964 files changed, 86623 insertions, 77064 deletions
diff --git a/storage/archive/ha_archive.cc b/storage/archive/ha_archive.cc
index 9526306a601..2df54567285 100644
--- a/storage/archive/ha_archive.cc
+++ b/storage/archive/ha_archive.cc
@@ -215,7 +215,6 @@ int archive_db_init(void *p)
 #endif
 
   archive_hton= (handlerton *)p;
-  archive_hton->state= SHOW_OPTION_YES;
   archive_hton->db_type= DB_TYPE_ARCHIVE_DB;
   archive_hton->create= archive_create_handler;
   archive_hton->flags= HTON_NO_FLAGS;
@@ -293,7 +292,7 @@ int archive_discover(handlerton *hton, THD* thd, TABLE_SHARE *share)
   if (frm_stream.frm_length == 0)
     DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
 
-  frm_ptr= (uchar *)my_malloc(sizeof(char) * frm_stream.frm_length,
+  frm_ptr= (uchar *)my_malloc(PSI_INSTRUMENT_ME, frm_stream.frm_length,
                               MYF(MY_THREAD_SPECIFIC | MY_WME));
   if (!frm_ptr)
     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
@@ -719,7 +718,7 @@ int ha_archive::frm_copy(azio_stream *src, azio_stream *dst)
     return 0;
   }
 
-  if (!(frm_ptr= (uchar *) my_malloc(src->frm_length,
+  if (!(frm_ptr= (uchar *) my_malloc(PSI_INSTRUMENT_ME, src->frm_length,
                                      MYF(MY_THREAD_SPECIFIC | MY_WME))))
     return HA_ERR_OUT_OF_MEM;
 
@@ -1228,8 +1227,8 @@ bool ha_archive::fix_rec_buff(unsigned int length)
   if (length > record_buffer->length)
   {
     uchar *newptr;
-    if (!(newptr=(uchar*) my_realloc((uchar*) record_buffer->buffer, 
-                                    length,
+    if (!(newptr=(uchar*) my_realloc(PSI_INSTRUMENT_ME,
+                                     (uchar*) record_buffer->buffer, length,
 				    MYF(MY_ALLOW_ZERO_PTR))))
       DBUG_RETURN(1);
     record_buffer->buffer= newptr;
@@ -1903,16 +1902,14 @@ archive_record_buffer *ha_archive::create_record_buffer(unsigned int length)
 {
   DBUG_ENTER("ha_archive::create_record_buffer");
   archive_record_buffer *r;
-  if (!(r= 
-        (archive_record_buffer*) my_malloc(sizeof(archive_record_buffer),
-                                           MYF(MY_WME))))
+  if (!(r= (archive_record_buffer*) my_malloc(PSI_INSTRUMENT_ME,
+                                 sizeof(archive_record_buffer), MYF(MY_WME))))
   {
     DBUG_RETURN(NULL); /* purecov: inspected */
   }
   r->length= (int)length;
 
-  if (!(r->buffer= (uchar*) my_malloc(r->length,
-                                    MYF(MY_WME))))
+  if (!(r->buffer= (uchar*) my_malloc(PSI_INSTRUMENT_ME, r->length, MYF(MY_WME))))
   {
     my_free(r);
     DBUG_RETURN(NULL); /* purecov: inspected */
diff --git a/storage/archive/ha_archive.h b/storage/archive/ha_archive.h
index 35291e469cd..2bb5079868b 100644
--- a/storage/archive/ha_archive.h
+++ b/storage/archive/ha_archive.h
@@ -61,7 +61,7 @@ public:
 */
 #define ARCHIVE_VERSION 3
 
-class ha_archive: public handler
+class ha_archive final : public handler
 {
   THR_LOCK_DATA lock;        /* MySQL lock */
   Archive_share *share;      /* Shared lock info */
diff --git a/storage/blackhole/ha_blackhole.cc b/storage/blackhole/ha_blackhole.cc
index 1b64db142e0..0134032351e 100644
--- a/storage/blackhole/ha_blackhole.cc
+++ b/storage/blackhole/ha_blackhole.cc
@@ -323,9 +323,8 @@ static st_blackhole_share *get_share(const char *table_name)
         my_hash_search(&blackhole_open_tables,
                        (uchar*) table_name, length)))
   {
-    if (!(share= (st_blackhole_share*) my_malloc(sizeof(st_blackhole_share) +
-                                                 length,
-                                                 MYF(MY_WME | MY_ZEROFILL))))
+    if (!(share= (st_blackhole_share*) my_malloc(PSI_INSTRUMENT_ME,
+              sizeof(st_blackhole_share) + length, MYF(MY_WME | MY_ZEROFILL))))
       goto error;
 
     share->table_name_length= length;
@@ -398,14 +397,15 @@ static int blackhole_init(void *p)
 #endif
 
   blackhole_hton= (handlerton *)p;
-  blackhole_hton->state= SHOW_OPTION_YES;
   blackhole_hton->db_type= DB_TYPE_BLACKHOLE_DB;
   blackhole_hton->create= blackhole_create_handler;
+  blackhole_hton->drop_table= [](handlerton *, const char*) { return -1; };
   blackhole_hton->flags= HTON_CAN_RECREATE;
 
   mysql_mutex_init(bh_key_mutex_blackhole,
                    &blackhole_mutex, MY_MUTEX_INIT_FAST);
-  (void) my_hash_init(&blackhole_open_tables, system_charset_info,32,0,0,
+  (void) my_hash_init(PSI_INSTRUMENT_ME, &blackhole_open_tables,
+                      system_charset_info, 32, 0, 0,
                       (my_hash_get_key) blackhole_get_key,
                       (my_hash_free_key) blackhole_free_key, 0);
 
@@ -423,23 +423,6 @@ static int blackhole_fini(void *p)
 struct st_mysql_storage_engine blackhole_storage_engine=
 { MYSQL_HANDLERTON_INTERFACE_VERSION };
 
-mysql_declare_plugin(blackhole)
-{
-  MYSQL_STORAGE_ENGINE_PLUGIN,
-  &blackhole_storage_engine,
-  "BLACKHOLE",
-  "MySQL AB",
-  "/dev/null storage engine (anything you write to it disappears)",
-  PLUGIN_LICENSE_GPL,
-  blackhole_init, /* Plugin Init */
-  blackhole_fini, /* Plugin Deinit */
-  0x0100 /* 1.0 */,
-  NULL,                       /* status variables                */
-  NULL,                       /* system variables                */
-  NULL,                       /* config options                  */
-  0,                          /* flags                           */
-}
-mysql_declare_plugin_end;
 maria_declare_plugin(blackhole)
 {
   MYSQL_STORAGE_ENGINE_PLUGIN,
diff --git a/storage/blackhole/ha_blackhole.h b/storage/blackhole/ha_blackhole.h
index e827cd25bfc..baa140bc04a 100644
--- a/storage/blackhole/ha_blackhole.h
+++ b/storage/blackhole/ha_blackhole.h
@@ -37,7 +37,7 @@ struct st_blackhole_share {
   Class definition for the blackhole storage engine
   "Dumbest named feature ever"
 */
-class ha_blackhole: public handler
+class ha_blackhole final : public handler
 {
   THR_LOCK_DATA lock;      /* MySQL lock */
   st_blackhole_share *share;
@@ -96,6 +96,10 @@ public:
   THR_LOCK_DATA **store_lock(THD *thd,
                              THR_LOCK_DATA **to,
                              enum thr_lock_type lock_type);
+  int delete_table(const char *name)
+  {
+    return 0;
+  }
 private:
   virtual int write_row(const uchar *buf);
   virtual int update_row(const uchar *old_data, const uchar *new_data);
diff --git a/storage/cassandra/CMakeLists.txt b/storage/cassandra/CMakeLists.txt
index fe32f69c10b..6dfa1d40cb9 100644
--- a/storage/cassandra/CMakeLists.txt
+++ b/storage/cassandra/CMakeLists.txt
@@ -59,5 +59,5 @@ LINK_DIRECTORIES(${LINK_DIR})
 
 IF(CASSANDRASE_OK)
   MYSQL_ADD_PLUGIN(cassandra ${cassandra_sources} STORAGE_ENGINE
-    MODULE_ONLY LINK_LIBRARIES thrift COMPONENT cassandra-engine)
+    MODULE_ONLY LINK_LIBRARIES thrift COMPONENT cassandra-engine DISABLED)
 ENDIF(CASSANDRASE_OK)
diff --git a/storage/cassandra/ha_cassandra.cc b/storage/cassandra/ha_cassandra.cc
index 1d2331c1a5e..54812b51db5 100644
--- a/storage/cassandra/ha_cassandra.cc
+++ b/storage/cassandra/ha_cassandra.cc
@@ -1,5 +1,5 @@
 /*
-   Copyright (c) 2012, Monty Program Ab
+   Copyright (c) 2012, 2020, MariaDB Corporation.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -242,10 +242,9 @@ static int cassandra_init_func(void *p)
 
   cassandra_hton= (handlerton *)p;
   mysql_mutex_init(ex_key_mutex_example, &cassandra_mutex, MY_MUTEX_INIT_FAST);
-  (void) my_hash_init(&cassandra_open_tables,system_charset_info,32,0,0,
+  (void) my_hash_init(PSI_INSTRUMENT_ME, &cassandra_open_tables,system_charset_info,32,0,0,
                       (my_hash_get_key) cassandra_get_key,0,0);
 
-  cassandra_hton->state=   SHOW_OPTION_YES;
   cassandra_hton->create=  cassandra_create_handler;
   /*
     Don't specify HTON_CAN_RECREATE in flags. re-create is used by TRUNCATE
@@ -298,7 +297,7 @@ static CASSANDRA_SHARE *get_share(const char *table_name, TABLE *table)
                                               length)))
   {
     if (!(share=(CASSANDRA_SHARE *)
-          my_multi_malloc(MYF(MY_WME | MY_ZEROFILL),
+          my_multi_malloc(MYF(MY_WME | MY_ZEROFILL), PSI_INSTRUMENT_ME,
                           &share, sizeof(*share),
                           &tmp_name, length+1,
                           NullS)))
@@ -866,7 +865,7 @@ static void alloc_strings_memroot(MEM_ROOT *mem_root)
       The mem_root used to allocate UUID (of length 36 + \0) so make
       appropriate allocated size
     */
-    init_alloc_root(mem_root, "cassandra",
+    init_alloc_root(PSI_INSTRUMENT_ME, mem_root,
                     (36 + 1 + ALIGN_SIZE(sizeof(USED_MEM))) * 10 +
                     ALLOC_ROOT_MIN_BLOCK_SIZE,
                     (36 + 1 + ALIGN_SIZE(sizeof(USED_MEM))) * 10 +
@@ -1105,7 +1104,7 @@ bool cassandra_to_dyncol_strUTF8(const char *cass_data,
                                  MEM_ROOT *mem_root __attribute__((unused)))
 {
   return cassandra_to_dyncol_strStr(cass_data, cass_data_len, value,
-                                    &my_charset_utf8_unicode_ci);
+                                    &my_charset_utf8mb3_unicode_ci);
 }
 
 bool dyncol_to_cassandraUTF8(DYNAMIC_COLUMN_VALUE *value,
@@ -1113,7 +1112,7 @@ bool dyncol_to_cassandraUTF8(DYNAMIC_COLUMN_VALUE *value,
                              void* buff, void **freemem)
 {
   return dyncol_to_cassandraStr(value, cass_data, cass_data_len,
-                                buff, freemem, &my_charset_utf8_unicode_ci);
+                                buff, freemem, &my_charset_utf8mb3_unicode_ci);
 }
 
 bool cassandra_to_dyncol_strUUID(const char *cass_data,
@@ -1447,7 +1446,7 @@ bool ha_cassandra::setup_field_converters(Field **field_arg, uint n_fields)
   size_t memsize= sizeof(ColumnDataConverter*) * n_fields +
     (sizeof(LEX_STRING) + sizeof(CASSANDRA_TYPE_DEF))*
     (dyncol_set ? max_non_default_fields : 0);
-  if (!(field_converters= (ColumnDataConverter**)my_malloc(memsize, MYF(0))))
+  if (!(field_converters= (ColumnDataConverter**)my_malloc(PSI_INSTRUMENT_ME, memsize, MYF(0))))
     DBUG_RETURN(true);
   bzero(field_converters, memsize);
   n_field_converters= n_fields;
@@ -1459,12 +1458,12 @@ bool ha_cassandra::setup_field_converters(Field **field_arg, uint n_fields)
     special_type_field_names=
       ((LEX_STRING*)(special_type_field_converters + max_non_default_fields));
 
-    if (my_init_dynamic_array(&dynamic_values,
+    if (my_init_dynamic_array(PSI_INSTRUMENT_ME, &dynamic_values,
                            sizeof(DYNAMIC_COLUMN_VALUE),
                            DYNCOL_USUAL, DYNCOL_DELTA, MYF(0)))
       DBUG_RETURN(true);
     else
-      if (my_init_dynamic_array(&dynamic_names,
+      if (my_init_dynamic_array(PSI_INSTRUMENT_ME, &dynamic_names,
                              sizeof(LEX_STRING),
                              DYNCOL_USUAL, DYNCOL_DELTA,MYF(0)))
       {
@@ -2171,7 +2170,7 @@ int ha_cassandra::info(uint flag)
 }
 
 
-void key_copy(uchar *to_key, const uchar *from_record, KEY *key_info,
+void key_copy(uchar *to_key, const uchar *from_record, const KEY *key_info,
               uint key_length, bool with_zerofill);
 
 
@@ -2528,14 +2527,6 @@ THR_LOCK_DATA **ha_cassandra::store_lock(THD *thd,
 }
 
 
-ha_rows ha_cassandra::records_in_range(uint inx, key_range *min_key,
-                                       key_range *max_key)
-{
-  DBUG_ENTER("ha_cassandra::records_in_range");
-  DBUG_RETURN(HA_POS_ERROR); /* Range scans are not supported */
-}
-
-
 /**
   check_if_incompatible_data() called if ALTER TABLE can't detect otherwise
   if new and old definition are compatible
diff --git a/storage/cassandra/ha_cassandra.h b/storage/cassandra/ha_cassandra.h
index a36d58fa4da..29987ec804b 100644
--- a/storage/cassandra/ha_cassandra.h
+++ b/storage/cassandra/ha_cassandra.h
@@ -1,5 +1,5 @@
 /*
-   Copyright (c) 2012, Monty Program Ab
+   Copyright (c) 2012, 2020, MariaDB Corporation.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -129,13 +129,13 @@ public:
     The name of the index type that will be used for display.
     Don't implement this method unless you really have indexes.
    */
-  const char *index_type(uint inx) { return "HASH"; }
+  const char *index_type(uint) override { return "HASH"; }
 
   /** @brief
     This is a list of flags that indicate what functionality the storage engine
     implements. The current table flags are documented in handler.h
   */
-  ulonglong table_flags() const
+  ulonglong table_flags() const override
   {
     return HA_BINLOG_STMT_CAPABLE |
            HA_REC_NOT_IN_SEQ |
@@ -157,7 +157,7 @@ public:
     If all_parts is set, MySQL wants to know the flags for the combined
     index, up to and including 'part'.
   */
-  ulong index_flags(uint inx, uint part, bool all_parts) const
+  ulong index_flags(uint, uint, bool) const override
   {
     return 0;
   }
@@ -169,11 +169,11 @@ public:
     send. Return *real* limits of your storage engine here; MySQL will do
     min(your_limits, MySQL_limits) automatically.
    */
-  uint max_supported_record_length() const { return HA_MAX_REC_LENGTH; }
+  uint max_supported_record_length() const override {return HA_MAX_REC_LENGTH;}
 
   /* Support only one Primary Key, for now */
-  uint max_supported_keys()          const { return 1; }
-  uint max_supported_key_parts()     const { return 1; }
+  uint max_supported_keys()          const override { return 1; }
+  uint max_supported_key_parts()     const override { return 1; }
 
   /** @brief
     unireg.cc will call this to make sure that the storage engine can handle
@@ -184,42 +184,48 @@ public:
     There is no need to implement ..._key_... methods if your engine doesn't
     support indexes.
    */
-  uint max_supported_key_length()    const { return 16*1024; /* just to return something*/ }
+  uint max_supported_key_length() const override
+  { return 16*1024; /* just to return something*/ }
 
-  int index_init(uint idx, bool sorted);
+  int index_init(uint idx, bool sorted) override;
 
   int index_read_map(uchar * buf, const uchar * key,
                      key_part_map keypart_map,
-                     enum ha_rkey_function find_flag);
+                     enum ha_rkey_function find_flag) override;
 
   /** @brief
     Called in test_quick_select to determine if indexes should be used.
   */
-  virtual double scan_time() { return (double) (stats.records+stats.deleted) / 20.0+10; }
+  double scan_time() override
+  { return (double) (stats.records+stats.deleted) / 20.0+10; }
 
   /** @brief
     This method will never be called if you do not implement indexes.
   */
-  virtual double read_time(uint, uint, ha_rows rows)
+  double read_time(uint, uint, ha_rows rows) override
   { return (double) rows /  20.0+1; }
 
-  virtual void start_bulk_insert(ha_rows rows, uint flags);
-  virtual int end_bulk_insert();
+  void start_bulk_insert(ha_rows rows, uint flags) override;
+  int end_bulk_insert() override;
 
-  virtual int reset();
+  int reset() override;
 
 
   int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                            uint n_ranges, uint mode, HANDLER_BUFFER *buf);
-  int multi_range_read_next(range_id_t *range_info);
+                            uint n_ranges, uint mode, HANDLER_BUFFER *buf)
+    override;
+  int multi_range_read_next(range_id_t *range_info) override;
   ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                       void *seq_init_param,
                                       uint n_ranges, uint *bufsz,
-                                      uint *flags, Cost_estimate *cost);
+                                      uint *flags, Cost_estimate *cost)
+    override;
   ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
                                 uint key_parts, uint *bufsz,
-                                uint *flags, Cost_estimate *cost);
-  int multi_range_read_explain_info(uint mrr_mode, char *str, size_t size);
+                                uint *flags, Cost_estimate *cost)
+    override;
+  int multi_range_read_explain_info(uint mrr_mode, char *str, size_t size)
+    override;
 
 private:
   bool source_exhausted;
@@ -236,12 +242,12 @@ private:
   CASSANDRA_TYPE_DEF * get_cassandra_field_def(char *cass_name,
                                                int cass_name_length);
 public:
-  int open(const char *name, int mode, uint test_if_locked);
-  int close(void);
+  int open(const char *name, int mode, uint test_if_locked) override;
+  int close() override;
 
-  int write_row(const uchar *buf);
-  int update_row(const uchar *old_data, const uchar *new_data);
-  int delete_row(const uchar *buf);
+  int write_row(const uchar *buf) override;
+  int update_row(const uchar *old_data, const uchar *new_data) override;
+  int delete_row(const uchar *buf) override;
 
   /** @brief
     Unlike index_init(), rnd_init() can be called two consecutive times
@@ -251,28 +257,31 @@ public:
     cursor to the start of the table; no need to deallocate and allocate
     it again. This is a required method.
   */
-  int rnd_init(bool scan);                                      //required
-  int rnd_end();
-  int rnd_next(uchar *buf);                                     ///< required
-  int rnd_pos(uchar *buf, uchar *pos);                          ///< required
-  void position(const uchar *record);                           ///< required
-  int info(uint);                                               ///< required
-  int delete_all_rows(void);
-  ha_rows records_in_range(uint inx, key_range *min_key,
-                           key_range *max_key);
+  int rnd_init(bool scan) override;
+  int rnd_end() override;
+  int rnd_next(uchar *buf) override;
+  int rnd_pos(uchar *buf, uchar *pos) override;
+  void position(const uchar *record) override;
+  int info(uint) override;
+  int delete_all_rows() override;
+  ha_rows records_in_range(uint, const key_range *min_key,
+                           const key_range *max_key,
+                           page_range *res) override
+  { return HA_POS_ERROR; /* Range scans are not supported */ }
+
   int create(const char *name, TABLE *form,
-             HA_CREATE_INFO *create_info);                      ///< required
+             HA_CREATE_INFO *create_info) override;
   bool check_if_incompatible_data(HA_CREATE_INFO *info,
-                                  uint table_changes);
+                                  uint table_changes) override;
 
   THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to,
-                             enum thr_lock_type lock_type);     ///< required
+                             enum thr_lock_type lock_type) override;
 
   my_bool register_query_cache_table(THD *thd, const char *table_key,
                                      uint key_length,
                                      qc_engine_callback
                                      *engine_callback,
-                                     ulonglong *engine_data)
+                                     ulonglong *engine_data) override
   {
     /* 
       Do not put data from Cassandra tables into query cache (because there 
diff --git a/storage/columnstore/CMakeLists.txt b/storage/columnstore/CMakeLists.txt
new file mode 100644
index 00000000000..81690f14b15
--- /dev/null
+++ b/storage/columnstore/CMakeLists.txt
@@ -0,0 +1,36 @@
+#set(PLUGIN_COLUMNSTORE "NO" CACHE STRING "Enable ColumnStore engine")
+
+if("NO" STREQUAL "${PLUGIN_COLUMNSTORE}")
+  return()
+endif()
+
+# this does everything, gets the var from the correct scope, appends new
+# values, sets in the correct scope
+macro(APPEND_FOR_CPACK V)
+  get_directory_property(var DIRECTORY columnstore DEFINITION ${V})
+  set(${V} "${var}${ARGN}" PARENT_SCOPE)
+endmacro()
+
+IF(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR
+CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
+    add_subdirectory(columnstore)
+
+    IF(TARGET columnstore)
+        # Needed to bump the component changes up to the main scope
+        APPEND_FOR_CPACK(CPACK_COMPONENTS_ALL)
+        IF (RPM)
+            APPEND_FOR_CPACK(CPACK_RPM_columnstore-engine_PACKAGE_REQUIRES " binutils net-tools python3")
+            APPEND_FOR_CPACK(CPACK_RPM_columnstore-engine_PACKAGE_RECOMMENDS " jemalloc")
+            APPEND_FOR_CPACK(CPACK_RPM_columnstore-engine_USER_FILELIST ";%ignore /var/lib;%ignore /var")
+            APPEND_FOR_CPACK(CPACK_RPM_columnstore-engine_PACKAGE_CONFLICTS " thrift MariaDB-columnstore-platform MariaDB-columnstore-libs")
+            # these three don't have the list semantics, so no append here
+            SET(CPACK_RPM_columnstore-engine_PRE_INSTALL_SCRIPT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/columnstore/build/preInstall_storage_engine.sh PARENT_SCOPE)
+            SET(CPACK_RPM_columnstore-engine_POST_INSTALL_SCRIPT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/columnstore/build/postInstall_storage_engine.sh PARENT_SCOPE)
+            SET(CPACK_RPM_columnstore-engine_PRE_UNINSTALL_SCRIPT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/columnstore/build/preUn_storage_engine.sh PARENT_SCOPE)
+            APPEND_FOR_CPACK(CPACK_RPM_columnstore-engine_PACKAGE_VERSION  "")
+            SET(CPACK_RPM_columnstore-engine_PACKAGE_SUMMARY "MariaDB ColumnStore storage engine" PARENT_SCOPE)
+            SET(CPACK_RPM_columnstore-engine_PACKAGE_DESCRIPTION "The MariaDB ColumnStore storage engine is a high-performance columnar analytical engine, aimed at rapid processing of analytical queries on very large amounts of data." PARENT_SCOPE)
+        ENDIF()
+        INSTALL_MYSQL_TEST("${CMAKE_CURRENT_SOURCE_DIR}/mysql-test/" "plugin/columnstore")
+    ENDIF()
+ENDIF()
diff --git a/storage/columnstore/columnstore b/storage/columnstore/columnstore
new file mode 160000
+Subproject 105aeb3e1e25dbb3dee6a53425dc665c7b7332d
diff --git a/storage/columnstore/mysql-test/columnstore/include/cleanup_columnstore.inc b/storage/columnstore/mysql-test/columnstore/include/cleanup_columnstore.inc
new file mode 100644
index 00000000000..c1286a5ea64
--- /dev/null
+++ b/storage/columnstore/mysql-test/columnstore/include/cleanup_columnstore.inc
@@ -0,0 +1,4 @@
+--disable_query_log
+DROP FUNCTION IF EXISTS mcssystemready;
+--enable_query_log
+
diff --git a/storage/columnstore/mysql-test/columnstore/include/have_columnstore.inc b/storage/columnstore/mysql-test/columnstore/include/have_columnstore.inc
new file mode 100644
index 00000000000..f6b97fed471
--- /dev/null
+++ b/storage/columnstore/mysql-test/columnstore/include/have_columnstore.inc
@@ -0,0 +1,13 @@
+if (`SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'columnstore' AND support IN ('YES', 'DEFAULT', 'ENABLED')`)
+{
+  --skip Test requires engine ColumnStore.
+}
+
+--disable_query_log
+CREATE FUNCTION mcssystemready RETURNS INTEGER SONAME 'ha_columnstore.so';
+--enable_query_log
+
+if (`SELECT mcssystemready() = 0`)
+{
+  --skip Test requires ColumnStore to be running
+}
diff --git a/storage/columnstore/mysql-test/columnstore/r/basic.result b/storage/columnstore/mysql-test/columnstore/r/basic.result
new file mode 100644
index 00000000000..3be76759732
--- /dev/null
+++ b/storage/columnstore/mysql-test/columnstore/r/basic.result
@@ -0,0 +1,17 @@
+DROP TABLE IF EXISTS t1;
+CREATE TABLE t1 (a INT, b VARCHAR(255)) ENGINE=columnstore;
+INSERT INTO t1 (a, b) VALUES (1, 'columnstore'), (2, 'test');
+SELECT * FROM t1;
+a	b
+1	columnstore
+2	test
+UPDATE t1 SET a=a+100;
+SELECT * FROM t1;
+a	b
+101	columnstore
+102	test
+DELETE FROM t1 WHERE a=101;
+SELECT * FROM t1;
+a	b
+102	test
+DROP TABLE t1;
diff --git a/storage/columnstore/mysql-test/columnstore/suite.opt b/storage/columnstore/mysql-test/columnstore/suite.opt
new file mode 100644
index 00000000000..fbd322fdd72
--- /dev/null
+++ b/storage/columnstore/mysql-test/columnstore/suite.opt
@@ -0,0 +1 @@
+--plugin-load-add=$HA_COLUMNSTORE_SO
diff --git a/storage/columnstore/mysql-test/columnstore/suite.pm b/storage/columnstore/mysql-test/columnstore/suite.pm
new file mode 100644
index 00000000000..5579df4048e
--- /dev/null
+++ b/storage/columnstore/mysql-test/columnstore/suite.pm
@@ -0,0 +1,7 @@
+package My::Suite::ColumnStore;
+
+use My::SysInfo;
+
+@ISA = qw(My::Suite);
+
+bless { };
diff --git a/storage/columnstore/mysql-test/columnstore/t/basic.test b/storage/columnstore/mysql-test/columnstore/t/basic.test
new file mode 100644
index 00000000000..34ab07d7f64
--- /dev/null
+++ b/storage/columnstore/mysql-test/columnstore/t/basic.test
@@ -0,0 +1,20 @@
+--source include/have_columnstore.inc
+
+--disable_warnings
+DROP TABLE IF EXISTS t1;
+--enable_warnings
+
+CREATE TABLE t1 (a INT, b VARCHAR(255)) ENGINE=columnstore;
+
+INSERT INTO t1 (a, b) VALUES (1, 'columnstore'), (2, 'test');
+SELECT * FROM t1;
+
+UPDATE t1 SET a=a+100;
+SELECT * FROM t1;
+
+DELETE FROM t1 WHERE a=101;
+SELECT * FROM t1;
+
+DROP TABLE t1;
+
+--source include/cleanup_columnstore.inc
diff --git a/storage/connect/CMakeLists.txt b/storage/connect/CMakeLists.txt
index be1197b4c58..c5e54daf093 100644
--- a/storage/connect/CMakeLists.txt
+++ b/storage/connect/CMakeLists.txt
@@ -152,10 +152,6 @@ ADD_FEATURE_INFO(CONNECT_LIBXML2 CONNECT_WITH_LIBXML2
 
 
 IF(WIN32)
-  # /MP option of the Microsoft compiler does not work well with COM #import
-  string(REPLACE "/MP" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-  string(REPLACE "/MP" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-
   OPTION(CONNECT_WITH_MSXML "Compile CONNECT storage engine with MSXML support" ON)
   IF(CONNECT_WITH_MSXML)
     add_definitions(-DMSX6 -DDOMDOC_SUPPORT)
@@ -414,6 +410,19 @@ IF(NOT TARGET connect)
   RETURN()
 ENDIF()
 
+IF(MSVC AND (CMAKE_CXX_FLAGS MATCHES "/MP"))
+  # domdoc.cpp uses compiler directive #import which is not compatible
+  # with the /MP option, resulting in  compiler error C2813.
+  # Remove /MP for this file.
+  SET(src_list ${CONNECT_SOURCES})
+  LIST(FIND src_list  domdoc.cpp idx)
+  IF(idx GREATER -1)
+    STRING(REPLACE "/MP" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    LIST(REMOVE_AT src_list ${idx})
+    SET_SOURCE_FILES_PROPERTIES(${src_list} PROPERTIES COMPILE_FLAGS "/MP")
+  ENDIF()
+ENDIF()
+
 IF(WIN32)
   IF (libmongoc-1.0_FOUND)
     SET_TARGET_PROPERTIES(connect PROPERTIES LINK_FLAGS
diff --git a/storage/connect/bson.cpp b/storage/connect/bson.cpp
index bf03f47a10d..fa9cd5f1e52 100644
--- a/storage/connect/bson.cpp
+++ b/storage/connect/bson.cpp
@@ -83,7 +83,7 @@ BDOC::BDOC(PGLOBAL G) : BJSON(G, NULL)
 PBVAL BDOC::ParseJson(PGLOBAL g, char* js, size_t lng)
 {
   size_t i;
-  bool  b = false, ptyp = (bool *)pty;
+  bool  b = false;
   PBVAL bvp = NULL;
 
   s = js;
@@ -145,7 +145,7 @@ PBVAL BDOC::ParseJson(PGLOBAL g, char* js, size_t lng)
           b = false;
           break;
         } // endif b
-
+        /* fall through */
       default:
         if (bvp->Type != TYPE_UNKNOWN) {
           bvp->To_Val = ParseAsArray(i);
@@ -683,7 +683,7 @@ bool BDOC::SerializeArray(OFFSET arp, bool b)
   } else if (jp->WriteChr('['))
     return true;
 
-  for (vp; vp; vp = MVP(vp->Next)) {
+  for (; vp; vp = MVP(vp->Next)) {
     if (first)
       first = false;
     else if ((!b || jp->Prty()) && jp->WriteChr(','))
@@ -718,7 +718,7 @@ bool BDOC::SerializeObject(OFFSET obp)
   if (jp->WriteChr('{'))
     return true;
 
-  for (prp; prp; prp = GetNext(prp)) {
+  for (; prp; prp = GetNext(prp)) {
     if (first)
       first = false;
     else if (jp->WriteChr(','))
diff --git a/storage/connect/bsonudf.cpp b/storage/connect/bsonudf.cpp
index 491f388bc5f..12f19484458 100644
--- a/storage/connect/bsonudf.cpp
+++ b/storage/connect/bsonudf.cpp
@@ -88,6 +88,7 @@ static PBSON BbinAlloc(PGLOBAL g, ulong len, PBVAL jsp)
 /*********************************************************************************/
 /*  SubAlloc a new BJNX class with protection against memory exhaustion.         */
 /*********************************************************************************/
+#ifdef NOT_USED
 static PBJNX BjnxNew(PGLOBAL g, PBVAL vlp, int type, int len)
 {
 	PBJNX bjnx;
@@ -104,7 +105,7 @@ static PBJNX BjnxNew(PGLOBAL g, PBVAL vlp, int type, int len)
 
 	return bjnx;
 } /* end of BjnxNew */
-
+#endif
 /* ----------------------------------- BSNX ------------------------------------ */
 
 /*********************************************************************************/
@@ -287,7 +288,7 @@ my_bool BJNX::ParseJpath(PGLOBAL g)
 {
 	char* p, * p1 = NULL, * p2 = NULL, * pbuf = NULL;
 	int     i;
-	my_bool a, mul = false;
+	my_bool a;
 
 	if (Parsed)
 		return false;                       // Already done
@@ -498,7 +499,8 @@ void BJNX::SetJsonValue(PGLOBAL g, PVAL vp, PBVAL vlp)
 			break;
 		case TYPE_NULL:
 			vp->SetNull(true);
-		default:
+                        /* fall through */
+                default:
 			vp->Reset();
 		} // endswitch Type
 
@@ -541,7 +543,6 @@ PVAL BJNX::GetColumnValue(PGLOBAL g, PBVAL row, int i)
 /*********************************************************************************/
 PBVAL BJNX::GetRowValue(PGLOBAL g, PBVAL row, int i)
 {
-	my_bool expd = false;
 	PBVAL   bap;
 	PBVAL   vlp = NULL;
 
@@ -1084,7 +1085,7 @@ my_bool BJNX::CheckPath(PGLOBAL g, UDF_ARGS *args, PBVAL jsp, PBVAL& jvp, int n)
 PSZ BJNX::Locate(PGLOBAL g, PBVAL jsp, PBVAL jvp, int k)
 {
 	PSZ     str = NULL;
-	my_bool b = false, err = true;
+	my_bool err = true;
 
 	g->Message[0] = 0;
 
@@ -1205,7 +1206,7 @@ my_bool BJNX::LocateValue(PGLOBAL g, PBVAL jvp)
 PSZ BJNX::LocateAll(PGLOBAL g, PBVAL jsp, PBVAL bvp, int mx)
 {
 	PSZ     str = NULL;
-	my_bool b = false, err = true;
+	my_bool err = true;
 	PJPN    jnp;
 
 	if (!jsp) {
@@ -2902,7 +2903,7 @@ my_bool bson_array_grp_init(UDF_INIT *initid, UDF_ARGS *args, char *message)
 		return true;
 
 	PGLOBAL g = (PGLOBAL)initid->ptr;
-	PBJNX   bxp = new(g) BJNX(g);
+	(void) new(g) BJNX(g);
 
 	JsonMemSave(g);
 	return false;
@@ -2975,7 +2976,7 @@ my_bool bson_object_grp_init(UDF_INIT *initid, UDF_ARGS *args, char *message)
 		return true;
 
 	PGLOBAL g = (PGLOBAL)initid->ptr;
-	PBJNX   bxp = new(g) BJNX(g);
+	(void) new(g) BJNX(g);
 
 	JsonMemSave(g);
 	return false;
@@ -3045,7 +3046,7 @@ my_bool bson_test_init(UDF_INIT* initid, UDF_ARGS* args, char* message) {
 
 char* bson_test(UDF_INIT* initid, UDF_ARGS* args, char* result,
 	unsigned long* res_length, char* is_null, char* error) {
-	char* str = NULL, * sap = NULL, * fn = NULL;
+	char* str = NULL, * fn = NULL;
 	int     pretty = 1;
 	PBVAL   bvp;
 	PGLOBAL g = (PGLOBAL)initid->ptr;
@@ -4690,7 +4691,7 @@ char *bfile_convert(UDF_INIT* initid, UDF_ARGS* args, char* result,
 		str = (char*)g->Xchk;
 
 	if (!str) {
-		PUSH_WARNING(*g->Message ? g->Message : "Unexpected error");
+		PUSH_WARNING(g->Message[0] != '\0' ? g->Message : "Unexpected error");
 		*is_null = 1;
 		*error = 1;
 		*res_length = 0;
@@ -4813,7 +4814,7 @@ char *bfile_bjson(UDF_INIT *initid, UDF_ARGS *args, char *result,
 		str = (char*)g->Xchk;
 
 	if (!str) {
-		if (*g->Message)
+		if (g->Message[0] != '\0')
 			str = strcpy(result, g->Message);
 		else
 			str = strcpy(result, "Unexpected error");
@@ -5043,7 +5044,7 @@ char* bbin_array_add_values(UDF_INIT* initid, UDF_ARGS* args, char* result,
 		if (!CheckMemory(g, initid, args, args->arg_count, true)) {
 			uint  i = 0;
 			BJNX  bnx(g);
-			PBVAL arp, top, jvp = NULL;
+			PBVAL arp, top;
 			PBVAL bvp = bnx.MakeValue(args, 0, true, &top);
 
 			if (bvp->Type == TYPE_JAR) {
@@ -5667,7 +5668,7 @@ char *bbin_get_item(UDF_INIT *initid, UDF_ARGS *args, char *result,
 	if (g->Xchk) {
 		bsp = (PBSON)g->Xchk;
 	} else if (!CheckMemory(g, initid, args, 1, true, true)) {
-		char *path = MakePSZ(g, args, 1);
+                // char *path = MakePSZ(g, args, 1);
 		BJNX  bnx(g, NULL, TYPE_STRING, initid->max_length);
 		PBVAL top, jvp = NULL;
 		PBVAL jsp = bnx.MakeValue(args, 0, true, &top);
diff --git a/storage/connect/colblk.cpp b/storage/connect/colblk.cpp
index 19e98537d27..d531685950d 100644
--- a/storage/connect/colblk.cpp
+++ b/storage/connect/colblk.cpp
@@ -81,13 +81,12 @@ COLBLK::COLBLK(PCOL col1, PTDB tdbp)
 
   if (tdbp) {
     // Attach the new column to the table block
-    if (!tdbp->GetColumns())
+    if (!tdbp->GetColumns()) {
       tdbp->SetColumns(this);
-    else {
+    } else {
       for (colp = tdbp->GetColumns(); colp->Next; colp = colp->Next) ;
-
       colp->Next = this;
-      } // endelse
+    } // endelse
   }
 
   } // end of COLBLK copy constructor
diff --git a/storage/connect/filamdbf.cpp b/storage/connect/filamdbf.cpp
index 71bf626c08c..3d1718b9983 100644
--- a/storage/connect/filamdbf.cpp
+++ b/storage/connect/filamdbf.cpp
@@ -337,7 +337,7 @@ PQRYRES DBFColumns(PGLOBAL g, PCSZ dp, PCSZ fn, PTOS topt, bool info)
           hp->Encryptflag, hp->Mdxflag, hp->Language);
     htrc("%hd records, last changed %02d/%02d/%d\n",
           hp->Records(), hp->Filedate[1], hp->Filedate[2],
-          hp->Filedate[0] + (hp->Filedate[0] <= 30) ? 2000 : 1900);
+          hp->Filedate[0] + ((hp->Filedate[0] <= 30) ? 2000 : 1900));
     htrc("Field    Type  Offset  Len  Dec  Set  Mdx\n");
     } // endif trace
 
@@ -949,7 +949,7 @@ int DBFFAM::DeleteRecords(PGLOBAL g, int irc)
     }
     *Tdbp->GetLine() = '*';
     Modif++;                         // Modified line in Delete mode
-    } // endif irc
+  } // endif irc
 
   return RC_OK;
   } // end of DeleteRecords
diff --git a/storage/connect/filamfix.cpp b/storage/connect/filamfix.cpp
index 1b1cb7ca3f1..46f3ea01129 100644
--- a/storage/connect/filamfix.cpp
+++ b/storage/connect/filamfix.cpp
@@ -135,8 +135,6 @@ bool FIXFAM::AllocateBuffer(PGLOBAL g)
       // The buffer must be prepared depending on column types
       int     n = 0;
       bool    b = false;
-      PDOSDEF defp __attribute__((unused))= (PDOSDEF)Tdbp->GetDef();
-//    PCOLDEF cdp;
       PBINCOL colp;
 
       // Prepare the first line of the buffer
diff --git a/storage/connect/filamgz.cpp b/storage/connect/filamgz.cpp
index 1fe632b0bcf..634599eced3 100644
--- a/storage/connect/filamgz.cpp
+++ b/storage/connect/filamgz.cpp
@@ -647,7 +647,7 @@ int ZBKFAM::WriteBuffer(PGLOBAL g)
 int ZBKFAM::DeleteRecords(PGLOBAL g, int irc)
   {
   if (irc == RC_EF) {
-    LPCSTR  name __attribute__((unused)) = Tdbp->GetName();
+    (void) Tdbp->GetName();                     // XXX Should be removed ?
     PDOSDEF defp = (PDOSDEF)Tdbp->GetDef();
 
     defp->SetBlock(0);
@@ -673,7 +673,7 @@ void ZBKFAM::CloseTableFile(PGLOBAL g, bool)
   int rc = RC_OK;
 
   if (Tdbp->GetMode() == MODE_INSERT) {
-    LPCSTR  name __attribute__((unused))= Tdbp->GetName();
+    (void) Tdbp->GetName();                     // XXX Should be removed?
     PDOSDEF defp = (PDOSDEF)Tdbp->GetDef();
 
     if (CurNum && !Closing) {
@@ -1356,7 +1356,7 @@ void ZLBFAM::CloseTableFile(PGLOBAL g, bool)
   int rc = RC_OK;
 
   if (Tdbp->GetMode() == MODE_INSERT) {
-    LPCSTR  name __attribute__((unused))= Tdbp->GetName();
+    (void) Tdbp->GetName();                     // XXX Should be removed?
     PDOSDEF defp = (PDOSDEF)Tdbp->GetDef();
 
     // Closing is True if last Write was in error
diff --git a/storage/connect/filamvct.cpp b/storage/connect/filamvct.cpp
index fd3ce79c762..2cd80f5d43f 100644
--- a/storage/connect/filamvct.cpp
+++ b/storage/connect/filamvct.cpp
@@ -1164,7 +1164,6 @@ bool VCTFAM::ResetTableSize(PGLOBAL g, int block, int last)
     if (!Header) {
       // Update catalog values for Block and Last
       PVCTDEF defp = (PVCTDEF)Tdbp->GetDef();
-      LPCSTR  name __attribute__((unused))= Tdbp->GetName();
 
       defp->SetBlock(Block);
       defp->SetLast(Last);
diff --git a/storage/connect/filamzip.cpp b/storage/connect/filamzip.cpp
index 4e9d008b455..77a97b95107 100644
--- a/storage/connect/filamzip.cpp
+++ b/storage/connect/filamzip.cpp
@@ -1207,7 +1207,6 @@ int UZDFAM::Cardinality(PGLOBAL g)
 		return 1;
 
 	int card = -1;
-	GetFileLength(g);
 
 	card = Records;
 
diff --git a/storage/connect/filter.cpp b/storage/connect/filter.cpp
index fd6a6a68de0..9d8518ec3a5 100644
--- a/storage/connect/filter.cpp
+++ b/storage/connect/filter.cpp
@@ -856,7 +856,7 @@ PFIL FILTER::LinkFilter(PGLOBAL g, PFIL fp2)
 
 /***********************************************************************/
 /*  Checks whether filter contains reference to a previous table that  */
-/*  is not logically joined to the currently openned table, or whether */
+/*  is not logically joined to the currently opened table, or whether */
 /*  it is a Sub-Select filter.  In any case, local is set to FALSE.    */
 /*  Note: This function is now applied to de-linearized filters.       */
 /***********************************************************************/
@@ -1220,7 +1220,8 @@ bool FILTER::Eval(PGLOBAL g)
   int     i; // n = 0;
 //PSUBQ   subp = NULL;
   PARRAY  ap = NULL;
-  PDBUSER dup __attribute__((unused)) = PlgGetUser(g);
+
+  (void) PlgGetUser(g);
 
   if (Opc <= OP_XX)
   {
diff --git a/storage/connect/ha_connect.cc b/storage/connect/ha_connect.cc
index 5785bc0f99e..ce04852324b 100644
--- a/storage/connect/ha_connect.cc
+++ b/storage/connect/ha_connect.cc
@@ -170,12 +170,12 @@
 #define JSONMAX      50             // JSON Default max grp size
 
 extern "C" {
-       char version[]= "Version 1.07.0003 June 06, 2021";
+       char version[]= "Version 1.07.0002 March 22, 2021";
 #if defined(_WIN32)
-       char compver[]= "Version 1.07.0003 " __DATE__ " "  __TIME__;
-       char slash= '\\';
+       char compver[]= "Version 1.07.0002 " __DATE__ " "  __TIME__;
+       static char slash= '\\';
 #else   // !_WIN32
-       char slash= '/';
+       static char slash= '/';
 #endif  // !_WIN32
 } // extern "C"
 
@@ -824,7 +824,6 @@ static int connect_init_func(void *p)
   init_connect_psi_keys();
 
   connect_hton= (handlerton *)p;
-  connect_hton->state= SHOW_OPTION_YES;
   connect_hton->create= connect_create_handler;
   connect_hton->flags= HTON_TEMPORARY_NOT_SUPPORTED;
   connect_hton->table_options= connect_table_option_list;
@@ -1178,7 +1177,8 @@ ulonglong ha_connect::table_flags() const
 //                   HA_NULL_IN_KEY |    not implemented yet
 //                   HA_FAST_KEY_READ |  causes error when sorting (???)
                      HA_NO_TRANSACTIONS | HA_DUPLICATE_KEY_NOT_IN_ORDER |
-                     HA_NO_BLOBS | HA_MUST_USE_TABLE_CONDITION_PUSHDOWN;
+                     HA_NO_BLOBS | HA_MUST_USE_TABLE_CONDITION_PUSHDOWN |
+                     HA_REUSES_FILE_NAMES;
   ha_connect *hp= (ha_connect*)this;
   PTOS        pos= hp->GetTableOptionStruct();
 
@@ -2273,7 +2273,7 @@ int ha_connect::MakeRecord(char *buf)
           case TYPE_DECIM:
             p= value->GetCharString(val);
             charset= tdbp->data_charset();
-            rc= fp->store(p, strlen(p), charset, CHECK_FIELD_WARN);
+            rc= fp->store_text(p, strlen(p), charset, CHECK_FIELD_WARN);
             break;
 					case TYPE_BIN:
 						p= value->GetCharValue();
@@ -5263,6 +5263,14 @@ int ha_connect::delete_or_rename_table(const char *name, const char *to)
     thd->push_internal_handler(&error_handler);
     bool got_error= open_table_def(thd, share);
     thd->pop_internal_handler();
+    if (!got_error && share->db_type() != connect_hton)
+    {
+      /* The .frm file is not for the connect engine. Something is wrong! */
+      got_error= 1;
+      rc= HA_ERR_INTERNAL_ERROR;
+      my_error(HA_ERR_INTERNAL_ERROR, MYF(0),
+               "TABLE_SHARE is not for the CONNECT engine");
+    }
     if (!got_error) {
       // Now we can work
       if ((pos= share->option_struct)) {
@@ -5275,7 +5283,8 @@ int ha_connect::delete_or_rename_table(const char *name, const char *to)
         } // endif pos
 
       } // endif open_table_def
-
+      else
+        rc= ENOENT;
     free_table_share(share);
   } else              // Temporary file
     ok= true;
@@ -5315,8 +5324,11 @@ int ha_connect::rename_table(const char *from, const char *to)
   @see
   check_quick_keys() in opt_range.cc
 */
-ha_rows ha_connect::records_in_range(uint inx, key_range *min_key,
-                                               key_range *max_key)
+ha_rows ha_connect::records_in_range(uint inx,
+                                     const key_range *min_key,
+                                     const key_range *max_key,
+                                     page_range *pages)
+
 {
   ha_rows rows;
   DBUG_ENTER("ha_connect::records_in_range");
@@ -5387,7 +5399,7 @@ static char *encode(PGLOBAL g, const char *cnm)
   char  *buf= (char*)PlugSubAlloc(g, NULL, strlen(cnm) * 3);
   uint   dummy_errors;
   uint32 len= copy_and_convert(buf, strlen(cnm) * 3,
-                               &my_charset_utf8_general_ci,
+                               &my_charset_utf8mb3_general_ci,
                                cnm, strlen(cnm),
                                &my_charset_latin1,
                                &dummy_errors);
@@ -6511,7 +6523,7 @@ int ha_connect::create(const char *name, TABLE *table_arg,
       DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
       } // endif charset
 
-    if (type == TAB_XML && data_charset != &my_charset_utf8_general_ci) {
+    if (type == TAB_XML && data_charset != &my_charset_utf8mb3_general_ci) {
       my_printf_error(ER_UNKNOWN_ERROR,
                       "DATA_CHARSET='%s' is not supported for TABLE_TYPE=XML",
                         MYF(0), options->data_charset);
@@ -7193,7 +7205,8 @@ ha_connect::check_if_supported_inplace_alter(TABLE *altered_table,
     ALTER_ADD_UNIQUE_INDEX |
     ALTER_DROP_UNIQUE_INDEX |
     ALTER_ADD_PK_INDEX |
-    ALTER_DROP_PK_INDEX;
+    ALTER_DROP_PK_INDEX |
+    ALTER_INDEX_ORDER;
 
   alter_table_operations inplace_offline_operations=
     ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE |
diff --git a/storage/connect/ha_connect.h b/storage/connect/ha_connect.h
index 218819d0b73..d1aca22b01f 100644
--- a/storage/connect/ha_connect.h
+++ b/storage/connect/ha_connect.h
@@ -146,7 +146,7 @@ typedef class ha_connect *PHC;
 /** @brief
   Class definition for the storage engine
 */
-class ha_connect: public handler
+class ha_connect final : public handler
 {
   THR_LOCK_DATA lock;      ///< MySQL lock
   CONNECT_SHARE *share;        ///< Shared lock info
@@ -463,8 +463,8 @@ int index_prev(uchar *buf);
   int start_stmt(THD *thd, thr_lock_type lock_type);
   int external_lock(THD *thd, int lock_type);                   ///< required
   int delete_all_rows(void);
-  ha_rows records_in_range(uint inx, key_range *min_key,
-                           key_range *max_key);
+  ha_rows records_in_range(uint inx, const key_range *start_key,
+                           const key_range *end_key, page_range *pages);
   /**
     These methods can be overridden, but their default implementation
     provide useful functionality.
diff --git a/storage/connect/inihandl.cpp b/storage/connect/inihandl.cpp
index de04fa91883..cd06f7fadd7 100644
--- a/storage/connect/inihandl.cpp
+++ b/storage/connect/inihandl.cpp
@@ -112,8 +112,6 @@ static PROFILE *MRUProfile[N_CACHED_PROFILES] = {NULL};
 
 //static CRITICAL_SECTION PROFILE_CritSect = CRITICAL_SECTION_INIT("PROFILE_CritSect");
 
-static const char hex[17] = "0123456789ABCDEF";
-
 BOOL  WritePrivateProfileString(LPCSTR section, LPCSTR entry,
                                 LPCSTR string, LPCSTR filename);
 
@@ -1340,7 +1338,7 @@ BOOL WritePrivateProfileSection(LPCSTR section,
  * Note that when the buffer is big enough then the return value may be any
  * value between 1 and len-1 (or len in Win95), including len-2.
  */
-#ifdef NOT_USED
+#ifdef TEST_MODULE
 static DWORD
 GetPrivateProfileSectionNames(LPSTR buffer, DWORD size,  LPCSTR filename)
 {
@@ -1357,12 +1355,11 @@ GetPrivateProfileSectionNames(LPSTR buffer, DWORD size,  LPCSTR filename)
   LeaveCriticalSection(&PROFILE_CritSect);
   return ret;
 }  // end of GetPrivateProfileSectionNames
-#endif
+
 
 /************************************************************************
  * Program to test the above
  ************************************************************************/
-#ifdef TEST_MODULE
 int main(int argc, char**argv) {
   char  buff[128];
   char *p, *inifile = "D:\\Plug\\Data\\contact.ini";
diff --git a/storage/connect/ioapi.h b/storage/connect/ioapi.h
index 4fa73002053..e2148c56bac 100644
--- a/storage/connect/ioapi.h
+++ b/storage/connect/ioapi.h
@@ -82,7 +82,7 @@
 #include "mz64conf.h"
 #endif
 
-/* a type choosen by DEFINE */
+/* a type chosen by DEFINE */
 #ifdef HAVE_64BIT_INT_CUSTOM
 typedef  64BIT_INT_CUSTOM_TYPE ZPOS64_T;
 #else
diff --git a/storage/connect/javaconn.cpp b/storage/connect/javaconn.cpp
index 364d49f8e7c..de37d5b6970 100644
--- a/storage/connect/javaconn.cpp
+++ b/storage/connect/javaconn.cpp
@@ -349,7 +349,7 @@ bool JAVAConn::GetJVM(PGLOBAL g)
 /***********************************************************************/
 bool JAVAConn::Open(PGLOBAL g)
 {
-	bool		 brc = true, err = false;
+         bool		 brc = true;
 	jboolean jt = (trace(1));
 
 	// Link or check whether jvm library was linked
diff --git a/storage/connect/jdbconn.cpp b/storage/connect/jdbconn.cpp
index 20918745316..1035c2b8305 100644
--- a/storage/connect/jdbconn.cpp
+++ b/storage/connect/jdbconn.cpp
@@ -769,7 +769,6 @@ bool JDBConn::Connect(PJPARM sop)
 	int      irc = RC_FX;
 	bool		 err = false;
 	jint     rc;
-	jboolean jt = (trace(1));
 	PGLOBAL& g = m_G;
 
 	/*******************************************************************/
@@ -939,7 +938,7 @@ int JDBConn::Rewind(PCSZ sql)
 		if (gmID(m_G, fetchid, "Fetch", "(I)Z"))
 			return -1;
 
-		jboolean b = env->CallBooleanMethod(job, fetchid, 0);
+		(void) env->CallBooleanMethod(job, fetchid, 0);
 
 		rbuf = m_Rows;
 	} else if (ExecuteCommand(sql) != RC_FX)
@@ -1191,7 +1190,7 @@ int JDBConn::ExecuteUpdate(PCSZ sql)
 /***********************************************************************/
 int JDBConn::GetResultSize(PCSZ sql, PCOL colp)
 {
-	int rc, n = 0;
+	int rc;
 
 	if ((rc = ExecuteQuery(sql)) != RC_OK)
 		return -1;
@@ -1498,7 +1497,6 @@ bool JDBConn::SetParam(JDBCCOL *colp)
 		PCSZ     fnc = "Unknown";
 		uint     n;
 		short    len, tp;
-		int      crow = 0;
 		PQRYRES  qrp = cap->Qrp;
 		PCOLRES  crp;
 		jboolean rc = false;
diff --git a/storage/connect/json.cpp b/storage/connect/json.cpp
index b9f9492320b..17c6ba9791a 100644
--- a/storage/connect/json.cpp
+++ b/storage/connect/json.cpp
@@ -1051,7 +1051,7 @@ int JOBJECT::GetSize(bool b) {
 
   for (PJPR jpp = First; jpp; jpp = jpp->Next)
     // If b return only non null pairs
-    if (!b || jpp->Val && !jpp->Val->IsNull())
+    if (!b || (jpp->Val && !jpp->Val->IsNull()))
       n++;
 
   return n;
@@ -1581,10 +1581,12 @@ PVAL JVALUE::GetValue(PGLOBAL g)
   PVAL valp = NULL;
 
   if (DataType != TYPE_JSON)
+  {
     if (DataType == TYPE_STRG)
       valp = AllocateValue(g, Strp, DataType, Nd);
     else
       valp = AllocateValue(g, &LLn, DataType, Nd);
+  }
 
   return valp;
 } // end of GetValue
@@ -1755,6 +1757,7 @@ void JVALUE::SetValue(PGLOBAL g, PVAL valp)
   case TYPE_TINY:
     B = valp->GetTinyValue() != 0;
     DataType = TYPE_BOOL;
+    break;
   case TYPE_INT:
     N = valp->GetIntValue();
     DataType = TYPE_INTG;
diff --git a/storage/connect/jsonudf.cpp b/storage/connect/jsonudf.cpp
index 18332362086..04558469762 100644
--- a/storage/connect/jsonudf.cpp
+++ b/storage/connect/jsonudf.cpp
@@ -5313,7 +5313,7 @@ char *jbin_object_delete(UDF_INIT *initid, UDF_ARGS *args, char *result,
 		PCSZ  key;
 		PJOB  jobp;
 		PJVAL jvp = MakeValue(g, args, 0, &top);
-		PJSON jsp __attribute__((unused)) = jvp->GetJson();
+		(void) jvp->GetJson();          // XXX Should be removed?
 
 		if (CheckPath(g, args, top, jvp, 2))
 			PUSH_WARNING(g->Message);
@@ -5954,7 +5954,7 @@ char *jfile_convert(UDF_INIT* initid, UDF_ARGS* args, char* result,
 		str = (char*)g->Xchk;
 
 	if (!str) {
-		PUSH_WARNING(*g->Message ? g->Message : "Unexpected error");
+		PUSH_WARNING(g->Message[0] != '\0' ? g->Message : "Unexpected error");
 		*is_null = 1;
 		*error = 1;
 		*res_length = 0;
@@ -6082,7 +6082,7 @@ char *jfile_bjson(UDF_INIT *initid, UDF_ARGS *args, char *result,
 		str = (char*)g->Xchk;
 
 	if (!str) {
-		if (*g->Message)
+		if (g->Message[0] != '\0')
 			str = strcpy(result, g->Message);
 		else
 			str = strcpy(result, "Unexpected error");
diff --git a/storage/connect/libdoc.cpp b/storage/connect/libdoc.cpp
index 61921555ad7..3b2696b066d 100644
--- a/storage/connect/libdoc.cpp
+++ b/storage/connect/libdoc.cpp
@@ -765,8 +765,8 @@ int LIBXMLDOC::Decode(xmlChar *cnt, char *buf, int n)
   {
   const char *txt = (const char *)cnt;
   uint   dummy_errors;
-  uint32 len= copy_and_convert(buf, n, &my_charset_utf8_general_ci, txt,
-                               strlen(txt), &my_charset_utf8_general_ci,
+  uint32 len= copy_and_convert(buf, n, &my_charset_utf8mb3_general_ci, txt,
+                               strlen(txt), &my_charset_utf8mb3_general_ci,
                                &dummy_errors);
   buf[len]= '\0';
   return 0;
@@ -777,8 +777,8 @@ int LIBXMLDOC::Decode(xmlChar *cnt, char *buf, int n)
 /******************************************************************/
 xmlChar *LIBXMLDOC::Encode(PGLOBAL g, char *txt)
   {
-  const CHARSET_INFO *ics= &my_charset_utf8_general_ci;
-  const CHARSET_INFO *ocs= &my_charset_utf8_general_ci;
+  const CHARSET_INFO *ics= &my_charset_utf8mb3_general_ci;
+  const CHARSET_INFO *ocs= &my_charset_utf8mb3_general_ci;
   size_t      i = strlen(txt);
   size_t      o = i * ocs->mbmaxlen / ics->mbmaxlen + 1;
   char        *buf;
diff --git a/storage/connect/maputil.cpp b/storage/connect/maputil.cpp
index b2e55e619a9..b722a438011 100644
--- a/storage/connect/maputil.cpp
+++ b/storage/connect/maputil.cpp
@@ -190,7 +190,7 @@ bool CloseMemMap(void *memory, size_t dwSize)
   {
   if (memory) {
     // All this must be redesigned
-    int rc __attribute__((unused))= msync((char*)memory, dwSize, MS_SYNC);
+    msync((char*)memory, dwSize, MS_SYNC);
     return (munmap((char*)memory, dwSize) < 0) ? true : false;
   } else
     return false;
diff --git a/storage/connect/myconn.cpp b/storage/connect/myconn.cpp
index 89514d6c3c2..e0f30a159e9 100644
--- a/storage/connect/myconn.cpp
+++ b/storage/connect/myconn.cpp
@@ -88,8 +88,8 @@ static MYSQL_RES *connect_use_result(MYSQL *mysql)
     DBUG_RETURN(NULL);
     } // endif status
 
-  if (!(result = (MYSQL_RES*) my_malloc(sizeof(*result) +
-				          sizeof(ulong) * mysql->field_count,
+  if (!(result = (MYSQL_RES*) my_malloc(PSI_NOT_INSTRUMENTED,
+                  sizeof(*result) + sizeof(ulong) * mysql->field_count,
 				          MYF(MY_WME | MY_ZEROFILL))))
     DBUG_RETURN(NULL);
 
@@ -97,8 +97,8 @@ static MYSQL_RES *connect_use_result(MYSQL *mysql)
   result->methods = mysql->methods;
 
   /* Ptrs: to one row */
-  if (!(result->row = (MYSQL_ROW)my_malloc(sizeof(result->row[0]) *
-                                (mysql->field_count+1), MYF(MY_WME)))) {
+  if (!(result->row = (MYSQL_ROW)my_malloc(PSI_NOT_INSTRUMENTED,
+                sizeof(result->row[0]) * (mysql->field_count+1), MYF(MY_WME)))) {
     my_free(result);
     DBUG_RETURN(NULL);
     }  // endif row
@@ -120,7 +120,7 @@ static MYSQL_RES *connect_use_result(MYSQL *mysql)
 /************************************************************************/
 /*  MyColumns: constructs the result blocks containing all columns      */
 /*  of a MySQL table or view.                                           */
-/*  info = TRUE to get catalog column informations.                     */
+/*  info = TRUE to get catalog column information.                     */
 /************************************************************************/
 PQRYRES MyColumns(PGLOBAL g, THD *thd, const char *host, const char *db,
                   const char *user, const char *pwd,
diff --git a/storage/connect/mysql-test/connect/r/alter_engine.result b/storage/connect/mysql-test/connect/r/alter_engine.result
new file mode 100644
index 00000000000..530574d276d
--- /dev/null
+++ b/storage/connect/mysql-test/connect/r/alter_engine.result
@@ -0,0 +1,11 @@
+#
+# MDEV-24422 Server crashes in GetTypeID / ha_connect::GetRealType upon
+# altering table engine
+#
+CREATE TABLE t1 (f INT) ENGINE=CONNECT;
+Warnings:
+Warning	1105	No table_type. Will be set to DOS
+Warning	1105	No file name. Table will use t1.dos
+ALTER TABLE t1 ENGINE InnoDB;
+ALTER TABLE t1 ENGINE CONNECT;
+DROP TABLE t1;
diff --git a/storage/connect/mysql-test/connect/r/drop-open-error.result b/storage/connect/mysql-test/connect/r/drop-open-error.result
index f0ad8553d8b..34f58a845dc 100644
--- a/storage/connect/mysql-test/connect/r/drop-open-error.result
+++ b/storage/connect/mysql-test/connect/r/drop-open-error.result
@@ -2,6 +2,8 @@ create table t1 (c varchar(8));
 create table tcon engine=connect table_type=mysql CONNECTION='mysql://root@localhost/test/t1' SRCDEF='select c from t1 where c in ("foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar", "foo", "bar", "qux", "foobar")';
 ERROR HY000: Too long value for 'SRCDEF'
 drop table mdev9949;
+Warnings:
+Warning	1017	Can't find file: './test/mdev9949.dos' (errno: 2 "No such file or directory")
 drop table t1;
 select @@secure_file_priv 'must be NULL';
 must be NULL
diff --git a/storage/connect/mysql-test/connect/r/mysql_index.result b/storage/connect/mysql-test/connect/r/mysql_index.result
index e81caf671a4..54acc7be08d 100644
--- a/storage/connect/mysql-test/connect/r/mysql_index.result
+++ b/storage/connect/mysql-test/connect/r/mysql_index.result
@@ -52,8 +52,8 @@ id	msg
 SELECT * FROM t2 WHERE id >= 3;
 id	msg
 3	Trois
-5	Cinq
 4	Four
+5	Cinq
 6	Six
 SELECT * FROM t2 WHERE id < 3;
 id	msg
@@ -302,18 +302,18 @@ matricule	nom	prenom
 403	HERMITTE	PHILIPPE
 7626	HENIN	PHILIPPE
 9096	HELENA	PHILIPPE
-SELECT matricule, nom, prenom FROM t2 ORDER BY nom LIMIT 10;
+SELECT matricule, nom, prenom FROM t2 ORDER BY nom,prenom LIMIT 10;
 matricule	nom	prenom
-1340	ABBE	MICHELE
-2945	ABBEVILLE	PASCAL
-307	ABBAYE	ANNICK
 4552	ABBADIE	MONIQUE
-6399	ABEILLES	RENE
+307	ABBAYE	ANNICK
 6627	ABBAYE	GERALD
 7961	ABBE	KATIA
+1340	ABBE	MICHELE
+9270	ABBE	SOPHIE
+2945	ABBEVILLE	PASCAL
 8596	ABEBERRY	PATRICK
+6399	ABEILLES	RENE
 8673	ABEL	JEAN PIERRE
-9270	ABBE	SOPHIE
 SELECT a.nom, a.prenom, b.nom FROM t1 a STRAIGHT_JOIN t2 b ON a.prenom = b.prenom WHERE a.nom = 'FOCH' AND a.nom != b.nom;
 nom	prenom	nom
 FOCH	BERNADETTE	BERTIN
diff --git a/storage/connect/mysql-test/connect/r/type_inet6.result b/storage/connect/mysql-test/connect/r/type_inet6.result
new file mode 100644
index 00000000000..495a7a515b7
--- /dev/null
+++ b/storage/connect/mysql-test/connect/r/type_inet6.result
@@ -0,0 +1,15 @@
+#
+# MDEV-21764 CONNECT table with INET6 field produces warnings upon SELECT
+#
+CREATE TABLE t1 (a INET6) ENGINE=CONNECT TABLE_TYPE=DOS;
+Warnings:
+Warning	1105	No file name. Table will use t1.dos
+INSERT INTO t1 VALUES ('::');
+INSERT INTO t1 VALUES ('::ffff');
+INSERT INTO t1 VALUES ('ffff::ffff');
+SELECT * FROM t1;
+a
+::
+::ffff
+ffff::ffff
+DROP TABLE t1;
diff --git a/storage/connect/mysql-test/connect/t/alter_engine.test b/storage/connect/mysql-test/connect/t/alter_engine.test
new file mode 100644
index 00000000000..789a0955d3b
--- /dev/null
+++ b/storage/connect/mysql-test/connect/t/alter_engine.test
@@ -0,0 +1,11 @@
+--source include/have_innodb.inc
+
+--echo #
+--echo # MDEV-24422 Server crashes in GetTypeID / ha_connect::GetRealType upon
+--echo # altering table engine
+--echo #
+
+CREATE TABLE t1 (f INT) ENGINE=CONNECT;
+ALTER TABLE t1 ENGINE InnoDB;
+ALTER TABLE t1 ENGINE CONNECT;
+DROP TABLE t1;
diff --git a/storage/connect/mysql-test/connect/t/mysql_index.test b/storage/connect/mysql-test/connect/t/mysql_index.test
index ebc4965e8e1..cb4a332cdf8 100644
--- a/storage/connect/mysql-test/connect/t/mysql_index.test
+++ b/storage/connect/mysql-test/connect/t/mysql_index.test
@@ -50,6 +50,7 @@ SELECT * FROM t2 WHERE id = 3;
 SELECT * FROM t2 WHERE id IN (2,4);
 SELECT * FROM t2 WHERE id IN (2,4) AND msg = 'Two';
 SELECT * FROM t2 WHERE id > 4;
+--sorted_result
 SELECT * FROM t2 WHERE id >= 3;
 SELECT * FROM t2 WHERE id < 3;
 SELECT * FROM t2 WHERE id < 2 OR id > 4;
@@ -130,8 +131,7 @@ SELECT matricule, nom, prenom FROM t2 WHERE nom > 'HELEN' AND nom < 'HEROS';
 SELECT matricule, nom, prenom FROM t2 WHERE nom BETWEEN 'HELEN' AND 'HEROS';
 --sorted_result
 SELECT matricule, nom, prenom FROM t2 WHERE nom BETWEEN 'HELEN' AND 'HEROS' AND prenom = 'PHILIPPE';
---sorted_result
-SELECT matricule, nom, prenom FROM t2 ORDER BY nom LIMIT 10;
+SELECT matricule, nom, prenom FROM t2 ORDER BY nom,prenom LIMIT 10;
 --sorted_result
 SELECT a.nom, a.prenom, b.nom FROM t1 a STRAIGHT_JOIN t2 b ON a.prenom = b.prenom WHERE a.nom = 'FOCH' AND a.nom != b.nom;
 
diff --git a/storage/connect/mysql-test/connect/t/type_inet6.test b/storage/connect/mysql-test/connect/t/type_inet6.test
new file mode 100644
index 00000000000..19f5c13e270
--- /dev/null
+++ b/storage/connect/mysql-test/connect/t/type_inet6.test
@@ -0,0 +1,10 @@
+--echo #
+--echo # MDEV-21764 CONNECT table with INET6 field produces warnings upon SELECT
+--echo #
+
+CREATE TABLE t1 (a INET6) ENGINE=CONNECT TABLE_TYPE=DOS;
+INSERT INTO t1 VALUES ('::');
+INSERT INTO t1 VALUES ('::ffff');
+INSERT INTO t1 VALUES ('ffff::ffff');
+SELECT * FROM t1;
+DROP TABLE t1;
diff --git a/storage/connect/odbconn.cpp b/storage/connect/odbconn.cpp
index 9175a4c9053..48003dbf3fd 100644
--- a/storage/connect/odbconn.cpp
+++ b/storage/connect/odbconn.cpp
@@ -1289,10 +1289,7 @@ bool ODBConn::DriverConnect(DWORD Options)
 #else   // !_WIN32
   HWND    hWnd = (HWND)1;
 #endif  // !_WIN32
-  PGLOBAL& g = m_G;
-  PDBUSER dup = PlgGetUser(g);
 
-//if (Options & noOdbcDialog || dup->Remote)
     wConnectOption = SQL_DRIVER_NOPROMPT;
 //else if (Options & forceOdbcDialog)
 //  wConnectOption = SQL_DRIVER_PROMPT;
@@ -1691,7 +1688,7 @@ int ODBConn::PrepareSQL(char *sql)
     b = false;
 
     if (m_hstmt) {
-      RETCODE rc = SQLFreeStmt(m_hstmt, SQL_CLOSE);
+      SQLFreeStmt(m_hstmt, SQL_CLOSE);
 
       hstmt = m_hstmt;
       m_hstmt = NULL;
@@ -1699,7 +1696,7 @@ int ODBConn::PrepareSQL(char *sql)
       if (m_Tdb->GetAmType() != TYPE_AM_XDBC)
         ThrowDBX(MSG(SEQUENCE_ERROR));
 
-      } // endif m_hstmt
+    } // endif m_hstmt
 
     rc = SQLAllocStmt(m_hdbc, &hstmt);
 
diff --git a/storage/connect/plgdbutl.cpp b/storage/connect/plgdbutl.cpp
index 3624420dfed..55c7b8110fd 100644
--- a/storage/connect/plgdbutl.cpp
+++ b/storage/connect/plgdbutl.cpp
@@ -425,12 +425,13 @@ char *ExtractFromPath(PGLOBAL g, char *pBuff, char *FileName, OPVAL op)
   return pBuff;
   } // end of PlgExtractFromPath
 
+
+#ifdef NOT_USED
 /***********************************************************************/
-/*  Check the occurence and matching of a pattern against a string.    */
+/*  Check the occurrence and matching of a pattern against a string.    */
 /*  Because this function is only used for catalog name checking,      */
 /*  it must be case insensitive.                                       */
 /***********************************************************************/
-#ifdef NOT_USED
 static bool PlugCheckPattern(PGLOBAL g, LPCSTR string, LPCSTR pat)
   {
   if (pat && strlen(pat)) {
@@ -444,7 +445,7 @@ static bool PlugCheckPattern(PGLOBAL g, LPCSTR string, LPCSTR pat)
     return true;
 
   } // end of PlugCheckPattern
-#endif
+#endif /* NOT_USED */
 
 /***********************************************************************/
 /*  PlugEvalLike: evaluates a LIKE clause.                             */
@@ -571,7 +572,7 @@ bool EvalLikePattern(LPCSTR sp, LPCSTR tp)
     b = (t || !*sp);                  /*   true if %  or void strg.    */
   else if (!t) {
     /*******************************************************************/
-    /*  No character to skip, check occurence of <subtring-specifier>  */
+    /*  No character to skip, check occurrence of <subtring-specifier>  */
     /*  at the very beginning of remaining string.                     */
     /*******************************************************************/
     if (p) {
@@ -585,7 +586,7 @@ bool EvalLikePattern(LPCSTR sp, LPCSTR tp)
     if (p)
       /*****************************************************************/
       /*  Here is the case explaining why we need a recursive routine. */
-      /*  The test must be done not only against the first occurence   */
+      /*  The test must be done not only against the first occurrence   */
       /*  of the <substring-specifier> in the remaining string,        */
       /*  but also with all eventual succeeding ones.                  */
       /*****************************************************************/
diff --git a/storage/connect/tabbson.cpp b/storage/connect/tabbson.cpp
index 59d2b7ed1b0..23bac92efd8 100644
--- a/storage/connect/tabbson.cpp
+++ b/storage/connect/tabbson.cpp
@@ -2017,6 +2017,7 @@ PSZ BSONCOL::GetJpath(PGLOBAL g, bool proj)
     } // endif
 
     for (p1 = p2 = mgopath; *p1; p1++)
+    {
       if (i) {                 // Inside []
         if (isdigit(*p1)) {
           if (!proj)
@@ -2055,17 +2056,18 @@ PSZ BSONCOL::GetJpath(PGLOBAL g, bool proj)
           Sgfy = true;
           break;
         } // endif p2
-
+        /* fall through */
       default:
         *p2++ = *p1;
         break;
       } // endswitch p1;
+    }
 
-      if (*(p2 - 1) == '.')
-        p2--;
+    if (*(p2 - 1) == '.')
+      p2--;
 
-      *p2 = 0;
-      return mgopath;
+    *p2 = 0;
+    return mgopath;
   } else
     return NULL;
 
diff --git a/storage/connect/tabdos.cpp b/storage/connect/tabdos.cpp
index cb6154c45e3..2e70fdfc558 100644
--- a/storage/connect/tabdos.cpp
+++ b/storage/connect/tabdos.cpp
@@ -648,8 +648,9 @@ int TDBDOS::MakeBlockValues(PGLOBAL g)
   PDOSDEF    defp = (PDOSDEF)To_Def;
   PDOSCOL    colp = NULL;
   PDBUSER    dup = PlgGetUser(g);
-  PCATLG     cat __attribute__((unused))= defp->GetCat();
 //void      *memp = cat->GetDescp();
+  (void) defp->GetCat();                        // XXX Should be removed?
+
 
   if ((nrec = defp->GetElemt()) < 2) {
     if (!To_Def->Partitioned()) {
@@ -1018,8 +1019,10 @@ bool TDBDOS::GetBlockValues(PGLOBAL g)
   FILE      *opfile;
   PCOLDEF    cdp;
   PDOSDEF    defp = (PDOSDEF)To_Def;
-  PCATLG     cat __attribute__((unused))= defp->GetCat();
-	PDBUSER    dup = PlgGetUser(g);
+  PDBUSER    dup = PlgGetUser(g);
+
+  (void) defp->GetCat();                        // XXX Should be removed?
+
 
 #if 0
   if (Mode == MODE_INSERT && Txfp->GetAmType() == TYPE_AM_DOS)
diff --git a/storage/connect/tabext.cpp b/storage/connect/tabext.cpp
index 42658425fbd..4dc0f4e86d9 100644
--- a/storage/connect/tabext.cpp
+++ b/storage/connect/tabext.cpp
@@ -280,7 +280,7 @@ int TDBEXT::Decode(PCSZ txt, char *buf, size_t n)
 	uint   dummy_errors;
 	uint32 len = copy_and_convert(buf, n, &my_charset_latin1,
 		txt, strlen(txt),
-		&my_charset_utf8_general_ci,
+		&my_charset_utf8mb3_general_ci,
 		&dummy_errors);
 	buf[len] = '\0';
 	return 0;
diff --git a/storage/connect/tabfix.cpp b/storage/connect/tabfix.cpp
index 5deb5fd0d40..60c37075f66 100644
--- a/storage/connect/tabfix.cpp
+++ b/storage/connect/tabfix.cpp
@@ -52,8 +52,6 @@
 /*  DB static variables.                                               */
 /***********************************************************************/
 extern int num_read, num_there, num_eq[2];               // Statistics
-static const longlong M2G = 0x80000000;
-static const longlong M4G = (longlong)2 * M2G;
 char BINCOL::Endian = 'H';
 
 /***********************************************************************/
diff --git a/storage/connect/tabfmt.cpp b/storage/connect/tabfmt.cpp
index 53af8d84053..49233f4f799 100644
--- a/storage/connect/tabfmt.cpp
+++ b/storage/connect/tabfmt.cpp
@@ -802,8 +802,7 @@ bool TDBCSV::OpenDB(PGLOBAL g)
             Fldtyp[i] = IsTypeNum(cdp->GetType());
             } // endif cdp
     }
-
-    } // endif Use
+  } // endif Use
 
   if (Header) {
     // Check that the Lrecl is at least equal to the header line length
@@ -1082,8 +1081,7 @@ bool TDBCSV::PrepareWriting(PGLOBAL g)
       else
         strcat(To_Line, Field[i]);
     }
-
-    } // endfor i
+  } // endfor i
 
 #if defined(_DEBUG)
   assert ((unsigned)nlen == strlen(To_Line));
@@ -1137,6 +1135,7 @@ int TDBCSV::CheckWrite(PGLOBAL g)
 
   // Check whether record is too int
   for (int i = 0; i < Fields; i++)
+  {
     if (Field[i]) {
       if (!(n = strlen(Field[i])))
         n += (Quoted > 2 ? 2 : 0);
@@ -1162,7 +1161,7 @@ int TDBCSV::CheckWrite(PGLOBAL g)
         } // endif nlen
 
       } // endif Field
-
+  }
   return nlen;
   } // end of CheckWrite
 
diff --git a/storage/connect/tabjdbc.cpp b/storage/connect/tabjdbc.cpp
index 9721c62be7d..1268bcfd1de 100644
--- a/storage/connect/tabjdbc.cpp
+++ b/storage/connect/tabjdbc.cpp
@@ -381,7 +381,7 @@ bool TDBJDBC::MakeInsert(PGLOBAL g)
 	int    len = 0;
 	uint   pos;
 	bool   b = false;
-	PTABLE tablep = To_Table;
+	// PTABLE tablep = To_Table;
 	PCOL   colp;
 
 	for (colp = Columns; colp; colp = colp->GetNext())
@@ -585,11 +585,13 @@ bool TDBJDBC::OpenDB(PGLOBAL g)
 			if (Memory < 3) {
 				// Method will depend on cursor type
 				if ((Rbuf = Query ? Jcp->Rewind(Query->GetStr()) : 0) < 0)
+                                {
 					if (Mode != MODE_READX) {
 						Jcp->Close();
 						return true;
 					} else
 						Rbuf = 0;
+                                }
 
 			} else
 				Rbuf = Qrp->Nblin;
@@ -1022,7 +1024,7 @@ JDBCCOL::JDBCCOL(JDBCCOL *col1, PTDB tdbp) : EXTCOL(col1, tdbp)
 void JDBCCOL::ReadColumn(PGLOBAL g)
 {
 	PTDBJDBC tdbp = (PTDBJDBC)To_Tdb;
-	int i = tdbp->Fpos - 1, n = tdbp->CurNum;
+	int i = tdbp->Fpos - 1;
 
 	if (tdbp->Memory == 3) {
 		// Get the value from the stored memory
@@ -1142,8 +1144,6 @@ int TDBXJDC::GetMaxSize(PGLOBAL g)
 /***********************************************************************/
 bool TDBXJDC::OpenDB(PGLOBAL g)
 {
-	bool rc = false;
-
 	if (trace(1))
 		htrc("JDBC OpenDB: tdbp=%p tdb=R%d use=%d mode=%d\n",
 		this, Tdb_No, Use, Mode);
diff --git a/storage/connect/tabjson.cpp b/storage/connect/tabjson.cpp
index 2697be2da13..4321a906cf0 100644
--- a/storage/connect/tabjson.cpp
+++ b/storage/connect/tabjson.cpp
@@ -1068,7 +1068,7 @@ bool TDBJSN::OpenDB(PGLOBAL g)
 		/*********************************************************************/
 		/*  Lrecl is Ok.                      															 */
 		/*********************************************************************/
-		size_t linelen = Lrecl;
+
     MODE   mode = Mode;
 
     // Buffer must be allocated in g->Sarea
@@ -1729,7 +1729,6 @@ PVAL JSONCOL::MakeJson(PGLOBAL g, PJSON jsp, int n)
 /***********************************************************************/
 PJVAL JSONCOL::GetRowValue(PGLOBAL g, PJSON row, int i)
 {
-  int   n = Nod - 1;
   PJVAL val = NULL;
 
   for (; i < Nod && row; i++) {
@@ -1862,7 +1861,6 @@ void JSONCOL::ReadColumn(PGLOBAL g)
 /***********************************************************************/
 PVAL JSONCOL::GetColumnValue(PGLOBAL g, PJSON row, int i)
 {
-  int   n = Nod - 1;
   PJAR  arp;
   PJVAL val = NULL;
 
diff --git a/storage/connect/tabmul.cpp b/storage/connect/tabmul.cpp
index 131b96ffbf5..d260149514d 100644
--- a/storage/connect/tabmul.cpp
+++ b/storage/connect/tabmul.cpp
@@ -671,8 +671,8 @@ TDBDIR::TDBDIR(PSZ fpat) : TDBASE((PTABDEF)NULL)
 /***********************************************************************/
 char* TDBDIR::Path(PGLOBAL g)
   {
-  PCATLG cat __attribute__((unused))= PlgGetCatalog(g);
-  PTABDEF defp = (PTABDEF)To_Def;
+    (void) PlgGetCatalog(g);                    // XXX Should be removed?
+    PTABDEF defp = (PTABDEF)To_Def;
 
 #if defined(_WIN32)
   if (!*Drive) {
@@ -711,7 +711,6 @@ int TDBDIR::GetMaxSize(PGLOBAL g)
     int n = -1;
 #if defined(_WIN32)
     int rc;
-
     // Start searching files in the target directory.
 		hSearch = FindFirstFile(Path(g), &FileData);
 
@@ -1047,8 +1046,8 @@ int TDBSDR::FindInDir(PGLOBAL g)
 
   // Start searching files in the target directory.
 #if defined(_WIN32)
-	HANDLE h;
 	int rc;
+	HANDLE h;
 
 #if defined(PATHMATCHSPEC)
 	if (!*Drive)
@@ -1176,7 +1175,7 @@ int TDBSDR::FindInDir(PGLOBAL g)
       // Look in the name sub-directory
       strcat(strcat(Direc, Entry->d_name), "/");
 
-      if ((k = FindInDir(g)) < 0)
+      if ((k= FindInDir(g)) < 0)
         return k;
       else
         n += k;
diff --git a/storage/connect/tabmysql.cpp b/storage/connect/tabmysql.cpp
index b677f862cc8..fb5d1b35bc2 100644
--- a/storage/connect/tabmysql.cpp
+++ b/storage/connect/tabmysql.cpp
@@ -246,7 +246,7 @@ bool MYSQLDEF::ParseURL(PGLOBAL g, char *url, bool b)
       // Found that if the string is:
       // user:@hostname:port/db/table
       // Then password is a null string, so set to NULL
-			if ((pwd[0] == 0))
+			if (pwd[0] == 0)
 				Password = NULL;
 			else
 				Password = pwd;
diff --git a/storage/connect/taboccur.cpp b/storage/connect/taboccur.cpp
index c3cb5be2e8d..718b8a066d1 100644
--- a/storage/connect/taboccur.cpp
+++ b/storage/connect/taboccur.cpp
@@ -294,7 +294,7 @@ TDBOCCUR::TDBOCCUR(POCCURDEF tdp) : TDBPRX(tdp)
 	Col = NULL;                        // To source column blocks array
 	Mult = PrepareColist(Colist);      // Multiplication factor
 	N = 0;									           // The current table index
-	M = 0;                             // The occurence rank
+	M = 0;                             // The occurrence rank
 	RowFlag = 0;    				           // 0: Ok, 1: Same, 2: Skip
 } // end of TDBOCCUR constructor
 
@@ -433,7 +433,7 @@ int TDBOCCUR::GetMaxSize(PGLOBAL g)
 
 /***********************************************************************/
 /*  In this sample, ROWID will be the (virtual) row number,            */
-/*  while ROWNUM will be the occurence rank in the multiple column.    */
+/*  while ROWNUM will be the occurrence rank in the multiple column.    */
 /***********************************************************************/
 int TDBOCCUR::RowNumber(PGLOBAL, bool b)
 {
diff --git a/storage/connect/taboccur.h b/storage/connect/taboccur.h
index 4538d3d71f2..13bc055cd6f 100644
--- a/storage/connect/taboccur.h
+++ b/storage/connect/taboccur.h
@@ -35,7 +35,7 @@ class OCCURDEF : public PRXDEF {          /* Logical table description */
  protected:
   // Members
 	char   *Colist;						 /* The source column list                 */
-  char   *Xcol;              /* The multiple occurence column          */
+  char   *Xcol;              /* The multiple occurrence column          */
   char   *Rcol;              /* The rank column                        */
   }; // end of OCCURDEF
 
@@ -76,12 +76,12 @@ class TDBOCCUR : public TDBPRX {
 	PCOL     *Col;									  // To source multiple columns
 	int       Mult;										// Multiplication factor
 	int       N;											// The current table index
-	int		    M;                      // The occurence rank
+	int		    M;                      // The occurrence rank
 	BYTE      RowFlag;								// 0: Ok, 1: Same, 2: Skip
   }; // end of class TDBOCCUR
 
 /***********************************************************************/
-/*  Class OCCURCOL: for the multiple occurence column.                 */
+/*  Class OCCURCOL: for the multiple occurrence column.                 */
 /***********************************************************************/
 class OCCURCOL : public COLBLK {
  public:
@@ -106,7 +106,7 @@ class OCCURCOL : public COLBLK {
   }; // end of class OCCURCOL
 
 /***********************************************************************/
-/*  Class RANKCOL: for the multiple occurence column ranking.          */
+/*  Class RANKCOL: for the multiple occurrence column ranking.          */
 /***********************************************************************/
 class RANKCOL : public COLBLK {
  public:
diff --git a/storage/connect/tabodbc.cpp b/storage/connect/tabodbc.cpp
index bede19f7344..4e8b4417a33 100644
--- a/storage/connect/tabodbc.cpp
+++ b/storage/connect/tabodbc.cpp
@@ -301,7 +301,6 @@ bool TDBODBC::MakeInsert(PGLOBAL g)
 	char  *catp = NULL, buf[NAM_LEN * 3];
 	int    len = 0;
 	bool   oom, b = false;
-	PTABLE tablep = To_Table;
 	PCOL   colp;
 
   for (colp = Columns; colp; colp = colp->GetNext())
@@ -322,9 +321,6 @@ bool TDBODBC::MakeInsert(PGLOBAL g)
 	if (catp)
 		len += strlen(catp) + 1;
 
-	//if (tablep->GetSchema())
-	//	schmp = (char*)tablep->GetSchema();
-	//else 
 	if (Schema && *Schema)
 		schmp = Schema;
 
@@ -557,15 +553,17 @@ bool TDBODBC::OpenDB(PGLOBAL g)
 
     if (Memory < 3) {
       // Method will depend on cursor type
-      if ((Rbuf = Ocp->Rewind(Query->GetStr(), (PODBCCOL)Columns)) < 0)
-				if (Mode != MODE_READX) {
-	        Ocp->Close();
-		      return true;
-				}	else
-					Rbuf = 0;
-
-    } else
+      if ((Rbuf = Ocp->Rewind(Query->GetStr(), (PODBCCOL)Columns)) < 0) {
+        if (Mode != MODE_READX) {
+          Ocp->Close();
+          return true;
+        } else {
+          Rbuf = 0;
+        }
+      }
+    } else {
       Rbuf = Qrp->Nblin;
+    }
 
     CurNum = 0;
     Fpos = 0;
@@ -1215,8 +1213,6 @@ int TDBXDBC::GetMaxSize(PGLOBAL g)
 /***********************************************************************/
 bool TDBXDBC::OpenDB(PGLOBAL g)
 {
-  bool rc = false;
-
   if (trace(1))
     htrc("ODBC OpenDB: tdbp=%p tdb=R%d use=%dmode=%d\n",
             this, Tdb_No, Use, Mode);
diff --git a/storage/connect/tabpivot.cpp b/storage/connect/tabpivot.cpp
index 1047a139f3b..5ba4b511528 100644
--- a/storage/connect/tabpivot.cpp
+++ b/storage/connect/tabpivot.cpp
@@ -187,7 +187,7 @@ PQRYRES PIVAID::MakePivotColumns(PGLOBAL g)
 		} // endif picol
 
 	  // Prepare the column list
-		for (pcrp = &Qryp->Colresp; (crp = *pcrp); )
+		for (pcrp = &Qryp->Colresp; (crp = *pcrp); ) {
 			if (SkipColumn(crp, skc)) {
 				// Ignore this column
 				*pcrp = crp->Next;
@@ -204,7 +204,7 @@ PQRYRES PIVAID::MakePivotColumns(PGLOBAL g)
 				*pcrp = crp->Next;
 			} else
 				pcrp = &crp->Next;
-
+		}
 		if (!Rblkp) {
 			strcpy(g->Message, MSG(NO_DEF_PIVOTCOL));
 			goto err;
@@ -340,7 +340,6 @@ int PIVAID::Qcompare(int *i1, int *i2)
 bool PIVOTDEF::DefineAM(PGLOBAL g, LPCSTR am, int poff)
   {
   char *p1, *p2;
-  PHC    hc __attribute__((unused))= ((MYCAT*)Cat)->GetHandler();
 
   if (PRXDEF::DefineAM(g, am, poff))
     return TRUE;
@@ -405,7 +404,7 @@ TDBPIVOT::TDBPIVOT(PPIVOTDEF tdp) : TDBPRX(tdp)
   Accept = tdp->Accept;
   Mult = -1;                // Estimated table size
   N = 0;                    // The current table index
-  M = 0;                    // The occurence rank
+  M = 0;                    // The occurrence rank
   FileStatus = 0;           // Logical End-of-File
   RowFlag = 0;              // 0: Ok, 1: Same, 2: Skip
   } // end of TDBPIVOT constructor
@@ -645,7 +644,7 @@ int TDBPIVOT::GetMaxSize(PGLOBAL g __attribute__((unused)))
 
 /***********************************************************************/
 /*  In this sample, ROWID will be the (virtual) row number,            */
-/*  while ROWNUM will be the occurence rank in the multiple column.    */
+/*  while ROWNUM will be the occurrence rank in the multiple column.    */
 /***********************************************************************/
 int TDBPIVOT::RowNumber(PGLOBAL, bool b)
   {
diff --git a/storage/connect/tabpivot.h b/storage/connect/tabpivot.h
index 6c2d53e9527..d819d55a61a 100644
--- a/storage/connect/tabpivot.h
+++ b/storage/connect/tabpivot.h
@@ -138,7 +138,7 @@ class TDBPIVOT : public TDBPRX {
   int     Mult;                   // Multiplication factor
   int     Ncol;                   // The number of generated columns
   int     N;                      // The current table index
-  int     M;                      // The occurence rank
+  int     M;                      // The occurrence rank
   int     Port;                   // MySQL port number 
   BYTE    FileStatus;             // 0: First 1: Rows 2: End-of-File
   BYTE    RowFlag;                // 0: Ok, 1: Same, 2: Skip
diff --git a/storage/connect/tabrest.h b/storage/connect/tabrest.h
index 9066a89b306..901d9102e95 100644
--- a/storage/connect/tabrest.h
+++ b/storage/connect/tabrest.h
@@ -6,9 +6,7 @@
 #pragma once
 
 #if defined(_WIN32)
-static PCSZ slash = "\\";
 #else // !_WIN32
-static PCSZ slash = "/";
 #define stricmp strcasecmp
 #endif // !_WIN32
 
diff --git a/storage/connect/tabutil.cpp b/storage/connect/tabutil.cpp
index e23ada8cde9..941073b35e6 100644
--- a/storage/connect/tabutil.cpp
+++ b/storage/connect/tabutil.cpp
@@ -536,7 +536,7 @@ int TDBPRX::GetMaxSize(PGLOBAL g)
 
 /***********************************************************************/
 /*  In this sample, ROWID will be the (virtual) row number,            */
-/*  while ROWNUM will be the occurence rank in the multiple column.    */
+/*  while ROWNUM will be the occurrence rank in the multiple column.    */
 /***********************************************************************/
 int TDBPRX::RowNumber(PGLOBAL g, bool b)
 	{
@@ -686,7 +686,7 @@ char *PRXCOL::Decode(PGLOBAL g, const char *cnm)
   uint32 len= copy_and_convert(buf, strlen(cnm) + 1,
                                &my_charset_latin1,
                                cnm, strlen(cnm),
-                               &my_charset_utf8_general_ci,
+                               &my_charset_utf8mb3_general_ci,
                                &dummy_errors);
   buf[len]= '\0';
   return buf;
diff --git a/storage/connect/tabxcl.cpp b/storage/connect/tabxcl.cpp
index 5b1cf57cebb..d354f556ca1 100644
--- a/storage/connect/tabxcl.cpp
+++ b/storage/connect/tabxcl.cpp
@@ -103,7 +103,7 @@ TDBXCL::TDBXCL(PXCLDEF tdp) : TDBPRX(tdp)
 	Xcolp = NULL;										// To the XCLCOL column
 	Mult = tdp->Mult;								// Multiplication factor
 	N = 0;													// The current table index
-	M = 0;                          // The occurence rank
+	M = 0;                          // The occurrence rank
 	RowFlag = 0;    								// 0: Ok, 1: Same, 2: Skip
 	New = TRUE;						          // TRUE for new line
 	Sep = tdp->Sep;                 // The Xcol separator
@@ -142,7 +142,7 @@ int TDBXCL::GetMaxSize(PGLOBAL g)
 
 /***********************************************************************/
 /*  For this table type, ROWID is the (virtual) row number,            */
-/*  while ROWNUM is be the occurence rank in the multiple column.      */
+/*  while ROWNUM is be the occurrence rank in the multiple column.      */
 /***********************************************************************/
 int TDBXCL::RowNumber(PGLOBAL, bool b)
 	{
diff --git a/storage/connect/tabxcl.h b/storage/connect/tabxcl.h
index fde000ee709..2ae96703548 100644
--- a/storage/connect/tabxcl.h
+++ b/storage/connect/tabxcl.h
@@ -72,7 +72,7 @@ class TDBXCL : public TDBPRX {
 	PXCLCOL Xcolp;									// To the XCVCOL column
 	int     Mult;										// Multiplication factor
 	int     N;											// The current table index
-	int			M;                      // The occurence rank
+	int			M;                      // The occurrence rank
 	BYTE    RowFlag;								// 0: Ok, 1: Same, 2: Skip
 	bool    New;						        // TRUE for new line
 	char    Sep;										// The Xcol separator
diff --git a/storage/connect/tabxml.cpp b/storage/connect/tabxml.cpp
index 693450668db..0691d6380d8 100644
--- a/storage/connect/tabxml.cpp
+++ b/storage/connect/tabxml.cpp
@@ -710,7 +710,7 @@ PTDB TDBXML::Clone(PTABS t)
 /***********************************************************************/
 const CHARSET_INFO *TDBXML::data_charset()
 {
-	return &my_charset_utf8_general_ci;
+	return &my_charset_utf8mb3_general_ci;
 }	// end of data_charset
 
 /***********************************************************************/
diff --git a/storage/connect/user_connect.cc b/storage/connect/user_connect.cc
index 5268651d080..ba446a3e2f3 100644
--- a/storage/connect/user_connect.cc
+++ b/storage/connect/user_connect.cc
@@ -112,7 +112,7 @@ bool user_connect::user_init()
     if (g)
       printf("%s\n", g->Message);
 
-    g= PlugExit(g);
+    (void) PlugExit(g);
 
 		if (dup)
 	    free(dup);
diff --git a/storage/connect/xindex.cpp b/storage/connect/xindex.cpp
index f4a5b1fe1fa..6ed70f21a85 100644
--- a/storage/connect/xindex.cpp
+++ b/storage/connect/xindex.cpp
@@ -1211,7 +1211,6 @@ bool XINDEX::MapInit(PGLOBAL g)
   PCOL    colp;
   PXCOL   prev = NULL, kcp = NULL;
   PDOSDEF defp = (PDOSDEF)Tdbp->To_Def;
-  PDBUSER dup __attribute__((unused))= PlgGetUser(g);
 
   /*********************************************************************/
   /*  Get the estimated table size.                                    */
diff --git a/storage/csv/ha_tina.cc b/storage/csv/ha_tina.cc
index 6e7ee48d2eb..ec569feec9d 100644
--- a/storage/csv/ha_tina.cc
+++ b/storage/csv/ha_tina.cc
@@ -112,6 +112,11 @@ static uchar* tina_get_key(TINA_SHARE *share, size_t *length,
   return (uchar*) share->table_name;
 }
 
+static PSI_memory_key csv_key_memory_tina_share;
+static PSI_memory_key csv_key_memory_blobroot;
+static PSI_memory_key csv_key_memory_tina_set;
+static PSI_memory_key csv_key_memory_row;
+
 #ifdef HAVE_PSI_INTERFACE
 
 static PSI_mutex_key csv_key_mutex_tina, csv_key_mutex_TINA_SHARE_mutex;
@@ -132,6 +137,15 @@ static PSI_file_info all_tina_files[]=
   { &csv_key_file_update, "update", 0}
 };
 
+static PSI_memory_info all_tina_memory[]=
+{
+  { &csv_key_memory_tina_share, "TINA_SHARE", PSI_FLAG_GLOBAL},
+  { &csv_key_memory_blobroot, "blobroot", 0},
+  { &csv_key_memory_tina_set, "tina_set", 0},
+  { &csv_key_memory_row, "row", 0},
+  { &csv_key_memory_Transparent_file, "Transparent_file", 0}
+};
+
 static void init_tina_psi_keys(void)
 {
   const char* category= "csv";
@@ -142,6 +156,9 @@ static void init_tina_psi_keys(void)
 
   count= array_elements(all_tina_files);
   mysql_file_register(category, all_tina_files, count);
+
+  count= array_elements(all_tina_memory);
+  mysql_memory_register(category, all_tina_memory, count);
 }
 #endif /* HAVE_PSI_INTERFACE */
 
@@ -166,9 +183,9 @@ static int tina_init_func(void *p)
 
   tina_hton= (handlerton *)p;
   mysql_mutex_init(csv_key_mutex_tina, &tina_mutex, MY_MUTEX_INIT_FAST);
-  (void) my_hash_init(&tina_open_tables,system_charset_info,32,0,0,
-                      (my_hash_get_key) tina_get_key,0,0);
-  tina_hton->state= SHOW_OPTION_YES;
+  (void) my_hash_init(csv_key_memory_tina_share, &tina_open_tables,
+                      system_charset_info, 32, 0, 0, (my_hash_get_key)
+                      tina_get_key, 0, 0);
   tina_hton->db_type= DB_TYPE_CSV_DB;
   tina_hton->create= tina_create_handler;
   tina_hton->flags= (HTON_CAN_RECREATE | HTON_SUPPORT_LOG_TABLES | 
@@ -205,14 +222,11 @@ static TINA_SHARE *get_share(const char *table_name, TABLE *table)
     If share is not present in the hash, create a new share and
     initialize its members.
   */
-  if (!(share=(TINA_SHARE*) my_hash_search(&tina_open_tables,
-                                           (uchar*) table_name,
-                                           length)))
+  if (!(share=(TINA_SHARE*) my_hash_search(&tina_open_tables, (uchar*)
+                                           table_name, length)))
   {
-    if (!my_multi_malloc(MYF(MY_WME | MY_ZEROFILL),
-                         &share, sizeof(*share),
-                         &tmp_name, length+1,
-                         NullS))
+    if (!my_multi_malloc(csv_key_memory_tina_share, MYF(MY_WME | MY_ZEROFILL),
+                         &share, sizeof(*share), &tmp_name, length+1, NullS))
     {
       mysql_mutex_unlock(&tina_mutex);
       return NULL;
@@ -514,7 +528,8 @@ ha_tina::ha_tina(handlerton *hton, TABLE_SHARE *table_arg)
   buffer.set((char*)byte_buffer, IO_SIZE, &my_charset_bin);
   chain= chain_buffer;
   file_buff= new Transparent_file();
-  init_alloc_root(&blobroot, "ha_tina", BLOB_MEMROOT_ALLOC_SIZE, 0, MYF(0));
+  init_alloc_root(csv_key_memory_blobroot, &blobroot, BLOB_MEMROOT_ALLOC_SIZE,
+                  0, MYF(0));
 }
 
 
@@ -628,14 +643,15 @@ int ha_tina::chain_append()
       chain_size += DEFAULT_CHAIN_LENGTH;
       if (chain_alloced)
       {
-        /* Must cast since my_malloc unlike malloc doesn't have a void ptr */
-        if ((chain= (tina_set *) my_realloc((uchar*)chain,
-                                            chain_size, MYF(MY_WME))) == NULL)
+        if ((chain= (tina_set *) my_realloc(csv_key_memory_tina_set,
+                                            (uchar*)chain, chain_size,
+                                            MYF(MY_WME))) == NULL)
           return -1;
       }
       else
       {
-        tina_set *ptr= (tina_set *) my_malloc(chain_size * sizeof(tina_set),
+        tina_set *ptr= (tina_set *) my_malloc(csv_key_memory_tina_set,
+                                              chain_size * sizeof(tina_set),
                                               MYF(MY_WME));
         memcpy(ptr, chain, DEFAULT_CHAIN_LENGTH * sizeof(tina_set));
         chain= ptr;
@@ -828,8 +844,8 @@ int ha_tina::find_current_row(uchar *buf)
         Thus, for enums we silence the warning, as it doesn't really mean
         an invalid value.
       */
-      if ((*field)->store(buffer.ptr(), buffer.length(), buffer.charset(),
-                          is_enum ? CHECK_FIELD_IGNORE : CHECK_FIELD_WARN))
+      if ((*field)->store_text(buffer.ptr(), buffer.length(), buffer.charset(),
+                               is_enum ? CHECK_FIELD_IGNORE : CHECK_FIELD_WARN))
       {
         if (!is_enum)
           goto err;
@@ -866,10 +882,11 @@ err:
   for CSV engine. For more details see mysys/thr_lock.c
 */
 
-void tina_get_status(void* param, my_bool concurrent_insert)
+my_bool tina_get_status(void* param, my_bool concurrent_insert)
 {
   ha_tina *tina= (ha_tina*) param;
   tina->get_status();
+  return 0;
 }
 
 void tina_update_status(void* param)
@@ -975,7 +992,8 @@ int ha_tina::open(const char *name, int mode, uint open_options)
   */
   thr_lock_data_init(&share->lock, &lock, (void*) this);
   ref_length= sizeof(my_off_t);
-  init_alloc_root(&blobroot, "ha_tina", BLOB_MEMROOT_ALLOC_SIZE, 0, MYF(0));
+  init_alloc_root(csv_key_memory_blobroot, &blobroot, BLOB_MEMROOT_ALLOC_SIZE,
+                  0, MYF(0));
 
   share->lock.get_status= tina_get_status;
   share->lock.update_status= tina_update_status;
@@ -1528,7 +1546,8 @@ int ha_tina::repair(THD* thd, HA_CHECK_OPT* check_opt)
   if (init_data_file())
     DBUG_RETURN(HA_ERR_CRASHED_ON_REPAIR);
 
-  if (!(buf= (uchar*) my_malloc(table->s->reclength, MYF(MY_WME))))
+  if (!(buf= (uchar*) my_malloc(csv_key_memory_row, table->s->reclength,
+                                MYF(MY_WME))))
     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
   /*
     Local_saved_data_file_length is initialized during the lock phase.
@@ -1742,7 +1761,8 @@ int ha_tina::check(THD* thd, HA_CHECK_OPT* check_opt)
    if (init_data_file())
      DBUG_RETURN(HA_ERR_CRASHED);
 
-  if (!(buf= (uchar*) my_malloc(table->s->reclength, MYF(MY_WME))))
+  if (!(buf= (uchar*) my_malloc(csv_key_memory_row, table->s->reclength,
+                                MYF(MY_WME))))
     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
 
   /*
diff --git a/storage/csv/ha_tina.h b/storage/csv/ha_tina.h
index aae535c271e..043183444da 100644
--- a/storage/csv/ha_tina.h
+++ b/storage/csv/ha_tina.h
@@ -57,7 +57,7 @@ struct tina_set {
   my_off_t end;
 };
 
-class ha_tina: public handler
+class ha_tina final : public handler
 {
   THR_LOCK_DATA lock;      /* MySQL lock */
   TINA_SHARE *share;       /* Shared lock info */
diff --git a/storage/csv/transparent_file.cc b/storage/csv/transparent_file.cc
index 47d63f2caaf..443e61d08d8 100644
--- a/storage/csv/transparent_file.cc
+++ b/storage/csv/transparent_file.cc
@@ -23,9 +23,12 @@
 #include "transparent_file.h"
 #include "my_sys.h"          // MY_WME, MY_ALLOW_ZERO_PTR, MY_SEEK_SET
 
+PSI_memory_key csv_key_memory_Transparent_file;
+
 Transparent_file::Transparent_file() : lower_bound(0), buff_size(IO_SIZE)
 { 
-  buff= (uchar *) my_malloc(buff_size*sizeof(uchar),  MYF(MY_WME)); 
+  buff= (uchar *) my_malloc(csv_key_memory_Transparent_file,
+                            buff_size*sizeof(uchar),  MYF(MY_WME));
 }
 
 Transparent_file::~Transparent_file()
diff --git a/storage/csv/transparent_file.h b/storage/csv/transparent_file.h
index 024fc0e3a38..545643d0f17 100644
--- a/storage/csv/transparent_file.h
+++ b/storage/csv/transparent_file.h
@@ -18,6 +18,7 @@
 #include <sys/stat.h>
 #include <my_dir.h>
 
+extern PSI_memory_key csv_key_memory_Transparent_file;
 
 class Transparent_file
 {
diff --git a/storage/example/ha_example.cc b/storage/example/ha_example.cc
index fd7df7ee06f..8f2015070be 100644
--- a/storage/example/ha_example.cc
+++ b/storage/example/ha_example.cc
@@ -215,6 +215,8 @@ static void init_example_psi_keys()
   count= array_elements(all_example_mutexes);
   mysql_mutex_register(category, all_example_mutexes, count);
 }
+#else
+static void init_example_psi_keys() { }
 #endif
 
 
@@ -252,17 +254,15 @@ static int example_init_func(void *p)
 {
   DBUG_ENTER("example_init_func");
 
-#ifdef HAVE_PSI_INTERFACE
   init_example_psi_keys();
-#endif
 
   example_hton= (handlerton *)p;
-  example_hton->state=   SHOW_OPTION_YES;
   example_hton->create=  example_create_handler;
   example_hton->flags=   HTON_CAN_RECREATE;
   example_hton->table_options= example_table_option_list;
   example_hton->field_options= example_field_option_list;
   example_hton->tablefile_extensions= ha_example_exts;
+  example_hton->drop_table= [](handlerton *, const char*) { return -1; };
 
   DBUG_RETURN(0);
 }
@@ -840,6 +840,10 @@ int ha_example::delete_table(const char *name)
   @brief
   Given a starting key and an ending key, estimate the number of rows that
   will exist between the two keys.
+  The handler can also optionally update the 'pages' parameter with the page
+  number that contains the min and max keys. This will help the optimizer
+  to know if two ranges are partly on the same pages and if the min and
+  max key are on the same page.
 
   @details
   end_key may be empty, in which case determine if start_key matches any rows.
@@ -849,8 +853,10 @@ int ha_example::delete_table(const char *name)
   @see
   check_quick_keys() in opt_range.cc
 */
-ha_rows ha_example::records_in_range(uint inx, key_range *min_key,
-                                     key_range *max_key)
+ha_rows ha_example::records_in_range(uint inx,
+                                     const key_range *min_key,
+                                     const key_range *max_key,
+                                     page_range *pages)
 {
   DBUG_ENTER("ha_example::records_in_range");
   DBUG_RETURN(10);                         // low number to force index usage
@@ -1093,23 +1099,6 @@ static struct st_mysql_show_var func_status[]=
 struct st_mysql_daemon unusable_example=
 { MYSQL_DAEMON_INTERFACE_VERSION };
 
-mysql_declare_plugin(example)
-{
-  MYSQL_STORAGE_ENGINE_PLUGIN,
-  &example_storage_engine,
-  "EXAMPLE",
-  "Brian Aker, MySQL AB",
-  "Example storage engine",
-  PLUGIN_LICENSE_GPL,
-  example_init_func,                            /* Plugin Init */
-  NULL,                                         /* Plugin Deinit */
-  0x0001 /* 0.1 */,
-  func_status,                                  /* status variables */
-  example_system_variables,                     /* system variables */
-  NULL,                                         /* config options */
-  0,                                            /* flags */
-}
-mysql_declare_plugin_end;
 maria_declare_plugin(example)
 {
   MYSQL_STORAGE_ENGINE_PLUGIN,
diff --git a/storage/example/ha_example.h b/storage/example/ha_example.h
index 0a08e871461..2d3fa6d4216 100644
--- a/storage/example/ha_example.h
+++ b/storage/example/ha_example.h
@@ -242,8 +242,8 @@ public:
   int extra(enum ha_extra_function operation);
   int external_lock(THD *thd, int lock_type);                   ///< required
   int delete_all_rows(void);
-  ha_rows records_in_range(uint inx, key_range *min_key,
-                           key_range *max_key);
+  ha_rows records_in_range(uint inx, const key_range *min_key,
+                           const key_range *max_key, page_range *pages);
   int delete_table(const char *from);
   int create(const char *name, TABLE *form,
              HA_CREATE_INFO *create_info);                      ///< required
diff --git a/storage/federated/ha_federated.cc b/storage/federated/ha_federated.cc
index 00407730c31..64a80808d6f 100644
--- a/storage/federated/ha_federated.cc
+++ b/storage/federated/ha_federated.cc
@@ -1,5 +1,5 @@
 /* Copyright (c) 2004, 2015, Oracle and/or its affiliates.
-   Copyright (c) 2017, MariaDB Corporation.
+   Copyright (c) 2017, 2020, MariaDB Corporation.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -480,11 +480,11 @@ int federated_db_init(void *p)
 #endif /* HAVE_PSI_INTERFACE */
 
   handlerton *federated_hton= (handlerton *)p;
-  federated_hton->state= SHOW_OPTION_YES;
   federated_hton->db_type= DB_TYPE_FEDERATED_DB;
   federated_hton->commit= federated_commit;
   federated_hton->rollback= federated_rollback;
   federated_hton->create= federated_create_handler;
+  federated_hton->drop_table= [](handlerton *, const char*) { return -1; };
   federated_hton->flags= HTON_ALTER_NOT_SUPPORTED | HTON_NO_PARTITION;
 
   /*
@@ -497,8 +497,8 @@ int federated_db_init(void *p)
   if (mysql_mutex_init(fe_key_mutex_federated,
                        &federated_mutex, MY_MUTEX_INIT_FAST))
     goto error;
-  if (!my_hash_init(&federated_open_tables, &my_charset_bin, 32, 0, 0,
-                    (my_hash_get_key) federated_get_key, 0, 0))
+  if (!my_hash_init(PSI_INSTRUMENT_ME, &federated_open_tables, &my_charset_bin,
+                    32, 0, 0, (my_hash_get_key) federated_get_key, 0, 0))
   {
     DBUG_RETURN(FALSE);
   }
@@ -561,7 +561,7 @@ static bool append_ident(String *string, const char *name, size_t length,
     for (name_end= name+length; name < name_end; name+= clen)
     {
       uchar c= *(uchar *) name;
-      clen= my_charlen_fix(system_charset_info, name, name_end);
+      clen= system_charset_info->charlen_fix(name, name_end);
       if (clen == 1 && c == (uchar) quote_char &&
           (result= string->append(&quote_char, 1, system_charset_info)))
         goto err;
@@ -960,7 +960,7 @@ uint ha_federated::convert_row_to_internal_format(uchar *record,
       if (bitmap_is_set(table->read_set, (*field)->field_index))
       {
         (*field)->set_notnull();
-        (*field)->store(*row, *lengths, &my_charset_bin);
+        (*field)->store_text(*row, *lengths, &my_charset_bin);
       }
     }
     (*field)->move_field_offset(-old_ptr);
@@ -1516,7 +1516,7 @@ static FEDERATED_SHARE *get_share(const char *table_name, TABLE *table)
   */
   query.length(0);
 
-  init_alloc_root(&mem_root, "federated_share", 256, 0, MYF(0));
+  init_alloc_root(PSI_INSTRUMENT_ME, &mem_root, 256, 0, MYF(0));
 
   mysql_mutex_lock(&federated_mutex);
 
@@ -1603,8 +1603,10 @@ static int free_share(FEDERATED_SHARE *share)
 }
 
 
-ha_rows ha_federated::records_in_range(uint inx, key_range *start_key,
-                                       key_range *end_key)
+ha_rows ha_federated::records_in_range(uint inx,
+                                       const key_range *start_key,
+                                       const key_range *end_key,
+                                       page_range *pages)
 {
   /*
 
@@ -1642,7 +1644,7 @@ int ha_federated::open(const char *name, int mode, uint test_if_locked)
   ref_length= sizeof(MYSQL_RES *) + sizeof(MYSQL_ROW_OFFSET);
   DBUG_PRINT("info", ("ref_length: %u", ref_length));
 
-  my_init_dynamic_array(&results, sizeof(MYSQL_RES *), 4, 4, MYF(0));
+  my_init_dynamic_array(PSI_INSTRUMENT_ME, &results, sizeof(MYSQL_RES *), 4, 4, MYF(0));
   reset();
 
   DBUG_RETURN(0);
@@ -3386,30 +3388,13 @@ int ha_federated::execute_simple_query(const char *query, int len)
 struct st_mysql_storage_engine federated_storage_engine=
 { MYSQL_HANDLERTON_INTERFACE_VERSION };
 
-mysql_declare_plugin(federated)
-{
-  MYSQL_STORAGE_ENGINE_PLUGIN,
-  &federated_storage_engine,
-  "FEDERATED",
-  "Patrick Galbraith and Brian Aker, MySQL AB",
-  "Federated MySQL storage engine",
-  PLUGIN_LICENSE_GPL,
-  federated_db_init, /* Plugin Init */
-  federated_done, /* Plugin Deinit */
-  0x0100 /* 1.0 */,
-  NULL,                       /* status variables                */
-  NULL,                       /* system variables                */
-  NULL,                       /* config options                  */
-  0,                          /* flags                           */
-}
-mysql_declare_plugin_end;
 maria_declare_plugin(federated)
 {
   MYSQL_STORAGE_ENGINE_PLUGIN,
   &federated_storage_engine,
   "FEDERATED",
   "Patrick Galbraith and Brian Aker, MySQL AB",
-  "Allows to access tables on other MariaDB servers",
+  "Allows accessing tables on other MariaDB servers",
   PLUGIN_LICENSE_GPL,
   federated_db_init, /* Plugin Init */
   federated_done, /* Plugin Deinit */
diff --git a/storage/federated/ha_federated.h b/storage/federated/ha_federated.h
index 8d6c28556c4..0c6285f3ac6 100644
--- a/storage/federated/ha_federated.h
+++ b/storage/federated/ha_federated.h
@@ -257,13 +257,18 @@ public:
   void update_auto_increment(void);
   int repair(THD* thd, HA_CHECK_OPT* check_opt);
   int optimize(THD* thd, HA_CHECK_OPT* check_opt);
-
+  int delete_table(const char *name)
+  {
+    return 0;
+  }
   int delete_all_rows(void);
   int truncate();
   int create(const char *name, TABLE *form,
              HA_CREATE_INFO *create_info);                      //required
-  ha_rows records_in_range(uint inx, key_range *start_key,
-                                   key_range *end_key);
+  ha_rows records_in_range(uint inx,
+                           const key_range *start_key,
+                           const key_range *end_key,
+                           page_range *pages);
   uint8 table_cache_type() { return HA_CACHE_TBL_NOCACHE; }
 
   THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to,
diff --git a/storage/federatedx/federatedx_io_mysql.cc b/storage/federatedx/federatedx_io_mysql.cc
index f33cf45a241..d420de738b9 100644
--- a/storage/federatedx/federatedx_io_mysql.cc
+++ b/storage/federatedx/federatedx_io_mysql.cc
@@ -140,7 +140,7 @@ federatedx_io_mysql::federatedx_io_mysql(FEDERATEDX_SERVER *aserver)
   bzero(&mysql, sizeof(MYSQL));
   bzero(&savepoints, sizeof(DYNAMIC_ARRAY));
 
-  my_init_dynamic_array(&savepoints, sizeof(SAVEPT), 16, 16, MYF(0));
+  my_init_dynamic_array(PSI_INSTRUMENT_ME, &savepoints, sizeof(SAVEPT), 16, 16, MYF(0));
   
   DBUG_VOID_RETURN;
 }
diff --git a/storage/federatedx/federatedx_pushdown.cc b/storage/federatedx/federatedx_pushdown.cc
index 15b0b0d3d4e..664f0570238 100644
--- a/storage/federatedx/federatedx_pushdown.cc
+++ b/storage/federatedx/federatedx_pushdown.cc
@@ -1,5 +1,5 @@
 /*
-   Copyright (c) 2019 MariaDB
+   Copyright (c) 2019, 2020, MariaDB
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -300,5 +300,3 @@ void ha_federatedx_select_handler::print_error(int error, myf error_flag)
 {
   select_handler::print_error(error, error_flag);
 }
-
-
diff --git a/storage/federatedx/ha_federatedx.cc b/storage/federatedx/ha_federatedx.cc
index aa29c726d16..3d695391439 100644
--- a/storage/federatedx/ha_federatedx.cc
+++ b/storage/federatedx/ha_federatedx.cc
@@ -1,5 +1,6 @@
 /*
 Copyright (c) 2008-2009, Patrick Galbraith & Antony Curtis
+Copyright (c) 2020, 2022, MariaDB Corporation.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -426,7 +427,6 @@ int federatedx_db_init(void *p)
   DBUG_ENTER("federatedx_db_init");
   init_federated_psi_keys();
   federatedx_hton= (handlerton *)p;
-  federatedx_hton->state= SHOW_OPTION_YES;
   /* Needed to work with old .frm files */
   federatedx_hton->db_type= DB_TYPE_FEDERATED_DB;
   federatedx_hton->savepoint_offset= sizeof(ulong);
@@ -438,6 +438,7 @@ int federatedx_db_init(void *p)
   federatedx_hton->rollback= ha_federatedx::rollback;
   federatedx_hton->discover_table_structure= ha_federatedx::discover_assisted;
   federatedx_hton->create= federatedx_create_handler;
+  federatedx_hton->drop_table= [](handlerton *, const char*) { return -1; };
   federatedx_hton->flags= HTON_ALTER_NOT_SUPPORTED;
   federatedx_hton->create_derived= create_federatedx_derived_handler;
   federatedx_hton->create_select= create_federatedx_select_handler;
@@ -445,9 +446,9 @@ int federatedx_db_init(void *p)
   if (mysql_mutex_init(fe_key_mutex_federatedx,
                        &federatedx_mutex, MY_MUTEX_INIT_FAST))
     goto error;
-  if (!my_hash_init(&federatedx_open_tables, &my_charset_bin, 32, 0, 0,
+  if (!my_hash_init(PSI_INSTRUMENT_ME, &federatedx_open_tables, &my_charset_bin, 32, 0, 0,
                  (my_hash_get_key) federatedx_share_get_key, 0, 0) &&
-      !my_hash_init(&federatedx_open_servers, &my_charset_bin, 32, 0, 0,
+      !my_hash_init(PSI_INSTRUMENT_ME, &federatedx_open_servers, &my_charset_bin, 32, 0, 0,
                  (my_hash_get_key) federatedx_server_get_key, 0, 0))
   {
     DBUG_RETURN(FALSE);
@@ -511,7 +512,7 @@ bool append_ident(String *string, const char *name, size_t length,
     for (name_end= name+length; name < name_end; name+= clen)
     {
       uchar c= *(uchar *) name;
-      clen= my_charlen_fix(system_charset_info, name, name_end);
+      clen= system_charset_info->charlen_fix(name, name_end);
       if (clen == 1 && c == (uchar) quote_char &&
           (result= string->append(&quote_char, 1, system_charset_info)))
         goto err;
@@ -894,7 +895,8 @@ uint ha_federatedx::convert_row_to_internal_format(uchar *record,
       if (bitmap_is_set(table->read_set, (*field)->field_index))
       {
         (*field)->set_notnull();
-        (*field)->store(io->get_column_data(row, column), lengths[column], &my_charset_bin);
+        (*field)->store_text(io->get_column_data(row, column), lengths[column],
+                             &my_charset_bin);
       }
     }
     (*field)->move_field_offset(-old_ptr);
@@ -1538,7 +1540,7 @@ static FEDERATEDX_SERVER *get_server(FEDERATEDX_SHARE *share, TABLE *table)
 
   mysql_mutex_assert_owner(&federatedx_mutex);
 
-  init_alloc_root(&mem_root, "federated", 4096, 4096, MYF(0));
+  init_alloc_root(PSI_INSTRUMENT_ME, &mem_root, 4096, 4096, MYF(0));
 
   fill_server(&mem_root, &tmp_server, share, table ? table->s->table_charset : 0);
 
@@ -1596,7 +1598,7 @@ static FEDERATEDX_SHARE *get_share(const char *table_name, TABLE *table)
   query.length(0);
 
   bzero(&tmp_share, sizeof(tmp_share));
-  init_alloc_root(&mem_root, "federated", 256, 0, MYF(0));
+  init_alloc_root(PSI_INSTRUMENT_ME, &mem_root, 256, 0, MYF(0));
 
   mysql_mutex_lock(&federatedx_mutex);
 
@@ -1729,8 +1731,10 @@ static void free_share(federatedx_txn *txn, FEDERATEDX_SHARE *share)
 }
 
 
-ha_rows ha_federatedx::records_in_range(uint inx, key_range *start_key,
-                                       key_range *end_key)
+ha_rows ha_federatedx::records_in_range(uint inx,
+                                        const key_range *start_key,
+                                        const key_range *end_key,
+                                        page_range *pages)
 {
   /*
 
@@ -1745,10 +1749,13 @@ ha_rows ha_federatedx::records_in_range(uint inx, key_range *start_key,
 
 federatedx_txn *ha_federatedx::get_txn(THD *thd, bool no_create)
 {
-  federatedx_txn **txnp= (federatedx_txn **) thd_ha_data(thd, ht);
-  if (!*txnp && !no_create)
-    *txnp= new federatedx_txn();
-  return *txnp;
+  federatedx_txn *txn= (federatedx_txn *) thd_get_ha_data(thd, federatedx_hton);
+  if (!txn && !no_create)
+  {
+    txn= new federatedx_txn();
+    thd_set_ha_data(thd, federatedx_hton, txn);
+  }
+  return txn;
 }
 
 
@@ -1756,7 +1763,6 @@ int ha_federatedx::disconnect(handlerton *hton, MYSQL_THD thd)
 {
   federatedx_txn *txn= (federatedx_txn *) thd_get_ha_data(thd, hton);
   delete txn;
-  *((federatedx_txn **) thd_ha_data(thd, hton))= 0;
   return 0;
 }
 
@@ -1798,7 +1804,7 @@ int ha_federatedx::open(const char *name, int mode, uint test_if_locked)
 
   DBUG_PRINT("info", ("ref_length: %u", ref_length));
 
-  my_init_dynamic_array(&results, sizeof(FEDERATEDX_IO_RESULT*), 4, 4, MYF(0));
+  my_init_dynamic_array(PSI_INSTRUMENT_ME, &results, sizeof(FEDERATEDX_IO_RESULT*), 4, 4, MYF(0));
 
   reset();
 
@@ -3490,7 +3496,7 @@ int ha_federatedx::start_stmt(MYSQL_THD thd, thr_lock_type lock_type)
   if (!txn->in_transaction())
   {
     txn->stmt_begin();
-    trans_register_ha(thd, FALSE, ht);
+    trans_register_ha(thd, FALSE, ht, 0);
   }
   DBUG_RETURN(0);
 }
@@ -3513,12 +3519,12 @@ int ha_federatedx::external_lock(MYSQL_THD thd, int lock_type)
       if (!thd_test_options(thd, (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))
       {
         txn->stmt_begin();
-        trans_register_ha(thd, FALSE, ht);
+        trans_register_ha(thd, FALSE, ht, 0);
       }
       else
       {
         txn->txn_begin();
-        trans_register_ha(thd, TRUE, ht);
+        trans_register_ha(thd, TRUE, ht, 0);
       }
     }
   }
@@ -3536,7 +3542,7 @@ int ha_federatedx::savepoint_set(handlerton *hton, MYSQL_THD thd, void *sv)
   if (txn && txn->has_connections())
   {
     if (txn->txn_begin())
-      trans_register_ha(thd, TRUE, hton);
+      trans_register_ha(thd, TRUE, hton, 0);
     
     txn->sp_acquire((ulong *) sv);
 
@@ -3711,7 +3717,7 @@ maria_declare_plugin(federatedx)
   &federatedx_storage_engine,
   "FEDERATED",
   "Patrick Galbraith",
-  "Allows to access tables on other MariaDB servers, supports transactions and more",
+  "Allows one to access tables on other MariaDB servers, supports transactions and more",
   PLUGIN_LICENSE_GPL,
   federatedx_db_init, /* Plugin Init */
   federatedx_done, /* Plugin Deinit */
@@ -3722,4 +3728,3 @@ maria_declare_plugin(federatedx)
   MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
 }
 maria_declare_plugin_end;
-
diff --git a/storage/federatedx/ha_federatedx.h b/storage/federatedx/ha_federatedx.h
index c9d80dd8282..374a623a975 100644
--- a/storage/federatedx/ha_federatedx.h
+++ b/storage/federatedx/ha_federatedx.h
@@ -262,7 +262,7 @@ public:
 /*
   Class definition for the storage engine
 */
-class ha_federatedx: public handler
+class ha_federatedx final : public handler
 {
   friend int federatedx_db_init(void *p);
 
@@ -447,12 +447,15 @@ public:
   void update_auto_increment(void);
   int repair(THD* thd, HA_CHECK_OPT* check_opt);
   int optimize(THD* thd, HA_CHECK_OPT* check_opt);
-
+  int delete_table(const char *name)
+  {
+    return 0;
+  }
   int delete_all_rows(void);
   int create(const char *name, TABLE *form,
              HA_CREATE_INFO *create_info);                      //required
-  ha_rows records_in_range(uint inx, key_range *start_key,
-                                   key_range *end_key);
+  ha_rows records_in_range(uint inx, const key_range *start_key,
+                           const key_range *end_key, page_range *pages);
   uint8 table_cache_type() { return HA_CACHE_TBL_NOCACHE; }
 
   THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to,
diff --git a/storage/heap/CMakeLists.txt b/storage/heap/CMakeLists.txt
index f3d10e1f186..a26124d0c1c 100644
--- a/storage/heap/CMakeLists.txt
+++ b/storage/heap/CMakeLists.txt
@@ -21,6 +21,11 @@ SET(HEAP_SOURCES  _check.c _rectest.c hp_block.c hp_clear.c hp_close.c hp_create
 
 MYSQL_ADD_PLUGIN(heap ${HEAP_SOURCES} STORAGE_ENGINE MANDATORY RECOMPILE_FOR_EMBEDDED)
 
+IF(CMAKE_SYSTEM_NAME MATCHES AIX AND CMAKE_BUILD_TYPE STREQUAL "DEBUG")
+  # Workaround linker bug on AIX
+  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-berok")
+ENDIF()
+
 IF(WITH_UNIT_TESTS)
   ADD_EXECUTABLE(hp_test1 hp_test1.c)
   TARGET_LINK_LIBRARIES(hp_test1 heap mysys dbug strings)
diff --git a/storage/heap/ChangeLog b/storage/heap/ChangeLog
index 9d3ced84cc9..b6bd0e435bc 100644
--- a/storage/heap/ChangeLog
+++ b/storage/heap/ChangeLog
@@ -1,7 +1,7 @@
 Sun Sep  6 10:56:59 1992  Michael Widenius  (monty@bitch)
 
 	* Added functions for first,next,last,prev and clear of database-heap
-	* Added optional index to rsame for compability.
+	* Added optional index to rsame for compatibility.
 
 Fri Aug 14 14:34:35 1992  Michael Widenius  (monty@bitch)
 
diff --git a/storage/heap/ha_heap.cc b/storage/heap/ha_heap.cc
index 5107abdfdd2..5f7f0c1efa0 100644
--- a/storage/heap/ha_heap.cc
+++ b/storage/heap/ha_heap.cc
@@ -24,31 +24,35 @@
 #include "sql_priv.h"
 #include "sql_plugin.h"
 #include "ha_heap.h"
-#include "sql_base.h"                    // enum_tdc_remove_table_type
+#include "sql_base.h"
 
 static handler *heap_create_handler(handlerton *, TABLE_SHARE *, MEM_ROOT *);
 static int heap_prepare_hp_create_info(TABLE *, bool, HP_CREATE_INFO *);
 
 
-int heap_panic(handlerton *hton, ha_panic_function flag)
+static int heap_panic(handlerton *hton, ha_panic_function flag)
 {
   return hp_panic(flag);
 }
 
 
+static int heap_drop_table(handlerton *hton, const char *path)
+{
+  int error= heap_delete_table(path);
+  return error == ENOENT ? -1 : error;
+}
+
 int heap_init(void *p)
 {
   handlerton *heap_hton;
 
-#ifdef HAVE_PSI_INTERFACE
   init_heap_psi_keys();
-#endif
 
   heap_hton= (handlerton *)p;
-  heap_hton->state=      SHOW_OPTION_YES;
   heap_hton->db_type=    DB_TYPE_HEAP;
   heap_hton->create=     heap_create_handler;
   heap_hton->panic=      heap_panic;
+  heap_hton->drop_table= heap_drop_table;
   heap_hton->flags=      HTON_CAN_RECREATE;
 
   return 0;
@@ -149,7 +153,7 @@ int ha_heap::close(void)
 
 handler *ha_heap::clone(const char *name, MEM_ROOT *mem_root)
 {
-  handler *new_handler= get_new_handler(table->s, mem_root, table->s->db_type());
+  handler *new_handler= get_new_handler(table->s, mem_root, ht);
   if (new_handler && !new_handler->ha_open(table, file->s->name, table->db_stat,
                                            HA_OPEN_IGNORE_IF_LOCKED))
     return new_handler;
@@ -555,8 +559,7 @@ THR_LOCK_DATA **ha_heap::store_lock(THD *thd,
 
 int ha_heap::delete_table(const char *name)
 {
-  int error= heap_delete_table(name);
-  return error == ENOENT ? 0 : error;
+  return heap_drop_table(0, name);
 }
 
 
@@ -573,8 +576,8 @@ int ha_heap::rename_table(const char * from, const char * to)
 }
 
 
-ha_rows ha_heap::records_in_range(uint inx, key_range *min_key,
-                                  key_range *max_key)
+ha_rows ha_heap::records_in_range(uint inx, const key_range *min_key,
+                                  const key_range *max_key, page_range *pages)
 {
   KEY *key=table->key_info+inx;
   if (key->algorithm == HA_KEY_ALG_BTREE)
@@ -612,7 +615,8 @@ static int heap_prepare_hp_create_info(TABLE *table_arg, bool internal_table,
   for (key= parts= 0; key < keys; key++)
     parts+= table_arg->key_info[key].user_defined_key_parts;
 
-  if (!my_multi_malloc(MYF(MY_WME | MY_THREAD_SPECIFIC),
+  if (!my_multi_malloc(hp_key_memory_HP_KEYDEF,
+                       MYF(MY_WME | MY_THREAD_SPECIFIC),
                        &keydef, keys * sizeof(HP_KEYDEF),
                        &seg, parts * sizeof(HA_KEYSEG),
                        NULL))
@@ -827,23 +831,6 @@ int ha_heap::find_unique_row(uchar *record, uint unique_idx)
 struct st_mysql_storage_engine heap_storage_engine=
 { MYSQL_HANDLERTON_INTERFACE_VERSION };
 
-mysql_declare_plugin(heap)
-{
-  MYSQL_STORAGE_ENGINE_PLUGIN,
-  &heap_storage_engine,
-  "MEMORY",
-  "MySQL AB",
-  "Hash based, stored in memory, useful for temporary tables",
-  PLUGIN_LICENSE_GPL,
-  heap_init,
-  NULL,
-  0x0100, /* 1.0 */
-  NULL,                       /* status variables                */
-  NULL,                       /* system variables                */
-  NULL,                       /* config options                  */
-  0,                          /* flags                           */
-}
-mysql_declare_plugin_end;
 maria_declare_plugin(heap)
 {
   MYSQL_STORAGE_ENGINE_PLUGIN,
diff --git a/storage/heap/ha_heap.h b/storage/heap/ha_heap.h
index 370906bd1f6..3a41028c719 100644
--- a/storage/heap/ha_heap.h
+++ b/storage/heap/ha_heap.h
@@ -25,7 +25,7 @@
 #include <heap.h>
 #include "sql_class.h"                          /* THD */
 
-class ha_heap: public handler
+class ha_heap final : public handler
 {
   HP_INFO *file;
   HP_SHARE *internal_share;
@@ -65,9 +65,11 @@ public:
   double scan_time()
   { return (double) (stats.records+stats.deleted) / 20.0+10; }
   double read_time(uint index, uint ranges, ha_rows rows)
-  { return (double) rows / 20.0+1; }
+  { return (double) (rows +1)/ 20.0; }
   double keyread_time(uint index, uint ranges, ha_rows rows)
-  { return (double) rows / 20.0+1; }
+  { return (double) (rows + ranges) / 20.0 ; }
+  double avg_io_cost()
+  { return 0.05; }                              /* 1/20 */
   int open(const char *name, int mode, uint test_if_locked);
   int close(void);
   void set_keys_for_scanning(void);
@@ -102,7 +104,8 @@ public:
   int disable_indexes(uint mode);
   int enable_indexes(uint mode);
   int indexes_are_disabled(void);
-  ha_rows records_in_range(uint inx, key_range *min_key, key_range *max_key);
+  ha_rows records_in_range(uint inx, const key_range *start_key,
+                           const key_range *end_key, page_range *pages);
   int delete_table(const char *from);
   void drop_table(const char *name);
   int rename_table(const char * from, const char * to);
diff --git a/storage/heap/heapdef.h b/storage/heap/heapdef.h
index b3ceb617bd4..ffd5382b6f7 100644
--- a/storage/heap/heapdef.h
+++ b/storage/heap/heapdef.h
@@ -103,9 +103,15 @@ extern uint hp_rb_pack_key(HP_KEYDEF *keydef, uchar *key, const uchar *old,
 
 extern mysql_mutex_t THR_LOCK_heap;
 
+extern PSI_memory_key hp_key_memory_HP_SHARE;
+extern PSI_memory_key hp_key_memory_HP_INFO;
+extern PSI_memory_key hp_key_memory_HP_PTRS;
+extern PSI_memory_key hp_key_memory_HP_KEYDEF;
+
 #ifdef HAVE_PSI_INTERFACE
-extern PSI_mutex_key hp_key_mutex_HP_SHARE_intern_lock;
 void init_heap_psi_keys();
+#else
+#define init_heap_psi_keys() do { } while(0)
 #endif /* HAVE_PSI_INTERFACE */
 
 C_MODE_END
diff --git a/storage/heap/hp_block.c b/storage/heap/hp_block.c
index 395637db351..324efc8b4af 100644
--- a/storage/heap/hp_block.c
+++ b/storage/heap/hp_block.c
@@ -78,7 +78,7 @@ int hp_get_new_block(HP_SHARE *info, HP_BLOCK *block, size_t *alloc_length)
    */
   *alloc_length= (sizeof(HP_PTRS) * ((i == block->levels) ? i : i - 1) +
                   (ulonglong)block->records_in_block * block->recbuffer);
-  if (!(root=(HP_PTRS*) my_malloc(*alloc_length,
+  if (!(root=(HP_PTRS*) my_malloc(hp_key_memory_HP_PTRS, *alloc_length,
                                   MYF(MY_WME |
                                       (info->internal ?
                                        MY_THREAD_SPECIFIC : 0)))))
diff --git a/storage/heap/hp_create.c b/storage/heap/hp_create.c
index 171756071eb..935c6f8d0fd 100644
--- a/storage/heap/hp_create.c
+++ b/storage/heap/hp_create.c
@@ -1,5 +1,5 @@
 /* Copyright (c) 2000, 2018, Oracle and/or its affiliates.
-   Copyright (c) 2010, 2018, MariaDB Corporation.
+   Copyright (c) 2010, 2020, MariaDB Corporation.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -93,7 +93,7 @@ int heap_create(const char *name, HP_CREATE_INFO *create_info,
 	  keyinfo->seg[j].flag|= HA_SWAP_KEY;
           break;
         case HA_KEYTYPE_VARBINARY1:
-          /* Case-insensitiveness is handled in coll->hash_sort */
+          /* Case-insensitiveness is handled in hash_sort */
           keyinfo->seg[j].type= HA_KEYTYPE_VARTEXT1;
           /* fall through */
         case HA_KEYTYPE_VARTEXT1:
@@ -110,7 +110,7 @@ int heap_create(const char *name, HP_CREATE_INFO *create_info,
           keyinfo->seg[j].bit_start= 1;
           break;
         case HA_KEYTYPE_VARBINARY2:
-          /* Case-insensitiveness is handled in coll->hash_sort */
+          /* Case-insensitiveness is handled in hash_sort */
           /* fall_through */
         case HA_KEYTYPE_VARTEXT2:
           keyinfo->flag|= HA_VAR_LENGTH_KEY;
@@ -159,7 +159,8 @@ int heap_create(const char *name, HP_CREATE_INFO *create_info,
           keyinfo->get_key_length= hp_rb_key_length;
       }
     }
-    if (!(share= (HP_SHARE*) my_malloc((uint) sizeof(HP_SHARE)+
+    if (!(share= (HP_SHARE*) my_malloc(hp_key_memory_HP_SHARE,
+                                       sizeof(HP_SHARE)+
 				       keys*sizeof(HP_KEYDEF)+
 				       key_segs*sizeof(HA_KEYSEG),
 				       MYF(MY_ZEROFILL |
@@ -222,7 +223,7 @@ int heap_create(const char *name, HP_CREATE_INFO *create_info,
     share->create_time= (long) time((time_t*) 0);
     share->internal= create_info->internal_table;
     /* Must be allocated separately for rename to work */
-    if (!(share->name= my_strdup(name,MYF(0))))
+    if (!(share->name= my_strdup(hp_key_memory_HP_SHARE, name, MYF(0))))
     {
       my_free(share);
       goto err;
@@ -231,8 +232,6 @@ int heap_create(const char *name, HP_CREATE_INFO *create_info,
     if (!create_info->internal_table)
     {
       thr_lock_init(&share->lock);
-      mysql_mutex_init(hp_key_mutex_HP_SHARE_intern_lock,
-                       &share->intern_lock, MY_MUTEX_INIT_FAST);
       share->open_list.data= (void*) share;
       heap_share_list= list_add(heap_share_list,&share->open_list);
     }
@@ -361,7 +360,6 @@ void hp_free(HP_SHARE *share)
   {
     heap_share_list= list_delete(heap_share_list, &share->open_list);
     thr_lock_delete(&share->lock);
-    mysql_mutex_destroy(&share->intern_lock);
   }
   hp_clear(share);			/* Remove blocks from memory */
   my_free(share->name);
diff --git a/storage/heap/hp_hash.c b/storage/heap/hp_hash.c
index 5c4b3b50290..a0139151730 100644
--- a/storage/heap/hp_hash.c
+++ b/storage/heap/hp_hash.c
@@ -1,5 +1,6 @@
 /*
    Copyright (c) 2000, 2010, Oracle and/or its affiliates
+   Copyright (c) 2020, MariaDB Corporation.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -19,6 +20,14 @@
 #include "heapdef.h"
 #include <m_ctype.h>
 
+
+static inline size_t
+hp_charpos(CHARSET_INFO *cs, const uchar *b, const uchar *e, size_t num)
+{
+  return my_ci_charpos(cs, (const char*) b, (const char *) e, num);
+}
+
+
 static ulong hp_hashnr(HP_KEYDEF *keydef, const uchar *key);
 /*
   Find out how many rows there is in the given range
@@ -46,8 +55,9 @@ static ulong hp_hashnr(HP_KEYDEF *keydef, const uchar *key);
 			the range.
 */
 
-ha_rows hp_rb_records_in_range(HP_INFO *info, int inx,  key_range *min_key,
-                               key_range *max_key)
+ha_rows hp_rb_records_in_range(HP_INFO *info, int inx,
+                               const key_range *min_key,
+                               const key_range *max_key)
 {
   ha_rows start_pos, end_pos;
   HP_KEYDEF *keyinfo= info->s->keydef + inx;
@@ -242,10 +252,10 @@ static ulong hp_hashnr(HP_KEYDEF *keydef, const uchar *key)
        if (cs->mbmaxlen > 1)
        {
          size_t char_length;
-         char_length= my_charpos(cs, pos, pos + length, length/cs->mbmaxlen);
+         char_length= hp_charpos(cs, pos, pos + length, length/cs->mbmaxlen);
          set_if_smaller(length, char_length);
        }
-       cs->coll->hash_sort(cs, pos, length, &nr, &nr2);
+       my_ci_hash_sort(cs, pos, length, &nr, &nr2);
     }
     else if (seg->type == HA_KEYTYPE_VARTEXT1)  /* Any VARCHAR segments */
     {
@@ -255,12 +265,12 @@ static ulong hp_hashnr(HP_KEYDEF *keydef, const uchar *key)
        if (cs->mbmaxlen > 1)
        {
          size_t char_length;
-         char_length= my_charpos(cs, pos +pack_length,
+         char_length= hp_charpos(cs, pos +pack_length,
                                  pos +pack_length + length,
                                  seg->length/cs->mbmaxlen);
          set_if_smaller(length, char_length);
        }
-       cs->coll->hash_sort(cs, pos+pack_length, length, &nr, &nr2);
+       my_ci_hash_sort(cs, pos+pack_length, length, &nr, &nr2);
        key+= pack_length;
     }
     else
@@ -302,11 +312,11 @@ ulong hp_rec_hashnr(register HP_KEYDEF *keydef, register const uchar *rec)
       size_t char_length= seg->length;
       if (cs->mbmaxlen > 1)
       {
-        char_length= my_charpos(cs, pos, pos + char_length,
+        char_length= hp_charpos(cs, pos, pos + char_length,
                                 char_length / cs->mbmaxlen);
         set_if_smaller(char_length, seg->length); /* QQ: ok to remove? */
       }
-      cs->coll->hash_sort(cs, pos, char_length, &nr, &nr2);
+      my_ci_hash_sort(cs, pos, char_length, &nr, &nr2);
     }
     else if (seg->type == HA_KEYTYPE_VARTEXT1)  /* Any VARCHAR segments */
     {
@@ -316,14 +326,14 @@ ulong hp_rec_hashnr(register HP_KEYDEF *keydef, register const uchar *rec)
       if (cs->mbmaxlen > 1)
       {
         size_t char_length;
-        char_length= my_charpos(cs, pos + pack_length,
+        char_length= hp_charpos(cs, pos + pack_length,
                                 pos + pack_length + length,
                                 seg->length/cs->mbmaxlen);
         set_if_smaller(length, char_length);
       }
       else
         set_if_smaller(length, seg->length);
-      cs->coll->hash_sort(cs, pos+pack_length, length, &nr, &nr2);
+      my_ci_hash_sort(cs, pos+pack_length, length, &nr, &nr2);
     }
     else
     {
@@ -392,18 +402,18 @@ int hp_rec_key_cmp(HP_KEYDEF *keydef, const uchar *rec1, const uchar *rec2)
       if (cs->mbmaxlen > 1)
       {
         size_t char_length= seg->length / cs->mbmaxlen;
-        char_length1= my_charpos(cs, pos1, pos1 + seg->length, char_length);
+        char_length1= hp_charpos(cs, pos1, pos1 + seg->length, char_length);
         set_if_smaller(char_length1, seg->length);
-        char_length2= my_charpos(cs, pos2, pos2 + seg->length, char_length);
+        char_length2= hp_charpos(cs, pos2, pos2 + seg->length, char_length);
         set_if_smaller(char_length2, seg->length);
       }
       else
       {
         char_length1= char_length2= seg->length;
       }
-      if (seg->charset->coll->strnncollsp(seg->charset,
-      					  pos1,char_length1,
-					  pos2,char_length2))
+      if (my_ci_strnncollsp(seg->charset,
+                            pos1, char_length1,
+                            pos2, char_length2))
 	return 1;
     }
     else if (seg->type == HA_KEYTYPE_VARTEXT1)  /* Any VARCHAR segments */
@@ -430,9 +440,9 @@ int hp_rec_key_cmp(HP_KEYDEF *keydef, const uchar *rec1, const uchar *rec2)
         size_t safe_length1= char_length1;
         size_t safe_length2= char_length2;
         size_t char_length= seg->length / cs->mbmaxlen;
-        char_length1= my_charpos(cs, pos1, pos1 + char_length1, char_length);
+        char_length1= hp_charpos(cs, pos1, pos1 + char_length1, char_length);
         set_if_smaller(char_length1, safe_length1);
-        char_length2= my_charpos(cs, pos2, pos2 + char_length2, char_length);
+        char_length2= hp_charpos(cs, pos2, pos2 + char_length2, char_length);
         set_if_smaller(char_length2, safe_length2);
       }
       else
@@ -441,9 +451,9 @@ int hp_rec_key_cmp(HP_KEYDEF *keydef, const uchar *rec1, const uchar *rec2)
         set_if_smaller(char_length2, seg->length);
       }
 
-      if (cs->coll->strnncollsp(seg->charset,
-                                pos1, char_length1,
-                                pos2, char_length2))
+      if (my_ci_strnncollsp(seg->charset,
+                            pos1, char_length1,
+                            pos2, char_length2))
 	return 1;
     }
     else
@@ -498,9 +508,9 @@ int hp_key_cmp(HP_KEYDEF *keydef, const uchar *rec, const uchar *key)
       if (cs->mbmaxlen > 1)
       {
         size_t char_length= seg->length / cs->mbmaxlen;
-        char_length_key= my_charpos(cs, key, key + seg->length, char_length);
+        char_length_key= hp_charpos(cs, key, key + seg->length, char_length);
         set_if_smaller(char_length_key, seg->length);
-        char_length_rec= my_charpos(cs, pos, pos + seg->length, char_length);
+        char_length_rec= hp_charpos(cs, pos, pos + seg->length, char_length);
         set_if_smaller(char_length_rec, seg->length);
       }
       else
@@ -509,9 +519,9 @@ int hp_key_cmp(HP_KEYDEF *keydef, const uchar *rec, const uchar *key)
         char_length_rec= seg->length;
       }
       
-      if (seg->charset->coll->strnncollsp(seg->charset,
-					  (uchar*) pos, char_length_rec,
-					  (uchar*) key, char_length_key))
+      if (my_ci_strnncollsp(seg->charset,
+                            pos, char_length_rec,
+                            key, char_length_key))
 	return 1;
     }
     else if (seg->type == HA_KEYTYPE_VARTEXT1)  /* Any VARCHAR segments */
@@ -529,17 +539,17 @@ int hp_key_cmp(HP_KEYDEF *keydef, const uchar *rec, const uchar *key)
       {
         size_t char_length1, char_length2;
         char_length1= char_length2= seg->length / cs->mbmaxlen; 
-        char_length1= my_charpos(cs, key, key + char_length_key, char_length1);
+        char_length1= hp_charpos(cs, key, key + char_length_key, char_length1);
         set_if_smaller(char_length_key, char_length1);
-        char_length2= my_charpos(cs, pos, pos + char_length_rec, char_length2);
+        char_length2= hp_charpos(cs, pos, pos + char_length_rec, char_length2);
         set_if_smaller(char_length_rec, char_length2);
       }
       else
         set_if_smaller(char_length_rec, seg->length);
 
-      if (cs->coll->strnncollsp(seg->charset,
-                                (uchar*) pos, char_length_rec,
-                                (uchar*) key, char_length_key))
+      if (my_ci_strnncollsp(seg->charset,
+                            pos, char_length_rec,
+                            key, char_length_key))
 	return 1;
     }
     else
@@ -578,7 +588,7 @@ void hp_make_key(HP_KEYDEF *keydef, uchar *key, const uchar *rec)
       *key++= MY_TEST(rec[seg->null_pos] & seg->null_bit);
     if (cs->mbmaxlen > 1)
     {
-      char_length= my_charpos(cs, pos, pos + seg->length,
+      char_length= hp_charpos(cs, pos, pos + seg->length,
                               char_length / cs->mbmaxlen);
       set_if_smaller(char_length, seg->length); /* QQ: ok to remove? */
     }
@@ -598,7 +608,7 @@ void hp_make_key(HP_KEYDEF *keydef, uchar *key, const uchar *rec)
 #define FIX_LENGTH(cs, pos, length, char_length)                        \
   do {                                                                  \
     if (length > char_length)                                           \
-      char_length= my_charpos(cs, pos, pos+length, char_length);        \
+      char_length= hp_charpos(cs, pos, pos+length, char_length);        \
     set_if_smaller(char_length,length);                                 \
   } while(0)
 
@@ -676,12 +686,12 @@ uint hp_rb_make_key(HP_KEYDEF *keydef, uchar *key,
     char_length= seg->length;
     if (seg->charset->mbmaxlen > 1)
     {
-      char_length= my_charpos(seg->charset, 
+      char_length= hp_charpos(seg->charset, 
                               rec + seg->start, rec + seg->start + char_length,
                               char_length / seg->charset->mbmaxlen);
       set_if_smaller(char_length, seg->length); /* QQ: ok to remove? */
       if (char_length < seg->length)
-        seg->charset->cset->fill(seg->charset, (char*) key + char_length,
+        my_ci_fill(seg->charset, (char*) key + char_length,
                                  seg->length - char_length, ' ');
     }
     if (seg->type == HA_KEYTYPE_BIT && seg->bit_length)
@@ -750,11 +760,11 @@ uint hp_rb_pack_key(HP_KEYDEF *keydef, uchar *key, const uchar *old,
     char_length= seg->length;
     if (seg->charset->mbmaxlen > 1)
     {
-      char_length= my_charpos(seg->charset, old, old+char_length,
+      char_length= hp_charpos(seg->charset, old, old+char_length,
                               char_length / seg->charset->mbmaxlen);
       set_if_smaller(char_length, seg->length); /* QQ: ok to remove? */
       if (char_length < seg->length)
-        seg->charset->cset->fill(seg->charset, (char*) key + char_length, 
+        my_ci_fill(seg->charset, (char*) key + char_length,
                                  seg->length - char_length, ' ');
     }
     memcpy(key, old, (size_t) char_length);
diff --git a/storage/heap/hp_open.c b/storage/heap/hp_open.c
index 65186d77e4f..272c4a3af23 100644
--- a/storage/heap/hp_open.c
+++ b/storage/heap/hp_open.c
@@ -30,8 +30,8 @@ HP_INFO *heap_open_from_share(HP_SHARE *share, int mode)
   HP_INFO *info;
   DBUG_ENTER("heap_open_from_share");
 
-  if (!(info= (HP_INFO*) my_malloc(sizeof(HP_INFO) +
-				  2 * share->max_key_length,
+  if (!(info= (HP_INFO*) my_malloc(hp_key_memory_HP_INFO,
+                                   sizeof(HP_INFO) + 2 * share->max_key_length,
                                    MYF(MY_ZEROFILL +
                                        (share->internal ?
                                         MY_THREAD_SPECIFIC : 0)))))
diff --git a/storage/heap/hp_rename.c b/storage/heap/hp_rename.c
index 34e82bbc531..7343644b5d9 100644
--- a/storage/heap/hp_rename.c
+++ b/storage/heap/hp_rename.c
@@ -28,7 +28,8 @@ int heap_rename(const char *old_name, const char *new_name)
   mysql_mutex_lock(&THR_LOCK_heap);
   if ((info = hp_find_named_heap(old_name)))
   {
-    if (!(name_buff=(char*) my_strdup(new_name,MYF(MY_WME))))
+    if (!(name_buff=(char*) my_strdup(hp_key_memory_HP_SHARE,
+                                      new_name, MYF(MY_WME))))
     {
       mysql_mutex_unlock(&THR_LOCK_heap);
       DBUG_RETURN(my_errno);
diff --git a/storage/heap/hp_static.c b/storage/heap/hp_static.c
index 9191e23b399..9a4410eead9 100644
--- a/storage/heap/hp_static.c
+++ b/storage/heap/hp_static.c
@@ -24,16 +24,19 @@
 
 LIST *heap_open_list=0,*heap_share_list=0;
 
+PSI_memory_key hp_key_memory_HP_SHARE;
+PSI_memory_key hp_key_memory_HP_INFO;
+PSI_memory_key hp_key_memory_HP_PTRS;
+PSI_memory_key hp_key_memory_HP_KEYDEF;
+
 #ifdef HAVE_PSI_INTERFACE
-PSI_mutex_key hp_key_mutex_HP_SHARE_intern_lock;
 
-static PSI_mutex_info all_heap_mutexes[]=
+static PSI_memory_info all_heap_memory[]=
 {
-  { & hp_key_mutex_HP_SHARE_intern_lock, "HP_SHARE::intern_lock", 0}
-  /*
-    Note:
-    THR_LOCK_heap is part of mysys, not storage/heap.
-  */
+  { & hp_key_memory_HP_SHARE, "HP_SHARE", 0},
+  { & hp_key_memory_HP_INFO, "HP_INFO", 0},
+  { & hp_key_memory_HP_PTRS, "HP_PTRS", 0},
+  { & hp_key_memory_HP_KEYDEF, "HP_KEYDEF", 0}
 };
 
 void init_heap_psi_keys()
@@ -44,8 +47,8 @@ void init_heap_psi_keys()
   if (PSI_server == NULL)
     return;
 
-  count= array_elements(all_heap_mutexes);
-  PSI_server->register_mutex(category, all_heap_mutexes, count);
+  count= array_elements(all_heap_memory);
+  mysql_memory_register(category, all_heap_memory, count);
 }
 #endif /* HAVE_PSI_INTERFACE */
 
diff --git a/storage/heap/hp_write.c b/storage/heap/hp_write.c
index 877c1bcecb6..670f628a2d5 100644
--- a/storage/heap/hp_write.c
+++ b/storage/heap/hp_write.c
@@ -145,21 +145,21 @@ static uchar *next_free_record_pos(HP_SHARE *info)
     DBUG_PRINT("exit",("Used old position: %p", pos));
     DBUG_RETURN(pos);
   }
+  if ((info->records > info->max_records && info->max_records) ||
+      (info->data_length + info->index_length >= info->max_table_size))
+  {
+    DBUG_PRINT("error",
+                ("record file full. records: %lu  max_records: %lu  "
+                "data_length: %llu  index_length: %llu  "
+                "max_table_size: %llu",
+                info->records, info->max_records,
+                info->data_length, info->index_length,
+                info->max_table_size));
+    my_errno=HA_ERR_RECORD_FILE_FULL;
+    DBUG_RETURN(NULL);
+  }
   if (!(block_pos=(info->records % info->block.records_in_block)))
   {
-    if ((info->records > info->max_records && info->max_records) ||
-        (info->data_length + info->index_length >= info->max_table_size))
-    {
-      DBUG_PRINT("error",
-                 ("record file full. records: %lu  max_records: %lu  "
-                  "data_length: %llu  index_length: %llu  "
-                  "max_table_size: %llu",
-                  info->records, info->max_records,
-                  info->data_length, info->index_length,
-                  info->max_table_size));
-      my_errno=HA_ERR_RECORD_FILE_FULL;
-      DBUG_RETURN(NULL);
-    }
     if (hp_get_new_block(info, &info->block,&length))
       DBUG_RETURN(NULL);
     info->data_length+=length;
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt
index 8a4fc0e79d9..9daf7009ff3 100644
--- a/storage/innobase/CMakeLists.txt
+++ b/storage/innobase/CMakeLists.txt
@@ -1,3 +1,4 @@
+
 # Copyright (c) 2006, 2017, Oracle and/or its affiliates. All rights reserved.
 # Copyright (c) 2014, 2021, MariaDB Corporation.
 #
@@ -19,13 +20,13 @@
 
 
 INCLUDE(innodb.cmake)
+INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/tpool)
 
 SET(INNOBASE_SOURCES
 	btr/btr0btr.cc
 	btr/btr0bulk.cc
 	btr/btr0cur.cc
 	btr/btr0pcur.cc
-	btr/btr0scrub.cc
 	btr/btr0sea.cc
 	btr/btr0defragment.cc
 	buf/buf0block_hint.cc
@@ -57,9 +58,7 @@ SET(INNOBASE_SOURCES
 	fsp/fsp0space.cc
 	fsp/fsp0sysspace.cc
 	fut/fut0lst.cc
-	ha/ha0ha.cc
 	ha/ha0storage.cc
-	ha/hash0hash.cc
 	fts/fts0fts.cc
 	fts/fts0ast.cc
 	fts/fts0blex.cc
@@ -77,6 +76,214 @@ SET(INNOBASE_SOURCES
 	handler/handler0alter.cc
 	handler/i_s.cc
 	ibuf/ibuf0ibuf.cc
+	include/btr0btr.h
+	include/btr0btr.inl
+	include/btr0bulk.h
+	include/btr0cur.h
+	include/btr0cur.inl
+	include/btr0defragment.h
+	include/btr0pcur.h
+	include/btr0pcur.inl
+	include/btr0sea.h
+	include/btr0sea.inl
+	include/btr0types.h
+	include/buf0buddy.h
+	include/buf0buf.h
+	include/buf0buf.inl
+	include/buf0checksum.h
+	include/buf0dblwr.h
+	include/buf0dump.h
+	include/buf0flu.h
+	include/buf0flu.inl
+	include/buf0lru.h
+	include/buf0rea.h
+	include/buf0types.h
+	include/data0data.h
+	include/data0data.inl
+	include/data0type.h
+	include/data0type.inl
+	include/data0types.h
+	include/db0err.h
+	include/dict0boot.h
+	include/dict0boot.inl
+	include/dict0crea.h
+	include/dict0crea.inl
+	include/dict0defrag_bg.h
+	include/dict0dict.h
+	include/dict0dict.inl
+	include/dict0load.h
+	include/dict0mem.h
+	include/dict0mem.inl
+	include/dict0pagecompress.h
+	include/dict0pagecompress.inl
+	include/dict0priv.h
+	include/dict0priv.inl
+	include/dict0stats.h
+	include/dict0stats.inl
+	include/dict0stats_bg.h
+	include/dict0types.h
+	include/dyn0buf.h
+	include/dyn0types.h
+	include/eval0eval.h
+	include/eval0eval.inl
+	include/eval0proc.h
+	include/eval0proc.inl
+	include/fil0crypt.h
+	include/fil0crypt.inl
+	include/fil0fil.h
+	include/fil0fil.inl
+	include/fil0pagecompress.h
+	include/fsp0file.h
+	include/fsp0fsp.h
+	include/fsp0space.h
+	include/fsp0sysspace.h
+	include/fsp0types.h
+	include/fts0ast.h
+	include/fts0blex.h
+	include/fts0fts.h
+	include/fts0opt.h
+	include/fts0pars.h
+	include/fts0plugin.h
+	include/fts0priv.h
+	include/fts0priv.inl
+	include/fts0tlex.h
+	include/fts0tokenize.h
+	include/fts0types.h
+	include/fts0types.inl
+	include/fts0vlc.h
+	include/fut0fut.h
+	include/fut0lst.h
+	include/gis0geo.h
+	include/gis0rtree.h
+	include/gis0rtree.inl
+	include/gis0type.h
+	include/ha_prototypes.h
+	include/ha0ha.h
+	include/ha0ha.inl
+	include/ha0storage.h
+	include/ha0storage.inl
+	include/handler0alter.h
+	include/hash0hash.h
+	include/ib0mutex.h
+	include/ibuf0ibuf.h
+	include/ibuf0ibuf.inl/
+	include/ibuf0types.h
+	include/lock0iter.h
+	include/lock0lock.h
+	include/lock0lock.inl
+	include/lock0prdt.h
+	include/lock0priv.h
+	include/lock0priv.inl
+	include/lock0types.h
+	include/log0crypt.h
+	include/log0log.h
+	include/log0log.inl
+	include/log0recv.h
+	include/log0types.h
+	include/mach0data.h
+	include/mach0data.inl
+	include/mem0mem.h
+	include/mem0mem.inl
+	include/mtr0log.h
+	include/mtr0mtr.h
+	include/mtr0mtr.inl
+	include/mtr0types.h
+	include/os0event.h
+	include/os0file.h
+	include/os0file.inl
+	include/os0thread.h
+	include/page0cur.h
+	include/page0cur.inl
+	include/page0page.h
+	include/page0page.inl
+	include/page0types.h
+	include/page0zip.h
+	include/page0zip.inl
+	include/pars0grm.h
+	include/pars0opt.h
+	include/pars0pars.h
+	include/pars0sym.h
+	include/pars0types.h
+	include/que0que.h
+	include/que0que.inl
+	include/que0types.h
+	include/read0types.h
+	include/rem0cmp.h
+	include/rem0cmp.inl
+	include/rem0rec.h
+	include/rem0rec.inl
+	include/rem0types.h
+	include/row0ext.h
+	include/row0ext.inl
+	include/row0ftsort.h
+	include/row0import.h
+	include/row0ins.h
+	include/row0log.h
+	include/row0log.inl
+	include/row0merge.h
+	include/row0mysql.h
+	include/row0purge.h
+	include/row0quiesce.h
+	include/row0row.h
+	include/row0row.inl
+	include/row0sel.h
+	include/row0sel.inl
+	include/row0types.h
+	include/row0uins.h
+	include/row0umod.h
+	include/row0undo.h
+	include/row0upd.h
+	include/row0upd.inl
+	include/row0vers.h
+	include/srv0mon.h
+	include/srv0mon.inl
+	include/srv0srv.h
+	include/srv0start.h
+	include/sync0arr.h
+	include/sync0arr.inl
+	include/sync0debug.h
+	include/sync0policy.h
+	include/sync0rw.h
+	include/sync0rw.inl
+	include/sync0sync.h
+	include/sync0types.h
+	include/trx0i_s.h
+	include/trx0purge.h
+	include/trx0rec.h
+	include/trx0rec.inl
+	include/trx0roll.h
+	include/trx0rseg.h
+	include/trx0rseg.inl
+	include/trx0sys.h
+	include/trx0trx.h
+	include/trx0trx.inl
+	include/trx0types.h
+	include/trx0undo.h
+	include/trx0undo.inl
+	include/trx0xa.h
+	include/univ.i
+	include/ut0byte.h
+	include/ut0byte.inl
+	include/ut0counter.h
+	include/ut0dbg.h
+	include/ut0list.h
+	include/ut0list.inl
+	include/ut0lst.h
+	include/ut0mem.h
+	include/ut0mem.inl
+	include/ut0mutex.h
+	include/ut0new.h
+	include/ut0pool.h
+	include/ut0rbt.h
+	include/ut0rnd.h
+	include/ut0rnd.inl
+	include/ut0sort.h
+	include/ut0stage.h
+	include/ut0ut.h
+	include/ut0ut.inl
+	include/ut0vec.h
+	include/ut0vec.inl
+	include/ut0wqueue.h
 	lock/lock0iter.cc
 	lock/lock0prdt.cc
 	lock/lock0lock.cc
@@ -84,12 +291,10 @@ SET(INNOBASE_SOURCES
 	log/log0log.cc
 	log/log0recv.cc
 	log/log0crypt.cc
-	mach/mach0data.cc
+	log/log0sync.cc
 	mem/mem0mem.cc
-	mtr/mtr0log.cc
 	mtr/mtr0mtr.cc
 	os/os0file.cc
-	os/os0proc.cc
 	os/os0event.cc
 	os/os0thread.cc
 	page/page0cur.cc
@@ -120,7 +325,6 @@ SET(INNOBASE_SOURCES
 	row/row0upd.cc
 	row/row0quiesce.cc
 	row/row0vers.cc
-	srv/srv0conc.cc
 	srv/srv0mon.cc
 	srv/srv0srv.cc
 	srv/srv0start.cc
@@ -136,7 +340,6 @@ SET(INNOBASE_SOURCES
 	trx/trx0sys.cc
 	trx/trx0trx.cc
 	trx/trx0undo.cc
-	ut/ut0crc32.cc
 	ut/ut0dbg.cc
 	ut/ut0list.cc
 	ut/ut0mem.cc
@@ -152,10 +355,10 @@ MYSQL_ADD_PLUGIN(innobase ${INNOBASE_SOURCES} STORAGE_ENGINE
   DEFAULT RECOMPILE_FOR_EMBEDDED
   LINK_LIBRARIES
 	${ZLIB_LIBRARY}
-	${CRC32_LIBRARY}
 	${NUMA_LIBRARY}
 	${LIBSYSTEMD}
-	${LINKER_SCRIPT})
+	${LINKER_SCRIPT}
+	${LIBPMEM})
 
 IF(NOT TARGET innobase)
   RETURN()
@@ -193,6 +396,7 @@ IF(MSVC)
 ENDIF()
 
 IF(NOT (PLUGIN_INNOBASE STREQUAL DYNAMIC))
+  TARGET_LINK_LIBRARIES(innobase tpool mysys)
   ADD_SUBDIRECTORY(${CMAKE_SOURCE_DIR}/extra/mariabackup ${CMAKE_BINARY_DIR}/extra/mariabackup)
 ENDIF()
 
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
index dd08b3e6b63..4c832eb77e8 100644
--- a/storage/innobase/btr/btr0btr.cc
+++ b/storage/innobase/btr/btr0btr.cc
@@ -55,7 +55,7 @@ bool
 btr_can_merge_with_page(
 /*====================*/
 	btr_cur_t*	cursor,		/*!< in: cursor on the page to merge */
-	ulint		page_no,	/*!< in: a sibling page */
+	uint32_t	page_no,	/*!< in: a sibling page */
 	buf_block_t**	merge_block,	/*!< out: the merge block */
 	mtr_t*		mtr);		/*!< in: mini-transaction */
 
@@ -65,7 +65,7 @@ btr_can_merge_with_page(
 void btr_corruption_report(const buf_block_t* block, const dict_index_t* index)
 {
 	ib::fatal()
-		<< "Flag mismatch in page " << block->page.id
+		<< "Flag mismatch in page " << block->page.id()
 		<< " index " << index->name
 		<< " of table " << index->table->name;
 }
@@ -215,7 +215,7 @@ buf_block_t*
 btr_root_block_get(
 /*===============*/
 	const dict_index_t*	index,	/*!< in: index tree */
-	ulint			mode,	/*!< in: either RW_S_LATCH
+	rw_lock_type_t		mode,	/*!< in: either RW_S_LATCH
 					or RW_X_LATCH */
 	mtr_t*			mtr)	/*!< in: mtr */
 {
@@ -223,10 +223,8 @@ btr_root_block_get(
 		return NULL;
 	}
 
-	buf_block_t*	block = btr_block_get(
-		page_id_t(index->table->space_id, index->page),
-		index->table->space->zip_size(), mode,
-		index, mtr);
+	buf_block_t* block = btr_block_get(*index, index->page, mode, false,
+					   mtr);
 
 	if (!block) {
 		index->table->file_unreadable = true;
@@ -270,8 +268,7 @@ btr_root_get(
 	/* Intended to be used for segment list access.
 	SX lock doesn't block reading user data by other threads.
 	And block the segment list access by others.*/
-	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH,
-					       mtr);
+	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
 	return(root ? buf_block_get_frame(root) : NULL);
 }
 
@@ -290,10 +287,9 @@ btr_height_get(
 	buf_block_t*	root_block;
 
 	ut_ad(srv_read_only_mode
-	      || mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
-					   MTR_MEMO_S_LOCK
-					   | MTR_MEMO_X_LOCK
-					   | MTR_MEMO_SX_LOCK));
+	      || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK
+					    | MTR_MEMO_X_LOCK
+					    | MTR_MEMO_SX_LOCK));
 
 	/* S latches the page */
 	root_block = btr_root_block_get(index, RW_S_LATCH, mtr);
@@ -352,11 +348,8 @@ btr_root_adjust_on_import(
 	dberr_t			err;
 	mtr_t			mtr;
 	page_t*			page;
-	buf_block_t*		block;
 	page_zip_des_t*		page_zip;
 	dict_table_t*		table = index->table;
-	const page_id_t		page_id(table->space_id, index->page);
-	const ulint		zip_size = table->space->zip_size();
 
 	DBUG_EXECUTE_IF("ib_import_trigger_corruption_3",
 			return(DB_CORRUPTION););
@@ -365,7 +358,17 @@ btr_root_adjust_on_import(
 
 	mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
 
-	block = btr_block_get(page_id, zip_size, RW_X_LATCH, index, &mtr);
+	buf_block_t* block = buf_page_get_gen(
+		page_id_t(table->space->id, index->page),
+		table->space->zip_size(), RW_X_LATCH, NULL, BUF_GET,
+		__FILE__, __LINE__,
+		&mtr, &err);
+	if (!block) {
+		ut_ad(err != DB_SUCCESS);
+		goto func_exit;
+	}
+
+	buf_block_dbg_add_level(block, SYNC_TREE_NODE);
 
 	page = buf_block_get_frame(block);
 	page_zip = buf_block_get_page_zip(block);
@@ -416,6 +419,7 @@ btr_root_adjust_on_import(
 		err = DB_CORRUPTION;
 	}
 
+func_exit:
 	mtr_commit(&mtr);
 
 	return(err);
@@ -433,25 +437,34 @@ btr_page_create(
 	ulint		level,	/*!< in: the B-tree level of the page */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	page_t*		page = buf_block_get_frame(block);
-
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-
-	if (page_zip) {
-		page_create_zip(block, index, level, 0, mtr);
-	} else {
-		page_create(block, mtr, dict_table_is_comp(index->table),
-			    dict_index_is_spatial(index));
-		/* Set the level of the new index page */
-		btr_page_set_level(page, NULL, level, mtr);
-	}
-
-	/* For Spatial Index, initialize the Split Sequence Number */
-	if (dict_index_is_spatial(index)) {
-		page_set_ssn_id(block, page_zip, 0, mtr);
-	}
-
-	btr_page_set_index_id(page, page_zip, index->id, mtr);
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+  byte *index_id= my_assume_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID +
+                                       block->frame);
+
+  if (UNIV_LIKELY_NULL(page_zip))
+  {
+    mach_write_to_8(index_id, index->id);
+    page_create_zip(block, index, level, 0, mtr);
+  }
+  else
+  {
+    page_create(block, mtr, dict_table_is_comp(index->table));
+    if (index->is_spatial())
+    {
+      static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) ==
+                    FIL_PAGE_RTREE, "compatibility");
+      mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+                    byte(FIL_PAGE_RTREE));
+      if (mach_read_from_8(block->frame + FIL_RTREE_SPLIT_SEQ_NUM))
+        mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0);
+    }
+    /* Set the level of the new index page */
+    mtr->write<2,mtr_t::MAYBE_NOP>(*block,
+                                   my_assume_aligned<2>(PAGE_HEADER +
+                                                        PAGE_LEVEL +
+                                                        block->frame), level);
+    mtr->write<8,mtr_t::MAYBE_NOP>(*block, index_id, index->id);
+  }
 }
 
 /**************************************************************//**
@@ -465,15 +478,13 @@ btr_page_alloc_for_ibuf(
 	dict_index_t*	index,	/*!< in: index tree */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	fil_addr_t	node_addr;
-	page_t*		root;
-	page_t*		new_page;
 	buf_block_t*	new_block;
 
-	root = btr_root_get(index, mtr);
+	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
 
-	node_addr = flst_get_first(root + PAGE_HEADER
-				   + PAGE_BTR_IBUF_FREE_LIST, mtr);
+	fil_addr_t node_addr = flst_get_first(PAGE_HEADER
+					      + PAGE_BTR_IBUF_FREE_LIST
+					      + root->frame);
 	ut_a(node_addr.page != FIL_NULL);
 
 	new_block = buf_page_get(
@@ -481,14 +492,12 @@ btr_page_alloc_for_ibuf(
 		index->table->space->zip_size(),
 		RW_X_LATCH, mtr);
 
-	new_page = buf_block_get_frame(new_block);
 	buf_block_dbg_add_level(new_block, SYNC_IBUF_TREE_NODE_NEW);
 
-	flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
-		    new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
+	flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+		    new_block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
 		    mtr);
-	ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
-			    mtr));
+	ut_d(flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
 
 	return(new_block);
 }
@@ -496,16 +505,13 @@ btr_page_alloc_for_ibuf(
 /**************************************************************//**
 Allocates a new file page to be used in an index tree. NOTE: we assume
 that the caller has made the reservation for free extents!
-@retval NULL if no page could be allocated
-@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
-(init_mtr == mtr, or the page was not previously freed in mtr)
-@retval block (not allocated or initialized) otherwise */
+@retval NULL if no page could be allocated */
 static MY_ATTRIBUTE((nonnull, warn_unused_result))
 buf_block_t*
 btr_page_alloc_low(
 /*===============*/
 	dict_index_t*	index,		/*!< in: index */
-	ulint		hint_page_no,	/*!< in: hint of a good page */
+	uint32_t	hint_page_no,	/*!< in: hint of a good page */
 	byte		file_direction,	/*!< in: direction where a possible
 					page split is made */
 	ulint		level,		/*!< in: level where the page is placed
@@ -514,21 +520,14 @@ btr_page_alloc_low(
 					for the allocation */
 	mtr_t*		init_mtr)	/*!< in/out: mtr or another
 					mini-transaction in which the
-					page should be initialized.
-					If init_mtr!=mtr, but the page
-					is already X-latched in mtr, do
-					not initialize the page. */
+					page should be initialized. */
 {
-	fseg_header_t*	seg_header;
-	page_t*		root;
-
-	root = btr_root_get(index, mtr);
+	page_t* root = btr_root_get(index, mtr);
 
-	if (level == 0) {
-		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
-	} else {
-		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
-	}
+	fseg_header_t* seg_header = (level
+				     ? PAGE_HEADER + PAGE_BTR_SEG_TOP
+				     : PAGE_HEADER + PAGE_BTR_SEG_LEAF)
+		+ root;
 
 	/* Parameter TRUE below states that the caller has made the
 	reservation for free extents, and thus we know that a page can
@@ -536,23 +535,7 @@ btr_page_alloc_low(
 
 	buf_block_t* block = fseg_alloc_free_page_general(
 		seg_header, hint_page_no, file_direction,
-		TRUE, mtr, init_mtr);
-
-#ifdef UNIV_DEBUG_SCRUBBING
-	if (block != NULL) {
-		fprintf(stderr,
-			"alloc %lu:%lu to index: %lu root: %lu\n",
-			buf_block_get_page_no(block),
-			buf_block_get_space(block),
-			index->id,
-			dict_index_get_page(index));
-	} else {
-		fprintf(stderr,
-			"failed alloc index: %lu root: %lu\n",
-			index->id,
-			dict_index_get_page(index));
-	}
-#endif /* UNIV_DEBUG_SCRUBBING */
+		true, mtr, init_mtr);
 
 	return block;
 }
@@ -560,15 +543,12 @@ btr_page_alloc_low(
 /**************************************************************//**
 Allocates a new file page to be used in an index tree. NOTE: we assume
 that the caller has made the reservation for free extents!
-@retval NULL if no page could be allocated
-@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
-(init_mtr == mtr, or the page was not previously freed in mtr)
-@retval block (not allocated or initialized) otherwise */
+@retval NULL if no page could be allocated */
 buf_block_t*
 btr_page_alloc(
 /*===========*/
 	dict_index_t*	index,		/*!< in: index */
-	ulint		hint_page_no,	/*!< in: hint of a good page */
+	uint32_t	hint_page_no,	/*!< in: hint of a good page */
 	byte		file_direction,	/*!< in: direction where a possible
 					page split is made */
 	ulint		level,		/*!< in: level where the page is placed
@@ -607,42 +587,34 @@ btr_get_size(
 	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
 				is s-latched */
 {
-	fseg_header_t*	seg_header;
-	page_t*		root;
 	ulint		n=0;
-	ulint		dummy;
 
 	ut_ad(srv_read_only_mode
-	      || mtr_memo_contains(mtr, dict_index_get_lock(index),
-				   MTR_MEMO_S_LOCK));
+	      || mtr->memo_contains(index->lock, MTR_MEMO_S_LOCK));
+	ut_ad(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
 
 	if (index->page == FIL_NULL
 	    || dict_index_is_online_ddl(index)
-	    || !index->is_committed()) {
+	    || !index->is_committed()
+	    || !index->table->space) {
 		return(ULINT_UNDEFINED);
 	}
 
-	root = btr_root_get(index, mtr);
-
-	if (root) {
-		if (flag == BTR_N_LEAF_PAGES) {
-			seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
-
-			fseg_n_reserved_pages(seg_header, &n, mtr);
-
-		} else if (flag == BTR_TOTAL_SIZE) {
-			seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
-
-			n = fseg_n_reserved_pages(seg_header, &dummy, mtr);
-
-			seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
-
-			n += fseg_n_reserved_pages(seg_header, &dummy, mtr);
-		} else {
-			ut_error;
-		}
+	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
+	if (!root) {
+		return ULINT_UNDEFINED;
+	}
+	mtr_x_lock_space(index->table->space, mtr);
+	if (flag == BTR_N_LEAF_PAGES) {
+		fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
+				      + root->frame, &n, mtr);
 	} else {
-		n = ULINT_UNDEFINED;
+		ulint dummy;
+		n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_TOP
+					  + root->frame, &dummy, mtr);
+		n += fseg_n_reserved_pages(*root,
+					   PAGE_HEADER + PAGE_BTR_SEG_LEAF
+					   + root->frame, &dummy, mtr);
 	}
 
 	return(n);
@@ -662,38 +634,33 @@ btr_get_size_and_reserved(
 	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
 				is s-latched */
 {
-	fseg_header_t*	seg_header;
-	page_t*		root;
-	ulint		n=ULINT_UNDEFINED;
 	ulint		dummy;
 
-	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
-				MTR_MEMO_S_LOCK));
-
+	ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_S_LOCK));
 	ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
 
 	if (index->page == FIL_NULL
 	    || dict_index_is_online_ddl(index)
-	    || !index->is_committed()) {
+	    || !index->is_committed()
+	    || !index->table->space) {
 		return(ULINT_UNDEFINED);
 	}
 
-	root = btr_root_get(index, mtr);
+	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
 	*used = 0;
+	if (!root) {
+		return ULINT_UNDEFINED;
+	}
 
-	if (root) {
-
-		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
-
-		n = fseg_n_reserved_pages(seg_header, used, mtr);
-
-		if (flag == BTR_TOTAL_SIZE) {
-			seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
-
-			n += fseg_n_reserved_pages(seg_header, &dummy, mtr);
-			*used += dummy;
+	mtr_x_lock_space(index->table->space, mtr);
 
-		}
+	ulint n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
+					+ root->frame, used, mtr);
+	if (flag == BTR_TOTAL_SIZE) {
+		n += fseg_n_reserved_pages(*root,
+					   PAGE_HEADER + PAGE_BTR_SEG_TOP
+					   + root->frame, &dummy, mtr);
+		*used += dummy;
 	}
 
 	return(n);
@@ -710,17 +677,14 @@ btr_page_free_for_ibuf(
 	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	page_t*		root;
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-	root = btr_root_get(index, mtr);
+	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
 
-	flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
-		       buf_block_get_frame(block)
-		       + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
+	flst_add_first(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+		       block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
 
-	ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
-			    mtr));
+	ut_d(flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
 }
 
 /** Free an index page.
@@ -731,16 +695,17 @@ btr_page_free_for_ibuf(
 void btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
 		   bool blob)
 {
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 #ifdef BTR_CUR_HASH_ADAPT
 	if (block->index && !block->index->freed()) {
 		ut_ad(!blob);
 		ut_ad(page_is_leaf(block->frame));
 	}
 #endif
-	ut_ad(index->table->space_id == block->page.id.space());
+	const page_id_t id(block->page.id());
+	ut_ad(index->table->space_id == id.space());
 	/* The root page is freed by btr_free_root(). */
-	ut_ad(block->page.id.page_no() != index->page);
+	ut_ad(id.page_no() != index->page);
 	ut_ad(mtr->is_named_space(index->table->space));
 
 	/* The page gets invalid for optimistic searches: increment the frame
@@ -760,54 +725,41 @@ void btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
 	fseg_header_t* seg_header = &root[blob || page_is_leaf(block->frame)
 					  ? PAGE_HEADER + PAGE_BTR_SEG_LEAF
 					  : PAGE_HEADER + PAGE_BTR_SEG_TOP];
-	fseg_free_page(seg_header,
-		       index->table->space, block->page.id.page_no(),
-		       !block->page.flush_observer, mtr);
+	fil_space_t* space= index->table->space;
+	const uint32_t page= id.page_no();
+
+	fseg_free_page(seg_header, space, page, mtr);
+	buf_page_free(space, page, mtr, __FILE__, __LINE__);
 
 	/* The page was marked free in the allocation bitmap, but it
 	should remain exclusively latched until mtr_t::commit() or until it
 	is explicitly freed from the mini-transaction. */
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-
-	/* MDEV-15528 FIXME: Zero out the page after the redo log for
-	this mini-transaction has been durably written.
-	This must be done unconditionally if
-	srv_immediate_scrub_data_uncompressed is set. */
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 }
 
-/**************************************************************//**
+/** Set the child page number in a node pointer record.
+@param[in,out]  block   non-leaf index page
+@param[in,out]  rec     node pointer record in the page
+@param[in]      offsets rec_get_offsets(rec)
+@param[in]      page_no child page number
+@param[in,out]  mtr     mini-transaction
 Sets the child node file address in a node pointer. */
-UNIV_INLINE
-void
-btr_node_ptr_set_child_page_no(
-/*===========================*/
-	rec_t*		rec,	/*!< in: node pointer record */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
-				part will be updated, or NULL */
-	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
-	ulint		page_no,/*!< in: child node address */
-	mtr_t*		mtr)	/*!< in: mtr */
+inline void btr_node_ptr_set_child_page_no(buf_block_t *block,
+                                           rec_t *rec, const rec_offs *offsets,
+                                           ulint page_no, mtr_t *mtr)
 {
-	byte*	field;
-	ulint	len;
-
-	ut_ad(rec_offs_validate(rec, NULL, offsets));
-	ut_ad(!page_rec_is_leaf(rec));
-	ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
-
-	/* The child address is in the last field */
-	field = rec_get_nth_field(rec, offsets,
-				  rec_offs_n_fields(offsets) - 1, &len);
-
-	ut_ad(len == REC_NODE_PTR_SIZE);
-
-	if (page_zip) {
-		page_zip_write_node_ptr(page_zip, rec,
-					rec_offs_data_size(offsets),
-					page_no, mtr);
-	} else {
-		mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr);
-	}
+  ut_ad(rec_offs_validate(rec, NULL, offsets));
+  ut_ad(!page_rec_is_leaf(rec));
+  ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
+
+  const ulint offs= rec_offs_data_size(offsets);
+  ut_ad(rec_offs_nth_size(offsets, rec_offs_n_fields(offsets) - 1) ==
+        REC_NODE_PTR_SIZE);
+
+  if (UNIV_LIKELY_NULL(block->page.zip.data))
+    page_zip_write_node_ptr(block, rec, offs, page_no, mtr);
+  else
+    mtr->write<4>(*block, rec + offs - REC_NODE_PTR_SIZE, page_no);
 }
 
 /************************************************************//**
@@ -827,10 +779,9 @@ btr_node_ptr_get_child(
 	      == page_get_space_id(page_align(node_ptr)));
 
 	return btr_block_get(
-		page_id_t(index->table->space_id,
-			  btr_node_ptr_get_child_page_no(node_ptr, offsets)),
-		index->table->space->zip_size(),
-		RW_SX_LATCH, index, mtr);
+		*index, btr_node_ptr_get_child_page_no(node_ptr, offsets),
+		RW_SX_LATCH, btr_page_get_level(page_align(node_ptr)) == 1,
+		mtr);
 }
 
 /************************************************************//**
@@ -862,14 +813,13 @@ btr_page_get_father_node_ptr_func(
 	ut_ad(latch_mode == BTR_CONT_MODIFY_TREE
 	      || latch_mode == BTR_CONT_SEARCH_TREE);
 
-	page_no = btr_cur_get_block(cursor)->page.id.page_no();
+	page_no = btr_cur_get_block(cursor)->page.id().page_no();
 	index = btr_cur_get_index(cursor);
 	ut_ad(!dict_index_is_spatial(index));
 
 	ut_ad(srv_read_only_mode
-	      || mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
-					   MTR_MEMO_X_LOCK
-					   | MTR_MEMO_SX_LOCK));
+	      || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					    | MTR_MEMO_SX_LOCK));
 
 	ut_ad(dict_index_get_page(index) != page_no);
 
@@ -983,40 +933,32 @@ void btr_page_get_father(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
 	mem_heap_free(heap);
 }
 
+#ifdef UNIV_DEBUG
 /** PAGE_INDEX_ID value for freed index B-trees */
-static const index_id_t	BTR_FREED_INDEX_ID = 0;
+constexpr index_id_t	BTR_FREED_INDEX_ID = 0;
+#endif
 
 /** Free a B-tree root page. btr_free_but_not_root() must already
 have been called.
 In a persistent tablespace, the caller must invoke fsp_init_file_page()
 before mtr.commit().
 @param[in,out]	block		index root page
-@param[in,out]	mtr		mini-transaction
-@param[in]	invalidate	whether to invalidate PAGE_INDEX_ID */
-static void btr_free_root(buf_block_t* block, mtr_t* mtr, bool invalidate)
+@param[in,out]	mtr		mini-transaction */
+static void btr_free_root(buf_block_t *block, mtr_t *mtr)
 {
-	fseg_header_t*	header;
-
-	ut_ad(mtr_memo_contains_flagged(mtr, block, MTR_MEMO_PAGE_X_FIX
-					| MTR_MEMO_PAGE_SX_FIX));
-	ut_ad(mtr->is_named_space(block->page.id.space()));
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->is_named_space(block->page.id().space()));
 
-	btr_search_drop_page_hash_index(block);
+  btr_search_drop_page_hash_index(block);
 
-	header = buf_block_get_frame(block) + PAGE_HEADER + PAGE_BTR_SEG_TOP;
 #ifdef UNIV_BTR_DEBUG
-	ut_a(btr_root_fseg_validate(header, block->page.id.space()));
+  ut_a(btr_root_fseg_validate(PAGE_HEADER + PAGE_BTR_SEG_TOP + block->frame,
+			      block->page.id().space()));
 #endif /* UNIV_BTR_DEBUG */
-	if (invalidate) {
-		btr_page_set_index_id(
-			buf_block_get_frame(block),
-			buf_block_get_page_zip(block),
-			BTR_FREED_INDEX_ID, mtr);
-	}
 
-	while (!fseg_free_step(header, mtr)) {
-		/* Free the entire segment in small steps. */
-	}
+  /* Free the entire segment in small steps. */
+  while (!fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_TOP + block->frame, mtr));
 }
 
 /** Prepare to free a B-tree.
@@ -1061,11 +1003,11 @@ btr_free_root_check(
 @param[in]	type			type of the index
 @param[in]	index_id		index id
 @param[in,out]	space			tablespace where created
-@param[in]	index			index
+@param[in]	index			index, or NULL to create a system table
 @param[in,out]	mtr			mini-transaction
 @return	page number of the created root
 @retval	FIL_NULL	if did not succeed */
-ulint
+uint32_t
 btr_create(
 	ulint			type,
 	fil_space_t*		space,
@@ -1074,8 +1016,6 @@ btr_create(
 	mtr_t*			mtr)
 {
 	buf_block_t*		block;
-	page_t*			page;
-	page_zip_des_t*		page_zip;
 
 	ut_ad(mtr->is_named_space(space));
 	ut_ad(index_id != BTR_FREED_INDEX_ID);
@@ -1097,7 +1037,7 @@ btr_create(
 		buf_block_dbg_add_level(
 			ibuf_hdr_block, SYNC_IBUF_TREE_NODE_NEW);
 
-		ut_ad(ibuf_hdr_block->page.id.page_no()
+		ut_ad(ibuf_hdr_block->page.id().page_no()
 		      == IBUF_HEADER_PAGE_NO);
 		/* Allocate then the next page to the segment: it will be the
 		tree root page */
@@ -1112,14 +1052,14 @@ btr_create(
 			return(FIL_NULL);
 		}
 
-		ut_ad(block->page.id.page_no() == IBUF_TREE_ROOT_PAGE_NO);
+		ut_ad(block->page.id() == page_id_t(0,IBUF_TREE_ROOT_PAGE_NO));
 
 		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
 
 		flst_init(block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
 	} else {
-		block = fseg_create(space,
-				    PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
+		block = fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_TOP,
+				    mtr);
 
 		if (block == NULL) {
 			return(FIL_NULL);
@@ -1127,13 +1067,11 @@ btr_create(
 
 		buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
 
-		if (!fseg_create(space,
-				 PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr,
+		if (!fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr,
 				 false, block)) {
 			/* Not enough space for new segment, free root
 			segment before return. */
-			btr_free_root(block, mtr,
-				      !index->table->is_temporary());
+			btr_free_root(block, mtr);
 			return(FIL_NULL);
 		}
 
@@ -1142,40 +1080,38 @@ btr_create(
 		buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
 	}
 
-	/* Create a new index page on the allocated segment page */
-	page_zip = buf_block_get_page_zip(block);
+	ut_ad(!page_has_siblings(block->frame));
 
-	if (page_zip) {
-		page = page_create_zip(block, index, 0, 0, mtr);
-	} else {
-		page = page_create(block, mtr,
-				   dict_table_is_comp(index->table),
-				   dict_index_is_spatial(index));
-		/* Set the level of the new index page */
-		btr_page_set_level(page, NULL, 0, mtr);
-	}
+	constexpr uint16_t field = PAGE_HEADER + PAGE_INDEX_ID;
 
-	/* Set the index id of the page */
-	btr_page_set_index_id(page, page_zip, index_id, mtr);
+	byte* page_index_id = my_assume_aligned<2>(field + block->frame);
 
-	/* Set the next node and previous node fields */
-	compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
-	compile_time_assert(FIL_NULL == 0xffffffff);
-#if MYSQL_VERSION_ID < 100500
-	if (UNIV_LIKELY_NULL(page_zip)) {
-		/* Avoid tripping the ut_a() in mlog_parse_nbytes()
-		when crash-downgrading to an earlier MariaDB 10.4 version. */
-		btr_page_set_next(page, page_zip, FIL_NULL, mtr);
-		btr_page_set_prev(page, page_zip, FIL_NULL, mtr);
+	/* Create a new index page on the allocated segment page */
+	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+		mach_write_to_8(page_index_id, index_id);
+		ut_ad(!page_has_siblings(block->page.zip.data));
+		page_create_zip(block, index, 0, 0, mtr);
 	} else {
-		mlog_memset(block, FIL_PAGE_PREV, 8, 0xff, mtr);
-	}
-#else
-	mlog_memset(block, FIL_PAGE_PREV, 8, 0xff, mtr);
-	if (UNIV_LIKELY_NULL(page_zip)) {
-		memset(page_zip->data + FIL_PAGE_PREV, 0xff, 8);
+		page_create(block, mtr,
+			    index && index->table->not_redundant());
+		if (index && index->is_spatial()) {
+			static_assert(((FIL_PAGE_INDEX & 0xff00)
+				       | byte(FIL_PAGE_RTREE))
+				      == FIL_PAGE_RTREE, "compatibility");
+			mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+				      byte(FIL_PAGE_RTREE));
+			if (mach_read_from_8(block->frame
+					     + FIL_RTREE_SPLIT_SEQ_NUM)) {
+				mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM,
+					    8, 0);
+			}
+		}
+		/* Set the level of the new index page */
+		mtr->write<2,mtr_t::MAYBE_NOP>(*block, PAGE_HEADER + PAGE_LEVEL
+					       + block->frame, 0U);
+		mtr->write<8,mtr_t::MAYBE_NOP>(*block, page_index_id,
+					       index_id);
 	}
-#endif
 
 	/* We reset the free bits for the page in a separate
 	mini-transaction to allow creation of several trees in the
@@ -1184,7 +1120,8 @@ btr_create(
 
 	Note: Insert Buffering is disabled for temporary tables given that
 	most temporary tables are smaller in size and short-lived. */
-	if (!(type & DICT_CLUSTERED) && !index->table->is_temporary()) {
+	if (!(type & DICT_CLUSTERED)
+	    && (!index || !index->table->is_temporary())) {
 		ibuf_reset_free_bits(block);
 	}
 
@@ -1192,9 +1129,10 @@ btr_create(
 	allowed size fit on the root page: this fact is needed to ensure
 	correctness of split algorithms */
 
-	ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE);
+	ut_ad(page_get_max_insert_size(block->frame, 2)
+	      > 2 * BTR_PAGE_MAX_REC_SIZE);
 
-	return(block->page.id.page_no());
+	return(block->page.id().page_no());
 }
 
 /** Free a B-tree except the root page. The root page MUST be freed after
@@ -1207,7 +1145,6 @@ btr_free_but_not_root(
 	buf_block_t*	block,
 	mtr_log_t	log_mode)
 {
-	ibool	finished;
 	mtr_t	mtr;
 
 	ut_ad(fil_page_index_page_check(block->frame));
@@ -1215,7 +1152,7 @@ btr_free_but_not_root(
 leaf_loop:
 	mtr_start(&mtr);
 	mtr_set_log_mode(&mtr, log_mode);
-	mtr.set_named_space_id(block->page.id.space());
+	mtr.set_named_space_id(block->page.id().space());
 
 	page_t*	root = block->frame;
 
@@ -1226,16 +1163,16 @@ leaf_loop:
 
 #ifdef UNIV_BTR_DEBUG
 	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
-				    + root, block->page.id.space()));
+				    + root, block->page.id().space()));
 	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
-				    + root, block->page.id.space()));
+				    + root, block->page.id().space()));
 #endif /* UNIV_BTR_DEBUG */
 
 	/* NOTE: page hash indexes are dropped when a page is freed inside
 	fsp0fsp. */
 
-	finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
-				  &mtr);
+	bool finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
+				       &mtr);
 	mtr_commit(&mtr);
 
 	if (!finished) {
@@ -1245,13 +1182,13 @@ leaf_loop:
 top_loop:
 	mtr_start(&mtr);
 	mtr_set_log_mode(&mtr, log_mode);
-	mtr.set_named_space_id(block->page.id.space());
+	mtr.set_named_space_id(block->page.id().space());
 
 	root = block->frame;
 
 #ifdef UNIV_BTR_DEBUG
 	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
-				    + root, block->page.id.space()));
+				    + root, block->page.id().space()));
 #endif /* UNIV_BTR_DEBUG */
 
 	finished = fseg_free_step_not_header(
@@ -1284,7 +1221,7 @@ btr_free_if_exists(
 
 	btr_free_but_not_root(root, mtr->get_log_mode());
 	mtr->set_named_space_id(page_id.space());
-	btr_free_root(root, mtr, true);
+	btr_free_root(root, mtr);
 }
 
 /** Free an index tree in a temporary tablespace.
@@ -1299,7 +1236,7 @@ void btr_free(const page_id_t page_id)
 
 	if (block) {
 		btr_free_but_not_root(block, MTR_LOG_NO_REDO);
-		btr_free_root(block, &mtr, false);
+		btr_free_root(block, &mtr);
 	}
 	mtr.commit();
 }
@@ -1398,271 +1335,246 @@ btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset)
 	page_set_autoinc(buf_page_get(page_id_t(space->id, index->page),
 				      space->zip_size(),
 				      RW_SX_LATCH, &mtr),
-			 index, autoinc, &mtr, reset);
+			 autoinc, &mtr, reset);
 	mtr.commit();
 }
 
-/*************************************************************//**
-Reorganizes an index page.
-
-IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
-if this is a compressed leaf page in a secondary index. This has to
-be done either within the same mini-transaction, or by invoking
-ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
-IBUF_BITMAP_FREE is unaffected by reorganization.
-
-@retval true if the operation was successful
-@retval false if it is a compressed page, and recompression failed */
-bool
-btr_page_reorganize_low(
-/*====================*/
-	bool		recovery,/*!< in: true if called in recovery:
-				locks should not be updated, i.e.,
-				there cannot exist locks on the
-				page, and a hash index should not be
-				dropped: it cannot exist */
-	ulint		z_level,/*!< in: compression level to be used
-				if dealing with compressed page */
-	page_cur_t*	cursor,	/*!< in/out: page cursor */
-	dict_index_t*	index,	/*!< in: the index tree of the page */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+/** Reorganize an index page.
+@param cursor      index page cursor
+@param index       the index that the cursor belongs to
+@param mtr         mini-transaction */
+static void btr_page_reorganize_low(page_cur_t *cursor, dict_index_t *index,
+                                    mtr_t *mtr)
 {
-	buf_block_t*	block		= page_cur_get_block(cursor);
-	buf_pool_t*	buf_pool	= buf_pool_from_bpage(&block->page);
-	page_t*		page		= buf_block_get_frame(block);
-	page_zip_des_t*	page_zip	= buf_block_get_page_zip(block);
-	buf_block_t*	temp_block;
-	page_t*		temp_page;
-	ulint		data_size1;
-	ulint		data_size2;
-	ulint		max_ins_size1;
-	ulint		max_ins_size2;
-	bool		success		= false;
-	ulint		pos;
-	bool		log_compressed;
-	bool		is_spatial;
-
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-	btr_assert_not_corrupted(block, index);
-	ut_ad(fil_page_index_page_check(block->frame));
-	ut_ad(index->is_dummy
-	      || block->page.id.space() == index->table->space->id);
-	ut_ad(index->is_dummy
-	      || block->page.id.page_no() != index->page
-	      || !page_has_siblings(page));
-#ifdef UNIV_ZIP_DEBUG
-	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
-#endif /* UNIV_ZIP_DEBUG */
-	data_size1 = page_get_data_size(page);
-	max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1);
-	/* Turn logging off */
-	mtr_log_t	log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+  const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NO_REDO);
 
-	temp_block = buf_block_alloc(buf_pool);
-	temp_page = temp_block->frame;
+  buf_block_t *const block= cursor->block;
 
-	MONITOR_INC(MONITOR_INDEX_REORG_ATTEMPTS);
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+  ut_ad(!is_buf_block_get_page_zip(block));
+  btr_assert_not_corrupted(block, index);
+  ut_ad(fil_page_index_page_check(block->frame));
+  ut_ad(index->is_dummy ||
+        block->page.id().space() == index->table->space->id);
+  ut_ad(index->is_dummy || block->page.id().page_no() != index->page ||
+        !page_has_siblings(block->frame));
 
-	/* This function can be called by log redo with a "dummy" index.
-	So we would trust more on the original page's type */
-	is_spatial = (fil_page_get_type(page) == FIL_PAGE_RTREE
-		      || dict_index_is_spatial(index));
+  buf_block_t *old= buf_block_alloc();
+  /* Copy the old page to temporary space */
+  memcpy_aligned<UNIV_PAGE_SIZE_MIN>(old->frame, block->frame, srv_page_size);
 
-	/* Copy the old page to temporary space */
-	buf_frame_copy(temp_page, page);
+  btr_search_drop_page_hash_index(block);
 
-	if (!recovery) {
-		btr_search_drop_page_hash_index(block);
-	}
-
-	/* Save the cursor position. */
-	pos = page_rec_get_n_recs_before(page_cur_get_rec(cursor));
-
-	/* Recreate the page: note that global data on page (possible
-	segment headers, next page-field, etc.) is preserved intact */
-
-	page_create(block, mtr, dict_table_is_comp(index->table), is_spatial);
-
-	/* Copy the records from the temporary space to the recreated page;
-	do not copy the lock bits yet */
-
-	page_copy_rec_list_end_no_locks(block, temp_block,
-					page_get_infimum_rec(temp_page),
-					index, mtr);
-
-	/* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */
-	memcpy(page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
-	       temp_page + (PAGE_HEADER + PAGE_MAX_TRX_ID), 8);
-	/* PAGE_MAX_TRX_ID is unused in clustered index pages
-	(other than the root where it is repurposed as PAGE_ROOT_AUTO_INC),
-	non-leaf pages, and in temporary tables. It was always
-	zero-initialized in page_create() in all InnoDB versions.
-	PAGE_MAX_TRX_ID must be nonzero on dict_index_is_sec_or_ibuf()
-	leaf pages.
-
-	During redo log apply, dict_index_is_sec_or_ibuf() always
-	holds, even for clustered indexes. */
-	ut_ad(recovery || index->table->is_temporary()
-	      || !page_is_leaf(temp_page)
-	      || !dict_index_is_sec_or_ibuf(index)
-	      || page_get_max_trx_id(page) != 0);
-	/* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than
-	clustered index root pages. */
-	ut_ad(recovery
-	      || page_get_max_trx_id(page) == 0
-	      || (dict_index_is_sec_or_ibuf(index)
-		  ? page_is_leaf(temp_page)
-		  : block->page.id.page_no() == index->page));
-
-	/* If innodb_log_compressed_pages is ON, page reorganize should log the
-	compressed page image.*/
-	log_compressed = page_zip && page_zip_log_pages;
-
-	if (log_compressed) {
-		mtr_set_log_mode(mtr, log_mode);
-	}
-
-	if (page_zip
-	    && !page_zip_compress(page_zip, page, index, z_level, mtr)) {
-
-		/* Restore the old page and exit. */
-#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
-		/* Check that the bytes that we skip are identical. */
-		ut_a(!memcmp(page, temp_page, PAGE_HEADER));
-		ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
-			     PAGE_HEADER + PAGE_N_RECS + temp_page,
-			     PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
-		ut_a(!memcmp(srv_page_size - FIL_PAGE_DATA_END + page,
-			     srv_page_size - FIL_PAGE_DATA_END + temp_page,
-			     FIL_PAGE_DATA_END));
-#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
-
-		memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
-		       PAGE_N_RECS - PAGE_N_DIR_SLOTS);
-		memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
-		       srv_page_size - PAGE_DATA - FIL_PAGE_DATA_END);
-
-#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
-		ut_a(!memcmp(page, temp_page, srv_page_size));
-#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
-
-		goto func_exit;
-	}
-
-	data_size2 = page_get_data_size(page);
-	max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1);
-
-	if (data_size1 != data_size2 || max_ins_size1 != max_ins_size2) {
-		ib::error()
-			<< "Page old data size " << data_size1
-			<< " new data size " << data_size2
-			<< ", page old max ins size " << max_ins_size1
-			<< " new max ins size " << max_ins_size2;
-
-		ib::error() << BUG_REPORT_MSG;
-		ut_ad(0);
-	} else {
-		success = true;
-	}
-
-	/* Restore the cursor position. */
-	if (pos > 0) {
-		cursor->rec = page_rec_get_nth(page, pos);
-	} else {
-		ut_ad(cursor->rec == page_get_infimum_rec(page));
-	}
+  /* Save the cursor position. */
+  const ulint pos= page_rec_get_n_recs_before(cursor->rec);
 
-#ifdef UNIV_ZIP_DEBUG
-	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
-#endif /* UNIV_ZIP_DEBUG */
+  page_create(block, mtr, index->table->not_redundant());
+  if (index->is_spatial())
+    block->frame[FIL_PAGE_TYPE + 1]= byte(FIL_PAGE_RTREE);
 
-	if (!recovery) {
-		if (block->page.id.page_no() == index->page
-		    && fil_page_get_type(temp_page) == FIL_PAGE_TYPE_INSTANT) {
-			/* Preserve the PAGE_INSTANT information. */
-			ut_ad(!page_zip);
-			ut_ad(index->is_instant());
-			memcpy(FIL_PAGE_TYPE + page,
-			       FIL_PAGE_TYPE + temp_page, 2);
-			memcpy(PAGE_HEADER + PAGE_INSTANT + page,
-			       PAGE_HEADER + PAGE_INSTANT + temp_page, 2);
-			if (!index->table->instant) {
-			} else if (page_is_comp(page)) {
-				memcpy(PAGE_NEW_INFIMUM + page,
-				       PAGE_NEW_INFIMUM + temp_page, 8);
-				memcpy(PAGE_NEW_SUPREMUM + page,
-				       PAGE_NEW_SUPREMUM + temp_page, 8);
-			} else {
-				memcpy(PAGE_OLD_INFIMUM + page,
-				       PAGE_OLD_INFIMUM + temp_page, 8);
-				memcpy(PAGE_OLD_SUPREMUM + page,
-				       PAGE_OLD_SUPREMUM + temp_page, 8);
-			}
-		}
+  static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) ==
+                FIL_PAGE_RTREE, "compatibility");
 
-		if (!dict_table_is_locking_disabled(index->table)) {
-			/* Update the record lock bitmaps */
-			lock_move_reorganize_page(block, temp_block);
-		}
-	}
+  /* Copy the records from the temporary space to the recreated page;
+  do not copy the lock bits yet */
 
-func_exit:
-	buf_block_free(temp_block);
+  page_copy_rec_list_end_no_locks(block, old, page_get_infimum_rec(old->frame),
+                                  index, mtr);
 
-	/* Restore logging mode */
-	mtr_set_log_mode(mtr, log_mode);
-
-	if (success) {
-		mlog_id_t	type;
-		byte*		log_ptr;
-
-		/* Write the log record */
-		if (page_zip) {
-			ut_ad(page_is_comp(page));
-			type = MLOG_ZIP_PAGE_REORGANIZE;
-		} else if (page_is_comp(page)) {
-			type = MLOG_COMP_PAGE_REORGANIZE;
-		} else {
-			type = MLOG_PAGE_REORGANIZE;
-		}
-
-		log_ptr = log_compressed
-			? NULL
-			: mlog_open_and_write_index(
-				mtr, page, index, type,
-				page_zip ? 1 : 0);
-
-		/* For compressed pages write the compression level. */
-		if (log_ptr && page_zip) {
-			mach_write_to_1(log_ptr, z_level);
-			mlog_close(mtr, log_ptr + 1);
-		}
-
-		MONITOR_INC(MONITOR_INDEX_REORG_SUCCESSFUL);
-	}
-
-	if (UNIV_UNLIKELY(fil_page_get_type(page) == FIL_PAGE_TYPE_INSTANT)) {
-		/* Log the PAGE_INSTANT information. */
-		ut_ad(!page_zip);
-		ut_ad(index->is_instant());
-		ut_ad(!recovery);
-		mlog_write_ulint(FIL_PAGE_TYPE + page, FIL_PAGE_TYPE_INSTANT,
-				 MLOG_2BYTES, mtr);
-		mlog_write_ulint(PAGE_HEADER + PAGE_INSTANT + page,
-				 mach_read_from_2(PAGE_HEADER + PAGE_INSTANT
-						  + page),
-				 MLOG_2BYTES, mtr);
-		if (!index->table->instant) {
-		} else if (page_is_comp(page)) {
-			mlog_log_string(PAGE_NEW_INFIMUM + page, 8, mtr);
-			mlog_log_string(PAGE_NEW_SUPREMUM + page, 8, mtr);
-		} else {
-			mlog_log_string(PAGE_OLD_INFIMUM + page, 8, mtr);
-			mlog_log_string(PAGE_OLD_SUPREMUM + page, 8, mtr);
-		}
-	}
+  /* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */
+  ut_ad(!page_get_max_trx_id(block->frame));
+  memcpy_aligned<8>(PAGE_MAX_TRX_ID + PAGE_HEADER + block->frame,
+                    PAGE_MAX_TRX_ID + PAGE_HEADER + old->frame, 8);
+#ifdef UNIV_DEBUG
+  if (page_get_max_trx_id(block->frame))
+    /* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than
+    clustered index root pages. */
+    ut_ad(dict_index_is_sec_or_ibuf(index)
+          ? page_is_leaf(block->frame)
+          : block->page.id().page_no() == index->page);
+  else
+    /* PAGE_MAX_TRX_ID is unused in clustered index pages (other than
+    the root where it is repurposed as PAGE_ROOT_AUTO_INC), non-leaf
+    pages, and in temporary tables.  It was always zero-initialized in
+    page_create().  PAGE_MAX_TRX_ID must be nonzero on
+    dict_index_is_sec_or_ibuf() leaf pages. */
+    ut_ad(index->table->is_temporary() || !page_is_leaf(block->frame) ||
+          !dict_index_is_sec_or_ibuf(index));
+#endif
 
-	return(success);
+  const uint16_t data_size1= page_get_data_size(old->frame);
+  const uint16_t data_size2= page_get_data_size(block->frame);
+  const ulint max1= page_get_max_insert_size_after_reorganize(old->frame, 1);
+  const ulint max2= page_get_max_insert_size_after_reorganize(block->frame, 1);
+
+  if (UNIV_UNLIKELY(data_size1 != data_size2 || max1 != max2))
+    ib::fatal() << "Page old data size " << data_size1
+                << " new data size " << data_size2
+                << ", page old max ins size " << max1
+                << " new max ins size " << max2;
+
+  /* Restore the cursor position. */
+  if (pos)
+    cursor->rec = page_rec_get_nth(block->frame, pos);
+  else
+    ut_ad(cursor->rec == page_get_infimum_rec(block->frame));
+
+  if (block->page.id().page_no() == index->page &&
+      fil_page_get_type(old->frame) == FIL_PAGE_TYPE_INSTANT)
+  {
+    /* Preserve the PAGE_INSTANT information. */
+    ut_ad(index->is_instant());
+    memcpy_aligned<2>(FIL_PAGE_TYPE + block->frame,
+                      FIL_PAGE_TYPE + old->frame, 2);
+    memcpy_aligned<2>(PAGE_HEADER + PAGE_INSTANT + block->frame,
+                      PAGE_HEADER + PAGE_INSTANT + old->frame, 2);
+    if (!index->table->instant);
+    else if (page_is_comp(block->frame))
+    {
+      memcpy(PAGE_NEW_INFIMUM + block->frame,
+             PAGE_NEW_INFIMUM + old->frame, 8);
+      memcpy(PAGE_NEW_SUPREMUM + block->frame,
+             PAGE_NEW_SUPREMUM + old->frame, 8);
+    }
+    else
+    {
+      memcpy(PAGE_OLD_INFIMUM + block->frame,
+             PAGE_OLD_INFIMUM + old->frame, 8);
+      memcpy(PAGE_OLD_SUPREMUM + block->frame,
+             PAGE_OLD_SUPREMUM + old->frame, 8);
+    }
+  }
+
+  ut_ad(!memcmp(old->frame, block->frame, PAGE_HEADER));
+  ut_ad(!memcmp(old->frame + PAGE_MAX_TRX_ID + PAGE_HEADER,
+                block->frame + PAGE_MAX_TRX_ID + PAGE_HEADER,
+                PAGE_DATA - (PAGE_MAX_TRX_ID + PAGE_HEADER)));
+
+  if (!dict_table_is_locking_disabled(index->table))
+    lock_move_reorganize_page(block, old);
+
+  /* Write log for the changes, if needed. */
+  mtr->set_log_mode(log_mode);
+  if (log_mode == MTR_LOG_ALL)
+  {
+    /* Check and log the changes in the page header. */
+    ulint a, e;
+    for (a= PAGE_HEADER, e= PAGE_MAX_TRX_ID + PAGE_HEADER; a < e; a++)
+    {
+      if (old->frame[a] == block->frame[a])
+        continue;
+      while (--e, old->frame[e] == block->frame[e]);
+      e++;
+      ut_ad(a < e);
+      /* Write log for the changed page header fields. */
+      mtr->memcpy(*block, a, e - a);
+      break;
+    }
+
+    const uint16_t top= page_header_get_offs(block->frame, PAGE_HEAP_TOP);
+
+    if (page_is_comp(block->frame))
+    {
+      /* info_bits=0, n_owned=1, heap_no=0, status */
+      ut_ad(!memcmp(PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES + block->frame,
+                    PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES + old->frame, 3));
+      /* If the 'next' pointer of the infimum record has changed, log it. */
+      a= PAGE_NEW_INFIMUM - 2;
+      e= a + 2;
+      if (block->frame[a] == old->frame[a])
+        a++;
+      if (--e, block->frame[e] != old->frame[e])
+        e++;
+      if (ulint len= e - a)
+        mtr->memcpy(*block, a, len);
+      /* The infimum record itself must not change. */
+      ut_ad(!memcmp(PAGE_NEW_INFIMUM + block->frame,
+                    PAGE_NEW_INFIMUM + old->frame, 8));
+      /* Log any change of the n_owned of the supremum record. */
+      a= PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES;
+      if (block->frame[a] != old->frame[a])
+        mtr->memcpy(*block, a, 1);
+      /* The rest of the supremum record must not change. */
+      ut_ad(!memcmp(&block->frame[a + 1], &old->frame[a + 1],
+                    PAGE_NEW_SUPREMUM_END - PAGE_NEW_SUPREMUM +
+                    REC_N_NEW_EXTRA_BYTES - 1));
+
+      /* Log the differences in the payload. */
+      for (a= PAGE_NEW_SUPREMUM_END, e= top; a < e; a++)
+      {
+        if (old->frame[a] == block->frame[a])
+          continue;
+        while (--e, old->frame[e] == block->frame[e]);
+        e++;
+        ut_ad(a < e);
+	/* TODO: write MEMMOVE records to minimize this further! */
+        mtr->memcpy(*block, a, e - a);
+	break;
+      }
+    }
+    else
+    {
+      /* info_bits=0, n_owned=1, heap_no=0, number of fields, 1-byte format */
+      ut_ad(!memcmp(PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES + block->frame,
+                    PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES + old->frame, 4));
+      /* If the 'next' pointer of the infimum record has changed, log it. */
+      a= PAGE_OLD_INFIMUM - 2;
+      e= a + 2;
+      if (block->frame[a] == old->frame[a])
+        a++;
+      if (--e, block->frame[e] != old->frame[e])
+        e++;
+      if (ulint len= e - a)
+        mtr->memcpy(*block, a, len);
+      /* The infimum record itself must not change. */
+      ut_ad(!memcmp(PAGE_OLD_INFIMUM + block->frame,
+                    PAGE_OLD_INFIMUM + old->frame, 8));
+      /* Log any change of the n_owned of the supremum record. */
+      a= PAGE_OLD_SUPREMUM - REC_N_OLD_EXTRA_BYTES;
+      if (block->frame[a] != old->frame[a])
+        mtr->memcpy(*block, a, 1);
+      ut_ad(!memcmp(&block->frame[a + 1], &old->frame[a + 1],
+                    PAGE_OLD_SUPREMUM_END - PAGE_OLD_SUPREMUM +
+                    REC_N_OLD_EXTRA_BYTES - 1));
+
+      /* Log the differences in the payload. */
+      for (a= PAGE_OLD_SUPREMUM_END, e= top; a < e; a++)
+      {
+        if (old->frame[a] == block->frame[a])
+          continue;
+        while (--e, old->frame[e] == block->frame[e]);
+        e++;
+        ut_ad(a < e);
+	/* TODO: write MEMMOVE records to minimize this further! */
+        mtr->memcpy(*block, a, e - a);
+	break;
+      }
+    }
+
+    e= srv_page_size - PAGE_DIR;
+    a= e - PAGE_DIR_SLOT_SIZE * page_dir_get_n_slots(block->frame);
+
+    /* Zero out the payload area. */
+    mtr->memset(*block, top, a - top, 0);
+
+    /* Log changes to the page directory. */
+    for (; a < e; a++)
+    {
+      if (old->frame[a] == block->frame[a])
+        continue;
+      while (--e, old->frame[e] == block->frame[e]);
+      e++;
+      ut_ad(a < e);
+      /* Write log for the changed page directory slots. */
+      mtr->memcpy(*block, a, e - a);
+      break;
+    }
+  }
+
+  buf_block_free(old);
+
+  MONITOR_INC(MONITOR_INDEX_REORG_ATTEMPTS);
+  MONITOR_INC(MONITOR_INDEX_REORG_SUCCESSFUL);
 }
 
 /*************************************************************//**
@@ -1678,22 +1590,21 @@ IBUF_BITMAP_FREE is unaffected by reorganization.
 @retval false if it is a compressed page, and recompression failed */
 bool
 btr_page_reorganize_block(
-/*======================*/
-	bool		recovery,/*!< in: true if called in recovery:
-				locks should not be updated, i.e.,
-				there cannot exist locks on the
-				page, and a hash index should not be
-				dropped: it cannot exist */
 	ulint		z_level,/*!< in: compression level to be used
 				if dealing with compressed page */
 	buf_block_t*	block,	/*!< in/out: B-tree page */
 	dict_index_t*	index,	/*!< in: the index tree of the page */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
+	if (buf_block_get_page_zip(block)) {
+		return page_zip_reorganize(block, index, z_level, mtr, true);
+	}
+
 	page_cur_t	cur;
 	page_cur_set_before_first(block, &cur);
 
-	return(btr_page_reorganize_low(recovery, z_level, &cur, index, mtr));
+	btr_page_reorganize_low(&cur, index, mtr);
+	return true;
 }
 
 /*************************************************************//**
@@ -1714,50 +1625,24 @@ btr_page_reorganize(
 	dict_index_t*	index,	/*!< in: the index tree of the page */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	return(btr_page_reorganize_low(false, page_zip_level,
-				       cursor, index, mtr));
-}
-
-/***********************************************************//**
-Parses a redo log record of reorganizing a page.
-@return end of log record or NULL */
-byte*
-btr_parse_page_reorganize(
-/*======================*/
-	byte*		ptr,	/*!< in: buffer */
-	byte*		end_ptr,/*!< in: buffer end */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	bool		compressed,/*!< in: true if compressed page */
-	buf_block_t*	block,	/*!< in: page to be reorganized, or NULL */
-	mtr_t*		mtr)	/*!< in: mtr or NULL */
-{
-	ulint	level = page_zip_level;
-
-	ut_ad(ptr != NULL);
-	ut_ad(end_ptr != NULL);
-	ut_ad(index != NULL);
-
-	/* If dealing with a compressed page the record has the
-	compression level used during original compression written in
-	one byte. Otherwise record is empty. */
-	if (compressed) {
-		if (ptr == end_ptr) {
-			return(NULL);
-		}
-
-		level = mach_read_from_1(ptr);
-
-		ut_a(level <= 9);
-		++ptr;
-	} else {
-		level = page_zip_level;
+	if (!buf_block_get_page_zip(cursor->block)) {
+		btr_page_reorganize_low(cursor, index, mtr);
+		return true;
 	}
 
-	if (block != NULL) {
-		btr_page_reorganize_block(true, level, block, index, mtr);
+	ulint pos = page_rec_get_n_recs_before(cursor->rec);
+	if (!page_zip_reorganize(cursor->block, index, page_zip_level, mtr,
+				 true)) {
+		return false;
+	}
+	if (pos) {
+		cursor->rec = page_rec_get_nth(cursor->block->frame, pos);
+	} else {
+		ut_ad(cursor->rec == page_get_infimum_rec(
+			      cursor->block->frame));
 	}
 
-	return(ptr);
+	return true;
 }
 
 /** Empty an index page (possibly the root page). @see btr_page_create().
@@ -1774,14 +1659,12 @@ btr_page_empty(
 	ulint		level,
 	mtr_t*		mtr)
 {
-	page_t*	page = buf_block_get_frame(block);
-
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 	ut_ad(page_zip == buf_block_get_page_zip(block));
 	ut_ad(!index->is_dummy);
-	ut_ad(index->table->space->id == block->page.id.space());
+	ut_ad(index->table->space->id == block->page.id().space());
 #ifdef UNIV_ZIP_DEBUG
-	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+	ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index));
 #endif /* UNIV_ZIP_DEBUG */
 
 	btr_search_drop_page_hash_index(block);
@@ -1793,19 +1676,31 @@ btr_page_empty(
 	root page. */
 	const ib_uint64_t	autoinc
 		= dict_index_is_clust(index)
-		&& index->page == block->page.id.page_no()
-		? page_get_autoinc(page)
+		&& index->page == block->page.id().page_no()
+		? page_get_autoinc(block->frame)
 		: 0;
 
 	if (page_zip) {
 		page_create_zip(block, index, level, autoinc, mtr);
 	} else {
-		page_create(block, mtr, dict_table_is_comp(index->table),
-			    dict_index_is_spatial(index));
-		btr_page_set_level(page, NULL, level, mtr);
+		page_create(block, mtr, index->table->not_redundant());
+		if (index->is_spatial()) {
+			static_assert(((FIL_PAGE_INDEX & 0xff00)
+				       | byte(FIL_PAGE_RTREE))
+				      == FIL_PAGE_RTREE, "compatibility");
+			mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+				      byte(FIL_PAGE_RTREE));
+			if (mach_read_from_8(block->frame
+					     + FIL_RTREE_SPLIT_SEQ_NUM)) {
+				mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM,
+					    8, 0);
+			}
+		}
+		mtr->write<2,mtr_t::MAYBE_NOP>(*block, PAGE_HEADER + PAGE_LEVEL
+					       + block->frame, level);
 		if (autoinc) {
-			mlog_write_ull(PAGE_HEADER + PAGE_MAX_TRX_ID + page,
-				       autoinc, mtr);
+			mtr->write<8>(*block, PAGE_HEADER + PAGE_MAX_TRX_ID
+				      + block->frame, autoinc);
 		}
 	}
 }
@@ -1822,7 +1717,7 @@ void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr)
 	ut_ad(fil_page_get_type(root->frame) == FIL_PAGE_TYPE_INSTANT
 	      || fil_page_get_type(root->frame) == FIL_PAGE_INDEX);
 	ut_ad(!page_has_siblings(root->frame));
-	ut_ad(root->page.id.page_no() == index.page);
+	ut_ad(root->page.id().page_no() == index.page);
 
 	rec_t* infimum = page_get_infimum_rec(root->frame);
 	rec_t* supremum = page_get_supremum_rec(root->frame);
@@ -1845,30 +1740,73 @@ void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr)
 		}
 		break;
 	default:
-		ut_ad(!"wrong page type");
+		ut_ad("wrong page type" == 0);
 		/* fall through */
 	case FIL_PAGE_INDEX:
 		ut_ad(!page_is_comp(root->frame)
 		      || !page_get_instant(root->frame));
 		ut_ad(!memcmp(infimum, "infimum", 8));
 		ut_ad(!memcmp(supremum, "supremum", 8));
-		mlog_write_ulint(page_type, FIL_PAGE_TYPE_INSTANT,
-				 MLOG_2BYTES, mtr);
+		mtr->write<2>(*root, page_type, FIL_PAGE_TYPE_INSTANT);
 		ut_ad(i <= PAGE_NO_DIRECTION);
-		i |= index.n_core_fields << 3;
-		mlog_write_ulint(PAGE_HEADER + PAGE_INSTANT + root->frame, i,
-				 MLOG_2BYTES, mtr);
+		i |= static_cast<uint16_t>(index.n_core_fields << 3);
+		mtr->write<2>(*root, PAGE_HEADER + PAGE_INSTANT + root->frame,
+			      i);
 		break;
 	}
 
 	if (index.table->instant) {
-		mlog_memset(root, infimum - root->frame, 8, 0, mtr);
-		mlog_memset(root, supremum - root->frame, 7, 0, mtr);
-		mlog_write_ulint(&supremum[7], index.n_core_null_bytes,
-				 MLOG_1BYTE, mtr);
+		mtr->memset(root, infimum - root->frame, 8, 0);
+		mtr->memset(root, supremum - root->frame, 7, 0);
+		mtr->write<1,mtr_t::MAYBE_NOP>(*root, &supremum[7],
+					       index.n_core_null_bytes);
 	}
 }
 
+/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE.
+@param[in]      index   clustered index with instant ALTER TABLE
+@param[in]      all     whether to reset FIL_PAGE_TYPE as well
+@param[in,out]  mtr     mini-transaction */
+ATTRIBUTE_COLD
+void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr)
+{
+  ut_ad(!index.table->is_temporary());
+  ut_ad(index.is_primary());
+  if (buf_block_t *root = btr_root_block_get(&index, RW_SX_LATCH, mtr))
+  {
+    byte *page_type= root->frame + FIL_PAGE_TYPE;
+    if (all)
+    {
+      ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT ||
+            mach_read_from_2(page_type) == FIL_PAGE_INDEX);
+      mtr->write<2,mtr_t::MAYBE_NOP>(*root, page_type, FIL_PAGE_INDEX);
+      byte *instant= PAGE_INSTANT + PAGE_HEADER + root->frame;
+      mtr->write<2,mtr_t::MAYBE_NOP>(*root, instant,
+                                     page_ptr_get_direction(instant + 1));
+    }
+    else
+      ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT);
+    static const byte supremuminfimum[8 + 8] = "supremuminfimum";
+    uint16_t infimum, supremum;
+    if (page_is_comp(root->frame))
+    {
+      infimum= PAGE_NEW_INFIMUM;
+      supremum= PAGE_NEW_SUPREMUM;
+    }
+    else
+    {
+      infimum= PAGE_OLD_INFIMUM;
+      supremum= PAGE_OLD_SUPREMUM;
+    }
+    ut_ad(!memcmp(&root->frame[infimum], supremuminfimum + 8, 8) ==
+          !memcmp(&root->frame[supremum], supremuminfimum, 8));
+    mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->frame[infimum],
+                                  supremuminfimum + 8, 8);
+    mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->frame[supremum],
+                                  supremuminfimum, 8);
+  }
+}
+
 /*************************************************************//**
 Makes tree one level higher by splitting the root, and inserts
 the tuple. It is assumed that mtr contains an x-latch on the tree.
@@ -1891,8 +1829,6 @@ btr_root_raise_and_insert(
 	mtr_t*		mtr)	/*!< in: mtr */
 {
 	dict_index_t*	index;
-	page_t*		root;
-	page_t*		new_page;
 	ulint		new_page_no;
 	rec_t*		rec;
 	dtuple_t*	node_ptr;
@@ -1901,40 +1837,39 @@ btr_root_raise_and_insert(
 	page_cur_t*	page_cursor;
 	page_zip_des_t*	root_page_zip;
 	page_zip_des_t*	new_page_zip;
-	buf_block_t*	root_block;
+	buf_block_t*	root;
 	buf_block_t*	new_block;
 
-	root = btr_cur_get_page(cursor);
-	root_block = btr_cur_get_block(cursor);
-	root_page_zip = buf_block_get_page_zip(root_block);
-	ut_ad(!page_is_empty(root));
+	root = btr_cur_get_block(cursor);
+	root_page_zip = buf_block_get_page_zip(root);
+	ut_ad(!page_is_empty(root->frame));
 	index = btr_cur_get_index(cursor);
 	ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable));
 #ifdef UNIV_ZIP_DEBUG
-	ut_a(!root_page_zip || page_zip_validate(root_page_zip, root, index));
+	ut_a(!root_page_zip || page_zip_validate(root_page_zip, root->frame,
+						 index));
 #endif /* UNIV_ZIP_DEBUG */
 #ifdef UNIV_BTR_DEBUG
 	if (!dict_index_is_ibuf(index)) {
 		ulint	space = index->table->space_id;
 
 		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
-					    + root, space));
+					    + root->frame, space));
 		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
-					    + root, space));
+					    + root->frame, space));
 	}
 
-	ut_a(dict_index_get_page(index) == page_get_page_no(root));
+	ut_a(dict_index_get_page(index) == root->page.id().page_no());
 #endif /* UNIV_BTR_DEBUG */
-	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
-					MTR_MEMO_X_LOCK
-					| MTR_MEMO_SX_LOCK));
-	ut_ad(mtr_memo_contains(mtr, root_block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(root, MTR_MEMO_PAGE_X_FIX));
 
 	/* Allocate a new page to the tree. Root splitting is done by first
 	moving the root records to the new page, emptying the root, putting
 	a node pointer to the new page, and then splitting the new page. */
 
-	level = btr_page_get_level(root);
+	level = btr_page_get_level(root->frame);
 
 	new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr);
 
@@ -1942,7 +1877,6 @@ btr_root_raise_and_insert(
 		return(NULL);
 	}
 
-	new_page = buf_block_get_frame(new_block);
 	new_page_zip = buf_block_get_page_zip(new_block);
 	ut_a(!new_page_zip == !root_page_zip);
 	ut_a(!new_page_zip
@@ -1950,25 +1884,17 @@ btr_root_raise_and_insert(
 	     == page_zip_get_size(root_page_zip));
 
 	btr_page_create(new_block, new_page_zip, index, level, mtr);
-
-	/* Set the next node and previous node fields of new page */
-	compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
-	compile_time_assert(FIL_NULL == 0xffffffff);
-#if MYSQL_VERSION_ID < 100500
-	if (UNIV_LIKELY_NULL(new_page_zip)) {
-		/* Avoid tripping the ut_a() in mlog_parse_nbytes()
-		when crash-downgrading to an earlier MariaDB 10.4 version. */
-		btr_page_set_next(new_page, new_page_zip, FIL_NULL, mtr);
-		btr_page_set_prev(new_page, new_page_zip, FIL_NULL, mtr);
-	} else {
-		mlog_memset(new_block, FIL_PAGE_PREV, 8, 0xff, mtr);
-	}
-#else
-	mlog_memset(new_block, FIL_PAGE_PREV, 8, 0xff, mtr);
-	if (UNIV_LIKELY_NULL(new_page_zip)) {
-		memset(new_page_zip->data + FIL_PAGE_PREV, 0xff, 8);
+	if (page_has_siblings(new_block->frame)) {
+		compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
+		compile_time_assert(FIL_NULL == 0xffffffff);
+		static_assert(FIL_PAGE_PREV % 8 == 0, "alignment");
+		memset_aligned<8>(new_block->frame + FIL_PAGE_PREV, 0xff, 8);
+		mtr->memset(new_block, FIL_PAGE_PREV, 8, 0xff);
+		if (UNIV_LIKELY_NULL(new_page_zip)) {
+			memset_aligned<8>(new_page_zip->data + FIL_PAGE_PREV,
+					  0xff, 8);
+		}
 	}
-#endif
 
 	/* Copy the records from root to the new page one by one. */
 
@@ -1976,53 +1902,57 @@ btr_root_raise_and_insert(
 #ifdef UNIV_ZIP_COPY
 	    || new_page_zip
 #endif /* UNIV_ZIP_COPY */
-	    || !page_copy_rec_list_end(new_block, root_block,
-				       page_get_infimum_rec(root),
+	    || !page_copy_rec_list_end(new_block, root,
+				       page_get_infimum_rec(root->frame),
 				       index, mtr)) {
 		ut_a(new_page_zip);
 
 		/* Copy the page byte for byte. */
-		page_zip_copy_recs(new_page_zip, new_page,
-				   root_page_zip, root, index, mtr);
+		page_zip_copy_recs(new_block,
+				   root_page_zip, root->frame, index, mtr);
 
 		/* Update the lock table and possible hash index. */
-		lock_move_rec_list_end(new_block, root_block,
-				       page_get_infimum_rec(root));
+		lock_move_rec_list_end(new_block, root,
+				       page_get_infimum_rec(root->frame));
 
 		/* Move any existing predicate locks */
 		if (dict_index_is_spatial(index)) {
-			lock_prdt_rec_move(new_block, root_block);
+			lock_prdt_rec_move(new_block, root);
 		} else {
 			btr_search_move_or_delete_hash_entries(
-				new_block, root_block);
+				new_block, root);
 		}
 	}
 
+	constexpr uint16_t max_trx_id = PAGE_HEADER + PAGE_MAX_TRX_ID;
 	if (dict_index_is_sec_or_ibuf(index)) {
 		/* In secondary indexes and the change buffer,
 		PAGE_MAX_TRX_ID can be reset on the root page, because
 		the field only matters on leaf pages, and the root no
 		longer is a leaf page. (Older versions of InnoDB did
 		set PAGE_MAX_TRX_ID on all secondary index pages.) */
-		if (root_page_zip) {
-			byte* p = PAGE_HEADER + PAGE_MAX_TRX_ID + root;
-			memset(p, 0, 8);
-			page_zip_write_header(root_page_zip, p, 8, mtr);
-		} else {
-			mlog_write_ull(PAGE_HEADER + PAGE_MAX_TRX_ID
-				       + root, 0, mtr);
+		byte* p = my_assume_aligned<8>(
+			PAGE_HEADER + PAGE_MAX_TRX_ID + root->frame);
+		if (mach_read_from_8(p)) {
+			mtr->memset(root, max_trx_id, 8, 0);
+			if (UNIV_LIKELY_NULL(root->page.zip.data)) {
+				memset_aligned<8>(max_trx_id
+						  + root->page.zip.data, 0, 8);
+			}
 		}
 	} else {
 		/* PAGE_ROOT_AUTO_INC is only present in the clustered index
 		root page; on other clustered index pages, we want to reserve
 		the field PAGE_MAX_TRX_ID for future use. */
-		if (new_page_zip) {
-			byte* p = PAGE_HEADER + PAGE_MAX_TRX_ID + new_page;
-			memset(p, 0, 8);
-			page_zip_write_header(new_page_zip, p, 8, mtr);
-		} else {
-			mlog_write_ull(PAGE_HEADER + PAGE_MAX_TRX_ID
-				       + new_page, 0, mtr);
+		byte* p = my_assume_aligned<8>(
+			PAGE_HEADER + PAGE_MAX_TRX_ID + new_block->frame);
+		if (mach_read_from_8(p)) {
+			mtr->memset(new_block, max_trx_id, 8, 0);
+			if (UNIV_LIKELY_NULL(new_block->page.zip.data)) {
+				memset_aligned<8>(max_trx_id
+						  + new_block->page.zip.data,
+						  0, 8);
+			}
 		}
 	}
 
@@ -2032,7 +1962,7 @@ btr_root_raise_and_insert(
 	root page: we cannot discard the lock structs on the root page */
 
 	if (!dict_table_is_locking_disabled(index->table)) {
-		lock_update_root_raise(new_block, root_block);
+		lock_update_root_raise(new_block, root);
 	}
 
 	/* Create a memory heap where the node pointer is stored */
@@ -2040,8 +1970,8 @@ btr_root_raise_and_insert(
 		*heap = mem_heap_create(1000);
 	}
 
-	rec = page_rec_get_next(page_get_infimum_rec(new_page));
-	new_page_no = new_block->page.id.page_no();
+	rec = page_rec_get_next(page_get_infimum_rec(new_block->frame));
+	new_page_no = new_block->page.id().page_no();
 
 	/* Build the node pointer (= node key and page address) for the
 	child */
@@ -2063,28 +1993,28 @@ btr_root_raise_and_insert(
 			     | REC_INFO_MIN_REC_FLAG);
 
 	/* Rebuild the root page to get free space */
-	btr_page_empty(root_block, root_page_zip, index, level + 1, mtr);
+	btr_page_empty(root, root_page_zip, index, level + 1, mtr);
 	/* btr_page_empty() is supposed to zero-initialize the field. */
-	ut_ad(!page_get_instant(root_block->frame));
+	ut_ad(!page_get_instant(root->frame));
 
 	if (index->is_instant()) {
 		ut_ad(!root_page_zip);
-		btr_set_instant(root_block, *index, mtr);
+		btr_set_instant(root, *index, mtr);
 	}
 
-	ut_ad(!page_has_siblings(root));
+	ut_ad(!page_has_siblings(root->frame));
 
 	page_cursor = btr_cur_get_page_cur(cursor);
 
 	/* Insert node pointer to the root */
 
-	page_cur_set_before_first(root_block, page_cursor);
+	page_cur_set_before_first(root, page_cursor);
 
 	node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
 					     index, offsets, heap, 0, mtr);
 
 	/* The root page should only contain the node pointer
-	to new_page at this point.  Thus, the data should fit. */
+	to new_block at this point.  Thus, the data should fit. */
 	ut_a(node_ptr_rec);
 
 	/* We play safe and reset the free bits for the new page */
@@ -2522,23 +2452,15 @@ btr_attach_half_pages(
 	ulint		direction,	/*!< in: FSP_UP or FSP_DOWN */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	ulint		prev_page_no;
-	ulint		next_page_no;
-	ulint		level;
-	page_t*		page		= buf_block_get_frame(block);
-	page_t*		lower_page;
-	page_t*		upper_page;
-	ulint		lower_page_no;
-	ulint		upper_page_no;
-	page_zip_des_t*	lower_page_zip;
-	page_zip_des_t*	upper_page_zip;
 	dtuple_t*	node_ptr_upper;
 	mem_heap_t*	heap;
 	buf_block_t*	prev_block = NULL;
 	buf_block_t*	next_block = NULL;
+	buf_block_t*	lower_block;
+	buf_block_t*	upper_block;
 
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-	ut_ad(mtr_memo_contains(mtr, new_block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(new_block, MTR_MEMO_PAGE_X_FIX));
 
 	/* Create a memory heap where the data tuple is stored */
 	heap = mem_heap_create(1024);
@@ -2549,12 +2471,8 @@ btr_attach_half_pages(
 		btr_cur_t	cursor;
 		rec_offs*	offsets;
 
-		lower_page = buf_block_get_frame(new_block);
-		lower_page_no = new_block->page.id.page_no();
-		lower_page_zip = buf_block_get_page_zip(new_block);
-		upper_page = buf_block_get_frame(block);
-		upper_page_no = block->page.id.page_no();
-		upper_page_zip = buf_block_get_page_zip(block);
+		lower_block = new_block;
+		upper_block = block;
 
 		/* Look up the index for the node pointer to page */
 		offsets = btr_page_get_father_block(NULL, heap, index,
@@ -2564,46 +2482,39 @@ btr_attach_half_pages(
 		address of the new lower half */
 
 		btr_node_ptr_set_child_page_no(
+			btr_cur_get_block(&cursor),
 			btr_cur_get_rec(&cursor),
-			btr_cur_get_page_zip(&cursor),
-			offsets, lower_page_no, mtr);
+			offsets, lower_block->page.id().page_no(), mtr);
 		mem_heap_empty(heap);
 	} else {
-		lower_page = buf_block_get_frame(block);
-		lower_page_no = block->page.id.page_no();
-		lower_page_zip = buf_block_get_page_zip(block);
-		upper_page = buf_block_get_frame(new_block);
-		upper_page_no = new_block->page.id.page_no();
-		upper_page_zip = buf_block_get_page_zip(new_block);
+		lower_block = block;
+		upper_block = new_block;
 	}
 
-	/* Get the previous and next pages of page */
-	prev_page_no = btr_page_get_prev(page);
-	next_page_no = btr_page_get_next(page);
+	/* Get the level of the split pages */
+	const ulint level = btr_page_get_level(buf_block_get_frame(block));
+	ut_ad(level == btr_page_get_level(buf_block_get_frame(new_block)));
 
-	const ulint	space = block->page.id.space();
+	/* Get the previous and next pages of page */
+	const uint32_t prev_page_no = btr_page_get_prev(block->frame);
+	const uint32_t next_page_no = btr_page_get_next(block->frame);
 
 	/* for consistency, both blocks should be locked, before change */
 	if (prev_page_no != FIL_NULL && direction == FSP_DOWN) {
-		prev_block = btr_block_get(
-			page_id_t(space, prev_page_no), block->zip_size(),
-			RW_X_LATCH, index, mtr);
+		prev_block = btr_block_get(*index, prev_page_no, RW_X_LATCH,
+					   !level, mtr);
 	}
 	if (next_page_no != FIL_NULL && direction != FSP_DOWN) {
-		next_block = btr_block_get(
-			page_id_t(space, next_page_no), block->zip_size(),
-			RW_X_LATCH, index, mtr);
+		next_block = btr_block_get(*index, next_page_no, RW_X_LATCH,
+					   !level, mtr);
 	}
 
-	/* Get the level of the split pages */
-	level = btr_page_get_level(buf_block_get_frame(block));
-	ut_ad(level == btr_page_get_level(buf_block_get_frame(new_block)));
-
 	/* Build the node pointer (= node key and page address) for the upper
 	half */
 
-	node_ptr_upper = dict_index_build_node_ptr(index, split_rec,
-						   upper_page_no, heap, level);
+	node_ptr_upper = dict_index_build_node_ptr(
+		index, split_rec, upper_block->page.id().page_no(),
+		heap, level);
 
 	/* Insert it next to the pointer to the lower half. Note that this
 	may generate recursion leading to a split on the higher level. */
@@ -2618,46 +2529,38 @@ btr_attach_half_pages(
 
 	if (prev_block) {
 #ifdef UNIV_BTR_DEBUG
-		ut_a(page_is_comp(prev_block->frame) == page_is_comp(page));
+		ut_a(page_is_comp(prev_block->frame)
+		     == page_is_comp(block->frame));
 		ut_a(btr_page_get_next(prev_block->frame)
-		     == block->page.id.page_no());
+		     == block->page.id().page_no());
 #endif /* UNIV_BTR_DEBUG */
-
-		btr_page_set_next(buf_block_get_frame(prev_block),
-				  buf_block_get_page_zip(prev_block),
-				  lower_page_no, mtr);
+		btr_page_set_next(prev_block, lower_block->page.id().page_no(),
+				  mtr);
 	}
 
 	if (next_block) {
 #ifdef UNIV_BTR_DEBUG
-		ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
+		ut_a(page_is_comp(next_block->frame)
+		     == page_is_comp(block->frame));
 		ut_a(btr_page_get_prev(next_block->frame)
-		     == page_get_page_no(page));
+		     == block->page.id().page_no());
 #endif /* UNIV_BTR_DEBUG */
-
-		btr_page_set_prev(buf_block_get_frame(next_block),
-				  buf_block_get_page_zip(next_block),
-				  upper_page_no, mtr);
+		btr_page_set_prev(next_block, upper_block->page.id().page_no(),
+				  mtr);
 	}
 
 	if (direction == FSP_DOWN) {
-		/* lower_page is new */
-		btr_page_set_prev(lower_page, lower_page_zip,
-				  prev_page_no, mtr);
+		ut_ad(lower_block == new_block);
+		ut_ad(btr_page_get_next(upper_block->frame) == next_page_no);
+		btr_page_set_prev(lower_block, prev_page_no, mtr);
 	} else {
-		ut_ad(btr_page_get_prev(lower_page) == prev_page_no);
+		ut_ad(upper_block == new_block);
+		ut_ad(btr_page_get_prev(lower_block->frame) == prev_page_no);
+		btr_page_set_next(upper_block, next_page_no, mtr);
 	}
 
-	btr_page_set_next(lower_page, lower_page_zip, upper_page_no, mtr);
-	btr_page_set_prev(upper_page, upper_page_zip, lower_page_no, mtr);
-
-	if (direction != FSP_DOWN) {
-		/* upper_page is new */
-		btr_page_set_next(upper_page, upper_page_zip,
-				  next_page_no, mtr);
-	} else {
-		ut_ad(btr_page_get_next(upper_page) == next_page_no);
-	}
+	btr_page_set_prev(upper_block, lower_block->page.id().page_no(), mtr);
+	btr_page_set_next(lower_block, upper_block->page.id().page_no(), mtr);
 }
 
 /*************************************************************//**
@@ -2720,10 +2623,9 @@ btr_insert_into_right_sibling(
 	page_t*		page = buf_block_get_frame(block);
 	const uint32_t	next_page_no = btr_page_get_next(page);
 
-	ut_ad(mtr_memo_contains_flagged(
-		      mtr, dict_index_get_lock(cursor->index),
-		      MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(&cursor->index->lock,
+					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 	ut_ad(heap);
 
 	if (next_page_no == FIL_NULL || !page_rec_is_supremum(
@@ -2739,11 +2641,11 @@ btr_insert_into_right_sibling(
 	rec_t*		rec = NULL;
 	ulint		max_size;
 
-	const ulint	space = block->page.id.space();
-
-	next_block = btr_block_get(
-		page_id_t(space, next_page_no), block->zip_size(),
-		RW_X_LATCH, cursor->index, mtr);
+	next_block = btr_block_get(*cursor->index, next_page_no, RW_X_LATCH,
+				   page_is_leaf(page), mtr);
+	if (UNIV_UNLIKELY(!next_block)) {
+		return NULL;
+	}
 	next_page = buf_block_get_frame(next_block);
 
 	bool	is_leaf = page_is_leaf(next_page);
@@ -2802,7 +2704,7 @@ btr_insert_into_right_sibling(
 	}
 
 	dtuple_t*	node_ptr = dict_index_build_node_ptr(
-		cursor->index, rec, next_block->page.id.page_no(),
+		cursor->index, rec, next_block->page.id().page_no(),
 		heap, level);
 
 	btr_insert_on_non_leaf_level(
@@ -2882,9 +2784,8 @@ func_start:
 	mem_heap_empty(*heap);
 	*offsets = NULL;
 
-	ut_ad(mtr_memo_contains_flagged(mtr,
-					dict_index_get_lock(cursor->index),
-					MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(&cursor->index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
 	ut_ad(!dict_index_is_online_ddl(cursor->index)
 	      || (flags & BTR_CREATE_FLAG)
 	      || dict_index_is_clust(cursor->index));
@@ -2895,7 +2796,7 @@ func_start:
 	page = buf_block_get_frame(block);
 	page_zip = buf_block_get_page_zip(block);
 
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 	ut_ad(!page_is_empty(page));
 
 	/* try to insert to the next page if possible before split */
@@ -2908,7 +2809,7 @@ func_start:
 	tuple to be inserted should be the first record on the upper
 	half-page */
 	bool insert_left = false;
-	ulint hint_page_no = block->page.id.page_no() + 1;
+	uint32_t hint_page_no = block->page.id().page_no() + 1;
 	byte direction = FSP_UP;
 
 	if (tuple && n_iterations > 0) {
@@ -2944,8 +2845,9 @@ func_start:
 			return(NULL););
 
 	/* 2. Allocate a new page to the index */
+	const uint16_t page_level = btr_page_get_level(page);
 	new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
-				   btr_page_get_level(page), mtr, mtr);
+				   page_level, mtr, mtr);
 
 	if (!new_block) {
 		return(NULL);
@@ -2953,10 +2855,16 @@ func_start:
 
 	new_page = buf_block_get_frame(new_block);
 	new_page_zip = buf_block_get_page_zip(new_block);
+
+	if (page_level && UNIV_LIKELY_NULL(new_page_zip)) {
+		/* ROW_FORMAT=COMPRESSED non-leaf pages are not expected
+		to contain FIL_NULL in FIL_PAGE_PREV at this stage. */
+		memset_aligned<4>(new_page + FIL_PAGE_PREV, 0, 4);
+	}
 	btr_page_create(new_block, new_page_zip, cursor->index,
-			btr_page_get_level(page), mtr);
+			page_level, mtr);
 	/* Only record the leaf level page splits. */
-	if (page_is_leaf(page)) {
+	if (!page_level) {
 		cursor->index->stat_defrag_n_page_split ++;
 		cursor->index->stat_defrag_modified_counter ++;
 		btr_defragment_save_defrag_stats_if_needed(cursor->index);
@@ -3003,6 +2911,7 @@ insert_empty:
 
 	/* 4. Do first the modifications in the tree structure */
 
+	/* FIXME: write FIL_PAGE_PREV,FIL_PAGE_NEXT in new_block earlier! */
 	btr_attach_half_pages(flags, cursor->index, block,
 			      first_rec, new_block, direction, mtr);
 
@@ -3058,7 +2967,7 @@ insert_empty:
 			as appropriate.  Deleting will always succeed. */
 			ut_a(new_page_zip);
 
-			page_zip_copy_recs(new_page_zip, new_page,
+			page_zip_copy_recs(new_block,
 					   page_zip, page, cursor->index, mtr);
 			page_delete_rec_list_end(move_limit - page + new_page,
 						 new_block, cursor->index,
@@ -3101,7 +3010,7 @@ insert_empty:
 			as appropriate.  Deleting will always succeed. */
 			ut_a(new_page_zip);
 
-			page_zip_copy_recs(new_page_zip, new_page,
+			page_zip_copy_recs(new_block,
 					   page_zip, page, cursor->index, mtr);
 			page_delete_rec_list_start(move_limit - page
 						   + new_page, new_block,
@@ -3232,145 +3141,60 @@ func_exit:
 }
 
 /** Remove a page from the level list of pages.
-@param[in]	space		space where removed
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in,out]	page		page to remove
+@param[in]	block		page to remove
 @param[in]	index		index tree
 @param[in,out]	mtr		mini-transaction */
-dberr_t
-btr_level_list_remove_func(
-	ulint			space,
-	ulint			zip_size,
-	page_t*			page,
-	dict_index_t*		index,
-	mtr_t*			mtr)
+dberr_t btr_level_list_remove(const buf_block_t& block,
+                              const dict_index_t& index, mtr_t* mtr)
 {
-	ut_ad(page != NULL);
-	ut_ad(mtr != NULL);
-	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
-	ut_ad(space == page_get_space_id(page));
+	ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(block.zip_size() == index.table->space->zip_size());
+	ut_ad(index.table->space->id == block.page.id().space());
 	/* Get the previous and next page numbers of page */
 
+	const page_t* page = block.frame;
 	const uint32_t	prev_page_no = btr_page_get_prev(page);
 	const uint32_t	next_page_no = btr_page_get_next(page);
 
 	/* Update page links of the level */
 
 	if (prev_page_no != FIL_NULL) {
-		buf_block_t*	prev_block
-			= btr_block_get(page_id_t(space, prev_page_no),
-					zip_size, RW_X_LATCH, index, mtr);
-
-		page_t*		prev_page
-			= buf_block_get_frame(prev_block);
+		buf_block_t*	prev_block = btr_block_get(
+			index, prev_page_no, RW_X_LATCH, page_is_leaf(page),
+			mtr);
 #ifdef UNIV_BTR_DEBUG
-		ut_a(page_is_comp(prev_page) == page_is_comp(page));
-		ut_a(!memcmp(prev_page + FIL_PAGE_NEXT, page + FIL_PAGE_OFFSET,
-			     4));
+		ut_a(page_is_comp(prev_block->frame) == page_is_comp(page));
+		static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment");
+		static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+		ut_a(!memcmp_aligned<4>(prev_block->frame + FIL_PAGE_NEXT,
+					page + FIL_PAGE_OFFSET, 4));
 #endif /* UNIV_BTR_DEBUG */
 
-		btr_page_set_next(prev_page,
-				  buf_block_get_page_zip(prev_block),
-				  next_page_no, mtr);
+		btr_page_set_next(prev_block, next_page_no, mtr);
 	}
 
 	if (next_page_no != FIL_NULL) {
-		buf_block_t*	next_block
-			= btr_block_get(
-				page_id_t(space, next_page_no), zip_size,
-				RW_X_LATCH, index, mtr);
+		buf_block_t*	next_block = btr_block_get(
+			index, next_page_no, RW_X_LATCH, page_is_leaf(page),
+			mtr);
 
 		if (!next_block) {
 			return DB_ERROR;
 		}
-
-		page_t*		next_page
-			= buf_block_get_frame(next_block);
 #ifdef UNIV_BTR_DEBUG
-		ut_a(page_is_comp(next_page) == page_is_comp(page));
-		ut_a(!memcmp(next_page + FIL_PAGE_PREV, page + FIL_PAGE_OFFSET,
-			     4));
+		ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
+		static_assert(FIL_PAGE_PREV % 4 == 0, "alignment");
+		static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+		ut_a(!memcmp_aligned<4>(next_block->frame + FIL_PAGE_PREV,
+					page + FIL_PAGE_OFFSET, 4));
 #endif /* UNIV_BTR_DEBUG */
 
-		btr_page_set_prev(next_page,
-				  buf_block_get_page_zip(next_block),
-				  prev_page_no, mtr);
+		btr_page_set_prev(next_block, prev_page_no, mtr);
 	}
 
 	return DB_SUCCESS;
 }
 
-/****************************************************************//**
-Writes the redo log record for setting an index record as the predefined
-minimum record. */
-UNIV_INLINE
-void
-btr_set_min_rec_mark_log(
-/*=====================*/
-	rec_t*		rec,	/*!< in: record */
-	mlog_id_t	type,	/*!< in: MLOG_COMP_REC_MIN_MARK or
-				MLOG_REC_MIN_MARK */
-	mtr_t*		mtr)	/*!< in: mtr */
-{
-	mlog_write_initial_log_record(rec, type, mtr);
-
-	/* Write rec offset as a 2-byte ulint */
-	mlog_catenate_ulint(mtr, page_offset(rec), MLOG_2BYTES);
-}
-
-/****************************************************************//**
-Parses the redo log record for setting an index record as the predefined
-minimum record.
-@return end of log record or NULL */
-byte*
-btr_parse_set_min_rec_mark(
-/*=======================*/
-	byte*	ptr,	/*!< in: buffer */
-	byte*	end_ptr,/*!< in: buffer end */
-	ulint	comp,	/*!< in: nonzero=compact page format */
-	page_t*	page,	/*!< in: page or NULL */
-	mtr_t*	mtr)	/*!< in: mtr or NULL */
-{
-	rec_t*	rec;
-
-	if (end_ptr < ptr + 2) {
-
-		return(NULL);
-	}
-
-	if (page) {
-		ut_a(!page_is_comp(page) == !comp);
-
-		rec = page + mach_read_from_2(ptr);
-
-		btr_set_min_rec_mark(rec, mtr);
-	}
-
-	return(ptr + 2);
-}
-
-/** Sets a record as the predefined minimum record. */
-void btr_set_min_rec_mark(rec_t* rec, mtr_t* mtr)
-{
-	const bool comp = page_rec_is_comp(rec);
-
-	ut_ad(rec == page_rec_get_next_const(page_get_infimum_rec(
-						     page_align(rec))));
-	ut_ad(!(rec_get_info_bits(page_rec_get_next(rec), comp)
-		& REC_INFO_MIN_REC_FLAG));
-
-	size_t info_bits = rec_get_info_bits(rec, comp);
-	if (comp) {
-		rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG);
-
-		btr_set_min_rec_mark_log(rec, MLOG_COMP_REC_MIN_MARK, mtr);
-	} else {
-		rec_set_info_bits_old(rec, info_bits | REC_INFO_MIN_REC_FLAG);
-
-		btr_set_min_rec_mark_log(rec, MLOG_REC_MIN_MARK, mtr);
-	}
-}
-
 /*************************************************************//**
 If page is the only on its level, this function moves its records to the
 father page, thus reducing the tree height.
@@ -3387,7 +3211,6 @@ btr_lift_page_up(
 	mtr_t*		mtr)	/*!< in: mtr */
 {
 	buf_block_t*	father_block;
-	page_t*		father_page;
 	ulint		page_level;
 	page_zip_des_t*	father_page_zip;
 	page_t*		page		= buf_block_get_frame(block);
@@ -3399,7 +3222,7 @@ btr_lift_page_up(
 	buf_block_t*	block_orig	= block;
 
 	ut_ad(!page_has_siblings(page));
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 
 	page_level = btr_page_get_level(page);
 	root_page_no = dict_index_get_page(index);
@@ -3424,7 +3247,6 @@ btr_lift_page_up(
 		}
 		father_block = btr_cur_get_block(&cursor);
 		father_page_zip = buf_block_get_page_zip(father_block);
-		father_page = buf_block_get_frame(father_block);
 
 		n_blocks = 0;
 
@@ -3434,7 +3256,7 @@ btr_lift_page_up(
 		the first level, the tree is in an inconsistent state
 		and can not be searched. */
 		for (b = father_block;
-		     b->page.id.page_no() != root_page_no; ) {
+		     b->page.id().page_no() != root_page_no; ) {
 			ut_a(n_blocks < BTR_MAX_LEVELS);
 
 			if (dict_index_is_spatial(index)) {
@@ -3467,12 +3289,11 @@ btr_lift_page_up(
 			page_level = btr_page_get_level(page);
 
 			ut_ad(!page_has_siblings(page));
-			ut_ad(mtr_memo_contains(
-				      mtr, block, MTR_MEMO_PAGE_X_FIX));
+			ut_ad(mtr->memo_contains_flagged(block,
+							 MTR_MEMO_PAGE_X_FIX));
 
 			father_block = blocks[0];
 			father_page_zip = buf_block_get_page_zip(father_block);
-			father_page = buf_block_get_frame(father_block);
 		}
 
 		mem_heap_free(heap);
@@ -3486,7 +3307,7 @@ btr_lift_page_up(
 	ut_ad(!page_get_instant(father_block->frame));
 
 	if (index->is_instant()
-	    && father_block->page.id.page_no() == root_page_no) {
+	    && father_block->page.id().page_no() == root_page_no) {
 		ut_ad(!father_page_zip);
 		btr_set_instant(father_block, *index, mtr);
 	}
@@ -3507,7 +3328,7 @@ btr_lift_page_up(
 		ut_a(page_zip);
 
 		/* Copy the page byte for byte. */
-		page_zip_copy_recs(father_page_zip, father_page,
+		page_zip_copy_recs(father_block,
 				   page_zip, page, index, mtr);
 
 		/* Update the lock table and possible hash index. */
@@ -3529,7 +3350,7 @@ btr_lift_page_up(
 		if (dict_index_is_spatial(index)) {
 			lock_mutex_enter();
 			lock_prdt_page_free_from_discard(
-				block, lock_sys.prdt_page_hash);
+				block, &lock_sys.prdt_page_hash);
 			lock_mutex_exit();
 		}
 		lock_update_copy_and_discard(father_block, block);
@@ -3537,15 +3358,8 @@ btr_lift_page_up(
 
 	/* Go upward to root page, decrementing levels by one. */
 	for (i = lift_father_up ? 1 : 0; i < n_blocks; i++, page_level++) {
-		page_t*		page	= buf_block_get_frame(blocks[i]);
-		page_zip_des_t*	page_zip= buf_block_get_page_zip(blocks[i]);
-
-		ut_ad(btr_page_get_level(page) == page_level + 1);
-
-		btr_page_set_level(page, page_zip, page_level, mtr);
-#ifdef UNIV_ZIP_DEBUG
-		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
-#endif /* UNIV_ZIP_DEBUG */
+		ut_ad(btr_page_get_level(blocks[i]->frame) == page_level + 1);
+		btr_page_set_level(blocks[i], page_level, mtr);
 	}
 
 	if (dict_index_is_spatial(index)) {
@@ -3560,7 +3374,7 @@ btr_lift_page_up(
 	    && !index->table->is_temporary()) {
 		ibuf_reset_free_bits(father_block);
 	}
-	ut_ad(page_validate(father_page, index));
+	ut_ad(page_validate(father_block->frame, index));
 	ut_ad(btr_check_node_ptr(index, father_block, mtr));
 
 	return(lift_father_up ? block_orig : father_block);
@@ -3588,8 +3402,6 @@ btr_compress(
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	dict_index_t*	index;
-	ulint		left_page_no;
-	ulint		right_page_no;
 	buf_block_t*	merge_block;
 	page_t*		merge_page = NULL;
 	page_zip_des_t*	merge_page_zip;
@@ -3612,25 +3424,14 @@ btr_compress(
 
 	btr_assert_not_corrupted(block, index);
 
-#ifdef UNIV_DEBUG
-	if (dict_index_is_spatial(index)) {
-		ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
-						MTR_MEMO_X_LOCK));
-	} else {
-		ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
-						MTR_MEMO_X_LOCK
-						| MTR_MEMO_SX_LOCK));
-	}
-#endif /* UNIV_DEBUG */
-
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-
-	const ulint zip_size = index->table->space->zip_size();
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 
 	MONITOR_INC(MONITOR_INDEX_MERGE_ATTEMPTS);
 
-	left_page_no = btr_page_get_prev(page);
-	right_page_no = btr_page_get_next(page);
+	const uint32_t left_page_no = btr_page_get_prev(page);
+	const uint32_t right_page_no = btr_page_get_next(page);
 
 #ifdef UNIV_DEBUG
 	if (!page_is_leaf(page) && left_page_no == FIL_NULL) {
@@ -3645,16 +3446,15 @@ btr_compress(
 	if (dict_index_is_spatial(index)) {
 		offsets = rtr_page_get_father_block(
 			NULL, heap, index, block, mtr, cursor, &father_cursor);
-		ut_ad(cursor->page_cur.block->page.id.page_no()
-		      == block->page.id.page_no());
+		ut_ad(cursor->page_cur.block->page.id() == block->page.id());
 		rec_t*  my_rec = father_cursor.page_cur.rec;
 
 		ulint page_no = btr_node_ptr_get_child_page_no(my_rec, offsets);
 
-		if (page_no != block->page.id.page_no()) {
+		if (page_no != block->page.id().page_no()) {
 			ib::info() << "father positioned on page "
 				<< page_no << "instead of "
-				<< block->page.id.page_no();
+				<< block->page.id().page_no();
 			offsets = btr_page_get_father_block(
 				NULL, heap, index, block, mtr, &father_cursor);
 		}
@@ -3705,25 +3505,13 @@ retry:
 #ifdef UNIV_BTR_DEBUG
 	if (is_left) {
 		ut_a(btr_page_get_next(merge_page)
-		     == block->page.id.page_no());
+		     == block->page.id().page_no());
 	} else {
 		ut_a(btr_page_get_prev(merge_page)
-		     == block->page.id.page_no());
+		     == block->page.id().page_no());
 	}
 #endif /* UNIV_BTR_DEBUG */
 
-#ifdef UNIV_GIS_DEBUG
-	if (dict_index_is_spatial(index)) {
-		if (is_left) {
-			fprintf(stderr, "GIS_DIAG: merge left  %ld to %ld \n",
-				(long) block->page.id.page_no(), left_page_no);
-		} else {
-			fprintf(stderr, "GIS_DIAG: merge right %ld to %ld\n",
-				(long) block->page.id.page_no(), right_page_no);
-		}
-	}
-#endif /* UNIV_GIS_DEBUG */
-
 	ut_ad(page_validate(merge_page, index));
 
 	merge_page_zip = buf_block_get_page_zip(merge_block);
@@ -3783,9 +3571,7 @@ retry:
 		btr_search_drop_page_hash_index(block);
 
 		/* Remove the page from the level list */
-		if (DB_SUCCESS != btr_level_list_remove(index->table->space_id,
-							zip_size, page, index,
-							mtr)) {
+		if (DB_SUCCESS != btr_level_list_remove(*block, *index, mtr)) {
 			goto err_exit;
 		}
 
@@ -3795,13 +3581,10 @@ retry:
 			ulint page_no = btr_node_ptr_get_child_page_no(
 						my_rec, offsets);
 
-			if (page_no != block->page.id.page_no()) {
-
+			if (page_no != block->page.id().page_no()) {
 				ib::fatal() << "father positioned on "
 					<< page_no << " instead of "
-					<< block->page.id.page_no();
-
-				ut_ad(0);
+					<< block->page.id().page_no();
 			}
 
 			if (mbr_changed) {
@@ -3823,7 +3606,7 @@ retry:
 			/* No GAP lock needs to be worrying about */
 			lock_mutex_enter();
 			lock_prdt_page_free_from_discard(
-				block, lock_sys.prdt_page_hash);
+				block, &lock_sys.prdt_page_hash);
 			lock_rec_free_all_from_discard_page(block);
 			lock_mutex_exit();
 		} else {
@@ -3876,11 +3659,12 @@ retry:
 			invoked by page_copy_rec_list_end() below,
 			requires that FIL_PAGE_PREV be FIL_NULL.
 			Clear the field, but prepare to restore it. */
+			static_assert(FIL_PAGE_PREV % 8 == 0, "alignment");
 #ifdef UNIV_BTR_DEBUG
 			memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4);
 #endif /* UNIV_BTR_DEBUG */
 			compile_time_assert(FIL_NULL == 0xffffffffU);
-			memset(merge_page + FIL_PAGE_PREV, 0xff, 4);
+			memset_aligned<4>(merge_page + FIL_PAGE_PREV, 0xff, 4);
 		}
 
 		orig_succ = page_copy_rec_list_end(merge_block, block,
@@ -3916,21 +3700,19 @@ retry:
 #endif /* UNIV_BTR_DEBUG */
 
 		/* Remove the page from the level list */
-		if (DB_SUCCESS != btr_level_list_remove(index->table->space_id,
-							zip_size, page, index,
-							mtr)) {
+		if (DB_SUCCESS != btr_level_list_remove(*block, *index, mtr)) {
 			goto err_exit;
 		}
 
 		ut_ad(btr_node_ptr_get_child_page_no(
-			btr_cur_get_rec(&father_cursor), offsets)
-			== block->page.id.page_no());
+			      btr_cur_get_rec(&father_cursor), offsets)
+		      == block->page.id().page_no());
 
 		/* Replace the address of the old child node (= page) with the
 		address of the merge page to the right */
 		btr_node_ptr_set_child_page_no(
+			btr_cur_get_block(&father_cursor),
 			btr_cur_get_rec(&father_cursor),
-			btr_cur_get_page_zip(&father_cursor),
 			offsets, right_page_no, mtr);
 
 #ifdef UNIV_DEBUG
@@ -3980,7 +3762,7 @@ retry:
 			}
 			lock_mutex_enter();
 			lock_prdt_page_free_from_discard(
-				block, lock_sys.prdt_page_hash);
+				block, &lock_sys.prdt_page_hash);
 			lock_rec_free_all_from_discard_page(block);
 			lock_mutex_exit();
 		} else {
@@ -4029,7 +3811,7 @@ retry:
 		committed mini-transaction, because in crash recovery,
 		the free bits could momentarily be set too high. */
 
-		if (zip_size) {
+		if (merge_block->zip_size()) {
 			/* Because the free bits may be incremented
 			and we cannot update the insert buffer bitmap
 			in the same mini-transaction, the only safe
@@ -4054,13 +3836,6 @@ retry:
 #endif /* UNIV_ZIP_DEBUG */
 
 	if (dict_index_is_spatial(index)) {
-#ifdef UNIV_GIS_DEBUG
-		fprintf(stderr, "GIS_DIAG: compressed away  %ld\n",
-			(long) block->page.id.page_no());
-		fprintf(stderr, "GIS_DIAG: merged to %ld\n",
-			(long) merge_block->page.id.page_no());
-#endif
-
 		rtr_check_discard_page(index, NULL, block);
 	}
 
@@ -4089,9 +3864,8 @@ func_exit:
 
 err_exit:
 	/* We play it safe and reset the free bits. */
-	if (zip_size
-	    && merge_page
-	    && page_is_leaf(merge_page)
+	if (merge_block && merge_block->zip_size()
+	    && page_is_leaf(merge_block->frame)
 	    && !dict_index_is_clust(index)) {
 
 		ibuf_reset_free_bits(merge_block);
@@ -4124,7 +3898,7 @@ btr_discard_only_page_on_level(
 	const rec_t* r = page_rec_get_next(page_get_infimum_rec(block->frame));
 	ut_ad(rec_is_metadata(r, *index) == index->is_instant());
 
-	while (block->page.id.page_no() != dict_index_get_page(index)) {
+	while (block->page.id().page_no() != dict_index_get_page(index)) {
 		btr_cur_t	cursor;
 		buf_block_t*	father;
 		const page_t*	page	= buf_block_get_frame(block);
@@ -4133,8 +3907,8 @@ btr_discard_only_page_on_level(
 		ut_a(page_level == btr_page_get_level(page));
 		ut_a(!page_has_siblings(page));
 		ut_ad(fil_page_index_page_check(page));
-		ut_ad(block->page.id.space() == index->table->space->id);
-		ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+		ut_ad(block->page.id().space() == index->table->space->id);
+		ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 		btr_search_drop_page_hash_index(block);
 
 		if (dict_index_is_spatial(index)) {
@@ -4176,8 +3950,10 @@ btr_discard_only_page_on_level(
 	mem_heap_t* heap = nullptr;
 	const rec_t* rec = nullptr;
 	rec_offs* offsets = nullptr;
-	if (index->table->instant) {
-		if (rec_is_alter_metadata(r, *index)) {
+	if (index->table->instant || index->must_avoid_clear_instant_add()) {
+		if (!rec_is_metadata(r, *index)) {
+		} else if (!index->table->instant
+			   || rec_is_alter_metadata(r, *index)) {
 			heap = mem_heap_create(srv_page_size);
 			offsets = rec_get_offsets(r, index, nullptr,
 						  index->n_core_fields,
@@ -4196,12 +3972,13 @@ btr_discard_only_page_on_level(
 
 	if (index->is_primary()) {
 		if (rec) {
+			page_cur_t cur;
+			page_cur_set_before_first(block, &cur);
 			DBUG_ASSERT(index->table->instant);
 			DBUG_ASSERT(rec_is_alter_metadata(rec, *index));
 			btr_set_instant(block, *index, mtr);
-			rec = page_cur_insert_rec_low(
-				page_get_infimum_rec(block->frame),
-				index, rec, offsets, mtr);
+			rec = page_cur_insert_rec_low(&cur, index, rec,
+						      offsets, mtr);
 			ut_ad(rec);
 			mem_heap_free(heap);
 		} else if (index->is_instant()) {
@@ -4230,24 +4007,18 @@ btr_discard_page(
 	mtr_t*		mtr)	/*!< in: mtr */
 {
 	dict_index_t*	index;
-	ulint		left_page_no;
-	ulint		right_page_no;
 	buf_block_t*	merge_block;
-	page_t*		merge_page;
 	buf_block_t*	block;
-	page_t*		page;
-	rec_t*		node_ptr;
 	btr_cur_t	parent_cursor;
 
 	block = btr_cur_get_block(cursor);
 	index = btr_cur_get_index(cursor);
 
-	ut_ad(dict_index_get_page(index) != block->page.id.page_no());
+	ut_ad(dict_index_get_page(index) != block->page.id().page_no());
 
-	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
-					MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
-
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 
 	MONITOR_INC(MONITOR_INDEX_DISCARD);
 
@@ -4259,20 +4030,16 @@ btr_discard_page(
 
 	/* Decide the page which will inherit the locks */
 
-	left_page_no = btr_page_get_prev(buf_block_get_frame(block));
-	right_page_no = btr_page_get_next(buf_block_get_frame(block));
+	const uint32_t left_page_no = btr_page_get_prev(block->frame);
+	const uint32_t right_page_no = btr_page_get_next(block->frame);
 
-	const ulint zip_size = index->table->space->zip_size();
 	ut_d(bool parent_is_different = false);
 	if (left_page_no != FIL_NULL) {
-		merge_block = btr_block_get(
-			page_id_t(index->table->space_id, left_page_no),
-			zip_size, RW_X_LATCH, index, mtr);
-
-		merge_page = buf_block_get_frame(merge_block);
+		merge_block = btr_block_get(*index, left_page_no, RW_X_LATCH,
+					    true, mtr);
 #ifdef UNIV_BTR_DEBUG
-		ut_a(btr_page_get_next(merge_page)
-		     == block->page.id.page_no());
+		ut_a(btr_page_get_next(merge_block->frame)
+		     == block->page.id().page_no());
 #endif /* UNIV_BTR_DEBUG */
 		ut_d(parent_is_different =
 			(page_rec_get_next(
@@ -4281,43 +4048,34 @@ btr_discard_page(
 						&parent_cursor)))
 			 == btr_cur_get_rec(&parent_cursor)));
 	} else if (right_page_no != FIL_NULL) {
-		merge_block = btr_block_get(
-			page_id_t(index->table->space_id, right_page_no),
-			zip_size, RW_X_LATCH, index, mtr);
-
-		merge_page = buf_block_get_frame(merge_block);
+		merge_block = btr_block_get(*index, right_page_no, RW_X_LATCH,
+					    true, mtr);
 #ifdef UNIV_BTR_DEBUG
-		ut_a(btr_page_get_prev(merge_page)
-		     == block->page.id.page_no());
+		ut_a(btr_page_get_prev(merge_block->frame)
+		     == block->page.id().page_no());
 #endif /* UNIV_BTR_DEBUG */
 		ut_d(parent_is_different = page_rec_is_supremum(
 			page_rec_get_next(btr_cur_get_rec(&parent_cursor))));
+		if (!page_is_leaf(merge_block->frame)) {
+			rec_t* node_ptr = page_rec_get_next(
+				page_get_infimum_rec(merge_block->frame));
+			ut_ad(page_rec_is_user_rec(node_ptr));
+			/* We have to mark the leftmost node pointer as the
+			predefined minimum record. */
+			btr_set_min_rec_mark<true>(node_ptr, *merge_block,
+						   mtr);
+		}
 	} else {
 		btr_discard_only_page_on_level(index, block, mtr);
 
 		return;
 	}
 
-	page = buf_block_get_frame(block);
-	ut_a(page_is_comp(merge_page) == page_is_comp(page));
+	ut_a(page_is_comp(merge_block->frame) == page_is_comp(block->frame));
+	ut_ad(!memcmp_aligned<2>(&merge_block->frame[PAGE_HEADER + PAGE_LEVEL],
+				 &block->frame[PAGE_HEADER + PAGE_LEVEL], 2));
 	btr_search_drop_page_hash_index(block);
 
-	if (left_page_no == FIL_NULL && !page_is_leaf(page)) {
-
-		/* We have to mark the leftmost node pointer on the right
-		side page as the predefined minimum record */
-		node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page));
-
-		ut_ad(page_rec_is_user_rec(node_ptr));
-
-		/* This will make page_zip_validate() fail on merge_page
-		until btr_level_list_remove() completes.  This is harmless,
-		because everything will take place within a single
-		mini-transaction and because writing to the redo log
-		is an atomic operation (performed by mtr_commit()). */
-		btr_set_min_rec_mark(node_ptr, mtr);
-	}
-
 	if (dict_index_is_spatial(index)) {
 		rtr_node_ptr_delete(&parent_cursor, mtr);
 	} else {
@@ -4325,15 +4083,15 @@ btr_discard_page(
 	}
 
 	/* Remove the page from the level list */
-	ut_a(DB_SUCCESS == btr_level_list_remove(index->table->space_id,
-						 zip_size, page, index, mtr));
+	ut_a(DB_SUCCESS == btr_level_list_remove(*block, *index, mtr));
 
 #ifdef UNIV_ZIP_DEBUG
 	{
 		page_zip_des_t*	merge_page_zip
 			= buf_block_get_page_zip(merge_block);
 		ut_a(!merge_page_zip
-		     || page_zip_validate(merge_page_zip, merge_page, index));
+		     || page_zip_validate(merge_page_zip, merge_block->frame,
+					  index));
 	}
 #endif /* UNIV_ZIP_DEBUG */
 
@@ -4361,7 +4119,8 @@ btr_discard_page(
 	ut_ad(parent_is_different
 	      || btr_check_node_ptr(index, merge_block, mtr));
 
-	if (btr_cur_get_block(&parent_cursor)->page.id.page_no() == index->page
+	if (btr_cur_get_block(&parent_cursor)->page.id().page_no()
+	    == index->page
 	    && !page_has_siblings(btr_cur_get_page(&parent_cursor))
 	    && page_get_n_recs(btr_cur_get_page(&parent_cursor)) == 1) {
 		btr_lift_page_up(index, merge_block, mtr);
@@ -4427,7 +4186,7 @@ btr_print_recursive(
 	ulint		i	= 0;
 	mtr_t		mtr2;
 
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_SX_FIX));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_SX_FIX));
 
 	ib::info() << "NODE ON LEVEL " << btr_page_get_level(page)
 		<< " page " << block->page.id;
@@ -4521,9 +4280,9 @@ btr_check_node_ptr(
 	btr_cur_t	cursor;
 	page_t*		page = buf_block_get_frame(block);
 
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 
-	if (dict_index_get_page(index) == block->page.id.page_no()) {
+	if (dict_index_get_page(index) == block->page.id().page_no()) {
 
 		return(TRUE);
 	}
@@ -4553,7 +4312,7 @@ btr_check_node_ptr(
 	if (dict_index_is_spatial(index)) {
 		ut_a(!cmp_dtuple_rec_with_gis(
 			tuple, btr_cur_get_rec(&cursor),
-			offsets, PAGE_CUR_WITHIN));
+			PAGE_CUR_WITHIN));
 	} else {
 		ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets));
 	}
@@ -4819,7 +4578,7 @@ btr_validate_report1(
 	const buf_block_t*	block)	/*!< in: index page */
 {
 	ib::error	error;
-	error << "In page " << block->page.id.page_no()
+	error << "In page " << block->page.id().page_no()
 		<< " of index " << index->name
 		<< " of table " << index->table->name;
 
@@ -4839,14 +4598,13 @@ btr_validate_report2(
 	const buf_block_t*	block1,	/*!< in: first index page */
 	const buf_block_t*	block2)	/*!< in: second index page */
 {
-	ib::error	error;
-	error << "In pages " << block1->page.id
-		<< " and " << block2->page.id << " of index " << index->name
-		<< " of table " << index->table->name;
+  ib::error error;
+  error << "In pages " << block1->page.id()
+	<< " and " << block2->page.id() << " of index " << index->name
+	<< " of table " << index->table->name;
 
-	if (level > 0) {
-		error << ", index tree level " << level;
-	}
+  if (level)
+    error << ", index tree level " << level;
 }
 
 /************************************************************//**
@@ -4869,8 +4627,6 @@ btr_validate_level(
 	btr_cur_t	node_cur;
 	btr_cur_t	right_node_cur;
 	rec_t*		rec;
-	ulint		right_page_no;
-	ulint		left_page_no;
 	page_cur_t	cursor;
 	dtuple_t*	node_ptr_tuple;
 	bool		ret	= true;
@@ -4883,8 +4639,8 @@ btr_validate_level(
 #endif /* UNIV_ZIP_DEBUG */
 	ulint		savepoint = 0;
 	ulint		savepoint2 = 0;
-	ulint		parent_page_no = FIL_NULL;
-	ulint		parent_right_page_no = FIL_NULL;
+	uint32_t	parent_page_no = FIL_NULL;
+	uint32_t	parent_right_page_no = FIL_NULL;
 	bool		rightmost_child = false;
 
 	mtr.start();
@@ -4901,12 +4657,11 @@ btr_validate_level(
 	page = buf_block_get_frame(block);
 
 	fil_space_t*		space	= index->table->space;
-	const ulint		zip_size = space->zip_size();
 
 	while (level != btr_page_get_level(page)) {
 		const rec_t*	node_ptr;
 
-		if (fseg_page_is_free(space, block->page.id.page_no())) {
+		if (fseg_page_is_free(space, block->page.id().page_no())) {
 
 			btr_validate_report1(index, level, block);
 
@@ -4915,8 +4670,8 @@ btr_validate_level(
 			ret = false;
 		}
 
-		ut_a(index->table->space_id == block->page.id.space());
-		ut_a(block->page.id.space() == page_get_space_id(page));
+		ut_a(index->table->space_id == block->page.id().space());
+		ut_a(block->page.id().space() == page_get_space_id(page));
 #ifdef UNIV_ZIP_DEBUG
 		page_zip = buf_block_get_page_zip(block);
 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
@@ -4941,7 +4696,7 @@ btr_validate_level(
 		does not use such scan for any of its DML or query
 		operations  */
 		if (dict_index_is_spatial(index)) {
-			left_page_no = btr_page_get_prev(page);
+			uint32_t left_page_no = btr_page_get_prev(page);
 
 			while (left_page_no != FIL_NULL) {
 				/* To obey latch order of tree blocks,
@@ -4951,11 +4706,9 @@ btr_validate_level(
 					&mtr, savepoint2, block);
 
 				savepoint2 = mtr_set_savepoint(&mtr);
-				block = btr_block_get(
-					page_id_t(index->table->space_id,
-						  left_page_no),
-					zip_size,
-					RW_SX_LATCH, index, &mtr);
+				block = btr_block_get(*index, left_page_no,
+						      RW_SX_LATCH, false,
+						      &mtr);
 				page = buf_block_get_frame(block);
 				left_page_no = btr_page_get_prev(page);
 			}
@@ -4981,9 +4734,9 @@ loop:
 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
 #endif /* UNIV_ZIP_DEBUG */
 
-	ut_a(block->page.id.space() == index->table->space_id);
+	ut_a(block->page.id().space() == index->table->space_id);
 
-	if (fseg_page_is_free(space, block->page.id.page_no())) {
+	if (fseg_page_is_free(space, block->page.id().page_no())) {
 
 		btr_validate_report1(index, level, block);
 
@@ -5012,8 +4765,8 @@ loop:
 
 	ut_a(btr_page_get_level(page) == level);
 
-	right_page_no = btr_page_get_next(page);
-	left_page_no = btr_page_get_prev(page);
+	uint32_t right_page_no = btr_page_get_next(page);
+	uint32_t left_page_no = btr_page_get_prev(page);
 
 	ut_a(!page_is_empty(page)
 	     || (level == 0
@@ -5023,11 +4776,8 @@ loop:
 		const rec_t*	right_rec;
 		savepoint = mtr_set_savepoint(&mtr);
 
-		right_block = btr_block_get(
-			page_id_t(index->table->space_id, right_page_no),
-			zip_size,
-			RW_SX_LATCH, index, &mtr);
-
+		right_block = btr_block_get(*index, right_page_no, RW_SX_LATCH,
+					    !level, &mtr);
 		right_page = buf_block_get_frame(right_block);
 
 		if (btr_page_get_prev(right_page) != page_get_page_no(page)) {
@@ -5098,7 +4848,7 @@ loop:
 	2) Search parent from root is very costly for R-tree.
 	We will add special validation mechanism for R-tree later (WL #7520) */
 	if (!dict_index_is_spatial(index)
-	    && block->page.id.page_no() != dict_index_get_page(index)) {
+	    && block->page.id().page_no() != dict_index_get_page(index)) {
 
 		/* Check father node pointers */
 		rec_t*	node_ptr;
@@ -5127,7 +4877,7 @@ loop:
 
 		if (node_ptr != btr_cur_get_rec(&node_cur)
 		    || btr_node_ptr_get_child_page_no(node_ptr, offsets)
-				     != block->page.id.page_no()) {
+		    != block->page.id().page_no()) {
 
 			btr_validate_report1(index, level, block);
 
@@ -5139,8 +4889,7 @@ loop:
 
 			rec = btr_cur_get_rec(&node_cur);
 			fprintf(stderr, "\n"
-				"InnoDB: node ptr child page n:o "
-				ULINTPF "\n",
+				"InnoDB: node ptr child page n:o %u\n",
 				btr_node_ptr_get_child_page_no(rec, offsets));
 
 			fputs("InnoDB: record on page ", stderr);
@@ -5201,19 +4950,16 @@ loop:
 					&mtr, savepoint, right_block);
 
 				if (parent_right_page_no != FIL_NULL) {
-					btr_block_get(
-						page_id_t(index->table
-							  ->space_id,
-							  parent_right_page_no),
-						zip_size,
-						RW_SX_LATCH, index, &mtr);
+					btr_block_get(*index,
+						      parent_right_page_no,
+						      RW_SX_LATCH, false,
+						      &mtr);
 				}
 
-				right_block = btr_block_get(
-					page_id_t(index->table->space_id,
-						  right_page_no),
-					zip_size,
-					RW_SX_LATCH, index, &mtr);
+				right_block = btr_block_get(*index,
+							    right_page_no,
+							    RW_SX_LATCH,
+							    !level, &mtr);
 			}
 
 			btr_cur_position(
@@ -5286,27 +5032,19 @@ node_ptr_fails:
 		if (!lockout) {
 			if (rightmost_child) {
 				if (parent_right_page_no != FIL_NULL) {
-					btr_block_get(
-						page_id_t(
-							index->table->space_id,
-							parent_right_page_no),
-						zip_size,
-						RW_SX_LATCH, index, &mtr);
+					btr_block_get(*index,
+						      parent_right_page_no,
+						      RW_SX_LATCH, false,
+						      &mtr);
 				}
 			} else if (parent_page_no != FIL_NULL) {
-				btr_block_get(
-					page_id_t(index->table->space_id,
-						  parent_page_no),
-					zip_size,
-					RW_SX_LATCH, index, &mtr);
+				btr_block_get(*index, parent_page_no,
+					      RW_SX_LATCH, false, &mtr);
 			}
 		}
 
-		block = btr_block_get(
-			page_id_t(index->table->space_id, right_page_no),
-			zip_size,
-			RW_SX_LATCH, index, &mtr);
-
+		block = btr_block_get(*index, right_page_no, RW_SX_LATCH,
+				      !level, &mtr);
 		page = buf_block_get_frame(block);
 
 		goto loop;
@@ -5386,7 +5124,7 @@ bool
 btr_can_merge_with_page(
 /*====================*/
 	btr_cur_t*	cursor,		/*!< in: cursor on the page to merge */
-	ulint		page_no,	/*!< in: a sibling page */
+	uint32_t	page_no,	/*!< in: a sibling page */
 	buf_block_t**	merge_block,	/*!< out: the merge block */
 	mtr_t*		mtr)		/*!< in: mini-transaction */
 {
@@ -5408,10 +5146,8 @@ btr_can_merge_with_page(
 	index = btr_cur_get_index(cursor);
 	page = btr_cur_get_page(cursor);
 
-	const page_id_t		page_id(index->table->space_id, page_no);
-	const ulint zip_size = index->table->space->zip_size();
-
-	mblock = btr_block_get(page_id, zip_size, RW_X_LATCH, index, mtr);
+	mblock = btr_block_get(*index, page_no, RW_X_LATCH, page_is_leaf(page),
+			       mtr);
 	mpage = buf_block_get_frame(mblock);
 
 	n_recs = page_get_n_recs(page);
@@ -5427,7 +5163,7 @@ btr_can_merge_with_page(
 	/* If compression padding tells us that merging will result in
 	too packed up page i.e.: which is likely to cause compression
 	failure then don't merge the pages. */
-	if (zip_size && page_is_leaf(mpage)
+	if (mblock->page.zip.data && page_is_leaf(mpage)
 	    && (page_get_data_size(mpage) + data_size
 		>= dict_index_zip_pad_optimal_page_size(index))) {
 
@@ -5437,12 +5173,9 @@ btr_can_merge_with_page(
 	max_ins_size = page_get_max_insert_size(mpage, n_recs);
 
 	if (data_size > max_ins_size) {
-
 		/* We have to reorganize mpage */
-
-		if (!btr_page_reorganize_block(
-			    false, page_zip_level, mblock, index, mtr)) {
-
+		if (!btr_page_reorganize_block(page_zip_level, mblock, index,
+					       mtr)) {
 			goto error;
 		}
 
diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc
index 65cb6e83783..9004064a1b9 100644
--- a/storage/innobase/btr/btr0bulk.cc
+++ b/storage/innobase/btr/btr0bulk.cc
@@ -34,8 +34,6 @@ Created 03/11/2014 Shaohua Wang
 
 /** Innodb B-tree index fill factor for bulk load. */
 uint	innobase_fill_factor;
-/** whether to reduce redo logging during ALTER TABLE */
-my_bool	innodb_log_optimize_ddl;
 
 /** Initialize members, allocate page if needed and start mtr.
 Note: we commit all mtrs on failure.
@@ -45,20 +43,12 @@ PageBulk::init()
 {
 	buf_block_t*	new_block;
 	page_t*		new_page;
-	page_zip_des_t*	new_page_zip;
-	ulint		new_page_no;
 
 	ut_ad(m_heap == NULL);
 	m_heap = mem_heap_create(1000);
 
 	m_mtr.start();
-
-	if (m_flush_observer) {
-		m_mtr.set_log_mode(MTR_LOG_NO_REDO);
-		m_mtr.set_flush_observer(m_flush_observer);
-	} else {
-		m_index->set_modified(m_mtr);
-	}
+	m_index->set_modified(m_mtr);
 
 	if (m_page_no == FIL_NULL) {
 		mtr_t	alloc_mtr;
@@ -70,12 +60,10 @@ PageBulk::init()
 		alloc_mtr.start();
 		m_index->set_modified(alloc_mtr);
 
-		ulint	n_reserved;
-		bool	success;
-		success = fsp_reserve_free_extents(&n_reserved,
-						   m_index->table->space,
-						   1, FSP_NORMAL, &alloc_mtr);
-		if (!success) {
+		uint32_t n_reserved;
+		if (!fsp_reserve_free_extents(&n_reserved,
+					      m_index->table->space,
+					      1, FSP_NORMAL, &alloc_mtr)) {
 			alloc_mtr.commit();
 			m_mtr.commit();
 			return(DB_OUT_OF_FILE_SPACE);
@@ -90,60 +78,49 @@ PageBulk::init()
 		alloc_mtr.commit();
 
 		new_page = buf_block_get_frame(new_block);
-		new_page_zip = buf_block_get_page_zip(new_block);
-		new_page_no = page_get_page_no(new_page);
+		m_page_no = new_block->page.id().page_no();
 
-		if (new_page_zip) {
+		byte* index_id = my_assume_aligned<2>
+			(PAGE_HEADER + PAGE_INDEX_ID + new_page);
+		compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
+		compile_time_assert(FIL_NULL == 0xffffffff);
+		memset_aligned<8>(new_page + FIL_PAGE_PREV, 0xff, 8);
+
+		if (UNIV_LIKELY_NULL(new_block->page.zip.data)) {
+			mach_write_to_8(index_id, m_index->id);
 			page_create_zip(new_block, m_index, m_level, 0,
 					&m_mtr);
-			memset(FIL_PAGE_PREV + new_page, 0xff, 8);
-			page_zip_write_header(new_page_zip,
-					      FIL_PAGE_PREV + new_page,
-					      8, &m_mtr);
-			mach_write_to_8(PAGE_HEADER + PAGE_INDEX_ID + new_page,
-					m_index->id);
-			page_zip_write_header(new_page_zip,
-					      PAGE_HEADER + PAGE_INDEX_ID
-					      + new_page, 8, &m_mtr);
 		} else {
-			ut_ad(!dict_index_is_spatial(m_index));
+			ut_ad(!m_index->is_spatial());
 			page_create(new_block, &m_mtr,
-				    m_index->table->not_redundant(),
-				    false);
-			compile_time_assert(FIL_PAGE_NEXT
-					    == FIL_PAGE_PREV + 4);
-			compile_time_assert(FIL_NULL == 0xffffffff);
-			mlog_memset(new_block, FIL_PAGE_PREV, 8, 0xff, &m_mtr);
-			mlog_write_ulint(PAGE_HEADER + PAGE_LEVEL + new_page,
-					 m_level, MLOG_2BYTES, &m_mtr);
-			mlog_write_ull(PAGE_HEADER + PAGE_INDEX_ID + new_page,
-				       m_index->id, &m_mtr);
+				    m_index->table->not_redundant());
+			m_mtr.memset(*new_block, FIL_PAGE_PREV, 8, 0xff);
+			m_mtr.write<2,mtr_t::MAYBE_NOP>(*new_block, PAGE_HEADER
+							+ PAGE_LEVEL
+							+ new_page, m_level);
+			m_mtr.write<8>(*new_block, index_id, m_index->id);
 		}
 	} else {
-		new_block = btr_block_get(
-			page_id_t(m_index->table->space_id, m_page_no),
-			m_index->table->space->zip_size(),
-			RW_X_LATCH, m_index, &m_mtr);
+		new_block = btr_block_get(*m_index, m_page_no, RW_X_LATCH,
+					  false, &m_mtr);
 
 		new_page = buf_block_get_frame(new_block);
-		new_page_zip = buf_block_get_page_zip(new_block);
-		new_page_no = page_get_page_no(new_page);
-		ut_ad(m_page_no == new_page_no);
+		ut_ad(new_block->page.id().page_no() == m_page_no);
 
 		ut_ad(page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW);
 
-		btr_page_set_level(new_page, new_page_zip, m_level, &m_mtr);
+		btr_page_set_level(new_block, m_level, &m_mtr);
 	}
 
+	m_page_zip = buf_block_get_page_zip(new_block);
+
 	if (!m_level && dict_index_is_sec_or_ibuf(m_index)) {
-		page_update_max_trx_id(new_block, new_page_zip, m_trx_id,
+		page_update_max_trx_id(new_block, m_page_zip, m_trx_id,
 				       &m_mtr);
 	}
 
 	m_block = new_block;
 	m_page = new_page;
-	m_page_zip = new_page_zip;
-	m_page_no = new_page_no;
 	m_cur_rec = page_get_infimum_rec(new_page);
 	ut_ad(m_is_comp == !!page_is_comp(new_page));
 	m_free_space = page_get_free_space_of_empty(m_is_comp);
@@ -166,89 +143,353 @@ PageBulk::init()
 	ut_ad(m_page[PAGE_HEADER + PAGE_DIRECTION_B] == PAGE_NO_DIRECTION);
 	m_page[PAGE_HEADER + PAGE_DIRECTION_B] = 0;
 	ut_d(m_total_data = 0);
-	/* See page_copy_rec_list_end_to_created_page() */
-	ut_d(page_header_set_field(m_page, NULL, PAGE_HEAP_TOP,
-				   srv_page_size - 1));
 
 	return(DB_SUCCESS);
 }
 
 /** Insert a record in the page.
-@param[in]	rec		record
+@tparam fmt     the page format
+@param[in,out]	rec		record
 @param[in]	offsets		record offsets */
-void
-PageBulk::insert(
-	const rec_t*		rec,
-	rec_offs*		offsets)
+template<PageBulk::format fmt>
+inline void PageBulk::insertPage(rec_t *rec, rec_offs *offsets)
 {
-	ulint		rec_size;
-
-	ut_ad(m_heap != NULL);
+  ut_ad((m_page_zip != nullptr) == (fmt == COMPRESSED));
+  ut_ad((fmt != REDUNDANT) == m_is_comp);
+  ut_ad(page_align(m_heap_top) == m_page);
+  ut_ad(m_heap);
 
-	rec_size = rec_offs_size(offsets);
-	ut_d(const bool is_leaf = page_rec_is_leaf(m_cur_rec));
+  const ulint rec_size= rec_offs_size(offsets);
+  const ulint extra_size= rec_offs_extra_size(offsets);
+  ut_ad(page_align(m_heap_top + rec_size) == m_page);
+  ut_d(const bool is_leaf= page_rec_is_leaf(m_cur_rec));
 
 #ifdef UNIV_DEBUG
-	/* Check whether records are in order. */
-	if (!page_rec_is_infimum_low(page_offset(m_cur_rec))) {
-		rec_t*	old_rec = m_cur_rec;
-		rec_offs* old_offsets = rec_get_offsets(
-			old_rec, m_index, NULL,	is_leaf
-			? m_index->n_core_fields : 0,
-			ULINT_UNDEFINED, &m_heap);
-
-		ut_ad(cmp_rec_rec(rec, old_rec, offsets, old_offsets, m_index)
-		      > 0);
-	}
+  /* Check whether records are in order. */
+  if (page_offset(m_cur_rec) !=
+      (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM))
+  {
+    const rec_t *old_rec = m_cur_rec;
+    rec_offs *old_offsets= rec_get_offsets(old_rec, m_index, nullptr, is_leaf
+                                           ? m_index->n_core_fields : 0,
+                                           ULINT_UNDEFINED, &m_heap);
+    ut_ad(cmp_rec_rec(rec, old_rec, offsets, old_offsets, m_index) > 0);
+  }
 
-	m_total_data += rec_size;
+  m_total_data+= rec_size;
 #endif /* UNIV_DEBUG */
 
-	/* 1. Copy the record to page. */
-	rec_t*	insert_rec = rec_copy(m_heap_top, rec, offsets);
-	rec_offs_make_valid(insert_rec, m_index, is_leaf, offsets);
-
-	/* 2. Insert the record in the linked list. */
-	/* 3. Set the n_owned field in the inserted record to zero,
-	and set the heap_no field. */
-	if (m_is_comp) {
-		ulint next_offs = rec_get_next_offs(m_cur_rec, TRUE);
-		rec_set_next_offs_new(insert_rec, next_offs);
-		rec_set_next_offs_new(m_cur_rec, page_offset(insert_rec));
-
-		rec_set_n_owned_new(insert_rec, NULL, 0);
-		rec_set_heap_no_new(insert_rec,
-				    PAGE_HEAP_NO_USER_LOW + m_rec_no);
-	} else {
-		ulint next_offs = rec_get_next_offs(m_cur_rec, FALSE);
-		rec_set_next_offs_old(insert_rec, next_offs);
-		rec_set_next_offs_old(m_cur_rec, page_offset(insert_rec));
+  rec_t* const insert_rec= m_heap_top + extra_size;
 
-		rec_set_n_owned_old(insert_rec, 0);
-		rec_set_heap_no_old(insert_rec,
-				    PAGE_HEAP_NO_USER_LOW + m_rec_no);
-	}
+  /* Insert the record in the linked list. */
+  if (fmt != REDUNDANT)
+  {
+    const rec_t *next_rec= m_page +
+      page_offset(m_cur_rec + mach_read_from_2(m_cur_rec - REC_NEXT));
+    if (fmt != COMPRESSED)
+      m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT,
+                     static_cast<uint16_t>(insert_rec - m_cur_rec));
+    else
+    {
+      mach_write_to_2(m_cur_rec - REC_NEXT,
+                      static_cast<uint16_t>(insert_rec - m_cur_rec));
+      memcpy(m_heap_top, rec - extra_size, rec_size);
+    }
+
+    rec_t * const this_rec= fmt != COMPRESSED
+      ? const_cast<rec_t*>(rec) : insert_rec;
+    rec_set_bit_field_1(this_rec, 0, REC_NEW_N_OWNED, REC_N_OWNED_MASK,
+                        REC_N_OWNED_SHIFT);
+    rec_set_bit_field_2(this_rec, PAGE_HEAP_NO_USER_LOW + m_rec_no,
+                        REC_NEW_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+    mach_write_to_2(this_rec - REC_NEXT,
+                    static_cast<uint16_t>(next_rec - insert_rec));
+  }
+  else
+  {
+    memcpy(const_cast<rec_t*>(rec) - REC_NEXT, m_cur_rec - REC_NEXT, 2);
+    m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT, page_offset(insert_rec));
+    rec_set_bit_field_1(const_cast<rec_t*>(rec), 0,
+                        REC_OLD_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+    rec_set_bit_field_2(const_cast<rec_t*>(rec),
+                        PAGE_HEAP_NO_USER_LOW + m_rec_no,
+                        REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+  }
 
-	/* 4. Set member variables. */
-	ulint		slot_size;
-	slot_size = page_dir_calc_reserved_space(m_rec_no + 1)
-		- page_dir_calc_reserved_space(m_rec_no);
+  if (fmt == COMPRESSED)
+    /* We already wrote the record. Log is written in PageBulk::compress(). */;
+  else if (page_offset(m_cur_rec) ==
+           (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM))
+    m_mtr.memcpy(*m_block, m_heap_top, rec - extra_size, rec_size);
+  else
+  {
+    /* Try to copy common prefix from the preceding record. */
+    const byte *r= rec - extra_size;
+    const byte * const insert_rec_end= m_heap_top + rec_size;
+    byte *b= m_heap_top;
+
+    /* Skip any unchanged prefix of the record. */
+    for (; * b == *r; b++, r++);
+
+    ut_ad(b < insert_rec_end);
+
+    const byte *c= m_cur_rec - (rec - r);
+    const byte * const c_end= std::min(m_cur_rec + rec_offs_data_size(offsets),
+                                       m_heap_top);
+
+    /* Try to copy any bytes of the preceding record. */
+    if (UNIV_LIKELY(c >= m_page && c < c_end))
+    {
+      const byte *cm= c;
+      byte *bm= b;
+      const byte *rm= r;
+      for (; cm < c_end && *rm == *cm; cm++, bm++, rm++);
+      ut_ad(bm <= insert_rec_end);
+      size_t len= static_cast<size_t>(rm - r);
+      ut_ad(!memcmp(r, c, len));
+      if (len > 2)
+      {
+        memcpy(b, c, len);
+        m_mtr.memmove(*m_block, page_offset(b), page_offset(c), len);
+        c= cm;
+        b= bm;
+        r= rm;
+      }
+    }
+
+    if (c < m_cur_rec)
+    {
+      if (!rec_offs_data_size(offsets))
+      {
+no_data:
+        m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, m_cur_rec - c);
+        goto rec_done;
+      }
+      /* Some header bytes differ. Compare the data separately. */
+      const byte *cd= m_cur_rec;
+      byte *bd= insert_rec;
+      const byte *rd= rec;
+      /* Skip any unchanged prefix of the record. */
+      for (;; cd++, bd++, rd++)
+        if (bd == insert_rec_end)
+          goto no_data;
+        else if (*bd != *rd)
+          break;
+
+      /* Try to copy any data bytes of the preceding record. */
+      if (c_end - cd > 2)
+      {
+        const byte *cdm= cd;
+        const byte *rdm= rd;
+        for (; cdm < c_end && *rdm == *cdm; cdm++, rdm++)
+        ut_ad(rdm - rd + bd <= insert_rec_end);
+        size_t len= static_cast<size_t>(rdm - rd);
+        ut_ad(!memcmp(rd, cd, len));
+        if (len > 2)
+        {
+          m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, m_cur_rec - c);
+          memcpy(bd, cd, len);
+          m_mtr.memmove(*m_block, page_offset(bd), page_offset(cd), len);
+          c= cdm;
+          b= rdm - rd + bd;
+          r= rdm;
+        }
+      }
+    }
+
+    if (size_t len= static_cast<size_t>(insert_rec_end - b))
+      m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, len);
+  }
 
-	ut_ad(m_free_space >= rec_size + slot_size);
-	ut_ad(m_heap_top + rec_size < m_page + srv_page_size);
+rec_done:
+  ut_ad(fmt == COMPRESSED || !memcmp(m_heap_top, rec - extra_size, rec_size));
+  rec_offs_make_valid(insert_rec, m_index, is_leaf, offsets);
 
-	m_free_space -= rec_size + slot_size;
-	m_heap_top += rec_size;
-	m_rec_no += 1;
+  /* Update the member variables. */
+  ulint slot_size= page_dir_calc_reserved_space(m_rec_no + 1) -
+    page_dir_calc_reserved_space(m_rec_no);
 
-	if (!m_flush_observer && !m_page_zip) {
-		/* For ROW_FORMAT=COMPRESSED, redo log may be written
-		in PageBulk::compress(). */
-		page_cur_insert_rec_write_log(insert_rec, rec_size,
-					      m_cur_rec, m_index, &m_mtr);
-	}
+  ut_ad(m_free_space >= rec_size + slot_size);
+  ut_ad(m_heap_top + rec_size < m_page + srv_page_size);
 
-	m_cur_rec = insert_rec;
+  m_free_space-= rec_size + slot_size;
+  m_heap_top+= rec_size;
+  m_rec_no++;
+  m_cur_rec= insert_rec;
+}
+
+/** Insert a record in the page.
+@param[in]	rec		record
+@param[in]	offsets		record offsets */
+inline void PageBulk::insert(const rec_t *rec, rec_offs *offsets)
+{
+  byte rec_hdr[REC_N_OLD_EXTRA_BYTES];
+  static_assert(REC_N_OLD_EXTRA_BYTES > REC_N_NEW_EXTRA_BYTES, "file format");
+
+  if (UNIV_LIKELY_NULL(m_page_zip))
+    insertPage<COMPRESSED>(const_cast<rec_t*>(rec), offsets);
+  else if (m_is_comp)
+  {
+    memcpy(rec_hdr, rec - REC_N_NEW_EXTRA_BYTES, REC_N_NEW_EXTRA_BYTES);
+    insertPage<DYNAMIC>(const_cast<rec_t*>(rec), offsets);
+    memcpy(const_cast<rec_t*>(rec) - REC_N_NEW_EXTRA_BYTES, rec_hdr,
+           REC_N_NEW_EXTRA_BYTES);
+  }
+  else
+  {
+    memcpy(rec_hdr, rec - REC_N_OLD_EXTRA_BYTES, REC_N_OLD_EXTRA_BYTES);
+    insertPage<REDUNDANT>(const_cast<rec_t*>(rec), offsets);
+    memcpy(const_cast<rec_t*>(rec) - REC_N_OLD_EXTRA_BYTES, rec_hdr,
+           REC_N_OLD_EXTRA_BYTES);
+  }
+}
+
+/** Set the number of owned records in the uncompressed page of
+a ROW_FORMAT=COMPRESSED record without redo-logging. */
+static void rec_set_n_owned_zip(rec_t *rec, ulint n_owned)
+{
+  rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED,
+                      REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+}
+
+/** Mark end of insertion to the page. Scan all records to set page dirs,
+and set page header members.
+@tparam fmt  page format */
+template<PageBulk::format fmt>
+inline void PageBulk::finishPage()
+{
+  ut_ad((m_page_zip != nullptr) == (fmt == COMPRESSED));
+  ut_ad((fmt != REDUNDANT) == m_is_comp);
+
+  ulint count= 0;
+  ulint n_recs= 0;
+  byte *slot= my_assume_aligned<2>(m_page + srv_page_size -
+                                   (PAGE_DIR + PAGE_DIR_SLOT_SIZE));
+  const page_dir_slot_t *const slot0 = slot;
+  compile_time_assert(PAGE_DIR_SLOT_SIZE == 2);
+  if (fmt != REDUNDANT)
+  {
+    uint16_t offset= mach_read_from_2(PAGE_NEW_INFIMUM - REC_NEXT + m_page);
+    ut_ad(offset >= PAGE_NEW_SUPREMUM - PAGE_NEW_INFIMUM);
+    offset= static_cast<uint16_t>(offset + PAGE_NEW_INFIMUM);
+    /* Set owner & dir. */
+    while (offset != PAGE_NEW_SUPREMUM)
+    {
+      ut_ad(offset >= PAGE_NEW_SUPREMUM);
+      ut_ad(offset < page_offset(slot));
+      count++;
+      n_recs++;
+
+      if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)
+      {
+        slot-= PAGE_DIR_SLOT_SIZE;
+        mach_write_to_2(slot, offset);
+
+        if (fmt != COMPRESSED)
+          page_rec_set_n_owned<false>(m_block, m_page + offset, count, true,
+                                      &m_mtr);
+        else
+          rec_set_n_owned_zip(m_page + offset, count);
+
+        count= 0;
+      }
+
+      uint16_t next= static_cast<uint16_t>
+        ((mach_read_from_2(m_page + offset - REC_NEXT) + offset) &
+         (srv_page_size - 1));
+      ut_ad(next);
+      offset= next;
+    }
+
+    if (slot0 != slot && (count + 1 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 <=
+                          PAGE_DIR_SLOT_MAX_N_OWNED))
+    {
+      /* Merge the last two slots, like page_cur_insert_rec_low() does. */
+      count+= (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2;
+
+      rec_t *rec= const_cast<rec_t*>(page_dir_slot_get_rec(slot));
+      if (fmt != COMPRESSED)
+        page_rec_set_n_owned<false>(m_block, rec, 0, true, &m_mtr);
+      else
+        rec_set_n_owned_zip(rec, 0);
+    }
+    else
+      slot-= PAGE_DIR_SLOT_SIZE;
+
+    mach_write_to_2(slot, PAGE_NEW_SUPREMUM);
+    if (fmt != COMPRESSED)
+      page_rec_set_n_owned<false>(m_block, m_page + PAGE_NEW_SUPREMUM,
+                                  count + 1, true, &m_mtr);
+    else
+      rec_set_n_owned_zip(m_page + PAGE_NEW_SUPREMUM, count + 1);
+  }
+  else
+  {
+    rec_t *insert_rec= m_page +
+      mach_read_from_2(PAGE_OLD_INFIMUM - REC_NEXT + m_page);
+
+    /* Set owner & dir. */
+    while (insert_rec != m_page + PAGE_OLD_SUPREMUM)
+    {
+      count++;
+      n_recs++;
+
+      if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)
+      {
+        slot-= PAGE_DIR_SLOT_SIZE;
+        mach_write_to_2(slot, page_offset(insert_rec));
+        page_rec_set_n_owned<false>(m_block, insert_rec, count, false, &m_mtr);
+        count= 0;
+      }
+
+      insert_rec= m_page + mach_read_from_2(insert_rec - REC_NEXT);
+    }
+
+    if (slot0 != slot && (count + 1 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 <=
+                          PAGE_DIR_SLOT_MAX_N_OWNED))
+    {
+      /* Merge the last two slots, like page_cur_insert_rec_low() does. */
+      count+= (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2;
+
+      rec_t *rec= const_cast<rec_t*>(page_dir_slot_get_rec(slot));
+      page_rec_set_n_owned<false>(m_block, rec, 0, false, &m_mtr);
+    }
+    else
+      slot-= PAGE_DIR_SLOT_SIZE;
+
+    mach_write_to_2(slot, PAGE_OLD_SUPREMUM);
+    page_rec_set_n_owned<false>(m_block, m_page + PAGE_OLD_SUPREMUM, count + 1,
+                                false, &m_mtr);
+  }
+
+  if (!m_rec_no);
+  else if (fmt != COMPRESSED)
+  {
+    static_assert(PAGE_N_DIR_SLOTS == 0, "compatibility");
+    alignas(8) byte page_header[PAGE_N_HEAP + 2];
+    mach_write_to_2(page_header + PAGE_N_DIR_SLOTS,
+                    1 + (slot0 - slot) / PAGE_DIR_SLOT_SIZE);
+    mach_write_to_2(page_header + PAGE_HEAP_TOP, m_heap_top - m_page);
+    mach_write_to_2(page_header + PAGE_N_HEAP,
+                    (PAGE_HEAP_NO_USER_LOW + m_rec_no) |
+                    uint16_t{fmt != REDUNDANT} << 15);
+    m_mtr.memcpy(*m_block, PAGE_HEADER + m_page, page_header,
+                 sizeof page_header);
+    m_mtr.write<2>(*m_block, PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no);
+    m_mtr.memcpy(*m_block, page_offset(slot), slot0 - slot);
+  }
+  else
+  {
+    /* For ROW_FORMAT=COMPRESSED, redo log may be written in
+    PageBulk::compress(). */
+    mach_write_to_2(PAGE_HEADER + PAGE_N_DIR_SLOTS + m_page,
+                    1 + (slot0 - slot) / PAGE_DIR_SLOT_SIZE);
+    mach_write_to_2(PAGE_HEADER + PAGE_HEAP_TOP + m_page,
+                    static_cast<ulint>(m_heap_top - m_page));
+    mach_write_to_2(PAGE_HEADER + PAGE_N_HEAP + m_page,
+                    (PAGE_HEAP_NO_USER_LOW + m_rec_no) | 1U << 15);
+    mach_write_to_2(PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no);
+  }
 }
 
 inline bool PageBulk::needs_finish() const
@@ -279,120 +520,37 @@ inline bool PageBulk::needs_finish() const
 
 /** Mark end of insertion to the page. Scan all records to set page dirs,
 and set page header members.
-Note: we refer to page_copy_rec_list_end_to_created_page. */
-void
-PageBulk::finish()
+@tparam compressed  whether the page is in ROW_FORMAT=COMPRESSED */
+inline void PageBulk::finish()
 {
-	ut_ad(!dict_index_is_spatial(m_index));
-
-	if (!needs_finish()) {
-		return;
-	}
-
-	ut_ad(m_total_data + page_dir_calc_reserved_space(m_rec_no)
-	      <= page_get_free_space_of_empty(m_is_comp));
-#ifdef UNIV_DEBUG
-	/* See page_copy_rec_list_end_to_created_page() */
-	if (m_rec_no) {
-		page_dir_set_n_slots(m_page, NULL, srv_page_size / 2);
-	}
-	mach_write_to_2(PAGE_HEADER + PAGE_HEAP_TOP + m_page,
-			ulint(m_heap_top - m_page));
-#endif
-
-	ulint	count = 0;
-	ulint	n_recs = 0;
-	ulint	slot_index = 0;
-	rec_t*	insert_rec = page_rec_get_next(page_get_infimum_rec(m_page));
-	page_dir_slot_t* slot = NULL;
-
-	/* Set owner & dir. */
-	while (!page_rec_is_supremum(insert_rec)) {
-		count++;
-		n_recs++;
-
-		if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2) {
-
-			slot_index++;
-
-			slot = page_dir_get_nth_slot(m_page, slot_index);
-
-			page_dir_slot_set_rec(slot, insert_rec);
-			page_dir_slot_set_n_owned(slot, NULL, count);
-
-			count = 0;
-		}
-
-		insert_rec = page_rec_get_next(insert_rec);
-	}
-
-	if (slot_index > 0
-	    && (count + 1 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2
-		<= PAGE_DIR_SLOT_MAX_N_OWNED)) {
-		/* We can merge the two last dir slots. This operation is
-		here to make this function imitate exactly the equivalent
-		task made using page_cur_insert_rec, which we use in database
-		recovery to reproduce the task performed by this function.
-		To be able to check the correctness of recovery, it is good
-		that it imitates exactly. */
-
-		count += (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2;
-
-		page_dir_slot_set_n_owned(slot, NULL, 0);
-
-		slot_index--;
-	}
+  ut_ad(!m_index->is_spatial());
 
-	slot = page_dir_get_nth_slot(m_page, 1 + slot_index);
-	page_dir_slot_set_rec(slot, page_get_supremum_rec(m_page));
-	page_dir_slot_set_n_owned(slot, NULL, count + 1);
-
-	ut_ad(!page_get_instant(m_page));
-
-	if (!m_rec_no) {
-		/* Restore PAGE_DIRECTION_B from 0 to
-		PAGE_NO_DIRECTION like it should be on an empty page,
-		again without writing redo log. */
-		m_page[PAGE_HEADER + PAGE_DIRECTION_B] = PAGE_NO_DIRECTION;
-	} else if (!m_flush_observer && !m_page_zip) {
-		mlog_write_ulint(PAGE_HEADER + PAGE_N_DIR_SLOTS + m_page,
-				 2 + slot_index, MLOG_2BYTES, &m_mtr);
-		mlog_write_ulint(PAGE_HEADER + PAGE_HEAP_TOP + m_page,
-				 ulint(m_heap_top - m_page),
-				 MLOG_2BYTES, &m_mtr);
-		mlog_write_ulint(PAGE_HEADER + PAGE_N_HEAP + m_page,
-				 (PAGE_HEAP_NO_USER_LOW + m_rec_no)
-				 | ulint(m_is_comp) << 15,
-				 MLOG_2BYTES, &m_mtr);
-		mlog_write_ulint(PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no,
-				 MLOG_2BYTES, &m_mtr);
-		mlog_write_ulint(PAGE_HEADER + PAGE_LAST_INSERT + m_page,
-				 ulint(m_cur_rec - m_page),
-				 MLOG_2BYTES, &m_mtr);
-		mlog_write_ulint(PAGE_HEADER + PAGE_DIRECTION_B - 1 + m_page,
-				 PAGE_RIGHT, MLOG_2BYTES, &m_mtr);
-		mlog_write_ulint(PAGE_HEADER + PAGE_N_DIRECTION + m_page, 0,
-				 MLOG_2BYTES, &m_mtr);
-	} else {
-		/* For ROW_FORMAT=COMPRESSED, redo log may be written
-		in PageBulk::compress(). */
-		mach_write_to_2(PAGE_HEADER + PAGE_N_DIR_SLOTS + m_page,
-				2 + slot_index);
-		mach_write_to_2(PAGE_HEADER + PAGE_HEAP_TOP + m_page,
-				ulint(m_heap_top - m_page));
-		mach_write_to_2(PAGE_HEADER + PAGE_N_HEAP + m_page,
-				(PAGE_HEAP_NO_USER_LOW + m_rec_no)
-				| ulint(m_is_comp) << 15);
-		mach_write_to_2(PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no);
-		mach_write_to_2(PAGE_HEADER + PAGE_LAST_INSERT + m_page,
-				ulint(m_cur_rec - m_page));
-		mach_write_to_2(PAGE_HEADER + PAGE_DIRECTION_B - 1 + m_page,
-				PAGE_RIGHT);
-		mach_write_to_2(PAGE_HEADER + PAGE_N_DIRECTION + m_page, 0);
-	}
-
-	ut_ad(!needs_finish());
-	ut_ad(page_validate(m_page, m_index));
+  if (!needs_finish());
+  else if (UNIV_LIKELY_NULL(m_page_zip))
+    finishPage<COMPRESSED>();
+  else if (m_is_comp)
+    finishPage<DYNAMIC>();
+  else
+    finishPage<REDUNDANT>();
+
+  /* In MariaDB 10.2, 10.3, 10.4, we would initialize
+  PAGE_DIRECTION_B, PAGE_N_DIRECTION, PAGE_LAST_INSERT
+  in the same way as we would during normal INSERT operations.
+  Starting with MariaDB Server 10.5, bulk insert will not
+  touch those fields. */
+  ut_ad(!m_page[PAGE_HEADER + PAGE_INSTANT]);
+  /* Restore the temporary change of PageBulk::init() that was necessary to
+  ensure that PageBulk::needs_finish() holds on an empty page. */
+  m_page[PAGE_HEADER + PAGE_DIRECTION_B]= PAGE_NO_DIRECTION;
+
+  ut_ad(!page_header_get_field(m_page, PAGE_FREE));
+  ut_ad(!page_header_get_field(m_page, PAGE_GARBAGE));
+  ut_ad(!page_header_get_field(m_page, PAGE_LAST_INSERT));
+  ut_ad(!page_header_get_field(m_page, PAGE_N_DIRECTION));
+  ut_ad(m_total_data + page_dir_calc_reserved_space(m_rec_no) <=
+        page_get_free_space_of_empty(m_is_comp));
+  ut_ad(!needs_finish());
+  ut_ad(page_validate(m_page, m_index));
 }
 
 /** Commit inserts done to the page
@@ -413,8 +571,7 @@ PageBulk::compress()
 {
 	ut_ad(m_page_zip != NULL);
 
-	return(page_zip_compress(m_page_zip, m_page, m_index,
-				 page_zip_level, &m_mtr));
+	return page_zip_compress(m_block, m_index, page_zip_level, &m_mtr);
 }
 
 /** Get node pointer
@@ -543,7 +700,10 @@ PageBulk::copyOut(
 
 	offsets = rec_get_offsets(rec, m_index, offsets, n_core,
 				  ULINT_UNDEFINED, &m_heap);
-	page_rec_set_next(rec, page_get_supremum_rec(m_page));
+	mach_write_to_2(rec - REC_NEXT, m_is_comp
+			? static_cast<uint16_t>
+			(PAGE_NEW_SUPREMUM - page_offset(rec))
+			: PAGE_OLD_SUPREMUM);
 
 	/* Set related members */
 	m_cur_rec = rec;
@@ -567,28 +727,24 @@ PageBulk::copyOut(
 @param[in]	next_page_no	next page no */
 inline void PageBulk::setNext(ulint next_page_no)
 {
-	if (UNIV_LIKELY_NULL(m_page_zip)) {
-		/* For ROW_FORMAT=COMPRESSED, redo log may be written
-		in PageBulk::compress(). */
-		mach_write_to_4(m_page + FIL_PAGE_NEXT, next_page_no);
-	} else {
-		mlog_write_ulint(m_page + FIL_PAGE_NEXT, next_page_no,
-				 MLOG_4BYTES, &m_mtr);
-	}
+  if (UNIV_LIKELY_NULL(m_page_zip))
+    /* For ROW_FORMAT=COMPRESSED, redo log may be written
+    in PageBulk::compress(). */
+    mach_write_to_4(m_page + FIL_PAGE_NEXT, next_page_no);
+  else
+    m_mtr.write<4>(*m_block, m_page + FIL_PAGE_NEXT, next_page_no);
 }
 
 /** Set previous page
 @param[in]	prev_page_no	previous page no */
 inline void PageBulk::setPrev(ulint prev_page_no)
 {
-	if (UNIV_LIKELY_NULL(m_page_zip)) {
-		/* For ROW_FORMAT=COMPRESSED, redo log may be written
-		in PageBulk::compress(). */
-		mach_write_to_4(m_page + FIL_PAGE_PREV, prev_page_no);
-	} else {
-		mlog_write_ulint(m_page + FIL_PAGE_PREV, prev_page_no,
-				 MLOG_4BYTES, &m_mtr);
-	}
+  if (UNIV_LIKELY_NULL(m_page_zip))
+    /* For ROW_FORMAT=COMPRESSED, redo log may be written
+    in PageBulk::compress(). */
+    mach_write_to_4(m_page + FIL_PAGE_PREV, prev_page_no);
+  else
+    m_mtr.write<4>(*m_block, m_page + FIL_PAGE_PREV, prev_page_no);
 }
 
 /** Check if required space is available in the page for the rec to be inserted.
@@ -696,15 +852,9 @@ dberr_t
 PageBulk::latch()
 {
 	m_mtr.start();
+	m_index->set_modified(m_mtr);
 
-	if (m_flush_observer) {
-		m_mtr.set_log_mode(MTR_LOG_NO_REDO);
-		m_mtr.set_flush_observer(m_flush_observer);
-	} else {
-		m_index->set_modified(m_mtr);
-	}
-
-	ut_ad(m_block->page.buf_fix_count);
+	ut_ad(m_block->page.buf_fix_count());
 
 	/* In case the block is S-latched by page_cleaner. */
 	if (!buf_page_optimistic_get(RW_X_LATCH, m_block, m_modify_clock,
@@ -724,7 +874,7 @@ PageBulk::latch()
 
 	buf_block_buf_fix_dec(m_block);
 
-	ut_ad(m_block->page.buf_fix_count);
+	ut_ad(m_block->page.buf_fix_count());
 
 	ut_ad(m_cur_rec > m_page && m_cur_rec < m_heap_top);
 
@@ -748,7 +898,7 @@ BtrBulk::pageSplit(
 
 	/* Initialize a new page */
 	PageBulk new_page_bulk(m_index, m_trx->id, FIL_NULL,
-			       page_bulk->getLevel(), m_flush_observer);
+			       page_bulk->getLevel());
 	dberr_t	err = new_page_bulk.init();
 	if (err != DB_SUCCESS) {
 		return(err);
@@ -798,9 +948,10 @@ BtrBulk::pageCommit(
 		page_bulk->setNext(next_page_bulk->getPageNo());
 		next_page_bulk->setPrev(page_bulk->getPageNo());
 	} else {
-		/** Suppose a page is released and latched again, we need to
+		ut_ad(!page_has_next(page_bulk->getPage()));
+		/* If a page is released and latched again, we need to
 		mark it modified in mini-transaction.  */
-		page_bulk->setNext(FIL_NULL);
+		page_bulk->set_modified();
 	}
 
 	ut_ad(!rw_lock_own_flagged(&m_index->lock,
@@ -831,10 +982,10 @@ BtrBulk::pageCommit(
 /** Log free check */
 inline void BtrBulk::logFreeCheck()
 {
-	if (log_sys.check_flush_or_checkpoint) {
+	if (log_sys.check_flush_or_checkpoint()) {
 		release();
 
-		log_free_check();
+		log_check_margins();
 
 		latch();
 	}
@@ -881,7 +1032,7 @@ BtrBulk::insert(
 	if (level + 1 > m_page_bulks.size()) {
 		PageBulk*	new_page_bulk
 			= UT_NEW_NOKEY(PageBulk(m_index, m_trx->id, FIL_NULL,
-						level, m_flush_observer));
+						level));
 		err = new_page_bulk->init();
 		if (err != DB_SUCCESS) {
 			UT_DELETE(new_page_bulk);
@@ -935,8 +1086,7 @@ BtrBulk::insert(
 		/* Create a sibling page_bulk. */
 		PageBulk*	sibling_page_bulk;
 		sibling_page_bulk = UT_NEW_NOKEY(PageBulk(m_index, m_trx->id,
-							  FIL_NULL, level,
-							  m_flush_observer));
+							  FIL_NULL, level));
 		err = sibling_page_bulk->init();
 		if (err != DB_SUCCESS) {
 			UT_DELETE(sibling_page_bulk);
@@ -961,21 +1111,13 @@ BtrBulk::insert(
 		/* Important: log_free_check whether we need a checkpoint. */
 		if (page_is_leaf(sibling_page_bulk->getPage())) {
 			if (trx_is_interrupted(m_trx)) {
-				if (m_flush_observer) {
-					m_flush_observer->interrupted();
-				}
-
 				err = DB_INTERRUPTED;
 				goto func_exit;
 			}
 
-			/* Wake up page cleaner to flush dirty pages. */
 			srv_inc_activity_count();
-			os_event_set(buf_flush_event);
-
 			logFreeCheck();
 		}
-
 	}
 
 	/* Convert tuple to rec. */
@@ -1022,7 +1164,7 @@ if no error occurs.
 dberr_t
 BtrBulk::finish(dberr_t	err)
 {
-	ulint		last_page_no = FIL_NULL;
+	uint32_t last_page_no = FIL_NULL;
 
 	ut_ad(!m_index->table->is_temporary());
 
@@ -1057,18 +1199,15 @@ BtrBulk::finish(dberr_t	err)
 		mtr_t		mtr;
 		buf_block_t*	last_block;
 		PageBulk	root_page_bulk(m_index, m_trx->id,
-					       m_index->page, m_root_level,
-					       m_flush_observer);
+					       m_index->page, m_root_level);
 
 		mtr.start();
 		m_index->set_modified(mtr);
 		mtr_x_lock_index(m_index, &mtr);
 
 		ut_ad(last_page_no != FIL_NULL);
-		last_block = btr_block_get(
-			page_id_t(m_index->table->space_id, last_page_no),
-			m_index->table->space->zip_size(),
-			RW_X_LATCH, m_index, &mtr);
+		last_block = btr_block_get(*m_index, last_page_no, RW_X_LATCH,
+					   false, &mtr);
 		first_rec = page_rec_get_next(
 			page_get_infimum_rec(last_block->frame));
 		ut_ad(page_rec_is_user_rec(first_rec));
@@ -1085,9 +1224,6 @@ BtrBulk::finish(dberr_t	err)
 		/* Remove last page. */
 		btr_page_free(m_index, last_block, &mtr);
 
-		/* Do not flush the last page. */
-		last_block->page.flush_observer = NULL;
-
 		mtr.commit();
 
 		err = pageCommit(&root_page_bulk, NULL, false);
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
index abb633b51b4..4a4ef0deb9a 100644
--- a/storage/innobase/btr/btr0cur.cc
+++ b/storage/innobase/btr/btr0cur.cc
@@ -157,8 +157,7 @@ static
 void
 btr_cur_unmark_extern_fields(
 /*=========================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
-				part will be updated, or NULL */
+	buf_block_t*	block,	/*!< in/out: index page */
 	rec_t*		rec,	/*!< in/out: record in a clustered index */
 	dict_index_t*	index,	/*!< in: index of the page */
 	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
@@ -184,8 +183,7 @@ btr_rec_free_updated_extern_fields(
 	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
 				X-latched */
 	rec_t*		rec,	/*!< in: record */
-	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
-				part will be updated, or NULL */
+	buf_block_t*	block,	/*!< in: index page of rec */
 	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
 	const upd_t*	update,	/*!< in: update vector */
 	bool		rollback,/*!< in: performing rollback? */
@@ -201,8 +199,7 @@ btr_rec_free_externally_stored_fields(
 				tree MUST be X-latched */
 	rec_t*		rec,	/*!< in: record */
 	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
-	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
-				part will be updated, or NULL */
+	buf_block_t*	block,	/*!< in: index page of rec */
 	bool		rollback,/*!< in: performing rollback? */
 	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
 				an X-latch to record page and to the index
@@ -212,8 +209,6 @@ btr_rec_free_externally_stored_fields(
 
 /** Latches the leaf page or pages requested.
 @param[in]	block		leaf page where the search converged
-@param[in]	page_id		page id of the leaf
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 @param[in]	latch_mode	BTR_SEARCH_LEAF, ...
 @param[in]	cursor		cursor
 @param[in]	mtr		mini-transaction
@@ -221,26 +216,24 @@ btr_rec_free_externally_stored_fields(
 btr_latch_leaves_t
 btr_cur_latch_leaves(
 	buf_block_t*		block,
-	const page_id_t		page_id,
-	ulint			zip_size,
 	ulint			latch_mode,
 	btr_cur_t*		cursor,
 	mtr_t*			mtr)
 {
-	ulint		mode;
-	ulint		left_page_no;
-	ulint		right_page_no;
+	rw_lock_type_t	mode;
+	uint32_t	left_page_no;
+	uint32_t	right_page_no;
 	buf_block_t*	get_block;
-	page_t*		page = buf_block_get_frame(block);
 	bool		spatial;
 	btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
 
 	compile_time_assert(int(MTR_MEMO_PAGE_S_FIX) == int(RW_S_LATCH));
 	compile_time_assert(int(MTR_MEMO_PAGE_X_FIX) == int(RW_X_LATCH));
 	compile_time_assert(int(MTR_MEMO_PAGE_SX_FIX) == int(RW_SX_LATCH));
+	ut_ad(block->page.id().space() == cursor->index->table->space->id);
 
 	spatial = dict_index_is_spatial(cursor->index) && cursor->rtr_info;
-	ut_ad(buf_page_in_file(&block->page));
+	ut_ad(block->page.in_file());
 
 	switch (latch_mode) {
 	case BTR_SEARCH_LEAF:
@@ -253,11 +246,13 @@ btr_cur_latch_leaves(
 
 		mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH;
 		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
-		get_block = btr_block_get(page_id, zip_size, mode,
-					  cursor->index, mtr);
+		get_block = btr_block_get(*cursor->index,
+					  block->page.id().page_no(), mode,
+					  true, mtr);
 		latch_leaves.blocks[1] = get_block;
 #ifdef UNIV_BTR_DEBUG
-		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
+		ut_a(page_is_comp(get_block->frame)
+		     == page_is_comp(block->frame));
 #endif /* UNIV_BTR_DEBUG */
 		if (spatial) {
 			cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
@@ -268,13 +263,11 @@ btr_cur_latch_leaves(
 	case BTR_MODIFY_TREE:
 		/* It is exclusive for other operations which calls
 		btr_page_set_prev() */
-		ut_ad(mtr_memo_contains_flagged(
-			      mtr,
-			      dict_index_get_lock(cursor->index),
-			      MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+		ut_ad(mtr->memo_contains_flagged(&cursor->index->lock,
+						 MTR_MEMO_X_LOCK
+						 | MTR_MEMO_SX_LOCK));
 		/* x-latch also siblings from left to right */
-		left_page_no = btr_page_get_prev(page);
-		mode = latch_mode;
+		left_page_no = btr_page_get_prev(block->frame);
 
 		if (left_page_no != FIL_NULL) {
 
@@ -285,8 +278,8 @@ btr_cur_latch_leaves(
 
 			latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
 			get_block = btr_block_get(
-				page_id_t(page_id.space(), left_page_no),
-				zip_size, RW_X_LATCH, cursor->index, mtr);
+				*cursor->index, left_page_no, RW_X_LATCH,
+				true, mtr);
 			latch_leaves.blocks[0] = get_block;
 
 			if (spatial) {
@@ -302,18 +295,20 @@ btr_cur_latch_leaves(
 
 		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
 		get_block = btr_block_get(
-			page_id, zip_size, RW_X_LATCH, cursor->index, mtr);
+			*cursor->index, block->page.id().page_no(),
+			RW_X_LATCH, true, mtr);
 		latch_leaves.blocks[1] = get_block;
 
 #ifdef UNIV_BTR_DEBUG
 		/* Sanity check only after both the blocks are latched. */
 		if (latch_leaves.blocks[0] != NULL) {
 			ut_a(page_is_comp(latch_leaves.blocks[0]->frame)
-			     == page_is_comp(page));
+			     == page_is_comp(block->frame));
 			ut_a(btr_page_get_next(latch_leaves.blocks[0]->frame)
-			     == page_get_page_no(page));
+			     == block->page.id().page_no());
 		}
-		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
+		ut_a(page_is_comp(get_block->frame)
+		     == page_is_comp(block->frame));
 #endif /* UNIV_BTR_DEBUG */
 
 		if (spatial) {
@@ -321,7 +316,7 @@ btr_cur_latch_leaves(
 				= get_block;
 		}
 
-		right_page_no = btr_page_get_next(page);
+		right_page_no = btr_page_get_next(block->frame);
 
 		if (right_page_no != FIL_NULL) {
 			if (spatial) {
@@ -330,16 +325,16 @@ btr_cur_latch_leaves(
 								mtr);
 			}
 			latch_leaves.savepoints[2] = mtr_set_savepoint(mtr);
-			get_block = btr_block_get(
-				page_id_t(page_id.space(), right_page_no),
-				zip_size, RW_X_LATCH, cursor->index, mtr);
+			get_block = btr_block_get(*cursor->index,
+						  right_page_no, RW_X_LATCH,
+						  true, mtr);
 			latch_leaves.blocks[2] = get_block;
 #ifdef UNIV_BTR_DEBUG
 			if (get_block) {
 				ut_a(page_is_comp(get_block->frame)
-				     == page_is_comp(page));
+				     == page_is_comp(block->frame));
 				ut_a(btr_page_get_prev(get_block->frame)
-				     == page_get_page_no(page));
+				     == block->page.id().page_no());
 			}
 #endif /* UNIV_BTR_DEBUG */
 			if (spatial) {
@@ -355,30 +350,32 @@ btr_cur_latch_leaves(
 		mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
 		/* latch also left sibling */
 		rw_lock_s_lock(&block->lock);
-		left_page_no = btr_page_get_prev(page);
+		left_page_no = btr_page_get_prev(block->frame);
 		rw_lock_s_unlock(&block->lock);
 
 		if (left_page_no != FIL_NULL) {
 			latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
 			get_block = btr_block_get(
-				page_id_t(page_id.space(), left_page_no),
-				zip_size, mode, cursor->index, mtr);
+				*cursor->index, left_page_no, mode,
+				true, mtr);
 			latch_leaves.blocks[0] = get_block;
 			cursor->left_block = get_block;
 #ifdef UNIV_BTR_DEBUG
 			ut_a(page_is_comp(get_block->frame)
-			     == page_is_comp(page));
+			     == page_is_comp(block->frame));
 			ut_a(btr_page_get_next(get_block->frame)
-			     == page_get_page_no(page));
+			     == block->page.id().page_no());
 #endif /* UNIV_BTR_DEBUG */
 		}
 
 		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
-		get_block = btr_block_get(page_id, zip_size, mode,
-					  cursor->index, mtr);
+		get_block = btr_block_get(*cursor->index,
+					  block->page.id().page_no(), mode,
+					  true, mtr);
 		latch_leaves.blocks[1] = get_block;
 #ifdef UNIV_BTR_DEBUG
-		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
+		ut_a(page_is_comp(get_block->frame)
+		     == page_is_comp(block->frame));
 #endif /* UNIV_BTR_DEBUG */
 		return(latch_leaves);
 	case BTR_CONT_MODIFY_TREE:
@@ -699,15 +696,15 @@ bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
 
 	switch (fil_page_get_type(page)) {
 	default:
-		ut_ad(!"wrong page type");
+		ut_ad("wrong page type" == 0);
 		return true;
 	case FIL_PAGE_INDEX:
 		/* The field PAGE_INSTANT is guaranteed 0 on clustered
 		index root pages of ROW_FORMAT=COMPACT or
 		ROW_FORMAT=DYNAMIC when instant ADD COLUMN is not used. */
 		ut_ad(!page_is_comp(page) || !page_get_instant(page));
-		index->n_core_null_bytes = UT_BITS_IN_BYTES(
-			unsigned(index->n_nullable));
+		index->n_core_null_bytes = static_cast<uint8_t>(
+			UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
 		return false;
 	case FIL_PAGE_TYPE_INSTANT:
 		break;
@@ -726,7 +723,7 @@ bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
 		return true;
 	}
 
-	index->n_core_fields = n;
+	index->n_core_fields = n & dict_index_t::MAX_N_FIELDS;
 
 	const rec_t* infimum = page_get_infimum_rec(page);
 	const rec_t* supremum = page_get_supremum_rec(page);
@@ -742,8 +739,8 @@ bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
 
 		ut_ad(!index->is_dummy);
 		ut_d(index->is_dummy = true);
-		index->n_core_null_bytes = UT_BITS_IN_BYTES(
-			index->get_n_nullable(n));
+		index->n_core_null_bytes = static_cast<uint8_t>(
+			UT_BITS_IN_BYTES(index->get_n_nullable(n)));
 		ut_d(index->is_dummy = false);
 		return false;
 	}
@@ -779,33 +776,31 @@ btr_cur_optimistic_latch_leaves(
 	unsigned	line,
 	mtr_t*		mtr)
 {
-	ulint		mode;
-	ulint		left_page_no;
-	ulint		curr_page_no;
-	ut_ad(block->page.buf_fix_count);
-	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(block->page.buf_fix_count());
+	ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
 
 	switch (*latch_mode) {
+	default:
+		ut_error;
+		return(false);
 	case BTR_SEARCH_LEAF:
 	case BTR_MODIFY_LEAF:
 		return(buf_page_optimistic_get(*latch_mode, block,
 				modify_clock, file, line, mtr));
 	case BTR_SEARCH_PREV:
 	case BTR_MODIFY_PREV:
-		mode = *latch_mode == BTR_SEARCH_PREV
-			? RW_S_LATCH : RW_X_LATCH;
-
 		rw_lock_s_lock(&block->lock);
 		if (block->modify_clock != modify_clock) {
 			rw_lock_s_unlock(&block->lock);
 			return false;
 		}
-
-		curr_page_no = block->page.id.page_no();
-		left_page_no = btr_page_get_prev(
-			buf_block_get_frame(block));
+		const uint32_t curr_page_no = block->page.id().page_no();
+		const uint32_t left_page_no = btr_page_get_prev(block->frame);
 		rw_lock_s_unlock(&block->lock);
 
+		const rw_lock_type_t mode = *latch_mode == BTR_SEARCH_PREV
+			? RW_S_LATCH : RW_X_LATCH;
+
 		if (left_page_no != FIL_NULL) {
 			dberr_t	err = DB_SUCCESS;
 			cursor->left_block = buf_page_get_gen(
@@ -815,11 +810,13 @@ btr_cur_optimistic_latch_leaves(
 				mode, nullptr, BUF_GET_POSSIBLY_FREED,
 				__FILE__, __LINE__, mtr, &err);
 
-			if (err == DB_DECRYPTION_FAILED) {
+			if (!cursor->left_block) {
 				cursor->index->table->file_unreadable = true;
 			}
 
-			if (btr_page_get_next(cursor->left_block->frame)
+			if (cursor->left_block->page.status
+			    == buf_page_t::FREED
+			    || btr_page_get_next(cursor->left_block->frame)
 			    != curr_page_no) {
 				/* release the left block */
 				btr_leaf_page_release(
@@ -832,13 +829,12 @@ btr_cur_optimistic_latch_leaves(
 
 		if (buf_page_optimistic_get(mode, block, modify_clock,
 					    file, line, mtr)) {
-			if (btr_page_get_prev(buf_block_get_frame(block))
-			    == left_page_no) {
+			if (btr_page_get_prev(block->frame) == left_page_no) {
 				/* block was already buffer-fixed while
 				entering the function and
 				buf_page_optimistic_get() buffer-fixes
 				it again. */
-				ut_ad(2 <= block->page.buf_fix_count);
+				ut_ad(2 <= block->page.buf_fix_count());
 				*latch_mode = mode;
 				return(true);
 			} else {
@@ -849,19 +845,15 @@ btr_cur_optimistic_latch_leaves(
 			}
 		}
 
-		ut_ad(block->page.buf_fix_count);
+		ut_ad(block->page.buf_fix_count());
 		/* release the left block */
 		if (cursor->left_block != NULL) {
 			btr_leaf_page_release(cursor->left_block,
 					      mode, mtr);
 		}
-
-		return(false);
-
-	default:
-		ut_error;
-		return(false);
 	}
+
+	return false;
 }
 
 /**
@@ -945,8 +937,8 @@ btr_cur_will_modify_tree(
 	mtr_t*		mtr)
 {
 	ut_ad(!page_is_leaf(page));
-	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
-					MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
 
 	/* Pessimistic delete of the first record causes delete & insert
 	of node_ptr at upper level. And a subsequent page shrink is
@@ -1278,7 +1270,6 @@ btr_cur_search_to_nth_level_func(
 	ulint		up_bytes;
 	ulint		low_match;
 	ulint		low_bytes;
-	ulint		savepoint;
 	ulint		rw_latch;
 	page_cur_mode_t	page_mode;
 	page_cur_mode_t	search_mode = PAGE_CUR_UNSUPP;
@@ -1290,7 +1281,6 @@ btr_cur_search_to_nth_level_func(
 	ulint		root_height = 0; /* remove warning */
 	dberr_t		err = DB_SUCCESS;
 
-	ulint		upper_rw_latch, root_leaf_rw_latch;
 	btr_intention_t	lock_intention;
 	bool		modify_external;
 	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
@@ -1348,10 +1338,8 @@ btr_cur_search_to_nth_level_func(
 
 	ut_ad(!s_latch_by_caller
 	      || srv_read_only_mode
-	      || mtr_memo_contains_flagged(mtr,
-					   dict_index_get_lock(index),
-					   MTR_MEMO_S_LOCK
-					   | MTR_MEMO_SX_LOCK));
+	      || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK
+					    | MTR_MEMO_SX_LOCK));
 
 	/* These flags are mutually exclusive, they are lumped together
 	with the latch mode for historical reasons. It's possible for
@@ -1475,7 +1463,9 @@ btr_cur_search_to_nth_level_func(
 	/* Store the position of the tree latch we push to mtr so that we
 	know how to release it when we have latched leaf node(s) */
 
-	savepoint = mtr_set_savepoint(mtr);
+	ulint savepoint = mtr_set_savepoint(mtr);
+
+	rw_lock_type_t upper_rw_latch;
 
 	switch (latch_mode) {
 	case BTR_MODIFY_TREE:
@@ -1484,7 +1474,7 @@ btr_cur_search_to_nth_level_func(
 		for them, when the history list is glowing huge. */
 		if (lock_intention == BTR_INTENTION_DELETE
 		    && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
-			&& buf_get_n_pending_read_ios()) {
+		    && buf_pool.n_pend_reads) {
 x_latch_index:
 			mtr_x_lock_index(index, mtr);
 		} else if (index->is_spatial()
@@ -1502,10 +1492,9 @@ x_latch_index:
 	case BTR_CONT_SEARCH_TREE:
 		/* Do nothing */
 		ut_ad(srv_read_only_mode
-		      || mtr_memo_contains_flagged(mtr,
-						   dict_index_get_lock(index),
-						   MTR_MEMO_X_LOCK
-						   | MTR_MEMO_SX_LOCK));
+		      || mtr->memo_contains_flagged(&index->lock,
+						    MTR_MEMO_X_LOCK
+						    | MTR_MEMO_SX_LOCK));
 		if (dict_index_is_spatial(index)
 		    && latch_mode == BTR_CONT_MODIFY_TREE) {
 			/* If we are about to locating parent page for split
@@ -1536,7 +1525,8 @@ x_latch_index:
 			upper_rw_latch = RW_NO_LATCH;
 		}
 	}
-	root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
+	const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
+		latch_mode);
 
 	page_cursor = btr_cur_get_page_cur(cursor);
 
@@ -1624,7 +1614,8 @@ retry_page_get:
 	ut_ad(n_blocks < BTR_MAX_LEVELS);
 	tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
 	block = buf_page_get_gen(page_id, zip_size, rw_latch, guess,
-				 buf_mode, file, line, mtr, &err);
+				 buf_mode, file, line, mtr, &err,
+				 height == 0 && !index->is_clust());
 	tree_blocks[n_blocks] = block;
 
 	/* Note that block==NULL signifies either an error or change
@@ -1699,11 +1690,11 @@ retry_page_get:
 				cursor->flag = BTR_CUR_DELETE_IBUF;
 			} else {
 				/* The purge could not be buffered. */
-				buf_pool_watch_unset(page_id);
+				buf_pool.watch_unset(page_id);
 				break;
 			}
 
-			buf_pool_watch_unset(page_id);
+			buf_pool.watch_unset(page_id);
 			goto func_exit;
 
 		default:
@@ -1720,7 +1711,7 @@ retry_page_get:
 
 	if (retrying_for_search_prev && height != 0) {
 		/* also latch left sibling */
-		ulint		left_page_no;
+		uint32_t	left_page_no;
 		buf_block_t*	get_block;
 
 		ut_ad(rw_latch == RW_NO_LATCH);
@@ -1768,8 +1759,9 @@ retry_page_get:
 			tree_blocks[n_blocks]);
 
 		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
-		block = buf_page_get_gen(page_id, zip_size, rw_latch, NULL,
-					 buf_mode, file, line, mtr, &err);
+		block = buf_page_get_gen(page_id, zip_size,
+					 rw_latch, NULL, buf_mode,
+					 file, line, mtr, &err);
 		tree_blocks[n_blocks] = block;
 
 		if (err != DB_SUCCESS) {
@@ -1854,10 +1846,8 @@ retry_page_get:
 
 	if (height == 0) {
 		if (rw_latch == RW_NO_LATCH) {
-
 			latch_leaves = btr_cur_latch_leaves(
-				block, page_id, zip_size, latch_mode,
-				cursor, mtr);
+				block, latch_mode, cursor, mtr);
 		}
 
 		switch (latch_mode) {
@@ -1896,10 +1886,10 @@ retry_page_get:
 				if (n_releases == 0
 				    && (modify_external || autoinc)) {
 					/* keep the root page latch */
-					ut_ad(mtr_memo_contains_flagged(
-						mtr, tree_blocks[n_releases],
-						MTR_MEMO_PAGE_SX_FIX
-						| MTR_MEMO_PAGE_X_FIX));
+					ut_ad(mtr->memo_contains_flagged(
+						      tree_blocks[n_releases],
+						      MTR_MEMO_PAGE_SX_FIX
+						      | MTR_MEMO_PAGE_X_FIX));
 					continue;
 				}
 
@@ -2109,9 +2099,8 @@ need_opposite_intention:
 
 				if (latch_mode == BTR_MODIFY_TREE
 				    && rw_latch == RW_NO_LATCH) {
-					ut_ad(mtr_memo_contains_flagged(
-						mtr, dict_index_get_lock(index),
-						MTR_MEMO_X_LOCK
+					ut_ad(mtr->memo_contains_flagged(
+						&index->lock, MTR_MEMO_X_LOCK
 						| MTR_MEMO_SX_LOCK));
 					rw_lock_s_lock(&block->lock);
 					add_latch = true;
@@ -2160,7 +2149,7 @@ need_opposite_intention:
 
 		/* If the first or the last record of the page
 		or the same key value to the first record or last record,
-		the another page might be choosen when BTR_CONT_MODIFY_TREE.
+		the another page might be chosen when BTR_CONT_MODIFY_TREE.
 		So, the parent page should not released to avoiding deadlock
 		with blocking the another search with the same key value. */
 		if (!detected_same_key_root
@@ -2267,9 +2256,9 @@ need_opposite_intention:
 		    && !retrying_for_search_prev) {
 			/* block should be latched for consistent
 			   btr_page_get_prev() */
-			ut_ad(mtr_memo_contains_flagged(mtr, block,
-				MTR_MEMO_PAGE_S_FIX
-				| MTR_MEMO_PAGE_X_FIX));
+			ut_ad(mtr->memo_contains_flagged(
+				      block, MTR_MEMO_PAGE_S_FIX
+				      | MTR_MEMO_PAGE_X_FIX));
 
 			if (page_has_prev(page)
 			    && page_rec_is_first(node_ptr, page)) {
@@ -2299,7 +2288,7 @@ need_opposite_intention:
 					- (leftmost_from_level - 1);
 
 				page_id.set_page_no(
-					tree_blocks[idx]->page.id.page_no());
+					tree_blocks[idx]->page.id().page_no());
 
 				for (ulint i = n_blocks
 					       - (leftmost_from_level - 1);
@@ -2413,32 +2402,24 @@ need_opposite_intention:
 		ut_ad(!autoinc);
 
 		if (upper_rw_latch == RW_NO_LATCH) {
-			/* latch the page */
-			buf_block_t*	child_block;
-
-			if (latch_mode == BTR_CONT_MODIFY_TREE) {
-				child_block = btr_block_get(
-					page_id, zip_size, RW_X_LATCH,
-					index, mtr);
-			} else {
-				ut_ad(latch_mode == BTR_CONT_SEARCH_TREE);
-				child_block = btr_block_get(
-					page_id, zip_size, RW_SX_LATCH,
-					index, mtr);
-			}
-
+			ut_ad(latch_mode == BTR_CONT_MODIFY_TREE
+			      || latch_mode == BTR_CONT_SEARCH_TREE);
+			buf_block_t* child_block = btr_block_get(
+				*index, page_id.page_no(),
+				latch_mode == BTR_CONT_MODIFY_TREE
+				? RW_X_LATCH : RW_SX_LATCH, false, mtr);
 			btr_assert_not_corrupted(child_block, index);
 		} else {
-			ut_ad(mtr_memo_contains(mtr, block, upper_rw_latch));
+			ut_ad(mtr->memo_contains_flagged(block,
+							 upper_rw_latch));
 			btr_assert_not_corrupted(block, index);
 
 			if (s_latch_by_caller) {
 				ut_ad(latch_mode == BTR_SEARCH_TREE);
 				/* to exclude modifying tree operations
 				should sx-latch the index. */
-				ut_ad(mtr_memo_contains(
-					mtr, dict_index_get_lock(index),
-					MTR_MEMO_SX_LOCK));
+				ut_ad(mtr->memo_contains(index->lock,
+							 MTR_MEMO_SX_LOCK));
 				/* because has sx-latch of index,
 				can release upper blocks. */
 				for (; n_releases < n_blocks; n_releases++) {
@@ -2461,8 +2442,7 @@ need_opposite_intention:
 		cursor->up_bytes = up_bytes;
 
 		if (autoinc) {
-			page_set_autoinc(tree_blocks[0],
-					 index, autoinc, mtr, false);
+			page_set_autoinc(tree_blocks[0], autoinc, mtr, false);
 		}
 
 #ifdef BTR_CUR_HASH_ADAPT
@@ -2558,8 +2538,6 @@ btr_cur_open_at_index_side_func(
 	ulint		root_height = 0; /* remove warning */
 	rec_t*		node_ptr;
 	ulint		estimate;
-	ulint		savepoint;
-	ulint		upper_rw_latch, root_leaf_rw_latch;
 	btr_intention_t	lock_intention;
 	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
 	ulint		tree_savepoints[BTR_MAX_LEVELS];
@@ -2596,7 +2574,9 @@ btr_cur_open_at_index_side_func(
 	/* Store the position of the tree latch we push to mtr so that we
 	know how to release it when we have latched the leaf node */
 
-	savepoint = mtr_set_savepoint(mtr);
+	ulint savepoint = mtr_set_savepoint(mtr);
+
+	rw_lock_type_t upper_rw_latch;
 
 	switch (latch_mode) {
 	case BTR_CONT_MODIFY_TREE:
@@ -2609,7 +2589,7 @@ btr_cur_open_at_index_side_func(
 		for them, when the history list is glowing huge. */
 		if (lock_intention == BTR_INTENTION_DELETE
 		    && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
-		    && buf_get_n_pending_read_ios()) {
+		    && buf_pool.n_pend_reads) {
 			mtr_x_lock_index(index, mtr);
 		} else {
 			mtr_sx_lock_index(index, mtr);
@@ -2618,10 +2598,9 @@ btr_cur_open_at_index_side_func(
 		break;
 	default:
 		ut_ad(!s_latch_by_caller
-		      || mtr_memo_contains_flagged(mtr,
-						 dict_index_get_lock(index),
-						 MTR_MEMO_SX_LOCK
-						 | MTR_MEMO_S_LOCK));
+		      || mtr->memo_contains_flagged(&index->lock,
+						    MTR_MEMO_SX_LOCK
+						    | MTR_MEMO_S_LOCK));
 		if (!srv_read_only_mode) {
 			if (!s_latch_by_caller) {
 				/* BTR_SEARCH_TREE is intended to be used with
@@ -2635,7 +2614,9 @@ btr_cur_open_at_index_side_func(
 			upper_rw_latch = RW_NO_LATCH;
 		}
 	}
-	root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
+
+	const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
+		latch_mode);
 
 	page_cursor = btr_cur_get_page_cur(cursor);
 	cursor->index = index;
@@ -2650,22 +2631,17 @@ btr_cur_open_at_index_side_func(
 	height = ULINT_UNDEFINED;
 
 	for (;;) {
-		buf_block_t*	block;
-		ulint		rw_latch;
-
 		ut_ad(n_blocks < BTR_MAX_LEVELS);
-
-		if (height != 0
-		    && (latch_mode != BTR_MODIFY_TREE
-			|| height == level)) {
-			rw_latch = upper_rw_latch;
-		} else {
-			rw_latch = RW_NO_LATCH;
-		}
-
 		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
-		block = buf_page_get_gen(page_id, zip_size, rw_latch, NULL,
-					 BUF_GET, file, line, mtr, &err);
+
+		const ulint rw_latch = height
+			&& (latch_mode != BTR_MODIFY_TREE || height == level)
+			? upper_rw_latch : RW_NO_LATCH;
+		buf_block_t* block = buf_page_get_gen(page_id, zip_size,
+						      rw_latch, NULL, BUF_GET,
+						      file, line, mtr, &err,
+						      height == 0
+						      && !index->is_clust());
 		ut_ad((block != NULL) == (err == DB_SUCCESS));
 		tree_blocks[n_blocks] = block;
 
@@ -2717,77 +2693,62 @@ btr_cur_open_at_index_side_func(
 			ut_ad(height == btr_page_get_level(page));
 		}
 
-		if (height == level) {
-			if (srv_read_only_mode) {
-				btr_cur_latch_leaves(
-					block, page_id, zip_size,
-					latch_mode, cursor, mtr);
-			} else if (height == 0) {
-				if (rw_latch == RW_NO_LATCH) {
-					btr_cur_latch_leaves(
-						block, page_id, zip_size,
-						latch_mode, cursor, mtr);
-				}
-				/* In versions <= 3.23.52 we had
-				forgotten to release the tree latch
-				here. If in an index scan we had to
-				scan far to find a record visible to
-				the current transaction, that could
-				starve others waiting for the tree
-				latch. */
-
-				switch (latch_mode) {
-				case BTR_MODIFY_TREE:
-				case BTR_CONT_MODIFY_TREE:
-				case BTR_CONT_SEARCH_TREE:
-					break;
-				default:
-					if (!s_latch_by_caller) {
-						/* Release the tree s-latch */
-						mtr_release_s_latch_at_savepoint(
-							mtr, savepoint,
-							dict_index_get_lock(
-								index));
-					}
+		if (height == 0) {
+			if (rw_latch == RW_NO_LATCH) {
+				btr_cur_latch_leaves(block, latch_mode,
+						     cursor, mtr);
+			}
 
-					/* release upper blocks */
-					for (; n_releases < n_blocks;
-					     n_releases++) {
-						mtr_release_block_at_savepoint(
-							mtr,
-							tree_savepoints[
-								n_releases],
-							tree_blocks[
-								n_releases]);
-					}
+			/* In versions <= 3.23.52 we had forgotten to
+			release the tree latch here. If in an index
+			scan we had to scan far to find a record
+			visible to the current transaction, that could
+			starve others waiting for the tree latch. */
+
+			switch (latch_mode) {
+			case BTR_MODIFY_TREE:
+			case BTR_CONT_MODIFY_TREE:
+			case BTR_CONT_SEARCH_TREE:
+				break;
+			default:
+				if (UNIV_UNLIKELY(srv_read_only_mode)) {
+					break;
+				}
+				if (!s_latch_by_caller) {
+					/* Release the tree s-latch */
+					mtr_release_s_latch_at_savepoint(
+						mtr, savepoint, &index->lock);
 				}
-			} else { /* height != 0 */
-				/* We already have the block latched. */
-				ut_ad(latch_mode == BTR_SEARCH_TREE);
-				ut_ad(s_latch_by_caller);
-				ut_ad(upper_rw_latch == RW_S_LATCH);
 
-				ut_ad(mtr_memo_contains(mtr, block,
-							upper_rw_latch));
+				/* release upper blocks */
+				for (; n_releases < n_blocks; n_releases++) {
+					mtr_release_block_at_savepoint(
+						mtr,
+						tree_savepoints[n_releases],
+						tree_blocks[n_releases]);
+				}
+			}
+		} else if (height == level /* height != 0 */
+			   && UNIV_LIKELY(!srv_read_only_mode)) {
+			/* We already have the block latched. */
+			ut_ad(latch_mode == BTR_SEARCH_TREE);
+			ut_ad(s_latch_by_caller);
+			ut_ad(upper_rw_latch == RW_S_LATCH);
+			ut_ad(mtr->memo_contains_flagged(block,
+							 MTR_MEMO_PAGE_S_FIX));
 
-				if (s_latch_by_caller) {
-					/* to exclude modifying tree operations
-					should sx-latch the index. */
-					ut_ad(mtr_memo_contains(
+			if (s_latch_by_caller) {
+				/* to exclude modifying tree operations
+				should sx-latch the index. */
+				ut_ad(mtr->memo_contains(index->lock,
+							 MTR_MEMO_SX_LOCK));
+				/* because has sx-latch of index,
+				can release upper blocks. */
+				for (; n_releases < n_blocks; n_releases++) {
+					mtr_release_block_at_savepoint(
 						mtr,
-						dict_index_get_lock(index),
-						MTR_MEMO_SX_LOCK));
-					/* because has sx-latch of index,
-					can release upper blocks. */
-					for (; n_releases < n_blocks;
-					     n_releases++) {
-						mtr_release_block_at_savepoint(
-							mtr,
-							tree_savepoints[
-								n_releases],
-							tree_blocks[
-								n_releases]);
-					}
+						tree_savepoints[n_releases],
+						tree_blocks[n_releases]);
 				}
 			}
 		}
@@ -2927,8 +2888,6 @@ btr_cur_open_at_rnd_pos_func(
 	ulint		node_ptr_max_size = srv_page_size / 2;
 	ulint		height;
 	rec_t*		node_ptr;
-	ulint		savepoint;
-	ulint		upper_rw_latch, root_leaf_rw_latch;
 	btr_intention_t	lock_intention;
 	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
 	ulint		tree_savepoints[BTR_MAX_LEVELS];
@@ -2945,7 +2904,9 @@ btr_cur_open_at_rnd_pos_func(
 
 	ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
 
-	savepoint = mtr_set_savepoint(mtr);
+	ulint savepoint = mtr_set_savepoint(mtr);
+
+	rw_lock_type_t upper_rw_latch;
 
 	switch (latch_mode) {
 	case BTR_MODIFY_TREE:
@@ -2954,7 +2915,7 @@ btr_cur_open_at_rnd_pos_func(
 		for them, when the history list is glowing huge. */
 		if (lock_intention == BTR_INTENTION_DELETE
 		    && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
-		    && buf_get_n_pending_read_ios()) {
+		    && buf_pool.n_pend_reads) {
 			mtr_x_lock_index(index, mtr);
 		} else {
 			mtr_sx_lock_index(index, mtr);
@@ -2992,7 +2953,8 @@ btr_cur_open_at_rnd_pos_func(
 		return(false);
 	}
 
-	root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
+	const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
+		latch_mode);
 
 	page_cursor = btr_cur_get_page_cur(cursor);
 	cursor->index = index;
@@ -3008,22 +2970,19 @@ btr_cur_open_at_rnd_pos_func(
 	height = ULINT_UNDEFINED;
 
 	for (;;) {
-		buf_block_t*	block;
 		page_t*		page;
-		ulint		rw_latch;
 
 		ut_ad(n_blocks < BTR_MAX_LEVELS);
-
-		if (height != 0
-		    && latch_mode != BTR_MODIFY_TREE) {
-			rw_latch = upper_rw_latch;
-		} else {
-			rw_latch = RW_NO_LATCH;
-		}
-
 		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
-		block = buf_page_get_gen(page_id, zip_size, rw_latch, NULL,
-			BUF_GET, file, line, mtr, &err);
+
+		const rw_lock_type_t rw_latch = height
+			&& latch_mode != BTR_MODIFY_TREE
+			? upper_rw_latch : RW_NO_LATCH;
+		buf_block_t* block = buf_page_get_gen(page_id, zip_size,
+						      rw_latch, NULL, BUF_GET,
+						      file, line, mtr, &err,
+						      height == 0
+						      && !index->is_clust());
 		tree_blocks[n_blocks] = block;
 
 		ut_ad((block != NULL) == (err == DB_SUCCESS));
@@ -3074,9 +3033,8 @@ btr_cur_open_at_rnd_pos_func(
 		if (height == 0) {
 			if (rw_latch == RW_NO_LATCH
 			    || srv_read_only_mode) {
-				btr_cur_latch_leaves(
-					block, page_id, zip_size,
-					latch_mode, cursor, mtr);
+				btr_cur_latch_leaves(block, latch_mode, cursor,
+						     mtr);
 			}
 
 			/* btr_cur_open_at_index_side_func() and
@@ -3235,8 +3193,8 @@ btr_cur_insert_if_possible(
 
 	ut_ad(dtuple_check_typed(tuple));
 
-	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
-			       MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
 	page_cursor = btr_cur_get_page_cur(cursor);
 
 	/* Now, try the insert */
@@ -3369,33 +3327,30 @@ upd_sys:
 
 /**
 Prefetch siblings of the leaf for the pessimistic operation.
-@param block	leaf page */
-static
-void
-btr_cur_prefetch_siblings(
-	buf_block_t*	block)
+@param block	leaf page
+@param index    index of the page */
+static void btr_cur_prefetch_siblings(const buf_block_t *block,
+                                      const dict_index_t *index)
 {
-	page_t*	page = buf_block_get_frame(block);
+  ut_ad(page_is_leaf(block->frame));
 
-	ut_ad(page_is_leaf(page));
+  if (index->is_ibuf())
+    return;
 
-	ulint left_page_no = fil_page_get_prev(page);
-	ulint right_page_no = fil_page_get_next(page);
+  const page_t *page= block->frame;
+  uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
+  uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
 
-	if (left_page_no != FIL_NULL) {
-		buf_read_page_background(
-			page_id_t(block->page.id.space(), left_page_no),
-			block->zip_size(), false);
-	}
-	if (right_page_no != FIL_NULL) {
-		buf_read_page_background(
-			page_id_t(block->page.id.space(), right_page_no),
-			block->zip_size(), false);
-	}
-	if (left_page_no != FIL_NULL
-	    || right_page_no != FIL_NULL) {
-		os_aio_simulated_wake_handler_threads();
-	}
+  fil_space_t *space= index->table->space;
+
+  if (prev == FIL_NULL);
+  else if (space->acquire())
+    buf_read_page_background(space, page_id_t(space->id, prev),
+                             block->zip_size());
+  if (next == FIL_NULL);
+  else if (space->acquire())
+    buf_read_page_background(space, page_id_t(space->id, next),
+                             block->zip_size());
 }
 
 /*************************************************************//**
@@ -3451,7 +3406,7 @@ btr_cur_optimistic_insert(
 	page = buf_block_get_frame(block);
 	index = cursor->index;
 
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 	ut_ad(!dict_index_is_online_ddl(index)
 	      || dict_index_is_clust(index)
 	      || (flags & BTR_CREATE_FLAG));
@@ -3513,8 +3468,8 @@ fail:
 
 		/* prefetch siblings of the leaf for the pessimistic
 		operation, if the page is leaf. */
-		if (page_is_leaf(page) && !index->is_ibuf()) {
-			btr_cur_prefetch_siblings(block);
+		if (page_is_leaf(page)) {
+			btr_cur_prefetch_siblings(block, index);
 		}
 fail_err:
 
@@ -3660,7 +3615,7 @@ fail_err:
 		ut_ad(index->is_instant());
 		ut_ad(flags == BTR_NO_LOCKING_FLAG);
 	} else {
-		rw_lock_t* ahi_latch = btr_get_search_latch(index);
+		rw_lock_t* ahi_latch = btr_search_sys.get_latch(*index);
 		if (!reorg && cursor->flag == BTR_CUR_HASH) {
 			btr_search_update_hash_node_on_insert(
 				cursor, ahi_latch);
@@ -3745,18 +3700,17 @@ btr_cur_pessimistic_insert(
 	dberr_t		err;
 	bool		inherit = false;
 	bool		success;
-	ulint		n_reserved	= 0;
+	uint32_t	n_reserved	= 0;
 
 	ut_ad(dtuple_check_typed(entry));
 	ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
 
 	*big_rec = NULL;
 
-	ut_ad(mtr_memo_contains_flagged(
-		      mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
-		      MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
-	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
-			       MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
 	ut_ad(!dict_index_is_online_ddl(index)
 	      || dict_index_is_clust(index)
 	      || (flags & BTR_CREATE_FLAG));
@@ -3778,7 +3732,7 @@ btr_cur_pessimistic_insert(
 		of the index tree, so that the insert will not fail because
 		of lack of space */
 
-		ulint	n_extents = cursor->tree_height / 16 + 3;
+		uint32_t n_extents = uint32_t(cursor->tree_height / 16 + 3);
 
 		success = fsp_reserve_free_extents(&n_reserved,
 						   index->table->space,
@@ -3817,7 +3771,7 @@ btr_cur_pessimistic_insert(
 	}
 
 	if (dict_index_get_page(index)
-	    == btr_cur_get_block(cursor)->page.id.page_no()) {
+	    == btr_cur_get_block(cursor)->page.id().page_no()) {
 
 		/* The page is the root page */
 		*rec = btr_root_raise_and_insert(
@@ -3872,7 +3826,7 @@ btr_cur_pessimistic_insert(
 			ut_ad(!(flags & BTR_CREATE_FLAG));
 		} else {
 			btr_search_update_hash_on_insert(
-				cursor, btr_get_search_latch(index));
+				cursor, btr_search_sys.get_latch(*index));
 		}
 #endif /* BTR_CUR_HASH_ADAPT */
 		if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
@@ -3951,31 +3905,6 @@ btr_cur_upd_lock_and_undo(
 		       cmpl_info, rec, offsets, roll_ptr));
 }
 
-/** Copy DB_TRX_ID,DB_ROLL_PTR to the redo log.
-@param[in]	index	clustered index
-@param[in]	trx_id_t	DB_TRX_ID
-@param[in]	roll_ptr	DB_ROLL_PTR
-@param[in,out]	log_ptr		redo log buffer
-@return current end of the redo log buffer */
-static byte*
-btr_cur_log_sys(
-	const dict_index_t*	index,
-	trx_id_t		trx_id,
-	roll_ptr_t		roll_ptr,
-	byte*			log_ptr)
-{
-	log_ptr += mach_write_compressed(log_ptr, index->db_trx_id());
-	/* Yes, we are writing DB_ROLL_PTR,DB_TRX_ID in reverse order,
-	after emitting the position of DB_TRX_ID in the index.
-	This is how row_upd_write_sys_vals_to_log()
-	originally worked, and it is part of the redo log format. */
-	trx_write_roll_ptr(log_ptr, roll_ptr);
-	log_ptr += DATA_ROLL_PTR_LEN;
-	log_ptr += mach_u64_write_compressed(log_ptr, trx_id);
-
-	return log_ptr;
-}
-
 /** Write DB_TRX_ID,DB_ROLL_PTR to a clustered index entry.
 @param[in,out]	entry		clustered index entry
 @param[in]	index		clustered index
@@ -3995,148 +3924,95 @@ static void btr_cur_write_sys(
 	trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr);
 }
 
-/***********************************************************//**
-Writes a redo log record of updating a record in-place. */
-void
-btr_cur_update_in_place_log(
-/*========================*/
-	ulint		flags,		/*!< in: flags */
-	const rec_t*	rec,		/*!< in: record */
-	dict_index_t*	index,		/*!< in: index of the record */
-	const upd_t*	update,		/*!< in: update vector */
-	trx_id_t	trx_id,		/*!< in: transaction id */
-	roll_ptr_t	roll_ptr,	/*!< in: roll ptr */
-	mtr_t*		mtr)		/*!< in: mtr */
+/** Update DB_TRX_ID, DB_ROLL_PTR in a clustered index record.
+@param[in,out]  block           clustered index leaf page
+@param[in,out]  rec             clustered index record
+@param[in]      index           clustered index
+@param[in]      offsets         rec_get_offsets(rec, index)
+@param[in]      trx             transaction
+@param[in]      roll_ptr        DB_ROLL_PTR value
+@param[in,out]  mtr             mini-transaction */
+static void btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec,
+                                dict_index_t *index, const rec_offs *offsets,
+                                const trx_t *trx, roll_ptr_t roll_ptr,
+                                mtr_t *mtr)
 {
-	byte*		log_ptr;
-	const page_t*	page	= page_align(rec);
-	ut_ad(flags < 256);
-	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
-
-	log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
-					    ? MLOG_COMP_REC_UPDATE_IN_PLACE
-					    : MLOG_REC_UPDATE_IN_PLACE,
-					    1 + DATA_ROLL_PTR_LEN + 14 + 2
-					    + MLOG_BUF_MARGIN);
-
-	if (!log_ptr) {
-		/* Logging in mtr is switched off during crash recovery */
-		return;
-	}
-
-	/* For secondary indexes, we could skip writing the dummy system fields
-	to the redo log but we have to change redo log parsing of
-	MLOG_REC_UPDATE_IN_PLACE/MLOG_COMP_REC_UPDATE_IN_PLACE or we have to add
-	new redo log record. For now, just write dummy sys fields to the redo
-	log if we are updating a secondary index record.
-	*/
-	mach_write_to_1(log_ptr, flags);
-	log_ptr++;
-
-	if (dict_index_is_clust(index)) {
-		log_ptr = btr_cur_log_sys(index, trx_id, roll_ptr, log_ptr);
-	} else {
-		/* Dummy system fields for a secondary index */
-		/* TRX_ID Position */
-		log_ptr += mach_write_compressed(log_ptr, 0);
-		/* ROLL_PTR */
-		trx_write_roll_ptr(log_ptr, 0);
-		log_ptr += DATA_ROLL_PTR_LEN;
-		/* TRX_ID */
-		log_ptr += mach_u64_write_compressed(log_ptr, 0);
-	}
-
-	mach_write_to_2(log_ptr, page_offset(rec));
-	log_ptr += 2;
-
-	row_upd_index_write_log(update, log_ptr, mtr);
-}
-
-/***********************************************************//**
-Parses a redo log record of updating a record in-place.
-@return end of log record or NULL */
-byte*
-btr_cur_parse_update_in_place(
-/*==========================*/
-	byte*		ptr,	/*!< in: buffer */
-	byte*		end_ptr,/*!< in: buffer end */
-	page_t*		page,	/*!< in/out: page or NULL */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	dict_index_t*	index)	/*!< in: index corresponding to page */
-{
-	ulint		flags;
-	rec_t*		rec;
-	upd_t*		update;
-	ulint		pos;
-	trx_id_t	trx_id;
-	roll_ptr_t	roll_ptr;
-	ulint		rec_offset;
-	mem_heap_t*	heap;
-	rec_offs*	offsets;
-
-	if (end_ptr < ptr + 1) {
-
-		return(NULL);
-	}
-
-	flags = mach_read_from_1(ptr);
-	ptr++;
-
-	ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
-
-	if (ptr == NULL) {
-
-		return(NULL);
-	}
-
-	if (end_ptr < ptr + 2) {
-
-		return(NULL);
-	}
-
-	rec_offset = mach_read_from_2(ptr);
-	ptr += 2;
-
-	ut_a(rec_offset <= srv_page_size);
-
-	heap = mem_heap_create(256);
-
-	ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
-
-	if (!ptr || !page) {
-
-		goto func_exit;
-	}
-
-	ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
-	rec = page + rec_offset;
-
-	/* We do not need to reserve search latch, as the page is only
-	being recovered, and there cannot be a hash index to it. */
-
-	/* The function rtr_update_mbr_field_in_place() is generating
-	these records on node pointer pages; therefore we have to
-	check if this is a leaf page. */
-
-	offsets = rec_get_offsets(rec, index, NULL,
-				  flags != (BTR_NO_UNDO_LOG_FLAG
-					    | BTR_NO_LOCKING_FLAG
-					    | BTR_KEEP_SYS_FLAG)
-				  || page_is_leaf(page)
-				  ? index->n_core_fields : 0,
-				  ULINT_UNDEFINED, &heap);
-
-	if (!(flags & BTR_KEEP_SYS_FLAG)) {
-		row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
-						   pos, trx_id, roll_ptr);
-	}
-
-	row_upd_rec_in_place(rec, index, offsets, update, page_zip);
-
-func_exit:
-	mem_heap_free(heap);
-
-	return(ptr);
+  ut_ad(index->is_primary());
+  ut_ad(rec_offs_validate(rec, index, offsets));
+
+  if (UNIV_LIKELY_NULL(block->page.zip.data))
+  {
+    page_zip_write_trx_id_and_roll_ptr(block, rec, offsets, index->db_trx_id(),
+                                       trx->id, roll_ptr, mtr);
+    return;
+  }
+
+  ulint offset= index->trx_id_offset;
+
+  if (!offset)
+    offset= row_get_trx_id_offset(index, offsets);
+
+  compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
+
+  /* During IMPORT the trx id in the record can be in the future, if
+  the .ibd file is being imported from another instance. During IMPORT
+  roll_ptr will be 0. */
+  ut_ad(roll_ptr == 0 ||
+        lock_check_trx_id_sanity(trx_read_trx_id(rec + offset),
+                                 rec, index, offsets));
+
+  byte sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+
+  trx_write_trx_id(sys, trx->id);
+  trx_write_roll_ptr(sys + DATA_TRX_ID_LEN, roll_ptr);
+
+  ulint d= 0;
+  const byte *src= nullptr;
+  byte *dest= rec + offset;
+  ulint len= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+  if (UNIV_LIKELY(index->trx_id_offset))
+  {
+    const rec_t *prev= page_rec_get_prev_const(rec);
+    if (UNIV_UNLIKELY(prev == rec))
+      ut_ad(0);
+    else if (page_rec_is_infimum(prev));
+    else
+      for (src= prev + offset; d < DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; d++)
+        if (src[d] != sys[d])
+          break;
+    if (d > 6 && memcmp(dest, sys, d))
+    {
+      /* We save space by replacing a single record
+
+      WRITE,page_offset(dest),byte[13]
+
+      with two records:
+
+      MEMMOVE,page_offset(dest),d(1 byte),offset(1..3 bytes),
+      WRITE|0x80,0,byte[13-d]
+
+      The single WRITE record would be x+13 bytes long, with x>2.
+      The MEMMOVE record would be up to x+1+3 = x+4 bytes, and the
+      second WRITE would be 1+1+13-d = 15-d bytes.
+
+      The total size is: x+13 versus x+4+15-d = x+19-d bytes.
+      To save space, we must have d>6, that is, the complete DB_TRX_ID and
+      the first byte(s) of DB_ROLL_PTR must match the previous record. */
+      memcpy(dest, src, d);
+      mtr->memmove(*block, page_offset(dest), page_offset(src), d);
+      dest+= d;
+      len-= d;
+      /* DB_TRX_ID,DB_ROLL_PTR must be unique in each record when
+      DB_TRX_ID refers to an active transaction. */
+      ut_ad(len);
+    }
+    else
+      d= 0;
+  }
+
+  if (UNIV_LIKELY(len)) /* extra safety, to avoid corrupting the log */
+    mtr->memcpy<mtr_t::MAYBE_NOP>(*block, dest, sys + d, len);
 }
 
 /*************************************************************//**
@@ -4226,6 +4102,143 @@ out_of_space:
 	return(false);
 }
 
+/** Apply an update vector to a record. No field size changes are allowed.
+
+This is usually invoked on a clustered index. The only use case for a
+secondary index is row_ins_sec_index_entry_by_modify() or its
+counterpart in ibuf_insert_to_index_page().
+@param[in,out]  rec     index record
+@param[in]      index   the index of the record
+@param[in]      offsets rec_get_offsets(rec, index)
+@param[in]      update  update vector
+@param[in,out]  block   index page
+@param[in,out]  mtr     mini-transaction */
+void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index,
+                              const rec_offs *offsets, const upd_t *update,
+                              buf_block_t *block, mtr_t *mtr)
+{
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!index->table->skip_alter_undo);
+	ut_ad(!block->page.zip.data || index->table->not_redundant());
+
+#ifdef UNIV_DEBUG
+	if (rec_offs_comp(offsets)) {
+		switch (rec_get_status(rec)) {
+		case REC_STATUS_ORDINARY:
+			break;
+		case REC_STATUS_INSTANT:
+			ut_ad(index->is_instant());
+			break;
+		case REC_STATUS_NODE_PTR:
+		case REC_STATUS_INFIMUM:
+		case REC_STATUS_SUPREMUM:
+			ut_ad("wrong record status in update" == 0);
+		}
+	}
+#endif /* UNIV_DEBUG */
+
+	static_assert(REC_INFO_BITS_SHIFT == 0, "compatibility");
+	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+		ut_ad(rec_offs_comp(offsets));
+		byte* info_bits = &rec[-REC_NEW_INFO_BITS];
+		const bool flip_del_mark = (*info_bits ^ update->info_bits)
+			& REC_INFO_DELETED_FLAG;
+		*info_bits &= byte(~REC_INFO_BITS_MASK);
+		*info_bits |= update->info_bits;
+
+		if (flip_del_mark) {
+			page_zip_rec_set_deleted(block, rec, update->info_bits
+						 & REC_INFO_DELETED_FLAG, mtr);
+		}
+	} else {
+		byte* info_bits = &rec[rec_offs_comp(offsets)
+				       ? -REC_NEW_INFO_BITS
+				       : -REC_OLD_INFO_BITS];
+
+		mtr->write<1,mtr_t::MAYBE_NOP>(*block, info_bits,
+					       (*info_bits
+						& ~REC_INFO_BITS_MASK)
+					       | update->info_bits);
+	}
+
+	for (ulint i = 0; i < update->n_fields; i++) {
+		const upd_field_t* uf = upd_get_nth_field(update, i);
+		if (upd_fld_is_virtual_col(uf) && !index->has_virtual()) {
+			continue;
+		}
+		const ulint n = uf->field_no;
+
+		ut_ad(!dfield_is_ext(&uf->new_val)
+		      == !rec_offs_nth_extern(offsets, n));
+		ut_ad(!rec_offs_nth_default(offsets, n));
+
+		if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) {
+			if (rec_offs_nth_sql_null(offsets, n)) {
+				ut_ad(index->table->is_instant());
+				ut_ad(n >= index->n_core_fields);
+				continue;
+			}
+
+			ut_ad(!index->table->not_redundant());
+			switch (ulint size = rec_get_nth_field_size(rec, n)) {
+			case 0:
+				break;
+			case 1:
+				mtr->write<1,mtr_t::MAYBE_NOP>(
+					*block,
+					rec_get_field_start_offs(rec, n) + rec,
+					0U);
+				break;
+			default:
+				mtr->memset(
+					block,
+					page_offset(rec_get_field_start_offs(
+							    rec, n) + rec),
+					size, 0);
+			}
+			ulint l = rec_get_1byte_offs_flag(rec)
+				? (n + 1) : (n + 1) * 2;
+			byte* b = rec - REC_N_OLD_EXTRA_BYTES - l;
+			compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
+					    == REC_2BYTE_SQL_NULL_MASK);
+			mtr->write<1>(*block, b,
+				      byte(*b | REC_1BYTE_SQL_NULL_MASK));
+			continue;
+		}
+
+		ulint len;
+		byte* data = rec_get_nth_field(rec, offsets, n, &len);
+		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+			ut_ad(len == uf->new_val.len);
+			memcpy(data, uf->new_val.data, len);
+			continue;
+		}
+
+		if (UNIV_UNLIKELY(len != uf->new_val.len)) {
+			ut_ad(len == UNIV_SQL_NULL);
+			ut_ad(!rec_offs_comp(offsets));
+			len = uf->new_val.len;
+			ut_ad(len == rec_get_nth_field_size(rec, n));
+			ulint l = rec_get_1byte_offs_flag(rec)
+				? (n + 1) : (n + 1) * 2;
+			byte* b = rec - REC_N_OLD_EXTRA_BYTES - l;
+			compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
+					    == REC_2BYTE_SQL_NULL_MASK);
+			mtr->write<1>(*block, b,
+				      byte(*b & ~REC_1BYTE_SQL_NULL_MASK));
+		}
+
+		if (len) {
+			mtr->memcpy<mtr_t::MAYBE_NOP>(*block, data,
+						      uf->new_val.data, len);
+		}
+	}
+
+	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+		page_zip_write_rec(block, rec, index, offsets, 0, mtr);
+	}
+}
+
 /*************************************************************//**
 Updates a record when the update causes no size changes in its fields.
 We assume here that the ordering fields of the record do not change.
@@ -4252,8 +4265,6 @@ btr_cur_update_in_place(
 				further pages */
 {
 	dict_index_t*	index;
-	buf_block_t*	block;
-	page_zip_des_t*	page_zip;
 	dberr_t		err;
 	rec_t*		rec;
 	roll_ptr_t	roll_ptr	= 0;
@@ -4283,11 +4294,11 @@ btr_cur_update_in_place(
 		 << ") by " << ib::hex(trx_id) << ": "
 		 << rec_printer(rec, offsets).str());
 
-	block = btr_cur_get_block(cursor);
-	page_zip = buf_block_get_page_zip(block);
+	buf_block_t* block = btr_cur_get_block(cursor);
+	page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
 
 	/* Check that enough space is available on the compressed page. */
-	if (page_zip) {
+	if (UNIV_LIKELY_NULL(page_zip)) {
 		ut_ad(!index->table->is_temporary());
 
 		if (!btr_cur_update_alloc_zip(
@@ -4312,8 +4323,8 @@ btr_cur_update_in_place(
 	}
 
 	if (!(flags & BTR_KEEP_SYS_FLAG)) {
-		row_upd_rec_sys_fields(rec, NULL, index, offsets,
-				       thr_get_trx(thr), roll_ptr);
+		btr_cur_upd_rec_sys(block, rec, index, offsets,
+				    thr_get_trx(thr), roll_ptr, mtr);
 	}
 
 	was_delete_marked = rec_get_deleted_flag(
@@ -4327,7 +4338,7 @@ btr_cur_update_in_place(
 #ifdef BTR_CUR_HASH_ADAPT
 	{
 		rw_lock_t* ahi_latch = block->index
-			? btr_get_search_latch(index) : NULL;
+			? btr_search_sys.get_latch(*index) : NULL;
 		if (ahi_latch) {
 			/* TO DO: Can we skip this if none of the fields
 			index->search_info->curr_n_fields
@@ -4352,7 +4363,8 @@ btr_cur_update_in_place(
 		assert_block_ahi_valid(block);
 #endif /* BTR_CUR_HASH_ADAPT */
 
-		row_upd_rec_in_place(rec, index, offsets, update, page_zip);
+		btr_cur_upd_rec_in_place(rec, index, offsets, update, block,
+					 mtr);
 
 #ifdef BTR_CUR_HASH_ADAPT
 		if (ahi_latch) {
@@ -4361,17 +4373,13 @@ btr_cur_update_in_place(
 	}
 #endif /* BTR_CUR_HASH_ADAPT */
 
-	btr_cur_update_in_place_log(flags, rec, index, update,
-				    trx_id, roll_ptr, mtr);
-
 	if (was_delete_marked
 	    && !rec_get_deleted_flag(
 		    rec, page_is_comp(buf_block_get_frame(block)))) {
 		/* The new updated record owns its possible externally
 		stored fields */
 
-		btr_cur_unmark_extern_fields(page_zip,
-					     rec, index, offsets, mtr);
+		btr_cur_unmark_extern_fields(block, rec, index, offsets, mtr);
 	}
 
 	ut_ad(err == DB_SUCCESS);
@@ -4569,7 +4577,7 @@ btr_cur_optimistic_update(
 	ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
 	      || index->table->is_temporary());
 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 	/* This is intended only for leaf page updates */
 	ut_ad(page_is_leaf(page));
 	/* The insert buffer tree should never be updated in place. */
@@ -4612,7 +4620,7 @@ any_extern:
 
 		/* prefetch siblings of the leaf for the pessimistic
 		operation. */
-		btr_cur_prefetch_siblings(block);
+		btr_cur_prefetch_siblings(block, index);
 
 		return(DB_OVERFLOW);
 	}
@@ -4803,10 +4811,10 @@ func_exit:
 		}
 	}
 
-	if (err != DB_SUCCESS && !index->is_ibuf()) {
+	if (err != DB_SUCCESS) {
 		/* prefetch siblings of the leaf for the pessimistic
 		operation. */
-		btr_cur_prefetch_siblings(block);
+		btr_cur_prefetch_siblings(block, index);
 	}
 
 	return(err);
@@ -4839,17 +4847,18 @@ btr_cur_pess_upd_restore_supremum(
 
 	const uint32_t	prev_page_no = btr_page_get_prev(page);
 
-	const page_id_t	page_id(block->page.id.space(), prev_page_no);
+	const page_id_t	page_id(block->page.id().space(), prev_page_no);
 
 	ut_ad(prev_page_no != FIL_NULL);
 	prev_block = buf_page_get_with_no_latch(page_id, block->zip_size(),
 						mtr);
 #ifdef UNIV_BTR_DEBUG
-	ut_a(btr_page_get_next(prev_block->frame) == block->page.id.page_no());
+	ut_a(btr_page_get_next(prev_block->frame)
+	     == block->page.id().page_no());
 #endif /* UNIV_BTR_DEBUG */
 
 	/* We must already have an x-latch on prev_block! */
-	ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(prev_block, MTR_MEMO_PAGE_X_FIX));
 
 	lock_rec_reset_and_inherit_gap_locks(prev_block, block,
 					     PAGE_HEAP_NO_SUPREMUM,
@@ -4895,31 +4904,27 @@ btr_cur_pessimistic_update(
 	big_rec_t*	dummy_big_rec;
 	dict_index_t*	index;
 	buf_block_t*	block;
-	page_t*		page;
 	page_zip_des_t*	page_zip;
 	rec_t*		rec;
 	page_cur_t*	page_cursor;
 	dberr_t		err;
 	dberr_t		optim_err;
 	roll_ptr_t	roll_ptr;
-	ibool		was_first;
-	ulint		n_reserved	= 0;
-	ulint		max_ins_size	= 0;
+	bool		was_first;
+	uint32_t	n_reserved	= 0;
 
 	*offsets = NULL;
 	*big_rec = NULL;
 
 	block = btr_cur_get_block(cursor);
-	page = buf_block_get_frame(block);
 	page_zip = buf_block_get_page_zip(block);
 	index = cursor->index;
 
-	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
-					MTR_MEMO_X_LOCK |
-					MTR_MEMO_SX_LOCK));
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK |
+					 MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 #ifdef UNIV_ZIP_DEBUG
-	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+	ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index));
 #endif /* UNIV_ZIP_DEBUG */
 	ut_ad(!page_zip || !index->table->is_temporary());
 	/* The insert buffer tree should never be updated in place. */
@@ -4952,7 +4957,7 @@ btr_cur_pessimistic_update(
 		if (page_zip
 		    && optim_err != DB_ZIP_OVERFLOW
 		    && !dict_index_is_clust(index)
-		    && page_is_leaf(page)) {
+		    && page_is_leaf(block->frame)) {
 			ut_ad(!index->table->is_temporary());
 			ibuf_update_free_bits_zip(block, mtr);
 		}
@@ -4999,7 +5004,7 @@ btr_cur_pessimistic_update(
 	/* We have to set appropriate extern storage bits in the new
 	record to be inserted: we have to remember which fields were such */
 
-	ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
+	ut_ad(!page_is_comp(block->frame) || !rec_get_node_ptr_flag(rec));
 	ut_ad(rec_offs_validate(rec, index, *offsets));
 
 	if ((flags & BTR_NO_UNDO_LOG_FLAG)
@@ -5018,14 +5023,14 @@ btr_cur_pessimistic_update(
 		DEBUG_SYNC_C("blob_rollback_middle");
 
 		btr_rec_free_updated_extern_fields(
-			index, rec, page_zip, *offsets, update, true, mtr);
+			index, rec, block, *offsets, update, true, mtr);
 	}
 
 	ulint n_ext = index->is_primary() ? dtuple_get_n_ext(new_entry) : 0;
 
 	if (page_zip_rec_needs_ext(
 		    rec_get_converted_size(index, new_entry, n_ext),
-		    page_is_comp(page),
+		    page_is_comp(block->frame),
 		    dict_index_get_n_fields(index),
 		    block->zip_size())
 	    || (UNIV_UNLIKELY(update->is_alter_metadata())
@@ -5041,14 +5046,15 @@ btr_cur_pessimistic_update(
 			BTR_KEEP_IBUF_BITMAP. */
 #ifdef UNIV_ZIP_DEBUG
 			ut_a(!page_zip
-			     || page_zip_validate(page_zip, page, index));
+			     || page_zip_validate(page_zip, block->frame,
+						  index));
 #endif /* UNIV_ZIP_DEBUG */
 			index->table->space->release_free_extents(n_reserved);
 			err = DB_TOO_BIG_RECORD;
 			goto err_exit;
 		}
 
-		ut_ad(page_is_leaf(page));
+		ut_ad(page_is_leaf(block->frame));
 		ut_ad(dict_index_is_clust(index));
 		ut_ad(flags & BTR_KEEP_POS_FLAG);
 	}
@@ -5067,7 +5073,7 @@ btr_cur_pessimistic_update(
 		of the index tree, so that the update will not fail because
 		of lack of space */
 
-		ulint	n_extents = cursor->tree_height / 16 + 3;
+		uint32_t n_extents = uint32_t(cursor->tree_height / 16 + 3);
 
 		if (!fsp_reserve_free_extents(
 		            &n_reserved, index->table->space, n_extents,
@@ -5083,10 +5089,9 @@ btr_cur_pessimistic_update(
 		btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
 	}
 
-	if (!page_zip) {
-		max_ins_size = page_get_max_insert_size_after_reorganize(
-				page, 1);
-	}
+	const ulint max_ins_size = page_zip
+		? 0 : page_get_max_insert_size_after_reorganize(block->frame,
+								1);
 
 	if (UNIV_UNLIKELY(is_metadata)) {
 		ut_ad(new_entry->is_metadata());
@@ -5114,7 +5119,7 @@ btr_cur_pessimistic_update(
 	}
 
 #ifdef UNIV_ZIP_DEBUG
-	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+	ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index));
 #endif /* UNIV_ZIP_DEBUG */
 	page_cursor = btr_cur_get_page_cur(cursor);
 
@@ -5136,7 +5141,7 @@ btr_cur_pessimistic_update(
 			btr_page_reorganize(page_cursor, index, mtr);
 			rec = page_cursor->rec;
 			rec_offs_make_valid(rec, index, true, *offsets);
-			if (page_cursor->block->page.id.page_no()
+			if (page_cursor->block->page.id().page_no()
 			    == index->page) {
 				btr_set_instant(page_cursor->block, *index,
 						mtr);
@@ -5150,8 +5155,8 @@ btr_cur_pessimistic_update(
 		    || rec_is_alter_metadata(rec, *index)) {
 			/* The new inserted record owns its possible externally
 			stored fields */
-			btr_cur_unmark_extern_fields(
-				page_zip, rec, index, *offsets, mtr);
+			btr_cur_unmark_extern_fields(btr_cur_get_block(cursor),
+						     rec, index, *offsets, mtr);
 		} else {
 			/* In delete-marked records, DB_TRX_ID must
 			always refer to an existing undo log record. */
@@ -5159,7 +5164,7 @@ btr_cur_pessimistic_update(
 		}
 
 		bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
-		ut_ad(!adjust || page_is_leaf(page));
+		ut_ad(!adjust || page_is_leaf(block->frame));
 
 		if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
 			if (adjust) {
@@ -5167,7 +5172,7 @@ btr_cur_pessimistic_update(
 						    true, *offsets);
 			}
 		} else if (!dict_index_is_clust(index)
-			   && page_is_leaf(page)) {
+			   && page_is_leaf(block->frame)) {
 			/* Update the free bits in the insert buffer.
 			This is the same block which was skipped by
 			BTR_KEEP_IBUF_BITMAP. */
@@ -5182,7 +5187,7 @@ btr_cur_pessimistic_update(
 
 		if (!srv_read_only_mode
 		    && !big_rec_vec
-		    && page_is_leaf(page)
+		    && page_is_leaf(block->frame)
 		    && !dict_index_is_online_ddl(index)) {
 
 			mtr_memo_release(mtr, dict_index_get_lock(index),
@@ -5207,13 +5212,13 @@ btr_cur_pessimistic_update(
 		BTR_KEEP_IBUF_BITMAP. */
 		if (!dict_index_is_clust(index)
 		    && !index->table->is_temporary()
-		    && page_is_leaf(page)) {
+		    && page_is_leaf(block->frame)) {
 			ibuf_reset_free_bits(block);
 		}
 	}
 
 	if (big_rec_vec != NULL) {
-		ut_ad(page_is_leaf(page));
+		ut_ad(page_is_leaf(block->frame));
 		ut_ad(dict_index_is_clust(index));
 		ut_ad(flags & BTR_KEEP_POS_FLAG);
 
@@ -5224,11 +5229,8 @@ btr_cur_pessimistic_update(
 		big_rec, so that row_upd_clust_rec() can store the
 		big_rec in the same mini-transaction. */
 
-		ut_ad(mtr_memo_contains_flagged(mtr,
-						dict_index_get_lock(index),
-						MTR_MEMO_X_LOCK |
-						MTR_MEMO_SX_LOCK));
-
+		ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+						 | MTR_MEMO_SX_LOCK));
 		mtr_sx_lock_index(index, mtr);
 	}
 
@@ -5262,28 +5264,20 @@ btr_cur_pessimistic_update(
 		/* Update PAGE_MAX_TRX_ID in the index page header.
 		It was not updated by btr_cur_pessimistic_insert()
 		because of BTR_NO_LOCKING_FLAG. */
-		buf_block_t*	rec_block;
-
-		rec_block = btr_cur_get_block(cursor);
-
-		page_update_max_trx_id(rec_block,
-				       buf_block_get_page_zip(rec_block),
+		page_update_max_trx_id(btr_cur_get_block(cursor),
+				       btr_cur_get_page_zip(cursor),
 				       trx_id, mtr);
 	}
 
 	if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
 		/* The new inserted record owns its possible externally
 		stored fields */
-		buf_block_t*	rec_block = btr_cur_get_block(cursor);
-
 #ifdef UNIV_ZIP_DEBUG
-		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
-		page = buf_block_get_frame(rec_block);
+		ut_a(!page_zip || page_zip_validate(page_zip, block->frame,
+						    index));
 #endif /* UNIV_ZIP_DEBUG */
-		page_zip = buf_block_get_page_zip(rec_block);
-
-		btr_cur_unmark_extern_fields(page_zip,
-					     rec, index, *offsets, mtr);
+		btr_cur_unmark_extern_fields(btr_cur_get_block(cursor), rec,
+					     index, *offsets, mtr);
 	} else {
 		/* In delete-marked records, DB_TRX_ID must
 		always refer to an existing undo log record. */
@@ -5314,7 +5308,8 @@ btr_cur_pessimistic_update(
 
 return_after_reservations:
 #ifdef UNIV_ZIP_DEBUG
-	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+	ut_a(!page_zip || page_zip_validate(btr_cur_get_page_zip(cursor),
+					    btr_cur_get_page(cursor), index));
 #endif /* UNIV_ZIP_DEBUG */
 
 	index->table->space->release_free_extents(n_reserved);
@@ -5324,148 +5319,42 @@ return_after_reservations:
 
 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/
 
-/****************************************************************//**
-Writes the redo log record for delete marking or unmarking of an index
-record. */
-UNIV_INLINE
-void
-btr_cur_del_mark_set_clust_rec_log(
-/*===============================*/
-	rec_t*		rec,	/*!< in: record */
-	dict_index_t*	index,	/*!< in: index of the record */
-	trx_id_t	trx_id,	/*!< in: transaction id */
-	roll_ptr_t	roll_ptr,/*!< in: roll ptr to the undo log record */
-	mtr_t*		mtr)	/*!< in: mtr */
+/** Modify the delete-mark flag of a record.
+@tparam         flag    the value of the delete-mark flag
+@param[in,out]  block   buffer block
+@param[in,out]  rec     record on a physical index page
+@param[in,out]  mtr     mini-transaction  */
+template<bool flag>
+void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr)
 {
-	byte*	log_ptr;
-
-	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
-	ut_ad(mtr->is_named_space(index->table->space));
-
-	log_ptr = mlog_open_and_write_index(mtr, rec, index,
-					    page_rec_is_comp(rec)
-					    ? MLOG_COMP_REC_CLUST_DELETE_MARK
-					    : MLOG_REC_CLUST_DELETE_MARK,
-					    1 + 1 + DATA_ROLL_PTR_LEN
-					    + 14 + 2);
-
-	if (!log_ptr) {
-		/* Logging in mtr is switched off during crash recovery */
-		return;
-	}
-
-	*log_ptr++ = 0;
-	*log_ptr++ = 1;
-
-	log_ptr = btr_cur_log_sys(index, trx_id, roll_ptr, log_ptr);
-	mach_write_to_2(log_ptr, page_offset(rec));
-	log_ptr += 2;
-
-	mlog_close(mtr, log_ptr);
+  if (page_rec_is_comp(rec))
+  {
+    byte *b= &rec[-REC_NEW_INFO_BITS];
+    const byte v= flag
+      ? (*b | REC_INFO_DELETED_FLAG)
+      : (*b & byte(~REC_INFO_DELETED_FLAG));
+    if (*b == v);
+    else if (UNIV_LIKELY_NULL(block->page.zip.data))
+    {
+      *b= v;
+      page_zip_rec_set_deleted(block, rec, flag, mtr);
+    }
+    else
+      mtr->write<1>(*block, b, v);
+  }
+  else
+  {
+    ut_ad(!block->page.zip.data);
+    byte *b= &rec[-REC_OLD_INFO_BITS];
+    const byte v = flag
+      ? (*b | REC_INFO_DELETED_FLAG)
+      : (*b & byte(~REC_INFO_DELETED_FLAG));
+    mtr->write<1,mtr_t::MAYBE_NOP>(*block, b, v);
+  }
 }
 
-/****************************************************************//**
-Parses the redo log record for delete marking or unmarking of a clustered
-index record.
-@return end of log record or NULL */
-byte*
-btr_cur_parse_del_mark_set_clust_rec(
-/*=================================*/
-	byte*		ptr,	/*!< in: buffer */
-	byte*		end_ptr,/*!< in: buffer end */
-	page_t*		page,	/*!< in/out: page or NULL */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	dict_index_t*	index)	/*!< in: index corresponding to page */
-{
-	ulint		flags;
-	ulint		val;
-	ulint		pos;
-	trx_id_t	trx_id;
-	roll_ptr_t	roll_ptr;
-	ulint		offset;
-	rec_t*		rec;
-
-	ut_ad(!page
-	      || !!page_is_comp(page) == dict_table_is_comp(index->table));
-
-	if (end_ptr < ptr + 2) {
-
-		return(NULL);
-	}
-
-	flags = mach_read_from_1(ptr);
-	ptr++;
-	val = mach_read_from_1(ptr);
-	ptr++;
-
-	ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
-
-	if (ptr == NULL) {
-
-		return(NULL);
-	}
-
-	if (end_ptr < ptr + 2) {
-
-		return(NULL);
-	}
-
-	offset = mach_read_from_2(ptr);
-	ptr += 2;
-
-	ut_a(offset <= srv_page_size);
-
-	/* In delete-marked records, DB_TRX_ID must
-	always refer to an existing undo log record. */
-	ut_ad(trx_id || (flags & BTR_KEEP_SYS_FLAG));
-
-	if (page) {
-		rec = page + offset;
-
-		/* We do not need to reserve search latch, as the page
-		is only being recovered, and there cannot be a hash index to
-		it. Besides, these fields are being updated in place
-		and the adaptive hash index does not depend on them. */
-
-		btr_rec_set_deleted_flag(rec, page_zip, val);
-		/* pos is the offset of DB_TRX_ID in the clustered index.
-		Debug assertions may also access DB_ROLL_PTR at pos+1.
-		Therefore, we must compute offsets for the first pos+2
-		clustered index fields. */
-		ut_ad(pos <= MAX_REF_PARTS);
-
-		rec_offs offsets[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2];
-		rec_offs_init(offsets);
-		mem_heap_t*	heap	= NULL;
-
-		if (!(flags & BTR_KEEP_SYS_FLAG)) {
-			row_upd_rec_sys_fields_in_recovery(
-				rec, page_zip,
-				rec_get_offsets(rec, index, offsets,
-						index->n_core_fields,
-						pos + 2, &heap),
-				pos, trx_id, roll_ptr);
-		} else {
-			/* In delete-marked records, DB_TRX_ID must
-			always refer to an existing undo log record. */
-			ut_ad(memcmp(rec_get_nth_field(
-					     rec,
-					     rec_get_offsets(rec, index,
-							     offsets, index
-							     ->n_core_fields,
-							     pos, &heap),
-					     pos, &offset),
-				     field_ref_zero, DATA_TRX_ID_LEN));
-			ut_ad(offset == DATA_TRX_ID_LEN);
-		}
-
-		if (UNIV_LIKELY_NULL(heap)) {
-			mem_heap_free(heap);
-		}
-	}
-
-	return(ptr);
-}
+template void btr_rec_set_deleted<false>(buf_block_t *, rec_t *, mtr_t *);
+template void btr_rec_set_deleted<true>(buf_block_t *, rec_t *, mtr_t *);
 
 /***********************************************************//**
 Marks a clustered index record deleted. Writes an undo log record to
@@ -5487,7 +5376,6 @@ btr_cur_del_mark_set_clust_rec(
 {
 	roll_ptr_t	roll_ptr;
 	dberr_t		err;
-	page_zip_des_t*	page_zip;
 	trx_t*		trx;
 
 	ut_ad(dict_index_is_clust(index));
@@ -5525,9 +5413,7 @@ btr_cur_del_mark_set_clust_rec(
 	the adaptive hash index does not depend on the delete-mark
 	and the delete-mark is being updated in place. */
 
-	page_zip = buf_block_get_page_zip(block);
-
-	btr_rec_set_deleted_flag(rec, page_zip, TRUE);
+	btr_rec_set_deleted<true>(block, rec, mtr);
 
 	trx = thr_get_trx(thr);
 
@@ -5541,163 +5427,10 @@ btr_cur_del_mark_set_clust_rec(
 		row_log_table_delete(rec, index, offsets, NULL);
 	}
 
-	row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr);
-
-	btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id,
-					   roll_ptr, mtr);
-
+	btr_cur_upd_rec_sys(block, rec, index, offsets, trx, roll_ptr, mtr);
 	return(err);
 }
 
-/****************************************************************//**
-Writes the redo log record for a delete mark setting of a secondary
-index record. */
-UNIV_INLINE
-void
-btr_cur_del_mark_set_sec_rec_log(
-/*=============================*/
-	rec_t*		rec,	/*!< in: record */
-	ibool		val,	/*!< in: value to set */
-	mtr_t*		mtr)	/*!< in: mtr */
-{
-	byte*	log_ptr;
-	ut_ad(val <= 1);
-
-	log_ptr = mlog_open(mtr, 11 + 1 + 2);
-
-	if (!log_ptr) {
-		/* Logging in mtr is switched off during crash recovery:
-		in that case mlog_open returns NULL */
-		return;
-	}
-
-	log_ptr = mlog_write_initial_log_record_fast(
-		rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
-	mach_write_to_1(log_ptr, val);
-	log_ptr++;
-
-	mach_write_to_2(log_ptr, page_offset(rec));
-	log_ptr += 2;
-
-	mlog_close(mtr, log_ptr);
-}
-
-/****************************************************************//**
-Parses the redo log record for delete marking or unmarking of a secondary
-index record.
-@return end of log record or NULL */
-byte*
-btr_cur_parse_del_mark_set_sec_rec(
-/*===============================*/
-	byte*		ptr,	/*!< in: buffer */
-	byte*		end_ptr,/*!< in: buffer end */
-	page_t*		page,	/*!< in/out: page or NULL */
-	page_zip_des_t*	page_zip)/*!< in/out: compressed page, or NULL */
-{
-	ulint	val;
-	ulint	offset;
-	rec_t*	rec;
-
-	if (end_ptr < ptr + 3) {
-
-		return(NULL);
-	}
-
-	val = mach_read_from_1(ptr);
-	ptr++;
-
-	offset = mach_read_from_2(ptr);
-	ptr += 2;
-
-	ut_a(offset <= srv_page_size);
-
-	if (page) {
-		rec = page + offset;
-
-		/* We do not need to reserve search latch, as the page
-		is only being recovered, and there cannot be a hash index to
-		it. Besides, the delete-mark flag is being updated in place
-		and the adaptive hash index does not depend on it. */
-
-		btr_rec_set_deleted_flag(rec, page_zip, val);
-	}
-
-	return(ptr);
-}
-
-/***********************************************************//**
-Sets a secondary index record delete mark to TRUE or FALSE.
-@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
-dberr_t
-btr_cur_del_mark_set_sec_rec(
-/*=========================*/
-	ulint		flags,	/*!< in: locking flag */
-	btr_cur_t*	cursor,	/*!< in: cursor */
-	ibool		val,	/*!< in: value to set */
-	que_thr_t*	thr,	/*!< in: query thread */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
-{
-	buf_block_t*	block;
-	rec_t*		rec;
-	dberr_t		err;
-
-	block = btr_cur_get_block(cursor);
-	rec = btr_cur_get_rec(cursor);
-
-	err = lock_sec_rec_modify_check_and_lock(flags,
-						 btr_cur_get_block(cursor),
-						 rec, cursor->index, thr, mtr);
-	if (err != DB_SUCCESS) {
-
-		return(err);
-	}
-
-	ut_ad(!!page_rec_is_comp(rec)
-	      == dict_table_is_comp(cursor->index->table));
-
-	DBUG_PRINT("ib_cur", ("delete-mark=%u sec %u:%u:%u in %s("
-			      IB_ID_FMT ") by " TRX_ID_FMT,
-			      unsigned(val),
-			      block->page.id.space(), block->page.id.page_no(),
-			      unsigned(page_rec_get_heap_no(rec)),
-			      cursor->index->name(), cursor->index->id,
-			      trx_get_id_for_print(thr_get_trx(thr))));
-
-	/* We do not need to reserve search latch, as the
-	delete-mark flag is being updated in place and the adaptive
-	hash index does not depend on it. */
-	btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
-
-	btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
-
-	return(DB_SUCCESS);
-}
-
-/***********************************************************//**
-Sets a secondary index record's delete mark to the given value. This
-function is only used by the insert buffer merge mechanism. */
-void
-btr_cur_set_deleted_flag_for_ibuf(
-/*==============================*/
-	rec_t*		rec,		/*!< in/out: record */
-	page_zip_des_t*	page_zip,	/*!< in/out: compressed page
-					corresponding to rec, or NULL
-					when the tablespace is
-					uncompressed */
-	ibool		val,		/*!< in: value to set */
-	mtr_t*		mtr)		/*!< in/out: mini-transaction */
-{
-	/* We do not need to reserve search latch, as the page
-	has just been read to the buffer pool and there cannot be
-	a hash index to it.  Besides, the delete-mark flag is being
-	updated in place and the adaptive hash index does not depend
-	on it. */
-
-	btr_rec_set_deleted_flag(rec, page_zip, val);
-
-	btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
-}
-
 /*==================== B-TREE RECORD REMOVE =========================*/
 
 /*************************************************************//**
@@ -5717,23 +5450,19 @@ btr_cur_compress_if_useful(
 				cursor position even if compression occurs */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	ut_ad(mtr_memo_contains_flagged(
-		mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
-		MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
-	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
-			       MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(&cursor->index->lock,
+					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
 
-	if (dict_index_is_spatial(cursor->index)) {
-		const page_t*   page = btr_cur_get_page(cursor);
-		const trx_t*	trx = NULL;
-
-		if (cursor->rtr_info->thr != NULL) {
-			trx = thr_get_trx(cursor->rtr_info->thr);
-		}
+	if (cursor->index->is_spatial()) {
+		const trx_t*	trx = cursor->rtr_info->thr
+			? thr_get_trx(cursor->rtr_info->thr)
+			: NULL;
+		const buf_block_t* block = btr_cur_get_block(cursor);
 
 		/* Check whether page lock prevents the compression */
-		if (!lock_test_prdt_page_lock(trx, page_get_space_id(page),
-					      page_get_page_no(page))) {
+		if (!lock_test_prdt_page_lock(trx, block->page.id())) {
 			return(false);
 		}
 	}
@@ -5770,8 +5499,8 @@ btr_cur_optimistic_delete_func(
 	rec_offs_init(offsets_);
 
 	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
-	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
-				MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
 	ut_ad(mtr->is_named_space(cursor->index->table->space));
 	ut_ad(!cursor->index->is_dummy);
 
@@ -5779,7 +5508,7 @@ btr_cur_optimistic_delete_func(
 
 	block = btr_cur_get_block(cursor);
 
-	ut_ad(block->page.id.space() == cursor->index->table->space->id);
+	ut_ad(block->page.id().space() == cursor->index->table->space->id);
 	ut_ad(page_is_leaf(buf_block_get_frame(block)));
 	ut_ad(!dict_index_is_online_ddl(cursor->index)
 	      || dict_index_is_clust(cursor->index)
@@ -5798,14 +5527,15 @@ btr_cur_optimistic_delete_func(
 	if (!no_compress_needed) {
 		/* prefetch siblings of the leaf for the pessimistic
 		operation. */
-		btr_cur_prefetch_siblings(block);
+		btr_cur_prefetch_siblings(block, cursor->index);
 		goto func_exit;
 	}
 
-	if (UNIV_UNLIKELY(block->page.id.page_no() == cursor->index->page
+	if (UNIV_UNLIKELY(block->page.id().page_no() == cursor->index->page
 			  && page_get_n_recs(block->frame) == 1
 			  + (cursor->index->is_instant()
-			     && !rec_is_metadata(rec, *cursor->index)))) {
+			     && !rec_is_metadata(rec, *cursor->index))
+			  && !cursor->index->must_avoid_clear_instant_add())) {
 		/* The whole index (and table) becomes logically empty.
 		Empty the whole page. That is, if we are deleting the
 		only user record, also delete the metadata record
@@ -5946,7 +5676,7 @@ btr_cur_pessimistic_delete(
 	page_zip_des_t*	page_zip;
 	dict_index_t*	index;
 	rec_t*		rec;
-	ulint		n_reserved	= 0;
+	uint32_t	n_reserved	= 0;
 	bool		success;
 	ibool		ret		= FALSE;
 	mem_heap_t*	heap;
@@ -5963,20 +5693,19 @@ btr_cur_pessimistic_delete(
 	ut_ad(!dict_index_is_online_ddl(index)
 	      || dict_index_is_clust(index)
 	      || (flags & BTR_CREATE_FLAG));
-	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
-					MTR_MEMO_X_LOCK
-					| MTR_MEMO_SX_LOCK));
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 	ut_ad(mtr->is_named_space(index->table->space));
 	ut_ad(!index->is_dummy);
-	ut_ad(block->page.id.space() == index->table->space->id);
+	ut_ad(block->page.id().space() == index->table->space->id);
 
 	if (!has_reserved_extents) {
 		/* First reserve enough free space for the file segments
 		of the index tree, so that the node pointer updates will
 		not fail because of lack of space */
 
-		ulint	n_extents = cursor->tree_height / 32 + 1;
+		uint32_t n_extents = uint32_t(cursor->tree_height / 32 + 1);
 
 		success = fsp_reserve_free_extents(&n_reserved,
 						   index->table->space,
@@ -6002,7 +5731,7 @@ btr_cur_pessimistic_delete(
 
 	if (rec_offs_any_extern(offsets)) {
 		btr_rec_free_externally_stored_fields(index,
-						      rec, offsets, page_zip,
+						      rec, offsets, block,
 						      rollback, mtr);
 #ifdef UNIV_ZIP_DEBUG
 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
@@ -6027,12 +5756,13 @@ btr_cur_pessimistic_delete(
 			lock_update_delete(block, rec);
 		}
 
-		if (block->page.id.page_no() != index->page) {
+		if (block->page.id().page_no() != index->page) {
 			if (page_get_n_recs(page) < 2) {
 				goto discard_page;
 			}
 		} else if (page_get_n_recs(page) == 1
-			   + (index->is_instant() && !is_metadata)) {
+			   + (index->is_instant() && !is_metadata)
+			   && !index->must_avoid_clear_instant_add()) {
 			/* The whole index (and table) becomes logically empty.
 			Empty the whole page. That is, if we are deleting the
 			only user record, also delete the metadata record
@@ -6141,7 +5871,7 @@ discard_page:
 			const ulint	level = btr_page_get_level(page);
 			// FIXME: reuse the node_ptr from above
 			dtuple_t*	node_ptr = dict_index_build_node_ptr(
-				index, next_rec, block->page.id.page_no(),
+				index, next_rec, block->page.id().page_no(),
 				heap, level);
 
 			btr_insert_on_non_leaf_level(
@@ -6166,7 +5896,7 @@ discard_page:
 				    offsets, mtr);
 
 		if (min_mark_next_rec) {
-			btr_set_min_rec_mark(next_rec, mtr);
+			btr_set_min_rec_mark(next_rec, *block, mtr);
 		}
 
 #ifdef UNIV_ZIP_DEBUG
@@ -6182,10 +5912,10 @@ discard_page:
 					cursor, FALSE, mtr);
 			} else {
 				ib::warn() << "Not merging page "
-					   << block->page.id
+					   << block->page.id()
 					   << " in index " << index->name
 					   << " of " << index->table->name;
-				ut_ad(!"MDEV-14637");
+				ut_ad("MDEV-14637" == 0);
 			}
 		}
 	}
@@ -6215,8 +5945,8 @@ return_after_reservations:
 @param[in,out]	mtr	mini-transaction */
 void btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
 {
-	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(parent),
-				MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(parent),
+					 MTR_MEMO_PAGE_X_FIX));
 	dberr_t err;
 	ibool compressed = btr_cur_pessimistic_delete(&err, TRUE, parent,
 						      BTR_CREATE_FLAG, false,
@@ -6240,8 +5970,6 @@ btr_cur_add_path_info(
 	ulint		root_height)	/*!< in: root node height in tree */
 {
 	btr_path_t*	slot;
-	const rec_t*	rec;
-	const page_t*	page;
 
 	ut_a(cursor->path_arr);
 
@@ -6260,16 +5988,14 @@ btr_cur_add_path_info(
 		slot->nth_rec = ULINT_UNDEFINED;
 	}
 
-	rec = btr_cur_get_rec(cursor);
-
 	slot = cursor->path_arr + (root_height - height);
 
-	page = page_align(rec);
+	const buf_block_t* block = btr_cur_get_block(cursor);
 
-	slot->nth_rec = page_rec_get_n_recs_before(rec);
-	slot->n_recs = page_get_n_recs(page);
-	slot->page_no = page_get_page_no(page);
-	slot->page_level = btr_page_get_level(page);
+	slot->nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
+	slot->n_recs = page_get_n_recs(block->frame);
+	slot->page_no = block->page.id().page_no();
+	slot->page_level = btr_page_get_level(block->frame);
 }
 
 /*******************************************************************//**
@@ -6356,7 +6082,7 @@ btr_estimate_n_rows_in_range_on_level(
 
 		ut_ad((block != NULL) == (err == DB_SUCCESS));
 
-		if (err != DB_SUCCESS) {
+		if (!block) {
 			if (err == DB_DECRYPTION_FAILED) {
 				ib_push_warning((void *)NULL,
 					DB_DECRYPTION_FAILED,
@@ -6453,10 +6179,8 @@ static const ha_rows	rows_in_range_arbitrary_ret_val = 10;
 
 /** Estimates the number of rows in a given index range.
 @param[in]	index		index
-@param[in]	tuple1		range start, may also be empty tuple
-@param[in]	mode1		search mode for range start
-@param[in]	tuple2		range end, may also be empty tuple
-@param[in]	mode2		search mode for range end
+@param[in]	tuple1		range start
+@param[in]	tuple2		range end
 @param[in]	nth_attempt	if the tree gets modified too much while
 we are trying to analyze it, then we will retry (this function will call
 itself, incrementing this parameter)
@@ -6469,10 +6193,8 @@ static
 ha_rows
 btr_estimate_n_rows_in_range_low(
 	dict_index_t*	index,
-	const dtuple_t*	tuple1,
-	page_cur_mode_t	mode1,
-	const dtuple_t*	tuple2,
-	page_cur_mode_t	mode2,
+	btr_pos_t*	tuple1,
+	btr_pos_t*	tuple2,
 	unsigned	nth_attempt)
 {
 	btr_path_t	path1[BTR_PATH_ARRAY_N_SLOTS];
@@ -6488,6 +6210,7 @@ btr_estimate_n_rows_in_range_low(
 	ulint		i;
 	mtr_t		mtr;
 	ha_rows		table_n_rows;
+        page_cur_mode_t mode2= tuple2->mode;
 
 	table_n_rows = dict_table_get_n_rows(index->table);
 
@@ -6506,9 +6229,10 @@ btr_estimate_n_rows_in_range_low(
 
 	bool	should_count_the_left_border;
 
-	if (dtuple_get_n_fields(tuple1) > 0) {
+	if (dtuple_get_n_fields(tuple1->tuple) > 0) {
 
-		btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
+              btr_cur_search_to_nth_level(index, 0, tuple1->tuple,
+                                            tuple1->mode,
 					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
 					    &cursor, 0,
 					    __FILE__, __LINE__, &mtr);
@@ -6548,6 +6272,8 @@ btr_estimate_n_rows_in_range_low(
 		should_count_the_left_border = false;
 	}
 
+        tuple1->page_id= cursor.page_cur.block->page.id();
+
 	mtr_commit(&mtr);
 
 	if (!index->is_readable()) {
@@ -6560,9 +6286,10 @@ btr_estimate_n_rows_in_range_low(
 
 	bool	should_count_the_right_border;
 
-	if (dtuple_get_n_fields(tuple2) > 0) {
+	if (dtuple_get_n_fields(tuple2->tuple) > 0) {
 
-		btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
+		btr_cur_search_to_nth_level(index, 0, tuple2->tuple,
+                                            mode2,
 					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
 					    &cursor, 0,
 					    __FILE__, __LINE__, &mtr);
@@ -6574,7 +6301,7 @@ btr_estimate_n_rows_in_range_low(
 		should_count_the_right_border
 			= (mode2 == PAGE_CUR_LE /* if the range is '<=' */
 			   /* and the record was found */
-			   && cursor.low_match >= dtuple_get_n_fields(tuple2))
+			   && cursor.low_match >= dtuple_get_n_fields(tuple2->tuple))
 			|| (mode2 == PAGE_CUR_L /* or if the range is '<' */
 			    /* and there are any records to match the criteria,
 			    i.e. if the minimum record on the tree is 5 and
@@ -6618,6 +6345,8 @@ btr_estimate_n_rows_in_range_low(
 		should_count_the_right_border = false;
 	}
 
+        tuple2->page_id= cursor.page_cur.block->page.id();
+
 	mtr_commit(&mtr);
 
 	/* We have the path information for the range in path1 and path2 */
@@ -6756,8 +6485,8 @@ btr_estimate_n_rows_in_range_low(
 				}
 
 				return btr_estimate_n_rows_in_range_low(
-					index, tuple1, mode1,
-					tuple2, mode2, nth_attempt + 1);
+                                       index, tuple1, tuple2,
+                                       nth_attempt + 1);
 			}
 
 			diverged = true;
@@ -6828,13 +6557,11 @@ btr_estimate_n_rows_in_range_low(
 ha_rows
 btr_estimate_n_rows_in_range(
 	dict_index_t*	index,
-	const dtuple_t*	tuple1,
-	page_cur_mode_t	mode1,
-	const dtuple_t*	tuple2,
-	page_cur_mode_t	mode2)
+        btr_pos_t       *tuple1,
+        btr_pos_t       *tuple2)
 {
 	return btr_estimate_n_rows_in_range_low(
-		index, tuple1, mode1, tuple2, mode2, 1);
+		index, tuple1, tuple2, 1);
 }
 
 /*******************************************************************//**
@@ -6992,11 +6719,12 @@ btr_estimate_number_of_different_key_vals(dict_index_t* index)
 		n_pages = S < I? min(I,L) : I
                 */
 		if (index->stat_index_size > 1) {
-			n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size) ?
-				ut_min(static_cast<ulint>(index->stat_index_size),
-					static_cast<ulint>(log2(index->stat_index_size)*srv_stats_transient_sample_pages))
+			n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size)
+				? ut_min(index->stat_index_size,
+					 static_cast<ulint>(
+						 log2(double(index->stat_index_size))
+						 * double(srv_stats_transient_sample_pages)))
 				: index->stat_index_size;
-
 		}
 	}
 
@@ -7242,13 +6970,12 @@ static
 void
 btr_cur_set_ownership_of_extern_field(
 /*==================================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
-				part will be updated, or NULL */
+	buf_block_t*	block,	/*!< in/out: index page */
 	rec_t*		rec,	/*!< in/out: clustered index record */
 	dict_index_t*	index,	/*!< in: index of the page */
 	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
 	ulint		i,	/*!< in: field number */
-	ibool		val,	/*!< in: value to set */
+	bool		val,	/*!< in: value to set */
 	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
 {
 	byte*	data;
@@ -7272,15 +6999,12 @@ btr_cur_set_ownership_of_extern_field(
 		byte_val |= BTR_EXTERN_OWNER_FLAG;
 	}
 
-	if (page_zip) {
+	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
 		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
-		page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
-	} else if (mtr != NULL) {
-
-		mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
-				 MLOG_1BYTE, mtr);
+		page_zip_write_blob_ptr(block, rec, index, offsets, i, mtr);
 	} else {
-		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
+		mtr->write<1,mtr_t::MAYBE_NOP>(*block, data + local_len
+					       + BTR_EXTERN_LEN, byte_val);
 	}
 }
 
@@ -7292,25 +7016,22 @@ to free the field. */
 void
 btr_cur_disown_inherited_fields(
 /*============================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
-				part will be updated, or NULL */
+	buf_block_t*	block,	/*!< in/out: index page */
 	rec_t*		rec,	/*!< in/out: record in a clustered index */
 	dict_index_t*	index,	/*!< in: index of the page */
 	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
 	const upd_t*	update,	/*!< in: update vector */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	ulint	i;
-
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
 	ut_ad(rec_offs_any_extern(offsets));
 
-	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+	for (uint16_t i = 0; i < rec_offs_n_fields(offsets); i++) {
 		if (rec_offs_nth_extern(offsets, i)
 		    && !upd_get_field_by_field_no(update, i, false)) {
 			btr_cur_set_ownership_of_extern_field(
-				page_zip, rec, index, offsets, i, FALSE, mtr);
+				block, rec, index, offsets, i, false, mtr);
 		}
 	}
 }
@@ -7323,29 +7044,23 @@ static
 void
 btr_cur_unmark_extern_fields(
 /*=========================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
-				part will be updated, or NULL */
+	buf_block_t*	block,	/*!< in/out: index page */
 	rec_t*		rec,	/*!< in/out: record in a clustered index */
 	dict_index_t*	index,	/*!< in: index of the page */
 	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
 	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
 {
-	ulint	n;
-	ulint	i;
-
 	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
-	n = rec_offs_n_fields(offsets);
-
 	if (!rec_offs_any_extern(offsets)) {
-
 		return;
 	}
 
-	for (i = 0; i < n; i++) {
-		if (rec_offs_nth_extern(offsets, i)) {
+	const ulint n = rec_offs_n_fields(offsets);
 
+	for (ulint i = 0; i < n; i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
 			btr_cur_set_ownership_of_extern_field(
-				page_zip, rec, index, offsets, i, TRUE, mtr);
+				block, rec, index, offsets, i, true, mtr);
 		}
 	}
 }
@@ -7354,7 +7069,7 @@ btr_cur_unmark_extern_fields(
 Returns the length of a BLOB part stored on the header page.
 @return part length */
 static
-ulint
+uint32_t
 btr_blob_get_part_len(
 /*==================*/
 	const byte*	blob_header)	/*!< in: blob header */
@@ -7366,7 +7081,7 @@ btr_blob_get_part_len(
 Returns the page number where the next BLOB part is stored.
 @return page number or FIL_NULL if no more pages */
 static
-ulint
+uint32_t
 btr_blob_get_next_page_no(
 /*======================*/
 	const byte*	blob_header)	/*!< in: blob header */
@@ -7374,35 +7089,27 @@ btr_blob_get_next_page_no(
 	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
 }
 
-/*******************************************************************//**
-Deallocate a buffer block that was reserved for a BLOB part. */
-static
-void
-btr_blob_free(
-/*==========*/
-	buf_block_t*	block,	/*!< in: buffer block */
-	ibool		all,	/*!< in: TRUE=remove also the compressed page
-				if there is one */
-	mtr_t*		mtr)	/*!< in: mini-transaction to commit */
+/** Deallocate a buffer block that was reserved for a BLOB part.
+@param block   buffer block
+@param all     flag whether to remove a ROW_FORMAT=COMPRESSED page
+@param mtr     mini-transaction to commit */
+static void btr_blob_free(buf_block_t *block, bool all, mtr_t *mtr)
 {
-	buf_pool_t*	buf_pool = buf_pool_from_block(block);
-	const page_id_t	page_id(block->page.id);
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-	mtr_commit(mtr);
+  const page_id_t page_id(block->page.id());
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+  mtr->commit();
 
-	buf_pool_mutex_enter(buf_pool);
+  const ulint fold= page_id.fold();
 
-	if (buf_page_t* bpage = buf_page_hash_get(buf_pool, page_id)) {
-		if (!buf_LRU_free_page(bpage, all)
-		    && all && bpage->zip.data) {
-			/* Attempt to deallocate the uncompressed page
-			if the whole block cannot be deallocted. */
+  mysql_mutex_lock(&buf_pool.mutex);
 
-			buf_LRU_free_page(bpage, false);
-		}
-	}
+  if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold))
+    if (!buf_LRU_free_page(bpage, all) && all && bpage->zip.data)
+      /* Attempt to deallocate the redundant copy of the uncompressed page
+      if the whole ROW_FORMAT=COMPRESSED block cannot be deallocted. */
+      buf_LRU_free_page(bpage, false);
 
-	buf_pool_mutex_exit(buf_pool);
+  mysql_mutex_unlock(&buf_pool.mutex);
 }
 
 /** Helper class used while writing blob pages, during insert or update. */
@@ -7454,15 +7161,13 @@ struct btr_blob_log_check_t {
 	{
 		dict_index_t*	index = m_pcur->index();
 		ulint		offs = 0;
-		ulint		page_no = ULINT_UNDEFINED;
-		FlushObserver*	observer = m_mtr->get_flush_observer();
+		uint32_t	page_no = FIL_NULL;
 
 		if (UNIV_UNLIKELY(m_op == BTR_STORE_INSERT_BULK)) {
 			offs = page_offset(*m_rec);
-			page_no = page_get_page_no(
-				buf_block_get_frame(*m_block));
-
+			page_no = (*m_block)->page.id().page_no();
 			buf_block_buf_fix_inc(*m_block, __FILE__, __LINE__);
+			ut_ad(page_no != FIL_NULL);
 		} else {
 			btr_pcur_store_position(m_pcur, m_mtr);
 		}
@@ -7478,13 +7183,10 @@ struct btr_blob_log_check_t {
 		m_mtr->start();
 		m_mtr->set_log_mode(log_mode);
 		index->set_modified(*m_mtr);
-		m_mtr->set_flush_observer(observer);
 
-		if (UNIV_UNLIKELY(m_op == BTR_STORE_INSERT_BULK)) {
+		if (UNIV_UNLIKELY(page_no != FIL_NULL)) {
 			m_pcur->btr_cur.page_cur.block = btr_block_get(
-				page_id_t(index->table->space_id, page_no),
-				index->table->space->zip_size(),
-				RW_X_LATCH, index, m_mtr);
+				*index, page_no, RW_X_LATCH, false, m_mtr);
 			m_pcur->btr_cur.page_cur.rec
 				= m_pcur->btr_cur.page_cur.block->frame
 				+ offs;
@@ -7508,9 +7210,9 @@ struct btr_blob_log_check_t {
 		      MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX));
 
 		ut_ad((m_op == BTR_STORE_INSERT_BULK)
-		      == !mtr_memo_contains_flagged(m_mtr, &index->lock,
-						    MTR_MEMO_SX_LOCK
-						    | MTR_MEMO_X_LOCK));
+		      == !m_mtr->memo_contains_flagged(&index->lock,
+						       MTR_MEMO_SX_LOCK
+						       | MTR_MEMO_X_LOCK));
 	}
 };
 
@@ -7543,17 +7245,12 @@ btr_store_big_rec_extern_fields(
 					committed and restarted. */
 	enum blob_op	op)		/*! in: operation code */
 {
-	ulint		rec_page_no;
 	byte*		field_ref;
 	ulint		extern_len;
 	ulint		store_len;
-	ulint		page_no;
 	ulint		space_id;
-	ulint		prev_page_no;
-	ulint		hint_page_no;
 	ulint		i;
 	mtr_t		mtr;
-	mtr_t		mtr_bulk;
 	mem_heap_t*	heap = NULL;
 	page_zip_des_t*	page_zip;
 	z_stream	c_stream;
@@ -7565,18 +7262,16 @@ btr_store_big_rec_extern_fields(
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(rec_offs_any_extern(offsets));
 	ut_ad(op == BTR_STORE_INSERT_BULK
-	      || mtr_memo_contains_flagged(btr_mtr, &index->lock,
-					   MTR_MEMO_X_LOCK
-					   | MTR_MEMO_SX_LOCK));
-	ut_ad(mtr_memo_contains(btr_mtr, rec_block, MTR_MEMO_PAGE_X_FIX));
+	      || btr_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+						| MTR_MEMO_SX_LOCK));
+	ut_ad(btr_mtr->memo_contains_flagged(rec_block, MTR_MEMO_PAGE_X_FIX));
 	ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
 	ut_a(dict_index_is_clust(index));
 
 	btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
 				      &rec, op);
 	page_zip = buf_block_get_page_zip(rec_block);
-	space_id = rec_block->page.id.space();
-	rec_page_no = rec_block->page.id.page_no();
+	space_id = rec_block->page.id().space();
 	ut_a(fil_page_index_page_check(page_align(rec))
 	     || op == BTR_STORE_INSERT_BULK);
 
@@ -7639,7 +7334,7 @@ btr_store_big_rec_extern_fields(
 		MEM_CHECK_DEFINED(big_rec_vec->fields[i].data, extern_len);
 		ut_a(extern_len > 0);
 
-		prev_page_no = FIL_NULL;
+		uint32_t prev_page_no = FIL_NULL;
 
 		if (page_zip) {
 			int	err = deflateReset(&c_stream);
@@ -7652,9 +7347,8 @@ btr_store_big_rec_extern_fields(
 
 		for (ulint blob_npages = 0;; ++blob_npages) {
 			buf_block_t*	block;
-			page_t*		page;
 			const ulint	commit_freq = 4;
-			ulint		r_extents;
+			uint32_t	r_extents;
 
 			ut_ad(page_align(field_ref) == page_align(rec));
 
@@ -7666,60 +7360,39 @@ btr_store_big_rec_extern_fields(
 					rec, offsets, field_no);
 
 				page_zip = buf_block_get_page_zip(rec_block);
-				rec_page_no = rec_block->page.id.page_no();
 			}
 
 			mtr.start();
 			index->set_modified(mtr);
 			mtr.set_log_mode(btr_mtr->get_log_mode());
-			mtr.set_flush_observer(btr_mtr->get_flush_observer());
 
-			buf_page_get(rec_block->page.id,
+			buf_page_get(rec_block->page.id(),
 				     rec_block->zip_size(), RW_X_LATCH, &mtr);
 
-			if (prev_page_no == FIL_NULL) {
-				hint_page_no = 1 + rec_page_no;
-			} else {
-				hint_page_no = prev_page_no + 1;
-			}
-
-			mtr_t	*alloc_mtr;
-
-			if (UNIV_UNLIKELY(op == BTR_STORE_INSERT_BULK)) {
-				mtr_bulk.start();
-				mtr_bulk.set_spaces(mtr);
-				alloc_mtr = &mtr_bulk;
-			} else {
-				alloc_mtr = &mtr;
+			uint32_t hint_prev = prev_page_no;
+			if (hint_prev == FIL_NULL) {
+				hint_prev = rec_block->page.id().page_no();
 			}
 
 			if (!fsp_reserve_free_extents(&r_extents,
 						      index->table->space, 1,
-						      FSP_BLOB, alloc_mtr,
-						      1)) {
-
-				alloc_mtr->commit();
+						      FSP_BLOB, &mtr, 1)) {
+				mtr.commit();
 				error = DB_OUT_OF_FILE_SPACE;
 				goto func_exit;
 			}
 
-			block = btr_page_alloc(index, hint_page_no, FSP_NO_DIR,
-					       0, alloc_mtr, &mtr);
+			block = btr_page_alloc(index, hint_prev + 1,
+					       FSP_NO_DIR, 0, &mtr, &mtr);
 
 			index->table->space->release_free_extents(r_extents);
 
-			if (UNIV_UNLIKELY(op == BTR_STORE_INSERT_BULK)) {
-				mtr_bulk.commit();
-			}
-
 			ut_a(block != NULL);
 
-			page_no = block->page.id.page_no();
-			page = buf_block_get_frame(block);
+			const uint32_t page_no = block->page.id().page_no();
 
 			if (prev_page_no != FIL_NULL) {
 				buf_block_t*	prev_block;
-				page_t*		prev_page;
 
 				prev_block = buf_page_get(
 					page_id_t(space_id, prev_page_no),
@@ -7728,48 +7401,45 @@ btr_store_big_rec_extern_fields(
 
 				buf_block_dbg_add_level(prev_block,
 							SYNC_EXTERN_STORAGE);
-				prev_page = buf_block_get_frame(prev_block);
 
 				if (page_zip) {
-					mlog_write_ulint(
-						prev_page + FIL_PAGE_NEXT,
-						page_no, MLOG_4BYTES, &mtr);
-					memcpy(buf_block_get_page_zip(
-						       prev_block)
-					       ->data + FIL_PAGE_NEXT,
-					       prev_page + FIL_PAGE_NEXT, 4);
+					mtr.write<4>(*prev_block,
+						     prev_block->frame
+						     + FIL_PAGE_NEXT,
+						     page_no);
+					memcpy_aligned<4>(
+						buf_block_get_page_zip(
+							prev_block)
+						->data + FIL_PAGE_NEXT,
+						prev_block->frame
+						+ FIL_PAGE_NEXT, 4);
 				} else {
-					mlog_write_ulint(
-						prev_page + FIL_PAGE_DATA
-						+ BTR_BLOB_HDR_NEXT_PAGE_NO,
-						page_no, MLOG_4BYTES, &mtr);
+					mtr.write<4>(*prev_block,
+						     BTR_BLOB_HDR_NEXT_PAGE_NO
+						     + FIL_PAGE_DATA
+						     + prev_block->frame,
+						     page_no);
 				}
-
 			} else if (dict_index_is_online_ddl(index)) {
 				row_log_table_blob_alloc(index, page_no);
 			}
 
+			ut_ad(!page_has_siblings(block->frame));
+			ut_ad(!fil_page_get_type(block->frame));
+
 			if (page_zip) {
 				int		err;
 				page_zip_des_t*	blob_page_zip;
 
-				/* Write FIL_PAGE_TYPE to the redo log
-				separately, before logging any other
-				changes to the page, so that the debug
-				assertions in
-				recv_parse_or_apply_log_rec_body() can
-				be made simpler.  Before InnoDB Plugin
-				1.0.4, the initialization of
-				FIL_PAGE_TYPE was logged as part of
-				the mlog_log_string() below. */
-
-				mlog_write_ulint(page + FIL_PAGE_TYPE,
-						 prev_page_no == FIL_NULL
-						 ? FIL_PAGE_TYPE_ZBLOB
-						 : FIL_PAGE_TYPE_ZBLOB2,
-						 MLOG_2BYTES, &mtr);
-
-				c_stream.next_out = page
+				mtr.write<1>(*block,
+					     FIL_PAGE_TYPE + 1 + block->frame,
+					     prev_page_no == FIL_NULL
+					     ? FIL_PAGE_TYPE_ZBLOB
+					     : FIL_PAGE_TYPE_ZBLOB2);
+				block->page.zip.data[FIL_PAGE_TYPE + 1]
+					= block->frame[FIL_PAGE_TYPE + 1];
+
+				c_stream.next_out = block->frame
 					+ FIL_PAGE_DATA;
 				c_stream.avail_out = static_cast<uInt>(
 					payload_size_zip);
@@ -7779,25 +7449,11 @@ btr_store_big_rec_extern_fields(
 				ut_a(err == Z_STREAM_END
 				     || c_stream.avail_out == 0);
 
-				compile_time_assert(FIL_PAGE_NEXT
-						    == FIL_PAGE_PREV + 4);
-				compile_time_assert(FIL_NULL == 0xffffffff);
-				mlog_memset(block, FIL_PAGE_PREV, 8, 0xff,
-					    &mtr);
-				mlog_log_string(page
-						+ FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
-						page_zip_get_size(page_zip)
-						- FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
-						- c_stream.avail_out,
-						&mtr);
-				/* Zero out the unused part of the page. */
-				if (c_stream.avail_out) {
-					mlog_memset(block,
-						    page_zip_get_size(page_zip)
-						    - c_stream.avail_out,
-						    c_stream.avail_out,
-						    0, &mtr);
-				}
+				mtr.memcpy(*block,
+					   FIL_PAGE_DATA,
+					   page_zip_get_size(page_zip)
+					   - FIL_PAGE_DATA
+					   - c_stream.avail_out);
 				/* Copy the page to compressed storage,
 				because it will be flushed to disk
 				from there. */
@@ -7805,7 +7461,7 @@ btr_store_big_rec_extern_fields(
 				ut_ad(blob_page_zip);
 				ut_ad(page_zip_get_size(blob_page_zip)
 				      == page_zip_get_size(page_zip));
-				memcpy(blob_page_zip->data, page,
+				memcpy(blob_page_zip->data, block->frame,
 				       page_zip_get_size(page_zip));
 
 				if (err == Z_OK && prev_page_no != FIL_NULL) {
@@ -7842,7 +7498,7 @@ btr_store_big_rec_extern_fields(
 				/* We compress a page when finish bulk insert.*/
 				if (UNIV_LIKELY(op != BTR_STORE_INSERT_BULK)) {
 					page_zip_write_blob_ptr(
-						page_zip, rec, index, offsets,
+						rec_block, rec, index, offsets,
 						field_no, &mtr);
 				}
 
@@ -7857,9 +7513,9 @@ next_zip_page:
 					break;
 				}
 			} else {
-				mlog_write_ulint(page + FIL_PAGE_TYPE,
-						 FIL_PAGE_TYPE_BLOB,
-						 MLOG_2BYTES, &mtr);
+				mtr.write<1>(*block, FIL_PAGE_TYPE + 1
+					     + block->frame,
+					     FIL_PAGE_TYPE_BLOB);
 
 				if (extern_len > payload_size) {
 					store_len = payload_size;
@@ -7867,47 +7523,44 @@ next_zip_page:
 					store_len = extern_len;
 				}
 
-				mlog_write_string(page + FIL_PAGE_DATA
-						  + BTR_BLOB_HDR_SIZE,
-						  (const byte*)
-						  big_rec_vec->fields[i].data
-						  + big_rec_vec->fields[i].len
-						  - extern_len,
-						  store_len, &mtr);
-				mlog_write_ulint(page + FIL_PAGE_DATA
-						 + BTR_BLOB_HDR_PART_LEN,
-						 store_len, MLOG_4BYTES, &mtr);
-				mlog_write_ulint(page + FIL_PAGE_DATA
-						 + BTR_BLOB_HDR_NEXT_PAGE_NO,
-						 FIL_NULL, MLOG_4BYTES, &mtr);
+				mtr.memcpy<mtr_t::MAYBE_NOP>(
+					*block,
+					FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE
+					+ block->frame,
+					static_cast<const byte*>
+					(big_rec_vec->fields[i].data)
+					+ big_rec_vec->fields[i].len
+					- extern_len, store_len);
+				mtr.write<4>(*block, BTR_BLOB_HDR_PART_LEN
+					     + FIL_PAGE_DATA + block->frame,
+					     store_len);
+				compile_time_assert(FIL_NULL == 0xffffffff);
+				mtr.memset(block, BTR_BLOB_HDR_NEXT_PAGE_NO
+					   + FIL_PAGE_DATA, 4, 0xff);
 
 				extern_len -= store_len;
 
 				ut_ad(!mach_read_from_4(BTR_EXTERN_LEN
 							+ field_ref));
-				mlog_write_ulint(field_ref
-						 + BTR_EXTERN_LEN + 4,
-						 big_rec_vec->fields[i].len
-						 - extern_len,
-						 MLOG_4BYTES, &mtr);
+				mtr.write<4>(*rec_block,
+					     BTR_EXTERN_LEN + 4 + field_ref,
+					     big_rec_vec->fields[i].len
+					     - extern_len);
 
 				if (prev_page_no == FIL_NULL) {
 					ut_ad(blob_npages == 0);
-					mlog_write_ulint(field_ref
-							 + BTR_EXTERN_SPACE_ID,
-							 space_id, MLOG_4BYTES,
-							 &mtr);
-
-					mlog_write_ulint(field_ref
-							 + BTR_EXTERN_PAGE_NO,
-							 page_no, MLOG_4BYTES,
-							 &mtr);
-
-					mlog_write_ulint(field_ref
-							 + BTR_EXTERN_OFFSET,
-							 FIL_PAGE_DATA,
-							 MLOG_4BYTES,
-							 &mtr);
+					mtr.write<4,mtr_t::MAYBE_NOP>(
+						*rec_block,
+						field_ref + BTR_EXTERN_SPACE_ID,
+						space_id);
+
+					mtr.write<4>(*rec_block, field_ref
+						     + BTR_EXTERN_PAGE_NO,
+						     page_no);
+
+					mtr.write<4>(*rec_block, field_ref
+						     + BTR_EXTERN_OFFSET,
+						     FIL_PAGE_DATA);
 				}
 
 				prev_page_no = page_no;
@@ -7958,40 +7611,30 @@ func_exit:
 	return(error);
 }
 
-/*******************************************************************//**
-Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */
-static
-void
-btr_check_blob_fil_page_type(
-/*=========================*/
-	ulint		space_id,	/*!< in: space id */
-	ulint		page_no,	/*!< in: page number */
-	const page_t*	page,		/*!< in: page */
-	ibool		read)		/*!< in: TRUE=read, FALSE=purge */
+/** Check the FIL_PAGE_TYPE on an uncompressed BLOB page.
+@param[in]      block   uncompressed BLOB page
+@param[in]      read    true=read, false=purge */
+static void btr_check_blob_fil_page_type(const buf_block_t& block, bool read)
 {
-	ulint	type = fil_page_get_type(page);
-
-	ut_a(space_id == page_get_space_id(page));
-	ut_a(page_no == page_get_page_no(page));
-
-	if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
-		ulint	flags = fil_space_get_flags(space_id);
-
-#ifndef UNIV_DEBUG /* Improve debug test coverage */
-		if (!DICT_TF_HAS_ATOMIC_BLOBS(flags)) {
-			/* Old versions of InnoDB did not initialize
-			FIL_PAGE_TYPE on BLOB pages.  Do not print
-			anything about the type mismatch when reading
-			a BLOB page that may be from old versions. */
-			return;
-		}
-#endif /* !UNIV_DEBUG */
-
-		ib::fatal() << "FIL_PAGE_TYPE=" << type
-			<< " on BLOB " << (read ? "read" : "purge")
-			<< " space " << space_id << " page " << page_no
-			<< " flags " << flags;
-	}
+  uint16_t type= fil_page_get_type(block.frame);
+
+  if (UNIV_LIKELY(type == FIL_PAGE_TYPE_BLOB))
+    return;
+  /* FIXME: take the tablespace as a parameter */
+  if (fil_space_t *space= fil_space_t::get(block.page.id().space()))
+  {
+    /* Old versions of InnoDB did not initialize FIL_PAGE_TYPE on BLOB
+    pages.  Do not print anything about the type mismatch when reading
+    a BLOB page that may be from old versions. */
+    if (space->full_crc32() || DICT_TF_HAS_ATOMIC_BLOBS(space->flags))
+    {
+      ib::fatal() << "FIL_PAGE_TYPE=" << type
+		  << (read ? " on BLOB read file " : " on BLOB purge file ")
+		  << space->chain.start->name
+		  << " page " << block.page.id().page_no();
+    }
+    space->release();
+  }
 }
 
 /*******************************************************************//**
@@ -8015,8 +7658,7 @@ btr_free_externally_stored_field(
 					page_zip_write_blob_ptr(), or NULL */
 	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index),
 					or NULL */
-	page_zip_des_t*	page_zip,	/*!< in: compressed page corresponding
-					to rec, or NULL if rec == NULL */
+	buf_block_t*	block,		/*!< in/out: page of field_ref */
 	ulint		i,		/*!< in: field number of field_ref;
 					ignored if rec == NULL */
 	bool		rollback,	/*!< in: performing rollback? */
@@ -8025,19 +7667,19 @@ btr_free_externally_stored_field(
 					X-latch to the index tree */
 {
 	page_t*		page;
-	const ulint	space_id	= mach_read_from_4(
+	const uint32_t	space_id	= mach_read_from_4(
 		field_ref + BTR_EXTERN_SPACE_ID);
-	const ulint	start_page	= mach_read_from_4(
+	const uint32_t	start_page	= mach_read_from_4(
 		field_ref + BTR_EXTERN_PAGE_NO);
-	ulint		page_no;
-	ulint		next_page_no;
+	uint32_t	page_no;
+	uint32_t	next_page_no;
 	mtr_t		mtr;
 
 	ut_ad(index->is_primary());
-	ut_ad(mtr_memo_contains_flagged(local_mtr, dict_index_get_lock(index),
-					MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
-	ut_ad(mtr_memo_contains_page(local_mtr, field_ref,
-				     MTR_MEMO_PAGE_X_FIX));
+	ut_ad(local_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					       | MTR_MEMO_SX_LOCK));
+	ut_ad(local_mtr->memo_contains_page_flagged(field_ref,
+						    MTR_MEMO_PAGE_X_FIX));
 	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
 	ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
 	ut_ad(local_mtr->is_named_space(
@@ -8061,10 +7703,8 @@ btr_free_externally_stored_field(
 	const ulint ext_zip_size = index->table->space->zip_size();
 	const ulint rec_zip_size = rec ? ext_zip_size : 0;
 
-	if (rec == NULL) {
-		/* This is a call from row_purge_upd_exist_or_extern(). */
-		ut_ad(!page_zip);
-	}
+	/* !rec holds in a call from purge when field_ref is in an undo page */
+	ut_ad(rec || !block->page.zip.data);
 
 	for (;;) {
 #ifdef UNIV_DEBUG
@@ -8133,43 +7773,40 @@ btr_free_externally_stored_field(
 
 			btr_page_free(index, ext_block, &mtr, true);
 
-			if (page_zip != NULL) {
+			if (UNIV_LIKELY_NULL(block->page.zip.data)) {
 				mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
 						next_page_no);
-				mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
-						0);
-				page_zip_write_blob_ptr(page_zip, rec, index,
+				memset(field_ref + BTR_EXTERN_LEN + 4, 0, 4);
+				page_zip_write_blob_ptr(block, rec, index,
 							offsets, i, &mtr);
 			} else {
-				mlog_write_ulint(field_ref
-						 + BTR_EXTERN_PAGE_NO,
-						 next_page_no,
-						 MLOG_4BYTES, &mtr);
-				mlog_write_ulint(field_ref
-						 + BTR_EXTERN_LEN + 4, 0,
-						 MLOG_4BYTES, &mtr);
+				mtr.write<4>(*block,
+					     BTR_EXTERN_PAGE_NO + field_ref,
+					     next_page_no);
+				mtr.write<4,mtr_t::MAYBE_NOP>(*block,
+							      BTR_EXTERN_LEN
+							      + 4 + field_ref,
+							      0U);
 			}
 		} else {
-			ut_a(!page_zip);
-			btr_check_blob_fil_page_type(space_id, page_no, page,
-						     FALSE);
+			ut_ad(!block->page.zip.data);
+			btr_check_blob_fil_page_type(*ext_block, false);
 
 			next_page_no = mach_read_from_4(
 				page + FIL_PAGE_DATA
 				+ BTR_BLOB_HDR_NEXT_PAGE_NO);
 			btr_page_free(index, ext_block, &mtr, true);
 
-			mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
-					 next_page_no,
-					 MLOG_4BYTES, &mtr);
+			mtr.write<4>(*block, BTR_EXTERN_PAGE_NO + field_ref,
+				     next_page_no);
 			/* Zero out the BLOB length.  If the server
 			crashes during the execution of this function,
 			trx_rollback_all_recovered() could
 			dereference the half-deleted BLOB, fetching a
 			wrong prefix for the BLOB. */
-			mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
-					 0,
-					 MLOG_4BYTES, &mtr);
+			mtr.write<4,mtr_t::MAYBE_NOP>(*block,
+						      BTR_EXTERN_LEN + 4
+						      + field_ref, 0U);
 		}
 
 		/* Commit mtr and release the BLOB block to save memory. */
@@ -8187,8 +7824,7 @@ btr_rec_free_externally_stored_fields(
 				tree MUST be X-latched */
 	rec_t*		rec,	/*!< in/out: record */
 	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
-	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
-				part will be updated, or NULL */
+	buf_block_t*	block,	/*!< in: index page of rec */
 	bool		rollback,/*!< in: performing rollback? */
 	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
 				an X-latch to record page and to the index
@@ -8198,7 +7834,7 @@ btr_rec_free_externally_stored_fields(
 	ulint	i;
 
 	ut_ad(rec_offs_validate(rec, index, offsets));
-	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
 	ut_ad(index->is_primary());
 	ut_ad(page_rec_is_leaf(rec));
 	/* Free possible externally stored fields in the record */
@@ -8210,7 +7846,7 @@ btr_rec_free_externally_stored_fields(
 		if (rec_offs_nth_extern(offsets, i)) {
 			btr_free_externally_stored_field(
 				index, btr_rec_get_field_ref(rec, offsets, i),
-				rec, offsets, page_zip, i, rollback, mtr);
+				rec, offsets, block, i, rollback, mtr);
 		}
 	}
 }
@@ -8225,8 +7861,7 @@ btr_rec_free_updated_extern_fields(
 	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
 				X-latched */
 	rec_t*		rec,	/*!< in/out: record */
-	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
-				part will be updated, or NULL */
+	buf_block_t*	block,	/*!< in: index page of rec */
 	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
 	const upd_t*	update,	/*!< in: update vector */
 	bool		rollback,/*!< in: performing rollback? */
@@ -8237,7 +7872,7 @@ btr_rec_free_updated_extern_fields(
 	ulint	i;
 
 	ut_ad(rec_offs_validate(rec, index, offsets));
-	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
 
 	/* Free possible externally stored fields in the record */
 
@@ -8254,7 +7889,7 @@ btr_rec_free_updated_extern_fields(
 
 			btr_free_externally_stored_field(
 				index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
-				rec, offsets, page_zip,
+				rec, offsets, block,
 				ufield->field_no, rollback, mtr);
 		}
 	}
@@ -8270,10 +7905,9 @@ btr_copy_blob_prefix(
 /*=================*/
 	byte*		buf,	/*!< out: the externally stored part of
 				the field, or a prefix of it */
-	ulint		len,	/*!< in: length of buf, in bytes */
-	ulint		space_id,/*!< in: space id of the BLOB pages */
-	ulint		page_no,/*!< in: page number of the first BLOB page */
-	ulint		offset)	/*!< in: offset on the first BLOB page */
+	uint32_t	len,	/*!< in: length of buf, in bytes */
+	page_id_t	id,	/*!< in: page identifier of the first BLOB page */
+	uint32_t	offset)	/*!< in: offset on the first BLOB page */
 {
 	ulint	copied_len	= 0;
 
@@ -8287,12 +7921,11 @@ btr_copy_blob_prefix(
 
 		mtr_start(&mtr);
 
-		block = buf_page_get(page_id_t(space_id, page_no),
-				     0, RW_S_LATCH, &mtr);
+		block = buf_page_get(id, 0, RW_S_LATCH, &mtr);
 		buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
 		page = buf_block_get_frame(block);
 
-		btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);
+		btr_check_blob_fil_page_type(*block, true);
 
 		blob_header = page + offset;
 		part_len = btr_blob_get_part_len(blob_header);
@@ -8302,11 +7935,11 @@ btr_copy_blob_prefix(
 		       blob_header + BTR_BLOB_HDR_SIZE, copy_len);
 		copied_len += copy_len;
 
-		page_no = btr_blob_get_next_page_no(blob_header);
+		id.set_page_no(btr_blob_get_next_page_no(blob_header));
 
 		mtr_commit(&mtr);
 
-		if (page_no == FIL_NULL || copy_len != part_len) {
+		if (id.page_no() == FIL_NULL || copy_len != part_len) {
 			MEM_CHECK_DEFINED(buf, copied_len);
 			return(copied_len);
 		}
@@ -8327,18 +7960,16 @@ by a lock or a page latch.
 or a prefix of it
 @param[in]	len		length of buf, in bytes
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size
-@param[in]	space_id	space id of the BLOB pages
-@param[in]	offset		offset on the first BLOB page
+@param[in]	id		page identifier of the BLOB pages
 @return number of bytes written to buf */
 static
 ulint
 btr_copy_zblob_prefix(
 	byte*			buf,
-	ulint			len,
+	uint32_t		len,
 	ulint			zip_size,
-	ulint			space_id,
-	ulint			page_no,
-	ulint			offset)
+	page_id_t		id,
+	uint32_t		offset)
 {
 	ulint		page_type = FIL_PAGE_TYPE_ZBLOB;
 	mem_heap_t*	heap;
@@ -8357,25 +7988,23 @@ btr_copy_zblob_prefix(
 
 	ut_ad(zip_size);
 	ut_ad(ut_is_2pow(zip_size));
-	ut_ad(space_id);
+	ut_ad(id.space());
 
 	err = inflateInit(&d_stream);
 	ut_a(err == Z_OK);
 
 	for (;;) {
 		buf_page_t*	bpage;
-		ulint		next_page_no;
+		uint32_t	next_page_no;
 
 		/* There is no latch on bpage directly.  Instead,
 		bpage is protected by the B-tree page latch that
 		is being held on the clustered index record, or,
 		in row_merge_copy_blobs(), by an exclusive table lock. */
-		bpage = buf_page_get_zip(page_id_t(space_id, page_no),
-					 zip_size);
+		bpage = buf_page_get_zip(id, zip_size);
 
 		if (UNIV_UNLIKELY(!bpage)) {
-			ib::error() << "Cannot load compressed BLOB "
-				<< page_id_t(space_id, page_no);
+			ib::error() << "Cannot load compressed BLOB " << id;
 			goto func_exit;
 		}
 
@@ -8384,8 +8013,7 @@ btr_copy_zblob_prefix(
 
 			ib::error() << "Unexpected type "
 				<< fil_page_get_type(bpage->zip.data)
-				<< " of compressed BLOB page "
-				<< page_id_t(space_id, page_no);
+				<< " of compressed BLOB page " << id;
 
 			ut_ad(0);
 			goto end_of_blob;
@@ -8420,7 +8048,7 @@ btr_copy_zblob_prefix(
 		default:
 inflate_error:
 			ib::error() << "inflate() of compressed BLOB page "
-				<< page_id_t(space_id, page_no)
+				<< id
 				<< " returned " << err
 				<< " (" << d_stream.msg << ")";
 
@@ -8432,8 +8060,7 @@ inflate_error:
 			if (!d_stream.avail_in) {
 				ib::error()
 					<< "Unexpected end of compressed "
-					<< "BLOB page "
-					<< page_id_t(space_id, page_no);
+					<< "BLOB page " << id;
 			} else {
 				err = inflate(&d_stream, Z_FINISH);
 				switch (err) {
@@ -8455,7 +8082,7 @@ end_of_blob:
 		/* On other BLOB pages except the first
 		the BLOB header always is at the page header: */
 
-		page_no = next_page_no;
+		id.set_page_no(next_page_no);
 		offset = FIL_PAGE_NEXT;
 		page_type = FIL_PAGE_TYPE_ZBLOB2;
 	}
@@ -8474,31 +8101,24 @@ by a lock or a page latch.
 field, or a prefix of it
 @param[in]	len		length of buf, in bytes
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	space_id	space id of the first BLOB page
-@param[in]	page_no		page number of the first BLOB page
+@param[in]	id		page identifier of the first BLOB page
 @param[in]	offset		offset on the first BLOB page
 @return number of bytes written to buf */
 static
 ulint
 btr_copy_externally_stored_field_prefix_low(
 	byte*			buf,
-	ulint			len,
+	uint32_t		len,
 	ulint			zip_size,
-	ulint			space_id,
-	ulint			page_no,
-	ulint			offset)
+	page_id_t		id,
+	uint32_t		offset)
 {
-	if (len == 0) {
-		return(0);
-	}
+  if (len == 0)
+    return 0;
 
-	if (zip_size) {
-		return(btr_copy_zblob_prefix(buf, len, zip_size,
-					     space_id, page_no, offset));
-	} else {
-		return(btr_copy_blob_prefix(buf, len, space_id,
-					    page_no, offset));
-	}
+  return zip_size
+    ? btr_copy_zblob_prefix(buf, len, zip_size, id, offset)
+    : btr_copy_blob_prefix(buf, len, id, offset);
 }
 
 /** Copies the prefix of an externally stored field of a record.
@@ -8520,10 +8140,6 @@ btr_copy_externally_stored_field_prefix(
 	const byte*		data,
 	ulint			local_len)
 {
-	ulint	space_id;
-	ulint	page_no;
-	ulint	offset;
-
 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
 
 	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
@@ -8546,17 +8162,18 @@ btr_copy_externally_stored_field_prefix(
 		return(0);
 	}
 
-	space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
-
-	page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
-
-	offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
+	uint32_t space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
+	uint32_t page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
+	uint32_t offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
+	len -= local_len;
 
 	return(local_len
 	       + btr_copy_externally_stored_field_prefix_low(buf + local_len,
-							     len - local_len,
+							     uint32_t(len),
 							     zip_size,
-							     space_id, page_no,
+							     page_id_t(
+								     space_id,
+								     page_no),
 							     offset));
 }
 
@@ -8578,26 +8195,24 @@ btr_copy_externally_stored_field(
 	ulint			local_len,
 	mem_heap_t*		heap)
 {
-	ulint	space_id;
-	ulint	page_no;
-	ulint	offset;
-	ulint	extern_len;
 	byte*	buf;
 
 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
 
 	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
 
-	space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
-
-	page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
-
-	offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
+	uint32_t space_id = mach_read_from_4(data + local_len
+					     + BTR_EXTERN_SPACE_ID);
+	uint32_t page_no = mach_read_from_4(data + local_len
+					    + BTR_EXTERN_PAGE_NO);
+	uint32_t offset = mach_read_from_4(data + local_len
+					   + BTR_EXTERN_OFFSET);
 
 	/* Currently a BLOB cannot be bigger than 4 GB; we
 	leave the 4 upper bytes in the length field unused */
 
-	extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
+	uint32_t extern_len = mach_read_from_4(data + local_len
+					       + BTR_EXTERN_LEN + 4);
 
 	buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
 
@@ -8606,8 +8221,10 @@ btr_copy_externally_stored_field(
 		+ btr_copy_externally_stored_field_prefix_low(buf + local_len,
 							      extern_len,
 							      zip_size,
-							      space_id,
-							      page_no, offset);
+							      page_id_t(
+								      space_id,
+								      page_no),
+							      offset);
 
 	return(buf);
 }
@@ -8653,7 +8270,7 @@ btr_rec_copy_externally_stored_field(
 		     field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
 		/* The externally stored field was not written yet.
 		This record should only be seen by
-		recv_recovery_rollback_active() or any
+		trx_rollback_recovered() or any
 		TRX_ISO_READ_UNCOMMITTED transactions. */
 		return(NULL);
 	}
diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc
index 67076e7a515..09454aeccc1 100644
--- a/storage/innobase/btr/btr0defragment.cc
+++ b/storage/innobase/btr/btr0defragment.cc
@@ -50,6 +50,21 @@ possible. From experimentation it seems that reduce the target size by 512 every
 time will make sure the page is compressible within a couple of iterations. */
 #define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE	512
 
+/** Item in the work queue for btr_degrament_thread. */
+struct btr_defragment_item_t
+{
+	btr_pcur_t*	pcur;		/* persistent cursor where
+					btr_defragment_n_pages should start */
+	os_event_t	event;		/* if not null, signal after work
+					is done */
+	bool		removed;	/* Mark an item as removed */
+	ulonglong	last_processed;	/* timestamp of last time this index
+					is processed by defragment thread */
+
+	btr_defragment_item_t(btr_pcur_t* pcur, os_event_t event);
+	~btr_defragment_item_t();
+};
+
 /* Work queue for defragmentation. */
 typedef std::list<btr_defragment_item_t*>	btr_defragment_wq_t;
 static btr_defragment_wq_t	btr_defragment_wq;
@@ -71,6 +86,21 @@ The difference between btr_defragment_count and btr_defragment_failures shows
 the amount of effort wasted. */
 Atomic_counter<ulint> btr_defragment_count;
 
+bool btr_defragment_active;
+
+struct defragment_chunk_state_t
+{
+	btr_defragment_item_t* m_item;
+};
+
+static defragment_chunk_state_t defragment_chunk_state;
+static void btr_defragment_chunk(void*);
+
+static tpool::timer* btr_defragment_timer;
+static tpool::task_group task_group(1);
+static tpool::task btr_defragment_task(btr_defragment_chunk, 0, &task_group);
+static void btr_defragment_start();
+
 /******************************************************************//**
 Constructor for btr_defragment_item_t. */
 btr_defragment_item_t::btr_defragment_item_t(
@@ -94,6 +124,11 @@ btr_defragment_item_t::~btr_defragment_item_t() {
 	}
 }
 
+static void submit_defragment_task(void*arg=0)
+{
+	srv_thread_pool->submit_task(&btr_defragment_task);
+}
+
 /******************************************************************//**
 Initialize defragmentation. */
 void
@@ -101,6 +136,9 @@ btr_defragment_init()
 {
 	srv_defragment_interval = 1000000000ULL / srv_defragment_frequency;
 	mutex_create(LATCH_ID_BTR_DEFRAGMENT_MUTEX, &btr_defragment_mutex);
+	defragment_chunk_state.m_item = 0;
+	btr_defragment_timer = srv_thread_pool->create_timer(submit_defragment_task);
+	btr_defragment_active = true;
 }
 
 /******************************************************************//**
@@ -108,6 +146,11 @@ Shutdown defragmentation. Release all resources. */
 void
 btr_defragment_shutdown()
 {
+	if (!btr_defragment_timer)
+		return;
+	delete btr_defragment_timer;
+	btr_defragment_timer = 0;
+	task_group.cancel_pending(&btr_defragment_task);
 	mutex_enter(&btr_defragment_mutex);
 	std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
 	while(iter != btr_defragment_wq.end()) {
@@ -117,6 +160,7 @@ btr_defragment_shutdown()
 	}
 	mutex_exit(&btr_defragment_mutex);
 	mutex_free(&btr_defragment_mutex);
+	btr_defragment_active = false;
 }
 
 
@@ -160,11 +204,7 @@ btr_defragment_add_index(
 	*err = DB_SUCCESS;
 
 	mtr_start(&mtr);
-	// Load index rood page.
-	buf_block_t* block = btr_block_get(
-		page_id_t(index->table->space_id, index->page),
-		index->table->space->zip_size(),
-		RW_NO_LATCH, index, &mtr);
+	buf_block_t* block = btr_root_block_get(index, RW_NO_LATCH, &mtr);
 	page_t* page = NULL;
 
 	if (block) {
@@ -196,6 +236,10 @@ btr_defragment_add_index(
 	btr_defragment_item_t*	item = new btr_defragment_item_t(pcur, event);
 	mutex_enter(&btr_defragment_mutex);
 	btr_defragment_wq.push_back(item);
+	if(btr_defragment_wq.size() == 1){
+		/* Kick off defragmentation work */
+		btr_defragment_start();
+	}
 	mutex_exit(&btr_defragment_mutex);
 	return event;
 }
@@ -368,7 +412,7 @@ btr_defragment_calc_n_recs_for_size(
 Merge as many records from the from_block to the to_block. Delete
 the from_block if all records are successfully merged to to_block.
 @return the to_block to target for next merge operation. */
-UNIV_INTERN
+static
 buf_block_t*
 btr_defragment_merge_pages(
 	dict_index_t*	index,		/*!< in: index tree */
@@ -414,7 +458,7 @@ btr_defragment_merge_pages(
 	// reorganizing the page, otherwise we need to reorganize the page
 	// first to release more space.
 	if (move_size > max_ins_size) {
-		if (!btr_page_reorganize_block(false, page_zip_level,
+		if (!btr_page_reorganize_block(page_zip_level,
 					       to_block, index,
 					       mtr)) {
 			if (!dict_index_is_clust(index)
@@ -486,10 +530,8 @@ btr_defragment_merge_pages(
 		lock_update_merge_left(to_block, orig_pred,
 				       from_block);
 		btr_search_drop_page_hash_index(from_block);
-
-		ut_a(DB_SUCCESS == btr_level_list_remove(
-			index->table->space_id,
-			zip_size, from_page, index, mtr));
+		ut_a(DB_SUCCESS == btr_level_list_remove(*from_block, *index,
+							 mtr));
 		btr_page_get_father(index, from_block, mtr, &parent);
 		btr_cur_node_ptr_delete(&parent, mtr);
 		/* btr_blob_dbg_remove(from_page, index,
@@ -582,7 +624,7 @@ btr_defragment_n_pages(
 	blocks[0] = block;
 	for (uint i = 1; i <= n_pages; i++) {
 		page_t* page = buf_block_get_frame(blocks[i-1]);
-		ulint page_no = btr_page_get_next(page);
+		uint32_t page_no = btr_page_get_next(page);
 		total_data_size += page_get_data_size(page);
 		total_n_recs += page_get_n_recs(page);
 		if (page_no == FIL_NULL) {
@@ -591,9 +633,8 @@ btr_defragment_n_pages(
 			break;
 		}
 
-		blocks[i] = btr_block_get(page_id_t(index->table->space_id,
-						    page_no), zip_size,
-					  RW_X_LATCH, index, mtr);
+		blocks[i] = btr_block_get(*index, page_no, RW_X_LATCH, true,
+					  mtr);
 	}
 
 	if (n_pages == 1) {
@@ -640,8 +681,9 @@ btr_defragment_n_pages(
 		max_data_size = optimal_page_size;
 	}
 
-	reserved_space = ut_min((ulint)(optimal_page_size
-			      * (1 - srv_defragment_fill_factor)),
+	reserved_space = ut_min(static_cast<ulint>(
+					static_cast<double>(optimal_page_size)
+					* (1 - srv_defragment_fill_factor)),
 			     (data_size_per_rec
 			      * srv_defragment_fill_factor_n_recs));
 	optimal_page_size -= reserved_space;
@@ -681,14 +723,29 @@ btr_defragment_n_pages(
 	return current_block;
 }
 
-/** Whether btr_defragment_thread is active */
-bool btr_defragment_thread_active;
 
-/** Merge consecutive b-tree pages into fewer pages to defragment indexes */
-extern "C" UNIV_INTERN
-os_thread_ret_t
-DECLARE_THREAD(btr_defragment_thread)(void*)
+
+void btr_defragment_start() {
+	if (!srv_defragment)
+		return;
+	ut_ad(!btr_defragment_wq.empty());
+	submit_defragment_task();
+}
+
+
+/**
+Callback used by defragment timer
+
+Throttling "sleep", is implemented via rescheduling the
+threadpool timer, which, when fired, will resume the work again,
+where it is left.
+
+The state (current item) is stored in function parameter.
+*/
+static void btr_defragment_chunk(void*)
 {
+	defragment_chunk_state_t* state = &defragment_chunk_state;
+
 	btr_pcur_t*	pcur;
 	btr_cur_t*	cursor;
 	dict_index_t*	index;
@@ -697,37 +754,24 @@ DECLARE_THREAD(btr_defragment_thread)(void*)
 	buf_block_t*	last_block;
 
 	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
-		ut_ad(btr_defragment_thread_active);
-
-		/* If defragmentation is disabled, sleep before
-		checking whether it's enabled. */
-		if (!srv_defragment) {
-			os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
-			continue;
-		}
-		/* The following call won't remove the item from work queue.
-		We only get a pointer to it to work on. This will make sure
-		when user issue a kill command, all indices are in the work
-		queue to be searched. This also means that the user thread
-		cannot directly remove the item from queue (since we might be
-		using it). So user thread only marks index as removed. */
-		btr_defragment_item_t* item = btr_defragment_get_item();
-		/* If work queue is empty, sleep and check later. */
-		if (!item) {
-			os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
-			continue;
+		if (!state->m_item) {
+			state->m_item = btr_defragment_get_item();
 		}
 		/* If an index is marked as removed, we remove it from the work
 		queue. No other thread could be using this item at this point so
 		it's safe to remove now. */
-		if (item->removed) {
-			btr_defragment_remove_item(item);
-			continue;
+		while (state->m_item && state->m_item->removed) {
+			btr_defragment_remove_item(state->m_item);
+			state->m_item = btr_defragment_get_item();
+		}
+		if (!state->m_item) {
+			/* Queue empty */
+			return;
 		}
 
-		pcur = item->pcur;
+		pcur = state->m_item->pcur;
 		ulonglong now = my_interval_timer();
-		ulonglong elapsed = now - item->last_processed;
+		ulonglong elapsed = now - state->m_item->last_processed;
 
 		if (elapsed < srv_defragment_interval) {
 			/* If we see an index again before the interval
@@ -736,12 +780,13 @@ DECLARE_THREAD(btr_defragment_thread)(void*)
 			defragmentation of all indices queue up on a single
 			thread, it's likely other indices that follow this one
 			don't need to sleep again. */
-			os_thread_sleep(static_cast<ulint>
-					((srv_defragment_interval - elapsed)
-					 / 1000));
+			int sleep_ms = (int)((srv_defragment_interval - elapsed) / 1000 / 1000);
+			if (sleep_ms) {
+				btr_defragment_timer->set_time(sleep_ms, 0);
+				return;
+			}
 		}
-
-		now = my_interval_timer();
+		log_free_check();
 		mtr_start(&mtr);
 		cursor = btr_pcur_get_btr_cur(pcur);
 		index = btr_cur_get_index(cursor);
@@ -770,7 +815,7 @@ DECLARE_THREAD(btr_defragment_thread)(void*)
 			btr_pcur_store_position(pcur, &mtr);
 			mtr_commit(&mtr);
 			/* Update the last_processed time of this index. */
-			item->last_processed = now;
+			state->m_item->last_processed = now;
 		} else {
 			dberr_t err = DB_SUCCESS;
 			mtr_commit(&mtr);
@@ -793,11 +838,8 @@ DECLARE_THREAD(btr_defragment_thread)(void*)
 				}
 			}
 
-			btr_defragment_remove_item(item);
+			btr_defragment_remove_item(state->m_item);
+			state->m_item = NULL;
 		}
 	}
-
-	btr_defragment_thread_active = false;
-	os_thread_exit();
-	OS_THREAD_DUMMY_RETURN;
 }
diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc
index c31db34186c..76d2be211f8 100644
--- a/storage/innobase/btr/btr0pcur.cc
+++ b/storage/innobase/btr/btr0pcur.cc
@@ -113,18 +113,16 @@ btr_pcur_store_position(
 
 	rec = page_cur_get_rec(page_cursor);
 	offs = rec - block->frame;
-	ut_ad(block->page.id.page_no() == page_get_page_no(block->frame));
-	ut_ad(block->page.buf_fix_count);
+	ut_ad(block->page.id().page_no() == page_get_page_no(block->frame));
+	ut_ad(block->page.buf_fix_count());
 	/* For spatial index, when we do positioning on parent
 	buffer if necessary, it might not hold latches, but the
 	tree must be locked to prevent change on the page */
-	ut_ad(mtr_memo_contains_flagged(mtr, block,
-					MTR_MEMO_PAGE_S_FIX
-					| MTR_MEMO_PAGE_X_FIX)
-	      || (dict_index_is_spatial(index)
-		  && mtr_memo_contains_flagged(
-			  mtr, dict_index_get_lock(index),
-			  MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_S_FIX
+					 | MTR_MEMO_PAGE_X_FIX)
+	      || (index->is_spatial()
+		  && mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+						| MTR_MEMO_SX_LOCK)));
 
 	cursor->old_stored = true;
 
@@ -135,7 +133,7 @@ btr_pcur_store_position(
 
 		ut_a(!page_has_siblings(block->frame));
 		ut_ad(page_is_leaf(block->frame));
-		ut_ad(block->page.id.page_no() == index->page);
+		ut_ad(block->page.id().page_no() == index->page);
 
 		if (page_rec_is_supremum_low(offs)) {
 			cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE;
@@ -152,8 +150,15 @@ before_first:
 
 		ut_ad(!page_rec_is_infimum(rec));
 		if (UNIV_UNLIKELY(rec_is_metadata(rec, *index))) {
+#if 0 /* MDEV-22867 had to relax this */
+			/* If the table is emptied during an ALGORITHM=NOCOPY
+			DROP COLUMN ... that is not ALGORITHM=INSTANT,
+			then we must preserve any instant ADD metadata. */
 			ut_ad(index->table->instant
-			      || block->page.id.page_no() != index->page);
+			      || block->page.id().page_no() != index->page);
+#endif
+			ut_ad(index->is_instant()
+			      || block->page.id().page_no() != index->page);
 			ut_ad(page_get_n_recs(block->frame) == 1);
 			ut_ad(page_is_leaf(block->frame));
 			ut_ad(!page_has_prev(block->frame));
@@ -216,15 +221,15 @@ btr_pcur_copy_stored_position(
 					copied */
 {
 	ut_free(pcur_receive->old_rec_buf);
-	ut_memcpy(pcur_receive, pcur_donate, sizeof(btr_pcur_t));
+	memcpy(pcur_receive, pcur_donate, sizeof(btr_pcur_t));
 
 	if (pcur_donate->old_rec_buf) {
 
 		pcur_receive->old_rec_buf = (byte*)
 			ut_malloc_nokey(pcur_donate->buf_size);
 
-		ut_memcpy(pcur_receive->old_rec_buf, pcur_donate->old_rec_buf,
-			  pcur_donate->buf_size);
+		memcpy(pcur_receive->old_rec_buf, pcur_donate->old_rec_buf,
+		       pcur_donate->buf_size);
 		pcur_receive->old_rec = pcur_receive->old_rec_buf
 			+ (pcur_donate->old_rec - pcur_donate->old_rec_buf);
 	}
@@ -320,7 +325,7 @@ btr_pcur_t::restore_position(ulint restore_latch_mode, const char *file,
 		pos_state = BTR_PCUR_IS_POSITIONED;
 		block_when_stored.clear();
 
-		return NOT_SAME;
+		return restore_status::NOT_SAME;
 	}
 
 	ut_a(old_rec);
@@ -376,7 +381,7 @@ btr_pcur_t::restore_position(ulint restore_latch_mode, const char *file,
 						   index));
 				mem_heap_free(heap);
 #endif /* UNIV_DEBUG */
-				return SAME_ALL;
+				return restore_status::SAME_ALL;
 			}
 			/* This is the same record as stored,
 			may need to be adjusted for BTR_PCUR_BEFORE/AFTER,
@@ -385,7 +390,7 @@ btr_pcur_t::restore_position(ulint restore_latch_mode, const char *file,
 				pos_state
 					= BTR_PCUR_IS_POSITIONED_OPTIMISTIC;
 			}
-			return NOT_SAME;
+			return restore_status::NOT_SAME;
 		}
 	}
 
@@ -435,7 +440,7 @@ btr_pcur_t::restore_position(ulint restore_latch_mode, const char *file,
 	      || rel_pos == BTR_PCUR_AFTER);
 	rec_offs offsets[REC_OFFS_NORMAL_SIZE];
 	rec_offs_init(offsets);
-	restore_status ret_val= NOT_SAME;
+	restore_status ret_val= restore_status::NOT_SAME;
 	if (rel_pos == BTR_PCUR_ON && btr_pcur_is_on_user_rec(this)) {
 		ulint n_matched_fields= 0;
 		if (!cmp_dtuple_rec_with_match(
@@ -455,10 +460,10 @@ btr_pcur_t::restore_position(ulint restore_latch_mode, const char *file,
 
 			mem_heap_free(heap);
 
-			return SAME_ALL;
+			return restore_status::SAME_ALL;
 		}
 		if (n_matched_fields >= index->n_uniq)
-			ret_val= SAME_UNIQ;
+			ret_val= restore_status::SAME_UNIQ;
 	}
 
 	mem_heap_free(heap);
@@ -484,29 +489,23 @@ btr_pcur_move_to_next_page(
 				last record of the current page */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	ulint		next_page_no;
-	page_t*		page;
-	buf_block_t*	next_block;
-	page_t*		next_page;
-	ulint		mode;
-
 	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
 	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
 	ut_ad(btr_pcur_is_after_last_on_page(cursor));
 
 	cursor->old_stored = false;
 
-	page = btr_pcur_get_page(cursor);
+	const page_t* page = btr_pcur_get_page(cursor);
 
 	if (UNIV_UNLIKELY(!page)) {
 		return;
 	}
 
-	next_page_no = btr_page_get_next(page);
+	const uint32_t next_page_no = btr_page_get_next(page);
 
 	ut_ad(next_page_no != FIL_NULL);
 
-	mode = cursor->latch_mode;
+	ulint mode = cursor->latch_mode;
 	switch (mode) {
 	case BTR_SEARCH_TREE:
 		mode = BTR_SEARCH_LEAF;
@@ -515,22 +514,19 @@ btr_pcur_move_to_next_page(
 		mode = BTR_MODIFY_LEAF;
 	}
 
-	buf_block_t*	block = btr_pcur_get_block(cursor);
-
-	next_block = btr_block_get(
-		page_id_t(block->page.id.space(), next_page_no),
-		block->zip_size(), mode,
-		btr_pcur_get_btr_cur(cursor)->index, mtr);
+	buf_block_t* next_block = btr_block_get(
+		*btr_pcur_get_btr_cur(cursor)->index, next_page_no, mode,
+		page_is_leaf(page), mtr);
 
 	if (UNIV_UNLIKELY(!next_block)) {
 		return;
 	}
 
-	next_page = buf_block_get_frame(next_block);
+	const page_t* next_page = buf_block_get_frame(next_block);
 #ifdef UNIV_BTR_DEBUG
 	ut_a(page_is_comp(next_page) == page_is_comp(page));
 	ut_a(btr_page_get_prev(next_page)
-	     == btr_pcur_get_block(cursor)->page.id.page_no());
+	     == btr_pcur_get_block(cursor)->page.id().page_no());
 #endif /* UNIV_BTR_DEBUG */
 
 	btr_leaf_page_release(btr_pcur_get_block(cursor), mode, mtr);
diff --git a/storage/innobase/btr/btr0scrub.cc b/storage/innobase/btr/btr0scrub.cc
deleted file mode 100644
index c25d2e42555..00000000000
--- a/storage/innobase/btr/btr0scrub.cc
+++ /dev/null
@@ -1,908 +0,0 @@
-// Copyright (c) 2014, Google Inc.
-// Copyright (c) 2017, 2021, MariaDB Corporation.
-
-/**************************************************//**
-@file btr/btr0scrub.cc
-Scrubbing of btree pages
-
-*******************************************************/
-
-#include "btr0btr.h"
-#include "btr0cur.h"
-#include "btr0scrub.h"
-#include "ibuf0ibuf.h"
-#include "fsp0fsp.h"
-#include "dict0dict.h"
-#include "mtr0mtr.h"
-
-/* used when trying to acquire dict-lock */
-UNIV_INTERN bool fil_crypt_is_closing(ulint space);
-
-/**
-* scrub data at delete time (e.g purge thread)
-*/
-my_bool srv_immediate_scrub_data_uncompressed = false;
-
-/**
-* background scrub uncompressed data
-*
-* if srv_immediate_scrub_data_uncompressed is enabled
-* this is only needed to handle "old" data
-*/
-my_bool srv_background_scrub_data_uncompressed = false;
-
-/**
-* backgrounds scrub compressed data
-*
-* reorganize compressed page for scrubbing
-* (only way to scrub compressed data)
-*/
-my_bool srv_background_scrub_data_compressed = false;
-
-/* check spaces once per hour */
-UNIV_INTERN uint srv_background_scrub_data_check_interval = (60 * 60);
-
-/* default to scrub spaces that hasn't been scrubbed in a week */
-UNIV_INTERN uint srv_background_scrub_data_interval = (7 * 24 * 60 * 60);
-
-/**
-* statistics for scrubbing by background threads
-*/
-static btr_scrub_stat_t scrub_stat;
-static ib_mutex_t scrub_stat_mutex;
-#ifdef UNIV_PFS_MUTEX
-UNIV_INTERN mysql_pfs_key_t scrub_stat_mutex_key;
-#endif
-
-#ifdef UNIV_DEBUG
-/**
-* srv_scrub_force_testing
-*
-* - force scrubbing using background threads even for uncompressed tables
-* - force pessimistic scrubbing (page split) even if not needed
-*   (see test_pessimistic_scrub_pct)
-*/
-my_bool srv_scrub_force_testing = true;
-
-/**
-* Force pessimistic scrubbing in 50% of the cases (UNIV_DEBUG only)
-*/
-static int test_pessimistic_scrub_pct = 50;
-
-#endif
-static uint scrub_compression_level = page_zip_level;
-
-/**************************************************************//**
-Log a scrubbing failure */
-static
-void
-log_scrub_failure(
-/*===============*/
-	dict_index_t* index,     /*!< in: index */
-	btr_scrub_t* scrub_data, /*!< in: data to store statistics on */
-	buf_block_t* block,	 /*!< in: block */
-	dberr_t err)             /*!< in: error */
-{
-	const char* reason = "unknown";
-	switch(err) {
-	case DB_UNDERFLOW:
-		reason = "too few records on page";
-		scrub_data->scrub_stat.page_split_failures_underflow++;
-		break;
-	case DB_INDEX_CORRUPT:
-		reason = "unable to find index!";
-		scrub_data->scrub_stat.page_split_failures_missing_index++;
-		break;
-	case DB_OUT_OF_FILE_SPACE:
-		reason = "out of filespace";
-		scrub_data->scrub_stat.page_split_failures_out_of_filespace++;
-		break;
-	default:
-		ut_ad(0);
-		reason = "unknown";
-		scrub_data->scrub_stat.page_split_failures_unknown++;
-	}
-
-	ib::warn() << "Failed to scrub index " << index->name
-		   << " of table " << index->table->name
-		   << " page " << block->page.id << ": " << reason;
-}
-
-/****************************************************************
-Lock dict mutexes */
-static
-bool
-btr_scrub_lock_dict_func(ulint space_id, bool lock_to_close_table,
-			 const char * file, uint line)
-{
-	time_t start = time(0);
-	time_t last = start;
-
-	/* FIXME: this is not the proper way of doing things. The
-	dict_sys.mutex should not be held by any thread for longer
-	than a few microseconds. It must not be held during I/O,
-	for example. So, what is the purpose for this busy-waiting?
-	This function should be rewritten as part of MDEV-8139:
-	Fix scrubbing tests. */
-
-	while (mutex_enter_nowait(&dict_sys.mutex)) {
-		/* if we lock to close a table, we wait forever
-		* if we don't lock to close a table, we check if space
-		* is closing, and then instead give up
-		*/
-		if (lock_to_close_table) {
-		} else if (fil_space_t* space = fil_space_acquire(space_id)) {
-			bool stopping = space->is_stopping();
-			space->release();
-			if (stopping) {
-				return false;
-			}
-		} else {
-			return false;
-		}
-
-		os_thread_sleep(250000);
-
-		time_t now = time(0);
-
-		if (now >= last + 30) {
-			fprintf(stderr,
-				"WARNING: %s:%u waited %ld seconds for"
-				" dict_sys lock, space: " ULINTPF
-				" lock_to_close_table: %d\n",
-				file, line, long(now - start), space_id,
-				lock_to_close_table);
-
-			last = now;
-		}
-	}
-
-	ut_ad(mutex_own(&dict_sys.mutex));
-	return true;
-}
-
-#define btr_scrub_lock_dict(space, lock_to_close_table)			\
-	btr_scrub_lock_dict_func(space, lock_to_close_table, __FILE__, __LINE__)
-
-/****************************************************************
-Unlock dict mutexes */
-static
-void
-btr_scrub_unlock_dict()
-{
-	dict_mutex_exit_for_mysql();
-}
-
-/****************************************************************
-Release reference to table
-*/
-static
-void
-btr_scrub_table_close(
-/*==================*/
-	dict_table_t* table)  /*!< in: table */
-{
-	bool dict_locked = true;
-	bool try_drop = false;
-	table->stats_bg_flag &= ~BG_SCRUB_IN_PROGRESS;
-	dict_table_close(table, dict_locked, try_drop);
-}
-
-/****************************************************************
-Release reference to table
-*/
-static
-void
-btr_scrub_table_close_for_thread(
-	btr_scrub_t *scrub_data)
-{
-	if (scrub_data->current_table == NULL) {
-		return;
-	}
-
-	if (fil_space_t* space = fil_space_acquire(scrub_data->space)) {
-		/* If tablespace is not marked as stopping perform
-		the actual close. */
-		if (!space->is_stopping()) {
-			mutex_enter(&dict_sys.mutex);
-			/* perform the actual closing */
-			btr_scrub_table_close(scrub_data->current_table);
-			mutex_exit(&dict_sys.mutex);
-		}
-		space->release();
-	}
-
-	scrub_data->current_table = NULL;
-	scrub_data->current_index = NULL;
-}
-
-/**************************************************************//**
-Check if scrubbing is turned ON or OFF */
-static
-bool
-check_scrub_setting(
-/*=====================*/
-	btr_scrub_t*	scrub_data) /*!< in: scrub data  */
-{
-	if (scrub_data->compressed)
-		return srv_background_scrub_data_compressed;
-	else
-		return srv_background_scrub_data_uncompressed;
-}
-
-#define IBUF_INDEX_ID (DICT_IBUF_ID_MIN + IBUF_SPACE_ID)
-
-/**************************************************************//**
-Check if a page needs scrubbing */
-UNIV_INTERN
-int
-btr_page_needs_scrubbing(
-/*=====================*/
-	btr_scrub_t*	scrub_data, /*!< in: scrub data  */
-	buf_block_t*	block,	    /*!< in: block to check, latched */
-	btr_scrub_page_allocation_status_t allocated)  /*!< in: is block known
-						       to be allocated */
-{
-	/**
-	* Check if scrubbing has been turned OFF.
-	*
-	* at start of space, we check if scrubbing is ON or OFF
-	* here we only check if scrubbing is turned OFF.
-	*
-	* Motivation is that it's only valueable to have a full table (space)
-	* scrubbed.
-	*/
-	if (!check_scrub_setting(scrub_data)) {
-		bool before_value = scrub_data->scrubbing;
-		scrub_data->scrubbing = false;
-
-		if (before_value == true) {
-			/* we toggle scrubbing from on to off */
-			return BTR_SCRUB_TURNED_OFF;
-		}
-	}
-
-	if (scrub_data->scrubbing == false) {
-		return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
-	}
-
-	const page_t*	page = buf_block_get_frame(block);
-
-	if (allocated == BTR_SCRUB_PAGE_ALLOCATED) {
-		if (fil_page_get_type(page) != FIL_PAGE_INDEX) {
-			/* this function is called from fil-crypt-threads.
-			* these threads iterate all pages of all tablespaces
-			* and don't know about fil_page_type.
-			* But scrubbing is only needed for index-pages. */
-
-			/**
-			* NOTE: scrubbing is also needed for UNDO pages,
-			* but they are scrubbed at purge-time, since they are
-			* uncompressed
-			*/
-
-			/* if encountering page type not needing scrubbing
-			release reference to table object */
-			return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
-		}
-
-		if (!page_has_garbage(page)) {
-			/* no garbage (from deleted/shrunken records) */
-			return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
-		}
-
-	} else if (allocated == BTR_SCRUB_PAGE_FREE ||
-		   allocated == BTR_SCRUB_PAGE_ALLOCATION_UNKNOWN) {
-
-		switch (fil_page_get_type(page)) {
-		case FIL_PAGE_INDEX:
-		case FIL_PAGE_TYPE_ZBLOB:
-		case FIL_PAGE_TYPE_ZBLOB2:
-			break;
-		default:
-			/**
-			* If this is a dropped page, we also need to scrub
-			* BLOB pages
-			*/
-
-			/* if encountering page type not needing scrubbing
-			release reference to table object */
-			return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
-		}
-	}
-
-	if (block->page.id.space() == TRX_SYS_SPACE
-	    && btr_page_get_index_id(page) == IBUF_INDEX_ID) {
-		/* skip ibuf */
-		return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
-	}
-
-	return BTR_SCRUB_PAGE;
-}
-
-/****************************************************************
-Handle a skipped page
-*/
-UNIV_INTERN
-void
-btr_scrub_skip_page(
-/*==================*/
-	btr_scrub_t* scrub_data, /*!< in: data with scrub state */
-	int needs_scrubbing)     /*!< in: return code from
-				 btr_page_needs_scrubbing */
-{
-	switch(needs_scrubbing) {
-	case BTR_SCRUB_SKIP_PAGE:
-		/* nothing todo */
-		return;
-	case BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE:
-		btr_scrub_table_close_for_thread(scrub_data);
-		return;
-	case BTR_SCRUB_TURNED_OFF:
-	case BTR_SCRUB_SKIP_PAGE_AND_COMPLETE_SPACE:
-		btr_scrub_complete_space(scrub_data);
-		return;
-	}
-
-	/* unknown value. should not happen */
-	ut_a(0);
-}
-
-/****************************************************************
-Try to scrub a page using btr_page_reorganize_low
-return DB_SUCCESS on success or DB_OVERFLOW on failure */
-static
-dberr_t
-btr_optimistic_scrub(
-/*==================*/
-	btr_scrub_t* scrub_data, /*!< in: data with scrub state */
-	buf_block_t* block,      /*!< in: block to scrub */
-	dict_index_t* index,     /*!< in: index */
-	mtr_t* mtr)              /*!< in: mtr */
-{
-#ifdef UNIV_DEBUG
-	if (srv_scrub_force_testing &&
-	    page_get_n_recs(buf_block_get_frame(block)) > 2 &&
-	    (rand() % 100) < test_pessimistic_scrub_pct) {
-
-		log_scrub_failure(index, scrub_data, block, DB_OVERFLOW);
-		return DB_OVERFLOW;
-	}
-#endif
-
-	page_cur_t cur;
-	page_cur_set_before_first(block, &cur);
-	bool recovery = false;
-	if (!btr_page_reorganize_low(recovery, scrub_compression_level,
-				     &cur, index, mtr)) {
-		return DB_OVERFLOW;
-	}
-
-	/* We play safe and reset the free bits */
-	if (!dict_index_is_clust(index) &&
-	    block != NULL) {
-		buf_frame_t* frame = buf_block_get_frame(block);
-		if (frame &&
-		    page_is_leaf(frame)) {
-
-			ibuf_reset_free_bits(block);
-		}
-	}
-
-	scrub_data->scrub_stat.page_reorganizations++;
-
-	return DB_SUCCESS;
-}
-
-/****************************************************************
-Try to scrub a page by splitting it
-return DB_SUCCESS on success
-DB_UNDERFLOW if page has too few records
-DB_OUT_OF_FILE_SPACE if we can't find space for split */
-static
-dberr_t
-btr_pessimistic_scrub(
-/*==================*/
-	btr_scrub_t* scrub_data, /*!< in: data with scrub state */
-	buf_block_t* block,      /*!< in: block to scrub */
-	dict_index_t* index,     /*!< in: index */
-	mtr_t* mtr)              /*!< in: mtr */
-{
-	page_t*	page = buf_block_get_frame(block);
-
-	if (page_get_n_recs(page) < 2) {
-		/**
-		* There is no way we can split a page with < 2 records
-		*/
-		log_scrub_failure(index, scrub_data, block, DB_UNDERFLOW);
-		return DB_UNDERFLOW;
-	}
-
-	/**
-	* Splitting page needs new space, allocate it here
-	* so that splitting won't fail due to this */
-	ulint n_extents = 3;
-	ulint n_reserved = 0;
-	if (!fsp_reserve_free_extents(&n_reserved, index->table->space,
-				      n_extents, FSP_NORMAL, mtr)) {
-		log_scrub_failure(index, scrub_data, block,
-				  DB_OUT_OF_FILE_SPACE);
-		return DB_OUT_OF_FILE_SPACE;
-	}
-
-	/* read block variables */
-	const uint32_t page_no =  mach_read_from_4(page + FIL_PAGE_OFFSET);
-	const uint32_t left_page_no = btr_page_get_prev(page);
-	const uint32_t right_page_no = btr_page_get_next(page);
-	const ulint zip_size = index->table->space->zip_size();
-
-	/**
-	* When splitting page, we need X-latches on left/right brothers
-	* see e.g btr_cur_latch_leaves
-	*/
-
-	if (left_page_no != FIL_NULL) {
-		/**
-		* pages needs to be locked left-to-right, release block
-		* and re-lock. We still have x-lock on index
-		* so this should be safe
-		*/
-		mtr->release_block_at_savepoint(scrub_data->savepoint, block);
-
-		btr_block_get(
-			page_id_t(index->table->space_id, left_page_no),
-			zip_size, RW_X_LATCH, index, mtr);
-
-		/**
-		* Refetch block and re-initialize page
-		*/
-		block = btr_block_get(
-			page_id_t(index->table->space_id, page_no),
-			zip_size, RW_X_LATCH, index, mtr);
-
-		page = buf_block_get_frame(block);
-
-		/**
-		* structure should be unchanged
-		*/
-		ut_a(left_page_no == btr_page_get_prev(page));
-		ut_a(right_page_no == btr_page_get_next(page));
-	}
-
-	if (right_page_no != FIL_NULL) {
-		btr_block_get(
-			page_id_t(index->table->space_id, right_page_no),
-			zip_size, RW_X_LATCH, index, mtr);
-	}
-
-	/* arguments to btr_page_split_and_insert */
-	mem_heap_t* heap = NULL;
-	dtuple_t* entry = NULL;
-	rec_offs* offsets = NULL;
-	ulint n_ext = 0;
-	ulint flags = BTR_MODIFY_TREE;
-
-	/**
-	* position a cursor on first record on page
-	*/
-	rec_t* rec = page_rec_get_next(page_get_infimum_rec(page));
-	btr_cur_t cursor;
-	btr_cur_position(index, rec, block, &cursor);
-
-	/**
-	* call split page with NULL as argument for entry to insert
-	*/
-	if (dict_index_get_page(index) == page_no) {
-		/* The page is the root page
-		* NOTE: ibuf_reset_free_bits is called inside
-		* btr_root_raise_and_insert */
-		rec = btr_root_raise_and_insert(
-			flags, &cursor, &offsets, &heap, entry, n_ext, mtr);
-	} else {
-		/* We play safe and reset the free bits
-		* NOTE: need to call this prior to btr_page_split_and_insert */
-		if (!dict_index_is_clust(index) &&
-		    block != NULL) {
-			buf_frame_t* frame = buf_block_get_frame(block);
-			if (frame &&
-			    page_is_leaf(frame)) {
-
-				ibuf_reset_free_bits(block);
-			}
-		}
-
-		rec = btr_page_split_and_insert(
-			flags, &cursor, &offsets, &heap, entry, n_ext, mtr);
-	}
-
-	if (heap) {
-		mem_heap_free(heap);
-	}
-
-	index->table->space->release_free_extents(n_reserved);
-	scrub_data->scrub_stat.page_splits++;
-	return DB_SUCCESS;
-}
-
-/****************************************************************
-Location index by id for a table
-return index or NULL */
-static
-dict_index_t*
-find_index(
-/*========*/
-	dict_table_t* table, /*!< in: table */
-	index_id_t index_id) /*!< in: index id */
-{
-	if (table != NULL) {
-		dict_index_t* index = dict_table_get_first_index(table);
-		while (index != NULL) {
-			if (index->id == index_id)
-				return index;
-			index = dict_table_get_next_index(index);
-		}
-	}
-
-	return NULL;
-}
-
-/****************************************************************
-Check if table should be scrubbed
-*/
-static
-bool
-btr_scrub_table_needs_scrubbing(
-/*============================*/
-	dict_table_t* table) /*!< in: table */
-{
-	if (table == NULL)
-		return false;
-
-	if (table->stats_bg_flag & BG_STAT_SHOULD_QUIT) {
-		return false;
-	}
-
-	if (table->to_be_dropped) {
-		return false;
-	}
-
-	if (!table->is_readable()) {
-		return false;
-	}
-
-	return true;
-}
-
-/****************************************************************
-Check if index should be scrubbed
-*/
-static
-bool
-btr_scrub_index_needs_scrubbing(
-/*============================*/
-	dict_index_t* index) /*!< in: index */
-{
-	if (index == NULL)
-		return false;
-
-	if (dict_index_is_ibuf(index)) {
-		return false;
-	}
-
-	if (dict_index_is_online_ddl(index)) {
-		return false;
-	}
-
-	return true;
-}
-
-/****************************************************************
-Get table and index and store it on scrub_data
-*/
-static
-void
-btr_scrub_get_table_and_index(
-/*=========================*/
-	btr_scrub_t* scrub_data, /*!< in/out: scrub data */
-	index_id_t index_id)     /*!< in: index id */
-{
-	/* first check if it's an index to current table */
-	scrub_data->current_index = find_index(scrub_data->current_table,
-					       index_id);
-
-	if (scrub_data->current_index != NULL) {
-		/* yes it was */
-		return;
-	}
-
-	if (!btr_scrub_lock_dict(scrub_data->space, false)) {
-		btr_scrub_complete_space(scrub_data);
-		return;
-	}
-
-	/* close current table (if any) */
-	if (scrub_data->current_table != NULL) {
-		btr_scrub_table_close(scrub_data->current_table);
-		scrub_data->current_table = NULL;
-	}
-
-	/* open table based on index_id */
-	dict_table_t* table = dict_table_open_on_index_id(index_id);
-
-	if (table != NULL) {
-		/* mark table as being scrubbed */
-		table->stats_bg_flag |= BG_SCRUB_IN_PROGRESS;
-
-		if (!btr_scrub_table_needs_scrubbing(table)) {
-			btr_scrub_table_close(table);
-			btr_scrub_unlock_dict();
-			return;
-		}
-	}
-
-	btr_scrub_unlock_dict();
-	scrub_data->current_table = table;
-	scrub_data->current_index = find_index(table, index_id);
-}
-
-/****************************************************************
-Handle free page */
-UNIV_INTERN
-int
-btr_scrub_free_page(
-/*====================*/
-	btr_scrub_t* scrub_data,  /*!< in/out: scrub data */
-	buf_block_t* block,       /*!< in: block to scrub */
-	mtr_t* mtr)               /*!< in: mtr */
-{
-	// TODO(jonaso): scrub only what is actually needed
-
-	{
-		/* note: perform both the memset and setting of FIL_PAGE_TYPE
-		* wo/ logging. so that if we crash before page is flushed
-		* it will be found by scrubbing thread again
-		*/
-		memset(buf_block_get_frame(block) + PAGE_HEADER, 0,
-		       srv_page_size - PAGE_HEADER);
-
-		mach_write_to_2(buf_block_get_frame(block) + FIL_PAGE_TYPE,
-				FIL_PAGE_TYPE_ALLOCATED);
-	}
-
-	page_create(block, mtr,
-		    dict_table_is_comp(scrub_data->current_table),
-		    dict_index_is_spatial(scrub_data->current_index));
-
-	mtr_commit(mtr);
-
-	/* page doesn't need further processing => SKIP
-	* and close table/index so that we don't keep references too long */
-	return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
-}
-
-/****************************************************************
-Recheck if a page needs scrubbing, and if it does load appropriate
-table and index */
-UNIV_INTERN
-int
-btr_scrub_recheck_page(
-/*====================*/
-	btr_scrub_t* scrub_data,  /*!< inut: scrub data */
-	buf_block_t* block,       /*!< in: block */
-	btr_scrub_page_allocation_status_t allocated, /*!< in: is block
-						      allocated or free */
-	mtr_t* mtr)               /*!< in: mtr */
-{
-	/* recheck if page needs scrubbing (knowing allocation status) */
-	int needs_scrubbing = btr_page_needs_scrubbing(
-		scrub_data, block, allocated);
-
-	if (needs_scrubbing != BTR_SCRUB_PAGE) {
-		mtr_commit(mtr);
-		return needs_scrubbing;
-	}
-
-	if (allocated == BTR_SCRUB_PAGE_FREE) {
-		/** we don't need to load table/index for free pages
-		* so scrub directly here */
-		/* mtr is committed inside btr_scrub_page_free */
-		return btr_scrub_free_page(scrub_data,
-					   block,
-					   mtr);
-	}
-
-	page_t*	page = buf_block_get_frame(block);
-	index_id_t index_id = btr_page_get_index_id(page);
-
-	if (scrub_data->current_index == NULL ||
-	    scrub_data->current_index->id != index_id) {
-
-		/**
-		* commit mtr (i.e release locks on block)
-		* and try to get table&index potentially loading it
-		* from disk
-		*/
-		mtr_commit(mtr);
-		btr_scrub_get_table_and_index(scrub_data, index_id);
-	} else {
-		/* we already have correct index
-		* commit mtr so that we can lock index before fetching page
-		*/
-		mtr_commit(mtr);
-	}
-
-	/* check if table is about to be dropped */
-	if (!btr_scrub_table_needs_scrubbing(scrub_data->current_table)) {
-		return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
-	}
-
-	/* check if index is scrubbable */
-	if (!btr_scrub_index_needs_scrubbing(scrub_data->current_index)) {
-		return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
-	}
-
-	mtr_start(mtr);
-	mtr_x_lock_index(scrub_data->current_index, mtr);
-	/** set savepoint for X-latch of block */
-	scrub_data->savepoint = mtr_set_savepoint(mtr);
-	return BTR_SCRUB_PAGE;
-}
-
-/****************************************************************
-Perform actual scrubbing of page */
-UNIV_INTERN
-int
-btr_scrub_page(
-/*============*/
-	btr_scrub_t* scrub_data,  /*!< in/out: scrub data */
-	buf_block_t* block,       /*!< in: block */
-	btr_scrub_page_allocation_status_t allocated, /*!< in: is block
-						      allocated or free */
-	mtr_t* mtr)               /*!< in: mtr */
-{
-	/* recheck if page needs scrubbing (knowing allocation status) */
-	int needs_scrubbing = BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
-
-	if (block) {
-		btr_page_needs_scrubbing(scrub_data, block, allocated);
-	}
-
-	if (!block || needs_scrubbing != BTR_SCRUB_PAGE) {
-		mtr_commit(mtr);
-		return needs_scrubbing;
-	}
-
-	if (allocated == BTR_SCRUB_PAGE_FREE) {
-		/* mtr is committed inside btr_scrub_page_free */
-		return btr_scrub_free_page(scrub_data,
-					   block,
-					   mtr);
-	}
-
-	/* check that table/index still match now that they are loaded */
-
-	if (!scrub_data->current_table->space
-	    || scrub_data->current_table->space_id != scrub_data->space) {
-		/* this is truncate table */
-		mtr_commit(mtr);
-		return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
-	}
-
-	if (scrub_data->current_index->table != scrub_data->current_table) {
-		/* this is truncate table */
-		mtr_commit(mtr);
-		return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
-	}
-
-	if (scrub_data->current_index->page == FIL_NULL) {
-		/* this is truncate table */
-		mtr_commit(mtr);
-		return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
-	}
-
-	buf_frame_t* frame = buf_block_get_frame(block);
-
-	if (!frame || btr_page_get_index_id(frame) !=
-	    scrub_data->current_index->id) {
-		/* page has been reallocated to new index */
-		mtr_commit(mtr);
-		return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
-	}
-
-	/* check if I can scrub (reorganize) page wo/ overflow */
-	if (btr_optimistic_scrub(scrub_data,
-				 block,
-				 scrub_data->current_index,
-				 mtr) != DB_SUCCESS) {
-
-		/**
-		* Can't reorganize page...need to split it
-		*/
-		btr_pessimistic_scrub(scrub_data,
-				      block,
-				      scrub_data->current_index,
-				      mtr);
-	}
-	mtr_commit(mtr);
-
-	return BTR_SCRUB_SKIP_PAGE; // no further action needed
-}
-
-/**************************************************************//**
-Start iterating a space */
-bool btr_scrub_start_space(const fil_space_t &space, btr_scrub_t *scrub_data)
-{
-	scrub_data->space = space.id;
-	scrub_data->current_table = NULL;
-	scrub_data->current_index = NULL;
-	scrub_data->compressed = FSP_FLAGS_GET_ZIP_SSIZE(space.flags) != 0;
-	scrub_data->scrubbing = check_scrub_setting(scrub_data);
-	return scrub_data->scrubbing;
-}
-
-/***********************************************************************
-Update global statistics with thread statistics */
-static
-void
-btr_scrub_update_total_stat(btr_scrub_t *scrub_data)
-{
-	mutex_enter(&scrub_stat_mutex);
-	scrub_stat.page_reorganizations +=
-		scrub_data->scrub_stat.page_reorganizations;
-	scrub_stat.page_splits +=
-		scrub_data->scrub_stat.page_splits;
-	scrub_stat.page_split_failures_underflow +=
-		scrub_data->scrub_stat.page_split_failures_underflow;
-	scrub_stat.page_split_failures_out_of_filespace +=
-		scrub_data->scrub_stat.page_split_failures_out_of_filespace;
-	scrub_stat.page_split_failures_missing_index +=
-		scrub_data->scrub_stat.page_split_failures_missing_index;
-	scrub_stat.page_split_failures_unknown +=
-		scrub_data->scrub_stat.page_split_failures_unknown;
-	mutex_exit(&scrub_stat_mutex);
-
-	// clear stat
-	memset(&scrub_data->scrub_stat, 0, sizeof(scrub_data->scrub_stat));
-}
-
-/** Complete iterating a space.
-@param[in,out]	scrub_data	 scrub data */
-UNIV_INTERN
-void
-btr_scrub_complete_space(btr_scrub_t* scrub_data)
-{
-	ut_ad(scrub_data->scrubbing);
-	btr_scrub_table_close_for_thread(scrub_data);
-	btr_scrub_update_total_stat(scrub_data);
-}
-
-/*********************************************************************
-Return scrub statistics */
-void
-btr_scrub_total_stat(btr_scrub_stat_t *stat)
-{
-	mutex_enter(&scrub_stat_mutex);
-	*stat = scrub_stat;
-	mutex_exit(&scrub_stat_mutex);
-}
-
-/*********************************************************************
-Init global variables */
-UNIV_INTERN
-void
-btr_scrub_init()
-{
-	mutex_create(LATCH_ID_SCRUB_STAT_MUTEX, &scrub_stat_mutex);
-
-	memset(&scrub_stat, 0, sizeof(scrub_stat));
-}
-
-/*********************************************************************
-Cleanup globals */
-UNIV_INTERN
-void
-btr_scrub_cleanup()
-{
-	mutex_free(&scrub_stat_mutex);
-}
-
diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc
index 9b9adf541e7..2f75f567933 100644
--- a/storage/innobase/btr/btr0sea.cc
+++ b/storage/innobase/btr/btr0sea.cc
@@ -39,9 +39,7 @@ Created 2/17/1996 Heikki Tuuri
 #include "btr0cur.h"
 #include "btr0pcur.h"
 #include "btr0btr.h"
-#include "ha0ha.h"
 #include "srv0mon.h"
-#include "sync0sync.h"
 
 /** Is search system enabled.
 Search system is protected by array of latches. */
@@ -57,25 +55,8 @@ ulint		btr_search_n_succ	= 0;
 ulint		btr_search_n_hash_fail	= 0;
 #endif /* UNIV_SEARCH_PERF_STAT */
 
-/** padding to prevent other memory update
-hotspots from residing on the same memory
-cache line as btr_search_latches */
-UNIV_INTERN byte		btr_sea_pad1[CACHE_LINE_SIZE];
-
-/** The latches protecting the adaptive search system: this latches protects the
-(1) positions of records on those pages where a hash index has been built.
-NOTE: It does not protect values of non-ordering fields within a record from
-being updated in-place! We can use fact (1) to perform unique searches to
-indexes. We will allocate the latches from dynamic memory to get it to the
-same DRAM page as other hotspot semaphores */
-rw_lock_t**	btr_search_latches;
-
-/** padding to prevent other memory update hotspots from residing on
-the same memory cache line */
-UNIV_INTERN byte		btr_sea_pad2[CACHE_LINE_SIZE];
-
 /** The adaptive hash index */
-btr_search_sys_t*	btr_search_sys;
+btr_search_sys_t btr_search_sys;
 
 /** If the number of records on the page divided by this parameter
 would have been successfully accessed using a hash index, the index
@@ -187,104 +168,23 @@ probable that, when have reserved the btr search system latch and we need to
 allocate a new node to the hash table, it will succeed. However, the check
 will not guarantee success.
 @param[in]	index	index handler */
-static
-void
-btr_search_check_free_space_in_heap(const dict_index_t* index)
+static void btr_search_check_free_space_in_heap(const dict_index_t *index)
 {
-	/* Note that we peek the value of heap->free_block without reserving
-	the latch: this is ok, because we will not guarantee that there will
-	be enough free space in the hash table. */
+  /* Note that we peek the value of heap->free_block without reserving
+  the latch: this is ok, because we will not guarantee that there will
+  be enough free space in the hash table. */
 
-	buf_block_t*	block = buf_block_alloc(NULL);
-	rw_lock_t*	latch = btr_get_search_latch(index);
-	hash_table_t*	table;
-	mem_heap_t*	heap;
+  buf_block_t *block= buf_block_alloc();
+  auto part= btr_search_sys.get_part(*index);
 
-	rw_lock_x_lock(latch);
-
-	if (!btr_search_enabled) {
-		goto func_exit;
-	}
+  rw_lock_x_lock(&part->latch);
 
-	table = btr_get_search_table(index);
-	heap = table->heap;
+  if (!btr_search_enabled || part->heap->free_block)
+    buf_block_free(block);
+  else
+    part->heap->free_block= block;
 
-	if (heap->free_block == NULL) {
-		heap->free_block = block;
-	} else {
-func_exit:
-		buf_block_free(block);
-	}
-
-	rw_lock_x_unlock(latch);
-}
-
-/** Creates and initializes the adaptive search system at a database start.
-@param[in]	hash_size	hash table size. */
-void btr_search_sys_create(ulint hash_size)
-{
-	/* Search System is divided into n parts.
-	Each part controls access to distinct set of hash buckets from
-	hash table through its own latch. */
-
-	/* Step-1: Allocate latches (1 per part). */
-	btr_search_latches = reinterpret_cast<rw_lock_t**>(
-		ut_malloc(sizeof(rw_lock_t*) * btr_ahi_parts, mem_key_ahi));
-
-	for (ulint i = 0; i < btr_ahi_parts; ++i) {
-
-		btr_search_latches[i] = reinterpret_cast<rw_lock_t*>(
-			ut_malloc(sizeof(rw_lock_t), mem_key_ahi));
-
-		rw_lock_create(btr_search_latch_key,
-			       btr_search_latches[i], SYNC_SEARCH_SYS);
-	}
-
-	/* Step-2: Allocate hash tablees. */
-	btr_search_sys = reinterpret_cast<btr_search_sys_t*>(
-		ut_malloc(sizeof(btr_search_sys_t), mem_key_ahi));
-
-	btr_search_sys->hash_tables = NULL;
-
-	if (btr_search_enabled) {
-		btr_search_enable();
-	}
-}
-
-/** Frees the adaptive search system at a database shutdown. */
-void btr_search_sys_free()
-{
-  if (!btr_search_sys)
-  {
-    ut_ad(!btr_search_latches);
-    return;
-  }
-
-  ut_ad(btr_search_sys);
-  ut_ad(btr_search_latches);
-
-  if (btr_search_sys->hash_tables)
-  {
-    for (ulint i= 0; i < btr_ahi_parts; ++i)
-    {
-      mem_heap_free(btr_search_sys->hash_tables[i]->heap);
-      hash_table_free(btr_search_sys->hash_tables[i]);
-    }
-    ut_free(btr_search_sys->hash_tables);
-  }
-
-  ut_free(btr_search_sys);
-  btr_search_sys= NULL;
-
-  /* Free all latches. */
-  for (ulint i= 0; i < btr_ahi_parts; ++i)
-  {
-    rw_lock_free(btr_search_latches[i]);
-    ut_free(btr_search_latches[i]);
-  }
-
-  ut_free(btr_search_latches);
-  btr_search_latches= NULL;
+  rw_lock_x_unlock(&part->latch);
 }
 
 /** Set index->ref_count = 0 on all indexes of a table.
@@ -321,63 +221,6 @@ ATTRIBUTE_COLD static void btr_search_lazy_free(dict_index_t *index)
   table->autoinc_mutex.unlock();
 }
 
-/** Clear the adaptive hash index on all pages in the buffer pool. */
-static void buf_pool_clear_hash_index()
-{
-  ut_ad(btr_search_own_all(RW_LOCK_X));
-  ut_ad(!btr_search_enabled);
-
-  std::set<dict_index_t*> garbage;
-
-  for (ulong p = 0; p < srv_buf_pool_instances; p++)
-  {
-    buf_pool_t *buf_pool= buf_pool_from_array(p);
-    buf_chunk_t *chunks= buf_pool->chunks;
-    buf_chunk_t *chunk= chunks + buf_pool->n_chunks;
-
-    while (--chunk >= chunks)
-    {
-      buf_block_t *block= chunk->blocks;
-      for (ulint i= chunk->size; i--; block++)
-      {
-        dict_index_t *index= block->index;
-        assert_block_ahi_valid(block);
-
-        /* We can clear block->index and block->n_pointers when
-        btr_search_own_all(RW_LOCK_X); see the comments in buf0buf.h */
-
-        if (!index)
-        {
-# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-          ut_a(!block->n_pointers);
-# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-          continue;
-        }
-
-        ut_d(buf_page_state state= buf_block_get_state(block));
-        /* Another thread may have set the state to
-        BUF_BLOCK_REMOVE_HASH in buf_LRU_block_remove_hashed().
-
-        The state change in buf_page_realloc() is not observable here,
-        because in that case we would have !block->index.
-
-        In the end, the entire adaptive hash index will be removed. */
-        ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_REMOVE_HASH);
-# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-        block->n_pointers= 0;
-# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-        if (index->freed())
-          garbage.insert(index);
-        block->index= NULL;
-      }
-    }
-  }
-
-  for (std::set<dict_index_t*>::iterator i= garbage.begin();
-       i != garbage.end(); i++)
-    btr_search_lazy_free(*i);
-}
-
 /** Disable the adaptive hash search system and empty the index. */
 void btr_search_disable()
 {
@@ -412,52 +255,37 @@ void btr_search_disable()
 	mutex_exit(&dict_sys.mutex);
 
 	/* Set all block->index = NULL. */
-	buf_pool_clear_hash_index();
+	buf_pool.clear_hash_index();
 
 	/* Clear the adaptive hash index. */
-	for (ulint i = 0; i < btr_ahi_parts; ++i) {
-		mem_heap_free(btr_search_sys->hash_tables[i]->heap);
-		hash_table_free(btr_search_sys->hash_tables[i]);
-	}
-	ut_free(btr_search_sys->hash_tables);
-	btr_search_sys->hash_tables = NULL;
+	btr_search_sys.clear();
 
 	btr_search_x_unlock_all();
 }
 
 /** Enable the adaptive hash search system.
-@param resize whether buf_pool_resize() is the caller */
+@param resize whether buf_pool_t::resize() is the caller */
 void btr_search_enable(bool resize)
 {
 	if (!resize) {
-		buf_pool_mutex_enter_all();
-		if (srv_buf_pool_old_size != srv_buf_pool_size) {
-			buf_pool_mutex_exit_all();
+		mysql_mutex_lock(&buf_pool.mutex);
+		bool changed = srv_buf_pool_old_size != srv_buf_pool_size;
+		mysql_mutex_unlock(&buf_pool.mutex);
+		if (changed) {
 			return;
 		}
-		buf_pool_mutex_exit_all();
 	}
 
-	ulint hash_size = buf_pool_get_curr_size() / sizeof(void *) / 64;
 	btr_search_x_lock_all();
+	ulint hash_size = buf_pool_get_curr_size() / sizeof(void *) / 64;
 
-	if (btr_search_sys->hash_tables) {
+	if (btr_search_sys.parts[0].heap) {
 		ut_ad(btr_search_enabled);
 		btr_search_x_unlock_all();
 		return;
 	}
 
-	btr_search_sys->hash_tables = reinterpret_cast<hash_table_t**>(
-		ut_malloc(sizeof(hash_table_t*) * btr_ahi_parts, mem_key_ahi));
-	for (ulint i = 0; i < btr_ahi_parts; ++i) {
-		btr_search_sys->hash_tables[i] =
-			ib_create((hash_size / btr_ahi_parts),
-				  LATCH_ID_HASH_TABLE_MUTEX,
-				  0, MEM_HEAP_FOR_BTR_SEARCH);
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-                btr_search_sys->hash_tables[i]->adaptive = TRUE;
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-	}
+	btr_search_sys.alloc(hash_size);
 
 	btr_search_enabled = true;
 	btr_search_x_unlock_all();
@@ -475,7 +303,6 @@ btr_search_info_update_hash(
 	btr_cur_t*	cursor)
 {
 	dict_index_t*	index = cursor->index;
-	ulint		n_unique;
 	int		cmp;
 
 	ut_ad(!btr_search_own_any(RW_LOCK_S));
@@ -488,7 +315,7 @@ btr_search_info_update_hash(
 		return;
 	}
 
-	n_unique = dict_index_get_n_unique_in_tree(index);
+	uint16_t n_unique = dict_index_get_n_unique_in_tree(index);
 
 	if (info->n_hash_potential == 0) {
 
@@ -530,16 +357,13 @@ set_new_recomm:
 
 	cmp = ut_pair_cmp(cursor->up_match, cursor->up_bytes,
 			  cursor->low_match, cursor->low_bytes);
-	if (cmp == 0) {
-		info->n_hash_potential = 0;
+	info->left_side = cmp >= 0;
+	info->n_hash_potential = cmp != 0;
 
+	if (cmp == 0) {
 		/* For extra safety, we set some sensible values here */
-
 		info->n_fields = 1;
 		info->n_bytes = 0;
-
-		info->left_side = TRUE;
-
 	} else if (cmp > 0) {
 		info->n_hash_potential = 1;
 
@@ -550,31 +374,31 @@ set_new_recomm:
 
 		} else if (cursor->low_match < cursor->up_match) {
 
-			info->n_fields = cursor->low_match + 1;
+			info->n_fields = static_cast<uint16_t>(
+				cursor->low_match + 1);
 			info->n_bytes = 0;
 		} else {
-			info->n_fields = cursor->low_match;
-			info->n_bytes = cursor->low_bytes + 1;
+			info->n_fields = static_cast<uint16_t>(
+				cursor->low_match);
+			info->n_bytes = static_cast<uint16_t>(
+				cursor->low_bytes + 1);
 		}
-
-		info->left_side = TRUE;
 	} else {
-		info->n_hash_potential = 1;
-
 		if (cursor->low_match >= n_unique) {
 
 			info->n_fields = n_unique;
 			info->n_bytes = 0;
 		} else if (cursor->low_match > cursor->up_match) {
 
-			info->n_fields = cursor->up_match + 1;
+			info->n_fields = static_cast<uint16_t>(
+				cursor->up_match + 1);
 			info->n_bytes = 0;
 		} else {
-			info->n_fields = cursor->up_match;
-			info->n_bytes = cursor->up_bytes + 1;
+			info->n_fields = static_cast<uint16_t>(
+				cursor->up_match);
+			info->n_bytes = static_cast<uint16_t>(
+				cursor->up_bytes + 1);
 		}
-
-		info->left_side = FALSE;
 	}
 }
 
@@ -593,8 +417,11 @@ btr_search_update_block_hash_info(btr_search_t* info, buf_block_t* block)
 				  RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
 
 	info->last_hash_succ = FALSE;
-
-	ut_a(buf_block_state_valid(block));
+	ut_d(auto state= block->page.state());
+	ut_ad(state == BUF_BLOCK_NOT_USED
+	      || state == BUF_BLOCK_FILE_PAGE
+	      || state == BUF_BLOCK_MEMORY
+	      || state == BUF_BLOCK_REMOVE_HASH);
 	ut_ad(info->magic_n == BTR_SEARCH_MAGIC_N);
 
 	if ((block->n_hash_helps > 0)
@@ -642,6 +469,220 @@ btr_search_update_block_hash_info(btr_search_t* info, buf_block_t* block)
 	return(false);
 }
 
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/** Maximum number of records in a page */
+constexpr ulint MAX_N_POINTERS = UNIV_PAGE_SIZE_MAX / REC_N_NEW_EXTRA_BYTES;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+__attribute__((nonnull))
+/**
+Insert an entry into the hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted.
+@param table hash table
+@param heap  memory heap
+@param fold  folded value of the record
+@param block buffer block containing the record
+@param data  the record
+@retval true on success
+@retval false if no more memory could be allocated */
+static bool ha_insert_for_fold(hash_table_t *table, mem_heap_t* heap,
+                               ulint fold,
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+                               buf_block_t *block, /*!< buffer block of data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+                               const rec_t *data)
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+  ut_a(block->frame == page_align(data));
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+  ut_ad(btr_search_enabled);
+
+  hash_cell_t *cell= &table->array[table->calc_hash(fold)];
+
+  for (ha_node_t *prev= static_cast<ha_node_t*>(cell->node); prev;
+       prev= prev->next)
+  {
+    if (prev->fold == fold)
+    {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+      buf_block_t *prev_block= prev->block;
+      ut_a(prev_block->frame == page_align(prev->data));
+      ut_a(prev_block->n_pointers-- < MAX_N_POINTERS);
+      ut_a(block->n_pointers++ < MAX_N_POINTERS);
+
+      prev->block= block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+      prev->data= data;
+      return true;
+    }
+  }
+
+  /* We have to allocate a new chain node */
+  ha_node_t *node= static_cast<ha_node_t*>(mem_heap_alloc(heap, sizeof *node));
+
+  if (!node)
+    return false;
+
+  ha_node_set_data(node, block, data);
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+  ut_a(block->n_pointers++ < MAX_N_POINTERS);
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+  node->fold= fold;
+  node->next= nullptr;
+
+  ha_node_t *prev= static_cast<ha_node_t*>(cell->node);
+  if (!prev)
+    cell->node= node;
+  else
+  {
+    while (prev->next)
+      prev= prev->next;
+    prev->next= node;
+  }
+  return true;
+}
+
+__attribute__((nonnull))
+/** Delete a record.
+@param table     hash table
+@param heap      memory heap
+@param del_node  record to be deleted */
+static void ha_delete_hash_node(hash_table_t *table, mem_heap_t *heap,
+                                ha_node_t *del_node)
+{
+  ut_ad(btr_search_enabled);
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+  ut_a(del_node->block->frame == page_align(del_node->data));
+  ut_a(del_node->block->n_pointers-- < MAX_N_POINTERS);
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+  const ulint fold= del_node->fold;
+
+  HASH_DELETE(ha_node_t, next, table, fold, del_node);
+
+  ha_node_t *top= static_cast<ha_node_t*>(mem_heap_get_top(heap, sizeof *top));
+
+  if (del_node != top)
+  {
+    /* Compact the heap of nodes by moving the top in the place of del_node. */
+    *del_node= *top;
+    hash_cell_t *cell= &table->array[table->calc_hash(top->fold)];
+
+    /* Look for the pointer to the top node, to update it */
+    if (cell->node == top)
+      /* The top node is the first in the chain */
+      cell->node= del_node;
+    else
+    {
+      /* We have to look for the predecessor */
+      ha_node_t *node= static_cast<ha_node_t*>(cell->node);
+
+      while (top != HASH_GET_NEXT(next, node))
+        node= static_cast<ha_node_t*>(HASH_GET_NEXT(next, node));
+
+      /* Now we have the predecessor node */
+      node->next= del_node;
+    }
+  }
+
+  /* Free the occupied space */
+  mem_heap_free_top(heap, sizeof *top);
+}
+
+__attribute__((nonnull))
+/** Delete all pointers to a page.
+@param table     hash table
+@param heap      memory heap
+@param page      record to be deleted */
+static void ha_remove_all_nodes_to_page(hash_table_t *table, mem_heap_t *heap,
+                                        ulint fold, const page_t *page)
+{
+  for (ha_node_t *node= ha_chain_get_first(table, fold); node; )
+  {
+    if (page_align(ha_node_get_data(node)) == page)
+    {
+      ha_delete_hash_node(table, heap, node);
+      /* The deletion may compact the heap of nodes and move other nodes! */
+      node= ha_chain_get_first(table, fold);
+    }
+    else
+      node= ha_chain_get_next(node);
+  }
+#ifdef UNIV_DEBUG
+  /* Check that all nodes really got deleted */
+  for (ha_node_t *node= ha_chain_get_first(table, fold); node;
+       node= ha_chain_get_next(node))
+    ut_ad(page_align(ha_node_get_data(node)) != page);
+#endif /* UNIV_DEBUG */
+}
+
+/** Delete a record if found.
+@param table     hash table
+@param heap      memory heap for the hash bucket chain
+@param fold      folded value of the searched data
+@param data      pointer to the record
+@return whether the record was found */
+static bool ha_search_and_delete_if_found(hash_table_t *table,
+                                          mem_heap_t *heap,
+                                          ulint fold, const rec_t *data)
+{
+  if (ha_node_t *node= ha_search_with_data(table, fold, data))
+  {
+    ha_delete_hash_node(table, heap, node);
+    return true;
+  }
+
+  return false;
+}
+
+__attribute__((nonnull))
+/** Looks for an element when we know the pointer to the data and
+updates the pointer to data if found.
+@param table     hash table
+@param fold      folded value of the searched data
+@param data      pointer to the data
+@param new_data  new pointer to the data
+@return whether the element was found */
+static bool ha_search_and_update_if_found(hash_table_t *table, ulint fold,
+                                          const rec_t *data,
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+                                          /** block containing new_data */
+                                          buf_block_t *new_block,
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+                                          const rec_t *new_data)
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+  ut_a(new_block->frame == page_align(new_data));
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+  if (!btr_search_enabled)
+    return false;
+
+  if (ha_node_t *node= ha_search_with_data(table, fold, data))
+  {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+    ut_a(node->block->n_pointers-- < MAX_N_POINTERS);
+    ut_a(new_block->n_pointers++ < MAX_N_POINTERS);
+    node->block= new_block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+    node->data= new_data;
+
+    return true;
+  }
+
+  return false;
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+#else
+# define ha_insert_for_fold(t,h,f,b,d) ha_insert_for_fold(t,h,f,d)
+# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \
+	ha_search_and_update_if_found(table,fold,data,new_data)
+#endif
+
 /** Updates a hash node reference when it has been unsuccessfully used in a
 search which could have succeeded with the used hash parameters. This can
 happen because when building a hash index for a page, we do not check
@@ -673,17 +714,17 @@ btr_search_update_hash_ref(
 		return;
 	}
 
-	if (cursor->index != index) {
-		ut_ad(cursor->index->id == index->id);
+	if (index != cursor->index) {
+		ut_ad(index->id == cursor->index->id);
 		btr_search_drop_page_hash_index(block);
 		return;
 	}
 
-	ut_ad(block->page.id.space() == index->table->space_id);
+	ut_ad(block->page.id().space() == index->table->space_id);
 	ut_ad(index == cursor->index);
 	ut_ad(!dict_index_is_ibuf(index));
-	rw_lock_t* const latch = btr_get_search_latch(index);
-	rw_lock_x_lock(latch);
+	auto part = btr_search_sys.get_part(*index);
+	rw_lock_x_lock(&part->latch);
 	ut_ad(!block->index || block->index == index);
 
 	if (block->index
@@ -712,14 +753,13 @@ btr_search_update_hash_ref(
 			mem_heap_free(heap);
 		}
 
-		ha_insert_for_fold(btr_get_search_table(index), fold,
-				   block, rec);
+		ha_insert_for_fold(&part->table, part->heap, fold, block, rec);
 
 		MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
 	}
 
 func_exit:
-	rw_lock_x_unlock(latch);
+	rw_lock_x_unlock(&part->latch);
 }
 
 /** Checks if a guessed position for a tree cursor is right. Note that if
@@ -874,6 +914,88 @@ btr_search_failure(btr_search_t* info, btr_cur_t* cursor)
 	info->last_hash_succ = FALSE;
 }
 
+/** Clear the adaptive hash index on all pages in the buffer pool. */
+inline void buf_pool_t::clear_hash_index()
+{
+  ut_ad(btr_search_own_all(RW_LOCK_X));
+  ut_ad(!resizing);
+  ut_ad(!btr_search_enabled);
+
+  std::set<dict_index_t*> garbage;
+
+  for (chunk_t *chunk= chunks + n_chunks; chunk-- != chunks; )
+  {
+    for (buf_block_t *block= chunk->blocks, * const end= block + chunk->size;
+         block != end; block++)
+    {
+      dict_index_t *index= block->index;
+      assert_block_ahi_valid(block);
+
+      /* We can clear block->index and block->n_pointers when
+      btr_search_own_all(RW_LOCK_X); see the comments in buf0buf.h */
+
+      if (!index)
+      {
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+        ut_a(!block->n_pointers);
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+        continue;
+      }
+
+      ut_d(buf_page_state state= block->page.state());
+      /* Another thread may have set the state to
+      BUF_BLOCK_REMOVE_HASH in buf_LRU_block_remove_hashed().
+
+      The state change in buf_pool_t::realloc() is not observable
+      here, because in that case we would have !block->index.
+
+      In the end, the entire adaptive hash index will be removed. */
+      ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_REMOVE_HASH);
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+      block->n_pointers= 0;
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+      if (index->freed())
+        garbage.insert(index);
+      block->index= nullptr;
+    }
+  }
+
+  for (dict_index_t *index : garbage)
+    btr_search_lazy_free(index);
+}
+
+/** Get a buffer block from an adaptive hash index pointer.
+This function does not return if the block is not identified.
+@param ptr  pointer to within a page frame
+@return pointer to block, never NULL */
+inline buf_block_t* buf_pool_t::block_from_ahi(const byte *ptr) const
+{
+  chunk_t::map *chunk_map = chunk_t::map_ref;
+  ut_ad(chunk_t::map_ref == chunk_t::map_reg);
+  ut_ad(!resizing);
+
+  chunk_t::map::const_iterator it= chunk_map->upper_bound(ptr);
+  ut_a(it != chunk_map->begin());
+
+  chunk_t *chunk= it == chunk_map->end()
+    ? chunk_map->rbegin()->second
+    : (--it)->second;
+
+  const size_t offs= size_t(ptr - chunk->blocks->frame) >> srv_page_size_shift;
+  ut_a(offs < chunk->size);
+
+  buf_block_t *block= &chunk->blocks[offs];
+  /* buf_pool_t::chunk_t::init() invokes buf_block_init() so that
+  block[n].frame == block->frame + n * srv_page_size.  Check it. */
+  ut_ad(block->frame == page_align(ptr));
+  /* Read the state of the block without holding hash_lock.
+  A state transition from BUF_BLOCK_FILE_PAGE to
+  BUF_BLOCK_REMOVE_HASH is possible during this execution. */
+  ut_d(const buf_page_state state = block->page.state());
+  ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_REMOVE_HASH);
+  return block;
+}
+
 /** Tries to guess the right search position based on the hash search info
 of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
 and the function returns TRUE, then cursor->up_match and cursor->low_match
@@ -905,10 +1027,8 @@ btr_search_guess_on_hash(
 {
 	ulint		fold;
 	index_id_t	index_id;
-#ifdef notdefined
-	btr_cur_t	cursor2;
-	btr_pcur_t	pcur;
-#endif
+
+	ut_ad(mtr->is_active());
 	ut_ad(!ahi_latch || rw_lock_own_flagged(
 		      ahi_latch, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
 
@@ -916,11 +1036,13 @@ btr_search_guess_on_hash(
 		return false;
 	}
 
-	ut_ad(index && info && tuple && cursor && mtr);
-	ut_ad(!dict_index_is_ibuf(index));
-	ut_ad(!ahi_latch || ahi_latch == btr_get_search_latch(index));
+	ut_ad(!index->is_ibuf());
+	ut_ad(!ahi_latch
+	      || ahi_latch == &btr_search_sys.get_part(*index)->latch);
 	ut_ad((latch_mode == BTR_SEARCH_LEAF)
 	      || (latch_mode == BTR_MODIFY_LEAF));
+	compile_time_assert(ulint{BTR_SEARCH_LEAF} == ulint{RW_S_LATCH});
+	compile_time_assert(ulint{BTR_MODIFY_LEAF} == ulint{RW_X_LATCH});
 
 	/* Not supported for spatial index */
 	ut_ad(!dict_index_is_spatial(index));
@@ -949,11 +1071,11 @@ btr_search_guess_on_hash(
 	cursor->fold = fold;
 	cursor->flag = BTR_CUR_HASH;
 
-	rw_lock_t* use_latch = ahi_latch ? NULL : btr_get_search_latch(index);
+	auto part = btr_search_sys.get_part(*index);
 	const rec_t* rec;
 
-	if (use_latch) {
-		rw_lock_s_lock(use_latch);
+	if (!ahi_latch) {
+		rw_lock_s_lock(&part->latch);
 
 		if (!btr_search_enabled) {
 			goto fail;
@@ -964,31 +1086,65 @@ btr_search_guess_on_hash(
 	}
 
 	rec = static_cast<const rec_t*>(
-		ha_search_and_get_data(btr_get_search_table(index), fold));
+		ha_search_and_get_data(&part->table, fold));
 
 	if (!rec) {
-		if (use_latch) {
+		if (!ahi_latch) {
 fail:
-			rw_lock_s_unlock(use_latch);
+			rw_lock_s_unlock(&part->latch);
 		}
 
 		btr_search_failure(info, cursor);
 		return false;
 	}
 
-	buf_block_t*	block = buf_block_from_ahi(rec);
+	buf_block_t* block = buf_pool.block_from_ahi(rec);
 
-	if (use_latch) {
-		if (!buf_page_get_known_nowait(
-			latch_mode, block, BUF_MAKE_YOUNG,
-			__FILE__, __LINE__, mtr)) {
+	if (!ahi_latch) {
+		page_hash_latch* hash_lock = buf_pool.hash_lock_get(
+			block->page.id());
+		hash_lock->read_lock();
+
+		if (block->page.state() == BUF_BLOCK_REMOVE_HASH) {
+			/* Another thread is just freeing the block
+			from the LRU list of the buffer pool: do not
+			try to access this page. */
+			hash_lock->read_unlock();
 			goto fail;
 		}
 
 		const bool fail = index != block->index
 			&& index_id == block->index->id;
 		ut_a(!fail || block->index->freed());
-		rw_lock_s_unlock(use_latch);
+		ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+		DBUG_ASSERT(fail || block->page.status != buf_page_t::FREED);
+
+		buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+		hash_lock->read_unlock();
+		block->page.set_accessed();
+
+		buf_page_make_young_if_needed(&block->page);
+		mtr_memo_type_t	fix_type;
+		if (latch_mode == BTR_SEARCH_LEAF) {
+			if (!rw_lock_s_lock_nowait(&block->lock,
+						   __FILE__, __LINE__)) {
+got_no_latch:
+				buf_block_buf_fix_dec(block);
+				goto fail;
+			}
+			fix_type = MTR_MEMO_PAGE_S_FIX;
+		} else {
+			if (!rw_lock_x_lock_func_nowait_inline(
+				    &block->lock, __FILE__, __LINE__)) {
+				goto got_no_latch;
+			}
+			fix_type = MTR_MEMO_PAGE_X_FIX;
+		}
+		mtr->memo_push(block, fix_type);
+
+		buf_pool.stat.n_page_gets++;
+
+		rw_lock_s_unlock(&part->latch);
 
 		buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH);
 		if (UNIV_UNLIKELY(fail)) {
@@ -1000,9 +1156,9 @@ fail:
 		goto fail_and_release_page;
 	}
 
-	if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
+	if (block->page.state() != BUF_BLOCK_FILE_PAGE) {
 
-		ut_ad(buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH);
+		ut_ad(block->page.state() == BUF_BLOCK_REMOVE_HASH);
 
 fail_and_release_page:
 		if (!ahi_latch) {
@@ -1074,17 +1230,12 @@ fail_and_release_page:
 #ifdef UNIV_SEARCH_PERF_STAT
 	btr_search_n_succ++;
 #endif
-	if (!ahi_latch && buf_page_peek_if_too_old(&block->page)) {
-
-		buf_page_make_young(&block->page);
-	}
-
 	/* Increment the page get statistics though we did not really
 	fix the page: for user info only */
-	{
-		buf_pool_t*	buf_pool = buf_pool_from_bpage(&block->page);
+	++buf_pool.stat.n_page_gets;
 
-		++buf_pool->stat.n_page_gets;
+	if (!ahi_latch) {
+		buf_page_make_young_if_needed(&block->page);
 	}
 
 	return true;
@@ -1094,7 +1245,7 @@ fail_and_release_page:
 @param[in,out]	block	block containing index page, s- or x-latched, or an
 			index page for which we know that
 			block->buf_fix_count == 0 or it is an index page which
-			has already been removed from the buf_pool->page_hash
+			has already been removed from the buf_pool.page_hash
 			i.e.: it is in state BUF_BLOCK_REMOVE_HASH */
 void btr_search_drop_page_hash_index(buf_block_t* block)
 {
@@ -1110,11 +1261,10 @@ void btr_search_drop_page_hash_index(buf_block_t* block)
 	ulint			i;
 	mem_heap_t*		heap;
 	rec_offs*		offsets;
-	rw_lock_t*		latch;
 
 retry:
 	/* This debug check uses a dirty read that could theoretically cause
-	false positives while buf_pool_clear_hash_index() is executing. */
+	false positives while buf_pool.clear_hash_index() is executing. */
 	assert_block_ahi_valid(block);
 	ut_ad(!btr_search_own_any(RW_LOCK_S));
 	ut_ad(!btr_search_own_any(RW_LOCK_X));
@@ -1123,8 +1273,8 @@ retry:
 		return;
 	}
 
-	ut_ad(block->page.buf_fix_count == 0
-	      || buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH
+	ut_ad(!block->page.buf_fix_count()
+	      || block->page.state() == BUF_BLOCK_REMOVE_HASH
 	      || rw_lock_own_flagged(&block->lock,
 				     RW_LOCK_FLAG_X | RW_LOCK_FLAG_S
 				     | RW_LOCK_FLAG_SX));
@@ -1136,28 +1286,27 @@ retry:
 
 	const index_id_t	index_id
 		= btr_page_get_index_id(block->frame);
-	const ulint		ahi_slot
-		= ut_fold_ulint_pair(static_cast<ulint>(index_id),
-				     static_cast<ulint>(block->page.id.space()))
-		% btr_ahi_parts;
-	latch = btr_search_latches[ahi_slot];
 
-	dict_index_t* index = block->index;
+	auto part = btr_search_sys.get_part(index_id,
+					    block->page.id().space());
 
+	dict_index_t* index = block->index;
 	bool is_freed = index && index->freed();
+
 	if (is_freed) {
-		rw_lock_x_lock(latch);
+		rw_lock_x_lock(&part->latch);
 	} else {
-		rw_lock_s_lock(latch);
+		rw_lock_s_lock(&part->latch);
 	}
 
 	assert_block_ahi_valid(block);
 
+
 	if (!index || !btr_search_enabled) {
 		if (is_freed) {
-			rw_lock_x_unlock(latch);
+			rw_lock_x_unlock(&part->latch);
 		} else {
-			rw_lock_s_unlock(latch);
+			rw_lock_s_unlock(&part->latch);
 		}
 		return;
 	}
@@ -1167,7 +1316,7 @@ retry:
 #endif
 	ut_ad(btr_search_enabled);
 
-	ut_ad(block->page.id.space() == index->table->space_id);
+	ut_ad(block->page.id().space() == index->table->space_id);
 	ut_a(index_id == index->id);
 	ut_ad(!dict_index_is_ibuf(index));
 
@@ -1178,7 +1327,7 @@ retry:
 	releasing search latch, as the index page might only be s-latched! */
 
 	if (!is_freed) {
-		rw_lock_s_unlock(latch);
+		rw_lock_s_unlock(&part->latch);
 	}
 
 	ut_a(n_fields > 0 || n_bytes > 0);
@@ -1231,7 +1380,7 @@ next_rec:
 	}
 
 	if (!is_freed) {
-		rw_lock_x_lock(latch);
+		rw_lock_x_lock(&part->latch);
 
 		if (UNIV_UNLIKELY(!block->index)) {
 			/* Someone else has meanwhile dropped the
@@ -1248,17 +1397,15 @@ next_rec:
 		/* Someone else has meanwhile built a new hash index on the
 		page, with different parameters */
 
-		rw_lock_x_unlock(latch);
+		rw_lock_x_unlock(&part->latch);
 
 		ut_free(folds);
 		goto retry;
 	}
 
 	for (i = 0; i < n_cached; i++) {
-
-		ha_remove_all_nodes_to_page(
-			btr_search_sys->hash_tables[ahi_slot],
-			folds[i], page);
+		ha_remove_all_nodes_to_page(&part->table, part->heap,
+					    folds[i], page);
 	}
 
 	switch (index->search_info->ref_count--) {
@@ -1277,7 +1424,7 @@ next_rec:
 
 cleanup:
 	assert_block_ahi_valid(block);
-	rw_lock_x_unlock(latch);
+	rw_lock_x_unlock(&part->latch);
 
 	ut_free(folds);
 }
@@ -1307,7 +1454,7 @@ void btr_search_drop_page_hash_when_freed(const page_id_t page_id)
 
 		/* If AHI is still valid, page can't be in free state.
 		AHI is dropped when page is freed. */
-		ut_ad(!block->page.file_page_was_freed);
+		DBUG_ASSERT(block->page.status != buf_page_t::FREED);
 
 		buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH);
 
@@ -1341,9 +1488,9 @@ btr_search_build_page_hash_index(
 	dict_index_t*	index,
 	buf_block_t*	block,
 	rw_lock_t*	ahi_latch,
-	ulint		n_fields,
-	ulint		n_bytes,
-	ibool		left_side)
+	uint16_t	n_fields,
+	uint16_t	n_bytes,
+	bool		left_side)
 {
 	const rec_t*	rec;
 	const rec_t*	next_rec;
@@ -1365,15 +1512,15 @@ btr_search_build_page_hash_index(
 	}
 
 	rec_offs_init(offsets_);
-	ut_ad(ahi_latch == btr_get_search_latch(index));
+	ut_ad(ahi_latch == &btr_search_sys.get_part(*index)->latch);
 	ut_ad(index);
-	ut_ad(block->page.id.space() == index->table->space_id);
+	ut_ad(block->page.id().space() == index->table->space_id);
 	ut_ad(!dict_index_is_ibuf(index));
 	ut_ad(page_is_leaf(block->frame));
 
 	ut_ad(rw_lock_own_flagged(&block->lock,
 				  RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
-	ut_ad(block->page.id.page_no() >= 3);
+	ut_ad(block->page.id().page_no() >= 3);
 
 	rw_lock_s_lock(ahi_latch);
 
@@ -1436,7 +1583,7 @@ btr_search_build_page_hash_index(
 		btr_search_get_n_fields(n_fields, n_bytes),
 		&heap);
 	ut_ad(page_rec_is_supremum(rec)
-	      || n_fields + (n_bytes > 0) == rec_offs_n_fields(offsets));
+	      || n_fields == rec_offs_n_fields(offsets) - (n_bytes > 0));
 
 	fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id);
 
@@ -1511,15 +1658,16 @@ btr_search_build_page_hash_index(
 
 	block->n_hash_helps = 0;
 
-	block->curr_n_fields = unsigned(n_fields);
-	block->curr_n_bytes = unsigned(n_bytes);
-	block->curr_left_side = unsigned(left_side);
+	block->curr_n_fields = n_fields & dict_index_t::MAX_N_FIELDS;
+	block->curr_n_bytes = n_bytes & ((1U << 15) - 1);
+	block->curr_left_side = left_side;
 	block->index = index;
 
 	{
-		hash_table_t*	table = btr_get_search_table(index);
+		auto part = btr_search_sys.get_part(*index);
 		for (ulint i = 0; i < n_cached; i++) {
-			ha_insert_for_fold(table, folds[i], block, recs[i]);
+			ha_insert_for_fold(&part->table, part->heap,
+					   folds[i], block, recs[i]);
 		}
 	}
 
@@ -1542,8 +1690,8 @@ exit_func:
 void
 btr_search_info_update_slow(btr_search_t* info, btr_cur_t* cursor)
 {
-	rw_lock_t*	ahi_latch = btr_get_search_latch(cursor->index);
-
+	rw_lock_t*	ahi_latch = &btr_search_sys.get_part(*cursor->index)
+		->latch;
 	ut_ad(!rw_lock_own_flagged(ahi_latch,
 				   RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
 
@@ -1612,7 +1760,9 @@ btr_search_move_or_delete_hash_entries(
 	assert_block_ahi_valid(block);
 	assert_block_ahi_valid(new_block);
 
-	rw_lock_t* ahi_latch = index ? btr_get_search_latch(index) : NULL;
+	rw_lock_t* ahi_latch = index
+		? &btr_search_sys.get_part(*index)->latch
+		: nullptr;
 
 	if (new_block->index) {
 drop_exit:
@@ -1624,18 +1774,16 @@ drop_exit:
 		return;
 	}
 
+	if (index->freed()) {
+		goto drop_exit;
+	}
+
 	rw_lock_s_lock(ahi_latch);
 
 	if (block->index) {
-
-		if (block->index != index) {
-			rw_lock_s_unlock(ahi_latch);
-			goto drop_exit;
-		}
-
-		ulint	n_fields = block->curr_n_fields;
-		ulint	n_bytes = block->curr_n_bytes;
-		ibool	left_side = block->curr_left_side;
+		uint16_t n_fields = block->curr_n_fields;
+		uint16_t n_bytes = block->curr_n_bytes;
+		bool left_side = block->curr_left_side;
 
 		new_block->n_fields = block->curr_n_fields;
 		new_block->n_bytes = block->curr_n_bytes;
@@ -1653,6 +1801,7 @@ drop_exit:
 		ut_ad(left_side == block->curr_left_side);
 		return;
 	}
+
 	rw_lock_s_unlock(ahi_latch);
 }
 
@@ -1691,12 +1840,11 @@ void btr_search_update_hash_on_delete(btr_cur_t* cursor)
 	}
 
 	if (index != cursor->index) {
-		ut_ad(index->id == cursor->index->id);
 		btr_search_drop_page_hash_index(block);
 		return;
 	}
 
-	ut_ad(block->page.id.space() == index->table->space_id);
+	ut_ad(block->page.id().space() == index->table->space_id);
 	ut_a(index == cursor->index);
 	ut_a(block->curr_n_fields > 0 || block->curr_n_bytes > 0);
 	ut_ad(!dict_index_is_ibuf(index));
@@ -1711,27 +1859,25 @@ void btr_search_update_hash_on_delete(btr_cur_t* cursor)
 		mem_heap_free(heap);
 	}
 
-	rw_lock_t* ahi_latch = btr_get_search_latch(index);
+	auto part = btr_search_sys.get_part(*index);
 
-	rw_lock_x_lock(ahi_latch);
+	rw_lock_x_lock(&part->latch);
 	assert_block_ahi_valid(block);
 
-	if (btr_search_enabled) {
-		hash_table_t* table = btr_get_search_table(index);
-		if (block->index) {
-			ut_a(block->index == index);
-
-			if (ha_search_and_delete_if_found(table, fold, rec)) {
-				MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVED);
-			} else {
-				MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND);
-			}
+	if (block->index && btr_search_enabled) {
+		ut_a(block->index == index);
 
-			assert_block_ahi_valid(block);
+		if (ha_search_and_delete_if_found(&part->table, part->heap,
+						  fold, rec)) {
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVED);
+		} else {
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND);
 		}
+
+		assert_block_ahi_valid(block);
 	}
 
-	rw_lock_x_unlock(ahi_latch);
+	rw_lock_x_unlock(&part->latch);
 }
 
 /** Updates the page hash index when a single record is inserted on a page.
@@ -1742,12 +1888,11 @@ void btr_search_update_hash_on_delete(btr_cur_t* cursor)
 void
 btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
 {
-	hash_table_t*	table;
 	buf_block_t*	block;
 	dict_index_t*	index;
 	rec_t*		rec;
 
-	ut_ad(ahi_latch == btr_get_search_latch(cursor->index));
+	ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index)->latch);
 	ut_ad(!btr_search_own_any(RW_LOCK_S));
 	ut_ad(!btr_search_own_any(RW_LOCK_X));
 #ifdef MYSQL_INDEX_DISABLE_AHI
@@ -1770,8 +1915,8 @@ btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
 		return;
 	}
 
-	if (cursor->index != index) {
-		ut_ad(cursor->index->id == index->id);
+	if (index != cursor->index) {
+		ut_ad(index->id == cursor->index->id);
 		btr_search_drop_page_hash_index(block);
 		return;
 	}
@@ -1792,10 +1937,9 @@ btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
 	    && (cursor->n_bytes == block->curr_n_bytes)
 	    && !block->curr_left_side) {
 
-		table = btr_get_search_table(index);
-
 		if (ha_search_and_update_if_found(
-			table, cursor->fold, rec, block,
+			&btr_search_sys.get_part(*cursor->index)->table,
+			cursor->fold, rec, block,
 			page_rec_get_next(rec))) {
 			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_UPDATED);
 		}
@@ -1834,7 +1978,7 @@ btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
 	rec_offs*	offsets		= offsets_;
 	rec_offs_init(offsets_);
 
-	ut_ad(ahi_latch == btr_get_search_latch(cursor->index));
+	ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index)->latch);
 	ut_ad(page_is_leaf(btr_cur_get_page(cursor)));
 	ut_ad(!btr_search_own_any(RW_LOCK_S));
 	ut_ad(!btr_search_own_any(RW_LOCK_X));
@@ -1857,7 +2001,7 @@ btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
 		return;
 	}
 
-	ut_ad(block->page.id.space() == index->table->space_id);
+	ut_ad(block->page.id().space() == index->table->space_id);
 	btr_search_check_free_space_in_heap(index);
 
 	rec = btr_cur_get_rec(cursor);
@@ -1894,8 +2038,8 @@ btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
 				     n_bytes, index->id);
 	}
 
-	/* We must not look up "table" before acquiring ahi_latch. */
-	hash_table_t* table = NULL;
+	/* We must not look up "part" before acquiring ahi_latch. */
+	btr_search_sys_t::partition* part= nullptr;
 	bool locked = false;
 
 	if (!page_rec_is_infimum(rec) && !rec_is_metadata(rec, *index)) {
@@ -1912,8 +2056,10 @@ btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
 				goto function_exit;
 			}
 
-			table = btr_get_search_table(index);
-			ha_insert_for_fold(table, ins_fold, block, ins_rec);
+			part = btr_search_sys.get_part(*index);
+			ha_insert_for_fold(&part->table, part->heap,
+					   ins_fold, block, ins_rec);
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
 		}
 
 		goto check_next_rec;
@@ -1929,14 +2075,17 @@ btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
 				goto function_exit;
 			}
 
-			table = btr_get_search_table(index);
+			part = btr_search_sys.get_part(*index);
 		}
 
 		if (!left_side) {
-			ha_insert_for_fold(table, fold, block, rec);
+			ha_insert_for_fold(&part->table, part->heap,
+					   fold, block, rec);
 		} else {
-			ha_insert_for_fold(table, ins_fold, block, ins_rec);
+			ha_insert_for_fold(&part->table, part->heap,
+					   ins_fold, block, ins_rec);
 		}
+		MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
 	}
 
 check_next_rec:
@@ -1951,10 +2100,12 @@ check_next_rec:
 					goto function_exit;
 				}
 
-				table = btr_get_search_table(index);
+				part = btr_search_sys.get_part(*index);
 			}
 
-			ha_insert_for_fold(table, ins_fold, block, ins_rec);
+			ha_insert_for_fold(&part->table, part->heap,
+					   ins_fold, block, ins_rec);
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
 		}
 
 		goto function_exit;
@@ -1969,14 +2120,17 @@ check_next_rec:
 				goto function_exit;
 			}
 
-			table = btr_get_search_table(index);
+			part = btr_search_sys.get_part(*index);
 		}
 
 		if (!left_side) {
-			ha_insert_for_fold(table, ins_fold, block, ins_rec);
+			ha_insert_for_fold(&part->table, part->heap,
+					   ins_fold, block, ins_rec);
 		} else {
-			ha_insert_for_fold(table, next_fold, block, next_rec);
+			ha_insert_for_fold(&part->table, part->heap,
+					   next_fold, block, next_rec);
 		}
+		MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
 	}
 
 function_exit:
@@ -1990,6 +2144,31 @@ function_exit:
 }
 
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+__attribute__((nonnull))
+/** @return whether a range of the cells is valid */
+static bool ha_validate(const hash_table_t *table,
+                        ulint start_index, ulint end_index)
+{
+  ut_a(start_index <= end_index);
+  ut_a(end_index < table->n_cells);
+
+  bool ok= true;
+
+  for (ulint i= start_index; i <= end_index; i++)
+  {
+    for (auto node= static_cast<const ha_node_t*>(table->array[i].node); node;
+         node= node->next)
+    {
+      if (table->calc_hash(node->fold) != i) {
+        ib::error() << "Hash table node fold value " << node->fold
+		    << " does not match the cell number " << i;
+	ok= false;
+      }
+    }
+  }
+
+  return ok;
+}
 
 /** Validates the search system for given hash table.
 @param[in]	hash_table_id	hash table to validate
@@ -2018,17 +2197,18 @@ btr_search_hash_table_validate(ulint hash_table_id)
 
 	rec_offs_init(offsets_);
 
-	buf_pool_mutex_enter_all();
+	mysql_mutex_lock(&buf_pool.mutex);
 
-	cell_count = hash_get_n_cells(
-			btr_search_sys->hash_tables[hash_table_id]);
+	auto &part = btr_search_sys.parts[hash_table_id];
+
+	cell_count = part.table.n_cells;
 
 	for (i = 0; i < cell_count; i++) {
 		/* We release search latches every once in a while to
 		give other queries a chance to run. */
 		if ((i != 0) && ((i % chunk_size) == 0)) {
 
-			buf_pool_mutex_exit_all();
+			mysql_mutex_unlock(&buf_pool.mutex);
 			btr_search_x_unlock_all();
 
 			os_thread_yield();
@@ -2040,10 +2220,9 @@ btr_search_hash_table_validate(ulint hash_table_id)
 				goto func_exit;
 			}
 
-			buf_pool_mutex_enter_all();
+			mysql_mutex_lock(&buf_pool.mutex);
 
-			ulint	curr_cell_count = hash_get_n_cells(
-				btr_search_sys->hash_tables[hash_table_id]);
+			ulint curr_cell_count = part.table.n_cells;
 
 			if (cell_count != curr_cell_count) {
 
@@ -2055,19 +2234,14 @@ btr_search_hash_table_validate(ulint hash_table_id)
 			}
 		}
 
-		node = (ha_node_t*) hash_get_nth_cell(
-			btr_search_sys->hash_tables[hash_table_id], i)->node;
+		node = static_cast<ha_node_t*>(part.table.array[i].node);
 
 		for (; node != NULL; node = node->next) {
 			const buf_block_t*	block
-				= buf_block_from_ahi((byte*) node->data);
-			const buf_block_t*	hash_block;
-			buf_pool_t*		buf_pool;
+				= buf_pool.block_from_ahi((byte*) node->data);
 			index_id_t		page_index_id;
 
-			buf_pool = buf_pool_from_bpage((buf_page_t*) block);
-
-			if (UNIV_LIKELY(buf_block_get_state(block)
+			if (UNIV_LIKELY(block->page.state()
 					== BUF_BLOCK_FILE_PAGE)) {
 
 				/* The space and offset are only valid
@@ -2075,32 +2249,24 @@ btr_search_hash_table_validate(ulint hash_table_id)
 				the block is being freed
 				(BUF_BLOCK_REMOVE_HASH, see the
 				assertion and the comment below) */
-				hash_block = buf_block_hash_get(
-					buf_pool,
-					block->page.id);
-			} else {
-				hash_block = NULL;
-			}
-
-			if (hash_block) {
-				ut_a(hash_block == block);
-			} else {
-				/* When a block is being freed,
-				buf_LRU_search_and_free_block() first
-				removes the block from
-				buf_pool->page_hash by calling
-				buf_LRU_block_remove_hashed_page().
-				After that, it invokes
-				btr_search_drop_page_hash_index() to
-				remove the block from
-				btr_search_sys->hash_tables[i]. */
-
-				ut_a(buf_block_get_state(block)
-				     == BUF_BLOCK_REMOVE_HASH);
+				const page_id_t id(block->page.id());
+				if (const buf_page_t* hash_page
+				    = buf_pool.page_hash_get_low(
+					    id, id.fold())) {
+					ut_ad(hash_page == &block->page);
+					goto state_ok;
+				}
 			}
 
+			/* When a block is being freed,
+			buf_LRU_search_and_free_block() first removes
+			the block from buf_pool.page_hash by calling
+			buf_LRU_block_remove_hashed_page(). Then it
+			invokes btr_search_drop_page_hash_index(). */
+			ut_a(block->page.state() == BUF_BLOCK_REMOVE_HASH);
+state_ok:
 			ut_ad(!dict_index_is_ibuf(block->index));
-			ut_ad(block->page.id.space()
+			ut_ad(block->page.id().space()
 			      == block->index->table->space_id);
 
 			page_index_id = btr_page_get_index_id(block->frame);
@@ -2125,8 +2291,7 @@ btr_search_hash_table_validate(ulint hash_table_id)
 
 				ib::error() << "Error in an adaptive hash"
 					<< " index pointer to page "
-					<< page_id_t(page_get_space_id(page),
-						     page_get_page_no(page))
+					<< block->page.id()
 					<< ", ptr mem address "
 					<< reinterpret_cast<const void*>(
 						node->data)
@@ -2152,8 +2317,7 @@ btr_search_hash_table_validate(ulint hash_table_id)
 		/* We release search latches every once in a while to
 		give other queries a chance to run. */
 		if (i != 0) {
-
-			buf_pool_mutex_exit_all();
+			mysql_mutex_unlock(&buf_pool.mutex);
 			btr_search_x_unlock_all();
 
 			os_thread_yield();
@@ -2165,10 +2329,9 @@ btr_search_hash_table_validate(ulint hash_table_id)
 				goto func_exit;
 			}
 
-			buf_pool_mutex_enter_all();
+			mysql_mutex_lock(&buf_pool.mutex);
 
-			ulint	curr_cell_count = hash_get_n_cells(
-				btr_search_sys->hash_tables[hash_table_id]);
+			ulint curr_cell_count = part.table.n_cells;
 
 			if (cell_count != curr_cell_count) {
 
@@ -2182,13 +2345,12 @@ btr_search_hash_table_validate(ulint hash_table_id)
 
 		ulint end_index = ut_min(i + chunk_size - 1, cell_count - 1);
 
-		if (!ha_validate(btr_search_sys->hash_tables[hash_table_id],
-				 i, end_index)) {
+		if (!ha_validate(&part.table, i, end_index)) {
 			ok = FALSE;
 		}
 	}
 
-	buf_pool_mutex_exit_all();
+	mysql_mutex_unlock(&buf_pool.mutex);
 func_exit:
 	btr_search_x_unlock_all();
 
diff --git a/storage/innobase/buf/buf0block_hint.cc b/storage/innobase/buf/buf0block_hint.cc
index 9f974e8304d..6d99d0b61f0 100644
--- a/storage/innobase/buf/buf0block_hint.cc
+++ b/storage/innobase/buf/buf0block_hint.cc
@@ -30,49 +30,30 @@ namespace buf {
 
 void Block_hint::buffer_fix_block_if_still_valid()
 {
-  /* We need to check if m_block points to one of chunks. For this to be
-  meaningful we need to prevent freeing memory while we check, and until we
-  buffer-fix the block. For this purpose it is enough to latch any of the many
-  latches taken by buf_resize().
-  However, for buffer-fixing to be meaningful, the block has to contain a page
-  (as opposed to being already empty, which might mean that buf_pool_resize()
-  can proceed and free it once we free the s-latch), so we confirm that the
-  block contains a page. However, it is not sufficient to check that this is
-  just any page, because just after we check it could get freed, unless we
-  have a latch which prevents this. This is tricky because page_hash latches
-  are sharded by page_id and we don't know the page_id until we look into the
-  block. To solve this chicken-and-egg problem somewhat, we latch the shard
-  for the m_page_id and compare block->page.id to it - so if is equal then we
-  can be reasonably sure that we have the correct latch.
-  There is still a theoretical problem here, where other threads might try
-  to modify the m_block->page.id while we are comparing it, but the chance of
-  accidentally causing the old space_id == m_page_id.m_space and the new
-  page_no == m_page_id.m_page_no is minimal as compilers emit a single 8-byte
-  comparison instruction to compare both at the same time atomically, and f()
-    will probably double-check the block->page.id again, anyway.
-  Finally, assuming that we have correct hash bucket latched, we should check if
-  the state of the block is BUF_BLOCK_FILE_PAGE before buffer-fixing the block,
-  as otherwise we risk buffer-fixing and operating on a block, which is already
-  meant to be freed. In particular, buf_LRU_free_page() first calls
-  buf_LRU_block_remove_hashed() under hash bucket latch protection to change the
-  state to BUF_BLOCK_REMOVE_HASH and then releases the latch. Later it calls
-  buf_LRU_block_free_hashed_page() without any latch to change the state to
-  BUF_BLOCK_MEMORY and reset the page's id, which means buf_resize() can free it
-  regardless of our buffer-fixing. */
+  /* To check if m_block belongs to the current buf_pool, we must
+  prevent freeing memory while we check, and until we buffer-fix the
+  block. For this purpose it is enough to latch any of the many
+  latches taken by buf_pool_t::resize().
+
+  Similar to buf_page_optimistic_get(), we must validate
+  m_block->page.id() after acquiring the hash_lock, because the object
+  may have been freed and not actually attached to buf_pool.page_hash
+  at the moment. (The block could have been reused to store a
+  different page, and that slice of buf_pool.page_hash could be protected
+  by another hash_lock that we are not holding.)
+
+  Finally, assuming that we have correct hash bucket latched, we must
+  validate m_block->state() to ensure that the block is not being freed. */
   if (m_block)
   {
-    const buf_pool_t *const buf_pool= buf_pool_get(m_page_id);
-    rw_lock_t *latch= buf_page_hash_lock_get(buf_pool, m_page_id);
-    rw_lock_s_lock(latch);
-    /* If not own buf_pool_mutex, page_hash can be changed. */
-    latch= buf_page_hash_lock_s_confirm(latch, buf_pool, m_page_id);
-    if (buf_pool->is_block_field(m_block) &&
-        m_page_id == m_block->page.id &&
-        buf_block_get_state(m_block) == BUF_BLOCK_FILE_PAGE)
+    const ulint fold= m_page_id.fold();
+    page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
+    if (buf_pool.is_uncompressed(m_block) && m_page_id == m_block->page.id() &&
+        m_block->page.state() == BUF_BLOCK_FILE_PAGE)
       buf_block_buf_fix_inc(m_block, __FILE__, __LINE__);
     else
       clear();
-    rw_lock_s_unlock(latch);
+    hash_lock->read_unlock();
   }
 }
 }  // namespace buf
diff --git a/storage/innobase/buf/buf0buddy.cc b/storage/innobase/buf/buf0buddy.cc
index 593adf700b3..f822adc3389 100644
--- a/storage/innobase/buf/buf0buddy.cc
+++ b/storage/innobase/buf/buf0buddy.cc
@@ -39,7 +39,7 @@ safe to look at BUF_BUDDY_STAMP_OFFSET.
 The answer lies in following invariants:
 * All blocks allocated by buddy allocator are used for compressed
 page frame.
-* A compressed table always have space_id < SRV_LOG_SPACE_FIRST_ID
+* A compressed table always have space_id < SRV_SPACE_ID_UPPER_BOUND
 * BUF_BUDDY_STAMP_OFFSET always points to the space_id field in
 a frame.
   -- The above is true because we look at these fields when the
@@ -67,7 +67,7 @@ are written.*/
 
 /** Value that we stamp on all buffers that are currently on the zip_free
 list. This value is stamped at BUF_BUDDY_STAMP_OFFSET offset */
-#define BUF_BUDDY_STAMP_FREE	 SRV_LOG_SPACE_FIRST_ID
+#define BUF_BUDDY_STAMP_FREE	 SRV_SPACE_ID_UPPER_BOUND
 
 /** Stamp value for non-free buffers. Will be overwritten by a non-zero
 value by the consumer of the block */
@@ -176,38 +176,29 @@ struct	CheckZipFree {
 };
 
 /** Validate a buddy list.
-@param[in]	buf_pool	buffer pool instance
 @param[in]	i		buddy size to validate */
-static
-void
-buf_buddy_list_validate(
-	const buf_pool_t*	buf_pool,
-	ulint			i)
+static void buf_buddy_list_validate(ulint i)
 {
-	ut_list_validate(buf_pool->zip_free[i], CheckZipFree(i));
+	ut_list_validate(buf_pool.zip_free[i], CheckZipFree(i));
 }
 
 /**********************************************************************//**
 Debug function to validate that a buffer is indeed free i.e.: in the
 zip_free[].
+@param[in]	buf		block to check
+@param[in]	i		index of buf_pool.zip_free[]
 @return true if free */
-UNIV_INLINE
-bool
-buf_buddy_check_free(
-/*=================*/
-	buf_pool_t*		buf_pool,/*!< in: buffer pool instance */
-	const buf_buddy_free_t*	buf,	/*!< in: block to check */
-	ulint			i)	/*!< in: index of buf_pool->zip_free[] */
+static bool buf_buddy_check_free(const buf_buddy_free_t* buf, ulint i)
 {
 	const ulint	size	= BUF_BUDDY_LOW << i;
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
 	ut_ad(!ut_align_offset(buf, size));
 	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
 
 	buf_buddy_free_t* itr;
 
-	for (itr = UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
+	for (itr = UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
 	     itr && itr != buf;
 	     itr = UT_LIST_GET_NEXT(list, itr)) {
 	}
@@ -227,7 +218,7 @@ buf_buddy_is_free(
 /*==============*/
 	buf_buddy_free_t*	buf,	/*!< in: block to check */
 	ulint			i)	/*!< in: index of
-					buf_pool->zip_free[] */
+					buf_pool.zip_free[] */
 {
 #ifdef UNIV_DEBUG
 	const ulint	size	= BUF_BUDDY_LOW << i;
@@ -263,90 +254,75 @@ buf_buddy_is_free(
 	       : BUF_BUDDY_STATE_PARTIALLY_USED);
 }
 
-/**********************************************************************//**
-Add a block to the head of the appropriate buddy free list. */
+/** Add a block to the head of the appropriate buddy free list.
+@param[in,out]	buf		block to be freed
+@param[in]	i		index of buf_pool.zip_free[] */
 UNIV_INLINE
 void
-buf_buddy_add_to_free(
-/*==================*/
-	buf_pool_t*		buf_pool,	/*!< in: buffer pool instance */
-	buf_buddy_free_t*	buf,		/*!< in,own: block to be freed */
-	ulint			i)		/*!< in: index of
-						buf_pool->zip_free[] */
+buf_buddy_add_to_free(buf_buddy_free_t* buf, ulint i)
 {
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_pool->zip_free[i].start != buf);
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(buf_pool.zip_free[i].start != buf);
 
 	buf_buddy_stamp_free(buf, i);
-	UT_LIST_ADD_FIRST(buf_pool->zip_free[i], buf);
-	ut_d(buf_buddy_list_validate(buf_pool, i));
+	UT_LIST_ADD_FIRST(buf_pool.zip_free[i], buf);
+	ut_d(buf_buddy_list_validate(i));
 }
 
-/**********************************************************************//**
-Remove a block from the appropriate buddy free list. */
+/** Remove a block from the appropriate buddy free list.
+@param[in,out]	buf		block to be freed
+@param[in]	i		index of buf_pool.zip_free[] */
 UNIV_INLINE
 void
-buf_buddy_remove_from_free(
-/*=======================*/
-	buf_pool_t*		buf_pool,	/*!< in: buffer pool instance */
-	buf_buddy_free_t*	buf,		/*!< in,own: block to be
-						freed */
-	ulint			i)		/*!< in: index of
-						buf_pool->zip_free[] */
+buf_buddy_remove_from_free(buf_buddy_free_t* buf, ulint i)
 {
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_buddy_check_free(buf_pool, buf, i));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(buf_buddy_check_free(buf, i));
 
-	UT_LIST_REMOVE(buf_pool->zip_free[i], buf);
+	UT_LIST_REMOVE(buf_pool.zip_free[i], buf);
 	buf_buddy_stamp_nonfree(buf, i);
 }
 
-/**********************************************************************//**
-Try to allocate a block from buf_pool->zip_free[].
-@return allocated block, or NULL if buf_pool->zip_free[] was empty */
-static
-buf_buddy_free_t*
-buf_buddy_alloc_zip(
-/*================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	ulint		i)		/*!< in: index of buf_pool->zip_free[] */
+/** Try to allocate a block from buf_pool.zip_free[].
+@param[in]	i		index of buf_pool.zip_free[]
+@return allocated block, or NULL if buf_pool.zip_free[] was empty */
+static buf_buddy_free_t* buf_buddy_alloc_zip(ulint i)
 {
 	buf_buddy_free_t*	buf;
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
 	ut_a(i < BUF_BUDDY_SIZES);
 	ut_a(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
 
-	ut_d(buf_buddy_list_validate(buf_pool, i));
+	ut_d(buf_buddy_list_validate(i));
 
-	buf = UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
+	buf = UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
 
-	if (buf_pool->curr_size < buf_pool->old_size
-	    && UT_LIST_GET_LEN(buf_pool->withdraw)
-		< buf_pool->withdraw_target) {
+	if (buf_pool.curr_size < buf_pool.old_size
+	    && UT_LIST_GET_LEN(buf_pool.withdraw)
+	    < buf_pool.withdraw_target) {
 
 		while (buf != NULL
-		       && buf_frame_will_withdrawn(
-			       buf_pool, reinterpret_cast<byte*>(buf))) {
+		       && buf_pool.will_be_withdrawn(
+			       reinterpret_cast<byte*>(buf))) {
 			/* This should be withdrawn, not to be allocated */
 			buf = UT_LIST_GET_NEXT(list, buf);
 		}
 	}
 
 	if (buf) {
-		buf_buddy_remove_from_free(buf_pool, buf, i);
+		buf_buddy_remove_from_free(buf, i);
 	} else if (i + 1 < BUF_BUDDY_SIZES) {
 		/* Attempt to split. */
-		buf = buf_buddy_alloc_zip(buf_pool, i + 1);
+		buf = buf_buddy_alloc_zip(i + 1);
 
 		if (buf) {
 			buf_buddy_free_t* buddy =
 				reinterpret_cast<buf_buddy_free_t*>(
 					reinterpret_cast<byte*>(buf)
 					+ (BUF_BUDDY_LOW << i));
-
-			ut_ad(!buf_pool_contains_zip(buf_pool, buddy));
-			buf_buddy_add_to_free(buf_pool, buddy, i);
+			ut_ad(!buf_pool.contains_zip(buddy));
+			buf_buddy_add_to_free(buddy, i);
 		}
 	}
 
@@ -364,44 +340,37 @@ buf_buddy_alloc_zip(
 	return(buf);
 }
 
-/**********************************************************************//**
-Deallocate a buffer frame of srv_page_size. */
+/** Deallocate a buffer frame of srv_page_size.
+@param[in]	buf		buffer frame to deallocate */
 static
 void
-buf_buddy_block_free(
-/*=================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	void*		buf)		/*!< in: buffer frame to deallocate */
+buf_buddy_block_free(void* buf)
 {
 	const ulint	fold	= BUF_POOL_ZIP_FOLD_PTR(buf);
 	buf_page_t*	bpage;
 	buf_block_t*	block;
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(!mutex_own(&buf_pool->zip_mutex));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
 	ut_a(!ut_align_offset(buf, srv_page_size));
 
-	HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage,
-		    ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY
-			  && bpage->in_zip_hash && !bpage->in_page_hash),
+	HASH_SEARCH(hash, &buf_pool.zip_hash, fold, buf_page_t*, bpage,
+		    ut_ad(bpage->state() == BUF_BLOCK_MEMORY
+			  && bpage->in_zip_hash),
 		    ((buf_block_t*) bpage)->frame == buf);
 	ut_a(bpage);
-	ut_a(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY);
-	ut_ad(!bpage->in_page_hash);
+	ut_a(bpage->state() == BUF_BLOCK_MEMORY);
 	ut_ad(bpage->in_zip_hash);
-	ut_d(bpage->in_zip_hash = FALSE);
-	HASH_DELETE(buf_page_t, hash, buf_pool->zip_hash, fold, bpage);
+	ut_d(bpage->in_zip_hash = false);
+	HASH_DELETE(buf_page_t, hash, &buf_pool.zip_hash, fold, bpage);
 
 	ut_d(memset(buf, 0, srv_page_size));
 	MEM_UNDEFINED(buf, srv_page_size);
 
 	block = (buf_block_t*) bpage;
-	buf_page_mutex_enter(block);
 	buf_LRU_block_free_non_file_page(block);
-	buf_page_mutex_exit(block);
 
-	ut_ad(buf_pool->buddy_n_frames > 0);
-	ut_d(buf_pool->buddy_n_frames--);
+	ut_ad(buf_pool.buddy_n_frames > 0);
+	ut_d(buf_pool.buddy_n_frames--);
 }
 
 /**********************************************************************//**
@@ -412,38 +381,27 @@ buf_buddy_block_register(
 /*=====================*/
 	buf_block_t*	block)	/*!< in: buffer frame to allocate */
 {
-	buf_pool_t*	buf_pool = buf_pool_from_block(block);
 	const ulint	fold = BUF_POOL_ZIP_FOLD(block);
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(!mutex_own(&buf_pool->zip_mutex));
-	ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE);
-
-	buf_block_set_state(block, BUF_BLOCK_MEMORY);
+	ut_ad(block->page.state() == BUF_BLOCK_MEMORY);
 
 	ut_a(block->frame);
 	ut_a(!ut_align_offset(block->frame, srv_page_size));
 
-	ut_ad(!block->page.in_page_hash);
 	ut_ad(!block->page.in_zip_hash);
-	ut_d(block->page.in_zip_hash = TRUE);
-	HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page);
+	ut_d(block->page.in_zip_hash = true);
+	HASH_INSERT(buf_page_t, hash, &buf_pool.zip_hash, fold, &block->page);
 
-	ut_d(buf_pool->buddy_n_frames++);
+	ut_d(buf_pool.buddy_n_frames++);
 }
 
-/**********************************************************************//**
-Allocate a block from a bigger object.
+/** Allocate a block from a bigger object.
+@param[in]	buf		a block that is free to use
+@param[in]	i		index of buf_pool.zip_free[]
+@param[in]	j		size of buf as an index of buf_pool.zip_free[]
 @return allocated block */
 static
 void*
-buf_buddy_alloc_from(
-/*=================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	void*		buf,		/*!< in: a block that is free to use */
-	ulint		i,		/*!< in: index of
-					buf_pool->zip_free[] */
-	ulint		j)		/*!< in: size of buf as an index
-					of buf_pool->zip_free[] */
+buf_buddy_alloc_from(void* buf, ulint i, ulint j)
 {
 	ulint	offs	= BUF_BUDDY_LOW << j;
 	ut_ad(j <= BUF_BUDDY_SIZES);
@@ -460,101 +418,79 @@ buf_buddy_alloc_from(
 
 		zip_buf = reinterpret_cast<buf_buddy_free_t*>(
 			reinterpret_cast<byte*>(buf) + offs);
-		buf_buddy_add_to_free(buf_pool, zip_buf, j);
+		buf_buddy_add_to_free(zip_buf, j);
 	}
 
 	buf_buddy_stamp_nonfree(reinterpret_cast<buf_buddy_free_t*>(buf), i);
 	return(buf);
 }
 
-/**********************************************************************//**
-Allocate a block.  The thread calling this function must hold
-buf_pool->mutex and must not hold buf_pool->zip_mutex or any block->mutex.
-The buf_pool_mutex may be released and reacquired.
+/** Allocate a ROW_FORMAT=COMPRESSED block.
+@param i      index of buf_pool.zip_free[] or BUF_BUDDY_SIZES
+@param lru    assigned to true if buf_pool.mutex was temporarily released
 @return allocated block, never NULL */
-void*
-buf_buddy_alloc_low(
-/*================*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
-	ulint		i,		/*!< in: index of buf_pool->zip_free[],
-					or BUF_BUDDY_SIZES */
-	bool*		lru)		/*!< in: pointer to a variable that
-					will be assigned true if storage was
-					allocated from the LRU list and
-					buf_pool->mutex was temporarily
-					released */
+byte *buf_buddy_alloc_low(ulint i, bool *lru)
 {
 	buf_block_t*	block;
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(!mutex_own(&buf_pool->zip_mutex));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
 	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
 
 	if (i < BUF_BUDDY_SIZES) {
 		/* Try to allocate from the buddy system. */
-		block = (buf_block_t*) buf_buddy_alloc_zip(buf_pool, i);
+		block = (buf_block_t*) buf_buddy_alloc_zip(i);
 
 		if (block) {
 			goto func_exit;
 		}
 	}
 
-	/* Try allocating from the buf_pool->free list. */
-	block = buf_LRU_get_free_only(buf_pool);
+	/* Try allocating from the buf_pool.free list. */
+	block = buf_LRU_get_free_only();
 
 	if (block) {
-
 		goto alloc_big;
 	}
 
 	/* Try replacing an uncompressed page in the buffer pool. */
-	buf_pool_mutex_exit(buf_pool);
-	block = buf_LRU_get_free_block(buf_pool);
-	*lru = true;
-	buf_pool_mutex_enter(buf_pool);
+	block = buf_LRU_get_free_block(true);
+	if (lru) {
+		*lru = true;
+	}
 
 alloc_big:
 	buf_buddy_block_register(block);
 
 	block = (buf_block_t*) buf_buddy_alloc_from(
-		buf_pool, block->frame, i, BUF_BUDDY_SIZES);
+		block->frame, i, BUF_BUDDY_SIZES);
 
 func_exit:
-	buf_pool->buddy_stat[i].used++;
-	return(block);
+	buf_pool.buddy_stat[i].used++;
+	return reinterpret_cast<byte*>(block);
 }
 
-/**********************************************************************//**
-Try to relocate a block.
+/** Try to relocate a block. The caller must hold zip_free_mutex, and this
+function will release and lock it again.
+@param[in]	src		block to relocate
+@param[in]	dst		free block to relocated to
+@param[in]	i		index of buf_pool.zip_free[]
+@param[in]	force		true if we must relocated always
 @return true if relocated */
-static
-bool
-buf_buddy_relocate(
-/*===============*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	void*		src,		/*!< in: block to relocate */
-	void*		dst,		/*!< in: free block to relocate to */
-	ulint		i,		/*!< in: index of
-					buf_pool->zip_free[] */
-	bool		force)		/*!< in: true if we must relocate
-					always */
+static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
 {
 	buf_page_t*	bpage;
 	const ulint	size = BUF_BUDDY_LOW << i;
-	ulint		space;
-	ulint		offset;
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(!mutex_own(&buf_pool->zip_mutex));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
 	ut_ad(!ut_align_offset(src, size));
 	ut_ad(!ut_align_offset(dst, size));
 	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
 	MEM_CHECK_ADDRESSABLE(dst, size);
 
-	space	= mach_read_from_4((const byte*) src
-				   + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
-	offset	= mach_read_from_4((const byte*) src
-				   + FIL_PAGE_OFFSET);
+	uint32_t space = mach_read_from_4(static_cast<const byte*>(src)
+					  + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	uint32_t offset = mach_read_from_4(static_cast<const byte*>(src)
+					   + FIL_PAGE_OFFSET);
 
 	/* Suppress Valgrind or MSAN warnings. */
 	MEM_MAKE_DEFINED(&space, sizeof space);
@@ -563,27 +499,16 @@ buf_buddy_relocate(
 	ut_ad(space != BUF_BUDDY_STAMP_FREE);
 
 	const page_id_t	page_id(space, offset);
+	const ulint fold= page_id.fold();
 
-	/* If space,offset is bogus, then we know that the
-	buf_page_hash_get_low() call below will return NULL. */
-	if (!force && buf_pool != buf_pool_get(page_id)) {
-		return(false);
-	}
-
-	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
-
-	rw_lock_x_lock(hash_lock);
-
-	bpage = buf_page_hash_get_low(buf_pool, page_id);
+	bpage = buf_pool.page_hash_get_low(page_id, fold);
 
 	if (!bpage || bpage->zip.data != src) {
 		/* The block has probably been freshly
 		allocated by buf_LRU_get_free_block() but not
-		added to buf_pool->page_hash yet.  Obviously,
+		added to buf_pool.page_hash yet.  Obviously,
 		it cannot be relocated. */
 
-		rw_lock_x_unlock(hash_lock);
-
 		if (!force || space != 0 || offset != 0) {
 			return(false);
 		}
@@ -591,12 +516,10 @@ buf_buddy_relocate(
 		/* It might be just uninitialized page.
 		We should search from LRU list also. */
 
-		bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+		bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
 		while (bpage != NULL) {
 			if (bpage->zip.data == src) {
-				hash_lock = buf_page_hash_lock_get(
-					buf_pool, bpage->id);
-				rw_lock_x_lock(hash_lock);
+				ut_ad(bpage->id() == page_id);
 				break;
 			}
 			bpage = UT_LIST_GET_NEXT(LRU, bpage);
@@ -612,9 +535,6 @@ buf_buddy_relocate(
 		have to relocate all blocks covered by src.
 		For the sake of simplicity, give up. */
 		ut_ad(page_zip_get_size(&bpage->zip) < size);
-
-		rw_lock_x_unlock(hash_lock);
-
 		return(false);
 	}
 
@@ -622,11 +542,14 @@ buf_buddy_relocate(
 	contain uninitialized data. */
 	MEM_CHECK_ADDRESSABLE(src, size);
 
-	BPageMutex*	block_mutex = buf_page_get_mutex(bpage);
+	if (!bpage->can_relocate()) {
+		return false;
+	}
 
-	mutex_enter(block_mutex);
+	page_hash_latch *hash_lock = buf_pool.page_hash.lock_get(fold);
+	hash_lock->write_lock();
 
-	if (buf_page_can_relocate(bpage)) {
+	if (bpage->can_relocate()) {
 		/* Relocate the compressed page. */
 		const ulonglong ns = my_interval_timer();
 
@@ -635,62 +558,53 @@ buf_buddy_relocate(
 		memcpy(dst, src, size);
 		bpage->zip.data = reinterpret_cast<page_zip_t*>(dst);
 
-		rw_lock_x_unlock(hash_lock);
-
-		mutex_exit(block_mutex);
+		hash_lock->write_unlock();
 
 		buf_buddy_mem_invalid(
 			reinterpret_cast<buf_buddy_free_t*>(src), i);
 
-		buf_buddy_stat_t*	buddy_stat = &buf_pool->buddy_stat[i];
+		buf_buddy_stat_t*	buddy_stat = &buf_pool.buddy_stat[i];
 		buddy_stat->relocated++;
 		buddy_stat->relocated_usec+= (my_interval_timer() - ns) / 1000;
 		return(true);
 	}
 
-	rw_lock_x_unlock(hash_lock);
+	hash_lock->write_unlock();
 
-	mutex_exit(block_mutex);
 	return(false);
 }
 
-/**********************************************************************//**
-Deallocate a block. */
-void
-buf_buddy_free_low(
-/*===============*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	void*		buf,		/*!< in: block to be freed, must not be
-					pointed to by the buffer pool */
-	ulint		i)		/*!< in: index of buf_pool->zip_free[],
-					or BUF_BUDDY_SIZES */
+/** Deallocate a block.
+@param[in]	buf	block to be freed, must not be pointed to
+			by the buffer pool
+@param[in]	i	index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
+void buf_buddy_free_low(void* buf, ulint i)
 {
 	buf_buddy_free_t*	buddy;
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(!mutex_own(&buf_pool->zip_mutex));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
 	ut_ad(i <= BUF_BUDDY_SIZES);
 	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
-	ut_ad(buf_pool->buddy_stat[i].used > 0);
+	ut_ad(buf_pool.buddy_stat[i].used > 0);
 
-	buf_pool->buddy_stat[i].used--;
+	buf_pool.buddy_stat[i].used--;
 recombine:
 	MEM_UNDEFINED(buf, BUF_BUDDY_LOW << i);
 
 	if (i == BUF_BUDDY_SIZES) {
-		buf_buddy_block_free(buf_pool, buf);
+		buf_buddy_block_free(buf);
 		return;
 	}
 
 	ut_ad(i < BUF_BUDDY_SIZES);
 	ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i));
-	ut_ad(!buf_pool_contains_zip(buf_pool, buf));
+	ut_ad(!buf_pool.contains_zip(buf));
 
 	/* Do not recombine blocks if there are few free blocks.
 	We may waste up to 15360*max_len bytes to free blocks
 	(1024 + 2048 + 4096 + 8192 = 15360) */
-	if (UT_LIST_GET_LEN(buf_pool->zip_free[i]) < 16
-	    && buf_pool->curr_size >= buf_pool->old_size) {
+	if (UT_LIST_GET_LEN(buf_pool.zip_free[i]) < 16
+	    && buf_pool.curr_size >= buf_pool.old_size) {
 		goto func_exit;
 	}
 
@@ -702,36 +616,34 @@ recombine:
 	switch (buf_buddy_is_free(buddy, i)) {
 	case BUF_BUDDY_STATE_FREE:
 		/* The buddy is free: recombine */
-		buf_buddy_remove_from_free(buf_pool, buddy, i);
+		buf_buddy_remove_from_free(buddy, i);
 buddy_is_free:
-		ut_ad(!buf_pool_contains_zip(buf_pool, buddy));
+		ut_ad(!buf_pool.contains_zip(buddy));
 		i++;
 		buf = ut_align_down(buf, BUF_BUDDY_LOW << i);
 
 		goto recombine;
 
 	case BUF_BUDDY_STATE_USED:
-		ut_d(buf_buddy_list_validate(buf_pool, i));
+		ut_d(buf_buddy_list_validate(i));
 
 		/* The buddy is not free. Is there a free block of
 		this size? */
 		if (buf_buddy_free_t* zip_buf =
-			UT_LIST_GET_FIRST(buf_pool->zip_free[i])) {
+			UT_LIST_GET_FIRST(buf_pool.zip_free[i])) {
 
 			/* Remove the block from the free list, because
 			a successful buf_buddy_relocate() will overwrite
 			zip_free->list. */
-			buf_buddy_remove_from_free(buf_pool, zip_buf, i);
+			buf_buddy_remove_from_free(zip_buf, i);
 
 			/* Try to relocate the buddy of buf to the free
 			block. */
-			if (buf_buddy_relocate(buf_pool, buddy, zip_buf, i,
-					       false)) {
-
+			if (buf_buddy_relocate(buddy, zip_buf, i, false)) {
 				goto buddy_is_free;
 			}
 
-			buf_buddy_add_to_free(buf_pool, zip_buf, i);
+			buf_buddy_add_to_free(zip_buf, i);
 		}
 
 		break;
@@ -743,40 +655,31 @@ buddy_is_free:
 
 func_exit:
 	/* Free the block to the buddy list. */
-	buf_buddy_add_to_free(buf_pool,
-			      reinterpret_cast<buf_buddy_free_t*>(buf),
-			      i);
+	buf_buddy_add_to_free(reinterpret_cast<buf_buddy_free_t*>(buf), i);
 }
 
-/** Reallocate a block.
-@param[in]	buf_pool	buffer pool instance
-@param[in]	buf		block to be reallocated, must be pointed
-to by the buffer pool
-@param[in]	size		block size, up to srv_page_size
-@retval false	if failed because of no free blocks. */
+/** Try to reallocate a block.
+@param[in]	buf	buf_pool block to be reallocated
+@param[in]	size	block size, up to srv_page_size
+@return	whether the reallocation succeeded */
 bool
-buf_buddy_realloc(
-	buf_pool_t*	buf_pool,
-	void*		buf,
-	ulint		size)
+buf_buddy_realloc(void* buf, ulint size)
 {
 	buf_block_t*	block = NULL;
 	ulint		i = buf_buddy_get_slot(size);
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(!mutex_own(&buf_pool->zip_mutex));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
 	ut_ad(i <= BUF_BUDDY_SIZES);
 	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
 
 	if (i < BUF_BUDDY_SIZES) {
 		/* Try to allocate from the buddy system. */
-		block = reinterpret_cast<buf_block_t*>(
-			buf_buddy_alloc_zip(buf_pool, i));
+		block = reinterpret_cast<buf_block_t*>(buf_buddy_alloc_zip(i));
 	}
 
 	if (block == NULL) {
-		/* Try allocating from the buf_pool->free list. */
-		block = buf_LRU_get_free_only(buf_pool);
+		/* Try allocating from the buf_pool.free list. */
+		block = buf_LRU_get_free_only();
 
 		if (block == NULL) {
 			return(false); /* free_list was not enough */
@@ -786,40 +689,37 @@ buf_buddy_realloc(
 
 		block = reinterpret_cast<buf_block_t*>(
 			buf_buddy_alloc_from(
-				buf_pool, block->frame, i, BUF_BUDDY_SIZES));
+				block->frame, i, BUF_BUDDY_SIZES));
 	}
 
-	buf_pool->buddy_stat[i].used++;
+	buf_pool.buddy_stat[i].used++;
 
 	/* Try to relocate the buddy of buf to the free block. */
-	if (buf_buddy_relocate(buf_pool, buf, block, i, true)) {
+	if (buf_buddy_relocate(buf, block, i, true)) {
 		/* succeeded */
-		buf_buddy_free_low(buf_pool, buf, i);
+		buf_buddy_free_low(buf, i);
 	} else {
 		/* failed */
-		buf_buddy_free_low(buf_pool, block, i);
+		buf_buddy_free_low(block, i);
 	}
 
 	return(true); /* free_list was enough */
 }
 
-/** Combine all pairs of free buddies.
-@param[in]	buf_pool	buffer pool instance */
-void
-buf_buddy_condense_free(
-	buf_pool_t*	buf_pool)
+/** Combine all pairs of free buddies. */
+void buf_buddy_condense_free()
 {
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_pool->curr_size < buf_pool->old_size);
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(buf_pool.curr_size < buf_pool.old_size);
 
-	for (ulint i = 0; i < UT_ARR_SIZE(buf_pool->zip_free); ++i) {
+	for (ulint i = 0; i < UT_ARR_SIZE(buf_pool.zip_free); ++i) {
 		buf_buddy_free_t* buf =
-			UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
+			UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
 
 		/* seek to withdraw target */
 		while (buf != NULL
-		       && !buf_frame_will_withdrawn(
-				buf_pool, reinterpret_cast<byte*>(buf))) {
+		       && !buf_pool.will_be_withdrawn(
+			       reinterpret_cast<byte*>(buf))) {
 			buf = UT_LIST_GET_NEXT(list, buf);
 		}
 
@@ -836,8 +736,7 @@ buf_buddy_condense_free(
 			/* seek to the next withdraw target */
 			while (true) {
 				while (next != NULL
-				       && !buf_frame_will_withdrawn(
-						buf_pool,
+				       && !buf_pool.will_be_withdrawn(
 						reinterpret_cast<byte*>(next))) {
 					 next = UT_LIST_GET_NEXT(list, next);
 				}
@@ -853,10 +752,10 @@ buf_buddy_condense_free(
 			    == BUF_BUDDY_STATE_FREE) {
 				/* Both buf and buddy are free.
 				Try to combine them. */
-				buf_buddy_remove_from_free(buf_pool, buf, i);
-				buf_pool->buddy_stat[i].used++;
+				buf_buddy_remove_from_free(buf, i);
+				buf_pool.buddy_stat[i].used++;
 
-				buf_buddy_free_low(buf_pool, buf, i);
+				buf_buddy_free_low(buf, i);
 			}
 
 			buf = next;
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index b42964b8315..8bea22b3e85 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2021, Oracle and/or its affiliates.
+Copyright (c) 1995, 2018, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
 Copyright (c) 2013, 2022, MariaDB Corporation.
 
@@ -31,6 +31,7 @@ The database buffer buf_pool
 Created 11/5/1995 Heikki Tuuri
 *******************************************************/
 
+#include "assume_aligned.h"
 #include "mtr0types.h"
 #include "mach0data.h"
 #include "buf0buf.h"
@@ -39,11 +40,13 @@ Created 11/5/1995 Heikki Tuuri
 #include <string.h>
 
 #ifndef UNIV_INNOCHECKSUM
+#include "my_cpu.h"
 #include "mem0mem.h"
 #include "btr0btr.h"
 #include "fil0fil.h"
 #include "fil0crypt.h"
 #include "buf0buddy.h"
+#include "buf0dblwr.h"
 #include "lock0lock.h"
 #include "sync0rw.h"
 #include "btr0sea.h"
@@ -59,7 +62,6 @@ Created 11/5/1995 Heikki Tuuri
 #include "srv0mon.h"
 #include "log0crypt.h"
 #include "fil0pagecompress.h"
-#include "fsp0pagecompress.h"
 #endif /* !UNIV_INNOCHECKSUM */
 #include "page0zip.h"
 #include "sync0sync.h"
@@ -67,14 +69,6 @@ Created 11/5/1995 Heikki Tuuri
 #include <map>
 #include <sstream>
 
-#ifdef UNIV_LINUX
-#include <stdlib.h>
-#endif
-
-#ifdef HAVE_LZO
-#include "lzo/lzo1x.h"
-#endif
-
 using st_::span;
 
 #ifdef HAVE_LIBNUMA
@@ -121,40 +115,10 @@ struct set_numa_interleave_t
 #define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE
 #endif /* HAVE_LIBNUMA */
 
-#ifdef HAVE_SNAPPY
-#include "snappy-c.h"
-#endif
-
-#ifndef UNIV_INNOCHECKSUM
-buf_pool_t::io_buf_t::~io_buf_t()
-{
-	for (buf_tmp_buffer_t* s = slots, *e = slots + n_slots; s != e; s++) {
-		aligned_free(s->crypt_buf);
-		aligned_free(s->comp_buf);
-	}
-	ut_free(slots);
-}
-#endif /* !UNIV_INNOCHECKSUM */
-
 /*
 		IMPLEMENTATION OF THE BUFFER POOL
 		=================================
 
-Performance improvement:
-------------------------
-Thread scheduling in NT may be so slow that the OS wait mechanism should
-not be used even in waiting for disk reads to complete.
-Rather, we should put waiting query threads to the queue of
-waiting jobs, and let the OS thread do something useful while the i/o
-is processed. In this way we could remove most OS thread switches in
-an i/o-intensive benchmark like TPC-C.
-
-A possibility is to put a user space thread library between the database
-and NT. User space thread libraries might be very fast.
-
-SQL Server 7.0 can be configured to use 'fibers' which are lightweight
-threads in NT. These should be studied.
-
 		Buffer frames and blocks
 		------------------------
 Following the terminology of Gray and Reuter, we call the memory
@@ -168,21 +132,21 @@ in the file along with the file page, resides in the control block.
 The buffer buf_pool contains a single mutex which protects all the
 control data structures of the buf_pool. The content of a buffer frame is
 protected by a separate read-write lock in its control block, though.
-These locks can be locked and unlocked without owning the buf_pool->mutex.
+These locks can be locked and unlocked without owning the buf_pool.mutex.
 The OS events in the buf_pool struct can be waited for without owning the
-buf_pool->mutex.
+buf_pool.mutex.
 
-The buf_pool->mutex is a hot-spot in main memory, causing a lot of
+The buf_pool.mutex is a hot-spot in main memory, causing a lot of
 memory bus traffic on multiprocessor systems when processors
 alternately access the mutex. On our Pentium, the mutex is accessed
 maybe every 10 microseconds. We gave up the solution to have mutexes
 for each control block, for instance, because it seemed to be
 complicated.
 
-A solution to reduce mutex contention of the buf_pool->mutex is to
+A solution to reduce mutex contention of the buf_pool.mutex is to
 create a separate mutex for the page hash table. On Pentium,
 accessing the hash table takes 2 microseconds, about half
-of the total buf_pool->mutex hold time.
+of the total buf_pool.mutex hold time.
 
 		Control blocks
 		--------------
@@ -197,16 +161,6 @@ The buffer frames have to be aligned so that the start memory
 address of a frame is divisible by the universal page size, which
 is a power of two.
 
-We intend to make the buffer buf_pool size on-line reconfigurable,
-that is, the buf_pool size can be changed without closing the database.
-Then the database administarator may adjust it to be bigger
-at night, for example. The control block array must
-contain enough control blocks for the maximum buffer buf_pool size
-which is used in the particular database.
-If the buf_pool size is cut, we exploit the virtual memory mechanism of
-the OS, and just refrain from using frames at high addresses. Then the OS
-can swap them to disk.
-
 The control blocks containing file pages are put to a hash table
 according to the file address of the page.
 We could speed up the access to an individual page by using
@@ -226,7 +180,7 @@ in the database, using tables whose size is a power of 2.
 
 There are several lists of control blocks.
 
-The free list (buf_pool->free) contains blocks which are currently not
+The free list (buf_pool.free) contains blocks which are currently not
 used.
 
 The common LRU list contains all the blocks holding a file page
@@ -247,25 +201,25 @@ in the main memory, undisturbed.
 The unzip_LRU list contains a subset of the common LRU list.  The
 blocks on the unzip_LRU list hold a compressed file page and the
 corresponding uncompressed page frame.  A block is in unzip_LRU if and
-only if the predicate buf_page_belongs_to_unzip_LRU(&block->page)
+only if the predicate block->page.belongs_to_unzip_LRU()
 holds.  The blocks in unzip_LRU will be in same order as they are in
 the common LRU list.  That is, each manipulation of the common LRU
 list will result in the same manipulation of the unzip_LRU list.
 
-The chain of modified blocks (buf_pool->flush_list) contains the blocks
-holding file pages that have been modified in the memory
+The chain of modified blocks (buf_pool.flush_list) contains the blocks
+holding persistent file pages that have been modified in the memory
 but not written to disk yet. The block with the oldest modification
 which has not yet been written to disk is at the end of the chain.
-The access to this list is protected by buf_pool->flush_list_mutex.
+The access to this list is protected by buf_pool.flush_list_mutex.
 
-The chain of unmodified compressed blocks (buf_pool->zip_clean)
-contains the control blocks (buf_page_t) of those compressed pages
-that are not in buf_pool->flush_list and for which no uncompressed
-page has been allocated in the buffer pool.  The control blocks for
-uncompressed pages are accessible via buf_block_t objects that are
-reachable via buf_pool->chunks[].
+The control blocks for uncompressed pages are accessible via
+buf_block_t objects that are reachable via buf_pool.chunks[].
+The control blocks (buf_page_t) of those ROW_FORMAT=COMPRESSED pages
+that are not in buf_pool.flush_list and for which no uncompressed
+page has been allocated in buf_pool are only accessible via
+buf_pool.LRU.
 
-The chains of free memory blocks (buf_pool->zip_free[]) are used by
+The chains of free memory blocks (buf_pool.zip_free[]) are used by
 the buddy allocator (buf0buddy.cc) to keep track of currently unused
 memory blocks of size sizeof(buf_page_t)..srv_page_size / 2.  These
 blocks are inside the srv_page_size-sized memory blocks of type
@@ -325,67 +279,67 @@ the read requests for the whole area.
 */
 
 #ifndef UNIV_INNOCHECKSUM
+void page_hash_latch::read_lock_wait()
+{
+  /* First, try busy spinning for a while. */
+  for (auto spin= srv_n_spin_wait_rounds; spin--; )
+  {
+    ut_delay(srv_spin_wait_delay);
+    if (read_trylock())
+      return;
+  }
+  /* Fall back to yielding to other threads. */
+  do
+    os_thread_yield();
+  while (!read_trylock());
+}
+
+void page_hash_latch::write_lock_wait()
+{
+  write_lock_wait_start();
+
+  /* First, try busy spinning for a while. */
+  for (auto spin= srv_n_spin_wait_rounds; spin--; )
+  {
+    if (write_lock_poll())
+      return;
+    ut_delay(srv_spin_wait_delay);
+  }
+
+  /* Fall back to yielding to other threads. */
+  do
+    os_thread_yield();
+  while (!write_lock_poll());
+}
+
 /** Value in microseconds */
-static const int WAIT_FOR_READ	= 100;
-static const int WAIT_FOR_WRITE = 100;
+constexpr int WAIT_FOR_READ= 100;
+constexpr int WAIT_FOR_WRITE= 100;
 /** Number of attempts made to read in a page in the buffer pool */
-static const ulint	BUF_PAGE_READ_MAX_RETRIES = 100;
-/** Number of pages to read ahead */
-static const ulint	BUF_READ_AHEAD_PAGES = 64;
+constexpr ulint	BUF_PAGE_READ_MAX_RETRIES= 100;
 /** The maximum portion of the buffer pool that can be used for the
 read-ahead buffer.  (Divide buf_pool size by this amount) */
-static const ulint	BUF_READ_AHEAD_PORTION = 32;
-
-/** The buffer pools of the database */
-buf_pool_t*	buf_pool_ptr;
-
-/** true when resizing buffer pool is in the critical path. */
-volatile bool	buf_pool_resizing;
+constexpr uint32_t BUF_READ_AHEAD_PORTION= 32;
 
-/** Map of buffer pool chunks by its first frame address
-This is newly made by initialization of buffer pool and buf_resize_thread.
-Currently, no need mutex protection for update. */
-typedef std::map<
-	const byte*,
-	buf_chunk_t*,
-	std::less<const byte*>,
-	ut_allocator<std::pair<const byte* const, buf_chunk_t*> > >
-	buf_pool_chunk_map_t;
+/** A 64KiB buffer of NUL bytes, for use in assertions and checks,
+and dummy default values of instantly dropped columns.
+Initially, BLOB field references are set to NUL bytes, in
+dtuple_convert_big_rec(). */
+const byte *field_ref_zero;
 
-static buf_pool_chunk_map_t*			buf_chunk_map_reg;
-
-/** Chunk map to be used to lookup.
-The map pointed by this should not be updated */
-static buf_pool_chunk_map_t*	buf_chunk_map_ref = NULL;
+/** The InnoDB buffer pool */
+buf_pool_t buf_pool;
+buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_reg;
+buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_ref;
 
 #ifdef UNIV_DEBUG
 /** Disable resizing buffer pool to make assertion code not expensive. */
 my_bool			buf_disable_resize_buffer_pool_debug = TRUE;
-#endif /* UNIV_DEBUG */
 
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 /** This is used to insert validation operations in execution
 in the debug version */
-static ulint	buf_dbg_counter	= 0;
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-#if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK
-# ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
-
-/* Buffer block mutexes and rwlocks can be registered
-in one group rather than individually. If PFS_GROUP_BUFFER_SYNC
-is defined, register buffer block mutex and rwlock
-in one group after their initialization. */
-#  define PFS_GROUP_BUFFER_SYNC
-
-/* This define caps the number of mutexes/rwlocks can
-be registered with performance schema. Developers can
-modify this define if necessary. Please note, this would
-be effective only if PFS_GROUP_BUFFER_SYNC is defined. */
-#  define PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER	ULINT_MAX
-
-# endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
-#endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
+static ulint buf_dbg_counter;
+#endif /* UNIV_DEBUG */
 
 /** Macro to determine whether the read of write counter is used depending
 on the io_type */
@@ -395,56 +349,6 @@ on the io_type */
 	 : (counter##_WRITTEN))
 
 
-/** Reserve a buffer slot for encryption, decryption or page compression.
-@param[in,out]	buf_pool	buffer pool
-@return reserved buffer slot */
-static buf_tmp_buffer_t* buf_pool_reserve_tmp_slot(buf_pool_t* buf_pool)
-{
-	buf_tmp_buffer_t* slot = buf_pool->io_buf.reserve();
-	ut_a(slot);
-	return slot;
-}
-
-/** Reserve a buffer for encryption, decryption or decompression.
-@param[in,out]	slot	reserved slot */
-static void buf_tmp_reserve_crypt_buf(buf_tmp_buffer_t* slot)
-{
-	if (!slot->crypt_buf) {
-		slot->crypt_buf = static_cast<byte*>(
-			aligned_malloc(srv_page_size, srv_page_size));
-	}
-}
-
-/** Reserve a buffer for compression.
-@param[in,out]	slot	reserved slot */
-static void buf_tmp_reserve_compression_buf(buf_tmp_buffer_t* slot)
-{
-	if (!slot->comp_buf) {
-		/* Both snappy and lzo compression methods require that
-		output buffer used for compression is bigger than input
-		buffer. Increase the allocated buffer size accordingly. */
-		ulint size = srv_page_size;
-#ifdef HAVE_LZO
-		size += LZO1X_1_15_MEM_COMPRESS;
-#elif defined HAVE_SNAPPY
-		size = snappy_max_compressed_length(size);
-#endif
-		slot->comp_buf = static_cast<byte*>(
-			aligned_malloc(size, srv_page_size));
-	}
-}
-
-/** Registers a chunk to buf_pool_chunk_map
-@param[in]	chunk	chunk of buffers */
-static
-void
-buf_pool_register_chunk(
-	buf_chunk_t*	chunk)
-{
-	buf_chunk_map_reg->insert(buf_pool_chunk_map_t::value_type(
-		chunk->blocks->frame, chunk));
-}
-
 /** Decrypt a page for temporary tablespace.
 @param[in,out]	tmp_frame	Temporary buffer
 @param[in]	src_frame	Page to decrypt
@@ -473,11 +377,13 @@ static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame)
 		return false;
 	}
 
-	memcpy(tmp_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
-	       src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
-	       FIL_PAGE_FCRC32_CHECKSUM);
+	static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment");
+	memcpy_aligned<4>(tmp_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+			  src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+			  FIL_PAGE_FCRC32_CHECKSUM);
 
-	memcpy(src_frame, tmp_frame, srv_page_size);
+	memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(src_frame, tmp_frame,
+					       srv_page_size);
 	srv_stats.pages_decrypted.inc();
 	srv_stats.n_temp_blocks_decrypted.inc();
 
@@ -486,33 +392,36 @@ static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame)
 
 /** Decrypt a page.
 @param[in,out]	bpage	Page control block
-@param[in,out]	space	tablespace
+@param[in]	node	data file
 @return whether the operation was successful */
-static bool buf_page_decrypt_after_read(buf_page_t* bpage, fil_space_t* space)
+static bool buf_page_decrypt_after_read(buf_page_t *bpage,
+                                        const fil_node_t &node)
 {
-	ut_ad(space->pending_io());
-	ut_ad(space->id == bpage->id.space());
+	ut_ad(node.space->referenced());
+	ut_ad(node.space->id == bpage->id().space());
+	const auto flags = node.space->flags;
 
 	byte* dst_frame = bpage->zip.data ? bpage->zip.data :
 		((buf_block_t*) bpage)->frame;
-	bool page_compressed = space->is_compressed()
-		&& buf_page_is_compressed(dst_frame, space->flags);
-	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+	bool page_compressed = node.space->is_compressed()
+		&& buf_page_is_compressed(dst_frame, flags);
+	const page_id_t id(bpage->id());
 
-	if (bpage->id.page_no() == 0) {
+	if (id.page_no() == 0) {
 		/* File header pages are not encrypted/compressed */
 		return (true);
 	}
 
-	if (space->purpose == FIL_TYPE_TEMPORARY
+	if (node.space->purpose == FIL_TYPE_TEMPORARY
 	    && innodb_encrypt_temporary_tables) {
-		buf_tmp_buffer_t* slot = buf_pool_reserve_tmp_slot(buf_pool);
-		buf_tmp_reserve_crypt_buf(slot);
+		buf_tmp_buffer_t* slot = buf_pool.io_buf_reserve();
+		ut_a(slot);
+		slot->allocate();
 
 		if (!buf_tmp_page_decrypt(slot->crypt_buf, dst_frame)) {
 			slot->release();
-			ib::error() << "Encrypted page " << bpage->id
-				    << " in file " << space->chain.start->name;
+			ib::error() << "Encrypted page " << id
+				    << " in file " << node.name;
 			return false;
 		}
 
@@ -525,238 +434,73 @@ static bool buf_page_decrypt_after_read(buf_page_t* bpage, fil_space_t* space)
 	also for pages first compressed and then encrypted. */
 
 	buf_tmp_buffer_t* slot;
-	uint key_version = buf_page_get_key_version(dst_frame, space->flags);
+	uint key_version = buf_page_get_key_version(dst_frame, flags);
 
 	if (page_compressed && !key_version) {
 		/* the page we read is unencrypted */
 		/* Find free slot from temporary memory array */
 decompress:
-		if (space->full_crc32()
-		    && buf_page_is_corrupted(true, dst_frame, space->flags)) {
+		if (fil_space_t::full_crc32(flags)
+		    && buf_page_is_corrupted(true, dst_frame, flags)) {
 			return false;
 		}
 
-		slot = buf_pool_reserve_tmp_slot(buf_pool);
-		/* For decompression, use crypt_buf. */
-		buf_tmp_reserve_crypt_buf(slot);
+		slot = buf_pool.io_buf_reserve();
+		ut_a(slot);
+		slot->allocate();
 
 decompress_with_slot:
-		ut_d(fil_page_type_validate(space, dst_frame));
+		ut_d(fil_page_type_validate(node.space, dst_frame));
 
 		ulint write_size = fil_page_decompress(
-			slot->crypt_buf, dst_frame, space->flags);
+			slot->crypt_buf, dst_frame, flags);
 		slot->release();
-
-		ut_ad(!write_size || fil_page_type_validate(space, dst_frame));
-		ut_ad(space->pending_io());
+		ut_ad(!write_size
+		      || fil_page_type_validate(node.space, dst_frame));
+		ut_ad(node.space->referenced());
 		return write_size != 0;
 	}
 
-	if (key_version && space->crypt_data) {
+	if (key_version && node.space->crypt_data) {
 		/* Verify encryption checksum before we even try to
 		decrypt. */
-		if (!buf_page_verify_crypt_checksum(dst_frame, space->flags)) {
+		if (!buf_page_verify_crypt_checksum(dst_frame, flags)) {
 decrypt_failed:
-			ib::error() << "Encrypted page " << bpage->id
-				    << " in file " << space->chain.start->name
+			ib::error() << "Encrypted page " << id
+				    << " in file " << node.name
 				    << " looks corrupted; key_version="
 				    << key_version;
 			return false;
 		}
 
-		/* Find free slot from temporary memory array */
-		slot = buf_pool_reserve_tmp_slot(buf_pool);
-		buf_tmp_reserve_crypt_buf(slot);
-
-		ut_d(fil_page_type_validate(space, dst_frame));
+		slot = buf_pool.io_buf_reserve();
+		ut_a(slot);
+		slot->allocate();
+		ut_d(fil_page_type_validate(node.space, dst_frame));
 
 		/* decrypt using crypt_buf to dst_frame */
-		if (!fil_space_decrypt(space, slot->crypt_buf, dst_frame)) {
+		if (!fil_space_decrypt(node.space, slot->crypt_buf, dst_frame)) {
 			slot->release();
 			goto decrypt_failed;
 		}
 
-		ut_d(fil_page_type_validate(space, dst_frame));
+		ut_d(fil_page_type_validate(node.space, dst_frame));
 
-		if ((space->full_crc32() && page_compressed)
-		    || fil_page_is_compressed_encrypted(dst_frame)) {
+		if ((fil_space_t::full_crc32(flags) && page_compressed)
+		    || fil_page_get_type(dst_frame)
+		    == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
 			goto decompress_with_slot;
 		}
 
 		slot->release();
-	} else if (fil_page_is_compressed_encrypted(dst_frame)) {
+	} else if (fil_page_get_type(dst_frame)
+		   == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
 		goto decompress;
 	}
 
-	ut_ad(space->pending_io());
+	ut_ad(node.space->referenced());
 	return true;
 }
-
-/********************************************************************//**
-Gets the smallest oldest_modification lsn for any page in the pool. Returns
-zero if all modified pages have been flushed to disk.
-@return oldest modification in pool, zero if none */
-lsn_t
-buf_pool_get_oldest_modification(void)
-/*==================================*/
-{
-	lsn_t		lsn = 0;
-	lsn_t		oldest_lsn = 0;
-
-	/* When we traverse all the flush lists we don't want another
-	thread to add a dirty page to any flush list. */
-	log_flush_order_mutex_enter();
-
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		buf_flush_list_mutex_enter(buf_pool);
-
-		buf_page_t*	bpage;
-
-		/* We don't let log-checkpoint halt because pages from system
-		temporary are not yet flushed to the disk. Anyway, object
-		residing in system temporary doesn't generate REDO logging. */
-		for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
-		     bpage != NULL
-			&& fsp_is_system_temporary(bpage->id.space());
-		     bpage = UT_LIST_GET_PREV(list, bpage)) {
-			/* Do nothing. */
-		}
-
-		if (bpage != NULL) {
-			ut_ad(bpage->in_flush_list);
-			lsn = bpage->oldest_modification;
-		}
-
-		buf_flush_list_mutex_exit(buf_pool);
-
-		if (!oldest_lsn || oldest_lsn > lsn) {
-			oldest_lsn = lsn;
-		}
-	}
-
-	log_flush_order_mutex_exit();
-
-	/* The returned answer may be out of date: the flush_list can
-	change after the mutex has been released. */
-
-	return(oldest_lsn);
-}
-
-/********************************************************************//**
-Get total buffer pool statistics. */
-void
-buf_get_total_list_len(
-/*===================*/
-	ulint*		LRU_len,	/*!< out: length of all LRU lists */
-	ulint*		free_len,	/*!< out: length of all free lists */
-	ulint*		flush_list_len)	/*!< out: length of all flush lists */
-{
-	ulint		i;
-
-	*LRU_len = 0;
-	*free_len = 0;
-	*flush_list_len = 0;
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		*LRU_len += UT_LIST_GET_LEN(buf_pool->LRU);
-		*free_len += UT_LIST_GET_LEN(buf_pool->free);
-		*flush_list_len += UT_LIST_GET_LEN(buf_pool->flush_list);
-	}
-}
-
-/********************************************************************//**
-Get total list size in bytes from all buffer pools. */
-void
-buf_get_total_list_size_in_bytes(
-/*=============================*/
-	buf_pools_list_size_t*	buf_pools_list_size)	/*!< out: list sizes
-							in all buffer pools */
-{
-	ut_ad(buf_pools_list_size);
-	memset(buf_pools_list_size, 0, sizeof(*buf_pools_list_size));
-
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-		/* We don't need mutex protection since this is
-		for statistics purpose */
-		buf_pools_list_size->LRU_bytes += buf_pool->stat.LRU_bytes;
-		buf_pools_list_size->unzip_LRU_bytes +=
-			UT_LIST_GET_LEN(buf_pool->unzip_LRU)
-			<< srv_page_size_shift;
-		buf_pools_list_size->flush_list_bytes +=
-			buf_pool->stat.flush_list_bytes;
-	}
-}
-
-/********************************************************************//**
-Get total buffer pool statistics. */
-void
-buf_get_total_stat(
-/*===============*/
-	buf_pool_stat_t*	tot_stat)	/*!< out: buffer pool stats */
-{
-	ulint			i;
-
-	memset(tot_stat, 0, sizeof(*tot_stat));
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_stat_t*buf_stat;
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		buf_stat = &buf_pool->stat;
-		tot_stat->n_page_gets += buf_stat->n_page_gets;
-		tot_stat->n_pages_read += buf_stat->n_pages_read;
-		tot_stat->n_pages_written += buf_stat->n_pages_written;
-		tot_stat->n_pages_created += buf_stat->n_pages_created;
-		tot_stat->n_ra_pages_read_rnd += buf_stat->n_ra_pages_read_rnd;
-		tot_stat->n_ra_pages_read += buf_stat->n_ra_pages_read;
-		tot_stat->n_ra_pages_evicted += buf_stat->n_ra_pages_evicted;
-		tot_stat->n_pages_made_young += buf_stat->n_pages_made_young;
-
-		tot_stat->n_pages_not_made_young +=
-			buf_stat->n_pages_not_made_young;
-	}
-}
-
-/********************************************************************//**
-Allocates a buffer block.
-@return own: the allocated block, in state BUF_BLOCK_MEMORY */
-buf_block_t*
-buf_block_alloc(
-/*============*/
-	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance,
-					or NULL for round-robin selection
-					of the buffer pool */
-{
-	buf_block_t*	block;
-	ulint		index;
-	static ulint	buf_pool_index;
-
-	if (buf_pool == NULL) {
-		/* We are allocating memory from any buffer pool, ensure
-		we spread the grace on all buffer pool instances. */
-		index = buf_pool_index++ % srv_buf_pool_instances;
-		buf_pool = buf_pool_from_array(index);
-	}
-
-	block = buf_LRU_get_free_block(buf_pool);
-
-	buf_block_set_state(block, BUF_BLOCK_MEMORY);
-
-	return(block);
-}
 #endif /* !UNIV_INNOCHECKSUM */
 
 /** Checks if the page is in crc32 checksum format.
@@ -924,18 +668,17 @@ static void buf_page_check_lsn(bool check_lsn, const byte* read_buf)
 {
 #ifndef UNIV_INNOCHECKSUM
 	if (check_lsn && recv_lsn_checks_on) {
-		lsn_t		current_lsn;
+		const lsn_t current_lsn = log_sys.get_lsn();
 		const lsn_t	page_lsn
 			= mach_read_from_8(read_buf + FIL_PAGE_LSN);
 
 		/* Since we are going to reset the page LSN during the import
 		phase it makes no sense to spam the log with error messages. */
+		if (current_lsn < page_lsn) {
 
-		if (log_peek_lsn(&current_lsn) && current_lsn < page_lsn) {
-
-			const ulint	space_id = mach_read_from_4(
+			const uint32_t space_id = mach_read_from_4(
 				read_buf + FIL_PAGE_SPACE_ID);
-			const ulint	page_no = mach_read_from_4(
+			const uint32_t page_no = mach_read_from_4(
 				read_buf + FIL_PAGE_OFFSET);
 
 			ib::error() << "Page " << page_id_t(space_id, page_no)
@@ -961,7 +704,7 @@ static void buf_page_check_lsn(bool check_lsn, const byte* read_buf)
 @return whether the buffer is all zeroes */
 bool buf_is_zeroes(span<const byte> buf)
 {
-  ut_ad(buf.size() <= sizeof field_ref_zero);
+  ut_ad(buf.size() <= UNIV_PAGE_SIZE_MAX);
   return memcmp(buf.data(), field_ref_zero, buf.size()) == 0;
 }
 
@@ -1006,12 +749,16 @@ buf_page_is_corrupted(
 				      size - FIL_PAGE_FCRC32_CHECKSUM)) {
 			return true;
 		}
+		static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "alignment");
+		static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+		static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
 		if (!compressed
 		    && !mach_read_from_4(FIL_PAGE_FCRC32_KEY_VERSION
 					 + read_buf)
-		    && memcmp(read_buf + (FIL_PAGE_LSN + 4),
-			      end - (FIL_PAGE_FCRC32_END_LSN
-				     - FIL_PAGE_FCRC32_CHECKSUM), 4)) {
+		    && memcmp_aligned<4>(read_buf + (FIL_PAGE_LSN + 4),
+					 end - (FIL_PAGE_FCRC32_END_LSN
+						- FIL_PAGE_FCRC32_CHECKSUM),
+					 4)) {
 			return true;
 		}
 
@@ -1022,7 +769,7 @@ buf_page_is_corrupted(
 	size_t		checksum_field1 = 0;
 	size_t		checksum_field2 = 0;
 	const ulint zip_size = fil_space_t::zip_size(fsp_flags);
-	ulint page_type = mach_read_from_2(read_buf + FIL_PAGE_TYPE);
+	const uint16_t page_type = fil_page_get_type(read_buf);
 
 	/* We can trust page type if page compression is set on tablespace
 	flags because page compression flag means file must have been
@@ -1042,10 +789,13 @@ buf_page_is_corrupted(
 		return(false);
 	}
 
-	if (!zip_size && memcmp(read_buf + FIL_PAGE_LSN + 4,
-				read_buf + srv_page_size
-				- FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
+	static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+	static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 4 == 0, "alignment");
 
+	if (!zip_size
+	    && memcmp_aligned<4>(read_buf + FIL_PAGE_LSN + 4,
+				 read_buf + srv_page_size
+				 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
 		/* Stored log sequence numbers at the start and the end
 		of page do not match */
 
@@ -1073,7 +823,7 @@ buf_page_is_corrupted(
 	checksum_field2 = mach_read_from_4(
 		read_buf + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM);
 
-	compile_time_assert(!(FIL_PAGE_LSN % 8));
+	static_assert(FIL_PAGE_LSN % 8 == 0, "alignment");
 
 	/* A page filled with NUL bytes is considered not corrupted.
 	Before MariaDB Server 10.1.25 (MDEV-12113) or 10.2.2 (or MySQL 5.7),
@@ -1204,8 +954,6 @@ int
 buf_madvise_do_dump()
 {
 	int ret= 0;
-	buf_pool_t*	buf_pool;
-	buf_chunk_t*	chunk;
 
 	/* mirrors allocation in log_t::create() */
 	if (log_sys.buf) {
@@ -1222,21 +970,14 @@ buf_madvise_do_dump()
 		ret+= madvise(recv_sys.buf, recv_sys.len, MADV_DODUMP);
 	}
 
-	buf_pool_mutex_enter_all();
-
-	for (ulong i= 0; i < srv_buf_pool_instances; i++)
-	{
-		buf_pool = buf_pool_from_array(i);
-		chunk = buf_pool->chunks;
+	mysql_mutex_lock(&buf_pool.mutex);
+	auto chunk = buf_pool.chunks;
 
-		for (int n = buf_pool->n_chunks; n--; chunk++)
-		{
-			ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP);
-		}
+	for (ulint n = buf_pool.n_chunks; n--; chunk++) {
+		ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP);
 	}
 
-	buf_pool_mutex_exit_all();
-
+	mysql_mutex_unlock(&buf_pool.mutex);
 	return ret;
 }
 #endif
@@ -1408,729 +1149,404 @@ void buf_page_print(const byte* read_buf, ulint zip_size)
 	}
 }
 
-# ifdef PFS_GROUP_BUFFER_SYNC
-extern mysql_pfs_key_t	buffer_block_mutex_key;
-
-/********************************************************************//**
-This function registers mutexes and rwlocks in buffer blocks with
-performance schema. If PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER is
-defined to be a value less than chunk->size, then only mutexes
-and rwlocks in the first PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER
-blocks are registered. */
-static
-void
-pfs_register_buffer_block(
-/*======================*/
-	buf_chunk_t*	chunk)		/*!< in/out: chunk of buffers */
-{
-	buf_block_t*    block;
-	ulint		num_to_register;
-
-	block = chunk->blocks;
-
-	num_to_register = ut_min(
-		chunk->size, PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER);
-
-	for (ulint i = 0; i < num_to_register; i++) {
-#  ifdef UNIV_PFS_MUTEX
-		BPageMutex*	mutex;
-
-		mutex = &block->mutex;
-		mutex->pfs_add(buffer_block_mutex_key);
-#  endif /* UNIV_PFS_MUTEX */
-
-		rw_lock_t*	rwlock;
-
-#  ifdef UNIV_PFS_RWLOCK
-		rwlock = &block->lock;
-		ut_a(!rwlock->pfs_psi);
-		rwlock->pfs_psi = (PSI_server)
-			? PSI_server->init_rwlock(buf_block_lock_key, rwlock)
-			: NULL;
-
-#   ifdef UNIV_DEBUG
-		rwlock = block->debug_latch;
-		ut_a(!rwlock->pfs_psi);
-		rwlock->pfs_psi = (PSI_server)
-			? PSI_server->init_rwlock(buf_block_debug_latch_key,
-						  rwlock)
-			: NULL;
-#   endif /* UNIV_DEBUG */
-
-#  endif /* UNIV_PFS_RWLOCK */
-		block++;
-	}
-}
-# endif /* PFS_GROUP_BUFFER_SYNC */
-
-/********************************************************************//**
-Initializes a buffer control block when the buf_pool is created. */
+/** Initialize a buffer page descriptor.
+@param[in,out]	block	buffer page descriptor
+@param[in]	frame	buffer page frame */
 static
 void
-buf_block_init(
-/*===========*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	buf_block_t*	block,		/*!< in: pointer to control block */
-	byte*		frame)		/*!< in: pointer to buffer frame */
+buf_block_init(buf_block_t* block, byte* frame)
 {
 	/* This function should only be executed at database startup or by
-	buf_pool_resize(). Either way, adaptive hash index must not exist. */
+	buf_pool.resize(). Either way, adaptive hash index must not exist. */
 	assert_block_ahi_empty_on_init(block);
 
 	block->frame = frame;
 
-	block->page.buf_pool_index = buf_pool_index(buf_pool);
-	block->page.flush_type = BUF_FLUSH_LRU;
-	block->page.state = BUF_BLOCK_NOT_USED;
-	block->page.buf_fix_count = 0;
-	block->page.io_fix = BUF_IO_NONE;
-	block->page.flush_observer = NULL;
-	block->page.real_size = 0;
 	block->modify_clock = 0;
-	block->page.slot = NULL;
-
-	ut_d(block->page.file_page_was_freed = FALSE);
-
+	block->page.init(BUF_BLOCK_NOT_USED, page_id_t(~0ULL));
 #ifdef BTR_CUR_HASH_ADAPT
 	block->index = NULL;
 #endif /* BTR_CUR_HASH_ADAPT */
-	ut_d(block->page.in_page_hash = FALSE);
-	ut_d(block->page.in_zip_hash = FALSE);
-	ut_d(block->page.in_flush_list = FALSE);
-	ut_d(block->page.in_free_list = FALSE);
-	ut_d(block->page.in_LRU_list = FALSE);
-	ut_d(block->in_unzip_LRU_list = FALSE);
-	ut_d(block->in_withdraw_list = FALSE);
+	ut_d(block->in_unzip_LRU_list = false);
+	ut_d(block->in_withdraw_list = false);
 
 	page_zip_des_init(&block->page.zip);
 
-	mutex_create(LATCH_ID_BUF_BLOCK_MUTEX, &block->mutex);
 	ut_d(block->debug_latch = (rw_lock_t *) ut_malloc_nokey(sizeof(rw_lock_t)));
 
-#if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC
-	/* If PFS_SKIP_BUFFER_MUTEX_RWLOCK is defined, skip registration
-	of buffer block rwlock with performance schema.
-
-	If PFS_GROUP_BUFFER_SYNC is defined, skip the registration
-	since buffer block rwlock will be registered later in
-	pfs_register_buffer_block(). */
-
 	rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING);
 
 	ut_d(rw_lock_create(PFS_NOT_INSTRUMENTED, block->debug_latch,
 			    SYNC_LEVEL_VARYING));
 
-#else /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
-
-	rw_lock_create(buf_block_lock_key, &block->lock, SYNC_LEVEL_VARYING);
-
-	ut_d(rw_lock_create(buf_block_debug_latch_key,
-			    block->debug_latch, SYNC_LEVEL_VARYING));
-
-#endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
-
 	block->lock.is_block_lock = 1;
 
 	ut_ad(rw_lock_validate(&(block->lock)));
 }
 
-/********************************************************************//**
-Allocates a chunk of buffer frames.
-@return chunk, or NULL on failure */
-static
-buf_chunk_t*
-buf_chunk_init(
-/*===========*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	buf_chunk_t*	chunk,		/*!< out: chunk of buffers */
-	ulint		mem_size)	/*!< in: requested size in bytes */
+/** Allocate a chunk of buffer frames.
+@param bytes    requested size
+@return whether the allocation succeeded */
+inline bool buf_pool_t::chunk_t::create(size_t bytes)
 {
-	buf_block_t*	block;
-	byte*		frame;
-	ulint		i;
+  DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return false;);
+  /* Round down to a multiple of page size, although it already should be. */
+  bytes= ut_2pow_round<size_t>(bytes, srv_page_size);
 
-	/* Round down to a multiple of page size,
-	although it already should be. */
-	mem_size = ut_2pow_round<ulint>(mem_size, srv_page_size);
+  mem= buf_pool.allocator.allocate_large_dontdump(bytes, &mem_pfx);
 
-	DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return(NULL););
+  if (UNIV_UNLIKELY(!mem))
+    return false;
 
-	chunk->mem = buf_pool->allocator.allocate_large_dontdump(mem_size, &chunk->mem_pfx);
-
-	if (UNIV_UNLIKELY(chunk->mem == NULL)) {
-
-		return(NULL);
-	}
+  MEM_UNDEFINED(mem, mem_size());
 
 #ifdef HAVE_LIBNUMA
-	if (srv_numa_interleave) {
-		struct bitmask *numa_mems_allowed = numa_get_mems_allowed();
-		int	st = mbind(chunk->mem, chunk->mem_size(),
-				   MPOL_INTERLEAVE,
-				   numa_mems_allowed->maskp,
-				   numa_mems_allowed->size,
-				   MPOL_MF_MOVE);
-		if (st != 0) {
-			ib::warn() << "Failed to set NUMA memory policy of"
-				" buffer pool page frames to MPOL_INTERLEAVE"
-				" (error: " << strerror(errno) << ").";
-		}
-		numa_bitmask_free(numa_mems_allowed);
-	}
+  if (srv_numa_interleave)
+  {
+    struct bitmask *numa_mems_allowed= numa_get_mems_allowed();
+    if (mbind(mem, mem_size(), MPOL_INTERLEAVE,
+              numa_mems_allowed->maskp, numa_mems_allowed->size,
+              MPOL_MF_MOVE))
+    {
+      ib::warn() << "Failed to set NUMA memory policy of"
+              " buffer pool page frames to MPOL_INTERLEAVE"
+              " (error: " << strerror(errno) << ").";
+    }
+    numa_bitmask_free(numa_mems_allowed);
+  }
 #endif /* HAVE_LIBNUMA */
 
 
-	/* Allocate the block descriptors from
-	the start of the memory block. */
-	chunk->blocks = (buf_block_t*) chunk->mem;
-
-	/* Align a pointer to the first frame.  Note that when
-	opt_large_page_size is smaller than srv_page_size,
-	we may allocate one fewer block than requested.  When
-	it is bigger, we may allocate more blocks than requested. */
-
-	frame = (byte*) ut_align(chunk->mem, srv_page_size);
-	chunk->size = (chunk->mem_pfx.m_size >> srv_page_size_shift)
-		- (frame != chunk->mem);
+  /* Allocate the block descriptors from
+  the start of the memory block. */
+  blocks= reinterpret_cast<buf_block_t*>(mem);
 
-	/* Subtract the space needed for block descriptors. */
-	{
-		ulint	size = chunk->size;
-
-		while (frame < (byte*) (chunk->blocks + size)) {
-			frame += srv_page_size;
-			size--;
-		}
+  /* Align a pointer to the first frame.  Note that when
+  opt_large_page_size is smaller than srv_page_size,
+  (with max srv_page_size at 64k don't think any hardware
+  makes this true),
+  we may allocate one fewer block than requested.  When
+  it is bigger, we may allocate more blocks than requested. */
+  static_assert(sizeof(byte*) == sizeof(ulint), "pointer size");
 
-		chunk->size = size;
-	}
+  byte *frame= reinterpret_cast<byte*>((reinterpret_cast<ulint>(mem) +
+                                        srv_page_size - 1) &
+                                       ~ulint{srv_page_size - 1});
+  size= (mem_pfx.m_size >> srv_page_size_shift) - (frame != mem);
 
-	/* Init block structs and assign frames for them. Then we
-	assign the frames to the first blocks (we already mapped the
-	memory above). */
+  /* Subtract the space needed for block descriptors. */
+  {
+    ulint s= size;
 
-	block = chunk->blocks;
+    while (frame < reinterpret_cast<const byte*>(blocks + s))
+    {
+      frame+= srv_page_size;
+      s--;
+    }
 
-	for (i = chunk->size; i--; ) {
+    size= s;
+  }
 
-		buf_block_init(buf_pool, block, frame);
-		MEM_UNDEFINED(block->frame, srv_page_size);
+  /* Init block structs and assign frames for them. Then we assign the
+  frames to the first blocks (we already mapped the memory above). */
 
-		/* Add the block to the free list */
-		UT_LIST_ADD_LAST(buf_pool->free, &block->page);
+  buf_block_t *block= blocks;
 
-		ut_d(block->page.in_free_list = TRUE);
-		ut_ad(buf_pool_from_block(block) == buf_pool);
+  for (auto i= size; i--; ) {
+    buf_block_init(block, frame);
+    MEM_UNDEFINED(block->frame, srv_page_size);
+    /* Add the block to the free list */
+    UT_LIST_ADD_LAST(buf_pool.free, &block->page);
 
-		block++;
-		frame += srv_page_size;
-	}
+    ut_d(block->page.in_free_list = TRUE);
+    block++;
+    frame+= srv_page_size;
+  }
 
-	buf_pool_register_chunk(chunk);
+  reg();
 
-#ifdef PFS_GROUP_BUFFER_SYNC
-	pfs_register_buffer_block(chunk);
-#endif /* PFS_GROUP_BUFFER_SYNC */
-	return(chunk);
+  return true;
 }
 
 #ifdef UNIV_DEBUG
-/*********************************************************************//**
-Finds a block in the given buffer chunk that points to a
-given compressed page.
-@return buffer block pointing to the compressed page, or NULL */
-static
-buf_block_t*
-buf_chunk_contains_zip(
-/*===================*/
-	buf_chunk_t*	chunk,	/*!< in: chunk being checked */
-	const void*	data)	/*!< in: pointer to compressed page */
+/** Check that all file pages in the buffer chunk are in a replaceable state.
+@return address of a non-free block
+@retval nullptr if all freed */
+inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const
 {
-	buf_block_t*	block;
-	ulint		i;
-
-	block = chunk->blocks;
-
-	for (i = chunk->size; i--; block++) {
-		if (block->page.zip.data == data) {
-
-			return(block);
-		}
-	}
-
-	return(NULL);
-}
-
-/*********************************************************************//**
-Finds a block in the buffer pool that points to a
-given compressed page.
-@return buffer block pointing to the compressed page, or NULL */
-buf_block_t*
-buf_pool_contains_zip(
-/*==================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	const void*	data)		/*!< in: pointer to compressed page */
-{
-	ulint		n;
-	buf_chunk_t*	chunk = buf_pool->chunks;
-
-	ut_ad(buf_pool);
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	for (n = buf_pool->n_chunks; n--; chunk++) {
-
-		buf_block_t* block = buf_chunk_contains_zip(chunk, data);
-
-		if (block) {
-			return(block);
-		}
-	}
+  buf_block_t *block= blocks;
+  for (auto i= size; i--; block++)
+  {
+    switch (block->page.state()) {
+    case BUF_BLOCK_ZIP_PAGE:
+      /* The uncompressed buffer pool should never
+      contain ROW_FORMAT=COMPRESSED block descriptors. */
+      ut_error;
+      break;
+    case BUF_BLOCK_NOT_USED:
+    case BUF_BLOCK_MEMORY:
+    case BUF_BLOCK_REMOVE_HASH:
+      /* Skip blocks that are not being used for file pages. */
+      break;
+    case BUF_BLOCK_FILE_PAGE:
+      const lsn_t lsn= block->page.oldest_modification();
+
+      if (srv_read_only_mode)
+      {
+        /* The page cleaner is disabled in read-only mode.  No pages
+        can be dirtied, so all of them must be clean. */
+        ut_ad(lsn == 0 || lsn == recv_sys.recovered_lsn ||
+              srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
+        ut_ad(!block->page.buf_fix_count());
+        ut_ad(block->page.io_fix() == BUF_IO_NONE);
+        break;
+      }
+
+      if (fsp_is_system_temporary(block->page.id().space()))
+      {
+        ut_ad(lsn == 0 || lsn == 2);
+        break;
+      }
+
+      if (lsn > 1 || !block->page.can_relocate())
+        return block;
+
+      break;
+    }
+  }
 
-	return(NULL);
+  return nullptr;
 }
 #endif /* UNIV_DEBUG */
 
-/*********************************************************************//**
-Checks that all file pages in the buffer chunk are in a replaceable state.
-@return address of a non-free block, or NULL if all freed */
-static
-const buf_block_t*
-buf_chunk_not_freed(
-/*================*/
-	buf_chunk_t*	chunk)	/*!< in: chunk being checked */
-{
-	buf_block_t*	block;
-	ulint		i;
-
-	block = chunk->blocks;
-
-	for (i = chunk->size; i--; block++) {
-		ibool	ready;
-
-		switch (buf_block_get_state(block)) {
-		case BUF_BLOCK_POOL_WATCH:
-		case BUF_BLOCK_ZIP_PAGE:
-		case BUF_BLOCK_ZIP_DIRTY:
-			/* The uncompressed buffer pool should never
-			contain compressed block descriptors. */
-			ut_error;
-			break;
-		case BUF_BLOCK_NOT_USED:
-		case BUF_BLOCK_READY_FOR_USE:
-		case BUF_BLOCK_MEMORY:
-		case BUF_BLOCK_REMOVE_HASH:
-			/* Skip blocks that are not being used for
-			file pages. */
-			break;
-		case BUF_BLOCK_FILE_PAGE:
-			if (srv_read_only_mode) {
-				/* The page cleaner is disabled in
-				read-only mode.  No pages can be
-				dirtied, so all of them must be clean. */
-				ut_ad(block->page.oldest_modification
-				      == block->page.newest_modification);
-				ut_ad(block->page.oldest_modification == 0
-				      || block->page.oldest_modification
-				      == recv_sys.recovered_lsn
-				      || srv_force_recovery
-				      == SRV_FORCE_NO_LOG_REDO);
-				ut_ad(block->page.buf_fix_count == 0);
-				ut_ad(block->page.io_fix == BUF_IO_NONE);
-				break;
-			}
-
-			buf_page_mutex_enter(block);
-			ready = buf_flush_ready_for_replace(&block->page);
-			buf_page_mutex_exit(block);
-
-			if (!ready) {
-				return(block);
-			}
-
-			break;
-		}
-	}
-
-	return(NULL);
-}
-
-/********************************************************************//**
-Set buffer pool size variables after resizing it */
-static
-void
-buf_pool_set_sizes(void)
-/*====================*/
-{
-	ulint	i;
-	ulint	curr_size = 0;
-
-	buf_pool_mutex_enter_all();
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-		curr_size += buf_pool->curr_pool_size;
-	}
-
-	srv_buf_pool_curr_size = curr_size;
-	srv_buf_pool_old_size = srv_buf_pool_size;
-	srv_buf_pool_base_size = srv_buf_pool_size;
-
-	buf_pool_mutex_exit_all();
-}
-
 /** Free the synchronization objects of a buffer pool block descriptor
 @param[in,out]	block	buffer pool block descriptor */
 static void buf_block_free_mutexes(buf_block_t* block)
 {
-	mutex_free(&block->mutex);
 	rw_lock_free(&block->lock);
 	ut_d(rw_lock_free(block->debug_latch));
 	ut_d(ut_free(block->debug_latch));
 }
 
-/********************************************************************//**
-Initialize a buffer pool instance.
-@return DB_SUCCESS if all goes well. */
-static
-ulint
-buf_pool_init_instance(
-/*===================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	ulint		buf_pool_size,	/*!< in: size in bytes */
-	ulint		instance_no)	/*!< in: id of the instance */
+/** Create the hash table.
+@param n  the lower bound of n_cells */
+void buf_pool_t::page_hash_table::create(ulint n)
 {
-	ulint		i;
-	ulint		chunk_size;
-	buf_chunk_t*	chunk;
-
-	ut_ad(buf_pool_size % srv_buf_pool_chunk_unit == 0);
-
-	/* 1. Initialize general fields
-	------------------------------- */
-	mutex_create(LATCH_ID_BUF_POOL, &buf_pool->mutex);
-
-	mutex_create(LATCH_ID_BUF_POOL_ZIP, &buf_pool->zip_mutex);
-
-	new(&buf_pool->allocator)
-		ut_allocator<unsigned char>(mem_key_buf_buf_pool);
-
-	buf_pool_mutex_enter(buf_pool);
-
-	if (buf_pool_size > 0) {
-		buf_pool->n_chunks
-			= buf_pool_size / srv_buf_pool_chunk_unit;
-		chunk_size = srv_buf_pool_chunk_unit;
-
-		buf_pool->chunks =
-			reinterpret_cast<buf_chunk_t*>(ut_zalloc_nokey(
-				buf_pool->n_chunks * sizeof(*chunk)));
-		buf_pool->chunks_old = NULL;
-
-		UT_LIST_INIT(buf_pool->LRU, &buf_page_t::LRU);
-		UT_LIST_INIT(buf_pool->free, &buf_page_t::list);
-		UT_LIST_INIT(buf_pool->withdraw, &buf_page_t::list);
-		buf_pool->withdraw_target = 0;
-		UT_LIST_INIT(buf_pool->flush_list, &buf_page_t::list);
-		UT_LIST_INIT(buf_pool->unzip_LRU, &buf_block_t::unzip_LRU);
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-		UT_LIST_INIT(buf_pool->zip_clean, &buf_page_t::list);
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-		for (i = 0; i < UT_ARR_SIZE(buf_pool->zip_free); ++i) {
-			UT_LIST_INIT(
-				buf_pool->zip_free[i], &buf_buddy_free_t::list);
-		}
-
-		buf_pool->curr_size = 0;
-		chunk = buf_pool->chunks;
-
-		do {
-			if (!buf_chunk_init(buf_pool, chunk, chunk_size)) {
-				while (--chunk >= buf_pool->chunks) {
-					buf_block_t*	block = chunk->blocks;
-
-					for (i = chunk->size; i--; block++) {
-						buf_block_free_mutexes(block);
-					}
-
-					buf_pool->allocator.deallocate_large_dodump(
-						chunk->mem, &chunk->mem_pfx, chunk->mem_size());
-				}
-				ut_free(buf_pool->chunks);
-				buf_pool_mutex_exit(buf_pool);
-
-				/* InnoDB should free the mutex which was
-				created so far before freeing the instance */
-				mutex_free(&buf_pool->mutex);
-				mutex_free(&buf_pool->zip_mutex);
-				return(DB_ERROR);
-			}
-
-			buf_pool->curr_size += chunk->size;
-		} while (++chunk < buf_pool->chunks + buf_pool->n_chunks);
-
-		buf_pool->instance_no = instance_no;
-		buf_pool->read_ahead_area =
-			ut_min(BUF_READ_AHEAD_PAGES,
-			       ut_2_power_up(buf_pool->curr_size /
-					     BUF_READ_AHEAD_PORTION));
-		buf_pool->curr_pool_size = buf_pool_size;
-
-		buf_pool->old_size = buf_pool->curr_size;
-		buf_pool->n_chunks_new = buf_pool->n_chunks;
-
-		/* Number of locks protecting page_hash must be a
-		power of two */
-		srv_n_page_hash_locks = static_cast<ulong>(
-			 ut_2_power_up(srv_n_page_hash_locks));
-		ut_a(srv_n_page_hash_locks != 0);
-		ut_a(srv_n_page_hash_locks <= MAX_PAGE_HASH_LOCKS);
-
-		buf_pool->page_hash = ib_create(
-			2 * buf_pool->curr_size,
-			LATCH_ID_HASH_TABLE_RW_LOCK,
-			srv_n_page_hash_locks, MEM_HEAP_FOR_PAGE_HASH);
-
-		buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
-
-		buf_pool->last_printout_time = time(NULL);
-	}
-	/* 2. Initialize flushing fields
-	-------------------------------- */
-
-	mutex_create(LATCH_ID_FLUSH_LIST, &buf_pool->flush_list_mutex);
-
-	for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
-		buf_pool->no_flush[i] = os_event_create(0);
-	}
-
-	buf_pool->watch = (buf_page_t*) ut_zalloc_nokey(
-		sizeof(*buf_pool->watch) * BUF_POOL_WATCH_SIZE);
-	for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
-		buf_pool->watch[i].buf_pool_index
-			= unsigned(buf_pool->instance_no);
-	}
-
-	/* All fields are initialized by ut_zalloc_nokey(). */
-
-	buf_pool->try_LRU_scan = TRUE;
-
-	/* Initialize the hazard pointer for flush_list batches */
-	new(&buf_pool->flush_hp)
-		FlushHp(buf_pool, &buf_pool->flush_list_mutex);
-
-	/* Initialize the hazard pointer for LRU batches */
-	new(&buf_pool->lru_hp) LRUHp(buf_pool, &buf_pool->mutex);
-
-	/* Initialize the iterator for LRU scan search */
-	new(&buf_pool->lru_scan_itr) LRUItr(buf_pool, &buf_pool->mutex);
-
-	/* Initialize the iterator for single page scan search */
-	new(&buf_pool->single_scan_itr) LRUItr(buf_pool, &buf_pool->mutex);
-
-	/* Initialize the temporal memory array and slots */
-	new(&buf_pool->io_buf) buf_pool_t::io_buf_t(
-		(srv_n_read_io_threads + srv_n_write_io_threads)
-		* (8 * OS_AIO_N_PENDING_IOS_PER_THREAD));
-
-	buf_pool_mutex_exit(buf_pool);
-
-	DBUG_EXECUTE_IF("buf_pool_init_instance_force_oom",
-		return(DB_ERROR); );
-
-	return(DB_SUCCESS);
+  n_cells= ut_find_prime(n);
+  const size_t size= pad(n_cells) * sizeof *array;
+  void* v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE);
+  memset(v, 0, size);
+  array= static_cast<hash_cell_t*>(v);
 }
 
-/********************************************************************//**
-free one buffer pool instance */
-static
-void
-buf_pool_free_instance(
-/*===================*/
-	buf_pool_t*	buf_pool)	/* in,own: buffer pool instance
-					to free */
+/** Create the buffer pool.
+@return whether the creation failed */
+bool buf_pool_t::create()
 {
-	buf_chunk_t*	chunk;
-	buf_chunk_t*	chunks;
-	buf_page_t*	bpage;
-	buf_page_t*	prev_bpage = 0;
-
-	mutex_free(&buf_pool->mutex);
-	mutex_free(&buf_pool->zip_mutex);
-	mutex_free(&buf_pool->flush_list_mutex);
-
-	if (buf_pool->flush_rbt) {
-		rbt_free(buf_pool->flush_rbt);
-		buf_pool->flush_rbt = NULL;
-	}
-
-	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
-	     bpage != NULL;
-	     bpage = prev_bpage) {
-
-		prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
-		buf_page_state	state = buf_page_get_state(bpage);
-
-		ut_ad(buf_page_in_file(bpage));
-		ut_ad(bpage->in_LRU_list);
-
-		if (state != BUF_BLOCK_FILE_PAGE) {
-			/* We must not have any dirty block except
-			when doing a fast shutdown. */
-			ut_ad(state == BUF_BLOCK_ZIP_PAGE
-			      || srv_fast_shutdown == 2);
-			buf_page_free_descriptor(bpage);
-		}
-	}
-
-	ut_free(buf_pool->watch);
-	buf_pool->watch = NULL;
+  ut_ad(this == &buf_pool);
+  ut_ad(srv_buf_pool_size % srv_buf_pool_chunk_unit == 0);
+  ut_ad(!is_initialised());
+  ut_ad(srv_buf_pool_size > 0);
+  ut_ad(!resizing);
+  ut_ad(!chunks_old);
+  ut_ad(!field_ref_zero);
 
-	chunks = buf_pool->chunks;
-	chunk = chunks + buf_pool->n_chunks;
+  NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
 
-	while (--chunk >= chunks) {
-		buf_block_t*	block = chunk->blocks;
+  if (auto b= aligned_malloc(UNIV_PAGE_SIZE_MAX, 4096))
+    field_ref_zero= static_cast<const byte*>
+      (memset_aligned<4096>(b, 0, UNIV_PAGE_SIZE_MAX));
+  else
+    return true;
 
-		for (ulint i = chunk->size; i--; block++) {
-			buf_block_free_mutexes(block);
-		}
+  chunk_t::map_reg= UT_NEW_NOKEY(chunk_t::map());
 
-		buf_pool->allocator.deallocate_large_dodump(
-			chunk->mem, &chunk->mem_pfx, chunk->mem_size());
-	}
+  new(&allocator) ut_allocator<unsigned char>(mem_key_buf_buf_pool);
 
-	for (ulint i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; ++i) {
-		os_event_destroy(buf_pool->no_flush[i]);
-	}
+  n_chunks= srv_buf_pool_size / srv_buf_pool_chunk_unit;
+  const size_t chunk_size= srv_buf_pool_chunk_unit;
 
-	ut_free(buf_pool->chunks);
-	ha_clear(buf_pool->page_hash);
-	hash_table_free(buf_pool->page_hash);
-	hash_table_free(buf_pool->zip_hash);
+  chunks= static_cast<chunk_t*>(ut_zalloc_nokey(n_chunks * sizeof *chunks));
+  UT_LIST_INIT(free, &buf_page_t::list);
+  curr_size= 0;
+  auto chunk= chunks;
 
-	buf_pool->io_buf.~io_buf_t();
-	buf_pool->allocator.~ut_allocator();
+  do
+  {
+    if (!chunk->create(chunk_size))
+    {
+      while (--chunk >= chunks)
+      {
+        buf_block_t* block= chunk->blocks;
+
+        for (auto i= chunk->size; i--; block++)
+          buf_block_free_mutexes(block);
+
+        allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx);
+      }
+      ut_free(chunks);
+      chunks= nullptr;
+      UT_DELETE(chunk_t::map_reg);
+      chunk_t::map_reg= nullptr;
+      aligned_free(const_cast<byte*>(field_ref_zero));
+      field_ref_zero= nullptr;
+      ut_ad(!is_initialised());
+      return true;
+    }
+
+    curr_size+= chunk->size;
+  }
+  while (++chunk < chunks + n_chunks);
+
+  ut_ad(is_initialised());
+  mysql_mutex_init(buf_pool_mutex_key, &mutex, MY_MUTEX_INIT_FAST);
+
+  UT_LIST_INIT(LRU, &buf_page_t::LRU);
+  UT_LIST_INIT(withdraw, &buf_page_t::list);
+  withdraw_target= 0;
+  UT_LIST_INIT(flush_list, &buf_page_t::list);
+  UT_LIST_INIT(unzip_LRU, &buf_block_t::unzip_LRU);
+
+  for (size_t i= 0; i < UT_ARR_SIZE(zip_free); ++i)
+    UT_LIST_INIT(zip_free[i], &buf_buddy_free_t::list);
+  ulint s= curr_size;
+  old_size= s;
+  s/= BUF_READ_AHEAD_PORTION;
+  read_ahead_area= s >= READ_AHEAD_PAGES
+    ? READ_AHEAD_PAGES
+    : my_round_up_to_next_power(static_cast<uint32_t>(s));
+  curr_pool_size= srv_buf_pool_size;
+
+  n_chunks_new= n_chunks;
+
+  page_hash.create(2 * curr_size);
+  zip_hash.create(2 * curr_size);
+  last_printout_time= time(NULL);
+
+  mysql_mutex_init(flush_list_mutex_key, &flush_list_mutex,
+                   MY_MUTEX_INIT_FAST);
+
+  pthread_cond_init(&done_flush_LRU, nullptr);
+  pthread_cond_init(&done_flush_list, nullptr);
+  pthread_cond_init(&do_flush_list, nullptr);
+  pthread_cond_init(&done_free, nullptr);
+
+  try_LRU_scan= true;
+
+  ut_d(flush_hp.m_mutex= &flush_list_mutex;);
+  ut_d(lru_hp.m_mutex= &mutex);
+  ut_d(lru_scan_itr.m_mutex= &mutex);
+
+  io_buf.create((srv_n_read_io_threads + srv_n_write_io_threads) *
+                OS_AIO_N_PENDING_IOS_PER_THREAD);
+
+  /* FIXME: remove some of these variables */
+  srv_buf_pool_curr_size= curr_pool_size;
+  srv_buf_pool_old_size= srv_buf_pool_size;
+  srv_buf_pool_base_size= srv_buf_pool_size;
+
+  last_activity_count= srv_get_activity_count();
+
+  chunk_t::map_ref= chunk_t::map_reg;
+  buf_LRU_old_ratio_update(100 * 3 / 8, false);
+  btr_search_sys_create();
+  ut_ad(is_initialised());
+  return false;
 }
 
-/********************************************************************//**
-Creates the buffer pool.
-@return DB_SUCCESS if success, DB_ERROR if not enough memory or error */
-dberr_t
-buf_pool_init(
-/*==========*/
-	ulint	total_size,	/*!< in: size of the total pool in bytes */
-	ulint	n_instances)	/*!< in: number of instances */
+/** Clean up after successful create() */
+void buf_pool_t::close()
 {
-	ulint		i;
-	const ulint	size	= total_size / n_instances;
-
-	ut_ad(n_instances > 0);
-	ut_ad(n_instances <= MAX_BUFFER_POOLS);
-	ut_ad(n_instances == srv_buf_pool_instances);
-
-	NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
-
-	buf_pool_resizing = false;
-
-	buf_pool_ptr = (buf_pool_t*) ut_zalloc_nokey(
-		n_instances * sizeof *buf_pool_ptr);
-
-	buf_chunk_map_reg = UT_NEW_NOKEY(buf_pool_chunk_map_t());
-
-	for (i = 0; i < n_instances; i++) {
-		buf_pool_t*	ptr	= &buf_pool_ptr[i];
-
-		if (buf_pool_init_instance(ptr, size, i) != DB_SUCCESS) {
+  ut_ad(this == &buf_pool);
+  if (!is_initialised())
+    return;
 
-			/* Free all the instances created so far. */
-			buf_pool_free(i);
+  mysql_mutex_destroy(&mutex);
+  mysql_mutex_destroy(&flush_list_mutex);
 
-			return(DB_ERROR);
-		}
-	}
-
-	buf_chunk_map_ref = buf_chunk_map_reg;
-
-	buf_pool_set_sizes();
-	buf_LRU_old_ratio_update(100 * 3/ 8, FALSE);
-
-	btr_search_sys_create(buf_pool_get_curr_size() / sizeof(void*) / 64);
+  for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev_bpage= nullptr; bpage;
+       bpage= prev_bpage)
+  {
+    prev_bpage= UT_LIST_GET_PREV(LRU, bpage);
+    ut_ad(bpage->in_file());
+    ut_ad(bpage->in_LRU_list);
+    /* The buffer pool must be clean during normal shutdown.
+    Only on aborted startup (with recovery) or with innodb_fast_shutdown=2
+    we may discard changes. */
+    ut_d(const lsn_t oldest= bpage->oldest_modification();)
+    ut_ad(fsp_is_system_temporary(bpage->id().space())
+          ? (oldest == 0 || oldest == 2)
+          : oldest <= 1 || srv_is_being_started || srv_fast_shutdown == 2);
+
+    if (bpage->state() != BUF_BLOCK_FILE_PAGE)
+      buf_page_free_descriptor(bpage);
+  }
 
-	return(DB_SUCCESS);
-}
+  for (auto chunk= chunks + n_chunks; --chunk >= chunks; )
+  {
+    buf_block_t *block= chunk->blocks;
 
-/********************************************************************//**
-Frees the buffer pool at shutdown.  This must not be invoked before
-freeing all mutexes. */
-void
-buf_pool_free(
-/*==========*/
-	ulint	n_instances)	/*!< in: numbere of instances to free */
-{
-	for (ulint i = 0; i < n_instances; i++) {
-		buf_pool_free_instance(buf_pool_from_array(i));
-	}
+    for (auto i= chunk->size; i--; block++)
+      buf_block_free_mutexes(block);
 
-	UT_DELETE(buf_chunk_map_reg);
-	buf_chunk_map_reg = buf_chunk_map_ref = NULL;
+    allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx);
+  }
 
-	ut_free(buf_pool_ptr);
-	buf_pool_ptr = NULL;
+  pthread_cond_destroy(&done_flush_LRU);
+  pthread_cond_destroy(&done_flush_list);
+  pthread_cond_destroy(&do_flush_list);
+  pthread_cond_destroy(&done_free);
+
+  ut_free(chunks);
+  chunks= nullptr;
+  page_hash.free();
+  zip_hash.free();
+
+  io_buf.close();
+  UT_DELETE(chunk_t::map_reg);
+  chunk_t::map_reg= chunk_t::map_ref= nullptr;
+  aligned_free(const_cast<byte*>(field_ref_zero));
+  field_ref_zero= nullptr;
 }
 
-/** Reallocate a control block.
-@param[in]	buf_pool	buffer pool instance
-@param[in]	block		pointer to control block
-@retval false	if failed because of no free blocks. */
-static
-bool
-buf_page_realloc(
-	buf_pool_t*	buf_pool,
-	buf_block_t*	block)
+/** Try to reallocate a control block.
+@param block  control block to reallocate
+@return whether the reallocation succeeded */
+inline bool buf_pool_t::realloc(buf_block_t *block)
 {
 	buf_block_t*	new_block;
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	mysql_mutex_assert_owner(&mutex);
+	ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
 
-	new_block = buf_LRU_get_free_only(buf_pool);
+	new_block = buf_LRU_get_free_only();
 
 	if (new_block == NULL) {
-		return(false); /* free_list was not enough */
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		page_cleaner_wakeup();
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+		return(false); /* free list was not enough */
 	}
 
-	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, block->page.id);
-
-	rw_lock_x_lock(hash_lock);
-	mutex_enter(&block->mutex);
+	const page_id_t id(block->page.id());
+	page_hash_latch* hash_lock = hash_lock_get(id);
+	hash_lock->write_lock();
 
-	if (buf_page_can_relocate(&block->page)) {
-		mutex_enter(&new_block->mutex);
-
-		memcpy(new_block->frame, block->frame, srv_page_size);
+	if (block->page.can_relocate()) {
+		memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(
+			new_block->frame, block->frame, srv_page_size);
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
 		new (&new_block->page) buf_page_t(block->page);
 
 		/* relocate LRU list */
-		ut_ad(block->page.in_LRU_list);
-		ut_ad(!block->page.in_zip_hash);
-		ut_d(block->page.in_LRU_list = FALSE);
-
-		buf_LRU_adjust_hp(buf_pool, &block->page);
-
-		buf_page_t*	prev_b = UT_LIST_GET_PREV(LRU, &block->page);
-		UT_LIST_REMOVE(buf_pool->LRU, &block->page);
-
-		if (prev_b != NULL) {
-			UT_LIST_INSERT_AFTER(buf_pool->LRU, prev_b, &new_block->page);
+		if (buf_page_t*	prev_b = buf_pool.LRU_remove(&block->page)) {
+			UT_LIST_INSERT_AFTER(LRU, prev_b, &new_block->page);
 		} else {
-			UT_LIST_ADD_FIRST(buf_pool->LRU, &new_block->page);
+			UT_LIST_ADD_FIRST(LRU, &new_block->page);
 		}
 
-		if (buf_pool->LRU_old == &block->page) {
-			buf_pool->LRU_old = &new_block->page;
+		if (LRU_old == &block->page) {
+			LRU_old = &new_block->page;
 		}
 
 		ut_ad(new_block->page.in_LRU_list);
@@ -2138,55 +1554,54 @@ buf_page_realloc(
 		/* relocate unzip_LRU list */
 		if (block->page.zip.data != NULL) {
 			ut_ad(block->in_unzip_LRU_list);
-			ut_d(new_block->in_unzip_LRU_list = TRUE);
+			ut_d(new_block->in_unzip_LRU_list = true);
 
 			buf_block_t*	prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
-			UT_LIST_REMOVE(buf_pool->unzip_LRU, block);
+			UT_LIST_REMOVE(unzip_LRU, block);
 
-			ut_d(block->in_unzip_LRU_list = FALSE);
+			ut_d(block->in_unzip_LRU_list = false);
 			block->page.zip.data = NULL;
 			page_zip_set_size(&block->page.zip, 0);
 
 			if (prev_block != NULL) {
-				UT_LIST_INSERT_AFTER(buf_pool->unzip_LRU, prev_block, new_block);
+				UT_LIST_INSERT_AFTER(unzip_LRU, prev_block, new_block);
 			} else {
-				UT_LIST_ADD_FIRST(buf_pool->unzip_LRU, new_block);
+				UT_LIST_ADD_FIRST(unzip_LRU, new_block);
 			}
 		} else {
 			ut_ad(!block->in_unzip_LRU_list);
-			ut_d(new_block->in_unzip_LRU_list = FALSE);
+			ut_d(new_block->in_unzip_LRU_list = false);
 		}
 
-		/* relocate buf_pool->page_hash */
+		/* relocate page_hash */
 		ut_ad(block->page.in_page_hash);
-		ut_ad(&block->page == buf_page_hash_get_low(buf_pool,
-							    block->page.id));
-		ut_d(block->page.in_page_hash = FALSE);
-		ulint	fold = block->page.id.fold();
-		ut_ad(fold == new_block->page.id.fold());
-		HASH_REPLACE(buf_page_t, hash, buf_pool->page_hash, fold,
-			     &block->page, &new_block->page);
-
 		ut_ad(new_block->page.in_page_hash);
+		const ulint fold = id.fold();
+		ut_ad(&block->page == page_hash_get_low(id, fold));
+		ut_d(block->page.in_page_hash = false);
+		HASH_REPLACE(buf_page_t, hash, &page_hash, fold,
+			     &block->page, &new_block->page);
 
 		buf_block_modify_clock_inc(block);
-		memset(block->frame + FIL_PAGE_OFFSET, 0xff, 4);
-		memset(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
+		static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+		memset_aligned<4>(block->frame + FIL_PAGE_OFFSET, 0xff, 4);
+		static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+			      "not perfect alignment");
+		memset_aligned<2>(block->frame
+				  + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
 		MEM_UNDEFINED(block->frame, srv_page_size);
-		buf_block_set_state(block, BUF_BLOCK_REMOVE_HASH);
-		block->page.id
-		    = page_id_t(ULINT32_UNDEFINED, ULINT32_UNDEFINED);
-
-		/* Relocate buf_pool->flush_list. */
-		if (block->page.oldest_modification) {
-			buf_flush_relocate_on_flush_list(
-				&block->page, &new_block->page);
+		block->page.set_state(BUF_BLOCK_REMOVE_HASH);
+		if (!fsp_is_system_temporary(id.space())) {
+			buf_flush_relocate_on_flush_list(&block->page,
+							 &new_block->page);
 		}
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+		block->page.set_corrupt_id();
 
 		/* set other flags of buf_block_t */
 
 #ifdef BTR_CUR_HASH_ADAPT
-		/* This code should only be executed by buf_pool_resize(),
+		/* This code should only be executed by resize(),
 		while the adaptive hash index is disabled. */
 		assert_block_ahi_empty(block);
 		assert_block_ahi_empty_on_init(new_block);
@@ -2196,30 +1611,13 @@ buf_page_realloc(
 		new_block->n_fields	= 1;
 		new_block->left_side	= TRUE;
 #endif /* BTR_CUR_HASH_ADAPT */
-
-		new_block->lock_hash_val = block->lock_hash_val;
-		ut_ad(new_block->lock_hash_val == lock_rec_hash(
-			new_block->page.id.space(),
-			new_block->page.id.page_no()));
-
-		rw_lock_x_unlock(hash_lock);
-		mutex_exit(&new_block->mutex);
-
+		ut_d(block->page.set_state(BUF_BLOCK_MEMORY));
 		/* free block */
-		buf_block_set_state(block, BUF_BLOCK_MEMORY);
-		buf_LRU_block_free_non_file_page(block);
-
-		mutex_exit(&block->mutex);
-	} else {
-		rw_lock_x_unlock(hash_lock);
-		mutex_exit(&block->mutex);
-
-		/* free new_block */
-		mutex_enter(&new_block->mutex);
-		buf_LRU_block_free_non_file_page(new_block);
-		mutex_exit(&new_block->mutex);
+		new_block = block;
 	}
 
+	hash_lock->write_unlock();
+	buf_LRU_block_free_non_file_page(new_block);
 	return(true); /* free_list was enough */
 }
 
@@ -2248,244 +1646,131 @@ buf_resize_status(
 	ib::info() << export_vars.innodb_buffer_pool_resize_status;
 }
 
-/** Determines if a block is intended to be withdrawn.
-@param[in]	buf_pool	buffer pool instance
-@param[in]	block		pointer to control block
-@retval true	if will be withdrawn */
-bool
-buf_block_will_withdrawn(
-	buf_pool_t*		buf_pool,
-	const buf_block_t*	block)
-{
-	ut_ad(buf_pool->curr_size < buf_pool->old_size);
-	ut_ad(!buf_pool_resizing || buf_pool_mutex_own(buf_pool));
-
-	const buf_chunk_t*	chunk
-		= buf_pool->chunks + buf_pool->n_chunks_new;
-	const buf_chunk_t*	echunk
-		= buf_pool->chunks + buf_pool->n_chunks;
-
-	while (chunk < echunk) {
-		if (block >= chunk->blocks
-		    && block < chunk->blocks + chunk->size) {
-			return(true);
-		}
-		++chunk;
-	}
-
-	return(false);
-}
-
-/** Determines if a frame is intended to be withdrawn.
-@param[in]	buf_pool	buffer pool instance
-@param[in]	ptr		pointer to a frame
-@retval true	if will be withdrawn */
-bool
-buf_frame_will_withdrawn(
-	buf_pool_t*	buf_pool,
-	const byte*	ptr)
-{
-	ut_ad(buf_pool->curr_size < buf_pool->old_size);
-	ut_ad(!buf_pool_resizing || buf_pool_mutex_own(buf_pool));
-
-	const buf_chunk_t*	chunk
-		= buf_pool->chunks + buf_pool->n_chunks_new;
-	const buf_chunk_t*	echunk
-		= buf_pool->chunks + buf_pool->n_chunks;
-
-	while (chunk < echunk) {
-		if (ptr >= chunk->blocks->frame
-		    && ptr < (chunk->blocks + chunk->size - 1)->frame
-			     + srv_page_size) {
-			return(true);
-		}
-		++chunk;
-	}
-
-	return(false);
-}
-
-/** Withdraw the buffer pool blocks from end of the buffer pool instance
-until withdrawn by buf_pool->withdraw_target.
-@param[in]	buf_pool	buffer pool instance
-@retval true	if retry is needed */
-static
-bool
-buf_pool_withdraw_blocks(
-	buf_pool_t*	buf_pool)
+/** Withdraw blocks from the buffer pool until meeting withdraw_target.
+@return whether retry is needed */
+inline bool buf_pool_t::withdraw_blocks()
 {
 	buf_block_t*	block;
 	ulint		loop_count = 0;
-	ulint		i = buf_pool_index(buf_pool);
 
-	ib::info() << "buffer pool " << i
-		<< " : start to withdraw the last "
-		<< buf_pool->withdraw_target << " blocks.";
+	ib::info() << "start to withdraw the last "
+		<< withdraw_target << " blocks";
 
-	/* Minimize buf_pool->zip_free[i] lists */
-	buf_pool_mutex_enter(buf_pool);
-	buf_buddy_condense_free(buf_pool);
-	buf_pool_mutex_exit(buf_pool);
+	/* Minimize zip_free[i] lists */
+	mysql_mutex_lock(&mutex);
+	buf_buddy_condense_free();
+	mysql_mutex_unlock(&mutex);
 
-	while (UT_LIST_GET_LEN(buf_pool->withdraw)
-	       < buf_pool->withdraw_target) {
+	while (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
 
 		/* try to withdraw from free_list */
 		ulint	count1 = 0;
 
-		buf_pool_mutex_enter(buf_pool);
+		mysql_mutex_lock(&mutex);
 		block = reinterpret_cast<buf_block_t*>(
-			UT_LIST_GET_FIRST(buf_pool->free));
+			UT_LIST_GET_FIRST(free));
 		while (block != NULL
-		       && UT_LIST_GET_LEN(buf_pool->withdraw)
-			  < buf_pool->withdraw_target) {
+		       && UT_LIST_GET_LEN(withdraw) < withdraw_target) {
 			ut_ad(block->page.in_free_list);
-			ut_ad(!block->page.in_flush_list);
+			ut_ad(!block->page.oldest_modification());
 			ut_ad(!block->page.in_LRU_list);
-			ut_a(!buf_page_in_file(&block->page));
+			ut_a(!block->page.in_file());
 
 			buf_block_t*	next_block;
 			next_block = reinterpret_cast<buf_block_t*>(
 				UT_LIST_GET_NEXT(
 					list, &block->page));
 
-			if (buf_block_will_withdrawn(buf_pool, block)) {
+			if (will_be_withdrawn(block->page)) {
 				/* This should be withdrawn */
-				UT_LIST_REMOVE(
-					buf_pool->free,
-					&block->page);
-				UT_LIST_ADD_LAST(
-					buf_pool->withdraw,
-					&block->page);
-				ut_d(block->in_withdraw_list = TRUE);
+				UT_LIST_REMOVE(free, &block->page);
+				UT_LIST_ADD_LAST(withdraw, &block->page);
+				ut_d(block->in_withdraw_list = true);
 				count1++;
 			}
 
 			block = next_block;
 		}
-		buf_pool_mutex_exit(buf_pool);
+		mysql_mutex_unlock(&mutex);
 
 		/* reserve free_list length */
-		if (UT_LIST_GET_LEN(buf_pool->withdraw)
-		    < buf_pool->withdraw_target) {
-			ulint	scan_depth;
-			flush_counters_t n;
-
-			/* cap scan_depth with current LRU size. */
-			buf_pool_mutex_enter(buf_pool);
-			scan_depth = UT_LIST_GET_LEN(buf_pool->LRU);
-			buf_pool_mutex_exit(buf_pool);
-
-			scan_depth = ut_min(
-				ut_max(buf_pool->withdraw_target
-				       - UT_LIST_GET_LEN(buf_pool->withdraw),
-				       static_cast<ulint>(srv_LRU_scan_depth)),
-				scan_depth);
-
-			buf_flush_do_batch(buf_pool, BUF_FLUSH_LRU,
-				scan_depth, 0, &n);
-			buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
-
-			if (n.flushed) {
+		if (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
+			ulint n_flushed = buf_flush_LRU(
+				std::max<ulint>(withdraw_target
+						- UT_LIST_GET_LEN(withdraw),
+						srv_LRU_scan_depth));
+			buf_flush_wait_batch_end_acquiring_mutex(true);
+
+			if (n_flushed) {
 				MONITOR_INC_VALUE_CUMULATIVE(
 					MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
 					MONITOR_LRU_BATCH_FLUSH_COUNT,
 					MONITOR_LRU_BATCH_FLUSH_PAGES,
-					n.flushed);
+					n_flushed);
 			}
 		}
 
 		/* relocate blocks/buddies in withdrawn area */
 		ulint	count2 = 0;
 
-		buf_pool_mutex_enter(buf_pool);
+		mysql_mutex_lock(&mutex);
 		buf_page_t*	bpage;
-		bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+		bpage = UT_LIST_GET_FIRST(LRU);
 		while (bpage != NULL) {
-			BPageMutex*	block_mutex;
-			buf_page_t*	next_bpage;
-
-			block_mutex = buf_page_get_mutex(bpage);
-			mutex_enter(block_mutex);
-
-			next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
-
+			buf_page_t* next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
 			if (bpage->zip.data != NULL
-			    && buf_frame_will_withdrawn(
-				buf_pool,
-				static_cast<byte*>(bpage->zip.data))) {
-
-				if (buf_page_can_relocate(bpage)) {
-					mutex_exit(block_mutex);
-					buf_pool_mutex_exit_forbid(buf_pool);
-					if(!buf_buddy_realloc(
-						buf_pool, bpage->zip.data,
-						page_zip_get_size(
-							&bpage->zip))) {
-
-						/* failed to allocate block */
-						buf_pool_mutex_exit_allow(
-							buf_pool);
-						break;
-					}
-					buf_pool_mutex_exit_allow(buf_pool);
-					mutex_enter(block_mutex);
-					count2++;
+			    && will_be_withdrawn(bpage->zip.data)
+			    && bpage->can_relocate()) {
+				buf_pool_mutex_exit_forbid();
+				if (!buf_buddy_realloc(
+					    bpage->zip.data,
+					    page_zip_get_size(&bpage->zip))) {
+					/* failed to allocate block */
+					buf_pool_mutex_exit_allow();
+					break;
 				}
-				/* NOTE: if the page is in use,
-				not reallocated yet */
+				buf_pool_mutex_exit_allow();
+				count2++;
 			}
 
-			if (buf_page_get_state(bpage)
-			    == BUF_BLOCK_FILE_PAGE
-			    && buf_block_will_withdrawn(
-				buf_pool,
-				reinterpret_cast<buf_block_t*>(bpage))) {
-
-				if (buf_page_can_relocate(bpage)) {
-					mutex_exit(block_mutex);
-					buf_pool_mutex_exit_forbid(buf_pool);
-					if(!buf_page_realloc(
-						buf_pool,
+			if (bpage->state() == BUF_BLOCK_FILE_PAGE
+			    && will_be_withdrawn(*bpage)) {
+				if (bpage->can_relocate()) {
+					buf_pool_mutex_exit_forbid();
+					if (!realloc(
 						reinterpret_cast<buf_block_t*>(
 							bpage))) {
 						/* failed to allocate block */
-						buf_pool_mutex_exit_allow(
-							buf_pool);
+						buf_pool_mutex_exit_allow();
 						break;
 					}
-					buf_pool_mutex_exit_allow(buf_pool);
+					buf_pool_mutex_exit_allow();
 					count2++;
-				} else {
-					mutex_exit(block_mutex);
 				}
 				/* NOTE: if the page is in use,
-				not reallocated yet */
-			} else {
-				mutex_exit(block_mutex);
+				not relocated yet */
 			}
 
 			bpage = next_bpage;
 		}
-		buf_pool_mutex_exit(buf_pool);
+		mysql_mutex_unlock(&mutex);
 
 		buf_resize_status(
-			"buffer pool %lu : withdrawing blocks. (%lu/%lu)",
-			i, UT_LIST_GET_LEN(buf_pool->withdraw),
-			buf_pool->withdraw_target);
+			"withdrawing blocks. (" ULINTPF "/" ULINTPF ")",
+			UT_LIST_GET_LEN(withdraw),
+			withdraw_target);
 
-		ib::info() << "buffer pool " << i << " : withdrew "
+		ib::info() << "withdrew "
 			<< count1 << " blocks from free list."
 			<< " Tried to relocate " << count2 << " pages ("
-			<< UT_LIST_GET_LEN(buf_pool->withdraw) << "/"
-			<< buf_pool->withdraw_target << ").";
+			<< UT_LIST_GET_LEN(withdraw) << "/"
+			<< withdraw_target << ")";
 
 		if (++loop_count >= 10) {
 			/* give up for now.
 			retried after user threads paused. */
 
-			ib::info() << "buffer pool " << i
-				<< " : will retry to withdraw later.";
+			ib::info() << "will retry to withdraw later";
 
 			/* need retry later */
 			return(true);
@@ -2493,165 +1778,118 @@ buf_pool_withdraw_blocks(
 	}
 
 	/* confirm withdrawn enough */
-	const buf_chunk_t*	chunk
-		= buf_pool->chunks + buf_pool->n_chunks_new;
-	const buf_chunk_t*	echunk
-		= buf_pool->chunks + buf_pool->n_chunks;
-
-	while (chunk < echunk) {
+	for (const chunk_t* chunk = chunks + n_chunks_new,
+	     * const echunk = chunks + n_chunks; chunk != echunk; chunk++) {
 		block = chunk->blocks;
 		for (ulint j = chunk->size; j--; block++) {
-			/* If !=BUF_BLOCK_NOT_USED block in the
-			withdrawn area, it means corruption
-			something */
-			ut_a(buf_block_get_state(block)
-				== BUF_BLOCK_NOT_USED);
+			ut_a(block->page.state() == BUF_BLOCK_NOT_USED);
 			ut_ad(block->in_withdraw_list);
 		}
-		++chunk;
 	}
 
-	ib::info() << "buffer pool " << i << " : withdrawn target "
-		<< UT_LIST_GET_LEN(buf_pool->withdraw) << " blocks.";
+	ib::info() << "withdrawn target: " << UT_LIST_GET_LEN(withdraw)
+		   << " blocks";
 
 	return(false);
 }
 
-/** resize page_hash and zip_hash for a buffer pool instance.
-@param[in]	buf_pool	buffer pool instance */
-static
-void
-buf_pool_resize_hash(
-	buf_pool_t*	buf_pool)
-{
-	hash_table_t*	new_hash_table;
-
-	/* recreate page_hash */
-	new_hash_table = ib_recreate(
-		buf_pool->page_hash, 2 * buf_pool->curr_size);
-
-	for (ulint i = 0; i < hash_get_n_cells(buf_pool->page_hash); i++) {
-		buf_page_t*	bpage;
-
-		bpage = static_cast<buf_page_t*>(
-			HASH_GET_FIRST(
-				buf_pool->page_hash, i));
 
-		while (bpage) {
-			buf_page_t*	prev_bpage = bpage;
-			ulint		fold;
 
-			bpage = static_cast<buf_page_t*>(
-				HASH_GET_NEXT(
-					hash, prev_bpage));
-
-			fold = prev_bpage->id.fold();
-
-			HASH_DELETE(buf_page_t, hash,
-				buf_pool->page_hash, fold,
-				prev_bpage);
-
-			HASH_INSERT(buf_page_t, hash,
-				new_hash_table, fold,
-				prev_bpage);
-		}
-	}
-
-	/* Concurrent threads may be accessing
-	buf_pool->page_hash->n_cells, n_sync_obj and try to latch
-	sync_obj[i] while we are resizing. Therefore we never
-	deallocate page_hash, instead we overwrite n_cells (and other
-	fields) with the new values. The n_sync_obj and sync_obj are
-	actually same in both. */
-	std::swap(*buf_pool->page_hash, *new_hash_table);
-	hash_table_free(new_hash_table);
-
-	/* recreate zip_hash */
-	new_hash_table = hash_create(2 * buf_pool->curr_size);
-
-	for (ulint i = 0; i < hash_get_n_cells(buf_pool->zip_hash); i++) {
-		buf_page_t*	bpage;
+inline void buf_pool_t::page_hash_table::write_lock_all()
+{
+  for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
+  {
+    reinterpret_cast<page_hash_latch&>(array[n]).write_lock();
+    if (!n)
+      break;
+  }
+}
 
-		bpage = static_cast<buf_page_t*>(
-			HASH_GET_FIRST(buf_pool->zip_hash, i));
 
-		while (bpage) {
-			buf_page_t*	prev_bpage = bpage;
-			ulint		fold;
+inline void buf_pool_t::page_hash_table::write_unlock_all()
+{
+  for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
+  {
+    reinterpret_cast<page_hash_latch&>(array[n]).write_unlock();
+    if (!n)
+      break;
+  }
+}
 
-			bpage = static_cast<buf_page_t*>(
-				HASH_GET_NEXT(
-					hash, prev_bpage));
 
-			fold = BUF_POOL_ZIP_FOLD(
-				reinterpret_cast<buf_block_t*>(
-					prev_bpage));
+namespace
+{
 
-			HASH_DELETE(buf_page_t, hash,
-				buf_pool->zip_hash, fold,
-				prev_bpage);
+struct find_interesting_trx
+{
+  void operator()(const trx_t &trx)
+  {
+    if (trx.state == TRX_STATE_NOT_STARTED)
+      return;
+    if (trx.mysql_thd == nullptr)
+      return;
+    if (withdraw_started <= trx.start_time)
+      return;
+
+    if (!found)
+    {
+      ib::warn() << "The following trx might hold "
+                    "the blocks in buffer pool to "
+                    "be withdrawn. Buffer pool "
+                    "resizing can complete only "
+                    "after all the transactions "
+                    "below release the blocks.";
+      found= true;
+    }
+
+    lock_trx_print_wait_and_mvcc_state(stderr, &trx, current_time);
+  }
 
-			HASH_INSERT(buf_page_t, hash,
-				new_hash_table, fold,
-				prev_bpage);
-		}
-	}
+  bool &found;
+  time_t withdraw_started;
+  time_t current_time;
+};
 
-	hash_table_free(buf_pool->zip_hash);
-	buf_pool->zip_hash = new_hash_table;
-}
+} // namespace
 
-/** Resize the buffer pool based on srv_buf_pool_size from
-srv_buf_pool_old_size. */
-static
-void
-buf_pool_resize()
+/** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */
+inline void buf_pool_t::resize()
 {
-	buf_pool_t*	buf_pool;
-	ulint		new_instance_size;
+  ut_ad(this == &buf_pool);
+
 	bool		warning = false;
 
 	NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
 
-	ut_ad(!buf_pool_resizing);
+	ut_ad(!resize_in_progress());
 	ut_ad(srv_buf_pool_chunk_unit > 0);
 
-	new_instance_size = srv_buf_pool_size / srv_buf_pool_instances;
-	new_instance_size >>= srv_page_size_shift;
+	ulint new_instance_size = srv_buf_pool_size >> srv_page_size_shift;
 
 	buf_resize_status("Resizing buffer pool from " ULINTPF " to "
 			  ULINTPF " (unit=" ULINTPF ").",
 			  srv_buf_pool_old_size, srv_buf_pool_size,
 			  srv_buf_pool_chunk_unit);
 
-	/* set new limit for all buffer pool for resizing */
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool = buf_pool_from_array(i);
-		buf_pool_mutex_enter(buf_pool);
-
-		ut_ad(buf_pool->curr_size == buf_pool->old_size);
-		ut_ad(buf_pool->n_chunks_new == buf_pool->n_chunks);
-		ut_ad(UT_LIST_GET_LEN(buf_pool->withdraw) == 0);
-		ut_ad(buf_pool->flush_rbt == NULL);
+	mysql_mutex_lock(&mutex);
+	ut_ad(curr_size == old_size);
+	ut_ad(n_chunks_new == n_chunks);
+	ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
 
-		buf_pool->n_chunks_new =
-			(new_instance_size << srv_page_size_shift)
-			/ srv_buf_pool_chunk_unit;
+	n_chunks_new = (new_instance_size << srv_page_size_shift)
+		/ srv_buf_pool_chunk_unit;
+	curr_size = n_chunks_new * chunks->size;
+	mysql_mutex_unlock(&mutex);
 
-		buf_pool->curr_size = buf_pool->n_chunks_new * buf_pool->chunks->size;
-
-		buf_pool_mutex_exit(buf_pool);
-	}
 #ifdef BTR_CUR_HASH_ADAPT
 	/* disable AHI if needed */
-	bool	btr_search_disabled = false;
+	const bool btr_search_disabled = btr_search_enabled;
 
 	buf_resize_status("Disabling adaptive hash index.");
 
 	btr_search_s_lock_all();
-	if (btr_search_enabled) {
+	if (btr_search_disabled) {
 		btr_search_s_unlock_all();
-		btr_search_disabled = true;
 	} else {
 		btr_search_s_unlock_all();
 	}
@@ -2663,45 +1901,29 @@ buf_pool_resize()
 	}
 #endif /* BTR_CUR_HASH_ADAPT */
 
-	/* set withdraw target */
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool = buf_pool_from_array(i);
-		if (buf_pool->curr_size < buf_pool->old_size) {
-			ulint	withdraw_target = 0;
+	if (curr_size < old_size) {
+		/* set withdraw target */
+		size_t w = 0;
 
-			const buf_chunk_t*	chunk
-				= buf_pool->chunks + buf_pool->n_chunks_new;
-			const buf_chunk_t*	echunk
-				= buf_pool->chunks + buf_pool->n_chunks;
+		for (const chunk_t* chunk = chunks + n_chunks_new,
+		     * const echunk = chunks + n_chunks;
+		     chunk != echunk; chunk++)
+			w += chunk->size;
 
-			while (chunk < echunk) {
-				withdraw_target += chunk->size;
-				++chunk;
-			}
-
-			ut_ad(buf_pool->withdraw_target == 0);
-			buf_pool->withdraw_target = withdraw_target;
-		}
+		ut_ad(withdraw_target == 0);
+		withdraw_target = w;
 	}
 
 	buf_resize_status("Withdrawing blocks to be shrunken.");
 
 	time_t		withdraw_started = time(NULL);
-	ulint		message_interval = 60;
+	double		message_interval = 60;
 	ulint		retry_interval = 1;
 
 withdraw_retry:
-	bool	should_retry_withdraw = false;
-
 	/* wait for the number of blocks fit to the new size (if needed)*/
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool = buf_pool_from_array(i);
-		if (buf_pool->curr_size < buf_pool->old_size) {
-
-			should_retry_withdraw |=
-				buf_pool_withdraw_blocks(buf_pool);
-		}
-	}
+	bool	should_retry_withdraw = curr_size < old_size
+		&& withdraw_blocks();
 
 	if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
 		/* abort to resize for shutdown. */
@@ -2723,30 +1945,9 @@ withdraw_retry:
 		}
 
 		lock_mutex_enter();
-		mutex_enter(&trx_sys.mutex);
 		bool	found = false;
-		for (trx_t* trx = UT_LIST_GET_FIRST(trx_sys.trx_list);
-		     trx != NULL;
-		     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
-			if (trx->state != TRX_STATE_NOT_STARTED
-			    && trx->mysql_thd != NULL
-			    && withdraw_started > trx->start_time) {
-				if (!found) {
-					ib::warn() <<
-						"The following trx might hold"
-						" the blocks in buffer pool to"
-					        " be withdrawn. Buffer pool"
-						" resizing can complete only"
-						" after all the transactions"
-						" below release the blocks.";
-					found = true;
-				}
-
-				lock_trx_print_wait_and_mvcc_state(
-					stderr, trx, current_time);
-			}
-		}
-		mutex_exit(&trx_sys.mutex);
+		trx_sys.trx_list.for_each(find_interesting_trx{
+			found, withdraw_started, current_time});
 		lock_mutex_exit();
 
 		withdraw_started = current_time;
@@ -2766,7 +1967,6 @@ withdraw_retry:
 		goto withdraw_retry;
 	}
 
-
 	buf_resize_status("Latching whole of buffer pool.");
 
 #ifndef DBUG_OFF
@@ -2787,242 +1987,170 @@ withdraw_retry:
 	}
 
 	/* Indicate critical path */
-	buf_pool_resizing = true;
+	resizing.store(true, std::memory_order_relaxed);
 
-	/* Acquire all buf_pool_mutex/hash_lock */
-	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
-		buf_pool_t*	buf_pool = buf_pool_from_array(i);
+  mysql_mutex_lock(&mutex);
+  page_hash.write_lock_all();
 
-		buf_pool_mutex_enter(buf_pool);
-	}
-	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
-		buf_pool_t*	buf_pool = buf_pool_from_array(i);
+	chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map());
 
-		hash_lock_x_all(buf_pool->page_hash);
-	}
+	/* add/delete chunks */
 
-	buf_chunk_map_reg = UT_NEW_NOKEY(buf_pool_chunk_map_t());
+	buf_resize_status("buffer pool resizing with chunks "
+			  ULINTPF " to " ULINTPF ".",
+			  n_chunks, n_chunks_new);
 
-	/* add/delete chunks */
-	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
-		buf_pool_t*	buf_pool = buf_pool_from_array(i);
-		buf_chunk_t*	chunk;
-		buf_chunk_t*	echunk;
-
-		buf_resize_status("buffer pool %lu :"
-			" resizing with chunks %lu to %lu.",
-			i, buf_pool->n_chunks, buf_pool->n_chunks_new);
-
-		if (buf_pool->n_chunks_new < buf_pool->n_chunks) {
-			/* delete chunks */
-			chunk = buf_pool->chunks
-				+ buf_pool->n_chunks_new;
-			echunk = buf_pool->chunks + buf_pool->n_chunks;
-
-			ulint	sum_freed = 0;
-
-			while (chunk < echunk) {
-				buf_block_t*	block = chunk->blocks;
-
-				/* buf_LRU_block_free_non_file_page()
-				invokes MEM_NOACCESS() on any blocks
-				that are in free_list. We must
-				cancel the effect of that. In MemorySanitizer,
-				MEM_NOACCESS() is no-op, so we must not do
-				anything special for it here. */
+	if (n_chunks_new < n_chunks) {
+		/* delete chunks */
+		chunk_t* chunk = chunks + n_chunks_new;
+		const chunk_t* const echunk = chunks + n_chunks;
+
+		ulint	sum_freed = 0;
+
+		while (chunk < echunk) {
+			/* buf_LRU_block_free_non_file_page() invokes
+			MEM_NOACCESS() on any buf_pool.free blocks.
+			We must cancel the effect of that. In
+			MemorySanitizer, MEM_NOACCESS() is no-op, so
+			we must not do anything special for it here. */
 #ifdef HAVE_valgrind
 # if !__has_feature(memory_sanitizer)
-				MEM_MAKE_DEFINED(chunk->mem,
-						 chunk->mem_size());
+			MEM_MAKE_DEFINED(chunk->mem, chunk->mem_size());
 # endif
 #else
-				MEM_MAKE_ADDRESSABLE(chunk->mem,
-						     chunk->mem_size());
+			MEM_MAKE_ADDRESSABLE(chunk->mem, chunk->size);
 #endif
 
-				for (ulint j = chunk->size;
-				     j--; block++) {
-					buf_block_free_mutexes(block);
-				}
-
-				buf_pool->allocator.deallocate_large_dodump(
-					chunk->mem, &chunk->mem_pfx, chunk->mem_size());
-
-				sum_freed += chunk->size;
+			buf_block_t*	block = chunk->blocks;
 
-				++chunk;
+			for (ulint j = chunk->size; j--; block++) {
+				buf_block_free_mutexes(block);
 			}
 
-			/* discard withdraw list */
-			UT_LIST_INIT(buf_pool->withdraw,
-				     &buf_page_t::list);
-			buf_pool->withdraw_target = 0;
-
-			ib::info() << "buffer pool " << i << " : "
-				<< buf_pool->n_chunks - buf_pool->n_chunks_new
-				<< " chunks (" << sum_freed
-				<< " blocks) were freed.";
-
-			buf_pool->n_chunks = buf_pool->n_chunks_new;
+			allocator.deallocate_large_dodump(
+				chunk->mem, &chunk->mem_pfx);
+			sum_freed += chunk->size;
+			++chunk;
 		}
 
-		{
-			/* reallocate buf_pool->chunks */
-			const ulint	new_chunks_size
-				= buf_pool->n_chunks_new * sizeof(*chunk);
-
-			buf_chunk_t*	new_chunks
-				= reinterpret_cast<buf_chunk_t*>(
-					ut_zalloc_nokey_nofatal(new_chunks_size));
-
-			DBUG_EXECUTE_IF("buf_pool_resize_chunk_null",
-					ut_free(new_chunks);
-					new_chunks = NULL;);
-
-			if (new_chunks == NULL) {
-				ib::error() << "buffer pool " << i
-					<< " : failed to allocate"
-					" the chunk array.";
-				buf_pool->n_chunks_new
-					= buf_pool->n_chunks;
-				warning = true;
-				buf_pool->chunks_old = NULL;
-				for (ulint j = 0; j < buf_pool->n_chunks_new; j++) {
-					buf_pool_register_chunk(&(buf_pool->chunks[j]));
-				}
-				goto calc_buf_pool_size;
-			}
-
-			ulint	n_chunks_copy = ut_min(buf_pool->n_chunks_new,
-						       buf_pool->n_chunks);
+		/* discard withdraw list */
+		UT_LIST_INIT(withdraw, &buf_page_t::list);
+		withdraw_target = 0;
 
-			memcpy(new_chunks, buf_pool->chunks,
-			       n_chunks_copy * sizeof(*chunk));
+		ib::info() << n_chunks - n_chunks_new
+			   << " chunks (" << sum_freed
+			   << " blocks) were freed.";
 
-			for (ulint j = 0; j < n_chunks_copy; j++) {
-				buf_pool_register_chunk(&new_chunks[j]);
-			}
-
-			buf_pool->chunks_old = buf_pool->chunks;
-			buf_pool->chunks = new_chunks;
-		}
+		n_chunks = n_chunks_new;
+	}
 
+	{
+		/* reallocate chunks */
+		const size_t	new_chunks_size
+			= n_chunks_new * sizeof(chunk_t);
 
-		if (buf_pool->n_chunks_new > buf_pool->n_chunks) {
-			/* add chunks */
-			chunk = buf_pool->chunks + buf_pool->n_chunks;
-			echunk = buf_pool->chunks
-				+ buf_pool->n_chunks_new;
+		chunk_t*	new_chunks = static_cast<chunk_t*>(
+			ut_zalloc_nokey_nofatal(new_chunks_size));
 
-			ulint	sum_added = 0;
-			ulint	n_chunks = buf_pool->n_chunks;
+		DBUG_EXECUTE_IF("buf_pool_resize_chunk_null",
+				ut_free(new_chunks); new_chunks= nullptr; );
 
-			while (chunk < echunk) {
-				ulong	unit = srv_buf_pool_chunk_unit;
+		if (!new_chunks) {
+			ib::error() << "failed to allocate"
+				" the chunk array.";
+			n_chunks_new = n_chunks;
+			warning = true;
+			chunks_old = NULL;
+			goto calc_buf_pool_size;
+		}
 
-				if (!buf_chunk_init(buf_pool, chunk, unit)) {
+		ulint	n_chunks_copy = ut_min(n_chunks_new,
+					       n_chunks);
 
-					ib::error() << "buffer pool " << i
-						<< " : failed to allocate"
-						" new memory.";
+		memcpy(new_chunks, chunks,
+		       n_chunks_copy * sizeof *new_chunks);
 
-					warning = true;
+		for (ulint j = 0; j < n_chunks_copy; j++) {
+			new_chunks[j].reg();
+		}
 
-					buf_pool->n_chunks_new
-						= n_chunks;
+		chunks_old = chunks;
+		chunks = new_chunks;
+	}
 
-					break;
-				}
+	if (n_chunks_new > n_chunks) {
+		/* add chunks */
+		ulint	sum_added = 0;
+		ulint	n = n_chunks;
+		const size_t unit = srv_buf_pool_chunk_unit;
 
-				sum_added += chunk->size;
+		for (chunk_t* chunk = chunks + n_chunks,
+		     * const echunk = chunks + n_chunks_new;
+		     chunk != echunk; chunk++) {
+			if (!chunk->create(unit)) {
+				ib::error() << "failed to allocate"
+					" memory for buffer pool chunk";
 
-				++n_chunks;
-				++chunk;
+				warning = true;
+				n_chunks_new = n_chunks;
+				break;
 			}
 
-			ib::info() << "buffer pool " << i << " : "
-				<< buf_pool->n_chunks_new - buf_pool->n_chunks
-				<< " chunks (" << sum_added
-				<< " blocks) were added.";
-
-			buf_pool->n_chunks = n_chunks;
+			sum_added += chunk->size;
+			++n;
 		}
-calc_buf_pool_size:
 
-		/* recalc buf_pool->curr_size */
-		ulint	new_size = 0;
+		ib::info() << n_chunks_new - n_chunks
+			   << " chunks (" << sum_added
+			   << " blocks) were added.";
+
+		n_chunks = n;
+	}
+calc_buf_pool_size:
+	/* recalc curr_size */
+	ulint	new_size = 0;
 
-		chunk = buf_pool->chunks;
+	{
+		chunk_t* chunk = chunks;
+		const chunk_t* const echunk = chunk + n_chunks;
 		do {
 			new_size += chunk->size;
-		} while (++chunk < buf_pool->chunks
-				   + buf_pool->n_chunks);
-
-		buf_pool->curr_size = new_size;
-		buf_pool->n_chunks_new = buf_pool->n_chunks;
-
-		if (buf_pool->chunks_old) {
-			ut_free(buf_pool->chunks_old);
-			buf_pool->chunks_old = NULL;
-		}
+		} while (++chunk != echunk);
 	}
 
-	buf_pool_chunk_map_t*	chunk_map_old = buf_chunk_map_ref;
-	buf_chunk_map_ref = buf_chunk_map_reg;
+	curr_size = new_size;
+	n_chunks_new = n_chunks;
 
-	/* set instance sizes */
-	{
-		ulint	curr_size = 0;
-
-		for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-			buf_pool = buf_pool_from_array(i);
+	if (chunks_old) {
+		ut_free(chunks_old);
+		chunks_old = NULL;
+	}
 
-			ut_ad(UT_LIST_GET_LEN(buf_pool->withdraw) == 0);
+	chunk_t::map* chunk_map_old = chunk_t::map_ref;
+	chunk_t::map_ref = chunk_t::map_reg;
 
-			buf_pool->read_ahead_area =
-				ut_min(BUF_READ_AHEAD_PAGES,
-				       ut_2_power_up(buf_pool->curr_size /
-						      BUF_READ_AHEAD_PORTION));
-			buf_pool->curr_pool_size
-				= buf_pool->n_chunks * srv_buf_pool_chunk_unit;
-			curr_size += buf_pool->curr_pool_size;
-			buf_pool->old_size = buf_pool->curr_size;
-		}
-		srv_buf_pool_curr_size = curr_size;
-		innodb_set_buf_pool_size(buf_pool_size_align(curr_size));
-	}
+	/* set size */
+	ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
+  ulint s= curr_size;
+  old_size= s;
+  s/= BUF_READ_AHEAD_PORTION;
+  read_ahead_area= s >= READ_AHEAD_PAGES
+    ? READ_AHEAD_PAGES
+    : my_round_up_to_next_power(static_cast<uint32_t>(s));
+  curr_pool_size= n_chunks * srv_buf_pool_chunk_unit;
+  srv_buf_pool_curr_size= curr_pool_size;/* FIXME: remove*/
+  innodb_set_buf_pool_size(buf_pool_size_align(srv_buf_pool_curr_size));
 
 	const bool	new_size_too_diff
 		= srv_buf_pool_base_size > srv_buf_pool_size * 2
 			|| srv_buf_pool_base_size * 2 < srv_buf_pool_size;
 
-	/* Normalize page_hash and zip_hash,
-	if the new size is too different */
-	if (!warning && new_size_too_diff) {
-
-		buf_resize_status("Resizing hash tables.");
-
-		for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
-			buf_pool_t*	buf_pool = buf_pool_from_array(i);
-
-			buf_pool_resize_hash(buf_pool);
-
-			ib::info() << "buffer pool " << i
-				<< " : hash tables were resized.";
-		}
-	}
-
-	/* Release all buf_pool_mutex/page_hash */
-	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
-		buf_pool_t*	buf_pool = buf_pool_from_array(i);
-
-		hash_unlock_x_all(buf_pool->page_hash);
-		buf_pool_mutex_exit(buf_pool);
-	}
+  mysql_mutex_unlock(&mutex);
+  page_hash.write_unlock_all();
 
 	UT_DELETE(chunk_map_old);
 
-	buf_pool_resizing = false;
+	resizing.store(false, std::memory_order_relaxed);
 
 	/* Normalize other components, if the new size is too different */
 	if (!warning && new_size_too_diff) {
@@ -3042,7 +2170,7 @@ calc_buf_pool_size:
 			" dictionary.";
 	}
 
-	/* normalize ibuf->max_size */
+	/* normalize ibuf.max_size */
 	ibuf_max_size_update(srv_change_buffer_max_size);
 
 	if (srv_buf_pool_old_size != srv_buf_pool_size) {
@@ -3072,576 +2200,269 @@ calc_buf_pool_size:
 			" finished resizing at %s.", now);
 	}
 
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	ut_a(buf_validate());
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+	ut_d(validate());
 
 	return;
 }
 
-/** This is the thread for resizing buffer pool. It waits for an event and
-when waked up either performs a resizing and sleeps again.
-@return	this function does not return, calls os_thread_exit()
-*/
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(buf_resize_thread)(void*)
+/** Thread pool task invoked by innodb_buffer_pool_size changes. */
+static void buf_resize_callback(void *)
 {
-	my_thread_init();
-
-	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
-		os_event_wait(srv_buf_resize_event);
-		os_event_reset(srv_buf_resize_event);
-
-		if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
-			break;
-		}
-
-		buf_pool_mutex_enter_all();
-		if (srv_buf_pool_old_size == srv_buf_pool_size) {
-			buf_pool_mutex_exit_all();
-			std::ostringstream sout;
-			sout << "Size did not change (old size = new size = "
-				<< srv_buf_pool_size << ". Nothing to do.";
-			buf_resize_status(sout.str().c_str());
-
-			/* nothing to do */
-			continue;
-		}
-		buf_pool_mutex_exit_all();
-
-		buf_pool_resize();
-	}
-
-	srv_buf_resize_thread_active = false;
-
-	my_thread_end();
-	os_thread_exit();
-
-	OS_THREAD_DUMMY_RETURN;
-}
-
-/********************************************************************//**
-Relocate a buffer control block.  Relocates the block on the LRU list
-and in buf_pool->page_hash.  Does not relocate bpage->list.
-The caller must take care of relocating bpage->list. */
-static
-void
-buf_relocate(
-/*=========*/
-	buf_page_t*	bpage,	/*!< in/out: control block being relocated;
-				buf_page_get_state(bpage) must be
-				BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
-	buf_page_t*	dpage)	/*!< in/out: destination control block */
-{
-	buf_page_t*	b;
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_page_hash_lock_held_x(buf_pool, bpage));
-	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-	ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
-	ut_a(bpage->buf_fix_count == 0);
-	ut_ad(bpage->in_LRU_list);
-	ut_ad(!bpage->in_zip_hash);
-	ut_ad(bpage->in_page_hash);
-	ut_ad(bpage == buf_page_hash_get_low(buf_pool, bpage->id));
-
-	ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
-#ifdef UNIV_DEBUG
-	switch (buf_page_get_state(bpage)) {
-	case BUF_BLOCK_POOL_WATCH:
-	case BUF_BLOCK_NOT_USED:
-	case BUF_BLOCK_READY_FOR_USE:
-	case BUF_BLOCK_FILE_PAGE:
-	case BUF_BLOCK_MEMORY:
-	case BUF_BLOCK_REMOVE_HASH:
-		ut_error;
-	case BUF_BLOCK_ZIP_DIRTY:
-	case BUF_BLOCK_ZIP_PAGE:
-		break;
-	}
-#endif /* UNIV_DEBUG */
-
-	new (dpage) buf_page_t(*bpage);
-
-	/* Important that we adjust the hazard pointer before
-	removing bpage from LRU list. */
-	buf_LRU_adjust_hp(buf_pool, bpage);
-
-	ut_d(bpage->in_LRU_list = FALSE);
-	ut_d(bpage->in_page_hash = FALSE);
-
-	/* relocate buf_pool->LRU */
-	b = UT_LIST_GET_PREV(LRU, bpage);
-	UT_LIST_REMOVE(buf_pool->LRU, bpage);
-
-	if (b != NULL) {
-		UT_LIST_INSERT_AFTER(buf_pool->LRU, b, dpage);
-	} else {
-		UT_LIST_ADD_FIRST(buf_pool->LRU, dpage);
-	}
-
-	if (UNIV_UNLIKELY(buf_pool->LRU_old == bpage)) {
-		buf_pool->LRU_old = dpage;
-#ifdef UNIV_LRU_DEBUG
-		/* buf_pool->LRU_old must be the first item in the LRU list
-		whose "old" flag is set. */
-		ut_a(buf_pool->LRU_old->old);
-		ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
-		     || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
-		ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
-		     || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
-	} else {
-		/* Check that the "old" flag is consistent in
-		the block and its neighbours. */
-		buf_page_set_old(dpage, buf_page_is_old(dpage));
-#endif /* UNIV_LRU_DEBUG */
-	}
-
-        ut_d(CheckInLRUList::validate(buf_pool));
-
-	/* relocate buf_pool->page_hash */
-	ulint	fold = bpage->id.fold();
-	ut_ad(fold == dpage->id.fold());
-	HASH_REPLACE(buf_page_t, hash, buf_pool->page_hash, fold, bpage,
-		     dpage);
-}
-
-/** Hazard Pointer implementation. */
-
-/** Set current value
-@param bpage	buffer block to be set as hp */
-void
-HazardPointer::set(buf_page_t* bpage)
-{
-	ut_ad(mutex_own(m_mutex));
-	ut_ad(!bpage || buf_pool_from_bpage(bpage) == m_buf_pool);
-	ut_ad(!bpage || buf_page_in_file(bpage));
-
-	m_hp = bpage;
-}
-
-/** Checks if a bpage is the hp
-@param bpage    buffer block to be compared
-@return true if it is hp */
-
-bool
-HazardPointer::is_hp(const buf_page_t* bpage)
-{
-	ut_ad(mutex_own(m_mutex));
-	ut_ad(!m_hp || buf_pool_from_bpage(m_hp) == m_buf_pool);
-	ut_ad(!bpage || buf_pool_from_bpage(bpage) == m_buf_pool);
-
-	return(bpage == m_hp);
-}
-
-/** Adjust the value of hp. This happens when some other thread working
-on the same list attempts to remove the hp from the list.
-@param bpage	buffer block to be compared */
-
-void
-FlushHp::adjust(const buf_page_t* bpage)
-{
-	ut_ad(bpage != NULL);
-
-	/** We only support reverse traversal for now. */
-	if (is_hp(bpage)) {
-		m_hp = UT_LIST_GET_PREV(list, m_hp);
-	}
-
-	ut_ad(!m_hp || m_hp->in_flush_list);
-}
-
-/** Adjust the value of hp. This happens when some other thread working
-on the same list attempts to remove the hp from the list.
-@param bpage	buffer block to be compared */
-
-void
-LRUHp::adjust(const buf_page_t* bpage)
-{
-	ut_ad(bpage);
-
-	/** We only support reverse traversal for now. */
-	if (is_hp(bpage)) {
-		m_hp = UT_LIST_GET_PREV(LRU, m_hp);
-	}
-
-	ut_ad(!m_hp || m_hp->in_LRU_list);
-}
-
-/** Selects from where to start a scan. If we have scanned too deep into
-the LRU list it resets the value to the tail of the LRU list.
-@return buf_page_t from where to start scan. */
-
-buf_page_t*
-LRUItr::start()
-{
-	ut_ad(mutex_own(m_mutex));
-
-	if (!m_hp || m_hp->old) {
-		m_hp = UT_LIST_GET_LAST(m_buf_pool->LRU);
-	}
-
-	return(m_hp);
-}
-
-/** Determine if a block is a sentinel for a buffer pool watch.
-@param[in]	buf_pool	buffer pool instance
-@param[in]	bpage		block
-@return TRUE if a sentinel for a buffer pool watch, FALSE if not */
-ibool
-buf_pool_watch_is_sentinel(
-	const buf_pool_t*	buf_pool,
-	const buf_page_t*	bpage)
-{
-	/* We must also own the appropriate hash lock. */
-	ut_ad(buf_page_hash_lock_held_s_or_x(buf_pool, bpage));
-	ut_ad(buf_page_in_file(bpage));
-
-	if (bpage < &buf_pool->watch[0]
-	    || bpage >= &buf_pool->watch[BUF_POOL_WATCH_SIZE]) {
-
-		ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_PAGE
-		      || bpage->zip.data != NULL);
-
-		return(FALSE);
-	}
-
-	ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE);
-	ut_ad(!bpage->in_zip_hash);
-	ut_ad(bpage->in_page_hash);
-	ut_ad(bpage->zip.data == NULL);
-	return(TRUE);
+  DBUG_ENTER("buf_resize_callback");
+  ut_ad(srv_shutdown_state < SRV_SHUTDOWN_CLEANUP);
+  mysql_mutex_lock(&buf_pool.mutex);
+  const auto size= srv_buf_pool_size;
+  const bool work= srv_buf_pool_old_size != size;
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  if (work)
+    buf_pool.resize();
+  else
+  {
+    std::ostringstream sout;
+    sout << "Size did not change: old size = new size = " << size;
+    buf_resize_status(sout.str().c_str());
+  }
+  DBUG_VOID_RETURN;
 }
 
-/** Add watch for the given page to be read in. Caller must have
-appropriate hash_lock for the bpage. This function may release the
-hash_lock and reacquire it.
-@param[in]	page_id		page id
-@param[in,out]	hash_lock	hash_lock currently latched
-@return NULL if watch set, block if the page is in the buffer pool */
-static
-buf_page_t*
-buf_pool_watch_set(
-	const page_id_t		page_id,
-	rw_lock_t**		hash_lock)
-{
-	buf_page_t*	bpage;
-	ulint		i;
-	buf_pool_t*	buf_pool = buf_pool_get(page_id);
-
-	ut_ad(*hash_lock == buf_page_hash_lock_get(buf_pool, page_id));
-
-	ut_ad(rw_lock_own(*hash_lock, RW_LOCK_X));
-
-	bpage = buf_page_hash_get_low(buf_pool, page_id);
-
-	if (bpage != NULL) {
-page_found:
-		if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) {
-			/* The page was loaded meanwhile. */
-			return(bpage);
-		}
-
-		/* Add to an existing watch. */
-		bpage->fix();
-		return(NULL);
-	}
-
-	/* From this point this function becomes fairly heavy in terms
-	of latching. We acquire the buf_pool mutex as well as all the
-	hash_locks. buf_pool mutex is needed because any changes to
-	the page_hash must be covered by it and hash_locks are needed
-	because we don't want to read any stale information in
-	buf_pool->watch[]. However, it is not in the critical code path
-	as this function will be called only by the purge thread. */
-
-	/* To obey latching order first release the hash_lock. */
-	rw_lock_x_unlock(*hash_lock);
-
-	buf_pool_mutex_enter(buf_pool);
-	hash_lock_x_all(buf_pool->page_hash);
-
-	/* If not own buf_pool_mutex, page_hash can be changed. */
-	*hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
-
-	/* We have to recheck that the page
-	was not loaded or a watch set by some other
-	purge thread. This is because of the small
-	time window between when we release the
-	hash_lock to acquire buf_pool mutex above. */
-
-	bpage = buf_page_hash_get_low(buf_pool, page_id);
-	if (UNIV_LIKELY_NULL(bpage)) {
-		buf_pool_mutex_exit(buf_pool);
-		hash_unlock_x_all_but(buf_pool->page_hash, *hash_lock);
-		goto page_found;
-	}
-
-	/* The maximum number of purge threads should never exceed
-	BUF_POOL_WATCH_SIZE. So there is no way for purge thread
-	instance to hold a watch when setting another watch. */
-	for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
-		bpage = &buf_pool->watch[i];
-
-		ut_ad(bpage->access_time == 0);
-		ut_ad(bpage->newest_modification == 0);
-		ut_ad(bpage->oldest_modification == 0);
-		ut_ad(bpage->zip.data == NULL);
-		ut_ad(!bpage->in_zip_hash);
-
-		switch (bpage->state) {
-		case BUF_BLOCK_POOL_WATCH:
-			ut_ad(!bpage->in_page_hash);
-			ut_ad(bpage->buf_fix_count == 0);
-
-			/* bpage is pointing to buf_pool->watch[],
-			which is protected by buf_pool->mutex.
-			Normally, buf_page_t objects are protected by
-			buf_block_t::mutex or buf_pool->zip_mutex or both. */
-
-			bpage->state = BUF_BLOCK_ZIP_PAGE;
-			bpage->id = page_id;
-			bpage->buf_fix_count = 1;
-
-			ut_d(bpage->in_page_hash = TRUE);
-			HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
-				    page_id.fold(), bpage);
-
-			buf_pool_mutex_exit(buf_pool);
-			/* Once the sentinel is in the page_hash we can
-			safely release all locks except just the
-			relevant hash_lock */
-			hash_unlock_x_all_but(buf_pool->page_hash,
-						*hash_lock);
-
-			return(NULL);
-		case BUF_BLOCK_ZIP_PAGE:
-			ut_ad(bpage->in_page_hash);
-			ut_ad(bpage->buf_fix_count > 0);
-			break;
-		default:
-			ut_error;
-		}
-	}
-
-	/* Allocation failed.  Either the maximum number of purge
-	threads should never exceed BUF_POOL_WATCH_SIZE, or this code
-	should be modified to return a special non-NULL value and the
-	caller should purge the record directly. */
-	ut_error;
+/* Ensure that task does not run in parallel, by setting max_concurrency to 1 for the thread group */
+static tpool::task_group single_threaded_group(1);
+static tpool::waitable_task buf_resize_task(buf_resize_callback,
+	nullptr, &single_threaded_group);
 
-	/* Fix compiler warning */
-	return(NULL);
-}
-
-/** Remove the sentinel block for the watch before replacing it with a
-real block. buf_page_watch_clear() or buf_page_watch_occurred() will notice
-that the block has been replaced with the real block.
-@param[in,out]	buf_pool	buffer pool instance
-@param[in,out]	watch		sentinel for watch
-@return reference count, to be added to the replacement block */
-static
-void
-buf_pool_watch_remove(
-	buf_pool_t*	buf_pool,
-	buf_page_t*	watch)
+void buf_resize_start()
 {
-#ifdef UNIV_DEBUG
-	/* We must also own the appropriate hash_bucket mutex. */
-	rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, watch->id);
-	ut_ad(rw_lock_own(hash_lock, RW_LOCK_X));
-#endif /* UNIV_DEBUG */
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-
-	HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, watch->id.fold(),
-		    watch);
-	ut_d(watch->in_page_hash = FALSE);
-	watch->buf_fix_count = 0;
-	watch->state = BUF_BLOCK_POOL_WATCH;
+	srv_thread_pool->submit_task(&buf_resize_task);
 }
 
-/** Stop watching if the page has been read in.
-buf_pool_watch_set(same_page_id) must have returned NULL before.
-@param[in]	page_id	page id */
-void buf_pool_watch_unset(const page_id_t page_id)
+void buf_resize_shutdown()
 {
-	buf_page_t*	bpage;
-	buf_pool_t*	buf_pool = buf_pool_get(page_id);
-
-	/* We only need to have buf_pool mutex in case where we end
-	up calling buf_pool_watch_remove but to obey latching order
-	we acquire it here before acquiring hash_lock. This should
-	not cause too much grief as this function is only ever
-	called from the purge thread. */
-	buf_pool_mutex_enter(buf_pool);
-
-	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
-	rw_lock_x_lock(hash_lock);
-
-	/* The page must exist because buf_pool_watch_set()
-	increments buf_fix_count. */
-	bpage = buf_page_hash_get_low(buf_pool, page_id);
-
-	if (bpage->unfix() == 0
-	    && buf_pool_watch_is_sentinel(buf_pool, bpage)) {
-		buf_pool_watch_remove(buf_pool, bpage);
-	}
-
-	buf_pool_mutex_exit(buf_pool);
-	rw_lock_x_unlock(hash_lock);
+	buf_resize_task.wait();
 }
 
-/** Check if the page has been read in.
-This may only be called after buf_pool_watch_set(same_page_id)
-has returned NULL and before invoking buf_pool_watch_unset(same_page_id).
-@param[in]	page_id	page id
-@return false if the given page was not read in, true if it was */
-bool buf_pool_watch_occurred(const page_id_t page_id)
-{
-	bool		ret;
-	buf_page_t*	bpage;
-	buf_pool_t*	buf_pool = buf_pool_get(page_id);
-	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
-
-	rw_lock_s_lock(hash_lock);
-
-	/* If not own buf_pool_mutex, page_hash can be changed. */
-	hash_lock = buf_page_hash_lock_s_confirm(hash_lock, buf_pool, page_id);
-
-	/* The page must exist because buf_pool_watch_set()
-	increments buf_fix_count. */
-	bpage = buf_page_hash_get_low(buf_pool, page_id);
-
-	ret = !buf_pool_watch_is_sentinel(buf_pool, bpage);
-	rw_lock_s_unlock(hash_lock);
-
-	return(ret);
-}
 
-/********************************************************************//**
-Moves a page to the start of the buffer pool LRU list. This high-level
-function can be used to prevent an important page from slipping out of
-the buffer pool. */
-void
-buf_page_make_young(
-/*================*/
-	buf_page_t*	bpage)	/*!< in: buffer block of a file page */
+/** Relocate a ROW_FORMAT=COMPRESSED block in the LRU list and
+buf_pool.page_hash.
+The caller must relocate bpage->list.
+@param bpage   BUF_BLOCK_ZIP_PAGE block
+@param dpage   destination control block */
+static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage)
 {
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-
-	buf_pool_mutex_enter(buf_pool);
-
-	ut_a(buf_page_in_file(bpage));
+  const ulint fold= bpage->id().fold();
+  ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE);
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(buf_pool.hash_lock_get(bpage->id())->is_write_locked());
+  ut_a(bpage->io_fix() == BUF_IO_NONE);
+  ut_a(!bpage->buf_fix_count());
+  ut_ad(bpage == buf_pool.page_hash_get_low(bpage->id(), fold));
+  ut_ad(!buf_pool.watch_is_sentinel(*bpage));
+  ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE);
+
+  new (dpage) buf_page_t(*bpage);
+
+  /* Important that we adjust the hazard pointer before
+  removing bpage from LRU list. */
+  if (buf_page_t *b= buf_pool.LRU_remove(bpage))
+    UT_LIST_INSERT_AFTER(buf_pool.LRU, b, dpage);
+  else
+    UT_LIST_ADD_FIRST(buf_pool.LRU, dpage);
+
+  if (UNIV_UNLIKELY(buf_pool.LRU_old == bpage))
+  {
+    buf_pool.LRU_old= dpage;
+#ifdef UNIV_LRU_DEBUG
+    /* buf_pool.LRU_old must be the first item in the LRU list
+    whose "old" flag is set. */
+    ut_a(buf_pool.LRU_old->old);
+    ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old) ||
+         !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old);
+    ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old) ||
+         UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old);
+  }
+  else
+  {
+    /* Check that the "old" flag is consistent in
+    the block and its neighbours. */
+    dpage->set_old(dpage->is_old());
+#endif /* UNIV_LRU_DEBUG */
+  }
 
-	buf_LRU_make_block_young(bpage);
+  ut_d(CheckInLRUList::validate());
 
-	buf_pool_mutex_exit(buf_pool);
+  /* relocate buf_pool.page_hash */
+  ut_ad(bpage->in_page_hash);
+  ut_ad(dpage->in_page_hash);
+  ut_d(bpage->in_page_hash= false);
+  HASH_REPLACE(buf_page_t, hash, &buf_pool.page_hash, fold, bpage, dpage);
 }
 
-/********************************************************************//**
-Moves a page to the start of the buffer pool LRU list if it is too old.
-This high-level function can be used to prevent an important page from
-slipping out of the buffer pool. */
-static
-void
-buf_page_make_young_if_needed(
-/*==========================*/
-	buf_page_t*	bpage)		/*!< in/out: buffer block of a
-					file page */
+/** Register a watch for a page identifier. The caller must hold an
+exclusive page hash latch. The *hash_lock may be released,
+relocated, and reacquired.
+@param id         page identifier
+@param hash_lock  exclusively held page_hash latch
+@return a buffer pool block corresponding to id
+@retval nullptr   if the block was not present, and a watch was installed */
+inline buf_page_t *buf_pool_t::watch_set(const page_id_t id,
+                                         page_hash_latch **hash_lock)
 {
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	ut_ad(!buf_pool_mutex_own(buf_pool));
-#endif /* UNIV_DEBUG */
-	ut_a(buf_page_in_file(bpage));
+  const ulint fold= id.fold();
+  ut_ad(*hash_lock == page_hash.lock_get(fold));
+  ut_ad((*hash_lock)->is_write_locked());
 
-	if (buf_page_peek_if_too_old(bpage)) {
-		buf_page_make_young(bpage);
-	}
-}
+retry:
+  if (buf_page_t *bpage= page_hash_get_low(id, fold))
+  {
+    if (!watch_is_sentinel(*bpage))
+      /* The page was loaded meanwhile. */
+      return bpage;
+    /* Add to an existing watch. */
+    bpage->fix();
+    return nullptr;
+  }
 
-#ifdef UNIV_DEBUG
+  (*hash_lock)->write_unlock();
+  /* Allocate a watch[] and then try to insert it into the page_hash. */
+  mysql_mutex_lock(&mutex);
 
-/** Sets file_page_was_freed TRUE if the page is found in the buffer pool.
-This function should be called when we free a file page and want the
-debug version to check that it is not accessed any more unless
-reallocated.
-@param[in]	page_id	page id
-@return control block if found in page hash table, otherwise NULL */
-buf_page_t* buf_page_set_file_page_was_freed(const page_id_t page_id)
-{
-	buf_page_t*	bpage;
-	buf_pool_t*	buf_pool = buf_pool_get(page_id);
-	rw_lock_t*	hash_lock;
-
-	bpage = buf_page_hash_get_s_locked(buf_pool, page_id, &hash_lock);
-
-	if (bpage) {
-		BPageMutex*	block_mutex = buf_page_get_mutex(bpage);
-		ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
-		mutex_enter(block_mutex);
-		rw_lock_s_unlock(hash_lock);
-		/* bpage->file_page_was_freed can already hold
-		when this code is invoked from dict_drop_index_tree() */
-		bpage->file_page_was_freed = TRUE;
-		mutex_exit(block_mutex);
-	}
+  /* The maximum number of purge tasks should never exceed
+  the UT_ARR_SIZE(watch) - 1, and there is no way for a purge task to hold a
+  watch when setting another watch. */
+  for (buf_page_t *w= &watch[UT_ARR_SIZE(watch)]; w-- >= watch; )
+  {
+    ut_ad(w->access_time == 0);
+    ut_ad(!w->oldest_modification());
+    ut_ad(!w->zip.data);
+    ut_ad(!w->in_zip_hash);
+    if (w->state() == BUF_BLOCK_ZIP_PAGE)
+      /* This watch may be in use for some other page. */
+      continue;
+    ut_ad(w->state() == BUF_BLOCK_NOT_USED);
+    ut_ad(!w->buf_fix_count());
+    /* w is pointing to watch[], which is protected by mutex.
+    Normally, buf_page_t::id for objects that are reachable by
+    page_hash_get_low(id, fold) are protected by hash_lock. */
+    w->set_state(BUF_BLOCK_ZIP_PAGE);
+    w->id_= id;
+
+    *hash_lock= page_hash.lock_get(fold);
+
+    buf_page_t *bpage= page_hash_get_low(id, fold);
+    if (UNIV_LIKELY_NULL(bpage))
+    {
+      w->set_state(BUF_BLOCK_NOT_USED);
+      *hash_lock= page_hash.lock_get(fold);
+      (*hash_lock)->write_lock();
+      mysql_mutex_unlock(&mutex);
+      goto retry;
+    }
+
+    (*hash_lock)->write_lock();
+    ut_ad(!w->buf_fix_count_);
+    w->buf_fix_count_= 1;
+    ut_ad(!w->in_page_hash);
+    ut_d(w->in_page_hash= true);
+    HASH_INSERT(buf_page_t, hash, &page_hash, fold, w);
+    mysql_mutex_unlock(&mutex);
+    return nullptr;
+  }
 
-	return(bpage);
+  ut_error;
+  mysql_mutex_unlock(&mutex);
+  return nullptr;
 }
 
-/** Sets file_page_was_freed FALSE if the page is found in the buffer pool.
-This function should be called when we free a file page and want the
-debug version to check that it is not accessed any more unless
-reallocated.
-@param[in]	page_id	page id
-@return control block if found in page hash table, otherwise NULL */
-buf_page_t* buf_page_reset_file_page_was_freed(const page_id_t page_id)
+/** Stop watching whether a page has been read in.
+watch_set(id) must have returned nullptr before.
+@param id   page identifier */
+void buf_pool_t::watch_unset(const page_id_t id)
 {
-	buf_page_t*	bpage;
-	buf_pool_t*	buf_pool = buf_pool_get(page_id);
-	rw_lock_t*	hash_lock;
-
-	bpage = buf_page_hash_get_s_locked(buf_pool, page_id, &hash_lock);
-	if (bpage) {
-		BPageMutex*	block_mutex = buf_page_get_mutex(bpage);
-		ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
-		mutex_enter(block_mutex);
-		rw_lock_s_unlock(hash_lock);
-		bpage->file_page_was_freed = FALSE;
-		mutex_exit(block_mutex);
-	}
-
-	return(bpage);
+  mysql_mutex_assert_not_owner(&mutex);
+  const ulint fold= id.fold();
+  page_hash_latch *hash_lock= page_hash.lock<true>(fold);
+  /* The page must exist because watch_set() increments buf_fix_count. */
+  buf_page_t *w= page_hash_get_low(id, fold);
+  ut_ad(w->in_page_hash);
+  const bool must_remove= watch_is_sentinel(*w) && w->buf_fix_count() == 1;
+  if (!must_remove)
+    w->unfix();
+  hash_lock->write_unlock();
+
+  if (must_remove)
+  {
+    const auto old= w;
+    /* The following is based on buf_pool_t::watch_remove(). */
+    mysql_mutex_lock(&mutex);
+    w= page_hash_get_low(id, fold);
+    page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
+    hash_lock->write_lock();
+    if (w->unfix() == 0 && w == old)
+    {
+      ut_ad(w->in_page_hash);
+      ut_d(w->in_page_hash= false);
+      HASH_DELETE(buf_page_t, hash, &page_hash, fold, w);
+      // Now that the watch is detached from page_hash, release it to watch[].
+      ut_ad(w->id_ == id);
+      ut_ad(!w->buf_fix_count());
+      ut_ad(w->state() == BUF_BLOCK_ZIP_PAGE);
+      w->set_state(BUF_BLOCK_NOT_USED);
+    }
+    hash_lock->write_unlock();
+    mysql_mutex_unlock(&mutex);
+  }
 }
-#endif /* UNIV_DEBUG */
 
-/** Attempts to discard the uncompressed frame of a compressed page.
-The caller should not be holding any mutexes when this function is called.
-@param[in]	page_id	page id */
-static void buf_block_try_discard_uncompressed(const page_id_t page_id)
+/** Mark the page status as FREED for the given tablespace id and
+page number. If the page is not in buffer pool then ignore it.
+@param[in,out]	space	tablespace
+@param[in]	page	page number
+@param[in,out]	mtr	mini-transaction
+@param[in]	file	file name
+@param[in]	line	line where called */
+void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr,
+                   const char *file, unsigned line)
 {
-	buf_page_t*	bpage;
-	buf_pool_t*	buf_pool = buf_pool_get(page_id);
+  ut_ad(mtr);
+  ut_ad(mtr->is_active());
 
-	/* Since we need to acquire buf_pool mutex to discard
-	the uncompressed frame and because page_hash mutex resides
-	below buf_pool mutex in sync ordering therefore we must
-	first release the page_hash mutex. This means that the
-	block in question can move out of page_hash. Therefore
-	we need to check again if the block is still in page_hash. */
-	buf_pool_mutex_enter(buf_pool);
+  if (srv_immediate_scrub_data_uncompressed
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+      || space->is_compressed()
+#endif
+      )
+    mtr->add_freed_offset(space, page);
+
+  buf_pool.stat.n_page_gets++;
+  const page_id_t page_id(space->id, page);
+  const ulint fold= page_id.fold();
+  page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
+  if (buf_block_t *block= reinterpret_cast<buf_block_t*>
+      (buf_pool.page_hash_get_low(page_id, fold)))
+  {
+    if (block->page.state() != BUF_BLOCK_FILE_PAGE)
+      /* FIXME: convert, but avoid buf_zip_decompress() */;
+    else
+    {
+      buf_block_buf_fix_inc(block, file, line);
+      ut_ad(block->page.buf_fix_count());
+      hash_lock->read_unlock();
+
+      mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+      rw_lock_x_lock_inline(&block->lock, 0, file, line);
+      buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
 
-	bpage = buf_page_hash_get(buf_pool, page_id);
+#ifdef BTR_CUR_HASH_ADAPT
+      if (block->index)
+        btr_search_drop_page_hash_index(block);
+#endif /* BTR_CUR_HASH_ADAPT */
 
-	if (bpage) {
-		buf_LRU_free_page(bpage, false);
-	}
+      block->page.status= buf_page_t::FREED;
+      return;
+    }
+  }
 
-	buf_pool_mutex_exit(buf_pool);
+  hash_lock->read_unlock();
 }
 
 /** Get read access to a compressed page (usually of type
@@ -3656,125 +2477,96 @@ the same set of mutexes or latches.
 @return pointer to the block */
 buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size)
 {
-	buf_page_t*	bpage;
-	BPageMutex*	block_mutex;
-	rw_lock_t*	hash_lock;
-	ibool		discard_attempted = FALSE;
-	ibool		must_read;
-	buf_pool_t*	buf_pool = buf_pool_get(page_id);
-
-	ut_ad(zip_size);
-	ut_ad(ut_is_2pow(zip_size));
-	buf_pool->stat.n_page_gets++;
-
-	for (;;) {
-lookup:
-
-		/* The following call will also grab the page_hash
-		mutex if the page is found. */
-		bpage = buf_page_hash_get_s_locked(buf_pool, page_id,
-						   &hash_lock);
-		if (bpage) {
-			ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
-			break;
-		}
+  ut_ad(zip_size);
+  ut_ad(ut_is_2pow(zip_size));
+  buf_pool.stat.n_page_gets++;
 
-		/* Page not in buf_pool: needs to be read from file */
+  bool discard_attempted= false;
+  const ulint fold= page_id.fold();
+  buf_page_t *bpage;
+  page_hash_latch *hash_lock;
 
-		ut_ad(!hash_lock);
-		dberr_t err = buf_read_page(page_id, zip_size);
+  for (;;)
+  {
+lookup:
+    bpage= buf_pool.page_hash_get_locked<false>(page_id, fold, &hash_lock);
+    if (bpage)
+      break;
 
-		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
-			ib::error() << "Reading compressed page " << page_id
-				<< " failed with error: " << err;
+    dberr_t err= buf_read_page(page_id, zip_size);
 
-			goto err_exit;
-		}
+    if (UNIV_UNLIKELY(err != DB_SUCCESS))
+    {
+      ib::error() << "Reading compressed page " << page_id
+                  << " failed with error: " << err;
+      goto err_exit;
+    }
 
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-		ut_a(++buf_dbg_counter % 5771 || buf_validate());
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-	}
+#ifdef UNIV_DEBUG
+    if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+  }
 
-	ut_ad(buf_page_hash_lock_held_s(buf_pool, bpage));
+  ut_ad(hash_lock->is_read_locked());
 
-	if (!bpage->zip.data) {
-		/* There is no compressed page. */
+  if (!bpage->zip.data)
+  {
+    /* There is no compressed page. */
 err_exit:
-		rw_lock_s_unlock(hash_lock);
-		return(NULL);
-	}
-
-	ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
-
-	switch (buf_page_get_state(bpage)) {
-	case BUF_BLOCK_ZIP_PAGE:
-	case BUF_BLOCK_ZIP_DIRTY:
-		bpage->fix();
-		block_mutex = &buf_pool->zip_mutex;
-		goto got_block;
-	case BUF_BLOCK_FILE_PAGE:
-		/* Discard the uncompressed page frame if possible. */
-		if (!discard_attempted) {
-			rw_lock_s_unlock(hash_lock);
-			buf_block_try_discard_uncompressed(page_id);
-			discard_attempted = TRUE;
-			goto lookup;
-		}
-
-		buf_block_buf_fix_inc((buf_block_t*) bpage,
-				      __FILE__, __LINE__);
+    hash_lock->read_unlock();
+    return nullptr;
+  }
 
-		block_mutex = &((buf_block_t*) bpage)->mutex;
-		goto got_block;
-	default:
-		break;
-	}
+  ut_ad(!buf_pool.watch_is_sentinel(*bpage));
+
+  switch (bpage->state()) {
+  case BUF_BLOCK_ZIP_PAGE:
+    bpage->fix();
+    goto got_block;
+  case BUF_BLOCK_FILE_PAGE:
+    /* Discard the uncompressed page frame if possible. */
+    if (!discard_attempted)
+    {
+      discard_attempted= true;
+      hash_lock->read_unlock();
+      mysql_mutex_lock(&buf_pool.mutex);
+      if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold))
+        buf_LRU_free_page(bpage, false);
+      mysql_mutex_unlock(&buf_pool.mutex);
+      goto lookup;
+    }
+
+    buf_block_buf_fix_inc(reinterpret_cast<buf_block_t*>(bpage),
+                          __FILE__, __LINE__);
+    goto got_block;
+  default:
+    break;
+  }
 
-	ut_error;
-	goto err_exit;
+  ut_error;
+  goto err_exit;
 
 got_block:
-	mutex_enter(block_mutex);
-	must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ;
-
-	rw_lock_s_unlock(hash_lock);
-
-	ut_ad(!bpage->file_page_was_freed);
-
-	buf_page_set_accessed(bpage);
-
-	mutex_exit(block_mutex);
+  bool must_read= bpage->io_fix() == BUF_IO_READ;
+  hash_lock->read_unlock();
 
-	buf_page_make_young_if_needed(bpage);
+  DBUG_ASSERT(bpage->status != buf_page_t::FREED);
 
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	ut_a(++buf_dbg_counter % 5771 || buf_validate());
-	ut_a(bpage->buf_fix_count > 0);
-	ut_a(buf_page_in_file(bpage));
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+  bpage->set_accessed();
+  buf_page_make_young_if_needed(bpage);
 
-	if (must_read) {
-		/* Let us wait until the read operation
-		completes */
-
-		for (;;) {
-			enum buf_io_fix	io_fix;
-
-			mutex_enter(block_mutex);
-			io_fix = buf_page_get_io_fix(bpage);
-			mutex_exit(block_mutex);
-
-			if (io_fix == BUF_IO_READ) {
+#ifdef UNIV_DEBUG
+  if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+  ut_ad(bpage->buf_fix_count());
+  ut_ad(bpage->in_file());
 
-				os_thread_sleep(WAIT_FOR_READ);
-			} else {
-				break;
-			}
-		}
-	}
+  if (must_read)
+    /* Let us wait until the read operation completes */
+    while (bpage->io_fix() == BUF_IO_READ)
+      os_thread_sleep(WAIT_FOR_READ);
 
-	return(bpage);
+  return bpage;
 }
 
 /********************************************************************//**
@@ -3811,7 +2603,7 @@ buf_zip_decompress(
 	ulint		size = page_zip_get_size(&block->page.zip);
 	/* The tablespace will not be found if this function is called
 	during IMPORT. */
-	fil_space_t* space = fil_space_acquire_for_io(block->page.id.space());
+	fil_space_t* space= fil_space_t::get(block->page.id().space());
 	const unsigned key_version = mach_read_from_4(
 		frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
 	fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL;
@@ -3821,13 +2613,13 @@ buf_zip_decompress(
 		    || srv_encrypt_tables);
 
 	ut_ad(block->zip_size());
-	ut_a(block->page.id.space() != 0);
+	ut_a(block->page.id().space() != 0);
 
 	if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) {
 
 		ib::error() << "Compressed page checksum mismatch for "
 			<< (space ? space->chain.start->name : "")
-			<< block->page.id << ": stored: "
+			<< block->page.id() << ": stored: "
 			<< mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
 			<< ", crc32: "
 			<< page_zip_calc_checksum(
@@ -3848,14 +2640,14 @@ buf_zip_decompress(
 		if (page_zip_decompress(&block->page.zip,
 					block->frame, TRUE)) {
 			if (space) {
-				space->release_for_io();
+				space->release();
 			}
 			return(TRUE);
 		}
 
 		ib::error() << "Unable to decompress "
 			<< (space ? space->chain.start->name : "")
-			<< block->page.id;
+			<< block->page.id();
 		goto err_exit;
 	case FIL_PAGE_TYPE_ALLOCATED:
 	case FIL_PAGE_INODE:
@@ -3867,7 +2659,7 @@ buf_zip_decompress(
 		/* Copy to uncompressed storage. */
 		memcpy(block->frame, frame, block->zip_size());
 		if (space) {
-			space->release_for_io();
+			space->release();
 		}
 
 		return(TRUE);
@@ -3876,7 +2668,7 @@ buf_zip_decompress(
 	ib::error() << "Unknown compressed page type "
 		<< fil_page_get_type(frame)
 		<< " in " << (space ? space->chain.start->name : "")
-		<< block->page.id;
+		<< block->page.id();
 
 err_exit:
 	if (encrypted) {
@@ -3891,99 +2683,12 @@ err_exit:
 			dict_set_corrupted_by_space(space);
 		}
 
-		space->release_for_io();
-	}
-
-	return(FALSE);
-}
-
-#ifdef BTR_CUR_HASH_ADAPT
-/** Get a buffer block from an adaptive hash index pointer.
-This function does not return if the block is not identified.
-@param[in]	ptr	pointer to within a page frame
-@return pointer to block, never NULL */
-buf_block_t*
-buf_block_from_ahi(const byte* ptr)
-{
-	buf_pool_chunk_map_t::iterator it;
-
-	buf_pool_chunk_map_t*	chunk_map = buf_chunk_map_ref;
-	ut_ad(buf_chunk_map_ref == buf_chunk_map_reg);
-	ut_ad(!buf_pool_resizing);
-
-	buf_chunk_t*	chunk;
-	it = chunk_map->upper_bound(ptr);
-
-	ut_a(it != chunk_map->begin());
-
-	if (it == chunk_map->end()) {
-		chunk = chunk_map->rbegin()->second;
-	} else {
-		chunk = (--it)->second;
-	}
-
-	ulint		offs = ulint(ptr - chunk->blocks->frame);
-
-	offs >>= srv_page_size_shift;
-
-	ut_a(offs < chunk->size);
-
-	buf_block_t*	block = &chunk->blocks[offs];
-
-	/* The function buf_chunk_init() invokes buf_block_init() so that
-	block[n].frame == block->frame + n * srv_page_size.  Check it. */
-	ut_ad(block->frame == page_align(ptr));
-	/* Read the state of the block without holding a mutex.
-	A state transition from BUF_BLOCK_FILE_PAGE to
-	BUF_BLOCK_REMOVE_HASH is possible during this execution. */
-	ut_d(const buf_page_state state = buf_block_get_state(block));
-	ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_REMOVE_HASH);
-	return(block);
-}
-#endif /* BTR_CUR_HASH_ADAPT */
-
-/********************************************************************//**
-Find out if a pointer belongs to a buf_block_t. It can be a pointer to
-the buf_block_t itself or a member of it
-@return TRUE if ptr belongs to a buf_block_t struct */
-ibool
-buf_pointer_is_block_field(
-/*=======================*/
-	const void*	ptr)	/*!< in: pointer not dereferenced */
-{
-	ulint	i;
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		if (buf_pool_from_array(i)->is_block_field(ptr)) {
-			return(TRUE);
-		}
+		space->release();
 	}
 
 	return(FALSE);
 }
 
-#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
-/********************************************************************//**
-Return true if probe is enabled.
-@return true if probe enabled. */
-static
-bool
-buf_debug_execute_is_force_flush()
-/*==============================*/
-{
-	DBUG_EXECUTE_IF("ib_buf_force_flush", return(true); );
-
-	/* This is used during queisce testing, we want to ensure maximum
-	buffering by the change buffer. */
-
-	if (srv_ibuf_disable_background_merge) {
-		return(true);
-	}
-
-	return(false);
-}
-#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
-
 /** Wait for the block to be read in.
 @param[in]	block	The block to check */
 static
@@ -3993,35 +2698,15 @@ buf_wait_for_read(
 {
 	/* Note:
 
-	We are using the block->lock to check for IO state (and a dirty read).
-	We set the IO_READ state under the protection of the hash_lock
-	(and block->mutex). This is safe because another thread can only
+	We are using the block->lock to check for IO state.
+	We set the IO_READ state under the protection of the hash_lock.
+	This is safe because another thread can only
 	access the block (and check for IO state) after the block has been
 	added to the page hashtable. */
 
-	if (buf_block_get_io_fix(block) == BUF_IO_READ) {
-
-		/* Wait until the read operation completes */
-
-		BPageMutex*	mutex = buf_page_get_mutex(&block->page);
-
-		for (;;) {
-			buf_io_fix	io_fix;
-
-			mutex_enter(mutex);
-
-			io_fix = buf_block_get_io_fix(block);
-
-			mutex_exit(mutex);
-
-			if (io_fix == BUF_IO_READ) {
-				/* Wait by temporaly s-latch */
-				rw_lock_s_lock(&block->lock);
-				rw_lock_s_unlock(&block->lock);
-			} else {
-				break;
-			}
-		}
+	while (block->page.io_fix() == BUF_IO_READ) {
+		rw_lock_s_lock(&block->lock);
+		rw_lock_s_unlock(&block->lock);
 	}
 }
 
@@ -4112,17 +2797,21 @@ done:
   return block;
 }
 
-/** This is the low level function used to get access to a database page.
-@param[in]	page_id		page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	rw_latch	RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
-@param[in]	guess		guessed block or NULL
-@param[in]	mode		BUF_GET, BUF_GET_IF_IN_POOL,
+/** Low level function used to get access to a database page.
+@param[in]	page_id			page id
+@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in]	guess			guessed block or NULL
+@param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
 BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
-@param[in]	file		file name
-@param[in]	line		line where called
-@param[in]	mtr		mini-transaction
-@param[out]	err		DB_SUCCESS or error code
+@param[in]	file			file name
+@param[in]	line			line where called
+@param[in]	mtr			mini-transaction
+@param[out]	err			DB_SUCCESS or error code
+@param[in]	allow_ibuf_merge	Allow change buffer merge to happen
+while reading the page from file
+then it makes sure that it does merging of change buffer changes while
+reading the page from file.
 @return pointer to the block or NULL */
 buf_block_t*
 buf_page_get_low(
@@ -4134,14 +2823,13 @@ buf_page_get_low(
 	const char*		file,
 	unsigned		line,
 	mtr_t*			mtr,
-	dberr_t*		err)
+	dberr_t*		err,
+	bool			allow_ibuf_merge)
 {
 	buf_block_t*	block;
 	unsigned	access_time;
-	rw_lock_t*	hash_lock;
-	buf_block_t*	fix_block;
 	ulint		retries = 0;
-	buf_pool_t*	buf_pool = buf_pool_get(page_id);
+	const ulint	fold = page_id.fold();
 
 	ut_ad((mtr == NULL) == (mode == BUF_EVICT_IF_IN_POOL));
 	ut_ad(!mtr || mtr->is_active());
@@ -4149,6 +2837,11 @@ buf_page_get_low(
 	      || (rw_latch == RW_X_LATCH)
 	      || (rw_latch == RW_SX_LATCH)
 	      || (rw_latch == RW_NO_LATCH));
+	ut_ad(!allow_ibuf_merge
+	      || mode == BUF_GET
+	      || mode == BUF_GET_POSSIBLY_FREED
+	      || mode == BUF_GET_IF_IN_POOL
+	      || mode == BUF_GET_IF_IN_POOL_OR_WATCH);
 
 	if (err) {
 		*err = DB_SUCCESS;
@@ -4169,105 +2862,74 @@ buf_page_get_low(
 		break;
 	default:
 		ut_error;
+	case BUF_GET_POSSIBLY_FREED:
+		break;
 	case BUF_GET_NO_LATCH:
 		ut_ad(rw_latch == RW_NO_LATCH);
 		/* fall through */
 	case BUF_GET:
 	case BUF_GET_IF_IN_POOL_OR_WATCH:
-	case BUF_GET_POSSIBLY_FREED:
-		fil_space_t* s = fil_space_acquire_for_io(page_id.space());
+		fil_space_t* s = fil_space_get(page_id.space());
 		ut_ad(s);
 		ut_ad(s->zip_size() == zip_size);
-		s->release_for_io();
 	}
 #endif /* UNIV_DEBUG */
 
 	ut_ad(!mtr || !ibuf_inside(mtr)
 	      || ibuf_page_low(page_id, zip_size, FALSE, file, line, NULL));
 
-	buf_pool->stat.n_page_gets++;
-	hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
+	buf_pool.stat.n_page_gets++;
 loop:
+	buf_block_t* fix_block;
 	block = guess;
 
-	rw_lock_s_lock(hash_lock);
+	page_hash_latch* hash_lock = buf_pool.page_hash.lock<false>(fold);
 
-	/* If not own buf_pool_mutex, page_hash can be changed. */
-	hash_lock = buf_page_hash_lock_s_confirm(hash_lock, buf_pool, page_id);
-
-	if (block != NULL) {
+	if (block) {
 
 		/* If the guess is a compressed page descriptor that
 		has been allocated by buf_page_alloc_descriptor(),
 		it may have been freed by buf_relocate(). */
 
-		if (!buf_pool->is_block_field(block)
-		    || page_id != block->page.id
-		    || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
-
+		if (!buf_pool.is_uncompressed(block)
+		    || page_id != block->page.id()
+		    || block->page.state() != BUF_BLOCK_FILE_PAGE) {
 			/* Our guess was bogus or things have changed
 			since. */
-			block = guess = NULL;
+			guess = nullptr;
+			goto lookup;
 		} else {
 			ut_ad(!block->page.in_zip_hash);
 		}
+	} else {
+lookup:
+		block = reinterpret_cast<buf_block_t*>(
+			buf_pool.page_hash_get_low(page_id, fold));
 	}
 
-	if (block == NULL) {
-		block = (buf_block_t*) buf_page_hash_get_low(buf_pool, page_id);
-	}
-
-	if (!block || buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
-		rw_lock_s_unlock(hash_lock);
-		block = NULL;
+	if (!block || buf_pool.watch_is_sentinel(block->page)) {
+		hash_lock->read_unlock();
+		block = nullptr;
 	}
 
-	if (block == NULL) {
-
+	if (UNIV_UNLIKELY(!block)) {
 		/* Page not in buf_pool: needs to be read from file */
-
 		if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
-			rw_lock_x_lock(hash_lock);
-
-			/* If not own buf_pool_mutex,
-			page_hash can be changed. */
-			hash_lock = buf_page_hash_lock_x_confirm(
-				hash_lock, buf_pool, page_id);
+			hash_lock = buf_pool.page_hash.lock<true>(fold);
 
-			block = (buf_block_t*) buf_pool_watch_set(
-				page_id, &hash_lock);
-
-			if (block) {
+			if (buf_page_t *bpage= buf_pool.watch_set(
+				    page_id, &hash_lock)) {
 				/* We can release hash_lock after we
 				increment the fix count to make
 				sure that no state change takes place. */
+				bpage->fix();
+				hash_lock->write_unlock();
+				block = reinterpret_cast<buf_block_t*>(bpage);
 				fix_block = block;
-
-				if (fsp_is_system_temporary(page_id.space())) {
-					/* For temporary tablespace,
-					the mutex is being used for
-					synchronization between user
-					thread and flush thread,
-					instead of block->lock. See
-					buf_flush_page() for the flush
-					thread counterpart. */
-
-					BPageMutex*	fix_mutex
-						= buf_page_get_mutex(
-							&fix_block->page);
-					mutex_enter(fix_mutex);
-					fix_block->fix();
-					mutex_exit(fix_mutex);
-				} else {
-					fix_block->fix();
-				}
-
-				/* Now safe to release page_hash mutex */
-				rw_lock_x_unlock(hash_lock);
 				goto got_block;
 			}
 
-			rw_lock_x_unlock(hash_lock);
+			hash_lock->write_unlock();
 		}
 
 		switch (mode) {
@@ -4275,15 +2937,12 @@ loop:
 		case BUF_GET_IF_IN_POOL_OR_WATCH:
 		case BUF_PEEK_IF_IN_POOL:
 		case BUF_EVICT_IF_IN_POOL:
-			ut_ad(!rw_lock_own_flagged(
-				      hash_lock,
-				      RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
 			return(NULL);
 		}
 
 		/* The call path is buf_read_page() ->
-		buf_read_page_low() (fil_io()) ->
-		buf_page_io_complete() ->
+		buf_read_page_low() (fil_space_t::io()) ->
+		buf_page_read_complete() ->
 		buf_decrypt_after_read(). Here fil_space_t* is used
 		and we decrypt -> buf_page_check_corrupt() where page
 		checksums are compared. Decryption, decompression as
@@ -4336,11 +2995,10 @@ loop:
 			asserting. */
 			if (page_id.space() == TRX_SYS_SPACE) {
 			} else if (page_id.space() == SRV_TMP_SPACE_ID) {
-			} else if (fil_space_t* space
-				   = fil_space_acquire_for_io(
+			} else if (fil_space_t* space= fil_space_t::get(
 					   page_id.space())) {
 				bool set = dict_set_corrupted_by_space(space);
-				space->release_for_io();
+				space->release();
 				if (set) {
 					return NULL;
 				}
@@ -4359,30 +3017,16 @@ loop:
 				" See https://mariadb.com/kb/en/library/innodb-recovery-modes/";
 		}
 
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-		ut_a(++buf_dbg_counter % 5771 || buf_validate());
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_DEBUG
+		if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
 		goto loop;
 	} else {
 		fix_block = block;
 	}
 
-	if (fsp_is_system_temporary(page_id.space())) {
-		/* For temporary tablespace, the mutex is being used
-		for synchronization between user thread and flush
-		thread, instead of block->lock. See buf_flush_page()
-		for the flush thread counterpart. */
-		BPageMutex*	fix_mutex = buf_page_get_mutex(
-			&fix_block->page);
-		mutex_enter(fix_mutex);
-		fix_block->fix();
-		mutex_exit(fix_mutex);
-	} else {
-		fix_block->fix();
-	}
-
-	/* Now safe to release page_hash mutex */
-	rw_lock_s_unlock(hash_lock);
+	fix_block->fix();
+	hash_lock->read_unlock();
 
 got_block:
 	switch (mode) {
@@ -4392,30 +3036,19 @@ got_block:
 	case BUF_GET_IF_IN_POOL:
 	case BUF_PEEK_IF_IN_POOL:
 	case BUF_EVICT_IF_IN_POOL:
-		buf_page_t*	fix_page = &fix_block->page;
-		BPageMutex*	fix_mutex = buf_page_get_mutex(fix_page);
-		mutex_enter(fix_mutex);
-		const bool	must_read
-			= (buf_page_get_io_fix(fix_page) == BUF_IO_READ);
-		mutex_exit(fix_mutex);
-
-		if (must_read) {
+		if (fix_block->page.io_fix() == BUF_IO_READ) {
 			/* The page is being read to buffer pool,
 			but we cannot wait around for the read to
 			complete. */
 			fix_block->unfix();
-
 			return(NULL);
 		}
 	}
 
-	switch (buf_block_get_state(fix_block)) {
-		buf_page_t*	bpage;
-
+	switch (UNIV_EXPECT(fix_block->page.state(), BUF_BLOCK_FILE_PAGE)) {
 	case BUF_BLOCK_FILE_PAGE:
-		bpage = &block->page;
 		if (fsp_is_system_temporary(page_id.space())
-		    && buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+		    && block->page.io_fix() != BUF_IO_NONE) {
 			/* This suggests that the page is being flushed.
 			Avoid returning reference to this page.
 			Instead wait for the flush action to complete. */
@@ -4426,22 +3059,28 @@ got_block:
 
 		if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
 evict_from_pool:
-			ut_ad(!fix_block->page.oldest_modification);
-			buf_pool_mutex_enter(buf_pool);
+			ut_ad(!fix_block->page.oldest_modification());
+			mysql_mutex_lock(&buf_pool.mutex);
 			fix_block->unfix();
 
 			if (!buf_LRU_free_page(&fix_block->page, true)) {
 				ut_ad(0);
 			}
 
-			buf_pool_mutex_exit(buf_pool);
+			mysql_mutex_unlock(&buf_pool.mutex);
 			return(NULL);
 		}
 
 		break;
+	default:
+		ut_error;
+		break;
 
 	case BUF_BLOCK_ZIP_PAGE:
-	case BUF_BLOCK_ZIP_DIRTY:
+		if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
+			goto evict_from_pool;
+		}
+
 		if (mode == BUF_PEEK_IF_IN_POOL) {
 			/* This mode is only used for dropping an
 			adaptive hash index.  There cannot be an
@@ -4452,11 +3091,11 @@ evict_from_pool:
 			return(NULL);
 		}
 
-		bpage = &block->page;
+		buf_page_t* bpage = &block->page;
 
 		/* Note: We have already buffer fixed this block. */
-		if (bpage->buf_fix_count > 1
-		    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+		if (bpage->buf_fix_count() > 1
+		    || bpage->io_fix() != BUF_IO_NONE) {
 
 			/* This condition often occurs when the buffer
 			is not buffer-fixed, but I/O-fixed by
@@ -4470,213 +3109,142 @@ evict_from_pool:
 			goto loop;
 		}
 
-		if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
-			goto evict_from_pool;
-		}
-
 		/* Buffer-fix the block so that it cannot be evicted
 		or relocated while we are attempting to allocate an
 		uncompressed page. */
 
-		block = buf_LRU_get_free_block(buf_pool);
-
-		buf_pool_mutex_enter(buf_pool);
+		block = buf_LRU_get_free_block(false);
+		buf_block_init_low(block);
 
-		/* If not own buf_pool_mutex, page_hash can be changed. */
-		hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
+		mysql_mutex_lock(&buf_pool.mutex);
+		hash_lock = buf_pool.page_hash.lock_get(fold);
 
-		rw_lock_x_lock(hash_lock);
+		hash_lock->write_lock();
 
 		/* Buffer-fixing prevents the page_hash from changing. */
-		ut_ad(bpage == buf_page_hash_get_low(buf_pool, page_id));
-
-		fix_block->unfix();
-
-		buf_page_mutex_enter(block);
-		mutex_enter(&buf_pool->zip_mutex);
-
-		fix_block = block;
+		ut_ad(bpage == buf_pool.page_hash_get_low(page_id, fold));
 
-		if (bpage->buf_fix_count > 0
-		    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+		fix_block->unfix(); /* hash_lock protects us after this */
 
-			mutex_exit(&buf_pool->zip_mutex);
+		if (bpage->buf_fix_count() || bpage->io_fix() != BUF_IO_NONE) {
 			/* The block was buffer-fixed or I/O-fixed while
-			buf_pool->mutex was not held by this thread.
+			buf_pool.mutex was not held by this thread.
 			Free the block that was allocated and retry.
 			This should be extremely unlikely, for example,
 			if buf_page_get_zip() was invoked. */
 
+			hash_lock->write_unlock();
 			buf_LRU_block_free_non_file_page(block);
-			buf_pool_mutex_exit(buf_pool);
-			rw_lock_x_unlock(hash_lock);
-			buf_page_mutex_exit(block);
+			mysql_mutex_unlock(&buf_pool.mutex);
 
 			/* Try again */
 			goto loop;
 		}
 
+		fix_block = block;
+
 		/* Move the compressed page from bpage to block,
 		and uncompress it. */
 
 		/* Note: this is the uncompressed block and it is not
 		accessible by other threads yet because it is not in
 		any list or hash table */
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
 		buf_relocate(bpage, &block->page);
 
-		buf_block_init_low(block);
-
 		/* Set after buf_relocate(). */
-		block->page.buf_fix_count = 1;
-
-		block->lock_hash_val = lock_rec_hash(page_id.space(),
-						     page_id.page_no());
+		block->page.set_buf_fix_count(1);
 
-		if (buf_page_get_state(&block->page) == BUF_BLOCK_ZIP_PAGE) {
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-			UT_LIST_REMOVE(buf_pool->zip_clean, &block->page);
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-			ut_ad(!block->page.in_flush_list);
-		} else {
-			/* Relocate buf_pool->flush_list. */
-			buf_flush_relocate_on_flush_list(bpage, &block->page);
-		}
+		buf_flush_relocate_on_flush_list(bpage, &block->page);
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
 		/* Buffer-fix, I/O-fix, and X-latch the block
 		for the duration of the decompression.
 		Also add the block to the unzip_LRU list. */
-		block->page.state = BUF_BLOCK_FILE_PAGE;
+		block->page.set_state(BUF_BLOCK_FILE_PAGE);
 
 		/* Insert at the front of unzip_LRU list */
 		buf_unzip_LRU_add_block(block, FALSE);
 
-		buf_block_set_io_fix(block, BUF_IO_READ);
+		block->page.set_io_fix(BUF_IO_READ);
 		rw_lock_x_lock_inline(&block->lock, 0, file, line);
 
 		MEM_UNDEFINED(bpage, sizeof *bpage);
 
-		rw_lock_x_unlock(hash_lock);
-		buf_pool->n_pend_unzip++;
-		mutex_exit(&buf_pool->zip_mutex);
-		buf_pool_mutex_exit(buf_pool);
+		mysql_mutex_unlock(&buf_pool.mutex);
+		hash_lock->write_unlock();
+		buf_pool.n_pend_unzip++;
 
-		access_time = buf_page_is_accessed(&block->page);
+		access_time = block->page.is_accessed();
 
-		buf_page_mutex_exit(block);
+		if (!access_time && !recv_no_ibuf_operations
+		    && ibuf_page_exists(block->page.id(), zip_size)) {
+			block->page.ibuf_exist = true;
+		}
 
 		buf_page_free_descriptor(bpage);
 
 		/* Decompress the page while not holding
-		buf_pool->mutex or block->mutex. */
-
-		{
-			bool	success = buf_zip_decompress(block, false);
+		buf_pool.mutex. */
 
-			if (!success) {
-				buf_pool_mutex_enter(buf_pool);
-				buf_page_mutex_enter(fix_block);
-				buf_block_set_io_fix(fix_block, BUF_IO_NONE);
-				buf_page_mutex_exit(fix_block);
-
-				--buf_pool->n_pend_unzip;
-				fix_block->unfix();
-				buf_pool_mutex_exit(buf_pool);
-				rw_lock_x_unlock(&fix_block->lock);
+		if (!buf_zip_decompress(block, false)) {
+			rw_lock_x_unlock(&fix_block->lock);
+			fix_block->page.io_unfix();
+			fix_block->unfix();
+			--buf_pool.n_pend_unzip;
 
-				if (err) {
-					*err = DB_PAGE_CORRUPTED;
-				}
-				return NULL;
+			if (err) {
+				*err = DB_PAGE_CORRUPTED;
 			}
+			return NULL;
 		}
 
-		if (!access_time && !recv_no_ibuf_operations) {
-			ibuf_merge_or_delete_for_page(
-				block, block->page.id, zip_size);
-		}
-
-		buf_pool_mutex_enter(buf_pool);
-
-		buf_page_mutex_enter(fix_block);
-
-		buf_block_set_io_fix(fix_block, BUF_IO_NONE);
-
-		buf_page_mutex_exit(fix_block);
-
-		--buf_pool->n_pend_unzip;
-
-		buf_pool_mutex_exit(buf_pool);
-
 		rw_lock_x_unlock(&block->lock);
-
-		break;
-
-	case BUF_BLOCK_POOL_WATCH:
-	case BUF_BLOCK_NOT_USED:
-	case BUF_BLOCK_READY_FOR_USE:
-	case BUF_BLOCK_MEMORY:
-	case BUF_BLOCK_REMOVE_HASH:
-		ut_error;
+		fix_block->page.io_unfix();
+		--buf_pool.n_pend_unzip;
 		break;
 	}
 
 	ut_ad(block == fix_block);
-	ut_ad(fix_block->page.buf_fix_count > 0);
+	ut_ad(fix_block->page.buf_fix_count());
 
-	ut_ad(!rw_lock_own_flagged(hash_lock,
-				   RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
-
-	ut_ad(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE);
 
 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
-
-	if ((mode == BUF_GET_IF_IN_POOL || mode == BUF_GET_IF_IN_POOL_OR_WATCH)
-	    && (ibuf_debug || buf_debug_execute_is_force_flush())) {
-
+re_evict:
+	if (mode != BUF_GET_IF_IN_POOL
+	    && mode != BUF_GET_IF_IN_POOL_OR_WATCH) {
+	} else if (!ibuf_debug) {
+	} else if (fil_space_t* space = fil_space_t::get(page_id.space())) {
 		/* Try to evict the block from the buffer pool, to use the
 		insert buffer (change buffer) as much as possible. */
 
-		buf_pool_mutex_enter(buf_pool);
+		mysql_mutex_lock(&buf_pool.mutex);
 
 		fix_block->unfix();
 
-		/* Now we are only holding the buf_pool->mutex,
-		not block->mutex or hash_lock. Blocks cannot be
-		relocated or enter or exit the buf_pool while we
-		are holding the buf_pool->mutex. */
-
-		if (buf_LRU_free_page(&fix_block->page, true)) {
-
-			buf_pool_mutex_exit(buf_pool);
-
-			/* If not own buf_pool_mutex,
-			page_hash can be changed. */
-			hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
-
-			rw_lock_x_lock(hash_lock);
-
-			/* If not own buf_pool_mutex,
-			page_hash can be changed. */
-			hash_lock = buf_page_hash_lock_x_confirm(
-				hash_lock, buf_pool, page_id);
-
-			if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
-				/* Set the watch, as it would have
-				been set if the page were not in the
-				buffer pool in the first place. */
-				block = (buf_block_t*) buf_pool_watch_set(
-					page_id, &hash_lock);
-			} else {
-				block = (buf_block_t*) buf_page_hash_get_low(
-					buf_pool, page_id);
-			}
-
-			rw_lock_x_unlock(hash_lock);
+		/* Blocks cannot be relocated or enter or exit the
+		buf_pool while we are holding the buf_pool.mutex. */
+		const bool evicted = buf_LRU_free_page(&fix_block->page, true);
+		space->release();
+
+		if (evicted) {
+			hash_lock = buf_pool.page_hash.lock_get(fold);
+			hash_lock->write_lock();
+			mysql_mutex_unlock(&buf_pool.mutex);
+			/* We may set the watch, as it would have
+			been set if the page were not in the
+			buffer pool in the first place. */
+			block= reinterpret_cast<buf_block_t*>(
+				mode == BUF_GET_IF_IN_POOL_OR_WATCH
+				? buf_pool.watch_set(page_id, &hash_lock)
+				: buf_pool.page_hash_get_low(page_id, fold));
+			hash_lock->write_unlock();
 
 			if (block != NULL) {
 				/* Either the page has been read in or
 				a watch was set on that in the window
-				where we released the buf_pool::mutex
+				where we released the buf_pool.mutex
 				and before we acquire the hash_lock
 				above. Try again. */
 				guess = block;
@@ -4687,25 +3255,20 @@ evict_from_pool:
 			return(NULL);
 		}
 
-		buf_page_mutex_enter(fix_block);
-
-		if (buf_flush_page_try(buf_pool, fix_block)) {
-			guess = fix_block;
+		fix_block->fix();
+		mysql_mutex_unlock(&buf_pool.mutex);
+		buf_flush_sync();
 
-			goto loop;
+		if (fix_block->page.buf_fix_count() == 1
+		    && !fix_block->page.oldest_modification()) {
+			goto re_evict;
 		}
 
-		buf_page_mutex_exit(fix_block);
-
-		fix_block->fix();
-
 		/* Failed to evict the page; change it directly */
-
-		buf_pool_mutex_exit(buf_pool);
 	}
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 
-	ut_ad(fix_block->page.buf_fix_count > 0);
+	ut_ad(fix_block->page.buf_fix_count());
 
 #ifdef UNIV_DEBUG
 	/* We have already buffer fixed the page, and we are committed to
@@ -4729,35 +3292,24 @@ evict_from_pool:
 	"btr_search_drop_page_hash_when_freed". */
 	ut_ad(mode == BUF_GET_POSSIBLY_FREED
 	      || mode == BUF_PEEK_IF_IN_POOL
-	      || !fix_block->page.file_page_was_freed);
-
-	/* Check if this is the first access to the page */
-	access_time = buf_page_is_accessed(&fix_block->page);
-
-	/* This is a heuristic and we don't care about ordering issues. */
-	if (access_time == 0) {
-		buf_page_mutex_enter(fix_block);
+	      || fix_block->page.status != buf_page_t::FREED);
 
-		buf_page_set_accessed(&fix_block->page);
-
-		buf_page_mutex_exit(fix_block);
-	}
+	const bool not_first_access = fix_block->page.set_accessed();
 
 	if (mode != BUF_PEEK_IF_IN_POOL) {
 		buf_page_make_young_if_needed(&fix_block->page);
 	}
 
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	ut_a(++buf_dbg_counter % 5771 || buf_validate());
-	ut_a(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE);
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_DEBUG
+	if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+	ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE);
 
 	/* We have to wait here because the IO_READ state was set
-	under the protection of the hash_lock and not the block->mutex
-	and block->lock. */
+	under the protection of the hash_lock and not block->lock. */
 	buf_wait_for_read(fix_block);
 
-	if (fix_block->page.id != page_id) {
+	if (fix_block->page.id() != page_id) {
 		fix_block->unfix();
 
 #ifdef UNIV_DEBUG
@@ -4773,33 +3325,53 @@ evict_from_pool:
 		return NULL;
 	}
 
-	fix_block = buf_page_mtr_lock(fix_block, rw_latch, mtr, file, line);
+	if (fix_block->page.status != buf_page_t::FREED
+	    && allow_ibuf_merge
+	    && fil_page_get_type(fix_block->frame) == FIL_PAGE_INDEX
+	    && page_is_leaf(fix_block->frame)) {
+		rw_lock_x_lock_inline(&fix_block->lock, 0, file, line);
+
+		if (fix_block->page.ibuf_exist) {
+			fix_block->page.ibuf_exist = false;
+			ibuf_merge_or_delete_for_page(fix_block, page_id,
+						      zip_size);
+		}
+
+		if (rw_latch == RW_X_LATCH) {
+			mtr->memo_push(fix_block, MTR_MEMO_PAGE_X_FIX);
+		} else {
+			rw_lock_x_unlock(&fix_block->lock);
+			goto get_latch;
+		}
+	} else {
+get_latch:
+		fix_block = buf_page_mtr_lock(fix_block, rw_latch, mtr,
+					      file, line);
+	}
 
-	if (mode != BUF_PEEK_IF_IN_POOL && !access_time) {
+	if (!not_first_access && mode != BUF_PEEK_IF_IN_POOL) {
 		/* In the case of a first access, try to apply linear
 		read-ahead */
 
 		buf_read_ahead_linear(page_id, zip_size, ibuf_inside(mtr));
 	}
 
-	ut_ad(!rw_lock_own_flagged(hash_lock,
-				   RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
-
 	return(fix_block);
 }
 
-/** This is the general function used to get access to a database page.
-It does page initialization and applies the buffered redo logs.
-@param[in]	page_id		page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	rw_latch	RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
-@param[in]	guess		guessed block or NULL
-@param[in]	mode		BUF_GET, BUF_GET_IF_IN_POOL,
+/** Get access to a database page. Buffered redo log may be applied.
+@param[in]	page_id			page id
+@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in]	guess			guessed block or NULL
+@param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
 BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
-@param[in]	file		file name
-@param[in]	line		line where called
-@param[in]	mtr		mini-transaction
-@param[out]	err		DB_SUCCESS or error code
+@param[in]	file			file name
+@param[in]	line			line where called
+@param[in]	mtr			mini-transaction
+@param[out]	err			DB_SUCCESS or error code
+@param[in]	allow_ibuf_merge	Allow change buffer merge while
+reading the pages from file.
 @return pointer to the block or NULL */
 buf_block_t*
 buf_page_get_gen(
@@ -4811,18 +3383,39 @@ buf_page_get_gen(
 	const char*		file,
 	unsigned		line,
 	mtr_t*			mtr,
-	dberr_t*		err)
+	dberr_t*		err,
+	bool			allow_ibuf_merge)
 {
-  if (buf_block_t *block= recv_recovery_create_page(page_id))
+  if (buf_block_t *block= recv_sys.recover(page_id))
   {
     block->fix();
     ut_ad(rw_lock_s_lock_nowait(block->debug_latch, file, line));
+    if (err)
+      *err= DB_SUCCESS;
+    const bool must_merge= allow_ibuf_merge &&
+      ibuf_page_exists(page_id, block->zip_size());
+    if (block->page.status == buf_page_t::FREED)
+      ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL);
+    else if (must_merge && fil_page_get_type(block->frame) == FIL_PAGE_INDEX &&
+	     page_is_leaf(block->frame))
+    {
+      rw_lock_x_lock_inline(&block->lock, 0, file, line);
+      block->page.ibuf_exist= false;
+      ibuf_merge_or_delete_for_page(block, page_id, block->zip_size());
+
+      if (rw_latch == RW_X_LATCH)
+      {
+        mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+	return block;
+      }
+      rw_lock_x_unlock(&block->lock);
+    }
     block= buf_page_mtr_lock(block, rw_latch, mtr, file, line);
     return block;
   }
 
   return buf_page_get_low(page_id, zip_size, rw_latch,
-                          guess, mode, file, line, mtr, err);
+                          guess, mode, file, line, mtr, err, allow_ibuf_merge);
 }
 
 /********************************************************************//**
@@ -4839,55 +3432,52 @@ buf_page_optimistic_get(
 	unsigned	line,	/*!< in: line where called */
 	mtr_t*		mtr)	/*!< in: mini-transaction */
 {
-	buf_pool_t*	buf_pool;
-	unsigned	access_time;
 	ibool		success;
 
 	ut_ad(block);
 	ut_ad(mtr);
 	ut_ad(mtr->is_active());
-	ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
+	ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
 
-	buf_page_mutex_enter(block);
+	if (UNIV_UNLIKELY(block->page.state() != BUF_BLOCK_FILE_PAGE
+			  || block->page.io_fix() != BUF_IO_NONE)) {
+		return FALSE;
+	}
 
-	if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) {
+	const page_id_t id(block->page.id());
 
-		buf_page_mutex_exit(block);
+	page_hash_latch *hash_lock = buf_pool.hash_lock_get(id);
+	hash_lock->read_lock();
 
+	if (UNIV_UNLIKELY(id != block->page.id()
+			  || block->page.state() != BUF_BLOCK_FILE_PAGE
+			  || block->page.io_fix() != BUF_IO_NONE)) {
+		hash_lock->read_unlock();
 		return(FALSE);
 	}
 
 	buf_block_buf_fix_inc(block, file, line);
+	hash_lock->read_unlock();
 
-	access_time = buf_page_is_accessed(&block->page);
-
-	buf_page_set_accessed(&block->page);
-
-	buf_page_mutex_exit(block);
+	block->page.set_accessed();
 
 	buf_page_make_young_if_needed(&block->page);
 
-	ut_ad(!ibuf_inside(mtr)
-	      || ibuf_page(block->page.id, block->zip_size(), NULL));
+	ut_ad(!ibuf_inside(mtr) || ibuf_page(id, block->zip_size(), NULL));
 
 	mtr_memo_type_t	fix_type;
 
-	switch (rw_latch) {
-	case RW_S_LATCH:
-		success = rw_lock_s_lock_nowait(&block->lock, file, line);
-
+	if (rw_latch == RW_S_LATCH) {
 		fix_type = MTR_MEMO_PAGE_S_FIX;
-		break;
-	case RW_X_LATCH:
+		success = rw_lock_s_lock_nowait(&block->lock, file, line);
+	} else {
+		fix_type = MTR_MEMO_PAGE_X_FIX;
 		success = rw_lock_x_lock_func_nowait_inline(
 			&block->lock, file, line);
-
-		fix_type = MTR_MEMO_PAGE_X_FIX;
-		break;
-	default:
-		ut_error; /* RW_SX_LATCH is not implemented yet */
 	}
 
+	ut_ad(id == block->page.id());
+
 	if (!success) {
 		buf_block_buf_fix_dec(block);
 		return(FALSE);
@@ -4909,132 +3499,13 @@ buf_page_optimistic_get(
 
 	mtr_memo_push(mtr, block, fix_type);
 
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	ut_a(++buf_dbg_counter % 5771 || buf_validate());
-	ut_a(block->page.buf_fix_count > 0);
-	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-	ut_d(buf_page_mutex_enter(block));
-	ut_ad(!block->page.file_page_was_freed);
-	ut_d(buf_page_mutex_exit(block));
-
-	if (!access_time) {
-		/* In the case of a first access, try to apply linear
-		read-ahead */
-		buf_read_ahead_linear(block->page.id, block->zip_size(),
-				      ibuf_inside(mtr));
-	}
-
-	buf_pool = buf_pool_from_block(block);
-	buf_pool->stat.n_page_gets++;
-
-	return(TRUE);
-}
-
-/********************************************************************//**
-This is used to get access to a known database page, when no waiting can be
-done. For example, if a search in an adaptive hash index leads us to this
-frame.
-@return TRUE if success */
-ibool
-buf_page_get_known_nowait(
-/*======================*/
-	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
-	buf_block_t*	block,	/*!< in: the known page */
-	ulint		mode,	/*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
-	const char*	file,	/*!< in: file name */
-	unsigned	line,	/*!< in: line where called */
-	mtr_t*		mtr)	/*!< in: mini-transaction */
-{
-	buf_pool_t*	buf_pool;
-	ibool		success;
-
-	ut_ad(mtr->is_active());
-	ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
-
-	buf_page_mutex_enter(block);
-
-	if (buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH) {
-		/* Another thread is just freeing the block from the LRU list
-		of the buffer pool: do not try to access this page; this
-		attempt to access the page can only come through the hash
-		index because when the buffer block state is ..._REMOVE_HASH,
-		we have already removed it from the page address hash table
-		of the buffer pool. */
-
-		buf_page_mutex_exit(block);
-
-		return(FALSE);
-	}
-
-	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-
-	buf_block_buf_fix_inc(block, file, line);
-
-	buf_page_set_accessed(&block->page);
-
-	buf_page_mutex_exit(block);
-
-	buf_pool = buf_pool_from_block(block);
-
-#ifdef BTR_CUR_HASH_ADAPT
-	if (mode == BUF_MAKE_YOUNG) {
-		buf_page_make_young_if_needed(&block->page);
-	}
-#endif /* BTR_CUR_HASH_ADAPT */
-
-	ut_ad(!ibuf_inside(mtr) || mode == BUF_KEEP_OLD);
-
-	mtr_memo_type_t	fix_type;
-
-	switch (rw_latch) {
-	case RW_S_LATCH:
-		success = rw_lock_s_lock_nowait(&block->lock, file, line);
-		fix_type = MTR_MEMO_PAGE_S_FIX;
-		break;
-	case RW_X_LATCH:
-		success = rw_lock_x_lock_func_nowait_inline(
-			&block->lock, file, line);
-
-		fix_type = MTR_MEMO_PAGE_X_FIX;
-		break;
-	default:
-		ut_error; /* RW_SX_LATCH is not implemented yet */
-	}
-
-	if (!success) {
-		buf_block_buf_fix_dec(block);
-		return(FALSE);
-	}
-
-	mtr_memo_push(mtr, block, fix_type);
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	ut_a(++buf_dbg_counter % 5771 || buf_validate());
-	ut_a(block->page.buf_fix_count > 0);
-	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
 #ifdef UNIV_DEBUG
-	if (mode != BUF_KEEP_OLD) {
-		/* If mode == BUF_KEEP_OLD, we are executing an I/O
-		completion routine.  Avoid a bogus assertion failure
-		when ibuf_merge_or_delete_for_page() is processing a
-		page that was just freed due to DROP INDEX, or
-		deleting a record from SYS_INDEXES. This check will be
-		skipped in recv_recover_page() as well. */
-
-# ifdef BTR_CUR_HASH_ADAPT
-		ut_ad(!block->page.file_page_was_freed
-		      || (block->index && block->index->freed()));
-# else /* BTR_CUR_HASH_ADAPT */
-		ut_ad(!block->page.file_page_was_freed);
-# endif /* BTR_CUR_HASH_ADAPT */
-	}
+	if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
 #endif /* UNIV_DEBUG */
+	ut_ad(block->page.buf_fix_count());
+	ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
 
-	buf_pool->stat.n_page_gets++;
+	buf_pool.stat.n_page_gets++;
 
 	return(TRUE);
 }
@@ -5054,650 +3525,265 @@ buf_page_try_get_func(
 	unsigned		line,
 	mtr_t*			mtr)
 {
-	buf_block_t*	block;
-	ibool		success;
-	buf_pool_t*	buf_pool = buf_pool_get(page_id);
-	rw_lock_t*	hash_lock;
-
-	ut_ad(mtr);
-	ut_ad(mtr->is_active());
-
-	block = buf_block_hash_get_s_locked(buf_pool, page_id, &hash_lock);
-
-	if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
-		if (block) {
-			rw_lock_s_unlock(hash_lock);
-		}
-		return(NULL);
-	}
-
-	ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page));
-
-	buf_page_mutex_enter(block);
-	rw_lock_s_unlock(hash_lock);
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-	ut_a(page_id == block->page.id);
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-	buf_block_buf_fix_inc(block, file, line);
-	buf_page_mutex_exit(block);
-
-	mtr_memo_type_t	fix_type = MTR_MEMO_PAGE_S_FIX;
-	success = rw_lock_s_lock_nowait(&block->lock, file, line);
-
-	if (!success) {
-		/* Let us try to get an X-latch. If the current thread
-		is holding an X-latch on the page, we cannot get an
-		S-latch. */
-
-		fix_type = MTR_MEMO_PAGE_X_FIX;
-		success = rw_lock_x_lock_func_nowait_inline(&block->lock,
-							    file, line);
-	}
-
-	if (!success) {
-		buf_block_buf_fix_dec(block);
-		return(NULL);
-	}
-
-	mtr_memo_push(mtr, block, fix_type);
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	ut_a(++buf_dbg_counter % 5771 || buf_validate());
-	ut_a(block->page.buf_fix_count > 0);
-	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-	ut_d(buf_page_mutex_enter(block));
-	ut_d(ut_a(!block->page.file_page_was_freed));
-	ut_d(buf_page_mutex_exit(block));
-
-	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
-
-	buf_pool->stat.n_page_gets++;
-
-	return(block);
-}
-
-/********************************************************************//**
-Initialize some fields of a control block. */
-UNIV_INLINE
-void
-buf_page_init_low(
-/*==============*/
-	buf_page_t*	bpage)	/*!< in: block to init */
-{
-	bpage->flush_type = BUF_FLUSH_LRU;
-	bpage->io_fix = BUF_IO_NONE;
-	bpage->buf_fix_count = 0;
-	bpage->old = 0;
-	bpage->freed_page_clock = 0;
-	bpage->access_time = 0;
-	bpage->newest_modification = 0;
-	bpage->oldest_modification = 0;
-	bpage->real_size = 0;
-	bpage->slot = NULL;
-
-	HASH_INVALIDATE(bpage, hash);
-
-	ut_d(bpage->file_page_was_freed = FALSE);
-}
-
-/** Inits a page to the buffer buf_pool.
-@param[in,out]	buf_pool	buffer pool
-@param[in]	page_id		page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in,out]	block		block to init */
-static
-void
-buf_page_init(
-	buf_pool_t*		buf_pool,
-	const page_id_t		page_id,
-	ulint			zip_size,
-	buf_block_t*		block)
-{
-	buf_page_t*	hash_page;
-
-	ut_ad(buf_pool == buf_pool_get(page_id));
-	ut_ad(buf_pool_mutex_own(buf_pool));
-
-	ut_ad(buf_page_mutex_own(block));
-	ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
-
-	ut_ad(rw_lock_own(buf_page_hash_lock_get(buf_pool, page_id),
-			  RW_LOCK_X));
-
-	/* Set the state of the block */
-	buf_block_set_file_page(block, page_id);
-
-	buf_block_init_low(block);
-
-	block->lock_hash_val = lock_rec_hash(page_id.space(),
-					     page_id.page_no());
-
-	buf_page_init_low(&block->page);
-
-	/* Insert into the hash table of file pages */
-
-	hash_page = buf_page_hash_get_low(buf_pool, page_id);
-
-	if (hash_page == NULL) {
-		/* Block not found in hash table */
-	} else if (UNIV_LIKELY(buf_pool_watch_is_sentinel(buf_pool,
-							  hash_page))) {
-		/* Preserve the reference count. */
-		ib_uint32_t	buf_fix_count = hash_page->buf_fix_count;
-
-		ut_a(buf_fix_count > 0);
-
-		block->page.buf_fix_count += buf_fix_count;
+  ut_ad(mtr);
+  ut_ad(mtr->is_active());
+
+  page_hash_latch *hash_lock;
+  buf_page_t *bpage= buf_pool.page_hash_get_locked<false>(page_id,
+                                                          page_id.fold(),
+                                                          &hash_lock);
+  if (!bpage)
+    return nullptr;
+  if (bpage->state() != BUF_BLOCK_FILE_PAGE)
+  {
+    hash_lock->read_unlock();
+    return nullptr;
+  }
 
-		buf_pool_watch_remove(buf_pool, hash_page);
-	} else {
-		ib::fatal() << "Page already foudn in the hash table: "
-			    << page_id;
-	}
+  buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
+  buf_block_buf_fix_inc(block, file, line);
+  hash_lock->read_unlock();
 
-	ut_ad(!block->page.in_zip_hash);
-	ut_ad(!block->page.in_page_hash);
-	ut_d(block->page.in_page_hash = TRUE);
+  mtr_memo_type_t fix_type= MTR_MEMO_PAGE_S_FIX;
+  if (!rw_lock_s_lock_nowait(&block->lock, file, line))
+  {
+    /* Let us try to get an X-latch. If the current thread
+    is holding an X-latch on the page, we cannot get an S-latch. */
+    fix_type= MTR_MEMO_PAGE_X_FIX;
+    if (!rw_lock_x_lock_func_nowait_inline(&block->lock, file, line))
+    {
+      buf_block_buf_fix_dec(block);
+      return nullptr;
+    }
+  }
 
-	block->page.id = page_id;
+  mtr_memo_push(mtr, block, fix_type);
 
-	HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
-		    page_id.fold(), &block->page);
+#ifdef UNIV_DEBUG
+  if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+  ut_ad(bpage->buf_fix_count());
+  ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
+  ut_ad(bpage->id() == page_id);
+  buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
 
-	page_zip_set_size(&block->page.zip, zip_size);
+  buf_pool.stat.n_page_gets++;
+  return block;
 }
 
-/** Initialize a page for read to the buffer buf_pool. If the page is
-(1) already in buf_pool, or
-(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
-(3) if the space is deleted or being deleted,
-then this function does nothing.
-Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
-on the buffer frame. The io-handler must take care that the flag is cleared
-and the lock released later.
-@param[out]	err			DB_SUCCESS or DB_TABLESPACE_DELETED
-@param[in]	mode			BUF_READ_IBUF_PAGES_ONLY, ...
-@param[in]	page_id			page id
-@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	unzip			whether the uncompressed page is
-					requested (for ROW_FORMAT=COMPRESSED)
-@return pointer to the block
-@retval	NULL	in case of an error */
-buf_page_t*
-buf_page_init_for_read(
-	dberr_t*		err,
-	ulint			mode,
-	const page_id_t		page_id,
-	ulint			zip_size,
-	bool			unzip)
+/** Initialize the block.
+@param page_id  page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param fix      initial buf_fix_count() */
+void buf_block_t::initialise(const page_id_t page_id, ulint zip_size,
+                             uint32_t fix)
 {
-	buf_block_t*	block;
-	buf_page_t*	bpage	= NULL;
-	buf_page_t*	watch_page;
-	rw_lock_t*	hash_lock;
-	mtr_t		mtr;
-	bool		lru	= false;
-	void*		data;
-	buf_pool_t*	buf_pool = buf_pool_get(page_id);
-
-	ut_ad(buf_pool);
-
-	*err = DB_SUCCESS;
-
-	if (mode == BUF_READ_IBUF_PAGES_ONLY) {
-		/* It is a read-ahead within an ibuf routine */
-
-		ut_ad(!ibuf_bitmap_page(page_id, zip_size));
-
-		ibuf_mtr_start(&mtr);
-
-		if (!recv_no_ibuf_operations
-		    && !ibuf_page(page_id, zip_size, &mtr)) {
-
-			ibuf_mtr_commit(&mtr);
-
-			return(NULL);
-		}
-	} else {
-		ut_ad(mode == BUF_READ_ANY_PAGE);
-	}
-
-	if (zip_size && !unzip && !recv_recovery_is_on()) {
-		block = NULL;
-	} else {
-		block = buf_LRU_get_free_block(buf_pool);
-		ut_ad(block);
-		ut_ad(buf_pool_from_block(block) == buf_pool);
-	}
-
-	buf_pool_mutex_enter(buf_pool);
-
-	hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
-	rw_lock_x_lock(hash_lock);
-
-	watch_page = buf_page_hash_get_low(buf_pool, page_id);
-	if (watch_page && !buf_pool_watch_is_sentinel(buf_pool, watch_page)) {
-		/* The page is already in the buffer pool. */
-		watch_page = NULL;
-		rw_lock_x_unlock(hash_lock);
-		if (block) {
-			buf_page_mutex_enter(block);
-			buf_LRU_block_free_non_file_page(block);
-			buf_page_mutex_exit(block);
-		}
-
-		bpage = NULL;
-		goto func_exit;
-	}
-
-	if (block) {
-		bpage = &block->page;
-
-		buf_page_mutex_enter(block);
-
-		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
-
-		buf_page_init(buf_pool, page_id, zip_size, block);
-
-		/* Note: We are using the hash_lock for protection. This is
-		safe because no other thread can lookup the block from the
-		page hashtable yet. */
-
-		buf_page_set_io_fix(bpage, BUF_IO_READ);
-
-		rw_lock_x_unlock(hash_lock);
-
-		/* The block must be put to the LRU list, to the old blocks */
-		buf_LRU_add_block(bpage, TRUE/* to old blocks */);
-
-		/* We set a pass-type x-lock on the frame because then
-		the same thread which called for the read operation
-		(and is running now at this point of code) can wait
-		for the read to complete by waiting for the x-lock on
-		the frame; if the x-lock were recursive, the same
-		thread would illegally get the x-lock before the page
-		read is completed.  The x-lock is cleared by the
-		io-handler thread. */
-
-		rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
-
-		if (zip_size) {
-			/* buf_pool->mutex may be released and
-			reacquired by buf_buddy_alloc().  Thus, we
-			must release block->mutex in order not to
-			break the latching order in the reacquisition
-			of buf_pool->mutex.  We also must defer this
-			operation until after the block descriptor has
-			been added to buf_pool->LRU and
-			buf_pool->page_hash. */
-			buf_page_mutex_exit(block);
-			data = buf_buddy_alloc(buf_pool, zip_size, &lru);
-			buf_page_mutex_enter(block);
-			block->page.zip.data = (page_zip_t*) data;
-
-			/* To maintain the invariant
-			block->in_unzip_LRU_list
-			== buf_page_belongs_to_unzip_LRU(&block->page)
-			we have to add this block to unzip_LRU
-			after block->page.zip.data is set. */
-			ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
-			buf_unzip_LRU_add_block(block, TRUE);
-		}
-
-		buf_page_mutex_exit(block);
-	} else {
-		rw_lock_x_unlock(hash_lock);
-
-		/* The compressed page must be allocated before the
-		control block (bpage), in order to avoid the
-		invocation of buf_buddy_relocate_block() on
-		uninitialized data. */
-		data = buf_buddy_alloc(buf_pool, zip_size, &lru);
-
-		rw_lock_x_lock(hash_lock);
-
-		/* If buf_buddy_alloc() allocated storage from the LRU list,
-		it released and reacquired buf_pool->mutex.  Thus, we must
-		check the page_hash again, as it may have been modified. */
-		if (UNIV_UNLIKELY(lru)) {
-
-			watch_page = buf_page_hash_get_low(buf_pool, page_id);
-
-			if (UNIV_UNLIKELY(watch_page
-			    && !buf_pool_watch_is_sentinel(buf_pool,
-							   watch_page))) {
-
-				/* The block was added by some other thread. */
-				rw_lock_x_unlock(hash_lock);
-				watch_page = NULL;
-				buf_buddy_free(buf_pool, data, zip_size);
-
-				bpage = NULL;
-				goto func_exit;
-			}
-		}
-
-		bpage = buf_page_alloc_descriptor();
-
-		/* Initialize the buf_pool pointer. */
-		bpage->buf_pool_index = buf_pool_index(buf_pool);
-
-		page_zip_des_init(&bpage->zip);
-		page_zip_set_size(&bpage->zip, zip_size);
-		bpage->zip.data = (page_zip_t*) data;
-
-		mutex_enter(&buf_pool->zip_mutex);
-
-		buf_page_init_low(bpage);
-
-		bpage->state = BUF_BLOCK_ZIP_PAGE;
-		bpage->id = page_id;
-		bpage->flush_observer = NULL;
-
-		ut_d(bpage->in_page_hash = FALSE);
-		ut_d(bpage->in_zip_hash = FALSE);
-		ut_d(bpage->in_flush_list = FALSE);
-		ut_d(bpage->in_free_list = FALSE);
-		ut_d(bpage->in_LRU_list = FALSE);
-
-		ut_d(bpage->in_page_hash = TRUE);
-
-		if (watch_page != NULL) {
-
-			/* Preserve the reference count. */
-			ib_uint32_t	buf_fix_count;
-
-			buf_fix_count = watch_page->buf_fix_count;
-
-			ut_a(buf_fix_count > 0);
-
-			bpage->buf_fix_count += buf_fix_count;
-
-			ut_ad(buf_pool_watch_is_sentinel(buf_pool, watch_page));
-			buf_pool_watch_remove(buf_pool, watch_page);
-		}
-
-		HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
-			    bpage->id.fold(), bpage);
-
-		rw_lock_x_unlock(hash_lock);
-
-		/* The block must be put to the LRU list, to the old blocks.
-		The zip size is already set into the page zip */
-		buf_LRU_add_block(bpage, TRUE/* to old blocks */);
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-		buf_LRU_insert_zip_clean(bpage);
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-		buf_page_set_io_fix(bpage, BUF_IO_READ);
-
-		mutex_exit(&buf_pool->zip_mutex);
-	}
-
-	buf_pool->n_pend_reads++;
-func_exit:
-	buf_pool_mutex_exit(buf_pool);
-
-	if (mode == BUF_READ_IBUF_PAGES_ONLY) {
-
-		ibuf_mtr_commit(&mtr);
-	}
-
-	ut_ad(!rw_lock_own_flagged(hash_lock,
-				   RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
-	ut_ad(!bpage || buf_page_in_file(bpage));
-
-	return(bpage);
+  ut_ad(page.state() != BUF_BLOCK_FILE_PAGE);
+  buf_block_init_low(this);
+  page.init(page_id, fix);
+  page_zip_set_size(&page.zip, zip_size);
 }
 
 /** Initialize a page in the buffer pool. The page is usually not read
 from a file even if it cannot be found in the buffer buf_pool. This is one
 of the functions which perform to a block a state transition NOT_USED =>
 FILE_PAGE (the other is buf_page_get_gen).
-@param[in]	page_id		page id
+@param[in,out]	space		space object
+@param[in]	offset		offset of the tablespace
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 @param[in,out]	mtr		mini-transaction
+@param[in,out]	free_block	pre-allocated buffer block
 @return pointer to the block, page bufferfixed */
 buf_block_t*
-buf_page_create(
-	const page_id_t		page_id,
-	ulint			zip_size,
-	mtr_t*			mtr)
+buf_page_create(fil_space_t *space, uint32_t offset,
+                ulint zip_size, mtr_t *mtr, buf_block_t *free_block)
 {
-	buf_frame_t*	frame;
-	buf_block_t*	block;
-	buf_pool_t*	buf_pool = buf_pool_get(page_id);
-	rw_lock_t*	hash_lock;
-
-	ut_ad(mtr->is_active());
-	ut_ad(page_id.space() != 0 || !zip_size);
-loop:
-	buf_block_t *free_block = buf_LRU_get_free_block(buf_pool);
+  page_id_t page_id(space->id, offset);
+  ut_ad(mtr->is_active());
+  ut_ad(page_id.space() != 0 || !zip_size);
 
-	buf_pool_mutex_enter(buf_pool);
+  space->free_page(offset, false);
+  free_block->initialise(page_id, zip_size, 1);
 
-	hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
-	rw_lock_x_lock(hash_lock);
+  const ulint fold= page_id.fold();
+  mysql_mutex_lock(&buf_pool.mutex);
 
-	block = (buf_block_t*) buf_page_hash_get_low(buf_pool, page_id);
+loop:
+  buf_block_t *block= reinterpret_cast<buf_block_t*>
+    (buf_pool.page_hash_get_low(page_id, fold));
 
-	if (UNIV_LIKELY_NULL(block)
-	    && buf_page_in_file(&block->page)
-	    && !buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
-		ut_d(block->page.file_page_was_freed = FALSE);
-		bool have_x_latch = false;
+  if (block && block->page.in_file() &&
+      !buf_pool.watch_is_sentinel(block->page))
+  {
 #ifdef BTR_CUR_HASH_ADAPT
-		const dict_index_t *drop_hash_entry= nullptr;
+    const dict_index_t *drop_hash_entry= nullptr;
 #endif
-		switch (const auto page_state= buf_block_get_state(block)) {
-		default:
-			ut_ad(0);
-			break;
-		case BUF_BLOCK_ZIP_PAGE:
-		case BUF_BLOCK_ZIP_DIRTY:
-			buf_block_init_low(free_block);
-			mutex_enter(&buf_pool->zip_mutex);
-
-			buf_page_mutex_enter(free_block);
-			if (buf_page_get_io_fix(&block->page) != BUF_IO_NONE) {
-				mutex_exit(&buf_pool->zip_mutex);
-				rw_lock_x_unlock(hash_lock);
-				buf_LRU_block_free_non_file_page(free_block);
-				buf_pool_mutex_exit(buf_pool);
-				buf_page_mutex_exit(free_block);
-
-				goto loop;
-			}
-
-			rw_lock_x_lock(&free_block->lock);
-
-			buf_relocate(&block->page, &free_block->page);
-			if (page_state == BUF_BLOCK_ZIP_DIRTY) {
-				ut_ad(block->page.in_flush_list);
-				ut_ad(block->page.oldest_modification > 0);
-				buf_flush_relocate_on_flush_list(
-					&block->page, &free_block->page);
-			} else {
-				ut_ad(block->page.oldest_modification == 0);
-				ut_ad(!block->page.in_flush_list);
-#ifdef UNIV_DEBUG
-				UT_LIST_REMOVE(
-					buf_pool->zip_clean, &block->page);
+    switch (UNIV_EXPECT(block->page.state(), BUF_BLOCK_FILE_PAGE)) {
+    default:
+      ut_ad(0);
+      break;
+    case BUF_BLOCK_FILE_PAGE:
+      if (!mtr->have_x_latch(*block))
+      {
+        buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+        while (!rw_lock_x_lock_nowait(&block->lock))
+        {
+          /* Wait for buf_page_write_complete() to release block->lock.
+          We must not hold buf_pool.mutex while waiting. */
+          timespec abstime;
+          set_timespec_nsec(abstime, 1000000);
+          my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
+                            &abstime);
+        }
+        mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
+      }
+      else
+      {
+        ut_ad(!block->page.ibuf_exist);
+#ifdef BTR_CUR_HASH_ADAPT
+        ut_ad(!block->index);
 #endif
-			}
-
-			free_block->page.state = BUF_BLOCK_FILE_PAGE;
-			mutex_exit(&buf_pool->zip_mutex);
-			free_block->lock_hash_val = lock_rec_hash(
-				page_id.space(), page_id.page_no());
-			buf_unzip_LRU_add_block(free_block, false);
-			buf_page_free_descriptor(&block->page);
-			block = free_block;
-			buf_block_buf_fix_inc(block, __FILE__, __LINE__);
-			buf_page_mutex_exit(free_block);
-			free_block = nullptr;
-			break;
-		case BUF_BLOCK_FILE_PAGE:
-			have_x_latch = mtr->have_x_latch(*block);
-			if (!have_x_latch) {
-				buf_block_buf_fix_inc(block,
-						      __FILE__, __LINE__);
-				while (buf_block_get_io_fix(block)
-				       != BUF_IO_NONE
-				       || block->page.buf_fix_count != 1) {
-					buf_pool_mutex_exit(buf_pool);
-					rw_lock_x_unlock(hash_lock);
-
-					os_thread_sleep(1000);
-
-					buf_pool_mutex_enter(buf_pool);
-					rw_lock_x_lock(hash_lock);
-				}
-
-				rw_lock_x_lock(&block->lock);
-			}
+      }
 #ifdef BTR_CUR_HASH_ADAPT
-			drop_hash_entry = block->index;
+      drop_hash_entry= block->index;
 #endif
-			break;
-		}
-		/* Page can be found in buf_pool */
-		buf_pool_mutex_exit(buf_pool);
-		rw_lock_x_unlock(hash_lock);
-
-		if (free_block) {
-			buf_block_free(free_block);
-		}
+      break;
+    case BUF_BLOCK_ZIP_PAGE:
+      page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
+      hash_lock->write_lock();
+      if (block->page.io_fix() != BUF_IO_NONE)
+      {
+        hash_lock->write_unlock();
+        /* Wait for buf_page_write_complete() to release the I/O fix. */
+        timespec abstime;
+        set_timespec_nsec(abstime, 1000000);
+        my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
+                          &abstime);
+        goto loop;
+      }
+
+      rw_lock_x_lock(&free_block->lock);
+      mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      buf_relocate(&block->page, &free_block->page);
+      buf_flush_relocate_on_flush_list(&block->page, &free_block->page);
+      mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+      free_block->page.set_state(BUF_BLOCK_FILE_PAGE);
+      buf_unzip_LRU_add_block(free_block, FALSE);
+      hash_lock->write_unlock();
+      buf_page_free_descriptor(&block->page);
+      block= free_block;
+      buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+      mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
+      break;
+    }
+
+    mysql_mutex_unlock(&buf_pool.mutex);
 
 #ifdef BTR_CUR_HASH_ADAPT
-		if (UNIV_LIKELY_NULL(drop_hash_entry)) {
-			btr_search_drop_page_hash_index(block);
-		}
+    if (drop_hash_entry)
+      btr_search_drop_page_hash_index(block);
 #endif /* BTR_CUR_HASH_ADAPT */
 
-		if (!have_x_latch) {
-			mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
-		}
-
-		return block;
-	}
-
-	/* If we get here, the page was not in buf_pool: init it there */
-
-	DBUG_PRINT("ib_buf", ("create page %u:%u",
-			      page_id.space(), page_id.page_no()));
+    if (block->page.ibuf_exist)
+    {
+      if (!recv_recovery_is_on())
+        ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size);
+      block->page.ibuf_exist= false;
+    }
 
-	block = free_block;
-
-	buf_page_mutex_enter(block);
-
-	buf_page_init(buf_pool, page_id, zip_size, block);
-
-	rw_lock_x_lock(&block->lock);
-
-	rw_lock_x_unlock(hash_lock);
+    return block;
+  }
 
-	/* The block must be put to the LRU list */
-	buf_LRU_add_block(&block->page, FALSE);
+  /* If we get here, the page was not in buf_pool: init it there */
 
-	buf_block_buf_fix_inc(block, __FILE__, __LINE__);
-	buf_pool->stat.n_pages_created++;
+  DBUG_PRINT("ib_buf", ("create page %u:%u",
+                        page_id.space(), page_id.page_no()));
 
-	if (zip_size) {
-		void*	data;
-		bool	lru;
-
-		/* Prevent race conditions during buf_buddy_alloc(),
-		which may release and reacquire buf_pool->mutex,
-		by IO-fixing and X-latching the block. */
-
-		buf_page_set_io_fix(&block->page, BUF_IO_READ);
-
-		buf_page_mutex_exit(block);
-		/* buf_pool->mutex may be released and reacquired by
-		buf_buddy_alloc().  Thus, we must release block->mutex
-		in order not to break the latching order in
-		the reacquisition of buf_pool->mutex.  We also must
-		defer this operation until after the block descriptor
-		has been added to buf_pool->LRU and buf_pool->page_hash. */
-		data = buf_buddy_alloc(buf_pool, zip_size, &lru);
-		buf_page_mutex_enter(block);
-		block->page.zip.data = (page_zip_t*) data;
-
-		/* To maintain the invariant
-		block->in_unzip_LRU_list
-		== buf_page_belongs_to_unzip_LRU(&block->page)
-		we have to add this block to unzip_LRU after
-		block->page.zip.data is set. */
-		ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
-		buf_unzip_LRU_add_block(block, FALSE);
+  block= free_block;
 
-		buf_page_set_io_fix(&block->page, BUF_IO_NONE);
-	}
+  /* Duplicate buf_block_buf_fix_inc_func() */
+  ut_ad(block->page.buf_fix_count() == 1);
+  ut_ad(fsp_is_system_temporary(page_id.space()) ||
+        rw_lock_s_lock_nowait(block->debug_latch, __FILE__, __LINE__));
 
-	buf_pool_mutex_exit(buf_pool);
+  /* The block must be put to the LRU list */
+  buf_LRU_add_block(&block->page, false);
+  page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
+  hash_lock->write_lock();
+  block->page.set_state(BUF_BLOCK_FILE_PAGE);
+  ut_d(block->page.in_page_hash= true);
+  HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, &block->page);
 
-	mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
+  rw_lock_x_lock(&block->lock);
+  if (UNIV_UNLIKELY(zip_size))
+  {
+    /* Prevent race conditions during buf_buddy_alloc(), which may
+    release and reacquire buf_pool.mutex, by IO-fixing and X-latching
+    the block. */
+    block->page.set_io_fix(BUF_IO_READ);
+    hash_lock->write_unlock();
+
+    /* buf_pool.mutex may be released and reacquired by
+    buf_buddy_alloc(). We must defer this operation until
+    after the block descriptor has been added to
+    buf_pool.LRU and buf_pool.page_hash. */
+    block->page.zip.data= buf_buddy_alloc(zip_size);
+
+    /* To maintain the invariant block->in_unzip_LRU_list ==
+    block->page.belongs_to_unzip_LRU() we have to add this
+    block to unzip_LRU after block->page.zip.data is set. */
+    ut_ad(block->page.belongs_to_unzip_LRU());
+    buf_unzip_LRU_add_block(block, FALSE);
+
+    block->page.set_io_fix(BUF_IO_NONE);
+  }
+  else
+    hash_lock->write_unlock();
 
-	buf_page_set_accessed(&block->page);
+  mysql_mutex_unlock(&buf_pool.mutex);
 
-	buf_page_mutex_exit(block);
+  mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+  block->page.set_accessed();
+  buf_pool.stat.n_pages_created++;
 
-	/* Delete possible entries for the page from the insert buffer:
-	such can exist if the page belonged to an index which was dropped */
-	if (!recv_recovery_is_on()) {
-		ibuf_merge_or_delete_for_page(NULL, page_id, zip_size);
-	}
+  /* Delete possible entries for the page from the insert buffer:
+  such can exist if the page belonged to an index which was dropped */
+  if (page_id < page_id_t{SRV_SPACE_ID_UPPER_BOUND, 0} &&
+      !srv_is_undo_tablespace(page_id.space()) &&
+      !recv_recovery_is_on())
+    ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size);
 
-	frame = block->frame;
+  static_assert(FIL_PAGE_PREV + 4 == FIL_PAGE_NEXT, "adjacent");
+  memset_aligned<8>(block->frame + FIL_PAGE_PREV, 0xff, 8);
+  mach_write_to_2(block->frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
 
-	memset(frame + FIL_PAGE_PREV, 0xff, 4);
-	memset(frame + FIL_PAGE_NEXT, 0xff, 4);
-	mach_write_to_2(frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
+  /* FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is only used on the
+  following pages:
+  (1) The first page of the InnoDB system tablespace (page 0:0)
+  (2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages
+  (3) key_version on encrypted pages (not page 0:0) */
 
-	/* FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is only used on the
-	following pages:
-	(1) The first page of the InnoDB system tablespace (page 0:0)
-	(2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages
-	(3) key_version on encrypted pages (not page 0:0) */
+  memset(block->frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
+  memset_aligned<8>(block->frame + FIL_PAGE_LSN, 0, 8);
 
-	memset(frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
-	memset(frame + FIL_PAGE_LSN, 0, 8);
-	/* mark page as just allocated for check in
-	buf_flush_init_for_writing() */
-	ut_d(memset(frame + FIL_PAGE_SPACE_OR_CHKSUM, 0, 4));
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	ut_a(++buf_dbg_counter % 5771 || buf_validate());
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-	return(block);
+#ifdef UNIV_DEBUG
+  if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+  return block;
 }
 
-/********************************************************************//**
-Monitor the buffer page read/write activity, and increment corresponding
-counter value if MONITOR_MODULE_BUF_PAGE (module_buf_page) module is
-enabled. */
-static
-void
-buf_page_monitor(
-/*=============*/
-	const buf_page_t*	bpage,	/*!< in: pointer to the block */
-	enum buf_io_fix		io_type)/*!< in: io_fix types */
+/** Monitor the buffer page read/write activity, and increment corresponding
+counter value in MONITOR_MODULE_BUF_PAGE.
+@param bpage   buffer page whose read or write was completed
+@param io_type BUF_IO_READ or BUF_IO_WRITE */
+ATTRIBUTE_COLD __attribute__((nonnull))
+void buf_page_monitor(const buf_page_t *bpage, buf_io_fix io_type)
 {
 	const byte*	frame;
 	monitor_id_t	counter;
 
-	/* If the counter module is not turned on, just return */
-	if (!MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)) {
-		return;
-	}
-
-	ut_a(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
+	ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
 
 	frame = bpage->zip.data
 		? bpage->zip.data
@@ -5803,95 +3889,88 @@ static void buf_mark_space_corrupt(buf_page_t* bpage, const fil_space_t& space)
 	}
 }
 
-/** Mark a table corrupted.
-@param[in]	bpage	Corrupted page
-@param[in]	space	Corrupted page belongs to tablespace
-Also remove the bpage from LRU list. */
-static
-void
-buf_corrupt_page_release(buf_page_t* bpage, const fil_space_t* space)
+/** Release and evict a corrupted page.
+@param bpage    page that was being read */
+ATTRIBUTE_COLD void buf_pool_t::corrupted_evict(buf_page_t *bpage)
 {
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	const ibool	uncompressed = (buf_page_get_state(bpage)
-					== BUF_BLOCK_FILE_PAGE);
-	page_id_t	old_page_id = bpage->id;
-
-	/* First unfix and release lock on the bpage */
-	buf_pool_mutex_enter(buf_pool);
-	mutex_enter(buf_page_get_mutex(bpage));
-	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
-	ut_ad(bpage->id.space() == space->id);
-
-	/* buf_fix_count can be greater than zero. Because other thread
-	can wait in buf_page_wait_read() for the page to be read. */
-
-	bpage->id.set_corrupt_id();
-	/* Set BUF_IO_NONE before we remove the block from LRU list */
-	buf_page_set_io_fix(bpage, BUF_IO_NONE);
-
-	if (uncompressed) {
-		rw_lock_x_unlock_gen(
-			&((buf_block_t*) bpage)->lock,
-			BUF_IO_READ);
-	}
+  const page_id_t id(bpage->id());
+  page_hash_latch *hash_lock= hash_lock_get(id);
 
-	mutex_exit(buf_page_get_mutex(bpage));
+  mysql_mutex_lock(&mutex);
+  hash_lock->write_lock();
 
-	if (!srv_force_recovery) {
-		buf_mark_space_corrupt(bpage, *space);
-	}
+  ut_ad(bpage->io_fix() == BUF_IO_READ);
+  ut_ad(!bpage->oldest_modification());
+  bpage->set_corrupt_id();
 
-	/* After this point bpage can't be referenced. */
-	buf_LRU_free_one_page(bpage, old_page_id);
+  if (bpage->state() == BUF_BLOCK_FILE_PAGE)
+    rw_lock_x_unlock_gen(&reinterpret_cast<buf_block_t*>(bpage)->lock,
+                         BUF_IO_READ);
 
-	ut_ad(buf_pool->n_pend_reads > 0);
-	buf_pool->n_pend_reads--;
+  bpage->io_unfix();
+
+  /* remove from LRU and page_hash */
+  buf_LRU_free_one_page(bpage, id, hash_lock);
+  mysql_mutex_unlock(&mutex);
+
+  ut_d(auto n=) n_pend_reads--;
+  ut_ad(n > 0);
+}
+
+/** Mark a table corrupted.
+@param[in]	bpage	Corrupted page
+@param[in]	node	data file
+Also remove the bpage from LRU list. */
+ATTRIBUTE_COLD
+static void buf_corrupt_page_release(buf_page_t *bpage, const fil_node_t &node)
+{
+  ut_ad(bpage->id().space() == node.space->id);
+  buf_pool.corrupted_evict(bpage);
 
-	buf_pool_mutex_exit(buf_pool);
+  if (!srv_force_recovery)
+    buf_mark_space_corrupt(bpage, *node.space);
 }
 
 /** Check if the encrypted page is corrupted for the full crc32 format.
 @param[in]	space_id	page belongs to space id
-@param[in]	dst_frame	page
+@param[in]	d		page
 @param[in]	is_compressed	compressed page
 @return true if page is corrupted or false if it isn't */
-static bool buf_page_full_crc32_is_corrupted(
-	ulint		space_id,
-	const byte*	dst_frame,
-	bool		is_compressed)
+static bool buf_page_full_crc32_is_corrupted(ulint space_id, const byte* d,
+                                             bool is_compressed)
 {
-	if (!is_compressed
-	    && memcmp(dst_frame + FIL_PAGE_LSN + 4,
-		      dst_frame + srv_page_size - FIL_PAGE_FCRC32_END_LSN, 4)) {
-		return true;
-	}
+  if (space_id != mach_read_from_4(d + FIL_PAGE_SPACE_ID))
+    return true;
 
-	if (space_id != mach_read_from_4(dst_frame + FIL_PAGE_SPACE_ID)) {
-		return true;
-	}
+  static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+  static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
 
-	return false;
+  return !is_compressed &&
+    memcmp_aligned<4>(FIL_PAGE_LSN + 4 + d,
+                      d + srv_page_size - FIL_PAGE_FCRC32_END_LSN, 4);
 }
 
 /** Check if page is maybe compressed, encrypted or both when we encounter
 corrupted page. Note that we can't be 100% sure if page is corrupted
 or decrypt/decompress just failed.
 @param[in,out]	bpage		page
-@param[in,out]	space		tablespace from fil_space_acquire_for_io()
+@param[in]	node		data file
 @return	whether the operation succeeded
 @retval	DB_SUCCESS		if page has been read and is not corrupted
 @retval	DB_PAGE_CORRUPTED	if page based on checksum check is corrupted
 @retval	DB_DECRYPTION_FAILED	if page post encryption checksum matches but
 after decryption normal page checksum does not match.
 @retval	DB_TABLESPACE_DELETED	if accessed tablespace is not found */
-static dberr_t buf_page_check_corrupt(buf_page_t* bpage, fil_space_t* space)
+static dberr_t buf_page_check_corrupt(buf_page_t *bpage,
+                                      const fil_node_t &node)
 {
-	ut_ad(space->pending_io());
+	ut_ad(node.space->referenced());
 
 	byte* dst_frame = (bpage->zip.data) ? bpage->zip.data :
 		((buf_block_t*) bpage)->frame;
 	dberr_t err = DB_SUCCESS;
-	uint key_version = buf_page_get_key_version(dst_frame, space->flags);
+	uint key_version = buf_page_get_key_version(dst_frame,
+						    node.space->flags);
 
 	/* In buf_decrypt_after_read we have either decrypted the page if
 	page post encryption checksum matches and used key_id is found
@@ -5899,33 +3978,35 @@ static dberr_t buf_page_check_corrupt(buf_page_t* bpage, fil_space_t* space)
 	not decrypted and it could be either encrypted and corrupted
 	or corrupted or good page. If we decrypted, there page could
 	still be corrupted if used key does not match. */
-	const bool seems_encrypted = !space->full_crc32() && key_version
-		&& space->crypt_data
-		&& space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED;
-	ut_ad(space->purpose != FIL_TYPE_TEMPORARY || space->full_crc32());
+	const bool seems_encrypted = !node.space->full_crc32() && key_version
+		&& node.space->crypt_data
+		&& node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED;
+	ut_ad(node.space->purpose != FIL_TYPE_TEMPORARY ||
+	      node.space->full_crc32());
 
 	/* If traditional checksums match, we assume that page is
 	not anymore encrypted. */
-	if (space->full_crc32()
+	if (node.space->full_crc32()
 	    && !buf_is_zeroes(span<const byte>(dst_frame,
-					       space->physical_size()))
-	    && (key_version || space->is_compressed()
-		|| space->purpose == FIL_TYPE_TEMPORARY)) {
+					       node.space->physical_size()))
+	    && (key_version || node.space->is_compressed()
+		|| node.space->purpose == FIL_TYPE_TEMPORARY)) {
 		if (buf_page_full_crc32_is_corrupted(
-			    space->id, dst_frame, space->is_compressed())) {
+			    bpage->id().space(), dst_frame,
+			    node.space->is_compressed())) {
 			err = DB_PAGE_CORRUPTED;
 		}
-	} else if (buf_page_is_corrupted(true, dst_frame, space->flags)) {
+	} else if (buf_page_is_corrupted(true, dst_frame, node.space->flags)) {
 		err = DB_PAGE_CORRUPTED;
 	}
 
 	if (seems_encrypted && err == DB_PAGE_CORRUPTED
-	    && bpage->id.page_no() != 0) {
+	    && bpage->id().page_no() != 0) {
 		err = DB_DECRYPTION_FAILED;
 
 		ib::error()
-			<< "The page " << bpage->id << " in file '"
-			<< space->chain.start->name
+			<< "The page " << bpage->id()
+			<< " in file '" << node.name
 			<< "' cannot be decrypted.";
 
 		ib::info()
@@ -5934,7 +4015,7 @@ static dberr_t buf_page_check_corrupt(buf_page_t* bpage, fil_space_t* space)
 			<< " is not found or"
 			" used encryption algorithm or method does not match.";
 
-		if (bpage->id.space() != TRX_SYS_SPACE) {
+		if (bpage->id().space() != TRX_SYS_SPACE) {
 			ib::info()
 				<< "Marking tablespace as missing."
 				" You may drop this table or"
@@ -5946,660 +4027,326 @@ static dberr_t buf_page_check_corrupt(buf_page_t* bpage, fil_space_t* space)
 	return (err);
 }
 
-/** Complete a read or write request of a file page to or from the buffer pool.
-@param[in,out]	bpage	page to complete
-@param[in]	dblwr	whether the doublewrite buffer was used (on write)
-@param[in]	evict	whether or not to evict the page from LRU list
+/** Complete a read request of a file page to buf_pool.
+@param bpage    recently read page
+@param node     data file
 @return whether the operation succeeded
-@retval	DB_SUCCESS		always when writing, or if a read page was OK
-@retval	DB_TABLESPACE_DELETED	if the tablespace does not exist
-@retval	DB_PAGE_CORRUPTED	if the checksum fails on a page read
-@retval	DB_DECRYPTION_FAILED	if page post encryption checksum matches but
-				after decryption normal page checksum does
-				not match */
-UNIV_INTERN
-dberr_t
-buf_page_io_complete(buf_page_t* bpage, bool dblwr, bool evict)
+@retval DB_SUCCESS              always when writing, or if a read page was OK
+@retval DB_PAGE_CORRUPTED       if the checksum fails on a page read
+@retval DB_DECRYPTION_FAILED    if the page cannot be decrypted */
+dberr_t buf_page_read_complete(buf_page_t *bpage, const fil_node_t &node)
 {
-	enum buf_io_fix	io_type;
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	const bool	uncompressed = (buf_page_get_state(bpage)
-					== BUF_BLOCK_FILE_PAGE);
-	ut_a(buf_page_in_file(bpage));
-
-	/* We do not need protect io_fix here by mutex to read
-	it because this is the only function where we can change the value
-	from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code
-	ensures that this is the only thread that handles the i/o for this
-	block. */
-
-	io_type = buf_page_get_io_fix(bpage);
-	ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
-	ut_ad(!!bpage->zip.ssize == (bpage->zip.data != NULL));
-	ut_ad(uncompressed || bpage->zip.data);
-
-	if (io_type == BUF_IO_READ) {
-		ulint	read_page_no = 0;
-		ulint	read_space_id = 0;
-		byte*	frame = bpage->zip.data
-			? bpage->zip.data
-			: reinterpret_cast<buf_block_t*>(bpage)->frame;
-		ut_ad(frame);
-		fil_space_t* space = fil_space_acquire_for_io(
-			bpage->id.space());
-		if (!space) {
-			return DB_TABLESPACE_DELETED;
-		}
-
-		dberr_t	err;
-
-		if (!buf_page_decrypt_after_read(bpage, space)) {
-			err = DB_DECRYPTION_FAILED;
-			goto database_corrupted;
-		}
-
-		if (bpage->zip.data && uncompressed) {
-			buf_pool->n_pend_unzip++;
-			ibool ok = buf_zip_decompress((buf_block_t*) bpage,
-						      FALSE);
-			buf_pool->n_pend_unzip--;
-
-			if (!ok) {
-				ib::info() << "Page "
-					   << bpage->id
-					   << " zip_decompress failure.";
-
-				err = DB_PAGE_CORRUPTED;
-				goto database_corrupted;
-			}
-		}
-
-		/* If this page is not uninitialized and not in the
-		doublewrite buffer, then the page number and space id
-		should be the same as in block. */
-		read_page_no = mach_read_from_4(frame + FIL_PAGE_OFFSET);
-		read_space_id = mach_read_from_4(
-			frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
-
-		if (bpage->id.space() == TRX_SYS_SPACE
-		    && buf_dblwr_page_inside(bpage->id.page_no())) {
-
-			ib::error() << "Reading page " << bpage->id
-				<< ", which is in the doublewrite buffer!";
-
-		} else if (read_space_id == 0 && read_page_no == 0) {
-			/* This is likely an uninitialized page. */
-		} else if (((!space->full_crc32()
-			     || bpage->id.space() != TRX_SYS_SPACE)
-			    && bpage->id.space() != read_space_id)
-			   || bpage->id.page_no() != read_page_no) {
-			/* We do not compare space_id to read_space_id
-			in the system tablespace unless space->full_crc32(),
-			because the field was written as garbage before
-			MySQL 4.1.1, which introduced support for
-			innodb_file_per_table. */
-
-			if (space->full_crc32()
-			    && *reinterpret_cast<uint32_t*>
-			    (&frame[FIL_PAGE_FCRC32_KEY_VERSION])
-			    && space->crypt_data
-			    && space->crypt_data->type
-			    != CRYPT_SCHEME_UNENCRYPTED) {
-				ib::error() << "Cannot decrypt " << bpage->id;
-				err = DB_DECRYPTION_FAILED;
-				goto release_page;
-			}
+  const page_id_t id(bpage->id());
+  ut_ad(bpage->in_file());
+  ut_ad(!buf_dblwr.is_inside(id));
+  ut_ad(id.space() == node.space->id);
+  ut_ad(bpage->zip_size() == node.space->zip_size());
+
+  /* We do not need protect io_fix here by mutex to read it because
+  this and buf_page_write_complete() are the only functions where we can
+  change the value from BUF_IO_READ or BUF_IO_WRITE to some other
+  value, and our code ensures that this is the only thread that handles
+  the i/o for this block. */
+
+  ut_ad(bpage->io_fix() == BUF_IO_READ);
+  ut_ad(!!bpage->zip.ssize == !!bpage->zip.data);
+  ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE || bpage->zip.data);
+
+  const byte *frame= bpage->zip.data
+    ? bpage->zip.data
+    : reinterpret_cast<buf_block_t*>(bpage)->frame;
+  ut_ad(frame);
+
+  dberr_t err;
+  if (!buf_page_decrypt_after_read(bpage, node))
+  {
+    err= DB_DECRYPTION_FAILED;
+    goto database_corrupted;
+  }
 
-			ib::error() << "Space id and page no stored in "
-				"the page, read in are "
-				<< page_id_t(read_space_id, read_page_no)
-				<< ", should be " << bpage->id;
-		}
+  if (bpage->zip.data && bpage->state() == BUF_BLOCK_FILE_PAGE)
+  {
+    buf_pool.n_pend_unzip++;
+    auto ok= buf_zip_decompress(reinterpret_cast<buf_block_t*>(bpage), FALSE);
+    buf_pool.n_pend_unzip--;
+
+    if (!ok)
+    {
+      ib::info() << "Page " << id << " zip_decompress failure.";
+      err= DB_PAGE_CORRUPTED;
+      goto database_corrupted;
+    }
+  }
 
-		err = buf_page_check_corrupt(bpage, space);
+  {
+    const page_id_t read_id(mach_read_from_4(frame + FIL_PAGE_SPACE_ID),
+                            mach_read_from_4(frame + FIL_PAGE_OFFSET));
+
+    if (read_id == id);
+    else if (read_id == page_id_t(0, 0))
+      /* This is likely an uninitialized page. */;
+    else if (!node.space->full_crc32() &&
+             page_id_t(0, read_id.page_no()) == id)
+      /* FIL_PAGE_SPACE_ID was written as garbage in the system tablespace
+      before MySQL 4.1.1, which introduced innodb_file_per_table. */;
+    else if (node.space->full_crc32() &&
+             *reinterpret_cast<const uint32_t*>
+             (&frame[FIL_PAGE_FCRC32_KEY_VERSION]) &&
+             node.space->crypt_data &&
+             node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)
+    {
+      ib::error() << "Cannot decrypt " << id;
+      err= DB_DECRYPTION_FAILED;
+      goto release_page;
+    }
+    else
+      ib::error() << "Space id and page no stored in the page, read in are "
+                  << read_id << ", should be " << id;
+  }
 
-		if (err != DB_SUCCESS) {
+  err= buf_page_check_corrupt(bpage, node);
+  if (UNIV_UNLIKELY(err != DB_SUCCESS))
+  {
 database_corrupted:
-			/* Not a real corruption if it was triggered by
-			error injection */
-			DBUG_EXECUTE_IF(
-				"buf_page_import_corrupt_failure",
-				if (!is_predefined_tablespace(
-					    bpage->id.space())) {
-					buf_corrupt_page_release(bpage, space);
-					ib::info() << "Simulated IMPORT "
-						"corruption";
-					space->release_for_io();
-					return(err);
-				}
-				err = DB_SUCCESS;
-				goto page_not_corrupt;
-			);
-
-			if (uncompressed && bpage->zip.data) {
-				memset(reinterpret_cast<buf_block_t*>(bpage)
-				       ->frame, 0, srv_page_size);
-			}
-
-			if (err == DB_PAGE_CORRUPTED) {
-				ib::error()
-					<< "Database page corruption on disk"
-					" or a failed file read of tablespace "
-					<< space->name << " page " << bpage->id
-					<< ". You may have to recover from "
-					<< "a backup.";
-
-				buf_page_print(frame, bpage->zip_size());
-
-				ib::info()
-					<< "It is also possible that your"
-					" operating system has corrupted"
-					" its own file cache and rebooting"
-					" your computer removes the error."
-					" If the corrupt page is an index page."
-					" You can also try to fix the"
-					" corruption by dumping, dropping,"
-					" and reimporting the corrupt table."
-					" You can use CHECK TABLE to scan"
-					" your table for corruption. "
-					<< FORCE_RECOVERY_MSG;
-			}
-
-			if (!srv_force_recovery) {
-
-				/* If page space id is larger than TRX_SYS_SPACE
-				(0), we will attempt to mark the corresponding
-				table as corrupted instead of crashing server */
-				if (bpage->id.space() == TRX_SYS_SPACE) {
-					ib::fatal() << "Aborting because of"
-						" a corrupt database page.";
-				}
-
-				buf_corrupt_page_release(bpage, space);
-				space->release_for_io();
-				return(err);
-			}
-		}
+    /* Not a real corruption if it was triggered by error injection */
+    DBUG_EXECUTE_IF("buf_page_import_corrupt_failure",
+                    if (!is_predefined_tablespace(id.space()))
+                    {
+                      buf_corrupt_page_release(bpage, node);
+                      ib::info() << "Simulated IMPORT corruption";
+                      return err;
+                    }
+                    err= DB_SUCCESS;
+                    goto page_not_corrupt;);
+
+    if (bpage->zip.data && bpage->state() == BUF_BLOCK_FILE_PAGE)
+      memset(reinterpret_cast<buf_block_t*>(bpage)->frame, 0, srv_page_size);
+
+    if (err == DB_PAGE_CORRUPTED)
+    {
+      ib::error() << "Database page corruption on disk"
+                     " or a failed read of file '"
+                  << node.name << "' page " << id
+                  << ". You may have to recover from a backup.";
+
+      buf_page_print(frame, bpage->zip_size());
+
+      ib::info() << " You can use CHECK TABLE to scan"
+                    " your table for corruption. "
+                 << FORCE_RECOVERY_MSG;
+    }
+
+    if (!srv_force_recovery)
+    {
+      /* If the corruption is in the system tablespace, we will
+      intentionally crash the server. */
+      if (id.space() == TRX_SYS_SPACE)
+        ib::fatal() << "Aborting because of a corrupt database page.";
+      buf_corrupt_page_release(bpage, node);
+      return err;
+    }
+  }
 
-		DBUG_EXECUTE_IF("buf_page_import_corrupt_failure",
-				page_not_corrupt: bpage = bpage; );
+  DBUG_EXECUTE_IF("buf_page_import_corrupt_failure",
+                  page_not_corrupt: bpage= bpage; );
 
-		if (err == DB_PAGE_CORRUPTED
-		    || err == DB_DECRYPTION_FAILED) {
+  if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED)
+  {
 release_page:
-			const page_id_t corrupt_page_id = bpage->id;
-
-			buf_corrupt_page_release(bpage, space);
-
-			if (recv_recovery_is_on()) {
-				recv_recover_corrupt_page(corrupt_page_id);
-			}
-
-			space->release_for_io();
-			return err;
-		}
-
-		if (recv_recovery_is_on()) {
-			recv_recover_page(bpage);
-		}
-
-		if (uncompressed
-		    && !recv_no_ibuf_operations
-		    && (bpage->id.space() == 0
-			|| !is_predefined_tablespace(bpage->id.space()))
-		    && fil_page_get_type(frame) == FIL_PAGE_INDEX
-		    && page_is_leaf(frame)) {
-			ibuf_merge_or_delete_for_page(
-				reinterpret_cast<buf_block_t*>(bpage),
-				bpage->id, bpage->zip_size());
-		}
-
-		space->release_for_io();
-	} else {
-		/* io_type == BUF_IO_WRITE */
-		if (bpage->slot) {
-			/* Mark slot free */
-			bpage->slot->release();
-			bpage->slot = NULL;
-		}
-	}
-
-	BPageMutex* block_mutex = buf_page_get_mutex(bpage);
-	buf_pool_mutex_enter(buf_pool);
-	mutex_enter(block_mutex);
-
-	/* Because this thread which does the unlocking is not the same that
-	did the locking, we use a pass value != 0 in unlock, which simply
-	removes the newest lock debug record, without checking the thread
-	id. */
-
-	buf_page_set_io_fix(bpage, BUF_IO_NONE);
-	buf_page_monitor(bpage, io_type);
-
-	if (io_type == BUF_IO_READ) {
-		/* NOTE that the call to ibuf may have moved the ownership of
-		the x-latch to this OS thread: do not let this confuse you in
-		debugging! */
-
-		ut_ad(buf_pool->n_pend_reads > 0);
-		buf_pool->n_pend_reads--;
-		buf_pool->stat.n_pages_read++;
-
-		if (uncompressed) {
-			rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock,
-					     BUF_IO_READ);
-		}
-
-		mutex_exit(block_mutex);
-	} else {
-		/* Write means a flush operation: call the completion
-		routine in the flush system */
-
-		buf_flush_write_complete(bpage, dblwr);
-
-		if (uncompressed) {
-			rw_lock_sx_unlock_gen(&((buf_block_t*) bpage)->lock,
-					      BUF_IO_WRITE);
-		}
-
-		buf_pool->stat.n_pages_written++;
+    buf_corrupt_page_release(bpage, node);
+    if (recv_recovery_is_on())
+      recv_sys.free_corrupted_page(id);
+    return err;
+  }
 
-		/* We decide whether or not to evict the page from the
-		LRU list based on the flush_type.
-		* BUF_FLUSH_LIST: don't evict
-		* BUF_FLUSH_LRU: always evict
-		* BUF_FLUSH_SINGLE_PAGE: eviction preference is passed
-		by the caller explicitly. */
-		if (buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU) {
-			evict = true;
-		}
+  if (recv_recovery_is_on())
+    recv_recover_page(node.space, bpage);
 
-		mutex_exit(block_mutex);
+  if (bpage->state() == BUF_BLOCK_FILE_PAGE && !recv_no_ibuf_operations &&
+      (!id.space() || !is_predefined_tablespace(id.space())) &&
+      fil_page_get_type(frame) == FIL_PAGE_INDEX &&
+      page_is_leaf(frame))
+    bpage->ibuf_exist= true;
 
-		if (evict) {
-			buf_LRU_free_page(bpage, true);
-		}
-	}
+  if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
+    buf_page_monitor(bpage, BUF_IO_READ);
+  DBUG_PRINT("ib_buf", ("read page %u:%u",
+                        id.space(), id.page_no()));
 
-	DBUG_PRINT("ib_buf", ("%s page %u:%u",
-			      io_type == BUF_IO_READ ? "read" : "wrote",
-			      bpage->id.space(), bpage->id.page_no()));
+  /* Because this thread which does the unlocking might not be the same that
+  did the locking, we use a pass value != 0 in unlock, which simply
+  removes the newest lock debug record, without checking the thread id. */
+  if (bpage->state() == BUF_BLOCK_FILE_PAGE)
+    rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_READ);
+  bpage->io_unfix();
 
-	buf_pool_mutex_exit(buf_pool);
+  ut_d(auto n=) buf_pool.n_pend_reads--;
+  ut_ad(n > 0);
+  buf_pool.stat.n_pages_read++;
 
-	return DB_SUCCESS;
+  return DB_SUCCESS;
 }
 
-/*********************************************************************//**
-Asserts that all file pages in the buffer are in a replaceable state.
-@return TRUE */
-static
-ibool
-buf_all_freed_instance(
-/*===================*/
-	buf_pool_t*	buf_pool)	/*!< in: buffer pool instancce */
+#ifdef UNIV_DEBUG
+/** Check that all blocks are in a replaceable state.
+@return address of a non-free block
+@retval nullptr if all freed */
+void buf_pool_t::assert_all_freed()
 {
-	ulint		i;
-	buf_chunk_t*	chunk;
-
-	ut_ad(buf_pool);
-
-	buf_pool_mutex_enter(buf_pool);
-
-	chunk = buf_pool->chunks;
-
-	for (i = buf_pool->n_chunks; i--; chunk++) {
-
-		if (const buf_block_t* block = buf_chunk_not_freed(chunk)) {
-			ib::fatal() << "Page " << block->page.id
-				<< " still fixed or dirty";
-		}
-	}
-
-	buf_pool_mutex_exit(buf_pool);
-
-	return(TRUE);
+  mysql_mutex_lock(&mutex);
+  const chunk_t *chunk= chunks;
+  for (auto i= n_chunks; i--; chunk++)
+    if (const buf_block_t* block= chunk->not_freed())
+      ib::fatal() << "Page " << block->page.id() << " still fixed or dirty";
+  mysql_mutex_unlock(&mutex);
 }
+#endif /* UNIV_DEBUG */
 
-/** Refreshes the statistics used to print per-second averages.
-@param[in,out]	buf_pool	buffer pool instance */
-static
-void
-buf_refresh_io_stats(
-	buf_pool_t*	buf_pool)
+/** Refresh the statistics used to print per-second averages. */
+void buf_refresh_io_stats()
 {
-	buf_pool->last_printout_time = time(NULL);
-	buf_pool->old_stat = buf_pool->stat;
+	buf_pool.last_printout_time = time(NULL);
+	buf_pool.old_stat = buf_pool.stat;
 }
 
-/*********************************************************************//**
-Invalidates file pages in one buffer pool instance */
-static
-void
-buf_pool_invalidate_instance(
-/*=========================*/
-	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
+/** Invalidate all pages in the buffer pool.
+All pages must be in a replaceable state (not modified or latched). */
+void buf_pool_invalidate()
 {
-	ulint		i;
-
-	buf_pool_mutex_enter(buf_pool);
-
-	for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
-
-		/* As this function is called during startup and
-		during redo application phase during recovery, InnoDB
-		is single threaded (apart from IO helper threads) at
-		this stage. No new write batch can be in intialization
-		stage at this point. */
-		ut_ad(buf_pool->init_flush[i] == FALSE);
-
-		/* However, it is possible that a write batch that has
-		been posted earlier is still not complete. For buffer
-		pool invalidation to proceed we must ensure there is NO
-		write activity happening. */
-		if (buf_pool->n_flush[i] > 0) {
-			buf_flush_t	type = static_cast<buf_flush_t>(i);
-
-			buf_pool_mutex_exit(buf_pool);
-			buf_flush_wait_batch_end(buf_pool, type);
-			buf_pool_mutex_enter(buf_pool);
-		}
-	}
-
-	buf_pool_mutex_exit(buf_pool);
+	mysql_mutex_lock(&buf_pool.mutex);
 
-	ut_ad(buf_all_freed_instance(buf_pool));
-
-	buf_pool_mutex_enter(buf_pool);
-
-	while (buf_LRU_scan_and_free_block(buf_pool, true)) {
-	}
+	buf_flush_wait_batch_end(true);
+	buf_flush_wait_batch_end(false);
 
-	ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0);
-	ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0);
+	/* It is possible that a write batch that has been posted
+	earlier is still not complete. For buffer pool invalidation to
+	proceed we must ensure there is NO write activity happening. */
 
-	buf_pool->freed_page_clock = 0;
-	buf_pool->LRU_old = NULL;
-	buf_pool->LRU_old_len = 0;
+	ut_d(mysql_mutex_unlock(&buf_pool.mutex));
+	ut_d(buf_pool.assert_all_freed());
+	ut_d(mysql_mutex_lock(&buf_pool.mutex));
 
-	memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat));
-	buf_refresh_io_stats(buf_pool);
+	while (buf_LRU_scan_and_free_block());
 
-	buf_pool_mutex_exit(buf_pool);
-}
+	ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == 0);
+	ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0);
 
-/*********************************************************************//**
-Invalidates the file pages in the buffer pool when an archive recovery is
-completed. All the file pages buffered must be in a replaceable state when
-this function is called: not latched and not modified. */
-void
-buf_pool_invalidate(void)
-/*=====================*/
-{
-	ulint   i;
+	buf_pool.freed_page_clock = 0;
+	buf_pool.LRU_old = NULL;
+	buf_pool.LRU_old_len = 0;
 
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_invalidate_instance(buf_pool_from_array(i));
-	}
+	memset(&buf_pool.stat, 0x00, sizeof(buf_pool.stat));
+	buf_refresh_io_stats();
+	mysql_mutex_unlock(&buf_pool.mutex);
 }
 
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/*********************************************************************//**
-Validates data in one buffer pool instance
-@return TRUE */
-static
-ibool
-buf_pool_validate_instance(
-/*=======================*/
-	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
+#ifdef UNIV_DEBUG
+/** Validate the buffer pool. */
+void buf_pool_t::validate()
 {
-	buf_page_t*	b;
-	buf_chunk_t*	chunk;
-	ulint		i;
-	ulint		n_lru_flush	= 0;
-	ulint		n_page_flush	= 0;
-	ulint		n_list_flush	= 0;
 	ulint		n_lru		= 0;
-	ulint		n_flush		= 0;
+	ulint		n_flushing	= 0;
 	ulint		n_free		= 0;
 	ulint		n_zip		= 0;
 
-	ut_ad(buf_pool);
-
-	buf_pool_mutex_enter(buf_pool);
-	hash_lock_x_all(buf_pool->page_hash);
+	mysql_mutex_lock(&mutex);
 
-	chunk = buf_pool->chunks;
+	chunk_t* chunk = chunks;
 
 	/* Check the uncompressed blocks. */
 
-	for (i = buf_pool->n_chunks; i--; chunk++) {
+	for (auto i = n_chunks; i--; chunk++) {
 
 		ulint		j;
 		buf_block_t*	block = chunk->blocks;
 
 		for (j = chunk->size; j--; block++) {
-
-			buf_page_mutex_enter(block);
-
-			switch (buf_block_get_state(block)) {
-			case BUF_BLOCK_POOL_WATCH:
+			switch (block->page.state()) {
 			case BUF_BLOCK_ZIP_PAGE:
-			case BUF_BLOCK_ZIP_DIRTY:
-				/* These should only occur on
-				zip_clean, zip_free[], or flush_list. */
+				/* This kind of block descriptors should
+				be allocated by malloc() only. */
 				ut_error;
 				break;
 
-			case BUF_BLOCK_FILE_PAGE:
-				ut_a(buf_page_hash_get_low(
-						buf_pool, block->page.id)
-				     == &block->page);
-
-				switch (buf_page_get_io_fix(&block->page)) {
-				case BUF_IO_NONE:
-					break;
-
-				case BUF_IO_WRITE:
-					switch (buf_page_get_flush_type(
-							&block->page)) {
-					case BUF_FLUSH_LRU:
-						n_lru_flush++;
-						goto assert_s_latched;
-					case BUF_FLUSH_SINGLE_PAGE:
-						n_page_flush++;
-assert_s_latched:
-						ut_a(rw_lock_is_locked(
-							     &block->lock,
-								     RW_LOCK_S)
-						     || rw_lock_is_locked(
-								&block->lock,
-								RW_LOCK_SX));
-						break;
-					case BUF_FLUSH_LIST:
-						n_list_flush++;
-						break;
-					default:
-						ut_error;
-					}
-
-					break;
-
-				case BUF_IO_READ:
-
-					ut_a(rw_lock_is_locked(&block->lock,
-							       RW_LOCK_X));
-					break;
-
-				case BUF_IO_PIN:
-					break;
-				}
-
-				n_lru++;
-				break;
-
 			case BUF_BLOCK_NOT_USED:
 				n_free++;
 				break;
 
-			case BUF_BLOCK_READY_FOR_USE:
 			case BUF_BLOCK_MEMORY:
 			case BUF_BLOCK_REMOVE_HASH:
 				/* do nothing */
 				break;
-			}
-
-			buf_page_mutex_exit(block);
-		}
-	}
 
-	mutex_enter(&buf_pool->zip_mutex);
-
-	/* Check clean compressed-only blocks. */
+			case BUF_BLOCK_FILE_PAGE:
+				const page_id_t id = block->page.id();
+				ut_ad(page_hash_get_low(id, id.fold())
+				      == &block->page);
+				n_lru++;
+				break;
 
-	for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
-	     b = UT_LIST_GET_NEXT(list, b)) {
-		ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
-		switch (buf_page_get_io_fix(b)) {
-		case BUF_IO_NONE:
-		case BUF_IO_PIN:
-			/* All clean blocks should be I/O-unfixed. */
-			break;
-		case BUF_IO_READ:
-			/* In buf_LRU_free_page(), we temporarily set
-			b->io_fix = BUF_IO_READ for a newly allocated
-			control block in order to prevent
-			buf_page_get_gen() from decompressing the block. */
-			break;
-		default:
-			ut_error;
-			break;
+			}
 		}
-
-		/* It is OK to read oldest_modification here because
-		we have acquired buf_pool->zip_mutex above which acts
-		as the 'block->mutex' for these bpages. */
-		ut_a(!b->oldest_modification);
-		ut_a(buf_page_hash_get_low(buf_pool, b->id) == b);
-		n_lru++;
-		n_zip++;
 	}
 
 	/* Check dirty blocks. */
 
-	buf_flush_list_mutex_enter(buf_pool);
-	for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
+	mysql_mutex_lock(&flush_list_mutex);
+	for (buf_page_t* b = UT_LIST_GET_FIRST(flush_list); b;
 	     b = UT_LIST_GET_NEXT(list, b)) {
-		ut_ad(b->in_flush_list);
-		ut_a(b->oldest_modification);
-		n_flush++;
+		ut_ad(b->oldest_modification());
+		ut_ad(!fsp_is_system_temporary(b->id().space()));
+		n_flushing++;
 
-		switch (buf_page_get_state(b)) {
-		case BUF_BLOCK_ZIP_DIRTY:
+		switch (b->state()) {
+		case BUF_BLOCK_ZIP_PAGE:
 			n_lru++;
 			n_zip++;
-			switch (buf_page_get_io_fix(b)) {
-			case BUF_IO_NONE:
-			case BUF_IO_READ:
-			case BUF_IO_PIN:
-				break;
-			case BUF_IO_WRITE:
-				switch (buf_page_get_flush_type(b)) {
-				case BUF_FLUSH_LRU:
-					n_lru_flush++;
-					break;
-				case BUF_FLUSH_SINGLE_PAGE:
-					n_page_flush++;
-					break;
-				case BUF_FLUSH_LIST:
-					n_list_flush++;
-					break;
-				default:
-					ut_error;
-				}
-				break;
-			}
 			break;
 		case BUF_BLOCK_FILE_PAGE:
 			/* uncompressed page */
 			break;
-		case BUF_BLOCK_POOL_WATCH:
-		case BUF_BLOCK_ZIP_PAGE:
 		case BUF_BLOCK_NOT_USED:
-		case BUF_BLOCK_READY_FOR_USE:
 		case BUF_BLOCK_MEMORY:
 		case BUF_BLOCK_REMOVE_HASH:
 			ut_error;
 			break;
 		}
-		ut_a(buf_page_hash_get_low(buf_pool, b->id) == b);
+		const page_id_t id = b->id();
+		ut_ad(page_hash_get_low(id, id.fold()) == b);
 	}
 
-	ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
+	ut_ad(UT_LIST_GET_LEN(flush_list) == n_flushing);
 
-	hash_unlock_x_all(buf_pool->page_hash);
-	buf_flush_list_mutex_exit(buf_pool);
+	mysql_mutex_unlock(&flush_list_mutex);
 
-	mutex_exit(&buf_pool->zip_mutex);
-
-	if (buf_pool->curr_size == buf_pool->old_size
-	    && n_lru + n_free > buf_pool->curr_size + n_zip) {
+	if (curr_size == old_size
+	    && n_lru + n_free > curr_size + n_zip) {
 
 		ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free
-			<< ", pool " << buf_pool->curr_size
+			<< ", pool " << curr_size
 			<< " zip " << n_zip << ". Aborting...";
 	}
 
-	ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru);
-	if (buf_pool->curr_size == buf_pool->old_size
-	    && UT_LIST_GET_LEN(buf_pool->free) != n_free) {
+	ut_ad(UT_LIST_GET_LEN(LRU) >= n_lru);
+
+	if (curr_size == old_size
+	    && UT_LIST_GET_LEN(free) != n_free) {
 
 		ib::fatal() << "Free list len "
-			<< UT_LIST_GET_LEN(buf_pool->free)
+			<< UT_LIST_GET_LEN(free)
 			<< ", free blocks " << n_free << ". Aborting...";
 	}
 
-	ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
-	ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
-	ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_page_flush);
-
-	buf_pool_mutex_exit(buf_pool);
+	mysql_mutex_unlock(&mutex);
 
-	ut_a(buf_LRU_validate());
-	ut_a(buf_flush_validate(buf_pool));
-
-	return(TRUE);
-}
-
-/*********************************************************************//**
-Validates the buffer buf_pool data structure.
-@return TRUE */
-ibool
-buf_validate(void)
-/*==============*/
-{
-	ulint	i;
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		buf_pool_validate_instance(buf_pool);
-	}
-	return(TRUE);
+	ut_d(buf_LRU_validate());
+	ut_d(buf_flush_validate());
 }
+#endif /* UNIV_DEBUG */
 
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/*********************************************************************//**
-Prints info of the buffer buf_pool data structure for one instance. */
-static
-void
-buf_print_instance(
-/*===============*/
-	buf_pool_t*	buf_pool)
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+/** Write information of the buf_pool to the error log. */
+void buf_pool_t::print()
 {
 	index_id_t*	index_ids;
 	ulint*		counts;
@@ -6608,32 +4355,44 @@ buf_print_instance(
 	ulint		j;
 	index_id_t	id;
 	ulint		n_found;
-	buf_chunk_t*	chunk;
+	chunk_t*	chunk;
 	dict_index_t*	index;
 
-	ut_ad(buf_pool);
-
-	size = buf_pool->curr_size;
+	size = curr_size;
 
 	index_ids = static_cast<index_id_t*>(
 		ut_malloc_nokey(size * sizeof *index_ids));
 
 	counts = static_cast<ulint*>(ut_malloc_nokey(sizeof(ulint) * size));
 
-	buf_pool_mutex_enter(buf_pool);
-	buf_flush_list_mutex_enter(buf_pool);
-
-	ib::info() << *buf_pool;
+	mysql_mutex_lock(&mutex);
+	mysql_mutex_lock(&flush_list_mutex);
 
-	buf_flush_list_mutex_exit(buf_pool);
+	ib::info()
+		<< "[buffer pool: size=" << curr_size
+		<< ", database pages=" << UT_LIST_GET_LEN(LRU)
+		<< ", free pages=" << UT_LIST_GET_LEN(free)
+		<< ", modified database pages="
+		<< UT_LIST_GET_LEN(flush_list)
+		<< ", n pending decompressions=" << n_pend_unzip
+		<< ", n pending reads=" << n_pend_reads
+		<< ", n pending flush LRU=" << n_flush_LRU_
+		<< " list=" << n_flush_list_
+		<< ", pages made young=" << stat.n_pages_made_young
+		<< ", not young=" << stat.n_pages_not_made_young
+		<< ", pages read=" << stat.n_pages_read
+		<< ", created=" << stat.n_pages_created
+		<< ", written=" << stat.n_pages_written << "]";
+
+	mysql_mutex_unlock(&flush_list_mutex);
 
 	/* Count the number of blocks belonging to each index in the buffer */
 
 	n_found = 0;
 
-	chunk = buf_pool->chunks;
+	chunk = chunks;
 
-	for (i = buf_pool->n_chunks; i--; chunk++) {
+	for (i = n_chunks; i--; chunk++) {
 		buf_block_t*	block		= chunk->blocks;
 		ulint		n_blocks	= chunk->size;
 
@@ -6666,7 +4425,7 @@ buf_print_instance(
 		}
 	}
 
-	buf_pool_mutex_exit(buf_pool);
+	mysql_mutex_unlock(&mutex);
 
 	for (i = 0; i < n_found; i++) {
 		index = dict_index_get_if_in_cache(index_ids[i]);
@@ -6686,360 +4445,137 @@ buf_print_instance(
 	ut_free(index_ids);
 	ut_free(counts);
 
-	ut_a(buf_pool_validate_instance(buf_pool));
-}
-
-/*********************************************************************//**
-Prints info of the buffer buf_pool data structure. */
-void
-buf_print(void)
-/*===========*/
-{
-	ulint   i;
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-		buf_print_instance(buf_pool);
-	}
+	validate();
 }
-#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
 
 #ifdef UNIV_DEBUG
-/*********************************************************************//**
-Returns the number of latched pages in the buffer pool.
-@return number of latched pages */
-static
-ulint
-buf_get_latched_pages_number_instance(
-/*==================================*/
-	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
-{
-	buf_page_t*	b;
-	ulint		i;
-	buf_chunk_t*	chunk;
-	ulint		fixed_pages_number = 0;
-
-	buf_pool_mutex_enter(buf_pool);
-
-	chunk = buf_pool->chunks;
-
-	for (i = buf_pool->n_chunks; i--; chunk++) {
-		buf_block_t*	block;
-		ulint		j;
-
-		block = chunk->blocks;
-
-		for (j = chunk->size; j--; block++) {
-			if (buf_block_get_state(block)
-			    != BUF_BLOCK_FILE_PAGE) {
-
-				continue;
-			}
-
-			buf_page_mutex_enter(block);
-
-			if (block->page.buf_fix_count != 0
-			    || buf_page_get_io_fix(&block->page)
-			    != BUF_IO_NONE) {
-				fixed_pages_number++;
-			}
-
-			buf_page_mutex_exit(block);
-		}
-	}
-
-	mutex_enter(&buf_pool->zip_mutex);
-
-	/* Traverse the lists of clean and dirty compressed-only blocks. */
-
-	for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
-	     b = UT_LIST_GET_NEXT(list, b)) {
-		ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
-		ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE);
-
-		if (b->buf_fix_count != 0
-		    || buf_page_get_io_fix(b) != BUF_IO_NONE) {
-			fixed_pages_number++;
-		}
-	}
-
-	buf_flush_list_mutex_enter(buf_pool);
-	for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
-	     b = UT_LIST_GET_NEXT(list, b)) {
-		ut_ad(b->in_flush_list);
-
-		switch (buf_page_get_state(b)) {
-		case BUF_BLOCK_ZIP_DIRTY:
-			if (b->buf_fix_count != 0
-			    || buf_page_get_io_fix(b) != BUF_IO_NONE) {
-				fixed_pages_number++;
-			}
-			break;
-		case BUF_BLOCK_FILE_PAGE:
-			/* uncompressed page */
-			break;
-		case BUF_BLOCK_POOL_WATCH:
-		case BUF_BLOCK_ZIP_PAGE:
-		case BUF_BLOCK_NOT_USED:
-		case BUF_BLOCK_READY_FOR_USE:
-		case BUF_BLOCK_MEMORY:
-		case BUF_BLOCK_REMOVE_HASH:
-			ut_error;
-			break;
-		}
-	}
-
-	buf_flush_list_mutex_exit(buf_pool);
-	mutex_exit(&buf_pool->zip_mutex);
-	buf_pool_mutex_exit(buf_pool);
-
-	return(fixed_pages_number);
-}
-
-/*********************************************************************//**
-Returns the number of latched pages in all the buffer pools.
-@return number of latched pages */
-ulint
-buf_get_latched_pages_number(void)
-/*==============================*/
+/** @return the number of latched pages in the buffer pool */
+ulint buf_get_latched_pages_number()
 {
-	ulint	i;
-	ulint	total_latched_pages = 0;
+  ulint fixed_pages_number= 0;
 
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
+  mysql_mutex_lock(&buf_pool.mutex);
 
-		buf_pool = buf_pool_from_array(i);
+  for (buf_page_t *b= UT_LIST_GET_FIRST(buf_pool.LRU); b;
+       b= UT_LIST_GET_NEXT(LRU, b))
+    if (b->in_file() && (b->buf_fix_count() || b->io_fix() != BUF_IO_NONE))
+      fixed_pages_number++;
 
-		total_latched_pages += buf_get_latched_pages_number_instance(
-			buf_pool);
-	}
+  mysql_mutex_unlock(&buf_pool.mutex);
 
-	return(total_latched_pages);
+  return fixed_pages_number;
 }
-
 #endif /* UNIV_DEBUG */
 
-/*********************************************************************//**
-Returns the number of pending buf pool read ios.
-@return number of pending read I/O operations */
-ulint
-buf_get_n_pending_read_ios(void)
-/*============================*/
-{
-	ulint	pend_ios = 0;
-
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		pend_ios += buf_pool_from_array(i)->n_pend_reads;
-	}
-
-	return(pend_ios);
-}
-
-/*********************************************************************//**
-Returns the ratio in percents of modified pages in the buffer pool /
-database pages in the buffer pool.
-@return modified page percentage ratio */
-double
-buf_get_modified_ratio_pct(void)
-/*============================*/
-{
-	double		ratio;
-	ulint		lru_len = 0;
-	ulint		free_len = 0;
-	ulint		flush_list_len = 0;
-
-	buf_get_total_list_len(&lru_len, &free_len, &flush_list_len);
-
-	ratio = static_cast<double>(100 * flush_list_len)
-		/ (1 + lru_len + free_len);
-
-	/* 1 + is there to avoid division by zero */
-
-	return(ratio);
-}
-
-/*******************************************************************//**
-Aggregates a pool stats information with the total buffer pool stats  */
-static
-void
-buf_stats_aggregate_pool_info(
-/*==========================*/
-	buf_pool_info_t*	total_info,	/*!< in/out: the buffer pool
-						info to store aggregated
-						result */
-	const buf_pool_info_t*	pool_info)	/*!< in: individual buffer pool
-						stats info */
-{
-	ut_a(total_info && pool_info);
-
-	/* Nothing to copy if total_info is the same as pool_info */
-	if (total_info == pool_info) {
-		return;
-	}
-
-	total_info->pool_size += pool_info->pool_size;
-	total_info->lru_len += pool_info->lru_len;
-	total_info->old_lru_len += pool_info->old_lru_len;
-	total_info->free_list_len += pool_info->free_list_len;
-	total_info->flush_list_len += pool_info->flush_list_len;
-	total_info->n_pend_unzip += pool_info->n_pend_unzip;
-	total_info->n_pend_reads += pool_info->n_pend_reads;
-	total_info->n_pending_flush_lru += pool_info->n_pending_flush_lru;
-	total_info->n_pending_flush_list += pool_info->n_pending_flush_list;
-	total_info->n_pages_made_young += pool_info->n_pages_made_young;
-	total_info->n_pages_not_made_young += pool_info->n_pages_not_made_young;
-	total_info->n_pages_read += pool_info->n_pages_read;
-	total_info->n_pages_created += pool_info->n_pages_created;
-	total_info->n_pages_written += pool_info->n_pages_written;
-	total_info->n_page_gets += pool_info->n_page_gets;
-	total_info->n_ra_pages_read_rnd += pool_info->n_ra_pages_read_rnd;
-	total_info->n_ra_pages_read += pool_info->n_ra_pages_read;
-	total_info->n_ra_pages_evicted += pool_info->n_ra_pages_evicted;
-	total_info->page_made_young_rate += pool_info->page_made_young_rate;
-	total_info->page_not_made_young_rate +=
-		pool_info->page_not_made_young_rate;
-	total_info->pages_read_rate += pool_info->pages_read_rate;
-	total_info->pages_created_rate += pool_info->pages_created_rate;
-	total_info->pages_written_rate += pool_info->pages_written_rate;
-	total_info->n_page_get_delta += pool_info->n_page_get_delta;
-	total_info->page_read_delta += pool_info->page_read_delta;
-	total_info->young_making_delta += pool_info->young_making_delta;
-	total_info->not_young_making_delta += pool_info->not_young_making_delta;
-	total_info->pages_readahead_rnd_rate += pool_info->pages_readahead_rnd_rate;
-	total_info->pages_readahead_rate += pool_info->pages_readahead_rate;
-	total_info->pages_evicted_rate += pool_info->pages_evicted_rate;
-	total_info->unzip_lru_len += pool_info->unzip_lru_len;
-	total_info->io_sum += pool_info->io_sum;
-	total_info->io_cur += pool_info->io_cur;
-	total_info->unzip_sum += pool_info->unzip_sum;
-	total_info->unzip_cur += pool_info->unzip_cur;
-}
-/*******************************************************************//**
-Collect buffer pool stats information for a buffer pool. Also
-record aggregated stats if there are more than one buffer pool
-in the server */
-void
-buf_stats_get_pool_info(
-/*====================*/
-	buf_pool_t*		buf_pool,	/*!< in: buffer pool */
-	ulint			pool_id,	/*!< in: buffer pool ID */
-	buf_pool_info_t*	all_pool_info)	/*!< in/out: buffer pool info
-						to fill */
+/** Collect buffer pool metadata.
+@param[out]	pool_info	buffer pool metadata */
+void buf_stats_get_pool_info(buf_pool_info_t *pool_info)
 {
-	buf_pool_info_t*	pool_info;
 	time_t			current_time;
 	double			time_elapsed;
 
-	/* Find appropriate pool_info to store stats for this buffer pool */
-	pool_info = &all_pool_info[pool_id];
-
-	buf_pool_mutex_enter(buf_pool);
-	buf_flush_list_mutex_enter(buf_pool);
-
-	pool_info->pool_unique_id = pool_id;
-
-	pool_info->pool_size = buf_pool->curr_size;
+	mysql_mutex_lock(&buf_pool.mutex);
 
-	pool_info->lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+	pool_info->pool_size = buf_pool.curr_size;
 
-	pool_info->old_lru_len = buf_pool->LRU_old_len;
+	pool_info->lru_len = UT_LIST_GET_LEN(buf_pool.LRU);
 
-	pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool->free);
+	pool_info->old_lru_len = buf_pool.LRU_old_len;
 
-	pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool->flush_list);
+	pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool.free);
 
-	pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
+	mysql_mutex_lock(&buf_pool.flush_list_mutex);
+	pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool.flush_list);
 
-	pool_info->n_pend_reads = buf_pool->n_pend_reads;
+	pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool.unzip_LRU);
+	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
-	pool_info->n_pending_flush_lru =
-		 (buf_pool->n_flush[BUF_FLUSH_LRU]
-		  + buf_pool->init_flush[BUF_FLUSH_LRU]);
+	pool_info->n_pend_reads = buf_pool.n_pend_reads;
 
-	pool_info->n_pending_flush_list =
-		 (buf_pool->n_flush[BUF_FLUSH_LIST]
-		  + buf_pool->init_flush[BUF_FLUSH_LIST]);
+	pool_info->n_pending_flush_lru = buf_pool.n_flush_LRU_;
 
-	pool_info->n_pending_flush_single_page =
-		 (buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
-		  + buf_pool->init_flush[BUF_FLUSH_SINGLE_PAGE]);
-
-	buf_flush_list_mutex_exit(buf_pool);
+	pool_info->n_pending_flush_list = buf_pool.n_flush_list_;
 
 	current_time = time(NULL);
 	time_elapsed = 0.001 + difftime(current_time,
-					buf_pool->last_printout_time);
+					buf_pool.last_printout_time);
 
-	pool_info->n_pages_made_young = buf_pool->stat.n_pages_made_young;
+	pool_info->n_pages_made_young = buf_pool.stat.n_pages_made_young;
 
 	pool_info->n_pages_not_made_young =
-		buf_pool->stat.n_pages_not_made_young;
+		buf_pool.stat.n_pages_not_made_young;
 
-	pool_info->n_pages_read = buf_pool->stat.n_pages_read;
+	pool_info->n_pages_read = buf_pool.stat.n_pages_read;
 
-	pool_info->n_pages_created = buf_pool->stat.n_pages_created;
+	pool_info->n_pages_created = buf_pool.stat.n_pages_created;
 
-	pool_info->n_pages_written = buf_pool->stat.n_pages_written;
+	pool_info->n_pages_written = buf_pool.stat.n_pages_written;
 
-	pool_info->n_page_gets = buf_pool->stat.n_page_gets;
+	pool_info->n_page_gets = buf_pool.stat.n_page_gets;
 
-	pool_info->n_ra_pages_read_rnd = buf_pool->stat.n_ra_pages_read_rnd;
-	pool_info->n_ra_pages_read = buf_pool->stat.n_ra_pages_read;
+	pool_info->n_ra_pages_read_rnd = buf_pool.stat.n_ra_pages_read_rnd;
+	pool_info->n_ra_pages_read = buf_pool.stat.n_ra_pages_read;
 
-	pool_info->n_ra_pages_evicted = buf_pool->stat.n_ra_pages_evicted;
+	pool_info->n_ra_pages_evicted = buf_pool.stat.n_ra_pages_evicted;
 
 	pool_info->page_made_young_rate =
-		 (buf_pool->stat.n_pages_made_young
-		  - buf_pool->old_stat.n_pages_made_young) / time_elapsed;
+	static_cast<double>(buf_pool.stat.n_pages_made_young
+			    - buf_pool.old_stat.n_pages_made_young)
+	/ time_elapsed;
 
 	pool_info->page_not_made_young_rate =
-		 (buf_pool->stat.n_pages_not_made_young
-		  - buf_pool->old_stat.n_pages_not_made_young) / time_elapsed;
+	static_cast<double>(buf_pool.stat.n_pages_not_made_young
+			    - buf_pool.old_stat.n_pages_not_made_young)
+	/ time_elapsed;
 
 	pool_info->pages_read_rate =
-		(buf_pool->stat.n_pages_read
-		  - buf_pool->old_stat.n_pages_read) / time_elapsed;
+	static_cast<double>(buf_pool.stat.n_pages_read
+			    - buf_pool.old_stat.n_pages_read)
+	/ time_elapsed;
 
 	pool_info->pages_created_rate =
-		(buf_pool->stat.n_pages_created
-		 - buf_pool->old_stat.n_pages_created) / time_elapsed;
+	static_cast<double>(buf_pool.stat.n_pages_created
+			    - buf_pool.old_stat.n_pages_created)
+	/ time_elapsed;
 
 	pool_info->pages_written_rate =
-		(buf_pool->stat.n_pages_written
-		 - buf_pool->old_stat.n_pages_written) / time_elapsed;
+	static_cast<double>(buf_pool.stat.n_pages_written
+			    - buf_pool.old_stat.n_pages_written)
+	/ time_elapsed;
 
-	pool_info->n_page_get_delta = buf_pool->stat.n_page_gets
-				      - buf_pool->old_stat.n_page_gets;
+	pool_info->n_page_get_delta = buf_pool.stat.n_page_gets
+				      - buf_pool.old_stat.n_page_gets;
 
 	if (pool_info->n_page_get_delta) {
-		pool_info->page_read_delta = buf_pool->stat.n_pages_read
-					     - buf_pool->old_stat.n_pages_read;
+		pool_info->page_read_delta = buf_pool.stat.n_pages_read
+					     - buf_pool.old_stat.n_pages_read;
 
 		pool_info->young_making_delta =
-			buf_pool->stat.n_pages_made_young
-			- buf_pool->old_stat.n_pages_made_young;
+			buf_pool.stat.n_pages_made_young
+			- buf_pool.old_stat.n_pages_made_young;
 
 		pool_info->not_young_making_delta =
-			buf_pool->stat.n_pages_not_made_young
-			- buf_pool->old_stat.n_pages_not_made_young;
+			buf_pool.stat.n_pages_not_made_young
+			- buf_pool.old_stat.n_pages_not_made_young;
 	}
 	pool_info->pages_readahead_rnd_rate =
-		 (buf_pool->stat.n_ra_pages_read_rnd
-		  - buf_pool->old_stat.n_ra_pages_read_rnd) / time_elapsed;
+	static_cast<double>(buf_pool.stat.n_ra_pages_read_rnd
+			    - buf_pool.old_stat.n_ra_pages_read_rnd)
+	/ time_elapsed;
 
 
 	pool_info->pages_readahead_rate =
-		 (buf_pool->stat.n_ra_pages_read
-		  - buf_pool->old_stat.n_ra_pages_read) / time_elapsed;
+	static_cast<double>(buf_pool.stat.n_ra_pages_read
+			    - buf_pool.old_stat.n_ra_pages_read)
+	/ time_elapsed;
 
 	pool_info->pages_evicted_rate =
-		(buf_pool->stat.n_ra_pages_evicted
-		 - buf_pool->old_stat.n_ra_pages_evicted) / time_elapsed;
+	static_cast<double>(buf_pool.stat.n_ra_pages_evicted
+			    - buf_pool.old_stat.n_ra_pages_evicted)
+	/ time_elapsed;
 
-	pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
+	pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool.unzip_LRU);
 
 	pool_info->io_sum = buf_LRU_stat_sum.io;
 
@@ -7049,8 +4585,8 @@ buf_stats_get_pool_info(
 
 	pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
 
-	buf_refresh_io_stats(buf_pool);
-	buf_pool_mutex_exit(buf_pool);
+	buf_refresh_io_stats();
+	mysql_mutex_unlock(&buf_pool.mutex);
 }
 
 /*********************************************************************//**
@@ -7073,20 +4609,20 @@ buf_print_io_instance(
 		"Percent of dirty pages(LRU & free pages): %.3f\n"
 		"Max dirty pages percent: %.3f\n"
 		"Pending reads " ULINTPF "\n"
-		"Pending writes: LRU " ULINTPF ", flush list " ULINTPF
-		", single page " ULINTPF "\n",
+		"Pending writes: LRU " ULINTPF ", flush list " ULINTPF "\n",
 		pool_info->pool_size,
 		pool_info->free_list_len,
 		pool_info->lru_len,
 		pool_info->old_lru_len,
 		pool_info->flush_list_len,
-		(((double) pool_info->flush_list_len) /
-		  (pool_info->lru_len + pool_info->free_list_len + 1.0)) * 100.0,
+		static_cast<double>(pool_info->flush_list_len)
+		/ (static_cast<double>(pool_info->lru_len
+				       + pool_info->free_list_len) + 1.0)
+		* 100.0,
 		srv_max_buf_pool_modified_pct,
 		pool_info->n_pend_reads,
 		pool_info->n_pending_flush_lru,
-		pool_info->n_pending_flush_list,
-		pool_info->n_pending_flush_single_page);
+		pool_info->n_pending_flush_list);
 
 	fprintf(file,
 		"Pages made young " ULINTPF ", not young " ULINTPF "\n"
@@ -7106,8 +4642,9 @@ buf_print_io_instance(
 		pool_info->pages_written_rate);
 
 	if (pool_info->n_page_get_delta) {
-		double hit_rate = double(pool_info->page_read_delta)
-			/ pool_info->n_page_get_delta;
+		double hit_rate = static_cast<double>(
+			pool_info->page_read_delta)
+			/ static_cast<double>(pool_info->n_page_get_delta);
 
 		if (hit_rate > 1) {
 			hit_rate = 1;
@@ -7118,10 +4655,11 @@ buf_print_io_instance(
 			" young-making rate " ULINTPF " / 1000 not "
 			ULINTPF " / 1000\n",
 			ulint(1000 * (1 - hit_rate)),
-			ulint(1000 * double(pool_info->young_making_delta)
-			      / pool_info->n_page_get_delta),
+			ulint(1000
+			      * double(pool_info->young_making_delta)
+			      / double(pool_info->n_page_get_delta)),
 			ulint(1000 * double(pool_info->not_young_making_delta)
-			      / pool_info->n_page_get_delta));
+			      / double(pool_info->n_page_get_delta)));
 	} else {
 		fputs("No buffer pool page gets since the last printout\n",
 		      file);
@@ -7154,95 +4692,10 @@ buf_print_io(
 /*=========*/
 	FILE*	file)	/*!< in/out: buffer where to print */
 {
-	ulint			i;
-	buf_pool_info_t*	pool_info;
-	buf_pool_info_t*	pool_info_total;
-
-	/* If srv_buf_pool_instances is greater than 1, allocate
-	one extra buf_pool_info_t, the last one stores
-	aggregated/total values from all pools */
-	if (srv_buf_pool_instances > 1) {
-		pool_info = (buf_pool_info_t*) ut_zalloc_nokey((
-			srv_buf_pool_instances + 1) * sizeof *pool_info);
-
-		pool_info_total = &pool_info[srv_buf_pool_instances];
-	} else {
-		ut_a(srv_buf_pool_instances == 1);
-
-		pool_info_total = pool_info =
-			static_cast<buf_pool_info_t*>(
-				ut_zalloc_nokey(sizeof *pool_info));
-	}
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		/* Fetch individual buffer pool info and calculate
-		aggregated stats along the way */
-		buf_stats_get_pool_info(buf_pool, i, pool_info);
-
-		/* If we have more than one buffer pool, store
-		the aggregated stats  */
-		if (srv_buf_pool_instances > 1) {
-			buf_stats_aggregate_pool_info(pool_info_total,
-						      &pool_info[i]);
-		}
-	}
-
-	/* Print the aggreate buffer pool info */
-	buf_print_io_instance(pool_info_total, file);
-
-	/* If there are more than one buffer pool, print each individual pool
-	info */
-	if (srv_buf_pool_instances > 1) {
-		fputs("----------------------\n"
-		"INDIVIDUAL BUFFER POOL INFO\n"
-		"----------------------\n", file);
+	buf_pool_info_t	pool_info;
 
-		for (i = 0; i < srv_buf_pool_instances; i++) {
-			fprintf(file, "---BUFFER POOL " ULINTPF "\n", i);
-			buf_print_io_instance(&pool_info[i], file);
-		}
-	}
-
-	ut_free(pool_info);
-}
-
-/**********************************************************************//**
-Refreshes the statistics used to print per-second averages. */
-void
-buf_refresh_io_stats_all(void)
-/*==========================*/
-{
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		buf_refresh_io_stats(buf_pool);
-	}
-}
-
-/**********************************************************************//**
-Check if all pages in all buffer pools are in a replacable state.
-@return FALSE if not */
-ibool
-buf_all_freed(void)
-/*===============*/
-{
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		if (!buf_all_freed_instance(buf_pool)) {
-			return(FALSE);
-		}
-	}
-
-	return(TRUE);
+	buf_stats_get_pool_info(&pool_info);
+	buf_print_io_instance(&pool_info, file);
 }
 
 /** Verify that post encryption checksum match with the calculated checksum.
@@ -7260,292 +4713,14 @@ bool buf_page_verify_crypt_checksum(const byte* page, ulint fsp_flags)
 	return !buf_page_is_corrupted(true, page, fsp_flags);
 }
 
-/*********************************************************************//**
-Checks that there currently are no pending i/o-operations for the buffer
-pool.
-@return number of pending i/o */
-ulint
-buf_pool_check_no_pending_io(void)
-/*==============================*/
-{
-	ulint		i;
-	ulint		pending_io = 0;
-
-	buf_pool_mutex_enter_all();
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		const buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		pending_io += buf_pool->n_pend_reads
-			      + buf_pool->n_flush[BUF_FLUSH_LRU]
-			      + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
-			      + buf_pool->n_flush[BUF_FLUSH_LIST];
-
-	}
-
-	buf_pool_mutex_exit_all();
-
-	return(pending_io);
-}
-
 /** Print the given page_id_t object.
 @param[in,out]	out	the output stream
 @param[in]	page_id	the page_id_t object to be printed
 @return the output stream */
-std::ostream&
-operator<<(
-	std::ostream&		out,
-	const page_id_t		page_id)
-{
-	out << "[page id: space=" << page_id.m_space
-		<< ", page number=" << page_id.m_page_no << "]";
-	return(out);
-}
-
-#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/** Print the given buf_pool_t object.
-@param[in,out]	out		the output stream
-@param[in]	buf_pool	the buf_pool_t object to be printed
-@return the output stream */
-std::ostream&
-operator<<(
-	std::ostream&		out,
-	const buf_pool_t&	buf_pool)
-{
-	out << "[buffer pool instance: "
-		<< "buf_pool size=" << buf_pool.curr_size
-		<< ", database pages=" << UT_LIST_GET_LEN(buf_pool.LRU)
-		<< ", free pages=" << UT_LIST_GET_LEN(buf_pool.free)
-		<< ", modified database pages="
-		<< UT_LIST_GET_LEN(buf_pool.flush_list)
-		<< ", n pending decompressions=" << buf_pool.n_pend_unzip
-		<< ", n pending reads=" << buf_pool.n_pend_reads
-		<< ", n pending flush LRU=" << buf_pool.n_flush[BUF_FLUSH_LRU]
-		<< " list=" << buf_pool.n_flush[BUF_FLUSH_LIST]
-		<< " single page=" << buf_pool.n_flush[BUF_FLUSH_SINGLE_PAGE]
-		<< ", pages made young=" << buf_pool.stat.n_pages_made_young
-		<< ", not young=" << buf_pool.stat.n_pages_not_made_young
-		<< ", pages read=" << buf_pool.stat.n_pages_read
-		<< ", created=" << buf_pool.stat.n_pages_created
-		<< ", written=" << buf_pool.stat.n_pages_written << "]";
-	return(out);
-}
-#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-/** Encrypt a buffer of temporary tablespace
-@param[in]	offset		Page offset
-@param[in]	src_frame	Page to encrypt
-@param[in,out]	dst_frame	Output buffer
-@return encrypted buffer or NULL */
-static byte* buf_tmp_page_encrypt(
-	ulint	offset,
-	byte*	src_frame,
-	byte*	dst_frame)
-{
-	/* Calculate the start offset in a page */
-	uint srclen = srv_page_size - (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
-				       + FIL_PAGE_FCRC32_CHECKSUM);
-	const byte* src = src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
-	byte* dst = dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
-
-	memcpy(dst_frame, src_frame, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
-
-	if (!log_tmp_block_encrypt(src, srclen, dst, (offset * srv_page_size),
-				   true)) {
-		return NULL;
-	}
-
-	const ulint payload = srv_page_size - FIL_PAGE_FCRC32_CHECKSUM;
-	mach_write_to_4(dst_frame + payload, ut_crc32(dst_frame, payload));
-
-	srv_stats.pages_encrypted.inc();
-	srv_stats.n_temp_blocks_encrypted.inc();
-	return dst_frame;
-}
-
-/** Encryption and page_compression hook that is called just before
-a page is written to disk.
-@param[in,out]	space		tablespace
-@param[in,out]	bpage		buffer page
-@param[in]	src_frame	physical page frame that is being encrypted
-@return	page frame to be written to file
-(may be src_frame or an encrypted/compressed copy of it) */
-UNIV_INTERN
-byte*
-buf_page_encrypt(
-	fil_space_t*	space,
-	buf_page_t*	bpage,
-	byte*		src_frame)
-{
-	ut_ad(space->id == bpage->id.space());
-	bpage->real_size = srv_page_size;
-
-	ut_d(fil_page_type_validate(space, src_frame));
-
-	switch (bpage->id.page_no()) {
-	case 0:
-		/* Page 0 of a tablespace is not encrypted/compressed */
-		return src_frame;
-	case TRX_SYS_PAGE_NO:
-		if (bpage->id.space() == TRX_SYS_SPACE) {
-			/* don't encrypt/compress page as it contains
-			address to dblwr buffer */
-			return src_frame;
-		}
-	}
-
-	fil_space_crypt_t* crypt_data = space->crypt_data;
-
-	bool encrypted, page_compressed;
-
-	if (space->purpose == FIL_TYPE_TEMPORARY) {
-		ut_ad(!crypt_data);
-		encrypted = innodb_encrypt_temporary_tables;
-		page_compressed = false;
-	} else {
-		encrypted = crypt_data
-			&& !crypt_data->not_encrypted()
-			&& crypt_data->type != CRYPT_SCHEME_UNENCRYPTED
-			&& (!crypt_data->is_default_encryption()
-			    || srv_encrypt_tables);
-		page_compressed = space->is_compressed();
-	}
-
-	if (!encrypted && !page_compressed) {
-		/* No need to encrypt or page compress the page.
-		Clear key-version & crypt-checksum. */
-		if (space->full_crc32()) {
-			memset(src_frame + FIL_PAGE_FCRC32_KEY_VERSION, 0, 4);
-		} else {
-			memset(src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
-			       0, 8);
-		}
-
-		return src_frame;
-	}
-
-	ut_ad(!bpage->zip_size() || !page_compressed);
-	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
-	/* Find free slot from temporary memory array */
-	buf_tmp_buffer_t* slot = buf_pool_reserve_tmp_slot(buf_pool);
-	slot->out_buf = NULL;
-	bpage->slot = slot;
-
-	buf_tmp_reserve_crypt_buf(slot);
-	byte *dst_frame = slot->crypt_buf;
-	const bool full_crc32 = space->full_crc32();
-
-	if (full_crc32) {
-		/* Write LSN for the full crc32 checksum before
-		encryption. Because lsn is one of the input for encryption. */
-		mach_write_to_8(src_frame + FIL_PAGE_LSN,
-				bpage->newest_modification);
-		if (!page_compressed) {
-			mach_write_to_4(
-				src_frame + srv_page_size - FIL_PAGE_FCRC32_END_LSN,
-				(ulint) bpage->newest_modification);
-		}
-	}
-
-	if (!page_compressed) {
-not_compressed:
-		byte* tmp;
-		if (space->purpose == FIL_TYPE_TEMPORARY) {
-			/* Encrypt temporary tablespace page content */
-			tmp = buf_tmp_page_encrypt(bpage->id.page_no(),
-						   src_frame, dst_frame);
-		} else {
-			/* Encrypt page content */
-			tmp = fil_space_encrypt(
-					space, bpage->id.page_no(),
-					bpage->newest_modification,
-					src_frame, dst_frame);
-		}
-
-		bpage->real_size = srv_page_size;
-		slot->out_buf = dst_frame = tmp;
-
-		ut_d(fil_page_type_validate(space, tmp));
-	} else {
-		ut_ad(space->purpose != FIL_TYPE_TEMPORARY);
-		/* First we compress the page content */
-		buf_tmp_reserve_compression_buf(slot);
-		byte* tmp = slot->comp_buf;
-		ulint out_len = fil_page_compress(
-			src_frame, tmp, space->flags,
-			fil_space_get_block_size(space, bpage->id.page_no()),
-			encrypted);
-
-		if (!out_len) {
-			goto not_compressed;
-		}
-
-		bpage->real_size = out_len;
-
-		if (full_crc32) {
-			ut_d(bool compressed = false);
-			out_len = buf_page_full_crc32_size(tmp,
-#ifdef UNIV_DEBUG
-							   &compressed,
-#else
-							   NULL,
-#endif
-							   NULL);
-			ut_ad(compressed);
-		}
-
-		/* Workaround for MDEV-15527. */
-		memset(tmp + out_len, 0 , srv_page_size - out_len);
-		ut_d(fil_page_type_validate(space, tmp));
-
-		if (encrypted) {
-			/* And then we encrypt the page content */
-			tmp = fil_space_encrypt(space,
-						bpage->id.page_no(),
-						bpage->newest_modification,
-						tmp,
-						dst_frame);
-		}
-
-		if (full_crc32) {
-			compile_time_assert(FIL_PAGE_FCRC32_CHECKSUM == 4);
-			mach_write_to_4(tmp + out_len - 4,
-					ut_crc32(tmp, out_len - 4));
-			ut_ad(!buf_page_is_corrupted(true, tmp, space->flags));
-		}
-
-		slot->out_buf = dst_frame = tmp;
-	}
-
-	ut_d(fil_page_type_validate(space, dst_frame));
-
-	// return dst_frame which will be written
-	return dst_frame;
-}
-
-/**
-Should we punch hole to deallocate unused portion of the page.
-@param[in]	bpage		Page control block
-@return true if punch hole should be used, false if not */
-bool
-buf_page_should_punch_hole(
-	const buf_page_t* bpage)
-{
-	return bpage->real_size != bpage->physical_size();
-}
-
-/**
-Calculate the length of trim (punch_hole) operation.
-@param[in]	bpage		Page control block
-@param[in]	write_length	Write length
-@return length of the trim or zero. */
-ulint
-buf_page_get_trim_length(
-	const buf_page_t*	bpage,
-	ulint			write_length)
+std::ostream& operator<<(std::ostream &out, const page_id_t page_id)
 {
-	return bpage->physical_size() - write_length;
+  out << "[page id: space=" << page_id.space()
+      << ", page number=" << page_id.page_no() << "]";
+  return out;
 }
 #endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/buf/buf0checksum.cc b/storage/innobase/buf/buf0checksum.cc
index 2b2a74dd736..e98dc18452e 100644
--- a/storage/innobase/buf/buf0checksum.cc
+++ b/storage/innobase/buf/buf0checksum.cc
@@ -33,11 +33,8 @@ Created Aug 11, 2011 Vasil Dimov
 #include "srv0srv.h"
 #endif /* !UNIV_INNOCHECKSUM */
 
-/** the macro MYSQL_SYSVAR_ENUM() requires "long unsigned int" and if we
-use srv_checksum_algorithm_t here then we get a compiler error:
-ha_innodb.cc:12251: error: cannot convert 'srv_checksum_algorithm_t*' to
-  'long unsigned int*' in initialization */
-ulong	srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_INNODB;
+/** the value of innodb_checksum_algorithm */
+ulong	srv_checksum_algorithm;
 
 /** Calculate the CRC32 checksum of a page. The value is stored to the page
 when it is written to a file and also checked for a match when reading from
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc
index f6da4d2d8e4..1d582b6cfbf 100644
--- a/storage/innobase/buf/buf0dblwr.cc
+++ b/storage/innobase/buf/buf0dblwr.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2020, MariaDB Corporation.
+Copyright (c) 2013, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -29,6 +29,7 @@ Created 2011/12/19
 #include "buf0checksum.h"
 #include "srv0start.h"
 #include "srv0srv.h"
+#include "sync0sync.h"
 #include "page0zip.h"
 #include "trx0sys.h"
 #include "fil0crypt.h"
@@ -37,1248 +38,726 @@ Created 2011/12/19
 using st_::span;
 
 /** The doublewrite buffer */
-buf_dblwr_t*	buf_dblwr = NULL;
+buf_dblwr_t buf_dblwr;
 
-/** Set to TRUE when the doublewrite buffer is being created */
-ibool	buf_dblwr_being_created = FALSE;
-
-#define TRX_SYS_DOUBLEWRITE_BLOCKS 2
-
-/****************************************************************//**
-Determines if a page number is located inside the doublewrite buffer.
-@return TRUE if the location is inside the two blocks of the
-doublewrite buffer */
-ibool
-buf_dblwr_page_inside(
-/*==================*/
-	ulint	page_no)	/*!< in: page number */
+/** @return the TRX_SYS page */
+inline buf_block_t *buf_dblwr_trx_sys_get(mtr_t *mtr)
 {
-	if (buf_dblwr == NULL) {
-
-		return(FALSE);
-	}
-
-	if (page_no >= buf_dblwr->block1
-	    && page_no < buf_dblwr->block1
-	    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-		return(TRUE);
-	}
-
-	if (page_no >= buf_dblwr->block2
-	    && page_no < buf_dblwr->block2
-	    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-		return(TRUE);
-	}
-
-	return(FALSE);
+  buf_block_t *block= buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
+                                   0, RW_X_LATCH, mtr);
+  buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+  return block;
 }
 
-/****************************************************************//**
-Calls buf_page_get() on the TRX_SYS_PAGE and returns a pointer to the
-doublewrite buffer within it.
-@return pointer to the doublewrite buffer within the filespace header
-page. */
-UNIV_INLINE
-byte*
-buf_dblwr_get(
-/*==========*/
-	mtr_t*	mtr)	/*!< in/out: MTR to hold the page latch */
+/** Initialize the doublewrite buffer data structure.
+@param header   doublewrite page header in the TRX_SYS page */
+inline void buf_dblwr_t::init(const byte *header)
 {
-	buf_block_t*	block;
-
-	block = buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
-			     0, RW_X_LATCH, mtr);
-
-	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
-
-	return(buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE);
+  ut_ad(!active_slot->first_free);
+  ut_ad(!active_slot->reserved);
+  ut_ad(!batch_running);
+
+  mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr);
+  pthread_cond_init(&cond, nullptr);
+  block1= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK1));
+  block2= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK2));
+
+  const uint32_t buf_size= 2 * block_size();
+  for (int i= 0; i < 2; i++)
+  {
+    slots[i].write_buf= static_cast<byte*>
+      (aligned_malloc(buf_size << srv_page_size_shift, srv_page_size));
+    slots[i].buf_block_arr= static_cast<element*>
+      (ut_zalloc_nokey(buf_size * sizeof(element)));
+  }
+  active_slot= &slots[0];
 }
 
-/********************************************************************//**
-Flush a batch of writes to the datafiles that have already been
-written to the dblwr buffer on disk. */
-void
-buf_dblwr_sync_datafiles()
-/*======================*/
+/** Create or restore the doublewrite buffer in the TRX_SYS page.
+@return whether the operation succeeded */
+bool buf_dblwr_t::create()
 {
-	/* Wake possible simulated aio thread to actually post the
-	writes to the operating system */
-	os_aio_simulated_wake_handler_threads();
-
-	/* Wait that all async writes to tablespaces have been posted to
-	the OS */
-	os_aio_wait_until_no_pending_writes();
-}
-
-/****************************************************************//**
-Creates or initialializes the doublewrite buffer at a database start. */
-static
-void
-buf_dblwr_init(
-/*===========*/
-	byte*	doublewrite)	/*!< in: pointer to the doublewrite buf
-				header on trx sys page */
-{
-	ulint	buf_size;
-
-	buf_dblwr = static_cast<buf_dblwr_t*>(
-		ut_zalloc_nokey(sizeof(buf_dblwr_t)));
-
-	/* There are two blocks of same size in the doublewrite
-	buffer. */
-	buf_size = TRX_SYS_DOUBLEWRITE_BLOCKS * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+  if (is_initialised())
+    return true;
 
-	/* There must be atleast one buffer for single page writes
-	and one buffer for batch writes. */
-	ut_a(srv_doublewrite_batch_size > 0
-	     && srv_doublewrite_batch_size < buf_size);
-
-	mutex_create(LATCH_ID_BUF_DBLWR, &buf_dblwr->mutex);
-
-	buf_dblwr->b_event = os_event_create("dblwr_batch_event");
-	buf_dblwr->s_event = os_event_create("dblwr_single_event");
-	buf_dblwr->first_free = 0;
-	buf_dblwr->s_reserved = 0;
-	buf_dblwr->b_reserved = 0;
-
-	buf_dblwr->block1 = mach_read_from_4(
-		doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1);
-	buf_dblwr->block2 = mach_read_from_4(
-		doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2);
-
-	buf_dblwr->in_use = static_cast<bool*>(
-		ut_zalloc_nokey(buf_size * sizeof(bool)));
-
-	buf_dblwr->write_buf_unaligned = static_cast<byte*>(
-		ut_malloc_nokey((1 + buf_size) << srv_page_size_shift));
-
-	buf_dblwr->write_buf = static_cast<byte*>(
-		ut_align(buf_dblwr->write_buf_unaligned,
-			 srv_page_size));
-
-	buf_dblwr->buf_block_arr = static_cast<buf_page_t**>(
-		ut_zalloc_nokey(buf_size * sizeof(void*)));
-}
-
-/** Create the doublewrite buffer if the doublewrite buffer header
-is not present in the TRX_SYS page.
-@return	whether the operation succeeded
-@retval	true	if the doublewrite buffer exists or was created
-@retval	false	if the creation failed (too small first data file) */
-bool
-buf_dblwr_create()
-{
-	buf_block_t*	block2;
-	buf_block_t*	new_block;
-	buf_block_t*	trx_sys_block;
-	byte*	doublewrite;
-	byte*	fseg_header;
-	ulint	page_no;
-	ulint	prev_page_no;
-	ulint	i;
-	mtr_t	mtr;
-
-	if (buf_dblwr) {
-		/* Already inited */
-		return(true);
-	}
+  mtr_t mtr;
+  const ulint size= block_size();
 
 start_again:
-	mtr.start();
-	buf_dblwr_being_created = TRUE;
-
-	doublewrite = buf_dblwr_get(&mtr);
-
-	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
-	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
-		/* The doublewrite buffer has already been created:
-		just read in some numbers */
-
-		buf_dblwr_init(doublewrite);
-
-		mtr.commit();
-		buf_dblwr_being_created = FALSE;
-		return(true);
-	} else {
-		if (UT_LIST_GET_FIRST(fil_system.sys_space->chain)->size
-		    < 3 * FSP_EXTENT_SIZE) {
-			goto too_small;
-		}
-	}
-
-	trx_sys_block = buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
-				     0, RW_X_LATCH, &mtr);
-
-	block2 = fseg_create(fil_system.sys_space,
-			     TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG,
-			     &mtr, false, trx_sys_block);
-
-	if (block2 == NULL) {
+  mtr.start();
+
+  buf_block_t *trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
+
+  if (mach_read_from_4(TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+                       trx_sys_block->frame) == TRX_SYS_DOUBLEWRITE_MAGIC_N)
+  {
+    /* The doublewrite buffer has already been created: just read in
+    some numbers */
+    init(TRX_SYS_DOUBLEWRITE + trx_sys_block->frame);
+    mtr.commit();
+    return true;
+  }
+
+  if (UT_LIST_GET_FIRST(fil_system.sys_space->chain)->size < 3 * size)
+  {
 too_small:
-		ib::error()
-			<< "Cannot create doublewrite buffer: "
-			"the first file in innodb_data_file_path"
-			" must be at least "
-			<< (3 * (FSP_EXTENT_SIZE
-				 >> (20U - srv_page_size_shift)))
-			<< "M.";
-		mtr.commit();
-		return(false);
-	}
-
-	ib::info() << "Doublewrite buffer not found: creating new";
-
-	/* FIXME: After this point, the doublewrite buffer creation
-	is not atomic. The doublewrite buffer should not exist in
-	the InnoDB system tablespace file in the first place.
-	It could be located in separate optional file(s) in a
-	user-specified location. */
-
-	/* fseg_create acquires a second latch on the page,
-	therefore we must declare it: */
-
-	buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
-
-	fseg_header = doublewrite + TRX_SYS_DOUBLEWRITE_FSEG;
-	prev_page_no = 0;
-
-	for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCKS * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
-		     + FSP_EXTENT_SIZE / 2; i++) {
-		new_block = fseg_alloc_free_page(
-			fseg_header, prev_page_no + 1, FSP_UP, &mtr);
-		if (new_block == NULL) {
-			ib::error() << "Cannot create doublewrite buffer: "
-				" you must increase your tablespace size."
-				" Cannot continue operation.";
-			/* This may essentially corrupt the doublewrite
-			buffer. However, usually the doublewrite buffer
-			is created at database initialization, and it
-			should not matter (just remove all newly created
-			InnoDB files and restart). */
-			mtr.commit();
-			return(false);
-		}
-
-		/* We read the allocated pages to the buffer pool;
-		when they are written to disk in a flush, the space
-		id and page number fields are also written to the
-		pages. When we at database startup read pages
-		from the doublewrite buffer, we know that if the
-		space id and page number in them are the same as
-		the page position in the tablespace, then the page
-		has not been written to in doublewrite. */
-
-		ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
-		page_no = new_block->page.id.page_no();
-		/* We only do this in the debug build, to ensure that
-		both the check in buf_flush_init_for_writing() and
-		recv_parse_or_apply_log_rec_body() will see a valid
-		page type. The flushes of new_block are actually
-		unnecessary here.  */
-		ut_d(mlog_write_ulint(FIL_PAGE_TYPE + new_block->frame,
-				      FIL_PAGE_TYPE_SYS, MLOG_2BYTES, &mtr));
-
-		if (i == FSP_EXTENT_SIZE / 2) {
-			ut_a(page_no == FSP_EXTENT_SIZE);
-			mlog_write_ulint(doublewrite
-					 + TRX_SYS_DOUBLEWRITE_BLOCK1,
-					 page_no, MLOG_4BYTES, &mtr);
-			mlog_write_ulint(doublewrite
-					 + TRX_SYS_DOUBLEWRITE_REPEAT
-					 + TRX_SYS_DOUBLEWRITE_BLOCK1,
-					 page_no, MLOG_4BYTES, &mtr);
-
-		} else if (i == FSP_EXTENT_SIZE / 2
-			   + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-			ut_a(page_no == 2 * FSP_EXTENT_SIZE);
-			mlog_write_ulint(doublewrite
-					 + TRX_SYS_DOUBLEWRITE_BLOCK2,
-					 page_no, MLOG_4BYTES, &mtr);
-			mlog_write_ulint(doublewrite
-					 + TRX_SYS_DOUBLEWRITE_REPEAT
-					 + TRX_SYS_DOUBLEWRITE_BLOCK2,
-					 page_no, MLOG_4BYTES, &mtr);
-
-		} else if (i > FSP_EXTENT_SIZE / 2) {
-			ut_a(page_no == prev_page_no + 1);
-		}
-
-		if (((i + 1) & 15) == 0) {
-			/* rw_locks can only be recursively x-locked
-			2048 times. (on 32 bit platforms,
-			(lint) 0 - (X_LOCK_DECR * 2049)
-			is no longer a negative number, and thus
-			lock_word becomes like a shared lock).
-			For 4k page size this loop will
-			lock the fseg header too many times. Since
-			this code is not done while any other threads
-			are active, restart the MTR occasionally. */
-			mtr_commit(&mtr);
-			mtr_start(&mtr);
-			doublewrite = buf_dblwr_get(&mtr);
-			fseg_header = doublewrite
-				      + TRX_SYS_DOUBLEWRITE_FSEG;
-		}
-
-		prev_page_no = page_no;
-	}
-
-	mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
-			 TRX_SYS_DOUBLEWRITE_MAGIC_N,
-			 MLOG_4BYTES, &mtr);
-	mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
-			 + TRX_SYS_DOUBLEWRITE_REPEAT,
-			 TRX_SYS_DOUBLEWRITE_MAGIC_N,
-			 MLOG_4BYTES, &mtr);
-
-	mlog_write_ulint(doublewrite
-			 + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
-			 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
-			 MLOG_4BYTES, &mtr);
-	mtr_commit(&mtr);
-
-	/* Flush the modified pages to disk and make a checkpoint */
-	log_make_checkpoint();
-	buf_dblwr_being_created = FALSE;
-
-	/* Remove doublewrite pages from LRU */
-	buf_pool_invalidate();
-
-	ib::info() <<  "Doublewrite buffer created";
-
-	goto start_again;
+    ib::error() << "Cannot create doublewrite buffer: "
+                   "the first file in innodb_data_file_path must be at least "
+                << (3 * (size >> (20U - srv_page_size_shift))) << "M.";
+    mtr.commit();
+    return false;
+  }
+  else
+  {
+    buf_block_t *b= fseg_create(fil_system.sys_space,
+                                TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG,
+                                &mtr, false, trx_sys_block);
+    if (!b)
+      goto too_small;
+    ib::info() << "Doublewrite buffer not found: creating new";
+
+    /* FIXME: After this point, the doublewrite buffer creation
+    is not atomic. The doublewrite buffer should not exist in
+    the InnoDB system tablespace file in the first place.
+    It could be located in separate optional file(s) in a
+    user-specified location. */
+
+    /* fseg_create acquires a second latch on the page,
+    therefore we must declare it: */
+    buf_block_dbg_add_level(b, SYNC_NO_ORDER_CHECK);
+  }
+
+  byte *fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
+    trx_sys_block->frame;
+  for (uint32_t prev_page_no= 0, i= 0, extent_size= FSP_EXTENT_SIZE;
+       i < 2 * size + extent_size / 2; i++)
+  {
+    buf_block_t *new_block= fseg_alloc_free_page(fseg_header, prev_page_no + 1,
+                                                 FSP_UP, &mtr);
+    if (!new_block)
+    {
+      ib::error() << "Cannot create doublewrite buffer: "
+                     " you must increase your tablespace size."
+                     " Cannot continue operation.";
+      /* This may essentially corrupt the doublewrite
+      buffer. However, usually the doublewrite buffer
+      is created at database initialization, and it
+      should not matter (just remove all newly created
+      InnoDB files and restart). */
+      mtr.commit();
+      return false;
+    }
+
+    /* We read the allocated pages to the buffer pool; when they are
+    written to disk in a flush, the space id and page number fields
+    are also written to the pages. When we at database startup read
+    pages from the doublewrite buffer, we know that if the space id
+    and page number in them are the same as the page position in the
+    tablespace, then the page has not been written to in
+    doublewrite. */
+
+    ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
+    const page_id_t id= new_block->page.id();
+    /* We only do this in the debug build, to ensure that the check in
+    buf_flush_init_for_writing() will see a valid page type. The
+    flushes of new_block are actually unnecessary here.  */
+    ut_d(mtr.write<2>(*new_block, FIL_PAGE_TYPE + new_block->frame,
+                      FIL_PAGE_TYPE_SYS));
+
+    if (i == size / 2)
+    {
+      ut_a(id.page_no() == size);
+      mtr.write<4>(*trx_sys_block,
+                   TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK1 +
+                   trx_sys_block->frame, id.page_no());
+      mtr.write<4>(*trx_sys_block,
+                   TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
+                   TRX_SYS_DOUBLEWRITE_BLOCK1 + trx_sys_block->frame,
+                   id.page_no());
+    }
+    else if (i == size / 2 + size)
+    {
+      ut_a(id.page_no() == 2 * size);
+      mtr.write<4>(*trx_sys_block,
+                   TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK2 +
+                   trx_sys_block->frame, id.page_no());
+      mtr.write<4>(*trx_sys_block,
+                   TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
+                   TRX_SYS_DOUBLEWRITE_BLOCK2 + trx_sys_block->frame,
+                   id.page_no());
+    }
+    else if (i > size / 2)
+      ut_a(id.page_no() == prev_page_no + 1);
+
+    if (((i + 1) & 15) == 0) {
+      /* rw_locks can only be recursively x-locked 2048 times. (on 32
+      bit platforms, (lint) 0 - (X_LOCK_DECR * 2049) is no longer a
+      negative number, and thus lock_word becomes like a shared lock).
+      For 4k page size this loop will lock the fseg header too many
+      times. Since this code is not done while any other threads are
+      active, restart the MTR occasionally. */
+      mtr.commit();
+      mtr.start();
+      trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
+      fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
+        trx_sys_block->frame;
+    }
+
+    prev_page_no= id.page_no();
+  }
+
+  mtr.write<4>(*trx_sys_block,
+               TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+               trx_sys_block->frame, TRX_SYS_DOUBLEWRITE_MAGIC_N);
+  mtr.write<4>(*trx_sys_block,
+               TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+               TRX_SYS_DOUBLEWRITE_REPEAT + trx_sys_block->frame,
+               TRX_SYS_DOUBLEWRITE_MAGIC_N);
+
+  mtr.write<4>(*trx_sys_block,
+               TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
+               trx_sys_block->frame, TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N);
+  mtr.commit();
+
+  buf_flush_wait_flushed(mtr.commit_lsn());
+
+  /* Remove doublewrite pages from LRU */
+  buf_pool_invalidate();
+
+  ib::info() << "Doublewrite buffer created";
+  goto start_again;
 }
 
-/**
-At database startup initializes the doublewrite buffer memory structure if
-we already have a doublewrite buffer created in the data files. If we are
-upgrading to an InnoDB version which supports multiple tablespaces, then this
-function performs the necessary update operations. If we are in a crash
-recovery, this function loads the pages from double write buffer into memory.
-@param[in]	file		File handle
-@param[in]	path		Path name of file
+/** Initialize the doublewrite buffer memory structure on recovery.
+If we are upgrading from a version before MySQL 4.1, then this
+function performs the necessary update operations to support
+innodb_file_per_table. If we are in a crash recovery, this function
+loads the pages from double write buffer into memory.
+@param file File handle
+@param path Path name of file
 @return DB_SUCCESS or error code */
-dberr_t
-buf_dblwr_init_or_load_pages(
-	pfs_os_file_t	file,
-	const char*	path)
+dberr_t buf_dblwr_t::init_or_load_pages(pfs_os_file_t file, const char *path)
 {
-	byte*		buf;
-	byte*		page;
-	ulint		block1;
-	ulint		block2;
-	ulint		space_id;
-	byte*		read_buf;
-	byte*		doublewrite;
-	byte*		unaligned_read_buf;
-	ibool		reset_space_ids = FALSE;
-	recv_dblwr_t&	recv_dblwr = recv_sys.dblwr;
-
-	/* We do the file i/o past the buffer pool */
-
-	unaligned_read_buf = static_cast<byte*>(
-		ut_malloc_nokey(3U << srv_page_size_shift));
-
-	read_buf = static_cast<byte*>(
-		ut_align(unaligned_read_buf, srv_page_size));
-
-	/* Read the trx sys header to check if we are using the doublewrite
-	buffer */
-	dberr_t		err;
-
-	IORequest	read_request(IORequest::READ);
-
-	err = os_file_read(
-		read_request,
-		file, read_buf, TRX_SYS_PAGE_NO << srv_page_size_shift,
-		srv_page_size);
-
-	if (err != DB_SUCCESS) {
-
-		ib::error()
-			<< "Failed to read the system tablespace header page";
-
-		ut_free(unaligned_read_buf);
-
-		return(err);
-	}
-
-	doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
-
-	/* TRX_SYS_PAGE_NO is not encrypted see fil_crypt_rotate_page() */
-
-	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
-	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
-		/* The doublewrite buffer has been created */
-
-		buf_dblwr_init(doublewrite);
-
-		block1 = buf_dblwr->block1;
-		block2 = buf_dblwr->block2;
-
-		buf = buf_dblwr->write_buf;
-	} else {
-		ut_free(unaligned_read_buf);
-		return(DB_SUCCESS);
-	}
-
-	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
-	    != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
-
-		/* We are upgrading from a version < 4.1.x to a version where
-		multiple tablespaces are supported. We must reset the space id
-		field in the pages in the doublewrite buffer because starting
-		from this version the space id is stored to
-		FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
-
-		reset_space_ids = TRUE;
-
-		ib::info() << "Resetting space id's in the doublewrite buffer";
-	}
-
-	/* Read the pages from the doublewrite buffer to memory */
-	err = os_file_read(
-		read_request,
-		file, buf, block1 << srv_page_size_shift,
-		TRX_SYS_DOUBLEWRITE_BLOCK_SIZE << srv_page_size_shift);
-
-	if (err != DB_SUCCESS) {
-
-		ib::error()
-			<< "Failed to read the first double write buffer "
-			"extent";
-
-		ut_free(unaligned_read_buf);
-
-		return(err);
-	}
-
-	err = os_file_read(
-		read_request,
-		file,
-		buf + (TRX_SYS_DOUBLEWRITE_BLOCK_SIZE << srv_page_size_shift),
-		block2 << srv_page_size_shift,
-		TRX_SYS_DOUBLEWRITE_BLOCK_SIZE << srv_page_size_shift);
-
-	if (err != DB_SUCCESS) {
-
-		ib::error()
-			<< "Failed to read the second double write buffer "
-			"extent";
-
-		ut_free(unaligned_read_buf);
-
-		return(err);
-	}
-
-	/* Check if any of these pages is half-written in data files, in the
-	intended position */
-
-	page = buf;
-
-	for (ulint i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
-
-		if (reset_space_ids) {
-			ulint source_page_no;
-
-			space_id = 0;
-			mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
-					space_id);
-			/* We do not need to calculate new checksums for the
-			pages because the field .._SPACE_ID does not affect
-			them. Write the page back to where we read it from. */
-
-			if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-				source_page_no = block1 + i;
-			} else {
-				source_page_no = block2
-					+ i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
-			}
-
-			IORequest	write_request(IORequest::WRITE);
-
-			err = os_file_write(
-				write_request, path, file, page,
-				source_page_no << srv_page_size_shift,
-				srv_page_size);
-			if (err != DB_SUCCESS) {
-
-				ib::error()
-					<< "Failed to write to the double write"
-					" buffer";
-
-				ut_free(unaligned_read_buf);
-
-				return(err);
-			}
-
-		} else if (memcmp(field_ref_zero, page + FIL_PAGE_LSN, 8)) {
-			/* Each valid page header must contain
-			a nonzero FIL_PAGE_LSN field. */
-			recv_dblwr.add(page);
-		}
-
-		page += srv_page_size;
-	}
-
-	if (reset_space_ids) {
-		os_file_flush(file);
-	}
-
-	ut_free(unaligned_read_buf);
-
-	return(DB_SUCCESS);
+  ut_ad(this == &buf_dblwr);
+  const uint32_t size= block_size();
+
+  /* We do the file i/o past the buffer pool */
+  byte *read_buf= static_cast<byte*>(aligned_malloc(srv_page_size,
+                                                    srv_page_size));
+  /* Read the TRX_SYS header to check if we are using the doublewrite buffer */
+  dberr_t err= os_file_read(IORequestRead, file, read_buf,
+                            TRX_SYS_PAGE_NO << srv_page_size_shift,
+                            srv_page_size);
+
+  if (err != DB_SUCCESS)
+  {
+    ib::error() << "Failed to read the system tablespace header page";
+func_exit:
+    aligned_free(read_buf);
+    return err;
+  }
+
+  /* TRX_SYS_PAGE_NO is not encrypted see fil_crypt_rotate_page() */
+  if (mach_read_from_4(TRX_SYS_DOUBLEWRITE_MAGIC + TRX_SYS_DOUBLEWRITE +
+                       read_buf) != TRX_SYS_DOUBLEWRITE_MAGIC_N)
+  {
+    /* There is no doublewrite buffer initialized in the TRX_SYS page.
+    This should normally not be possible; the doublewrite buffer should
+    be initialized when creating the database. */
+    err= DB_SUCCESS;
+    goto func_exit;
+  }
+
+  init(TRX_SYS_DOUBLEWRITE + read_buf);
+
+  const bool upgrade_to_innodb_file_per_table=
+    mach_read_from_4(TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
+                     TRX_SYS_DOUBLEWRITE + read_buf) !=
+    TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N;
+
+  auto write_buf= active_slot->write_buf;
+  /* Read the pages from the doublewrite buffer to memory */
+  err= os_file_read(IORequestRead, file, write_buf,
+                    block1.page_no() << srv_page_size_shift,
+                    size << srv_page_size_shift);
+
+  if (err != DB_SUCCESS)
+  {
+    ib::error() << "Failed to read the first double write buffer extent";
+    goto func_exit;
+  }
+
+  err= os_file_read(IORequestRead, file,
+                    write_buf + (size << srv_page_size_shift),
+                    block2.page_no() << srv_page_size_shift,
+                    size << srv_page_size_shift);
+  if (err != DB_SUCCESS)
+  {
+    ib::error() << "Failed to read the second double write buffer extent";
+    goto func_exit;
+  }
+
+  byte *page= write_buf;
+
+  if (UNIV_UNLIKELY(upgrade_to_innodb_file_per_table))
+  {
+    ib::info() << "Resetting space id's in the doublewrite buffer";
+
+    for (ulint i= 0; i < size * 2; i++, page += srv_page_size)
+    {
+      memset(page + FIL_PAGE_SPACE_ID, 0, 4);
+      /* For innodb_checksum_algorithm=innodb, we do not need to
+      calculate new checksums for the pages because the field
+      .._SPACE_ID does not affect them. Write the page back to where
+      we read it from. */
+      const ulint source_page_no= i < size
+        ? block1.page_no() + i
+        : block2.page_no() + i - size;
+      err= os_file_write(IORequestWrite, path, file, page,
+                         source_page_no << srv_page_size_shift, srv_page_size);
+      if (err != DB_SUCCESS)
+      {
+        ib::error() << "Failed to upgrade the double write buffer";
+        goto func_exit;
+      }
+    }
+    os_file_flush(file);
+  }
+  else
+    for (ulint i= 0; i < size * 2; i++, page += srv_page_size)
+      if (mach_read_from_8(my_assume_aligned<8>(page + FIL_PAGE_LSN)))
+        /* Each valid page header must contain a nonzero FIL_PAGE_LSN field. */
+        recv_sys.dblwr.add(page);
+
+  err= DB_SUCCESS;
+  goto func_exit;
 }
 
 /** Process and remove the double write buffer pages for all tablespaces. */
-void
-buf_dblwr_process()
+void buf_dblwr_t::recover()
 {
-	ut_ad(recv_sys.parse_start_lsn);
-
-	ulint		page_no_dblwr	= 0;
-	byte*		read_buf;
-	recv_dblwr_t&	recv_dblwr	= recv_sys.dblwr;
-
-	if (!buf_dblwr) {
-		return;
-	}
-
-	read_buf = static_cast<byte*>(
-		aligned_malloc(3 * srv_page_size, srv_page_size));
-	byte* const buf = read_buf + srv_page_size;
-
-	for (recv_dblwr_t::list::iterator i = recv_dblwr.pages.begin();
-	     i != recv_dblwr.pages.end();
-	     ++i, ++page_no_dblwr) {
-		byte* page = *i;
-		const ulint page_no = page_get_page_no(page);
-
-		if (!page_no) {
-			/* page 0 should have been recovered
-			already via Datafile::restore_from_doublewrite() */
-			continue;
-		}
-
-		const ulint space_id = page_get_space_id(page);
-		const lsn_t lsn = mach_read_from_8(page + FIL_PAGE_LSN);
-
-		if (recv_sys.parse_start_lsn > lsn) {
-			/* Pages written before the checkpoint are
-			not useful for recovery. */
-			continue;
-		}
-
-		const page_id_t page_id(space_id, page_no);
-
-		if (recv_sys.scanned_lsn < lsn) {
-			ib::warn() << "Ignoring a doublewrite copy of page "
-				   << page_id
-				   << " with future log sequence number "
-				   << lsn;
-			continue;
-		}
-
-		fil_space_t* space = fil_space_acquire_for_io(space_id);
-
-		if (!space) {
-			/* Maybe we have dropped the tablespace
-			and this page once belonged to it: do nothing */
-			continue;
-		}
-
-		fil_space_open_if_needed(space);
-
-		if (UNIV_UNLIKELY(page_no >= space->size)) {
-
-			/* Do not report the warning for undo
-			tablespaces, because they can be truncated in place. */
-			if (!srv_is_undo_tablespace(space_id)) {
-				ib::warn() << "A copy of page " << page_no
-					<< " in the doublewrite buffer slot "
-					<< page_no_dblwr
-					<< " is beyond the end of tablespace "
-					<< space->name
-					<< " (" << space->size << " pages)";
-			}
+  ut_ad(recv_sys.parse_start_lsn);
+  if (!is_initialised())
+    return;
+
+  uint32_t page_no_dblwr= 0;
+  byte *read_buf= static_cast<byte*>(aligned_malloc(3 * srv_page_size,
+                                                    srv_page_size));
+  byte *const buf= read_buf + srv_page_size;
+
+  for (recv_dblwr_t::list::iterator i= recv_sys.dblwr.pages.begin();
+       i != recv_sys.dblwr.pages.end(); ++i, ++page_no_dblwr)
+  {
+    byte *page= *i;
+    const uint32_t page_no= page_get_page_no(page);
+    if (!page_no) /* recovered via Datafile::restore_from_doublewrite() */
+      continue;
+
+    const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN);
+    if (recv_sys.parse_start_lsn > lsn)
+      /* Pages written before the checkpoint are not useful for recovery. */
+      continue;
+    const ulint space_id= page_get_space_id(page);
+    const page_id_t page_id(space_id, page_no);
+
+    if (recv_sys.scanned_lsn < lsn)
+    {
+      ib::info() << "Ignoring a doublewrite copy of page " << page_id
+                 << " with future log sequence number " << lsn;
+      continue;
+    }
+
+    fil_space_t *space= fil_space_t::get(space_id);
+
+    if (!space)
+      /* The tablespace that this page once belonged to does not exist */
+      continue;
+
+    if (UNIV_UNLIKELY(page_no >= space->get_size()))
+    {
+      /* Do not report the warning for undo tablespaces, because they
+      can be truncated in place. */
+      if (!srv_is_undo_tablespace(space_id))
+        ib::warn() << "A copy of page " << page_no
+                   << " in the doublewrite buffer slot " << page_no_dblwr
+                   << " is beyond the end of tablespace " << space->name
+                   << " (" << space->size << " pages)";
 next_page:
-			space->release_for_io();
-			continue;
-		}
-
-		const ulint physical_size = space->physical_size();
-		const ulint zip_size = space->zip_size();
-		ut_ad(!buf_is_zeroes(span<const byte>(page, physical_size)));
-
-		/* We want to ensure that for partial reads the
-		unread portion of the page is NUL. */
-		memset(read_buf, 0x0, physical_size);
-
-		IORequest	request;
-
-		request.dblwr_recover();
-
-		/* Read in the actual page from the file */
-		dberr_t	err = fil_io(
-			request, true,
-			page_id, zip_size,
-			0, physical_size, read_buf, NULL);
-
-		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
-			ib::warn()
-				<< "Double write buffer recovery: "
-				<< page_id << " read failed with "
-				<< "error: " << err;
-		}
-
-		if (buf_is_zeroes(span<const byte>(read_buf, physical_size))) {
-			/* We will check if the copy in the
-			doublewrite buffer is valid. If not, we will
-			ignore this page (there should be redo log
-			records to initialize it). */
-		} else if (recv_dblwr.validate_page(
-				page_id, read_buf, space, buf)) {
-			goto next_page;
-		} else {
-			/* We intentionally skip this message for
-			all-zero pages. */
-			ib::info()
-				<< "Trying to recover page " << page_id
-				<< " from the doublewrite buffer.";
-		}
-
-		page = recv_dblwr.find_page(page_id, space, buf);
-
-		if (!page) {
-			goto next_page;
-		}
-
-		/* Write the good page from the doublewrite buffer to
-		the intended position. */
-
-		IORequest	write_request(IORequest::WRITE);
-
-		fil_io(write_request, true, page_id, zip_size,
-		       0, physical_size, page, nullptr);
-
-		ib::info() << "Recovered page " << page_id
-			<< " from the doublewrite buffer.";
-
-		goto next_page;
-	}
-
-	recv_dblwr.pages.clear();
-
-	fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
-	aligned_free(read_buf);
+      space->release();
+      continue;
+    }
+
+    const ulint physical_size= space->physical_size();
+    ut_ad(!buf_is_zeroes(span<const byte>(page, physical_size)));
+
+    /* We want to ensure that for partial reads the unread portion of
+    the page is NUL. */
+    memset(read_buf, 0x0, physical_size);
+
+    /* Read in the actual page from the file */
+    fil_io_t fio= space->io(IORequest(IORequest::DBLWR_RECOVER),
+                            os_offset_t{page_no} * physical_size,
+                            physical_size, read_buf);
+
+    if (UNIV_UNLIKELY(fio.err != DB_SUCCESS))
+       ib::warn() << "Double write buffer recovery: " << page_id
+                  << " (tablespace '" << space->name
+                  << "') read failed with error: " << fio.err;
+
+    if (buf_is_zeroes(span<const byte>(read_buf, physical_size)))
+    {
+      /* We will check if the copy in the doublewrite buffer is
+      valid. If not, we will ignore this page (there should be redo
+      log records to initialize it). */
+    }
+    else if (recv_sys.dblwr.validate_page(page_id, read_buf, space, buf))
+      goto next_page;
+    else
+      /* We intentionally skip this message for all-zero pages. */
+      ib::info() << "Trying to recover page " << page_id
+                 << " from the doublewrite buffer.";
+
+    page= recv_sys.dblwr.find_page(page_id, space, buf);
+
+    if (!page)
+      goto next_page;
+
+    /* Write the good page from the doublewrite buffer to the intended
+    position. */
+    space->reacquire();
+    fio= space->io(IORequestWrite,
+                   os_offset_t{page_id.page_no()} * physical_size,
+                   physical_size, page);
+
+    if (fio.err == DB_SUCCESS)
+      ib::info() << "Recovered page " << page_id << " to '" << fio.node->name
+                 << "' from the doublewrite buffer.";
+    goto next_page;
+  }
+
+  recv_sys.dblwr.pages.clear();
+  fil_flush_file_spaces();
+  aligned_free(read_buf);
 }
 
-/****************************************************************//**
-Frees doublewrite buffer. */
-void
-buf_dblwr_free()
+/** Free the doublewrite buffer. */
+void buf_dblwr_t::close()
 {
-	/* Free the double write data structures. */
-	ut_a(buf_dblwr != NULL);
-	ut_ad(buf_dblwr->s_reserved == 0);
-	ut_ad(buf_dblwr->b_reserved == 0);
-
-	os_event_destroy(buf_dblwr->b_event);
-	os_event_destroy(buf_dblwr->s_event);
-	ut_free(buf_dblwr->write_buf_unaligned);
-	buf_dblwr->write_buf_unaligned = NULL;
-
-	ut_free(buf_dblwr->buf_block_arr);
-	buf_dblwr->buf_block_arr = NULL;
-
-	ut_free(buf_dblwr->in_use);
-	buf_dblwr->in_use = NULL;
-
-	mutex_free(&buf_dblwr->mutex);
-	ut_free(buf_dblwr);
-	buf_dblwr = NULL;
+  if (!is_initialised())
+    return;
+
+  /* Free the double write data structures. */
+  ut_ad(!active_slot->reserved);
+  ut_ad(!active_slot->first_free);
+  ut_ad(!batch_running);
+
+  pthread_cond_destroy(&cond);
+  for (int i= 0; i < 2; i++)
+  {
+    aligned_free(slots[i].write_buf);
+    ut_free(slots[i].buf_block_arr);
+  }
+  mysql_mutex_destroy(&mutex);
+
+  memset((void*) this, 0, sizeof *this);
+  active_slot= &slots[0];
 }
 
-/********************************************************************//**
-Updates the doublewrite buffer when an IO request is completed. */
-void
-buf_dblwr_update(
-/*=============*/
-	const buf_page_t*	bpage,	/*!< in: buffer block descriptor */
-	buf_flush_t		flush_type)/*!< in: flush type */
+/** Update the doublewrite buffer on write completion. */
+void buf_dblwr_t::write_completed()
 {
-	ut_ad(srv_use_doublewrite_buf);
-	ut_ad(buf_dblwr);
-	ut_ad(!fsp_is_system_temporary(bpage->id.space()));
-	ut_ad(!srv_read_only_mode);
-
-	switch (flush_type) {
-	case BUF_FLUSH_LIST:
-	case BUF_FLUSH_LRU:
-		mutex_enter(&buf_dblwr->mutex);
-
-		ut_ad(buf_dblwr->batch_running);
-		ut_ad(buf_dblwr->b_reserved > 0);
-		ut_ad(buf_dblwr->b_reserved <= buf_dblwr->first_free);
-
-		buf_dblwr->b_reserved--;
-
-		if (buf_dblwr->b_reserved == 0) {
-			mutex_exit(&buf_dblwr->mutex);
-			/* This will finish the batch. Sync data files
-			to the disk. */
-			fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
-			mutex_enter(&buf_dblwr->mutex);
-
-			/* We can now reuse the doublewrite memory buffer: */
-			buf_dblwr->first_free = 0;
-			buf_dblwr->batch_running = false;
-			os_event_set(buf_dblwr->b_event);
-		}
-
-		mutex_exit(&buf_dblwr->mutex);
-		break;
-	case BUF_FLUSH_SINGLE_PAGE:
-		{
-			const ulint size = TRX_SYS_DOUBLEWRITE_BLOCKS * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
-			ulint i;
-			mutex_enter(&buf_dblwr->mutex);
-			for (i = srv_doublewrite_batch_size; i < size; ++i) {
-				if (buf_dblwr->buf_block_arr[i] == bpage) {
-					buf_dblwr->s_reserved--;
-					buf_dblwr->buf_block_arr[i] = NULL;
-					buf_dblwr->in_use[i] = false;
-					break;
-				}
-			}
-
-			/* The block we are looking for must exist as a
-			reserved block. */
-			ut_a(i < size);
-		}
-		os_event_set(buf_dblwr->s_event);
-		mutex_exit(&buf_dblwr->mutex);
-		break;
-	case BUF_FLUSH_N_TYPES:
-		ut_error;
-	}
+  ut_ad(this == &buf_dblwr);
+  ut_ad(srv_use_doublewrite_buf);
+  ut_ad(is_initialised());
+  ut_ad(!srv_read_only_mode);
+
+  mysql_mutex_lock(&mutex);
+
+  ut_ad(batch_running);
+  slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+  ut_ad(flush_slot->reserved);
+  ut_ad(flush_slot->reserved <= flush_slot->first_free);
+
+  if (!--flush_slot->reserved)
+  {
+    mysql_mutex_unlock(&mutex);
+    /* This will finish the batch. Sync data files to the disk. */
+    fil_flush_file_spaces();
+    mysql_mutex_lock(&mutex);
+
+    /* We can now reuse the doublewrite memory buffer: */
+    flush_slot->first_free= 0;
+    batch_running= false;
+    pthread_cond_broadcast(&cond);
+  }
+
+  mysql_mutex_unlock(&mutex);
 }
 
 #ifdef UNIV_DEBUG
 /** Check the LSN values on the page.
-@param[in]	page	page to check
-@param[in]	s	tablespace */
+@param[in] page  page to check
+@param[in] s     tablespace */
 static void buf_dblwr_check_page_lsn(const page_t* page, const fil_space_t& s)
 {
-	/* Ignore page compressed or encrypted pages */
-	if (s.is_compressed()
-	    || buf_page_get_key_version(page, s.flags)) {
-		return;
-	}
-
-	const unsigned lsn1 = mach_read_from_4(page + FIL_PAGE_LSN + 4),
-		lsn2 = mach_read_from_4(page + srv_page_size
-					- (s.full_crc32()
-					   ? FIL_PAGE_FCRC32_END_LSN
-					   : FIL_PAGE_END_LSN_OLD_CHKSUM - 4));
-	if (UNIV_UNLIKELY(lsn1 != lsn2)) {
-		ib::error() << "The page to be written to "
-			    << s.chain.start->name <<
-			" seems corrupt!"
-			" The low 4 bytes of LSN fields do not match"
-			" (" << lsn1 << " != " << lsn2 << ")!"
-			" Noticed in the buffer pool.";
-	}
+  /* Ignore page_compressed or encrypted pages */
+  if (s.is_compressed() || buf_page_get_key_version(page, s.flags))
+    return;
+  const byte* lsn_start= FIL_PAGE_LSN + 4 + page;
+  const byte* lsn_end= page + srv_page_size -
+    (s.full_crc32()
+     ? FIL_PAGE_FCRC32_END_LSN
+     : FIL_PAGE_END_LSN_OLD_CHKSUM - 4);
+  static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+  static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+  ut_ad(!memcmp_aligned<4>(lsn_start, lsn_end, 4));
 }
 
-static void buf_dblwr_check_page_lsn(const buf_page_t& b, const byte* page)
+static void buf_dblwr_check_page_lsn(const buf_page_t &b, const byte *page)
 {
-	if (fil_space_t* space = fil_space_acquire_for_io(b.id.space())) {
-		buf_dblwr_check_page_lsn(page, *space);
-		space->release_for_io();
-	}
+  if (fil_space_t *space= fil_space_t::get(b.id().space()))
+  {
+    buf_dblwr_check_page_lsn(page, *space);
+    space->release();
+  }
 }
-#endif /* UNIV_DEBUG */
 
-/********************************************************************//**
-Asserts when a corrupt block is find during writing out data to the
-disk. */
-static
-void
-buf_dblwr_assert_on_corrupt_block(
-/*==============================*/
-	const buf_block_t*	block)	/*!< in: block to check */
+/** Check the LSN values on the page with which this block is associated. */
+static void buf_dblwr_check_block(const buf_page_t *bpage)
 {
-	buf_page_print(block->frame);
-
-	ib::fatal() << "Apparent corruption of an index page "
-		<< block->page.id
-		<< " to be written to data file. We intentionally crash"
-		" the server to prevent corrupt data from ending up in"
-		" data files.";
-}
-
-/********************************************************************//**
-Check the LSN values on the page with which this block is associated.
-Also validate the page if the option is set. */
-static
-void
-buf_dblwr_check_block(
-/*==================*/
-	const buf_block_t*	block)	/*!< in: block to check */
-{
-	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-
-	switch (fil_page_get_type(block->frame)) {
-	case FIL_PAGE_INDEX:
-	case FIL_PAGE_TYPE_INSTANT:
-	case FIL_PAGE_RTREE:
-		if (page_is_comp(block->frame)) {
-			if (page_simple_validate_new(block->frame)) {
-				return;
-			}
-		} else if (page_simple_validate_old(block->frame)) {
-			return;
-		}
-		/* While it is possible that this is not an index page
-		but just happens to have wrongly set FIL_PAGE_TYPE,
-		such pages should never be modified to without also
-		adjusting the page type during page allocation or
-		buf_flush_init_for_writing() or fil_block_reset_type(). */
-		break;
-	case FIL_PAGE_TYPE_FSP_HDR:
-	case FIL_PAGE_IBUF_BITMAP:
-	case FIL_PAGE_TYPE_UNKNOWN:
-		/* Do not complain again, we already reset this field. */
-	case FIL_PAGE_UNDO_LOG:
-	case FIL_PAGE_INODE:
-	case FIL_PAGE_IBUF_FREE_LIST:
-	case FIL_PAGE_TYPE_SYS:
-	case FIL_PAGE_TYPE_TRX_SYS:
-	case FIL_PAGE_TYPE_XDES:
-	case FIL_PAGE_TYPE_BLOB:
-	case FIL_PAGE_TYPE_ZBLOB:
-	case FIL_PAGE_TYPE_ZBLOB2:
-		/* TODO: validate also non-index pages */
-		return;
-	case FIL_PAGE_TYPE_ALLOCATED:
-		/* empty pages should never be flushed */
-		return;
-	}
-
-	buf_dblwr_assert_on_corrupt_block(block);
+  ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
+  const page_t *page= reinterpret_cast<const buf_block_t*>(bpage)->frame;
+
+  switch (fil_page_get_type(page)) {
+  case FIL_PAGE_INDEX:
+  case FIL_PAGE_TYPE_INSTANT:
+  case FIL_PAGE_RTREE:
+    if (page_is_comp(page))
+    {
+      if (page_simple_validate_new(page))
+        return;
+    }
+    else if (page_simple_validate_old(page))
+      return;
+    /* While it is possible that this is not an index page but just
+    happens to have wrongly set FIL_PAGE_TYPE, such pages should never
+    be modified to without also adjusting the page type during page
+    allocation or buf_flush_init_for_writing() or
+    fil_block_reset_type(). */
+    buf_page_print(page);
+
+    ib::fatal() << "Apparent corruption of an index page " << bpage->id()
+                << " to be written to data file. We intentionally crash"
+                " the server to prevent corrupt data from ending up in"
+                " data files.";
+  }
 }
+#endif /* UNIV_DEBUG */
 
-/********************************************************************//**
-Writes a page that has already been written to the doublewrite buffer
-to the datafile. It is the job of the caller to sync the datafile. */
-static
-void
-buf_dblwr_write_block_to_datafile(
-/*==============================*/
-	const buf_page_t*	bpage,	/*!< in: page to write */
-	bool			sync)	/*!< in: true if sync IO
-					is requested */
+bool buf_dblwr_t::flush_buffered_writes(const ulint size)
 {
-	ut_a(buf_page_in_file(bpage));
-
-	ulint	type = IORequest::WRITE;
-
-	if (sync) {
-		type |= IORequest::DO_NOT_WAKE;
-	}
-
-	IORequest	request(type, const_cast<buf_page_t*>(bpage));
-
-	/* We request frame here to get correct buffer in case of
-	encryption and/or page compression */
-	void * frame = buf_page_get_frame(bpage);
-
-	if (bpage->zip.data != NULL) {
-		ut_ad(bpage->zip_size());
-
-		fil_io(request, sync, bpage->id, bpage->zip_size(), 0,
-		       bpage->zip_size(),
-		       (void*) frame,
-		       (void*) bpage);
-	} else {
-		ut_ad(!bpage->zip_size());
-
-		/* Our IO API is common for both reads and writes and is
-		therefore geared towards a non-const parameter. */
-
-		buf_block_t*	block = reinterpret_cast<buf_block_t*>(
-			const_cast<buf_page_t*>(bpage));
-
-		ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-		ut_d(buf_dblwr_check_page_lsn(block->page, block->frame));
-		fil_io(request,
-		       sync, bpage->id, bpage->zip_size(), 0, bpage->real_size,
-		       frame, block);
-	}
+  mysql_mutex_assert_owner(&mutex);
+  ut_ad(size == block_size());
+
+  for (;;)
+  {
+    if (!active_slot->first_free)
+      return false;
+    if (!batch_running)
+      break;
+    my_cond_wait(&cond, &mutex.m_mutex);
+  }
+
+  ut_ad(active_slot->reserved == active_slot->first_free);
+  ut_ad(!flushing_buffered_writes);
+
+  /* Disallow anyone else to start another batch of flushing. */
+  slot *flush_slot= active_slot;
+  /* Switch the active slot */
+  active_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+  ut_a(active_slot->first_free == 0);
+  batch_running= true;
+  const ulint old_first_free= flush_slot->first_free;
+  auto write_buf= flush_slot->write_buf;
+  const bool multi_batch= block1 + static_cast<uint32_t>(size) != block2 &&
+    old_first_free > size;
+  flushing_buffered_writes= 1 + multi_batch;
+  pages_submitted+= old_first_free;
+  /* Now safe to release the mutex. */
+  mysql_mutex_unlock(&mutex);
+#ifdef UNIV_DEBUG
+  for (ulint len2= 0, i= 0; i < old_first_free; len2 += srv_page_size, i++)
+  {
+    buf_page_t *bpage= flush_slot->buf_block_arr[i].request.bpage;
+
+    if (bpage->zip.data)
+      /* No simple validate for ROW_FORMAT=COMPRESSED pages exists. */
+      continue;
+
+    /* Check that the actual page in the buffer pool is not corrupt
+    and the LSN values are sane. */
+    buf_dblwr_check_block(bpage);
+    ut_d(buf_dblwr_check_page_lsn(*bpage, write_buf + len2));
+  }
+#endif /* UNIV_DEBUG */
+  const IORequest request(nullptr, fil_system.sys_space->chain.start,
+                          IORequest::DBLWR_BATCH);
+  ut_a(fil_system.sys_space->acquire());
+  if (multi_batch)
+  {
+    fil_system.sys_space->reacquire();
+    os_aio(request, write_buf,
+           os_offset_t{block1.page_no()} << srv_page_size_shift,
+           size << srv_page_size_shift);
+    os_aio(request, write_buf + (size << srv_page_size_shift),
+           os_offset_t{block2.page_no()} << srv_page_size_shift,
+           (old_first_free - size) << srv_page_size_shift);
+  }
+  else
+    os_aio(request, write_buf,
+           os_offset_t{block1.page_no()} << srv_page_size_shift,
+           old_first_free << srv_page_size_shift);
+  return true;
 }
 
-/********************************************************************//**
-Flushes possible buffered writes from the doublewrite memory buffer to disk,
-and also wakes up the aio thread if simulated aio is used. It is very
-important to call this function after a batch of writes has been posted,
-and also when we may have to wait for a page latch! Otherwise a deadlock
-of threads can occur. */
-void
-buf_dblwr_flush_buffered_writes()
+void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request)
 {
-	byte*		write_buf;
-	ulint		first_free;
-	ulint		len;
-
-	if (!srv_use_doublewrite_buf || buf_dblwr == NULL) {
-		/* Sync the writes to the disk. */
-		buf_dblwr_sync_datafiles();
-		/* Now we flush the data to disk (for example, with fsync) */
-		fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
-		return;
-	}
-
-	ut_ad(!srv_read_only_mode);
-
-try_again:
-	mutex_enter(&buf_dblwr->mutex);
-
-	/* Write first to doublewrite buffer blocks. We use synchronous
-	aio and thus know that file write has been completed when the
-	control returns. */
-
-	if (buf_dblwr->first_free == 0) {
-
-		mutex_exit(&buf_dblwr->mutex);
-
-		/* Wake possible simulated aio thread as there could be
-		system temporary tablespace pages active for flushing.
-		Note: system temporary tablespace pages are not scheduled
-		for doublewrite. */
-		os_aio_simulated_wake_handler_threads();
-
-		return;
-	}
-
-	if (buf_dblwr->batch_running) {
-		/* Another thread is running the batch right now. Wait
-		for it to finish. */
-		int64_t	sig_count = os_event_reset(buf_dblwr->b_event);
-		mutex_exit(&buf_dblwr->mutex);
-
-		os_aio_simulated_wake_handler_threads();
-		os_event_wait_low(buf_dblwr->b_event, sig_count);
-		goto try_again;
-	}
-
-	ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved);
-
-	/* Disallow anyone else to post to doublewrite buffer or to
-	start another batch of flushing. */
-	buf_dblwr->batch_running = true;
-	first_free = buf_dblwr->first_free;
-
-	/* Now safe to release the mutex. Note that though no other
-	thread is allowed to post to the doublewrite batch flushing
-	but any threads working on single page flushes are allowed
-	to proceed. */
-	mutex_exit(&buf_dblwr->mutex);
-
-	write_buf = buf_dblwr->write_buf;
-
-	for (ulint len2 = 0, i = 0;
-	     i < buf_dblwr->first_free;
-	     len2 += srv_page_size, i++) {
-
-		const buf_block_t*	block;
-
-		block = (buf_block_t*) buf_dblwr->buf_block_arr[i];
-
-		if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
-		    || block->page.zip.data) {
-			/* No simple validate for compressed
-			pages exists. */
-			continue;
-		}
-
-		/* Check that the actual page in the buffer pool is
-		not corrupt and the LSN values are sane. */
-		buf_dblwr_check_block(block);
-		ut_d(buf_dblwr_check_page_lsn(block->page, write_buf + len2));
-	}
-
-	/* Write out the first block of the doublewrite buffer */
-	len = std::min<ulint>(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
-			      buf_dblwr->first_free) << srv_page_size_shift;
-
-	fil_io(IORequestWrite, true,
-	       page_id_t(TRX_SYS_SPACE, buf_dblwr->block1), 0,
-	       0, len, (void*) write_buf, NULL);
-
-	if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-		/* No unwritten pages in the second block. */
-		goto flush;
-	}
-
-	/* Write out the second block of the doublewrite buffer. */
-	len = (buf_dblwr->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
-	       << srv_page_size_shift;
-
-	write_buf = buf_dblwr->write_buf
-		+ (TRX_SYS_DOUBLEWRITE_BLOCK_SIZE << srv_page_size_shift);
-
-	fil_io(IORequestWrite, true,
-	       page_id_t(TRX_SYS_SPACE, buf_dblwr->block2), 0,
-	       0, len, (void*) write_buf, NULL);
-
-flush:
-	/* increment the doublewrite flushed pages counter */
-	srv_stats.dblwr_pages_written.add(buf_dblwr->first_free);
-	srv_stats.dblwr_writes.inc();
-
-	/* Now flush the doublewrite buffer data to disk */
-	fil_flush(TRX_SYS_SPACE);
-
-	/* We know that the writes have been flushed to disk now
-	and in recovery we will find them in the doublewrite buffer
-	blocks. Next do the writes to the intended positions. */
-
-	/* Up to this point first_free and buf_dblwr->first_free are
-	same because we have set the buf_dblwr->batch_running flag
-	disallowing any other thread to post any request but we
-	can't safely access buf_dblwr->first_free in the loop below.
-	This is so because it is possible that after we are done with
-	the last iteration and before we terminate the loop, the batch
-	gets finished in the IO helper thread and another thread posts
-	a new batch setting buf_dblwr->first_free to a higher value.
-	If this happens and we are using buf_dblwr->first_free in the
-	loop termination condition then we'll end up dispatching
-	the same block twice from two different threads. */
-	ut_ad(first_free == buf_dblwr->first_free);
-	for (ulint i = 0; i < first_free; i++) {
-		buf_dblwr_write_block_to_datafile(
-			buf_dblwr->buf_block_arr[i], false);
-	}
-
-	/* Wake possible simulated aio thread to actually post the
-	writes to the operating system. We don't flush the files
-	at this point. We leave it to the IO helper thread to flush
-	datafiles when the whole batch has been processed. */
-	os_aio_simulated_wake_handler_threads();
+  ut_ad(this == &buf_dblwr);
+  ut_ad(srv_use_doublewrite_buf);
+  ut_ad(is_initialised());
+  ut_ad(!srv_read_only_mode);
+  ut_ad(!request.bpage);
+  ut_ad(request.node == fil_system.sys_space->chain.start);
+  ut_ad(request.type == IORequest::DBLWR_BATCH);
+  mysql_mutex_lock(&mutex);
+  ut_ad(batch_running);
+  ut_ad(flushing_buffered_writes);
+  ut_ad(flushing_buffered_writes <= 2);
+  writes_completed++;
+  if (UNIV_UNLIKELY(--flushing_buffered_writes))
+  {
+    mysql_mutex_unlock(&mutex);
+    return;
+  }
+
+  slot *const flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+  ut_ad(flush_slot->reserved == flush_slot->first_free);
+  /* increment the doublewrite flushed pages counter */
+  pages_written+= flush_slot->first_free;
+  mysql_mutex_unlock(&mutex);
+
+  /* Now flush the doublewrite buffer data to disk */
+  fil_system.sys_space->flush<false>();
+
+  /* The writes have been flushed to disk now and in recovery we will
+  find them in the doublewrite buffer blocks. Next, write the data pages. */
+  for (ulint i= 0, first_free= flush_slot->first_free; i < first_free; i++)
+  {
+    auto e= flush_slot->buf_block_arr[i];
+    buf_page_t* bpage= e.request.bpage;
+    ut_ad(bpage->in_file());
+
+    /* We request frame here to get correct buffer in case of
+    encryption and/or page compression */
+    void *frame= buf_page_get_frame(bpage);
+
+    auto e_size= e.size;
+
+    if (UNIV_LIKELY_NULL(bpage->zip.data))
+    {
+      e_size= bpage->zip_size();
+      ut_ad(e_size);
+    }
+    else
+    {
+      ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
+      ut_ad(!bpage->zip_size());
+      ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast<const byte*>(frame)));
+    }
+
+    const lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
+                                      (FIL_PAGE_LSN +
+                                       static_cast<const byte*>(frame)));
+    ut_ad(lsn);
+    ut_ad(lsn >= bpage->oldest_modification());
+    log_write_up_to(lsn, true);
+    e.request.node->space->io(e.request, bpage->physical_offset(), e_size,
+                              frame, bpage);
+  }
 }
 
-/********************************************************************//**
-Posts a buffer page for writing. If the doublewrite memory buffer is
-full, calls buf_dblwr_flush_buffered_writes and waits for for free
-space to appear. */
-void
-buf_dblwr_add_to_batch(
-/*====================*/
-	buf_page_t*	bpage)	/*!< in: buffer block to write */
+/** Flush possible buffered writes to persistent storage.
+It is very important to call this function after a batch of writes has been
+posted, and also when we may have to wait for a page latch!
+Otherwise a deadlock of threads can occur. */
+void buf_dblwr_t::flush_buffered_writes()
 {
-	ut_a(buf_page_in_file(bpage));
-
-try_again:
-	mutex_enter(&buf_dblwr->mutex);
-
-	ut_a(buf_dblwr->first_free <= srv_doublewrite_batch_size);
-
-	if (buf_dblwr->batch_running) {
-
-		/* This not nearly as bad as it looks. There is only
-		page_cleaner thread which does background flushing
-		in batches therefore it is unlikely to be a contention
-		point. The only exception is when a user thread is
-		forced to do a flush batch because of a sync
-		checkpoint. */
-		int64_t	sig_count = os_event_reset(buf_dblwr->b_event);
-		mutex_exit(&buf_dblwr->mutex);
-		os_aio_simulated_wake_handler_threads();
-
-		os_event_wait_low(buf_dblwr->b_event, sig_count);
-		goto try_again;
-	}
-
-	if (buf_dblwr->first_free == srv_doublewrite_batch_size) {
-		mutex_exit(&(buf_dblwr->mutex));
-
-		buf_dblwr_flush_buffered_writes();
-
-		goto try_again;
-	}
-
-	byte*	p = buf_dblwr->write_buf
-		+ srv_page_size * buf_dblwr->first_free;
-
-	/* We request frame here to get correct buffer in case of
-	encryption and/or page compression */
-	void * frame = buf_page_get_frame(bpage);
-
-	if (auto zip_size = bpage->zip_size()) {
-		MEM_CHECK_DEFINED(bpage->zip.data, zip_size);
-		/* Copy the compressed page and clear the rest. */
-		memcpy(p, frame, zip_size);
-		memset(p + zip_size, 0x0, srv_page_size - zip_size);
-	} else {
-		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
-		MEM_CHECK_DEFINED(frame, srv_page_size);
-		memcpy(p, frame, srv_page_size);
-	}
-
-	buf_dblwr->buf_block_arr[buf_dblwr->first_free] = bpage;
-
-	buf_dblwr->first_free++;
-	buf_dblwr->b_reserved++;
-
-	ut_ad(!buf_dblwr->batch_running);
-	ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved);
-	ut_ad(buf_dblwr->b_reserved <= srv_doublewrite_batch_size);
-
-	if (buf_dblwr->first_free == srv_doublewrite_batch_size) {
-		mutex_exit(&(buf_dblwr->mutex));
-
-		buf_dblwr_flush_buffered_writes();
-
-		return;
-	}
-
-	mutex_exit(&(buf_dblwr->mutex));
+  if (!is_initialised() || !srv_use_doublewrite_buf)
+  {
+    fil_flush_file_spaces();
+    return;
+  }
+
+  ut_ad(!srv_read_only_mode);
+  const ulint size= block_size();
+
+  mysql_mutex_lock(&mutex);
+  if (!flush_buffered_writes(size))
+    mysql_mutex_unlock(&mutex);
 }
 
-/********************************************************************//**
-Writes a page to the doublewrite buffer on disk, sync it, then write
-the page to the datafile and sync the datafile. This function is used
-for single page flushes. If all the buffers allocated for single page
-flushes in the doublewrite buffer are in use we wait here for one to
-become free. We are guaranteed that a slot will become free because any
-thread that is using a slot must also release the slot before leaving
-this function. */
-void
-buf_dblwr_write_single_page(
-/*========================*/
-	buf_page_t*	bpage,	/*!< in: buffer block to write */
-	bool		sync)	/*!< in: true if sync IO requested */
+/** Schedule a page write. If the doublewrite memory buffer is full,
+flush_buffered_writes() will be invoked to make space.
+@param request    asynchronous write request
+@param size       payload size in bytes */
+void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size)
 {
-	ulint		n_slots;
-	ulint		size;
-	ulint		offset;
-	ulint		i;
-
-	ut_a(buf_page_in_file(bpage));
-	ut_a(srv_use_doublewrite_buf);
-	ut_a(buf_dblwr != NULL);
-
-	/* total number of slots available for single page flushes
-	starts from srv_doublewrite_batch_size to the end of the
-	buffer. */
-	size = TRX_SYS_DOUBLEWRITE_BLOCKS * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
-	ut_a(size > srv_doublewrite_batch_size);
-	n_slots = size - srv_doublewrite_batch_size;
-
-	if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
-
-		/* Check that the actual page in the buffer pool is
-		not corrupt and the LSN values are sane. */
-		buf_dblwr_check_block((buf_block_t*) bpage);
-
-		/* Check that the page as written to the doublewrite
-		buffer has sane LSN values. */
-		if (!bpage->zip.data) {
-			ut_d(buf_dblwr_check_page_lsn(
-				     *bpage, ((buf_block_t*) bpage)->frame));
-		}
-	}
-
-retry:
-	mutex_enter(&buf_dblwr->mutex);
-	if (buf_dblwr->s_reserved == n_slots) {
-
-		/* All slots are reserved. */
-		int64_t	sig_count = os_event_reset(buf_dblwr->s_event);
-		mutex_exit(&buf_dblwr->mutex);
-		os_event_wait_low(buf_dblwr->s_event, sig_count);
-
-		goto retry;
-	}
-
-	for (i = srv_doublewrite_batch_size; i < size; ++i) {
-
-		if (!buf_dblwr->in_use[i]) {
-			break;
-		}
-	}
-
-	/* We are guaranteed to find a slot. */
-	ut_a(i < size);
-	buf_dblwr->in_use[i] = true;
-	buf_dblwr->s_reserved++;
-	buf_dblwr->buf_block_arr[i] = bpage;
-
-	/* increment the doublewrite flushed pages counter */
-	srv_stats.dblwr_pages_written.inc();
-	srv_stats.dblwr_writes.inc();
-
-	mutex_exit(&buf_dblwr->mutex);
-
-	/* Lets see if we are going to write in the first or second
-	block of the doublewrite buffer. */
-	if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-		offset = buf_dblwr->block1 + i;
-	} else {
-		offset = buf_dblwr->block2 + i
-			 - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
-	}
-
-	/* We deal with compressed and uncompressed pages a little
-	differently here. In case of uncompressed pages we can
-	directly write the block to the allocated slot in the
-	doublewrite buffer in the system tablespace and then after
-	syncing the system table space we can proceed to write the page
-	in the datafile.
-	In case of compressed page we first do a memcpy of the block
-	to the in-memory buffer of doublewrite before proceeding to
-	write it. This is so because we want to pad the remaining
-	bytes in the doublewrite page with zeros. */
-
-	/* We request frame here to get correct buffer in case of
-	encryption and/or page compression */
-	void * frame = buf_page_get_frame(bpage);
-
-	if (auto zip_size = bpage->zip_size()) {
-		memcpy(buf_dblwr->write_buf + srv_page_size * i,
-		       frame, zip_size);
-
-		memset(buf_dblwr->write_buf + srv_page_size * i
-		       + zip_size, 0x0,
-		       srv_page_size - zip_size);
-
-		fil_io(IORequestWrite,
-		       true,
-		       page_id_t(TRX_SYS_SPACE, offset),
-		       0,
-		       0,
-		       srv_page_size,
-		       (void *)(buf_dblwr->write_buf + srv_page_size * i),
-		       NULL);
-	} else {
-		/* It is a regular page. Write it directly to the
-		doublewrite buffer */
-		fil_io(IORequestWrite,
-		       true,
-		       page_id_t(TRX_SYS_SPACE, offset),
-		       0,
-		       0,
-		       srv_page_size,
-		       (void*) frame,
-		       NULL);
-	}
-
-	/* Now flush the doublewrite buffer data to disk */
-	fil_flush(TRX_SYS_SPACE);
-
-	/* We know that the write has been flushed to disk now
-	and during recovery we will find it in the doublewrite buffer
-	blocks. Next do the write to the intended position. */
-	buf_dblwr_write_block_to_datafile(bpage, sync);
+  ut_ad(request.is_async());
+  ut_ad(request.is_write());
+  ut_ad(request.bpage);
+  ut_ad(request.bpage->in_file());
+  ut_ad(request.node);
+  ut_ad(request.node->space->id == request.bpage->id().space());
+  ut_ad(request.node->space->referenced());
+  ut_ad(!srv_read_only_mode);
+
+  const ulint buf_size= 2 * block_size();
+
+  mysql_mutex_lock(&mutex);
+
+  for (;;)
+  {
+    ut_ad(active_slot->first_free <= buf_size);
+    if (active_slot->first_free != buf_size)
+      break;
+
+    if (flush_buffered_writes(buf_size / 2))
+      mysql_mutex_lock(&mutex);
+  }
+
+  byte *p= active_slot->write_buf + srv_page_size * active_slot->first_free;
+
+  /* We request frame here to get correct buffer in case of
+  encryption and/or page compression */
+  void *frame= buf_page_get_frame(request.bpage);
+
+  /* "frame" is at least 1024-byte aligned for ROW_FORMAT=COMPRESSED pages,
+  and at least srv_page_size (4096-byte) for everything else. */
+  memcpy_aligned<UNIV_ZIP_SIZE_MIN>(p, frame, size);
+  /* fil_page_compress() for page_compressed guarantees 256-byte alignment */
+  memset_aligned<256>(p + size, 0, srv_page_size - size);
+  /* FIXME: Inform the compiler that "size" and "srv_page_size - size"
+  are integer multiples of 256, so the above can translate into simple
+  SIMD instructions. Currently, we make no such assumptions about the
+  non-pointer parameters that are passed to the _aligned templates. */
+  ut_ad(!request.bpage->zip_size() || request.bpage->zip_size() == size);
+  ut_ad(active_slot->reserved == active_slot->first_free);
+  ut_ad(active_slot->reserved < buf_size);
+  new (active_slot->buf_block_arr + active_slot->first_free++)
+    element{request, size};
+  active_slot->reserved= active_slot->first_free;
+
+  if (active_slot->first_free != buf_size ||
+      !flush_buffered_writes(buf_size / 2))
+    mysql_mutex_unlock(&mutex);
 }
diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc
index 234fb8ef5f3..aaf07dd17eb 100644
--- a/storage/innobase/buf/buf0dump.cc
+++ b/storage/innobase/buf/buf0dump.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2011, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -46,6 +46,8 @@ Created April 08, 2011 Vasil Dimov
 #include "mysql/service_wsrep.h" /* wsrep_recovery */
 #include <my_service_manager.h>
 
+static void buf_do_load_dump();
+
 enum status_severity {
 	STATUS_INFO,
 	STATUS_ERR
@@ -58,43 +60,20 @@ take after being waked up. */
 static volatile bool	buf_dump_should_start;
 static volatile bool	buf_load_should_start;
 
-static ibool	buf_load_abort_flag = FALSE;
-
-/* Used to temporary store dump info in order to avoid IO while holding
-buffer pool mutex during dump and also to sort the contents of the dump
-before reading the pages from disk during load.
-We store the space id in the high 32 bits and page no in low 32 bits. */
-typedef ib_uint64_t	buf_dump_t;
-
-/* Aux macros to create buf_dump_t and to extract space and page from it */
-#define BUF_DUMP_CREATE(space, page)	ut_ull_create(space, page)
-#define BUF_DUMP_SPACE(a)		((ulint) ((a) >> 32))
-#define BUF_DUMP_PAGE(a)		((ulint) ((a) & 0xFFFFFFFFUL))
+static bool	buf_load_abort_flag;
 
-/*****************************************************************//**
-Wakes up the buffer pool dump/load thread and instructs it to start
-a dump. This function is called by MySQL code via buffer_pool_dump_now()
-and it should return immediately because the whole MySQL is frozen during
-its execution. */
-void
-buf_dump_start()
-/*============*/
+/** Start the buffer pool dump/load task and instructs it to start a dump. */
+void buf_dump_start()
 {
-	buf_dump_should_start = true;
-	os_event_set(srv_buf_dump_event);
+  buf_dump_should_start= true;
+  buf_do_load_dump();
 }
 
-/*****************************************************************//**
-Wakes up the buffer pool dump/load thread and instructs it to start
-a load. This function is called by MySQL code via buffer_pool_load_now()
-and it should return immediately because the whole MySQL is frozen during
-its execution. */
-void
-buf_load_start()
-/*============*/
+/** Start the buffer pool dump/load task and instructs it to start a load. */
+void buf_load_start()
 {
-	buf_load_should_start = true;
-	os_event_set(srv_buf_dump_event);
+  buf_load_should_start= true;
+  buf_do_load_dump();
 }
 
 /*****************************************************************//**
@@ -262,7 +241,6 @@ buf_dump(
 	char	tmp_filename[OS_FILE_MAX_PATH + sizeof "incomplete"];
 	char	now[32];
 	FILE*	f;
-	ulint	i;
 	int	ret;
 
 	buf_dump_generate_path(full_filename, sizeof(full_filename));
@@ -293,114 +271,102 @@ buf_dump(
 				tmp_filename, strerror(errno));
 		return;
 	}
-	/* else */
+	const buf_page_t*	bpage;
+	page_id_t*		dump;
+	ulint			n_pages;
+	ulint			j;
 
-	/* walk through each buffer pool */
-	for (i = 0; i < srv_buf_pool_instances && !SHOULD_QUIT(); i++) {
-		buf_pool_t*		buf_pool;
-		const buf_page_t*	bpage;
-		buf_dump_t*		dump;
-		ulint			n_pages;
-		ulint			j;
+	mysql_mutex_lock(&buf_pool.mutex);
 
-		buf_pool = buf_pool_from_array(i);
+	n_pages = UT_LIST_GET_LEN(buf_pool.LRU);
 
-		/* obtain buf_pool mutex before allocate, since
-		UT_LIST_GET_LEN(buf_pool->LRU) could change */
-		buf_pool_mutex_enter(buf_pool);
+	/* skip empty buffer pools */
+	if (n_pages == 0) {
+		mysql_mutex_unlock(&buf_pool.mutex);
+		goto done;
+	}
 
-		n_pages = UT_LIST_GET_LEN(buf_pool->LRU);
+	if (srv_buf_pool_dump_pct != 100) {
+		ulint		t_pages;
+
+		/* limit the number of total pages dumped to X% of the
+		total number of pages */
+		t_pages = buf_pool.curr_size * srv_buf_pool_dump_pct / 100;
+		if (n_pages > t_pages) {
+			buf_dump_status(STATUS_INFO,
+					"Restricted to " ULINTPF
+					" pages due to "
+					"innodb_buf_pool_dump_pct=%lu",
+					t_pages, srv_buf_pool_dump_pct);
+			n_pages = t_pages;
+		}
 
-		/* skip empty buffer pools */
 		if (n_pages == 0) {
-			buf_pool_mutex_exit(buf_pool);
-			continue;
+			n_pages = 1;
 		}
+	}
 
-		if (srv_buf_pool_dump_pct != 100) {
-			ulint		t_pages;
-
-			ut_ad(srv_buf_pool_dump_pct < 100);
-
-			/* limit the number of total pages dumped to X% of the
-			 * total number of pages */
-			t_pages = buf_pool->curr_size
-					*  srv_buf_pool_dump_pct / 100;
-			if (n_pages > t_pages) {
-				buf_dump_status(STATUS_INFO,
-						"Instance " ULINTPF
-						", restricted to " ULINTPF
-						" pages due to "
-						"innodb_buf_pool_dump_pct=%lu",
-						i, t_pages,
-						srv_buf_pool_dump_pct);
-				n_pages = t_pages;
-			}
+	dump = static_cast<page_id_t*>(ut_malloc_nokey(
+					       n_pages * sizeof(*dump)));
 
-			if (n_pages == 0) {
-				n_pages = 1;
-			}
+	if (dump == NULL) {
+		mysql_mutex_unlock(&buf_pool.mutex);
+		fclose(f);
+		buf_dump_status(STATUS_ERR,
+				"Cannot allocate " ULINTPF " bytes: %s",
+				(ulint) (n_pages * sizeof(*dump)),
+				strerror(errno));
+		/* leave tmp_filename to exist */
+		return;
+	}
+
+	for (bpage = UT_LIST_GET_FIRST(buf_pool.LRU), j = 0;
+	     bpage != NULL && j < n_pages;
+	     bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+
+		ut_a(bpage->in_file());
+		const page_id_t id(bpage->id());
+
+		if (id.space() == SRV_TMP_SPACE_ID) {
+			/* Ignore the innodb_temporary tablespace. */
+			continue;
 		}
 
-		dump = static_cast<buf_dump_t*>(ut_malloc_nokey(
-				n_pages * sizeof(*dump)));
+		if (bpage->status == buf_page_t::FREED) {
+			continue;
+		}
+
+		dump[j++] = id;
+	}
+
+	mysql_mutex_unlock(&buf_pool.mutex);
+
+	ut_a(j <= n_pages);
+	n_pages = j;
 
-		if (dump == NULL) {
-			buf_pool_mutex_exit(buf_pool);
+	for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) {
+		ret = fprintf(f, "%u,%u\n",
+			      dump[j].space(), dump[j].page_no());
+		if (ret < 0) {
+			ut_free(dump);
 			fclose(f);
 			buf_dump_status(STATUS_ERR,
-					"Cannot allocate " ULINTPF " bytes: %s",
-					(ulint) (n_pages * sizeof(*dump)),
-					strerror(errno));
+					"Cannot write to '%s': %s",
+					tmp_filename, strerror(errno));
 			/* leave tmp_filename to exist */
 			return;
 		}
-
-		for (bpage = UT_LIST_GET_FIRST(buf_pool->LRU), j = 0;
-		     bpage != NULL && j < n_pages;
-		     bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
-
-			ut_a(buf_page_in_file(bpage));
-			if (bpage->id.space() >= SRV_LOG_SPACE_FIRST_ID) {
-				/* Ignore the innodb_temporary tablespace. */
-				continue;
-			}
-
-			dump[j++] = BUF_DUMP_CREATE(bpage->id.space(),
-						    bpage->id.page_no());
-		}
-
-		buf_pool_mutex_exit(buf_pool);
-
-		ut_a(j <= n_pages);
-		n_pages = j;
-
-		for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) {
-			ret = fprintf(f, ULINTPF "," ULINTPF "\n",
-				      BUF_DUMP_SPACE(dump[j]),
-				      BUF_DUMP_PAGE(dump[j]));
-			if (ret < 0) {
-				ut_free(dump);
-				fclose(f);
-				buf_dump_status(STATUS_ERR,
-						"Cannot write to '%s': %s",
-						tmp_filename, strerror(errno));
-				/* leave tmp_filename to exist */
-				return;
-			}
-			if (SHUTTING_DOWN() && !(j % 1024)) {
-				service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
-					"Dumping buffer pool "
-					ULINTPF "/%lu, "
-					"page " ULINTPF "/" ULINTPF,
-					i + 1, srv_buf_pool_instances,
-					j + 1, n_pages);
-			}
+		if (SHUTTING_DOWN() && !(j & 1023)) {
+			service_manager_extend_timeout(
+				INNODB_EXTEND_TIMEOUT_INTERVAL,
+				"Dumping buffer pool page "
+				ULINTPF "/" ULINTPF, j + 1, n_pages);
 		}
-
-		ut_free(dump);
 	}
 
+	ut_free(dump);
+
+done:
 	ret = fclose(f);
 	if (ret != 0) {
 		buf_dump_status(STATUS_ERR,
@@ -524,16 +490,15 @@ buf_load()
 	char		full_filename[OS_FILE_MAX_PATH];
 	char		now[32];
 	FILE*		f;
-	buf_dump_t*	dump;
+	page_id_t*	dump;
 	ulint		dump_n;
-	ulint		total_buffer_pools_pages;
 	ulint		i;
-	ulint		space_id;
-	ulint		page_no;
+	uint32_t	space_id;
+	uint32_t	page_no;
 	int		fscanf_ret;
 
 	/* Ignore any leftovers from before */
-	buf_load_abort_flag = FALSE;
+	buf_load_abort_flag = false;
 
 	buf_dump_generate_path(full_filename, sizeof(full_filename));
 
@@ -553,7 +518,7 @@ buf_load()
 	This file is tiny (approx 500KB per 1GB buffer pool), reading it
 	two times is fine. */
 	dump_n = 0;
-	while (fscanf(f, ULINTPF "," ULINTPF, &space_id, &page_no) == 2
+	while (fscanf(f, "%u,%u", &space_id, &page_no) == 2
 	       && !SHUTTING_DOWN()) {
 		dump_n++;
 	}
@@ -576,14 +541,10 @@ buf_load()
 	/* If dump is larger than the buffer pool(s), then we ignore the
 	extra trailing. This could happen if a dump is made, then buffer
 	pool is shrunk and then load is attempted. */
-	total_buffer_pools_pages = buf_pool_get_n_pages()
-		* srv_buf_pool_instances;
-	if (dump_n > total_buffer_pools_pages) {
-		dump_n = total_buffer_pools_pages;
-	}
+	dump_n = std::min(dump_n, buf_pool.get_n_pages());
 
-	if(dump_n != 0) {
-		dump = static_cast<buf_dump_t*>(ut_malloc_nokey(
+	if (dump_n != 0) {
+		dump = static_cast<page_id_t*>(ut_malloc_nokey(
 				dump_n * sizeof(*dump)));
 	} else {
 		fclose(f);
@@ -608,8 +569,7 @@ buf_load()
 	export_vars.innodb_buffer_pool_load_incomplete = 1;
 
 	for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
-		fscanf_ret = fscanf(f, ULINTPF "," ULINTPF,
-				    &space_id, &page_no);
+		fscanf_ret = fscanf(f, "%u,%u", &space_id, &page_no);
 
 		if (fscanf_ret != 2) {
 			if (feof(f)) {
@@ -631,16 +591,15 @@ buf_load()
 			fclose(f);
 			buf_load_status(STATUS_ERR,
 					"Error parsing '%s': bogus"
-					" space,page " ULINTPF "," ULINTPF
-					" at line " ULINTPF ","
-					" unable to load buffer pool",
+					" space,page %u,%u at line " ULINTPF
+					", unable to load buffer pool",
 					full_filename,
 					space_id, page_no,
 					i);
 			return;
 		}
 
-		dump[i] = BUF_DUMP_CREATE(space_id, page_no);
+		dump[i] = page_id_t(space_id, page_no);
 	}
 
 	/* Set dump_n to the actual number of initialized elements,
@@ -666,82 +625,77 @@ buf_load()
 	ulint		last_check_time = 0;
 	ulint		last_activity_cnt = 0;
 
-	/* Avoid calling the expensive fil_space_acquire_silent() for each
+	/* Avoid calling the expensive fil_space_t::get() for each
 	page within the same tablespace. dump[] is sorted by (space, page),
 	so all pages from a given tablespace are consecutive. */
-	ulint		cur_space_id = BUF_DUMP_SPACE(dump[0]);
-	fil_space_t*	space = fil_space_acquire_silent(cur_space_id);
+	ulint		cur_space_id = dump[0].space();
+	fil_space_t*	space = fil_space_t::get(cur_space_id);
 	ulint		zip_size = space ? space->zip_size() : 0;
 
-	/* JAN: TODO: MySQL 5.7 PSI
-#ifdef HAVE_PSI_STAGE_INTERFACE
-	PSI_stage_progress*	pfs_stage_progress
+	PSI_stage_progress*	pfs_stage_progress __attribute__((unused))
 		= mysql_set_stage(srv_stage_buffer_pool_load.m_key);
-	#endif*/ /* HAVE_PSI_STAGE_INTERFACE */
-	/*
 	mysql_stage_set_work_estimated(pfs_stage_progress, dump_n);
 	mysql_stage_set_work_completed(pfs_stage_progress, 0);
-	*/
 
 	for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
 
 		/* space_id for this iteration of the loop */
-		const ulint	this_space_id = BUF_DUMP_SPACE(dump[i]);
+		const ulint	this_space_id = dump[i].space();
 
-		if (this_space_id >= SRV_LOG_SPACE_FIRST_ID) {
+		if (this_space_id == SRV_TMP_SPACE_ID) {
 			/* Ignore the innodb_temporary tablespace. */
 			continue;
 		}
 
 		if (this_space_id != cur_space_id) {
-			if (space != NULL) {
+			if (space) {
 				space->release();
 			}
 
 			cur_space_id = this_space_id;
-			space = fil_space_acquire_silent(cur_space_id);
+			space = fil_space_t::get(cur_space_id);
 
-			if (space != NULL) {
-				zip_size = space->zip_size();
+			if (!space) {
+				continue;
 			}
+
+			zip_size = space->zip_size();
 		}
 
 		/* JAN: TODO: As we use background page read below,
 		if tablespace is encrypted we cant use it. */
-		if (space == NULL ||
-		   (space && space->crypt_data &&
-		    space->crypt_data->encryption != FIL_ENCRYPTION_OFF &&
-		    space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) {
+		if (!space || dump[i].page_no() >= space->get_size() ||
+		    (space->crypt_data &&
+		     space->crypt_data->encryption != FIL_ENCRYPTION_OFF &&
+		     space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) {
 			continue;
 		}
 
-		buf_read_page_background(
-			page_id_t(this_space_id, BUF_DUMP_PAGE(dump[i])),
-			zip_size, true);
-
-		if (i % 64 == 63) {
-			os_aio_simulated_wake_handler_threads();
+		if (space->is_stopping()) {
+			space->release();
+			space = nullptr;
+			continue;
 		}
 
+		space->reacquire();
+		buf_read_page_background(space, dump[i], zip_size);
+
 		if (buf_load_abort_flag) {
-			if (space != NULL) {
+			if (space) {
 				space->release();
 			}
-			buf_load_abort_flag = FALSE;
+			buf_load_abort_flag = false;
 			ut_free(dump);
 			buf_load_status(
 				STATUS_INFO,
 				"Buffer pool(s) load aborted on request");
 			/* Premature end, set estimated = completed = i and
 			end the current stage event. */
-			/*
+
 			mysql_stage_set_work_estimated(pfs_stage_progress, i);
-			mysql_stage_set_work_completed(pfs_stage_progress,
-			i);
-			*/
-#ifdef HAVE_PSI_STAGE_INTERFACE
-			/* mysql_end_stage(); */
-#endif /* HAVE_PSI_STAGE_INTERFACE */
+			mysql_stage_set_work_completed(pfs_stage_progress, i);
+
+			mysql_end_stage();
 			return;
 		}
 
@@ -750,17 +704,21 @@ buf_load()
 
 #ifdef UNIV_DEBUG
 		if ((i+1) >= srv_buf_pool_load_pages_abort) {
-			buf_load_abort_flag = 1;
+			buf_load_abort_flag = true;
 		}
 #endif
 	}
 
-	if (space != NULL) {
+	if (space) {
 		space->release();
 	}
 
 	ut_free(dump);
 
+	if (i == dump_n) {
+		os_aio_wait_until_no_pending_reads();
+	}
+
 	ut_sprintf_timestamp(now);
 
 	if (i == dump_n) {
@@ -782,41 +740,25 @@ buf_load()
 	}
 
 	/* Make sure that estimated = completed when we end. */
-	/* mysql_stage_set_work_completed(pfs_stage_progress, dump_n); */
+	mysql_stage_set_work_completed(pfs_stage_progress, dump_n);
 	/* End the stage progress event. */
-#ifdef HAVE_PSI_STAGE_INTERFACE
-	/* mysql_end_stage(); */
-#endif /* HAVE_PSI_STAGE_INTERFACE */
+	mysql_end_stage();
 }
 
-/*****************************************************************//**
-Aborts a currently running buffer pool load. This function is called by
-MySQL code via buffer_pool_load_abort() and it should return immediately
-because the whole MySQL is frozen during its execution. */
-void
-buf_load_abort()
-/*============*/
+/** Abort a currently running buffer pool load. */
+void buf_load_abort()
 {
-	buf_load_abort_flag = TRUE;
+  buf_load_abort_flag= true;
 }
 
 /*****************************************************************//**
-This is the main thread for buffer pool dump/load. It waits for an
-event and when waked up either performs a dump or load and sleeps
-again.
-@return this function does not return, it calls os_thread_exit() */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(buf_dump_thread)(void*)
+This is the main task for buffer pool dump/load. when scheduled
+either performs a dump or load, depending on server state, state of the variables etc- */
+static void buf_dump_load_func(void *)
 {
-	my_thread_init();
 	ut_ad(!srv_read_only_mode);
-	/* JAN: TODO: MySQL 5.7 PSI
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(buf_dump_thread_key);
-	#endif */ /* UNIV_PFS_THREAD */
-
-	if (srv_buffer_pool_load_at_startup) {
+	static bool first_time = true;
+	if (first_time && srv_buffer_pool_load_at_startup) {
 
 #ifdef WITH_WSREP
 		if (!get_wsrep_recovery()) {
@@ -826,27 +768,24 @@ DECLARE_THREAD(buf_dump_thread)(void*)
 		}
 #endif /* WITH_WSREP */
 	}
+	first_time = false;
 
 	while (!SHUTTING_DOWN()) {
-
-		os_event_wait(srv_buf_dump_event);
-
 		if (buf_dump_should_start) {
 			buf_dump_should_start = false;
-			buf_dump(TRUE /* quit on shutdown */);
+			buf_dump(true);
 		}
-
 		if (buf_load_should_start) {
 			buf_load_should_start = false;
 			buf_load();
 		}
 
-		if (buf_dump_should_start || buf_load_should_start) {
-			continue;
+		if (!buf_dump_should_start && !buf_load_should_start) {
+			return;
 		}
-		os_event_reset(srv_buf_dump_event);
 	}
 
+	/* In shutdown */
 	if (srv_buffer_pool_dump_at_shutdown && srv_fast_shutdown != 2) {
 		if (export_vars.innodb_buffer_pool_load_incomplete) {
 			buf_dump_status(STATUS_INFO,
@@ -856,16 +795,34 @@ DECLARE_THREAD(buf_dump_thread)(void*)
 		} else if (get_wsrep_recovery()) {
 #endif /* WITH_WSREP */
 		} else {
-			buf_dump(FALSE/* do complete dump at shutdown */);
+			buf_dump(false/* do complete dump at shutdown */);
 		}
 	}
+}
+
 
-	srv_buf_dump_thread_active = false;
+/* Execute task with max.concurrency */
+static tpool::task_group tpool_group(1);
+static tpool::waitable_task buf_dump_load_task(buf_dump_load_func, &tpool_group);
+static bool load_dump_enabled;
 
-	my_thread_end();
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-	os_thread_exit();
+/** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/
+void buf_load_at_startup()
+{
+  load_dump_enabled= true;
+  if (srv_buffer_pool_load_at_startup)
+    buf_do_load_dump();
+}
 
-	OS_THREAD_DUMMY_RETURN;
+static void buf_do_load_dump()
+{
+  if (load_dump_enabled && !buf_dump_load_task.is_running())
+    srv_thread_pool->submit_task(&buf_dump_load_task);
+}
+
+/** Wait for currently running load/dumps to finish*/
+void buf_load_dump_end()
+{
+  ut_ad(SHUTTING_DOWN());
+  buf_dump_load_task.wait();
 }
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
index 864311a0abf..90ead952d63 100644
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2020, MariaDB Corporation.
+Copyright (c) 2013, 2022, MariaDB Corporation.
 Copyright (c) 2013, 2014, Fusion-io
 
 This program is free software; you can redistribute it and/or modify it under
@@ -26,167 +26,62 @@ Created 11/11/1995 Heikki Tuuri
 *******************************************************/
 
 #include "univ.i"
+#include <my_service_manager.h>
 #include <mysql/service_thd_wait.h>
 #include <sql_class.h>
 
 #include "buf0flu.h"
 #include "buf0buf.h"
 #include "buf0checksum.h"
+#include "buf0dblwr.h"
 #include "srv0start.h"
-#include "srv0srv.h"
 #include "page0zip.h"
-#include "ut0byte.h"
-#include "page0page.h"
 #include "fil0fil.h"
-#include "buf0lru.h"
-#include "buf0rea.h"
-#include "ibuf0ibuf.h"
-#include "log0log.h"
-#include "os0file.h"
-#include "trx0sys.h"
+#include "log0crypt.h"
 #include "srv0mon.h"
-#include "ut0stage.h"
 #include "fil0pagecompress.h"
-#ifdef UNIV_LINUX
-/* include defs for CPU time priority settings */
-#include <unistd.h>
-#include <sys/syscall.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-static const int buf_flush_page_cleaner_priority = -20;
-#endif /* UNIV_LINUX */
-
-/** Sleep time in microseconds for loop waiting for the oldest
-modification lsn */
-static const ulint buf_flush_wait_flushed_sleep_time = 10000;
+#ifdef HAVE_LZO
+# include "lzo/lzo1x.h"
+#elif defined HAVE_SNAPPY
+# include "snappy-c.h"
+#endif
 
-#include <my_service_manager.h>
+/** Number of pages flushed via LRU. Protected by buf_pool.mutex.
+Also included in buf_flush_page_count. */
+ulint buf_lru_flush_page_count;
 
-/** Number of pages flushed through non flush_list flushes. */
-static ulint buf_lru_flush_page_count = 0;
+/** Number of pages flushed. Protected by buf_pool.mutex. */
+ulint buf_flush_page_count;
 
-/** Flag indicating if the page_cleaner is in active state. This flag
-is set to TRUE by the page_cleaner thread when it is spawned and is set
-back to FALSE at shutdown by the page_cleaner as well. Therefore no
-need to protect it by a mutex. It is only ever read by the thread
-doing the shutdown */
+/** Flag indicating if the page_cleaner is in active state. */
 bool buf_page_cleaner_is_active;
 
 /** Factor for scan length to determine n_pages for intended oldest LSN
 progress */
-static ulint buf_flush_lsn_scan_factor = 3;
+static constexpr ulint buf_flush_lsn_scan_factor = 3;
 
 /** Average redo generation rate */
 static lsn_t lsn_avg_rate = 0;
 
-/** Target oldest LSN for the requested flush_sync */
-static lsn_t buf_flush_sync_lsn = 0;
+/** Target oldest_modification for the page cleaner background flushing;
+writes are protected by buf_pool.flush_list_mutex */
+static Atomic_relaxed<lsn_t> buf_flush_async_lsn;
+/** Target oldest_modification for the page cleaner furious flushing;
+writes are protected by buf_pool.flush_list_mutex */
+static Atomic_relaxed<lsn_t> buf_flush_sync_lsn;
 
 #ifdef UNIV_PFS_THREAD
 mysql_pfs_key_t page_cleaner_thread_key;
 #endif /* UNIV_PFS_THREAD */
 
-/** Event to synchronise with the flushing. */
-os_event_t	buf_flush_event;
-
-/** State for page cleaner array slot */
-enum page_cleaner_state_t {
-	/** Not requested any yet.
-	Moved from FINISHED by the coordinator. */
-	PAGE_CLEANER_STATE_NONE = 0,
-	/** Requested but not started flushing.
-	Moved from NONE by the coordinator. */
-	PAGE_CLEANER_STATE_REQUESTED,
-	/** Flushing is on going.
-	Moved from REQUESTED by the worker. */
-	PAGE_CLEANER_STATE_FLUSHING,
-	/** Flushing was finished.
-	Moved from FLUSHING by the worker. */
-	PAGE_CLEANER_STATE_FINISHED
-};
-
-/** Page cleaner request state for each buffer pool instance */
-struct page_cleaner_slot_t {
-	page_cleaner_state_t	state;	/*!< state of the request.
-					protected by page_cleaner_t::mutex
-					if the worker thread got the slot and
-					set to PAGE_CLEANER_STATE_FLUSHING,
-					n_flushed_lru and n_flushed_list can be
-					updated only by the worker thread */
-	/* This value is set during state==PAGE_CLEANER_STATE_NONE */
-	ulint			n_pages_requested;
-					/*!< number of requested pages
-					for the slot */
-	/* These values are updated during state==PAGE_CLEANER_STATE_FLUSHING,
-	and commited with state==PAGE_CLEANER_STATE_FINISHED.
-	The consistency is protected by the 'state' */
-	ulint			n_flushed_lru;
-					/*!< number of flushed pages
-					by LRU scan flushing */
-	ulint			n_flushed_list;
-					/*!< number of flushed pages
-					by flush_list flushing */
-	bool			succeeded_list;
-					/*!< true if flush_list flushing
-					succeeded. */
-	ulint			flush_lru_time;
-					/*!< elapsed time for LRU flushing */
-	ulint			flush_list_time;
-					/*!< elapsed time for flush_list
-					flushing */
-	ulint			flush_lru_pass;
-					/*!< count to attempt LRU flushing */
-	ulint			flush_list_pass;
-					/*!< count to attempt flush_list
-					flushing */
-};
-
-/** Page cleaner structure common for all threads */
-struct page_cleaner_t {
-	ib_mutex_t		mutex;		/*!< mutex to protect whole of
-						page_cleaner_t struct and
-						page_cleaner_slot_t slots. */
-	os_event_t		is_requested;	/*!< event to activate worker
-						threads. */
-	os_event_t		is_finished;	/*!< event to signal that all
-						slots were finished. */
-	os_event_t		is_started;	/*!< event to signal that
-						thread is started/exiting */
-	volatile ulint		n_workers;	/*!< number of worker threads
-						in existence */
-	bool			requested;	/*!< true if requested pages
-						to flush */
-	lsn_t			lsn_limit;	/*!< upper limit of LSN to be
-						flushed */
-	ulint			n_slots;	/*!< total number of slots */
-	ulint			n_slots_requested;
-						/*!< number of slots
-						in the state
-						PAGE_CLEANER_STATE_REQUESTED */
-	ulint			n_slots_flushing;
-						/*!< number of slots
-						in the state
-						PAGE_CLEANER_STATE_FLUSHING */
-	ulint			n_slots_finished;
-						/*!< number of slots
-						in the state
-						PAGE_CLEANER_STATE_FINISHED */
-	ulint			flush_time;	/*!< elapsed time to flush
-						requests for all slots */
-	ulint			flush_pass;	/*!< count to finish to flush
-						requests for all slots */
-	page_cleaner_slot_t	slots[MAX_BUFFER_POOLS];
-	bool			is_running;	/*!< false if attempt
-						to shutdown */
-
-#ifdef UNIV_DEBUG
-	ulint			n_disabled_debug;
-						/*<! how many of pc threads
-						have been disabled */
-#endif /* UNIV_DEBUG */
-};
-
-static page_cleaner_t	page_cleaner;
+/** Page cleaner structure */
+static struct
+{
+  /** total elapsed time in adaptive flushing, in seconds */
+  ulint flush_time;
+  /** number of adaptive flushing passes */
+  ulint flush_pass;
+} page_cleaner;
 
 #ifdef UNIV_DEBUG
 my_bool innodb_page_cleaner_disabled_debug;
@@ -194,40 +89,12 @@ my_bool innodb_page_cleaner_disabled_debug;
 
 /* @} */
 
-/******************************************************************//**
-Increases flush_list size in bytes with the page size in inline function */
-static inline
-void
-incr_flush_list_size_in_bytes(
-/*==========================*/
-	buf_block_t*	block,		/*!< in: control block */
-	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
-{
-	ut_ad(buf_flush_list_mutex_own(buf_pool));
-
-	buf_pool->stat.flush_list_bytes += block->physical_size();
-
-	ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size);
-}
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/******************************************************************//**
-Validates the flush list.
-@return TRUE if ok */
-static
-ibool
-buf_flush_validate_low(
-/*===================*/
-	buf_pool_t*	buf_pool);	/*!< in: Buffer pool instance */
+#ifdef UNIV_DEBUG
+/** Validate the flush list. */
+static void buf_flush_validate_low();
 
-/******************************************************************//**
-Validates the flush list some of the time.
-@return TRUE if ok or the check was skipped */
-static
-ibool
-buf_flush_validate_skip(
-/*====================*/
-	buf_pool_t*	buf_pool)	/*!< in: Buffer pool instance */
+/** Validates the flush list some of the time. */
+static void buf_flush_validate_skip()
 {
 /** Try buf_flush_validate_low() every this many times */
 # define BUF_FLUSH_VALIDATE_SKIP	23
@@ -241,370 +108,156 @@ buf_flush_validate_skip(
 	reduce the call frequency of the costly buf_flush_validate_low()
 	check in debug builds. */
 	if (--buf_flush_validate_count > 0) {
-		return(TRUE);
+		return;
 	}
 
 	buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
-	return(buf_flush_validate_low(buf_pool));
-}
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-/******************************************************************//**
-Insert a block in the flush_rbt and returns a pointer to its
-predecessor or NULL if no predecessor. The ordering is maintained
-on the basis of the <oldest_modification, space, offset> key.
-@return pointer to the predecessor or NULL if no predecessor. */
-static
-buf_page_t*
-buf_flush_insert_in_flush_rbt(
-/*==========================*/
-	buf_page_t*	bpage)	/*!< in: bpage to be inserted. */
-{
-	const ib_rbt_node_t*	c_node;
-	const ib_rbt_node_t*	p_node;
-	buf_page_t*		prev = NULL;
-	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
-
-	ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
-	ut_ad(buf_flush_list_mutex_own(buf_pool));
-
-	/* Insert this buffer into the rbt. */
-	c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
-	ut_a(c_node != NULL);
-
-	/* Get the predecessor. */
-	p_node = rbt_prev(buf_pool->flush_rbt, c_node);
-
-	if (p_node != NULL) {
-		buf_page_t**	value;
-		value = rbt_value(buf_page_t*, p_node);
-		prev = *value;
-		ut_a(prev != NULL);
-	}
-
-	return(prev);
-}
-
-/*********************************************************//**
-Delete a bpage from the flush_rbt. */
-static
-void
-buf_flush_delete_from_flush_rbt(
-/*============================*/
-	buf_page_t*	bpage)	/*!< in: bpage to be removed. */
-{
-#ifdef UNIV_DEBUG
-	ibool		ret = FALSE;
-#endif /* UNIV_DEBUG */
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-
-	ut_ad(buf_flush_list_mutex_own(buf_pool));
-
-#ifdef UNIV_DEBUG
-	ret =
-#endif /* UNIV_DEBUG */
-	rbt_delete(buf_pool->flush_rbt, &bpage);
-
-	ut_ad(ret);
+	buf_flush_validate_low();
 }
-
-/*****************************************************************//**
-Compare two modified blocks in the buffer pool. The key for comparison
-is:
-key = <oldest_modification, space, offset>
-This comparison is used to maintian ordering of blocks in the
-buf_pool->flush_rbt.
-Note that for the purpose of flush_rbt, we only need to order blocks
-on the oldest_modification. The other two fields are used to uniquely
-identify the blocks.
-@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
-static
-int
-buf_flush_block_cmp(
-/*================*/
-	const void*	p1,		/*!< in: block1 */
-	const void*	p2)		/*!< in: block2 */
-{
-	int			ret;
-	const buf_page_t*	b1 = *(const buf_page_t**) p1;
-	const buf_page_t*	b2 = *(const buf_page_t**) p2;
-
-	ut_ad(b1 != NULL);
-	ut_ad(b2 != NULL);
-
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(b1);
 #endif /* UNIV_DEBUG */
 
-	ut_ad(buf_flush_list_mutex_own(buf_pool));
-
-	ut_ad(b1->in_flush_list);
-	ut_ad(b2->in_flush_list);
-
-	if (b2->oldest_modification > b1->oldest_modification) {
-		return(1);
-	} else if (b2->oldest_modification < b1->oldest_modification) {
-		return(-1);
-	}
-
-	/* If oldest_modification is same then decide on the space. */
-	ret = (int)(b2->id.space() - b1->id.space());
-
-	/* Or else decide ordering on the page number. */
-	return(ret ? ret : (int) (b2->id.page_no() - b1->id.page_no()));
-}
-
-/********************************************************************//**
-Initialize the red-black tree to speed up insertions into the flush_list
-during recovery process. Should be called at the start of recovery
-process before any page has been read/written. */
-void
-buf_flush_init_flush_rbt(void)
-/*==========================*/
-{
-	ulint	i;
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		buf_flush_list_mutex_enter(buf_pool);
-
-		ut_ad(buf_pool->flush_rbt == NULL);
-
-		/* Create red black tree for speedy insertions in flush list. */
-		buf_pool->flush_rbt = rbt_create(
-			sizeof(buf_page_t*), buf_flush_block_cmp);
-
-		buf_flush_list_mutex_exit(buf_pool);
-	}
-}
-
-/********************************************************************//**
-Frees up the red-black tree. */
-void
-buf_flush_free_flush_rbt(void)
-/*==========================*/
+/** Wake up the page cleaner if needed */
+void buf_pool_t::page_cleaner_wakeup()
 {
-	ulint	i;
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		buf_flush_list_mutex_enter(buf_pool);
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-		ut_a(buf_flush_validate_low(buf_pool));
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-		rbt_free(buf_pool->flush_rbt);
-		buf_pool->flush_rbt = NULL;
-
-		buf_flush_list_mutex_exit(buf_pool);
-	}
+  if (!page_cleaner_idle())
+    return;
+  double dirty_pct= double(UT_LIST_GET_LEN(buf_pool.flush_list)) * 100.0 /
+    double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free));
+  double pct_lwm= srv_max_dirty_pages_pct_lwm;
+
+  /* if pct_lwm != 0.0, adaptive flushing is enabled.
+  signal buf page cleaner thread
+  - if pct_lwm <= dirty_pct then it will invoke apdative flushing flow
+  - if pct_lwm > dirty_pct then it will invoke idle flushing flow.
+
+  idle_flushing:
+  dirty_pct < innodb_max_dirty_pages_pct_lwm so it could be an
+  idle flushing use-case.
+
+  Why is last_activity_count not updated always?
+  - let's first understand when is server activity count updated.
+  - it is updated on commit of a transaction trx_t::commit() and not
+    on adding a page to the flush list.
+  - page_cleaner_wakeup is called when a page is added to the flush list.
+
+  - now let's say the first user thread, updates the count from X -> Y but
+    is yet to commit the transaction (so activity count is still Y).
+    followup user threads will see the updated count as (Y) that is matching
+    the universal server activity count (Y), giving a false impression that
+    the server is idle.
+
+  How to avoid this?
+  - by allowing last_activity_count to updated when page-cleaner is made
+    active and has work to do. This ensures that the last_activity signal
+    is consumed by the page-cleaner before the next one is generated. */
+  if ((pct_lwm != 0.0 && pct_lwm <= dirty_pct) ||
+      (pct_lwm != 0.0 && last_activity_count == srv_get_activity_count()) ||
+      srv_max_buf_pool_modified_pct <= dirty_pct)
+  {
+    page_cleaner_is_idle= false;
+    pthread_cond_signal(&do_flush_list);
+  }
 }
 
-/********************************************************************//**
-Inserts a modified block into the flush list. */
-void
-buf_flush_insert_into_flush_list(
-/*=============================*/
-	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	buf_block_t*	block,		/*!< in/out: block which is modified */
-	lsn_t		lsn)		/*!< in: oldest modification */
+inline void buf_pool_t::delete_from_flush_list_low(buf_page_t *bpage)
 {
-	ut_ad(!buf_pool_mutex_own(buf_pool));
-	ut_ad(log_flush_order_mutex_own());
-	ut_ad(buf_page_mutex_own(block));
-
-	buf_flush_list_mutex_enter(buf_pool);
-	ut_ad(!block->page.in_flush_list);
-	ut_d(block->page.in_flush_list = TRUE);
-	ut_ad(!block->page.oldest_modification);
-	block->page.oldest_modification = lsn;
-	MEM_CHECK_DEFINED(block->page.zip.data
-			  ? block->page.zip.data : block->frame,
-			  block->physical_size());
-	incr_flush_list_size_in_bytes(block, buf_pool);
-
-	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
-		ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
-		/* The field in_LRU_list is protected by buf_pool->mutex, which
-		we are not holding.  However, while a block is in the flush
-		list, it is dirty and cannot be discarded, not from the
-		page_hash or from the LRU list.  At most, the uncompressed
-		page frame of a compressed block may be discarded or created
-		(copying the block->page to or from a buf_page_t that is
-		dynamically allocated from buf_buddy_alloc()).  Because those
-		transitions hold block->mutex and the flush list mutex (via
-		buf_flush_relocate_on_flush_list()), there is no possibility
-		of a race condition in the assertions below. */
-		ut_ad(block->page.in_LRU_list);
-		ut_ad(block->page.in_page_hash);
-		/* buf_buddy_block_register() will take a block in the
-		BUF_BLOCK_MEMORY state, not a file page. */
-		ut_ad(!block->page.in_zip_hash);
-
-		if (buf_page_t* prev_b =
-		    buf_flush_insert_in_flush_rbt(&block->page)) {
-			UT_LIST_INSERT_AFTER(buf_pool->flush_list, prev_b, &block->page);
-			goto func_exit;
-		}
-	}
-
-	UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page);
-func_exit:
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	ut_a(buf_flush_validate_skip(buf_pool));
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-	buf_flush_list_mutex_exit(buf_pool);
+  ut_ad(!fsp_is_system_temporary(bpage->id().space()));
+  mysql_mutex_assert_owner(&flush_list_mutex);
+  flush_hp.adjust(bpage);
+  UT_LIST_REMOVE(flush_list, bpage);
 }
 
-/********************************************************************//**
-Returns TRUE if the file page block is immediately suitable for replacement,
-i.e., the transition FILE_PAGE => NOT_USED allowed.
-@return TRUE if can replace immediately */
-ibool
-buf_flush_ready_for_replace(
-/*========================*/
-	buf_page_t*	bpage)	/*!< in: buffer control block, must be
-				buf_page_in_file(bpage) and in the LRU list */
+/** Insert a modified block into the flush list.
+@param block    modified block
+@param lsn      start LSN of the mini-transaction that modified the block */
+void buf_pool_t::insert_into_flush_list(buf_block_t *block, lsn_t lsn)
 {
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	ut_ad(buf_pool_mutex_own(buf_pool));
-#endif /* UNIV_DEBUG */
-	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-	ut_ad(bpage->in_LRU_list);
-	ut_a(buf_page_in_file(bpage));
-
-	return bpage->oldest_modification == 0
-		&& bpage->buf_fix_count == 0
-		&& buf_page_get_io_fix(bpage) == BUF_IO_NONE;
+  mysql_mutex_assert_not_owner(&mutex);
+  mysql_mutex_assert_owner(&log_sys.flush_order_mutex);
+  ut_ad(lsn > 2);
+  ut_ad(!fsp_is_system_temporary(block->page.id().space()));
+
+  mysql_mutex_lock(&flush_list_mutex);
+  if (ut_d(const lsn_t old=) block->page.oldest_modification())
+  {
+    ut_ad(old == 1);
+    delete_from_flush_list_low(&block->page);
+  }
+  else
+    stat.flush_list_bytes+= block->physical_size();
+  ut_ad(stat.flush_list_bytes <= curr_pool_size);
+
+  block->page.set_oldest_modification(lsn);
+  MEM_CHECK_DEFINED(block->page.zip.data
+                    ? block->page.zip.data : block->frame,
+                    block->physical_size());
+  UT_LIST_ADD_FIRST(flush_list, &block->page);
+  ut_d(buf_flush_validate_skip());
+  page_cleaner_wakeup();
+  mysql_mutex_unlock(&flush_list_mutex);
 }
 
-/********************************************************************//**
-Returns true if the block is modified and ready for flushing.
-@return true if can flush immediately */
-bool
-buf_flush_ready_for_flush(
-/*======================*/
-	buf_page_t*	bpage,	/*!< in: buffer control block, must be
-				buf_page_in_file(bpage) */
-	buf_flush_t	flush_type)/*!< in: type of flush */
+/** Remove a block from flush_list.
+@param bpage   buffer pool page
+@param clear   whether to invoke buf_page_t::clear_oldest_modification() */
+void buf_pool_t::delete_from_flush_list(buf_page_t *bpage, bool clear)
 {
+  delete_from_flush_list_low(bpage);
+  stat.flush_list_bytes-= bpage->physical_size();
+  if (clear)
+    bpage->clear_oldest_modification();
 #ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	ut_ad(buf_pool_mutex_own(buf_pool));
+  buf_flush_validate_skip();
 #endif /* UNIV_DEBUG */
-
-	ut_a(buf_page_in_file(bpage));
-	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
-
-	if (bpage->oldest_modification == 0
-	    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
-		return(false);
-	}
-
-	ut_ad(bpage->in_flush_list);
-
-	switch (flush_type) {
-	case BUF_FLUSH_LIST:
-	case BUF_FLUSH_LRU:
-	case BUF_FLUSH_SINGLE_PAGE:
-		return(true);
-
-	case BUF_FLUSH_N_TYPES:
-		break;
-	}
-
-	ut_error;
-	return(false);
 }
 
-/********************************************************************//**
-Remove a block from the flush list of modified blocks. */
-void
-buf_flush_remove(
-/*=============*/
-	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
+/** Remove all dirty pages belonging to a given tablespace when we are
+deleting the data file of that tablespace.
+The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU.
+@param id    tablespace identifier */
+void buf_flush_remove_pages(ulint id)
 {
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-
-#if 0 // FIXME: Rate-limit the output. Move this to the page cleaner?
-	if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE)) {
-		service_manager_extend_timeout(
-			INNODB_EXTEND_TIMEOUT_INTERVAL,
-			"Flush and remove page with tablespace id %u"
-			", Poolid " ULINTPF ", flush list length " ULINTPF,
-			bpage->space, buf_pool->instance_no,
-			UT_LIST_GET_LEN(buf_pool->flush_list));
-	}
-#endif
+  const page_id_t first(id, 0), end(id + 1, 0);
+  ut_ad(id);
+  mysql_mutex_lock(&buf_pool.mutex);
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-	ut_ad(bpage->in_flush_list);
+  for (;;)
+  {
+    bool deferred= false;
 
-	buf_flush_list_mutex_enter(buf_pool);
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
 
-	/* Important that we adjust the hazard pointer before removing
-	the bpage from flush list. */
-	buf_pool->flush_hp.adjust(bpage);
-
-	switch (buf_page_get_state(bpage)) {
-	case BUF_BLOCK_POOL_WATCH:
-	case BUF_BLOCK_ZIP_PAGE:
-		/* Clean compressed pages should not be on the flush list */
-	case BUF_BLOCK_NOT_USED:
-	case BUF_BLOCK_READY_FOR_USE:
-	case BUF_BLOCK_MEMORY:
-	case BUF_BLOCK_REMOVE_HASH:
-		ut_error;
-		return;
-	case BUF_BLOCK_ZIP_DIRTY:
-		buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
-		UT_LIST_REMOVE(buf_pool->flush_list, bpage);
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-		buf_LRU_insert_zip_clean(bpage);
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-		break;
-	case BUF_BLOCK_FILE_PAGE:
-		UT_LIST_REMOVE(buf_pool->flush_list, bpage);
-		break;
-	}
-
-	/* If the flush_rbt is active then delete from there as well. */
-	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
-		buf_flush_delete_from_flush_rbt(bpage);
-	}
+    for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
+    {
+      ut_d(const auto s= bpage->state());
+      ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE ||
+            s == BUF_BLOCK_REMOVE_HASH);
+      buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
 
-	/* Must be done after we have removed it from the flush_rbt
-	because we assert on in_flush_list in comparison function. */
-	ut_d(bpage->in_flush_list = FALSE);
+      const page_id_t bpage_id(bpage->id());
 
-	buf_pool->stat.flush_list_bytes -= bpage->physical_size();
+      if (bpage_id < first || bpage_id >= end);
+      else if (bpage->io_fix() != BUF_IO_NONE)
+        deferred= true;
+      else
+        buf_pool.delete_from_flush_list(bpage);
 
-	bpage->oldest_modification = 0;
+      bpage= prev;
+    }
 
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	ut_a(buf_flush_validate_skip(buf_pool));
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
-	/* If there is an observer that want to know if the asynchronous
-	flushing was done then notify it. */
-	if (bpage->flush_observer != NULL) {
-		bpage->flush_observer->notify_remove(buf_pool, bpage);
+    if (!deferred)
+      break;
 
-		bpage->flush_observer = NULL;
-	}
+    mysql_mutex_unlock(&buf_pool.mutex);
+    os_thread_yield();
+    mysql_mutex_lock(&buf_pool.mutex);
+    buf_flush_wait_batch_end(false);
+  }
 
-	buf_flush_list_mutex_exit(buf_pool);
+  mysql_mutex_unlock(&buf_pool.mutex);
 }
 
 /*******************************************************************//**
@@ -618,6 +271,7 @@ use the current list node (bpage) to do the list manipulation because
 the list pointers could have changed between the time that we copied
 the contents of bpage to the dpage and the flush list manipulation
 below. */
+ATTRIBUTE_COLD
 void
 buf_flush_relocate_on_flush_list(
 /*=============================*/
@@ -625,113 +279,132 @@ buf_flush_relocate_on_flush_list(
 	buf_page_t*	dpage)	/*!< in/out: destination block */
 {
 	buf_page_t*	prev;
-	buf_page_t*	prev_b = NULL;
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	/* Must reside in the same buffer pool. */
-	ut_ad(buf_pool == buf_pool_from_bpage(dpage));
-
-	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-
-	buf_flush_list_mutex_enter(buf_pool);
-
-	/* FIXME: At this point we have both buf_pool and flush_list
-	mutexes. Theoretically removal of a block from flush list is
-	only covered by flush_list mutex but currently we do
-	have buf_pool mutex in buf_flush_remove() therefore this block
-	is guaranteed to be in the flush list. We need to check if
-	this will work without the assumption of block removing code
-	having the buf_pool mutex. */
-	ut_ad(bpage->in_flush_list);
-	ut_ad(dpage->in_flush_list);
-
-	/* If recovery is active we must swap the control blocks in
-	the flush_rbt as well. */
-	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
-		buf_flush_delete_from_flush_rbt(bpage);
-		prev_b = buf_flush_insert_in_flush_rbt(dpage);
+
+	mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+	ut_ad(!fsp_is_system_temporary(bpage->id().space()));
+
+	const lsn_t lsn = bpage->oldest_modification();
+
+	if (!lsn) {
+		return;
 	}
 
+	ut_ad(lsn == 1 || lsn > 2);
+	ut_ad(dpage->oldest_modification() == lsn);
+
 	/* Important that we adjust the hazard pointer before removing
 	the bpage from the flush list. */
-	buf_pool->flush_hp.adjust(bpage);
-
-	/* Must be done after we have removed it from the flush_rbt
-	because we assert on in_flush_list in comparison function. */
-	ut_d(bpage->in_flush_list = FALSE);
+	buf_pool.flush_hp.adjust(bpage);
 
 	prev = UT_LIST_GET_PREV(list, bpage);
-	UT_LIST_REMOVE(buf_pool->flush_list, bpage);
-
-	if (prev) {
-		ut_ad(prev->in_flush_list);
-		UT_LIST_INSERT_AFTER( buf_pool->flush_list, prev, dpage);
+	UT_LIST_REMOVE(buf_pool.flush_list, bpage);
+
+	bpage->clear_oldest_modification();
+
+	if (lsn == 1) {
+		buf_pool.stat.flush_list_bytes -= dpage->physical_size();
+		dpage->list.prev = nullptr;
+		dpage->list.next = nullptr;
+		dpage->clear_oldest_modification();
+	} else if (prev) {
+		ut_ad(prev->oldest_modification());
+		UT_LIST_INSERT_AFTER(buf_pool.flush_list, prev, dpage);
 	} else {
-		UT_LIST_ADD_FIRST(buf_pool->flush_list, dpage);
+		UT_LIST_ADD_FIRST(buf_pool.flush_list, dpage);
 	}
 
-	/* Just an extra check. Previous in flush_list
-	should be the same control block as in flush_rbt. */
-	ut_a(buf_pool->flush_rbt == NULL || prev_b == prev);
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	ut_a(buf_flush_validate_low(buf_pool));
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-	buf_flush_list_mutex_exit(buf_pool);
+	ut_d(buf_flush_validate_low());
 }
 
-/** Update the flush system data structures when a write is completed.
-@param[in,out]	bpage	flushed page
-@param[in]	dblwr	whether the doublewrite buffer was used */
-void buf_flush_write_complete(buf_page_t* bpage, bool dblwr)
+/** Complete write of a file page from buf_pool.
+@param request write request */
+void buf_page_write_complete(const IORequest &request)
 {
-	buf_flush_t	flush_type;
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-
-	ut_ad(bpage);
-
-	buf_flush_remove(bpage);
-
-	flush_type = buf_page_get_flush_type(bpage);
-	buf_pool->n_flush[flush_type]--;
-	ut_ad(buf_pool->n_flush[flush_type] != ULINT_MAX);
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-
-	if (buf_pool->n_flush[flush_type] == 0
-	    && buf_pool->init_flush[flush_type] == FALSE) {
-
-		/* The running flush batch has ended */
-
-		os_event_set(buf_pool->no_flush[flush_type]);
-	}
-
-	if (dblwr) {
-		buf_dblwr_update(bpage, flush_type);
-	}
+  ut_ad(request.is_write());
+  ut_ad(!srv_read_only_mode/* ||
+        request.node->space->purpose == FIL_TYPE_TEMPORARY*/);
+  buf_page_t *bpage= request.bpage;
+  ut_ad(bpage);
+  ut_ad(bpage->in_file());
+  /* bpage->io_fix() can only be changed by buf_page_write_complete()
+  and buf_page_read_complete() from BUF_IO_READ or BUF_IO_WRITE */
+  ut_ad(bpage->io_fix() == BUF_IO_WRITE);
+  ut_ad(!buf_dblwr.is_inside(bpage->id()));
+  ut_ad(request.node->space->id == bpage->id().space());
+
+  if (bpage->status == buf_page_t::INIT_ON_FLUSH)
+    bpage->status= buf_page_t::NORMAL;
+  else
+  {
+    ut_ad(bpage->status == buf_page_t::NORMAL);
+    if (request.node->space->use_doublewrite())
+    {
+      ut_ad(request.node->space != fil_system.temp_space);
+      buf_dblwr.write_completed();
+    }
+  }
+
+  if (bpage->slot)
+  {
+    bpage->slot->release();
+    bpage->slot= nullptr;
+  }
+
+  if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
+    buf_page_monitor(bpage, BUF_IO_WRITE);
+  DBUG_PRINT("ib_buf", ("write page %u:%u",
+                        bpage->id().space(), bpage->id().page_no()));
+  const bool temp= fsp_is_system_temporary(bpage->id().space());
+
+  mysql_mutex_lock(&buf_pool.mutex);
+  mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+  buf_pool.stat.n_pages_written++;
+  /* While we do not need any mutex for clearing oldest_modification
+  here, we hope that it will be in the same cache line with io_fix,
+  whose changes must be protected by buf_pool.mutex. */
+  ut_ad(temp || bpage->oldest_modification() > 2);
+  bpage->clear_oldest_modification(temp);
+  ut_ad(bpage->io_fix() == BUF_IO_WRITE);
+  bpage->set_io_fix(BUF_IO_NONE);
+
+  /* Because this thread which does the unlocking might not be the same that
+  did the locking, we use a pass value != 0 in unlock, which simply
+  removes the newest lock debug record, without checking the thread id. */
+  if (bpage->state() == BUF_BLOCK_FILE_PAGE)
+    rw_lock_sx_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_WRITE);
+
+  if (request.is_LRU())
+  {
+    buf_LRU_free_page(bpage, true);
+
+    ut_ad(buf_pool.n_flush_LRU_);
+    if (!--buf_pool.n_flush_LRU_)
+    {
+      pthread_cond_broadcast(&buf_pool.done_flush_LRU);
+      pthread_cond_signal(&buf_pool.done_free);
+    }
+  }
+  else
+  {
+    ut_ad(!temp);
+    ut_ad(buf_pool.n_flush_list_);
+    if (!--buf_pool.n_flush_list_)
+      pthread_cond_broadcast(&buf_pool.done_flush_list);
+  }
+
+  mysql_mutex_unlock(&buf_pool.mutex);
 }
 
-/** Calculate the checksum of a page from compressed table and update
-the page.
+/** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page.
 @param[in,out]	page		page to update
-@param[in]	size		compressed page size
-@param[in]	lsn		LSN to stamp on the page */
-void
-buf_flush_update_zip_checksum(
-	buf_frame_t*	page,
-	ulint		size,
-	lsn_t		lsn)
+@param[in]	size		compressed page size */
+void buf_flush_update_zip_checksum(buf_frame_t *page, ulint size)
 {
-	ut_a(size > 0);
-
-	const uint32_t	checksum = page_zip_calc_checksum(
-		page, size,
-		static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm));
-
-	mach_write_to_8(page + FIL_PAGE_LSN, lsn);
-	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
+  ut_ad(size > 0);
+  mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+                  page_zip_calc_checksum(page, size,
+                                         static_cast<srv_checksum_algorithm_t>
+                                         (srv_checksum_algorithm)));
 }
 
 /** Assign the full crc32 checksum for non-compressed page.
@@ -755,14 +428,12 @@ void buf_flush_assign_full_crc32_checksum(byte* page)
 @param[in,out]	page			page frame
 @param[in,out]	page_zip_		compressed page, or NULL if
 					uncompressed
-@param[in]	newest_lsn		newest modification LSN to the page
 @param[in]	use_full_checksum	whether tablespace uses full checksum */
 void
 buf_flush_init_for_writing(
 	const buf_block_t*	block,
 	byte*			page,
 	void*			page_zip_,
-	lsn_t			newest_lsn,
 	bool			use_full_checksum)
 {
 	if (block != NULL && block->frame != page) {
@@ -775,19 +446,7 @@ buf_flush_init_for_writing(
 	ut_ad(block == NULL || block->frame == page);
 	ut_ad(block == NULL || page_zip_ == NULL
 	      || &block->page.zip == page_zip_);
-	ut_ad(!block || newest_lsn);
 	ut_ad(page);
-	/* Encryption key rotation procedure can write dummy log records to
-	update page's space id, what causes page LSN update, and we need some
-	additional check during recovery to be sure the page is freshly
-	allocated, see buf_page_create() to find such patterns */
-	ut_ad(fil_page_get_type(page)
-	      || (!newest_lsn
-		  || (mach_read_from_4(page + FIL_PAGE_SPACE_ID)
-			      == block->page.id.space()
-		      && mach_read_from_4(page + FIL_PAGE_PREV) == 0xffffffff
-		      && mach_read_from_4(page + FIL_PAGE_NEXT) == 0xffffffff
-		      && !mach_read_from_4(page + FIL_PAGE_SPACE_OR_CHKSUM))));
 
 	if (page_zip_) {
 		page_zip_des_t*	page_zip;
@@ -813,10 +472,7 @@ buf_flush_init_for_writing(
 		case FIL_PAGE_TYPE_ZBLOB2:
 		case FIL_PAGE_INDEX:
 		case FIL_PAGE_RTREE:
-
-			buf_flush_update_zip_checksum(
-				page_zip->data, size, newest_lsn);
-
+			buf_flush_update_zip_checksum(page_zip->data, size);
 			return;
 		}
 
@@ -829,18 +485,20 @@ buf_flush_init_for_writing(
 		ut_error;
 	}
 
-	/* Write the newest modification lsn to the page header and trailer */
-	mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
-
 	if (use_full_checksum) {
-		mach_write_to_4(page + srv_page_size - FIL_PAGE_FCRC32_END_LSN,
-				static_cast<uint32_t>(newest_lsn));
+		static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "aligned");
+		static_assert(FIL_PAGE_LSN % 4 == 0, "aligned");
+		memcpy_aligned<4>(page + srv_page_size
+				  - FIL_PAGE_FCRC32_END_LSN,
+				  FIL_PAGE_LSN + 4 + page, 4);
 		return buf_flush_assign_full_crc32_checksum(page);
-	} else {
-		mach_write_to_8(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
-				newest_lsn);
 	}
 
+	static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 8 == 0, "aligned");
+	static_assert(FIL_PAGE_LSN % 8 == 0, "aligned");
+	memcpy_aligned<8>(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
+			  FIL_PAGE_LSN + page, 8);
+
 	if (block && srv_page_size == 16384) {
 		/* The page type could be garbage in old files
 		created before MySQL 5.5. Such files always
@@ -848,9 +506,9 @@ buf_flush_init_for_writing(
 		ulint	page_type = fil_page_get_type(page);
 		ulint	reset_type = page_type;
 
-		switch (block->page.id.page_no() % 16384) {
+		switch (block->page.id().page_no() % 16384) {
 		case 0:
-			reset_type = block->page.id.page_no() == 0
+			reset_type = block->page.id().page_no() == 0
 				? FIL_PAGE_TYPE_FSP_HDR
 				: FIL_PAGE_TYPE_XDES;
 			break;
@@ -858,10 +516,8 @@ buf_flush_init_for_writing(
 			reset_type = FIL_PAGE_IBUF_BITMAP;
 			break;
 		case FSP_TRX_SYS_PAGE_NO:
-			if (block->page.id.page_no()
-			    == TRX_SYS_PAGE_NO
-			    && block->page.id.space()
-			    == TRX_SYS_SPACE) {
+			if (block->page.id()
+			    == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO)) {
 				reset_type = FIL_PAGE_TYPE_TRX_SYS;
 				break;
 			}
@@ -896,7 +552,7 @@ buf_flush_init_for_writing(
 		if (UNIV_UNLIKELY(page_type != reset_type)) {
 			ib::info()
 				<< "Resetting invalid page "
-				<< block->page.id << " type "
+				<< block->page.id() << " type "
 				<< page_type << " to "
 				<< reset_type << " when flushing.";
 			fil_page_set_type(page, reset_type);
@@ -939,577 +595,583 @@ buf_flush_init_for_writing(
 			checksum);
 }
 
-/********************************************************************//**
-Does an asynchronous write of a buffer page. NOTE: in simulated aio and
-also when the doublewrite buffer is used, we must call
-buf_dblwr_flush_buffered_writes after we have posted a batch of
-writes! */
-static
-void
-buf_flush_write_block_low(
-/*======================*/
-	buf_page_t*	bpage,		/*!< in: buffer block to write */
-	buf_flush_t	flush_type,	/*!< in: type of flush */
-	bool		sync)		/*!< in: true if sync IO request */
+/** Reserve a buffer for compression.
+@param[in,out]  slot    reserved slot */
+static void buf_tmp_reserve_compression_buf(buf_tmp_buffer_t* slot)
 {
-	fil_space_t* space = fil_space_acquire_for_io(bpage->id.space());
-	if (!space) {
-		return;
-	}
-	ut_ad(space->purpose == FIL_TYPE_TEMPORARY
-	      || space->purpose == FIL_TYPE_IMPORT
-	      || space->purpose == FIL_TYPE_TABLESPACE);
-	ut_ad((space->purpose == FIL_TYPE_TEMPORARY)
-	      == (space == fil_system.temp_space));
-
-	page_t*	frame = NULL;
-	const bool full_crc32 = space->full_crc32();
-
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	ut_ad(!buf_pool_mutex_own(buf_pool));
-#endif /* UNIV_DEBUG */
-
-	DBUG_PRINT("ib_buf", ("flush %s %u page %u:%u",
-			      sync ? "sync" : "async", (unsigned) flush_type,
-			      bpage->id.space(), bpage->id.page_no()));
-
-	ut_ad(buf_page_in_file(bpage));
-
-	/* We are not holding buf_pool->mutex or block_mutex here.
-	Nevertheless, it is safe to access bpage, because it is
-	io_fixed and oldest_modification != 0.  Thus, it cannot be
-	relocated in the buffer pool or removed from flush_list or
-	LRU_list. */
-	ut_ad(!buf_pool_mutex_own(buf_pool));
-	ut_ad(!buf_flush_list_mutex_own(buf_pool));
-	ut_ad(!buf_page_get_mutex(bpage)->is_owned());
-	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
-	ut_ad(bpage->oldest_modification != 0);
-	ut_ad(bpage->newest_modification != 0);
-
-	/* Force the log to the disk before writing the modified block */
-	if (!srv_read_only_mode) {
-		log_write_up_to(bpage->newest_modification, true);
-	}
-
-	switch (buf_page_get_state(bpage)) {
-	case BUF_BLOCK_POOL_WATCH:
-	case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
-	case BUF_BLOCK_NOT_USED:
-	case BUF_BLOCK_READY_FOR_USE:
-	case BUF_BLOCK_MEMORY:
-	case BUF_BLOCK_REMOVE_HASH:
-		ut_error;
-		break;
-	case BUF_BLOCK_ZIP_DIRTY:
-		frame = bpage->zip.data;
-		buf_flush_update_zip_checksum(frame, bpage->zip_size(),
-					      bpage->newest_modification);
-		break;
-	case BUF_BLOCK_FILE_PAGE:
-		frame = bpage->zip.data;
-		if (!frame) {
-			frame = ((buf_block_t*) bpage)->frame;
-		}
-
-		byte* page = reinterpret_cast<const buf_block_t*>(bpage)->frame;
-
-		if (full_crc32) {
-			page = buf_page_encrypt(space, bpage, page);
-			frame = page;
-		}
-
-		buf_flush_init_for_writing(
-			reinterpret_cast<const buf_block_t*>(bpage), page,
-			bpage->zip.data ? &bpage->zip : NULL,
-			bpage->newest_modification, full_crc32);
-		break;
-	}
-
-	if (!full_crc32) {
-		frame = buf_page_encrypt(space, bpage, frame);
-	}
-
-	ut_ad(space->purpose == FIL_TYPE_TABLESPACE
-	      || space->atomic_write_supported);
-	if (!space->use_doublewrite()) {
-		ulint	type = IORequest::WRITE | IORequest::DO_NOT_WAKE;
-
-		IORequest	request(type, bpage);
-
-		/* TODO: pass the tablespace to fil_io() */
-		fil_io(request,
-		       sync, bpage->id, bpage->zip_size(), 0,
-		       bpage->physical_size(),
-		       frame, bpage);
-	} else {
-		ut_ad(!srv_read_only_mode);
-
-		if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
-			buf_dblwr_write_single_page(bpage, sync);
-		} else {
-			ut_ad(!sync);
-			buf_dblwr_add_to_batch(bpage);
-		}
-	}
-
-	/* When doing single page flushing the IO is done synchronously
-	and we flush the changes to disk only for the tablespace we
-	are working on. */
-	if (sync) {
-		ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE);
-		if (space->purpose != FIL_TYPE_TEMPORARY) {
-			fil_flush(space);
-		}
-
-		/* The tablespace could already have been dropped,
-		because fil_io(request, sync) would already have
-		decremented the node->n_pending. However,
-		buf_page_io_complete() only needs to look up the
-		tablespace during read requests, not during writes. */
-		ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
-#ifdef UNIV_DEBUG
-		dberr_t err =
+  if (slot->comp_buf)
+    return;
+  /* Both Snappy and LZO compression methods require that the output
+  buffer be bigger than input buffer. Adjust the allocated size. */
+  ulint size= srv_page_size;
+#ifdef HAVE_LZO
+  size+= LZO1X_1_15_MEM_COMPRESS;
+#elif defined HAVE_SNAPPY
+  size= snappy_max_compressed_length(size);
 #endif
-		/* true means we want to evict this page from the
-		LRU list as well. */
-		buf_page_io_complete(bpage, space->use_doublewrite(), true);
-
-		ut_ad(err == DB_SUCCESS);
-	}
-
-	space->release_for_io();
-
-	/* Increment the counter of I/O operations used
-	for selecting LRU policy. */
-	buf_LRU_stat_inc_io();
+  slot->comp_buf= static_cast<byte*>(aligned_malloc(size, srv_page_size));
 }
 
-/********************************************************************//**
-Writes a flushable page asynchronously from the buffer pool to a file.
-NOTE: in simulated aio we must call
-os_aio_simulated_wake_handler_threads after we have posted a batch of
-writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
-held upon entering this function, and they will be released by this
-function if it returns true.
-@return TRUE if the page was flushed */
-ibool
-buf_flush_page(
-/*===========*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	buf_page_t*	bpage,		/*!< in: buffer control block */
-	buf_flush_t	flush_type,	/*!< in: type of flush */
-	bool		sync)		/*!< in: true if sync IO request */
+/** Encrypt a buffer of temporary tablespace
+@param[in]      offset  Page offset
+@param[in]      s       Page to encrypt
+@param[in,out]  d       Output buffer
+@return encrypted buffer or NULL */
+static byte* buf_tmp_page_encrypt(ulint offset, const byte* s, byte* d)
 {
-	BPageMutex*	block_mutex;
-
-	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_page_in_file(bpage));
-	ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE);
-
-	block_mutex = buf_page_get_mutex(bpage);
-	ut_ad(mutex_own(block_mutex));
-
-	ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
-
-	bool	is_uncompressed;
-
-	is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
-	ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
-
-	ibool		flush;
-	rw_lock_t*	rw_lock;
-	bool		no_fix_count = bpage->buf_fix_count == 0;
-
-	if (!is_uncompressed) {
-		flush = TRUE;
-		rw_lock = NULL;
-	} else if (!(no_fix_count || flush_type == BUF_FLUSH_LIST)
-		   || (!no_fix_count
-		       && srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP
-		       && fsp_is_system_temporary(bpage->id.space()))) {
-		/* This is a heuristic, to avoid expensive SX attempts. */
-		/* For table residing in temporary tablespace sync is done
-		using IO_FIX and so before scheduling for flush ensure that
-		page is not fixed. */
-		flush = FALSE;
-	} else {
-		rw_lock = &reinterpret_cast<buf_block_t*>(bpage)->lock;
-		if (flush_type != BUF_FLUSH_LIST) {
-			flush = rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE);
-		} else {
-			/* Will SX lock later */
-			flush = TRUE;
-		}
-	}
-
-	if (flush) {
-
-		/* We are committed to flushing by the time we get here */
-
-		buf_page_set_io_fix(bpage, BUF_IO_WRITE);
-
-		buf_page_set_flush_type(bpage, flush_type);
-
-		if (buf_pool->n_flush[flush_type] == 0) {
-			os_event_reset(buf_pool->no_flush[flush_type]);
-		}
-
-		++buf_pool->n_flush[flush_type];
-		ut_ad(buf_pool->n_flush[flush_type] != 0);
+  /* Calculate the start offset in a page */
+  uint srclen= static_cast<uint>(srv_page_size) -
+    (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION +
+     FIL_PAGE_FCRC32_CHECKSUM);
+  const byte* src= s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+  byte* dst= d + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
 
-		mutex_exit(block_mutex);
+  memcpy(d, s, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
 
-		buf_pool_mutex_exit(buf_pool);
+  if (!log_tmp_block_encrypt(src, srclen, dst, (offset * srv_page_size), true))
+    return NULL;
 
-		if (flush_type == BUF_FLUSH_LIST
-		    && is_uncompressed
-		    && !rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE)) {
-
-			if (!fsp_is_system_temporary(bpage->id.space())) {
-				/* avoiding deadlock possibility involves
-				doublewrite buffer, should flush it, because
-				it might hold the another block->lock. */
-				buf_dblwr_flush_buffered_writes();
-			} else {
-				buf_dblwr_sync_datafiles();
-			}
+  const ulint payload= srv_page_size - FIL_PAGE_FCRC32_CHECKSUM;
+  mach_write_to_4(d + payload, ut_crc32(d, payload));
 
-			rw_lock_sx_lock_gen(rw_lock, BUF_IO_WRITE);
-		}
+  srv_stats.pages_encrypted.inc();
+  srv_stats.n_temp_blocks_encrypted.inc();
+  return d;
+}
 
-		/* If there is an observer that want to know if the asynchronous
-		flushing was sent then notify it.
-		Note: we set flush observer to a page with x-latch, so we can
-		guarantee that notify_flush and notify_remove are called in pair
-		with s-latch on a uncompressed page. */
-		if (bpage->flush_observer != NULL) {
-			buf_pool_mutex_enter(buf_pool);
+/** Encryption and page_compression hook that is called just before
+a page is written to disk.
+@param[in,out]  space   tablespace
+@param[in,out]  bpage   buffer page
+@param[in]      s       physical page frame that is being encrypted
+@param[in,out]  size    payload size in bytes
+@return page frame to be written to file
+(may be src_frame or an encrypted/compressed copy of it) */
+static byte *buf_page_encrypt(fil_space_t* space, buf_page_t* bpage, byte* s,
+                              size_t *size)
+{
+  ut_ad(bpage->status != buf_page_t::FREED);
+  ut_ad(space->id == bpage->id().space());
+
+  ut_d(fil_page_type_validate(space, s));
+  const uint32_t page_no= bpage->id().page_no();
+
+  switch (page_no) {
+  case TRX_SYS_PAGE_NO:
+    if (bpage->id().space() != TRX_SYS_SPACE)
+      break;
+    /* The TRX_SYS page is neither encrypted nor compressed, because
+    it contains the address of the doublewrite buffer. */
+    /* fall through */
+  case 0:
+    /* Page 0 of a tablespace is not encrypted/compressed */
+    return s;
+  }
+
+  fil_space_crypt_t *crypt_data= space->crypt_data;
+  bool encrypted, page_compressed;
+  if (space->purpose == FIL_TYPE_TEMPORARY)
+  {
+    ut_ad(!crypt_data);
+    encrypted= innodb_encrypt_temporary_tables;
+    page_compressed= false;
+  }
+  else
+  {
+    encrypted= crypt_data && !crypt_data->not_encrypted() &&
+      crypt_data->type != CRYPT_SCHEME_UNENCRYPTED &&
+      (!crypt_data->is_default_encryption() || srv_encrypt_tables);
+    page_compressed= space->is_compressed();
+  }
+
+  const bool full_crc32= space->full_crc32();
+
+  if (!encrypted && !page_compressed)
+  {
+    /* No need to encrypt or compress. Clear key-version & crypt-checksum. */
+    static_assert(FIL_PAGE_FCRC32_KEY_VERSION % 4 == 0, "alignment");
+    static_assert(FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION % 4 == 2,
+                  "not perfect alignment");
+    if (full_crc32)
+      memset_aligned<4>(s + FIL_PAGE_FCRC32_KEY_VERSION, 0, 4);
+    else
+      memset_aligned<2>(s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
+    return s;
+  }
+
+  static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+  static_assert(FIL_PAGE_LSN % 8 == 0, "alignment");
+  if (full_crc32)
+    memcpy_aligned<4>(s + srv_page_size - FIL_PAGE_FCRC32_END_LSN,
+                      FIL_PAGE_LSN + 4 + s, 4);
+
+  ut_ad(!bpage->zip_size() || !page_compressed);
+  /* Find free slot from temporary memory array */
+  buf_tmp_buffer_t *slot= buf_pool.io_buf_reserve();
+  ut_a(slot);
+  slot->allocate();
+  slot->out_buf= NULL;
+  bpage->slot= slot;
+
+  byte *d= slot->crypt_buf;
+
+  if (!page_compressed)
+  {
+not_compressed:
+    byte *tmp= space->purpose == FIL_TYPE_TEMPORARY
+      ? buf_tmp_page_encrypt(page_no, s, d)
+      : fil_space_encrypt(space, page_no, s, d);
+
+    slot->out_buf= d= tmp;
+
+    ut_d(fil_page_type_validate(space, tmp));
+  }
+  else
+  {
+    ut_ad(space->purpose != FIL_TYPE_TEMPORARY);
+    /* First we compress the page content */
+    buf_tmp_reserve_compression_buf(slot);
+    byte *tmp= slot->comp_buf;
+    ulint len= fil_page_compress(s, tmp, space->flags,
+                                 fil_space_get_block_size(space, page_no),
+                                 encrypted);
+
+    if (!len)
+      goto not_compressed;
+
+    *size= len;
+
+    if (full_crc32)
+    {
+      ut_d(bool compressed = false);
+      len= buf_page_full_crc32_size(tmp,
+#ifdef UNIV_DEBUG
+                                    &compressed,
+#else
+                                    NULL,
+#endif
+                                    NULL);
+      ut_ad(compressed);
+    }
 
-			bpage->flush_observer->notify_flush(buf_pool, bpage);
+    /* Workaround for MDEV-15527. */
+    memset(tmp + len, 0 , srv_page_size - len);
+    ut_d(fil_page_type_validate(space, tmp));
 
-			buf_pool_mutex_exit(buf_pool);
-		}
+    if (encrypted)
+      tmp = fil_space_encrypt(space, page_no, tmp, d);
 
-		/* Even though bpage is not protected by any mutex at this
-		point, it is safe to access bpage, because it is io_fixed and
-		oldest_modification != 0.  Thus, it cannot be relocated in the
-		buffer pool or removed from flush_list or LRU_list. */
+    if (full_crc32)
+    {
+      static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment");
+      mach_write_to_4(tmp + len - 4, ut_crc32(tmp, len - 4));
+      ut_ad(!buf_page_is_corrupted(true, tmp, space->flags));
+    }
 
-		buf_flush_write_block_low(bpage, flush_type, sync);
-	}
+    slot->out_buf= d= tmp;
+  }
 
-	return(flush);
+  ut_d(fil_page_type_validate(space, d));
+  return d;
 }
 
-# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
-/********************************************************************//**
-Writes a flushable page asynchronously from the buffer pool to a file.
-NOTE: buf_pool->mutex and block->mutex must be held upon entering this
-function, and they will be released by this function after flushing.
-This is loosely based on buf_flush_batch() and buf_flush_page().
-@return TRUE if the page was flushed and the mutexes released */
-ibool
-buf_flush_page_try(
-/*===============*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
-	buf_block_t*	block)		/*!< in/out: buffer control block */
+/** Free a page whose underlying file page has been freed. */
+inline void buf_pool_t::release_freed_page(buf_page_t *bpage)
 {
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-	ut_ad(buf_page_mutex_own(block));
-
-	if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) {
-		return(FALSE);
-	}
-
-	/* The following call will release the buffer pool and
-	block mutex. */
-	return(buf_flush_page(
-			buf_pool, &block->page,
-			BUF_FLUSH_SINGLE_PAGE, true));
+  ut_ad(bpage->in_file());
+  const bool uncompressed= bpage->state() == BUF_BLOCK_FILE_PAGE;
+  mysql_mutex_lock(&mutex);
+  bpage->set_io_fix(BUF_IO_NONE);
+  bpage->status= buf_page_t::NORMAL;
+  mysql_mutex_lock(&flush_list_mutex);
+  ut_d(const lsn_t oldest_modification= bpage->oldest_modification();)
+  if (fsp_is_system_temporary(bpage->id().space()))
+  {
+    ut_ad(uncompressed);
+    ut_ad(oldest_modification == 2);
+  }
+  else
+  {
+    ut_ad(oldest_modification > 2);
+    delete_from_flush_list(bpage, false);
+  }
+  bpage->clear_oldest_modification();
+  mysql_mutex_unlock(&flush_list_mutex);
+
+  if (uncompressed)
+    rw_lock_sx_unlock_gen(&reinterpret_cast<buf_block_t*>(bpage)->lock,
+                          BUF_IO_WRITE);
+
+  buf_LRU_free_page(bpage, true);
+  mysql_mutex_unlock(&mutex);
 }
-# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 
-/** Check the page is in buffer pool and can be flushed.
-@param[in]	page_id		page id
-@param[in]	flush_type	BUF_FLUSH_LRU or BUF_FLUSH_LIST
-@return true if the page can be flushed. */
-static
-bool
-buf_flush_check_neighbor(
-	const page_id_t		page_id,
-	buf_flush_t		flush_type)
+/** Write a flushable page from buf_pool to a file.
+buf_pool.mutex must be held.
+@param bpage       buffer control block
+@param lru         true=buf_pool.LRU; false=buf_pool.flush_list
+@param space       tablespace
+@return whether the page was flushed and buf_pool.mutex was released */
+static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
 {
-	buf_page_t*	bpage;
-	buf_pool_t*	buf_pool = buf_pool_get(page_id);
-	bool		ret;
-
-	ut_ad(flush_type == BUF_FLUSH_LRU
-	      || flush_type == BUF_FLUSH_LIST);
-
-	buf_pool_mutex_enter(buf_pool);
-
-	/* We only want to flush pages from this buffer pool. */
-	bpage = buf_page_hash_get(buf_pool, page_id);
-
-	if (!bpage) {
-
-		buf_pool_mutex_exit(buf_pool);
-		return(false);
-	}
-
-	ut_a(buf_page_in_file(bpage));
-
-	/* We avoid flushing 'non-old' blocks in an LRU flush,
-	because the flushed blocks are soon freed */
-
-	ret = false;
-	if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) {
-		BPageMutex* block_mutex = buf_page_get_mutex(bpage);
-
-		mutex_enter(block_mutex);
-		if (buf_flush_ready_for_flush(bpage, flush_type)) {
-			ret = true;
-		}
-		mutex_exit(block_mutex);
-	}
-	buf_pool_mutex_exit(buf_pool);
+  ut_ad(bpage->in_file());
+  ut_ad(bpage->ready_for_flush());
+  ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
+        (space == fil_system.temp_space));
+  ut_ad(space->purpose == FIL_TYPE_TABLESPACE ||
+        space->atomic_write_supported);
+  ut_ad(space->referenced());
+  ut_ad(lru || space != fil_system.temp_space);
+
+  rw_lock_t *rw_lock;
+
+  if (bpage->state() != BUF_BLOCK_FILE_PAGE)
+    rw_lock= nullptr;
+  else
+  {
+    rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock;
+    if (!rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE))
+      return false;
+  }
+
+  bpage->set_io_fix(BUF_IO_WRITE);
+  /* Because bpage->status can only be changed while buf_block_t
+  exists, it cannot be modified for ROW_FORMAT=COMPRESSED pages
+  without first allocating the uncompressed page frame. Such
+  allocation cannot be completed due to our io_fix. So, bpage->status
+  is protected even if !rw_lock. */
+  const auto status= bpage->status;
+
+  if (status != buf_page_t::FREED)
+  {
+    if (lru)
+      buf_pool.n_flush_LRU_++;
+    else
+      buf_pool.n_flush_list_++;
+    buf_flush_page_count++;
+  }
+
+  mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+
+  /* We are holding rw_lock = buf_block_t::lock in SX mode except if
+  this is a ROW_FORMAT=COMPRESSED page whose uncompressed page frame
+  has been evicted from the buffer pool.
+
+  Apart from possible rw_lock protection, bpage is also protected by
+  io_fix and oldest_modification()!=0. Thus, it cannot be relocated in
+  the buffer pool or removed from flush_list or LRU_list. */
+
+  DBUG_PRINT("ib_buf", ("%s %u page %u:%u",
+                        lru ? "LRU" : "flush_list",
+                        bpage->id().space(), bpage->id().page_no()));
+  ut_ad(bpage->io_fix() == BUF_IO_WRITE);
+  ut_d(const lsn_t oldest_modification= bpage->oldest_modification());
+  ut_ad(space == fil_system.temp_space
+        ? oldest_modification == 2
+        : oldest_modification > 2);
+  ut_ad(bpage->state() ==
+        (rw_lock ? BUF_BLOCK_FILE_PAGE : BUF_BLOCK_ZIP_PAGE));
+  ut_ad(ULINT_UNDEFINED >
+        (lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_));
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
+  page_t *frame= bpage->zip.data;
+
+  if (status == buf_page_t::FREED)
+    buf_pool.release_freed_page(&block->page);
+  else
+  {
+    space->reacquire();
+    ut_ad(status == buf_page_t::NORMAL || status == buf_page_t::INIT_ON_FLUSH);
+    size_t size;
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+    size_t orig_size;
+#endif
+    IORequest::Type type= lru ? IORequest::WRITE_LRU : IORequest::WRITE_ASYNC;
+
+    if (UNIV_UNLIKELY(!rw_lock)) /* ROW_FORMAT=COMPRESSED */
+    {
+      ut_ad(!space->full_crc32());
+      ut_ad(!space->is_compressed()); /* not page_compressed */
+      size= bpage->zip_size();
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+      orig_size= size;
+#endif
+      buf_flush_update_zip_checksum(frame, size);
+      frame= buf_page_encrypt(space, bpage, frame, &size);
+      ut_ad(size == bpage->zip_size());
+    }
+    else
+    {
+      byte *page= block->frame;
+      size= block->physical_size();
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+      orig_size= size;
+#endif
 
-	return(ret);
+      if (space->full_crc32())
+      {
+        /* innodb_checksum_algorithm=full_crc32 is not implemented for
+        ROW_FORMAT=COMPRESSED pages. */
+        ut_ad(!frame);
+        page= buf_page_encrypt(space, bpage, page, &size);
+        buf_flush_init_for_writing(block, page, nullptr, true);
+      }
+      else
+      {
+        buf_flush_init_for_writing(block, page, frame ? &bpage->zip : nullptr,
+                                   false);
+        page= buf_page_encrypt(space, bpage, frame ? frame : page, &size);
+      }
+
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+      if (size != orig_size && space->punch_hole)
+        type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH;
+#endif
+      frame=page;
+    }
+
+    ut_ad(status == bpage->status);
+    ut_ad(oldest_modification == bpage->oldest_modification());
+
+    if (status != buf_page_t::NORMAL || !space->use_doublewrite())
+    {
+      if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
+      {
+        const lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
+                                          (FIL_PAGE_LSN + (frame ? frame
+                                                           : block->frame)));
+        ut_ad(lsn >= oldest_modification);
+        if (lsn > log_sys.get_flushed_lsn())
+          log_write_up_to(lsn, true);
+      }
+      space->io(IORequest(type, bpage),
+                bpage->physical_offset(), size, frame, bpage);
+    }
+    else
+      buf_dblwr.add_to_batch(IORequest(bpage, space->chain.start, type), size);
+  }
+
+  /* Increment the I/O operation count used for selecting LRU policy. */
+  buf_LRU_stat_inc_io();
+  return true;
 }
 
-/** Flushes to disk all flushable pages within the flush area.
-@param[in]	page_id		page id
-@param[in]	flush_type	BUF_FLUSH_LRU or BUF_FLUSH_LIST
-@param[in]	n_flushed	number of pages flushed so far in this batch
-@param[in]	n_to_flush	maximum number of pages we are allowed to flush
-@return number of pages flushed */
-static
-ulint
-buf_flush_try_neighbors(
-	const page_id_t		page_id,
-	buf_flush_t		flush_type,
-	ulint			n_flushed,
-	ulint			n_to_flush)
+/** Check whether a page can be flushed from the buf_pool.
+@param id          page identifier
+@param fold        id.fold()
+@param lru         true=buf_pool.LRU; false=buf_pool.flush_list
+@return whether the page can be flushed */
+static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru)
 {
-	ulint		i;
-	ulint		low;
-	ulint		high;
-	ulint		count = 0;
-	buf_pool_t*	buf_pool = buf_pool_get(page_id);
-
-	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
-	fil_space_t* space = fil_space_acquire_for_io(page_id.space());
-	if (!space) {
-		return 0;
-	}
-
-	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
-	    || !srv_flush_neighbors || !space->is_rotational()) {
-		/* If there is little space or neighbor flushing is
-		not enabled then just flush the victim. */
-		low = page_id.page_no();
-		high = page_id.page_no() + 1;
-	} else {
-		/* When flushed, dirty blocks are searched in
-		neighborhoods of this size, and flushed along with the
-		original page. */
-
-		ulint	buf_flush_area;
-
-		buf_flush_area	= ut_min(
-			BUF_READ_AHEAD_AREA(buf_pool),
-			buf_pool->curr_size / 16);
-
-		low = (page_id.page_no() / buf_flush_area) * buf_flush_area;
-		high = (page_id.page_no() / buf_flush_area + 1) * buf_flush_area;
-
-		if (srv_flush_neighbors == 1) {
-			/* adjust 'low' and 'high' to limit
-			   for contiguous dirty area */
-			if (page_id.page_no() > low) {
-				for (i = page_id.page_no() - 1; i >= low; i--) {
-					if (!buf_flush_check_neighbor(
-						page_id_t(page_id.space(), i),
-						flush_type)) {
-
-						break;
-					}
-
-					if (i == low) {
-						/* Avoid overwrap when low == 0
-						and calling
-						buf_flush_check_neighbor() with
-						i == (ulint) -1 */
-						i--;
-						break;
-					}
-				}
-				low = i + 1;
-			}
-
-			for (i = page_id.page_no() + 1;
-			     i < high
-			     && buf_flush_check_neighbor(
-				     page_id_t(page_id.space(), i),
-				     flush_type);
-			     i++) {
-				/* do nothing */
-			}
-			high = i;
-		}
-	}
-
-	high = space->max_page_number_for_io(high);
-
-	DBUG_PRINT("ib_buf", ("flush %u:%u..%u",
-			      page_id.space(),
-			      (unsigned) low, (unsigned) high));
-
-	for (ulint i = low; i < high; i++) {
-		buf_page_t*	bpage;
-
-		if ((count + n_flushed) >= n_to_flush) {
-
-			/* We have already flushed enough pages and
-			should call it a day. There is, however, one
-			exception. If the page whose neighbors we
-			are flushing has not been flushed yet then
-			we'll try to flush the victim that we
-			selected originally. */
-			if (i <= page_id.page_no()) {
-				i = page_id.page_no();
-			} else {
-				break;
-			}
-		}
-
-		const page_id_t	cur_page_id(page_id.space(), i);
-
-		buf_pool = buf_pool_get(cur_page_id);
-
-		buf_pool_mutex_enter(buf_pool);
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(fold == id.fold());
 
-		/* We only want to flush pages from this buffer pool. */
-		bpage = buf_page_hash_get(buf_pool, cur_page_id);
-
-		if (bpage == NULL) {
-
-			buf_pool_mutex_exit(buf_pool);
-			continue;
-		}
-
-		ut_a(buf_page_in_file(bpage));
-
-		/* We avoid flushing 'non-old' blocks in an LRU flush,
-		because the flushed blocks are soon freed */
-
-		if (flush_type != BUF_FLUSH_LRU
-		    || i == page_id.page_no()
-		    || buf_page_is_old(bpage)) {
-
-			BPageMutex* block_mutex = buf_page_get_mutex(bpage);
-
-			mutex_enter(block_mutex);
-
-			if (buf_flush_ready_for_flush(bpage, flush_type)
-			    && (i == page_id.page_no()
-				|| bpage->buf_fix_count == 0)) {
-
-				/* We also try to flush those
-				neighbors != offset */
-
-				if (buf_flush_page(
-					buf_pool, bpage, flush_type, false)) {
-
-					++count;
-				} else {
-					mutex_exit(block_mutex);
-					buf_pool_mutex_exit(buf_pool);
-				}
-
-				continue;
-			} else {
-				mutex_exit(block_mutex);
-			}
-		}
-		buf_pool_mutex_exit(buf_pool);
-	}
+  buf_page_t *bpage= buf_pool.page_hash_get_low(id, fold);
 
-	space->release_for_io();
+  if (!bpage || buf_pool.watch_is_sentinel(*bpage))
+    return false;
 
-	if (count > 1) {
-		MONITOR_INC_VALUE_CUMULATIVE(
-			MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
-			MONITOR_FLUSH_NEIGHBOR_COUNT,
-			MONITOR_FLUSH_NEIGHBOR_PAGES,
-			(count - 1));
-	}
+  /* We avoid flushing 'non-old' blocks in an LRU flush, because the
+  flushed blocks are soon freed */
+  if (lru && !bpage->is_old())
+    return false;
 
-	return(count);
+  return bpage->oldest_modification() > 1 && bpage->ready_for_flush();
 }
 
-/** Check if the block is modified and ready for flushing.
-If the the block is ready to flush then flush the page and try o flush
-its neighbors.
-@param[in]	bpage		buffer control block,
-must be buf_page_in_file(bpage)
-@param[in]	flush_type	BUF_FLUSH_LRU or BUF_FLUSH_LIST
-@param[in]	n_to_flush	number of pages to flush
-@param[in,out]	count		number of pages flushed
-@return TRUE if buf_pool mutex was released during this function.
-This does not guarantee that some pages were written as well.
-Number of pages written are incremented to the count. */
-static
-bool
-buf_flush_page_and_try_neighbors(
-	buf_page_t*		bpage,
-	buf_flush_t		flush_type,
-	ulint			n_to_flush,
-	ulint*			count)
+/** Check which neighbors of a page can be flushed from the buf_pool.
+@param space       tablespace
+@param id          page identifier of a dirty page
+@param contiguous  whether to consider contiguous areas of pages
+@param lru         true=buf_pool.LRU; false=buf_pool.flush_list
+@return last page number that can be flushed */
+static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
+                                           page_id_t &id, bool contiguous,
+                                           bool lru)
 {
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-#endif /* UNIV_DEBUG */
-
-	bool		flushed;
-	BPageMutex*	block_mutex = buf_page_get_mutex(bpage);
-
-	mutex_enter(block_mutex);
-
-	ut_a(buf_page_in_file(bpage));
-
-	if (buf_flush_ready_for_flush(bpage, flush_type)) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_bpage(bpage);
-
-		const page_id_t	page_id = bpage->id;
-
-		mutex_exit(block_mutex);
-
-		buf_pool_mutex_exit(buf_pool);
-
-		/* Try to flush also all the neighbors */
-		*count += buf_flush_try_neighbors(
-			page_id, flush_type, *count, n_to_flush);
-
-		buf_pool_mutex_enter(buf_pool);
-		flushed = TRUE;
-	} else {
-		mutex_exit(block_mutex);
-
-		flushed = false;
-	}
+  ut_ad(id.page_no() < space.size +
+        (space.physical_size() == 2048 ? 1
+         : space.physical_size() == 1024 ? 3 : 0));
+  /* When flushed, dirty blocks are searched in neighborhoods of this
+  size, and flushed along with the original page. */
+  const ulint s= buf_pool.curr_size / 16;
+  const uint32_t read_ahead= buf_pool.read_ahead_area;
+  const uint32_t buf_flush_area= read_ahead > s
+    ? static_cast<uint32_t>(s) : read_ahead;
+  page_id_t low= id - (id.page_no() % buf_flush_area);
+  page_id_t high= low + buf_flush_area;
+  high.set_page_no(std::min(high.page_no(), space.last_page_number()));
+
+  if (!contiguous)
+  {
+    high= std::max(id + 1, high);
+    id= low;
+    return high;
+  }
+
+  /* Determine the contiguous dirty area around id. */
+  const ulint id_fold= id.fold();
+
+  mysql_mutex_lock(&buf_pool.mutex);
+
+  if (id > low)
+  {
+    ulint fold= id_fold;
+    for (page_id_t i= id - 1;; --i)
+    {
+      fold--;
+      if (!buf_flush_check_neighbor(i, fold, lru))
+      {
+        low= i + 1;
+        break;
+      }
+      if (i == low)
+        break;
+    }
+  }
+
+  page_id_t i= id;
+  id= low;
+  ulint fold= id_fold;
+  while (++i < high)
+  {
+    ++fold;
+    if (!buf_flush_check_neighbor(i, fold, lru))
+      break;
+  }
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+  return i;
+}
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Write punch-hole or zeroes of the freed ranges when
+innodb_immediate_scrub_data_uncompressed from the freed ranges.
+@param space    tablespace which may contain ranges of freed pages
+@param writable whether the tablespace is writable
+@return number of pages written or hole-punched */
+static uint32_t buf_flush_freed_pages(fil_space_t *space, bool writable)
+{
+  const bool punch_hole= space->punch_hole;
+  if (!punch_hole && !srv_immediate_scrub_data_uncompressed)
+    return 0;
+
+  mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+  mysql_mutex_assert_not_owner(&buf_pool.mutex);
+
+  space->freed_range_mutex.lock();
+  if (space->freed_ranges.empty() ||
+      log_sys.get_flushed_lsn() < space->get_last_freed_lsn())
+  {
+    space->freed_range_mutex.unlock();
+    return 0;
+  }
+
+  const unsigned physical_size{space->physical_size()};
+
+  range_set freed_ranges= std::move(space->freed_ranges);
+  uint32_t written= 0;
+
+  if (!writable);
+  else if (punch_hole)
+  {
+    for (const auto &range : freed_ranges)
+    {
+      written+= range.last - range.first + 1;
+      space->reacquire();
+      space->io(IORequest(IORequest::PUNCH_RANGE),
+                          os_offset_t{range.first} * physical_size,
+                          (range.last - range.first + 1) * physical_size,
+                          nullptr);
+    }
+  }
+  else
+  {
+    for (const auto &range : freed_ranges)
+    {
+      written+= range.last - range.first + 1;
+      for (os_offset_t i= range.first; i <= range.last; i++)
+      {
+        space->reacquire();
+        space->io(IORequest(IORequest::WRITE_ASYNC),
+                  i * physical_size, physical_size,
+                  const_cast<byte*>(field_ref_zero));
+      }
+    }
+  }
+
+  space->freed_range_mutex.unlock();
+  return written;
+}
 
-	return(flushed);
+/** Flushes to disk all flushable pages within the flush area
+and also write zeroes or punch the hole for the freed ranges of pages.
+@param space       tablespace
+@param page_id     page identifier
+@param contiguous  whether to consider contiguous areas of pages
+@param lru         true=buf_pool.LRU; false=buf_pool.flush_list
+@param n_flushed   number of pages flushed so far in this batch
+@param n_to_flush  maximum number of pages we are allowed to flush
+@return number of pages flushed */
+static ulint buf_flush_try_neighbors(fil_space_t *space,
+                                     const page_id_t page_id,
+                                     bool contiguous, bool lru,
+                                     ulint n_flushed, ulint n_to_flush)
+{
+  ut_ad(space->id == page_id.space());
+
+  ulint count= 0;
+  page_id_t id= page_id;
+  page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, lru);
+
+  ut_ad(page_id >= id);
+  ut_ad(page_id < high);
+
+  for (ulint id_fold= id.fold(); id < high && !space->is_stopping();
+       ++id, ++id_fold)
+  {
+    if (count + n_flushed >= n_to_flush)
+    {
+      if (id > page_id)
+        break;
+      /* If the page whose neighbors we are flushing has not been
+      flushed yet, we must flush the page that we selected originally. */
+      id= page_id;
+      id_fold= id.fold();
+    }
+
+    mysql_mutex_lock(&buf_pool.mutex);
+
+    if (buf_page_t *bpage= buf_pool.page_hash_get_low(id, id_fold))
+    {
+      ut_ad(bpage->in_file());
+      /* We avoid flushing 'non-old' blocks in an LRU flush,
+      because the flushed blocks are soon freed */
+      if (!lru || id == page_id || bpage->is_old())
+      {
+        if (!buf_pool.watch_is_sentinel(*bpage) &&
+            bpage->oldest_modification() > 1 &&
+            bpage->ready_for_flush() && buf_flush_page(bpage, lru, space))
+        {
+          ++count;
+          continue;
+        }
+      }
+    }
+
+    mysql_mutex_unlock(&buf_pool.mutex);
+  }
+
+  if (auto n= count - 1)
+  {
+    MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+                                 MONITOR_FLUSH_NEIGHBOR_COUNT,
+                                 MONITOR_FLUSH_NEIGHBOR_PAGES, n);
+  }
+
+  return count;
 }
 
 /*******************************************************************//**
@@ -1520,46 +1182,35 @@ tail of the unzip_LRU and puts those freed frames in the free list.
 Note that it is a best effort attempt and it is not guaranteed that
 after a call to this function there will be 'max' blocks in the free
 list.
+@param[in]	max		desired number of blocks in the free_list
 @return number of blocks moved to the free list. */
-static
-ulint
-buf_free_from_unzip_LRU_list_batch(
-/*===============================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	ulint		max)		/*!< in: desired number of
-					blocks in the free_list */
+static ulint buf_free_from_unzip_LRU_list_batch(ulint max)
 {
 	ulint		scanned = 0;
 	ulint		count = 0;
-	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
-	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
 
-	buf_block_t*	block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
+	buf_block_t*	block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
 
-	while (block != NULL
+	while (block
 	       && count < max
-	       && free_len < srv_LRU_scan_depth
-	       && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
+	       && UT_LIST_GET_LEN(buf_pool.free) < srv_LRU_scan_depth
+	       && UT_LIST_GET_LEN(buf_pool.unzip_LRU)
+	       > UT_LIST_GET_LEN(buf_pool.LRU) / 10) {
 
 		++scanned;
 		if (buf_LRU_free_page(&block->page, false)) {
-			/* Block was freed. buf_pool->mutex potentially
+			/* Block was freed. buf_pool.mutex potentially
 			released and reacquired */
 			++count;
-			block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
-
+			block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
 		} else {
-
 			block = UT_LIST_GET_PREV(unzip_LRU, block);
 		}
-
-		free_len = UT_LIST_GET_LEN(buf_pool->free);
-		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
 	}
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
 
 	if (scanned) {
 		MONITOR_INC_VALUE_CUMULATIVE(
@@ -1572,744 +1223,858 @@ buf_free_from_unzip_LRU_list_batch(
 	return(count);
 }
 
-/*******************************************************************//**
-This utility flushes dirty blocks from the end of the LRU list.
-The calling thread is not allowed to own any latches on pages!
-It attempts to make 'max' blocks available in the free list. Note that
-it is a best effort attempt and it is not guaranteed that after a call
-to this function there will be 'max' blocks in the free list.*/
-
-void
-buf_flush_LRU_list_batch(
-/*=====================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	ulint		max,		/*!< in: desired number of
-					blocks in the free_list */
-	flush_counters_t*	n)	/*!< out: flushed/evicted page
-					counts */
+/** Start writing out pages for a tablespace.
+@param id   tablespace identifier
+@return tablespace and number of pages written */
+static std::pair<fil_space_t*, uint32_t> buf_flush_space(const uint32_t id)
 {
-	buf_page_t*	bpage;
-	ulint		scanned = 0;
-	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
-	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
-	ulint		withdraw_depth = 0;
-
-	n->flushed = 0;
-	n->evicted = 0;
-	n->unzip_LRU_evicted = 0;
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	if (buf_pool->curr_size < buf_pool->old_size
-	    && buf_pool->withdraw_target > 0) {
-		withdraw_depth = buf_pool->withdraw_target
-				 - UT_LIST_GET_LEN(buf_pool->withdraw);
-	}
-
-	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
-	     bpage != NULL && n->flushed + n->evicted < max
-	     && free_len < srv_LRU_scan_depth + withdraw_depth
-	     && lru_len > BUF_LRU_MIN_LEN;
-	     ++scanned,
-	     bpage = buf_pool->lru_hp.get()) {
-
-		buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
-		buf_pool->lru_hp.set(prev);
-
-		BPageMutex*	block_mutex = buf_page_get_mutex(bpage);
-
-		mutex_enter(block_mutex);
-
-		if (buf_flush_ready_for_replace(bpage)) {
-			/* block is ready for eviction i.e., it is
-			clean and is not IO-fixed or buffer fixed. */
-			mutex_exit(block_mutex);
-			if (buf_LRU_free_page(bpage, true)) {
-				++n->evicted;
-			}
-		} else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_LRU)) {
-			/* Block is ready for flush. Dispatch an IO
-			request. The IO helper thread will put it on
-			free list in IO completion routine. */
-			mutex_exit(block_mutex);
-			buf_flush_page_and_try_neighbors(
-				bpage, BUF_FLUSH_LRU, max, &n->flushed);
-		} else {
-			/* Can't evict or dispatch this block. Go to
-			previous. */
-			ut_ad(buf_pool->lru_hp.is_hp(prev));
-			mutex_exit(block_mutex);
-		}
-
-		ut_ad(!mutex_own(block_mutex));
-		ut_ad(buf_pool_mutex_own(buf_pool));
-
-		free_len = UT_LIST_GET_LEN(buf_pool->free);
-		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
-	}
-
-	buf_pool->lru_hp.set(NULL);
-
-	/* We keep track of all flushes happening as part of LRU
-	flush. When estimating the desired rate at which flush_list
-	should be flushed, we factor in this value. */
-	buf_lru_flush_page_count += n->flushed;
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-
-	if (n->evicted) {
-		MONITOR_INC_VALUE_CUMULATIVE(
-			MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
-			MONITOR_LRU_BATCH_EVICT_COUNT,
-			MONITOR_LRU_BATCH_EVICT_PAGES,
-			n->evicted);
-	}
-
-	if (scanned) {
-		MONITOR_INC_VALUE_CUMULATIVE(
-			MONITOR_LRU_BATCH_SCANNED,
-			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
-			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
-			scanned);
-	}
+  if (fil_space_t *space= fil_space_t::get(id))
+    return {space, buf_flush_freed_pages(space, true)};
+  return {nullptr, 0};
 }
 
-/*******************************************************************//**
-Flush and move pages from LRU or unzip_LRU list to the free list.
-Whether LRU or unzip_LRU is used depends on the state of the system.*/
+struct flush_counters_t
+{
+  /** number of dirty pages flushed */
+  ulint flushed;
+  /** number of clean pages evicted */
+  ulint evicted;
+};
 
-static
-void
-buf_do_LRU_batch(
-/*=============*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	ulint		max,		/*!< in: desired number of
-					blocks in the free_list */
-	flush_counters_t*	n)	/*!< out: flushed/evicted page
-					counts */
+/** Try to discard a dirty page.
+@param bpage      dirty page whose tablespace is not accessible */
+static void buf_flush_discard_page(buf_page_t *bpage)
 {
-	if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
-		n->unzip_LRU_evicted = buf_free_from_unzip_LRU_list_batch(buf_pool, max);
-	} else {
-		n->unzip_LRU_evicted = 0;
-	}
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+  ut_ad(bpage->in_file());
+  ut_ad(bpage->oldest_modification());
+
+  rw_lock_t *rw_lock;
+
+  if (bpage->state() != BUF_BLOCK_FILE_PAGE)
+    rw_lock= nullptr;
+  else
+  {
+    rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock;
+    if (!rw_lock_sx_lock_nowait(rw_lock, 0))
+      return;
+  }
+
+  bpage->status= buf_page_t::NORMAL;
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  buf_pool.delete_from_flush_list(bpage);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  if (rw_lock)
+    rw_lock_sx_unlock(rw_lock);
+
+  buf_LRU_free_page(bpage, true);
+}
 
-	if (max > n->unzip_LRU_evicted) {
-		buf_flush_LRU_list_batch(buf_pool, max - n->unzip_LRU_evicted, n);
-	} else {
-		n->evicted = 0;
-		n->flushed = 0;
-	}
+/** Flush dirty blocks from the end of the LRU list.
+@param max   maximum number of blocks to make available in buf_pool.free
+@param n     counts of flushed and evicted pages */
+static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
+{
+  ulint scanned= 0;
+  ulint free_limit= srv_LRU_scan_depth;
+
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  if (buf_pool.withdraw_target && buf_pool.curr_size < buf_pool.old_size)
+    free_limit+= buf_pool.withdraw_target - UT_LIST_GET_LEN(buf_pool.withdraw);
+
+  const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
+    ? 0 : srv_flush_neighbors;
+  fil_space_t *space= nullptr;
+  uint32_t last_space_id= FIL_NULL;
+  static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
+  static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
+
+  for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU);
+       bpage &&
+       ((UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN &&
+         UT_LIST_GET_LEN(buf_pool.free) < free_limit &&
+         n->flushed + n->evicted < max) ||
+        recv_recovery_is_on()); ++scanned)
+  {
+  retry:
+    buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage);
+    const lsn_t oldest_modification= bpage->oldest_modification();
+    buf_pool.lru_hp.set(prev);
+
+    if (oldest_modification <= 1 && bpage->can_relocate())
+    {
+      /* block is ready for eviction i.e., it is clean and is not
+      IO-fixed or buffer fixed. */
+      if (buf_LRU_free_page(bpage, true))
+        ++n->evicted;
+    }
+    else if (oldest_modification > 1 && bpage->ready_for_flush())
+    {
+      /* Block is ready for flush. Dispatch an IO request. The IO
+      helper thread will put it on free list in IO completion routine. */
+      const page_id_t page_id(bpage->id());
+      const uint32_t space_id= page_id.space();
+      if (!space || space->id != space_id)
+      {
+        if (last_space_id != space_id)
+        {
+          buf_pool.lru_hp.set(bpage);
+          mysql_mutex_unlock(&buf_pool.mutex);
+          if (space)
+            space->release();
+          auto p= buf_flush_space(space_id);
+          space= p.first;
+          last_space_id= space_id;
+          mysql_mutex_lock(&buf_pool.mutex);
+          if (p.second)
+            buf_pool.stat.n_pages_written+= p.second;
+          bpage= buf_pool.lru_hp.get();
+          goto retry;
+        }
+        else
+          ut_ad(!space);
+      }
+      else if (space->is_stopping())
+      {
+        space->release();
+        space= nullptr;
+      }
+
+      if (!space)
+        buf_flush_discard_page(bpage);
+      else if (neighbors && space->is_rotational())
+      {
+        mysql_mutex_unlock(&buf_pool.mutex);
+        n->flushed+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
+                                             true, n->flushed, max);
+reacquire_mutex:
+        mysql_mutex_lock(&buf_pool.mutex);
+      }
+      else if (buf_flush_page(bpage, true, space))
+      {
+        ++n->flushed;
+        goto reacquire_mutex;
+      }
+    }
+    else
+      /* Can't evict or dispatch this block. Go to previous. */
+      ut_ad(buf_pool.lru_hp.is_hp(prev));
+    bpage= buf_pool.lru_hp.get();
+  }
+
+  buf_pool.lru_hp.set(nullptr);
+
+  if (space)
+    space->release();
+
+  /* We keep track of all flushes happening as part of LRU flush. When
+  estimating the desired rate at which flush_list should be flushed,
+  we factor in this value. */
+  buf_lru_flush_page_count+= n->flushed;
+
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+
+  if (scanned)
+    MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_SCANNED,
+                                 MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+                                 MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+                                 scanned);
+}
 
-	/* Add evicted pages from unzip_LRU to the evicted pages from
-	the simple LRU. */
-	n->evicted += n->unzip_LRU_evicted;
+/** Flush and move pages from LRU or unzip_LRU list to the free list.
+Whether LRU or unzip_LRU is used depends on the state of the system.
+@param max   maximum number of blocks to make available in buf_pool.free
+@return number of flushed pages */
+static ulint buf_do_LRU_batch(ulint max)
+{
+  const ulint n_unzip_LRU_evicted= buf_LRU_evict_from_unzip_LRU()
+    ? buf_free_from_unzip_LRU_list_batch(max)
+    : 0;
+  flush_counters_t n;
+  n.flushed= 0;
+  n.evicted= n_unzip_LRU_evicted;
+  buf_flush_LRU_list_batch(max, &n);
+
+  if (const ulint evicted= n.evicted - n_unzip_LRU_evicted)
+  {
+    MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+                                 MONITOR_LRU_BATCH_EVICT_COUNT,
+                                 MONITOR_LRU_BATCH_EVICT_PAGES,
+                                 evicted);
+  }
+
+  return n.flushed;
 }
 
 /** This utility flushes dirty blocks from the end of the flush_list.
 The calling thread is not allowed to own any latches on pages!
-@param[in]	buf_pool	buffer pool instance
-@param[in]	min_n		wished minimum mumber of blocks flushed (it is
-not guaranteed that the actual number is that big, though)
-@param[in]	lsn_limit	all blocks whose oldest_modification is smaller
-than this should be flushed (if their number does not exceed min_n)
-@return number of blocks for which the write request was queued;
-ULINT_UNDEFINED if there was a flush of the same type already
-running */
-static
-ulint
-buf_do_flush_list_batch(
-	buf_pool_t*		buf_pool,
-	ulint			min_n,
-	lsn_t			lsn_limit)
+@param max_n    maximum mumber of blocks to flush
+@param lsn      once an oldest_modification>=lsn is found, terminate the batch
+@return number of blocks for which the write request was queued */
+static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
 {
-	ulint		count = 0;
-	ulint		scanned = 0;
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-
-	/* Start from the end of the list looking for a suitable
-	block to be flushed. */
-	buf_flush_list_mutex_enter(buf_pool);
-	ulint len = UT_LIST_GET_LEN(buf_pool->flush_list);
-
-	/* In order not to degenerate this scan to O(n*n) we attempt
-	to preserve pointer of previous block in the flush list. To do
-	so we declare it a hazard pointer. Any thread working on the
-	flush list must check the hazard pointer and if it is removing
-	the same block then it must reset it. */
-	for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
-	     count < min_n && bpage != NULL && len > 0
-	     && bpage->oldest_modification < lsn_limit;
-	     bpage = buf_pool->flush_hp.get(),
-	     ++scanned) {
-
-		buf_page_t*	prev;
-
-		ut_a(bpage->oldest_modification > 0);
-		ut_ad(bpage->in_flush_list);
-
-		prev = UT_LIST_GET_PREV(list, bpage);
-		buf_pool->flush_hp.set(prev);
-		buf_flush_list_mutex_exit(buf_pool);
-
-#ifdef UNIV_DEBUG
-		bool flushed =
-#endif /* UNIV_DEBUG */
-		buf_flush_page_and_try_neighbors(
-			bpage, BUF_FLUSH_LIST, min_n, &count);
-
-		buf_flush_list_mutex_enter(buf_pool);
-
-		ut_ad(flushed || buf_pool->flush_hp.is_hp(prev));
-
-		--len;
-	}
+  ulint count= 0;
+  ulint scanned= 0;
+
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+
+  const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
+    ? 0 : srv_flush_neighbors;
+  fil_space_t *space= nullptr;
+  uint32_t last_space_id= FIL_NULL;
+  static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
+  static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
+
+  /* Start from the end of the list looking for a suitable block to be
+  flushed. */
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
+
+  for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list);
+       bpage && len && count < max_n; ++scanned, len--)
+  {
+    const lsn_t oldest_modification= bpage->oldest_modification();
+    if (oldest_modification >= lsn)
+      break;
+    ut_ad(bpage->in_file());
+
+    buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+
+    if (oldest_modification == 1)
+    {
+      buf_pool.delete_from_flush_list(bpage);
+    skip:
+      bpage= prev;
+      continue;
+    }
+
+    ut_ad(oldest_modification > 2);
+    ut_ad(bpage->in_file());
+
+    if (!bpage->ready_for_flush())
+      goto skip;
+
+    /* In order not to degenerate this scan to O(n*n) we attempt to
+    preserve the pointer position. Any thread that would remove 'prev'
+    from buf_pool.flush_list must adjust the hazard pointer.
+
+    Note: A concurrent execution of buf_flush_list_space() may
+    terminate this scan prematurely. The buf_pool.n_flush_list()
+    should prevent multiple threads from executing
+    buf_do_flush_list_batch() concurrently,
+    but buf_flush_list_space() is ignoring that. */
+    buf_pool.flush_hp.set(prev);
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+    const page_id_t page_id(bpage->id());
+    const uint32_t space_id= page_id.space();
+    if (!space || space->id != space_id)
+    {
+      if (last_space_id != space_id)
+      {
+        mysql_mutex_lock(&buf_pool.flush_list_mutex);
+        buf_pool.flush_hp.set(bpage);
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+        mysql_mutex_unlock(&buf_pool.mutex);
+        if (space)
+          space->release();
+        auto p= buf_flush_space(space_id);
+        space= p.first;
+        last_space_id= space_id;
+        mysql_mutex_lock(&buf_pool.mutex);
+        if (p.second)
+          buf_pool.stat.n_pages_written+= p.second;
+        mysql_mutex_lock(&buf_pool.flush_list_mutex);
+        bpage= buf_pool.flush_hp.get();
+        if (!bpage)
+          break;
+        if (bpage->id() != page_id)
+          continue;
+        buf_pool.flush_hp.set(UT_LIST_GET_PREV(list, bpage));
+        if (bpage->oldest_modification() <= 1 || !bpage->ready_for_flush())
+          goto next;
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+      }
+      else
+        ut_ad(!space);
+    }
+    else if (space->is_stopping())
+    {
+      space->release();
+      space= nullptr;
+    }
+
+    if (!space)
+      buf_flush_discard_page(bpage);
+    else if (neighbors && space->is_rotational())
+    {
+      mysql_mutex_unlock(&buf_pool.mutex);
+      count+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
+                                      false, count, max_n);
+    reacquire_mutex:
+      mysql_mutex_lock(&buf_pool.mutex);
+    }
+    else if (buf_flush_page(bpage, false, space))
+    {
+      ++count;
+      goto reacquire_mutex;
+    }
+
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  next:
+    bpage= buf_pool.flush_hp.get();
+  }
+
+  buf_pool.flush_hp.set(nullptr);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  if (space)
+    space->release();
+
+  if (scanned)
+    MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
+                                 MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+                                 MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
+                                 scanned);
+  if (count)
+    MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+                                 MONITOR_FLUSH_BATCH_COUNT,
+                                 MONITOR_FLUSH_BATCH_PAGES,
+                                 count);
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  return count;
+}
 
-	buf_pool->flush_hp.set(NULL);
-	buf_flush_list_mutex_exit(buf_pool);
+/** Wait until a flush batch ends.
+@param lru    true=buf_pool.LRU; false=buf_pool.flush_list */
+void buf_flush_wait_batch_end(bool lru)
+{
+  const auto &n_flush= lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_;
+
+  if (n_flush)
+  {
+    auto cond= lru ? &buf_pool.done_flush_LRU : &buf_pool.done_flush_list;
+    tpool::tpool_wait_begin();
+    thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+    do
+      my_cond_wait(cond, &buf_pool.mutex.m_mutex);
+    while (n_flush);
+    tpool::tpool_wait_end();
+    thd_wait_end(nullptr);
+    pthread_cond_broadcast(cond);
+  }
+}
 
-	if (scanned) {
-		MONITOR_INC_VALUE_CUMULATIVE(
-			MONITOR_FLUSH_BATCH_SCANNED,
-			MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
-			MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
-			scanned);
-	}
+/** Write out dirty blocks from buf_pool.flush_list.
+@param max_n    wished maximum mumber of blocks flushed
+@param lsn      buf_pool.get_oldest_modification(LSN_MAX) target
+@return the number of processed pages
+@retval 0 if a buf_pool.flush_list batch is already running */
+static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, lsn_t lsn= LSN_MAX)
+{
+  ut_ad(lsn);
 
-	if (count) {
-		MONITOR_INC_VALUE_CUMULATIVE(
-			MONITOR_FLUSH_BATCH_TOTAL_PAGE,
-			MONITOR_FLUSH_BATCH_COUNT,
-			MONITOR_FLUSH_BATCH_PAGES,
-			count);
-	}
+  if (buf_pool.n_flush_list())
+    return 0;
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
+  mysql_mutex_lock(&buf_pool.mutex);
+  const bool running= buf_pool.n_flush_list_ != 0;
+  /* FIXME: we are performing a dirty read of buf_pool.flush_list.count
+  while not holding buf_pool.flush_list_mutex */
+  if (running || !UT_LIST_GET_LEN(buf_pool.flush_list))
+  {
+    if (!running)
+      pthread_cond_broadcast(&buf_pool.done_flush_list);
+    mysql_mutex_unlock(&buf_pool.mutex);
+    return 0;
+  }
 
-	return(count);
-}
+  buf_pool.n_flush_list_++;
+  const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn);
+  const ulint n_flushing= --buf_pool.n_flush_list_;
 
-/** This utility flushes dirty blocks from the end of the LRU list or
-flush_list.
-NOTE 1: in the case of an LRU flush the calling thread may own latches to
-pages: to avoid deadlocks, this function must be written so that it cannot
-end up waiting for these latches! NOTE 2: in the case of a flush list flush,
-the calling thread is not allowed to own any latches on pages!
-@param[in]	buf_pool	buffer pool instance
-@param[in]	flush_type	BUF_FLUSH_LRU or BUF_FLUSH_LIST; if
-BUF_FLUSH_LIST, then the caller must not own any latches on pages
-@param[in]	min_n		wished minimum mumber of blocks flushed (it is
-not guaranteed that the actual number is that big, though)
-@param[in]	lsn_limit	in the case of BUF_FLUSH_LIST all blocks whose
-oldest_modification is smaller than this should be flushed (if their number
-does not exceed min_n), otherwise ignored */
-static
-void
-buf_flush_batch(
-	buf_pool_t*		buf_pool,
-	buf_flush_t		flush_type,
-	ulint			min_n,
-	lsn_t			lsn_limit,
-	flush_counters_t*	n)	/*!< out: flushed/evicted page
-					counts  */
-{
-	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
-	ut_ad(flush_type == BUF_FLUSH_LRU
-	      || !sync_check_iterate(dict_sync_check()));
+  buf_pool.try_LRU_scan= true;
 
-	buf_pool_mutex_enter(buf_pool);
+  mysql_mutex_unlock(&buf_pool.mutex);
 
-	/* Note: The buffer pool mutex is released and reacquired within
-	the flush functions. */
-	switch (flush_type) {
-	case BUF_FLUSH_LRU:
-		buf_do_LRU_batch(buf_pool, min_n, n);
-		break;
-	case BUF_FLUSH_LIST:
-		n->flushed = buf_do_flush_list_batch(buf_pool, min_n, lsn_limit);
-		n->evicted = 0;
-		break;
-	default:
-		ut_error;
-	}
+  if (!n_flushing)
+    pthread_cond_broadcast(&buf_pool.done_flush_list);
 
-	buf_pool_mutex_exit(buf_pool);
+  buf_dblwr.flush_buffered_writes();
 
-	DBUG_LOG("ib_buf", "flush " << flush_type << " completed");
+  DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed));
+  return n_flushed;
 }
 
-/******************************************************************//**
-Gather the aggregated stats for both flush list and LRU list flushing.
-@param page_count_flush	number of pages flushed from the end of the flush_list
-@param page_count_LRU	number of pages flushed from the end of the LRU list
-*/
-static
-void
-buf_flush_stats(
-/*============*/
-	ulint		page_count_flush,
-	ulint		page_count_LRU)
+/** Try to flush all the dirty pages that belong to a given tablespace.
+@param space       tablespace
+@param n_flushed   number of pages written
+@return whether the flush for some pages might not have been initiated */
+bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
 {
-	DBUG_PRINT("ib_buf", ("flush completed, from flush_list %u pages, "
-			      "from LRU_list %u pages",
-			      unsigned(page_count_flush),
-			      unsigned(page_count_LRU)));
-
-	srv_stats.buf_pool_flushed.add(page_count_flush + page_count_LRU);
+  const auto space_id= space->id;
+  ut_ad(space_id <= SRV_SPACE_ID_UPPER_BOUND);
+
+  bool may_have_skipped= false;
+  ulint max_n_flush= srv_io_capacity;
+
+  bool acquired= space->acquire();
+  {
+    const uint32_t written{buf_flush_freed_pages(space, acquired)};
+    mysql_mutex_lock(&buf_pool.mutex);
+    if (written)
+      buf_pool.stat.n_pages_written+= written;
+  }
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+  for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
+  {
+    ut_d(const auto s= bpage->state());
+    ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE ||
+          s == BUF_BLOCK_REMOVE_HASH);
+    ut_ad(bpage->oldest_modification());
+    ut_ad(bpage->in_file());
+
+    buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+    if (bpage->id().space() != space_id);
+    else if (bpage->oldest_modification() == 1)
+      buf_pool.delete_from_flush_list(bpage);
+    else if (!bpage->ready_for_flush())
+      may_have_skipped= true;
+    else
+    {
+      /* In order not to degenerate this scan to O(n*n) we attempt to
+      preserve the pointer position. Any thread that would remove 'prev'
+      from buf_pool.flush_list must adjust the hazard pointer.
+
+      Note: Multiple executions of buf_flush_list_space() may be
+      interleaved, and also buf_do_flush_list_batch() may be running
+      concurrently. This may terminate our iteration prematurely,
+      leading us to return may_have_skipped=true. */
+      buf_pool.flush_hp.set(prev);
+      mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+      if (!acquired)
+      {
+      was_freed:
+        buf_flush_discard_page(bpage);
+      }
+      else
+      {
+        if (space->is_stopping())
+        {
+          space->release();
+          acquired= false;
+          goto was_freed;
+        }
+        if (!buf_flush_page(bpage, false, space))
+        {
+          may_have_skipped= true;
+          mysql_mutex_lock(&buf_pool.flush_list_mutex);
+          goto next_after_skip;
+        }
+        if (n_flushed)
+          ++*n_flushed;
+        if (!--max_n_flush)
+        {
+          mysql_mutex_lock(&buf_pool.mutex);
+          mysql_mutex_lock(&buf_pool.flush_list_mutex);
+          may_have_skipped= true;
+          break;
+        }
+        mysql_mutex_lock(&buf_pool.mutex);
+      }
+
+      mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      if (!buf_pool.flush_hp.is_hp(prev))
+        may_have_skipped= true;
+    next_after_skip:
+      bpage= buf_pool.flush_hp.get();
+      continue;
+    }
+
+    bpage= prev;
+  }
+
+  /* Note: this loop may have been executed concurrently with
+  buf_do_flush_list_batch() as well as other threads executing
+  buf_flush_list_space(). We should always return true from
+  buf_flush_list_space() if that should be the case; in
+  buf_do_flush_list_batch() we will simply perform less work. */
+
+  buf_pool.flush_hp.set(nullptr);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  buf_pool.try_LRU_scan= true;
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  if (acquired)
+    space->release();
+
+  if (space->purpose == FIL_TYPE_IMPORT)
+    os_aio_wait_until_no_pending_writes();
+  else
+    buf_dblwr.flush_buffered_writes();
+
+  return may_have_skipped;
 }
 
-/******************************************************************//**
-Start a buffer flush batch for LRU or flush list */
-static
-ibool
-buf_flush_start(
-/*============*/
-	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	buf_flush_t	flush_type)	/*!< in: BUF_FLUSH_LRU
-					or BUF_FLUSH_LIST */
+/** Write out dirty blocks from buf_pool.LRU.
+@param max_n    wished maximum mumber of blocks flushed
+@return the number of processed pages
+@retval 0 if a buf_pool.LRU batch is already running */
+ulint buf_flush_LRU(ulint max_n)
 {
-	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+  if (buf_pool.n_flush_LRU())
+    return 0;
 
-	buf_pool_mutex_enter(buf_pool);
+  log_buffer_flush_to_disk(true);
 
-	if (buf_pool->n_flush[flush_type] > 0
-	   || buf_pool->init_flush[flush_type] == TRUE) {
+  mysql_mutex_lock(&buf_pool.mutex);
+  if (buf_pool.n_flush_LRU_)
+  {
+    mysql_mutex_unlock(&buf_pool.mutex);
+    return 0;
+  }
+  buf_pool.n_flush_LRU_++;
 
-		/* There is already a flush batch of the same type running */
+  ulint n_flushed= buf_do_LRU_batch(max_n);
 
-		buf_pool_mutex_exit(buf_pool);
+  const ulint n_flushing= --buf_pool.n_flush_LRU_;
 
-		return(FALSE);
-	}
+  buf_pool.try_LRU_scan= true;
 
-	buf_pool->init_flush[flush_type] = TRUE;
+  mysql_mutex_unlock(&buf_pool.mutex);
 
-	os_event_reset(buf_pool->no_flush[flush_type]);
+  if (!n_flushing)
+  {
+    pthread_cond_broadcast(&buf_pool.done_flush_LRU);
+    pthread_cond_signal(&buf_pool.done_free);
+  }
 
-	buf_pool_mutex_exit(buf_pool);
+  buf_dblwr.flush_buffered_writes();
 
-	return(TRUE);
+  DBUG_PRINT("ib_buf", ("LRU flush completed, " ULINTPF " pages", n_flushed));
+  return n_flushed;
 }
 
-/******************************************************************//**
-End a buffer flush batch for LRU or flush list */
-static
-void
-buf_flush_end(
-/*==========*/
-	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	buf_flush_t	flush_type)	/*!< in: BUF_FLUSH_LRU
-					or BUF_FLUSH_LIST */
+/** Initiate a log checkpoint, discarding the start of the log.
+@param oldest_lsn   the checkpoint LSN
+@param end_lsn      log_sys.get_lsn()
+@return true if success, false if a checkpoint write was already running */
+static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn)
 {
-	buf_pool_mutex_enter(buf_pool);
-
-	buf_pool->init_flush[flush_type] = FALSE;
-
-	buf_pool->try_LRU_scan = TRUE;
-
-	if (buf_pool->n_flush[flush_type] == 0) {
-
-		/* The running flush batch has ended */
-
-		os_event_set(buf_pool->no_flush[flush_type]);
-	}
-
-	buf_pool_mutex_exit(buf_pool);
-
-	if (!srv_read_only_mode) {
-		buf_dblwr_flush_buffered_writes();
-	} else {
-		os_aio_simulated_wake_handler_threads();
-	}
+  ut_ad(!srv_read_only_mode);
+  mysql_mutex_assert_owner(&log_sys.mutex);
+  ut_ad(oldest_lsn <= end_lsn);
+  ut_ad(end_lsn == log_sys.get_lsn());
+
+  ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
+  const lsn_t age= oldest_lsn - log_sys.last_checkpoint_lsn;
+
+  if (age > SIZE_OF_FILE_CHECKPOINT + log_sys.framing_size())
+    /* Some log has been written since the previous checkpoint. */;
+  else if (age > SIZE_OF_FILE_CHECKPOINT &&
+           !((log_sys.log.calc_lsn_offset(oldest_lsn) ^
+              log_sys.log.calc_lsn_offset(log_sys.last_checkpoint_lsn)) &
+             ~lsn_t{OS_FILE_LOG_BLOCK_SIZE - 1}))
+    /* Some log has been written to the same log block. */;
+  else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
+    /* MariaDB startup expects the redo log file to be logically empty
+    (not even containing a FILE_CHECKPOINT record) after a clean shutdown.
+    Perform an extra checkpoint at shutdown. */;
+  else
+  {
+    /* Do nothing, because nothing was logged (other than a
+    FILE_CHECKPOINT record) since the previous checkpoint. */
+    mysql_mutex_unlock(&log_sys.mutex);
+    return true;
+  }
+
+  ut_ad(!recv_no_log_write);
+
+  /* Repeat the FILE_MODIFY records after the checkpoint, in case some
+  log records between the checkpoint and log_sys.lsn need them.
+  Finally, write a FILE_CHECKPOINT record. Redo log apply expects to
+  see a FILE_CHECKPOINT after the checkpoint, except on clean
+  shutdown, where the log will be empty after the checkpoint.
+
+  It is important that we write out the redo log before any further
+  dirty pages are flushed to the tablespace files.  At this point,
+  because we hold log_sys.mutex, mtr_t::commit() in other threads will
+  be blocked, and no pages can be added to the flush lists. */
+  lsn_t flush_lsn= oldest_lsn;
+
+  if (fil_names_clear(flush_lsn, oldest_lsn != end_lsn ||
+                      srv_shutdown_state <= SRV_SHUTDOWN_INITIATED))
+  {
+    flush_lsn= log_sys.get_lsn();
+    ut_ad(flush_lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT);
+    mysql_mutex_unlock(&log_sys.mutex);
+    log_write_up_to(flush_lsn, true, true);
+    mysql_mutex_lock(&log_sys.mutex);
+    if (log_sys.last_checkpoint_lsn >= oldest_lsn)
+    {
+      mysql_mutex_unlock(&log_sys.mutex);
+      return true;
+    }
+  }
+  else
+    ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
+
+  ut_ad(log_sys.get_flushed_lsn() >= flush_lsn);
+
+  if (log_sys.checkpoint_pending)
+  {
+    /* A checkpoint write is running */
+    mysql_mutex_unlock(&log_sys.mutex);
+    return false;
+  }
+
+  log_sys.next_checkpoint_lsn= oldest_lsn;
+  log_write_checkpoint_info(end_lsn);
+  mysql_mutex_assert_not_owner(&log_sys.mutex);
+
+  return true;
 }
 
-/******************************************************************//**
-Waits until a flush batch of the given type ends */
-void
-buf_flush_wait_batch_end(
-/*=====================*/
-	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	buf_flush_t	type)		/*!< in: BUF_FLUSH_LRU
-					or BUF_FLUSH_LIST */
+/** Make a checkpoint. Note that this function does not flush dirty
+blocks from the buffer pool: it only checks what is lsn of the oldest
+modification in the pool, and writes information about the lsn in
+log file. Use log_make_checkpoint() to flush also the pool.
+@retval true if the checkpoint was or had been made
+@retval false if a checkpoint write was already running */
+static bool log_checkpoint()
 {
-	ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
-
-	if (buf_pool == NULL) {
-		ulint	i;
-
-		for (i = 0; i < srv_buf_pool_instances; ++i) {
-			buf_pool_t*	buf_pool;
-
-			buf_pool = buf_pool_from_array(i);
-
-			thd_wait_begin(NULL, THD_WAIT_DISKIO);
-			os_event_wait(buf_pool->no_flush[type]);
-			thd_wait_end(NULL);
-		}
-	} else {
-		thd_wait_begin(NULL, THD_WAIT_DISKIO);
-		os_event_wait(buf_pool->no_flush[type]);
-		thd_wait_end(NULL);
-	}
+  if (recv_recovery_is_on())
+    recv_sys.apply(true);
+
+  switch (srv_file_flush_method) {
+  case SRV_NOSYNC:
+  case SRV_O_DIRECT_NO_FSYNC:
+    break;
+  default:
+    fil_flush_file_spaces();
+  }
+
+  mysql_mutex_lock(&log_sys.mutex);
+  const lsn_t end_lsn= log_sys.get_lsn();
+  mysql_mutex_lock(&log_sys.flush_order_mutex);
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  const lsn_t oldest_lsn= buf_pool.get_oldest_modification(end_lsn);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+  mysql_mutex_unlock(&log_sys.flush_order_mutex);
+  return log_checkpoint_low(oldest_lsn, end_lsn);
 }
 
-/** Do flushing batch of a given type.
-NOTE: The calling thread is not allowed to own any latches on pages!
-@param[in,out]	buf_pool	buffer pool instance
-@param[in]	type		flush type
-@param[in]	min_n		wished minimum mumber of blocks flushed
-(it is not guaranteed that the actual number is that big, though)
-@param[in]	lsn_limit	in the case BUF_FLUSH_LIST all blocks whose
-oldest_modification is smaller than this should be flushed (if their number
-does not exceed min_n), otherwise ignored
-@param[out]	n_processed	the number of pages which were processed is
-passed back to caller. Ignored if NULL
-@retval true	if a batch was queued successfully.
-@retval false	if another batch of same type was already running. */
-bool
-buf_flush_do_batch(
-	buf_pool_t*		buf_pool,
-	buf_flush_t		type,
-	ulint			min_n,
-	lsn_t			lsn_limit,
-	flush_counters_t*	n)
+/** Make a checkpoint. */
+ATTRIBUTE_COLD void log_make_checkpoint()
 {
-	ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
-
-	if (n != NULL) {
-		n->flushed = 0;
-	}
-
-	if (!buf_flush_start(buf_pool, type)) {
-		return(false);
-	}
-
-	buf_flush_batch(buf_pool, type, min_n, lsn_limit, n);
-
-	buf_flush_end(buf_pool, type);
-
-	return(true);
+  buf_flush_wait_flushed(log_sys.get_lsn(std::memory_order_acquire));
+  while (!log_checkpoint());
 }
-/**
-Waits until a flush batch of the given lsn ends
-@param[in]	new_oldest	target oldest_modified_lsn to wait for */
 
-void
-buf_flush_wait_flushed(
-	lsn_t		new_oldest)
+/** Wait for all dirty pages up to an LSN to be written out.
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+static void buf_flush_wait(lsn_t lsn)
 {
-	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
-		buf_pool_t*	buf_pool;
-		lsn_t		oldest;
-
-		buf_pool = buf_pool_from_array(i);
-
-		for (;;) {
-			/* We don't need to wait for fsync of the flushed
-			blocks, because anyway we need fsync to make chekpoint.
-			So, we don't need to wait for the batch end here. */
-
-			buf_flush_list_mutex_enter(buf_pool);
-
-			buf_page_t*	bpage;
-
-			/* We don't need to wait for system temporary pages */
-			for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
-			     bpage != NULL
-				&& fsp_is_system_temporary(bpage->id.space());
-			     bpage = UT_LIST_GET_PREV(list, bpage)) {
-				/* Do nothing. */
-			}
-
-			if (bpage != NULL) {
-				ut_ad(bpage->in_flush_list);
-				oldest = bpage->oldest_modification;
-			} else {
-				oldest = 0;
-			}
-
-			buf_flush_list_mutex_exit(buf_pool);
-
-			if (oldest == 0 || oldest >= new_oldest) {
-				break;
-			}
-
-			/* sleep and retry */
-			os_thread_sleep(buf_flush_wait_flushed_sleep_time);
-
-			MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
-		}
-	}
+  ut_ad(lsn <= log_sys.get_lsn());
+
+  while (buf_pool.get_oldest_modification(lsn) < lsn)
+  {
+    if (buf_flush_sync_lsn < lsn)
+    {
+      buf_flush_sync_lsn= lsn;
+      buf_pool.page_cleaner_set_idle(false);
+      pthread_cond_signal(&buf_pool.do_flush_list);
+    }
+    my_cond_wait(&buf_pool.done_flush_list,
+                 &buf_pool.flush_list_mutex.m_mutex);
+  }
 }
 
-/** This utility flushes dirty blocks from the end of the flush list of all
-buffer pool instances.
-NOTE: The calling thread is not allowed to own any latches on pages!
-@param[in]	min_n		wished minimum mumber of blocks flushed (it is
-not guaranteed that the actual number is that big, though)
-@param[in]	lsn_limit	in the case BUF_FLUSH_LIST all blocks whose
-oldest_modification is smaller than this should be flushed (if their number
-does not exceed min_n), otherwise ignored
-@param[out]	n_processed	the number of pages which were processed is
-passed back to caller. Ignored if NULL.
-@return true if a batch was queued successfully for each buffer pool
-instance. false if another batch of same type was already running in
-at least one of the buffer pool instance */
-bool
-buf_flush_lists(
-	ulint			min_n,
-	lsn_t			lsn_limit,
-	ulint*			n_processed)
+/** Wait until all persistent pages are flushed up to a limit.
+@param sync_lsn   buf_pool.get_oldest_modification(LSN_MAX) to wait for */
+ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
 {
-	ulint		i;
-	ulint		n_flushed = 0;
-	bool		success = true;
-
-	if (n_processed) {
-		*n_processed = 0;
-	}
-
-	if (min_n != ULINT_MAX) {
-		/* Ensure that flushing is spread evenly amongst the
-		buffer pool instances. When min_n is ULINT_MAX
-		we need to flush everything up to the lsn limit
-		so no limit here. */
-		min_n = (min_n + srv_buf_pool_instances - 1)
-			 / srv_buf_pool_instances;
-	}
-
-	/* Flush to lsn_limit in all buffer pool instances */
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*		buf_pool;
-		flush_counters_t	n;
-
-		memset(&n, 0, sizeof(flush_counters_t));
-		buf_pool = buf_pool_from_array(i);
-
-		if (!buf_flush_do_batch(buf_pool,
-					BUF_FLUSH_LIST,
-					min_n,
-					lsn_limit,
-					&n)) {
-			/* We have two choices here. If lsn_limit was
-			specified then skipping an instance of buffer
-			pool means we cannot guarantee that all pages
-			up to lsn_limit has been flushed. We can
-			return right now with failure or we can try
-			to flush remaining buffer pools up to the
-			lsn_limit. We attempt to flush other buffer
-			pools based on the assumption that it will
-			help in the retry which will follow the
-			failure. */
-			success = false;
-
-		}
-
-		n_flushed += n.flushed;
-	}
-
-	if (n_flushed) {
-		buf_flush_stats(n_flushed, 0);
-		if (n_processed) {
-			*n_processed = n_flushed;
-		}
-	}
-
-	return(success);
+  ut_ad(sync_lsn);
+  ut_ad(sync_lsn < LSN_MAX);
+  mysql_mutex_assert_not_owner(&log_sys.mutex);
+  ut_ad(!srv_read_only_mode);
+
+  if (recv_recovery_is_on())
+    recv_sys.apply(true);
+
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+  if (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn)
+  {
+    MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
+#if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */
+    if (UNIV_UNLIKELY(!buf_page_cleaner_is_active))
+    {
+      do
+      {
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+        ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn);
+        buf_flush_wait_batch_end_acquiring_mutex(false);
+        if (n_pages)
+        {
+          MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+                                       MONITOR_FLUSH_SYNC_COUNT,
+                                       MONITOR_FLUSH_SYNC_PAGES, n_pages);
+        }
+        mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      }
+      while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn);
+    }
+    else
+#endif
+    {
+      thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+      tpool::tpool_wait_begin();
+      buf_flush_wait(sync_lsn);
+      tpool::tpool_wait_end();
+      thd_wait_end(nullptr);
+    }
+  }
+
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  if (UNIV_UNLIKELY(log_sys.last_checkpoint_lsn < sync_lsn))
+  {
+    /* If the buffer pool was clean, no log write was guaranteed
+    to happen until now. There could be an outstanding FILE_CHECKPOINT
+    record from a previous fil_names_clear() call, which we must
+    write out before we can advance the checkpoint. */
+    if (sync_lsn > log_sys.get_flushed_lsn())
+      log_write_up_to(sync_lsn, true);
+    log_checkpoint();
+  }
 }
 
-/******************************************************************//**
-This function picks up a single page from the tail of the LRU
-list, flushes it (if it is dirty), removes it from page_hash and LRU
-list and puts it on the free list. It is called from user threads when
-they are unable to find a replaceable page at the tail of the LRU
-list i.e.: when the background LRU flushing in the page_cleaner thread
-is not fast enough to keep pace with the workload.
-@return true if success. */
-bool
-buf_flush_single_page_from_LRU(
-/*===========================*/
-	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
+/** Initiate more eager page flushing if the log checkpoint age is too old.
+@param lsn      buf_pool.get_oldest_modification(LSN_MAX) target
+@param furious  true=furious flushing, false=limit to innodb_io_capacity */
+ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious)
 {
-	ulint		scanned;
-	buf_page_t*	bpage;
-	ibool		freed;
-
-	buf_pool_mutex_enter(buf_pool);
-
-	for (bpage = buf_pool->single_scan_itr.start(), scanned = 0,
-	     freed = false;
-	     bpage != NULL;
-	     ++scanned, bpage = buf_pool->single_scan_itr.get()) {
-
-		ut_ad(buf_pool_mutex_own(buf_pool));
-
-		buf_page_t*	prev = UT_LIST_GET_PREV(LRU, bpage);
-		buf_pool->single_scan_itr.set(prev);
-		BPageMutex*	block_mutex;
-
-		block_mutex = buf_page_get_mutex(bpage);
-
-		mutex_enter(block_mutex);
-
-		if (buf_flush_ready_for_replace(bpage)) {
-			/* block is ready for eviction i.e., it is
-			clean and is not IO-fixed or buffer fixed. */
-			mutex_exit(block_mutex);
-
-			if (buf_LRU_free_page(bpage, true)) {
-				buf_pool_mutex_exit(buf_pool);
-				freed = true;
-				break;
-			}
-
-		} else if (buf_flush_ready_for_flush(
-				   bpage, BUF_FLUSH_SINGLE_PAGE)) {
-
-			/* Block is ready for flush. Try and dispatch an IO
-			request. We'll put it on free list in IO completion
-			routine if it is not buffer fixed. The following call
-			will release the buffer pool and block mutex.
-
-			Note: There is no guarantee that this page has actually
-			been freed, only that it has been flushed to disk */
-
-			freed = buf_flush_page(
-				buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true);
-
-			if (freed) {
-				break;
-			}
-
-			mutex_exit(block_mutex);
-		} else {
-			mutex_exit(block_mutex);
-		}
-		ut_ad(!mutex_own(block_mutex));
-	}
-	if (!freed) {
-		/* Can't find a single flushable page. */
-		ut_ad(!bpage);
-		buf_pool_mutex_exit(buf_pool);
-	}
-
-	if (scanned) {
-		MONITOR_INC_VALUE_CUMULATIVE(
-			MONITOR_LRU_SINGLE_FLUSH_SCANNED,
-			MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
-			MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
-			scanned);
-	}
-
-	ut_ad(!buf_pool_mutex_own(buf_pool));
-	return(freed);
+  mysql_mutex_assert_not_owner(&log_sys.mutex);
+  ut_ad(!srv_read_only_mode);
+
+  if (recv_recovery_is_on())
+    recv_sys.apply(true);
+
+  Atomic_relaxed<lsn_t> &limit= furious
+    ? buf_flush_sync_lsn : buf_flush_async_lsn;
+
+  if (limit < lsn)
+  {
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    if (limit < lsn)
+    {
+      limit= lsn;
+      buf_pool.page_cleaner_set_idle(false);
+      pthread_cond_signal(&buf_pool.do_flush_list);
+    }
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+  }
 }
 
-/**
-Clears up tail of the LRU list of a given buffer pool instance:
-* Put replaceable pages at the tail of LRU to the free list
-* Flush dirty pages at the tail of LRU to the disk
-The depth to which we scan each buffer pool is controlled by dynamic
-config parameter innodb_LRU_scan_depth.
-@param buf_pool buffer pool instance
-@return total pages flushed */
-static
-ulint
-buf_flush_LRU_list(
-	buf_pool_t*	buf_pool)
+/** Wait for pending flushes to complete. */
+void buf_flush_wait_batch_end_acquiring_mutex(bool lru)
 {
-	ulint	scan_depth, withdraw_depth;
-	flush_counters_t	n;
-
-	memset(&n, 0, sizeof(flush_counters_t));
-
-	ut_ad(buf_pool);
-	/* srv_LRU_scan_depth can be arbitrarily large value.
-	We cap it with current LRU size. */
-	buf_pool_mutex_enter(buf_pool);
-	scan_depth = UT_LIST_GET_LEN(buf_pool->LRU);
-	if (buf_pool->curr_size < buf_pool->old_size
-	    && buf_pool->withdraw_target > 0) {
-		withdraw_depth = buf_pool->withdraw_target
-				 - UT_LIST_GET_LEN(buf_pool->withdraw);
-	} else {
-		withdraw_depth = 0;
-	}
-	buf_pool_mutex_exit(buf_pool);
-	if (withdraw_depth > srv_LRU_scan_depth) {
-		scan_depth = ut_min(withdraw_depth, scan_depth);
-	} else {
-		scan_depth = ut_min(static_cast<ulint>(srv_LRU_scan_depth),
-				    scan_depth);
-	}
-	/* Currently one of page_cleaners is the only thread
-	that can trigger an LRU flush at the same time.
-	So, it is not possible that a batch triggered during
-	last iteration is still running, */
-	buf_flush_do_batch(buf_pool, BUF_FLUSH_LRU, scan_depth,
-			   0, &n);
-
-	return(n.flushed);
+  if (lru ? buf_pool.n_flush_LRU() : buf_pool.n_flush_list())
+  {
+    mysql_mutex_lock(&buf_pool.mutex);
+    buf_flush_wait_batch_end(lru);
+    mysql_mutex_unlock(&buf_pool.mutex);
+  }
 }
 
-/*********************************************************************//**
-Wait for any possible LRU flushes that are in progress to end. */
-void
-buf_flush_wait_LRU_batch_end(void)
-/*==============================*/
+/** Conduct checkpoint-related flushing for innodb_flush_sync=ON,
+and try to initiate checkpoints until the target is met.
+@param lsn   minimum value of buf_pool.get_oldest_modification(LSN_MAX) */
+ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
 {
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		buf_pool_mutex_enter(buf_pool);
-
-		if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
-		   || buf_pool->init_flush[BUF_FLUSH_LRU]) {
-
-			buf_pool_mutex_exit(buf_pool);
-			buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
-		} else {
-			buf_pool_mutex_exit(buf_pool);
-		}
-	}
+  ut_ad(!srv_read_only_mode);
+
+  for (;;)
+  {
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+    if (ulint n_flushed= buf_flush_list(srv_max_io_capacity, lsn))
+    {
+      MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+                                   MONITOR_FLUSH_SYNC_COUNT,
+                                   MONITOR_FLUSH_SYNC_PAGES, n_flushed);
+    }
+
+    switch (srv_file_flush_method) {
+    case SRV_NOSYNC:
+    case SRV_O_DIRECT_NO_FSYNC:
+      break;
+    default:
+      fil_flush_file_spaces();
+    }
+
+    mysql_mutex_lock(&log_sys.mutex);
+    const lsn_t newest_lsn= log_sys.get_lsn();
+    mysql_mutex_lock(&log_sys.flush_order_mutex);
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    lsn_t measure= buf_pool.get_oldest_modification(0);
+    mysql_mutex_unlock(&log_sys.flush_order_mutex);
+    const lsn_t checkpoint_lsn= measure ? measure : newest_lsn;
+
+    if (!recv_recovery_is_on() &&
+        checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT)
+    {
+      mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+      log_checkpoint_low(checkpoint_lsn, newest_lsn);
+      mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      measure= buf_pool.get_oldest_modification(LSN_MAX);
+    }
+    else
+    {
+      mysql_mutex_unlock(&log_sys.mutex);
+      if (!measure)
+        measure= LSN_MAX;
+    }
+
+    mysql_mutex_assert_not_owner(&log_sys.mutex);
+
+    /* After attempting log checkpoint, check if we have reached our target. */
+    const lsn_t target= buf_flush_sync_lsn;
+
+    if (measure >= target)
+      buf_flush_sync_lsn= 0;
+    else if (measure >= buf_flush_async_lsn)
+      buf_flush_async_lsn= 0;
+
+    /* wake up buf_flush_wait() */
+    pthread_cond_broadcast(&buf_pool.done_flush_list);
+
+    lsn= std::max(lsn, target);
+
+    if (measure >= lsn)
+      return;
+  }
 }
 
-/*********************************************************************//**
-Calculates if flushing is required based on number of dirty pages in
-the buffer pool.
-@return percent of io_capacity to flush to manage dirty page ratio */
-static
-ulint
-af_get_pct_for_dirty()
-/*==================*/
+/** Check if the adpative flushing threshold is recommended based on
+redo log capacity filled threshold.
+@param oldest_lsn     buf_pool.get_oldest_modification()
+@return true if adaptive flushing is recommended. */
+static bool af_needed_for_redo(lsn_t oldest_lsn)
 {
-	double	dirty_pct = buf_get_modified_ratio_pct();
-
-	if (dirty_pct == 0.0) {
-		/* No pages modified */
-		return(0);
-	}
-
-	ut_a(srv_max_dirty_pages_pct_lwm
-	     <= srv_max_buf_pool_modified_pct);
-
-	if (srv_max_dirty_pages_pct_lwm == 0) {
-		/* The user has not set the option to preflush dirty
-		pages as we approach the high water mark. */
-		if (dirty_pct >= srv_max_buf_pool_modified_pct) {
-			/* We have crossed the high water mark of dirty
-			pages In this case we start flushing at 100% of
-			innodb_io_capacity. */
-			return(100);
-		}
-	} else if (dirty_pct >= srv_max_dirty_pages_pct_lwm) {
-		/* We should start flushing pages gradually. */
-		return(static_cast<ulint>((dirty_pct * 100)
-		       / (srv_max_buf_pool_modified_pct + 1)));
-	}
+  lsn_t age= (log_sys.get_lsn() - oldest_lsn);
+  lsn_t af_lwm= static_cast<lsn_t>(srv_adaptive_flushing_lwm *
+    static_cast<double>(log_sys.log_capacity) / 100);
 
-	return(0);
+  /* if age > af_lwm adaptive flushing is recommended */
+  return (age > af_lwm);
 }
 
 /*********************************************************************//**
@@ -2321,95 +2086,75 @@ af_get_pct_for_lsn(
 /*===============*/
 	lsn_t	age)	/*!< in: current age of LSN. */
 {
-	lsn_t	max_async_age;
-	lsn_t	lsn_age_factor;
-	lsn_t	af_lwm = (lsn_t) ((srv_adaptive_flushing_lwm
-			* log_get_capacity()) / 100);
+	lsn_t	af_lwm = static_cast<lsn_t>(
+		srv_adaptive_flushing_lwm
+		* static_cast<double>(log_sys.log_capacity) / 100);
 
 	if (age < af_lwm) {
 		/* No adaptive flushing. */
 		return(0);
 	}
 
-	max_async_age = log_get_max_modified_age_async();
-
-	if (age < max_async_age && !srv_adaptive_flushing) {
-		/* We have still not reached the max_async point and
-		the user has disabled adaptive flushing. */
-		return(0);
-	}
-
-	/* If we are here then we know that either:
-	1) User has enabled adaptive flushing
-	2) User may have disabled adaptive flushing but we have reached
-	max_async_age. */
-	lsn_age_factor = (age * 100) / max_async_age;
+	lsn_t lsn_age_factor = (age * 100) / log_sys.max_modified_age_async;
 
 	ut_ad(srv_max_io_capacity >= srv_io_capacity);
-	return(static_cast<ulint>(
-		((srv_max_io_capacity / srv_io_capacity)
-		* (lsn_age_factor * sqrt((double)lsn_age_factor)))
-		/ 7.5));
+	return static_cast<ulint>(
+		(static_cast<double>(srv_max_io_capacity / srv_io_capacity
+				     * lsn_age_factor)
+		 * sqrt(static_cast<double>(lsn_age_factor))
+		 / 7.5));
 }
 
-/*********************************************************************//**
-This function is called approximately once every second by the
-page_cleaner thread. Based on various factors it decides if there is a
-need to do flushing.
+/** This function is called approximately once every second by the
+page_cleaner thread if innodb_adaptive_flushing=ON.
+Based on various factors it decides if there is a need to do flushing.
 @return number of pages recommended to be flushed
-@param lsn_limit	pointer to return LSN up to which flushing must happen
-@param last_pages_in	the number of pages flushed by the last flush_list
-			flushing. */
-static
-ulint
-page_cleaner_flush_pages_recommendation(
-/*====================================*/
-	lsn_t*	lsn_limit,
-	ulint	last_pages_in)
+@param last_pages_in  number of pages flushed in previous batch
+@param oldest_lsn     buf_pool.get_oldest_modification(0)
+@param dirty_blocks   UT_LIST_GET_LEN(buf_pool.flush_list)
+@param dirty_pct      100*flush_list.count / (LRU.count + free.count) */
+static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in,
+                                                     lsn_t oldest_lsn,
+                                                     ulint dirty_blocks,
+                                                     double dirty_pct)
 {
 	static	lsn_t		prev_lsn = 0;
 	static	ulint		sum_pages = 0;
 	static	ulint		avg_page_rate = 0;
 	static	ulint		n_iterations = 0;
 	static	time_t		prev_time;
-	lsn_t			oldest_lsn;
-	lsn_t			cur_lsn;
-	lsn_t			age;
 	lsn_t			lsn_rate;
 	ulint			n_pages = 0;
-	ulint			pct_for_dirty = 0;
-	ulint			pct_for_lsn = 0;
-	ulint			pct_total = 0;
-
-	cur_lsn = log_get_lsn_nowait();
 
-	/* log_get_lsn_nowait tries to get log_sys.mutex with
-	mutex_enter_nowait, if this does not succeed function
-	returns 0, do not use that value to update stats. */
-	if (cur_lsn == 0) {
-		return(0);
-	}
+	const lsn_t cur_lsn = log_sys.get_lsn();
+	ut_ad(oldest_lsn <= cur_lsn);
+	ulint pct_for_lsn = af_get_pct_for_lsn(cur_lsn - oldest_lsn);
+	time_t curr_time = time(nullptr);
+	const double max_pct = srv_max_buf_pool_modified_pct;
 
-	if (prev_lsn == 0) {
-		/* First time around. */
+	if (!prev_lsn || !pct_for_lsn) {
+		prev_time = curr_time;
 		prev_lsn = cur_lsn;
-		prev_time = time(NULL);
-		return(0);
-	}
+		if (max_pct > 0.0) {
+			dirty_pct /= max_pct;
+		}
 
-	if (prev_lsn == cur_lsn) {
-		return(0);
+		n_pages = ulint(dirty_pct * double(srv_io_capacity));
+		if (n_pages < dirty_blocks) {
+			n_pages= std::min<ulint>(srv_io_capacity, dirty_blocks);
+		}
+
+		return n_pages;
 	}
 
 	sum_pages += last_pages_in;
 
-	time_t	curr_time = time(NULL);
 	double	time_elapsed = difftime(curr_time, prev_time);
 
 	/* We update our variables every srv_flushing_avg_loops
 	iterations to smooth out transition in workload. */
 	if (++n_iterations >= srv_flushing_avg_loops
-	    || time_elapsed >= srv_flushing_avg_loops) {
+	    || time_elapsed >= static_cast<double>(srv_flushing_avg_loops)) {
 
 		if (time_elapsed < 1) {
 			time_elapsed = 1;
@@ -2418,7 +2163,7 @@ page_cleaner_flush_pages_recommendation(
 		avg_page_rate = static_cast<ulint>(
 			((static_cast<double>(sum_pages)
 			  / time_elapsed)
-			 + avg_page_rate) / 2);
+			 + static_cast<double>(avg_page_rate)) / 2);
 
 		/* How much LSN we have generated since last call. */
 		lsn_rate = static_cast<lsn_t>(
@@ -2427,81 +2172,18 @@ page_cleaner_flush_pages_recommendation(
 
 		lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
 
-		/* aggregate stats of all slots */
-		mutex_enter(&page_cleaner.mutex);
-
 		ulint	flush_tm = page_cleaner.flush_time;
 		ulint	flush_pass = page_cleaner.flush_pass;
 
 		page_cleaner.flush_time = 0;
 		page_cleaner.flush_pass = 0;
 
-		ulint	lru_tm = 0;
-		ulint	list_tm = 0;
-		ulint	lru_pass = 0;
-		ulint	list_pass = 0;
-
-		for (ulint i = 0; i < page_cleaner.n_slots; i++) {
-			page_cleaner_slot_t*	slot;
-
-			slot = &page_cleaner.slots[i];
-
-			lru_tm    += slot->flush_lru_time;
-			lru_pass  += slot->flush_lru_pass;
-			list_tm   += slot->flush_list_time;
-			list_pass += slot->flush_list_pass;
-
-			slot->flush_lru_time  = 0;
-			slot->flush_lru_pass  = 0;
-			slot->flush_list_time = 0;
-			slot->flush_list_pass = 0;
-		}
-
-		mutex_exit(&page_cleaner.mutex);
-
-		/* minimum values are 1, to avoid dividing by zero. */
-		if (lru_tm < 1) {
-			lru_tm = 1;
-		}
-		if (list_tm < 1) {
-			list_tm = 1;
-		}
-		if (flush_tm < 1) {
-			flush_tm = 1;
+		if (flush_pass) {
+			flush_tm /= flush_pass;
 		}
 
-		if (lru_pass < 1) {
-			lru_pass = 1;
-		}
-		if (list_pass < 1) {
-			list_pass = 1;
-		}
-		if (flush_pass < 1) {
-			flush_pass = 1;
-		}
-
-		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT,
-			    list_tm / list_pass);
-		MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_SLOT,
-			    lru_tm  / lru_pass);
-
-		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD,
-			    list_tm / (srv_n_page_cleaners * flush_pass));
-		MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_THREAD,
-			    lru_tm / (srv_n_page_cleaners * flush_pass));
-		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST,
-			    flush_tm * list_tm / flush_pass
-			    / (list_tm + lru_tm));
-		MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_EST,
-			    flush_tm * lru_tm / flush_pass
-			    / (list_tm + lru_tm));
-		MONITOR_SET(MONITOR_FLUSH_AVG_TIME, flush_tm / flush_pass);
-
-		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
-			    list_pass / page_cleaner.n_slots);
-		MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_PASS,
-			    lru_pass / page_cleaner.n_slots);
-		MONITOR_SET(MONITOR_FLUSH_AVG_PASS, flush_pass);
+		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME, flush_tm);
+		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, flush_pass);
 
 		prev_lsn = cur_lsn;
 		prev_time = curr_time;
@@ -2511,1268 +2193,430 @@ page_cleaner_flush_pages_recommendation(
 		sum_pages = 0;
 	}
 
-	oldest_lsn = buf_pool_get_oldest_modification();
-
-	ut_ad(oldest_lsn <= log_get_lsn());
-
-	age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0;
-
-	pct_for_dirty = af_get_pct_for_dirty();
-	pct_for_lsn = af_get_pct_for_lsn(age);
-
-	pct_total = ut_max(pct_for_dirty, pct_for_lsn);
+	const ulint pct_for_dirty = srv_max_dirty_pages_pct_lwm == 0
+		? (dirty_pct >= max_pct ? 100 : 0)
+		: static_cast<ulint>
+		(max_pct > 0.0 ? dirty_pct / max_pct : dirty_pct);
+	ulint pct_total = std::max(pct_for_dirty, pct_for_lsn);
 
 	/* Estimate pages to be flushed for the lsn progress */
-	ulint	sum_pages_for_lsn = 0;
 	lsn_t	target_lsn = oldest_lsn
-			     + lsn_avg_rate * buf_flush_lsn_scan_factor;
+		+ lsn_avg_rate * buf_flush_lsn_scan_factor;
+	ulint	pages_for_lsn = 0;
 
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool = buf_pool_from_array(i);
-		ulint		pages_for_lsn = 0;
+	mysql_mutex_lock(&buf_pool.flush_list_mutex);
 
-		buf_flush_list_mutex_enter(buf_pool);
-		for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool->flush_list);
-		     b != NULL;
-		     b = UT_LIST_GET_PREV(list, b)) {
-			if (b->oldest_modification > target_lsn) {
-				break;
-			}
-			++pages_for_lsn;
+	for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool.flush_list);
+	     b != NULL;
+	     b = UT_LIST_GET_PREV(list, b)) {
+		if (b->oldest_modification() > target_lsn) {
+			break;
+		}
+		if (++pages_for_lsn >= srv_max_io_capacity) {
+			break;
 		}
-		buf_flush_list_mutex_exit(buf_pool);
-
-		sum_pages_for_lsn += pages_for_lsn;
-
-		mutex_enter(&page_cleaner.mutex);
-		ut_ad(page_cleaner.slots[i].state
-		      == PAGE_CLEANER_STATE_NONE);
-		page_cleaner.slots[i].n_pages_requested
-			= pages_for_lsn / buf_flush_lsn_scan_factor + 1;
-		mutex_exit(&page_cleaner.mutex);
 	}
+	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
-	sum_pages_for_lsn /= buf_flush_lsn_scan_factor;
-	if(sum_pages_for_lsn < 1) {
-		sum_pages_for_lsn = 1;
+	pages_for_lsn /= buf_flush_lsn_scan_factor;
+	if (pages_for_lsn < 1) {
+		pages_for_lsn = 1;
 	}
 
-	/* Cap the maximum IO capacity that we are going to use by
-	max_io_capacity. Limit the value to avoid too quick increase */
-	ulint	pages_for_lsn =
-		std::min<ulint>(sum_pages_for_lsn, srv_max_io_capacity * 2);
-
-	n_pages = (PCT_IO(pct_total) + avg_page_rate + pages_for_lsn) / 3;
+	n_pages = (ulint(double(srv_io_capacity) * double(pct_total) / 100.0)
+		   + avg_page_rate + pages_for_lsn) / 3;
 
 	if (n_pages > srv_max_io_capacity) {
 		n_pages = srv_max_io_capacity;
 	}
 
-	/* Normalize request for each instance */
-	mutex_enter(&page_cleaner.mutex);
-	ut_ad(page_cleaner.n_slots_requested == 0);
-	ut_ad(page_cleaner.n_slots_flushing == 0);
-	ut_ad(page_cleaner.n_slots_finished == 0);
-
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		/* if REDO has enough of free space,
-		don't care about age distribution of pages */
-		page_cleaner.slots[i].n_pages_requested = pct_for_lsn > 30 ?
-			page_cleaner.slots[i].n_pages_requested
-			* n_pages / sum_pages_for_lsn + 1
-			: n_pages / srv_buf_pool_instances;
-	}
-	mutex_exit(&page_cleaner.mutex);
-
 	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
 
-	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, sum_pages_for_lsn);
+	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, pages_for_lsn);
 
 	MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
 	MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
 	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
 	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
 
-	*lsn_limit = LSN_MAX;
-
 	return(n_pages);
 }
 
-/*********************************************************************//**
-Puts the page_cleaner thread to sleep if it has finished work in less
-than a second
-@retval 0 wake up by event set,
-@retval OS_SYNC_TIME_EXCEEDED if timeout was exceeded
-@param next_loop_time	time when next loop iteration should start
-@param sig_count	zero or the value returned by previous call of
-			os_event_reset()
-@param cur_time		current time as in ut_time_ms() */
-static
-ulint
-pc_sleep_if_needed(
-/*===============*/
-	ulint		next_loop_time,
-	int64_t		sig_count,
-	ulint		cur_time)
-{
-	/* No sleep if we are cleaning the buffer pool during the shutdown
-	with everything else finished */
-	if (srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE)
-		return OS_SYNC_TIME_EXCEEDED;
-
-	if (next_loop_time > cur_time) {
-		/* Get sleep interval in micro seconds. We use
-		ut_min() to avoid long sleep in case of wrap around. */
-		ulint	sleep_us;
-
-		sleep_us = ut_min(static_cast<ulint>(1000000),
-				  (next_loop_time - cur_time) * 1000);
-
-		return(os_event_wait_time_low(buf_flush_event,
-					      sleep_us, sig_count));
-	}
-
-	return(OS_SYNC_TIME_EXCEEDED);
-}
-
-/******************************************************************//**
-Initialize page_cleaner. */
-void
-buf_flush_page_cleaner_init(void)
-/*=============================*/
-{
-	ut_ad(!page_cleaner.is_running);
-
-	mutex_create(LATCH_ID_PAGE_CLEANER, &page_cleaner.mutex);
-
-	page_cleaner.is_requested = os_event_create("pc_is_requested");
-	page_cleaner.is_finished = os_event_create("pc_is_finished");
-	page_cleaner.is_started = os_event_create("pc_is_started");
-	page_cleaner.n_slots = static_cast<ulint>(srv_buf_pool_instances);
-
-	ut_d(page_cleaner.n_disabled_debug = 0);
-
-	page_cleaner.is_running = true;
-}
-
-/**
-Requests for all slots to flush all buffer pool instances.
-@param min_n	wished minimum mumber of blocks flushed
-		(it is not guaranteed that the actual number is that big)
-@param lsn_limit in the case BUF_FLUSH_LIST all blocks whose
-		oldest_modification is smaller than this should be flushed
-		(if their number does not exceed min_n), otherwise ignored
-*/
-static
-void
-pc_request(
-	ulint		min_n,
-	lsn_t		lsn_limit)
-{
-	if (min_n != ULINT_MAX) {
-		/* Ensure that flushing is spread evenly amongst the
-		buffer pool instances. When min_n is ULINT_MAX
-		we need to flush everything up to the lsn limit
-		so no limit here. */
-		min_n = (min_n + srv_buf_pool_instances - 1)
-			/ srv_buf_pool_instances;
-	}
-
-	mutex_enter(&page_cleaner.mutex);
-
-	ut_ad(page_cleaner.n_slots_requested == 0);
-	ut_ad(page_cleaner.n_slots_flushing == 0);
-	ut_ad(page_cleaner.n_slots_finished == 0);
-
-	page_cleaner.requested = (min_n > 0);
-	page_cleaner.lsn_limit = lsn_limit;
-
-	for (ulint i = 0; i < page_cleaner.n_slots; i++) {
-		page_cleaner_slot_t* slot = &page_cleaner.slots[i];
-
-		ut_ad(slot->state == PAGE_CLEANER_STATE_NONE);
-
-		if (min_n == ULINT_MAX) {
-			slot->n_pages_requested = ULINT_MAX;
-		} else if (min_n == 0) {
-			slot->n_pages_requested = 0;
-		}
-
-		/* slot->n_pages_requested was already set by
-		page_cleaner_flush_pages_recommendation() */
-
-		slot->state = PAGE_CLEANER_STATE_REQUESTED;
-	}
-
-	page_cleaner.n_slots_requested = page_cleaner.n_slots;
-	page_cleaner.n_slots_flushing = 0;
-	page_cleaner.n_slots_finished = 0;
-
-	os_event_set(page_cleaner.is_requested);
-
-	mutex_exit(&page_cleaner.mutex);
-}
-
-/**
-Do flush for one slot.
-@return	the number of the slots which has not been treated yet. */
-static
-ulint
-pc_flush_slot(void)
-{
-	ulint	lru_tm = 0;
-	ulint	list_tm = 0;
-	ulint	lru_pass = 0;
-	ulint	list_pass = 0;
-
-	mutex_enter(&page_cleaner.mutex);
-
-	if (!page_cleaner.n_slots_requested) {
-		os_event_reset(page_cleaner.is_requested);
-	} else {
-		page_cleaner_slot_t*	slot = NULL;
-		ulint			i;
-
-		for (i = 0; i < page_cleaner.n_slots; i++) {
-			slot = &page_cleaner.slots[i];
-
-			if (slot->state == PAGE_CLEANER_STATE_REQUESTED) {
-				break;
-			}
-		}
-
-		/* slot should be found because
-		page_cleaner.n_slots_requested > 0 */
-		ut_a(i < page_cleaner.n_slots);
-
-		buf_pool_t* buf_pool = buf_pool_from_array(i);
-
-		page_cleaner.n_slots_requested--;
-		page_cleaner.n_slots_flushing++;
-		slot->state = PAGE_CLEANER_STATE_FLUSHING;
-
-		if (UNIV_UNLIKELY(!page_cleaner.is_running)) {
-			slot->n_flushed_lru = 0;
-			slot->n_flushed_list = 0;
-			goto finish_mutex;
-		}
-
-		if (page_cleaner.n_slots_requested == 0) {
-			os_event_reset(page_cleaner.is_requested);
-		}
-
-		mutex_exit(&page_cleaner.mutex);
-
-		lru_tm = ut_time_ms();
-
-		/* Flush pages from end of LRU if required */
-		slot->n_flushed_lru = buf_flush_LRU_list(buf_pool);
-
-		lru_tm = ut_time_ms() - lru_tm;
-		lru_pass++;
-
-		if (UNIV_UNLIKELY(!page_cleaner.is_running)) {
-			slot->n_flushed_list = 0;
-			goto finish;
-		}
-
-		/* Flush pages from flush_list if required */
-		if (page_cleaner.requested) {
-			flush_counters_t n;
-			memset(&n, 0, sizeof(flush_counters_t));
-			list_tm = ut_time_ms();
-
-			slot->succeeded_list = buf_flush_do_batch(
-				buf_pool, BUF_FLUSH_LIST,
-				slot->n_pages_requested,
-				page_cleaner.lsn_limit,
-				&n);
-
-			slot->n_flushed_list = n.flushed;
-
-			list_tm = ut_time_ms() - list_tm;
-			list_pass++;
-		} else {
-			slot->n_flushed_list = 0;
-			slot->succeeded_list = true;
-		}
-finish:
-		mutex_enter(&page_cleaner.mutex);
-finish_mutex:
-		page_cleaner.n_slots_flushing--;
-		page_cleaner.n_slots_finished++;
-		slot->state = PAGE_CLEANER_STATE_FINISHED;
-
-		slot->flush_lru_time += lru_tm;
-		slot->flush_list_time += list_tm;
-		slot->flush_lru_pass += lru_pass;
-		slot->flush_list_pass += list_pass;
-
-		if (page_cleaner.n_slots_requested == 0
-		    && page_cleaner.n_slots_flushing == 0) {
-			os_event_set(page_cleaner.is_finished);
-		}
-	}
-
-	ulint	ret = page_cleaner.n_slots_requested;
-
-	mutex_exit(&page_cleaner.mutex);
-
-	return(ret);
-}
-
-/**
-Wait until all flush requests are finished.
-@param n_flushed_lru	number of pages flushed from the end of the LRU list.
-@param n_flushed_list	number of pages flushed from the end of the
-			flush_list.
-@return			true if all flush_list flushing batch were success. */
-static
-bool
-pc_wait_finished(
-	ulint*	n_flushed_lru,
-	ulint*	n_flushed_list)
-{
-	bool	all_succeeded = true;
-
-	*n_flushed_lru = 0;
-	*n_flushed_list = 0;
-
-	os_event_wait(page_cleaner.is_finished);
-
-	mutex_enter(&page_cleaner.mutex);
-
-	ut_ad(page_cleaner.n_slots_requested == 0);
-	ut_ad(page_cleaner.n_slots_flushing == 0);
-	ut_ad(page_cleaner.n_slots_finished == page_cleaner.n_slots);
-
-	for (ulint i = 0; i < page_cleaner.n_slots; i++) {
-		page_cleaner_slot_t* slot = &page_cleaner.slots[i];
-
-		ut_ad(slot->state == PAGE_CLEANER_STATE_FINISHED);
-
-		*n_flushed_lru += slot->n_flushed_lru;
-		*n_flushed_list += slot->n_flushed_list;
-		all_succeeded &= slot->succeeded_list;
-
-		slot->state = PAGE_CLEANER_STATE_NONE;
-
-		slot->n_pages_requested = 0;
-	}
-
-	page_cleaner.n_slots_finished = 0;
-
-	os_event_reset(page_cleaner.is_finished);
-
-	mutex_exit(&page_cleaner.mutex);
-
-	return(all_succeeded);
-}
-
-#ifdef UNIV_LINUX
-/**
-Set priority for page_cleaner threads.
-@param[in]	priority	priority intended to set
-@return	true if set as intended */
-static
-bool
-buf_flush_page_cleaner_set_priority(
-	int	priority)
-{
-	setpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid),
-		    priority);
-	return(getpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid))
-	       == priority);
-}
-#endif /* UNIV_LINUX */
-
-#ifdef UNIV_DEBUG
-/** Loop used to disable page cleaner threads. */
-static
-void
-buf_flush_page_cleaner_disabled_loop(void)
-{
-	if (!innodb_page_cleaner_disabled_debug) {
-		/* We return to avoid entering and exiting mutex. */
-		return;
-	}
-
-	mutex_enter(&page_cleaner.mutex);
-	page_cleaner.n_disabled_debug++;
-	mutex_exit(&page_cleaner.mutex);
-
-	while (innodb_page_cleaner_disabled_debug
-	       && srv_shutdown_state == SRV_SHUTDOWN_NONE
-	       && page_cleaner.is_running) {
-
-		os_thread_sleep(100000); /* [A] */
-	}
-
-	/* We need to wait for threads exiting here, otherwise we would
-	encounter problem when we quickly perform following steps:
-		1) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
-		2) SET GLOBAL innodb_page_cleaner_disabled_debug = 0;
-		3) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
-	That's because after step 1 this thread could still be sleeping
-	inside the loop above at [A] and steps 2, 3 could happen before
-	this thread wakes up from [A]. In such case this thread would
-	not re-increment n_disabled_debug and we would be waiting for
-	him forever in buf_flush_page_cleaner_disabled_debug_update(...).
-
-	Therefore we are waiting in step 2 for this thread exiting here. */
-
-	mutex_enter(&page_cleaner.mutex);
-	page_cleaner.n_disabled_debug--;
-	mutex_exit(&page_cleaner.mutex);
-}
-
-/** Disables page cleaner threads (coordinator and workers).
-@param[in]	save		immediate result from check function */
-void buf_flush_page_cleaner_disabled_debug_update(THD*,
-						  st_mysql_sys_var*, void*,
-						  const void* save)
-{
-	if (!page_cleaner.is_running) {
-		return;
-	}
-
-	if (!*static_cast<const my_bool*>(save)) {
-		if (!innodb_page_cleaner_disabled_debug) {
-			return;
-		}
-
-		innodb_page_cleaner_disabled_debug = false;
-
-		/* Enable page cleaner threads. */
-		while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
-			mutex_enter(&page_cleaner.mutex);
-			const ulint n = page_cleaner.n_disabled_debug;
-			mutex_exit(&page_cleaner.mutex);
-			/* Check if all threads have been enabled, to avoid
-			problem when we decide to re-disable them soon. */
-			if (n == 0) {
-				break;
-			}
-		}
-		return;
-	}
-
-	if (innodb_page_cleaner_disabled_debug) {
-		return;
-	}
-
-	innodb_page_cleaner_disabled_debug = true;
-
-	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
-		/* Workers are possibly sleeping on is_requested.
-
-		We have to wake them, otherwise they could possibly
-		have never noticed, that they should be disabled,
-		and we would wait for them here forever.
-
-		That's why we have sleep-loop instead of simply
-		waiting on some disabled_debug_event. */
-		os_event_set(page_cleaner.is_requested);
-
-		mutex_enter(&page_cleaner.mutex);
-
-		ut_ad(page_cleaner.n_disabled_debug
-		      <= srv_n_page_cleaners);
-
-		if (page_cleaner.n_disabled_debug
-		    == srv_n_page_cleaners) {
-
-			mutex_exit(&page_cleaner.mutex);
-			break;
-		}
-
-		mutex_exit(&page_cleaner.mutex);
-
-		os_thread_sleep(100000);
-	}
-}
-#endif /* UNIV_DEBUG */
-
 /******************************************************************//**
 page_cleaner thread tasked with flushing dirty pages from the buffer
 pools. As of now we'll have only one coordinator.
 @return a dummy parameter */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(buf_flush_page_cleaner_coordinator)(void*)
+static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*)
 {
-	my_thread_init();
+  my_thread_init();
 #ifdef UNIV_PFS_THREAD
-	pfs_register_thread(page_cleaner_thread_key);
+  pfs_register_thread(page_cleaner_thread_key);
 #endif /* UNIV_PFS_THREAD */
-	ut_ad(!srv_read_only_mode);
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	ib::info() << "page_cleaner thread running, id "
-		<< os_thread_pf(os_thread_get_curr_id());
-#endif /* UNIV_DEBUG_THREAD_CREATION */
-#ifdef UNIV_LINUX
-	/* linux might be able to set different setting for each thread.
-	worth to try to set high priority for page cleaner threads */
-	if (buf_flush_page_cleaner_set_priority(
-		buf_flush_page_cleaner_priority)) {
-
-		ib::info() << "page_cleaner coordinator priority: "
-			<< buf_flush_page_cleaner_priority;
-	} else {
-		ib::info() << "If the mysqld execution user is authorized,"
-		" page cleaner thread priority can be changed."
-		" See the man page of setpriority().";
-	}
-	/* Signal that setpriority() has been attempted. */
-	os_event_set(recv_sys.flush_end);
-#endif /* UNIV_LINUX */
-
-	do {
-		/* treat flushing requests during recovery. */
-		ulint	n_flushed_lru = 0;
-		ulint	n_flushed_list = 0;
-
-		os_event_wait(recv_sys.flush_start);
-
-		if (!recv_writer_thread_active) {
-			break;
-		}
-
-		switch (recv_sys.flush_type) {
-		case BUF_FLUSH_LRU:
-			/* Flush pages from end of LRU if required */
-			pc_request(0, LSN_MAX);
-			while (pc_flush_slot() > 0) {}
-			pc_wait_finished(&n_flushed_lru, &n_flushed_list);
-			break;
-
-		case BUF_FLUSH_LIST:
-			/* Flush all pages */
-			do {
-				pc_request(ULINT_MAX, LSN_MAX);
-				while (pc_flush_slot() > 0) {}
-			} while (!pc_wait_finished(&n_flushed_lru,
-						   &n_flushed_list));
-			break;
-
-		default:
-			ut_ad(0);
-		}
-
-		os_event_reset(recv_sys.flush_start);
-		os_event_set(recv_sys.flush_end);
-	} while (recv_writer_thread_active);
-
-	os_event_wait(buf_flush_event);
-
-	ulint	ret_sleep = 0;
-	ulint	n_evicted = 0;
-	ulint	n_flushed_last = 0;
-	ulint	warn_interval = 1;
-	ulint	warn_count = 0;
-	int64_t	sig_count = os_event_reset(buf_flush_event);
-	ulint	next_loop_time = ut_time_ms() + 1000;
-	ulint	n_flushed = 0;
-	ulint	last_activity = srv_get_activity_count();
-	ulint	last_pages = 0;
-
-	while (srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
-		ulint	curr_time = ut_time_ms();
-
-		/* The page_cleaner skips sleep if the server is
-		idle and there are no pending IOs in the buffer pool
-		and there is work to do. */
-		if (srv_check_activity(last_activity)
-		    || buf_get_n_pending_read_ios()
-		    || n_flushed == 0) {
-
-			ret_sleep = pc_sleep_if_needed(
-				next_loop_time, sig_count, curr_time);
-		} else if (curr_time > next_loop_time) {
-			ret_sleep = OS_SYNC_TIME_EXCEEDED;
-		} else {
-			ret_sleep = 0;
-		}
-
-		if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
-			break;
-		}
-
-		sig_count = os_event_reset(buf_flush_event);
-
-		if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
-			if (global_system_variables.log_warnings > 2
-			    && curr_time > next_loop_time + 3000
-			    && !(test_flags & TEST_SIGINT)) {
-				if (warn_count == 0) {
-					ib::info() << "page_cleaner: 1000ms"
-						" intended loop took "
-						<< 1000 + curr_time
-						   - next_loop_time
-						<< "ms. The settings might not"
-						" be optimal. (flushed="
-						<< n_flushed_last
-						<< " and evicted="
-						<< n_evicted
-						<< ", during the time.)";
-					if (warn_interval > 300) {
-						warn_interval = 600;
-					} else {
-						warn_interval *= 2;
-					}
-
-					warn_count = warn_interval;
-				} else {
-					--warn_count;
-				}
-			} else {
-				/* reset counter */
-				warn_interval = 1;
-				warn_count = 0;
-			}
-
-			next_loop_time = curr_time + 1000;
-			n_flushed_last = n_evicted = 0;
-		}
-
-		if (ret_sleep != OS_SYNC_TIME_EXCEEDED
-		    && srv_flush_sync
-		    && buf_flush_sync_lsn > 0) {
-			/* woke up for flush_sync */
-			mutex_enter(&page_cleaner.mutex);
-			lsn_t	lsn_limit = buf_flush_sync_lsn;
-			buf_flush_sync_lsn = 0;
-			mutex_exit(&page_cleaner.mutex);
-
-			/* Request flushing for threads */
-			pc_request(ULINT_MAX, lsn_limit);
-
-			ulint tm = ut_time_ms();
-
-			/* Coordinator also treats requests */
-			while (pc_flush_slot() > 0) {}
-
-			/* only coordinator is using these counters,
-			so no need to protect by lock. */
-			page_cleaner.flush_time += ut_time_ms() - tm;
-			page_cleaner.flush_pass++;
-
-			/* Wait for all slots to be finished */
-			ulint	n_flushed_lru = 0;
-			ulint	n_flushed_list = 0;
-			pc_wait_finished(&n_flushed_lru, &n_flushed_list);
-
-			if (n_flushed_list > 0 || n_flushed_lru > 0) {
-				buf_flush_stats(n_flushed_list, n_flushed_lru);
-
-				MONITOR_INC_VALUE_CUMULATIVE(
-					MONITOR_FLUSH_SYNC_TOTAL_PAGE,
-					MONITOR_FLUSH_SYNC_COUNT,
-					MONITOR_FLUSH_SYNC_PAGES,
-					n_flushed_lru + n_flushed_list);
-			}
-
-			n_flushed = n_flushed_lru + n_flushed_list;
-
-		} else if (srv_check_activity(last_activity)) {
-			ulint	n_to_flush;
-			lsn_t	lsn_limit = 0;
-
-			/* Estimate pages from flush_list to be flushed */
-			if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
-				last_activity = srv_get_activity_count();
-				n_to_flush =
-					page_cleaner_flush_pages_recommendation(
-						&lsn_limit, last_pages);
-			} else {
-				n_to_flush = 0;
-			}
-
-			/* Request flushing for threads */
-			pc_request(n_to_flush, lsn_limit);
-
-			ulint tm = ut_time_ms();
-
-			/* Coordinator also treats requests */
-			while (pc_flush_slot() > 0) {
-				/* No op */
-			}
-
-			/* only coordinator is using these counters,
-			so no need to protect by lock. */
-			page_cleaner.flush_time += ut_time_ms() - tm;
-			page_cleaner.flush_pass++ ;
-
-			/* Wait for all slots to be finished */
-			ulint	n_flushed_lru = 0;
-			ulint	n_flushed_list = 0;
-
-			pc_wait_finished(&n_flushed_lru, &n_flushed_list);
-
-			if (n_flushed_list > 0 || n_flushed_lru > 0) {
-				buf_flush_stats(n_flushed_list, n_flushed_lru);
-			}
-
-			if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
-				last_pages = n_flushed_list;
-			}
-
-			n_evicted += n_flushed_lru;
-			n_flushed_last += n_flushed_list;
-
-			n_flushed = n_flushed_lru + n_flushed_list;
-
-			if (n_flushed_lru) {
-				MONITOR_INC_VALUE_CUMULATIVE(
-					MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
-					MONITOR_LRU_BATCH_FLUSH_COUNT,
-					MONITOR_LRU_BATCH_FLUSH_PAGES,
-					n_flushed_lru);
-			}
-
-			if (n_flushed_list) {
-				MONITOR_INC_VALUE_CUMULATIVE(
-					MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
-					MONITOR_FLUSH_ADAPTIVE_COUNT,
-					MONITOR_FLUSH_ADAPTIVE_PAGES,
-					n_flushed_list);
-			}
-
-		} else if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
-			/* no activity, slept enough */
-			buf_flush_lists(PCT_IO(100), LSN_MAX, &n_flushed);
-
-			n_flushed_last += n_flushed;
-
-			if (n_flushed) {
-				MONITOR_INC_VALUE_CUMULATIVE(
-					MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
-					MONITOR_FLUSH_BACKGROUND_COUNT,
-					MONITOR_FLUSH_BACKGROUND_PAGES,
-					n_flushed);
-
-			}
-
-		} else {
-			/* no activity, but woken up by event */
-			n_flushed = 0;
-		}
-
-		ut_d(buf_flush_page_cleaner_disabled_loop());
-	}
-
-	ut_ad(srv_shutdown_state > SRV_SHUTDOWN_INITIATED);
-	if (srv_fast_shutdown == 2
-	    || srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
-		/* In very fast shutdown or when innodb failed to start, we
-		simulate a crash of the buffer pool. We are not required to do
-		any flushing. */
-		goto thread_exit;
-	}
-
-	/* In case of normal and slow shutdown the page_cleaner thread
-	must wait for all other activity in the server to die down.
-	Note that we can start flushing the buffer pool as soon as the
-	server enters shutdown phase but we must stay alive long enough
-	to ensure that any work done by the master or purge threads is
-	also flushed.
-	During shutdown we pass through two stages. In the first stage,
-	when SRV_SHUTDOWN_CLEANUP is set other threads like the master
-	and the purge threads may be working as well. We start flushing
-	the buffer pool but can't be sure that no new pages are being
-	dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */
-
-	do {
-		pc_request(ULINT_MAX, LSN_MAX);
-
-		while (pc_flush_slot() > 0) {}
-
-		ulint	n_flushed_lru = 0;
-		ulint	n_flushed_list = 0;
-		pc_wait_finished(&n_flushed_lru, &n_flushed_list);
-
-		n_flushed = n_flushed_lru + n_flushed_list;
-
-		/* We sleep only if there are no pages to flush */
-		if (n_flushed == 0) {
-			os_thread_sleep(100000);
-		}
-	} while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
-
-	/* At this point all threads including the master and the purge
-	thread must have been suspended. */
-	ut_a(srv_get_active_thread_type() == SRV_NONE);
-	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
-
-	/* We can now make a final sweep on flushing the buffer pool
-	and exit after we have cleaned the whole buffer pool.
-	It is important that we wait for any running batch that has
-	been triggered by us to finish. Otherwise we can end up
-	considering end of that batch as a finish of our final
-	sweep and we'll come out of the loop leaving behind dirty pages
-	in the flush_list */
-	buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
-	buf_flush_wait_LRU_batch_end();
-
-	bool	success;
-
-	do {
-		pc_request(ULINT_MAX, LSN_MAX);
-
-		while (pc_flush_slot() > 0) {}
-
-		ulint	n_flushed_lru = 0;
-		ulint	n_flushed_list = 0;
-		success = pc_wait_finished(&n_flushed_lru, &n_flushed_list);
-
-		n_flushed = n_flushed_lru + n_flushed_list;
-
-		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
-		buf_flush_wait_LRU_batch_end();
-
-	} while (!success || n_flushed > 0);
-
-	/* Some sanity checks */
-	ut_a(srv_get_active_thread_type() == SRV_NONE);
-	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
-
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t* buf_pool = buf_pool_from_array(i);
-		ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0);
-	}
-
-	/* We have lived our life. Time to die. */
-
-thread_exit:
-	/* All worker threads are waiting for the event here,
-	and no more access to page_cleaner structure by them.
-	Wakes worker threads up just to make them exit. */
-	page_cleaner.is_running = false;
-
-	/* waiting for all worker threads exit */
-	while (page_cleaner.n_workers) {
-		os_event_set(page_cleaner.is_requested);
-		os_thread_sleep(10000);
-	}
-
-	mutex_destroy(&page_cleaner.mutex);
-
-	os_event_destroy(page_cleaner.is_finished);
-	os_event_destroy(page_cleaner.is_requested);
-	os_event_destroy(page_cleaner.is_started);
-
-	buf_page_cleaner_is_active = false;
+  ut_ad(!srv_read_only_mode);
+  ut_ad(buf_page_cleaner_is_active);
+
+  ulint last_pages= 0;
+  timespec abstime;
+  set_timespec(abstime, 1);
+
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+  lsn_t lsn_limit;
+  ulint last_activity_count= srv_get_activity_count();
+
+  for (;;)
+  {
+    lsn_limit= buf_flush_sync_lsn;
+
+    if (UNIV_UNLIKELY(lsn_limit != 0))
+    {
+furious_flush:
+      if (UNIV_LIKELY(srv_flush_sync))
+      {
+        buf_flush_sync_for_checkpoint(lsn_limit);
+        last_pages= 0;
+        set_timespec(abstime, 1);
+        continue;
+      }
+    }
+    else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
+      break;
+
+    /* If buf pager cleaner is idle and there is no work
+    (either dirty pages are all flushed or adaptive flushing
+    is not enabled) then opt for non-timed wait */
+    if (buf_pool.page_cleaner_idle() &&
+        (!UT_LIST_GET_LEN(buf_pool.flush_list) ||
+         srv_max_dirty_pages_pct_lwm == 0.0))
+      my_cond_wait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex.m_mutex);
+    else
+      my_cond_timedwait(&buf_pool.do_flush_list,
+                        &buf_pool.flush_list_mutex.m_mutex, &abstime);
+
+    set_timespec(abstime, 1);
+
+    lsn_t soft_lsn_limit= buf_flush_async_lsn;
+    lsn_limit= buf_flush_sync_lsn;
+
+    if (UNIV_UNLIKELY(lsn_limit != 0))
+    {
+      if (UNIV_LIKELY(srv_flush_sync))
+        goto furious_flush;
+    }
+    else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
+      break;
+
+    const lsn_t oldest_lsn= buf_pool.get_oldest_modification(0);
+
+    if (!oldest_lsn)
+    {
+      if (UNIV_UNLIKELY(lsn_limit != 0))
+      {
+        buf_flush_sync_lsn= 0;
+        /* wake up buf_flush_wait() */
+        pthread_cond_broadcast(&buf_pool.done_flush_list);
+      }
+unemployed:
+      buf_flush_async_lsn= 0;
+      buf_pool.page_cleaner_set_idle(true);
+
+      DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", continue;);
+
+      mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+      if (!recv_recovery_is_on() &&
+          !srv_startup_is_before_trx_rollback_phase &&
+          srv_operation == SRV_OPERATION_NORMAL)
+        log_checkpoint();
+
+      mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      continue;
+    }
+
+    const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list);
+    ut_ad(dirty_blocks);
+    /* We perform dirty reads of the LRU+free list lengths here.
+    Division by zero is not possible, because buf_pool.flush_list is
+    guaranteed to be nonempty, and it is a subset of buf_pool.LRU. */
+    const double dirty_pct= double(dirty_blocks) * 100.0 /
+      double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free));
+
+    bool idle_flush= false;
+
+    if (lsn_limit || soft_lsn_limit);
+    else if (af_needed_for_redo(oldest_lsn));
+    else if (srv_max_dirty_pages_pct_lwm != 0.0)
+    {
+      const ulint activity_count= srv_get_activity_count();
+      if (activity_count != last_activity_count)
+        last_activity_count= activity_count;
+      else if (buf_pool.page_cleaner_idle() && buf_pool.n_pend_reads == 0)
+      {
+         /* reaching here means 3 things:
+         - last_activity_count == activity_count: suggesting server is idle
+           (no trx_t::commit activity)
+         - page cleaner is idle (dirty_pct < srv_max_dirty_pages_pct_lwm)
+         - there are no pending reads but there are dirty pages to flush */
+        idle_flush= true;
+        buf_pool.update_last_activity_count(activity_count);
+      }
+
+      if (!idle_flush && dirty_pct < srv_max_dirty_pages_pct_lwm)
+        goto unemployed;
+    }
+    else if (dirty_pct < srv_max_buf_pool_modified_pct)
+      goto unemployed;
+
+    if (UNIV_UNLIKELY(lsn_limit != 0) && oldest_lsn >= lsn_limit)
+      lsn_limit= buf_flush_sync_lsn= 0;
+    if (UNIV_UNLIKELY(soft_lsn_limit != 0) && oldest_lsn >= soft_lsn_limit)
+      soft_lsn_limit= buf_flush_async_lsn= 0;
+
+    buf_pool.page_cleaner_set_idle(false);
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+    if (!lsn_limit)
+      lsn_limit= soft_lsn_limit;
+
+    ulint n_flushed;
+
+    if (UNIV_UNLIKELY(lsn_limit != 0))
+    {
+      n_flushed= buf_flush_list(srv_max_io_capacity, lsn_limit);
+      /* wake up buf_flush_wait() */
+      pthread_cond_broadcast(&buf_pool.done_flush_list);
+      goto try_checkpoint;
+    }
+    else if (idle_flush || !srv_adaptive_flushing)
+    {
+      n_flushed= buf_flush_list(srv_io_capacity);
+try_checkpoint:
+      if (n_flushed)
+      {
+        MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+                                     MONITOR_FLUSH_BACKGROUND_COUNT,
+                                     MONITOR_FLUSH_BACKGROUND_PAGES,
+                                     n_flushed);
+do_checkpoint:
+        /* The periodic log_checkpoint() call here makes it harder to
+        reproduce bugs in crash recovery or mariabackup --prepare, or
+        in code that writes the redo log records. Omitting the call
+        here should not affect correctness, because log_free_check()
+        should still be invoking checkpoints when needed. */
+        DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", goto next;);
+
+        if (!recv_recovery_is_on() && srv_operation == SRV_OPERATION_NORMAL)
+          log_checkpoint();
+      }
+    }
+    else if (ulint n= page_cleaner_flush_pages_recommendation(last_pages,
+                                                              oldest_lsn,
+                                                              dirty_blocks,
+                                                              dirty_pct))
+    {
+      page_cleaner.flush_pass++;
+      const ulint tm= ut_time_ms();
+      last_pages= n_flushed= buf_flush_list(n);
+      page_cleaner.flush_time+= ut_time_ms() - tm;
+
+      if (n_flushed)
+      {
+        MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+                                     MONITOR_FLUSH_ADAPTIVE_COUNT,
+                                     MONITOR_FLUSH_ADAPTIVE_PAGES,
+                                     n_flushed);
+        goto do_checkpoint;
+      }
+    }
+    else if (buf_flush_async_lsn <= oldest_lsn)
+    {
+      mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      goto unemployed;
+    }
 
-	my_thread_end();
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-	os_thread_exit();
+#ifdef UNIV_DEBUG
+    while (innodb_page_cleaner_disabled_debug && !buf_flush_sync_lsn &&
+           srv_shutdown_state == SRV_SHUTDOWN_NONE)
+      os_thread_sleep(100000);
+#endif /* UNIV_DEBUG */
 
-	OS_THREAD_DUMMY_RETURN;
+#ifndef DBUG_OFF
+next:
+#endif /* !DBUG_OFF */
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+    /* when idle flushing kicks in page_cleaner is marked active.
+    reset it back to idle since the it was made active as part of
+    idle flushing stage. */
+    if (idle_flush)
+      buf_pool.page_cleaner_set_idle(true);
+  }
+
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  if (srv_fast_shutdown != 2)
+  {
+    buf_flush_wait_batch_end_acquiring_mutex(true);
+    buf_flush_wait_batch_end_acquiring_mutex(false);
+  }
+
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  lsn_limit= buf_flush_sync_lsn;
+  if (UNIV_UNLIKELY(lsn_limit != 0))
+    goto furious_flush;
+  buf_page_cleaner_is_active= false;
+  pthread_cond_broadcast(&buf_pool.done_flush_list);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  my_thread_end();
+  /* We count the number of threads in os_thread_exit(). A created
+  thread should always use that to exit and not use return() to exit. */
+  os_thread_exit();
+
+  OS_THREAD_DUMMY_RETURN;
 }
 
-/** Adjust thread count for page cleaner workers.
-@param[in]	new_cnt		Number of threads to be used */
-void
-buf_flush_set_page_cleaner_thread_cnt(ulong new_cnt)
+/** Initialize page_cleaner. */
+ATTRIBUTE_COLD void buf_flush_page_cleaner_init()
 {
-	mutex_enter(&page_cleaner.mutex);
-
-	srv_n_page_cleaners = new_cnt;
-	if (new_cnt > page_cleaner.n_workers) {
-		/* User has increased the number of page
-		cleaner threads. */
-		ulint add = new_cnt - page_cleaner.n_workers;
-		for (ulint i = 0; i < add; i++) {
-			os_thread_id_t cleaner_thread_id;
-			os_thread_create(buf_flush_page_cleaner_worker, NULL, &cleaner_thread_id);
-		}
-	}
-
-	mutex_exit(&page_cleaner.mutex);
-
-	/* Wait until defined number of workers has started. */
-	while (page_cleaner.is_running &&
-	       page_cleaner.n_workers != (srv_n_page_cleaners - 1)) {
-		os_event_set(page_cleaner.is_requested);
-		os_event_reset(page_cleaner.is_started);
-		os_event_wait_time(page_cleaner.is_started, 1000000);
-	}
+  ut_ad(!buf_page_cleaner_is_active);
+  ut_ad(srv_operation == SRV_OPERATION_NORMAL ||
+        srv_operation == SRV_OPERATION_RESTORE ||
+        srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+  buf_flush_async_lsn= 0;
+  buf_flush_sync_lsn= 0;
+  buf_page_cleaner_is_active= true;
+  os_thread_create(buf_flush_page_cleaner);
 }
 
-/******************************************************************//**
-Worker thread of page_cleaner.
-@return a dummy parameter */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(buf_flush_page_cleaner_worker)(
-/*==========================================*/
-	void*	arg MY_ATTRIBUTE((unused)))
-			/*!< in: a dummy parameter required by
-			os_thread_create */
+#if defined(HAVE_SYSTEMD) && !defined(EMBEDDED_LIBRARY)
+/** @return the number of dirty pages in the buffer pool */
+static ulint buf_flush_list_length()
 {
-	my_thread_init();
-#ifndef DBUG_OFF
-	os_thread_id_t cleaner_thread_id = os_thread_get_curr_id();
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  const ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+  return len;
+}
 #endif
 
-	mutex_enter(&page_cleaner.mutex);
-	ulint thread_no = page_cleaner.n_workers++;
-
-	DBUG_LOG("ib_buf", "Thread " << cleaner_thread_id
-		 << " started; n_workers=" << page_cleaner.n_workers);
-
-	/* Signal that we have started */
-	os_event_set(page_cleaner.is_started);
-	mutex_exit(&page_cleaner.mutex);
-
-#ifdef UNIV_LINUX
-	/* linux might be able to set different setting for each thread
-	worth to try to set high priority for page cleaner threads */
-	if (buf_flush_page_cleaner_set_priority(
-		buf_flush_page_cleaner_priority)) {
-
-		ib::info() << "page_cleaner worker priority: "
-			<< buf_flush_page_cleaner_priority;
-	}
-#endif /* UNIV_LINUX */
-
-	while (true) {
-		os_event_wait(page_cleaner.is_requested);
-
-		ut_d(buf_flush_page_cleaner_disabled_loop());
-
-		if (!page_cleaner.is_running) {
-			break;
-		}
-
-		ut_ad(srv_n_page_cleaners >= 1);
-
-		/* If number of page cleaner threads is decreased
-		exit those that are not anymore needed. */
-		if (srv_shutdown_state == SRV_SHUTDOWN_NONE &&
-		    thread_no >= (srv_n_page_cleaners - 1)) {
-			DBUG_LOG("ib_buf", "Exiting "
-				<< thread_no
-				<< " page cleaner worker thread_id "
-				<< os_thread_pf(cleaner_thread_id)
-				<< " total threads " << srv_n_page_cleaners << ".");
-			break;
-		}
-
-		pc_flush_slot();
-	}
-
-	mutex_enter(&page_cleaner.mutex);
-	page_cleaner.n_workers--;
-
-	DBUG_LOG("ib_buf", "Thread " << cleaner_thread_id
-		 << " exiting; n_workers=" << page_cleaner.n_workers);
-
-	/* Signal that we have stopped */
-	os_event_set(page_cleaner.is_started);
-	mutex_exit(&page_cleaner.mutex);
-
-	my_thread_end();
-
-	os_thread_exit();
-
-	OS_THREAD_DUMMY_RETURN;
+/** Flush the buffer pool on shutdown. */
+ATTRIBUTE_COLD void buf_flush_buffer_pool()
+{
+  ut_ad(!buf_page_cleaner_is_active);
+  ut_ad(!buf_flush_sync_lsn);
+
+  service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+                                 "Waiting to flush the buffer pool");
+
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+  while (buf_pool.get_oldest_modification(0))
+  {
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+    buf_flush_list(srv_max_io_capacity);
+    if (buf_pool.n_flush_list())
+    {
+      timespec abstime;
+      service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+                                     "Waiting to flush " ULINTPF " pages",
+                                     buf_flush_list_length());
+      set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2);
+      mysql_mutex_lock(&buf_pool.mutex);
+      while (buf_pool.n_flush_list_)
+        my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
+                          &abstime);
+      mysql_mutex_unlock(&buf_pool.mutex);
+    }
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  }
+
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+  ut_ad(!buf_pool.any_io_pending());
 }
 
-/*******************************************************************//**
-Synchronously flush dirty blocks from the end of the flush list of all buffer
-pool instances.
-NOTE: The calling thread is not allowed to own any latches on pages! */
-void
-buf_flush_sync_all_buf_pools(void)
-/*==============================*/
+/** Synchronously flush dirty blocks during recv_sys_t::apply().
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync_batch(lsn_t lsn)
 {
-	bool success;
-	do {
-		success = buf_flush_lists(ULINT_MAX, LSN_MAX, NULL);
-		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
-	} while (!success);
-
-	ut_a(success);
+  thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+  tpool::tpool_wait_begin();
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  buf_flush_wait(lsn);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+  tpool::tpool_wait_end();
+  thd_wait_end(nullptr);
 }
 
-/** Request IO burst and wake page_cleaner up.
-@param[in]	lsn_limit	upper limit of LSN to be flushed */
-void
-buf_flush_request_force(
-	lsn_t	lsn_limit)
+/** Synchronously flush dirty blocks.
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync()
 {
-	/* adjust based on lsn_avg_rate not to get old */
-	lsn_t	lsn_target = lsn_limit + lsn_avg_rate * 3;
-
-	mutex_enter(&page_cleaner.mutex);
-	if (lsn_target > buf_flush_sync_lsn) {
-		buf_flush_sync_lsn = lsn_target;
-	}
-	mutex_exit(&page_cleaner.mutex);
-
-	os_event_set(buf_flush_event);
+  ut_ad(!sync_check_iterate(dict_sync_check()));
+
+  if (recv_recovery_is_on())
+    recv_sys.apply(true);
+
+  thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+  tpool::tpool_wait_begin();
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  for (;;)
+  {
+    const lsn_t lsn= log_sys.get_lsn();
+    buf_flush_wait(lsn);
+    if (lsn == log_sys.get_lsn())
+      break;
+  }
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+  tpool::tpool_wait_end();
+  thd_wait_end(nullptr);
 }
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 
+#ifdef UNIV_DEBUG
 /** Functor to validate the flush list. */
 struct	Check {
 	void operator()(const buf_page_t* elem) const
 	{
-		ut_a(elem->in_flush_list);
+		ut_ad(elem->oldest_modification());
+		ut_ad(!fsp_is_system_temporary(elem->id().space()));
 	}
 };
 
-/******************************************************************//**
-Validates the flush list.
-@return TRUE if ok */
-static
-ibool
-buf_flush_validate_low(
-/*===================*/
-	buf_pool_t*	buf_pool)		/*!< in: Buffer pool instance */
+/** Validate the flush list. */
+static void buf_flush_validate_low()
 {
 	buf_page_t*		bpage;
-	const ib_rbt_node_t*	rnode = NULL;
 
-	ut_ad(buf_flush_list_mutex_own(buf_pool));
+	mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
 
-	ut_list_validate(buf_pool->flush_list, Check());
+	ut_list_validate(buf_pool.flush_list, Check());
 
-	bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
-
-	/* If we are in recovery mode i.e.: flush_rbt != NULL
-	then each block in the flush_list must also be present
-	in the flush_rbt. */
-	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
-		rnode = rbt_first(buf_pool->flush_rbt);
-	}
+	bpage = UT_LIST_GET_FIRST(buf_pool.flush_list);
 
 	while (bpage != NULL) {
-		const lsn_t	om = bpage->oldest_modification;
-
-		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
-
-		ut_ad(bpage->in_flush_list);
-
-		/* A page in buf_pool->flush_list can be in
+		const lsn_t	om = bpage->oldest_modification();
+		/* A page in buf_pool.flush_list can be in
 		BUF_BLOCK_REMOVE_HASH state. This happens when a page
 		is in the middle of being relocated. In that case the
 		original descriptor can have this state and still be
 		in the flush list waiting to acquire the
-		buf_pool->flush_list_mutex to complete the relocation. */
-		ut_a(buf_page_in_file(bpage)
-		     || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
-		ut_a(om > 0);
-
-		if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
-			buf_page_t**	prpage;
-
-			ut_a(rnode != NULL);
-			prpage = rbt_value(buf_page_t*, rnode);
-
-			ut_a(*prpage != NULL);
-			ut_a(*prpage == bpage);
-			rnode = rbt_next(buf_pool->flush_rbt, rnode);
-		}
+		buf_pool.flush_list_mutex to complete the relocation. */
+		ut_d(const auto s= bpage->state());
+		ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE
+		      || s == BUF_BLOCK_REMOVE_HASH);
+		ut_ad(om == 1 || om > 2);
 
 		bpage = UT_LIST_GET_NEXT(list, bpage);
-
-		ut_a(bpage == NULL || om >= bpage->oldest_modification);
-	}
-
-	/* By this time we must have exhausted the traversal of
-	flush_rbt (if active) as well. */
-	ut_a(rnode == NULL);
-
-	return(TRUE);
-}
-
-/******************************************************************//**
-Validates the flush list.
-@return TRUE if ok */
-ibool
-buf_flush_validate(
-/*===============*/
-	buf_pool_t*	buf_pool)	/*!< buffer pool instance */
-{
-	ibool	ret;
-
-	buf_flush_list_mutex_enter(buf_pool);
-
-	ret = buf_flush_validate_low(buf_pool);
-
-	buf_flush_list_mutex_exit(buf_pool);
-
-	return(ret);
-}
-
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-/******************************************************************//**
-Check if there are any dirty pages that belong to a space id in the flush
-list in a particular buffer pool.
-@return number of dirty pages present in a single buffer pool */
-ulint
-buf_pool_get_dirty_pages_count(
-/*===========================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool */
-	ulint		id,		/*!< in: space id to check */
-	FlushObserver*	observer)	/*!< in: flush observer to check */
-
-{
-	ulint		count = 0;
-
-	buf_pool_mutex_enter(buf_pool);
-	buf_flush_list_mutex_enter(buf_pool);
-
-	buf_page_t*	bpage;
-
-	for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
-	     bpage != 0;
-	     bpage = UT_LIST_GET_NEXT(list, bpage)) {
-
-		ut_ad(buf_page_in_file(bpage));
-		ut_ad(bpage->in_flush_list);
-		ut_ad(bpage->oldest_modification > 0);
-
-		if ((observer != NULL
-		     && observer == bpage->flush_observer)
-		    || (observer == NULL
-			&& id == bpage->id.space())) {
-			++count;
-		}
-	}
-
-	buf_flush_list_mutex_exit(buf_pool);
-	buf_pool_mutex_exit(buf_pool);
-
-	return(count);
-}
-
-/******************************************************************//**
-Check if there are any dirty pages that belong to a space id in the flush list.
-@return number of dirty pages present in all the buffer pools */
-static
-ulint
-buf_flush_get_dirty_pages_count(
-/*============================*/
-	ulint		id,		/*!< in: space id to check */
-	FlushObserver*	observer)	/*!< in: flush observer to check */
-{
-	ulint		count = 0;
-
-	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		count += buf_pool_get_dirty_pages_count(buf_pool, id, observer);
-	}
-
-	return(count);
-}
-
-/** FlushObserver constructor
-@param[in]	space		tablespace
-@param[in]	trx		trx instance
-@param[in]	stage		performance schema accounting object,
-used by ALTER TABLE. It is passed to log_preflush_pool_modified_pages()
-for accounting. */
-FlushObserver::FlushObserver(
-	fil_space_t*		space,
-	trx_t*			trx,
-	ut_stage_alter_t*	stage)
-	:
-	m_space(space),
-	m_trx(trx),
-	m_stage(stage),
-	m_interrupted(false)
-{
-	m_flushed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
-	m_removed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
-
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		m_flushed->at(i) = 0;
-		m_removed->at(i) = 0;
-	}
-
-	DBUG_LOG("flush", "FlushObserver(): trx->id=" << m_trx->id);
-}
-
-/** FlushObserver deconstructor */
-FlushObserver::~FlushObserver()
-{
-	ut_ad(buf_flush_get_dirty_pages_count(m_space->id, this) == 0);
-
-	UT_DELETE(m_flushed);
-	UT_DELETE(m_removed);
-
-	DBUG_LOG("flush", "~FlushObserver(): trx->id=" << m_trx->id);
-}
-
-/** Check whether the operation has been interrupted */
-void FlushObserver::check_interrupted()
-{
-	if (trx_is_interrupted(m_trx)) {
-		interrupted();
-	}
-}
-
-/** Notify observer of a flush
-@param[in]	buf_pool	buffer pool instance
-@param[in]	bpage		buffer page to flush */
-void
-FlushObserver::notify_flush(
-	buf_pool_t*	buf_pool,
-	buf_page_t*	bpage)
-{
-	ut_ad(buf_pool_mutex_own(buf_pool));
-
-	m_flushed->at(buf_pool->instance_no)++;
-
-	if (m_stage != NULL) {
-		m_stage->inc();
+		ut_ad(om == 1 || !bpage || recv_recovery_is_on()
+		      || om >= bpage->oldest_modification());
 	}
-
-	DBUG_LOG("flush", "Flush " << bpage->id);
-}
-
-/** Notify observer of a remove
-@param[in]	buf_pool	buffer pool instance
-@param[in]	bpage		buffer page flushed */
-void
-FlushObserver::notify_remove(
-	buf_pool_t*	buf_pool,
-	buf_page_t*	bpage)
-{
-	ut_ad(buf_pool_mutex_own(buf_pool));
-
-	m_removed->at(buf_pool->instance_no)++;
-
-	DBUG_LOG("flush", "Remove " << bpage->id);
 }
 
-/** Flush dirty pages and wait. */
-void
-FlushObserver::flush()
+/** Validate the flush list. */
+void buf_flush_validate()
 {
-	ut_ad(m_trx);
-
-	if (!m_interrupted && m_stage) {
-		m_stage->begin_phase_flush(buf_flush_get_dirty_pages_count(
-						   m_space->id, this));
-	}
-
-	buf_LRU_flush_or_remove_pages(m_space->id, this);
-
-	/* Wait for all dirty pages were flushed. */
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		while (!is_complete(i)) {
-
-			os_thread_sleep(2000);
-		}
-	}
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  buf_flush_validate_low();
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 }
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
index c85f9331580..b282eb17dae 100644
--- a/storage/innobase/buf/buf0lru.cc
+++ b/storage/innobase/buf/buf0lru.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -25,55 +25,37 @@ Created 11/5/1995 Heikki Tuuri
 *******************************************************/
 
 #include "buf0lru.h"
-#include "ut0byte.h"
-#include "ut0rnd.h"
 #include "sync0rw.h"
-#include "hash0hash.h"
-#include "os0event.h"
 #include "fil0fil.h"
 #include "btr0btr.h"
 #include "buf0buddy.h"
 #include "buf0buf.h"
-#include "buf0dblwr.h"
 #include "buf0flu.h"
 #include "buf0rea.h"
 #include "btr0sea.h"
-#include "ibuf0ibuf.h"
 #include "os0file.h"
 #include "page0zip.h"
 #include "log0recv.h"
 #include "srv0srv.h"
 #include "srv0mon.h"
 
+/** Flush this many pages in buf_LRU_get_free_block() */
+size_t innodb_lru_flush_size;
+
 /** The number of blocks from the LRU_old pointer onward, including
-the block pointed to, must be buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
+the block pointed to, must be buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
 of the whole LRU list length, except that the tolerance defined below
 is allowed. Note that the tolerance must be small enough such that for
 even the BUF_LRU_OLD_MIN_LEN long LRU list, the LRU_old pointer is not
 allowed to point to either end of the LRU list. */
 
-static const ulint BUF_LRU_OLD_TOLERANCE = 20;
+static constexpr ulint BUF_LRU_OLD_TOLERANCE = 20;
 
 /** The minimum amount of non-old blocks when the LRU_old list exists
 (that is, when there are more than BUF_LRU_OLD_MIN_LEN blocks).
 @see buf_LRU_old_adjust_len */
 #define BUF_LRU_NON_OLD_MIN_LEN	5
 
-#ifdef BTR_CUR_HASH_ADAPT
-/** When dropping the search hash index entries before deleting an ibd
-file, we build a local array of pages belonging to that tablespace
-in the buffer pool. Following is the size of that array.
-We also release buf_pool->mutex after scanning this many pages of the
-flush_list when dropping a table. This is to ensure that other threads
-are not blocked for extended period of time when using very large
-buffer pools. */
-static const ulint BUF_LRU_DROP_SEARCH_SIZE = 1024;
-#endif /* BTR_CUR_HASH_ADAPT */
-
-/** We scan these many blocks when looking for a clean page to evict
-during LRU eviction. */
-static const ulint BUF_LRU_SEARCH_SCAN_THRESHOLD = 100;
-
 /** If we switch on the InnoDB monitor because there are too few available
 frames in the buffer pool, we set this to TRUE */
 static bool buf_lru_switched_on_innodb_mon = false;
@@ -94,13 +76,12 @@ uncompressed and compressed data), which must be clean. */
 /* @{ */
 
 /** Number of intervals for which we keep the history of these stats.
-Each interval is 1 second, defined by the rate at which
-srv_error_monitor_thread() calls buf_LRU_stat_update(). */
-static const ulint BUF_LRU_STAT_N_INTERVAL = 50;
+Updated at SRV_MONITOR_INTERVAL (the buf_LRU_stat_update() call rate). */
+static constexpr ulint BUF_LRU_STAT_N_INTERVAL= 4;
 
 /** Co-efficient with which we multiply I/O operations to equate them
 with page_zip_decompress() operations. */
-static const ulint BUF_LRU_IO_TO_UNZIP_FACTOR = 50;
+static constexpr ulint BUF_LRU_IO_TO_UNZIP_FACTOR= 50;
 
 /** Sampled values buf_LRU_stat_cur.
 Not protected by any mutex.  Updated by buf_LRU_stat_update(). */
@@ -125,82 +106,66 @@ least this many milliseconds ago.  Not protected by any mutex or latch. */
 uint	buf_LRU_old_threshold_ms;
 /* @} */
 
-/******************************************************************//**
-Takes a block out of the LRU list and page hash table.
-If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
+/** Remove bpage from buf_pool.LRU and buf_pool.page_hash.
+
+If bpage->state() == BUF_BLOCK_ZIP_PAGE && bpage->oldest_modification() <= 1,
 the object will be freed.
 
-The caller must hold buf_pool->mutex, the buf_page_get_mutex() mutex
-and the appropriate hash_lock. This function will release the
-buf_page_get_mutex() and the hash_lock.
+@param bpage      buffer block
+@param id         page identifier
+@param hash_lock  buf_pool.page_hash latch (will be released here)
+@param zip        whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed
 
 If a compressed page is freed other compressed pages may be relocated.
 @retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
 caller needs to free the page to the free list
 @retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
 this case the block is already returned to the buddy allocator. */
-static MY_ATTRIBUTE((warn_unused_result))
-bool
-buf_LRU_block_remove_hashed(
-/*========================*/
-	buf_page_t*	bpage,	/*!< in: block, must contain a file page and
-				be in a state where it can be freed; there
-				may or may not be a hash index to the page */
-	bool		zip);	/*!< in: true if should remove also the
-				compressed page of an uncompressed page */
-/******************************************************************//**
-Puts a file page whose has no hash index to the free list. */
-static
-void
-buf_LRU_block_free_hashed_page(
-/*===========================*/
-	buf_block_t*	block);	/*!< in: block, must contain a file page and
-				be in a state where it can be freed */
+static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
+                                        page_hash_latch *hash_lock, bool zip);
 
-/******************************************************************//**
-Increases LRU size in bytes with page size inline function */
-static inline
-void
-incr_LRU_size_in_bytes(
-/*===================*/
-	buf_page_t*	bpage,		/*!< in: control block */
-	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
+/** Free a block to buf_pool */
+static void buf_LRU_block_free_hashed_page(buf_block_t *block)
 {
-	ut_ad(buf_pool_mutex_own(buf_pool));
+  block->page.free_file_page();
+  buf_LRU_block_free_non_file_page(block);
+}
 
-	buf_pool->stat.LRU_bytes += bpage->physical_size();
+/** Increase LRU size in bytes by the page size.
+@param[in]	bpage		control block */
+static inline void incr_LRU_size_in_bytes(const buf_page_t* bpage)
+{
+	/* FIXME: use atomics, not mutex */
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	buf_pool.stat.LRU_bytes += bpage->physical_size();
 
-	ut_ad(buf_pool->stat.LRU_bytes <= buf_pool->curr_pool_size);
+	ut_ad(buf_pool.stat.LRU_bytes <= buf_pool.curr_pool_size);
 }
 
-/******************************************************************//**
-Determines if the unzip_LRU list should be used for evicting a victim
-instead of the general LRU list.
-@return TRUE if should use unzip_LRU */
-ibool
-buf_LRU_evict_from_unzip_LRU(
-/*=========================*/
-	buf_pool_t*	buf_pool)
+/** @return whether the unzip_LRU list should be used for evicting a victim
+instead of the general LRU list */
+bool buf_LRU_evict_from_unzip_LRU()
 {
-	ut_ad(buf_pool_mutex_own(buf_pool));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
 
 	/* If the unzip_LRU list is empty, we can only use the LRU. */
-	if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0) {
-		return(FALSE);
+	if (UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0) {
+		return false;
 	}
 
 	/* If unzip_LRU is at most 10% of the size of the LRU list,
 	then use the LRU.  This slack allows us to keep hot
 	decompressed pages in the buffer pool. */
-	if (UT_LIST_GET_LEN(buf_pool->unzip_LRU)
-	    <= UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
-		return(FALSE);
+	if (UT_LIST_GET_LEN(buf_pool.unzip_LRU)
+	    <= UT_LIST_GET_LEN(buf_pool.LRU) / 10) {
+		return false;
 	}
 
 	/* If eviction hasn't started yet, we assume by default
 	that a workload is disk bound. */
-	if (buf_pool->freed_page_clock == 0) {
-		return(TRUE);
+	if (buf_pool.freed_page_clock == 0) {
+		return true;
 	}
 
 	/* Calculate the average over past intervals, and add the values
@@ -218,428 +183,33 @@ buf_LRU_evict_from_unzip_LRU(
 	return(unzip_avg <= io_avg * BUF_LRU_IO_TO_UNZIP_FACTOR);
 }
 
-#ifdef BTR_CUR_HASH_ADAPT
-/******************************************************************//**
-While flushing (or removing dirty) pages from a tablespace we don't
-want to hog the CPU and resources. Release the buffer pool and block
-mutex and try to force a context switch. Then reacquire the same mutexes.
-The current page is "fixed" before the release of the mutexes and then
-"unfixed" again once we have reacquired the mutexes. */
-static
-void
-buf_flush_yield(
-/*============*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
-	buf_page_t*	bpage)		/*!< in/out: current page */
-{
-	BPageMutex*	block_mutex;
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_page_in_file(bpage));
-
-	block_mutex = buf_page_get_mutex(bpage);
-
-	mutex_enter(block_mutex);
-
-	/* "Fix" the block so that the position cannot be
-	changed after we release the buffer pool and
-	block mutexes. */
-	buf_page_set_sticky(bpage);
-
-	/* Now it is safe to release the buf_pool->mutex. */
-	buf_pool_mutex_exit(buf_pool);
-
-	mutex_exit(block_mutex);
-	/* Try and force a context switch. */
-	os_thread_yield();
-
-	buf_pool_mutex_enter(buf_pool);
-
-	mutex_enter(block_mutex);
-
-	/* "Unfix" the block now that we have both the
-	buffer pool and block mutex again. */
-	buf_page_unset_sticky(bpage);
-	mutex_exit(block_mutex);
-}
-
-/******************************************************************//**
-If we have hogged the resources for too long then release the buffer
-pool and flush list mutex and do a thread yield. Set the current page
-to "sticky" so that it is not relocated during the yield.
-@return true if yielded */
-static	MY_ATTRIBUTE((warn_unused_result))
-bool
-buf_flush_try_yield(
-/*================*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
-	buf_page_t*	bpage,		/*!< in/out: bpage to remove */
-	ulint		processed)	/*!< in: number of pages processed */
-{
-	/* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the
-	loop we release buf_pool->mutex to let other threads
-	do their job but only if the block is not IO fixed. This
-	ensures that the block stays in its position in the
-	flush_list. */
-
-	if (bpage != NULL
-	    && processed >= BUF_LRU_DROP_SEARCH_SIZE
-	    && buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
-
-		buf_flush_list_mutex_exit(buf_pool);
-
-		/* Release the buffer pool and block mutex
-		to give the other threads a go. */
-
-		buf_flush_yield(buf_pool, bpage);
-
-		buf_flush_list_mutex_enter(buf_pool);
-
-		/* Should not have been removed from the flush
-		list during the yield. However, this check is
-		not sufficient to catch a remove -> add. */
-
-		ut_ad(bpage->in_flush_list);
-
-		return(true);
-	}
-
-	return(false);
-}
-#endif /* BTR_CUR_HASH_ADAPT */
-
-/******************************************************************//**
-Removes a single page from a given tablespace inside a specific
-buffer pool instance.
-@return true if page was removed. */
-static	MY_ATTRIBUTE((warn_unused_result))
-bool
-buf_flush_or_remove_page(
-/*=====================*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
-	buf_page_t*	bpage,		/*!< in/out: bpage to remove */
-	bool		flush)		/*!< in: flush to disk if true but
-					don't remove else remove without
-					flushing to disk */
-{
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_flush_list_mutex_own(buf_pool));
-
-	/* bpage->space and bpage->io_fix are protected by
-	buf_pool->mutex and block_mutex. It is safe to check
-	them while holding buf_pool->mutex only. */
-
-	if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
-
-		/* We cannot remove this page during this scan
-		yet; maybe the system is currently reading it
-		in, or flushing the modifications to the file */
-		return(false);
-
-	}
-
-	BPageMutex*	block_mutex;
-	bool		processed = false;
-
-	block_mutex = buf_page_get_mutex(bpage);
-
-	/* We have to release the flush_list_mutex to obey the
-	latching order. We are however guaranteed that the page
-	will stay in the flush_list and won't be relocated because
-	buf_flush_remove() and buf_flush_relocate_on_flush_list()
-	need buf_pool->mutex as well. */
-
-	buf_flush_list_mutex_exit(buf_pool);
-
-	mutex_enter(block_mutex);
-
-	ut_ad(bpage->oldest_modification != 0);
-
-	if (!flush) {
-
-		buf_flush_remove(bpage);
-
-		mutex_exit(block_mutex);
-
-		processed = true;
-
-	} else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_SINGLE_PAGE)) {
-
-		/* The following call will release the buffer pool
-		and block mutex. */
-		processed = buf_flush_page(
-			buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, false);
-
-		if (processed) {
-			/* Wake possible simulated aio thread to actually
-			post the writes to the operating system */
-			os_aio_simulated_wake_handler_threads();
-			buf_pool_mutex_enter(buf_pool);
-		} else {
-			mutex_exit(block_mutex);
-		}
-	} else {
-		mutex_exit(block_mutex);
-	}
-
-	buf_flush_list_mutex_enter(buf_pool);
-
-	ut_ad(!mutex_own(block_mutex));
-	ut_ad(buf_pool_mutex_own(buf_pool));
-
-	return(processed);
-}
-
-/** Remove all dirty pages belonging to a given tablespace inside a specific
-buffer pool instance when we are deleting the data file(s) of that
-tablespace. The pages still remain a part of LRU and are evicted from
-the list as they age towards the tail of the LRU.
-@param[in,out]	buf_pool	buffer pool
-@param[in]	id		tablespace identifier
-@param[in]	observer	flush observer (to check for interrupt),
-				or NULL if the files should not be written to
-@param[in]	first		first page to be flushed or evicted
-@return	whether all matching dirty pages were removed */
-static	MY_ATTRIBUTE((warn_unused_result))
-bool
-buf_flush_or_remove_pages(
-	buf_pool_t*	buf_pool,
-	ulint		id,
-	FlushObserver*	observer,
-	ulint		first)
-{
-	buf_page_t*	prev;
-	buf_page_t*	bpage;
-	ulint		processed = 0;
-
-	buf_flush_list_mutex_enter(buf_pool);
-
-rescan:
-	bool	all_freed = true;
-
-	for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
-	     bpage != NULL;
-	     bpage = prev) {
-
-		ut_a(buf_page_in_file(bpage));
-
-		/* Save the previous link because once we free the
-		page we can't rely on the links. */
-
-		prev = UT_LIST_GET_PREV(list, bpage);
-
-		/* Flush the pages matching space id,
-		or pages matching the flush observer. */
-		if (observer && observer->is_partial_flush()) {
-			if (observer != bpage->flush_observer) {
-				/* Skip this block. */
-			} else if (!buf_flush_or_remove_page(
-					   buf_pool, bpage,
-					   !observer->is_interrupted())) {
-				all_freed = false;
-			} else if (!observer->is_interrupted()) {
-				/* The processing was successful. And during the
-				processing we have released the buf_pool mutex
-				when calling buf_page_flush(). We cannot trust
-				prev pointer. */
-				goto rescan;
-			}
-		} else if (id != bpage->id.space()) {
-			/* Skip this block, because it is for a
-			different tablespace. */
-		} else if (bpage->id.page_no() < first) {
-			/* Skip this block, because it is below the limit. */
-		} else if (!buf_flush_or_remove_page(
-				   buf_pool, bpage, observer != NULL)) {
-
-			/* Remove was unsuccessful, we have to try again
-			by scanning the entire list from the end.
-			This also means that we never released the
-			buf_pool mutex. Therefore we can trust the prev
-			pointer.
-			buf_flush_or_remove_page() released the
-			flush list mutex but not the buf_pool mutex.
-			Therefore it is possible that a new page was
-			added to the flush list. For example, in case
-			where we are at the head of the flush list and
-			prev == NULL. That is OK because we have the
-			tablespace quiesced and no new pages for this
-			space-id should enter flush_list. This is
-			because the only callers of this function are
-			DROP TABLE and FLUSH TABLE FOR EXPORT.
-			We know that we'll have to do at least one more
-			scan but we don't break out of loop here and
-			try to do as much work as we can in this
-			iteration. */
-
-			all_freed = false;
-		} else if (observer) {
-
-			/* The processing was successful. And during the
-			processing we have released the buf_pool mutex
-			when calling buf_page_flush(). We cannot trust
-			prev pointer. */
-			goto rescan;
-		}
-
-#ifdef BTR_CUR_HASH_ADAPT
-		++processed;
-
-		/* Yield if we have hogged the CPU and mutexes for too long. */
-		if (buf_flush_try_yield(buf_pool, prev, processed)) {
-
-			/* Reset the batch size counter if we had to yield. */
-
-			processed = 0;
-		}
-#endif /* BTR_CUR_HASH_ADAPT */
-
-		/* The check for trx is interrupted is expensive, we want
-		to check every N iterations. */
-		if (!processed && observer) {
-			observer->check_interrupted();
-		}
-	}
-
-	buf_flush_list_mutex_exit(buf_pool);
-
-	return(all_freed);
-}
-
-/** Remove or flush all the dirty pages that belong to a given tablespace
-inside a specific buffer pool instance. The pages will remain in the LRU
-list and will be evicted from the LRU list as they age and move towards
-the tail of the LRU list.
-@param[in,out]	buf_pool	buffer pool
-@param[in]	id		tablespace identifier
-@param[in]	observer	flush observer,
-				or NULL if the files should not be written to
-@param[in]	first		first page to be flushed or evicted */
-static
-void
-buf_flush_dirty_pages(
-	buf_pool_t*	buf_pool,
-	ulint		id,
-	FlushObserver*	observer,
-	ulint		first)
-{
-	for (;;) {
-		buf_pool_mutex_enter(buf_pool);
-
-		bool freed = buf_flush_or_remove_pages(buf_pool, id, observer,
-						       first);
-
-		buf_pool_mutex_exit(buf_pool);
-
-		ut_ad(buf_flush_validate(buf_pool));
-
-		if (freed) {
-			break;
-		}
-
-		os_thread_sleep(2000);
-		ut_ad(buf_flush_validate(buf_pool));
-	}
-
-	ut_ad((observer && observer->is_interrupted())
-	      || first
-	      || buf_pool_get_dirty_pages_count(buf_pool, id, observer) == 0);
-}
-
-/** Empty the flush list for all pages belonging to a tablespace.
-@param[in]	id		tablespace identifier
-@param[in]	observer	flush observer,
-				or NULL if nothing is to be written
-@param[in]	first		first page to be flushed or evicted */
-void buf_LRU_flush_or_remove_pages(ulint id, FlushObserver* observer,
-				   ulint first)
-{
-	/* Pages in the system tablespace must never be discarded. */
-	ut_ad(id || observer);
-
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_flush_dirty_pages(buf_pool_from_array(i), id, observer,
-				      first);
-	}
-
-	if (observer && !observer->is_interrupted()) {
-		/* Ensure that all asynchronous IO is completed. */
-		os_aio_wait_until_no_pending_writes();
-		fil_flush(id);
-	}
-}
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/********************************************************************//**
-Insert a compressed block into buf_pool->zip_clean in the LRU order. */
-void
-buf_LRU_insert_zip_clean(
-/*=====================*/
-	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
-{
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE);
-
-	/* Find the first successor of bpage in the LRU list
-	that is in the zip_clean list. */
-	buf_page_t*	b = bpage;
-
-	do {
-		b = UT_LIST_GET_NEXT(LRU, b);
-	} while (b && buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE);
-
-	/* Insert bpage before b, i.e., after the predecessor of b. */
-	if (b != NULL) {
-		b = UT_LIST_GET_PREV(list, b);
-	}
-
-	if (b != NULL) {
-		UT_LIST_INSERT_AFTER(buf_pool->zip_clean, b, bpage);
-	} else {
-		UT_LIST_ADD_FIRST(buf_pool->zip_clean, bpage);
-	}
-}
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-/******************************************************************//**
-Try to free an uncompressed page of a compressed block from the unzip
+/** Try to free an uncompressed page of a compressed block from the unzip
 LRU list.  The compressed page is preserved, and it need not be clean.
+@param limit  maximum number of blocks to scan
 @return true if freed */
-static
-bool
-buf_LRU_free_from_unzip_LRU_list(
-/*=============================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	bool		scan_all)	/*!< in: scan whole LRU list
-					if true, otherwise scan only
-					srv_LRU_scan_depth / 2 blocks. */
+static bool buf_LRU_free_from_unzip_LRU_list(ulint limit)
 {
-	ut_ad(buf_pool_mutex_own(buf_pool));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
 
-	if (!buf_LRU_evict_from_unzip_LRU(buf_pool)) {
+	if (!buf_LRU_evict_from_unzip_LRU()) {
 		return(false);
 	}
 
 	ulint	scanned = 0;
 	bool	freed = false;
 
-	for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
-	     block != NULL
-	     && !freed
-	     && (scan_all || scanned < srv_LRU_scan_depth);
-	     ++scanned) {
-
-		buf_block_t*	prev_block;
+	for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+	     block && scanned < limit; ++scanned) {
+		buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
 
-		prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
-
-		ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+		ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
 		ut_ad(block->in_unzip_LRU_list);
 		ut_ad(block->page.in_LRU_list);
 
 		freed = buf_LRU_free_page(&block->page, false);
+		if (freed) {
+			break;
+		}
 
 		block = prev_block;
 	}
@@ -655,57 +225,35 @@ buf_LRU_free_from_unzip_LRU_list(
 	return(freed);
 }
 
-/******************************************************************//**
-Try to free a clean page from the common LRU list.
-@return true if freed */
-static
-bool
-buf_LRU_free_from_common_LRU_list(
-/*==============================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	bool		scan_all)	/*!< in: scan whole LRU list
-					if true, otherwise scan only
-					up to BUF_LRU_SEARCH_SCAN_THRESHOLD */
+/** Try to free a clean page from the common LRU list.
+@param limit  maximum number of blocks to scan
+@return whether a page was freed */
+static bool buf_LRU_free_from_common_LRU_list(ulint limit)
 {
-	ut_ad(buf_pool_mutex_own(buf_pool));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
 
 	ulint		scanned = 0;
 	bool		freed = false;
 
-	for (buf_page_t* bpage = buf_pool->lru_scan_itr.start();
-	     bpage != NULL
-	     && !freed
-	     && (scan_all || scanned < BUF_LRU_SEARCH_SCAN_THRESHOLD);
-	     ++scanned, bpage = buf_pool->lru_scan_itr.get()) {
-
+	for (buf_page_t* bpage = buf_pool.lru_scan_itr.start();
+	     bpage && scanned < limit;
+	     ++scanned, bpage = buf_pool.lru_scan_itr.get()) {
 		buf_page_t*	prev = UT_LIST_GET_PREV(LRU, bpage);
-		BPageMutex*	mutex = buf_page_get_mutex(bpage);
-
-		buf_pool->lru_scan_itr.set(prev);
+		buf_pool.lru_scan_itr.set(prev);
 
-		mutex_enter(mutex);
+		const auto accessed = bpage->is_accessed();
 
-		ut_ad(buf_page_in_file(bpage));
-		ut_ad(bpage->in_LRU_list);
-
-		unsigned	accessed = buf_page_is_accessed(bpage);
-
-		if (buf_flush_ready_for_replace(bpage)) {
-			mutex_exit(mutex);
-			freed = buf_LRU_free_page(bpage, true);
-		} else {
-			mutex_exit(mutex);
-		}
+		if (buf_LRU_free_page(bpage, true)) {
+			if (!accessed) {
+				/* Keep track of pages that are evicted without
+				ever being accessed. This gives us a measure of
+				the effectiveness of readahead */
+				++buf_pool.stat.n_ra_pages_evicted;
+			}
 
-		if (freed && !accessed) {
-			/* Keep track of pages that are evicted without
-			ever being accessed. This gives us a measure of
-			the effectiveness of readahead */
-			++buf_pool->stat.n_ra_pages_evicted;
+			freed = true;
+			break;
 		}
-
-		ut_ad(buf_pool_mutex_own(buf_pool));
-		ut_ad(!mutex_own(mutex));
 	}
 
 	if (scanned) {
@@ -719,109 +267,57 @@ buf_LRU_free_from_common_LRU_list(
 	return(freed);
 }
 
-/******************************************************************//**
-Try to free a replaceable block.
+/** Try to free a replaceable block.
+@param limit  maximum number of blocks to scan
 @return true if found and freed */
-bool
-buf_LRU_scan_and_free_block(
-/*========================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	bool		scan_all)	/*!< in: scan whole LRU list
-					if true, otherwise scan only
-					BUF_LRU_SEARCH_SCAN_THRESHOLD
-					blocks. */
-{
-	ut_ad(buf_pool_mutex_own(buf_pool));
-
-	return(buf_LRU_free_from_unzip_LRU_list(buf_pool, scan_all)
-	       || buf_LRU_free_from_common_LRU_list(buf_pool, scan_all));
-}
-
-/******************************************************************//**
-Returns TRUE if less than 25 % of the buffer pool in any instance is
-available. This can be used in heuristics to prevent huge transactions
-eating up the whole buffer pool for their locks.
-@return TRUE if less than 25 % of buffer pool left */
-ibool
-buf_LRU_buf_pool_running_out(void)
-/*==============================*/
+bool buf_LRU_scan_and_free_block(ulint limit)
 {
-	ibool	ret = FALSE;
-
-	for (ulint i = 0; i < srv_buf_pool_instances && !ret; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		buf_pool_mutex_enter(buf_pool);
-
-		if (!recv_recovery_is_on()
-		    && UT_LIST_GET_LEN(buf_pool->free)
-		       + UT_LIST_GET_LEN(buf_pool->LRU)
-		       < ut_min(buf_pool->curr_size,
-				buf_pool->old_size) / 4) {
-
-			ret = TRUE;
-		}
-
-		buf_pool_mutex_exit(buf_pool);
-	}
+  mysql_mutex_assert_owner(&buf_pool.mutex);
 
-	return(ret);
+  return buf_LRU_free_from_unzip_LRU_list(limit) ||
+    buf_LRU_free_from_common_LRU_list(limit);
 }
 
-/******************************************************************//**
-Returns a free block from the buf_pool.  The block is taken off the
-free list.  If it is empty, returns NULL.
-@return a free control block, or NULL if the buf_block->free list is empty */
-buf_block_t*
-buf_LRU_get_free_only(
-/*==================*/
-	buf_pool_t*	buf_pool)
+/** @return a buffer block from the buf_pool.free list
+@retval	NULL	if the free list is empty */
+buf_block_t* buf_LRU_get_free_only()
 {
 	buf_block_t*	block;
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
 
 	block = reinterpret_cast<buf_block_t*>(
-		UT_LIST_GET_FIRST(buf_pool->free));
+		UT_LIST_GET_FIRST(buf_pool.free));
 
 	while (block != NULL) {
-
 		ut_ad(block->page.in_free_list);
 		ut_d(block->page.in_free_list = FALSE);
-		ut_ad(!block->page.in_flush_list);
+		ut_ad(!block->page.oldest_modification());
 		ut_ad(!block->page.in_LRU_list);
-		ut_a(!buf_page_in_file(&block->page));
-		UT_LIST_REMOVE(buf_pool->free, &block->page);
-
-		if (buf_pool->curr_size >= buf_pool->old_size
-		    || UT_LIST_GET_LEN(buf_pool->withdraw)
-			>= buf_pool->withdraw_target
-		    || !buf_block_will_withdrawn(buf_pool, block)) {
-			/* found valid free block */
-			buf_page_mutex_enter(block);
+		ut_a(!block->page.in_file());
+		UT_LIST_REMOVE(buf_pool.free, &block->page);
+
+		if (buf_pool.curr_size >= buf_pool.old_size
+		    || UT_LIST_GET_LEN(buf_pool.withdraw)
+			>= buf_pool.withdraw_target
+		    || !buf_pool.will_be_withdrawn(block->page)) {
 			/* No adaptive hash index entries may point to
 			a free block. */
 			assert_block_ahi_empty(block);
 
-			buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE);
+			block->page.set_state(BUF_BLOCK_MEMORY);
 			MEM_MAKE_ADDRESSABLE(block->frame, srv_page_size);
-
-			ut_ad(buf_pool_from_block(block) == buf_pool);
-
-			buf_page_mutex_exit(block);
 			break;
 		}
 
 		/* This should be withdrawn */
 		UT_LIST_ADD_LAST(
-			buf_pool->withdraw,
+			buf_pool.withdraw,
 			&block->page);
-		ut_d(block->in_withdraw_list = TRUE);
+		ut_d(block->in_withdraw_list = true);
 
 		block = reinterpret_cast<buf_block_t*>(
-			UT_LIST_GET_FIRST(buf_pool->free));
+			UT_LIST_GET_FIRST(buf_pool.free));
 	}
 
 	return(block);
@@ -832,162 +328,140 @@ Checks how much of buf_pool is occupied by non-data objects like
 AHI, lock heaps etc. Depending on the size of non-data objects this
 function will either assert or issue a warning and switch on the
 status monitor. */
-static
-void
-buf_LRU_check_size_of_non_data_objects(
-/*===================================*/
-	const buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
+static void buf_LRU_check_size_of_non_data_objects()
 {
-	ut_ad(buf_pool_mutex_own(buf_pool));
+  mysql_mutex_assert_owner(&buf_pool.mutex);
 
-	if (!recv_recovery_is_on()
-	    && buf_pool->curr_size == buf_pool->old_size
-	    && UT_LIST_GET_LEN(buf_pool->free)
-	    + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 20) {
+  if (recv_recovery_is_on() || buf_pool.curr_size != buf_pool.old_size)
+    return;
 
-		ib::fatal() << "Over 95 percent of the buffer pool is"
-			" occupied by lock heaps"
+  const auto s= UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU);
+
+  if (s < buf_pool.curr_size / 20)
+    ib::fatal() << "Over 95 percent of the buffer pool is"
+            " occupied by lock heaps"
 #ifdef BTR_CUR_HASH_ADAPT
-			" or the adaptive hash index!"
+            " or the adaptive hash index"
 #endif /* BTR_CUR_HASH_ADAPT */
-			" Check that your transactions do not set too many"
-			" row locks, or review if"
-			" innodb_buffer_pool_size="
-			<< (buf_pool->curr_size >> (20U - srv_page_size_shift))
-			<< "M could be bigger.";
-	} else if (!recv_recovery_is_on()
-		   && buf_pool->curr_size == buf_pool->old_size
-		   && (UT_LIST_GET_LEN(buf_pool->free)
-		       + UT_LIST_GET_LEN(buf_pool->LRU))
-		   < buf_pool->curr_size / 3) {
-
-		if (!buf_lru_switched_on_innodb_mon && srv_monitor_event) {
-
-			/* Over 67 % of the buffer pool is occupied by lock
-			heaps or the adaptive hash index. This may be a memory
-			leak! */
-
-			ib::warn() << "Over 67 percent of the buffer pool is"
-				" occupied by lock heaps"
+            "! Check that your transactions do not set too many"
+            " row locks, or review if innodb_buffer_pool_size="
+                << (buf_pool.curr_size >> (20U - srv_page_size_shift))
+                << "M could be bigger.";
+
+  if (s < buf_pool.curr_size / 3)
+  {
+    if (!buf_lru_switched_on_innodb_mon && srv_monitor_timer)
+    {
+      /* Over 67 % of the buffer pool is occupied by lock heaps or
+      the adaptive hash index. This may be a memory leak! */
+      ib::warn() << "Over 67 percent of the buffer pool is"
+              " occupied by lock heaps"
 #ifdef BTR_CUR_HASH_ADAPT
-				" or the adaptive hash index!"
+              " or the adaptive hash index"
 #endif /* BTR_CUR_HASH_ADAPT */
-				" Check that your transactions do not"
-				" set too many row locks."
-				" innodb_buffer_pool_size="
-				<< (buf_pool->curr_size >>
-				    (20U - srv_page_size_shift)) << "M."
-				" Starting the InnoDB Monitor to print"
-				" diagnostics.";
-
-			buf_lru_switched_on_innodb_mon = true;
-			srv_print_innodb_monitor = TRUE;
-			os_event_set(srv_monitor_event);
-		}
-
-	} else if (buf_lru_switched_on_innodb_mon) {
-
-		/* Switch off the InnoDB Monitor; this is a simple way
-		to stop the monitor if the situation becomes less urgent,
-		but may also surprise users if the user also switched on the
-		monitor! */
-
-		buf_lru_switched_on_innodb_mon = false;
-		srv_print_innodb_monitor = FALSE;
-	}
+              "! Check that your transactions do not set too many row locks."
+              " innodb_buffer_pool_size="
+                 << (buf_pool.curr_size >> (20U - srv_page_size_shift))
+                 << "M. Starting the InnoDB Monitor to print diagnostics.";
+      buf_lru_switched_on_innodb_mon= true;
+      srv_print_innodb_monitor= TRUE;
+      srv_monitor_timer_schedule_now();
+    }
+  }
+  else if (buf_lru_switched_on_innodb_mon)
+  {
+    /* Switch off the InnoDB Monitor; this is a simple way to stop the
+    monitor if the situation becomes less urgent, but may also
+    surprise users who did SET GLOBAL innodb_status_output=ON earlier! */
+    buf_lru_switched_on_innodb_mon= false;
+    srv_print_innodb_monitor= FALSE;
+  }
 }
 
-/******************************************************************//**
-Returns a free block from the buf_pool. The block is taken off the
-free list. If free list is empty, blocks are moved from the end of the
-LRU list to the free list.
+/** Get a block from the buf_pool.free list.
+If the list is empty, blocks will be moved from the end of buf_pool.LRU
+to buf_pool.free.
+
 This function is called from a user thread when it needs a clean
 block to read in a page. Note that we only ever get a block from
 the free list. Even when we flush a page or find a page in LRU scan
 we put it to free list to be used.
 * iteration 0:
-  * get a block from free list, success:done
-  * if buf_pool->try_LRU_scan is set
-    * scan LRU up to srv_LRU_scan_depth to find a clean block
-    * the above will put the block on free list
+  * get a block from the buf_pool.free list, success:done
+  * if buf_pool.try_LRU_scan is set
+    * scan LRU up to 100 pages to free a clean block
     * success:retry the free list
-  * flush one dirty page from tail of LRU to disk
-    * the above will put the block on free list
+  * flush up to innodb_lru_flush_size LRU blocks to data files
+    (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth)
+    * on buf_page_write_complete() the blocks will put on buf_pool.free list
     * success: retry the free list
-* iteration 1:
-  * same as iteration 0 except:
-    * scan whole LRU list
-    * scan LRU list even if buf_pool->try_LRU_scan is not set
-* iteration > 1:
-  * same as iteration 1 but sleep 10ms
-@return the free control block, in state BUF_BLOCK_READY_FOR_USE */
-buf_block_t*
-buf_LRU_get_free_block(
-/*===================*/
-	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
+* subsequent iterations: same as iteration 0 except:
+  * scan whole LRU list
+  * scan LRU list even if buf_pool.try_LRU_scan is not set
+
+@param have_mutex  whether buf_pool.mutex is already being held
+@return the free control block, in state BUF_BLOCK_MEMORY */
+buf_block_t *buf_LRU_get_free_block(bool have_mutex)
 {
-	buf_block_t*	block		= NULL;
-	bool		freed		= false;
 	ulint		n_iterations	= 0;
 	ulint		flush_failures	= 0;
-
 	MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH);
-loop:
-	buf_pool_mutex_enter(buf_pool);
-
-	buf_LRU_check_size_of_non_data_objects(buf_pool);
+	if (have_mutex) {
+		mysql_mutex_assert_owner(&buf_pool.mutex);
+		goto got_mutex;
+	}
+	mysql_mutex_lock(&buf_pool.mutex);
+got_mutex:
+	buf_LRU_check_size_of_non_data_objects();
+	buf_block_t* block;
 
 	DBUG_EXECUTE_IF("ib_lru_force_no_free_page",
 		if (!buf_lru_free_blocks_error_printed) {
 			n_iterations = 21;
 			goto not_found;});
 
+retry:
 	/* If there is a block in the free list, take it */
-	block = buf_LRU_get_free_only(buf_pool);
-
-	if (block != NULL) {
-
-		buf_pool_mutex_exit(buf_pool);
-		ut_ad(buf_pool_from_block(block) == buf_pool);
+	if ((block = buf_LRU_get_free_only()) != nullptr) {
+got_block:
+		if (!have_mutex) {
+			mysql_mutex_unlock(&buf_pool.mutex);
+		}
 		memset(&block->page.zip, 0, sizeof block->page.zip);
-
-		block->page.flush_observer = NULL;
-		return(block);
+		return block;
 	}
 
 	MONITOR_INC( MONITOR_LRU_GET_FREE_LOOPS );
-	freed = false;
-	if (buf_pool->try_LRU_scan || n_iterations > 0) {
+	if (n_iterations || buf_pool.try_LRU_scan) {
 		/* If no block was in the free list, search from the
 		end of the LRU list and try to free a block there.
 		If we are doing for the first time we'll scan only
 		tail of the LRU list otherwise we scan the whole LRU
 		list. */
-		freed = buf_LRU_scan_and_free_block(
-			buf_pool, n_iterations > 0);
-
-		if (!freed && n_iterations == 0) {
-			/* Tell other threads that there is no point
-			in scanning the LRU list. This flag is set to
-			TRUE again when we flush a batch from this
-			buffer pool. */
-			buf_pool->try_LRU_scan = FALSE;
-
-			/* Also tell the page_cleaner thread that
-			there is work for it to do. */
-			os_event_set(buf_flush_event);
+		if (buf_LRU_scan_and_free_block(n_iterations
+						? ULINT_UNDEFINED : 100)) {
+			goto retry;
 		}
+
+		/* Tell other threads that there is no point
+		in scanning the LRU list. */
+		buf_pool.try_LRU_scan = false;
+	}
+
+	for (;;) {
+		if ((block = buf_LRU_get_free_only()) != nullptr) {
+			goto got_block;
+		}
+		if (!buf_pool.n_flush_LRU_) {
+			break;
+		}
+		my_cond_wait(&buf_pool.done_free, &buf_pool.mutex.m_mutex);
 	}
 
 #ifndef DBUG_OFF
 not_found:
 #endif
-
-	buf_pool_mutex_exit(buf_pool);
-
-	if (freed) {
-		goto loop;
-	}
+	mysql_mutex_unlock(&buf_pool.mutex);
 
 	if (n_iterations > 20 && !buf_lru_free_blocks_error_printed
 	    && srv_buf_pool_old_size == srv_buf_pool_size) {
@@ -998,7 +472,7 @@ not_found:
 			" flush a page!"
 			" Consider increasing innodb_buffer_pool_size."
 			" Pending flushes (fsync) log: "
-			<< fil_n_pending_log_flushes
+			<< log_sys.get_pending_flushes()
 			<< "; buffer pool: "
 			<< fil_n_pending_tablespace_flushes
 			<< ". " << os_n_file_reads << " OS file reads, "
@@ -1009,83 +483,65 @@ not_found:
 		buf_lru_free_blocks_error_printed = true;
 	}
 
-	/* If we have scanned the whole LRU and still are unable to
-	find a free block then we should sleep here to let the
-	page_cleaner do an LRU batch for us. */
-
-	if (!srv_read_only_mode) {
-		os_event_set(buf_flush_event);
-	}
-
 	if (n_iterations > 1) {
-
 		MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS );
-		os_thread_sleep(10000);
 	}
 
 	/* No free block was found: try to flush the LRU list.
-	This call will flush one page from the LRU and put it on the
-	free list. That means that the free block is up for grabs for
-	all user threads.
+	The freed blocks will be up for grabs for all threads.
 
-	TODO: A more elegant way would have been to return the freed
+	TODO: A more elegant way would have been to return one freed
 	up block to the caller here but the code that deals with
-	removing the block from page_hash and LRU_list is fairly
-	involved (particularly in case of compressed pages). We
+	removing the block from buf_pool.page_hash and buf_pool.LRU is fairly
+	involved (particularly in case of ROW_FORMAT=COMPRESSED pages). We
 	can do that in a separate patch sometime in future. */
 
-	if (!buf_flush_single_page_from_LRU(buf_pool)) {
+	if (!buf_flush_LRU(innodb_lru_flush_size)) {
 		MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
 		++flush_failures;
 	}
 
-	srv_stats.buf_pool_wait_free.inc();
-
 	n_iterations++;
-
-	goto loop;
+	mysql_mutex_lock(&buf_pool.mutex);
+	buf_pool.stat.LRU_waits++;
+	goto got_mutex;
 }
 
-/*******************************************************************//**
-Moves the LRU_old pointer so that the length of the old blocks list
+/** Move the LRU_old pointer so that the length of the old blocks list
 is inside the allowed limits. */
-UNIV_INLINE
-void
-buf_LRU_old_adjust_len(
-/*===================*/
-	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
+static void buf_LRU_old_adjust_len()
 {
 	ulint	old_len;
 	ulint	new_len;
 
-	ut_a(buf_pool->LRU_old);
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_pool->LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN);
-	ut_ad(buf_pool->LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX);
+	ut_a(buf_pool.LRU_old);
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(buf_pool.LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN);
+	ut_ad(buf_pool.LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX);
 	compile_time_assert(BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN
 			    > BUF_LRU_OLD_RATIO_DIV
 			    * (BUF_LRU_OLD_TOLERANCE + 5));
 	compile_time_assert(BUF_LRU_NON_OLD_MIN_LEN < BUF_LRU_OLD_MIN_LEN);
 
 #ifdef UNIV_LRU_DEBUG
-	/* buf_pool->LRU_old must be the first item in the LRU list
+	/* buf_pool.LRU_old must be the first item in the LRU list
 	whose "old" flag is set. */
-	ut_a(buf_pool->LRU_old->old);
-	ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
-	     || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
-	ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
-	     || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
+	ut_a(buf_pool.LRU_old->old);
+	ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)
+	     || !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old);
+	ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)
+	     || UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old);
 #endif /* UNIV_LRU_DEBUG */
 
-	old_len = buf_pool->LRU_old_len;
-	new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU)
-			 * buf_pool->LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV,
-			 UT_LIST_GET_LEN(buf_pool->LRU)
+	old_len = buf_pool.LRU_old_len;
+	new_len = ut_min(UT_LIST_GET_LEN(buf_pool.LRU)
+			 * buf_pool.LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV,
+			 UT_LIST_GET_LEN(buf_pool.LRU)
 			 - (BUF_LRU_OLD_TOLERANCE
 			    + BUF_LRU_NON_OLD_MIN_LEN));
 
 	for (;;) {
-		buf_page_t*	LRU_old = buf_pool->LRU_old;
+		buf_page_t*	LRU_old = buf_pool.LRU_old;
 
 		ut_a(LRU_old);
 		ut_ad(LRU_old->in_LRU_list);
@@ -1097,174 +553,130 @@ buf_LRU_old_adjust_len(
 
 		if (old_len + BUF_LRU_OLD_TOLERANCE < new_len) {
 
-			buf_pool->LRU_old = LRU_old = UT_LIST_GET_PREV(
+			buf_pool.LRU_old = LRU_old = UT_LIST_GET_PREV(
 				LRU, LRU_old);
 #ifdef UNIV_LRU_DEBUG
 			ut_a(!LRU_old->old);
 #endif /* UNIV_LRU_DEBUG */
-			old_len = ++buf_pool->LRU_old_len;
-			buf_page_set_old(LRU_old, TRUE);
+			old_len = ++buf_pool.LRU_old_len;
+			LRU_old->set_old(true);
 
 		} else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) {
 
-			buf_pool->LRU_old = UT_LIST_GET_NEXT(LRU, LRU_old);
-			old_len = --buf_pool->LRU_old_len;
-			buf_page_set_old(LRU_old, FALSE);
+			buf_pool.LRU_old = UT_LIST_GET_NEXT(LRU, LRU_old);
+			old_len = --buf_pool.LRU_old_len;
+			LRU_old->set_old(false);
 		} else {
 			return;
 		}
 	}
 }
 
-/*******************************************************************//**
-Initializes the old blocks pointer in the LRU list. This function should be
+/** Initialize the old blocks pointer in the LRU list. This function should be
 called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */
-static
-void
-buf_LRU_old_init(
-/*=============*/
-	buf_pool_t*	buf_pool)
+static void buf_LRU_old_init()
 {
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN);
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_a(UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN);
 
 	/* We first initialize all blocks in the LRU list as old and then use
 	the adjust function to move the LRU_old pointer to the right
 	position */
 
-	for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+	for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool.LRU);
 	     bpage != NULL;
 	     bpage = UT_LIST_GET_PREV(LRU, bpage)) {
 
 		ut_ad(bpage->in_LRU_list);
-		ut_ad(buf_page_in_file(bpage));
 
 		/* This loop temporarily violates the
-		assertions of buf_page_set_old(). */
-		bpage->old = TRUE;
+		assertions of buf_page_t::set_old(). */
+		bpage->old = true;
 	}
 
-	buf_pool->LRU_old = UT_LIST_GET_FIRST(buf_pool->LRU);
-	buf_pool->LRU_old_len = UT_LIST_GET_LEN(buf_pool->LRU);
+	buf_pool.LRU_old = UT_LIST_GET_FIRST(buf_pool.LRU);
+	buf_pool.LRU_old_len = UT_LIST_GET_LEN(buf_pool.LRU);
 
-	buf_LRU_old_adjust_len(buf_pool);
+	buf_LRU_old_adjust_len();
 }
 
-/******************************************************************//**
-Remove a block from the unzip_LRU list if it belonged to the list. */
-static
-void
-buf_unzip_LRU_remove_block_if_needed(
-/*=================================*/
-	buf_page_t*	bpage)	/*!< in/out: control block */
+/** Remove a block from the unzip_LRU list if it belonged to the list.
+@param[in]	bpage	control block */
+static void buf_unzip_LRU_remove_block_if_needed(buf_page_t* bpage)
 {
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(bpage->in_file());
+	mysql_mutex_assert_owner(&buf_pool.mutex);
 
-	ut_ad(buf_page_in_file(bpage));
-	ut_ad(buf_pool_mutex_own(buf_pool));
-
-	if (buf_page_belongs_to_unzip_LRU(bpage)) {
+	if (bpage->belongs_to_unzip_LRU()) {
 		buf_block_t*	block = reinterpret_cast<buf_block_t*>(bpage);
 
 		ut_ad(block->in_unzip_LRU_list);
-		ut_d(block->in_unzip_LRU_list = FALSE);
+		ut_d(block->in_unzip_LRU_list = false);
 
-		UT_LIST_REMOVE(buf_pool->unzip_LRU, block);
+		UT_LIST_REMOVE(buf_pool.unzip_LRU, block);
 	}
 }
 
-/******************************************************************//**
-Adjust LRU hazard pointers if needed. */
-void
-buf_LRU_adjust_hp(
-/*==============*/
-	buf_pool_t*		buf_pool,/*!< in: buffer pool instance */
-	const buf_page_t*	bpage)	/*!< in: control block */
-{
-	buf_pool->lru_hp.adjust(bpage);
-	buf_pool->lru_scan_itr.adjust(bpage);
-	buf_pool->single_scan_itr.adjust(bpage);
-}
-
-/******************************************************************//**
-Removes a block from the LRU list. */
-UNIV_INLINE
-void
-buf_LRU_remove_block(
-/*=================*/
-	buf_page_t*	bpage)	/*!< in: control block */
+/** Removes a block from the LRU list.
+@param[in]	bpage	control block */
+static inline void buf_LRU_remove_block(buf_page_t* bpage)
 {
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-
-	ut_a(buf_page_in_file(bpage));
-
-	ut_ad(bpage->in_LRU_list);
-
 	/* Important that we adjust the hazard pointers before removing
 	bpage from the LRU list. */
-	buf_LRU_adjust_hp(buf_pool, bpage);
+	buf_page_t* prev_bpage = buf_pool.LRU_remove(bpage);
 
 	/* If the LRU_old pointer is defined and points to just this block,
 	move it backward one step */
 
-	if (bpage == buf_pool->LRU_old) {
+	if (bpage == buf_pool.LRU_old) {
 
 		/* Below: the previous block is guaranteed to exist,
 		because the LRU_old pointer is only allowed to differ
 		by BUF_LRU_OLD_TOLERANCE from strict
-		buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU
+		buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU
 		list length. */
-		buf_page_t*	prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
-
 		ut_a(prev_bpage);
 #ifdef UNIV_LRU_DEBUG
 		ut_a(!prev_bpage->old);
 #endif /* UNIV_LRU_DEBUG */
-		buf_pool->LRU_old = prev_bpage;
-		buf_page_set_old(prev_bpage, TRUE);
+		buf_pool.LRU_old = prev_bpage;
+		prev_bpage->set_old(true);
 
-		buf_pool->LRU_old_len++;
+		buf_pool.LRU_old_len++;
 	}
 
-	/* Remove the block from the LRU list */
-	UT_LIST_REMOVE(buf_pool->LRU, bpage);
-	ut_d(bpage->in_LRU_list = FALSE);
-
-	buf_pool->stat.LRU_bytes -= bpage->physical_size();
+	buf_pool.stat.LRU_bytes -= bpage->physical_size();
 
 	buf_unzip_LRU_remove_block_if_needed(bpage);
 
 	/* If the LRU list is so short that LRU_old is not defined,
 	clear the "old" flags and return */
-	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
+	if (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN) {
 
-		for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+		for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
 		     bpage != NULL;
 		     bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
 
 			/* This loop temporarily violates the
-			assertions of buf_page_set_old(). */
-			bpage->old = FALSE;
+			assertions of buf_page_t::set_old(). */
+			bpage->old = false;
 		}
 
-		buf_pool->LRU_old = NULL;
-		buf_pool->LRU_old_len = 0;
+		buf_pool.LRU_old = NULL;
+		buf_pool.LRU_old_len = 0;
 
 		return;
 	}
 
-	ut_ad(buf_pool->LRU_old);
+	ut_ad(buf_pool.LRU_old);
 
 	/* Update the LRU_old_len field if necessary */
-	if (buf_page_is_old(bpage)) {
-
-		buf_pool->LRU_old_len--;
+	if (bpage->old) {
+		buf_pool.LRU_old_len--;
 	}
 
 	/* Adjust the length of the old block list if necessary */
-	buf_LRU_old_adjust_len(buf_pool);
+	buf_LRU_old_adjust_len();
 }
 
 /******************************************************************//**
@@ -1276,19 +688,15 @@ buf_unzip_LRU_add_block(
 	ibool		old)	/*!< in: TRUE if should be put to the end
 				of the list, else put to the start */
 {
-	buf_pool_t*	buf_pool = buf_pool_from_block(block);
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-
-	ut_a(buf_page_belongs_to_unzip_LRU(&block->page));
-
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_a(block->page.belongs_to_unzip_LRU());
 	ut_ad(!block->in_unzip_LRU_list);
-	ut_d(block->in_unzip_LRU_list = TRUE);
+	ut_d(block->in_unzip_LRU_list = true);
 
 	if (old) {
-		UT_LIST_ADD_LAST(buf_pool->unzip_LRU, block);
+		UT_LIST_ADD_LAST(buf_pool.unzip_LRU, block);
 	} else {
-		UT_LIST_ADD_FIRST(buf_pool->unzip_LRU, block);
+		UT_LIST_ADD_FIRST(buf_pool.unzip_LRU, block);
 	}
 }
 
@@ -1296,326 +704,285 @@ buf_unzip_LRU_add_block(
 Adds a block to the LRU list. Please make sure that the page_size is
 already set when invoking the function, so that we can get correct
 page_size from the buffer page when adding a block into LRU */
-UNIV_INLINE
 void
-buf_LRU_add_block_low(
-/*==================*/
+buf_LRU_add_block(
 	buf_page_t*	bpage,	/*!< in: control block */
-	ibool		old)	/*!< in: TRUE if should be put to the old blocks
+	bool		old)	/*!< in: true if should be put to the old blocks
 				in the LRU list, else put to the start; if the
 				LRU list is very short, the block is added to
 				the start, regardless of this parameter */
 {
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-
-	ut_a(buf_page_in_file(bpage));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
 	ut_ad(!bpage->in_LRU_list);
 
-	if (!old || (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN)) {
+	if (!old || (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN)) {
 
-		UT_LIST_ADD_FIRST(buf_pool->LRU, bpage);
+		UT_LIST_ADD_FIRST(buf_pool.LRU, bpage);
 
-		bpage->freed_page_clock = buf_pool->freed_page_clock;
+		bpage->freed_page_clock = buf_pool.freed_page_clock
+			& ((1U << 31) - 1);
 	} else {
 #ifdef UNIV_LRU_DEBUG
-		/* buf_pool->LRU_old must be the first item in the LRU list
+		/* buf_pool.LRU_old must be the first item in the LRU list
 		whose "old" flag is set. */
-		ut_a(buf_pool->LRU_old->old);
-		ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
-		     || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
-		ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
-		     || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
+		ut_a(buf_pool.LRU_old->old);
+		ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)
+		     || !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old);
+		ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)
+		     || UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old);
 #endif /* UNIV_LRU_DEBUG */
-		UT_LIST_INSERT_AFTER(buf_pool->LRU, buf_pool->LRU_old,
+		UT_LIST_INSERT_AFTER(buf_pool.LRU, buf_pool.LRU_old,
 			bpage);
 
-		buf_pool->LRU_old_len++;
+		buf_pool.LRU_old_len++;
 	}
 
 	ut_d(bpage->in_LRU_list = TRUE);
 
-	incr_LRU_size_in_bytes(bpage, buf_pool);
+	incr_LRU_size_in_bytes(bpage);
 
-	if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) {
+	if (UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_OLD_MIN_LEN) {
 
-		ut_ad(buf_pool->LRU_old);
+		ut_ad(buf_pool.LRU_old);
 
 		/* Adjust the length of the old block list if necessary */
 
-		buf_page_set_old(bpage, old);
-		buf_LRU_old_adjust_len(buf_pool);
+		bpage->set_old(old);
+		buf_LRU_old_adjust_len();
 
-	} else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) {
+	} else if (UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN) {
 
 		/* The LRU list is now long enough for LRU_old to become
 		defined: init it */
 
-		buf_LRU_old_init(buf_pool);
+		buf_LRU_old_init();
 	} else {
-		buf_page_set_old(bpage, buf_pool->LRU_old != NULL);
+		bpage->set_old(buf_pool.LRU_old != NULL);
 	}
 
 	/* If this is a zipped block with decompressed frame as well
 	then put it on the unzip_LRU list */
-	if (buf_page_belongs_to_unzip_LRU(bpage)) {
+	if (bpage->belongs_to_unzip_LRU()) {
 		buf_unzip_LRU_add_block((buf_block_t*) bpage, old);
 	}
 }
 
-/******************************************************************//**
-Adds a block to the LRU list. Please make sure that the page_size is
-already set when invoking the function, so that we can get correct
-page_size from the buffer page when adding a block into LRU */
-void
-buf_LRU_add_block(
-/*==============*/
-	buf_page_t*	bpage,	/*!< in: control block */
-	ibool		old)	/*!< in: TRUE if should be put to the old
-				blocks in the LRU list, else put to the start;
-				if the LRU list is very short, the block is
-				added to the start, regardless of this
-				parameter */
+/** Move a block to the start of the LRU list. */
+void buf_page_make_young(buf_page_t *bpage)
 {
-	buf_LRU_add_block_low(bpage, old);
-}
+  ut_ad(bpage->in_file());
 
-/******************************************************************//**
-Moves a block to the start of the LRU list. */
-void
-buf_LRU_make_block_young(
-/*=====================*/
-	buf_page_t*	bpage)	/*!< in: control block */
-{
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+  mysql_mutex_lock(&buf_pool.mutex);
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
+  if (UNIV_UNLIKELY(bpage->old))
+    buf_pool.stat.n_pages_made_young++;
 
-	if (bpage->old) {
-		buf_pool->stat.n_pages_made_young++;
-	}
+  buf_LRU_remove_block(bpage);
+  buf_LRU_add_block(bpage, false);
 
-	buf_LRU_remove_block(bpage);
-	buf_LRU_add_block_low(bpage, FALSE);
+  mysql_mutex_unlock(&buf_pool.mutex);
 }
 
-/******************************************************************//**
-Try to free a block.  If bpage is a descriptor of a compressed-only
-page, the descriptor object will be freed as well.
-
-NOTE: If this function returns true, it will temporarily
-release buf_pool->mutex.  Furthermore, the page frame will no longer be
-accessible via bpage.
-
-The caller must hold buf_pool->mutex and must not hold any
-buf_page_get_mutex() when calling this function.
-@return true if freed, false otherwise. */
-bool
-buf_LRU_free_page(
-/*===============*/
-	buf_page_t*	bpage,	/*!< in: block to be freed */
-	bool		zip)	/*!< in: true if should remove also the
-				compressed page of an uncompressed page */
+/** Try to free a block. If bpage is a descriptor of a compressed-only
+ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well.
+The caller must hold buf_pool.mutex.
+@param bpage      block to be freed
+@param zip        whether to remove both copies of a ROW_FORMAT=COMPRESSED page
+@retval true if freed and buf_pool.mutex may have been temporarily released
+@retval false if the page was not freed */
+bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
 {
-	buf_page_t*	b = NULL;
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	const page_id_t id(bpage->id());
+	buf_page_t*	b = nullptr;
 
-	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, bpage->id);
-
-	BPageMutex*	block_mutex = buf_page_get_mutex(bpage);
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_page_in_file(bpage));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(bpage->in_file());
 	ut_ad(bpage->in_LRU_list);
 
-	rw_lock_x_lock(hash_lock);
-	mutex_enter(block_mutex);
+	/* First, perform a quick check before we acquire hash_lock. */
+	if (!bpage->can_relocate()) {
+		return false;
+	}
 
-	if (!buf_page_can_relocate(bpage)) {
+	/* We must hold an exclusive hash_lock to prevent
+	bpage->can_relocate() from changing due to a concurrent
+	execution of buf_page_get_low(). */
+	const ulint fold = id.fold();
+	page_hash_latch* hash_lock = buf_pool.page_hash.lock_get(fold);
+	hash_lock->write_lock();
+	lsn_t oldest_modification = bpage->oldest_modification_acquire();
 
+	if (UNIV_UNLIKELY(!bpage->can_relocate())) {
 		/* Do not free buffer fixed and I/O-fixed blocks. */
 		goto func_exit;
 	}
 
+	if (oldest_modification == 1) {
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		oldest_modification = bpage->oldest_modification();
+		if (oldest_modification) {
+			ut_ad(oldest_modification == 1);
+			buf_pool.delete_from_flush_list(bpage);
+		}
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+		ut_ad(!bpage->oldest_modification());
+		oldest_modification = 0;
+	}
+
 	if (zip || !bpage->zip.data) {
 		/* This would completely free the block. */
 		/* Do not completely free dirty blocks. */
 
-		if (bpage->oldest_modification) {
+		if (oldest_modification) {
 			goto func_exit;
 		}
-	} else if (bpage->oldest_modification > 0
-		   && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
-
-		ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY);
-
+	} else if (oldest_modification
+		   && bpage->state() != BUF_BLOCK_FILE_PAGE) {
 func_exit:
-		rw_lock_x_unlock(hash_lock);
-		mutex_exit(block_mutex);
+		hash_lock->write_unlock();
 		return(false);
 
-	} else if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
+	} else if (bpage->state() == BUF_BLOCK_FILE_PAGE) {
 		b = buf_page_alloc_descriptor();
 		ut_a(b);
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
 		new (b) buf_page_t(*bpage);
+		b->set_state(BUF_BLOCK_ZIP_PAGE);
 	}
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_page_in_file(bpage));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(bpage->in_file());
 	ut_ad(bpage->in_LRU_list);
-	ut_ad(!bpage->in_flush_list == !bpage->oldest_modification);
 
 	DBUG_PRINT("ib_buf", ("free page %u:%u",
-			      bpage->id.space(), bpage->id.page_no()));
+			      id.space(), id.page_no()));
 
-	ut_ad(rw_lock_own(hash_lock, RW_LOCK_X));
-	ut_ad(buf_page_can_relocate(bpage));
+	ut_ad(bpage->can_relocate());
 
-	if (!buf_LRU_block_remove_hashed(bpage, zip)) {
+	if (!buf_LRU_block_remove_hashed(bpage, id, hash_lock, zip)) {
+		ut_ad(!b);
+		mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
 		return(true);
 	}
 
-	/* buf_LRU_block_remove_hashed() releases the hash_lock */
-	ut_ad(!rw_lock_own_flagged(hash_lock,
-				   RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
-
-	/* We have just freed a BUF_BLOCK_FILE_PAGE. If b != NULL
+	/* We have just freed a BUF_BLOCK_FILE_PAGE. If b != nullptr
 	then it was a compressed page with an uncompressed frame and
 	we are interested in freeing only the uncompressed frame.
 	Therefore we have to reinsert the compressed page descriptor
 	into the LRU and page_hash (and possibly flush_list).
-	if b == NULL then it was a regular page that has been freed */
+	if !b then it was a regular page that has been freed */
 
-	if (b != NULL) {
+	if (UNIV_LIKELY_NULL(b)) {
 		buf_page_t*	prev_b	= UT_LIST_GET_PREV(LRU, b);
 
-		rw_lock_x_lock(hash_lock);
-
-		mutex_enter(block_mutex);
-
-		ut_a(!buf_page_hash_get_low(buf_pool, b->id));
-
-		b->state = b->oldest_modification
-			? BUF_BLOCK_ZIP_DIRTY
-			: BUF_BLOCK_ZIP_PAGE;
-
+		ut_ad(!buf_pool.page_hash_get_low(id, fold));
 		ut_ad(b->zip_size());
 
-		/* The fields in_page_hash and in_LRU_list of
+		/* The field in_LRU_list of
 		the to-be-freed block descriptor should have
 		been cleared in
 		buf_LRU_block_remove_hashed(), which
 		invokes buf_LRU_remove_block(). */
-		ut_ad(!bpage->in_page_hash);
 		ut_ad(!bpage->in_LRU_list);
 
 		/* bpage->state was BUF_BLOCK_FILE_PAGE because
-		b != NULL. The type cast below is thus valid. */
+		b != nullptr. The type cast below is thus valid. */
 		ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list);
 
 		/* The fields of bpage were copied to b before
 		buf_LRU_block_remove_hashed() was invoked. */
 		ut_ad(!b->in_zip_hash);
-		ut_ad(b->in_page_hash);
 		ut_ad(b->in_LRU_list);
+		ut_ad(b->in_page_hash);
 
-		HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
-			    b->id.fold(), b);
+		HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, b);
 
 		/* Insert b where bpage was in the LRU list. */
-		if (prev_b != NULL) {
+		if (prev_b) {
 			ulint	lru_len;
 
 			ut_ad(prev_b->in_LRU_list);
-			ut_ad(buf_page_in_file(prev_b));
+			ut_ad(prev_b->in_file());
 
-			UT_LIST_INSERT_AFTER(buf_pool->LRU, prev_b, b);
+			UT_LIST_INSERT_AFTER(buf_pool.LRU, prev_b, b);
 
-			incr_LRU_size_in_bytes(b, buf_pool);
+			incr_LRU_size_in_bytes(b);
 
-			if (buf_page_is_old(b)) {
-				buf_pool->LRU_old_len++;
-				if (buf_pool->LRU_old
+			if (b->is_old()) {
+				buf_pool.LRU_old_len++;
+				if (buf_pool.LRU_old
 				    == UT_LIST_GET_NEXT(LRU, b)) {
 
-					buf_pool->LRU_old = b;
+					buf_pool.LRU_old = b;
 				}
 			}
 
-			lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+			lru_len = UT_LIST_GET_LEN(buf_pool.LRU);
 
 			if (lru_len > BUF_LRU_OLD_MIN_LEN) {
-				ut_ad(buf_pool->LRU_old);
+				ut_ad(buf_pool.LRU_old);
 				/* Adjust the length of the
 				old block list if necessary */
-				buf_LRU_old_adjust_len(buf_pool);
+				buf_LRU_old_adjust_len();
 			} else if (lru_len == BUF_LRU_OLD_MIN_LEN) {
 				/* The LRU list is now long
 				enough for LRU_old to become
 				defined: init it */
-				buf_LRU_old_init(buf_pool);
+				buf_LRU_old_init();
 			}
 #ifdef UNIV_LRU_DEBUG
 			/* Check that the "old" flag is consistent
 			in the block and its neighbours. */
-			buf_page_set_old(b, buf_page_is_old(b));
+			b->set_old(b->is_old());
 #endif /* UNIV_LRU_DEBUG */
 		} else {
 			ut_d(b->in_LRU_list = FALSE);
-			buf_LRU_add_block_low(b, buf_page_is_old(b));
+			buf_LRU_add_block(b, b->old);
 		}
 
-		if (b->state == BUF_BLOCK_ZIP_PAGE) {
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-			buf_LRU_insert_zip_clean(b);
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-		} else {
-			/* Relocate on buf_pool->flush_list. */
-			buf_flush_relocate_on_flush_list(bpage, b);
-		}
+		buf_flush_relocate_on_flush_list(bpage, b);
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
-		bpage->zip.data = NULL;
+		bpage->zip.data = nullptr;
 
 		page_zip_set_size(&bpage->zip, 0);
 
-		mutex_exit(block_mutex);
-
 		/* Prevent buf_page_get_gen() from
 		decompressing the block while we release
-		buf_pool->mutex and block_mutex. */
-		block_mutex = buf_page_get_mutex(b);
-
-		mutex_enter(block_mutex);
-
-		buf_page_set_sticky(b);
-
-		mutex_exit(block_mutex);
-
-		rw_lock_x_unlock(hash_lock);
+		hash_lock. */
+		b->set_io_fix(BUF_IO_PIN);
+		hash_lock->write_unlock();
+	} else if (!zip) {
+		hash_lock->write_unlock();
 	}
 
-	buf_pool_mutex_exit(buf_pool);
-
-	/* Remove possible adaptive hash index on the page.
-	The page was declared uninitialized by
-	buf_LRU_block_remove_hashed().  We need to flag
-	the contents of the page valid (which it still is) in
-	order to avoid bogus Valgrind or MSAN warnings.*/
 	buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
 
-	MEM_MAKE_DEFINED(block->frame, srv_page_size);
-	btr_search_drop_page_hash_index(block);
-	MEM_UNDEFINED(block->frame, srv_page_size);
-
-	buf_pool_mutex_enter(buf_pool);
-
-	if (b) {
-		mutex_enter(block_mutex);
-
-		buf_page_unset_sticky(b);
+#ifdef BTR_CUR_HASH_ADAPT
+	if (block->index) {
+		mysql_mutex_unlock(&buf_pool.mutex);
+
+		/* Remove the adaptive hash index on the page.
+		The page was declared uninitialized by
+		buf_LRU_block_remove_hashed().  We need to flag
+		the contents of the page valid (which it still is) in
+		order to avoid bogus Valgrind or MSAN warnings.*/
+
+		MEM_MAKE_DEFINED(block->frame, srv_page_size);
+		btr_search_drop_page_hash_index(block);
+		MEM_UNDEFINED(block->frame, srv_page_size);
+
+		if (UNIV_LIKELY_NULL(b)) {
+			ut_ad(b->zip_size());
+			b->io_unfix();
+		}
 
-		mutex_exit(block_mutex);
+		mysql_mutex_lock(&buf_pool.mutex);
+	} else
+#endif
+	if (UNIV_LIKELY_NULL(b)) {
+		ut_ad(b->zip_size());
+		b->io_unfix();
 	}
 
 	buf_LRU_block_free_hashed_page(block);
@@ -1631,106 +998,93 @@ buf_LRU_block_free_non_file_page(
 	buf_block_t*	block)	/*!< in: block, must not contain a file page */
 {
 	void*		data;
-	buf_pool_t*	buf_pool = buf_pool_from_block(block);
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_page_mutex_own(block));
-
-	switch (buf_block_get_state(block)) {
-	case BUF_BLOCK_MEMORY:
-	case BUF_BLOCK_READY_FOR_USE:
-		break;
-	default:
-		ut_error;
-	}
 
+	ut_ad(block->page.state() == BUF_BLOCK_MEMORY);
 	assert_block_ahi_empty(block);
 	ut_ad(!block->page.in_free_list);
-	ut_ad(!block->page.in_flush_list);
+	ut_ad(!block->page.oldest_modification());
 	ut_ad(!block->page.in_LRU_list);
 
-	buf_block_set_state(block, BUF_BLOCK_NOT_USED);
+	block->page.set_state(BUF_BLOCK_NOT_USED);
 
 	MEM_UNDEFINED(block->frame, srv_page_size);
 	/* Wipe page_no and space_id */
-	memset(block->frame + FIL_PAGE_OFFSET, 0xfe, 4);
-	memset(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xfe, 4);
+	static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+	memset_aligned<4>(block->frame + FIL_PAGE_OFFSET, 0xfe, 4);
+	static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+		      "not perfect alignment");
+	memset_aligned<2>(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+			  0xfe, 4);
 	data = block->page.zip.data;
 
 	if (data != NULL) {
 		block->page.zip.data = NULL;
-		buf_page_mutex_exit(block);
-		buf_pool_mutex_exit_forbid(buf_pool);
+		buf_pool_mutex_exit_forbid();
 
 		ut_ad(block->zip_size());
 
-		buf_buddy_free(buf_pool, data, block->zip_size());
-
-		buf_pool_mutex_exit_allow(buf_pool);
-		buf_page_mutex_enter(block);
+		buf_buddy_free(data, block->zip_size());
 
+		buf_pool_mutex_exit_allow();
 		page_zip_set_size(&block->page.zip, 0);
 	}
 
-	if (buf_pool->curr_size < buf_pool->old_size
-	    && UT_LIST_GET_LEN(buf_pool->withdraw) < buf_pool->withdraw_target
-	    && buf_block_will_withdrawn(buf_pool, block)) {
+	if (buf_pool.curr_size < buf_pool.old_size
+	    && UT_LIST_GET_LEN(buf_pool.withdraw) < buf_pool.withdraw_target
+	    && buf_pool.will_be_withdrawn(block->page)) {
 		/* This should be withdrawn */
 		UT_LIST_ADD_LAST(
-			buf_pool->withdraw,
+			buf_pool.withdraw,
 			&block->page);
-		ut_d(block->in_withdraw_list = TRUE);
+		ut_d(block->in_withdraw_list = true);
 	} else {
-		UT_LIST_ADD_FIRST(buf_pool->free, &block->page);
-		ut_d(block->page.in_free_list = TRUE);
+		UT_LIST_ADD_FIRST(buf_pool.free, &block->page);
+		ut_d(block->page.in_free_list = true);
+		pthread_cond_signal(&buf_pool.done_free);
 	}
 
 	MEM_NOACCESS(block->frame, srv_page_size);
 }
 
-/******************************************************************//**
-Takes a block out of the LRU list and page hash table.
-If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
+/** Release a memory block to the buffer pool. */
+ATTRIBUTE_COLD void buf_pool_t::free_block(buf_block_t *block)
+{
+  ut_ad(this == &buf_pool);
+  mysql_mutex_lock(&mutex);
+  buf_LRU_block_free_non_file_page(block);
+  mysql_mutex_unlock(&mutex);
+}
+
+
+/** Remove bpage from buf_pool.LRU and buf_pool.page_hash.
+
+If bpage->state() == BUF_BLOCK_ZIP_PAGE && !bpage->oldest_modification(),
 the object will be freed.
 
-The caller must hold buf_pool->mutex, the buf_page_get_mutex() mutex
-and the appropriate hash_lock. This function will release the
-buf_page_get_mutex() and the hash_lock.
+@param bpage      buffer block
+@param id         page identifier
+@param hash_lock  buf_pool.page_hash latch (will be released here)
+@param zip        whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed
 
 If a compressed page is freed other compressed pages may be relocated.
 @retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
 caller needs to free the page to the free list
 @retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
 this case the block is already returned to the buddy allocator. */
-static
-bool
-buf_LRU_block_remove_hashed(
-/*========================*/
-	buf_page_t*	bpage,	/*!< in: block, must contain a file page and
-				be in a state where it can be freed; there
-				may or may not be a hash index to the page */
-	bool		zip)	/*!< in: true if should remove also the
-				compressed page of an uncompressed page */
+static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
+                                        page_hash_latch *hash_lock, bool zip)
 {
-	const buf_page_t*	hashed_bpage;
-	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
-	rw_lock_t*		hash_lock;
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-
-	hash_lock = buf_page_hash_lock_get(buf_pool, bpage->id);
-
-        ut_ad(rw_lock_own(hash_lock, RW_LOCK_X));
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+        ut_ad(hash_lock->is_write_locked());
 
-	ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
-	ut_a(bpage->buf_fix_count == 0);
+	ut_a(bpage->io_fix() == BUF_IO_NONE);
+	ut_a(!bpage->buf_fix_count());
 
 	buf_LRU_remove_block(bpage);
 
-	buf_pool->freed_page_clock += 1;
+	buf_pool.freed_page_clock += 1;
 
-	switch (buf_page_get_state(bpage)) {
+	switch (bpage->state()) {
 	case BUF_BLOCK_FILE_PAGE:
 		MEM_CHECK_ADDRESSABLE(bpage, sizeof(buf_block_t));
 		MEM_CHECK_ADDRESSABLE(((buf_block_t*) bpage)->frame,
@@ -1739,7 +1093,7 @@ buf_LRU_block_remove_hashed(
 		if (bpage->zip.data) {
 			const page_t*	page = ((buf_block_t*) bpage)->frame;
 
-			ut_a(!zip || bpage->oldest_modification == 0);
+			ut_a(!zip || !bpage->oldest_modification());
 			ut_ad(bpage->zip_size());
 
 			switch (fil_page_get_type(page)) {
@@ -1764,7 +1118,10 @@ buf_LRU_block_remove_hashed(
 			case FIL_PAGE_INDEX:
 			case FIL_PAGE_RTREE:
 #if defined UNIV_ZIP_DEBUG && defined BTR_CUR_HASH_ADAPT
-				ut_a(page_zip_validate(
+				/* During recovery, we only update the
+				compressed page, not the uncompressed one. */
+				ut_a(recv_recovery_is_on()
+				     || page_zip_validate(
 					     &bpage->zip, page,
 					     ((buf_block_t*) bpage)->index));
 #endif /* UNIV_ZIP_DEBUG && BTR_CUR_HASH_ADAPT */
@@ -1787,63 +1144,53 @@ buf_LRU_block_remove_hashed(
 		}
 		/* fall through */
 	case BUF_BLOCK_ZIP_PAGE:
-		ut_a(bpage->oldest_modification == 0);
+		ut_a(!bpage->oldest_modification());
 		MEM_CHECK_ADDRESSABLE(bpage->zip.data, bpage->zip_size());
 		break;
-	case BUF_BLOCK_POOL_WATCH:
-	case BUF_BLOCK_ZIP_DIRTY:
 	case BUF_BLOCK_NOT_USED:
-	case BUF_BLOCK_READY_FOR_USE:
 	case BUF_BLOCK_MEMORY:
 	case BUF_BLOCK_REMOVE_HASH:
 		ut_error;
 		break;
 	}
 
-	hashed_bpage = buf_page_hash_get_low(buf_pool, bpage->id);
-	if (UNIV_UNLIKELY(bpage != hashed_bpage)) {
-		ib::fatal() << "Page not found in the hash table: "
-			    << bpage->id;
-	}
-
 	ut_ad(!bpage->in_zip_hash);
-	ut_ad(bpage->in_page_hash);
-	ut_d(bpage->in_page_hash = FALSE);
-
-	HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, bpage->id.fold(),
-		    bpage);
+	HASH_DELETE(buf_page_t, hash, &buf_pool.page_hash, id.fold(), bpage);
 
-	switch (buf_page_get_state(bpage)) {
+	switch (bpage->state()) {
 	case BUF_BLOCK_ZIP_PAGE:
 		ut_ad(!bpage->in_free_list);
-		ut_ad(!bpage->in_flush_list);
 		ut_ad(!bpage->in_LRU_list);
 		ut_a(bpage->zip.data);
 		ut_a(bpage->zip.ssize);
+		ut_ad(!bpage->oldest_modification());
 
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-		UT_LIST_REMOVE(buf_pool->zip_clean, bpage);
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+		hash_lock->write_unlock();
+		buf_pool_mutex_exit_forbid();
 
-		mutex_exit(&buf_pool->zip_mutex);
-		rw_lock_x_unlock(hash_lock);
-		buf_pool_mutex_exit_forbid(buf_pool);
+		buf_buddy_free(bpage->zip.data, bpage->zip_size());
 
-		buf_buddy_free(buf_pool, bpage->zip.data, bpage->zip_size());
-
-		buf_pool_mutex_exit_allow(buf_pool);
+		buf_pool_mutex_exit_allow();
 		buf_page_free_descriptor(bpage);
 		return(false);
 
 	case BUF_BLOCK_FILE_PAGE:
-		memset(((buf_block_t*) bpage)->frame
-		       + FIL_PAGE_OFFSET, 0xff, 4);
-		memset(((buf_block_t*) bpage)->frame
-		       + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
+		static_assert(FIL_NULL == 0xffffffffU, "fill pattern");
+		static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+		memset_aligned<4>(reinterpret_cast<buf_block_t*>(bpage)->frame
+				  + FIL_PAGE_OFFSET, 0xff, 4);
+		static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+			      "not perfect alignment");
+		memset_aligned<2>(reinterpret_cast<buf_block_t*>(bpage)->frame
+				  + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
 		MEM_UNDEFINED(((buf_block_t*) bpage)->frame, srv_page_size);
-		buf_page_set_state(bpage, BUF_BLOCK_REMOVE_HASH);
+		bpage->set_state(BUF_BLOCK_REMOVE_HASH);
+
+		if (!zip) {
+			return true;
+		}
 
-		/* Question: If we release bpage and hash mutex here
+		/* Question: If we release hash_lock here
 		then what protects us against:
 		1) Some other thread buffer fixing this page
 		2) Some other thread trying to read this page and
@@ -1862,32 +1209,28 @@ buf_LRU_block_remove_hashed(
 		and by the time we'll release it in the caller we'd
 		have inserted the compressed only descriptor in the
 		page_hash. */
-		rw_lock_x_unlock(hash_lock);
-		mutex_exit(&((buf_block_t*) bpage)->mutex);
+		hash_lock->write_unlock();
 
-		if (zip && bpage->zip.data) {
+		if (bpage->zip.data) {
 			/* Free the compressed page. */
 			void*	data = bpage->zip.data;
 			bpage->zip.data = NULL;
 
 			ut_ad(!bpage->in_free_list);
-			ut_ad(!bpage->in_flush_list);
+			ut_ad(!bpage->oldest_modification());
 			ut_ad(!bpage->in_LRU_list);
-			buf_pool_mutex_exit_forbid(buf_pool);
+			buf_pool_mutex_exit_forbid();
 
-			buf_buddy_free(buf_pool, data, bpage->zip_size());
+			buf_buddy_free(data, bpage->zip_size());
 
-			buf_pool_mutex_exit_allow(buf_pool);
+			buf_pool_mutex_exit_allow();
 
 			page_zip_set_size(&bpage->zip, 0);
 		}
 
 		return(true);
 
-	case BUF_BLOCK_POOL_WATCH:
-	case BUF_BLOCK_ZIP_DIRTY:
 	case BUF_BLOCK_NOT_USED:
-	case BUF_BLOCK_READY_FOR_USE:
 	case BUF_BLOCK_MEMORY:
 	case BUF_BLOCK_REMOVE_HASH:
 		break;
@@ -1897,83 +1240,32 @@ buf_LRU_block_remove_hashed(
 	return(false);
 }
 
-/******************************************************************//**
-Puts a file page whose has no hash index to the free list. */
-static
-void
-buf_LRU_block_free_hashed_page(
-/*===========================*/
-	buf_block_t*	block)	/*!< in: block, must contain a file page and
-				be in a state where it can be freed */
-{
-	buf_pool_t*	buf_pool = buf_pool_from_block(block);
-	ut_ad(buf_pool_mutex_own(buf_pool));
-
-	buf_page_mutex_enter(block);
-
-	if (buf_pool->flush_rbt == NULL) {
-		block->page.id
-		    = page_id_t(ULINT32_UNDEFINED, ULINT32_UNDEFINED);
-	}
-
-	buf_block_set_state(block, BUF_BLOCK_MEMORY);
-
-	buf_LRU_block_free_non_file_page(block);
-	buf_page_mutex_exit(block);
-}
-
 /** Remove one page from LRU list and put it to free list.
-@param[in,out]	bpage		block, must contain a file page and be in
-				a freeable state; there may or may not be a
-				hash index to the page
-@param[in]	old_page_id	page number before bpage->id was invalidated */
-void buf_LRU_free_one_page(buf_page_t* bpage, page_id_t old_page_id)
+@param bpage     file page to be freed
+@param id        page identifier
+@param hash_lock buf_pool.page_hash latch (will be released here) */
+void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id,
+                           page_hash_latch *hash_lock)
 {
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool,
-							   old_page_id);
-	BPageMutex*	block_mutex = buf_page_get_mutex(bpage);
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-
-	rw_lock_x_lock(hash_lock);
-
-	while (bpage->buf_fix_count > 0) {
-		/* Wait for other threads to release the fix count
-		before releasing the bpage from LRU list. */
-	}
-
-	mutex_enter(block_mutex);
+  while (bpage->buf_fix_count())
+    /* Wait for other threads to release the fix count
+    before releasing the bpage from LRU list. */
+    (void) LF_BACKOFF();
 
-	bpage->id = old_page_id;
-
-	if (buf_LRU_block_remove_hashed(bpage, true)) {
-		buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
-	}
-
-	/* buf_LRU_block_remove_hashed() releases hash_lock and block_mutex */
-	ut_ad(!rw_lock_own_flagged(hash_lock,
-				   RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
-	ut_ad(!mutex_own(block_mutex));
+  if (buf_LRU_block_remove_hashed(bpage, id, hash_lock, true))
+    buf_LRU_block_free_hashed_page(reinterpret_cast<buf_block_t*>(bpage));
 }
 
-/**********************************************************************//**
-Updates buf_pool->LRU_old_ratio for one buffer pool instance.
+/** Update buf_pool.LRU_old_ratio.
+@param[in]	old_pct		Reserve this percentage of
+				the buffer pool for "old" blocks
+@param[in]	adjust		true=adjust the LRU list;
+				false=just assign buf_pool.LRU_old_ratio
+				during the initialization of InnoDB
 @return updated old_pct */
-static
-uint
-buf_LRU_old_ratio_update_instance(
-/*==============================*/
-	buf_pool_t*	buf_pool,/*!< in: buffer pool instance */
-	uint		old_pct,/*!< in: Reserve this percentage of
-				the buffer pool for "old" blocks. */
-	bool		adjust)	/*!< in: true=adjust the LRU list;
-				false=just assign buf_pool->LRU_old_ratio
-				during the initialization of InnoDB */
+uint buf_LRU_old_ratio_update(uint old_pct, bool adjust)
 {
-	uint	ratio;
-
-	ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100;
+	uint	ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100;
 	if (ratio < BUF_LRU_OLD_RATIO_MIN) {
 		ratio = BUF_LRU_OLD_RATIO_MIN;
 	} else if (ratio > BUF_LRU_OLD_RATIO_MAX) {
@@ -1981,77 +1273,36 @@ buf_LRU_old_ratio_update_instance(
 	}
 
 	if (adjust) {
-		buf_pool_mutex_enter(buf_pool);
+		mysql_mutex_lock(&buf_pool.mutex);
 
-		if (ratio != buf_pool->LRU_old_ratio) {
-			buf_pool->LRU_old_ratio = ratio;
+		if (ratio != buf_pool.LRU_old_ratio) {
+			buf_pool.LRU_old_ratio = ratio;
 
-			if (UT_LIST_GET_LEN(buf_pool->LRU)
+			if (UT_LIST_GET_LEN(buf_pool.LRU)
 			    >= BUF_LRU_OLD_MIN_LEN) {
-
-				buf_LRU_old_adjust_len(buf_pool);
+				buf_LRU_old_adjust_len();
 			}
 		}
 
-		buf_pool_mutex_exit(buf_pool);
+		mysql_mutex_unlock(&buf_pool.mutex);
 	} else {
-		buf_pool->LRU_old_ratio = ratio;
+		buf_pool.LRU_old_ratio = ratio;
 	}
 	/* the reverse of
 	ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100 */
 	return((uint) (ratio * 100 / (double) BUF_LRU_OLD_RATIO_DIV + 0.5));
 }
 
-/**********************************************************************//**
-Updates buf_pool->LRU_old_ratio.
-@return updated old_pct */
-uint
-buf_LRU_old_ratio_update(
-/*=====================*/
-	uint	old_pct,/*!< in: Reserve this percentage of
-			the buffer pool for "old" blocks. */
-	bool	adjust)	/*!< in: true=adjust the LRU list;
-			false=just assign buf_pool->LRU_old_ratio
-			during the initialization of InnoDB */
-{
-	uint	new_ratio = 0;
-
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		new_ratio = buf_LRU_old_ratio_update_instance(
-			buf_pool, old_pct, adjust);
-	}
-
-	return(new_ratio);
-}
-
 /********************************************************************//**
 Update the historical stats that we are collecting for LRU eviction
 policy at the end of each interval. */
 void
-buf_LRU_stat_update(void)
-/*=====================*/
+buf_LRU_stat_update()
 {
 	buf_LRU_stat_t*	item;
-	buf_pool_t*	buf_pool;
-	bool		evict_started = FALSE;
 	buf_LRU_stat_t	cur_stat;
 
-	/* If we haven't started eviction yet then don't update stats. */
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-
-		buf_pool = buf_pool_from_array(i);
-
-		if (buf_pool->freed_page_clock != 0) {
-			evict_started = true;
-			break;
-		}
-	}
-
-	if (!evict_started) {
+	if (!buf_pool.freed_page_clock) {
 		goto func_exit;
 	}
 
@@ -2079,29 +1330,24 @@ func_exit:
 	memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur);
 }
 
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/**********************************************************************//**
-Validates the LRU list for one buffer pool instance. */
-static
-void
-buf_LRU_validate_instance(
-/*======================*/
-	buf_pool_t*	buf_pool)
+#ifdef UNIV_DEBUG
+/** Validate the LRU list. */
+void buf_LRU_validate()
 {
-	ulint		old_len;
-	ulint		new_len;
+	ulint	old_len;
+	ulint	new_len;
 
-	buf_pool_mutex_enter(buf_pool);
+	mysql_mutex_lock(&buf_pool.mutex);
 
-	if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) {
+	if (UT_LIST_GET_LEN(buf_pool.LRU) >= BUF_LRU_OLD_MIN_LEN) {
 
-		ut_a(buf_pool->LRU_old);
-		old_len = buf_pool->LRU_old_len;
+		ut_a(buf_pool.LRU_old);
+		old_len = buf_pool.LRU_old_len;
 
-		new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU)
-				 * buf_pool->LRU_old_ratio
+		new_len = ut_min(UT_LIST_GET_LEN(buf_pool.LRU)
+				 * buf_pool.LRU_old_ratio
 				 / BUF_LRU_OLD_RATIO_DIV,
-				 UT_LIST_GET_LEN(buf_pool->LRU)
+				 UT_LIST_GET_LEN(buf_pool.LRU)
 				 - (BUF_LRU_OLD_TOLERANCE
 				    + BUF_LRU_NON_OLD_MIN_LEN));
 
@@ -2109,128 +1355,101 @@ buf_LRU_validate_instance(
 		ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE);
 	}
 
-	CheckInLRUList::validate(buf_pool);
+	CheckInLRUList::validate();
 
 	old_len = 0;
 
-	for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+	for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
 	     bpage != NULL;
              bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
 
-		switch (buf_page_get_state(bpage)) {
-		case BUF_BLOCK_POOL_WATCH:
+		switch (bpage->state()) {
 		case BUF_BLOCK_NOT_USED:
-		case BUF_BLOCK_READY_FOR_USE:
 		case BUF_BLOCK_MEMORY:
 		case BUF_BLOCK_REMOVE_HASH:
 			ut_error;
 			break;
 		case BUF_BLOCK_FILE_PAGE:
-			ut_ad(((buf_block_t*) bpage)->in_unzip_LRU_list
-			      == buf_page_belongs_to_unzip_LRU(bpage));
+			ut_ad(reinterpret_cast<buf_block_t*>(bpage)
+			      ->in_unzip_LRU_list
+			      == bpage->belongs_to_unzip_LRU());
 		case BUF_BLOCK_ZIP_PAGE:
-		case BUF_BLOCK_ZIP_DIRTY:
 			break;
 		}
 
-		if (buf_page_is_old(bpage)) {
+		if (bpage->is_old()) {
 			const buf_page_t*	prev
 				= UT_LIST_GET_PREV(LRU, bpage);
 			const buf_page_t*	next
 				= UT_LIST_GET_NEXT(LRU, bpage);
 
 			if (!old_len++) {
-				ut_a(buf_pool->LRU_old == bpage);
+				ut_a(buf_pool.LRU_old == bpage);
 			} else {
-				ut_a(!prev || buf_page_is_old(prev));
+				ut_a(!prev || prev->is_old());
 			}
 
-			ut_a(!next || buf_page_is_old(next));
+			ut_a(!next || next->is_old());
 		}
 	}
 
-	ut_a(buf_pool->LRU_old_len == old_len);
+	ut_a(buf_pool.LRU_old_len == old_len);
 
-	CheckInFreeList::validate(buf_pool);
+	CheckInFreeList::validate();
 
-	for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool->free);
+	for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.free);
 	     bpage != NULL;
 	     bpage = UT_LIST_GET_NEXT(list, bpage)) {
 
-		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_NOT_USED);
+		ut_a(bpage->state() == BUF_BLOCK_NOT_USED);
 	}
 
-	CheckUnzipLRUAndLRUList::validate(buf_pool);
+	CheckUnzipLRUAndLRUList::validate();
 
-	for (buf_block_t* block = UT_LIST_GET_FIRST(buf_pool->unzip_LRU);
+	for (buf_block_t* block = UT_LIST_GET_FIRST(buf_pool.unzip_LRU);
 	     block != NULL;
 	     block = UT_LIST_GET_NEXT(unzip_LRU, block)) {
 
 		ut_ad(block->in_unzip_LRU_list);
 		ut_ad(block->page.in_LRU_list);
-		ut_a(buf_page_belongs_to_unzip_LRU(&block->page));
-	}
-
-	buf_pool_mutex_exit(buf_pool);
-}
-
-/**********************************************************************//**
-Validates the LRU list.
-@return TRUE */
-ibool
-buf_LRU_validate(void)
-/*==================*/
-{
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-		buf_LRU_validate_instance(buf_pool);
+		ut_a(block->page.belongs_to_unzip_LRU());
 	}
 
-	return(TRUE);
+	mysql_mutex_unlock(&buf_pool.mutex);
 }
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#endif /* UNIV_DEBUG */
 
-#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/**********************************************************************//**
-Prints the LRU list for one buffer pool instance. */
-static
-void
-buf_LRU_print_instance(
-/*===================*/
-	buf_pool_t*	buf_pool)
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+/** Dump the LRU list to stderr. */
+void buf_LRU_print()
 {
-	buf_pool_mutex_enter(buf_pool);
+	mysql_mutex_lock(&buf_pool.mutex);
 
-	for (const buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+	for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
 	     bpage != NULL;
 	     bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
-
-		mutex_enter(buf_page_get_mutex(bpage));
+		const page_id_t id(bpage->id());
 
 		fprintf(stderr, "BLOCK space %u page %u ",
-			bpage->id.space(), bpage->id.page_no());
+			id.space(), id.page_no());
 
-		if (buf_page_is_old(bpage)) {
+		if (bpage->is_old()) {
 			fputs("old ", stderr);
 		}
 
-		if (bpage->buf_fix_count) {
-			fprintf(stderr, "buffix count %u ",
-				uint32_t(bpage->buf_fix_count));
+		if (const uint32_t buf_fix_count = bpage->buf_fix_count()) {
+			fprintf(stderr, "buffix count %u ", buf_fix_count);
 		}
 
-		if (buf_page_get_io_fix(bpage)) {
-			fprintf(stderr, "io_fix %d ",
-				buf_page_get_io_fix(bpage));
+		if (const auto io_fix = bpage->io_fix()) {
+			fprintf(stderr, "io_fix %d ", io_fix);
 		}
 
-		if (bpage->oldest_modification) {
+		if (bpage->oldest_modification()) {
 			fputs("modif. ", stderr);
 		}
 
-		switch (buf_page_get_state(bpage)) {
+		switch (const auto state = bpage->state()) {
 			const byte*	frame;
 		case BUF_BLOCK_FILE_PAGE:
 			frame = buf_block_get_frame((buf_block_t*) bpage);
@@ -2248,28 +1467,11 @@ buf_LRU_print_instance(
 			break;
 
 		default:
-			fprintf(stderr, "\n!state %d!\n",
-				buf_page_get_state(bpage));
+			fprintf(stderr, "\n!state %d!\n", state);
 			break;
 		}
-
-		mutex_exit(buf_page_get_mutex(bpage));
 	}
 
-	buf_pool_mutex_exit(buf_pool);
-}
-
-/**********************************************************************//**
-Prints the LRU list. */
-void
-buf_LRU_print(void)
-/*===============*/
-{
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-		buf_LRU_print_instance(buf_pool);
-	}
+	mysql_mutex_unlock(&buf_pool.mutex);
 }
-#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc
index 1ea3070cbda..ff163f74b08 100644
--- a/storage/innobase/buf/buf0rea.cc
+++ b/storage/innobase/buf/buf0rea.cc
@@ -33,6 +33,7 @@ Created 11/5/1995 Heikki Tuuri
 #include "buf0buf.h"
 #include "buf0flu.h"
 #include "buf0lru.h"
+#include "buf0buddy.h"
 #include "buf0dblwr.h"
 #include "ibuf0ibuf.h"
 #include "log0recv.h"
@@ -41,54 +42,211 @@ Created 11/5/1995 Heikki Tuuri
 #include "srv0start.h"
 #include "srv0srv.h"
 
-/** There must be at least this many pages in buf_pool in the area to start
-a random read-ahead */
-#define BUF_READ_AHEAD_RANDOM_THRESHOLD(b)	\
-				(5 + BUF_READ_AHEAD_AREA(b) / 8)
-
-/** If there are buf_pool->curr_size per the number below pending reads, then
+/** If there are buf_pool.curr_size per the number below pending reads, then
 read-ahead is not done: this is to prevent flooding the buffer pool with
 i/o-fixed buffer blocks */
 #define BUF_READ_AHEAD_PEND_LIMIT	2
 
-/********************************************************************//**
-Unfixes the pages, unlatches the page,
-removes it from page_hash and removes it from LRU. */
-static
-void
-buf_read_page_handle_error(
-/*=======================*/
-	buf_page_t*	bpage)	/*!< in: pointer to the block */
+/** Remove the sentinel block for the watch before replacing it with a
+real block. watch_unset() or watch_occurred() will notice
+that the block has been replaced with the real block.
+@param watch   sentinel */
+inline void buf_pool_t::watch_remove(buf_page_t *watch)
 {
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	const bool	uncompressed = (buf_page_get_state(bpage)
-					== BUF_BLOCK_FILE_PAGE);
-	const page_id_t	old_page_id = bpage->id;
-
-	/* First unfix and release lock on the bpage */
-	buf_pool_mutex_enter(buf_pool);
-	mutex_enter(buf_page_get_mutex(bpage));
-	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
-
-	bpage->id.set_corrupt_id();
-	/* Set BUF_IO_NONE before we remove the block from LRU list */
-	buf_page_set_io_fix(bpage, BUF_IO_NONE);
-
-	if (uncompressed) {
-		rw_lock_x_unlock_gen(
-			&((buf_block_t*) bpage)->lock,
-			BUF_IO_READ);
-	}
-
-	mutex_exit(buf_page_get_mutex(bpage));
-
-	/* remove the block from LRU list */
-	buf_LRU_free_one_page(bpage, old_page_id);
-
-	ut_ad(buf_pool->n_pend_reads > 0);
-	buf_pool->n_pend_reads--;
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(hash_lock_get(watch->id())->is_write_locked());
+  ut_a(watch_is_sentinel(*watch));
+  if (watch->buf_fix_count())
+  {
+    ut_ad(watch->in_page_hash);
+    ut_d(watch->in_page_hash= false);
+    HASH_DELETE(buf_page_t, hash, &page_hash, watch->id().fold(), watch);
+    watch->set_buf_fix_count(0);
+  }
+  ut_ad(!watch->in_page_hash);
+  watch->set_state(BUF_BLOCK_NOT_USED);
+  watch->id_= page_id_t(~0ULL);
+}
 
-	buf_pool_mutex_exit(buf_pool);
+/** Initialize a page for read to the buffer buf_pool. If the page is
+(1) already in buf_pool, or
+(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
+(3) if the space is deleted or being deleted,
+then this function does nothing.
+Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
+on the buffer frame. The io-handler must take care that the flag is cleared
+and the lock released later.
+@param[in]	mode			BUF_READ_IBUF_PAGES_ONLY, ...
+@param[in]	page_id			page id
+@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	unzip			whether the uncompressed page is
+					requested (for ROW_FORMAT=COMPRESSED)
+@return pointer to the block
+@retval	NULL	in case of an error */
+static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
+                                          ulint zip_size, bool unzip)
+{
+  mtr_t mtr;
+
+  if (mode == BUF_READ_IBUF_PAGES_ONLY)
+  {
+    /* It is a read-ahead within an ibuf routine */
+    ut_ad(!ibuf_bitmap_page(page_id, zip_size));
+    ibuf_mtr_start(&mtr);
+
+    if (!recv_no_ibuf_operations && !ibuf_page(page_id, zip_size, &mtr))
+    {
+      ibuf_mtr_commit(&mtr);
+      return nullptr;
+    }
+  }
+  else
+    ut_ad(mode == BUF_READ_ANY_PAGE);
+
+  buf_page_t *bpage= nullptr;
+  buf_block_t *block= nullptr;
+  if (!zip_size || unzip || recv_recovery_is_on())
+  {
+    block= buf_LRU_get_free_block(false);
+    block->initialise(page_id, zip_size);
+    /* We set a pass-type x-lock on the frame because then
+    the same thread which called for the read operation
+    (and is running now at this point of code) can wait
+    for the read to complete by waiting for the x-lock on
+    the frame; if the x-lock were recursive, the same
+    thread would illegally get the x-lock before the page
+    read is completed.  The x-lock will be released
+    in buf_page_read_complete() by the io-handler thread. */
+    rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
+  }
+
+  const ulint fold= page_id.fold();
+
+  mysql_mutex_lock(&buf_pool.mutex);
+
+  buf_page_t *hash_page= buf_pool.page_hash_get_low(page_id, fold);
+  if (hash_page && !buf_pool.watch_is_sentinel(*hash_page))
+  {
+    /* The page is already in the buffer pool. */
+    if (block)
+    {
+      rw_lock_x_unlock_gen(&block->lock, BUF_IO_READ);
+      buf_LRU_block_free_non_file_page(block);
+    }
+    goto func_exit;
+  }
+
+  if (UNIV_LIKELY(block != nullptr))
+  {
+    bpage= &block->page;
+
+    /* Insert into the hash table of file pages */
+    page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
+    hash_lock->write_lock();
+
+    if (hash_page)
+    {
+      /* Preserve the reference count. */
+      auto buf_fix_count= hash_page->buf_fix_count();
+      ut_a(buf_fix_count > 0);
+      block->page.add_buf_fix_count(buf_fix_count);
+      buf_pool.watch_remove(hash_page);
+    }
+
+    block->page.set_io_fix(BUF_IO_READ);
+    block->page.set_state(BUF_BLOCK_FILE_PAGE);
+    ut_ad(!block->page.in_page_hash);
+    ut_d(block->page.in_page_hash= true);
+    HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage);
+    hash_lock->write_unlock();
+
+    /* The block must be put to the LRU list, to the old blocks */
+    buf_LRU_add_block(bpage, true/* to old blocks */);
+
+    if (UNIV_UNLIKELY(zip_size))
+    {
+      /* buf_pool.mutex may be released and reacquired by
+      buf_buddy_alloc(). We must defer this operation until after the
+      block descriptor has been added to buf_pool.LRU and
+      buf_pool.page_hash. */
+      block->page.zip.data= static_cast<page_zip_t*>
+        (buf_buddy_alloc(zip_size));
+
+      /* To maintain the invariant
+      block->in_unzip_LRU_list == block->page.belongs_to_unzip_LRU()
+      we have to add this block to unzip_LRU
+      after block->page.zip.data is set. */
+      ut_ad(block->page.belongs_to_unzip_LRU());
+      buf_unzip_LRU_add_block(block, TRUE);
+    }
+  }
+  else
+  {
+    /* The compressed page must be allocated before the
+    control block (bpage), in order to avoid the
+    invocation of buf_buddy_relocate_block() on
+    uninitialized data. */
+    bool lru= false;
+    void *data= buf_buddy_alloc(zip_size, &lru);
+
+    /* If buf_buddy_alloc() allocated storage from the LRU list,
+    it released and reacquired buf_pool.mutex.  Thus, we must
+    check the page_hash again, as it may have been modified. */
+    if (UNIV_UNLIKELY(lru))
+    {
+      hash_page= buf_pool.page_hash_get_low(page_id, fold);
+
+      if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page)))
+      {
+        /* The block was added by some other thread. */
+        buf_buddy_free(data, zip_size);
+        goto func_exit;
+      }
+    }
+
+    bpage= buf_page_alloc_descriptor();
+
+    page_zip_des_init(&bpage->zip);
+    page_zip_set_size(&bpage->zip, zip_size);
+    bpage->zip.data = (page_zip_t*) data;
+
+    bpage->init(BUF_BLOCK_ZIP_PAGE, page_id);
+
+    page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
+    hash_lock->write_lock();
+
+    if (hash_page)
+    {
+      /* Preserve the reference count. It can be 0 if
+      buf_pool_t::watch_unset() is executing concurrently,
+      waiting for buf_pool.mutex, which we are holding. */
+      bpage->add_buf_fix_count(hash_page->buf_fix_count());
+      buf_pool.watch_remove(hash_page);
+    }
+
+    ut_ad(!bpage->in_page_hash);
+    ut_d(bpage->in_page_hash= true);
+    HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage);
+    bpage->set_io_fix(BUF_IO_READ);
+    hash_lock->write_unlock();
+
+    /* The block must be put to the LRU list, to the old blocks.
+    The zip size is already set into the page zip */
+    buf_LRU_add_block(bpage, true/* to old blocks */);
+  }
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+  buf_pool.n_pend_reads++;
+  goto func_exit_no_mutex;
+func_exit:
+  mysql_mutex_unlock(&buf_pool.mutex);
+func_exit_no_mutex:
+  if (mode == BUF_READ_IBUF_PAGES_ONLY)
+    ibuf_mtr_commit(&mtr);
+
+  ut_ad(!bpage || bpage->in_file());
+
+  return bpage;
 }
 
 /** Low-level function which reads a page asynchronously from a file to the
@@ -99,49 +257,48 @@ flag is cleared and the x-lock released by an i/o-handler thread.
 @param[out] err		DB_SUCCESS or DB_TABLESPACE_DELETED
 			if we are trying
 			to read from a non-existent tablespace
+@param[in,out] space	tablespace
 @param[in] sync		true if synchronous aio is desired
-@param[in] type		IO type, SIMULATED, IGNORE_MISSING
 @param[in] mode		BUF_READ_IBUF_PAGES_ONLY, ...,
 @param[in] page_id	page id
 @param[in] zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 @param[in] unzip	true=request uncompressed page
-@param[in] ignore_missing_space  true=ignore missing space when reading
-@return 1 if a read request was queued, 0 if the page already resided
-in buf_pool, or if the page is in the doublewrite buffer blocks in
-which case it is never read into the pool, or if the tablespace does
-not exist or is being dropped */
+@return whether a read request was queued */
 static
-ulint
+bool
 buf_read_page_low(
 	dberr_t*		err,
+	fil_space_t*		space,
 	bool			sync,
-	ulint			type,
 	ulint			mode,
 	const page_id_t		page_id,
 	ulint			zip_size,
-	bool			unzip,
-	bool			ignore_missing_space = false)
+	bool			unzip)
 {
 	buf_page_t*	bpage;
 
 	*err = DB_SUCCESS;
 
-	if (page_id.space() == TRX_SYS_SPACE
-	    && buf_dblwr_page_inside(page_id.page_no())) {
-
+	if (buf_dblwr.is_inside(page_id)) {
 		ib::error() << "Trying to read doublewrite buffer page "
 			<< page_id;
-		return(0);
+		ut_ad(0);
+nothing_read:
+		space->release();
+		return false;
 	}
 
-	if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) {
+	if (sync) {
+	} else if (trx_sys_hdr_page(page_id)
+		   || ibuf_bitmap_page(page_id, zip_size)
+		   || (!recv_no_ibuf_operations
+		       && ibuf_page(page_id, zip_size, nullptr))) {
 
 		/* Trx sys header is so low in the latching order that we play
 		safe and do not leave the i/o-completion to an asynchronous
-		i/o-thread. Ibuf bitmap pages must always be read with
-		syncronous i/o, to make sure they do not get involved in
+		i/o-thread. Change buffer pages must always be read with
+		synchronous i/o, to make sure they do not get involved in
 		thread deadlocks. */
-
 		sync = true;
 	}
 
@@ -149,66 +306,63 @@ buf_read_page_low(
 	or is being dropped; if we succeed in initing the page in the buffer
 	pool for read, then DISCARD cannot proceed until the read has
 	completed */
-	bpage = buf_page_init_for_read(err, mode, page_id, zip_size, unzip);
+	bpage = buf_page_init_for_read(mode, page_id, zip_size, unzip);
 
 	if (bpage == NULL) {
+		goto nothing_read;
+	}
+
+	ut_ad(bpage->in_file());
 
-		return(0);
+	if (sync) {
+		thd_wait_begin(nullptr, THD_WAIT_DISKIO);
 	}
 
 	DBUG_LOG("ib_buf",
 		 "read page " << page_id << " zip_size=" << zip_size
 		 << " unzip=" << unzip << ',' << (sync ? "sync" : "async"));
 
-	ut_ad(buf_page_in_file(bpage));
-
-	if (sync) {
-		thd_wait_begin(NULL, THD_WAIT_DISKIO);
-	}
-
 	void*	dst;
 
 	if (zip_size) {
 		dst = bpage->zip.data;
 	} else {
-		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+		ut_a(bpage->state() == BUF_BLOCK_FILE_PAGE);
 
 		dst = ((buf_block_t*) bpage)->frame;
 	}
 
-	IORequest	request(type | IORequest::READ);
-
-	*err = fil_io(
-		request, sync, page_id, zip_size, 0,
-		zip_size ? zip_size : srv_page_size,
-		dst, bpage, ignore_missing_space);
+	const ulint len = zip_size ? zip_size : srv_page_size;
 
-	if (sync) {
-		thd_wait_end(NULL);
-	}
+	auto fio = space->io(IORequest(sync
+				       ? IORequest::READ_SYNC
+				       : IORequest::READ_ASYNC),
+			     page_id.page_no() * len, len, dst, bpage);
+	*err= fio.err;
 
-	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
-		if (IORequest::ignore_missing(type)
-		    || *err == DB_TABLESPACE_DELETED
-		    || *err == DB_IO_ERROR) {
-			buf_read_page_handle_error(bpage);
-			return(0);
+	if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) {
+		if (!sync || fio.err == DB_TABLESPACE_DELETED
+		    || fio.err == DB_IO_ERROR) {
+			buf_pool.corrupted_evict(bpage);
+			return false;
 		}
 
 		ut_error;
 	}
 
 	if (sync) {
-		/* The i/o is already completed when we arrive from
-		fil_read */
-		*err = buf_page_io_complete(bpage);
+		thd_wait_end(NULL);
+
+		/* The i/o was already completed in space->io() */
+		*err = buf_page_read_complete(bpage, *fio.node);
+		space->release();
 
 		if (*err != DB_SUCCESS) {
-			return(0);
+			return false;
 		}
 	}
 
-	return(1);
+	return true;
 }
 
 /** Applies a random read-ahead in buf_pool if there are at least a threshold
@@ -230,143 +384,86 @@ get read even if we return a positive value! */
 ulint
 buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
 {
-	buf_pool_t*	buf_pool = buf_pool_get(page_id);
-	ulint		recent_blocks	= 0;
-	ulint		ibuf_mode;
-	ulint		count;
-	ulint		low, high;
-	dberr_t		err = DB_SUCCESS;
-	ulint		i;
-	const ulint	buf_read_ahead_random_area
-				= BUF_READ_AHEAD_AREA(buf_pool);
-
-	if (!srv_random_read_ahead) {
-		/* Disabled by user */
-		return(0);
-	}
-
-	if (srv_startup_is_before_trx_rollback_phase) {
-		/* No read-ahead to avoid thread deadlocks */
-		return(0);
-	}
-
-	if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) {
-
-		/* If it is an ibuf bitmap page or trx sys hdr, we do
-		no read-ahead, as that could break the ibuf page access
-		order */
-
-		return(0);
-	}
-
-	low  = (page_id.page_no() / buf_read_ahead_random_area)
-		* buf_read_ahead_random_area;
-
-	high = (page_id.page_no() / buf_read_ahead_random_area + 1)
-		* buf_read_ahead_random_area;
-
-	if (fil_space_t* space = fil_space_acquire(page_id.space())) {
-		high = space->max_page_number_for_io(high);
-		space->release();
-	} else {
-		return(0);
-	}
-
-	buf_pool_mutex_enter(buf_pool);
-
-	if (buf_pool->n_pend_reads
-	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
-		buf_pool_mutex_exit(buf_pool);
-
-		return(0);
-	}
-
-	/* Count how many blocks in the area have been recently accessed,
-	that is, reside near the start of the LRU list. */
-
-	for (i = low; i < high; i++) {
-		const buf_page_t*	bpage = buf_page_hash_get(
-			buf_pool, page_id_t(page_id.space(), i));
-
-		if (bpage != NULL
-		    && buf_page_is_accessed(bpage)
-		    && buf_page_peek_if_young(bpage)) {
-
-			recent_blocks++;
-
-			if (recent_blocks
-			    >= BUF_READ_AHEAD_RANDOM_THRESHOLD(buf_pool)) {
-
-				buf_pool_mutex_exit(buf_pool);
-				goto read_ahead;
-			}
-		}
-	}
-
-	buf_pool_mutex_exit(buf_pool);
-	/* Do nothing */
-	return(0);
+  if (!srv_random_read_ahead)
+    return 0;
+
+  if (srv_startup_is_before_trx_rollback_phase)
+    /* No read-ahead to avoid thread deadlocks */
+    return 0;
+
+  if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id))
+    /* If it is an ibuf bitmap page or trx sys hdr, we do no
+    read-ahead, as that could break the ibuf page access order */
+    return 0;
+
+  if (buf_pool.n_pend_reads > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
+    return 0;
+
+  fil_space_t* space= fil_space_t::get(page_id.space());
+  if (!space)
+    return 0;
+
+  const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area;
+  ulint count= 5 + buf_read_ahead_area / 8;
+  const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area);
+  page_id_t high= low + buf_read_ahead_area;
+  high.set_page_no(std::min(high.page_no(), space->last_page_number()));
+
+  /* Count how many blocks in the area have been recently accessed,
+  that is, reside near the start of the LRU list. */
+
+  for (page_id_t i= low; i < high; ++i)
+  {
+    const ulint fold= i.fold();
+    page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
+    const buf_page_t *bpage= buf_pool.page_hash_get_low(i, fold);
+    bool found= bpage && bpage->is_accessed() && buf_page_peek_if_young(bpage);
+    hash_lock->read_unlock();
+    if (found && !--count)
+      goto read_ahead;
+  }
+
+no_read_ahead:
+  space->release();
+  return 0;
 
 read_ahead:
-	/* Read all the suitable blocks within the area */
-
-	ibuf_mode = ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
-	count = 0;
-
-	for (i = low; i < high; i++) {
-		/* It is only sensible to do read-ahead in the non-sync aio
-		mode: hence FALSE as the first parameter */
-
-		const page_id_t	cur_page_id(page_id.space(), i);
-
-		if (!ibuf_bitmap_page(cur_page_id, zip_size)) {
-			count += buf_read_page_low(
-				&err, false,
-				IORequest::DO_NOT_WAKE,
-				ibuf_mode,
-				cur_page_id, zip_size, false);
-
-			switch (err) {
-			case DB_SUCCESS:
-			case DB_ERROR:
-				break;
-			case DB_TABLESPACE_DELETED:
-				ib::info() << "Random readahead trying to"
-					" access page " << cur_page_id
-					<< " in nonexisting or"
-					" being-dropped tablespace";
-				break;
-			default:
-				ut_error;
-			}
-		}
-	}
-
-	/* In simulated aio we wake the aio handler threads only after
-	queuing all aio requests, in native aio the following call does
-	nothing: */
-
-	os_aio_simulated_wake_handler_threads();
-
-	if (count) {
-		DBUG_PRINT("ib_buf", ("random read-ahead %u pages, %u:%u",
-				      (unsigned) count,
-				      (unsigned) page_id.space(),
-				      (unsigned) page_id.page_no()));
-	}
-
-	/* Read ahead is considered one I/O operation for the purpose of
-	LRU policy decision. */
-	buf_LRU_stat_inc_io();
-
-	buf_pool->stat.n_ra_pages_read_rnd += count;
-	srv_stats.buf_pool_reads.add(count);
-	return(count);
+  if (space->is_stopping())
+    goto no_read_ahead;
+
+  /* Read all the suitable blocks within the area */
+  const ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
+
+  for (page_id_t i= low; i < high; ++i)
+  {
+    if (ibuf_bitmap_page(i, zip_size))
+      continue;
+    if (space->is_stopping())
+      break;
+    dberr_t err;
+    space->reacquire();
+    if (buf_read_page_low(&err, space, false, ibuf_mode, i, zip_size, false))
+      count++;
+  }
+
+  if (count)
+    DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
+			  count, space->chain.start->name,
+			  low.page_no()));
+  space->release();
+
+  /* Read ahead is considered one I/O operation for the purpose of
+  LRU policy decision. */
+  buf_LRU_stat_inc_io();
+
+  buf_pool.stat.n_ra_pages_read_rnd+= count;
+  srv_stats.buf_pool_reads.add(count);
+  return count;
 }
 
-/** High-level function which reads a page asynchronously from a file to the
-buffer buf_pool if it is not already there. Sets the io_fix flag and sets
-an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+/** High-level function which reads a page from a file to buf_pool
+if it is not already there. Sets the io_fix and an exclusive lock
+on the buffer frame. The flag is cleared and the x-lock
 released by the i/o-handler thread.
 @param[in]	page_id		page id
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
@@ -377,50 +474,39 @@ after decryption normal page checksum does not match.
 @retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
 dberr_t buf_read_page(const page_id_t page_id, ulint zip_size)
 {
-	ulint		count;
-	dberr_t		err = DB_SUCCESS;
-
-	/* We do synchronous IO because our AIO completion code
-	is sub-optimal. See buf_page_io_complete(), we have to
-	acquire the buffer pool mutex before acquiring the block
-	mutex, required for updating the page state. The acquire
-	of the buffer pool mutex becomes an expensive bottleneck. */
-
-	count = buf_read_page_low(
-		&err, true,
-		0, BUF_READ_ANY_PAGE, page_id, zip_size, false);
-
-	srv_stats.buf_pool_reads.add(count);
-
-	if (err == DB_TABLESPACE_DELETED) {
-		ib::info() << "trying to read page " << page_id
-			<< " in nonexisting or being-dropped tablespace";
-	}
-
-	/* Increment number of I/O operations used for LRU policy. */
-	buf_LRU_stat_inc_io();
-
-	return(err);
+  fil_space_t *space= fil_space_t::get(page_id.space());
+  if (!space)
+  {
+    ib::info() << "trying to read page " << page_id
+               << " in nonexisting or being-dropped tablespace";
+    return DB_TABLESPACE_DELETED;
+  }
+
+  dberr_t err;
+  if (buf_read_page_low(&err, space, true, BUF_READ_ANY_PAGE,
+			page_id, zip_size, false))
+    srv_stats.buf_pool_reads.add(1);
+
+  buf_LRU_stat_inc_io();
+  return err;
 }
 
 /** High-level function which reads a page asynchronously from a file to the
 buffer buf_pool if it is not already there. Sets the io_fix flag and sets
 an exclusive lock on the buffer frame. The flag is cleared and the x-lock
 released by the i/o-handler thread.
+@param[in,out]	space		tablespace
 @param[in]	page_id		page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	sync		true if synchronous aio is desired */
-void
-buf_read_page_background(const page_id_t page_id, ulint zip_size, bool sync)
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0 */
+void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
+                              ulint zip_size)
 {
-	ulint		count;
 	dberr_t		err;
 
-	count = buf_read_page_low(
-		&err, sync,
-		IORequest::DO_NOT_WAKE | IORequest::IGNORE_MISSING,
-		BUF_READ_ANY_PAGE,
-		page_id, zip_size, false);
+	if (buf_read_page_low(&err, space, false, BUF_READ_ANY_PAGE,
+			      page_id, zip_size, false)) {
+		srv_stats.buf_pool_reads.add(1);
+	}
 
 	switch (err) {
 	case DB_SUCCESS:
@@ -442,8 +528,6 @@ buf_read_page_background(const page_id_t page_id, ulint zip_size, bool sync)
 			<< page_id;
 	}
 
-	srv_stats.buf_pool_reads.add(count);
-
 	/* We do not increment number of I/O operations used for LRU policy
 	here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
 	about evicting uncompressed version of compressed pages from the
@@ -481,432 +565,208 @@ which could result in a deadlock if the OS does not support asynchronous io.
 ulint
 buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
 {
-	buf_pool_t*	buf_pool = buf_pool_get(page_id);
-	buf_page_t*	bpage;
-	buf_frame_t*	frame;
-	buf_page_t*	pred_bpage	= NULL;
-	ulint		pred_offset;
-	ulint		succ_offset;
-	int		asc_or_desc;
-	ulint		new_offset;
-	ulint		fail_count;
-	ulint		low, high;
-	dberr_t		err = DB_SUCCESS;
-	ulint		i;
-	const ulint	buf_read_ahead_linear_area
-		= BUF_READ_AHEAD_AREA(buf_pool);
-	ulint		threshold;
-
-	/* check if readahead is disabled */
-	if (!srv_read_ahead_threshold) {
-		return(0);
-	}
-
-	if (srv_startup_is_before_trx_rollback_phase) {
-		/* No read-ahead to avoid thread deadlocks */
-		return(0);
-	}
-
-	low  = (page_id.page_no() / buf_read_ahead_linear_area)
-		* buf_read_ahead_linear_area;
-	high = (page_id.page_no() / buf_read_ahead_linear_area + 1)
-		* buf_read_ahead_linear_area;
-
-	if ((page_id.page_no() != low) && (page_id.page_no() != high - 1)) {
-		/* This is not a border page of the area: return */
-
-		return(0);
-	}
-
-	if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) {
-
-		/* If it is an ibuf bitmap page or trx sys hdr, we do
-		no read-ahead, as that could break the ibuf page access
-		order */
-
-		return(0);
-	}
-
-	ulint	space_size;
-
-	if (fil_space_t* space = fil_space_acquire(page_id.space())) {
-		space_size = space->committed_size;
-		space->release();
-
-		if (high > space_size) {
-			/* The area is not whole */
-			return(0);
-		}
-	} else {
-		return(0);
-	}
-
-	buf_pool_mutex_enter(buf_pool);
-
-	if (buf_pool->n_pend_reads
-	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
-		buf_pool_mutex_exit(buf_pool);
-
-		return(0);
-	}
-
-	/* Check that almost all pages in the area have been accessed; if
-	offset == low, the accesses must be in a descending order, otherwise,
-	in an ascending order. */
-
-	asc_or_desc = 1;
-
-	if (page_id.page_no() == low) {
-		asc_or_desc = -1;
-	}
-
-	/* How many out of order accessed pages can we ignore
-	when working out the access pattern for linear readahead */
-	threshold = ut_min(static_cast<ulint>(64 - srv_read_ahead_threshold),
-			   BUF_READ_AHEAD_AREA(buf_pool));
-
-	fail_count = 0;
-
-	for (i = low; i < high; i++) {
-		bpage = buf_page_hash_get(buf_pool,
-					  page_id_t(page_id.space(), i));
-
-		if (bpage == NULL || !buf_page_is_accessed(bpage)) {
-			/* Not accessed */
-			fail_count++;
-
-		} else if (pred_bpage) {
-			/* Note that buf_page_is_accessed() returns
-			the time of the first access.  If some blocks
-			of the extent existed in the buffer pool at
-			the time of a linear access pattern, the first
-			access times may be nonmonotonic, even though
-			the latest access times were linear.  The
-			threshold (srv_read_ahead_factor) should help
-			a little against this. */
-			int res = ut_ulint_cmp(
-				buf_page_is_accessed(bpage),
-				buf_page_is_accessed(pred_bpage));
-			/* Accesses not in the right order */
-			if (res != 0 && res != asc_or_desc) {
-				fail_count++;
-			}
-		}
-
-		if (fail_count > threshold) {
-			/* Too many failures: return */
-			buf_pool_mutex_exit(buf_pool);
-			return(0);
-		}
-
-		if (bpage && buf_page_is_accessed(bpage)) {
-			pred_bpage = bpage;
-		}
-	}
-
-	/* If we got this far, we know that enough pages in the area have
-	been accessed in the right order: linear read-ahead can be sensible */
-
-	bpage = buf_page_hash_get(buf_pool, page_id);
-
-	if (bpage == NULL) {
-		buf_pool_mutex_exit(buf_pool);
-
-		return(0);
-	}
-
-	switch (buf_page_get_state(bpage)) {
-	case BUF_BLOCK_ZIP_PAGE:
-		frame = bpage->zip.data;
-		break;
-	case BUF_BLOCK_FILE_PAGE:
-		frame = ((buf_block_t*) bpage)->frame;
-		break;
-	default:
-		ut_error;
-		break;
-	}
-
-	/* Read the natural predecessor and successor page addresses from
-	the page; NOTE that because the calling thread may have an x-latch
-	on the page, we do not acquire an s-latch on the page, this is to
-	prevent deadlocks. Even if we read values which are nonsense, the
-	algorithm will work. */
-
-	pred_offset = fil_page_get_prev(frame);
-	succ_offset = fil_page_get_next(frame);
-
-	buf_pool_mutex_exit(buf_pool);
-
-	if ((page_id.page_no() == low)
-	    && (succ_offset == page_id.page_no() + 1)) {
-
-		/* This is ok, we can continue */
-		new_offset = pred_offset;
-
-	} else if ((page_id.page_no() == high - 1)
-		   && (pred_offset == page_id.page_no() - 1)) {
-
-		/* This is ok, we can continue */
-		new_offset = succ_offset;
-	} else {
-		/* Successor or predecessor not in the right order */
-
-		return(0);
-	}
-
-	low  = (new_offset / buf_read_ahead_linear_area)
-		* buf_read_ahead_linear_area;
-	high = (new_offset / buf_read_ahead_linear_area + 1)
-		* buf_read_ahead_linear_area;
-
-	if ((new_offset != low) && (new_offset != high - 1)) {
-		/* This is not a border page of the area: return */
-
-		return(0);
-	}
-
-	if (high > space_size) {
-		/* The area is not whole, return */
-
-		return(0);
-	}
-
-	ulint	count = 0;
-
-	/* If we got this far, read-ahead can be sensible: do it */
-
-	ulint ibuf_mode = ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
-
-	/* Since Windows XP seems to schedule the i/o handler thread
-	very eagerly, and consequently it does not wait for the
-	full read batch to be posted, we use special heuristics here */
-
-	os_aio_simulated_put_read_threads_to_sleep();
-
-	for (i = low; i < high; i++) {
-		/* It is only sensible to do read-ahead in the non-sync
-		aio mode: hence FALSE as the first parameter */
-
-		const page_id_t	cur_page_id(page_id.space(), i);
-
-		if (!ibuf_bitmap_page(cur_page_id, zip_size)) {
-			count += buf_read_page_low(
-				&err, false,
-				IORequest::DO_NOT_WAKE,
-				ibuf_mode, cur_page_id, zip_size, false);
-
-			switch (err) {
-			case DB_SUCCESS:
-			case DB_TABLESPACE_DELETED:
-			case DB_ERROR:
-				break;
-			case DB_PAGE_CORRUPTED:
-			case DB_DECRYPTION_FAILED:
-				ib::error() << "linear readahead failed to"
-					" read or decrypt "
-					<< page_id_t(page_id.space(), i);
-				break;
-			default:
-				ut_error;
-			}
-		}
-	}
-
-	/* In simulated aio we wake the aio handler threads only after
-	queuing all aio requests, in native aio the following call does
-	nothing: */
-
-	os_aio_simulated_wake_handler_threads();
-
-	if (count) {
-		DBUG_PRINT("ib_buf", ("linear read-ahead " ULINTPF " pages, "
-				      "%u:%u",
-				      count,
-				      page_id.space(),
-				      page_id.page_no()));
-	}
-
-	/* Read ahead is considered one I/O operation for the purpose of
-	LRU policy decision. */
-	buf_LRU_stat_inc_io();
-
-	buf_pool->stat.n_ra_pages_read += count;
-	return(count);
-}
-
-/********************************************************************//**
-Issues read requests for pages which the ibuf module wants to read in, in
-order to contract the insert buffer tree. Technically, this function is like
-a read-ahead function. */
-void
-buf_read_ibuf_merge_pages(
-/*======================*/
-	bool		sync,		/*!< in: true if the caller
-					wants this function to wait
-					for the highest address page
-					to get read in, before this
-					function returns */
-	const ulint*	space_ids,	/*!< in: array of space ids */
-	const ulint*	page_nos,	/*!< in: array of page numbers
-					to read, with the highest page
-					number the last in the
-					array */
-	ulint		n_stored)	/*!< in: number of elements
-					in the arrays */
-{
-#ifdef UNIV_IBUF_DEBUG
-	ut_a(n_stored < srv_page_size);
-#endif
-
-	for (ulint i = 0; i < n_stored; i++) {
-		fil_space_t* space = fil_space_acquire_silent(space_ids[i]);
-		if (!space) {
-tablespace_deleted:
-			/* The tablespace was not found: remove all
-			entries for it */
-			ibuf_delete_for_discarded_space(space_ids[i]);
-			while (i + 1 < n_stored
-			       && space_ids[i + 1] == space_ids[i]) {
-				i++;
-			}
-			continue;
-		}
-
-		ulint size = space->size;
-		if (!size) {
-			size = fil_space_get_size(space->id);
-		}
-
-		if (UNIV_UNLIKELY(page_nos[i] >= size)) {
-			do {
-				ibuf_delete_recs(page_id_t(space_ids[i],
-							   page_nos[i]));
-			} while (++i < n_stored
-				 && space_ids[i - 1] == space_ids[i]
-				 && page_nos[i] >= size);
-			i--;
-next:
-			space->release();
-			continue;
-		}
-
-		const page_id_t	page_id(space_ids[i], page_nos[i]);
-
-		buf_pool_t*	buf_pool = buf_pool_get(page_id);
-
-		while (buf_pool->n_pend_reads
-		       > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
-			os_thread_sleep(500000);
-		}
-
-		dberr_t	err;
-
-		buf_read_page_low(&err,
-				  sync && (i + 1 == n_stored),
-				  0,
-				  BUF_READ_ANY_PAGE, page_id,
-				  space->zip_size(),
-				  true, true /* ignore_missing_space */);
-
-		switch(err) {
-		case DB_SUCCESS:
-		case DB_ERROR:
-			break;
-		case DB_TABLESPACE_DELETED:
-			space->release();
-			goto tablespace_deleted;
-		case DB_PAGE_CORRUPTED:
-		case DB_DECRYPTION_FAILED:
-			ib::error() << "Failed to read or decrypt page "
-				    << page_nos[i]
-				    << " of '" << space->chain.start->name
-				    << "' for change buffer merge";
-			break;
-		default:
-			ut_error;
-		}
-
-		goto next;
-	}
-
-	os_aio_simulated_wake_handler_threads();
-
-	if (n_stored) {
-		DBUG_PRINT("ib_buf",
-			   ("ibuf merge read-ahead %u pages, space %u",
-			    unsigned(n_stored), unsigned(space_ids[0])));
-	}
+  /* check if readahead is disabled */
+  if (!srv_read_ahead_threshold)
+    return 0;
+
+  if (srv_startup_is_before_trx_rollback_phase)
+    /* No read-ahead to avoid thread deadlocks */
+    return 0;
+
+  if (buf_pool.n_pend_reads > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
+    return 0;
+
+  const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area;
+  const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area);
+  const page_id_t high_1= low + (buf_read_ahead_area - 1);
+
+  /* We will check that almost all pages in the area have been accessed
+  in the desired order. */
+  const bool descending= page_id == low;
+
+  if (!descending && page_id != high_1)
+    /* This is not a border page of the area */
+    return 0;
+
+  if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id))
+    /* If it is an ibuf bitmap page or trx sys hdr, we do no
+    read-ahead, as that could break the ibuf page access order */
+    return 0;
+
+  fil_space_t *space= fil_space_t::get(page_id.space());
+  if (!space)
+    return 0;
+
+  if (high_1.page_no() > space->last_page_number())
+  {
+    /* The area is not whole. */
+fail:
+    space->release();
+    return 0;
+  }
+
+  /* How many out of order accessed pages can we ignore
+  when working out the access pattern for linear readahead */
+  ulint count= std::min<ulint>(buf_pool_t::READ_AHEAD_PAGES -
+                               srv_read_ahead_threshold,
+                               uint32_t{buf_pool.read_ahead_area});
+  page_id_t new_low= low, new_high_1= high_1;
+  unsigned prev_accessed= 0;
+  for (page_id_t i= low; i != high_1; ++i)
+  {
+    const ulint fold= i.fold();
+    page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
+    const buf_page_t* bpage= buf_pool.page_hash_get_low(i, fold);
+    if (i == page_id)
+    {
+      /* Read the natural predecessor and successor page addresses from
+      the page; NOTE that because the calling thread may have an x-latch
+      on the page, we do not acquire an s-latch on the page, this is to
+      prevent deadlocks. The hash_lock is only protecting the
+      buf_pool.page_hash for page i, not the bpage contents itself. */
+      if (!bpage)
+      {
+hard_fail:
+        hash_lock->read_unlock();
+	goto fail;
+      }
+      const byte *f;
+      switch (UNIV_EXPECT(bpage->state(), BUF_BLOCK_FILE_PAGE)) {
+      case BUF_BLOCK_FILE_PAGE:
+        f= reinterpret_cast<const buf_block_t*>(bpage)->frame;
+        break;
+      case BUF_BLOCK_ZIP_PAGE:
+        f= bpage->zip.data;
+        break;
+      default:
+        goto hard_fail;
+      }
+
+      uint32_t prev= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_PREV));
+      uint32_t next= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_NEXT));
+      if (prev == FIL_NULL || next == FIL_NULL)
+        goto hard_fail;
+      page_id_t id= page_id;
+      if (descending && next - 1 == page_id.page_no())
+        id.set_page_no(prev);
+      else if (!descending && prev + 1 == page_id.page_no())
+        id.set_page_no(next);
+      else
+        goto hard_fail; /* Successor or predecessor not in the right order */
+
+      new_low= id - (id.page_no() % buf_read_ahead_area);
+      new_high_1= new_low + (buf_read_ahead_area - 1);
+
+      if (id != new_low && id != new_high_1)
+        /* This is not a border page of the area: return */
+        goto hard_fail;
+      if (new_high_1.page_no() > space->last_page_number())
+        /* The area is not whole */
+        goto hard_fail;
+    }
+    else if (!bpage)
+    {
+failed:
+      hash_lock->read_unlock();
+      if (--count)
+        continue;
+      goto fail;
+    }
+
+    const unsigned accessed= bpage->is_accessed();
+    if (!accessed)
+      goto failed;
+    /* Note that buf_page_t::is_accessed() returns the time of the
+    first access. If some blocks of the extent existed in the buffer
+    pool at the time of a linear access pattern, the first access
+    times may be nonmonotonic, even though the latest access times
+    were linear. The threshold (srv_read_ahead_factor) should help a
+    little against this. */
+    bool fail= prev_accessed &&
+      (descending ? prev_accessed > accessed : prev_accessed < accessed);
+    prev_accessed= accessed;
+    if (fail)
+      goto failed;
+    hash_lock->read_unlock();
+  }
+
+  /* If we got this far, read-ahead can be sensible: do it */
+  count= 0;
+  for (ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
+       new_low != new_high_1; ++new_low)
+  {
+    if (ibuf_bitmap_page(new_low, zip_size))
+      continue;
+    if (space->is_stopping())
+      break;
+    dberr_t err;
+    space->reacquire();
+    count+= buf_read_page_low(&err, space, false, ibuf_mode, new_low, zip_size,
+                              false);
+  }
+
+  if (count)
+    DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
+                          count, space->chain.start->name,
+                          new_low.page_no()));
+  space->release();
+
+  /* Read ahead is considered one I/O operation for the purpose of
+  LRU policy decision. */
+  buf_LRU_stat_inc_io();
+
+  buf_pool.stat.n_ra_pages_read+= count;
+  return count;
 }
 
 /** Issues read requests for pages which recovery wants to read in.
-@param[in]	sync		true if the caller wants this function to wait
-for the highest address page to get read in, before this function returns
 @param[in]	space_id	tablespace id
 @param[in]	page_nos	array of page numbers to read, with the
 highest page number the last in the array
-@param[in]	n_stored	number of page numbers in the array */
-void
-buf_read_recv_pages(
-	bool		sync,
-	ulint		space_id,
-	const ulint*	page_nos,
-	ulint		n_stored)
+@param[in]	n		number of page numbers in the array */
+void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n)
 {
-	fil_space_t*		space	= fil_space_get(space_id);
+	fil_space_t* space = fil_space_t::get(space_id);
 
-	if (space == NULL) {
-		/* The tablespace is missing: do nothing */
+	if (!space) {
+		/* The tablespace is missing or unreadable: do nothing */
 		return;
 	}
 
-	fil_space_open_if_needed(space);
-
 	const ulint zip_size = space->zip_size();
 
-	for (ulint i = 0; i < n_stored; i++) {
-		buf_pool_t*		buf_pool;
-		const page_id_t	cur_page_id(space_id, page_nos[i]);
+	for (ulint i = 0; i < n; i++) {
 
-		ulint			count = 0;
+		/* Ignore if the page already present in freed ranges. */
+		if (space->freed_ranges.contains(page_nos[i])) {
+			continue;
+		}
+
+		const page_id_t	cur_page_id(space_id, page_nos[i]);
 
-		buf_pool = buf_pool_get(cur_page_id);
 		ulint limit = 0;
-		for (ulint j = 0; j < buf_pool->n_chunks; j++) {
-			limit += buf_pool->chunks[j].size / 2;
+		for (ulint j = 0; j < buf_pool.n_chunks; j++) {
+			limit += buf_pool.chunks[j].size / 2;
 		}
 
-		while (buf_pool->n_pend_reads >= limit) {
-			os_aio_simulated_wake_handler_threads();
+		for (ulint count = 0; buf_pool.n_pend_reads >= limit; ) {
 			os_thread_sleep(10000);
 
-			count++;
-
-			if (!(count % 1000)) {
+			if (!(++count % 1000)) {
 
 				ib::error()
 					<< "Waited for " << count / 100
 					<< " seconds for "
-					<< buf_pool->n_pend_reads
+					<< buf_pool.n_pend_reads
 					<< " pending reads";
 			}
 		}
 
 		dberr_t err;
-
-		if (sync && i + 1 == n_stored) {
-			buf_read_page_low(
-				&err, true,
-				0,
-				BUF_READ_ANY_PAGE,
-				cur_page_id, zip_size, true);
-		} else {
-			buf_read_page_low(
-				&err, false,
-				IORequest::DO_NOT_WAKE,
-				BUF_READ_ANY_PAGE,
-				cur_page_id, zip_size, true);
-		}
+		space->reacquire();
+		buf_read_page_low(&err, space, false,
+				  BUF_READ_ANY_PAGE, cur_page_id, zip_size,
+				  true);
 
 		if (err == DB_DECRYPTION_FAILED || err == DB_PAGE_CORRUPTED) {
 			ib::error() << "Recovery failed to read or decrypt "
@@ -914,8 +774,8 @@ buf_read_recv_pages(
 		}
 	}
 
-	os_aio_simulated_wake_handler_threads();
 
-	DBUG_PRINT("ib_buf", ("recovery read-ahead (%u pages)",
-			      unsigned(n_stored)));
+        DBUG_PRINT("ib_buf", ("recovery read (%u pages) for %s", n,
+			      space->chain.start->name));
+	space->release();
 }
diff --git a/storage/innobase/data/data0data.cc b/storage/innobase/data/data0data.cc
index fe849d8ae29..14a0b3e19fd 100644
--- a/storage/innobase/data/data0data.cc
+++ b/storage/innobase/data/data0data.cc
@@ -616,10 +616,11 @@ dtuple_convert_big_rec(
 	stored externally */
 
 	n_fields = 0;
-	ulint longest_i;
+	uint16_t longest_i;
+	ulint longest;
 
 	const bool mblob = entry->is_alter_metadata();
-	ut_ad(entry->n_fields >= index->first_user_field() + mblob);
+	ut_ad(entry->n_fields - mblob >= index->first_user_field());
 	ut_ad(entry->n_fields - mblob <= index->n_fields);
 
 	if (mblob) {
@@ -645,8 +646,9 @@ dtuple_convert_big_rec(
 				      dict_index_get_n_fields(index),
 				      zip_size)) {
 		longest_i = 0;
-		for (ulint i = index->first_user_field(), longest = 0;
-		     i + mblob < entry->n_fields; i++) {
+		longest = 0;
+		for (uint16_t i = index->first_user_field();
+		     i < entry->n_fields - mblob; i++) {
 			ulint	savings;
 			dfield = dtuple_get_nth_field(entry, i + mblob);
 
@@ -686,7 +688,7 @@ dtuple_convert_big_rec(
 				goto skip_field;
 			}
 
-			longest_i = i + mblob;
+			longest_i = uint16_t(i + mblob);
 			longest = savings;
 
 skip_field:
@@ -738,7 +740,7 @@ ext_write:
 			DEBUG_SYNC_C("ib_mv_nonupdated_column_offpage");
 
 			upd_field_t	upd_field;
-			upd_field.field_no = unsigned(longest_i);
+			upd_field.field_no = longest_i;
 			upd_field.orig_len = 0;
 			upd_field.exp = NULL;
 			upd_field.old_v_val = NULL;
diff --git a/storage/innobase/dict/dict0boot.cc b/storage/innobase/dict/dict0boot.cc
index 29db56599e2..bd2cf4ffdd8 100644
--- a/storage/innobase/dict/dict0boot.cc
+++ b/storage/innobase/dict/dict0boot.cc
@@ -35,24 +35,13 @@ Created 4/18/1996 Heikki Tuuri
 #include "log0recv.h"
 #include "os0file.h"
 
-/**********************************************************************//**
-Gets a pointer to the dictionary header and x-latches its page.
-@return pointer to the dictionary header, page x-latched */
-dict_hdr_t*
-dict_hdr_get(
-/*=========*/
-	mtr_t*	mtr)	/*!< in: mtr */
+/** @return the DICT_HDR block, x-latched */
+buf_block_t *dict_hdr_get(mtr_t* mtr)
 {
-	buf_block_t*	block;
-	dict_hdr_t*	header;
-
-	block = buf_page_get(page_id_t(DICT_HDR_SPACE, DICT_HDR_PAGE_NO),
-			     0, RW_X_LATCH, mtr);
-	header = DICT_HDR + buf_block_get_frame(block);
-
-	buf_block_dbg_add_level(block, SYNC_DICT_HEADER);
-
-	return(header);
+  buf_block_t *block= buf_page_get(page_id_t(DICT_HDR_SPACE, DICT_HDR_PAGE_NO),
+				   0, RW_X_LATCH, mtr);
+  buf_block_dbg_add_level(block, SYNC_DICT_HEADER);
+  return block;
 }
 
 /**********************************************************************//**
@@ -67,36 +56,41 @@ dict_hdr_get_new_id(
 	ulint*			space_id)	/*!< out: space id
 						(not assigned if NULL) */
 {
-	dict_hdr_t*	dict_hdr;
 	ib_id_t		id;
 	mtr_t		mtr;
 
-	mtr_start(&mtr);
-	dict_hdr = dict_hdr_get(&mtr);
+	mtr.start();
+	buf_block_t* dict_hdr = dict_hdr_get(&mtr);
 
 	if (table_id) {
-		id = mach_read_from_8(dict_hdr + DICT_HDR_TABLE_ID);
+		id = mach_read_from_8(DICT_HDR + DICT_HDR_TABLE_ID
+				      + dict_hdr->frame);
 		id++;
-		mlog_write_ull(dict_hdr + DICT_HDR_TABLE_ID, id, &mtr);
+		mtr.write<8>(*dict_hdr, DICT_HDR + DICT_HDR_TABLE_ID
+			     + dict_hdr->frame, id);
 		*table_id = id;
 	}
 
 	if (index_id) {
-		id = mach_read_from_8(dict_hdr + DICT_HDR_INDEX_ID);
+		id = mach_read_from_8(DICT_HDR + DICT_HDR_INDEX_ID
+				      + dict_hdr->frame);
 		id++;
-		mlog_write_ull(dict_hdr + DICT_HDR_INDEX_ID, id, &mtr);
+		mtr.write<8>(*dict_hdr, DICT_HDR + DICT_HDR_INDEX_ID
+			     + dict_hdr->frame, id);
 		*index_id = id;
 	}
 
 	if (space_id) {
-		*space_id = mach_read_from_4(dict_hdr + DICT_HDR_MAX_SPACE_ID);
+		*space_id = mach_read_from_4(DICT_HDR + DICT_HDR_MAX_SPACE_ID
+					     + dict_hdr->frame);
 		if (fil_assign_new_space_id(space_id)) {
-			mlog_write_ulint(dict_hdr + DICT_HDR_MAX_SPACE_ID,
-					 *space_id, MLOG_4BYTES, &mtr);
+			mtr.write<4>(*dict_hdr,
+				     DICT_HDR + DICT_HDR_MAX_SPACE_ID
+				     + dict_hdr->frame, *space_id);
 		}
 	}
 
-	mtr_commit(&mtr);
+	mtr.commit();
 }
 
 /**********************************************************************//**
@@ -106,7 +100,6 @@ void
 dict_hdr_flush_row_id(void)
 /*=======================*/
 {
-	dict_hdr_t*	dict_hdr;
 	row_id_t	id;
 	mtr_t		mtr;
 
@@ -114,13 +107,13 @@ dict_hdr_flush_row_id(void)
 
 	id = dict_sys.row_id;
 
-	mtr_start(&mtr);
+	mtr.start();
 
-	dict_hdr = dict_hdr_get(&mtr);
+	buf_block_t* d = dict_hdr_get(&mtr);
 
-	mlog_write_ull(dict_hdr + DICT_HDR_ROW_ID, id, &mtr);
+	mtr.write<8>(*d, DICT_HDR + DICT_HDR_ROW_ID + d->frame, id);
 
-	mtr_commit(&mtr);
+	mtr.commit();
 }
 
 /*****************************************************************//**
@@ -134,7 +127,6 @@ dict_hdr_create(
 	mtr_t*	mtr)	/*!< in: mtr */
 {
 	buf_block_t*	block;
-	dict_hdr_t*	dict_header;
 	ulint		root_page_no;
 
 	ut_ad(mtr);
@@ -145,26 +137,24 @@ dict_hdr_create(
 	block = fseg_create(fil_system.sys_space,
 			    DICT_HDR + DICT_HDR_FSEG_HEADER, mtr);
 
-	ut_a(DICT_HDR_PAGE_NO == block->page.id.page_no());
+	ut_a(block->page.id() == page_id_t(DICT_HDR_SPACE, DICT_HDR_PAGE_NO));
 
-	dict_header = dict_hdr_get(mtr);
+	buf_block_t* d = dict_hdr_get(mtr);
 
 	/* Start counting row, table, index, and tree ids from
 	DICT_HDR_FIRST_ID */
-	mlog_write_ull(dict_header + DICT_HDR_ROW_ID,
-		       DICT_HDR_FIRST_ID, mtr);
-
-	mlog_write_ull(dict_header + DICT_HDR_TABLE_ID,
-		       DICT_HDR_FIRST_ID, mtr);
+	mtr->write<8>(*d, DICT_HDR + DICT_HDR_ROW_ID + d->frame,
+		      DICT_HDR_FIRST_ID);
+	mtr->write<8>(*d, DICT_HDR + DICT_HDR_TABLE_ID + d->frame,
+		      DICT_HDR_FIRST_ID);
+	mtr->write<8>(*d, DICT_HDR + DICT_HDR_INDEX_ID + d->frame,
+		      DICT_HDR_FIRST_ID);
 
-	mlog_write_ull(dict_header + DICT_HDR_INDEX_ID,
-		       DICT_HDR_FIRST_ID, mtr);
-
-	ut_ad(mach_read_from_4(dict_header + DICT_HDR_MAX_SPACE_ID) == 0);
+	ut_ad(!mach_read_from_4(DICT_HDR + DICT_HDR_MAX_SPACE_ID + d->frame));
 
 	/* Obsolete, but we must initialize it anyway. */
-	mlog_write_ulint(dict_header + DICT_HDR_MIX_ID_LOW,
-			 DICT_HDR_FIRST_ID, MLOG_4BYTES, mtr);
+	mtr->write<4>(*d, DICT_HDR + DICT_HDR_MIX_ID_LOW + d->frame,
+		      DICT_HDR_FIRST_ID);
 
 	/* Create the B-tree roots for the clustered indexes of the basic
 	system tables */
@@ -172,58 +162,56 @@ dict_hdr_create(
 	/*--------------------------*/
 	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
 				  fil_system.sys_space, DICT_TABLES_ID,
-				  dict_ind_redundant, mtr);
+				  nullptr, mtr);
 	if (root_page_no == FIL_NULL) {
 
 		return(FALSE);
 	}
 
-	mlog_write_ulint(dict_header + DICT_HDR_TABLES, root_page_no,
-			 MLOG_4BYTES, mtr);
+	mtr->write<4>(*d, DICT_HDR + DICT_HDR_TABLES + d->frame, root_page_no);
 	/*--------------------------*/
 	root_page_no = btr_create(DICT_UNIQUE,
 				  fil_system.sys_space, DICT_TABLE_IDS_ID,
-				  dict_ind_redundant, mtr);
+				  nullptr, mtr);
 	if (root_page_no == FIL_NULL) {
 
 		return(FALSE);
 	}
 
-	mlog_write_ulint(dict_header + DICT_HDR_TABLE_IDS, root_page_no,
-			 MLOG_4BYTES, mtr);
+	mtr->write<4>(*d, DICT_HDR + DICT_HDR_TABLE_IDS + d->frame,
+		      root_page_no);
 	/*--------------------------*/
 	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
 				  fil_system.sys_space, DICT_COLUMNS_ID,
-				  dict_ind_redundant, mtr);
+				  nullptr, mtr);
 	if (root_page_no == FIL_NULL) {
 
 		return(FALSE);
 	}
 
-	mlog_write_ulint(dict_header + DICT_HDR_COLUMNS, root_page_no,
-			 MLOG_4BYTES, mtr);
+	mtr->write<4>(*d, DICT_HDR + DICT_HDR_COLUMNS + d->frame,
+		      root_page_no);
 	/*--------------------------*/
 	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
 				  fil_system.sys_space, DICT_INDEXES_ID,
-				  dict_ind_redundant, mtr);
+				  nullptr, mtr);
 	if (root_page_no == FIL_NULL) {
 
 		return(FALSE);
 	}
 
-	mlog_write_ulint(dict_header + DICT_HDR_INDEXES, root_page_no,
-			 MLOG_4BYTES, mtr);
+	mtr->write<4>(*d, DICT_HDR + DICT_HDR_INDEXES + d->frame,
+		      root_page_no);
 	/*--------------------------*/
 	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
 				  fil_system.sys_space, DICT_FIELDS_ID,
-				  dict_ind_redundant, mtr);
+				  nullptr, mtr);
 	if (root_page_no == FIL_NULL) {
 
 		return(FALSE);
 	}
 
-	mlog_write_ulint(dict_header + DICT_HDR_FIELDS, root_page_no,
-			 MLOG_4BYTES, mtr);
+	mtr->write<4>(*d, DICT_HDR + DICT_HDR_FIELDS + d->frame, root_page_no);
 	/*--------------------------*/
 
 	return(TRUE);
@@ -239,7 +227,6 @@ dict_boot(void)
 {
 	dict_table_t*	table;
 	dict_index_t*	index;
-	dict_hdr_t*	dict_hdr;
 	mem_heap_t*	heap;
 	mtr_t		mtr;
 
@@ -271,7 +258,7 @@ dict_boot(void)
 	mutex_enter(&dict_sys.mutex);
 
 	/* Get the dictionary header */
-	dict_hdr = dict_hdr_get(&mtr);
+	const byte* dict_hdr = &dict_hdr_get(&mtr)->frame[DICT_HDR];
 
 	/* Because we only write new row ids to disk-based data structure
 	(dictionary header) when it is divisible by
@@ -286,6 +273,11 @@ dict_boot(void)
 	dict_sys.row_id = DICT_HDR_ROW_ID_WRITE_MARGIN
 		+ ut_uint64_align_up(mach_read_from_8(dict_hdr + DICT_HDR_ROW_ID),
 				     DICT_HDR_ROW_ID_WRITE_MARGIN);
+	if (ulint max_space_id = mach_read_from_4(dict_hdr
+						  + DICT_HDR_MAX_SPACE_ID)) {
+		max_space_id--;
+		fil_assign_new_space_id(&max_space_id);
+	}
 
 	/* Insert into the dictionary cache the descriptions of the basic
 	system tables */
@@ -325,8 +317,8 @@ dict_boot(void)
 		index, mach_read_from_4(dict_hdr + DICT_HDR_TABLES));
 	ut_a(error == DB_SUCCESS);
 	ut_ad(!table->is_instant());
-	table->indexes.start->n_core_null_bytes = UT_BITS_IN_BYTES(
-		unsigned(table->indexes.start->n_nullable));
+	table->indexes.start->n_core_null_bytes = static_cast<uint8_t>(
+		UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable)));
 
 	/*-------------------------*/
 	index = dict_mem_index_create(table, "ID_IND", DICT_UNIQUE, 1);
@@ -367,8 +359,8 @@ dict_boot(void)
 		index, mach_read_from_4(dict_hdr + DICT_HDR_COLUMNS));
 	ut_a(error == DB_SUCCESS);
 	ut_ad(!table->is_instant());
-	table->indexes.start->n_core_null_bytes = UT_BITS_IN_BYTES(
-		unsigned(table->indexes.start->n_nullable));
+	table->indexes.start->n_core_null_bytes = static_cast<uint8_t>(
+		UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable)));
 
 	/*-------------------------*/
 	table = dict_mem_table_create("SYS_INDEXES", fil_system.sys_space,
@@ -410,8 +402,8 @@ dict_boot(void)
 		index, mach_read_from_4(dict_hdr + DICT_HDR_INDEXES));
 	ut_a(error == DB_SUCCESS);
 	ut_ad(!table->is_instant());
-	table->indexes.start->n_core_null_bytes = UT_BITS_IN_BYTES(
-		unsigned(table->indexes.start->n_nullable));
+	table->indexes.start->n_core_null_bytes = static_cast<uint8_t>(
+		UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable)));
 
 	/*-------------------------*/
 	table = dict_mem_table_create("SYS_FIELDS", fil_system.sys_space,
@@ -439,8 +431,8 @@ dict_boot(void)
 		index, mach_read_from_4(dict_hdr + DICT_HDR_FIELDS));
 	ut_a(error == DB_SUCCESS);
 	ut_ad(!table->is_instant());
-	table->indexes.start->n_core_null_bytes = UT_BITS_IN_BYTES(
-		unsigned(table->indexes.start->n_nullable));
+	table->indexes.start->n_core_null_bytes = static_cast<uint8_t>(
+		UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable)));
 
 	mtr_commit(&mtr);
 
@@ -448,36 +440,15 @@ dict_boot(void)
 
 	/* Initialize the insert buffer table and index for each tablespace */
 
-	dberr_t	err = DB_SUCCESS;
-
-	err = ibuf_init_at_db_start();
+	dberr_t	err = ibuf_init_at_db_start();
 
 	if (err == DB_SUCCESS) {
-		if (srv_read_only_mode
-		    && srv_force_recovery != SRV_FORCE_NO_LOG_REDO
-		    && !ibuf_is_empty()) {
-
-			if (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE) {
-				ib::error() << "Change buffer must be empty when"
-					" --innodb-read-only is set!"
-					"You can try to recover the database with innodb_force_recovery=5";
-
-				err = DB_ERROR;
-			} else {
-				ib::warn() << "Change buffer not empty when --innodb-read-only "
-					"is set! but srv_force_recovery = " << srv_force_recovery
-					   << " , ignoring.";
-			}
-		}
+		/* Load definitions of other indexes on system tables */
 
-		if (err == DB_SUCCESS) {
-			/* Load definitions of other indexes on system tables */
-
-			dict_load_sys_table(dict_sys.sys_tables);
-			dict_load_sys_table(dict_sys.sys_columns);
-			dict_load_sys_table(dict_sys.sys_indexes);
-			dict_load_sys_table(dict_sys.sys_fields);
-		}
+		dict_load_sys_table(dict_sys.sys_tables);
+		dict_load_sys_table(dict_sys.sys_columns);
+		dict_load_sys_table(dict_sys.sys_indexes);
+		dict_load_sys_table(dict_sys.sys_fields);
 	}
 
 	mutex_exit(&dict_sys.mutex);
diff --git a/storage/innobase/dict/dict0crea.cc b/storage/innobase/dict/dict0crea.cc
index be0e0cd7adb..55e3191c228 100644
--- a/storage/innobase/dict/dict0crea.cc
+++ b/storage/innobase/dict/dict0crea.cc
@@ -230,7 +230,7 @@ dict_create_sys_columns_tuple(
 		col_name = dict_table_get_col_name(table, i);
 	}
 
-	dfield_set_data(dfield, col_name, ut_strlen(col_name));
+	dfield_set_data(dfield, col_name, strlen(col_name));
 
 	/* 5: MTYPE --------------------------*/
 	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__MTYPE);
@@ -375,16 +375,18 @@ dict_build_table_def_step(
 			mtr.start();
 			undo->table_id = trx->table_id;
 			undo->dict_operation = TRUE;
-			page_t* page = trx_undo_page_get(
+			buf_block_t* block = trx_undo_page_get(
 				page_id_t(trx->rsegs.m_redo.rseg->space->id,
 					  undo->hdr_page_no),
 				&mtr);
-			mlog_write_ulint(page + undo->hdr_offset
-					 + TRX_UNDO_DICT_TRANS,
-					 TRUE, MLOG_1BYTE, &mtr);
-			mlog_write_ull(page + undo->hdr_offset
-				       + TRX_UNDO_TABLE_ID,
-				       trx->table_id, &mtr);
+			mtr.write<1,mtr_t::MAYBE_NOP>(
+				*block,
+				block->frame + undo->hdr_offset
+				+ TRX_UNDO_DICT_TRANS, 1U);
+			mtr.write<8,mtr_t::MAYBE_NOP>(
+				*block,
+				block->frame + undo->hdr_offset
+				+ TRX_UNDO_TABLE_ID, trx->table_id);
 			mtr.commit();
 			log_write_up_to(mtr.commit_lsn(), true);
 		}
@@ -653,8 +655,7 @@ dict_create_sys_fields_tuple(
 	/* 4: COL_NAME -------------------------*/
 	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__COL_NAME);
 
-	dfield_set_data(dfield, field->name,
-			ut_strlen(field->name));
+	dfield_set_data(dfield, field->name, strlen(field->name));
 	/*---------------------------------*/
 
 	return(entry);
@@ -854,14 +855,13 @@ dict_create_index_tree_step(
 				err = DB_OUT_OF_FILE_SPACE; );
 	}
 
-	ulint   len;
-	byte*   data = rec_get_nth_field_old(btr_pcur_get_rec(&pcur),
+	ulint	len;
+	byte*	data = rec_get_nth_field_old(btr_pcur_get_rec(&pcur),
 					     DICT_FLD__SYS_INDEXES__PAGE_NO,
 					     &len);
 	ut_ad(len == 4);
-	if (mach_read_from_4(data) != node->page_no) {
-		mlog_write_ulint(data, node->page_no, MLOG_4BYTES, &mtr);
-	}
+	mtr.write<4,mtr_t::MAYBE_NOP>(*btr_pcur_get_block(&pcur), data,
+				      node->page_no);
 
 	mtr.commit();
 
@@ -901,12 +901,12 @@ dict_create_index_tree_in_mem(
 }
 
 /** Drop the index tree associated with a row in SYS_INDEXES table.
-@param[in,out]	rec	SYS_INDEXES record
 @param[in,out]	pcur	persistent cursor on rec
 @param[in,out]	trx	dictionary transaction
 @param[in,out]	mtr	mini-transaction */
-void dict_drop_index_tree(rec_t* rec, btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr)
+void dict_drop_index_tree(btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr)
 {
+	rec_t*	rec = btr_pcur_get_rec(pcur);
 	byte*	ptr;
 	ulint	len;
 
@@ -927,7 +927,7 @@ void dict_drop_index_tree(rec_t* rec, btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr)
 	}
 
 	compile_time_assert(FIL_NULL == 0xffffffff);
-	mlog_memset(ptr, 4, 0xff, mtr);
+	mtr->memset(btr_pcur_get_block(pcur), page_offset(ptr), 4, 0xff);
 
 	ptr = rec_get_nth_field_old(
 		rec, DICT_FLD__SYS_INDEXES__SPACE, &len);
@@ -948,10 +948,10 @@ void dict_drop_index_tree(rec_t* rec, btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr)
 
 	ut_ad(len == 8);
 
-	if (fil_space_t* s = fil_space_acquire_silent(space_id)) {
+	if (fil_space_t* s = fil_space_t::get(space_id)) {
 		/* Ensure that the tablespace file exists
 		in order to avoid a crash in buf_page_get_gen(). */
-		if (s->size || fil_space_get_size(space_id)) {
+		if (root_page_no < s->get_size()) {
 			btr_free_if_exists(page_id_t(space_id, root_page_no),
 					   s->zip_size(),
 					   mach_read_from_8(ptr), mtr);
@@ -1262,8 +1262,8 @@ dict_create_index_step(
 			  ? dict_index_t::NO_CORE_NULL_BYTES
 			  : UT_BITS_IN_BYTES(
 				  unsigned(node->index->n_nullable))));
-		node->index->n_core_null_bytes = UT_BITS_IN_BYTES(
-			unsigned(node->index->n_nullable));
+		node->index->n_core_null_bytes = static_cast<uint8_t>(
+			UT_BITS_IN_BYTES(unsigned(node->index->n_nullable)));
 		node->state = INDEX_CREATE_INDEX_TREE;
 	}
 
@@ -1358,7 +1358,7 @@ dict_check_if_system_table_exists(
 	dict_table_t*	sys_table;
 	dberr_t		error = DB_SUCCESS;
 
-	ut_a(srv_get_active_thread_type() == SRV_NONE);
+	ut_ad(!srv_any_background_activity());
 
 	mutex_enter(&dict_sys.mutex);
 
@@ -1398,7 +1398,7 @@ dict_create_or_check_foreign_constraint_tables(void)
 	dberr_t		sys_foreign_err;
 	dberr_t		sys_foreign_cols_err;
 
-	ut_a(srv_get_active_thread_type() == SRV_NONE);
+	ut_ad(!srv_any_background_activity());
 
 	/* Note: The master thread has not been started at this point. */
 
@@ -1539,7 +1539,7 @@ dict_create_or_check_sys_virtual()
 	my_bool		srv_file_per_table_backup;
 	dberr_t		err;
 
-	ut_a(srv_get_active_thread_type() == SRV_NONE);
+	ut_ad(!srv_any_background_activity());
 
 	/* Note: The master thread has not been started at this point. */
 	err = dict_check_if_system_table_exists(
@@ -2065,7 +2065,7 @@ dict_create_or_check_sys_tablespace(void)
 	dberr_t		sys_tablespaces_err;
 	dberr_t		sys_datafiles_err;
 
-	ut_a(srv_get_active_thread_type() == SRV_NONE);
+	ut_ad(!srv_any_background_activity());
 
 	/* Note: The master thread has not been started at this point. */
 
diff --git a/storage/innobase/dict/dict0defrag_bg.cc b/storage/innobase/dict/dict0defrag_bg.cc
index 7e61e298ac6..0d9cb185b81 100644
--- a/storage/innobase/dict/dict0defrag_bg.cc
+++ b/storage/innobase/dict/dict0defrag_bg.cc
@@ -44,7 +44,6 @@ typedef defrag_pool_t::iterator		defrag_pool_iterator_t;
 by background defragmentation. */
 defrag_pool_t			defrag_pool;
 
-extern bool dict_stats_start_shutdown;
 
 /*****************************************************************//**
 Initialize the defrag pool, called once during thread initialization. */
@@ -134,10 +133,11 @@ dict_stats_defrag_pool_add(
 	item.table_id = index->table->id;
 	item.index_id = index->id;
 	defrag_pool.push_back(item);
-
+	if (defrag_pool.size() == 1) {
+		/* Kick off dict stats optimizer work */
+		dict_stats_schedule_now();
+	}
 	mutex_exit(&defrag_pool_mutex);
-
-	os_event_set(dict_stats_event);
 }
 
 /*****************************************************************//**
@@ -224,7 +224,7 @@ void
 dict_defrag_process_entries_from_defrag_pool()
 /*==========================================*/
 {
-	while (defrag_pool.size() && !dict_stats_start_shutdown) {
+	while (defrag_pool.size()) {
 		dict_stats_process_entry_from_defrag_pool();
 	}
 }
diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc
index 8357732e73b..9c1750c588f 100644
--- a/storage/innobase/dict/dict0dict.cc
+++ b/storage/innobase/dict/dict0dict.cc
@@ -36,14 +36,9 @@ Created 1/8/1996 Heikki Tuuri
 #include "fts0fts.h"
 #include "fil0fil.h"
 #include <algorithm>
-
-/** dummy index for ROW_FORMAT=REDUNDANT supremum and infimum records */
-dict_index_t*	dict_ind_redundant;
-
-#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
-/** Flag to control insert buffer debugging. */
-extern uint	ibuf_debug;
-#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+#include "sql_class.h"
+#include "sql_table.h"
+#include <mysql/service_thd_mdl.h>
 
 #include "btr0btr.h"
 #include "btr0cur.h"
@@ -205,21 +200,6 @@ dict_remove_db_name(
 	return(s + 1);
 }
 
-/********************************************************************//**
-Get the database name length in a table name.
-@return database name length */
-ulint
-dict_get_db_name_len(
-/*=================*/
-	const char*	name)	/*!< in: table name in the form
-				dbname '/' tablename */
-{
-	const char*	s;
-	s = strchr(name, '/');
-	ut_a(s);
-	return ulint(s - name);
-}
-
 /** Open a persistent table.
 @param[in]	table_id	persistent table identifier
 @param[in]	ignore_err	errors to ignore
@@ -311,16 +291,21 @@ dict_table_try_drop_aborted_and_mutex_exit(
 	}
 }
 
-/********************************************************************//**
-Decrements the count of open handles to a table. */
+/** Decrements the count of open handles of a table.
+@param[in,out]	table		table
+@param[in]	dict_locked	data dictionary locked
+@param[in]	try_drop	try to drop any orphan indexes after
+				an aborted online index creation
+@param[in]	thd		thread to release MDL
+@param[in]	mdl		metadata lock or NULL if the thread
+				is a foreground one. */
 void
 dict_table_close(
-/*=============*/
-	dict_table_t*	table,		/*!< in/out: table */
-	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
-	ibool		try_drop)	/*!< in: TRUE=try to drop any orphan
-					indexes after an aborted online
-					index creation */
+	dict_table_t*	table,
+	bool		dict_locked,
+	bool		try_drop,
+	THD*		thd,
+	MDL_ticket*	mdl)
 {
 	if (!dict_locked) {
 		mutex_enter(&dict_sys.mutex);
@@ -362,6 +347,12 @@ dict_table_close(
 			dict_table_try_drop_aborted(NULL, table_id, 0);
 		}
 	}
+
+	if (!thd || !mdl) {
+	} else if (MDL_context *mdl_context= static_cast<MDL_context*>(
+			   thd_mdl_context(thd))) {
+		mdl_context->release_lock(mdl);
+	}
 }
 
 /********************************************************************//**
@@ -382,7 +373,7 @@ dict_table_close_and_drop(
 	ut_ad(trx->dict_operation != TRX_DICT_OP_NONE);
 	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
 
-	dict_table_close(table, TRUE, FALSE);
+	dict_table_close(table, true, false);
 
 #if defined UNIV_DEBUG || defined UNIV_DDL_DEBUG
 	/* Nobody should have initialized the stats of the newly created
@@ -726,17 +717,222 @@ dict_index_get_nth_field_pos(
 	return(ULINT_UNDEFINED);
 }
 
-/**********************************************************************//**
-Returns a table object based on table id.
+/** Parse the table file name into table name and database name.
+@tparam		dict_locked	whether dict_sys.mutex is being held
+@param[in,out]	db_name		database name buffer
+@param[in,out]	tbl_name	table name buffer
+@param[out]	db_name_len	database name length
+@param[out]	tbl_name_len	table name length
+@return whether the table name is visible to SQL */
+template<bool dict_locked>
+bool dict_table_t::parse_name(char (&db_name)[NAME_LEN + 1],
+                              char (&tbl_name)[NAME_LEN + 1],
+                              size_t *db_name_len, size_t *tbl_name_len) const
+{
+  char db_buf[MAX_DATABASE_NAME_LEN + 1];
+  char tbl_buf[MAX_TABLE_NAME_LEN + 1];
+
+  if (!dict_locked)
+    mutex_enter(&dict_sys.mutex); /* protect against renaming */
+  else
+    ut_ad(mutex_own(&dict_sys.mutex));
+  const size_t db_len= name.dblen();
+  ut_ad(db_len <= MAX_DATABASE_NAME_LEN);
+
+  memcpy(db_buf, name.m_name, db_len);
+  db_buf[db_len]= 0;
+
+  size_t tbl_len= strlen(name.m_name + db_len + 1);
+
+  const bool is_temp= tbl_len > TEMP_FILE_PREFIX_LENGTH &&
+    !strncmp(name.m_name, TEMP_FILE_PREFIX, TEMP_FILE_PREFIX_LENGTH);
+
+  if (is_temp);
+  else if (const char *is_part= static_cast<const char*>
+           (memchr(name.m_name + db_len + 1, '#', tbl_len)))
+    tbl_len= static_cast<size_t>(is_part - &name.m_name[db_len + 1]);
+
+  memcpy(tbl_buf, name.m_name + db_len + 1, tbl_len);
+  tbl_buf[tbl_len]= 0;
+
+  if (!dict_locked)
+    mutex_exit(&dict_sys.mutex);
+
+  *db_name_len= filename_to_tablename(db_buf, db_name,
+                                      MAX_DATABASE_NAME_LEN + 1, true);
+
+  if (is_temp)
+    return false;
+
+  *tbl_name_len= filename_to_tablename(tbl_buf, tbl_name,
+                                       MAX_TABLE_NAME_LEN + 1, true);
+  return true;
+}
+
+template bool
+dict_table_t::parse_name<>(char(&)[NAME_LEN + 1], char(&)[NAME_LEN + 1],
+                           size_t*, size_t*) const;
+
+/** Acquire MDL shared for the table name.
+@tparam trylock whether to use non-blocking operation
+@param[in,out]  table           table object
+@param[in,out]  thd             background thread
+@param[out]     mdl             mdl ticket
+@param[in]      table_op        operation to perform when opening
+@return table object after locking MDL shared
+@retval nullptr if the table is not readable, or if trylock && MDL blocked */
+template<bool trylock>
+dict_table_t*
+dict_acquire_mdl_shared(dict_table_t *table,
+                        THD *thd,
+                        MDL_ticket **mdl,
+                        dict_table_op_t table_op)
+{
+  if (!table || !mdl)
+    return table;
+
+  MDL_context *mdl_context= static_cast<MDL_context*>(thd_mdl_context(thd));
+  size_t db_len;
+
+  if (trylock)
+  {
+    mutex_enter(&dict_sys.mutex);
+    db_len= dict_get_db_name_len(table->name.m_name);
+    mutex_exit(&dict_sys.mutex);
+  }
+  else
+  {
+    ut_ad(mutex_own(&dict_sys.mutex));
+    db_len= dict_get_db_name_len(table->name.m_name);
+  }
+
+  if (db_len == 0)
+    return table; /* InnoDB system tables are not covered by MDL */
+
+  if (!mdl_context)
+    return nullptr;
+
+  table_id_t table_id= table->id;
+  char db_buf[NAME_LEN + 1], db_buf1[NAME_LEN + 1];
+  char tbl_buf[NAME_LEN + 1], tbl_buf1[NAME_LEN + 1];
+  size_t tbl_len;
+  bool unaccessible= false;
+
+  if (!table->parse_name<!trylock>(db_buf, tbl_buf, &db_len, &tbl_len))
+    /* The name of an intermediate table starts with #sql */
+    return table;
+
+retry:
+  if (!unaccessible && (!table->is_readable() || table->corrupted))
+  {
+is_unaccessible:
+    if (*mdl)
+    {
+      mdl_context->release_lock(*mdl);
+      *mdl= nullptr;
+    }
+    unaccessible= true;
+  }
+
+  if (!trylock)
+    table->release();
+
+  if (unaccessible)
+    return nullptr;
+
+  if (!trylock)
+    mutex_exit(&dict_sys.mutex);
+  {
+    MDL_request request;
+    MDL_REQUEST_INIT(&request,MDL_key::TABLE, db_buf, tbl_buf, MDL_SHARED, MDL_EXPLICIT);
+    if (trylock
+        ? mdl_context->try_acquire_lock(&request)
+        : mdl_context->acquire_lock(&request,
+                                    /* FIXME: use compatible type, and maybe
+                                    remove this parameter altogether! */
+                                    static_cast<double>(global_system_variables
+                                                        .lock_wait_timeout)))
+    {
+      *mdl= nullptr;
+      if (trylock)
+        return nullptr;
+    }
+    else
+      *mdl= request.ticket;
+  }
+
+  if (!trylock)
+    mutex_enter(&dict_sys.mutex);
+  else if (!*mdl)
+    return nullptr;
+
+  table= dict_table_open_on_id(table_id, !trylock, table_op);
+
+  if (!table)
+  {
+    /* The table was dropped. */
+    if (*mdl)
+    {
+      mdl_context->release_lock(*mdl);
+      *mdl= nullptr;
+    }
+    return nullptr;
+  }
+
+  if (!table->is_accessible())
+    goto is_unaccessible;
+
+  size_t db1_len, tbl1_len;
+
+  if (!table->parse_name<!trylock>(db_buf1, tbl_buf1, &db1_len, &tbl1_len))
+  {
+    /* The table was renamed to #sql prefix.
+    Release MDL (if any) for the old name and return. */
+    if (*mdl)
+    {
+      mdl_context->release_lock(*mdl);
+      *mdl= nullptr;
+    }
+    return table;
+  }
+
+  if (*mdl)
+  {
+    if (db_len == db1_len && tbl_len == tbl1_len &&
+        !memcmp(db_buf, db_buf1, db_len) &&
+        !memcmp(tbl_buf, tbl_buf1, tbl_len))
+      return table;
+
+    /* The table was renamed. Release MDL for the old name and
+    try to acquire MDL for the new name. */
+    mdl_context->release_lock(*mdl);
+    *mdl= nullptr;
+  }
+
+  db_len= db1_len;
+  tbl_len= tbl1_len;
+
+  memcpy(tbl_buf, tbl_buf1, tbl_len + 1);
+  memcpy(db_buf, db_buf1, db_len + 1);
+  goto retry;
+}
+
+template dict_table_t*
+dict_acquire_mdl_shared<true>(dict_table_t*,THD*,MDL_ticket**,dict_table_op_t);
+
+/** Look up a table by numeric identifier.
+@param[in]      table_id        table identifier
+@param[in]      dict_locked     data dictionary locked
+@param[in]      table_op        operation to perform when opening
+@param[in,out]  thd             background thread, or NULL to not acquire MDL
+@param[out]     mdl             mdl ticket, or NULL
 @return table, NULL if does not exist */
 dict_table_t*
-dict_table_open_on_id(
-/*==================*/
-	table_id_t	table_id,	/*!< in: table id */
-	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
-	dict_table_op_t	table_op)	/*!< in: operation to perform */
+dict_table_open_on_id(table_id_t table_id, bool dict_locked,
+                      dict_table_op_t table_op, THD *thd,
+                      MDL_ticket **mdl)
 {
-	dict_table_t*	table;
+	ut_ad(!dict_locked || !thd);
 
 	if (!dict_locked) {
 		mutex_enter(&dict_sys.mutex);
@@ -744,7 +940,7 @@ dict_table_open_on_id(
 
 	ut_ad(mutex_own(&dict_sys.mutex));
 
-	table = dict_table_open_on_id_low(
+	dict_table_t* table = dict_table_open_on_id_low(
 		table_id,
 		table_op == DICT_TABLE_OP_LOAD_TABLESPACE
 		? DICT_ERR_IGNORE_RECOVER_LOCK
@@ -757,25 +953,32 @@ dict_table_open_on_id(
 	}
 
 	if (!dict_locked) {
+		if (thd) {
+			table = dict_acquire_mdl_shared<false>(
+				table, thd, mdl, table_op);
+		}
+
 		dict_table_try_drop_aborted_and_mutex_exit(
 			table, table_op == DICT_TABLE_OP_DROP_ORPHAN);
 	}
 
-	return(table);
+	return table;
 }
 
 /********************************************************************//**
 Looks for column n position in the clustered index.
 @return position in internal representation of the clustered index */
-ulint
+unsigned
 dict_table_get_nth_col_pos(
 /*=======================*/
 	const dict_table_t*	table,	/*!< in: table */
 	ulint			n,	/*!< in: column number */
 	ulint*			prefix_col_pos)
 {
-	return(dict_index_get_nth_col_pos(dict_table_get_first_index(table),
-					  n, prefix_col_pos));
+  ulint pos= dict_index_get_nth_col_pos(dict_table_get_first_index(table),
+					n, prefix_col_pos);
+  DBUG_ASSERT(pos <= dict_index_t::MAX_N_FIELDS);
+  return static_cast<unsigned>(pos);
 }
 
 /********************************************************************//**
@@ -826,9 +1029,9 @@ void dict_sys_t::create()
   const ulint hash_size = buf_pool_get_curr_size()
     / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE);
 
-  table_hash= hash_create(hash_size);
-  table_id_hash= hash_create(hash_size);
-  temp_id_hash= hash_create(hash_size);
+  table_hash.create(hash_size);
+  table_id_hash.create(hash_size);
+  temp_id_hash.create(hash_size);
 
   rw_lock_create(dict_operation_lock_key, &latch, SYNC_DICT_OPERATION);
 
@@ -990,24 +1193,24 @@ inline void dict_sys_t::add(dict_table_t* table)
 	/* Look for a table with the same name: error if such exists */
 	{
 		dict_table_t*	table2;
-		HASH_SEARCH(name_hash, table_hash, fold,
+		HASH_SEARCH(name_hash, &table_hash, fold,
 			    dict_table_t*, table2, ut_ad(table2->cached),
 			    !strcmp(table2->name.m_name, table->name.m_name));
 		ut_a(table2 == NULL);
 
 #ifdef UNIV_DEBUG
 		/* Look for the same table pointer with a different name */
-		HASH_SEARCH_ALL(name_hash, table_hash,
+		HASH_SEARCH_ALL(name_hash, &table_hash,
 				dict_table_t*, table2, ut_ad(table2->cached),
 				table2 == table);
 		ut_ad(table2 == NULL);
 #endif /* UNIV_DEBUG */
 	}
-	HASH_INSERT(dict_table_t, name_hash, table_hash, fold, table);
+	HASH_INSERT(dict_table_t, name_hash, &table_hash, fold, table);
 
 	/* Look for a table with the same id: error if such exists */
 	hash_table_t* id_hash = table->is_temporary()
-		? temp_id_hash : table_id_hash;
+		? &temp_id_hash : &table_id_hash;
 	const ulint id_fold = ut_fold_ull(table->id);
 	{
 		dict_table_t*	table2;
@@ -1084,7 +1287,6 @@ dict_index_t *dict_index_t::clone() const
   ut_ad(is_committed());
   ut_ad(!is_dummy);
   ut_ad(!parser);
-  ut_ad(!index_fts_syncing);
   ut_ad(!online_log);
   ut_ad(!rtr_track);
 
@@ -1314,9 +1516,9 @@ dict_table_rename_in_cache(
 
 	/* Look for a table with the same name: error if such exists */
 	dict_table_t*	table2;
-	HASH_SEARCH(name_hash, dict_sys.table_hash, fold,
+	HASH_SEARCH(name_hash, &dict_sys.table_hash, fold,
 			dict_table_t*, table2, ut_ad(table2->cached),
-			(ut_strcmp(table2->name.m_name, new_name) == 0));
+			(strcmp(table2->name.m_name, new_name) == 0));
 	DBUG_EXECUTE_IF("dict_table_rename_in_cache_failure",
 		if (table2 == NULL) {
 			table2 = (dict_table_t*) -1;
@@ -1408,7 +1610,7 @@ dict_table_rename_in_cache(
 	}
 
 	/* Remove table from the hash tables of tables */
-	HASH_DELETE(dict_table_t, name_hash, dict_sys.table_hash,
+	HASH_DELETE(dict_table_t, name_hash, &dict_sys.table_hash,
 		    ut_fold_string(old_name), table);
 
 	if (strlen(new_name) > strlen(table->name.m_name)) {
@@ -1423,7 +1625,7 @@ dict_table_rename_in_cache(
 	strcpy(table->name.m_name, new_name);
 
 	/* Add table to hash table of tables */
-	HASH_INSERT(dict_table_t, name_hash, dict_sys.table_hash, fold,
+	HASH_INSERT(dict_table_t, name_hash, &dict_sys.table_hash, fold,
 		    table);
 
 	if (!rename_also_foreigns) {
@@ -1480,8 +1682,8 @@ dict_table_rename_in_cache(
 			foreign->referenced_table->referenced_set.erase(foreign);
 		}
 
-		if (ut_strlen(foreign->foreign_table_name)
-		    < ut_strlen(table->name.m_name)) {
+		if (strlen(foreign->foreign_table_name)
+		    < strlen(table->name.m_name)) {
 			/* Allocate a longer name buffer;
 			TODO: store buf len to save memory */
 
@@ -1555,11 +1757,11 @@ dict_table_rename_in_cache(
 
 			old_id = mem_strdup(foreign->id);
 
-			if (ut_strlen(fkid) > ut_strlen(old_name_cs_filename)
+			if (strlen(fkid) > strlen(old_name_cs_filename)
 			    + ((sizeof dict_ibfk) - 1)
 			    && !memcmp(fkid, old_name_cs_filename,
-				       ut_strlen(old_name_cs_filename))
-			    && !memcmp(fkid + ut_strlen(old_name_cs_filename),
+				       strlen(old_name_cs_filename))
+			    && !memcmp(fkid + strlen(old_name_cs_filename),
 				       dict_ibfk, (sizeof dict_ibfk) - 1)) {
 
 				/* This is a generated >= 4.0.18 format id */
@@ -1600,7 +1802,7 @@ dict_table_rename_in_cache(
 				strcpy(foreign->id, table_name);
 				if (on_tmp) {
 					strcat(foreign->id,
-					       old_id + ut_strlen(old_name));
+					       old_id + strlen(old_name));
 				} else {
 					sprintf(strchr(foreign->id, '/') + 1,
 						"%s%s",
@@ -1626,8 +1828,8 @@ dict_table_rename_in_cache(
 				/* Replace the database prefix in id with the
 				one from table->name */
 
-				ut_memcpy(foreign->id,
-					  table->name.m_name, db_len);
+				memcpy(foreign->id,
+				       table->name.m_name, db_len);
 
 				strcpy(foreign->id + db_len,
 				       dict_remove_db_name(old_id));
@@ -1653,8 +1855,8 @@ dict_table_rename_in_cache(
 
 		foreign = *it;
 
-		if (ut_strlen(foreign->referenced_table_name)
-		    < ut_strlen(table->name.m_name)) {
+		if (strlen(foreign->referenced_table_name)
+		    < strlen(table->name.m_name)) {
 			/* Allocate a longer name buffer;
 			TODO: store buf len to save memory */
 
@@ -1691,12 +1893,12 @@ dict_table_change_id_in_cache(
 
 	/* Remove the table from the hash table of id's */
 
-	HASH_DELETE(dict_table_t, id_hash, dict_sys.table_id_hash,
+	HASH_DELETE(dict_table_t, id_hash, &dict_sys.table_id_hash,
 		    ut_fold_ull(table->id), table);
 	table->id = new_id;
 
 	/* Add the table back to the hash table */
-	HASH_INSERT(dict_table_t, id_hash, dict_sys.table_id_hash,
+	HASH_INSERT(dict_table_t, id_hash, &dict_sys.table_id_hash,
 		    ut_fold_ull(table->id), table);
 }
 
@@ -1741,11 +1943,11 @@ void dict_sys_t::remove(dict_table_t* table, bool lru, bool keep)
 
 	/* Remove table from the hash tables of tables */
 
-	HASH_DELETE(dict_table_t, name_hash, table_hash,
+	HASH_DELETE(dict_table_t, name_hash, &table_hash,
 		    ut_fold_string(table->name.m_name), table);
 
 	hash_table_t* id_hash = table->is_temporary()
-		? temp_id_hash : table_id_hash;
+		? &temp_id_hash : &table_id_hash;
 	const ulint id_fold = ut_fold_ull(table->id);
 	HASH_DELETE(dict_table_t, id_hash, id_hash, id_fold, table);
 
@@ -1877,8 +2079,8 @@ dict_index_add_to_cache(
 		new_index = (index->type & DICT_FTS)
 			? dict_index_build_internal_fts(index)
 			: dict_index_build_internal_non_clust(index);
-		new_index->n_core_null_bytes = UT_BITS_IN_BYTES(
-			unsigned(new_index->n_nullable));
+		new_index->n_core_null_bytes = static_cast<uint8_t>(
+			UT_BITS_IN_BYTES(unsigned(new_index->n_nullable)));
 	}
 
 	/* Set the n_fields value in new_index to the actual defined
@@ -2176,12 +2378,14 @@ dict_index_add_col(
 	field = dict_index_get_nth_field(index, unsigned(index->n_def) - 1);
 
 	field->col = col;
-	field->fixed_len = static_cast<unsigned int>(
+	field->fixed_len = static_cast<uint16_t>(
 		dict_col_get_fixed_size(
-			col, dict_table_is_comp(table)));
+			col, dict_table_is_comp(table)))
+		& ((1U << 10) - 1);
 
 	if (prefix_len && field->fixed_len > prefix_len) {
-		field->fixed_len = (unsigned int) prefix_len;
+		field->fixed_len = static_cast<uint16_t>(prefix_len)
+			& ((1U << 10) - 1);
 	}
 
 	/* Long fixed-length fields that need external storage are treated as
@@ -2357,7 +2561,8 @@ dict_index_build_internal_clust(
 		new_index->n_uniq = new_index->n_def;
 	} else {
 		/* Also the row id is needed to identify the entry */
-		new_index->n_uniq = 1 + unsigned(new_index->n_def);
+		new_index->n_uniq = unsigned(new_index->n_def + 1)
+			& dict_index_t::MAX_N_FIELDS;
 	}
 
 	new_index->trx_id_offset = 0;
@@ -2407,7 +2612,8 @@ dict_index_build_internal_clust(
 		can theoretically occur. Check for it. */
 		fixed_size += new_index->trx_id_offset;
 
-		new_index->trx_id_offset = unsigned(fixed_size);
+		new_index->trx_id_offset = static_cast<unsigned>(fixed_size)
+			& ((1U << 12) - 1);
 
 		if (new_index->trx_id_offset != fixed_size) {
 			/* Overflow. Pretend that this is a
@@ -2457,7 +2663,8 @@ dict_index_build_internal_clust(
 
 	new_index->n_core_null_bytes = table->supports_instant()
 		? dict_index_t::NO_CORE_NULL_BYTES
-		: UT_BITS_IN_BYTES(unsigned(new_index->n_nullable));
+		: static_cast<uint8_t>(
+			UT_BITS_IN_BYTES(unsigned(new_index->n_nullable)));
 	new_index->cached = TRUE;
 
 	return(new_index);
@@ -2612,23 +2819,6 @@ dict_index_build_internal_fts(
 }
 /*====================== FOREIGN KEY PROCESSING ========================*/
 
-/** Check whether the dict_table_t is a partition.
-A partitioned table on the SQL level is composed of InnoDB tables,
-where each InnoDB table is a [sub]partition including its secondary indexes
-which belongs to the partition.
-@param[in]	table	Table to check.
-@return true if the dict_table_t is a partition else false. */
-UNIV_INLINE
-bool
-dict_table_is_partition(
-	const dict_table_t*	table)
-{
-	/* Check both P and p on all platforms in case it was moved to/from
-	WIN. */
-	return(strstr(table->name.m_name, "#p#")
-	       || strstr(table->name.m_name, "#P#"));
-}
-
 /*********************************************************************//**
 Checks if a table is referenced by foreign keys.
 @return TRUE if table is referenced by a foreign key */
@@ -3038,7 +3228,7 @@ dict_accept(
 
 	*success = TRUE;
 
-	return(ptr + ut_strlen(string));
+	return ptr + strlen(string);
 }
 
 /*********************************************************************//**
@@ -3165,106 +3355,70 @@ convert_id:
 }
 
 /*********************************************************************//**
-Tries to scan a column name.
-@return scanned to */
-static
-const char*
-dict_scan_col(
-/*==========*/
-	CHARSET_INFO*		cs,	/*!< in: the character set of ptr */
-	const char*		ptr,	/*!< in: scanned to */
-	ibool*			success,/*!< out: TRUE if success */
-	dict_table_t*		table,	/*!< in: table in which the column is */
-	const dict_col_t**	column,	/*!< out: pointer to column if success */
-	mem_heap_t*		heap,	/*!< in: heap where to allocate */
-	const char**		name)	/*!< out,own: the column name;
-					NULL if no name was scannable */
-{
-	ulint		i;
-
-	*success = FALSE;
-
-	ptr = dict_scan_id(cs, ptr, heap, name, FALSE, TRUE);
-
-	if (*name == NULL) {
-
-		return(ptr);	/* Syntax error */
-	}
-
-	if (table == NULL) {
-		*success = TRUE;
-		*column = NULL;
-	} else {
-		for (i = 0; i < dict_table_get_n_cols(table); i++) {
-
-			const char*	col_name = dict_table_get_col_name(
-				table, i);
-
-			if (0 == innobase_strcasecmp(col_name, *name)) {
-				/* Found */
-
-				*success = TRUE;
-				*column = dict_table_get_nth_col(table, i);
-				strcpy((char*) *name, col_name);
-
-				break;
-			}
-		}
-
-		for (i = 0; i < dict_table_get_n_v_cols(table); i++) {
-
-			const char*	col_name = dict_table_get_v_col_name(
-				table, i);
-
-			if (0 == innobase_strcasecmp(col_name, *name)) {
-				/* Found */
-				dict_v_col_t * vcol;
-				*success = TRUE;
-				vcol = dict_table_get_nth_v_col(table, i);
-				*column = &vcol->m_col;
-				strcpy((char*) *name, col_name);
-
-				break;
-			}
-		}
-	}
-
-	return(ptr);
-}
-
-/*********************************************************************//**
 Open a table from its database and table name, this is currently used by
 foreign constraint parser to get the referenced table.
 @return complete table name with database and table name, allocated from
 heap memory passed in */
 char*
 dict_get_referenced_table(
-/*======================*/
-	const char*	name,		/*!< in: foreign key table name */
-	const char*	database_name,	/*!< in: table db name */
-	ulint		database_name_len, /*!< in: db name length */
-	const char*	table_name,	/*!< in: table name */
-	ulint		table_name_len, /*!< in: table name length */
-	dict_table_t**	table,		/*!< out: table object or NULL */
-	mem_heap_t*	heap)		/*!< in/out: heap memory */
+	const char*    name,		  /*!< in: foreign key table name */
+	const char*    database_name,	  /*!< in: table db name */
+	ulint	       database_name_len, /*!< in: db name length */
+	const char*    table_name,	  /*!< in: table name */
+	ulint	       table_name_len,	  /*!< in: table name length */
+	dict_table_t** table,		  /*!< out: table object or NULL */
+	mem_heap_t*    heap,		  /*!< in/out: heap memory */
+	CHARSET_INFO*  from_cs)		  /*!< in: table name charset */
 {
 	char*		ref;
-	const char*	db_name;
+	char		db_name[MAX_DATABASE_NAME_LEN];
+	char		tbl_name[MAX_TABLE_NAME_LEN];
+	CHARSET_INFO*	to_cs = &my_charset_filename;
+	uint		errors;
+	ut_ad(database_name || name);
+	ut_ad(table_name);
 
-	if (!database_name) {
+	if (!strncmp(table_name, srv_mysql50_table_name_prefix,
+		     sizeof(srv_mysql50_table_name_prefix) - 1)) {
+		/* This is a pre-5.1 table name
+		containing chars other than [A-Za-z0-9].
+		Discard the prefix and use raw UTF-8 encoding. */
+		table_name += sizeof(srv_mysql50_table_name_prefix) - 1;
+		table_name_len -= sizeof(srv_mysql50_table_name_prefix) - 1;
+
+		to_cs = system_charset_info;
+	}
+
+	table_name_len = strconvert(from_cs, table_name, table_name_len, to_cs,
+				    tbl_name, MAX_TABLE_NAME_LEN, &errors);
+	table_name     = tbl_name;
+
+	if (database_name) {
+		to_cs = &my_charset_filename;
+		if (!strncmp(database_name, srv_mysql50_table_name_prefix,
+			     sizeof(srv_mysql50_table_name_prefix) - 1)) {
+			database_name
+				+= sizeof(srv_mysql50_table_name_prefix) - 1;
+			database_name_len
+				-= sizeof(srv_mysql50_table_name_prefix) - 1;
+			to_cs = system_charset_info;
+		}
+
+		database_name_len = strconvert(
+			from_cs, database_name, database_name_len, to_cs,
+			db_name, MAX_DATABASE_NAME_LEN, &errors);
+		database_name = db_name;
+	} else {
 		/* Use the database name of the foreign key table */
 
-		db_name = name;
+		database_name = name;
 		database_name_len = dict_get_db_name_len(name);
-	} else {
-		db_name = database_name;
 	}
 
 	/* Copy database_name, '/', table_name, '\0' */
-	ref = static_cast<char*>(
-		mem_heap_alloc(heap, database_name_len + table_name_len + 2));
-
-	memcpy(ref, db_name, database_name_len);
+	ref = static_cast<char*>(mem_heap_alloc(
+		heap, database_name_len + table_name_len + 2));
+	memcpy(ref, database_name, database_name_len);
 	ref[database_name_len] = '/';
 	memcpy(ref + database_name_len + 1, table_name, table_name_len + 1);
 
@@ -3274,7 +3428,7 @@ dict_get_referenced_table(
 	if (innobase_get_lower_case_table_names() == 2) {
 		innobase_casedn_str(ref);
 		*table = dict_table_get_low(ref);
-		memcpy(ref, db_name, database_name_len);
+		memcpy(ref, database_name, database_name_len);
 		ref[database_name_len] = '/';
 		memcpy(ref + database_name_len + 1, table_name, table_name_len + 1);
 
@@ -3291,105 +3445,6 @@ dict_get_referenced_table(
 
 	return(ref);
 }
-/*********************************************************************//**
-Scans a table name from an SQL string.
-@return scanned to */
-static
-const char*
-dict_scan_table_name(
-/*=================*/
-	CHARSET_INFO*	cs,	/*!< in: the character set of ptr */
-	const char*	ptr,	/*!< in: scanned to */
-	dict_table_t**	table,	/*!< out: table object or NULL */
-	const char*	name,	/*!< in: foreign key table name */
-	ibool*		success,/*!< out: TRUE if ok name found */
-	mem_heap_t*	heap,	/*!< in: heap where to allocate the id */
-	const char**	ref_name)/*!< out,own: the table name;
-				NULL if no name was scannable */
-{
-	const char*	database_name	= NULL;
-	ulint		database_name_len = 0;
-	const char*	table_name	= NULL;
-	const char*	scan_name;
-
-	*success = FALSE;
-	*table = NULL;
-
-	ptr = dict_scan_id(cs, ptr, heap, &scan_name, TRUE, FALSE);
-
-	if (scan_name == NULL) {
-
-		return(ptr);	/* Syntax error */
-	}
-
-	if (*ptr == '.') {
-		/* We scanned the database name; scan also the table name */
-
-		ptr++;
-
-		database_name = scan_name;
-		database_name_len = strlen(database_name);
-
-		ptr = dict_scan_id(cs, ptr, heap, &table_name, TRUE, FALSE);
-
-		if (table_name == NULL) {
-
-			return(ptr);	/* Syntax error */
-		}
-	} else {
-		/* To be able to read table dumps made with InnoDB-4.0.17 or
-		earlier, we must allow the dot separator between the database
-		name and the table name also to appear within a quoted
-		identifier! InnoDB used to print a constraint as:
-		... REFERENCES `databasename.tablename` ...
-		starting from 4.0.18 it is
-		... REFERENCES `databasename`.`tablename` ... */
-		const char* s;
-
-		for (s = scan_name; *s; s++) {
-			if (*s == '.') {
-				database_name = scan_name;
-				database_name_len = ulint(s - scan_name);
-				scan_name = ++s;
-				break;/* to do: multiple dots? */
-			}
-		}
-
-		table_name = scan_name;
-	}
-
-	*ref_name = dict_get_referenced_table(
-		name, database_name, database_name_len,
-		table_name, strlen(table_name), table, heap);
-
-	*success = TRUE;
-	return(ptr);
-}
-
-/*********************************************************************//**
-Skips one id. The id is allowed to contain also '.'.
-@return scanned to */
-static
-const char*
-dict_skip_word(
-/*===========*/
-	CHARSET_INFO*	cs,	/*!< in: the character set of ptr */
-	const char*	ptr,	/*!< in: scanned to */
-	ibool*		success)/*!< out: TRUE if success, FALSE if just spaces
-				left in string or a syntax error */
-{
-	const char*	start;
-
-	*success = FALSE;
-
-	ptr = dict_scan_id(cs, ptr, NULL, &start, FALSE, TRUE);
-
-	if (start) {
-		*success = TRUE;
-	}
-
-	return(ptr);
-}
 
 /*********************************************************************//**
 Removes MySQL comments from an SQL string. A comment is either
@@ -3523,7 +3578,7 @@ dict_table_get_highest_foreign_id(
 
 	ut_a(table);
 
-	len = ut_strlen(table->name.m_name);
+	len = strlen(table->name.m_name);
 
 	for (dict_foreign_set::iterator it = table->foreign_set.begin();
 	     it != table->foreign_set.end();
@@ -3539,10 +3594,10 @@ dict_table_get_highest_foreign_id(
 				strchr(foreign->id, '/') + 1,
 				MAX_TABLE_NAME_LEN);
 
-		if (ut_strlen(fkid) > ((sizeof dict_ibfk) - 1) + len
-		    && 0 == ut_memcmp(fkid, table->name.m_name, len)
-		    && 0 == ut_memcmp(fkid + len,
-				      dict_ibfk, (sizeof dict_ibfk) - 1)
+		if (strlen(fkid) > ((sizeof dict_ibfk) - 1) + len
+		    && 0 == memcmp(fkid, table->name.m_name, len)
+		    && 0 == memcmp(fkid + len,
+				   dict_ibfk, (sizeof dict_ibfk) - 1)
 		    && fkid[len + ((sizeof dict_ibfk) - 1)] != '0') {
 			/* It is of the >= 4.0.18 format */
 
@@ -3565,1047 +3620,6 @@ dict_table_get_highest_foreign_id(
 	DBUG_RETURN(biggest_id);
 }
 
-/*********************************************************************//**
-Reports a simple foreign key create clause syntax error. */
-static
-void
-dict_foreign_report_syntax_err(
-/*===========================*/
-	const char*     fmt,		/*!< in: syntax err msg */
-	const char*	oper,		/*!< in: operation */
-	const char*	name,		/*!< in: table name */
-	const char*	start_of_latest_foreign,
-					/*!< in: start of the foreign key clause
-					in the SQL string */
-	const char*	ptr)		/*!< in: place of the syntax error */
-{
-	ut_ad(!srv_read_only_mode);
-
-	FILE*	ef = dict_foreign_err_file;
-
-	mutex_enter(&dict_foreign_err_mutex);
-	dict_foreign_error_report_low(ef, name);
-	fprintf(ef, fmt, oper, name, start_of_latest_foreign, ptr);
-	mutex_exit(&dict_foreign_err_mutex);
-}
-
-/*********************************************************************//**
-Push warning message to SQL-layer based on foreign key constraint
-index match error. */
-static
-void
-dict_foreign_push_index_error(
-/*==========================*/
-	trx_t*		trx,		/*!< in: trx */
-	const char*	operation,	/*!< in: operation create or alter
-					*/
-	const char*	create_name,	/*!< in: table name in create or
-					alter table */
-	const char*	latest_foreign,	/*!< in: start of latest foreign key
-					constraint name */
-	const char**	columns,	/*!< in: foreign key columns */
-	fkerr_t		index_error,	/*!< in: error code */
-	ulint		err_col,	/*!< in: column where error happened
-					*/
-	dict_index_t*	err_index,	/*!< in: index where error happened
-					*/
-	dict_table_t*	table,		/*!< in: table */
-	FILE*		ef)		/*!< in: output stream */
-{
-	switch (index_error) {
-	case FK_SUCCESS:
-		break;
-	case FK_INDEX_NOT_FOUND:
-		fprintf(ef,
-			"%s table %s with foreign key constraint"
-			" failed. There is no index in the referenced"
-			" table where the referenced columns appear"
-			" as the first columns near '%s'.\n",
-			operation, create_name, latest_foreign);
-		ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-			"%s table %s with foreign key constraint"
-			" failed. There is no index in the referenced"
-			" table where the referenced columns appear"
-			" as the first columns near '%s'.",
-			operation, create_name, latest_foreign);
-		return;
-	case FK_IS_PREFIX_INDEX:
-		fprintf(ef,
-			"%s table %s with foreign key constraint"
-			" failed. There is only prefix index in the referenced"
-			" table where the referenced columns appear"
-			" as the first columns near '%s'.\n",
-			operation, create_name, latest_foreign);
-		ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-			"%s table %s with foreign key constraint"
-			" failed. There is only prefix index in the referenced"
-			" table where the referenced columns appear"
-			" as the first columns near '%s'.",
-			operation, create_name, latest_foreign);
-		return;
-	case FK_COL_NOT_NULL:
-		fprintf(ef,
-			"%s table %s with foreign key constraint"
-			" failed. You have defined a SET NULL condition but "
-			"column '%s' on index is defined as NOT NULL near '%s'.\n",
-			operation, create_name, columns[err_col], latest_foreign);
-		ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-			"%s table %s with foreign key constraint"
-			" failed. You have defined a SET NULL condition but "
-			"column '%s' on index is defined as NOT NULL near '%s'.",
-			operation, create_name, columns[err_col], latest_foreign);
-		return;
-	case FK_COLS_NOT_EQUAL:
-		dict_field_t*	field;
-		const char*	col_name;
-		field = dict_index_get_nth_field(err_index, err_col);
-
-		col_name = field->col->is_virtual()
-			? "(null)"
-			: dict_table_get_col_name(
-				table, dict_col_get_no(field->col));
-		fprintf(ef,
-			"%s table %s with foreign key constraint"
-			" failed. Field type or character set for column '%s' "
-			"does not mach referenced column '%s' near '%s'.\n",
-			operation, create_name, columns[err_col], col_name, latest_foreign);
-		ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-			"%s table %s with foreign key constraint"
-			" failed. Field type or character set for column '%s' "
-			"does not mach referenced column '%s' near '%s'.",
-			operation, create_name, columns[err_col], col_name, latest_foreign);
-		return;
-	}
-	DBUG_ASSERT(!"unknown error");
-}
-
-/*********************************************************************//**
-Scans a table create SQL string and adds to the data dictionary the foreign key
-constraints declared in the string. This function should be called after the
-indexes for a table have been created. Each foreign key constraint must be
-accompanied with indexes in bot participating tables. The indexes are allowed
-to contain more fields than mentioned in the constraint.
-@return error code or DB_SUCCESS */
-static
-dberr_t
-dict_create_foreign_constraints_low(
-	trx_t*			trx,
-	mem_heap_t*		heap,
-	CHARSET_INFO*		cs,
-	const char*		sql_string,
-	const char*		name,
-	ibool			reject_fks)
-{
-	dict_table_t*	table			= NULL;
-	dict_table_t*	referenced_table	= NULL;
-	dict_table_t*	table_to_alter		= NULL;
-	dict_table_t*	table_to_create		= NULL;
-	ulint		highest_id_so_far	= 0;
-	ulint		number			= 1;
-	dict_index_t*	index			= NULL;
-	dict_foreign_t*	foreign			= NULL;
-	const char*	ptr			= sql_string;
-	const char*	start_of_latest_foreign	= sql_string;
-	const char*	start_of_latest_set     = NULL;
-	FILE*		ef			= dict_foreign_err_file;
-	fkerr_t		index_error		= FK_SUCCESS;
-	dict_index_t*	err_index		= NULL;
-	ulint		err_col;
-	const char*	constraint_name;
-	ibool		success;
-	dberr_t		error;
-	const char*	ptr1;
-	const char*	ptr2;
-	ulint		i;
-	ulint		j;
-	ibool		is_on_delete;
-	ulint		n_on_deletes;
-	ulint		n_on_updates;
-	const dict_col_t*columns[500];
-	const char*	column_names[500];
-	const char*	ref_column_names[500];
-	const char*	referenced_table_name;
-	dict_foreign_set	local_fk_set;
-	dict_foreign_set_free	local_fk_set_free(local_fk_set);
-	const char*	create_table_name;
-	const char*	orig;
-	char	create_name[MAX_TABLE_NAME_LEN + 1];
-
-	ut_ad(!srv_read_only_mode);
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	table = dict_table_get_low(name);
-	/* First check if we are actually doing an ALTER TABLE, and in that
-	case look for the table being altered */
-	orig = ptr;
-	ptr = dict_accept(cs, ptr, "ALTER", &success);
-
-	const char* const operation = success ? "Alter " : "Create ";
-
-	if (!success) {
-		orig = ptr;
-		ptr = dict_scan_to(ptr, "CREATE");
-		ptr = dict_scan_to(ptr, "TABLE");
-		ptr = dict_accept(cs, ptr, "TABLE", &success);
-		create_table_name = NULL;
-
-		if (success) {
-			ptr = dict_scan_table_name(cs, ptr, &table_to_create, name,
-						   &success, heap, &create_table_name);
-		}
-
-		ptr = orig;
-		const char* n = create_table_name ? create_table_name : name;
-		char *bufend = innobase_convert_name(create_name, MAX_TABLE_NAME_LEN,
-						     n, strlen(n), trx->mysql_thd);
-		create_name[bufend-create_name] = '\0';
-	} else {
-		strncpy(create_name, name, sizeof create_name);
-		create_name[(sizeof create_name) - 1] = '\0';
-	}
-
-	if (table == NULL) {
-		mutex_enter(&dict_foreign_err_mutex);
-		dict_foreign_error_report_low(ef, create_name);
-		fprintf(ef, "%s table %s with foreign key constraint"
-			" failed. Table %s not found from data dictionary."
-			" Error close to %s.\n",
-			operation, create_name, create_name, start_of_latest_foreign);
-		mutex_exit(&dict_foreign_err_mutex);
-		ib_push_warning(trx, DB_ERROR,
-			"%s table %s with foreign key constraint"
-			" failed. Table %s not found from data dictionary."
-			" Error close to %s.",
-			operation, create_name, create_name, start_of_latest_foreign);
-
-		return(DB_ERROR);
-	}
-
-	/* If not alter table jump to loop */
-	if (!success) {
-
-		goto loop;
-	}
-
-	orig = ptr;
-	for (;;) {
-		ptr = dict_accept(cs, ptr, "TABLE", &success);
-		if (success) {
-			break;
-		}
-		ptr = dict_accept(cs, ptr, "ONLINE", &success);
-		if (success) {
-			continue;
-		}
-		ptr = dict_accept(cs, ptr, "IGNORE", &success);
-		if (!success) {
-			goto loop;
-		}
-	}
-
-	/* We are doing an ALTER TABLE: scan the table name we are altering */
-
-	orig = ptr;
-	ptr = dict_scan_table_name(cs, ptr, &table_to_alter, name,
-				   &success, heap, &referenced_table_name);
-
-	{
-		const char* n = table_to_alter
-			? table_to_alter->name.m_name : referenced_table_name;
-		char* bufend = innobase_convert_name(
-			create_name, MAX_TABLE_NAME_LEN, n, strlen(n),
-			trx->mysql_thd);
-		create_name[bufend-create_name]='\0';
-	}
-
-	if (!success) {
-		ib::error() << "Could not find the table " << create_name << " being" << operation << " near to "
-			<< orig;
-
-		ib_push_warning(trx, DB_ERROR,
-			"%s table %s with foreign key constraint"
-			" failed. Table %s not found from data dictionary."
-			" Error close to %s.",
-			operation, create_name, create_name, orig);
-
-		return(DB_ERROR);
-	}
-
-	/* Starting from 4.0.18 and 4.1.2, we generate foreign key id's in the
-	format databasename/tablename_ibfk_[number], where [number] is local
-	to the table; look for the highest [number] for table_to_alter, so
-	that we can assign to new constraints higher numbers. */
-
-	/* If we are altering a temporary table, the table name after ALTER
-	TABLE does not correspond to the internal table name, and
-	table_to_alter is NULL. TODO: should we fix this somehow? */
-
-	if (table_to_alter == NULL) {
-		highest_id_so_far = 0;
-	} else {
-		highest_id_so_far = dict_table_get_highest_foreign_id(
-			table_to_alter);
-	}
-
-	number = highest_id_so_far + 1;
-	/* Scan for foreign key declarations in a loop */
-loop:
-	/* Scan either to "CONSTRAINT" or "FOREIGN", whichever is closer */
-
-	ptr1 = dict_scan_to(ptr, "CONSTRAINT");
-	ptr2 = dict_scan_to(ptr, "FOREIGN");
-
-	constraint_name = NULL;
-
-	if (ptr1 < ptr2) {
-		/* The user may have specified a constraint name. Pick it so
-		that we can store 'databasename/constraintname' as the id of
-		of the constraint to system tables. */
-		ptr = ptr1;
-
-		orig = ptr;
-		ptr = dict_accept(cs, ptr, "CONSTRAINT", &success);
-
-		ut_a(success);
-
-		if (!my_isspace(cs, *ptr) && *ptr != '"' && *ptr != '`') {
-			goto loop;
-		}
-
-		while (my_isspace(cs, *ptr)) {
-			ptr++;
-		}
-
-		/* read constraint name unless got "CONSTRAINT FOREIGN" */
-		if (ptr != ptr2) {
-			ptr = dict_scan_id(cs, ptr, heap,
-					   &constraint_name, FALSE, FALSE);
-		}
-	} else {
-		ptr = ptr2;
-	}
-
-	if (*ptr == '\0') {
-		/* The proper way to reject foreign keys for temporary
-		tables would be to split the lexing and syntactical
-		analysis of foreign key clauses from the actual adding
-		of them, so that ha_innodb.cc could first parse the SQL
-		command, determine if there are any foreign keys, and
-		if so, immediately reject the command if the table is a
-		temporary one. For now, this kludge will work. */
-		if (reject_fks && !local_fk_set.empty()) {
-			mutex_enter(&dict_foreign_err_mutex);
-			dict_foreign_error_report_low(ef, create_name);
-			fprintf(ef, "%s table %s with foreign key constraint"
-				" failed. Temporary tables can't have foreign key constraints."
-				" Error close to %s.\n",
-				operation, create_name, start_of_latest_foreign);
-			mutex_exit(&dict_foreign_err_mutex);
-
-			ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-				"%s table %s with foreign key constraint"
-				" failed. Temporary tables can't have foreign key constraints."
-				" Error close to %s.",
-				operation, create_name, start_of_latest_foreign);
-
-			return(DB_CANNOT_ADD_CONSTRAINT);
-		}
-
-		if (dict_foreigns_has_s_base_col(local_fk_set, table)) {
-			return(DB_NO_FK_ON_S_BASE_COL);
-		}
-
-		/**********************************************************/
-		/* The following call adds the foreign key constraints
-		to the data dictionary system tables on disk */
-		trx->op_info = "adding foreign keys";
-
-		trx_start_if_not_started_xa(trx, true);
-
-		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
-
-		error = dict_create_add_foreigns_to_dictionary(
-			local_fk_set, table, trx);
-
-		if (error == DB_SUCCESS) {
-
-			table->foreign_set.insert(local_fk_set.begin(),
-						  local_fk_set.end());
-			std::for_each(local_fk_set.begin(),
-				      local_fk_set.end(),
-				      dict_foreign_add_to_referenced_table());
-			local_fk_set.clear();
-
-			dict_mem_table_fill_foreign_vcol_set(table);
-		}
-		return(error);
-	}
-
-	start_of_latest_foreign = ptr;
-
-	orig = ptr;
-	ptr = dict_accept(cs, ptr, "FOREIGN", &success);
-
-	if (!success) {
-		goto loop;
-	}
-
-	if (!my_isspace(cs, *ptr)) {
-		goto loop;
-	}
-
-	orig = ptr;
-	ptr = dict_accept(cs, ptr, "KEY", &success);
-
-	if (!success) {
-		goto loop;
-	}
-
-	if (my_isspace(cs, *ptr)) {
-		ptr1 = dict_accept(cs, ptr, "IF", &success);
-
-		if (success) {
-			if (!my_isspace(cs, *ptr1)) {
-				goto loop;
-			}
-			ptr1 = dict_accept(cs, ptr1, "NOT", &success);
-			if (!success) {
-				goto loop;
-			}
-			ptr1 = dict_accept(cs, ptr1, "EXISTS", &success);
-			if (!success) {
-				goto loop;
-			}
-			ptr = ptr1;
-		}
-	}
-
-	orig = ptr;
-	ptr = dict_accept(cs, ptr, "(", &success);
-
-	if (!success) {
-		if (constraint_name) {
-			/* MySQL allows also an index id before the '('; we
-			skip it */
-			ptr = dict_skip_word(cs, ptr, &success);
-			if (!success) {
-				dict_foreign_report_syntax_err(
-					"%s table %s with foreign key constraint"
-					" failed. Parse error in '%s'"
-					" near '%s'.\n",
-					operation, create_name, start_of_latest_foreign, orig);
-
-				ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-					"%s table %s with foreign key constraint"
-					" failed. Parse error in '%s'"
-					" near '%s'.",
-					operation, create_name, start_of_latest_foreign, orig);
-				return(DB_CANNOT_ADD_CONSTRAINT);
-			}
-		} else {
-			while (my_isspace(cs, *ptr)) {
-				ptr++;
-			}
-
-			ptr = dict_scan_id(cs, ptr, heap,
-				     &constraint_name, FALSE, FALSE);
-		}
-
-		ptr = dict_accept(cs, ptr, "(", &success);
-
-		if (!success) {
-			/* We do not flag a syntax error here because in an
-			ALTER TABLE we may also have DROP FOREIGN KEY abc */
-
-			goto loop;
-		}
-	}
-
-	i = 0;
-
-	/* Scan the columns in the first list */
-col_loop1:
-	ut_a(i < (sizeof column_names) / sizeof *column_names);
-	orig = ptr;
-	ptr = dict_scan_col(cs, ptr, &success, table, columns + i,
-			    heap, column_names + i);
-	if (!success) {
-		mutex_enter(&dict_foreign_err_mutex);
-		dict_foreign_error_report_low(ef, create_name);
-		fprintf(ef,
-			"%s table %s with foreign key constraint"
-			" failed. Parse error in '%s'"
-			" near '%s'.\n",
-			operation, create_name, start_of_latest_foreign, orig);
-
-		mutex_exit(&dict_foreign_err_mutex);
-
-		ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-			"%s table %s with foreign key constraint"
-			" failed. Parse error in '%s'"
-			" near '%s'.",
-			operation, create_name, start_of_latest_foreign, orig);
-
-		return(DB_CANNOT_ADD_CONSTRAINT);
-	}
-
-	i++;
-
-	ptr = dict_accept(cs, ptr, ",", &success);
-
-	if (success) {
-		goto col_loop1;
-	}
-
-	orig = ptr;
-	ptr = dict_accept(cs, ptr, ")", &success);
-
-	if (!success) {
-		dict_foreign_report_syntax_err(
-			"%s table %s with foreign key constraint"
-			" failed. Parse error in '%s'"
-			" near '%s'.\n",
-			operation, create_name, start_of_latest_foreign, orig);
-
-		ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-			"%s table %s with foreign key constraint"
-			" failed. Parse error in '%s'"
-			" near '%s'.",
-			operation, create_name, start_of_latest_foreign, orig);
-
-		return(DB_CANNOT_ADD_CONSTRAINT);
-	}
-
-	/* Try to find an index which contains the columns
-	as the first fields and in the right order. There is
-	no need to check column type match (on types_idx), since
-	the referenced table can be NULL if foreign_key_checks is
-	set to 0 */
-
-	index = dict_foreign_find_index(
-		table, NULL, column_names, i,
-		NULL, TRUE, FALSE, &index_error, &err_col, &err_index);
-
-	if (!index) {
-		mutex_enter(&dict_foreign_err_mutex);
-		dict_foreign_error_report_low(ef, create_name);
-		fputs("There is no index in table ", ef);
-		ut_print_name(ef, NULL, create_name);
-		fprintf(ef, " where the columns appear\n"
-			"as the first columns. Constraint:\n%s\n%s",
-			start_of_latest_foreign,
-			FOREIGN_KEY_CONSTRAINTS_MSG);
-		dict_foreign_push_index_error(trx, operation, create_name, start_of_latest_foreign,
-			column_names, index_error, err_col, err_index, table, ef);
-
-		mutex_exit(&dict_foreign_err_mutex);
-		return(DB_CANNOT_ADD_CONSTRAINT);
-	}
-
-	orig = ptr;
-	ptr = dict_accept(cs, ptr, "REFERENCES", &success);
-
-	if (!success || !my_isspace(cs, *ptr)) {
-		dict_foreign_report_syntax_err(
-			"%s table %s with foreign key constraint"
-			" failed. Parse error in '%s'"
-			" near '%s'.\n",
-			operation, create_name, start_of_latest_foreign, orig);
-
-		ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-			"%s table %s with foreign key constraint"
-			" failed. Parse error in '%s'"
-			" near '%s'.",
-			operation, create_name, start_of_latest_foreign, orig);
-		return(DB_CANNOT_ADD_CONSTRAINT);
-	}
-
-	/* Don't allow foreign keys on partitioned tables yet. */
-	ptr1 = dict_scan_to(ptr, "PARTITION");
-	if (ptr1) {
-		ptr1 = dict_accept(cs, ptr1, "PARTITION", &success);
-		if (success && my_isspace(cs, *ptr1)) {
-			ptr2 = dict_accept(cs, ptr1, "BY", &success);
-			if (success) {
-				my_error(ER_FOREIGN_KEY_ON_PARTITIONED,MYF(0));
-				return(DB_CANNOT_ADD_CONSTRAINT);
-			}
-		}
-	}
-	if (dict_table_is_partition(table)) {
-		my_error(ER_FOREIGN_KEY_ON_PARTITIONED,MYF(0));
-		return(DB_CANNOT_ADD_CONSTRAINT);
-	}
-
-	/* Let us create a constraint struct */
-
-	foreign = dict_mem_foreign_create();
-
-	if (constraint_name) {
-		ulint	db_len;
-
-		/* Catenate 'databasename/' to the constraint name specified
-		by the user: we conceive the constraint as belonging to the
-		same MySQL 'database' as the table itself. We store the name
-		to foreign->id. */
-
-		db_len = dict_get_db_name_len(table->name.m_name);
-
-		foreign->id = static_cast<char*>(mem_heap_alloc(
-			foreign->heap, db_len + strlen(constraint_name) + 2));
-
-		ut_memcpy(foreign->id, table->name.m_name, db_len);
-		foreign->id[db_len] = '/';
-		strcpy(foreign->id + db_len + 1, constraint_name);
-	}
-
-	if (foreign->id == NULL) {
-		error = dict_create_add_foreign_id(
-			&number, table->name.m_name, foreign);
-		if (error != DB_SUCCESS) {
-			dict_foreign_free(foreign);
-			return(error);
-		}
-	}
-
-	std::pair<dict_foreign_set::iterator, bool>	ret
-		= local_fk_set.insert(foreign);
-
-	if (!ret.second) {
-		/* A duplicate foreign key name has been found */
-		dict_foreign_free(foreign);
-		return(DB_CANNOT_ADD_CONSTRAINT);
-	}
-
-	foreign->foreign_table = table;
-	foreign->foreign_table_name = mem_heap_strdup(
-		foreign->heap, table->name.m_name);
-	dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
-
-	foreign->foreign_index = index;
-	foreign->n_fields = (unsigned int) i;
-
-	foreign->foreign_col_names = static_cast<const char**>(
-		mem_heap_alloc(foreign->heap, i * sizeof(void*)));
-
-	for (i = 0; i < foreign->n_fields; i++) {
-		foreign->foreign_col_names[i] = mem_heap_strdup(
-                        foreign->heap, column_names[i]);
-	}
-
-	ptr = dict_scan_table_name(cs, ptr, &referenced_table, name,
-				   &success, heap, &referenced_table_name);
-
-	/* Note that referenced_table can be NULL if the user has suppressed
-	checking of foreign key constraints! */
-
-	if (!success || (!referenced_table && trx->check_foreigns)) {
-		char	buf[MAX_TABLE_NAME_LEN + 1] = "";
-		char*	bufend;
-
-		bufend = innobase_convert_name(buf, MAX_TABLE_NAME_LEN,
-				referenced_table_name, strlen(referenced_table_name),
-				trx->mysql_thd);
-		buf[bufend - buf] = '\0';
-
-		ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-			"%s table %s with foreign key constraint failed. Referenced table %s not found in the data dictionary "
-			"near '%s'.",
-			operation, create_name, buf, start_of_latest_foreign);
-		mutex_enter(&dict_foreign_err_mutex);
-		dict_foreign_error_report_low(ef, create_name);
-		fprintf(ef,
-			"%s table %s with foreign key constraint failed. Referenced table %s not found in the data dictionary "
-			"near '%s'.\n",
-			operation, create_name, buf, start_of_latest_foreign);
-
-		mutex_exit(&dict_foreign_err_mutex);
-
-		return(DB_CANNOT_ADD_CONSTRAINT);
-	}
-
-	/* Don't allow foreign keys on partitioned tables yet. */
-	if (referenced_table && dict_table_is_partition(referenced_table)) {
-		/* How could one make a referenced table to be a partition? */
-		ut_ad(0);
-		my_error(ER_FOREIGN_KEY_ON_PARTITIONED,MYF(0));
-		return(DB_CANNOT_ADD_CONSTRAINT);
-	}
-
-	ptr = dict_accept(cs, ptr, "(", &success);
-
-	if (!success) {
-		dict_foreign_report_syntax_err(
-			"%s table %s with foreign key constraint"
-			" failed. Parse error in '%s'"
-			" near '%s'.\n",
-			operation, create_name, start_of_latest_foreign, orig);
-
-		ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-			"%s table %s with foreign key constraint"
-			" failed. Parse error in '%s'"
-			" near '%s'.",
-			operation, create_name, start_of_latest_foreign, orig);
-
-		return(DB_CANNOT_ADD_CONSTRAINT);
-	}
-
-	/* Scan the columns in the second list */
-	i = 0;
-
-col_loop2:
-	orig = ptr;
-	ptr = dict_scan_col(cs, ptr, &success, referenced_table, columns + i,
-			    heap, ref_column_names + i);
-	i++;
-
-	if (!success) {
-
-		mutex_enter(&dict_foreign_err_mutex);
-		dict_foreign_error_report_low(ef, create_name);
-		fprintf(ef,
-			"%s table %s with foreign key constraint"
-			" failed. Parse error in '%s'"
-			" near '%s'.\n",
-			operation, create_name, start_of_latest_foreign, orig);
-		mutex_exit(&dict_foreign_err_mutex);
-		ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-			"%s table %s with foreign key constraint"
-			" failed. Parse error in '%s'"
-			" near '%s'.",
-			operation, create_name, start_of_latest_foreign, orig);
-
-		return(DB_CANNOT_ADD_CONSTRAINT);
-	}
-
-	orig = ptr;
-	ptr = dict_accept(cs, ptr, ",", &success);
-
-	if (success) {
-		goto col_loop2;
-	}
-
-	orig = ptr;
-	ptr = dict_accept(cs, ptr, ")", &success);
-
-	if (!success || foreign->n_fields != i) {
-
-		dict_foreign_report_syntax_err(
-			"%s table %s with foreign key constraint"
-			" failed. Parse error in '%s' near '%s'.  Referencing column count does not match referenced column count.\n",
-			operation, create_name, start_of_latest_foreign, orig);
-
-		ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-			"%s table %s with foreign key constraint"
-			" failed. Parse error in '%s' near '%s'.  Referencing column count %d does not match referenced column count %d.\n",
-			operation, create_name, start_of_latest_foreign, orig, i, foreign->n_fields);
-
-		return(DB_CANNOT_ADD_CONSTRAINT);
-	}
-
-	n_on_deletes = 0;
-	n_on_updates = 0;
-
-scan_on_conditions:
-	/* Loop here as long as we can find ON ... conditions */
-
-	start_of_latest_set = ptr;
-	ptr = dict_accept(cs, ptr, "ON", &success);
-
-	if (!success) {
-
-		goto try_find_index;
-	}
-
-	orig = ptr;
-	ptr = dict_accept(cs, ptr, "DELETE", &success);
-
-	if (!success) {
-		orig = ptr;
-		ptr = dict_accept(cs, ptr, "UPDATE", &success);
-
-		if (!success) {
-
-			dict_foreign_report_syntax_err(
-				"%s table %s with foreign key constraint"
-				" failed. Parse error in '%s'"
-				" near '%s'.\n",
-				operation, create_name, start_of_latest_foreign, start_of_latest_set);
-
-			ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-				"%s table %s with foreign key constraint"
-				" failed. Parse error in '%s'"
-				" near '%s'.",
-				operation, create_name, start_of_latest_foreign, start_of_latest_set);
-
-			return(DB_CANNOT_ADD_CONSTRAINT);
-		}
-
-		is_on_delete = FALSE;
-		n_on_updates++;
-	} else {
-		is_on_delete = TRUE;
-		n_on_deletes++;
-	}
-
-	orig = ptr;
-	ptr = dict_accept(cs, ptr, "RESTRICT", &success);
-
-	if (success) {
-		goto scan_on_conditions;
-	}
-
-	orig = ptr;
-	ptr = dict_accept(cs, ptr, "CASCADE", &success);
-
-	if (success) {
-		if (is_on_delete) {
-			foreign->type |= DICT_FOREIGN_ON_DELETE_CASCADE;
-		} else {
-			foreign->type |= DICT_FOREIGN_ON_UPDATE_CASCADE;
-		}
-
-		goto scan_on_conditions;
-	}
-
-	orig = ptr;
-	ptr = dict_accept(cs, ptr, "NO", &success);
-
-	if (success) {
-		orig = ptr;
-		ptr = dict_accept(cs, ptr, "ACTION", &success);
-
-		if (!success) {
-			dict_foreign_report_syntax_err(
-				"%s table %s with foreign key constraint"
-				" failed. Parse error in '%s'"
-				" near '%s'.\n",
-				operation, create_name, start_of_latest_foreign, start_of_latest_set);
-
-			ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-				"%s table %s with foreign key constraint"
-				" failed. Parse error in '%s'"
-				" near '%s'.",
-				operation, create_name, start_of_latest_foreign, start_of_latest_set);
-
-			return(DB_CANNOT_ADD_CONSTRAINT);
-		}
-
-		if (is_on_delete) {
-			foreign->type |= DICT_FOREIGN_ON_DELETE_NO_ACTION;
-		} else {
-			foreign->type |= DICT_FOREIGN_ON_UPDATE_NO_ACTION;
-		}
-
-		goto scan_on_conditions;
-	}
-
-	orig = ptr;
-	ptr = dict_accept(cs, ptr, "SET", &success);
-
-	if (!success) {
-		dict_foreign_report_syntax_err(
-			"%s table %s with foreign key constraint"
-			" failed. Parse error in '%s'"
-			" near '%s'.\n",
-			operation, create_name, start_of_latest_foreign, start_of_latest_set);
-
-		ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-			"%s table %s with foreign key constraint"
-			" failed. Parse error in '%s'"
-			" near '%s'.",
-			operation, create_name, start_of_latest_foreign, start_of_latest_set);
-		return(DB_CANNOT_ADD_CONSTRAINT);
-	}
-
-	orig = ptr;
-	ptr = dict_accept(cs, ptr, "NULL", &success);
-
-	if (!success) {
-		dict_foreign_report_syntax_err(
-			"%s table %s with foreign key constraint"
-			" failed. Parse error in '%s'"
-			" near '%s'.\n",
-			operation, create_name, start_of_latest_foreign, start_of_latest_set);
-
-		ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-			"%s table %s with foreign key constraint"
-			" failed. Parse error in '%s'"
-			" near '%s'.",
-			operation, create_name, start_of_latest_foreign, start_of_latest_set);
-
-		return(DB_CANNOT_ADD_CONSTRAINT);
-	}
-
-	for (j = 0; j < foreign->n_fields; j++) {
-		if ((dict_index_get_nth_col(foreign->foreign_index, j)->prtype)
-		    & DATA_NOT_NULL) {
-			const dict_col_t*	col
-				= dict_index_get_nth_col(foreign->foreign_index, j);
-			const char* col_name = dict_table_get_col_name(foreign->foreign_index->table,
-				dict_col_get_no(col));
-
-			/* It is not sensible to define SET NULL
-			if the column is not allowed to be NULL! */
-
-			mutex_enter(&dict_foreign_err_mutex);
-			dict_foreign_error_report_low(ef, create_name);
-			fprintf(ef,
-				"%s table %s with foreign key constraint"
-				" failed. You have defined a SET NULL condition but column '%s' is defined as NOT NULL"
-				" in '%s' near '%s'.\n",
-				operation, create_name, col_name, start_of_latest_foreign, start_of_latest_set);
-			mutex_exit(&dict_foreign_err_mutex);
-
-			ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-				"%s table %s with foreign key constraint"
-				" failed. You have defined a SET NULL condition but column '%s' is defined as NOT NULL"
-				" in '%s' near '%s'.",
-				operation, create_name, col_name, start_of_latest_foreign, start_of_latest_set);
-
-			return(DB_CANNOT_ADD_CONSTRAINT);
-		}
-	}
-
-	if (is_on_delete) {
-		foreign->type |= DICT_FOREIGN_ON_DELETE_SET_NULL;
-	} else {
-		foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL;
-	}
-
-	goto scan_on_conditions;
-
-try_find_index:
-	if (n_on_deletes > 1 || n_on_updates > 1) {
-		/* It is an error to define more than 1 action */
-
-		mutex_enter(&dict_foreign_err_mutex);
-		dict_foreign_error_report_low(ef, create_name);
-		fprintf(ef,
-			"%s table %s with foreign key constraint"
-			" failed. You have more than one on delete or on update clause"
-			" in '%s' near '%s'.\n",
-			operation, create_name, start_of_latest_foreign, start_of_latest_set);
-		mutex_exit(&dict_foreign_err_mutex);
-
-		ib_push_warning(trx, DB_CANNOT_ADD_CONSTRAINT,
-			"%s table %s with foreign key constraint"
-			" failed. You have more than one on delete or on update clause"
-			" in '%s' near '%s'.",
-			operation, create_name, start_of_latest_foreign, start_of_latest_set);
-
-		dict_foreign_free(foreign);
-
-		return(DB_CANNOT_ADD_CONSTRAINT);
-	}
-
-	/* Try to find an index which contains the columns as the first fields
-	and in the right order, and the types are the same as in
-	foreign->foreign_index */
-
-	if (referenced_table) {
-		index = dict_foreign_find_index(referenced_table, NULL,
-						ref_column_names, i,
-						foreign->foreign_index,
-			TRUE, FALSE, &index_error, &err_col, &err_index);
-
-		if (!index) {
-			mutex_enter(&dict_foreign_err_mutex);
-			dict_foreign_error_report_low(ef, create_name);
-			fprintf(ef, "%s:\n"
-				"Cannot find an index in the"
-				" referenced table where the\n"
-				"referenced columns appear as the"
-				" first columns, or column types\n"
-				"in the table and the referenced table"
-				" do not match for constraint.\n"
-				"Note that the internal storage type of"
-				" ENUM and SET changed in\n"
-				"tables created with >= InnoDB-4.1.12,"
-				" and such columns in old tables\n"
-				"cannot be referenced by such columns"
-				" in new tables.\n%s\n",
-				start_of_latest_foreign,
-				FOREIGN_KEY_CONSTRAINTS_MSG);
-
-			dict_foreign_push_index_error(trx, operation, create_name, start_of_latest_foreign,
-				column_names, index_error, err_col, err_index, referenced_table, ef);
-
-			mutex_exit(&dict_foreign_err_mutex);
-
-			return(DB_CANNOT_ADD_CONSTRAINT);
-		}
-	} else {
-		ut_a(trx->check_foreigns == FALSE);
-		index = NULL;
-	}
-
-	foreign->referenced_index = index;
-	foreign->referenced_table = referenced_table;
-
-	foreign->referenced_table_name = mem_heap_strdup(
-		foreign->heap, referenced_table_name);
-	dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
-
-	foreign->referenced_col_names = static_cast<const char**>(
-		mem_heap_alloc(foreign->heap, i * sizeof(void*)));
-
-	for (i = 0; i < foreign->n_fields; i++) {
-		foreign->referenced_col_names[i]
-			= mem_heap_strdup(foreign->heap, ref_column_names[i]);
-	}
-
-	goto loop;
-}
-
-/** Scans a table create SQL string and adds to the data dictionary
-the foreign key constraints declared in the string. This function
-should be called after the indexes for a table have been created.
-Each foreign key constraint must be accompanied with indexes in
-bot participating tables. The indexes are allowed to contain more
-fields than mentioned in the constraint.
-
-@param[in]	trx		transaction
-@param[in]	sql_string	table create statement where
-				foreign keys are declared like:
-				FOREIGN KEY (a, b) REFERENCES table2(c, d),
-				table2 can be written also with the database
-				name before it: test.table2; the default
-				database id the database of parameter name
-@param[in]	sql_length	length of sql_string
-@param[in]	name		table full name in normalized form
-@param[in]	reject_fks	if TRUE, fail with error code
-				DB_CANNOT_ADD_CONSTRAINT if any
-				foreign keys are found.
-@return error code or DB_SUCCESS */
-dberr_t
-dict_create_foreign_constraints(
-	trx_t*			trx,
-	const char*		sql_string,
-	size_t			sql_length,
-	const char*		name,
-	ibool			reject_fks)
-{
-	char*		str;
-	dberr_t		err;
-	mem_heap_t*	heap;
-
-	ut_a(trx);
-	ut_a(trx->mysql_thd);
-
-	str = dict_strip_comments(sql_string, sql_length);
-	heap = mem_heap_create(10000);
-
-	err = dict_create_foreign_constraints_low(
-		trx, heap, thd_charset(trx->mysql_thd),
-		str, name, reject_fks);
-
-	mem_heap_free(heap);
-	ut_free(str);
-
-	return(err);
-}
-
 /**********************************************************************//**
 Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement.
 @return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the
@@ -4759,7 +3773,7 @@ dict_index_get_if_in_cache_low(
 	return(dict_index_find_on_id_low(index_id));
 }
 
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+#ifdef UNIV_DEBUG
 /**********************************************************************//**
 Returns an index object if it is found in the dictionary cache.
 @return index, NULL if not found */
@@ -4782,9 +3796,7 @@ dict_index_get_if_in_cache(
 
 	return(index);
 }
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
-#ifdef UNIV_DEBUG
 /**********************************************************************//**
 Checks that a tuple has n_fields_cmp value in a sensible range, so that
 no comparison can occur with the page number field in a node pointer.
@@ -5292,7 +4304,7 @@ dict_set_corrupted(
 		if (len != 4) {
 			goto fail;
 		}
-		mlog_write_ulint(field, index->type, MLOG_4BYTES, &mtr);
+		mtr.write<4>(*btr_cur_get_block(&cursor), field, index->type);
 		status = "Flagged";
 	} else {
 fail:
@@ -5393,11 +4405,8 @@ dict_index_set_merge_threshold(
 			DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD, &len);
 
 		ut_ad(len == 4);
-
-		if (len == 4) {
-			mlog_write_ulint(field, merge_threshold,
-					 MLOG_4BYTES, &mtr);
-		}
+		mtr.write<4,mtr_t::MAYBE_NOP>(*btr_cur_get_block(&cursor),
+					      field, merge_threshold);
 	}
 
 	mtr_commit(&mtr);
@@ -5422,7 +4431,8 @@ dict_set_merge_threshold_list_debug(
 		     index != NULL;
 		     index = UT_LIST_GET_NEXT(indexes, index)) {
 			rw_lock_x_lock(dict_index_get_lock(index));
-			index->merge_threshold = merge_threshold_all;
+			index->merge_threshold = merge_threshold_all
+				& ((1U << 6) - 1);
 			rw_lock_x_unlock(dict_index_get_lock(index));
 		}
 	}
@@ -5446,34 +4456,6 @@ dict_set_merge_threshold_all_debug(
 
 #endif /* UNIV_DEBUG */
 
-/** Initialize dict_ind_redundant. */
-void
-dict_ind_init()
-{
-	dict_table_t*		table;
-
-	/* create dummy table and index for REDUNDANT infimum and supremum */
-	table = dict_mem_table_create("SYS_DUMMY1", NULL, 1, 0, 0, 0);
-	dict_mem_table_add_col(table, NULL, NULL, DATA_CHAR,
-			       DATA_ENGLISH | DATA_NOT_NULL, 8);
-
-	dict_ind_redundant = dict_mem_index_create(table, "SYS_DUMMY1", 0, 1);
-	dict_index_add_col(dict_ind_redundant, table,
-			   dict_table_get_nth_col(table, 0), 0);
-	/* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
-	dict_ind_redundant->cached = TRUE;
-}
-
-/** Free dict_ind_redundant. */
-void
-dict_ind_free()
-{
-	dict_table_t*	table = dict_ind_redundant->table;
-	dict_mem_index_free(dict_ind_redundant);
-	dict_ind_redundant = NULL;
-	dict_mem_table_free(table);
-}
-
 /** Get an index by name.
 @param[in]	table		the table where to look for the index
 @param[in]	name		the index name to look for
@@ -5916,38 +4898,39 @@ void dict_sys_t::resize()
   mutex_enter(&mutex);
 
   /* all table entries are in table_LRU and table_non_LRU lists */
-  hash_table_free(table_hash);
-  hash_table_free(table_id_hash);
-  hash_table_free(temp_id_hash);
+  table_hash.free();
+  table_id_hash.free();
+  temp_id_hash.free();
 
   const ulint hash_size = buf_pool_get_curr_size()
     / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE);
-  table_hash = hash_create(hash_size);
-  table_id_hash = hash_create(hash_size);
-  temp_id_hash = hash_create(hash_size);
+  table_hash.create(hash_size);
+  table_id_hash.create(hash_size);
+  temp_id_hash.create(hash_size);
 
-  for (dict_table_t* table= UT_LIST_GET_FIRST(table_LRU); table;
+  for (dict_table_t *table= UT_LIST_GET_FIRST(table_LRU); table;
        table= UT_LIST_GET_NEXT(table_LRU, table))
   {
     ut_ad(!table->is_temporary());
     ulint fold= ut_fold_string(table->name.m_name);
     ulint id_fold= ut_fold_ull(table->id);
 
-    HASH_INSERT(dict_table_t, name_hash, table_hash, fold, table);
-    HASH_INSERT(dict_table_t, id_hash, table_id_hash, id_fold, table);
+    HASH_INSERT(dict_table_t, name_hash, &table_hash, fold, table);
+    HASH_INSERT(dict_table_t, id_hash, &table_id_hash, id_fold, table);
   }
 
-  for (dict_table_t* table = UT_LIST_GET_FIRST(table_non_LRU); table;
-       table = UT_LIST_GET_NEXT(table_LRU, table)) {
-	  ulint	fold = ut_fold_string(table->name.m_name);
-	  ulint	id_fold = ut_fold_ull(table->id);
+  for (dict_table_t *table = UT_LIST_GET_FIRST(table_non_LRU); table;
+       table= UT_LIST_GET_NEXT(table_LRU, table))
+  {
+    ulint fold= ut_fold_string(table->name.m_name);
+    ulint id_fold= ut_fold_ull(table->id);
 
-	  HASH_INSERT(dict_table_t, name_hash, table_hash, fold, table);
+    HASH_INSERT(dict_table_t, name_hash, &table_hash, fold, table);
 
-	  hash_table_t* id_hash = table->is_temporary()
-	    ? temp_id_hash : table_id_hash;
+    hash_table_t *id_hash= table->is_temporary()
+      ? &temp_id_hash : &table_id_hash;
 
-	  HASH_INSERT(dict_table_t, id_hash, id_hash, id_fold, table);
+    HASH_INSERT(dict_table_t, id_hash, id_hash, id_fold, table);
   }
 
   mutex_exit(&mutex);
@@ -5963,27 +4946,19 @@ void dict_sys_t::close()
 
   /* Free the hash elements. We don't remove them from the table
   because we are going to destroy the table anyway. */
-  for (ulint i = 0; i < hash_get_n_cells(table_hash); i++)
-  {
-    dict_table_t* table = static_cast<dict_table_t*>(HASH_GET_FIRST(table_hash,
-								    i));
+  for (ulint i= table_hash.n_cells; i--; )
+    while (dict_table_t *table= static_cast<dict_table_t*>
+           (HASH_GET_FIRST(&table_hash, i)))
+      dict_sys.remove(table);
 
-    while (table)
-    {
-      dict_table_t* prev_table = table;
-      table = static_cast<dict_table_t*>(HASH_GET_NEXT(name_hash, prev_table));
-      dict_sys.remove(prev_table);
-    }
-  }
-
-  hash_table_free(table_hash);
+  table_hash.free();
 
   /* table_id_hash contains the same elements as in table_hash,
   therefore we don't delete the individual elements. */
-  hash_table_free(table_id_hash);
+  table_id_hash.free();
 
   /* No temporary tables should exist at this point. */
-  hash_table_free(temp_id_hash);
+  temp_id_hash.free();
 
   mutex_exit(&mutex);
   mutex_free(&mutex);
@@ -5993,7 +4968,7 @@ void dict_sys_t::close()
 
   if (dict_foreign_err_file)
   {
-    fclose(dict_foreign_err_file);
+    my_fclose(dict_foreign_err_file, MYF(MY_WME));
     dict_foreign_err_file = NULL;
   }
 
@@ -6306,25 +5281,6 @@ dict_tf_to_row_format_string(
 	return(0);
 }
 
-/** Calculate the used memory occupied by the data dictionary
-table and index objects.
-@return number of bytes occupied. */
-UNIV_INTERN
-ulint
-dict_sys_get_size()
-{
-	/* No mutex; this is a very crude approximation anyway */
-	ulint size = UT_LIST_GET_LEN(dict_sys.table_LRU)
-		+ UT_LIST_GET_LEN(dict_sys.table_non_LRU);
-	size *= sizeof(dict_table_t)
-		+ sizeof(dict_index_t) * 2
-		+ (sizeof(dict_col_t) + sizeof(dict_field_t)) * 10
-		+ sizeof(dict_field_t) * 5 /* total number of key fields */
-		+ 200; /* arbitrary, covering names and overhead */
-
-	return size;
-}
-
 bool dict_table_t::is_stats_table() const
 {
   return !strcmp(name.m_name, TABLE_STATS_NAME) ||
diff --git a/storage/innobase/dict/dict0load.cc b/storage/innobase/dict/dict0load.cc
index ab3aaa61970..a9b810b9aca 100644
--- a/storage/innobase/dict/dict0load.cc
+++ b/storage/innobase/dict/dict0load.cc
@@ -233,7 +233,7 @@ dict_get_first_table_name_in_db(
 	tuple = dtuple_create(heap, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
 
-	dfield_set_data(dfield, name, ut_strlen(name));
+	dfield_set_data(dfield, name, strlen(name));
 	dict_index_copy_types(tuple, sys_index, 1);
 
 	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
@@ -255,7 +255,7 @@ loop:
 		rec, DICT_FLD__SYS_TABLES__NAME, &len);
 
 	if (len < strlen(name)
-	    || ut_memcmp(name, field, strlen(name)) != 0) {
+	    || memcmp(name, field, strlen(name))) {
 		/* Not found */
 
 		btr_pcur_close(&pcur);
@@ -387,7 +387,7 @@ dict_process_sys_tables_rec_and_mtr_commit(
 
 	ut_a(!rec_get_deleted_flag(rec, 0));
 
-	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_S_FIX));
 
 	/* Get the table name */
 	table_name_t table_name(mem_heap_strdupl(heap, field, len));
@@ -528,7 +528,6 @@ dict_process_sys_foreign_rec(
 {
 	ulint		len;
 	const byte*	field;
-	ulint		n_fields_and_type;
 
 	if (rec_get_deleted_flag(rec, 0)) {
 		return("delete-marked record in SYS_FOREIGN");
@@ -586,10 +585,10 @@ err_len:
 	if (len != 4) {
 		goto err_len;
 	}
-	n_fields_and_type = mach_read_from_4(field);
+	uint32_t n_fields_and_type = mach_read_from_4(field);
 
-	foreign->type = (unsigned int) (n_fields_and_type >> 24);
-	foreign->n_fields = (unsigned int) (n_fields_and_type & 0x3FFUL);
+	foreign->type = n_fields_and_type >> 24 & ((1U << 6) - 1);
+	foreign->n_fields = n_fields_and_type & dict_index_t::MAX_N_FIELDS;
 
 	return(NULL);
 }
@@ -672,18 +671,13 @@ dict_process_sys_tablespaces(
 /*=========================*/
 	mem_heap_t*	heap,		/*!< in/out: heap memory */
 	const rec_t*	rec,		/*!< in: current SYS_TABLESPACES rec */
-	ulint*		space,		/*!< out: space id */
+	uint32_t*	space,		/*!< out: tablespace identifier */
 	const char**	name,		/*!< out: tablespace name */
 	ulint*		flags)		/*!< out: tablespace flags */
 {
 	ulint		len;
 	const byte*	field;
 
-	/* Initialize the output values */
-	*space = ULINT_UNDEFINED;
-	*name = NULL;
-	*flags = ULINT_UNDEFINED;
-
 	if (rec_get_deleted_flag(rec, 0)) {
 		return("delete-marked record in SYS_TABLESPACES");
 	}
@@ -738,7 +732,7 @@ dict_process_sys_datafiles(
 /*=======================*/
 	mem_heap_t*	heap,		/*!< in/out: heap memory */
 	const rec_t*	rec,		/*!< in: current SYS_DATAFILES rec */
-	ulint*		space,		/*!< out: space id */
+	uint32_t*	space,		/*!< out: space id */
 	const char**	path)		/*!< out: datafile paths */
 {
 	ulint		len;
@@ -1487,7 +1481,8 @@ void dict_check_tablespaces_and_store_max_id()
 	/* Initialize the max space_id from sys header */
 	mtr.start();
 	ulint max_space_id = mach_read_from_4(DICT_HDR_MAX_SPACE_ID
-					      + dict_hdr_get(&mtr));
+					      + DICT_HDR
+					      + dict_hdr_get(&mtr)->frame);
 	mtr.commit();
 
 	fil_set_max_space_id_if_bigger(max_space_id);
@@ -1847,7 +1842,6 @@ dict_load_columns(
 			the flag is set before the table is created. */
 			if (table->fts == NULL) {
 				table->fts = fts_create(table);
-				fts_optimize_add_table(table);
 			}
 
 			ut_a(table->fts->doc_col == ULINT_UNDEFINED);
@@ -2103,7 +2097,7 @@ err_len:
 
 		sys_field->name = mem_heap_strdupl(
 			heap, (const char*) field, len);
-		sys_field->prefix_len = prefix_len;
+		sys_field->prefix_len = prefix_len & ((1U << 12) - 1);
 		*pos = position;
 	}
 
@@ -2332,7 +2326,7 @@ err_len:
 	(*index)->id = id;
 	(*index)->page = mach_read_from_4(field);
 	ut_ad((*index)->page);
-	(*index)->merge_threshold = merge_threshold;
+	(*index)->merge_threshold = merge_threshold & ((1U << 6) - 1);
 
 	return(NULL);
 }
@@ -2710,7 +2704,8 @@ dict_get_and_save_data_dir_path(
 			dict_mutex_enter_for_mysql();
 		}
 
-		table->flags |= (1 << DICT_TF_POS_DATA_DIR);
+		table->flags |= 1 << DICT_TF_POS_DATA_DIR
+			& ((1U << DICT_TF_BITS) - 1);
 		dict_save_data_dir_path(table,
 					table->space->chain.start->name);
 
@@ -2720,7 +2715,8 @@ dict_get_and_save_data_dir_path(
 			or SYS_TABLES or FSP_SPACE_FLAGS on the header page
 			of the tablespace, but it makes dict_table_t
 			consistent. */
-			table->flags &= ~DICT_TF_MASK_DATA_DIR;
+			table->flags &= ~DICT_TF_MASK_DATA_DIR
+				& ((1U << DICT_TF_BITS) - 1);
 		}
 
 		if (!dict_mutex_own) {
@@ -2778,7 +2774,7 @@ dict_load_tablespace(
 {
 	ut_ad(!table->is_temporary());
 	ut_ad(!table->space);
-	ut_ad(table->space_id < SRV_LOG_SPACE_FIRST_ID);
+	ut_ad(table->space_id < SRV_SPACE_ID_UPPER_BOUND);
 	ut_ad(fil_system.sys_space);
 
 	if (table->space_id == TRX_SYS_SPACE) {
@@ -2905,7 +2901,7 @@ dict_load_table_one(
 	tuple = dtuple_create(heap, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
 
-	dfield_set_data(dfield, name.m_name, ut_strlen(name.m_name));
+	dfield_set_data(dfield, name.m_name, strlen(name.m_name));
 	dict_index_copy_types(tuple, sys_index, 1);
 
 	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
@@ -2927,8 +2923,8 @@ err_exit:
 		rec, DICT_FLD__SYS_TABLES__NAME, &len);
 
 	/* Check if the table name in record is the searched one */
-	if (len != ut_strlen(name.m_name)
-	    || 0 != ut_memcmp(name.m_name, field, len)) {
+	if (len != strlen(name.m_name)
+	    || memcmp(name.m_name, field, len)) {
 
 		goto err_exit;
 	}
@@ -2992,15 +2988,15 @@ err_exit:
 	}
 
 	if (err == DB_SUCCESS && table->is_readable()) {
-		if (table->space && !fil_space_get_size(table->space_id)) {
+		const auto root = dict_table_get_first_index(table)->page;
+
+		if (root >= table->space->get_size()) {
 corrupted:
 			table->corrupted = true;
 			table->file_unreadable = true;
 			err = DB_CORRUPTION;
 		} else {
-			const page_id_t page_id(
-				table->space->id,
-				dict_table_get_first_index(table)->page);
+			const page_id_t page_id(table->space->id, root);
 			mtr.start();
 			buf_block_t* block = buf_page_get(
 				page_id, table->space->zip_size(),
@@ -3090,7 +3086,6 @@ func_exit:
 			/* the table->fts could be created in dict_load_column
 			when a user defined FTS_DOC_ID is present, but no
 			FTS */
-			fts_optimize_remove_table(table);
 			fts_free(table);
 		} else if (fts_optimize_wq) {
 			fts_optimize_add_table(table);
@@ -3288,7 +3283,7 @@ dict_load_foreign_cols(
 		field = rec_get_nth_field_old(
 			rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len);
 
-		if (len != id_len || ut_memcmp(foreign->id, field, len) != 0) {
+		if (len != id_len || memcmp(foreign->id, field, len)) {
 			const rec_t*	pos;
 			ulint		pos_len;
 			const rec_t*	for_col_name;
@@ -3388,7 +3383,6 @@ dict_load_foreign(
 	const rec_t*	rec;
 	const byte*	field;
 	ulint		len;
-	ulint		n_fields_and_type;
 	mtr_t		mtr;
 	dict_table_t*	for_table;
 	dict_table_t*	ref_table;
@@ -3439,8 +3433,7 @@ dict_load_foreign(
 	field = rec_get_nth_field_old(rec, DICT_FLD__SYS_FOREIGN__ID, &len);
 
 	/* Check if the id in record is the searched one */
-	if (len != id_len || ut_memcmp(id, field, len) != 0) {
-
+	if (len != id_len || memcmp(id, field, len)) {
 		{
 			ib::error	err;
 			err << "Cannot load foreign constraint " << id
@@ -3463,7 +3456,7 @@ dict_load_foreign(
 
 	foreign = dict_mem_foreign_create();
 
-	n_fields_and_type = mach_read_from_4(
+	uint32_t n_fields_and_type = mach_read_from_4(
 		rec_get_nth_field_old(
 			rec, DICT_FLD__SYS_FOREIGN__N_COLS, &len));
 
@@ -3471,8 +3464,8 @@ dict_load_foreign(
 
 	/* We store the type in the bits 24..29 of n_fields_and_type. */
 
-	foreign->type = (unsigned int) (n_fields_and_type >> 24);
-	foreign->n_fields = (unsigned int) (n_fields_and_type & 0x3FFUL);
+	foreign->type = (n_fields_and_type >> 24) & ((1U << 6) - 1);
+	foreign->n_fields = n_fields_and_type & dict_index_t::MAX_N_FIELDS;
 
 	foreign->id = mem_heap_strdupl(foreign->heap, id, id_len);
 
@@ -3599,7 +3592,7 @@ start_load:
 	tuple = dtuple_create_from_mem(tuple_buf, sizeof(tuple_buf), 1, 0);
 	dfield = dtuple_get_nth_field(tuple, 0);
 
-	dfield_set_data(dfield, table_name, ut_strlen(table_name));
+	dfield_set_data(dfield, table_name, strlen(table_name));
 	dict_index_copy_types(tuple, sec_index, 1);
 
 	btr_pcur_open_on_user_rec(sec_index, tuple, PAGE_CUR_GE,
@@ -3644,8 +3637,8 @@ loop:
 		goto next_rec;
 	}
 
-	if ((innobase_get_lower_case_table_names() != 2)
-	    && (0 != ut_memcmp(field, table_name, len))) {
+	if (innobase_get_lower_case_table_names() != 2
+	    && memcmp(field, table_name, len)) {
 		goto next_rec;
 	}
 
@@ -3705,81 +3698,3 @@ load_next_index:
 
 	DBUG_RETURN(DB_SUCCESS);
 }
-
-/***********************************************************************//**
-Loads a table id based on the index id.
-@return	true if found */
-static
-bool
-dict_load_table_id_on_index_id(
-/*===========================*/
-	index_id_t		index_id,  /*!< in: index id */
-	table_id_t*		table_id) /*!< out: table id */
-{
-	/* check hard coded indexes */
-	switch(index_id) {
-	case DICT_TABLES_ID:
-	case DICT_COLUMNS_ID:
-	case DICT_INDEXES_ID:
-	case DICT_FIELDS_ID:
-		*table_id = index_id;
-		return true;
-	case DICT_TABLE_IDS_ID:
-		/* The following is a secondary index on SYS_TABLES */
-		*table_id = DICT_TABLES_ID;
-		return true;
-	}
-
-	bool		found = false;
-	mtr_t		mtr;
-
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	/* NOTE that the operation of this function is protected by
-	the dictionary mutex, and therefore no deadlocks can occur
-	with other dictionary operations. */
-
-	mtr_start(&mtr);
-
-	btr_pcur_t pcur;
-	const rec_t* rec = dict_startscan_system(&pcur, &mtr, SYS_INDEXES);
-
-	while (rec) {
-		ulint len;
-		const byte* field = rec_get_nth_field_old(
-			rec, DICT_FLD__SYS_INDEXES__ID, &len);
-		ut_ad(len == 8);
-
-		/* Check if the index id is the one searched for */
-		if (index_id == mach_read_from_8(field)) {
-			found = true;
-			/* Now we get the table id */
-			const byte* field = rec_get_nth_field_old(
-				rec,
-				DICT_FLD__SYS_INDEXES__TABLE_ID,
-				&len);
-			*table_id = mach_read_from_8(field);
-			break;
-		}
-		mtr_commit(&mtr);
-		mtr_start(&mtr);
-		rec = dict_getnext_system(&pcur, &mtr);
-	}
-
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
-
-	return(found);
-}
-
-dict_table_t* dict_table_open_on_index_id(index_id_t index_id)
-{
-	table_id_t table_id;
-	dict_table_t * table = NULL;
-	if (dict_load_table_id_on_index_id(index_id, &table_id)) {
-		table = dict_table_open_on_id(table_id, true,
-					      DICT_TABLE_OP_LOAD_TABLESPACE);
-	}
-
-	return table;
-}
diff --git a/storage/innobase/dict/dict0mem.cc b/storage/innobase/dict/dict0mem.cc
index 0bdb086a5b6..c5b4ce80492 100644
--- a/storage/innobase/dict/dict0mem.cc
+++ b/storage/innobase/dict/dict0mem.cc
@@ -132,14 +132,9 @@ bool dict_col_t::same_encoding(uint16_t a, uint16_t b)
 @param flags    table flags
 @param flags2   table flags2
 @return own: table object */
-dict_table_t*
-dict_mem_table_create(
-	const char*	name,
-	fil_space_t*	space,
-	ulint		n_cols,
-	ulint		n_v_cols,
-	ulint		flags,
-	ulint		flags2)
+dict_table_t *dict_mem_table_create(const char *name, fil_space_t *space,
+                                    ulint n_cols, ulint n_v_cols, ulint flags,
+                                    ulint flags2)
 {
 	dict_table_t*	table;
 	mem_heap_t*	heap;
@@ -168,15 +163,21 @@ dict_mem_table_create(
 
 	ut_d(table->magic_n = DICT_TABLE_MAGIC_N);
 
-	table->flags = (unsigned int) flags;
-	table->flags2 = (unsigned int) flags2;
+	table->flags = static_cast<unsigned>(flags)
+		& ((1U << DICT_TF_BITS) - 1);
+	table->flags2 = static_cast<unsigned>(flags2)
+		& ((1U << DICT_TF2_BITS) - 1);
 	table->name.m_name = mem_strdup(name);
 	table->is_system_db = dict_mem_table_is_system(table->name.m_name);
 	table->space = space;
 	table->space_id = space ? space->id : ULINT_UNDEFINED;
-	table->n_t_cols = unsigned(n_cols + DATA_N_SYS_COLS);
-	table->n_v_cols = (unsigned int) (n_v_cols);
-	table->n_cols = unsigned(table->n_t_cols - table->n_v_cols);
+	table->n_t_cols = static_cast<unsigned>(n_cols + DATA_N_SYS_COLS)
+		& dict_index_t::MAX_N_FIELDS;
+	table->n_v_cols = static_cast<unsigned>(n_v_cols)
+		& dict_index_t::MAX_N_FIELDS;
+	table->n_cols = static_cast<unsigned>(
+		table->n_t_cols - table->n_v_cols)
+		& dict_index_t::MAX_N_FIELDS;
 
 	table->cols = static_cast<dict_col_t*>(
 		mem_heap_alloc(heap, table->n_cols * sizeof(dict_col_t)));
@@ -223,8 +224,6 @@ dict_mem_table_free(
 	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
 	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
 		if (table->fts) {
-			fts_optimize_remove_table(table);
-
 			fts_free(table);
 		}
 	}
@@ -309,7 +308,7 @@ dict_mem_table_add_col(
 	ulint		len)	/*!< in: precision */
 {
 	dict_col_t*	col;
-	ulint		i;
+	unsigned	i;
 
 	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
 	ut_ad(!heap == !name);
@@ -343,11 +342,11 @@ dict_mem_table_add_col(
 	switch (prtype & DATA_VERSIONED) {
 	case DATA_VERS_START:
 		ut_ad(!table->vers_start);
-		table->vers_start = i;
+		table->vers_start = i & dict_index_t::MAX_N_FIELDS;
 		break;
 	case DATA_VERS_END:
 		ut_ad(!table->vers_end);
-		table->vers_end = i;
+		table->vers_end = i & dict_index_t::MAX_N_FIELDS;
 	}
 }
 
@@ -377,7 +376,6 @@ dict_mem_table_add_v_col(
 	ulint		num_base)
 {
 	dict_v_col_t*	v_col;
-	ulint		i;
 
 	ut_ad(table);
 	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
@@ -385,7 +383,7 @@ dict_mem_table_add_v_col(
 
 	ut_ad(prtype & DATA_VIRTUAL);
 
-	i = table->n_v_def++;
+	unsigned i = table->n_v_def++;
 
 	table->n_t_def++;
 
@@ -409,7 +407,7 @@ dict_mem_table_add_v_col(
 	v_col = &table->v_cols[i];
 
 	dict_mem_fill_column_struct(&v_col->m_col, pos, mtype, prtype, len);
-	v_col->v_pos = i;
+	v_col->v_pos = i & dict_index_t::MAX_N_FIELDS;
 
 	if (num_base != 0) {
 		v_col->base_col = static_cast<dict_col_t**>(mem_heap_zalloc(
@@ -419,7 +417,8 @@ dict_mem_table_add_v_col(
 		v_col->base_col = NULL;
 	}
 
-	v_col->num_base = num_base;
+	v_col->num_base = static_cast<unsigned>(num_base)
+		& dict_index_t::MAX_N_FIELDS;
 
 	/* Initialize the index list for virtual columns */
 	ut_ad(v_col->v_indexes.empty());
@@ -743,18 +742,18 @@ dict_mem_fill_column_struct(
 	ulint		prtype,		/*!< in: precise type */
 	ulint		col_len)	/*!< in: column length */
 {
-	ulint	mbminlen;
-	ulint	mbmaxlen;
+	unsigned mbminlen, mbmaxlen;
 
-	column->ind = (unsigned int) col_pos;
+	column->ind = static_cast<unsigned>(col_pos)
+		& dict_index_t::MAX_N_FIELDS;
 	column->ord_part = 0;
 	column->max_prefix = 0;
-	column->mtype = (unsigned int) mtype;
-	column->prtype = (unsigned int) prtype;
-	column->len = (unsigned int) col_len;
+	column->mtype = static_cast<uint8_t>(mtype);
+	column->prtype = static_cast<unsigned>(prtype);
+	column->len = static_cast<uint16_t>(col_len);
 	dtype_get_mblen(mtype, prtype, &mbminlen, &mbmaxlen);
-	column->mbminlen = mbminlen;
-	column->mbmaxlen = mbmaxlen;
+	column->mbminlen = mbminlen & 7;
+	column->mbmaxlen = mbmaxlen & 7;
 	column->def_val.data = NULL;
 	column->def_val.len = UNIV_SQL_DEFAULT;
 	ut_ad(!column->is_dropped());
@@ -1082,7 +1081,7 @@ dict_mem_index_add_field(
 	field = dict_index_get_nth_field(index, unsigned(index->n_def) - 1);
 
 	field->name = name;
-	field->prefix_len = (unsigned int) prefix_len;
+	field->prefix_len = prefix_len & ((1U << 12) - 1);
 }
 
 /**********************************************************************//**
@@ -1219,8 +1218,10 @@ inline bool dict_index_t::reconstruct_fields()
 
 	const auto old_n_fields = n_fields;
 
-	n_fields += table->instant->n_dropped;
-	n_def += table->instant->n_dropped;
+	n_fields = (n_fields + table->instant->n_dropped)
+		& dict_index_t::MAX_N_FIELDS;
+	n_def = (n_def + table->instant->n_dropped)
+		& dict_index_t::MAX_N_FIELDS;
 
 	const unsigned n_first = first_user_field();
 
@@ -1239,7 +1240,8 @@ inline bool dict_index_t::reconstruct_fields()
 		if (c.is_dropped()) {
 			f.col = &table->instant->dropped[j++];
 			DBUG_ASSERT(f.col->is_dropped());
-			f.fixed_len = dict_col_get_fixed_size(f.col, comp);
+			f.fixed_len = dict_col_get_fixed_size(f.col, comp)
+				& ((1U << 10) - 1);
 		} else {
 			DBUG_ASSERT(!c.is_not_null());
 			const auto old = std::find_if(
@@ -1265,7 +1267,7 @@ inline bool dict_index_t::reconstruct_fields()
 	}
 
 	fields = tfields;
-	n_core_null_bytes = UT_BITS_IN_BYTES(n_core_null);
+	n_core_null_bytes = static_cast<byte>(UT_BITS_IN_BYTES(n_core_null));
 
 	return false;
 }
@@ -1391,7 +1393,7 @@ dict_index_t::vers_history_row(
         } else {
 		ib::error() << "foreign constraints: secondary index is out of "
 			       "sync";
-		ut_ad(!"secondary index is out of sync");
+		ut_ad("secondary index is out of sync" == 0);
 		error = true;
 	}
 	mtr.commit();
diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc
index c5ff0c56951..42f75252cee 100644
--- a/storage/innobase/dict/dict0stats.cc
+++ b/storage/innobase/dict/dict0stats.cc
@@ -319,7 +319,7 @@ dict_stats_exec_sql(
 	} else {
 		trx->op_info = "rollback of internal trx on stats tables";
 		trx->dict_operation_lock_mode = RW_X_LATCH;
-		trx_rollback_to_savepoint(trx, NULL);
+		trx->rollback();
 		trx->dict_operation_lock_mode = 0;
 		trx->op_info = "";
 		ut_a(trx->error_state == DB_SUCCESS);
@@ -1033,8 +1033,7 @@ dict_stats_analyze_index_level(
 	DEBUG_PRINTF("    %s(table=%s, index=%s, level=" ULINTPF ")\n",
 		     __func__, index->table->name, index->name, level);
 
-	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
-				MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_SX_LOCK));
 
 	n_uniq = dict_index_get_n_unique(index);
 
@@ -1497,6 +1496,7 @@ dict_stats_analyze_index_below_cur(
 	rec_offs_set_n_alloc(offsets2, size);
 
 	rec = btr_cur_get_rec(cur);
+	page = page_align(rec);
 	ut_ad(!page_rec_is_leaf(rec));
 
 	offsets_rec = rec_get_offsets(rec, index, offsets1, 0,
@@ -1518,9 +1518,11 @@ dict_stats_analyze_index_below_cur(
 
 		dberr_t err = DB_SUCCESS;
 
-		block = buf_page_get_gen(page_id, zip_size, RW_S_LATCH,
-					 NULL /* no guessed block */,
-					 BUF_GET, __FILE__, __LINE__, &mtr, &err);
+		block = buf_page_get_gen(page_id, zip_size,
+					 RW_S_LATCH, NULL, BUF_GET,
+					 __FILE__, __LINE__, &mtr, &err,
+					 !index->is_clust()
+					 && 1 == btr_page_get_level(page));
 
 		page = buf_block_get_frame(block);
 
@@ -1665,8 +1667,7 @@ dict_stats_analyze_index_for_n_prefix(
 		     n_prefix, n_diff_data->n_diff_on_level);
 #endif
 
-	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
-				MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_SX_LOCK));
 
 	/* Position pcur on the leftmost record on the leftmost page
 	on the desired level. */
@@ -3192,7 +3193,7 @@ dict_stats_update(
 
 	if (!table->is_readable()) {
 		return (dict_stats_report_error(table));
-	} else if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
+	} else if (srv_force_recovery > SRV_FORCE_NO_IBUF_MERGE) {
 		/* If we have set a high innodb_force_recovery level, do
 		not calculate statistics, as a badly corrupted index can
 		cause a crash in it. */
@@ -3391,16 +3392,18 @@ transient:
 	return(DB_SUCCESS);
 }
 
-/*********************************************************************//**
-Removes the information for a particular index's stats from the persistent
+/** Remove the information for a particular index's stats from the persistent
 storage if it exists and if there is data stored for this index.
 This function creates its own trx and commits it.
-A note from Marko why we cannot edit user and sys_* tables in one trx:
-marko: The problem is that ibuf merges should be disabled while we are
-rolling back dict transactions.
-marko: If ibuf merges are not disabled, we need to scan the *.ibd files.
-But we shouldn't open *.ibd files before we have rolled back dict
-transactions and opened the SYS_* records for the *.ibd files.
+
+We must modify system tables in a separate transaction in order to
+adhere to the InnoDB design constraint that dict_sys.latch prevents
+lock waits on system tables. If we modified system and user tables in
+the same transaction, we should exclusively hold dict_sys.latch until
+the transaction is committed, and effectively block other transactions
+that will attempt to open any InnoDB tables. Because we have no
+guarantee that user transactions will be committed fast, we cannot
+afford to keep the system tables locked in a user transaction.
 @return DB_SUCCESS or error code */
 dberr_t
 dict_stats_drop_index(
diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc
index 4968612e964..69f50cccbbf 100644
--- a/storage/innobase/dict/dict0stats_bg.cc
+++ b/storage/innobase/dict/dict0stats_bg.cc
@@ -37,33 +37,17 @@ Created Apr 25, 2012 Vasil Dimov
 # include "wsrep.h"
 # include "log.h"
 # include "wsrep_mysqld.h"
-extern Atomic_relaxed<bool> wsrep_sst_disable_writes;
-#else
-constexpr bool wsrep_sst_disable_writes= false;
 #endif
 
 #include <vector>
 
 /** Minimum time interval between stats recalc for a given table */
 #define MIN_RECALC_INTERVAL	10 /* seconds */
-
-/** Event to wake up dict_stats_thread on dict_stats_recalc_pool_add()
-or shutdown. Not protected by any mutex. */
-os_event_t			dict_stats_event;
-
-/** Variable to initiate shutdown the dict stats thread. Note we don't
-use 'srv_shutdown_state' because we want to shutdown dict stats thread
-before purge thread. */
-bool				dict_stats_start_shutdown;
-
-/** Event to wait for shutdown of the dict stats thread */
-os_event_t			dict_stats_shutdown_event;
+static void dict_stats_schedule(int ms);
 
 #ifdef UNIV_DEBUG
 /** Used by SET GLOBAL innodb_dict_stats_disabled_debug = 1; */
 my_bool				innodb_dict_stats_disabled_debug;
-
-static os_event_t		dict_stats_disabled_event;
 #endif /* UNIV_DEBUG */
 
 /** This mutex protects the "recalc_pool" variable. */
@@ -122,7 +106,9 @@ static
 void
 dict_stats_recalc_pool_add(
 /*=======================*/
-	const dict_table_t*	table)	/*!< in: table to add */
+	const dict_table_t*	table,	/*!< in: table to add */
+	bool schedule_dict_stats_task = true /*!< in: schedule dict stats task */
+)
 {
 	ut_ad(!srv_read_only_mode);
 
@@ -140,10 +126,11 @@ dict_stats_recalc_pool_add(
 	}
 
 	recalc_pool.push_back(table->id);
-
+	if (recalc_pool.size() == 1 && schedule_dict_stats_task) {
+		dict_stats_schedule_now();
+	}
 	mutex_exit(&recalc_pool_mutex);
 
-	os_event_set(dict_stats_event);
 }
 
 #ifdef WITH_WSREP
@@ -312,14 +299,9 @@ dict_stats_wait_bg_to_stop_using_table(
 /*****************************************************************//**
 Initialize global variables needed for the operation of dict_stats_thread()
 Must be called before dict_stats_thread() is started. */
-void
-dict_stats_thread_init()
+void dict_stats_init()
 {
-	ut_a(!srv_read_only_mode);
-
-	dict_stats_event = os_event_create(0);
-	dict_stats_shutdown_event = os_event_create(0);
-	ut_d(dict_stats_disabled_event = os_event_create(0));
+	ut_ad(!srv_read_only_mode);
 
 	/* The recalc_pool_mutex is acquired from:
 	1) the background stats gathering thread before any other latch
@@ -342,48 +324,38 @@ dict_stats_thread_init()
 }
 
 /*****************************************************************//**
-Free resources allocated by dict_stats_thread_init(), must be called
-after dict_stats_thread() has exited. */
-void
-dict_stats_thread_deinit()
-/*======================*/
+Free resources allocated by dict_stats_init(), must be called
+after dict_stats task has exited. */
+void dict_stats_deinit()
 {
-	ut_a(!srv_read_only_mode);
-	ut_ad(!srv_dict_stats_thread_active);
-
 	if (!stats_initialised) {
 		return;
 	}
 
+	ut_ad(!srv_read_only_mode);
 	stats_initialised = false;
 
 	dict_stats_recalc_pool_deinit();
 	dict_defrag_pool_deinit();
 
 	mutex_free(&recalc_pool_mutex);
-
-	ut_d(os_event_destroy(dict_stats_disabled_event));
-	os_event_destroy(dict_stats_event);
-	os_event_destroy(dict_stats_shutdown_event);
-	dict_stats_start_shutdown = false;
 }
 
-/*****************************************************************//**
+/**
 Get the first table that has been added for auto recalc and eventually
-update its stats. */
-static
-void
-dict_stats_process_entry_from_recalc_pool()
-/*=======================================*/
+update its stats.
+@return whether the first entry can be processed immediately */
+static bool dict_stats_process_entry_from_recalc_pool()
 {
 	table_id_t	table_id;
 
 	ut_ad(!srv_read_only_mode);
 
+next_table_id:
 	/* pop the first table from the auto recalc pool */
 	if (!dict_stats_recalc_pool_get(&table_id)) {
 		/* no tables for auto recalc */
-		return;
+		return false;
 	}
 
 	dict_table_t*	table;
@@ -396,15 +368,15 @@ dict_stats_process_entry_from_recalc_pool()
 		/* table does not exist, must have been DROPped
 		after its id was enqueued */
 		mutex_exit(&dict_sys.mutex);
-		return;
+		goto next_table_id;
 	}
 
 	ut_ad(!table->is_temporary());
 
-	if (!fil_table_accessible(table)) {
+	if (!table->is_accessible()) {
 		dict_table_close(table, TRUE, FALSE);
 		mutex_exit(&dict_sys.mutex);
-		return;
+		goto next_table_id;
 	}
 
 	table->stats_bg_flag |= BG_STAT_IN_PROGRESS;
@@ -417,7 +389,7 @@ dict_stats_process_entry_from_recalc_pool()
 	find out that this is a problem, then the check below could eventually
 	be replaced with something else, though a time interval is the natural
 	approach. */
-
+	int ret;
 	if (difftime(time(NULL), table->stats_last_recalc)
 	    < MIN_RECALC_INTERVAL) {
 
@@ -425,11 +397,13 @@ dict_stats_process_entry_from_recalc_pool()
 		too frequent stats updates we put back the table on
 		the auto recalc list and do nothing. */
 
-		dict_stats_recalc_pool_add(table);
-
+		dict_stats_recalc_pool_add(table, false);
+		dict_stats_schedule(MIN_RECALC_INTERVAL*1000);
+		ret = false;
 	} else {
 
 		dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT);
+		ret = true;
 	}
 
 	mutex_enter(&dict_sys.mutex);
@@ -439,6 +413,7 @@ dict_stats_process_entry_from_recalc_pool()
 	dict_table_close(table, TRUE, FALSE);
 
 	mutex_exit(&dict_sys.mutex);
+	return ret;
 }
 
 #ifdef UNIV_DEBUG
@@ -448,94 +423,57 @@ dict_stats_process_entry_from_recalc_pool()
 void dict_stats_disabled_debug_update(THD*, st_mysql_sys_var*, void*,
 				      const void* save)
 {
-	/* This method is protected by mutex, as every SET GLOBAL .. */
-	ut_ad(dict_stats_disabled_event != NULL);
-
 	const bool disable = *static_cast<const my_bool*>(save);
-
-	const int64_t sig_count = os_event_reset(dict_stats_disabled_event);
-
-	innodb_dict_stats_disabled_debug = disable;
-
-	if (disable) {
-		os_event_set(dict_stats_event);
-		os_event_wait_low(dict_stats_disabled_event, sig_count);
-	}
+	if (disable)
+		dict_stats_shutdown();
+	else
+		dict_stats_start();
 }
 #endif /* UNIV_DEBUG */
 
+static tpool::timer* dict_stats_timer;
+static std::mutex dict_stats_mutex;
 
-/*****************************************************************//**
-This is the thread for background stats gathering. It pops tables, from
-the auto recalc list and proceeds them, eventually recalculating their
-statistics.
-@return this function does not return, it calls os_thread_exit() */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(dict_stats_thread)(void*)
+static void dict_stats_func(void*)
 {
-	my_thread_init();
-	ut_a(!srv_read_only_mode);
-
-#ifdef UNIV_PFS_THREAD
-	/* JAN: TODO: MySQL 5.7 PSI
-	pfs_register_thread(dict_stats_thread_key);
-	*/
-#endif /* UNIV_PFS_THREAD */
-
-	while (!dict_stats_start_shutdown) {
-
-		/* Wake up periodically even if not signaled. This is
-		because we may lose an event - if the below call to
-		dict_stats_process_entry_from_recalc_pool() puts the entry back
-		in the list, the os_event_set() will be lost by the subsequent
-		os_event_reset(). */
-		os_event_wait_time(
-			dict_stats_event, MIN_RECALC_INTERVAL * 1000000);
-
-		if (wsrep_sst_disable_writes) {
-			os_thread_sleep(1000000);
-			continue;
-		}
-
-#ifdef UNIV_DEBUG
-		while (innodb_dict_stats_disabled_debug) {
-			os_event_set(dict_stats_disabled_event);
-			if (dict_stats_start_shutdown) {
-				break;
-			}
-			os_event_wait_time(
-				dict_stats_event, 100000);
-		}
-#endif /* UNIV_DEBUG */
-
-		if (dict_stats_start_shutdown) {
-			break;
-		}
-
-		dict_stats_process_entry_from_recalc_pool();
-		dict_defrag_process_entries_from_defrag_pool();
+	while (dict_stats_process_entry_from_recalc_pool()) {}
+	dict_defrag_process_entries_from_defrag_pool();
+}
 
-		os_event_reset(dict_stats_event);
-	}
 
-	srv_dict_stats_thread_active = false;
+void dict_stats_start()
+{
+  std::lock_guard<std::mutex> lk(dict_stats_mutex);
+  if (!dict_stats_timer)
+    dict_stats_timer= srv_thread_pool->create_timer(dict_stats_func);
+}
 
-	os_event_set(dict_stats_shutdown_event);
-	my_thread_end();
 
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit instead of return(). */
-	os_thread_exit();
+static void dict_stats_schedule(int ms)
+{
+  std::unique_lock<std::mutex> lk(dict_stats_mutex, std::defer_lock);
+  /*
+    Use try_lock() to avoid deadlock in dict_stats_shutdown(), which
+    uses dict_stats_mutex too. If there is simultaneous timer reschedule,
+    the first one will win, which is fine.
+  */
+  if (!lk.try_lock())
+  {
+    return;
+  }
+  if (dict_stats_timer)
+    dict_stats_timer->set_time(ms,0);
+}
 
-	OS_THREAD_DUMMY_RETURN;
+void dict_stats_schedule_now()
+{
+  dict_stats_schedule(0);
 }
 
 /** Shut down the dict_stats_thread. */
-void
-dict_stats_shutdown()
+void dict_stats_shutdown()
 {
-	dict_stats_start_shutdown = true;
-	os_event_set(dict_stats_event);
-	os_event_wait(dict_stats_shutdown_event);
+  std::lock_guard<std::mutex> lk(dict_stats_mutex);
+  delete dict_stats_timer;
+  dict_stats_timer= 0;
 }
diff --git a/storage/innobase/eval/eval0eval.cc b/storage/innobase/eval/eval0eval.cc
index 97540d00198..193a5814a78 100644
--- a/storage/innobase/eval/eval0eval.cc
+++ b/storage/innobase/eval/eval0eval.cc
@@ -337,10 +337,8 @@ eval_notfound(
 	ut_ad(que_node_get_type(cursor) == QUE_NODE_SYMBOL);
 
 	if (cursor->token_type == SYM_LIT) {
-
-		ut_ad(ut_memcmp(dfield_get_data(que_node_get_val(cursor)),
-				"SQL", 3) == 0);
-
+		ut_ad(!memcmp(dfield_get_data(que_node_get_val(cursor)),
+			      "SQL", 3));
 		sel_node = cursor->sym_table->query_graph->last_sel_node;
 	} else {
 		sel_node = cursor->alias->cursor_def;
@@ -494,7 +492,7 @@ eval_concat(
 		dfield = que_node_get_val(arg);
 		len1 = dfield_get_len(dfield);
 
-		ut_memcpy(data + len, dfield_get_data(dfield), len1);
+		memcpy(data + len, dfield_get_data(dfield), len1);
 
 		len += len1;
 
diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc
index 49cb8ee6024..5830634692b 100644
--- a/storage/innobase/fil/fil0crypt.cc
+++ b/storage/innobase/fil/fil0crypt.cc
@@ -31,12 +31,12 @@ Modified           Jan Lindström jan.lindstrom@mariadb.com
 #ifdef UNIV_INNOCHECKSUM
 # include "buf0buf.h"
 #else
+#include "buf0dblwr.h"
 #include "srv0srv.h"
 #include "srv0start.h"
 #include "mtr0mtr.h"
 #include "mtr0log.h"
 #include "ut0ut.h"
-#include "btr0scrub.h"
 #include "fsp0fsp.h"
 #include "fil0pagecompress.h"
 #include <my_crypt.h>
@@ -78,20 +78,12 @@ UNIV_INTERN uint srv_n_fil_crypt_iops = 100;	 // 10ms per iop
 static uint srv_alloc_time = 3;		    // allocate iops for 3s at a time
 static uint n_fil_crypt_iops_allocated = 0;
 
-/** Variables for scrubbing */
-extern uint srv_background_scrub_data_interval;
-extern uint srv_background_scrub_data_check_interval;
-
 #define DEBUG_KEYROTATION_THROTTLING 0
 
 /** Statistics variables */
 static fil_crypt_stat_t crypt_stat;
 static ib_mutex_t crypt_stat_mutex;
 
-/** Is background scrubbing enabled, defined on btr0scrub.cc */
-extern my_bool srv_background_scrub_data_uncompressed;
-extern my_bool srv_background_scrub_data_compressed;
-
 /***********************************************************************
 Check if a key needs rotation given a key_state
 @param[in]	crypt_data		Encryption information
@@ -321,7 +313,6 @@ fil_space_crypt_t* fil_space_read_crypt_data(ulint zip_size, const byte* page)
 	members */
 	crypt_data->type = type;
 	crypt_data->min_key_version = min_key_version;
-	crypt_data->page0_offset = offset;
 	memcpy(crypt_data->iv, page + offset + MAGIC_SZ + 2, iv_length);
 
 	return crypt_data;
@@ -354,6 +345,34 @@ fil_space_destroy_crypt_data(
 	}
 }
 
+/** Amend encryption information from redo log.
+@param[in]	space	tablespace
+@param[in]	data	encryption metadata */
+void fil_crypt_parse(fil_space_t* space, const byte* data)
+{
+	ut_ad(data[1] == MY_AES_BLOCK_SIZE);
+	if (void* buf = ut_zalloc_nokey(sizeof(fil_space_crypt_t))) {
+		fil_space_crypt_t* crypt_data = new(buf)
+			fil_space_crypt_t(
+				data[0],
+				mach_read_from_4(&data[2 + MY_AES_BLOCK_SIZE]),
+				mach_read_from_4(&data[6 + MY_AES_BLOCK_SIZE]),
+				static_cast<fil_encryption_t>
+				(data[10 + MY_AES_BLOCK_SIZE]));
+		memcpy(crypt_data->iv, data + 2, MY_AES_BLOCK_SIZE);
+		mutex_enter(&fil_system.mutex);
+		if (space->crypt_data) {
+			fil_space_merge_crypt_data(space->crypt_data,
+						   crypt_data);
+			fil_space_destroy_crypt_data(&crypt_data);
+			crypt_data = space->crypt_data;
+		} else {
+			space->crypt_data = crypt_data;
+		}
+		mutex_exit(&fil_system.mutex);
+	}
+}
+
 /** Fill crypt data information to the give page.
 It should be called during ibd file creation.
 @param[in]	flags	tablespace flags
@@ -367,7 +386,6 @@ fil_space_crypt_t::fill_page0(
 	const ulint offset = FSP_HEADER_OFFSET
 		+ fsp_header_get_encryption_offset(
 			fil_space_t::zip_size(flags));
-	page0_offset = offset;
 
 	memcpy(page + offset, CRYPT_MAGIC, MAGIC_SZ);
 	mach_write_to_1(page + offset + MAGIC_SZ, type);
@@ -382,160 +400,34 @@ fil_space_crypt_t::fill_page0(
 			encryption);
 }
 
-/******************************************************************
-Write crypt data to a page (0)
-@param[in]	space	tablespace
-@param[in,out]	page0	first page of the tablespace
+/** Write encryption metadata to the first page.
+@param[in,out]	block	first page of the tablespace
 @param[in,out]	mtr	mini-transaction */
-UNIV_INTERN
-void
-fil_space_crypt_t::write_page0(
-	const fil_space_t*	space,
-	byte* 			page,
-	mtr_t*			mtr)
+void fil_space_crypt_t::write_page0(buf_block_t* block, mtr_t* mtr)
 {
-	ut_ad(this == space->crypt_data);
-	const uint len = sizeof(iv);
 	const ulint offset = FSP_HEADER_OFFSET
-		+ fsp_header_get_encryption_offset(space->zip_size());
-	page0_offset = offset;
-
-	/*
-	redo log this as bytewise updates to page 0
-	followed by an MLOG_FILE_WRITE_CRYPT_DATA
-	(that will during recovery update fil_space_t)
-	*/
-	mlog_write_string(page + offset, CRYPT_MAGIC, MAGIC_SZ, mtr);
-	mlog_write_ulint(page + offset + MAGIC_SZ + 0, type, MLOG_1BYTE, mtr);
-	mlog_write_ulint(page + offset + MAGIC_SZ + 1, len, MLOG_1BYTE, mtr);
-	mlog_write_string(page + offset + MAGIC_SZ + 2, iv, len,
-			  mtr);
-	mlog_write_ulint(page + offset + MAGIC_SZ + 2 + len, min_key_version,
-			 MLOG_4BYTES, mtr);
-	mlog_write_ulint(page + offset + MAGIC_SZ + 2 + len + 4, key_id,
-			 MLOG_4BYTES, mtr);
-	mlog_write_ulint(page + offset + MAGIC_SZ + 2 + len + 8, encryption,
-		MLOG_1BYTE, mtr);
-
-	DBUG_EXECUTE_IF("ib_do_not_log_crypt_data", return;);
-
-	byte* log_ptr = mlog_open(mtr, 11 + 17 + len);
-
-	if (log_ptr != NULL) {
-		log_ptr = mlog_write_initial_log_record_fast(
-			page,
-			MLOG_FILE_WRITE_CRYPT_DATA,
-			log_ptr, mtr);
-		mach_write_to_4(log_ptr, space->id);
-		log_ptr += 4;
-		mach_write_to_2(log_ptr, offset);
-		log_ptr += 2;
-		mach_write_to_1(log_ptr, type);
-		log_ptr += 1;
-		mach_write_to_1(log_ptr, len);
-		log_ptr += 1;
-		mach_write_to_4(log_ptr, min_key_version);
-		log_ptr += 4;
-		mach_write_to_4(log_ptr, key_id);
-		log_ptr += 4;
-		mach_write_to_1(log_ptr, encryption);
-		log_ptr += 1;
-		mlog_close(mtr, log_ptr);
-
-		mlog_catenate_string(mtr, iv, len);
-	}
-}
-
-/******************************************************************
-Parse a MLOG_FILE_WRITE_CRYPT_DATA log entry
-@param[in]	ptr		Log entry start
-@param[in]	end_ptr		Log entry end
-@param[in]	block		buffer block
-@return position on log buffer */
-UNIV_INTERN
-byte*
-fil_parse_write_crypt_data(
-	byte*			ptr,
-	const byte*		end_ptr,
-	dberr_t*		err)
-{
-	/* check that redo log entry is complete */
-	uint entry_size =
-		4 + // size of space_id
-		2 + // size of offset
-		1 + // size of type
-		1 + // size of iv-len
-		4 +  // size of min_key_version
-		4 +  // size of key_id
-		1; // fil_encryption_t
-
-	*err = DB_SUCCESS;
-
-	if (ptr + entry_size > end_ptr) {
-		return NULL;
-	}
-
-	ulint space_id = mach_read_from_4(ptr);
-	ptr += 4;
-	uint offset = mach_read_from_2(ptr);
-	ptr += 2;
-	uint type = mach_read_from_1(ptr);
-	ptr += 1;
-	uint len = mach_read_from_1(ptr);
-	ptr += 1;
-
-	if ((type != CRYPT_SCHEME_1 && type != CRYPT_SCHEME_UNENCRYPTED)
-	    || len != CRYPT_SCHEME_1_IV_LEN) {
-		*err = DB_CORRUPTION;
-		return NULL;
-	}
-
-	uint min_key_version = mach_read_from_4(ptr);
-	ptr += 4;
-
-	uint key_id = mach_read_from_4(ptr);
-	ptr += 4;
-
-	fil_encryption_t encryption = (fil_encryption_t)mach_read_from_1(ptr);
-	ptr +=1;
-
-	if (ptr + len > end_ptr) {
-		return NULL;
-	}
-
-	mutex_enter(&fil_system.mutex);
-
-	fil_space_t* space = fil_space_get_by_id(space_id);
-
-	if (!space) {
-		mutex_exit(&fil_system.mutex);
-		return ptr + len;
-	}
-
-	fil_space_crypt_t* crypt_data = fil_space_create_crypt_data(
-		encryption, key_id);
-
-	crypt_data->page0_offset = offset;
-	crypt_data->min_key_version = min_key_version;
-	crypt_data->type = type;
-	memcpy(crypt_data->iv, ptr, len);
-	ptr += len;
-
-	if (space->crypt_data) {
-		fil_space_merge_crypt_data(space->crypt_data, crypt_data);
-		fil_space_destroy_crypt_data(&crypt_data);
-		crypt_data = space->crypt_data;
-	} else {
-		space->crypt_data = crypt_data;
-	}
-
-	mutex_exit(&fil_system.mutex);
-
-	if (crypt_data->should_encrypt() && !crypt_data->is_key_found()) {
-		*err = DB_DECRYPTION_FAILED;
-	}
-
-	return ptr;
+		+ fsp_header_get_encryption_offset(block->zip_size());
+	byte* b = block->frame + offset;
+
+	mtr->memcpy<mtr_t::MAYBE_NOP>(*block, b, CRYPT_MAGIC, MAGIC_SZ);
+
+	b += MAGIC_SZ;
+	byte* const start = b;
+	*b++ = static_cast<byte>(type);
+	compile_time_assert(sizeof iv == MY_AES_BLOCK_SIZE);
+	compile_time_assert(sizeof iv == CRYPT_SCHEME_1_IV_LEN);
+	*b++ = sizeof iv;
+	memcpy(b, iv, sizeof iv);
+	b += sizeof iv;
+	mach_write_to_4(b, min_key_version);
+	b += 4;
+	mach_write_to_4(b, key_id);
+	b += 4;
+	*b++ = byte(encryption);
+	ut_ad(b - start == 11 + MY_AES_BLOCK_SIZE);
+	/* We must log also any unchanged bytes, because recovery will
+	invoke fil_crypt_parse() based on this log record. */
+	mtr->memcpy(*block, offset + MAGIC_SZ, b - start);
 }
 
 /** Encrypt a buffer for non full checksum.
@@ -560,9 +452,11 @@ static byte* fil_encrypt_buf_for_non_full_checksum(
 	uint size = uint(zip_size ? zip_size : srv_page_size);
 	uint key_version = fil_crypt_get_latest_key_version(crypt_data);
 	ut_a(key_version != ENCRYPTION_KEY_VERSION_INVALID);
+	ut_ad(!ut_align_offset(src_frame, 8));
+	ut_ad(!ut_align_offset(dst_frame, 8));
 
-	ulint orig_page_type = mach_read_from_2(src_frame+FIL_PAGE_TYPE);
-	ibool page_compressed = (orig_page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED);
+	const bool page_compressed = fil_page_get_type(src_frame)
+		== FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED;
 	uint header_len = FIL_PAGE_DATA;
 
 	if (page_compressed) {
@@ -598,9 +492,9 @@ static byte* fil_encrypt_buf_for_non_full_checksum(
 	to sector boundary is written. */
 	if (!page_compressed) {
 		/* FIL page trailer is also not encrypted */
-		memcpy(dst_frame + size - FIL_PAGE_DATA_END,
-			src_frame + size - FIL_PAGE_DATA_END,
-			FIL_PAGE_DATA_END);
+		static_assert(FIL_PAGE_DATA_END == 8, "alignment");
+		memcpy_aligned<8>(dst_frame + size - FIL_PAGE_DATA_END,
+				  src_frame + size - FIL_PAGE_DATA_END, 8);
 	} else {
 		/* Clean up rest of buffer */
 		memset(dst_frame+header_len+srclen, 0,
@@ -680,26 +574,25 @@ static byte* fil_encrypt_buf_for_full_crc32(
 @param[in,out]		crypt_data		Crypt data
 @param[in]		space			space_id
 @param[in]		offset			Page offset
-@param[in]		lsn			Log sequence number
 @param[in]		src_frame		Page to encrypt
 @param[in]		zip_size		ROW_FORMAT=COMPRESSED
 						page size, or 0
 @param[in,out]		dst_frame		Output buffer
 @param[in]		use_full_checksum	full crc32 algo is used
 @return encrypted buffer or NULL */
-UNIV_INTERN
-byte*
-fil_encrypt_buf(
+byte* fil_encrypt_buf(
 	fil_space_crypt_t*	crypt_data,
 	ulint			space,
 	ulint			offset,
-	lsn_t			lsn,
 	const byte*		src_frame,
 	ulint			zip_size,
 	byte*			dst_frame,
 	bool			use_full_checksum)
 {
+	const lsn_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN);
+
 	if (use_full_checksum) {
+		ut_ad(!zip_size);
 		return fil_encrypt_buf_for_full_crc32(
 			crypt_data, space, offset,
 			lsn, src_frame, dst_frame);
@@ -716,9 +609,9 @@ fil_encrypt_buf(
 @return true if it is valid page type */
 static bool fil_space_encrypt_valid_page_type(
 	const fil_space_t*	space,
-	byte*			src_frame)
+	const byte*		src_frame)
 {
-	switch (mach_read_from_2(src_frame+FIL_PAGE_TYPE)) {
+	switch (fil_page_get_type(src_frame)) {
 	case FIL_PAGE_RTREE:
 		return space->full_crc32();
 	case FIL_PAGE_TYPE_FSP_HDR:
@@ -734,16 +627,12 @@ Encrypt a page
 
 @param[in]		space		Tablespace
 @param[in]		offset		Page offset
-@param[in]		lsn		Log sequence number
 @param[in]		src_frame	Page to encrypt
 @param[in,out]		dst_frame	Output buffer
 @return encrypted buffer or NULL */
-UNIV_INTERN
-byte*
-fil_space_encrypt(
+byte* fil_space_encrypt(
 	const fil_space_t*	space,
 	ulint			offset,
-	lsn_t			lsn,
 	byte*			src_frame,
 	byte*			dst_frame)
 {
@@ -755,9 +644,9 @@ fil_space_encrypt(
 		return (src_frame);
 	}
 
-	ut_ad(space->pending_io());
+	ut_ad(space->referenced());
 
-	return fil_encrypt_buf(space->crypt_data, space->id, offset, lsn,
+	return fil_encrypt_buf(space->crypt_data, space->id, offset,
 			       src_frame, space->zip_size(),
 			       dst_frame, space->full_crc32());
 }
@@ -837,10 +726,9 @@ static dberr_t fil_space_decrypt_for_non_full_checksum(
 	ulint			physical_size,
 	byte*			src_frame)
 {
-	ulint page_type = mach_read_from_2(src_frame+FIL_PAGE_TYPE);
 	uint key_version = mach_read_from_4(
 			src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
-	bool page_compressed = (page_type
+	bool page_compressed = (fil_page_get_type(src_frame)
 				== FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED);
 	uint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET);
 	uint space = mach_read_from_4(
@@ -951,7 +839,7 @@ fil_space_decrypt(
 	const ulint physical_size = space->physical_size();
 
 	ut_ad(space->crypt_data != NULL && space->crypt_data->is_encrypted());
-	ut_ad(space->pending_io());
+	ut_ad(space->referenced());
 
 	if (DB_SUCCESS != fil_space_decrypt(space->id, space->crypt_data,
 					    tmp_frame, physical_size,
@@ -1075,8 +963,7 @@ static inline
 void
 fil_crypt_read_crypt_data(fil_space_t* space)
 {
-	if (space->crypt_data || space->size
-	    || !fil_space_get_size(space->id)) {
+	if (space->crypt_data || space->size || !space->get_size()) {
 		/* The encryption metadata has already been read, or
 		the tablespace is not encrypted and the file has been
 		opened already, or the file cannot be accessed,
@@ -1092,15 +979,22 @@ fil_crypt_read_crypt_data(fil_space_t* space)
 	const ulint zip_size = space->zip_size();
 	mtr_t	mtr;
 	mtr.start();
-	if (buf_block_t* block = buf_page_get(page_id_t(space->id, 0),
-					      zip_size, RW_S_LATCH, &mtr)) {
+	if (buf_block_t* block = buf_page_get_gen(page_id_t(space->id, 0),
+						  zip_size, RW_S_LATCH,
+						  nullptr,
+						  BUF_GET_POSSIBLY_FREED,
+						  __FILE__, __LINE__, &mtr)) {
+		if (block->page.status == buf_page_t::FREED) {
+			goto func_exit;
+		}
 		mutex_enter(&fil_system.mutex);
-		if (!space->crypt_data) {
+		if (!space->crypt_data && !space->is_stopping()) {
 			space->crypt_data = fil_space_read_crypt_data(
 				zip_size, block->frame);
 		}
 		mutex_exit(&fil_system.mutex);
 	}
+func_exit:
 	mtr.commit();
 }
 
@@ -1109,8 +1003,6 @@ fil_crypt_read_crypt_data(fil_space_t* space)
 @return true if a recheck of tablespace is needed by encryption thread. */
 static bool fil_crypt_start_encrypting_space(fil_space_t* space)
 {
-	bool recheck = false;
-
 	mutex_enter(&fil_crypt_threads_mutex);
 
 	fil_space_crypt_t *crypt_data = space->crypt_data;
@@ -1122,12 +1014,9 @@ static bool fil_crypt_start_encrypting_space(fil_space_t* space)
 		return false;
 	}
 
-	if (crypt_data != NULL || fil_crypt_start_converting) {
-		/* someone beat us to it */
-		if (fil_crypt_start_converting) {
-			recheck = true;
-		}
+	const bool recheck = fil_crypt_start_converting;
 
+	if (recheck || crypt_data || space->is_stopping()) {
 		mutex_exit(&fil_crypt_threads_mutex);
 		return recheck;
 	}
@@ -1146,53 +1035,47 @@ static bool fil_crypt_start_encrypting_space(fil_space_t* space)
 		return false;
 	}
 
-	crypt_data->type = CRYPT_SCHEME_UNENCRYPTED;
-	crypt_data->min_key_version = 0; // all pages are unencrypted
-	crypt_data->rotate_state.start_time = time(0);
-	crypt_data->rotate_state.starting = true;
-	crypt_data->rotate_state.active_threads = 1;
-
-	mutex_enter(&fil_system.mutex);
-	space->crypt_data = crypt_data;
-	mutex_exit(&fil_system.mutex);
-
 	fil_crypt_start_converting = true;
 	mutex_exit(&fil_crypt_threads_mutex);
 
-	do
-	{
-		mtr_t mtr;
-		mtr.start();
-		mtr.set_named_space(space);
-
-		/* 2 - get page 0 */
-		dberr_t err = DB_SUCCESS;
-		buf_block_t* block = buf_page_get_gen(
-			page_id_t(space->id, 0), space->zip_size(),
-			RW_X_LATCH, NULL, BUF_GET,
-			__FILE__, __LINE__,
-			&mtr, &err);
+	mtr_t mtr;
+	mtr.start();
 
+	/* 2 - get page 0 */
+	dberr_t err = DB_SUCCESS;
+	if (buf_block_t* block = buf_page_get_gen(
+		    page_id_t(space->id, 0), space->zip_size(),
+		    RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED,
+		    __FILE__, __LINE__, &mtr, &err)) {
+		if (block->page.status == buf_page_t::FREED) {
+			goto abort;
+		}
 
-		/* 3 - write crypt data to page 0 */
-		byte* frame = buf_block_get_frame(block);
 		crypt_data->type = CRYPT_SCHEME_1;
-		crypt_data->write_page0(space, frame, &mtr);
+		crypt_data->min_key_version = 0; // all pages are unencrypted
+		crypt_data->rotate_state.start_time = time(0);
+		crypt_data->rotate_state.starting = true;
+		crypt_data->rotate_state.active_threads = 1;
 
-		mtr.commit();
+		mutex_enter(&fil_system.mutex);
+		const bool stopping = space->is_stopping();
+		if (!stopping) {
+			space->crypt_data = crypt_data;
+		}
+		mutex_exit(&fil_system.mutex);
 
-		/* record lsn of update */
-		lsn_t end_lsn = mtr.commit_lsn();
+		if (stopping) {
+			goto abort;
+		}
 
-		/* 4 - sync tablespace before publishing crypt data */
+		/* 3 - write crypt data to page 0 */
+		mtr.set_named_space(space);
+		crypt_data->write_page0(block, &mtr);
 
-		bool success = false;
+		mtr.commit();
 
-		do {
-			ulint n_pages = 0;
-			success = buf_flush_lists(ULINT_MAX, end_lsn, &n_pages);
-			buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
-		} while (!success);
+		/* 4 - sync tablespace before publishing crypt data */
+		while (buf_flush_list_space(space));
 
 		/* 5 - publish crypt data */
 		mutex_enter(&fil_crypt_threads_mutex);
@@ -1206,19 +1089,18 @@ static bool fil_crypt_start_encrypting_space(fil_space_t* space)
 		mutex_exit(&crypt_data->mutex);
 		mutex_exit(&fil_crypt_threads_mutex);
 
-		return recheck;
-	} while (0);
-
-	mutex_enter(&crypt_data->mutex);
-	ut_a(crypt_data->rotate_state.active_threads == 1);
-	crypt_data->rotate_state.active_threads = 0;
-	mutex_exit(&crypt_data->mutex);
+		return false;
+	}
 
+abort:
+	mtr.commit();
 	mutex_enter(&fil_crypt_threads_mutex);
 	fil_crypt_start_converting = false;
 	mutex_exit(&fil_crypt_threads_mutex);
 
-	return recheck;
+	crypt_data->~fil_space_crypt_t();
+	ut_free(crypt_data);
+	return false;
 }
 
 /** State of a rotation thread */
@@ -1233,7 +1115,7 @@ struct rotate_thread_t {
 	uint thread_no;
 	bool first;		    /*!< is position before first space */
 	fil_space_t* space;	    /*!< current space or NULL */
-	ulint offset;		    /*!< current offset */
+	uint32_t offset;	    /*!< current page number */
 	ulint batch;		    /*!< #pages to rotate */
 	uint  min_key_version_found;/*!< min key version found but not rotated */
 	lsn_t end_lsn;		    /*!< max lsn when rotating this space */
@@ -1245,9 +1127,6 @@ struct rotate_thread_t {
 
 	fil_crypt_stat_t crypt_stat; // statistics
 
-	btr_scrub_t scrub_data;      /* thread local data used by btr_scrub-functions
-				     * when iterating pages of tablespace */
-
 	/** @return whether this thread should terminate */
 	bool should_shutdown() const {
 		switch (srv_shutdown_state) {
@@ -1258,7 +1137,6 @@ struct rotate_thread_t {
 		case SRV_SHUTDOWN_CLEANUP:
 		case SRV_SHUTDOWN_INITIATED:
 			return true;
-		case SRV_SHUTDOWN_FLUSH_PHASE:
 		case SRV_SHUTDOWN_LAST_PHASE:
 			break;
 		}
@@ -1339,6 +1217,7 @@ fil_crypt_space_needs_rotation(
 		return false;
 	}
 
+	bool need_key_rotation = false;
 	mutex_enter(&crypt_data->mutex);
 
 	do {
@@ -1368,38 +1247,15 @@ fil_crypt_space_needs_rotation(
 			fil_crypt_get_key_state(key_state, crypt_data);
 		}
 
-		bool need_key_rotation = fil_crypt_needs_rotation(
+		need_key_rotation = fil_crypt_needs_rotation(
 			crypt_data,
 			crypt_data->min_key_version,
 			key_state->key_version,
 			key_state->rotate_key_age);
-
-		crypt_data->rotate_state.scrubbing.is_active =
-			btr_scrub_start_space(*space, &state->scrub_data);
-
-		time_t diff = time(0) - crypt_data->rotate_state.scrubbing.
-			last_scrub_completed;
-
-		bool need_scrubbing =
-			(srv_background_scrub_data_uncompressed ||
-			 srv_background_scrub_data_compressed) &&
-			crypt_data->rotate_state.scrubbing.is_active
-			&& diff >= 0
-			&& ulint(diff) >= srv_background_scrub_data_interval;
-
-		if (need_key_rotation == false && need_scrubbing == false) {
-			break;
-		}
-
-		mutex_exit(&crypt_data->mutex);
-
-		return true;
 	} while (0);
 
 	mutex_exit(&crypt_data->mutex);
-
-
-	return false;
+	return need_key_rotation;
 }
 
 /***********************************************************************
@@ -1562,13 +1418,10 @@ fil_crypt_realloc_iops(
 	fil_crypt_update_total_stat(state);
 }
 
-/***********************************************************************
-Return allocated iops to global
-@param[in,out]		state		Rotation state */
-static
-void
-fil_crypt_return_iops(
-	rotate_thread_t *state)
+/** Release excess allocated iops
+@param state   rotation state
+@param wake    whether to wake up other threads */
+static void fil_crypt_return_iops(rotate_thread_t *state, bool wake= true)
 {
 	if (state->allocated_iops > 0) {
 		uint iops = state->allocated_iops;
@@ -1584,27 +1437,44 @@ fil_crypt_return_iops(
 
 		n_fil_crypt_iops_allocated -= iops;
 		state->allocated_iops = 0;
-		os_event_set(fil_crypt_threads_event);
+		if (wake) {
+			os_event_set(fil_crypt_threads_event);
+		}
 		mutex_exit(&fil_crypt_threads_mutex);
 	}
 
 	fil_crypt_update_total_stat(state);
 }
 
+/** Acquire a tablespace reference.
+@return whether a tablespace reference was successfully acquired */
+inline bool fil_space_t::acquire_if_not_stopped()
+{
+  ut_ad(mutex_own(&fil_system.mutex));
+  const uint32_t n= acquire_low();
+  if (UNIV_LIKELY(!(n & (STOPPING | CLOSING))))
+    return true;
+  if (UNIV_UNLIKELY(n & STOPPING))
+    return false;
+  return UNIV_LIKELY(!(n & CLOSING)) || prepare(true);
+}
+
 bool fil_crypt_must_default_encrypt()
 {
   return !srv_fil_crypt_rotate_key_age || !srv_encrypt_rotate;
 }
 
-/** Return the next tablespace from default_encrypt_tables.
-@param space   previous tablespace (NULL to start from the start)
+/** Return the next tablespace from default_encrypt_tables list.
+@param space   previous tablespace (nullptr to start from the start)
 @param recheck whether the removal condition needs to be rechecked after
 the encryption parameters were changed
 @param encrypt expected state of innodb_encrypt_tables
 @return the next tablespace to process (n_pending_ops incremented)
-@retval NULL if this was the last */
-inline fil_space_t *fil_system_t::default_encrypt_next(
-  fil_space_t *space, bool recheck, bool encrypt)
+@retval fil_system.temp_space if there is no work to do
+@retval nullptr upon reaching the end of the iteration */
+inline fil_space_t *fil_system_t::default_encrypt_next(fil_space_t *space,
+                                                       bool recheck,
+                                                       bool encrypt)
 {
   ut_ad(mutex_own(&mutex));
 
@@ -1648,55 +1518,58 @@ inline fil_space_t *fil_system_t::default_encrypt_next(
     it++;
   }
 
-  while (it != end)
+  if (it == end)
+    return temp_space;
+
+  do
   {
     space= &*it;
-    if (space->acquire())
+    if (space->acquire_if_not_stopped())
       return space;
-    while (++it != end && (!UT_LIST_GET_LEN(it->chain) || it->is_stopping()));
+    if (++it == end)
+      return nullptr;
   }
+  while (!UT_LIST_GET_LEN(it->chain) || it->is_stopping());
 
-  return NULL;
+  return nullptr;
 }
 
-/** Return the next tablespace.
-@param space    previous tablespace (NULL to start from the beginning)
+/** Determine the next tablespace for encryption key rotation.
+@param space    current tablespace (nullptr to start from the beginning)
 @param recheck  whether the removal condition needs to be rechecked after
-the encryption parameters were changed
+encryption parameters were changed
 @param encrypt  expected state of innodb_encrypt_tables
-@return pointer to the next tablespace (with n_pending_ops incremented)
-@retval NULL if this was the last */
-static fil_space_t *fil_space_next(fil_space_t *space, bool recheck,
-                                   bool encrypt)
+@return the next tablespace
+@retval fil_system.temp_space if there is no work to do
+@retval nullptr upon reaching the end of the iteration */
+inline fil_space_t *fil_space_t::next(fil_space_t *space, bool recheck,
+                                      bool encrypt)
 {
   mutex_enter(&fil_system.mutex);
 
   if (fil_crypt_must_default_encrypt())
     space= fil_system.default_encrypt_next(space, recheck, encrypt);
-  else if (!space)
-  {
-    space= UT_LIST_GET_FIRST(fil_system.space_list);
-    /* We can trust that space is not NULL because at least the
-    system tablespace is always present and loaded first. */
-    if (!space->acquire())
-      goto next;
-  }
   else
   {
-    /* Move on to the next fil_space_t */
-    space->release();
-next:
-    space= UT_LIST_GET_NEXT(space_list, space);
-
-    /* Skip abnormal tablespaces or those that are being created by
-    fil_ibd_create(), or being dropped. */
-    while (space &&
-           (UT_LIST_GET_LEN(space->chain) == 0 ||
-            space->is_stopping() || space->purpose != FIL_TYPE_TABLESPACE))
+    if (!space)
+      space= UT_LIST_GET_FIRST(fil_system.space_list);
+    else
+    {
+      /* Move on to the next fil_space_t */
+      space->release();
       space= UT_LIST_GET_NEXT(space_list, space);
+    }
 
-    if (space && !space->acquire())
-      goto next;
+    for (; space; space= UT_LIST_GET_NEXT(space_list, space))
+    {
+      if (space->purpose != FIL_TYPE_TABLESPACE)
+        continue;
+      const uint32_t n= space->acquire_low();
+      if (UNIV_LIKELY(!(n & (STOPPING | CLOSING))))
+        break;
+      if (!(n & STOPPING) && space->prepare(true))
+        break;
+    }
   }
 
   mutex_exit(&fil_system.mutex);
@@ -1740,10 +1613,24 @@ static bool fil_crypt_find_space_to_rotate(
 		state->space = NULL;
 	}
 
-	state->space = fil_space_next(state->space, *recheck,
-				      key_state->key_version != 0);
+	bool wake;
+	for (;;) {
+		state->space = fil_space_t::next(state->space, *recheck,
+						 key_state->key_version != 0);
+		wake = state->should_shutdown();
+
+		if (state->space == fil_system.temp_space) {
+			goto done;
+		} else if (wake) {
+			break;
+		} else {
+			wake = true;
+		}
+
+		if (!state->space) {
+			break;
+		}
 
-	while (!state->should_shutdown() && state->space) {
 		/* If there is no crypt data and we have not yet read
 		page 0 for this tablespace, we need to read it before
 		we can continue. */
@@ -1758,18 +1645,16 @@ static bool fil_crypt_find_space_to_rotate(
 			state->min_key_version_found = key_state->key_version;
 			return true;
 		}
-
-		state->space = fil_space_next(state->space, *recheck,
-					      key_state->key_version != 0);
 	}
 
 	if (state->space) {
 		state->space->release();
+done:
 		state->space = NULL;
 	}
 
 	/* no work to do; release our allocation of I/O capacity */
-	fil_crypt_return_iops(state);
+	fil_crypt_return_iops(state, wake);
 
 	return false;
 
@@ -1865,7 +1750,7 @@ fil_crypt_find_page_to_rotate(
 		}
 	}
 
-	crypt_data->rotate_state.next_offset += batch;
+	crypt_data->rotate_state.next_offset += uint32_t(batch);
 	mutex_exit(&crypt_data->mutex);
 	return found;
 }
@@ -1887,7 +1772,7 @@ static
 buf_block_t*
 fil_crypt_get_page_throttle_func(
 	rotate_thread_t*	state,
-	ulint 			offset,
+	uint32_t		offset,
 	mtr_t*			mtr,
 	ulint*			sleeptime_ms,
 	const char*		file,
@@ -1919,6 +1804,11 @@ fil_crypt_get_page_throttle_func(
 		return NULL;
 	}
 
+	if (fseg_page_is_free(space, state->offset)) {
+		/* page is already freed */
+		return NULL;
+	}
+
 	state->crypt_stat.pages_read_from_disk++;
 
 	const ulonglong start = my_interval_timer();
@@ -1951,69 +1841,6 @@ fil_crypt_get_page_throttle_func(
 	return block;
 }
 
-
-/***********************************************************************
-Get block and allocation status
-
-note: innodb locks fil_space_latch and then block when allocating page
-but locks block and then fil_space_latch when freeing page.
-
-@param[in,out]		state		Rotation state
-@param[in]		offset		Page offset
-@param[in,out]		mtr		Minitransaction
-@param[out]		allocation_status Allocation status
-@param[out]		sleeptime_ms	Sleep time
-@return block or NULL
-*/
-static
-buf_block_t*
-btr_scrub_get_block_and_allocation_status(
-	rotate_thread_t*	state,
-	ulint 			offset,
-	mtr_t*			mtr,
-	btr_scrub_page_allocation_status_t *allocation_status,
-	ulint*			sleeptime_ms)
-{
-	mtr_t local_mtr;
-	buf_block_t *block = NULL;
-	fil_space_t* space = state->space;
-
-	ut_ad(space->referenced());
-
-	mtr_start(&local_mtr);
-
-	*allocation_status = fseg_page_is_free(space, (uint32_t)offset) ?
-		BTR_SCRUB_PAGE_FREE :
-		BTR_SCRUB_PAGE_ALLOCATED;
-
-	if (*allocation_status == BTR_SCRUB_PAGE_FREE) {
-		/* this is easy case, we lock fil_space_latch first and
-		then block */
-		block = fil_crypt_get_page_throttle(state,
-						    offset, mtr,
-						    sleeptime_ms);
-		mtr_commit(&local_mtr);
-	} else {
-		/* page is allocated according to xdes */
-
-		/* release fil_space_latch *before* fetching block */
-		mtr_commit(&local_mtr);
-
-		/* NOTE: when we have locked dict_index_get_lock(),
-		* it's safe to release fil_space_latch and then fetch block
-		* as dict_index_get_lock() is needed to make tree modifications
-		* such as free-ing a page
-		*/
-
-		block = fil_crypt_get_page_throttle(state,
-						    offset, mtr,
-						    sleeptime_ms);
-	}
-
-	return block;
-}
-
-
 /***********************************************************************
 Rotate one page
 @param[in,out]		key_state		Key state
@@ -2026,7 +1853,7 @@ fil_crypt_rotate_page(
 {
 	fil_space_t*space = state->space;
 	ulint space_id = space->id;
-	ulint offset = state->offset;
+	uint32_t offset = state->offset;
 	ulint sleeptime_ms = 0;
 	fil_space_crypt_t *crypt_data = space->crypt_data;
 
@@ -2052,12 +1879,17 @@ fil_crypt_rotate_page(
 							     offset, &mtr,
 							     &sleeptime_ms)) {
 		bool modified = false;
-		int needs_scrubbing = BTR_SCRUB_SKIP_PAGE;
-		lsn_t block_lsn = block->page.newest_modification;
 		byte* frame = buf_block_get_frame(block);
+		const lsn_t block_lsn = mach_read_from_8(FIL_PAGE_LSN + frame);
 		uint kv = buf_page_get_key_version(frame, space->flags);
 
-		if (space->is_stopping()) {
+		if (block->page.status == buf_page_t::FREED) {
+			/* Do not modify freed pages to avoid an assertion
+			failure on recovery.*/
+		} else if (block->page.oldest_modification() > 1) {
+			/* Do not unnecessarily touch pages that are
+			already dirty. */
+		} else if (space->is_stopping()) {
 			/* The tablespace is closing (in DROP TABLE or
 			TRUNCATE TABLE or similar): avoid further access */
 		} else if (!kv && !*reinterpret_cast<uint16_t*>
@@ -2096,8 +1928,9 @@ fil_crypt_rotate_page(
 			modified = true;
 
 			/* force rotation by dummy updating page */
-			mlog_write_ulint(frame + FIL_PAGE_SPACE_ID,
-					 space_id, MLOG_4BYTES, &mtr);
+			mtr.write<1,mtr_t::FORCED>(*block,
+						   &frame[FIL_PAGE_SPACE_ID],
+						   frame[FIL_PAGE_SPACE_ID]);
 
 			/* statistics */
 			state->crypt_stat.pages_modified++;
@@ -2107,74 +1940,11 @@ fil_crypt_rotate_page(
 					state->min_key_version_found = kv;
 				}
 			}
-
-			needs_scrubbing = btr_page_needs_scrubbing(
-				&state->scrub_data, block,
-				BTR_SCRUB_PAGE_ALLOCATION_UNKNOWN);
 		}
 
 		mtr.commit();
 		lsn_t end_lsn = mtr.commit_lsn();
 
-		if (needs_scrubbing == BTR_SCRUB_PAGE) {
-			mtr.start();
-			/*
-			* refetch page and allocation status
-			*/
-			btr_scrub_page_allocation_status_t allocated;
-
-			block = btr_scrub_get_block_and_allocation_status(
-				state, offset, &mtr,
-				&allocated,
-				&sleeptime_ms);
-
-			if (block) {
-				mtr.set_named_space(space);
-
-				/* get required table/index and index-locks */
-				needs_scrubbing = btr_scrub_recheck_page(
-					&state->scrub_data, block, allocated, &mtr);
-
-				if (needs_scrubbing == BTR_SCRUB_PAGE) {
-					/* we need to refetch it once more now that we have
-					* index locked */
-					block = btr_scrub_get_block_and_allocation_status(
-						state, offset, &mtr,
-						&allocated,
-						&sleeptime_ms);
-
-					needs_scrubbing = btr_scrub_page(&state->scrub_data,
-						block, allocated,
-						&mtr);
-				}
-
-				/* NOTE: mtr is committed inside btr_scrub_recheck_page()
-				* and/or btr_scrub_page. This is to make sure that
-				* locks & pages are latched in corrected order,
-				* the mtr is in some circumstances restarted.
-				* (mtr_commit() + mtr_start())
-				*/
-			}
-		}
-
-		if (needs_scrubbing != BTR_SCRUB_PAGE) {
-			/* if page didn't need scrubbing it might be that cleanups
-			are needed. do those outside of any mtr to prevent deadlocks.
-
-			the information what kinds of cleanups that are needed are
-			encoded inside the needs_scrubbing, but this is opaque to
-			this function (except the value BTR_SCRUB_PAGE) */
-			btr_scrub_skip_page(&state->scrub_data, needs_scrubbing);
-		}
-
-		if (needs_scrubbing == BTR_SCRUB_TURNED_OFF) {
-			/* if we just detected that scrubbing was turned off
-			* update global state to reflect this */
-			ut_ad(crypt_data);
-			mutex_enter(&crypt_data->mutex);
-			crypt_data->rotate_state.scrubbing.is_active = false;
-			mutex_exit(&crypt_data->mutex);
-		}
 
 		if (modified) {
 			/* if we modified page, we take lsn from mtr */
@@ -2213,9 +1983,9 @@ fil_crypt_rotate_pages(
 	const key_state_t*	key_state,
 	rotate_thread_t*	state)
 {
-	ulint space = state->space->id;
-	ulint end = std::min(state->offset + state->batch,
-			     state->space->free_limit);
+	ulint space_id = state->space->id;
+	uint32_t end = std::min(state->offset + uint32_t(state->batch),
+				state->space->free_limit);
 
 	ut_ad(state->space->referenced());
 
@@ -2229,8 +1999,7 @@ fil_crypt_rotate_pages(
 		* real pages, they will be updated anyway when the
 		* real page is updated
 		*/
-		if (space == TRX_SYS_SPACE &&
-		    buf_dblwr_page_inside(state->offset)) {
+		if (buf_dblwr.is_inside(page_id_t(space_id, state->offset))) {
 			continue;
 		}
 
@@ -2262,20 +2031,12 @@ fil_crypt_flush_space(
 	lsn_t end_lsn = crypt_data->rotate_state.end_lsn;
 
 	if (end_lsn > 0 && !space->is_stopping()) {
-		bool success = false;
-		ulint n_pages = 0;
 		ulint sum_pages = 0;
 		const ulonglong start = my_interval_timer();
+		while (buf_flush_list_space(space, &sum_pages));
+		if (sum_pages) {
+			const ulonglong end = my_interval_timer();
 
-		do {
-			success = buf_flush_lists(ULINT_MAX, end_lsn, &n_pages);
-			buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
-			sum_pages += n_pages;
-		} while (!success && !space->is_stopping());
-
-		const ulonglong end = my_interval_timer();
-
-		if (sum_pages && end > start) {
 			state->cnt_waited += sum_pages;
 			state->sum_waited_us += (end - start) / 1000;
 
@@ -2296,14 +2057,14 @@ fil_crypt_flush_space(
 	mtr_t mtr;
 	mtr.start();
 
-	dberr_t err;
-
 	if (buf_block_t* block = buf_page_get_gen(
 		    page_id_t(space->id, 0), space->zip_size(),
-		    RW_X_LATCH, NULL, BUF_GET,
-		    __FILE__, __LINE__, &mtr, &err)) {
-		mtr.set_named_space(space);
-		crypt_data->write_page0(space, block->frame, &mtr);
+		    RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED,
+		    __FILE__, __LINE__, &mtr)) {
+		if (block->page.status != buf_page_t::FREED) {
+			mtr.set_named_space(space);
+			crypt_data->write_page0(block, &mtr);
+		}
 	}
 
 	mtr.commit();
@@ -2359,31 +2120,14 @@ static void fil_crypt_complete_rotate_space(rotate_thread_t* state)
 			crypt_data->rotate_state.flushing = true;
 			crypt_data->min_key_version =
 				crypt_data->rotate_state.min_key_version_found;
-		}
-
-		/* inform scrubbing */
-		crypt_data->rotate_state.scrubbing.is_active = false;
-		mutex_exit(&crypt_data->mutex);
-
-		/* all threads must call btr_scrub_complete_space wo/ mutex held */
-		if (state->scrub_data.scrubbing) {
-			btr_scrub_complete_space(&state->scrub_data);
-			if (should_flush) {
-				/* only last thread updates last_scrub_completed */
-				ut_ad(crypt_data);
-				mutex_enter(&crypt_data->mutex);
-				crypt_data->rotate_state.scrubbing.
-					last_scrub_completed = time(0);
-				mutex_exit(&crypt_data->mutex);
-			}
-		}
-
-		if (should_flush) {
+			mutex_exit(&crypt_data->mutex);
 			fil_crypt_flush_space(state);
 
 			mutex_enter(&crypt_data->mutex);
 			crypt_data->rotate_state.flushing = false;
 			mutex_exit(&crypt_data->mutex);
+		} else {
+			mutex_exit(&crypt_data->mutex);
 		}
 	} else {
 		mutex_enter(&crypt_data->mutex);
@@ -2416,8 +2160,6 @@ DECLARE_THREAD(fil_crypt_thread)(void*)
 
 		key_state_t new_state;
 
-		time_t wait_start = time(0);
-
 		while (!thr.should_shutdown()) {
 
 			/* wait for key state changes
@@ -2435,17 +2177,6 @@ DECLARE_THREAD(fil_crypt_thread)(void*)
 				* a space*/
 				break;
 			}
-
-			time_t waited = time(0) - wait_start;
-
-			/* Break if we have waited the background scrub
-			internal and background scrubbing is enabled */
-			if (waited >= 0
-			    && ulint(waited) >= srv_background_scrub_data_check_interval
-			    && (srv_background_scrub_data_uncompressed
-			        || srv_background_scrub_data_compressed)) {
-				break;
-			}
 		}
 
 		recheck = false;
@@ -2524,6 +2255,8 @@ fil_crypt_set_thread_cnt(
 	const uint	new_cnt)
 {
 	if (!fil_crypt_threads_inited) {
+		if (srv_shutdown_state != SRV_SHUTDOWN_NONE)
+			return;
 		fil_crypt_threads_init();
 	}
 
@@ -2533,11 +2266,9 @@ fil_crypt_set_thread_cnt(
 		uint add = new_cnt - srv_n_fil_crypt_threads;
 		srv_n_fil_crypt_threads = new_cnt;
 		for (uint i = 0; i < add; i++) {
-			os_thread_id_t rotation_thread_id;
-			os_thread_create(fil_crypt_thread, NULL, &rotation_thread_id);
 			ib::info() << "Creating #"
 				   << i+1 << " encryption thread id "
-				   << os_thread_pf(rotation_thread_id)
+				   << os_thread_create(fil_crypt_thread)
 				   << " total threads " << new_cnt << ".";
 		}
 	} else if (new_cnt < srv_n_fil_crypt_threads) {
@@ -2571,21 +2302,12 @@ static void fil_crypt_default_encrypt_tables_fill()
 		if (space->purpose != FIL_TYPE_TABLESPACE
 		    || space->is_in_default_encrypt
 		    || UT_LIST_GET_LEN(space->chain) == 0
-		    || !space->acquire()) {
+		    || !space->acquire_if_not_stopped()) {
 			continue;
 		}
 
 		/* Ensure that crypt_data has been initialized. */
-		if (!space->size) {
-			ut_d(const fil_space_t* s=)
-			        fil_system.read_page0(space->id);
-			ut_ad(!s || s == space);
-			if (!space->size) {
-				/* Page 0 was not loaded.
-				Skip this tablespace. */
-				goto next;
-			}
-		}
+		ut_ad(space->size);
 
 		/* Skip ENCRYPTION!=DEFAULT tablespaces. */
 		if (space->crypt_data
@@ -2648,10 +2370,7 @@ fil_crypt_set_rotation_iops(
 /*********************************************************************
 Adjust encrypt tables
 @param[in]	val		New setting for innodb-encrypt-tables */
-UNIV_INTERN
-void
-fil_crypt_set_encrypt_tables(
-	uint val)
+void fil_crypt_set_encrypt_tables(ulong val)
 {
 	if (!fil_crypt_threads_inited) {
 		return;
@@ -2828,45 +2547,6 @@ fil_crypt_total_stat(
 	mutex_exit(&crypt_stat_mutex);
 }
 
-/*********************************************************************
-Get scrub status for a space (used by information_schema)
-
-@param[in]	space		Tablespace
-@param[out]	status		Scrub status */
-UNIV_INTERN
-void
-fil_space_get_scrub_status(
-	const fil_space_t*			space,
-	struct fil_space_scrub_status_t*	status)
-{
-	memset(status, 0, sizeof(*status));
-
-	ut_ad(space->referenced());
-	fil_space_crypt_t* crypt_data = space->crypt_data;
-
-	status->space = space->id;
-
-	if (crypt_data != NULL) {
-		status->compressed = FSP_FLAGS_GET_ZIP_SSIZE(space->flags) > 0;
-		mutex_enter(&crypt_data->mutex);
-		status->last_scrub_completed =
-			crypt_data->rotate_state.scrubbing.last_scrub_completed;
-		if (crypt_data->rotate_state.active_threads > 0 &&
-		    crypt_data->rotate_state.scrubbing.is_active) {
-			status->scrubbing = true;
-			status->current_scrub_started =
-				crypt_data->rotate_state.start_time;
-			status->current_scrub_active_threads =
-				crypt_data->rotate_state.active_threads;
-			status->current_scrub_page_number =
-				crypt_data->rotate_state.next_offset;
-			status->current_scrub_max_page_number =
-				crypt_data->rotate_state.max_offset;
-		}
-
-		mutex_exit(&crypt_data->mutex);
-	}
-}
 #endif /* UNIV_INNOCHECKSUM */
 
 /**
@@ -2889,9 +2569,8 @@ bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size)
 
 	/* Compressed and encrypted pages do not have checksum. Assume not
 	corrupted. Page verification happens after decompression in
-	buf_page_io_complete() using buf_page_is_corrupted(). */
-	if (mach_read_from_2(page + FIL_PAGE_TYPE)
-	    == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
+	buf_page_read_complete() using buf_page_is_corrupted(). */
+	if (fil_page_get_type(page) == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
 		return true;
 	}
 
@@ -2942,6 +2621,6 @@ bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size)
 			|| checksum == buf_calc_page_new_checksum(page);
 	}
 
-	ut_ad(!"unhandled innodb_checksum_algorithm");
+	ut_ad("unhandled innodb_checksum_algorithm" == 0);
 	return false;
 }
diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
index bdf1c1aab87..94ea5182d35 100644
--- a/storage/innobase/fil/fil0fil.cc
+++ b/storage/innobase/fil/fil0fil.cc
@@ -49,39 +49,82 @@ Created 10/25/1995 Heikki Tuuri
 #include "os0event.h"
 #include "sync0sync.h"
 #include "buf0flu.h"
-#include "os0api.h"
+#include "log.h"
 #ifdef UNIV_LINUX
 # include <sys/types.h>
 # include <sys/sysmacros.h>
 # include <dirent.h>
 #endif
 
-/** Tries to close a file in the LRU list. The caller must hold the fil_sys
-mutex.
-@return true if success, false if should retry later; since i/o's
-generally complete in < 100 ms, and as InnoDB writes at most 128 pages
-from the buffer pool in a batch, and then immediately flushes the
-files, there is a good chance that the next time we find a suitable
-node from the LRU list.
-@param[in] print_info	if true, prints information why it
-                        cannot close a file */
-static
-bool
-fil_try_to_close_file_in_LRU(bool print_info);
+/** Determine if the space id is a user tablespace id or not.
+@param space_id tablespace identifier
+@return true if it is a user tablespace ID */
+inline bool fil_is_user_tablespace_id(ulint space_id)
+{
+  return space_id != TRX_SYS_SPACE && space_id != SRV_TMP_SPACE_ID &&
+    !srv_is_undo_tablespace(space_id);
+}
+
+/** Try to close a file to adhere to the innodb_open_files limit.
+@param print_info   whether to diagnose why a file cannot be closed
+@return whether a file was closed */
+bool fil_space_t::try_to_close(bool print_info)
+{
+  ut_ad(mutex_own(&fil_system.mutex));
+  for (fil_space_t *space= UT_LIST_GET_FIRST(fil_system.space_list); space;
+       space= UT_LIST_GET_NEXT(space_list, space))
+  {
+    switch (space->purpose) {
+    case FIL_TYPE_TEMPORARY:
+      continue;
+    case FIL_TYPE_IMPORT:
+      break;
+    case FIL_TYPE_TABLESPACE:
+      if (!fil_is_user_tablespace_id(space->id))
+        continue;
+    }
+
+    /* We are using an approximation of LRU replacement policy. In
+    fil_node_open_file_low(), newly opened files are moved to the end
+    of fil_system.space_list, so that they would be less likely to be
+    closed here. */
+    fil_node_t *node= UT_LIST_GET_FIRST(space->chain);
+    if (!node)
+      /* fil_ibd_create() did not invoke fil_space_t::add() yet */
+      continue;
+    ut_ad(!UT_LIST_GET_NEXT(chain, node));
+
+    if (!node->is_open())
+      continue;
+
+    if (const auto n= space->set_closing())
+    {
+      if (!print_info)
+        continue;
+      print_info= false;
+      const time_t now= time(nullptr);
+      if (now - fil_system.n_open_exceeded_time < 5)
+        continue; /* We display messages at most once in 5 seconds. */
+      fil_system.n_open_exceeded_time= now;
+
+      if (n & PENDING)
+        sql_print_information("InnoDB: Cannot close file %s because of "
+                              UINT32PF " pending operations%s", node->name,
+                              n & PENDING,
+                              (n & NEEDS_FSYNC) ? " and pending fsync" : "");
+      else if (n & NEEDS_FSYNC)
+        sql_print_information("InnoDB: Cannot close file %s because of "
+                              "pending fsync", node->name);
+      continue;
+    }
+
+    node->close();
+    return true;
+  }
+
+  return false;
+}
 
-/** Test if a tablespace file can be renamed to a new filepath by checking
-if that the old filepath exists and the new filepath does not exist.
-@param[in]	old_path	old filepath
-@param[in]	new_path	new filepath
-@param[in]	is_discarded	whether the tablespace is discarded
-@param[in]	replace_new	whether to ignore the existence of new_path
-@return innodb error code */
-static dberr_t
-fil_rename_tablespace_check(
-	const char*	old_path,
-	const char*	new_path,
-	bool		is_discarded,
-	bool		replace_new = false);
 /** Rename a single-table tablespace.
 The tablespace must exist in the memory cache.
 @param[in]	id		tablespace identifier
@@ -143,16 +186,7 @@ from a file, versus reading from a raw disk.
 
 To have fast access to a tablespace or a log file, we put the data structures
 to a hash table. Each tablespace and log file is given an unique 32-bit
-identifier.
-
-Some operating systems do not support many open files at the same time,
-though NT seems to tolerate at least 900 open files. Therefore, we put the
-open files in an LRU-list. If we need to open another file, we may close the
-file at the end of the LRU-list. When an i/o-operation is pending on a file,
-the file cannot be closed. We take the file nodes with pending i/o-operations
-out of the LRU-list and keep a count of pending operations. When an operation
-completes, we decrement the count and return the file node to the LRU-list if
-the count drops to zero. */
+identifier. */
 
 /** Reference to the server data directory. Usually it is the
 current working directory ".", but in the MySQL Embedded Server Library
@@ -162,13 +196,8 @@ const char*	fil_path_to_mysql_datadir;
 /** Common InnoDB file extensions */
 const char* dot_ext[] = { "", ".ibd", ".isl", ".cfg" };
 
-/** The number of fsyncs done to the log */
-ulint	fil_n_log_flushes			= 0;
-
-/** Number of pending redo log flushes */
-ulint	fil_n_pending_log_flushes		= 0;
 /** Number of pending tablespace flushes */
-ulint	fil_n_pending_tablespace_flushes	= 0;
+Atomic_counter<ulint> fil_n_pending_tablespace_flushes;
 
 /** The tablespace memory cache. This variable is NULL before the module is
 initialized. */
@@ -176,25 +205,6 @@ fil_system_t	fil_system;
 
 /** At this age or older a space/page will be rotated */
 UNIV_INTERN extern uint srv_fil_crypt_rotate_key_age;
-UNIV_INTERN extern ib_mutex_t fil_crypt_threads_mutex;
-
-/** Determine if user has explicitly disabled fsync(). */
-# define fil_buffering_disabled(s)	\
-	((s)->purpose == FIL_TYPE_TABLESPACE	\
-	 && srv_file_flush_method	\
-	 == SRV_O_DIRECT_NO_FSYNC)
-
-/** Determine if the space id is a user tablespace id or not.
-@param[in]	space_id	Space ID to check
-@return true if it is a user tablespace ID */
-inline
-bool
-fil_is_user_tablespace_id(ulint space_id)
-{
-	return(space_id != TRX_SYS_SPACE
-	       && space_id != SRV_TMP_SPACE_ID
-	       && !srv_is_undo_tablespace(space_id));
-}
 
 #ifdef UNIV_DEBUG
 /** Try fil_validate() every this many times */
@@ -217,105 +227,6 @@ fil_validate_skip(void)
 }
 #endif /* UNIV_DEBUG */
 
-/********************************************************************//**
-Determines if a file node belongs to the least-recently-used list.
-@return true if the file belongs to fil_system.LRU mutex. */
-UNIV_INLINE
-bool
-fil_space_belongs_in_lru(
-/*=====================*/
-	const fil_space_t*	space)	/*!< in: file space */
-{
-	switch (space->purpose) {
-	case FIL_TYPE_TEMPORARY:
-	case FIL_TYPE_LOG:
-		return(false);
-	case FIL_TYPE_TABLESPACE:
-		return(fil_is_user_tablespace_id(space->id));
-	case FIL_TYPE_IMPORT:
-		return(true);
-	}
-
-	ut_ad(0);
-	return(false);
-}
-
-/********************************************************************//**
-NOTE: you must call fil_mutex_enter_and_prepare_for_io() first!
-
-Prepares a file node for i/o. Opens the file if it is closed. Updates the
-pending i/o's field in the node and the system appropriately. Takes the node
-off the LRU list if it is in the LRU list. The caller must hold the fil_sys
-mutex.
-@return false if the file can't be opened, otherwise true */
-static
-bool
-fil_node_prepare_for_io(
-/*====================*/
-	fil_node_t*	node,	/*!< in: file node */
-	fil_space_t*	space);	/*!< in: space */
-
-/** Update the data structures when an i/o operation finishes.
-@param[in,out] node		file node
-@param[in] type			IO context */
-static
-void
-fil_node_complete_io(fil_node_t* node, const IORequest& type);
-
-/** Reads data from a space to a buffer. Remember that the possible incomplete
-blocks at the end of file are ignored: they are not taken into account when
-calculating the byte offset within a space.
-@param[in]	page_id		page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	byte_offset	remainder of offset in bytes; in aio this
-must be divisible by the OS block size
-@param[in]	len		how many bytes to read; this must not cross a
-file boundary; in aio this must be a block size multiple
-@param[in,out]	buf		buffer where to store data read; in aio this
-must be appropriately aligned
-@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
-i/o on a tablespace which does not exist */
-UNIV_INLINE
-dberr_t
-fil_read(
-	const page_id_t		page_id,
-	ulint			zip_size,
-	ulint			byte_offset,
-	ulint			len,
-	void*			buf)
-{
-	return(fil_io(IORequestRead, true, page_id, zip_size,
-			byte_offset, len, buf, NULL));
-}
-
-/** Writes data to a space from a buffer. Remember that the possible incomplete
-blocks at the end of file are ignored: they are not taken into account when
-calculating the byte offset within a space.
-@param[in]	page_id		page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	byte_offset	remainder of offset in bytes; in aio this
-must be divisible by the OS block size
-@param[in]	len		how many bytes to write; this must not cross
-a file boundary; in aio this must be a block size multiple
-@param[in]	buf		buffer from which to write; in aio this must
-be appropriately aligned
-@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
-i/o on a tablespace which does not exist */
-UNIV_INLINE
-dberr_t
-fil_write(
-	const page_id_t		page_id,
-	ulint			zip_size,
-	ulint			byte_offset,
-	ulint			len,
-	void*			buf)
-{
-	ut_ad(!srv_read_only_mode);
-
-	return(fil_io(IORequestWrite, true, page_id, zip_size,
-		      byte_offset, len, buf, NULL));
-}
-
 /*******************************************************************//**
 Returns the table space by a given id, NULL if not found.
 It is unsafe to dereference the returned pointer. It is fine to check
@@ -330,7 +241,7 @@ fil_space_get_by_id(
 	ut_ad(fil_system.is_initialised());
 	ut_ad(mutex_own(&fil_system.mutex));
 
-	HASH_SEARCH(hash, fil_system.spaces, id,
+	HASH_SEARCH(hash, &fil_system.spaces, id,
 		    fil_space_t*, space,
 		    ut_ad(space->magic_n == FIL_SPACE_MAGIC_N),
 		    space->id == id);
@@ -343,8 +254,7 @@ The caller should hold an InnoDB table lock or a MDL that prevents
 the tablespace from being dropped during the operation,
 or the caller should be in single-threaded crash recovery mode
 (no user connections that could drop tablespaces).
-If this is not the case, fil_space_acquire() and fil_space_t::release()
-should be used instead.
+Normally, fil_space_t::get() should be used instead.
 @param[in]	id	tablespace ID
 @return tablespace, or NULL if not found */
 fil_space_t*
@@ -354,35 +264,9 @@ fil_space_get(
 	mutex_enter(&fil_system.mutex);
 	fil_space_t*	space = fil_space_get_by_id(id);
 	mutex_exit(&fil_system.mutex);
-	ut_ad(space == NULL || space->purpose != FIL_TYPE_LOG);
 	return(space);
 }
 
-/**********************************************************************//**
-Checks if all the file nodes in a space are flushed.
-@return true if all are flushed */
-static
-bool
-fil_space_is_flushed(
-/*=================*/
-	fil_space_t*	space)	/*!< in: space */
-{
-	ut_ad(mutex_own(&fil_system.mutex));
-
-	for (const fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
-	     node != NULL;
-	     node = UT_LIST_GET_NEXT(chain, node)) {
-
-		if (node->needs_flush) {
-
-			ut_ad(!fil_buffering_disabled(space));
-			return(false);
-		}
-	}
-
-	return(true);
-}
-
 /** Validate the compression algorithm for full crc32 format.
 @param[in]	space	tablespace object
 @return whether the compression algorithm support */
@@ -427,11 +311,11 @@ static bool fil_comp_algo_validate(const fil_space_t* space)
 @param[in]	is_raw		whether this is a raw device
 @param[in]	atomic_write	true if atomic write could be enabled
 @param[in]	max_pages	maximum number of pages in file,
-or ULINT_MAX for unlimited
+or UINT32_MAX for unlimited
 @return file object */
 fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle,
-			     ulint size, bool is_raw, bool atomic_write,
-			     ulint max_pages)
+			     uint32_t size, bool is_raw, bool atomic_write,
+			     uint32_t max_pages)
 {
 	fil_node_t*	node;
 
@@ -463,313 +347,200 @@ fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle,
 	this->size += size;
 	UT_LIST_ADD_LAST(chain, node);
 	if (node->is_open()) {
-		fil_system.n_open++;
+		n_pending.fetch_and(~CLOSING, std::memory_order_relaxed);
+		if (++fil_system.n_open >= srv_max_n_open_files) {
+			reacquire();
+			try_to_close(true);
+			release();
+		}
 	}
 	mutex_exit(&fil_system.mutex);
 
 	return node;
 }
 
-/** Open a file node of a tablespace.
-@param[in,out]	node	File node
-@return false if the file can't be opened, otherwise true */
-static bool fil_node_open_file(fil_node_t* node)
-{
-	bool		success;
-	bool		read_only_mode;
-	fil_space_t*	space = node->space;
-
-	ut_ad(mutex_own(&fil_system.mutex));
-	ut_a(node->n_pending == 0);
-	ut_a(!node->is_open());
-
-	read_only_mode = space->purpose != FIL_TYPE_TEMPORARY
-		&& srv_read_only_mode;
-
-	const bool first_time_open = node->size == 0;
-
-	ulint type;
-	static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096,
-		      "compatibility");
-	switch (FSP_FLAGS_GET_ZIP_SSIZE(space->flags)) {
-	case 1:
-	case 2:
-		type = OS_DATA_FILE_NO_O_DIRECT;
-		break;
-	default:
-		type = OS_DATA_FILE;
-	}
-
-	if (first_time_open
-	    || (space->purpose == FIL_TYPE_TABLESPACE
-		&& node == UT_LIST_GET_FIRST(space->chain)
-		&& srv_startup_is_before_trx_rollback_phase)) {
-		/* We do not know the size of the file yet. First we
-		open the file in the normal mode, no async I/O here,
-		for simplicity. Then do some checks, and close the
-		file again.  NOTE that we could not use the simple
-		file read function os_file_read() in Windows to read
-		from a file opened for async I/O! */
-
-retry:
-		node->handle = os_file_create(
-			innodb_data_file_key, node->name,
-			node->is_raw_disk
-			? OS_FILE_OPEN_RAW | OS_FILE_ON_ERROR_NO_EXIT
-			: OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
-			OS_FILE_AIO,
-			type,
-			read_only_mode,
-			&success);
-
-		if (!success) {
-			/* The following call prints an error message */
-			ulint err = os_file_get_last_error(true);
-			if (err == EMFILE + 100) {
-				if (fil_try_to_close_file_in_LRU(true))
-					goto retry;
-			}
+/** Open a tablespace file.
+@param node  data file
+@return whether the file was successfully opened */
+static bool fil_node_open_file_low(fil_node_t *node)
+{
+  ut_ad(!node->is_open());
+  ut_ad(node->space->is_closing());
+  ut_ad(mutex_own(&fil_system.mutex));
+  ulint type;
+  static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096, "compatibility");
+  switch (FSP_FLAGS_GET_ZIP_SSIZE(node->space->flags)) {
+  case 1:
+  case 2:
+    type= OS_DATA_FILE_NO_O_DIRECT;
+    break;
+  default:
+    type= OS_DATA_FILE;
+  }
 
-			ib::warn() << "Cannot open '" << node->name << "'."
-				" Have you deleted .ibd files under a"
-				" running mysqld server?";
-			return(false);
-		}
+  for (;;)
+  {
+    bool success;
+    node->handle= os_file_create(innodb_data_file_key, node->name,
+                                 node->is_raw_disk
+                                 ? OS_FILE_OPEN_RAW | OS_FILE_ON_ERROR_NO_EXIT
+                                 : OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
+                                 OS_FILE_AIO, type,
+                                 srv_read_only_mode, &success);
+    if (success)
+      break;
+
+    /* The following call prints an error message */
+    if (os_file_get_last_error(true) == EMFILE + 100 &&
+        fil_space_t::try_to_close(true))
+      continue;
+
+    ib::warn() << "Cannot open '" << node->name << "'.";
+    return false;
+  }
 
-		if (!node->read_page0(first_time_open)) {
-fail:
-			os_file_close(node->handle);
-			node->handle = OS_FILE_CLOSED;
-			return false;
-		}
+  if (node->size);
+  else if (!node->read_page0() || !fil_comp_algo_validate(node->space))
+  {
+    os_file_close(node->handle);
+    node->handle= OS_FILE_CLOSED;
+    return false;
+  }
 
-		if (first_time_open && !fil_comp_algo_validate(space)) {
-			goto fail;
-		}
+  ut_ad(node->is_open());
 
-	} else if (space->purpose == FIL_TYPE_LOG) {
-		node->handle = os_file_create(
-			innodb_log_file_key, node->name, OS_FILE_OPEN,
-			OS_FILE_AIO, OS_LOG_FILE, read_only_mode, &success);
-	} else {
-		node->handle = os_file_create(
-			innodb_data_file_key, node->name,
-			node->is_raw_disk
-			? OS_FILE_OPEN_RAW | OS_FILE_ON_ERROR_NO_EXIT
-			: OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
-			OS_FILE_AIO,
-			type,
-			read_only_mode,
-			&success);
-	}
+  if (UNIV_LIKELY(!fil_system.freeze_space_list))
+  {
+    /* Move the file last in fil_system.space_list, so that
+    fil_space_t::try_to_close() should close it as a last resort. */
+    UT_LIST_REMOVE(fil_system.space_list, node->space);
+    UT_LIST_ADD_LAST(fil_system.space_list, node->space);
+  }
 
-	ut_a(success);
-	ut_a(node->is_open());
+  fil_system.n_open++;
+  return true;
+}
 
-	fil_system.n_open++;
+/** Open a tablespace file.
+@param node  data file
+@return whether the file was successfully opened */
+static bool fil_node_open_file(fil_node_t *node)
+{
+  ut_ad(mutex_own(&fil_system.mutex));
+  ut_ad(!node->is_open());
+  ut_ad(fil_is_user_tablespace_id(node->space->id) ||
+        srv_operation == SRV_OPERATION_BACKUP ||
+        srv_operation == SRV_OPERATION_RESTORE ||
+        srv_operation == SRV_OPERATION_RESTORE_DELTA);
+  ut_ad(node->space->purpose != FIL_TYPE_TEMPORARY);
+  ut_ad(node->space->referenced());
 
-	if (fil_space_belongs_in_lru(space)) {
+  const auto old_time= fil_system.n_open_exceeded_time;
 
-		/* Put the node to the LRU list */
-		UT_LIST_ADD_FIRST(fil_system.LRU, node);
-	}
+  for (ulint count= 0; fil_system.n_open >= srv_max_n_open_files; count++)
+  {
+    if (fil_space_t::try_to_close(count > 1))
+      count= 0;
+    else if (count >= 2)
+    {
+      if (old_time != fil_system.n_open_exceeded_time)
+        sql_print_warning("InnoDB: innodb_open_files=" ULINTPF
+                          " is exceeded (" ULINTPF " files stay open)",
+                          srv_max_n_open_files, fil_system.n_open);
+      break;
+    }
+    else
+    {
+      mutex_exit(&fil_system.mutex);
+      os_thread_sleep(20000);
+      /* Flush tablespaces so that we can close modified files. */
+      fil_flush_file_spaces();
+      mutex_enter(&fil_system.mutex);
+      if (node->is_open())
+        return true;
+    }
+  }
 
-	return(true);
+  return fil_node_open_file_low(node);
 }
 
 /** Close the file handle. */
 void fil_node_t::close()
 {
-	bool	ret;
+  prepare_to_close_or_detach();
 
-	ut_ad(mutex_own(&fil_system.mutex));
-	ut_a(is_open());
-	ut_a(n_pending == 0);
-	ut_a(n_pending_flushes == 0);
-	ut_a(!being_extended);
-	ut_a(!needs_flush
-	     || space->purpose == FIL_TYPE_TEMPORARY
-	     || srv_fast_shutdown == 2
-	     || !srv_was_started);
-
-	ret = os_file_close(handle);
-	ut_a(ret);
-
-	/* printf("Closing file %s\n", name); */
-
-	handle = OS_FILE_CLOSED;
-	ut_ad(!is_open());
-	ut_a(fil_system.n_open > 0);
-	fil_system.n_open--;
-
-	if (fil_space_belongs_in_lru(space)) {
-		ut_a(UT_LIST_GET_LEN(fil_system.LRU) > 0);
-		UT_LIST_REMOVE(fil_system.LRU, this);
-	}
+  /* printf("Closing file %s\n", name); */
+  int ret= os_file_close(handle);
+  ut_a(ret);
+  handle= OS_FILE_CLOSED;
 }
 
-/** Tries to close a file in the LRU list. The caller must hold the fil_sys
-mutex.
-@return true if success, false if should retry later; since i/o's
-generally complete in < 100 ms, and as InnoDB writes at most 128 pages
-from the buffer pool in a batch, and then immediately flushes the
-files, there is a good chance that the next time we find a suitable
-node from the LRU list.
-@param[in] print_info	if true, prints information why it
-			cannot close a file*/
-static
-bool
-fil_try_to_close_file_in_LRU(
-
-	bool	print_info)
+pfs_os_file_t fil_node_t::detach()
 {
-	fil_node_t*	node;
+  prepare_to_close_or_detach();
 
-	ut_ad(mutex_own(&fil_system.mutex));
-
-	if (print_info) {
-		ib::info() << "fil_sys open file LRU len "
-			<< UT_LIST_GET_LEN(fil_system.LRU);
-	}
-
-	for (node = UT_LIST_GET_LAST(fil_system.LRU);
-	     node != NULL;
-	     node = UT_LIST_GET_PREV(LRU, node)) {
-
-		if (!node->needs_flush
-		    && node->n_pending_flushes == 0
-		    && !node->being_extended) {
-
-			node->close();
-
-			return(true);
-		}
-
-		if (!print_info) {
-			continue;
-		}
-
-		if (node->n_pending_flushes > 0) {
-
-			ib::info() << "Cannot close file " << node->name
-				<< ", because n_pending_flushes "
-				<< node->n_pending_flushes;
-		}
-
-		if (node->needs_flush) {
-			ib::warn() << "Cannot close file " << node->name
-				<< ", because is should be flushed first";
-		}
-
-		if (node->being_extended) {
-			ib::info() << "Cannot close file " << node->name
-				<< ", because it is being extended";
-		}
-	}
-
-	return(false);
+  pfs_os_file_t result= handle;
+  handle= OS_FILE_CLOSED;
+  return result;
 }
 
-/** Flush any writes cached by the file system.
-@param[in,out]	space		tablespace
-@param[in]	metadata	whether to update file system metadata */
-static void fil_flush_low(fil_space_t* space, bool metadata = false)
+void fil_node_t::prepare_to_close_or_detach()
 {
-	ut_ad(mutex_own(&fil_system.mutex));
-	ut_ad(!space->is_stopping());
-
-	if (fil_buffering_disabled(space)) {
-
-		/* No need to flush. User has explicitly disabled
-		buffering. */
-		ut_ad(!space->is_in_unflushed_spaces);
-		ut_ad(fil_space_is_flushed(space));
-		ut_ad(space->n_pending_flushes == 0);
-
-#ifdef UNIV_DEBUG
-		for (fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
-		     node != NULL;
-		     node = UT_LIST_GET_NEXT(chain, node)) {
-			ut_ad(!node->needs_flush);
-			ut_ad(node->n_pending_flushes == 0);
-		}
-#endif /* UNIV_DEBUG */
-
-		if (!metadata) return;
-	}
-
-	/* Prevent dropping of the space while we are flushing */
-	space->n_pending_flushes++;
-
-	for (fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
-	     node != NULL;
-	     node = UT_LIST_GET_NEXT(chain, node)) {
-
-		if (!node->needs_flush) {
-			continue;
-		}
-
-		ut_a(node->is_open());
+  ut_ad(mutex_own(&fil_system.mutex));
+  ut_ad(space->is_ready_to_close() || srv_operation == SRV_OPERATION_BACKUP ||
+        srv_operation == SRV_OPERATION_RESTORE_DELTA);
+  ut_a(is_open());
+  ut_a(!being_extended);
+  ut_a(space->is_ready_to_close() || space->purpose == FIL_TYPE_TEMPORARY ||
+       srv_fast_shutdown == 2 || !srv_was_started);
 
-		switch (space->purpose) {
-		case FIL_TYPE_TEMPORARY:
-			ut_ad(0); // we already checked for this
-			/* fall through */
-		case FIL_TYPE_TABLESPACE:
-		case FIL_TYPE_IMPORT:
-			fil_n_pending_tablespace_flushes++;
-			break;
-		case FIL_TYPE_LOG:
-			fil_n_pending_log_flushes++;
-			fil_n_log_flushes++;
-			break;
-		}
-#ifdef _WIN32
-		if (node->is_raw_disk) {
-
-			goto skip_flush;
-		}
-#endif /* _WIN32 */
-
-		ut_a(node->is_open());
-		node->n_pending_flushes++;
-		node->needs_flush = false;
-
-		mutex_exit(&fil_system.mutex);
-
-		os_file_flush(node->handle);
+  ut_a(fil_system.n_open > 0);
+  fil_system.n_open--;
+}
 
-		mutex_enter(&fil_system.mutex);
+/** Flush any writes cached by the file system. */
+void fil_space_t::flush_low()
+{
+  ut_ad(!mutex_own(&fil_system.mutex));
 
-		node->n_pending_flushes--;
-#ifdef _WIN32
-skip_flush:
-#endif /* _WIN32 */
-		if (!node->needs_flush) {
-			if (space->is_in_unflushed_spaces
-			    && fil_space_is_flushed(space)) {
-
-				fil_system.unflushed_spaces.remove(*space);
-				space->is_in_unflushed_spaces = false;
-			}
-		}
+  uint32_t n= 1;
+  while (!n_pending.compare_exchange_strong(n, n | NEEDS_FSYNC,
+                                            std::memory_order_acquire,
+                                            std::memory_order_relaxed))
+  {
+    ut_ad(n & PENDING);
+    if (n & STOPPING)
+      return;
+    if (n & NEEDS_FSYNC)
+      break;
+  }
 
-		switch (space->purpose) {
-		case FIL_TYPE_TEMPORARY:
-			break;
-		case FIL_TYPE_TABLESPACE:
-		case FIL_TYPE_IMPORT:
-			fil_n_pending_tablespace_flushes--;
-			continue;
-		case FIL_TYPE_LOG:
-			fil_n_pending_log_flushes--;
-			continue;
-		}
+  fil_n_pending_tablespace_flushes++;
+  for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
+       node= UT_LIST_GET_NEXT(chain, node))
+  {
+    if (!node->is_open())
+    {
+      ut_ad(!is_in_unflushed_spaces);
+      continue;
+    }
+    IF_WIN(if (node->is_raw_disk) continue,);
+    os_file_flush(node->handle);
+  }
 
-		ut_ad(0);
-	}
+  if (is_in_unflushed_spaces)
+  {
+    mutex_enter(&fil_system.mutex);
+    if (is_in_unflushed_spaces)
+    {
+      is_in_unflushed_spaces= false;
+      fil_system.unflushed_spaces.remove(*this);
+    }
+    mutex_exit(&fil_system.mutex);
+  }
 
-	space->n_pending_flushes--;
+  clear_flush();
+  fil_n_pending_tablespace_flushes--;
 }
 
 /** Try to extend a tablespace.
@@ -783,12 +554,14 @@ bool
 fil_space_extend_must_retry(
 	fil_space_t*	space,
 	fil_node_t*	node,
-	ulint		size,
+	uint32_t	size,
 	bool*		success)
 {
 	ut_ad(mutex_own(&fil_system.mutex));
 	ut_ad(UT_LIST_GET_LAST(space->chain) == node);
 	ut_ad(size >= FIL_IBD_FILE_INITIAL_SIZE);
+	ut_ad(node->space == space);
+	ut_ad(space->referenced() || space->is_being_truncated);
 
 	*success = space->size >= size;
 
@@ -809,12 +582,6 @@ fil_space_extend_must_retry(
 
 	node->being_extended = true;
 
-	if (!fil_node_prepare_for_io(node, space)) {
-		/* The tablespace data file, such as .ibd file, is missing */
-		node->being_extended = false;
-		return(false);
-	}
-
 	/* At this point it is safe to release fil_system.mutex. No
 	other thread can rename, delete, close or extend the file because
 	we have set the node->being_extended flag. */
@@ -822,13 +589,13 @@ fil_space_extend_must_retry(
 
 	ut_ad(size >= space->size);
 
-	ulint		last_page_no		= space->size;
-	const ulint	file_start_page_no	= last_page_no - node->size;
+	uint32_t	last_page_no		= space->size;
+	const uint32_t	file_start_page_no	= last_page_no - node->size;
 
-	const ulint	page_size = space->physical_size();
+	const unsigned	page_size = space->physical_size();
 
-	/* fil_read_first_page() expects innodb_page_size bytes.
-	fil_node_open_file() expects at least 4 * innodb_page_size bytes.
+	/* Datafile::read_first_page() expects innodb_page_size bytes.
+	fil_node_t::read_page0() expects at least 4 * innodb_page_size bytes.
 	os_file_set_size() expects multiples of 4096 bytes.
 	For ROW_FORMAT=COMPRESSED tables using 1024-byte or 2048-byte
 	pages, we will preallocate up to an integer multiple of 4096 bytes,
@@ -852,7 +619,7 @@ fil_space_extend_must_retry(
 		os_offset_t	fsize = os_file_get_size(node->handle);
 		ut_a(fsize != os_offset_t(-1));
 
-		last_page_no = ulint(fsize / page_size)
+		last_page_no = uint32_t(fsize / page_size)
 			+ file_start_page_no;
 	}
 	mutex_enter(&fil_system.mutex);
@@ -861,13 +628,11 @@ fil_space_extend_must_retry(
 	node->being_extended = false;
 	ut_a(last_page_no - file_start_page_no >= node->size);
 
-	ulint file_size = last_page_no - file_start_page_no;
+	uint32_t file_size = last_page_no - file_start_page_no;
 	space->size += file_size - node->size;
 	node->size = file_size;
-	const ulint pages_in_MiB = node->size
-		& ~ulint((1U << (20U - srv_page_size_shift)) - 1);
-
-	fil_node_complete_io(node,IORequestRead);
+	const uint32_t pages_in_MiB = node->size
+		& ~uint32_t((1U << (20U - srv_page_size_shift)) - 1);
 
 	/* Keep the last data file size info up to date, rounded to
 	full megabytes */
@@ -875,244 +640,201 @@ fil_space_extend_must_retry(
 	switch (space->id) {
 	case TRX_SYS_SPACE:
 		srv_sys_space.set_last_file_size(pages_in_MiB);
-		fil_flush_low(space, true);
-		return(false);
+	do_flush:
+		space->reacquire();
+		mutex_exit(&fil_system.mutex);
+		space->flush_low();
+		space->release();
+		mutex_enter(&fil_system.mutex);
+		break;
 	default:
 		ut_ad(space->purpose == FIL_TYPE_TABLESPACE
 		      || space->purpose == FIL_TYPE_IMPORT);
 		if (space->purpose == FIL_TYPE_TABLESPACE
 		    && !space->is_being_truncated) {
-			fil_flush_low(space, true);
+			goto do_flush;
 		}
-		return(false);
+		break;
 	case SRV_TMP_SPACE_ID:
 		ut_ad(space->purpose == FIL_TYPE_TEMPORARY);
 		srv_tmp_space.set_last_file_size(pages_in_MiB);
-		return(false);
+		break;
 	}
+
+	return false;
 }
 
-/** Reserves the fil_system.mutex and tries to make sure we can open at least one
-file while holding it. This should be called before calling
-fil_node_prepare_for_io(), because that function may need to open a file.
-@param[in]	space_id	tablespace id
-@return whether the tablespace is usable for io */
-static
-bool
-fil_mutex_enter_and_prepare_for_io(ulint space_id)
+/** @return whether the file is usable for io() */
+ATTRIBUTE_COLD bool fil_space_t::prepare(bool have_mutex)
 {
-	for (ulint count = 0;;) {
-		mutex_enter(&fil_system.mutex);
-
-		if (space_id >= SRV_LOG_SPACE_FIRST_ID) {
-			/* We keep log files always open. */
-			break;
-		}
+  ut_ad(referenced());
+  if (!have_mutex)
+    mutex_enter(&fil_system.mutex);
+  ut_ad(mutex_own(&fil_system.mutex));
+  fil_node_t *node= UT_LIST_GET_LAST(chain);
+  ut_ad(!id || purpose == FIL_TYPE_TEMPORARY ||
+        node == UT_LIST_GET_FIRST(chain));
 
-		fil_space_t*	space = fil_space_get_by_id(space_id);
+  const bool is_open= node && (node->is_open() || fil_node_open_file(node));
 
-		if (space == NULL) {
-			return false;
-		}
-
-		fil_node_t*	node = UT_LIST_GET_LAST(space->chain);
-		ut_ad(space->id == 0
-		      || node == UT_LIST_GET_FIRST(space->chain));
-
-		if (space->id == 0) {
-			/* We keep the system tablespace files always
-			open; this is important in preventing
-			deadlocks in this module, as a page read
-			completion often performs another read from
-			the insert buffer. The insert buffer is in
-			tablespace 0, and we cannot end up waiting in
-			this function. */
-		} else if (space->is_stopping() && !space->is_being_truncated) {
-			/* If the tablespace is being deleted then InnoDB
-			shouldn't prepare the tablespace for i/o */
-			return false;
-		} else if (!node || node->is_open()) {
-			/* If the file is already open, no need to do
-			anything; if the space does not exist, we handle the
-			situation in the function which called this
-			function */
-		} else {
-			while (fil_system.n_open >= srv_max_n_open_files) {
-				/* Too many files are open */
-				if (fil_try_to_close_file_in_LRU(count > 1)) {
-					/* No problem */
-				} else if (count >= 2) {
-					ib::warn() << "innodb_open_files="
-						<< srv_max_n_open_files
-						<< " is exceeded ("
-						<< fil_system.n_open
-						<< ") files stay open)";
-					break;
-				} else {
-					mutex_exit(&fil_system.mutex);
-					os_aio_simulated_wake_handler_threads();
-					os_thread_sleep(20000);
-					/* Flush tablespaces so that we can
-					close modified files in the LRU list */
-					fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
-
-					count++;
-					mutex_enter(&fil_system.mutex);
-					continue;
-				}
-			}
-		}
-
-		ulint size = space->recv_size;
-		if (UNIV_UNLIKELY(size != 0)) {
-			ut_ad(node);
-			bool	success;
-			if (fil_space_extend_must_retry(space, node, size,
-							&success)) {
-				continue;
-			}
-
-			ut_ad(mutex_own(&fil_system.mutex));
-			/* Crash recovery requires the file extension
-			to succeed. */
-			ut_a(success);
-			/* InnoDB data files cannot shrink. */
-			ut_a(space->size >= size);
-			if (size > space->committed_size) {
-				space->committed_size = size;
-			}
-
-			/* There could be multiple concurrent I/O requests for
-			this tablespace (multiple threads trying to extend
-			this tablespace).
-
-			Also, fil_space_set_recv_size() may have been invoked
-			again during the file extension while fil_system.mutex
-			was not being held by us.
-
-			Only if space->recv_size matches what we read
-			originally, reset the field. In this way, a
-			subsequent I/O request will handle any pending
-			fil_space_set_recv_size(). */
-
-			if (size == space->recv_size) {
-				space->recv_size = 0;
-			}
-		}
-
-		break;
-	}
+  if (!is_open)
+    release();
+  else if (auto desired_size= recv_size)
+  {
+    bool success;
+    while (fil_space_extend_must_retry(this, node, desired_size, &success))
+      mutex_enter(&fil_system.mutex);
+
+    ut_ad(mutex_own(&fil_system.mutex));
+    /* Crash recovery requires the file extension to succeed. */
+    ut_a(success);
+    /* InnoDB data files cannot shrink. */
+    ut_a(size >= desired_size);
+    if (desired_size > committed_size)
+      committed_size= desired_size;
+
+    /* There could be multiple concurrent I/O requests for this
+    tablespace (multiple threads trying to extend this tablespace).
+
+    Also, fil_space_set_recv_size_and_flags() may have been invoked
+    again during the file extension while fil_system.mutex was not
+    being held by us.
+
+    Only if recv_size matches what we read originally, reset the
+    field. In this way, a subsequent I/O request will handle any
+    pending fil_space_set_recv_size_and_flags(). */
+
+    if (desired_size == recv_size)
+    {
+      recv_size= 0;
+      goto clear;
+    }
+  }
+  else
+clear:
+   n_pending.fetch_and(~CLOSING, std::memory_order_relaxed);
 
-	return true;
+  if (!have_mutex)
+    mutex_exit(&fil_system.mutex);
+  return is_open;
 }
 
 /** Try to extend a tablespace if it is smaller than the specified size.
 @param[in,out]	space	tablespace
 @param[in]	size	desired size in pages
 @return whether the tablespace is at least as big as requested */
-bool
-fil_space_extend(
-	fil_space_t*	space,
-	ulint		size)
+bool fil_space_extend(fil_space_t *space, uint32_t size)
 {
-	ut_ad(!srv_read_only_mode || space->purpose == FIL_TYPE_TEMPORARY);
-
-	bool	success;
-
-	do {
-		if (!fil_mutex_enter_and_prepare_for_io(space->id)) {
-			success = false;
-			break;
-		}
-	} while (fil_space_extend_must_retry(
-			 space, UT_LIST_GET_LAST(space->chain), size,
-			 &success));
-
-	mutex_exit(&fil_system.mutex);
-	return(success);
+  ut_ad(!srv_read_only_mode || space->purpose == FIL_TYPE_TEMPORARY);
+  bool success= false;
+  const bool acquired= space->acquire();
+  mutex_enter(&fil_system.mutex);
+  if (acquired || space->is_being_truncated)
+  {
+    while (fil_space_extend_must_retry(space, UT_LIST_GET_LAST(space->chain),
+                                       size, &success))
+      mutex_enter(&fil_system.mutex);
+  }
+  mutex_exit(&fil_system.mutex);
+  if (acquired)
+    space->release();
+  return success;
 }
 
-/** Prepare to free a file node object from a tablespace memory cache.
-@param[in,out]	node	file node
-@param[in]	space	tablespace */
-static
-void
-fil_node_close_to_free(
-	fil_node_t*	node,
-	fil_space_t*	space)
+/** Prepare to free a file from fil_system. */
+inline pfs_os_file_t fil_node_t::close_to_free(bool detach_handle)
 {
-	ut_ad(mutex_own(&fil_system.mutex));
-	ut_a(node->magic_n == FIL_NODE_MAGIC_N);
-	ut_a(node->n_pending == 0);
-	ut_a(!node->being_extended);
-
-	if (node->is_open()) {
-		/* We fool the assertion in fil_node_t::close() to think
-		there are no unflushed modifications in the file */
-
-		node->needs_flush = false;
+  ut_ad(mutex_own(&fil_system.mutex));
+  ut_a(magic_n == FIL_NODE_MAGIC_N);
+  ut_a(!being_extended);
 
-		if (fil_buffering_disabled(space)) {
-
-			ut_ad(!space->is_in_unflushed_spaces);
-			ut_ad(fil_space_is_flushed(space));
-
-		} else if (space->is_in_unflushed_spaces
-			   && fil_space_is_flushed(space)) {
+  if (is_open() &&
+      (space->n_pending.fetch_or(fil_space_t::CLOSING,
+                                 std::memory_order_acquire) &
+       fil_space_t::PENDING))
+  {
+    mutex_exit(&fil_system.mutex);
+    while (space->referenced())
+      os_thread_sleep(100);
+    mutex_enter(&fil_system.mutex);
+  }
 
-			fil_system.unflushed_spaces.remove(*space);
-			space->is_in_unflushed_spaces = false;
-		}
+  while (is_open())
+  {
+    if (space->is_in_unflushed_spaces)
+    {
+      ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC);
+      space->is_in_unflushed_spaces= false;
+      fil_system.unflushed_spaces.remove(*space);
+    }
+
+    ut_a(!being_extended);
+    if (detach_handle)
+    {
+      auto result= handle;
+      handle= OS_FILE_CLOSED;
+      return result;
+    }
+    bool ret= os_file_close(handle);
+    ut_a(ret);
+    handle= OS_FILE_CLOSED;
+    break;
+  }
 
-		node->close();
-	}
+  return OS_FILE_CLOSED;
 }
 
-/** Detach a space object from the tablespace memory cache.
-Closes the files in the chain but does not delete them.
-There must not be any pending i/o's or flushes on the files.
-@param[in,out]	space		tablespace */
-static
-void
-fil_space_detach(
-	fil_space_t*	space)
+/** Detach a tablespace from the cache and close the files. */
+std::vector<pfs_os_file_t> fil_system_t::detach(fil_space_t *space,
+                                                bool detach_handle)
 {
-	ut_ad(mutex_own(&fil_system.mutex));
-
-	HASH_DELETE(fil_space_t, hash, fil_system.spaces, space->id, space);
-
-	if (space->is_in_unflushed_spaces) {
-
-		ut_ad(!fil_buffering_disabled(space));
-		fil_system.unflushed_spaces.remove(*space);
-		space->is_in_unflushed_spaces = false;
-	}
+  ut_ad(mutex_own(&fil_system.mutex));
+  HASH_DELETE(fil_space_t, hash, &spaces, space->id, space);
 
-	if (space->is_in_default_encrypt) {
-		fil_system.default_encrypt_tables.remove(*space);
-		space->is_in_default_encrypt = false;
-	}
-
-	UT_LIST_REMOVE(fil_system.space_list, space);
-
-	ut_a(space->magic_n == FIL_SPACE_MAGIC_N);
-	ut_a(space->n_pending_flushes == 0);
-
-	for (fil_node_t* fil_node = UT_LIST_GET_FIRST(space->chain);
-	     fil_node != NULL;
-	     fil_node = UT_LIST_GET_NEXT(chain, fil_node)) {
+  if (space->is_in_unflushed_spaces)
+  {
+    ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC);
+    space->is_in_unflushed_spaces= false;
+    unflushed_spaces.remove(*space);
+  }
 
-		fil_node_close_to_free(fil_node, space);
-	}
+  if (space->is_in_default_encrypt)
+  {
+    space->is_in_default_encrypt= false;
+    default_encrypt_tables.remove(*space);
+  }
+  UT_LIST_REMOVE(space_list, space);
+  if (space == sys_space)
+    sys_space= nullptr;
+  else if (space == temp_space)
+    temp_space= nullptr;
+
+  ut_a(space->magic_n == FIL_SPACE_MAGIC_N);
+
+  for (fil_node_t* node= UT_LIST_GET_FIRST(space->chain); node;
+       node= UT_LIST_GET_NEXT(chain, node))
+    if (node->is_open())
+    {
+      ut_ad(n_open > 0);
+      n_open--;
+    }
+
+  std::vector<pfs_os_file_t> handles;
+  handles.reserve(UT_LIST_GET_LEN(space->chain));
+
+  for (fil_node_t* node= UT_LIST_GET_FIRST(space->chain); node;
+       node= UT_LIST_GET_NEXT(chain, node))
+  {
+    auto handle= node->close_to_free(detach_handle);
+    if (handle != OS_FILE_CLOSED)
+      handles.push_back(handle);
+  }
 
-	if (space == fil_system.sys_space) {
-		fil_system.sys_space = NULL;
-	} else if (space == fil_system.temp_space) {
-		fil_system.temp_space = NULL;
-	}
+  ut_ad(!space->referenced());
+  return handles;
 }
 
-/** Free a tablespace object on which fil_space_detach() was invoked.
+/** Free a tablespace object on which fil_system_t::detach() was invoked.
 There must not be any pending i/o's or flushes on the files.
 @param[in,out]	space		tablespace */
 static
@@ -1124,10 +846,10 @@ fil_space_free_low(
 	ut_ad(srv_fast_shutdown == 2 || !srv_was_started
 	      || space->max_lsn == 0);
 
-	/* Wait for fil_space_t::release_for_io(); after
-	fil_space_detach(), the tablespace cannot be found, so
-	fil_space_acquire_for_io() would return NULL */
-	while (space->pending_io()) {
+	/* Wait for fil_space_t::release() after
+	fil_system_t::detach(), the tablespace cannot be found, so
+	fil_space_t::get() would return NULL */
+	while (space->referenced()) {
 		os_thread_sleep(100);
 	}
 
@@ -1145,6 +867,7 @@ fil_space_free_low(
 	rw_lock_free(&space->latch);
 	fil_space_destroy_crypt_data(&space->crypt_data);
 
+	space->~fil_space_t();
 	ut_free(space->name);
 	ut_free(space);
 }
@@ -1166,7 +889,7 @@ fil_space_free(
 	fil_space_t*	space = fil_space_get_by_id(id);
 
 	if (space != NULL) {
-		fil_space_detach(space);
+		fil_system.detach(space);
 	}
 
 	mutex_exit(&fil_system.mutex);
@@ -1177,10 +900,10 @@ fil_space_free(
 		}
 
 		if (!recv_recovery_is_on()) {
-			log_mutex_enter();
+			mysql_mutex_lock(&log_sys.mutex);
 		}
 
-		ut_ad(log_mutex_own());
+		mysql_mutex_assert_owner(&log_sys.mutex);
 
 		if (space->max_lsn != 0) {
 			ut_d(space->max_lsn = 0);
@@ -1188,7 +911,7 @@ fil_space_free(
 		}
 
 		if (!recv_recovery_is_on()) {
-			log_mutex_exit();
+			mysql_mutex_unlock(&log_sys.mutex);
 		}
 
 		fil_space_free_low(space);
@@ -1197,74 +920,43 @@ fil_space_free(
 	return(space != NULL);
 }
 
-/** Create a space memory object and put it to the fil_system hash table.
-Error messages are issued to the server log.
-@param[in]	name		tablespace name
-@param[in]	id		tablespace identifier
-@param[in]	flags		tablespace flags
-@param[in]	purpose		tablespace purpose
-@param[in,out]	crypt_data	encryption information
-@param[in]	mode		encryption mode
-@return pointer to created tablespace, to be filled in with fil_space_t::add()
-@retval NULL on failure (such as when the same tablespace exists) */
-fil_space_t*
-fil_space_create(
-	const char*		name,
-	ulint			id,
-	ulint			flags,
-	fil_type_t		purpose,
-	fil_space_crypt_t*	crypt_data,
-	fil_encryption_t	mode)
+/** Create a tablespace in fil_system.
+@param name       tablespace name
+@param id         tablespace identifier
+@param flags      tablespace flags
+@param purpose    tablespace purpose
+@param crypt_data encryption information
+@param mode       encryption mode
+@return pointer to created tablespace, to be filled in with add()
+@retval nullptr on failure (such as when the same tablespace exists) */
+fil_space_t *fil_space_t::create(const char *name, ulint id, ulint flags,
+                                 fil_type_t purpose,
+				 fil_space_crypt_t *crypt_data,
+				 fil_encryption_t mode)
 {
 	fil_space_t*	space;
 
 	ut_ad(fil_system.is_initialised());
 	ut_ad(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, id));
-	ut_ad(purpose == FIL_TYPE_LOG
-	      || srv_page_size == UNIV_PAGE_SIZE_ORIG || flags != 0);
+	ut_ad(srv_page_size == UNIV_PAGE_SIZE_ORIG || flags != 0);
 
 	DBUG_EXECUTE_IF("fil_space_create_failure", return(NULL););
 
-	mutex_enter(&fil_system.mutex);
-
-	space = fil_space_get_by_id(id);
-
-	if (space != NULL) {
-		ib::error() << "Trying to add tablespace '" << name
-			<< "' with id " << id
-			<< " to the tablespace memory cache, but tablespace '"
-			<< space->name << "' already exists in the cache!";
-		mutex_exit(&fil_system.mutex);
-		return(NULL);
-	}
-
-	space = static_cast<fil_space_t*>(ut_zalloc_nokey(sizeof(*space)));
+	/* FIXME: if calloc() is defined as an inline function that calls
+	memset() or bzero(), then GCC 6 -flifetime-dse can optimize it away */
+	space= new (ut_zalloc_nokey(sizeof(*space))) fil_space_t;
 
 	space->id = id;
 	space->name = mem_strdup(name);
 
 	UT_LIST_INIT(space->chain, &fil_node_t::chain);
 
-	if ((purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT)
-	    && !recv_recovery_is_on()
-	    && srv_operation != SRV_OPERATION_BACKUP
-	    && id > fil_system.max_assigned_id) {
-		if (!fil_system.space_id_reuse_warned) {
-			fil_system.space_id_reuse_warned = true;
-
-			ib::warn() << "Allocated tablespace ID " << id
-				<< " for " << name << ", old maximum was "
-				<< fil_system.max_assigned_id;
-		}
-
-		fil_system.max_assigned_id = id;
-	}
-
 	space->purpose = purpose;
 	space->flags = flags;
 
 	space->magic_n = FIL_SPACE_MAGIC_N;
 	space->crypt_data = crypt_data;
+	space->n_pending.store(CLOSING, std::memory_order_relaxed);
 
 	DBUG_LOG("tablespace",
 		 "Created metadata for " << id << " name " << name);
@@ -1289,11 +981,47 @@ fil_space_create(
 		space->atomic_write_supported = true;
 	}
 
-	HASH_INSERT(fil_space_t, hash, fil_system.spaces, id, space);
+	mutex_enter(&fil_system.mutex);
+
+	if (const fil_space_t *old_space = fil_space_get_by_id(id)) {
+		ib::error() << "Trying to add tablespace '" << name
+			<< "' with id " << id
+			<< " to the tablespace memory cache, but tablespace '"
+			<< old_space->name << "' already exists in the cache!";
+		mutex_exit(&fil_system.mutex);
+		rw_lock_free(&space->latch);
+		space->~fil_space_t();
+		ut_free(space->name);
+		ut_free(space);
+		return(NULL);
+	}
+
+	HASH_INSERT(fil_space_t, hash, &fil_system.spaces, id, space);
 
 	UT_LIST_ADD_LAST(fil_system.space_list, space);
 
-	if (id < SRV_LOG_SPACE_FIRST_ID && id > fil_system.max_assigned_id) {
+	switch (id) {
+	case 0:
+		ut_ad(!fil_system.sys_space);
+		fil_system.sys_space = space;
+		break;
+	case SRV_TMP_SPACE_ID:
+		ut_ad(!fil_system.temp_space);
+		fil_system.temp_space = space;
+		break;
+	default:
+		ut_ad(purpose != FIL_TYPE_TEMPORARY);
+		if (UNIV_LIKELY(id <= fil_system.max_assigned_id)) {
+			break;
+		}
+		if (UNIV_UNLIKELY(srv_operation == SRV_OPERATION_BACKUP)) {
+			break;
+		}
+		if (!fil_system.space_id_reuse_warned) {
+			ib::warn() << "Allocated tablespace ID " << id
+				<< " for " << name << ", old maximum was "
+				<< fil_system.max_assigned_id;
+		}
 
 		fil_system.max_assigned_id = id;
 	}
@@ -1345,16 +1073,16 @@ fil_assign_new_space_id(
 
 	id++;
 
-	if (id > (SRV_LOG_SPACE_FIRST_ID / 2) && (id % 1000000UL == 0)) {
+	if (id > (SRV_SPACE_ID_UPPER_BOUND / 2) && (id % 1000000UL == 0)) {
 		ib::warn() << "You are running out of new single-table"
 			" tablespace id's. Current counter is " << id
-			<< " and it must not exceed" << SRV_LOG_SPACE_FIRST_ID
+			<< " and it must not exceed" <<SRV_SPACE_ID_UPPER_BOUND
 			<< "! To reset the counter to zero you have to dump"
 			" all your tables and recreate the whole InnoDB"
 			" installation.";
 	}
 
-	success = (id < SRV_LOG_SPACE_FIRST_ID);
+	success = (id < SRV_SPACE_ID_UPPER_BOUND);
 
 	if (success) {
 		*space_id = fil_system.max_assigned_id = id;
@@ -1372,168 +1100,99 @@ fil_assign_new_space_id(
 	return(success);
 }
 
-/** Trigger a call to fil_node_t::read_page0()
-@param[in]	id	tablespace identifier
-@return	tablespace
-@retval	NULL	if the tablespace does not exist or cannot be read */
-fil_space_t* fil_system_t::read_page0(ulint id)
+/** Read the first page of a data file.
+@return whether the page was found valid */
+bool fil_space_t::read_page0()
 {
-	mutex_exit(&mutex);
+  ut_ad(fil_system.is_initialised());
+  ut_ad(mutex_own(&fil_system.mutex));
+  if (size)
+    return true;
 
-	ut_ad(id != 0);
+  fil_node_t *node= UT_LIST_GET_FIRST(chain);
+  if (!node)
+    return false;
+  ut_ad(!UT_LIST_GET_NEXT(chain, node));
 
-	/* It is possible that the tablespace is dropped while we are
-	not holding the mutex. */
-	if (!fil_mutex_enter_and_prepare_for_io(id)) {
-		return NULL;
-	}
-
-	fil_space_t* space = fil_space_get_by_id(id);
-
-	if (space == NULL || UT_LIST_GET_LEN(space->chain) == 0) {
-		return(NULL);
-	}
-
-	/* The following code must change when InnoDB supports
-	multiple datafiles per tablespace. */
-	ut_a(1 == UT_LIST_GET_LEN(space->chain));
-
-	fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
-
-	/* It must be a single-table tablespace and we have not opened
-	the file yet; the following calls will open it and update the
-	size fields */
-
-	if (!fil_node_prepare_for_io(node, space)) {
-		/* The single-table tablespace can't be opened,
-		because the ibd file is missing. */
-		return(NULL);
-	}
-
-	fil_node_complete_io(node, IORequestRead);
-
-	return space;
+  if (UNIV_UNLIKELY(acquire_low() & STOPPING))
+  {
+    ut_ad("this should not happen" == 0);
+    return false;
+  }
+  const bool ok= node->is_open() || fil_node_open_file(node);
+  release();
+  return ok;
 }
 
-/*******************************************************************//**
-Returns a pointer to the fil_space_t that is in the memory cache
-associated with a space id. The caller must lock fil_system.mutex.
-@return file_space_t pointer, NULL if space not found */
-UNIV_INLINE
-fil_space_t*
-fil_space_get_space(
-/*================*/
-	ulint	id)	/*!< in: space id */
+/** Look up a tablespace and ensure that its first page has been validated. */
+static fil_space_t *fil_space_get_space(ulint id)
 {
-	fil_space_t* space = fil_space_get_by_id(id);
-	if (space == NULL || space->size != 0) {
-		return(space);
-	}
-
-	switch (space->purpose) {
-	case FIL_TYPE_LOG:
-		break;
-	case FIL_TYPE_TEMPORARY:
-	case FIL_TYPE_TABLESPACE:
-	case FIL_TYPE_IMPORT:
-		space = fil_system.read_page0(id);
-	}
-
-	return(space);
+  if (fil_space_t *space= fil_space_get_by_id(id))
+    if (space->read_page0())
+      return space;
+  return nullptr;
 }
 
-UNIV_INTERN
-void fil_space_set_recv_size_and_flags(ulint id, ulint size, uint32_t flags)
+void fil_space_set_recv_size_and_flags(ulint id, uint32_t size, uint32_t flags)
 {
+  ut_ad(id < SRV_SPACE_ID_UPPER_BOUND);
   mutex_enter(&fil_system.mutex);
-  ut_ad(id < SRV_LOG_SPACE_FIRST_ID);
-
-  if (fil_space_t* space= fil_space_get_space(id))
+  if (fil_space_t *space= fil_space_get_space(id))
   {
-    if (size) space->recv_size= size;
-    if (flags != FSP_FLAGS_FCRC32_MASK_MARKER) space->flags= flags;
+    if (size)
+      space->recv_size= size;
+    if (flags != FSP_FLAGS_FCRC32_MASK_MARKER)
+      space->flags= flags;
   }
   mutex_exit(&fil_system.mutex);
 }
 
-/*******************************************************************//**
-Returns the size of the space in pages. The tablespace must be cached in the
-memory cache.
-@return space size, 0 if space not found */
-ulint
-fil_space_get_size(
-/*===============*/
-	ulint	id)	/*!< in: space id */
-{
-	fil_space_t*	space;
-	ulint		size;
-
-	ut_ad(fil_system.is_initialised());
-	mutex_enter(&fil_system.mutex);
-
-	space = fil_space_get_space(id);
-
-	size = space ? space->size : 0;
-
-	mutex_exit(&fil_system.mutex);
-
-	return(size);
-}
-
-/*******************************************************************//**
-Returns the flags of the space. The tablespace must be cached
-in the memory cache.
-@return flags, ULINT_UNDEFINED if space not found */
-ulint
-fil_space_get_flags(
-/*================*/
-	ulint	id)	/*!< in: space id */
-{
-	fil_space_t*	space;
-	ulint		flags;
-
-	ut_ad(fil_system.is_initialised());
-
-	mutex_enter(&fil_system.mutex);
-
-	space = fil_space_get_space(id);
-
-	if (space == NULL) {
-		mutex_exit(&fil_system.mutex);
-
-		return(ULINT_UNDEFINED);
-	}
-
-	flags = space->flags;
-
-	mutex_exit(&fil_system.mutex);
-
-	return(flags);
-}
-
-/** Open each file. Only invoked on fil_system.temp_space.
+/** Open each file. Never invoked on .ibd files.
+@param create_new_db    whether to skip the call to fil_node_t::read_page0()
 @return whether all files were opened */
-bool fil_space_t::open()
+bool fil_space_t::open(bool create_new_db)
 {
-	ut_ad(fil_system.is_initialised());
+  ut_ad(fil_system.is_initialised());
+  ut_ad(!id || create_new_db);
 
-	mutex_enter(&fil_system.mutex);
-	ut_ad(this == fil_system.temp_space
-	      || srv_operation == SRV_OPERATION_BACKUP
-	      || srv_operation == SRV_OPERATION_RESTORE
-	      || srv_operation == SRV_OPERATION_RESTORE_DELTA);
+  bool success= true;
+  bool skip_read= create_new_db;
 
-	for (fil_node_t* node = UT_LIST_GET_FIRST(chain);
-	     node != NULL;
-	     node = UT_LIST_GET_NEXT(chain, node)) {
-		if (!node->is_open() && !fil_node_open_file(node)) {
-			mutex_exit(&fil_system.mutex);
-			return false;
-		}
-	}
+  mutex_enter(&fil_system.mutex);
 
-	mutex_exit(&fil_system.mutex);
-	return true;
+  for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
+       node= UT_LIST_GET_NEXT(chain, node))
+  {
+    if (!node->is_open() && !fil_node_open_file_low(node))
+    {
+err_exit:
+      success= false;
+      break;
+    }
+
+    if (create_new_db)
+      continue;
+    if (skip_read)
+    {
+      size+= node->size;
+      continue;
+    }
+
+    if (!node->read_page0())
+    {
+      fil_system.n_open--;
+      os_file_close(node->handle);
+      node->handle= OS_FILE_CLOSED;
+      goto err_exit;
+    }
+
+    skip_read= true;
+  }
+
+  if (!create_new_db)
+    committed_size= size;
+  mutex_exit(&fil_system.mutex);
+  return success;
 }
 
 /** Close each file. Only invoked on fil_system.temp_space. */
@@ -1566,7 +1225,7 @@ void fil_system_t::create(ulint hash_size)
 	ut_ad(!is_initialised());
 	ut_ad(!(srv_page_size % FSP_EXTENT_SIZE));
 	ut_ad(srv_page_size);
-	ut_ad(!spaces);
+	ut_ad(!spaces.array);
 
 	m_initialised = true;
 
@@ -1577,7 +1236,7 @@ void fil_system_t::create(ulint hash_size)
 
 	mutex_create(LATCH_ID_FIL_SYSTEM, &mutex);
 
-	spaces = hash_create(hash_size);
+	spaces.create(hash_size);
 
 	fil_space_crypt_init();
 #ifdef UNIV_LINUX
@@ -1645,7 +1304,6 @@ void fil_system_t::create(ulint hash_size)
 void fil_system_t::close()
 {
   ut_ad(this == &fil_system);
-  ut_a(!UT_LIST_GET_LEN(LRU));
   ut_a(unflushed_spaces.empty());
   ut_a(!UT_LIST_GET_LEN(space_list));
   ut_ad(!sys_space);
@@ -1654,13 +1312,12 @@ void fil_system_t::close()
   if (is_initialised())
   {
     m_initialised= false;
-    hash_table_free(spaces);
-    spaces = nullptr;
+    spaces.free();
     mutex_free(&mutex);
     fil_space_crypt_cleanup();
   }
 
-  ut_ad(!spaces);
+  ut_ad(!spaces.array);
 
 #ifdef UNIV_LINUX
   ssd.clear();
@@ -1668,88 +1325,51 @@ void fil_system_t::close()
 #endif /* UNIV_LINUX */
 }
 
-/*******************************************************************//**
-Opens all log files and system tablespace data files. They stay open until the
-database server shutdown. This should be called at a server startup after the
-space objects for the log and the system tablespace have been created. The
-purpose of this operation is to make sure we never run out of file descriptors
-if we need to read from the insert buffer or to write to the log. */
-void
-fil_open_log_and_system_tablespace_files(void)
-/*==========================================*/
+/** Extend all open data files to the recovered size */
+ATTRIBUTE_COLD void fil_system_t::extend_to_recv_size()
 {
-	fil_space_t*	space;
-
-	mutex_enter(&fil_system.mutex);
-
-	for (space = UT_LIST_GET_FIRST(fil_system.space_list);
-	     space != NULL;
-	     space = UT_LIST_GET_NEXT(space_list, space)) {
-
-		fil_node_t*	node;
-
-		if (fil_space_belongs_in_lru(space)) {
-
-			continue;
-		}
-
-		for (node = UT_LIST_GET_FIRST(space->chain);
-		     node != NULL;
-		     node = UT_LIST_GET_NEXT(chain, node)) {
-
-			if (!node->is_open()) {
-				if (!fil_node_open_file(node)) {
-					/* This func is called during server's
-					startup. If some file of log or system
-					tablespace is missing, the server
-					can't start successfully. So we should
-					assert for it. */
-					ut_a(0);
-				}
-			}
-
-			if (srv_max_n_open_files < 10 + fil_system.n_open) {
-
-				ib::warn() << "You must raise the value of"
-					" innodb_open_files in my.cnf!"
-					" Remember that InnoDB keeps all"
-					" log files and all system"
-					" tablespace files open"
-					" for the whole time mysqld is"
-					" running, and needs to open also"
-					" some .ibd files if the"
-					" file-per-table storage model is used."
-					" Current open files "
-					<< fil_system.n_open
-					<< ", max allowed open files "
-					<< srv_max_n_open_files
-					<< ".";
-			}
-		}
-	}
-
-	mutex_exit(&fil_system.mutex);
+  ut_ad(is_initialised());
+  mutex_enter(&mutex);
+  for (fil_space_t *space= UT_LIST_GET_FIRST(fil_system.space_list); space;
+       space= UT_LIST_GET_NEXT(space_list, space))
+  {
+    const uint32_t size= space->recv_size;
+
+    if (size > space->size)
+    {
+      if (space->is_closing())
+        continue;
+      space->reacquire();
+      bool success;
+      while (fil_space_extend_must_retry(space, UT_LIST_GET_LAST(space->chain),
+                                         size, &success))
+        mutex_enter(&mutex);
+      /* Crash recovery requires the file extension to succeed. */
+      ut_a(success);
+      space->release();
+    }
+  }
+  mutex_exit(&mutex);
 }
 
-/*******************************************************************//**
-Closes all open files. There must not be any pending i/o's or not flushed
-modifications in the files. */
-void
-fil_close_all_files(void)
-/*=====================*/
+/** Close all tablespace files at shutdown */
+void fil_space_t::close_all()
 {
+	if (!fil_system.is_initialised()) {
+		return;
+	}
+
 	fil_space_t*	space;
 
 	/* At shutdown, we should not have any files in this list. */
-	ut_ad(fil_system.is_initialised());
 	ut_ad(srv_fast_shutdown == 2
 	      || !srv_was_started
 	      || UT_LIST_GET_LEN(fil_system.named_spaces) == 0);
+	fil_flush_file_spaces();
 
 	mutex_enter(&fil_system.mutex);
 
-	for (space = UT_LIST_GET_FIRST(fil_system.space_list);
-	     space != NULL; ) {
+	for (space = UT_LIST_GET_FIRST(fil_system.space_list); space; ) {
 		fil_node_t*	node;
 		fil_space_t*	prev_space = space;
 
@@ -1757,13 +1377,31 @@ fil_close_all_files(void)
 		     node != NULL;
 		     node = UT_LIST_GET_NEXT(chain, node)) {
 
-			if (node->is_open()) {
-				node->close();
+			if (!node->is_open()) {
+next:
+				continue;
+			}
+
+			for (ulint count = 10000; count--; ) {
+				if (!space->set_closing()) {
+					node->close();
+					goto next;
+				}
+				mutex_exit(&fil_system.mutex);
+				os_thread_sleep(100);
+				mutex_enter(&fil_system.mutex);
+				if (!node->is_open()) {
+					goto next;
+				}
 			}
+
+			ib::error() << "File '" << node->name
+				    << "' has " << space->referenced()
+				    << " operations";
 		}
 
 		space = UT_LIST_GET_NEXT(space_list, space);
-		fil_space_detach(prev_space);
+		fil_system.detach(prev_space);
 		fil_space_free_low(prev_space);
 	}
 
@@ -1775,56 +1413,6 @@ fil_close_all_files(void)
 }
 
 /*******************************************************************//**
-Closes the redo log files. There must not be any pending i/o's or not
-flushed modifications in the files. */
-void
-fil_close_log_files(
-/*================*/
-	bool	free)	/*!< in: whether to free the memory object */
-{
-	fil_space_t*	space;
-
-	mutex_enter(&fil_system.mutex);
-
-	space = UT_LIST_GET_FIRST(fil_system.space_list);
-
-	while (space != NULL) {
-		fil_node_t*	node;
-		fil_space_t*	prev_space = space;
-
-		if (space->purpose != FIL_TYPE_LOG) {
-			space = UT_LIST_GET_NEXT(space_list, space);
-			continue;
-		}
-
-		/* Log files are not in the fil_system.named_spaces list. */
-		ut_ad(space->max_lsn == 0);
-
-		for (node = UT_LIST_GET_FIRST(space->chain);
-		     node != NULL;
-		     node = UT_LIST_GET_NEXT(chain, node)) {
-
-			if (node->is_open()) {
-				node->close();
-			}
-		}
-
-		space = UT_LIST_GET_NEXT(space_list, space);
-
-		if (free) {
-			fil_space_detach(prev_space);
-			fil_space_free_low(prev_space);
-		}
-	}
-
-	mutex_exit(&fil_system.mutex);
-
-	if (free) {
-		log_sys.log.close();
-	}
-}
-
-/*******************************************************************//**
 Sets the max tablespace id counter if the given number is bigger than the
 previous value. */
 void
@@ -1832,7 +1420,7 @@ fil_set_max_space_id_if_bigger(
 /*===========================*/
 	ulint	max_id)	/*!< in: maximum known id */
 {
-	if (max_id >= SRV_LOG_SPACE_FIRST_ID) {
+	if (max_id >= SRV_SPACE_ID_UPPER_BOUND) {
 		ib::fatal() << "Max tablespace id is too high, " << max_id;
 	}
 
@@ -1854,19 +1442,21 @@ dberr_t
 fil_write_flushed_lsn(
 	lsn_t	lsn)
 {
-	byte*	buf1;
 	byte*	buf;
-	dberr_t	err = DB_TABLESPACE_NOT_FOUND;
+	ut_ad(!srv_read_only_mode);
 
-	buf1 = static_cast<byte*>(ut_malloc_nokey(2U << srv_page_size_shift));
-	buf = static_cast<byte*>(ut_align(buf1, srv_page_size));
+	if (!fil_system.sys_space->acquire()) {
+		return DB_ERROR;
+	}
 
-	const page_id_t	page_id(TRX_SYS_SPACE, 0);
+	buf = static_cast<byte*>(aligned_malloc(srv_page_size, srv_page_size));
 
-	err = fil_read(page_id, 0, 0, srv_page_size, buf);
+	auto fio = fil_system.sys_space->io(IORequestRead, 0, srv_page_size,
+					    buf);
 
-	if (err == DB_SUCCESS) {
-		mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, lsn);
+	if (fio.err == DB_SUCCESS) {
+		mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
+				lsn);
 
 		ulint fsp_flags = mach_read_from_4(
 			buf + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS);
@@ -1875,149 +1465,97 @@ fil_write_flushed_lsn(
 			buf_flush_assign_full_crc32_checksum(buf);
 		}
 
-		err = fil_write(page_id, 0, 0, srv_page_size, buf);
-		fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
-	}
-
-	ut_free(buf1);
-	return(err);
-}
-
-/** Acquire a tablespace when it could be dropped concurrently.
-Used by background threads that do not necessarily hold proper locks
-for concurrency control.
-@param[in]	id	tablespace ID
-@param[in]	silent	whether to silently ignore missing tablespaces
-@return	the tablespace
-@retval	NULL if missing or being deleted */
-fil_space_t* fil_space_acquire_low(ulint id, bool silent)
-{
-	fil_space_t*	space;
-
-	mutex_enter(&fil_system.mutex);
-
-	space = fil_space_get_by_id(id);
-
-	if (space == NULL) {
-		if (!silent) {
-			ib::warn() << "Trying to access missing"
-				" tablespace " << id;
-		}
-	} else if (!space->acquire()) {
-		space = NULL;
-	}
-
-	mutex_exit(&fil_system.mutex);
-
-	return(space);
-}
-
-/** Acquire a tablespace for reading or writing a block,
-when it could be dropped concurrently.
-@param[in]	id	tablespace ID
-@return	the tablespace
-@retval	NULL if missing */
-fil_space_t*
-fil_space_acquire_for_io(ulint id)
-{
-	mutex_enter(&fil_system.mutex);
-
-	fil_space_t* space = fil_space_get_by_id(id);
-
-	if (space) {
-		space->acquire_for_io();
+		fio = fil_system.sys_space->io(IORequestWrite,
+					       0, srv_page_size, buf);
+		fil_flush_file_spaces();
+	} else {
+		fil_system.sys_space->release();
 	}
 
-	mutex_exit(&fil_system.mutex);
-
-	return(space);
+	aligned_free(buf);
+	return fio.err;
 }
 
-/** Write a log record about an operation on a tablespace file.
-@param[in]	type		MLOG_FILE_NAME or MLOG_FILE_DELETE
-or MLOG_FILE_CREATE2 or MLOG_FILE_RENAME2
-@param[in]	space_id	tablespace identifier
-@param[in]	first_page_no	first page number in the file
-@param[in]	path		file path
-@param[in]	new_path	if type is MLOG_FILE_RENAME2, the new name
-@param[in]	flags		if type is MLOG_FILE_CREATE2, the space flags
-@param[in,out]	mtr		mini-transaction */
-static
-void
-fil_op_write_log(
-	mlog_id_t	type,
-	ulint		space_id,
-	ulint		first_page_no,
-	const char*	path,
-	const char*	new_path,
-	ulint		flags,
-	mtr_t*		mtr)
+/** Acquire a tablespace reference.
+@param id      tablespace identifier
+@return tablespace
+@retval nullptr if the tablespace is missing or inaccessible */
+fil_space_t *fil_space_t::get(ulint id)
 {
-	byte*		log_ptr;
-	ulint		len;
-
-	ut_ad(first_page_no == 0 || type == MLOG_FILE_CREATE2);
-	ut_ad(fil_space_t::is_valid_flags(flags, space_id));
-
-	/* fil_name_parse() requires that there be at least one path
-	separator and that the file path end with ".ibd". */
-	ut_ad(strchr(path, OS_PATH_SEPARATOR) != NULL);
-	ut_ad(first_page_no /* trimming an undo tablespace */
-	      || !strcmp(&path[strlen(path) - strlen(DOT_IBD)], DOT_IBD));
-
-	log_ptr = mlog_open(mtr, 11 + 4 + 2 + 1);
-
-	if (log_ptr == NULL) {
-		/* Logging in mtr is switched off during crash recovery:
-		in that case mlog_open returns NULL */
-		return;
-	}
-
-	log_ptr = mlog_write_initial_log_record_low(
-		type, space_id, first_page_no, log_ptr, mtr);
-
-	if (type == MLOG_FILE_CREATE2) {
-		mach_write_to_4(log_ptr, flags);
-		log_ptr += 4;
-	}
-
-	/* Let us store the strings as null-terminated for easier readability
-	and handling */
-
-	len = strlen(path) + 1;
-
-	mach_write_to_2(log_ptr, len);
-	log_ptr += 2;
-	mlog_close(mtr, log_ptr);
+  mutex_enter(&fil_system.mutex);
+  fil_space_t *space= fil_space_get_by_id(id);
+  const uint32_t n= space ? space->acquire_low() : 0;
+  mutex_exit(&fil_system.mutex);
 
-	mlog_catenate_string(
-		mtr, reinterpret_cast<const byte*>(path), len);
+  if (n & STOPPING)
+    space= nullptr;
+  else if ((n & CLOSING) && !space->prepare())
+    space= nullptr;
+
+  return space;
+}
+
+/** Write a log record about a file operation.
+@param type           file operation
+@param first_page_no  first page number in the file
+@param path           file path
+@param new_path       new file path for type=FILE_RENAME */
+inline void mtr_t::log_file_op(mfile_type_t type, ulint space_id,
+			       const char *path, const char *new_path)
+{
+  ut_ad((new_path != nullptr) == (type == FILE_RENAME));
+  ut_ad(!(byte(type) & 15));
+
+  /* fil_name_parse() requires that there be at least one path
+  separator and that the file path end with ".ibd". */
+  ut_ad(strchr(path, OS_PATH_SEPARATOR) != NULL);
+  ut_ad(!strcmp(&path[strlen(path) - strlen(DOT_IBD)], DOT_IBD));
+
+  flag_modified();
+  if (m_log_mode != MTR_LOG_ALL)
+    return;
+  m_last= nullptr;
+
+  const size_t len= strlen(path);
+  const size_t new_len= type == FILE_RENAME ? 1 + strlen(new_path) : 0;
+  ut_ad(len > 0);
+  byte *const log_ptr= m_log.open(1 + 3/*length*/ + 5/*space_id*/ +
+                                  1/*page_no=0*/);
+  byte *end= log_ptr + 1;
+  end= mlog_encode_varint(end, space_id);
+  *end++= 0;
+  if (UNIV_LIKELY(end + len + new_len >= &log_ptr[16]))
+  {
+    *log_ptr= type;
+    size_t total_len= len + new_len + end - log_ptr - 15;
+    if (total_len >= MIN_3BYTE)
+      total_len+= 2;
+    else if (total_len >= MIN_2BYTE)
+      total_len++;
+    end= mlog_encode_varint(log_ptr + 1, total_len);
+    end= mlog_encode_varint(end, space_id);
+    *end++= 0;
+  }
+  else
+  {
+    *log_ptr= static_cast<byte>(type | (end + len + new_len - &log_ptr[1]));
+    ut_ad(*log_ptr & 15);
+  }
 
-	switch (type) {
-	case MLOG_FILE_RENAME2:
-		ut_ad(strchr(new_path, OS_PATH_SEPARATOR) != NULL);
-		len = strlen(new_path) + 1;
-		log_ptr = mlog_open(mtr, 2 + len);
-		ut_a(log_ptr);
-		mach_write_to_2(log_ptr, len);
-		log_ptr += 2;
-		mlog_close(mtr, log_ptr);
+  m_log.close(end);
 
-		mlog_catenate_string(
-			mtr, reinterpret_cast<const byte*>(new_path), len);
-		break;
-	case MLOG_FILE_NAME:
-	case MLOG_FILE_DELETE:
-	case MLOG_FILE_CREATE2:
-		break;
-	default:
-		ut_ad(0);
-	}
+  if (type == FILE_RENAME)
+  {
+    ut_ad(strchr(new_path, OS_PATH_SEPARATOR));
+    m_log.push(reinterpret_cast<const byte*>(path), uint32_t(len + 1));
+    m_log.push(reinterpret_cast<const byte*>(new_path), uint32_t(new_len));
+  }
+  else
+    m_log.push(reinterpret_cast<const byte*>(path), uint32_t(len));
 }
 
 /** Write redo log for renaming a file.
 @param[in]	space_id	tablespace id
-@param[in]	first_page_no	first page number in the file
 @param[in]	old_name	tablespace file name
 @param[in]	new_name	tablespace file name after renaming
 @param[in,out]	mtr		mini-transaction */
@@ -2025,16 +1563,12 @@ static
 void
 fil_name_write_rename_low(
 	ulint		space_id,
-	ulint		first_page_no,
 	const char*	old_name,
 	const char*	new_name,
 	mtr_t*		mtr)
 {
-	ut_ad(!is_predefined_tablespace(space_id));
-
-	fil_op_write_log(
-		MLOG_FILE_RENAME2,
-		space_id, first_page_no, old_name, new_name, 0, mtr);
+  ut_ad(!is_predefined_tablespace(space_id));
+  mtr->log_file_op(FILE_RENAME, space_id, old_name, new_name);
 }
 
 /** Write redo log for renaming a file.
@@ -2049,137 +1583,26 @@ fil_name_write_rename(
 {
 	mtr_t	mtr;
 	mtr.start();
-	fil_name_write_rename_low(space_id, 0, old_name, new_name, &mtr);
+	fil_name_write_rename_low(space_id, old_name, new_name, &mtr);
 	mtr.commit();
 	log_write_up_to(mtr.commit_lsn(), true);
 }
 
-/** Write MLOG_FILE_NAME for a file.
+/** Write FILE_MODIFY for a file.
 @param[in]	space_id	tablespace id
-@param[in]	first_page_no	first page number in the file
 @param[in]	name		tablespace file name
 @param[in,out]	mtr		mini-transaction */
 static
 void
 fil_name_write(
 	ulint		space_id,
-	ulint		first_page_no,
 	const char*	name,
 	mtr_t*		mtr)
 {
-	fil_op_write_log(
-		MLOG_FILE_NAME, space_id, first_page_no, name, NULL, 0, mtr);
-}
-/** Write MLOG_FILE_NAME for a file.
-@param[in]	space		tablespace
-@param[in]	first_page_no	first page number in the file
-@param[in]	file		tablespace file
-@param[in,out]	mtr		mini-transaction */
-static
-void
-fil_name_write(
-	const fil_space_t*	space,
-	ulint			first_page_no,
-	const fil_node_t*	file,
-	mtr_t*			mtr)
-{
-	fil_name_write(space->id, first_page_no, file->name, mtr);
-}
-
-/** Replay a file rename operation if possible.
-@param[in]	space_id	tablespace identifier
-@param[in]	first_page_no	first page number in the file
-@param[in]	name		old file name
-@param[in]	new_name	new file name
-@return	whether the operation was successfully applied
-(the name did not exist, or new_name did not exist and
-name was successfully renamed to new_name)  */
-bool
-fil_op_replay_rename(
-	ulint		space_id,
-	ulint		first_page_no,
-	const char*	name,
-	const char*	new_name)
-{
-	ut_ad(first_page_no == 0);
-
-	/* In order to replay the rename, the following must hold:
-	* The new name is not already used.
-	* A tablespace exists with the old name.
-	* The space ID for that tablepace matches this log entry.
-	This will prevent unintended renames during recovery. */
-	fil_space_t*	space = fil_space_get(space_id);
-
-	if (space == NULL) {
-		return(true);
-	}
-
-	const bool name_match
-		= strcmp(name, UT_LIST_GET_FIRST(space->chain)->name) == 0;
-
-	if (!name_match) {
-		return(true);
-	}
-
-	/* Create the database directory for the new name, if
-	it does not exist yet */
-
-	const char*	namend = strrchr(new_name, OS_PATH_SEPARATOR);
-	ut_a(namend != NULL);
-
-	char*		dir = static_cast<char*>(
-		ut_malloc_nokey(ulint(namend - new_name) + 1));
-
-	memcpy(dir, new_name, ulint(namend - new_name));
-	dir[namend - new_name] = '\0';
-
-	bool		success = os_file_create_directory(dir, false);
-	ut_a(success);
-
-	ulint		dirlen = 0;
-
-	if (const char* dirend = strrchr(dir, OS_PATH_SEPARATOR)) {
-		dirlen = ulint(dirend - dir) + 1;
-	}
-
-	ut_free(dir);
-
-	/* New path must not exist. */
-	dberr_t		err = fil_rename_tablespace_check(
-		name, new_name, false);
-	if (err != DB_SUCCESS) {
-		ib::error() << " Cannot replay file rename."
-			" Remove either file and try again.";
-		return(false);
-	}
-
-	char*		new_table = mem_strdupl(
-		new_name + dirlen,
-		strlen(new_name + dirlen)
-		- 4 /* remove ".ibd" */);
-
-	ut_ad(new_table[ulint(namend - new_name) - dirlen]
-	      == OS_PATH_SEPARATOR);
-#if OS_PATH_SEPARATOR != '/'
-	new_table[namend - new_name - dirlen] = '/';
-#endif
-
-	if (!fil_rename_tablespace(
-		    space_id, name, new_table, new_name)) {
-		ut_error;
-	}
-
-	ut_free(new_table);
-	return(true);
+  ut_ad(!is_predefined_tablespace(space_id));
+  mtr->log_file_op(FILE_MODIFY, space_id, name);
 }
 
-/** File operations for tablespace */
-enum fil_operation_t {
-	FIL_OPERATION_DELETE,	/*!< delete a single-table tablespace */
-	FIL_OPERATION_CLOSE,	/*!< close a single-table tablespace */
-	FIL_OPERATION_TRUNCATE	/*!< truncate an undo tablespace */
-};
-
 /** Check for pending operations.
 @param[in]	space	tablespace
 @param[in]	count	number of attempts so far
@@ -2215,39 +1638,26 @@ static
 ulint
 fil_check_pending_io(
 /*=================*/
-	fil_operation_t	operation,	/*!< in: File operation */
 	fil_space_t*	space,		/*!< in/out: Tablespace to check */
 	fil_node_t**	node,		/*!< out: Node in space list */
 	ulint		count)		/*!< in: number of attempts so far */
 {
 	ut_ad(mutex_own(&fil_system.mutex));
-	ut_ad(!space->referenced());
-
-	switch (operation) {
-	case FIL_OPERATION_DELETE:
-	case FIL_OPERATION_CLOSE:
-		break;
-	case FIL_OPERATION_TRUNCATE:
-		space->is_being_truncated = true;
-		break;
-	}
 
 	/* The following code must change when InnoDB supports
 	multiple datafiles per tablespace. */
-	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+	ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
 
 	*node = UT_LIST_GET_FIRST(space->chain);
 
-	if (space->n_pending_flushes > 0 || (*node)->n_pending > 0) {
-
+	if (const uint32_t p = space->referenced()) {
 		ut_a(!(*node)->being_extended);
 
-		if (count > 1000) {
-			ib::warn() << "Trying to delete"
+                /* Give a warning every 10 second, starting after 1 second */
+		if ((count % 500) == 50) {
+			ib::info() << "Trying to delete"
 				" tablespace '" << space->name
-				<< "' but there are "
-				<< space->n_pending_flushes
-				<< " flushes and " << (*node)->n_pending
+				<< "' but there are " << p
 				<< " pending i/o's on it.";
 		}
 
@@ -2259,66 +1669,60 @@ fil_check_pending_io(
 
 /*******************************************************************//**
 Check pending operations on a tablespace.
-@return DB_SUCCESS or error failure. */
+@return tablespace */
 static
-dberr_t
+fil_space_t*
 fil_check_pending_operations(
 /*=========================*/
 	ulint		id,		/*!< in: space id */
-	fil_operation_t	operation,	/*!< in: File operation */
-	fil_space_t**	space,		/*!< out: tablespace instance
-					in memory */
+	bool		truncate,	/*!< in: whether to truncate a file */
 	char**		path)		/*!< out/own: tablespace path */
 {
 	ulint		count = 0;
 
 	ut_a(!is_system_tablespace(id));
-	ut_ad(space);
-
-	*space = 0;
-
 	mutex_enter(&fil_system.mutex);
 	fil_space_t* sp = fil_space_get_by_id(id);
 
 	if (sp) {
-		if (sp->crypt_data && sp->acquire()) {
+		sp->set_stopping(true);
+		if (sp->crypt_data) {
+			sp->reacquire();
 			mutex_exit(&fil_system.mutex);
 			fil_space_crypt_close_tablespace(sp);
 			mutex_enter(&fil_system.mutex);
 			sp->release();
 		}
-		sp->set_stopping(true);
 	}
 
 	/* Check for pending operations. */
 
 	do {
-		sp = fil_space_get_by_id(id);
-
 		count = fil_check_pending_ops(sp, count);
 
 		mutex_exit(&fil_system.mutex);
 
-		if (count > 0) {
-			os_thread_sleep(20000);
+		if (count) {
+			os_thread_sleep(20000); // Wait 0.02 seconds
+		} else if (!sp) {
+			return nullptr;
 		}
 
 		mutex_enter(&fil_system.mutex);
-	} while (count > 0);
+
+		sp = fil_space_get_by_id(id);
+	} while (count);
 
 	/* Check for pending IO. */
 
 	for (;;) {
-		sp = fil_space_get_by_id(id);
-
-		if (sp == NULL) {
-			mutex_exit(&fil_system.mutex);
-			return(DB_TABLESPACE_NOT_FOUND);
+		if (truncate) {
+			sp->is_being_truncated = true;
 		}
 
 		fil_node_t*	node;
 
-		count = fil_check_pending_io(operation, sp, &node, count);
+		count = fil_check_pending_io(sp, &node, count);
 
 		if (count == 0 && path) {
 			*path = mem_strdup(node->name);
@@ -2330,130 +1734,87 @@ fil_check_pending_operations(
 			break;
 		}
 
-		os_thread_sleep(20000);
+		os_thread_sleep(20000);         // Wait 0.02 seconds
 		mutex_enter(&fil_system.mutex);
-	}
+		sp = fil_space_get_by_id(id);
 
-	ut_ad(sp);
+		if (!sp) {
+			mutex_exit(&fil_system.mutex);
+			break;
+		}
+	}
 
-	*space = sp;
-	return(DB_SUCCESS);
+	return sp;
 }
 
-/*******************************************************************//**
-Closes a single-table tablespace. The tablespace must be cached in the
-memory cache. Free all pages used by the tablespace.
-@return DB_SUCCESS or error */
-dberr_t
-fil_close_tablespace(
-/*=================*/
-	trx_t*		trx,	/*!< in/out: Transaction covering the close */
-	ulint		id)	/*!< in: space id */
+/** Close a single-table tablespace on failed IMPORT TABLESPACE.
+The tablespace must be cached in the memory cache.
+Free all pages used by the tablespace. */
+void fil_close_tablespace(ulint id)
 {
-	char*		path = 0;
-	fil_space_t*	space = 0;
-	dberr_t		err;
-
-	ut_a(!is_system_tablespace(id));
-
-	err = fil_check_pending_operations(id, FIL_OPERATION_CLOSE,
-					   &space, &path);
-
-	if (err != DB_SUCCESS) {
-		return(err);
+	ut_ad(!is_system_tablespace(id));
+	char* path = nullptr;
+	fil_space_t* space = fil_check_pending_operations(id, false, &path);
+	if (!space) {
+		return;
 	}
 
-	ut_a(space);
-	ut_a(path != 0);
-
 	rw_lock_x_lock(&space->latch);
 
 	/* Invalidate in the buffer pool all pages belonging to the
-	tablespace. Since we have set space->stop_new_ops = true, readahead
-	or ibuf merge can no longer read more pages of this tablespace to the
-	buffer pool. Thus we can clean the tablespace out of the buffer pool
-	completely and permanently. The flag stop_new_ops also prevents
-	fil_flush() from being applied to this tablespace. */
-
-	{
-		FlushObserver observer(space, trx, NULL);
-		buf_LRU_flush_or_remove_pages(id, &observer);
-	}
+	tablespace. Since we have invoked space->set_stopping(), readahead
+	can no longer read more pages of this tablespace to buf_pool.
+	Thus we can clean the tablespace out of buf_pool
+	completely and permanently. */
+	while (buf_flush_list_space(space));
+	ut_ad(space->is_stopping());
 
 	/* If the free is successful, the X lock will be released before
 	the space memory data structure is freed. */
 
 	if (!fil_space_free(id, true)) {
 		rw_lock_x_unlock(&space->latch);
-		err = DB_TABLESPACE_NOT_FOUND;
-	} else {
-		err = DB_SUCCESS;
 	}
 
 	/* If it is a delete then also delete any generated files, otherwise
 	when we drop the database the remove directory will fail. */
 
-	char*	cfg_name = fil_make_filepath(path, NULL, CFG, false);
-	if (cfg_name != NULL) {
+	if (char* cfg_name = fil_make_filepath(path, NULL, CFG, false)) {
 		os_file_delete_if_exists(innodb_data_file_key, cfg_name, NULL);
 		ut_free(cfg_name);
 	}
 
 	ut_free(path);
-
-	return(err);
-}
-
-/** Determine whether a table can be accessed in operations that are
-not (necessarily) protected by meta-data locks.
-(Rollback would generally be protected, but rollback of
-FOREIGN KEY CASCADE/SET NULL is not protected by meta-data locks
-but only by InnoDB table locks, which may be broken by
-lock_remove_all_on_table().)
-@param[in]	table	persistent table
-checked @return whether the table is accessible */
-bool fil_table_accessible(const dict_table_t* table)
-{
-	if (UNIV_UNLIKELY(!table->is_readable() || table->corrupted)) {
-		return(false);
-	}
-
-	mutex_enter(&fil_system.mutex);
-	bool accessible = table->space && !table->space->is_stopping();
-	mutex_exit(&fil_system.mutex);
-	ut_ad(accessible || dict_table_is_file_per_table(table));
-	return accessible;
 }
 
 /** Delete a tablespace and associated .ibd file.
 @param[in]	id		tablespace identifier
 @param[in]	if_exists	whether to ignore missing tablespace
+@param[in,out]	detached_handles	return detached handles if not nullptr
 @return	DB_SUCCESS or error */
-dberr_t fil_delete_tablespace(ulint id, bool if_exists)
+dberr_t fil_delete_tablespace(ulint id, bool if_exists,
+			      std::vector<pfs_os_file_t>* detached_handles)
 {
-	char*		path = 0;
-	fil_space_t*	space = 0;
-
-	ut_a(!is_system_tablespace(id));
+	char* path = NULL;
+	ut_ad(!is_system_tablespace(id));
+	ut_ad(!detached_handles || detached_handles->empty());
 
-	dberr_t err = fil_check_pending_operations(
-		id, FIL_OPERATION_DELETE, &space, &path);
+	dberr_t err;
+	fil_space_t *space = fil_check_pending_operations(id, false, &path);
 
-	if (err != DB_SUCCESS) {
+	if (!space) {
+		err = DB_TABLESPACE_NOT_FOUND;
 		if (!if_exists) {
 			ib::error() << "Cannot delete tablespace " << id
 				    << " because it is not found"
 				       " in the tablespace memory cache.";
 		}
 
-		return(err);
+		goto func_exit;
 	}
 
-	ut_a(space);
-	ut_a(path != 0);
-
 	/* IMPORTANT: Because we have set space::stop_new_ops there
-	can't be any new ibuf merges, reads or flushes. We are here
+	can't be any new reads or flushes. We are here
 	because node::n_pending was zero above. However, it is still
 	possible to have pending read and write requests:
 
@@ -2463,7 +1824,7 @@ dberr_t fil_delete_tablespace(ulint id, bool if_exists)
 	when we checked it above.
 
 	A write request can be issued any time because we don't check
-	the ::stop_new_ops flag when queueing a block for write.
+	fil_space_t::is_stopping() when queueing a block for write.
 
 	We deal with pending write requests in the following function
 	where we'd minimally evict all dirty pages belonging to this
@@ -2471,9 +1832,10 @@ dberr_t fil_delete_tablespace(ulint id, bool if_exists)
 	we'll wait for IO to complete.
 
 	To deal with potential read requests, we will check the
-	::stop_new_ops flag in fil_io(). */
+	is_stopping() in fil_space_t::io(). */
 
-	buf_LRU_flush_or_remove_pages(id, NULL);
+	err = DB_SUCCESS;
+	buf_flush_remove_pages(id);
 
 	/* If it is a delete then also delete any generated files, otherwise
 	when we drop the database the remove directory will fail. */
@@ -2483,9 +1845,9 @@ dberr_t fil_delete_tablespace(ulint id, bool if_exists)
 		to be gone. */
 		mtr_t		mtr;
 
-		mtr_start(&mtr);
-		fil_op_write_log(MLOG_FILE_DELETE, id, 0, path, NULL, 0, &mtr);
-		mtr_commit(&mtr);
+		mtr.start();
+		mtr.log_file_op(FILE_DELETE, id, path);
+		mtr.commit();
 		/* Even if we got killed shortly after deleting the
 		tablespace file, the record must have already been
 		written to the redo log. */
@@ -2511,20 +1873,21 @@ dberr_t fil_delete_tablespace(ulint id, bool if_exists)
 		ut_a(s == space);
 		ut_a(!space->referenced());
 		ut_a(UT_LIST_GET_LEN(space->chain) == 1);
-		fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
-		ut_a(node->n_pending == 0);
-
-		fil_space_detach(space);
+		auto handles = fil_system.detach(space,
+						 detached_handles != nullptr);
+		if (detached_handles) {
+			*detached_handles = std::move(handles);
+		}
 		mutex_exit(&fil_system.mutex);
 
-		log_mutex_enter();
+		mysql_mutex_lock(&log_sys.mutex);
 
 		if (space->max_lsn != 0) {
 			ut_d(space->max_lsn = 0);
 			UT_LIST_REMOVE(fil_system.named_spaces, space);
 		}
 
-		log_mutex_exit();
+		mysql_mutex_unlock(&log_sys.mutex);
 		fil_space_free_low(space);
 
 		if (!os_file_delete(innodb_data_file_key, path)
@@ -2541,8 +1904,9 @@ dberr_t fil_delete_tablespace(ulint id, bool if_exists)
 		err = DB_TABLESPACE_NOT_FOUND;
 	}
 
+func_exit:
 	ut_free(path);
-
+	ibuf_delete_for_discarded_space(id);
 	return(err);
 }
 
@@ -2550,35 +1914,15 @@ dberr_t fil_delete_tablespace(ulint id, bool if_exists)
 @param[in]	space_id	undo tablespace id
 @return	the tablespace
 @retval	NULL if tablespace not found */
-fil_space_t* fil_truncate_prepare(ulint space_id)
+fil_space_t *fil_truncate_prepare(ulint space_id)
 {
-	/* Stop all I/O on the tablespace and ensure that related
-	pages are flushed to disk. */
-	fil_space_t* space;
-	if (fil_check_pending_operations(space_id, FIL_OPERATION_TRUNCATE,
-					 &space, NULL) != DB_SUCCESS) {
-		return NULL;
-	}
-	ut_ad(space != NULL);
-	return space;
-}
-
-/** Write log about an undo tablespace truncate operation. */
-void fil_truncate_log(fil_space_t* space, ulint size, mtr_t* mtr)
-{
-	/* Write a MLOG_FILE_CREATE2 record with the new size, so that
-	recovery and backup will ignore any preceding redo log records
-	for writing pages that are after the new end of the tablespace. */
-	ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
-	const fil_node_t* file = UT_LIST_GET_FIRST(space->chain);
-	fil_op_write_log(MLOG_FILE_CREATE2, space->id, size, file->name,
-			 NULL, space->flags & ~FSP_FLAGS_MEM_MASK, mtr);
+  return fil_check_pending_operations(space_id, true, nullptr);
 }
 
 /*******************************************************************//**
 Allocates and builds a file name from a path, a table or tablespace name
 and a suffix. The string must be freed by caller with ut_free().
-@param[in] path NULL or the direcory path or the full path and filename.
+@param[in] path NULL or the directory path or the full path and filename.
 @param[in] name NULL if path is full, or Table/Tablespace name
 @param[in] suffix NULL or the file extention to use.
 @param[in] trim_name true if the last name on the path should be trimmed.
@@ -2684,22 +2028,18 @@ fil_make_filepath(
 if that the old filepath exists and the new filepath does not exist.
 @param[in]	old_path	old filepath
 @param[in]	new_path	new filepath
-@param[in]	is_discarded	whether the tablespace is discarded
 @param[in]	replace_new	whether to ignore the existence of new_path
 @return innodb error code */
 static dberr_t
 fil_rename_tablespace_check(
 	const char*	old_path,
 	const char*	new_path,
-	bool		is_discarded,
 	bool		replace_new)
 {
 	bool	exists = false;
 	os_file_type_t	ftype;
 
-	if (!is_discarded
-	    && os_file_status(old_path, &exists, &ftype)
-	    && !exists) {
+	if (os_file_status(old_path, &exists, &ftype) && !exists) {
 		ib::error() << "Cannot rename '" << old_path
 			<< "' to '" << new_path
 			<< "' because the source file"
@@ -2731,7 +2071,7 @@ retry:
 	for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list);
 	     space; space = UT_LIST_GET_NEXT(space_list, space)) {
 		ulint id = space->id;
-		if (id && id < SRV_LOG_SPACE_FIRST_ID
+		if (id
 		    && space->purpose == FIL_TYPE_TABLESPACE
 		    && !strcmp(new_path,
 			       UT_LIST_GET_FIRST(space->chain)->name)) {
@@ -2759,7 +2099,7 @@ dberr_t fil_space_t::rename(const char* name, const char* path, bool log,
 
 	if (log) {
 		dberr_t err = fil_rename_tablespace_check(
-			chain.start->name, path, false, replace);
+			chain.start->name, path, replace);
 		if (err != DB_SUCCESS) {
 			return(err);
 		}
@@ -2809,7 +2149,7 @@ fil_rename_tablespace(
 	multiple datafiles per tablespace. */
 	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
 	node = UT_LIST_GET_FIRST(space->chain);
-	ut_a(space->acquire());
+	space->reacquire();
 
 	mutex_exit(&fil_system.mutex);
 
@@ -2824,11 +2164,11 @@ fil_rename_tablespace(
 	ut_ad(strchr(new_file_name, OS_PATH_SEPARATOR) != NULL);
 
 	if (!recv_recovery_is_on()) {
-		log_mutex_enter();
+		mysql_mutex_lock(&log_sys.mutex);
 	}
 
 	/* log_sys.mutex is above fil_system.mutex in the latching order */
-	ut_ad(log_mutex_own());
+	mysql_mutex_assert_owner(&log_sys.mutex);
 	mutex_enter(&fil_system.mutex);
 	space->release();
 	ut_ad(space->name == old_space_name);
@@ -2850,7 +2190,7 @@ skip_second_rename:
 	}
 
 	if (!recv_recovery_is_on()) {
-		log_mutex_exit();
+		mysql_mutex_unlock(&log_sys.mutex);
 	}
 
 	ut_ad(space->name == old_space_name);
@@ -2892,20 +2232,19 @@ fil_ibd_create(
 	const char*	name,
 	const char*	path,
 	ulint		flags,
-	ulint		size,
+	uint32_t	size,
 	fil_encryption_t mode,
 	uint32_t	key_id,
 	dberr_t*	err)
 {
 	pfs_os_file_t	file;
-	byte*		buf2;
 	byte*		page;
 	bool		success;
 	bool		has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags) != 0;
 
 	ut_ad(!is_system_tablespace(space_id));
 	ut_ad(!srv_read_only_mode);
-	ut_a(space_id < SRV_LOG_SPACE_FIRST_ID);
+	ut_a(space_id < SRV_SPACE_ID_UPPER_BOUND);
 	ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE);
 	ut_a(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, space_id));
 
@@ -2931,7 +2270,7 @@ fil_ibd_create(
 	file = os_file_create(
 		innodb_data_file_key, path,
 		OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
-		OS_FILE_NORMAL, type, srv_read_only_mode, &success);
+		OS_FILE_AIO, type, srv_read_only_mode, &success);
 
 	if (!success) {
 		/* The following call will print an error message */
@@ -2957,7 +2296,7 @@ fil_ibd_create(
 
 	const bool is_compressed = fil_space_t::is_compressed(flags);
 	bool punch_hole = is_compressed;
-
+	fil_space_crypt_t* crypt_data = nullptr;
 #ifdef _WIN32
 	if (is_compressed) {
 		os_file_set_sparse_win32(file);
@@ -2971,6 +2310,7 @@ fil_ibd_create(
 err_exit:
 		os_file_close(file);
 		os_file_delete(innodb_data_file_key, path);
+		free(crypt_data);
 		return NULL;
 	}
 
@@ -2986,9 +2326,9 @@ err_exit:
 	with zeros from the call of os_file_set_size(), until a buffer pool
 	flush would write to it. */
 
-	buf2 = static_cast<byte*>(ut_malloc_nokey(3U << srv_page_size_shift));
 	/* Align the memory for file i/o if we might have O_DIRECT set */
-	page = static_cast<byte*>(ut_align(buf2, srv_page_size));
+	page = static_cast<byte*>(aligned_malloc(2 * srv_page_size,
+						 srv_page_size));
 
 	memset(page, '\0', srv_page_size);
 
@@ -3003,8 +2343,7 @@ err_exit:
 
 	/* Create crypt data if the tablespace is either encrypted or user has
 	requested it to remain unencrypted. */
-	fil_space_crypt_t *crypt_data = (mode != FIL_ENCRYPTION_DEFAULT
-					 || srv_encrypt_tables)
+	crypt_data = (mode != FIL_ENCRYPTION_DEFAULT || srv_encrypt_tables)
 		? fil_space_create_crypt_data(mode, key_id)
 		: NULL;
 
@@ -3019,24 +2358,25 @@ err_exit:
 		page_zip_set_size(&page_zip, zip_size);
 		page_zip.data = page + srv_page_size;
 #ifdef UNIV_DEBUG
-		page_zip.m_start =
+		page_zip.m_start = 0;
 #endif /* UNIV_DEBUG */
-			page_zip.m_end = page_zip.m_nonempty =
-			page_zip.n_blobs = 0;
+		page_zip.m_end = 0;
+		page_zip.m_nonempty = 0;
+		page_zip.n_blobs = 0;
 
-		buf_flush_init_for_writing(NULL, page, &page_zip, 0, false);
+		buf_flush_init_for_writing(NULL, page, &page_zip, false);
 
-		*err = os_file_write(
-			IORequestWrite, path, file, page_zip.data, 0, zip_size);
+		*err = os_file_write(IORequestWrite, path, file,
+				     page_zip.data, 0, zip_size);
 	} else {
-		buf_flush_init_for_writing(NULL, page, NULL, 0,
+		buf_flush_init_for_writing(NULL, page, NULL,
 					   fil_space_t::full_crc32(flags));
 
-		*err = os_file_write(
-			IORequestWrite, path, file, page, 0, srv_page_size);
+		*err = os_file_write(IORequestWrite, path, file,
+				     page, 0, srv_page_size);
 	}
 
-	ut_free(buf2);
+	aligned_free(page);
 
 	if (*err != DB_SUCCESS) {
 		ib::error()
@@ -3061,40 +2401,27 @@ err_exit:
 		}
 	}
 
-	fil_space_t* space = fil_space_create(name, space_id, flags,
-					      FIL_TYPE_TABLESPACE,
-					      crypt_data, mode);
-	if (!space) {
-		free(crypt_data);
-		*err = DB_ERROR;
-	} else {
+	if (fil_space_t* space = fil_space_t::create(name, space_id, flags,
+						     FIL_TYPE_TABLESPACE,
+						     crypt_data, mode)) {
 		space->punch_hole = punch_hole;
-		/* FIXME: Keep the file open! */
-		fil_node_t* node = space->add(path, OS_FILE_CLOSED, size,
-					      false, true);
+		fil_node_t* node = space->add(path, file, size, false, true);
 		mtr_t mtr;
 		mtr.start();
-		fil_op_write_log(
-			MLOG_FILE_CREATE2, space_id, 0, node->name,
-			NULL, space->flags & ~FSP_FLAGS_MEM_MASK, &mtr);
-		fil_name_write(space, 0, node, &mtr);
+		mtr.log_file_op(FILE_CREATE, space_id, node->name);
 		mtr.commit();
 
 		node->find_metadata(file);
 		*err = DB_SUCCESS;
+		return space;
 	}
 
-	os_file_close(file);
-
-	if (*err != DB_SUCCESS) {
-		if (has_data_dir) {
-			RemoteDatafile::delete_link_file(name);
-		}
-
-		os_file_delete(innodb_data_file_key, path);
+	if (has_data_dir) {
+		RemoteDatafile::delete_link_file(name);
 	}
 
-	return space;
+	*err = DB_ERROR;
+	goto err_exit;
 }
 
 /** Try to open a single-table tablespace and optionally check that the
@@ -3178,8 +2505,6 @@ fil_ibd_open(
 		ut_ad(srv_log_file_size != 0);
 	}
 
-	ut_ad(fil_type_is_data(purpose));
-
 	/* Table flags can be ULINT_UNDEFINED if
 	dict_tf_to_fsp_flags_failure is set. */
 	if (flags == ULINT_UNDEFINED) {
@@ -3446,7 +2771,7 @@ skip_validate:
 					    first_page)
 		: NULL;
 
-	fil_space_t* space = fil_space_create(
+	fil_space_t* space = fil_space_t::create(
 		tablename.m_name, id, flags, purpose, crypt_data);
 	if (!space) {
 		goto error;
@@ -3460,11 +2785,17 @@ skip_validate:
 		df_dict.is_open() ? df_dict.filepath() :
 		df_default.filepath(), OS_FILE_CLOSED, 0, false, true);
 
-	if (validate && purpose != FIL_TYPE_IMPORT && !srv_read_only_mode) {
+	if (validate && !srv_read_only_mode) {
 		df_remote.close();
 		df_dict.close();
 		df_default.close();
-		fsp_flags_try_adjust(space, flags & ~FSP_FLAGS_MEM_MASK);
+		if (space->acquire()) {
+			if (purpose != FIL_TYPE_IMPORT) {
+				fsp_flags_try_adjust(space, flags
+						     & ~FSP_FLAGS_MEM_MASK);
+			}
+			space->release();
+		}
 	}
 
 	if (err) *err = DB_SUCCESS;
@@ -3607,7 +2938,6 @@ fil_ibd_discover(
 		case SRV_OPERATION_RESTORE_DELTA:
 			ut_ad(0);
 			break;
-		case SRV_OPERATION_RESTORE_ROLLBACK_XA:
 		case SRV_OPERATION_RESTORE_EXPORT:
 		case SRV_OPERATION_RESTORE:
 			break;
@@ -3705,7 +3035,7 @@ fil_ibd_load(
 		return(FIL_LOAD_OK);
 	}
 
-	if (is_mariabackup_restore()) {
+	if (srv_operation == SRV_OPERATION_RESTORE) {
 		/* Replace absolute DATA DIRECTORY file paths with
 		short names relative to the backup directory. */
 		if (const char* name = strrchr(filename, OS_PATH_SEPARATOR)) {
@@ -3788,7 +3118,7 @@ fil_ibd_load(
 		? fil_space_read_crypt_data(fil_space_t::zip_size(flags),
 					    first_page)
 		: NULL;
-	space = fil_space_create(
+	space = fil_space_t::create(
 		file.name(), space_id, flags, FIL_TYPE_TABLESPACE, crypt_data);
 
 	if (space == NULL) {
@@ -3819,7 +3149,7 @@ void fsp_flags_try_adjust(fil_space_t* space, ulint flags)
 		return;
 	}
 	if (!space->size && (space->purpose != FIL_TYPE_TABLESPACE
-			     || !fil_space_get_size(space->id))) {
+			     || !space->get_size())) {
 		return;
 	}
 	/* This code is executed during server startup while no
@@ -3830,7 +3160,7 @@ void fsp_flags_try_adjust(fil_space_t* space, ulint flags)
 	if (buf_block_t* b = buf_page_get(
 		    page_id_t(space->id, 0), space->zip_size(),
 		    RW_X_LATCH, &mtr)) {
-		ulint f = fsp_header_get_flags(b->frame);
+		uint32_t f = fsp_header_get_flags(b->frame);
 		if (fil_space_t::full_crc32(f)) {
 			goto func_exit;
 		}
@@ -3846,8 +3176,9 @@ void fsp_flags_try_adjust(fil_space_t* space, ulint flags)
 				<< " to " << ib::hex(flags);
 		}
 		mtr.set_named_space(space);
-		mlog_write_ulint(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS
-				 + b->frame, flags, MLOG_4BYTES, &mtr);
+		mtr.write<4,mtr_t::FORCED>(*b,
+					   FSP_HEADER_OFFSET + FSP_SPACE_FLAGS
+					   + b->frame, flags);
 	}
 func_exit:
 	mtr.commit();
@@ -3857,7 +3188,7 @@ func_exit:
 memory cache. Note that if we have not done a crash recovery at the database
 startup, there may be many tablespaces which are not yet in the memory cache.
 @param[in]	id		Tablespace ID
-@param[in]	name		Tablespace name used in fil_space_create().
+@param[in]	name		Tablespace name used in fil_space_t::create().
 @param[in]	table_flags	table flags
 @return the tablespace
 @retval	NULL	if no matching tablespace exists in the memory cache */
@@ -3909,587 +3240,226 @@ func_exit:
 
 /*============================ FILE I/O ================================*/
 
-/********************************************************************//**
-NOTE: you must call fil_mutex_enter_and_prepare_for_io() first!
-
-Prepares a file node for i/o. Opens the file if it is closed. Updates the
-pending i/o's field in the node and the system appropriately. Takes the node
-off the LRU list if it is in the LRU list. The caller must hold the fil_sys
-mutex.
-@return false if the file can't be opened, otherwise true */
-static
-bool
-fil_node_prepare_for_io(
-/*====================*/
-	fil_node_t*	node,	/*!< in: file node */
-	fil_space_t*	space)	/*!< in: space */
-{
-	ut_ad(node && space);
-	ut_ad(mutex_own(&fil_system.mutex));
-
-	if (fil_system.n_open > srv_max_n_open_files + 5) {
-		ib::warn() << "Open files " << fil_system.n_open
-			<< " exceeds the limit " << srv_max_n_open_files;
-	}
-
-	if (!node->is_open()) {
-		/* File is closed: open it */
-		ut_a(node->n_pending == 0);
-
-		if (!fil_node_open_file(node)) {
-			return(false);
-		}
-	}
-
-	if (node->n_pending == 0 && fil_space_belongs_in_lru(space)) {
-		/* The node is in the LRU list, remove it */
-		ut_a(UT_LIST_GET_LEN(fil_system.LRU) > 0);
-		UT_LIST_REMOVE(fil_system.LRU, node);
-	}
-
-	node->n_pending++;
-
-	return(true);
-}
-
-/** Update the data structures when an i/o operation finishes.
-@param[in,out] node		file node
-@param[in] type			IO context */
-static
-void
-fil_node_complete_io(fil_node_t* node, const IORequest& type)
-{
-	ut_ad(mutex_own(&fil_system.mutex));
-	ut_a(node->n_pending > 0);
-
-	--node->n_pending;
-
-	ut_ad(type.validate());
-
-	if (type.is_write()) {
-
-		ut_ad(!srv_read_only_mode
-		      || node->space->purpose == FIL_TYPE_TEMPORARY);
-
-		if (fil_buffering_disabled(node->space)) {
-
-			/* We don't need to keep track of unflushed
-			changes as user has explicitly disabled
-			buffering. */
-			ut_ad(!node->space->is_in_unflushed_spaces);
-			ut_ad(node->needs_flush == false);
-
-		} else {
-			node->needs_flush = true;
-
-			if (!node->space->is_in_unflushed_spaces) {
-				node->space->is_in_unflushed_spaces = true;
-				fil_system.unflushed_spaces.push_front(
-					*node->space);
-			}
-		}
-	}
-
-	if (node->n_pending == 0 && fil_space_belongs_in_lru(node->space)) {
-
-		/* The node must be put back to the LRU list */
-		UT_LIST_ADD_FIRST(fil_system.LRU, node);
-	}
-}
-
-/** Compose error message about an invalid page access.
-@param[in]	block_offset	block offset
-@param[in]	space_id	space id
-@param[in]	space_name	space name
-@param[in]	byte_offset	byte offset
-@param[in]	len		I/O length
-@param[in]	is_read		I/O type
-@return	std::string with error message */
-static std::string fil_invalid_page_access_msg(size_t block_offset,
-                                               size_t space_id,
-                                               const char *space_name,
-                                               size_t byte_offset, size_t len,
-                                               bool is_read)
+/** Report information about an invalid page access. */
+ATTRIBUTE_COLD
+static void fil_invalid_page_access_msg(bool fatal, const char *name,
+                                        os_offset_t offset, ulint len,
+                                        bool is_read)
 {
-  std::stringstream ss;
-  ss << "Trying to " << (is_read ? "read" : "write") << " page number "
-     << block_offset << " in space " << space_id << ", space name "
-     << space_name << ", which is outside the tablespace bounds. Byte offset "
-     << byte_offset << ", len " << len
-     << (space_id == 0 && !srv_was_started
-             ? "Please check that the configuration matches"
-               " the InnoDB system tablespace location (ibdata files)"
-             : "");
-  return ss.str();
+  sql_print_error("%s%s %zu bytes at " UINT64PF
+                  " outside the bounds of the file: %s",
+                  fatal ? "[FATAL] InnoDB: " : "InnoDB: ",
+                  is_read ? "Trying to read" : "Trying to write",
+                  len, offset, name);
+  if (fatal)
+    abort();
 }
 
-inline void IORequest::set_fil_node(fil_node_t* node)
+/** Update the data structures on write completion */
+inline void fil_node_t::complete_write()
 {
-	if (!node->space->punch_hole) {
-		clear_punch_hole();
-	}
+  ut_ad(!mutex_own(&fil_system.mutex));
 
-	m_fil_node = node;
+  if (space->purpose != FIL_TYPE_TEMPORARY &&
+      srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC &&
+      space->set_needs_flush())
+  {
+    mutex_enter(&fil_system.mutex);
+    if (!space->is_in_unflushed_spaces)
+    {
+      space->is_in_unflushed_spaces= true;
+      fil_system.unflushed_spaces.push_front(*space);
+    }
+    mutex_exit(&fil_system.mutex);
+  }
 }
 
-/** Reads or writes data. This operation could be asynchronous (aio).
-
-@param[in,out] type	IO context
-@param[in] sync		true if synchronous aio is desired
-@param[in] page_id	page id
-@param[in] zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in] byte_offset	remainder of offset in bytes; in aio this
-			must be divisible by the OS block size
-@param[in] len		how many bytes to read or write; this must
-			not cross a file boundary; in aio this must
-			be a block size multiple
-@param[in,out] buf	buffer where to store read data or from where
-			to write; in aio this must be appropriately
-			aligned
-@param[in] message	message for aio handler if non-sync aio
-			used, else ignored
-@param[in] ignore_missing_space true=ignore missing space duging read
-@return DB_SUCCESS, or DB_TABLESPACE_DELETED
-	if we are trying to do i/o on a tablespace which does not exist */
-dberr_t
-fil_io(
-	const IORequest&	type,
-	bool			sync,
-	const page_id_t		page_id,
-	ulint			zip_size,
-	ulint			byte_offset,
-	ulint			len,
-	void*			buf,
-	void*			message,
-	bool			ignore_missing_space)
+/** Read or write data.
+@param type     I/O context
+@param offset   offset in bytes
+@param len      number of bytes
+@param buf      the data to be read or written
+@param bpage    buffer block (for type.is_async() completion callback)
+@return status and file descriptor */
+fil_io_t fil_space_t::io(const IORequest &type, os_offset_t offset, size_t len,
+                         void *buf, buf_page_t *bpage)
 {
-	os_offset_t		offset;
-	IORequest		req_type(type);
-
-	ut_ad(req_type.validate());
-
-	ut_ad(len > 0);
-	ut_ad(byte_offset < srv_page_size);
-	ut_ad(!zip_size || byte_offset == 0);
-	ut_ad(srv_page_size == 1UL << srv_page_size_shift);
-	compile_time_assert((1U << UNIV_PAGE_SIZE_SHIFT_MAX)
-			    == UNIV_PAGE_SIZE_MAX);
-	compile_time_assert((1U << UNIV_PAGE_SIZE_SHIFT_MIN)
-			    == UNIV_PAGE_SIZE_MIN);
+	ut_ad(referenced());
+	ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_ad((len % OS_FILE_LOG_BLOCK_SIZE) == 0);
 	ut_ad(fil_validate_skip());
+	ut_ad(type.is_read() || type.is_write());
+	ut_ad(type.type != IORequest::DBLWR_BATCH);
 
-	/* ibuf bitmap pages must be read in the sync AIO mode: */
-	ut_ad(recv_no_ibuf_operations
-	      || req_type.is_write()
-	      || !ibuf_bitmap_page(page_id, zip_size)
-	      || sync
-	      || req_type.is_log());
-
-	ulint	mode;
-
-	if (sync) {
-
-		mode = OS_AIO_SYNC;
-
-	} else if (req_type.is_log()) {
-
-		mode = OS_AIO_LOG;
-
-	} else if (req_type.is_read()
-		   && !recv_no_ibuf_operations
-		   && ibuf_page(page_id, zip_size, NULL)) {
-
-		mode = OS_AIO_IBUF;
-
-		/* Reduce probability of deadlock bugs in connection with ibuf:
-		do not let the ibuf i/o handler sleep */
-
-		req_type.clear_do_not_wake();
-	} else {
-		mode = OS_AIO_NORMAL;
-	}
-
-	if (req_type.is_read()) {
-
+	if (type.is_read()) {
 		srv_stats.data_read.add(len);
-
-	} else if (req_type.is_write()) {
-
-		ut_ad(!srv_read_only_mode
-		      || fsp_is_system_temporary(page_id.space()));
-
+	} else {
+		ut_ad(!srv_read_only_mode || this == fil_system.temp_space);
 		srv_stats.data_written.add(len);
 	}
 
-	/* Reserve the fil_system mutex and make sure that we can open at
-	least one file while holding it, if the file is not already open */
-
-	fil_mutex_enter_and_prepare_for_io(page_id.space());
-
-	fil_space_t*	space = fil_space_get_by_id(page_id.space());
-
-	/* If we are deleting a tablespace we don't allow async read operations
-	on that. However, we do allow write operations and sync read operations. */
-	if (space == NULL
-	    || (req_type.is_read()
-		&& !sync
-		&& space->is_stopping()
-		&& !space->is_being_truncated)) {
+	fil_node_t* node= UT_LIST_GET_FIRST(chain);
+	ut_ad(node);
 
-		mutex_exit(&fil_system.mutex);
-
-		if (!req_type.ignore_missing() && !ignore_missing_space) {
-			ib::error()
-				<< "Trying to do I/O to a tablespace which"
-				" does not exist. I/O type: "
-				<< (req_type.is_read() ? "read" : "write")
-				<< ", page: " << page_id
-				<< ", I/O length: " << len << " bytes";
-		}
-
-		return(DB_TABLESPACE_DELETED);
+	if (type.type == IORequest::READ_ASYNC && is_stopping()
+	    && !is_being_truncated) {
+		release();
+		return {DB_TABLESPACE_DELETED, nullptr};
 	}
 
-	ut_ad(mode != OS_AIO_IBUF || fil_type_is_data(space->purpose));
-
-	ulint		cur_page_no = page_id.page_no();
-	fil_node_t*	node = UT_LIST_GET_FIRST(space->chain);
-
-	for (;;) {
-
-		if (node == NULL) {
-
-			if (req_type.ignore_missing()) {
-				mutex_exit(&fil_system.mutex);
-				return(DB_ERROR);
-			}
+	ulint p = static_cast<ulint>(offset >> srv_page_size_shift);
+	bool fatal;
 
-			if (space->purpose == FIL_TYPE_IMPORT) {
-				mutex_exit(&fil_system.mutex);
-				ib::error() << fil_invalid_page_access_msg(
-					page_id.page_no(), page_id.space(),
-					space->name, byte_offset, len,
-					req_type.is_read());
-
-				return DB_IO_ERROR;
-			}
-
-			ib::fatal() << fil_invalid_page_access_msg(
-				page_id.page_no(), page_id.space(),
-				space->name, byte_offset, len,
-				req_type.is_read());
-
-		} else if (fil_is_user_tablespace_id(space->id)
-			   && node->size == 0) {
-
-			/* We do not know the size of a single-table tablespace
-			before we open the file */
-			break;
-
-		} else if (node->size > cur_page_no) {
-			/* Found! */
-			break;
-
-		} else {
-			cur_page_no -= node->size;
+	if (UNIV_LIKELY_NULL(UT_LIST_GET_NEXT(chain, node))) {
+		ut_ad(this == fil_system.sys_space
+		      || this == fil_system.temp_space);
+		ut_ad(!(offset & ((1 << srv_page_size_shift) - 1)));
 
+		while (node->size <= p) {
+			p -= node->size;
 			node = UT_LIST_GET_NEXT(chain, node);
-		}
-	}
-
-	/* Open file if closed */
-	if (!fil_node_prepare_for_io(node, space)) {
-		if (fil_type_is_data(space->purpose)
-		    && fil_is_user_tablespace_id(space->id)) {
-			mutex_exit(&fil_system.mutex);
+			if (!node) {
+				if (type.type == IORequest::READ_ASYNC) {
+					release();
+					return {DB_ERROR, nullptr};
+				}
 
-			if (!req_type.ignore_missing()) {
-				ib::error()
-					<< "Trying to do I/O to a tablespace"
-					" which exists without .ibd data file."
-					" I/O type: "
-					<< (req_type.is_read()
-					    ? "read" : "write")
-					<< ", page: "
-					<< page_id_t(page_id.space(),
-						     cur_page_no)
-					<< ", I/O length: " << len << " bytes";
+				fatal = true;
+fail:
+				fil_invalid_page_access_msg(fatal, node->name,
+							    offset, len,
+							    type.is_read());
+				return {DB_IO_ERROR, nullptr};
 			}
-
-			return(DB_TABLESPACE_DELETED);
 		}
 
-		/* The tablespace is for log. Currently, we just assert here
-		to prevent handling errors along the way fil_io returns.
-		Also, if the log files are missing, it would be hard to
-		promise the server can continue running. */
-		ut_a(0);
+		offset = os_offset_t{p} << srv_page_size_shift;
 	}
 
-	/* Check that at least the start offset is within the bounds of a
-	single-table tablespace, including rollback tablespaces. */
-	if (node->size <= cur_page_no
-	    && space->id != TRX_SYS_SPACE
-	    && fil_type_is_data(space->purpose)) {
+	if (UNIV_UNLIKELY(node->size <= p)) {
+		release();
 
-		if (req_type.ignore_missing()) {
+		if (type.type == IORequest::READ_ASYNC) {
 			/* If we can tolerate the non-existent pages, we
 			should return with DB_ERROR and let caller decide
 			what to do. */
-			fil_node_complete_io(node, req_type);
-			mutex_exit(&fil_system.mutex);
-			return(DB_ERROR);
+			return {DB_ERROR, nullptr};
 		}
 
-		ib::fatal() << fil_invalid_page_access_msg(
-			page_id.page_no(), page_id.space(),
-			space->name, byte_offset, len, req_type.is_read());
+		fatal = node->space->purpose != FIL_TYPE_IMPORT;
+		goto fail;
 	}
 
-	/* Now we have made the changes in the data structures of fil_system */
-	mutex_exit(&fil_system.mutex);
-
-	if (!zip_size) zip_size = srv_page_size;
-
-	offset = os_offset_t(cur_page_no) * zip_size + byte_offset;
-	ut_ad(node->size - cur_page_no >= (len + (zip_size - 1)) / zip_size);
-
-	/* Do AIO */
+	dberr_t err;
 
-	ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
-	ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0);
-
-	const char* name = node->name == NULL ? space->name : node->name;
-
-	req_type.set_fil_node(node);
-
-	ut_ad(!req_type.is_write()
-	      || page_id.space() == SRV_LOG_SPACE_FIRST_ID
-	      || !fil_is_user_tablespace_id(page_id.space())
-	      || offset == page_id.page_no() * zip_size);
-
-	/* Queue the aio request */
-	dberr_t err = os_aio(
-		req_type,
-		mode, name, node->handle, buf, offset, len,
-		space->purpose != FIL_TYPE_TEMPORARY
-		&& srv_read_only_mode,
-		node, message);
+	if (type.type == IORequest::PUNCH_RANGE) {
+		err = os_file_punch_hole(node->handle, offset, len);
+		/* Punch hole is not supported, make space not to
+		support punch hole */
+		if (UNIV_UNLIKELY(err == DB_IO_NO_PUNCH_HOLE)) {
+			punch_hole = false;
+			err = DB_SUCCESS;
+		}
+		goto release_sync_write;
+	} else {
+		/* Queue the aio request */
+		err = os_aio(IORequest(bpage, node, type.type),
+			     buf, offset, len);
+	}
 
 	/* We an try to recover the page from the double write buffer if
 	the decompression fails or the page is corrupt. */
 
-	ut_a(req_type.is_dblwr_recover() || err == DB_SUCCESS);
-
-	if (sync) {
-		/* The i/o operation is already completed when we return from
-		os_aio: */
-
-		mutex_enter(&fil_system.mutex);
-
-		fil_node_complete_io(node, req_type);
-
-		mutex_exit(&fil_system.mutex);
-
+	ut_a(type.type == IORequest::DBLWR_RECOVER || err == DB_SUCCESS);
+	if (!type.is_async()) {
+		if (type.is_write()) {
+release_sync_write:
+			node->complete_write();
+release:
+			release();
+		}
 		ut_ad(fil_validate_skip());
 	}
-
-	return(err);
-}
-
-/**********************************************************************//**
-Waits for an aio operation to complete. This function is used to write the
-handler for completed requests. The aio array of pending requests is divided
-into segments (see os0file.cc for more info). The thread specifies which
-segment it wants to wait for. */
-void
-fil_aio_wait(
-/*=========*/
-	ulint	segment)	/*!< in: the number of the segment in the aio
-				array to wait for */
-{
-	fil_node_t*	node;
-	IORequest	type;
-	void*		message;
-
-	ut_ad(fil_validate_skip());
-
-	dberr_t	err = os_aio_handler(segment, &node, &message, &type);
-
-	ut_a(err == DB_SUCCESS);
-
-	if (node == NULL) {
-		ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
-		return;
-	}
-
-	srv_set_io_thread_op_info(segment, "complete io for fil node");
-
-	mutex_enter(&fil_system.mutex);
-
-	fil_node_complete_io(node, type);
-	const fil_type_t	purpose	= node->space->purpose;
-	const ulint		space_id= node->space->id;
-	const bool		dblwr	= node->space->use_doublewrite();
-
-	mutex_exit(&fil_system.mutex);
-
-	ut_ad(fil_validate_skip());
-
-	/* Do the i/o handling */
-	/* IMPORTANT: since i/o handling for reads will read also the insert
-	buffer in tablespace 0, you have to be very careful not to introduce
-	deadlocks in the i/o system. We keep tablespace 0 data files always
-	open, and use a special i/o thread to serve insert buffer requests. */
-
-	switch (purpose) {
-	case FIL_TYPE_LOG:
-		srv_set_io_thread_op_info(segment, "complete io for log");
-		/* We use synchronous writing of the logs
-		and can only end up here when writing a log checkpoint! */
-		ut_a(ptrdiff_t(message) == 1);
-		/* It was a checkpoint write */
-		switch (srv_flush_t(srv_file_flush_method)) {
-		case SRV_O_DSYNC:
-		case SRV_NOSYNC:
-			break;
-		case SRV_FSYNC:
-		case SRV_LITTLESYNC:
-		case SRV_O_DIRECT:
-		case SRV_O_DIRECT_NO_FSYNC:
-#ifdef _WIN32
-		case SRV_ALL_O_DIRECT_FSYNC:
-#endif
-			fil_flush(SRV_LOG_SPACE_FIRST_ID);
-		}
-
-		DBUG_PRINT("ib_log", ("checkpoint info written"));
-		log_sys.complete_checkpoint();
-		return;
-	case FIL_TYPE_TABLESPACE:
-	case FIL_TYPE_TEMPORARY:
-	case FIL_TYPE_IMPORT:
-		srv_set_io_thread_op_info(segment, "complete io for buf page");
-
-		/* async single page writes from the dblwr buffer don't have
-		access to the page */
-		buf_page_t* bpage = static_cast<buf_page_t*>(message);
-		if (!bpage) {
-			return;
-		}
-
-		ulint offset = bpage->id.page_no();
-		dberr_t err = buf_page_io_complete(bpage, dblwr);
-		if (err == DB_SUCCESS) {
-			return;
-		}
-
-		ut_ad(type.is_read());
-		if (recv_recovery_is_on() && !srv_force_recovery) {
-			recv_sys.found_corrupt_fs = true;
-		}
-
-		if (fil_space_t* space = fil_space_acquire_for_io(space_id)) {
-			if (space == node->space) {
-				ib::error() << "Failed to read file '"
-					    << node->name
-					    << "' at offset " << offset
-					    << ": " << err;
-			}
-
-			space->release_for_io();
-		}
-		return;
+	if (err != DB_SUCCESS) {
+		goto release;
 	}
-
-	ut_ad(0);
+	return {err, node};
 }
 
-/**********************************************************************//**
-Flushes to disk possible writes cached by the OS. If the space does not exist
-or is being dropped, does not do anything. */
-void
-fil_flush(
-/*======*/
-	ulint	space_id)	/*!< in: file space id (this can be a group of
-				log files or a tablespace of the database) */
-{
-	mutex_enter(&fil_system.mutex);
+#include <tpool.h>
 
-	if (fil_space_t* space = fil_space_get_by_id(space_id)) {
-		if (space->purpose != FIL_TYPE_TEMPORARY
-		    && !space->is_stopping()) {
-			fil_flush_low(space);
-		}
-	}
-
-	mutex_exit(&fil_system.mutex);
-}
-
-/** Flush a tablespace.
-@param[in,out]	space	tablespace to flush */
-void
-fil_flush(fil_space_t* space)
+/** Callback for AIO completion */
+void fil_aio_callback(const IORequest &request)
 {
-	ut_ad(space->pending_io());
-	ut_ad(space->purpose == FIL_TYPE_TABLESPACE
-	      || space->purpose == FIL_TYPE_IMPORT);
+  ut_ad(fil_validate_skip());
+  ut_ad(request.node);
 
-	if (!space->is_stopping()) {
-		mutex_enter(&fil_system.mutex);
-		if (!space->is_stopping()) {
-			fil_flush_low(space);
-		}
-		mutex_exit(&fil_system.mutex);
-	}
+  if (!request.bpage)
+  {
+    ut_ad(!srv_read_only_mode);
+    if (request.type == IORequest::DBLWR_BATCH)
+      buf_dblwr.flush_buffered_writes_completed(request);
+    else
+      ut_ad(request.type == IORequest::WRITE_ASYNC);
+write_completed:
+    request.node->complete_write();
+  }
+  else if (request.is_write())
+  {
+    buf_page_write_complete(request);
+    goto write_completed;
+  }
+  else
+  {
+    ut_ad(request.is_read());
+
+    /* IMPORTANT: since i/o handling for reads will read also the insert
+    buffer in fil_system.sys_space, we have to be very careful not to
+    introduce deadlocks. We never close fil_system.sys_space data
+    files and never issue asynchronous reads of change buffer pages. */
+    const page_id_t id(request.bpage->id());
+
+    if (dberr_t err= buf_page_read_complete(request.bpage, *request.node))
+    {
+      if (recv_recovery_is_on() && !srv_force_recovery)
+        recv_sys.found_corrupt_fs= true;
+
+      ib::error() << "Failed to read page " << id.page_no()
+                  << " from file '" << request.node->name << "': " << err;
+    }
+  }
+
+  request.node->space->release();
 }
 
 /** Flush to disk the writes in file spaces of the given type
-possibly cached by the OS.
-@param[in]	purpose	FIL_TYPE_TABLESPACE or FIL_TYPE_LOG */
-void
-fil_flush_file_spaces(
-	fil_type_t	purpose)
+possibly cached by the OS. */
+void fil_flush_file_spaces()
 {
-	ulint*		space_ids;
-	ulint		n_space_ids;
-
-	ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_LOG);
-
-	mutex_enter(&fil_system.mutex);
-
-	n_space_ids = fil_system.unflushed_spaces.size();
-	if (n_space_ids == 0) {
-
-		mutex_exit(&fil_system.mutex);
-		return;
-	}
-
-	space_ids = static_cast<ulint*>(
-		ut_malloc_nokey(n_space_ids * sizeof(*space_ids)));
-
-	n_space_ids = 0;
-
-	for (sized_ilist<fil_space_t, unflushed_spaces_tag_t>::iterator it
-	     = fil_system.unflushed_spaces.begin(),
-	     end = fil_system.unflushed_spaces.end();
-	     it != end; ++it) {
-
-		if (it->purpose == purpose && !it->is_stopping()) {
-			space_ids[n_space_ids++] = it->id;
-		}
-	}
-
-	mutex_exit(&fil_system.mutex);
+  if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
+  {
+    ut_d(mutex_enter(&fil_system.mutex));
+    ut_ad(fil_system.unflushed_spaces.empty());
+    ut_d(mutex_exit(&fil_system.mutex));
+    return;
+  }
 
-	/* Flush the spaces.  It will not hurt to call fil_flush() on
-	a non-existing space id. */
-	for (ulint i = 0; i < n_space_ids; i++) {
+rescan:
+  mutex_enter(&fil_system.mutex);
 
-		fil_flush(space_ids[i]);
-	}
+  for (fil_space_t &space : fil_system.unflushed_spaces)
+  {
+    if (space.needs_flush_not_stopping())
+    {
+      space.reacquire();
+      mutex_exit(&fil_system.mutex);
+      space.flush_low();
+      space.release();
+      goto rescan;
+    }
+  }
 
-	ut_free(space_ids);
+  mutex_exit(&fil_system.mutex);
 }
 
 /** Functor to validate the file node list of a tablespace. */
@@ -4506,7 +3476,6 @@ struct	Check {
 	@param[in]	elem	file node to visit */
 	void	operator()(const fil_node_t* elem)
 	{
-		ut_a(elem->is_open() || !elem->n_pending);
 		n_open += elem->is_open();
 		size += elem->size;
 	}
@@ -4541,11 +3510,8 @@ struct	Check {
 /******************************************************************//**
 Checks the consistency of the tablespace cache.
 @return true if ok */
-bool
-fil_validate(void)
-/*==============*/
+bool fil_validate()
 {
-	fil_node_t*	fil_node;
 	ulint		n_open		= 0;
 
 	mutex_enter(&fil_system.mutex);
@@ -4558,56 +3524,11 @@ fil_validate(void)
 
 	ut_a(fil_system.n_open == n_open);
 
-	ut_list_validate(fil_system.LRU);
-
-	for (fil_node = UT_LIST_GET_FIRST(fil_system.LRU);
-	     fil_node != 0;
-	     fil_node = UT_LIST_GET_NEXT(LRU, fil_node)) {
-
-		ut_a(fil_node->n_pending == 0);
-		ut_a(!fil_node->being_extended);
-		ut_a(fil_node->is_open());
-		ut_a(fil_space_belongs_in_lru(fil_node->space));
-	}
-
 	mutex_exit(&fil_system.mutex);
 
 	return(true);
 }
 
-/********************************************************************//**
-Returns true if file address is undefined.
-@return true if undefined */
-bool
-fil_addr_is_null(
-/*=============*/
-	fil_addr_t	addr)	/*!< in: address */
-{
-	return(addr.page == FIL_NULL);
-}
-
-/********************************************************************//**
-Get the predecessor of a file page.
-@return FIL_PAGE_PREV */
-ulint
-fil_page_get_prev(
-/*==============*/
-	const byte*	page)	/*!< in: file page */
-{
-	return(mach_read_from_4(page + FIL_PAGE_PREV));
-}
-
-/********************************************************************//**
-Get the successor of a file page.
-@return FIL_PAGE_NEXT */
-ulint
-fil_page_get_next(
-/*==============*/
-	const byte*	page)	/*!< in: file page */
-{
-	return(mach_read_from_4(page + FIL_PAGE_NEXT));
-}
-
 /*********************************************************************//**
 Sets the file page type. */
 void
@@ -4644,80 +3565,6 @@ fil_delete_file(
 	}
 }
 
-/** Generate redo log for swapping two .ibd files
-@param[in]	old_table	old table
-@param[in]	new_table	new table
-@param[in]	tmp_name	temporary table name
-@param[in,out]	mtr		mini-transaction
-@return innodb error code */
-dberr_t
-fil_mtr_rename_log(
-	const dict_table_t*	old_table,
-	const dict_table_t*	new_table,
-	const char*		tmp_name,
-	mtr_t*			mtr)
-{
-	ut_ad(old_table->space != fil_system.temp_space);
-	ut_ad(new_table->space != fil_system.temp_space);
-	ut_ad(old_table->space->id == old_table->space_id);
-	ut_ad(new_table->space->id == new_table->space_id);
-
-	/* If neither table is file-per-table,
-	there will be no renaming of files. */
-	if (!old_table->space_id && !new_table->space_id) {
-		return(DB_SUCCESS);
-	}
-
-	const bool has_data_dir = DICT_TF_HAS_DATA_DIR(old_table->flags);
-
-	if (old_table->space_id) {
-		char*	tmp_path = fil_make_filepath(
-			has_data_dir ? old_table->data_dir_path : NULL,
-			tmp_name, IBD, has_data_dir);
-		if (tmp_path == NULL) {
-			return(DB_OUT_OF_MEMORY);
-		}
-
-		const char* old_path = old_table->space->chain.start->name;
-		/* Temp filepath must not exist. */
-		dberr_t err = fil_rename_tablespace_check(
-			old_path, tmp_path, !old_table->space);
-		if (err != DB_SUCCESS) {
-			ut_free(tmp_path);
-			return(err);
-		}
-
-		fil_name_write_rename_low(
-			old_table->space_id, 0, old_path, tmp_path, mtr);
-
-		ut_free(tmp_path);
-	}
-
-	if (new_table->space_id) {
-		const char* new_path = new_table->space->chain.start->name;
-		char* old_path = fil_make_filepath(
-			has_data_dir ? old_table->data_dir_path : NULL,
-			old_table->name.m_name, IBD, has_data_dir);
-
-		/* Destination filepath must not exist unless this ALTER
-		TABLE starts and ends with a file_per-table tablespace. */
-		if (!old_table->space_id) {
-			dberr_t err = fil_rename_tablespace_check(
-				new_path, old_path, !new_table->space);
-			if (err != DB_SUCCESS) {
-				ut_free(old_path);
-				return(err);
-			}
-		}
-
-		fil_name_write_rename_low(
-			new_table->space_id, 0, new_path, old_path, mtr);
-		ut_free(old_path);
-	}
-
-	return DB_SUCCESS;
-}
-
 #ifdef UNIV_DEBUG
 /** Check that a tablespace is valid for mtr_commit().
 @param[in]	space	persistent tablespace that has been changed */
@@ -4734,22 +3581,14 @@ fil_space_validate_for_mtr_commit(
 	/* We are serving mtr_commit(). While there is an active
 	mini-transaction, we should have !space->stop_new_ops. This is
 	guaranteed by meta-data locks or transactional locks, or
-	dict_sys.latch (X-lock in DROP, S-lock in purge).
-
-	However, a file I/O thread can invoke change buffer merge
-	while fil_check_pending_operations() is waiting for operations
-	to quiesce. This is not a problem, because
-	ibuf_merge_or_delete_for_page() would call
-	fil_space_acquire() before mtr_start() and
-	fil_space_t::release() after mtr_commit(). This is why
-	n_pending_ops should not be zero if stop_new_ops is set. */
+	dict_sys.latch (X-lock in DROP, S-lock in purge). */
 	ut_ad(!space->is_stopping()
 	      || space->is_being_truncated /* fil_truncate_prepare() */
 	      || space->referenced());
 }
 #endif /* UNIV_DEBUG */
 
-/** Write a MLOG_FILE_NAME record for a persistent tablespace.
+/** Write a FILE_MODIFY record for a persistent tablespace.
 @param[in]	space	tablespace
 @param[in,out]	mtr	mini-transaction */
 static
@@ -4759,7 +3598,7 @@ fil_names_write(
 	mtr_t*			mtr)
 {
 	ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
-	fil_name_write(space, 0, UT_LIST_GET_FIRST(space->chain), mtr);
+	fil_name_write(space->id, UT_LIST_GET_FIRST(space->chain)->name, mtr);
 }
 
 /** Note that a non-predefined persistent tablespace has been modified
@@ -4769,47 +3608,47 @@ void
 fil_names_dirty(
 	fil_space_t*	space)
 {
-	ut_ad(log_mutex_own());
+	mysql_mutex_assert_owner(&log_sys.mutex);
 	ut_ad(recv_recovery_is_on());
-	ut_ad(log_sys.lsn != 0);
+	ut_ad(log_sys.get_lsn() != 0);
 	ut_ad(space->max_lsn == 0);
 	ut_d(fil_space_validate_for_mtr_commit(space));
 
 	UT_LIST_ADD_LAST(fil_system.named_spaces, space);
-	space->max_lsn = log_sys.lsn;
+	space->max_lsn = log_sys.get_lsn();
 }
 
-/** Write MLOG_FILE_NAME records when a non-predefined persistent
+/** Write FILE_MODIFY records when a non-predefined persistent
 tablespace was modified for the first time since the latest
 fil_names_clear().
-@param[in,out]	space	tablespace
-@param[in,out]	mtr	mini-transaction */
-void
-fil_names_dirty_and_write(
-	fil_space_t*	space,
-	mtr_t*		mtr)
+@param[in,out]	space	tablespace */
+void fil_names_dirty_and_write(fil_space_t* space)
 {
-	ut_ad(log_mutex_own());
+	mysql_mutex_assert_owner(&log_sys.mutex);
 	ut_d(fil_space_validate_for_mtr_commit(space));
-	ut_ad(space->max_lsn == log_sys.lsn);
+	ut_ad(space->max_lsn == log_sys.get_lsn());
 
 	UT_LIST_ADD_LAST(fil_system.named_spaces, space);
-	fil_names_write(space, mtr);
+	mtr_t mtr;
+	mtr.start();
+	fil_names_write(space, &mtr);
 
 	DBUG_EXECUTE_IF("fil_names_write_bogus",
 			{
 				char bogus_name[] = "./test/bogus file.ibd";
 				os_normalize_path(bogus_name);
 				fil_name_write(
-					SRV_LOG_SPACE_FIRST_ID, 0,
-					bogus_name, mtr);
+					SRV_SPACE_ID_UPPER_BOUND,
+					bogus_name, &mtr);
 			});
+
+	mtr.commit_files();
 }
 
 /** On a log checkpoint, reset fil_names_dirty_and_write() flags
-and write out MLOG_FILE_NAME and MLOG_CHECKPOINT if needed.
+and write out FILE_MODIFY and FILE_CHECKPOINT if needed.
 @param[in]	lsn		checkpoint LSN
-@param[in]	do_write	whether to always write MLOG_CHECKPOINT
+@param[in]	do_write	whether to always write FILE_CHECKPOINT
 @return whether anything was written to the redo log
 @retval false	if no flags were set and nothing written
 @retval true	if anything was written to the redo log */
@@ -4819,24 +3658,28 @@ fil_names_clear(
 	bool	do_write)
 {
 	mtr_t	mtr;
-	ulint	mtr_checkpoint_size = LOG_CHECKPOINT_FREE_PER_THREAD;
+	ulint	mtr_checkpoint_size = RECV_SCAN_SIZE - 1;
 
 	DBUG_EXECUTE_IF(
 		"increase_mtr_checkpoint_size",
 		mtr_checkpoint_size = 75 * 1024;
 		);
 
-	ut_ad(log_mutex_own());
-
-	if (log_sys.append_on_checkpoint) {
-		mtr_write_log(log_sys.append_on_checkpoint);
-		do_write = true;
-	}
+	mysql_mutex_assert_owner(&log_sys.mutex);
+	ut_ad(lsn);
 
 	mtr.start();
 
 	for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.named_spaces);
 	     space != NULL; ) {
+		if (mtr.get_log()->size()
+		    + (3 + 5 + 1) + strlen(space->chain.start->name)
+		    >= mtr_checkpoint_size) {
+			/* Prevent log parse buffer overflow */
+			mtr.commit_files();
+			mtr.start();
+		}
+
 		fil_space_t*	next = UT_LIST_GET_NEXT(named_spaces, space);
 
 		ut_ad(space->max_lsn > 0);
@@ -4858,24 +3701,11 @@ fil_names_clear(
 		fil_names_write(space, &mtr);
 		do_write = true;
 
-		const mtr_buf_t* mtr_log = mtr_get_log(&mtr);
-
-		/** If the mtr buffer size exceeds the size of
-		LOG_CHECKPOINT_FREE_PER_THREAD then commit the multi record
-		mini-transaction, start the new mini-transaction to
-		avoid the parsing buffer overflow error during recovery. */
-
-		if (mtr_log->size() > mtr_checkpoint_size) {
-			ut_ad(mtr_log->size() < (RECV_PARSING_BUF_SIZE / 2));
-			mtr.commit_checkpoint(lsn, false);
-			mtr.start();
-		}
-
 		space = next;
 	}
 
 	if (do_write) {
-		mtr.commit_checkpoint(lsn, true);
+		mtr.commit_files(lsn);
 	} else {
 		ut_ad(!mtr.has_modifications());
 	}
diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc
index 21b148223f6..25b039aa9f1 100644
--- a/storage/innobase/fil/fil0pagecompress.cc
+++ b/storage/innobase/fil/fil0pagecompress.cc
@@ -85,13 +85,13 @@ static ulint fil_page_compress_low(
 	byte*		out_buf,
 	ulint		header_len,
 	ulint		comp_algo,
-	ulint		comp_level)
+	unsigned	comp_level)
 {
 	ulint write_size = srv_page_size - header_len;
 
 	switch (comp_algo) {
 	default:
-		ut_ad(!"unknown compression method");
+		ut_ad("unknown compression method" == 0);
 		/* fall through */
 	case PAGE_UNCOMPRESSED:
 		return 0;
@@ -200,7 +200,7 @@ static ulint fil_page_compress_for_full_crc32(
 	ulint		block_size,
 	bool		encrypted)
 {
-	ulint comp_level = fsp_flags_get_page_compression_level(flags);
+	ulint comp_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags);
 
 	if (comp_level == 0) {
 		comp_level = page_zip_level;
@@ -210,7 +210,8 @@ static ulint fil_page_compress_for_full_crc32(
 
 	ulint write_size = fil_page_compress_low(
 		buf, out_buf, header_len,
-		fil_space_t::get_compression_algo(flags), comp_level);
+		fil_space_t::get_compression_algo(flags),
+		static_cast<unsigned>(comp_level));
 
 	if (write_size == 0) {
 fail:
@@ -273,7 +274,8 @@ static ulint fil_page_compress_for_non_full_crc32(
 	ulint		block_size,
 	bool		encrypted)
 {
-	int comp_level = int(fsp_flags_get_page_compression_level(flags));
+	uint comp_level = static_cast<uint>(
+		FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags));
 	ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMP_METADATA_LEN;
 	/* Cache to avoid change during function execution */
 	ulint comp_algo = innodb_compression_algorithm;
@@ -285,7 +287,7 @@ static ulint fil_page_compress_for_non_full_crc32(
 	/* If no compression level was provided to this table, use system
 	default level */
 	if (comp_level == 0) {
-		comp_level = int(page_zip_level);
+		comp_level = page_zip_level;
 	}
 
 	ulint write_size = fil_page_compress_low(
@@ -321,9 +323,6 @@ static ulint fil_page_compress_for_non_full_crc32(
 	mach_write_to_2(out_buf + FIL_PAGE_DATA + FIL_PAGE_COMP_SIZE,
 			write_size);
 
-	ut_ad(fil_page_is_compressed(out_buf)
-	      || fil_page_is_compressed_encrypted(out_buf));
-
 	ut_ad(mach_read_from_4(out_buf + FIL_PAGE_SPACE_OR_CHKSUM)
 	      == BUF_NO_CHECKSUM_MAGIC);
 
@@ -439,7 +438,9 @@ static bool fil_page_decompress_low(
 		return LZ4_decompress_safe(
 			reinterpret_cast<const char*>(buf) + header_len,
 			reinterpret_cast<char*>(tmp_buf),
-			actual_size, srv_page_size) == int(srv_page_size);
+			static_cast<int>(actual_size),
+			static_cast<int>(srv_page_size)) ==
+			static_cast<int>(srv_page_size);
 #endif /* HAVE_LZ4 */
 #ifdef HAVE_LZO
 	case PAGE_LZO_ALGORITHM:
@@ -468,12 +469,12 @@ static bool fil_page_decompress_low(
 #ifdef HAVE_BZIP2
 	case PAGE_BZIP2_ALGORITHM:
 		{
-			unsigned int dst_pos = srv_page_size;
+			uint dst_pos = static_cast<uint>(srv_page_size);
 			return BZ_OK == BZ2_bzBuffToBuffDecompress(
 				reinterpret_cast<char*>(tmp_buf),
 				&dst_pos,
 				reinterpret_cast<char*>(buf) + header_len,
-				actual_size, 1, 0)
+				static_cast<uint>(actual_size), 1, 0)
 				&& dst_pos == srv_page_size;
 		}
 #endif /* HAVE_BZIP2 */
@@ -551,10 +552,9 @@ ulint fil_page_decompress_for_non_full_crc32(
 	byte*	tmp_buf,
 	byte*	buf)
 {
-	const unsigned	ptype = mach_read_from_2(buf+FIL_PAGE_TYPE);
 	ulint header_len;
 	uint comp_algo;
-	switch (ptype) {
+	switch (fil_page_get_type(buf)) {
 	case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED:
 		header_len= FIL_PAGE_DATA + FIL_PAGE_ENCRYPT_COMP_METADATA_LEN;
 		comp_algo = mach_read_from_2(
diff --git a/storage/innobase/fsp/fsp0file.cc b/storage/innobase/fsp/fsp0file.cc
index 1390915655c..57164113647 100644
--- a/storage/innobase/fsp/fsp0file.cc
+++ b/storage/innobase/fsp/fsp0file.cc
@@ -157,7 +157,7 @@ void
 Datafile::init_file_info()
 {
 #ifdef _WIN32
-	GetFileInformationByHandle(m_handle, &m_file_info);
+	GetFileInformationByHandle((os_file_t)m_handle, &m_file_info);
 #else
 	fstat(m_handle, &m_file_info);
 #endif	/* WIN32 */
@@ -291,28 +291,23 @@ Datafile::read_first_page(bool read_only_mode)
 		}
 	}
 
-	m_first_page_buf = static_cast<byte*>(
-		ut_malloc_nokey(2 * UNIV_PAGE_SIZE_MAX));
-
 	/* Align the memory for a possible read from a raw device */
 
 	m_first_page = static_cast<byte*>(
-		ut_align(m_first_page_buf, srv_page_size));
+		aligned_malloc(UNIV_PAGE_SIZE_MAX, srv_page_size));
 
-	IORequest	request;
 	dberr_t		err = DB_ERROR;
 	size_t		page_size = UNIV_PAGE_SIZE_MAX;
 
 	/* Don't want unnecessary complaints about partial reads. */
 
-	request.disable_partial_io_warnings();
-
 	while (page_size >= UNIV_PAGE_SIZE_MIN) {
 
 		ulint	n_read = 0;
 
 		err = os_file_read_no_error_handling(
-			request, m_handle, m_first_page, 0, page_size, &n_read);
+			IORequestReadPartial, m_handle, m_first_page, 0,
+			page_size, &n_read);
 
 		if (err == DB_IO_ERROR && n_read >= UNIV_PAGE_SIZE_MIN) {
 
@@ -341,7 +336,17 @@ Datafile::read_first_page(bool read_only_mode)
 	}
 
 	if (m_order == 0) {
-		m_space_id = fsp_header_get_space_id(m_first_page);
+		if (memcmp_aligned<2>(FIL_PAGE_SPACE_ID + m_first_page,
+				      FSP_HEADER_OFFSET + FSP_SPACE_ID
+				      + m_first_page, 4)) {
+			ib::error()
+				<< "Inconsistent tablespace ID in "
+				<< m_filepath;
+			return DB_CORRUPTION;
+		}
+
+		m_space_id = mach_read_from_4(FIL_PAGE_SPACE_ID
+					      + m_first_page);
 		m_flags = fsp_header_get_flags(m_first_page);
 		if (!fil_space_t::is_valid_flags(m_flags, m_space_id)) {
 			ulint cflags = fsp_flags_convert_from_101(m_flags);
@@ -369,14 +374,10 @@ Datafile::read_first_page(bool read_only_mode)
 }
 
 /** Free the first page from memory when it is no longer needed. */
-void
-Datafile::free_first_page()
+void Datafile::free_first_page()
 {
-	if (m_first_page_buf) {
-		ut_free(m_first_page_buf);
-		m_first_page_buf = NULL;
-		m_first_page = NULL;
-	}
+  aligned_free(m_first_page);
+  m_first_page= nullptr;
 }
 
 /** Validates the datafile and checks that it conforms with the expected
@@ -504,7 +505,6 @@ Datafile::validate_first_page(lsn_t* flush_lsn)
 
 		error_txt = "Cannot read first page";
 	} else {
-		ut_ad(m_first_page_buf);
 		ut_ad(m_first_page);
 
 		if (flush_lsn != NULL) {
@@ -565,7 +565,7 @@ err_exit:
 		goto err_exit;
 	}
 
-	if (m_space_id >= SRV_LOG_SPACE_FIRST_ID) {
+	if (m_space_id >= SRV_SPACE_ID_UPPER_BOUND) {
 		error_txt = "A bad Space ID was found";
 		goto err_exit;
 	}
@@ -653,11 +653,8 @@ Datafile::find_space_id()
 			<< "Page size:" << page_size
 			<< ". Pages to analyze:" << page_count;
 
-		byte*	buf = static_cast<byte*>(
-			ut_malloc_nokey(2 * UNIV_PAGE_SIZE_MAX));
-
 		byte*	page = static_cast<byte*>(
-			ut_align(buf, UNIV_SECTOR_SIZE));
+			aligned_malloc(page_size, page_size));
 
 		ulint fsp_flags;
 		/* provide dummy value if the first os_file_read() fails */
@@ -674,19 +671,10 @@ Datafile::find_space_id()
 		}
 
 		for (ulint j = 0; j < page_count; ++j) {
-
-			dberr_t		err;
-			ulint		n_bytes = j * page_size;
-			IORequest	request(IORequest::READ);
-
-			err = os_file_read(
-				request, m_handle, page, n_bytes, page_size);
-
-			if (err != DB_SUCCESS) {
-
+			if (os_file_read(IORequestRead, m_handle, page,
+					 j * page_size, page_size)) {
 				ib::info()
 					<< "READ FAIL: page_no:" << j;
-
 				continue;
 			}
 
@@ -732,7 +720,7 @@ Datafile::find_space_id()
 			}
 		}
 
-		ut_free(buf);
+		aligned_free(page);
 
 		ib::info()
 			<< "Page size: " << page_size
@@ -815,10 +803,8 @@ Datafile::restore_from_doublewrite()
 		<< physical_size << " bytes into file '"
 		<< m_filepath << "'";
 
-	IORequest	request(IORequest::WRITE);
-
 	return(os_file_write(
-			request,
+			IORequestWrite,
 			m_filepath, m_handle, page, 0, physical_size)
 	       != DB_SUCCESS);
 }
diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc
index 8ec813b1896..962d18d8081 100644
--- a/storage/innobase/fsp/fsp0fsp.cc
+++ b/storage/innobase/fsp/fsp0fsp.cc
@@ -45,7 +45,7 @@ Created 11/29/1995 Heikki Tuuri
 // JAN: MySQL 5.7 Encryption
 // #include <my_aes.h>
 
-typedef ulint page_no_t;
+typedef uint32_t page_no_t;
 
 /** Return an extent to the free list of a space.
 @param[in,out]	space		tablespace
@@ -59,18 +59,6 @@ fsp_free_extent(
 	page_no_t		offset,
 	mtr_t*			mtr);
 
-/********************************************************************//**
-Marks a page used. The page must reside within the extents of the given
-segment. */
-static MY_ATTRIBUTE((nonnull))
-void
-fseg_mark_page_used(
-/*================*/
-	fseg_inode_t*	seg_inode,/*!< in: segment inode */
-	page_no_t	page,	/*!< in: page offset */
-	xdes_t*		descr,  /*!< in: extent descriptor */
-	mtr_t*		mtr);	/*!< in/out: mini-transaction */
-
 /** Returns the first extent descriptor for a segment.
 We think of the extent lists of the segment catenated in the order
 FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE.
@@ -100,234 +88,141 @@ void
 fsp_fill_free_list(
 	bool		init_space,
 	fil_space_t*	space,
-	fsp_header_t*	header,
+	buf_block_t*	header,
 	mtr_t*		mtr);
 
 /** Allocates a single free page from a segment.
-This function implements the intelligent allocation strategy which tries
-to minimize file space fragmentation.
+This function implements the intelligent allocation strategy which tries to
+minimize file space fragmentation.
 @param[in,out]	space			tablespace
 @param[in,out]	seg_inode		segment inode
+@param[in,out]	iblock			segment inode page
 @param[in]	hint			hint of which page would be desirable
 @param[in]	direction		if the new page is needed because of
 an index page split, and records are inserted there in order, into which
 direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR
 @param[in,out]	mtr			mini-transaction
 @param[in,out]	init_mtr		mtr or another mini-transaction in
-which the page should be initialized. If init_mtr != mtr, but the page is
-already latched in mtr, do not initialize the page
-@param[in]	has_done_reservation	TRUE if the space has already been
-reserved, in this case we will never return NULL
-@retval NULL	if no page could be allocated
-@retval block	rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
-(init_mtr == mtr, or the page was not previously freed in mtr)
-@retval block	(not allocated or initialized) otherwise */
+which the page should be initialized.
+@retval NULL	if no page could be allocated */
 static
 buf_block_t*
 fseg_alloc_free_page_low(
 	fil_space_t*		space,
 	fseg_inode_t*		seg_inode,
-	ulint			hint,
+	buf_block_t*		iblock,
+	uint32_t		hint,
 	byte			direction,
-	mtr_t*			mtr,
-	mtr_t*			init_mtr
 #ifdef UNIV_DEBUG
-	, ibool			has_done_reservation
+	bool			has_done_reservation,
+	/*!< whether the space has already been reserved */
 #endif /* UNIV_DEBUG */
-)
+	mtr_t*			mtr,
+	mtr_t*			init_mtr)
 	MY_ATTRIBUTE((warn_unused_result));
 
-/** Gets a pointer to the space header and x-locks its page.
-@param[in]	space		tablespace
-@param[in,out]	mtr		mini-transaction
+/** Get the tablespace header block, SX-latched
+@param[in]      space           tablespace
+@param[in,out]  mtr             mini-transaction
 @return pointer to the space header, page x-locked */
-inline fsp_header_t* fsp_get_space_header(const fil_space_t* space, mtr_t* mtr)
-{
-	buf_block_t*	block;
-	fsp_header_t*	header;
-
-	ut_ad(space->purpose != FIL_TYPE_LOG);
-
-	block = buf_page_get(page_id_t(space->id, 0), space->zip_size(),
-			     RW_SX_LATCH, mtr);
-	header = FSP_HEADER_OFFSET + buf_block_get_frame(block);
-	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
-
-	ut_ad(space->id == mach_read_from_4(FSP_SPACE_ID + header));
-	return(header);
-}
-
-/**********************************************************************//**
-Gets a descriptor bit of a page.
-@return TRUE if free */
-UNIV_INLINE
-ibool
-xdes_mtr_get_bit(
-/*=============*/
-	const xdes_t*	descr,	/*!< in: descriptor */
-	ulint		bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
-	ulint		offset,	/*!< in: page offset within extent:
-				0 ... FSP_EXTENT_SIZE - 1 */
-	mtr_t*		mtr)	/*!< in: mini-transaction */
-{
-	ut_ad(mtr->is_active());
-	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_SX_FIX));
-
-	return(xdes_get_bit(descr, bit, offset));
-}
-
-/**********************************************************************//**
-Sets a descriptor bit of a page. */
-UNIV_INLINE
-void
-xdes_set_bit(
-/*=========*/
-	xdes_t*	descr,	/*!< in: descriptor */
-	ulint	bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
-	ulint	offset,	/*!< in: page offset within extent:
-			0 ... FSP_EXTENT_SIZE - 1 */
-	ibool	val,	/*!< in: bit value */
-	mtr_t*	mtr)	/*!< in/out: mini-transaction */
-{
-	ulint	index;
-	ulint	byte_index;
-	ulint	bit_index;
-	ulint	descr_byte;
-
-	ut_ad(mtr_memo_contains_page(
-			mtr, descr,
-			MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_PAGE_X_FIX));
-	ut_ad((bit == XDES_FREE_BIT) || (bit == XDES_CLEAN_BIT));
-	ut_ad(offset < FSP_EXTENT_SIZE);
-
-	index = bit + XDES_BITS_PER_PAGE * offset;
-
-	byte_index = index / 8;
-	bit_index = index % 8;
-
-	descr_byte = mach_read_from_1(descr + XDES_BITMAP + byte_index);
-	descr_byte = ut_bit_set_nth(descr_byte, bit_index, val);
-
-	mlog_write_ulint(descr + XDES_BITMAP + byte_index, descr_byte,
-			 MLOG_1BYTE, mtr);
-}
-
-/**********************************************************************//**
-Looks for a descriptor bit having the desired value. Starts from hint
-and scans upward; at the end of the extent the search is wrapped to
-the start of the extent.
-@return bit index of the bit, ULINT_UNDEFINED if not found */
-UNIV_INLINE
-ulint
-xdes_find_bit(
-/*==========*/
-	xdes_t*	descr,	/*!< in: descriptor */
-	ulint	bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
-	ibool	val,	/*!< in: desired bit value */
-	ulint	hint,	/*!< in: hint of which bit position would
-			be desirable */
-	mtr_t*	mtr)	/*!< in/out: mini-transaction */
-{
-	ulint	i;
-
-	ut_ad(descr && mtr);
-	ut_ad(val <= TRUE);
-	ut_ad(hint < FSP_EXTENT_SIZE);
-	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_SX_FIX));
-	for (i = hint; i < FSP_EXTENT_SIZE; i++) {
-		if (val == xdes_mtr_get_bit(descr, bit, i, mtr)) {
-
-			return(i);
-		}
-	}
-
-	for (i = 0; i < hint; i++) {
-		if (val == xdes_mtr_get_bit(descr, bit, i, mtr)) {
-
-			return(i);
-		}
-	}
-
-	return(ULINT_UNDEFINED);
-}
-
-/**********************************************************************//**
-Returns the number of used pages in a descriptor.
+inline buf_block_t *fsp_get_header(const fil_space_t *space, mtr_t *mtr)
+{
+ buf_block_t *block= buf_page_get(page_id_t(space->id, 0), space->zip_size(),
+                                  RW_SX_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+ ut_ad(space->id == mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID +
+                                     block->frame));
+ return block;
+}
+
+/** Set the XDES_FREE_BIT of a page.
+@tparam         free    desired value of XDES_FREE_BIT
+@param[in]      block   extent descriptor block
+@param[in,out]  descr   extent descriptor
+@param[in]      offset  page offset within the extent
+@param[in,out]  mtr     mini-transaction */
+template<bool free>
+inline void xdes_set_free(const buf_block_t &block, xdes_t *descr,
+                          ulint offset, mtr_t *mtr)
+{
+  ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
+                                   MTR_MEMO_PAGE_X_FIX));
+  ut_ad(offset < FSP_EXTENT_SIZE);
+  ut_ad(page_align(descr) == block.frame);
+  compile_time_assert(XDES_BITS_PER_PAGE == 2);
+  compile_time_assert(XDES_FREE_BIT == 0);
+  compile_time_assert(XDES_CLEAN_BIT == 1);
+
+  ulint index= XDES_BITS_PER_PAGE * offset;
+  byte *b= &descr[XDES_BITMAP + (index >> 3)];
+  /* xdes_init() should have set all XDES_CLEAN_BIT. */
+  ut_ad(!(~*b & 0xaa));
+  /* Clear or set XDES_FREE_BIT. */
+  byte val= free
+    ? static_cast<byte>(*b | 1 << (index & 7))
+    : static_cast<byte>(*b & ~(1 << (index & 7)));
+  mtr->write<1>(block, b, val);
+}
+
+/**
+Find a free page.
+@param descr   extent descriptor
+@param hint    page offset to start searching from (towards larger pages)
+@return free page offset
+@retval FIL_NULL if no page is free */
+inline uint32_t xdes_find_free(const xdes_t *descr, uint32_t hint= 0)
+{
+  const uint32_t extent_size= FSP_EXTENT_SIZE;
+  ut_ad(hint < extent_size);
+  for (uint32_t i= hint; i < extent_size; i++)
+    if (xdes_is_free(descr, i))
+      return i;
+  for (uint32_t i= 0; i < hint; i++)
+    if (xdes_is_free(descr, i))
+      return i;
+  return FIL_NULL;
+}
+
+/**
+Determine the number of used pages in a descriptor.
+@param descr  file descriptor
 @return number of pages used */
-UNIV_INLINE
-ulint
-xdes_get_n_used(
-/*============*/
-	const xdes_t*	descr,	/*!< in: descriptor */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+inline uint32_t xdes_get_n_used(const xdes_t *descr)
 {
-	ulint	count	= 0;
+  uint32_t count= 0;
 
-	ut_ad(descr && mtr);
-	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_SX_FIX));
-	for (ulint i = 0; i < FSP_EXTENT_SIZE; ++i) {
-		if (FALSE == xdes_mtr_get_bit(descr, XDES_FREE_BIT, i, mtr)) {
-			count++;
-		}
-	}
+  for (uint32_t i= FSP_EXTENT_SIZE; i--; )
+    if (!xdes_is_free(descr, i))
+      count++;
 
-	return(count);
+  return count;
 }
 
-/**********************************************************************//**
-Returns true if extent contains no used pages.
-@return TRUE if totally free */
-UNIV_INLINE
-ibool
-xdes_is_free(
-/*=========*/
-	const xdes_t*	descr,	/*!< in: descriptor */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+/**
+Determine whether a file extent is full.
+@param descr  file descriptor
+@return whether all pages have been allocated */
+inline bool xdes_is_full(const xdes_t *descr)
 {
-	if (0 == xdes_get_n_used(descr, mtr)) {
-
-		return(TRUE);
-	}
-
-	return(FALSE);
+  return FSP_EXTENT_SIZE == xdes_get_n_used(descr);
 }
 
-/**********************************************************************//**
-Returns true if extent contains no free pages.
-@return TRUE if full */
-UNIV_INLINE
-ibool
-xdes_is_full(
-/*=========*/
-	const xdes_t*	descr,	/*!< in: descriptor */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
-{
-	if (FSP_EXTENT_SIZE == xdes_get_n_used(descr, mtr)) {
-
-		return(TRUE);
-	}
-
-	return(FALSE);
-}
-
-/**********************************************************************//**
-Sets the state of an xdes. */
-UNIV_INLINE
-void
-xdes_set_state(
-/*===========*/
-	xdes_t*	descr,	/*!< in/out: descriptor */
-	ulint	state,	/*!< in: state to set */
-	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+/** Set the state of an extent descriptor.
+@param[in]      block   extent descriptor block
+@param[in,out]  descr   extent descriptor
+@param[in]      state   the state
+@param[in,out]  mtr     mini-transaction */
+inline void xdes_set_state(const buf_block_t &block, xdes_t *descr,
+			   byte state, mtr_t *mtr)
 {
-	ut_ad(descr && mtr);
-	ut_ad(state >= XDES_FREE);
-	ut_ad(state <= XDES_FSEG);
-	ut_ad(mtr_memo_contains_page(
-			mtr, descr,
-			MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_PAGE_X_FIX));
-
-	mlog_write_ulint(descr + XDES_STATE, state, MLOG_4BYTES, mtr);
+  ut_ad(descr && mtr);
+  ut_ad(state >= XDES_FREE);
+  ut_ad(state <= XDES_FSEG);
+  ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
+                                   MTR_MEMO_PAGE_X_FIX));
+  ut_ad(page_align(descr) == block.frame);
+  ut_ad(mach_read_from_4(descr + XDES_STATE) <= XDES_FSEG);
+  mtr->write<1>(block, XDES_STATE + 3 + descr, state);
 }
 
 /**********************************************************************//**
@@ -337,14 +232,11 @@ UNIV_INLINE
 ulint
 xdes_get_state(
 /*===========*/
-	const xdes_t*	descr,	/*!< in: descriptor */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	const xdes_t*	descr)	/*!< in: descriptor */
 {
 	ulint	state;
 
-	ut_ad(descr && mtr);
-	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_SX_FIX));
-
+	ut_ad(descr);
 	state = mach_read_from_4(descr + XDES_STATE);
 	ut_ad(state - 1 < XDES_FSEG);
 	return(state);
@@ -352,50 +244,88 @@ xdes_get_state(
 
 /**********************************************************************//**
 Inits an extent descriptor to the free and clean state. */
-UNIV_INLINE
+inline void xdes_init(const buf_block_t &block, xdes_t *descr, mtr_t *mtr)
+{
+  ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
+                                   MTR_MEMO_PAGE_X_FIX));
+  mtr->memset(&block, uint16_t(descr - block.frame) + XDES_BITMAP,
+              XDES_SIZE - XDES_BITMAP, 0xff);
+  xdes_set_state(block, descr, XDES_FREE, mtr);
+}
+
+/** Mark a page used in an extent descriptor.
+@param[in,out]  seg_inode       segment inode
+@param[in,out]  iblock          segment inode page
+@param[in]      page            page number
+@param[in,out]  descr           extent descriptor
+@param[in,out]  xdes            extent descriptor page
+@param[in,out]  mtr             mini-transaction */
+static MY_ATTRIBUTE((nonnull))
 void
-xdes_init(
-/*======*/
-	xdes_t*	descr,	/*!< in: descriptor */
-	mtr_t*	mtr)	/*!< in/out: mini-transaction */
-{
-  ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_SX_FIX |
-                               MTR_MEMO_PAGE_X_FIX));
-  mlog_memset(descr + XDES_BITMAP, XDES_SIZE - XDES_BITMAP, 0xff, mtr);
-  xdes_set_state(descr, XDES_FREE, mtr);
+fseg_mark_page_used(fseg_inode_t *seg_inode, buf_block_t *iblock,
+                    ulint page, xdes_t *descr, buf_block_t *xdes, mtr_t *mtr)
+{
+  ut_ad(fil_page_get_type(iblock->frame) == FIL_PAGE_INODE);
+  ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+  ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+  ut_ad(!memcmp(seg_inode + FSEG_ID, descr + XDES_ID, 4));
+
+  const uint16_t xoffset= uint16_t(descr - xdes->frame + XDES_FLST_NODE);
+  const uint16_t ioffset= uint16_t(seg_inode - iblock->frame);
+
+  if (!xdes_get_n_used(descr))
+  {
+    /* We move the extent from the free list to the NOT_FULL list */
+    flst_remove(iblock, uint16_t(FSEG_FREE + ioffset), xdes, xoffset, mtr);
+    flst_add_last(iblock, uint16_t(FSEG_NOT_FULL + ioffset),
+                  xdes, xoffset, mtr);
+  }
+
+  ut_ad(xdes_is_free(descr, page % FSP_EXTENT_SIZE));
+
+  /* We mark the page as used */
+  xdes_set_free<false>(*xdes, descr, page % FSP_EXTENT_SIZE, mtr);
+
+  byte* p_not_full= seg_inode + FSEG_NOT_FULL_N_USED;
+  const uint32_t not_full_n_used= mach_read_from_4(p_not_full) + 1;
+  mtr->write<4>(*iblock, p_not_full, not_full_n_used);
+  if (xdes_is_full(descr))
+  {
+    /* We move the extent from the NOT_FULL list to the FULL list */
+    flst_remove(iblock, uint16_t(FSEG_NOT_FULL + ioffset), xdes, xoffset, mtr);
+    flst_add_last(iblock, uint16_t(FSEG_FULL + ioffset), xdes, xoffset, mtr);
+    mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED,
+                  not_full_n_used - FSP_EXTENT_SIZE);
+  }
 }
 
 /** Get pointer to a the extent descriptor of a page.
 @param[in,out]	sp_header	tablespace header page, x-latched
 @param[in]	space		tablespace
 @param[in]	offset		page offset
+@param[out]	desc_block	descriptor block
 @param[in,out]	mtr		mini-transaction
 @param[in]	init_space	whether the tablespace is being initialized
-@param[out]	desc_block	descriptor block, or NULL if it is
-the same as the tablespace header
 @return pointer to the extent descriptor, NULL if the page does not
 exist in the space or if the offset exceeds free limit */
 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
 xdes_t*
 xdes_get_descriptor_with_space_hdr(
-	fsp_header_t*		sp_header,
+	buf_block_t*		header,
 	const fil_space_t*	space,
 	page_no_t		offset,
+	buf_block_t**		desc_block,
 	mtr_t*			mtr,
-	bool			init_space = false,
-	buf_block_t**		desc_block = NULL)
+	bool			init_space = false)
 {
-	ulint	limit;
-	ulint	size;
-	ulint	descr_page_no;
-	page_t*	descr_page;
-	ut_ad(mtr_memo_contains(mtr, space, MTR_MEMO_SPACE_X_LOCK));
-	ut_ad(mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_SX_FIX)
-	      || mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_X_FIX));
-	ut_ad(page_offset(sp_header) == FSP_HEADER_OFFSET);
+	ut_ad(mtr->memo_contains(*space));
+	ut_ad(mtr->memo_contains_flagged(header, MTR_MEMO_PAGE_SX_FIX
+					 | MTR_MEMO_PAGE_X_FIX));
 	/* Read free limit and space size */
-	limit = mach_read_from_4(sp_header + FSP_FREE_LIMIT);
-	size  = mach_read_from_4(sp_header + FSP_SIZE);
+	uint32_t limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
+					  + header->frame);
+	uint32_t size  = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+					  + header->frame);
 	ut_ad(limit == space->free_limit
 	      || (space->free_limit == 0
 		  && (init_space
@@ -409,33 +339,27 @@ xdes_get_descriptor_with_space_hdr(
 		return(NULL);
 	}
 
-	const ulint zip_size = space->zip_size();
-
-	descr_page_no = xdes_calc_descriptor_page(zip_size, offset);
+	const unsigned zip_size = space->zip_size();
 
-	buf_block_t*		block;
+	uint32_t descr_page_no = xdes_calc_descriptor_page(zip_size, offset);
 
-	if (descr_page_no == 0) {
-		/* It is on the space header page */
+	buf_block_t* block = header;
 
-		descr_page = page_align(sp_header);
-		block = NULL;
-	} else {
+	if (descr_page_no) {
 		block = buf_page_get(
 			page_id_t(space->id, descr_page_no), zip_size,
 			RW_SX_LATCH, mtr);
 
 		buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
-
-		descr_page = buf_block_get_frame(block);
 	}
 
 	if (desc_block != NULL) {
 		*desc_block = block;
 	}
 
-	return(descr_page + XDES_ARR_OFFSET
-	       + XDES_SIZE * xdes_calc_descriptor_index(zip_size, offset));
+	return XDES_ARR_OFFSET + XDES_SIZE
+		* xdes_calc_descriptor_index(zip_size, offset)
+		+ block->frame;
 }
 
 /** Get the extent descriptor of a page.
@@ -447,24 +371,16 @@ defined, as they are uninitialized above the free limit.
 @param[in]	space		tablespace
 @param[in]	offset		page offset; if equal to the free limit, we
 try to add new extents to the space free list
+@param[out]	xdes		extent descriptor page
 @param[in,out]	mtr		mini-transaction
 @return the extent descriptor */
-MY_ATTRIBUTE((warn_unused_result))
-static
-xdes_t*
-xdes_get_descriptor(const fil_space_t* space, page_no_t offset, mtr_t* mtr)
+static xdes_t* xdes_get_descriptor(const fil_space_t *space, page_no_t offset,
+                                   buf_block_t **xdes, mtr_t *mtr)
 {
-	buf_block_t*	block;
-	fsp_header_t*	sp_header;
-
-	block = buf_page_get(page_id_t(space->id, 0), space->zip_size(),
-			     RW_SX_LATCH, mtr);
-
-	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
-
-	sp_header = FSP_HEADER_OFFSET + buf_block_get_frame(block);
-	return(xdes_get_descriptor_with_space_hdr(
-		       sp_header, space, offset, mtr));
+  buf_block_t *block= buf_page_get(page_id_t(space->id, 0), space->zip_size(),
+                                   RW_SX_LATCH, mtr);
+  buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+  return xdes_get_descriptor_with_space_hdr(block, space, offset, xdes, mtr);
 }
 
 /** Get the extent descriptor of a page.
@@ -488,16 +404,23 @@ xdes_get_descriptor_const(
 	page_no_t		offset,
 	mtr_t*			mtr)
 {
-	ut_ad(mtr_memo_contains(mtr, &space->latch, MTR_MEMO_S_LOCK));
+	ut_ad(mtr->memo_contains(space->latch, MTR_MEMO_SX_LOCK));
 	ut_ad(offset < space->free_limit);
 	ut_ad(offset < space->size_in_header);
 
 	const ulint zip_size = space->zip_size();
 
-	if (buf_block_t* block = buf_page_get(page_id_t(space->id, page),
-					      zip_size, RW_S_LATCH, mtr)) {
+	if (buf_block_t* block = buf_page_get_gen(page_id_t(space->id, page),
+						  zip_size, RW_S_LATCH,
+						  nullptr,
+						  BUF_GET_POSSIBLY_FREED,
+						  __FILE__, __LINE__, mtr)) {
 		buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
 
+		if (block->page.status == buf_page_t::FREED) {
+			return nullptr;
+		}
+
 		ut_ad(page != 0 || space->free_limit == mach_read_from_4(
 			      FSP_FREE_LIMIT + FSP_HEADER_OFFSET
 			      + block->frame));
@@ -517,6 +440,7 @@ extent descriptor resides is x-locked.
 @param[in]	space		tablespace
 @param[in]	lst_node	file address of the list node
 				contained in the descriptor
+@param[out]	block		extent descriptor block
 @param[in,out]	mtr		mini-transaction
 @return pointer to the extent descriptor */
 MY_ATTRIBUTE((nonnull, warn_unused_result))
@@ -525,49 +449,51 @@ xdes_t*
 xdes_lst_get_descriptor(
 	const fil_space_t*	space,
 	fil_addr_t		lst_node,
+	buf_block_t**		block,
 	mtr_t*			mtr)
 {
-	ut_ad(mtr_memo_contains(mtr, space, MTR_MEMO_SPACE_X_LOCK));
+	ut_ad(mtr->memo_contains(*space));
 	return fut_get_ptr(space->id, space->zip_size(),
-			   lst_node, RW_SX_LATCH, mtr)
+			   lst_node, RW_SX_LATCH, mtr, block)
 		- XDES_FLST_NODE;
 }
 
 /********************************************************************//**
 Returns page offset of the first page in extent described by a descriptor.
 @return offset of the first page in extent */
-UNIV_INLINE
-ulint
-xdes_get_offset(
-/*============*/
-	const xdes_t*	descr)	/*!< in: extent descriptor */
+static uint32_t xdes_get_offset(const xdes_t *descr)
 {
-	ut_ad(descr);
-
-	return(page_get_page_no(page_align(descr))
-	       + ((page_offset(descr) - XDES_ARR_OFFSET) / XDES_SIZE)
-	       * FSP_EXTENT_SIZE);
+  ut_ad(descr);
+  return page_get_page_no(page_align(descr)) +
+    uint32_t(((page_offset(descr) - XDES_ARR_OFFSET) / XDES_SIZE) *
+             FSP_EXTENT_SIZE);
 }
 
 /** Initialize a file page whose prior contents should be ignored.
 @param[in,out]	block	buffer pool block */
-void fsp_apply_init_file_page(buf_block_t* block)
-{
-	page_t*		page	= buf_block_get_frame(block);
-
-	memset(page, 0, srv_page_size);
-
-	mach_write_to_4(page + FIL_PAGE_OFFSET, block->page.id.page_no());
-	mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
-			block->page.id.space());
-
-	if (page_zip_des_t* page_zip= buf_block_get_page_zip(block)) {
-		memset(page_zip->data, 0, page_zip_get_size(page_zip));
-		memcpy(page_zip->data + FIL_PAGE_OFFSET,
-		       page + FIL_PAGE_OFFSET, 4);
-		memcpy(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
-		       page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4);
-	}
+void fsp_apply_init_file_page(buf_block_t *block)
+{
+  memset_aligned<UNIV_PAGE_SIZE_MIN>(block->frame, 0, srv_page_size);
+  const page_id_t id(block->page.id());
+
+  mach_write_to_4(block->frame + FIL_PAGE_OFFSET, id.page_no());
+  if (log_sys.is_physical())
+    memset_aligned<8>(block->frame + FIL_PAGE_PREV, 0xff, 8);
+  mach_write_to_4(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id.space());
+  if (page_zip_des_t* page_zip= buf_block_get_page_zip(block))
+  {
+    memset_aligned<UNIV_ZIP_SIZE_MIN>(page_zip->data, 0,
+                                      page_zip_get_size(page_zip));
+    static_assert(FIL_PAGE_OFFSET == 4, "compatibility");
+    memcpy_aligned<4>(page_zip->data + FIL_PAGE_OFFSET,
+                      block->frame + FIL_PAGE_OFFSET, 4);
+    if (log_sys.is_physical())
+      memset_aligned<8>(page_zip->data + FIL_PAGE_PREV, 0xff, 8);
+    static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+                  "not perfect alignment");
+    memcpy_aligned<2>(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+                      block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4);
+  }
 }
 
 #ifdef UNIV_DEBUG
@@ -577,7 +503,6 @@ updating an allocation bitmap page.
 void fil_space_t::modify_check(const mtr_t& mtr) const
 {
 	switch (mtr.get_log_mode()) {
-	case MTR_LOG_SHORT_INSERTS:
 	case MTR_LOG_NONE:
 		/* These modes are only allowed within a non-bitmap page
 		when there is a higher-level redo log record written. */
@@ -586,8 +511,7 @@ void fil_space_t::modify_check(const mtr_t& mtr) const
 		break;
 	case MTR_LOG_NO_REDO:
 		ut_ad(purpose == FIL_TYPE_TEMPORARY
-		      || purpose == FIL_TYPE_IMPORT
-		      || redo_skipped_count);
+		      || purpose == FIL_TYPE_IMPORT);
 		return;
 	case MTR_LOG_ALL:
 		/* We may only write redo log for a persistent
@@ -597,7 +521,7 @@ void fil_space_t::modify_check(const mtr_t& mtr) const
 		return;
 	}
 
-	ut_ad(!"invalid log mode");
+	ut_ad("invalid log mode" == 0);
 }
 #endif
 
@@ -625,15 +549,23 @@ fsp_header_init_fields(
 @param[in,out]	space	tablespace
 @param[in]	size	current size in blocks
 @param[in,out]	mtr	mini-transaction */
-void fsp_header_init(fil_space_t* space, ulint size, mtr_t* mtr)
+void fsp_header_init(fil_space_t* space, uint32_t size, mtr_t* mtr)
 {
 	const page_id_t page_id(space->id, 0);
 	const ulint zip_size = space->zip_size();
 
+	buf_block_t *free_block = buf_LRU_get_free_block(false);
+
 	mtr_x_lock_space(space, mtr);
-	buf_block_t* block = buf_page_create(page_id, zip_size, mtr);
+
+	buf_block_t* block = buf_page_create(space, 0, zip_size, mtr,
+					     free_block);
 	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
 
+	if (UNIV_UNLIKELY(block != free_block)) {
+		buf_pool.free_block(free_block);
+	}
+
 	space->size_in_header = size;
 	space->free_len = 0;
 	space->free_limit = 0;
@@ -642,20 +574,26 @@ void fsp_header_init(fil_space_t* space, ulint size, mtr_t* mtr)
 
 	fsp_init_file_page(space, block, mtr);
 
-	mlog_write_ulint(block->frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_FSP_HDR,
-			 MLOG_2BYTES, mtr);
+	mtr->write<2>(*block, block->frame + FIL_PAGE_TYPE,
+		      FIL_PAGE_TYPE_FSP_HDR);
 
-	mlog_write_ulint(FSP_HEADER_OFFSET + FSP_SPACE_ID + block->frame,
-			 space->id, MLOG_4BYTES, mtr);
+	mtr->write<4,mtr_t::MAYBE_NOP>(*block, FSP_HEADER_OFFSET + FSP_SPACE_ID
+				       + block->frame, space->id);
 	ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_NOT_USED
 				    + block->frame));
-	mlog_write_ulint(FSP_HEADER_OFFSET + FSP_SIZE + block->frame, size,
-			 MLOG_4BYTES, mtr);
+	/* recv_sys_t::parse() expects to find a WRITE record that
+	covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
+	in order to avoid optimizing away any unchanged most
+	significant bytes of FSP_SIZE. */
+	mtr->write<4,mtr_t::FORCED>(*block, FSP_HEADER_OFFSET + FSP_SIZE
+				    + block->frame, size);
 	ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
 				    + block->frame));
-	mlog_write_ulint(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + block->frame,
-			 space->flags & ~FSP_FLAGS_MEM_MASK,
-			 MLOG_4BYTES, mtr);
+	if (auto f = space->flags & ~FSP_FLAGS_MEM_MASK) {
+		mtr->write<4,mtr_t::FORCED>(*block,
+					    FSP_HEADER_OFFSET + FSP_SPACE_FLAGS
+					    + block->frame, f);
+	}
 	ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED
 				    + block->frame));
 
@@ -665,47 +603,21 @@ void fsp_header_init(fil_space_t* space, ulint size, mtr_t* mtr)
 	flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, mtr);
 	flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, mtr);
 
-	mlog_write_ull(FSP_HEADER_OFFSET + FSP_SEG_ID + block->frame, 1, mtr);
+	mtr->write<8>(*block, FSP_HEADER_OFFSET + FSP_SEG_ID + block->frame,
+		      1U);
 
 	fsp_fill_free_list(!is_system_tablespace(space->id),
-			   space, FSP_HEADER_OFFSET + block->frame, mtr);
+			   space, block, mtr);
 
 	/* Write encryption metadata to page 0 if tablespace is
 	encrypted or encryption is disabled by table option. */
 	if (space->crypt_data &&
 	    (space->crypt_data->should_encrypt() ||
 	     space->crypt_data->not_encrypted())) {
-		space->crypt_data->write_page0(space, block->frame, mtr);
+		space->crypt_data->write_page0(block, mtr);
 	}
 }
 
-/**********************************************************************//**
-Reads the space id from the first page of a tablespace.
-@return space id, ULINT UNDEFINED if error */
-ulint
-fsp_header_get_space_id(
-/*====================*/
-	const page_t*	page)	/*!< in: first page of a tablespace */
-{
-	ulint	fsp_id;
-	ulint	id;
-
-	fsp_id = mach_read_from_4(FSP_HEADER_OFFSET + page + FSP_SPACE_ID);
-
-	id = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
-
-	DBUG_EXECUTE_IF("fsp_header_get_space_id_failure",
-			id = ULINT_UNDEFINED;);
-
-	if (id != fsp_id) {
-		ib::error() << "Space ID in fsp header is " << fsp_id
-			<< ", but in the page header it is " << id << ".";
-		return(ULINT_UNDEFINED);
-	}
-
-	return(id);
-}
-
 /** Try to extend a single-table tablespace so that a page would fit in the
 data file.
 @param[in,out]	space	tablespace
@@ -717,8 +629,8 @@ static ATTRIBUTE_COLD __attribute__((warn_unused_result))
 bool
 fsp_try_extend_data_file_with_pages(
 	fil_space_t*	space,
-	ulint		page_no,
-	fsp_header_t*	header,
+	uint32_t	page_no,
+	buf_block_t*	header,
 	mtr_t*		mtr)
 {
 	bool	success;
@@ -727,14 +639,19 @@ fsp_try_extend_data_file_with_pages(
 	ut_a(!is_system_tablespace(space->id));
 	ut_d(space->modify_check(*mtr));
 
-	size = mach_read_from_4(header + FSP_SIZE);
+	size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + header->frame);
 	ut_ad(size == space->size_in_header);
 
 	ut_a(page_no >= size);
 
 	success = fil_space_extend(space, page_no + 1);
 	/* The size may be less than we wanted if we ran out of disk space. */
-	mlog_write_ulint(header + FSP_SIZE, space->size, MLOG_4BYTES, mtr);
+	/* recv_sys_t::parse() expects to find a WRITE record that
+	covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
+	in order to avoid optimizing away any unchanged most
+	significant bytes of FSP_SIZE. */
+	mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE
+				    + header->frame, space->size);
 	space->size_in_header = space->size;
 
 	return(success);
@@ -743,9 +660,9 @@ fsp_try_extend_data_file_with_pages(
 /** Calculate the number of physical pages in an extent for this file.
 @param[in]	physical_size	page_size of the datafile
 @return number of pages in an extent for this file */
-inline ulint fsp_get_extent_size_in_pages(ulint physical_size)
+inline uint32_t fsp_get_extent_size_in_pages(ulint physical_size)
 {
-	return (FSP_EXTENT_SIZE << srv_page_size_shift) / physical_size;
+  return uint32_t((FSP_EXTENT_SIZE << srv_page_size_shift) / physical_size);
 }
 
 
@@ -761,12 +678,13 @@ on one extent descriptor page. See xdes_calc_descriptor_page().
 @param[in]	physical_size	page size in data file
 @param[in]	size		current number of pages in the datafile
 @return number of pages to extend the file. */
-static ulint fsp_get_pages_to_extend_ibd(ulint physical_size, ulint size)
+static uint32_t fsp_get_pages_to_extend_ibd(unsigned physical_size,
+					    uint32_t size)
 {
-	ulint extent_size = fsp_get_extent_size_in_pages(physical_size);
+	uint32_t extent_size = fsp_get_extent_size_in_pages(physical_size);
 	/* The threshold is set at 32MiB except when the physical page
 	size is small enough that it must be done sooner. */
-	ulint threshold = std::min(32 * extent_size, physical_size);
+	uint32_t threshold = std::min(32 * extent_size, physical_size);
 
 	if (size >= threshold) {
 		/* Below in fsp_fill_free_list() we assume
@@ -787,10 +705,8 @@ static ulint fsp_get_pages_to_extend_ibd(ulint physical_size, ulint size)
 ATTRIBUTE_COLD __attribute__((nonnull))
 static
 ulint
-fsp_try_extend_data_file(fil_space_t* space, fsp_header_t* header, mtr_t* mtr)
+fsp_try_extend_data_file(fil_space_t *space, buf_block_t *header, mtr_t *mtr)
 {
-	ulint	size;		/* current number of pages in the datafile */
-	ulint	size_increase;	/* number of pages to extend this file */
 	const char* OUT_OF_SPACE_MSG =
 		"ran out of space. Please add another file or use"
 		" 'autoextend' for the last file in setting";
@@ -827,10 +743,12 @@ fsp_try_extend_data_file(fil_space_t* space, fsp_header_t* header, mtr_t* mtr)
 		return(0);
 	}
 
-	size = mach_read_from_4(header + FSP_SIZE);
+	uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+					 + header->frame);
 	ut_ad(size == space->size_in_header);
+	uint32_t size_increase;
 
-	const ulint ps = space->physical_size();
+	const unsigned ps = space->physical_size();
 
 	switch (space->id) {
 	case TRX_SYS_SPACE:
@@ -840,7 +758,7 @@ fsp_try_extend_data_file(fil_space_t* space, fsp_header_t* header, mtr_t* mtr)
 		size_increase = srv_tmp_space.get_increment();
 		break;
 	default:
-		ulint extent_pages = fsp_get_extent_size_in_pages(ps);
+		uint32_t extent_pages = fsp_get_extent_size_in_pages(ps);
 		if (size < extent_pages) {
 			/* Let us first extend the file to extent_size */
 			if (!fsp_try_extend_data_file_with_pages(
@@ -869,8 +787,12 @@ fsp_try_extend_data_file(fil_space_t* space, fsp_header_t* header, mtr_t* mtr)
 		? space->size
 		: ut_2pow_round(space->size, (1024 * 1024) / ps);
 
-	mlog_write_ulint(
-		header + FSP_SIZE, space->size_in_header, MLOG_4BYTES, mtr);
+	/* recv_sys_t::parse() expects to find a WRITE record that
+	covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
+	in order to avoid optimizing away any unchanged most
+	significant bytes of FSP_SIZE. */
+	mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE
+				    + header->frame, space->size_in_header);
 
 	return(size_increase);
 }
@@ -886,9 +808,9 @@ ATTRIBUTE_COLD
 void fil_block_reset_type(const buf_block_t& block, ulint type, mtr_t* mtr)
 {
 	ib::info()
-		<< "Resetting invalid page " << block.page.id << " type "
+		<< "Resetting invalid page " << block.page.id() << " type "
 		<< fil_page_get_type(block.frame) << " to " << type << ".";
-	mlog_write_ulint(block.frame + FIL_PAGE_TYPE, type, MLOG_2BYTES, mtr);
+	mtr->write<2>(block, block.frame + FIL_PAGE_TYPE, type);
 }
 
 /** Put new extents to the free list if there are free extents above the free
@@ -905,22 +827,16 @@ void
 fsp_fill_free_list(
 	bool		init_space,
 	fil_space_t*	space,
-	fsp_header_t*	header,
+	buf_block_t*	header,
 	mtr_t*		mtr)
 {
-	ulint	limit;
-	ulint	size;
-	xdes_t*	descr;
-	ulint	count		= 0;
-	ulint	frag_n_used;
-	ulint	i;
-
-	ut_ad(page_offset(header) == FSP_HEADER_OFFSET);
 	ut_d(space->modify_check(*mtr));
 
 	/* Check if we can fill free list from above the free list limit */
-	size = mach_read_from_4(header + FSP_SIZE);
-	limit = mach_read_from_4(header + FSP_FREE_LIMIT);
+	uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+					 + header->frame);
+	uint32_t limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
+					  + header->frame);
 
 	ut_ad(size == space->size_in_header);
 	ut_ad(limit == space->free_limit);
@@ -944,17 +860,18 @@ fsp_fill_free_list(
 		}
 	}
 
-	i = limit;
+	uint32_t count = 0;
 
-	while ((init_space && i < 1)
-	       || ((i + FSP_EXTENT_SIZE <= size) && (count < FSP_FREE_ADD))) {
+	for (uint32_t i = limit, extent_size = FSP_EXTENT_SIZE,
+		     physical_size = space->physical_size();
+	     (init_space && i < 1)
+		     || (i + extent_size <= size && count < FSP_FREE_ADD);
+	     i += extent_size) {
+		const bool init_xdes = !ut_2pow_remainder(i, physical_size);
 
-		const bool init_xdes = 0
-			== ut_2pow_remainder(i, ulint(space->physical_size()));
-
-		space->free_limit = i + FSP_EXTENT_SIZE;
-		mlog_write_ulint(header + FSP_FREE_LIMIT, i + FSP_EXTENT_SIZE,
-				 MLOG_4BYTES, mtr);
+		space->free_limit = i + extent_size;
+		mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FREE_LIMIT
+			      + header->frame, i + extent_size);
 
 		if (init_xdes) {
 
@@ -965,45 +882,47 @@ fsp_fill_free_list(
 			pages should be ignored. */
 
 			if (i > 0) {
-				const page_id_t	page_id(space->id, i);
-
-				block = buf_page_create(
-					page_id, zip_size, mtr);
-
+				buf_block_t *f= buf_LRU_get_free_block(false);
+				block= buf_page_create(
+					space, static_cast<uint32_t>(i),
+					zip_size, mtr, f);
 				buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
-
+				if (UNIV_UNLIKELY(block != f)) {
+					buf_pool.free_block(f);
+				}
 				fsp_init_file_page(space, block, mtr);
-				mlog_write_ulint(buf_block_get_frame(block)
-						 + FIL_PAGE_TYPE,
-						 FIL_PAGE_TYPE_XDES,
-						 MLOG_2BYTES, mtr);
+				mtr->write<2>(*block,
+					      FIL_PAGE_TYPE + block->frame,
+					      FIL_PAGE_TYPE_XDES);
 			}
 
 			if (space->purpose != FIL_TYPE_TEMPORARY) {
-				const page_id_t	page_id(
-					space->id,
-					i + FSP_IBUF_BITMAP_OFFSET);
-
+				buf_block_t *f= buf_LRU_get_free_block(false);
 				block = buf_page_create(
-					page_id, zip_size, mtr);
-
+					space,
+					static_cast<uint32_t>(
+						i + FSP_IBUF_BITMAP_OFFSET),
+					zip_size, mtr, f);
 				buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
-
+				if (UNIV_UNLIKELY(block != f)) {
+					buf_pool.free_block(f);
+				}
 				fsp_init_file_page(space, block, mtr);
-				mlog_write_ulint(block->frame + FIL_PAGE_TYPE,
-						 FIL_PAGE_IBUF_BITMAP,
-						 MLOG_2BYTES, mtr);
+				mtr->write<2>(*block,
+					      block->frame + FIL_PAGE_TYPE,
+					      FIL_PAGE_IBUF_BITMAP);
 			}
 		}
 
-		buf_block_t*	desc_block = NULL;
-		descr = xdes_get_descriptor_with_space_hdr(
-			header, space, i, mtr, init_space, &desc_block);
-		if (desc_block && !space->full_crc32()) {
-			fil_block_check_type(
-				*desc_block, FIL_PAGE_TYPE_XDES, mtr);
+		buf_block_t* xdes;
+		xdes_t*	descr = xdes_get_descriptor_with_space_hdr(
+			header, space, i, &xdes, mtr, init_space);
+		if (xdes != header && !space->full_crc32()) {
+			fil_block_check_type(*xdes, FIL_PAGE_TYPE_XDES, mtr);
 		}
-		xdes_init(descr, mtr);
+		xdes_init(*xdes, descr, mtr);
+		const uint16_t xoffset= static_cast<uint16_t>(
+			descr - xdes->frame + XDES_FLST_NODE);
 
 		if (UNIV_UNLIKELY(init_xdes)) {
 
@@ -1011,24 +930,23 @@ fsp_fill_free_list(
 			and the second is an ibuf bitmap page: mark them
 			used */
 
-			xdes_set_bit(descr, XDES_FREE_BIT, 0, FALSE, mtr);
-			xdes_set_bit(descr, XDES_FREE_BIT,
-				     FSP_IBUF_BITMAP_OFFSET, FALSE, mtr);
-			xdes_set_state(descr, XDES_FREE_FRAG, mtr);
-
-			flst_add_last(header + FSP_FREE_FRAG,
-				      descr + XDES_FLST_NODE, mtr);
-			frag_n_used = mach_read_from_4(
-				header + FSP_FRAG_N_USED);
-			mlog_write_ulint(header + FSP_FRAG_N_USED,
-					 frag_n_used + 2, MLOG_4BYTES, mtr);
+			xdes_set_free<false>(*xdes, descr, 0, mtr);
+			xdes_set_free<false>(*xdes, descr,
+					     FSP_IBUF_BITMAP_OFFSET, mtr);
+			xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
+
+			flst_add_last(header,
+				      FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+				      xdes, xoffset, mtr);
+			byte* n_used = FSP_HEADER_OFFSET + FSP_FRAG_N_USED
+				+ header->frame;
+			mtr->write<4>(*header, n_used,
+				      2U + mach_read_from_4(n_used));
 		} else {
-			flst_add_last(header + FSP_FREE,
-				      descr + XDES_FLST_NODE, mtr);
+			flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE,
+				      xdes, xoffset, mtr);
 			count++;
 		}
-
-		i += FSP_EXTENT_SIZE;
 	}
 
 	space->free_len += count;
@@ -1038,121 +956,136 @@ fsp_fill_free_list(
 @param[in,out]	space		tablespace
 @param[in]	hint		hint of which extent would be desirable: any
 page offset in the extent goes; the hint must not be > FSP_FREE_LIMIT
+@param[out]	xdes		extent descriptor page
 @param[in,out]	mtr		mini-transaction
 @return extent descriptor, NULL if cannot be allocated */
 static
 xdes_t*
 fsp_alloc_free_extent(
 	fil_space_t*		space,
-	ulint			hint,
+	uint32_t		hint,
+	buf_block_t**		xdes,
 	mtr_t*			mtr)
 {
-	fsp_header_t*	header;
 	fil_addr_t	first;
 	xdes_t*		descr;
 	buf_block_t*	desc_block = NULL;
 
-	header = fsp_get_space_header(space, mtr);
+	buf_block_t* header = fsp_get_header(space, mtr);
 
 	descr = xdes_get_descriptor_with_space_hdr(
-		header, space, hint, mtr, false, &desc_block);
+		header, space, hint, &desc_block, mtr);
 
-	if (desc_block && !space->full_crc32()) {
+	if (desc_block != header && !space->full_crc32()) {
 		fil_block_check_type(*desc_block, FIL_PAGE_TYPE_XDES, mtr);
 	}
 
-	if (descr && (xdes_get_state(descr, mtr) == XDES_FREE)) {
+	if (descr && (xdes_get_state(descr) == XDES_FREE)) {
 		/* Ok, we can take this extent */
 	} else {
 		/* Take the first extent in the free list */
-		first = flst_get_first(header + FSP_FREE, mtr);
+		first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE
+				       + header->frame);
 
-		if (fil_addr_is_null(first)) {
+		if (first.page == FIL_NULL) {
 			fsp_fill_free_list(false, space, header, mtr);
 
-			first = flst_get_first(header + FSP_FREE, mtr);
-		}
-
-		if (fil_addr_is_null(first)) {
-
-			return(NULL);	/* No free extents left */
+			first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE
+					       + header->frame);
+			if (first.page == FIL_NULL) {
+				return nullptr;	/* No free extents left */
+			}
 		}
 
-		descr = xdes_lst_get_descriptor(space, first, mtr);
+		descr = xdes_lst_get_descriptor(space, first, &desc_block,
+						mtr);
 	}
 
-	flst_remove(header + FSP_FREE, descr + XDES_FLST_NODE, mtr);
+	flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE, desc_block,
+		    static_cast<uint16_t>(
+			    descr - desc_block->frame + XDES_FLST_NODE), mtr);
 	space->free_len--;
+	*xdes = desc_block;
 
 	return(descr);
 }
 
-/**********************************************************************//**
-Allocates a single free page from a space. */
-static MY_ATTRIBUTE((nonnull))
-void
-fsp_alloc_from_free_frag(
-/*=====================*/
-	fsp_header_t*	header,	/*!< in/out: tablespace header */
-	xdes_t*		descr,	/*!< in/out: extent descriptor */
-	ulint		bit,	/*!< in: slot to allocate in the extent */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+/** Allocate a single free page.
+@param[in,out]	header	tablespace header
+@param[in,out]	xdes	extent descriptor page
+@param[in,out]	descr	extent descriptor
+@param[in]	bit	slot to allocate in the extent
+@param[in,out]	mtr	mini-transaction */
+static void
+fsp_alloc_from_free_frag(buf_block_t *header, buf_block_t *xdes, xdes_t *descr,
+			 ulint bit, mtr_t *mtr)
 {
-	ulint		frag_n_used;
-
-	ut_ad(xdes_get_state(descr, mtr) == XDES_FREE_FRAG);
-	ut_a(xdes_mtr_get_bit(descr, XDES_FREE_BIT, bit, mtr));
-	xdes_set_bit(descr, XDES_FREE_BIT, bit, FALSE, mtr);
+	ut_ad(xdes_get_state(descr) == XDES_FREE_FRAG);
+	ut_a(xdes_is_free(descr, bit));
+	xdes_set_free<false>(*xdes, descr, bit, mtr);
 
 	/* Update the FRAG_N_USED field */
-	frag_n_used = mach_read_from_4(header + FSP_FRAG_N_USED);
-	frag_n_used++;
-	mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES,
-			 mtr);
-	if (xdes_is_full(descr, mtr)) {
+	byte* n_used_p = FSP_HEADER_OFFSET + FSP_FRAG_N_USED + header->frame;
+
+	uint32_t n_used = mach_read_from_4(n_used_p) + 1;
+
+	if (xdes_is_full(descr)) {
 		/* The fragment is full: move it to another list */
-		flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE,
-			    mtr);
-		xdes_set_state(descr, XDES_FULL_FRAG, mtr);
-
-		flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE,
-			      mtr);
-		mlog_write_ulint(header + FSP_FRAG_N_USED,
-				 frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES,
-				 mtr);
+		const uint16_t xoffset= static_cast<uint16_t>(
+			descr - xdes->frame + XDES_FLST_NODE);
+		flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+			    xdes, xoffset, mtr);
+		xdes_set_state(*xdes, descr, XDES_FULL_FRAG, mtr);
+
+		flst_add_last(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
+			      xdes, xoffset, mtr);
+		n_used -= FSP_EXTENT_SIZE;
 	}
+
+	mtr->write<4>(*header, n_used_p, n_used);
 }
 
 /** Gets a buffer block for an allocated page.
-NOTE: If init_mtr != mtr, the block will only be initialized if it was
-not previously x-latched. It is assumed that the block has been
-x-latched only by mtr, and freed in mtr in that case.
 @param[in,out]	space		tablespace
 @param[in]	offset		page number of the allocated page
-@param[in,out]	mtr		mini-transaction of the allocation
-@param[in,out]	init_mtr	mini-transaction for initializing the page
-@return block, initialized if init_mtr==mtr
-or rw_lock_x_lock_count(&block->lock) == 1 */
+@param[in,out]	mtr		mini-transaction
+@return block, initialized */
 static
 buf_block_t*
-fsp_page_create(
-	fil_space_t*		space,
-	page_no_t		offset,
-	mtr_t*			mtr,
-	mtr_t*			init_mtr)
-{
-	buf_block_t*	block = buf_page_create(page_id_t(space->id, offset),
-						space->zip_size(), init_mtr);
-
-	if (init_mtr == mtr
-	    || rw_lock_get_x_lock_count(&block->lock) == 1) {
-		/* Initialize the page, unless it was already
-		latched in mtr. (In this case, we would want to
-		allocate another page that has not been freed in mtr.) */
-		fsp_init_file_page(space, block, init_mtr);
-	}
-
-	return(block);
+fsp_page_create(fil_space_t *space, page_no_t offset, mtr_t *mtr)
+{
+  buf_block_t *block, *free_block;
+
+  if (UNIV_UNLIKELY(space->is_being_truncated))
+  {
+    const page_id_t page_id{space->id, offset};
+    const ulint fold= page_id.fold();
+    mysql_mutex_lock(&buf_pool.mutex);
+    block= reinterpret_cast<buf_block_t*>
+      (buf_pool.page_hash_get_low(page_id, fold));
+    if (block && block->page.oldest_modification() <= 1)
+      block= nullptr;
+    mysql_mutex_unlock(&buf_pool.mutex);
+
+    if (block)
+    {
+      ut_ad(block->page.buf_fix_count() >= 1);
+      ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1);
+      ut_ad(mtr->have_x_latch(*block));
+      free_block= block;
+      goto got_free_block;
+    }
+  }
+
+  free_block= buf_LRU_get_free_block(false);
+got_free_block:
+  block= buf_page_create(space, static_cast<uint32_t>(offset),
+                         space->zip_size(), mtr, free_block);
+  if (UNIV_UNLIKELY(block != free_block))
+    buf_pool.free_block(free_block);
+
+  fsp_init_file_page(space, block, mtr);
+  return block;
 }
 
 /** Allocates a single free page from a space.
@@ -1162,37 +1095,35 @@ The page is marked as used.
 @param[in,out]	mtr		mini-transaction
 @param[in,out]	init_mtr	mini-transaction in which the page should be
 initialized (may be the same as mtr)
-@retval NULL	if no page could be allocated
-@retval block	rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
-(init_mtr == mtr, or the page was not previously freed in mtr)
-@retval block	(not allocated or initialized) otherwise */
+@retval NULL	if no page could be allocated */
 static MY_ATTRIBUTE((warn_unused_result, nonnull))
 buf_block_t*
 fsp_alloc_free_page(
 	fil_space_t*		space,
-	ulint			hint,
+	uint32_t		hint,
 	mtr_t*			mtr,
 	mtr_t*			init_mtr)
 {
-	fsp_header_t*	header;
 	fil_addr_t	first;
 	xdes_t*		descr;
-	ulint		free;
 	const ulint	space_id = space->id;
 
 	ut_d(space->modify_check(*mtr));
-	header = fsp_get_space_header(space, mtr);
+	buf_block_t* block = fsp_get_header(space, mtr);
+	buf_block_t *xdes;
 
 	/* Get the hinted descriptor */
-	descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr);
+	descr = xdes_get_descriptor_with_space_hdr(block, space, hint, &xdes,
+						   mtr);
 
-	if (descr && (xdes_get_state(descr, mtr) == XDES_FREE_FRAG)) {
+	if (descr && (xdes_get_state(descr) == XDES_FREE_FRAG)) {
 		/* Ok, we can take this extent */
 	} else {
 		/* Else take the first extent in free_frag list */
-		first = flst_get_first(header + FSP_FREE_FRAG, mtr);
+		first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE_FRAG
+				       + block->frame);
 
-		if (fil_addr_is_null(first)) {
+		if (first.page == FIL_NULL) {
 			/* There are no partially full fragments: allocate
 			a free extent and add it to the FREE_FRAG list. NOTE
 			that the allocation may have as a side-effect that an
@@ -1200,7 +1131,7 @@ fsp_alloc_free_page(
 			FREE_FRAG list. But we will allocate our page from the
 			the free extent anyway. */
 
-			descr = fsp_alloc_free_extent(space, hint, mtr);
+			descr = fsp_alloc_free_extent(space, hint, &xdes, mtr);
 
 			if (descr == NULL) {
 				/* No free space left */
@@ -1208,11 +1139,14 @@ fsp_alloc_free_page(
 				return(NULL);
 			}
 
-			xdes_set_state(descr, XDES_FREE_FRAG, mtr);
-			flst_add_last(header + FSP_FREE_FRAG,
-				      descr + XDES_FLST_NODE, mtr);
+			xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
+			flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+				      xdes, static_cast<uint16_t>(
+					      descr - xdes->frame
+					      + XDES_FLST_NODE), mtr);
 		} else {
-			descr = xdes_lst_get_descriptor(space, first, mtr);
+			descr = xdes_lst_get_descriptor(space, first, &xdes,
+							mtr);
 		}
 
 		/* Reset the hint */
@@ -1222,9 +1156,8 @@ fsp_alloc_free_page(
 	/* Now we have in descr an extent with at least one free page. Look
 	for a free page in the extent. */
 
-	free = xdes_find_bit(descr, XDES_FREE_BIT, TRUE,
-			     hint % FSP_EXTENT_SIZE, mtr);
-	if (free == ULINT_UNDEFINED) {
+	uint32_t free = xdes_find_free(descr, hint % FSP_EXTENT_SIZE);
+	if (free == FIL_NULL) {
 
 		ut_print_buf(stderr, ((byte*) descr) - 500, 1000);
 		putc('\n', stderr);
@@ -1232,9 +1165,10 @@ fsp_alloc_free_page(
 		ut_error;
 	}
 
-	page_no_t page_no = xdes_get_offset(descr) + free;
+	uint32_t page_no = xdes_get_offset(descr) + free;
 
-	page_no_t space_size = mach_read_from_4(header + FSP_SIZE);
+	uint32_t space_size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+					       + block->frame);
 	ut_ad(space_size == space->size_in_header
 	      || (space_id == TRX_SYS_SPACE
 		  && srv_startup_is_before_trx_rollback_phase));
@@ -1253,26 +1187,23 @@ fsp_alloc_free_page(
 		}
 
 		if (!fsp_try_extend_data_file_with_pages(space, page_no,
-							 header, mtr)) {
+							 block, mtr)) {
 			/* No disk space left */
 			return(NULL);
 		}
 	}
 
-	fsp_alloc_from_free_frag(header, descr, free, mtr);
-	return fsp_page_create(space, page_no, mtr, init_mtr);
+	fsp_alloc_from_free_frag(block, xdes, descr, free, mtr);
+	return fsp_page_create(space, page_no, init_mtr);
 }
 
 /** Frees a single page of a space.
 The page is marked as free and clean.
 @param[in,out]	space		tablespace
 @param[in]	offset		page number
-@param[in]	log		whether to write MLOG_INIT_FREE_PAGE record
 @param[in,out]	mtr		mini-transaction */
-static void fsp_free_page(fil_space_t* space, page_no_t offset,
-			  bool log, mtr_t* mtr)
+static void fsp_free_page(fil_space_t* space, page_no_t offset, mtr_t* mtr)
 {
-	fsp_header_t*	header;
 	xdes_t*		descr;
 	ulint		state;
 	ulint		frag_n_used;
@@ -1282,12 +1213,13 @@ static void fsp_free_page(fil_space_t* space, page_no_t offset,
 
 	/* fprintf(stderr, "Freeing page %lu in space %lu\n", page, space); */
 
-	header = fsp_get_space_header(space, mtr);
+	buf_block_t* header = fsp_get_header(space, mtr);
+	buf_block_t* xdes= 0;
 
-	descr = xdes_get_descriptor_with_space_hdr(
-		header, space, offset, mtr);
+	descr = xdes_get_descriptor_with_space_hdr(header, space, offset,
+						   &xdes, mtr);
 
-	state = xdes_get_state(descr, mtr);
+	state = xdes_get_state(descr);
 
 	if (UNIV_UNLIKELY(state != XDES_FREE_FRAG
 			  && state != XDES_FULL_FRAG)) {
@@ -1308,9 +1240,7 @@ static void fsp_free_page(fil_space_t* space, page_no_t offset,
 		ut_error;
 	}
 
-	if (xdes_mtr_get_bit(descr, XDES_FREE_BIT,
-			     offset % FSP_EXTENT_SIZE, mtr)) {
-
+	if (xdes_is_free(descr, offset % FSP_EXTENT_SIZE)) {
 		ib::error() << "File space extent descriptor of page "
 			<< page_id_t(space->id, offset)
 			<< " says it is free.";
@@ -1324,70 +1254,63 @@ static void fsp_free_page(fil_space_t* space, page_no_t offset,
 		return;
 	}
 
-	if (UNIV_UNLIKELY(!log)) {
-		/* The last page freed in BtrBulk::finish() must be
-		written with redo logging disabled for the page
-		itself. The modifications of the allocation data
-		structures are covered by redo log. */
-	} else if (byte* log_ptr = mlog_open(mtr, 11)) {
-		log_ptr = mlog_write_initial_log_record_low(
-			MLOG_INIT_FREE_PAGE, space->id, offset, log_ptr, mtr);
-		mlog_close(mtr, log_ptr);
-	}
+	mtr->free(*space, static_cast<uint32_t>(offset));
 
 	const ulint	bit = offset % FSP_EXTENT_SIZE;
 
-	xdes_set_bit(descr, XDES_FREE_BIT, bit, TRUE, mtr);
-	/* xdes_init() should have set all XDES_CLEAN_BIT */
-	ut_ad(xdes_get_bit(descr, XDES_CLEAN_BIT, bit));
+	xdes_set_free<true>(*xdes, descr, bit, mtr);
+
+	frag_n_used = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED
+				       + header->frame);
+
+	const uint16_t xoffset= static_cast<uint16_t>(descr - xdes->frame
+						      + XDES_FLST_NODE);
 
-	frag_n_used = mach_read_from_4(header + FSP_FRAG_N_USED);
 	if (state == XDES_FULL_FRAG) {
 		/* The fragment was full: move it to another list */
-		flst_remove(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE,
-			    mtr);
-		xdes_set_state(descr, XDES_FREE_FRAG, mtr);
-		flst_add_last(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE,
-			      mtr);
-		mlog_write_ulint(header + FSP_FRAG_N_USED,
-				 frag_n_used + FSP_EXTENT_SIZE - 1,
-				 MLOG_4BYTES, mtr);
+		flst_remove(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
+			    xdes, xoffset, mtr);
+		xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
+		flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+			      xdes, xoffset, mtr);
+		mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED
+			      + header->frame,
+			      frag_n_used + FSP_EXTENT_SIZE - 1);
 	} else {
 		ut_a(frag_n_used > 0);
-		mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used - 1,
-				 MLOG_4BYTES, mtr);
+		mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED
+			      + header->frame, frag_n_used - 1);
 	}
 
-	if (xdes_is_free(descr, mtr)) {
+	if (!xdes_get_n_used(descr)) {
 		/* The extent has become free: move it to another list */
-		flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE,
-			    mtr);
+		flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+			    xdes, xoffset, mtr);
 		fsp_free_extent(space, offset, mtr);
 	}
 }
 
 /** Return an extent to the free list of a space.
-@param[in,out]	space		tablespace
-@param[in]	offset		page number in the extent
-@param[in,out]	mtr		mini-transaction */
+@param[in,out]  space   tablespace
+@param[in]      offset  page number in the extent
+@param[in,out]  mtr     mini-transaction */
 static void fsp_free_extent(fil_space_t* space, page_no_t offset, mtr_t* mtr)
 {
-	fsp_header_t*	header;
-	xdes_t*		descr;
-
-	ut_ad(mtr_memo_contains(mtr, space, MTR_MEMO_SPACE_X_LOCK));
+  ut_ad(mtr->memo_contains(*space));
 
-	header = fsp_get_space_header(space, mtr);
+  buf_block_t *block= fsp_get_header(space, mtr);
+  buf_block_t *xdes= 0;
 
-	descr = xdes_get_descriptor_with_space_hdr(
-		header, space, offset, mtr);
-
-	ut_a(xdes_get_state(descr, mtr) != XDES_FREE);
+  xdes_t* descr= xdes_get_descriptor_with_space_hdr(block, space, offset,
+                                                    &xdes, mtr);
+  ut_a(xdes_get_state(descr) != XDES_FREE);
 
-	xdes_init(descr, mtr);
+  xdes_init(*xdes, descr, mtr);
 
-	flst_add_last(header + FSP_FREE, descr + XDES_FLST_NODE, mtr);
-	space->free_len++;
+  flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE,
+                xdes, static_cast<uint16_t>(descr - xdes->frame +
+                                            XDES_FLST_NODE), mtr);
+  space->free_len++;
 }
 
 /** @return Number of segment inodes which fit on a single page */
@@ -1399,50 +1322,29 @@ inline ulint FSP_SEG_INODES_PER_PAGE(ulint physical_size)
 /** Returns the nth inode slot on an inode page.
 @param[in]	page		segment inode page
 @param[in]	i		inode index on page
-@param[in]	physical_size	page size
-@param[in,out]	mtr		mini-transaction
 @return segment inode */
-UNIV_INLINE
-fseg_inode_t*
-fsp_seg_inode_page_get_nth_inode(
-	page_t*			page,
-	ulint			i,
-	ulint			physical_size,
-	mtr_t*			mtr)
-{
-	ut_ad(i < FSP_SEG_INODES_PER_PAGE(physical_size));
-	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_SX_FIX));
-
-	return(page + FSEG_ARR_OFFSET + FSEG_INODE_SIZE * i);
-}
+#define fsp_seg_inode_page_get_nth_inode(page, i)	\
+	FSEG_ARR_OFFSET + FSEG_INODE_SIZE * i + page
 
 /** Looks for a used segment inode on a segment inode page.
 @param[in]	page		segment inode page
 @param[in]	physical_size	page size
-@param[in,out]	mtr		mini-transaction
 @return segment inode index, or ULINT_UNDEFINED if not found */
 static
 ulint
-fsp_seg_inode_page_find_used(
-	page_t*			page,
-	ulint			physical_size,
-	mtr_t*			mtr)
+fsp_seg_inode_page_find_used(const page_t* page, ulint physical_size)
 {
-	ulint		i;
-	fseg_inode_t*	inode;
-
-	for (i = 0; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++) {
-
-		inode = fsp_seg_inode_page_get_nth_inode(
-			page, i, physical_size, mtr);
-
-		if (mach_read_from_8(inode + FSEG_ID)) {
-			/* This is used */
-
-			ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
-			      == FSEG_MAGIC_N_VALUE);
-			return(i);
+	for (ulint i = 0; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++) {
+		if (!mach_read_from_8(
+			    FSEG_ID
+			    + fsp_seg_inode_page_get_nth_inode(page, i))) {
+			continue;
 		}
+		/* This is used */
+		ut_ad(FSEG_MAGIC_N_VALUE == mach_read_from_4(
+			      FSEG_MAGIC_N
+			      + fsp_seg_inode_page_get_nth_inode(page, i)));
+		return i;
 	}
 
 	return(ULINT_UNDEFINED);
@@ -1452,109 +1354,85 @@ fsp_seg_inode_page_find_used(
 @param[in]	page		segment inode page
 @param[in]	i		search forward starting from this index
 @param[in]	physical_size	page size
-@param[in,out]	mtr		mini-transaction
 @return segment inode index, or ULINT_UNDEFINED if not found */
 static
 ulint
-fsp_seg_inode_page_find_free(
-	page_t*			page,
-	ulint			i,
-	ulint			physical_size,
-	mtr_t*			mtr)
+fsp_seg_inode_page_find_free(const page_t* page, ulint i, ulint physical_size)
 {
 	for (; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++) {
-
-		fseg_inode_t*	inode;
-
-		inode = fsp_seg_inode_page_get_nth_inode(
-			page, i, physical_size, mtr);
-
-		if (!mach_read_from_8(inode + FSEG_ID)) {
+		if (!mach_read_from_8(
+			    FSEG_ID
+			    + fsp_seg_inode_page_get_nth_inode(page, i))) {
 			/* This is unused */
-			return(i);
+			return i;
 		}
 
-		ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
-		      == FSEG_MAGIC_N_VALUE);
+		ut_ad(FSEG_MAGIC_N_VALUE == mach_read_from_4(
+			      FSEG_MAGIC_N
+			      + fsp_seg_inode_page_get_nth_inode(page, i)));
 	}
 
-	return(ULINT_UNDEFINED);
+	return ULINT_UNDEFINED;
 }
 
 /** Allocate a file segment inode page.
-@param[in,out]	space		tablespace
-@param[in,out]	space_header	tablespace header
-@param[in,out]	mtr		mini-transaction
+@param[in,out]  space   tablespace
+@param[in,out]  header  tablespace header
+@param[in,out]  mtr     mini-transaction
 @return whether the allocation succeeded */
 MY_ATTRIBUTE((nonnull, warn_unused_result))
 static
 bool
-fsp_alloc_seg_inode_page(
-	fil_space_t*	space,
-	fsp_header_t*	space_header,
-	mtr_t*		mtr)
+fsp_alloc_seg_inode_page(fil_space_t *space, buf_block_t *header, mtr_t *mtr)
 {
-	buf_block_t*	block;
+  ut_ad(header->page.id().space() == space->id);
+  buf_block_t *block= fsp_alloc_free_page(space, 0, mtr, mtr);
 
-	ut_ad(page_offset(space_header) == FSP_HEADER_OFFSET);
-	ut_ad(page_get_space_id(page_align(space_header)) == space->id);
+  if (!block)
+    return false;
 
-	block = fsp_alloc_free_page(space, 0, mtr, mtr);
+  buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+  ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1);
 
-	if (!block) {
-		return(false);
-	}
-
-	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
-	ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1);
-
-	mlog_write_ulint(block->frame + FIL_PAGE_TYPE, FIL_PAGE_INODE,
-			 MLOG_2BYTES, mtr);
+  mtr->write<2>(*block, block->frame + FIL_PAGE_TYPE, FIL_PAGE_INODE);
 
 #ifdef UNIV_DEBUG
-	const byte* inode = FSEG_ID + FSEG_ARR_OFFSET + block->frame;
-	for (ulint i = FSP_SEG_INODES_PER_PAGE(space->physical_size()); i--;
-	     inode += FSEG_INODE_SIZE) {
-		ut_ad(!mach_read_from_8(inode));
-	}
+  const byte *inode= FSEG_ID + FSEG_ARR_OFFSET + block->frame;
+  for (ulint i= FSP_SEG_INODES_PER_PAGE(space->physical_size()); i--;
+       inode += FSEG_INODE_SIZE)
+    ut_ad(!mach_read_from_8(inode));
 #endif
 
-	flst_add_last(
-		space_header + FSP_SEG_INODES_FREE,
-		block->frame + FSEG_INODE_PAGE_NODE, mtr);
-
-	return(true);
+  flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+                block, FSEG_INODE_PAGE_NODE, mtr);
+  return true;
 }
 
 /** Allocate a file segment inode.
-@param[in,out]	space		tablespace
-@param[in,out]	space_header	tablespace header
-@param[in,out]	mtr		mini-transaction
+@param[in,out]  space   tablespace
+@param[in,out]  header  tablespace header
+@param[out]     iblock  segment inode page
+@param[in,out]  mtr     mini-transaction
 @return segment inode
 @retval NULL if not enough space */
 MY_ATTRIBUTE((nonnull, warn_unused_result))
-static
-fseg_inode_t*
-fsp_alloc_seg_inode(
-	fil_space_t*	space,
-	fsp_header_t*	space_header,
-	mtr_t*		mtr)
+static fseg_inode_t*
+fsp_alloc_seg_inode(fil_space_t *space, buf_block_t *header,
+                    buf_block_t **iblock, mtr_t *mtr)
 {
 	buf_block_t*	block;
-	page_t*		page;
 	fseg_inode_t*	inode;
-	ulint		n;
-
-	ut_ad(page_offset(space_header) == FSP_HEADER_OFFSET);
 
 	/* Allocate a new segment inode page if needed. */
-	if (flst_get_len(space_header + FSP_SEG_INODES_FREE) == 0
-	    && !fsp_alloc_seg_inode_page(space, space_header, mtr)) {
+	if (!flst_get_len(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE
+			  + header->frame)
+	    && !fsp_alloc_seg_inode_page(space, header, mtr)) {
 		return(NULL);
 	}
 	const page_id_t		page_id(
 		space->id,
-		flst_get_first(space_header + FSP_SEG_INODES_FREE, mtr).page);
+		flst_get_first(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE
+			       + header->frame).page);
 
 	block = buf_page_get(page_id, space->zip_size(), RW_SX_LATCH, mtr);
 	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
@@ -1562,80 +1440,67 @@ fsp_alloc_seg_inode(
 		fil_block_check_type(*block, FIL_PAGE_INODE, mtr);
 	}
 
-	page = buf_block_get_frame(block);
-
 	const ulint physical_size = space->physical_size();
 
-	n = fsp_seg_inode_page_find_free(page, 0, physical_size, mtr);
+	ulint n = fsp_seg_inode_page_find_free(block->frame, 0, physical_size);
 
-	ut_a(n != ULINT_UNDEFINED);
+	ut_a(n < FSP_SEG_INODES_PER_PAGE(physical_size));
 
-	inode = fsp_seg_inode_page_get_nth_inode(page, n, physical_size, mtr);
+	inode = fsp_seg_inode_page_get_nth_inode(block->frame, n);
 
-	if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(page, n + 1,
-							    physical_size,
-							    mtr)) {
+	if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(block->frame,
+							    n + 1,
+							    physical_size)) {
 		/* There are no other unused headers left on the page: move it
 		to another list */
-
-		flst_remove(space_header + FSP_SEG_INODES_FREE,
-			    page + FSEG_INODE_PAGE_NODE, mtr);
-
-		flst_add_last(space_header + FSP_SEG_INODES_FULL,
-			      page + FSEG_INODE_PAGE_NODE, mtr);
+		flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+			    block, FSEG_INODE_PAGE_NODE, mtr);
+		flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
+			      block, FSEG_INODE_PAGE_NODE, mtr);
 	}
 
 	ut_ad(!mach_read_from_8(inode + FSEG_ID)
 	      || mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+	*iblock = block;
 	return(inode);
 }
 
 /** Frees a file segment inode.
 @param[in,out]	space		tablespace
 @param[in,out]	inode		segment inode
+@param[in,out]	iblock		segment inode page
 @param[in,out]	mtr		mini-transaction */
 static void fsp_free_seg_inode(
 	fil_space_t*		space,
 	fseg_inode_t*		inode,
+	buf_block_t*		iblock,
 	mtr_t*			mtr)
 {
-	page_t*		page;
-	fsp_header_t*	space_header;
-
 	ut_d(space->modify_check(*mtr));
 
-	page = page_align(inode);
-
-	space_header = fsp_get_space_header(space, mtr);
+	buf_block_t* header = fsp_get_header(space, mtr);
 
 	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
 
 	const ulint physical_size = space->physical_size();
 
 	if (ULINT_UNDEFINED
-	    == fsp_seg_inode_page_find_free(page, 0, physical_size, mtr)) {
-
+	    == fsp_seg_inode_page_find_free(iblock->frame, 0, physical_size)) {
 		/* Move the page to another list */
-
-		flst_remove(space_header + FSP_SEG_INODES_FULL,
-			    page + FSEG_INODE_PAGE_NODE, mtr);
-
-		flst_add_last(space_header + FSP_SEG_INODES_FREE,
-			      page + FSEG_INODE_PAGE_NODE, mtr);
+		flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
+			    iblock, FSEG_INODE_PAGE_NODE, mtr);
+		flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+			      iblock, FSEG_INODE_PAGE_NODE, mtr);
 	}
 
-	mlog_write_ull(inode + FSEG_ID, 0, mtr);
-	mlog_write_ulint(inode + FSEG_MAGIC_N, 0xfa051ce3, MLOG_4BYTES, mtr);
+	mtr->memset(iblock, page_offset(inode) + FSEG_ID, FSEG_INODE_SIZE, 0);
 
 	if (ULINT_UNDEFINED
-	    == fsp_seg_inode_page_find_used(page, physical_size, mtr)) {
-
+	    == fsp_seg_inode_page_find_used(iblock->frame, physical_size)) {
 		/* There are no other used headers left on the page: free it */
-
-		flst_remove(space_header + FSP_SEG_INODES_FREE,
-			    page + FSEG_INODE_PAGE_NODE, mtr);
-
-		fsp_free_page(space, page_get_page_no(page), true, mtr);
+		flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+			    iblock, FSEG_INODE_PAGE_NODE, mtr);
+		fsp_free_page(space, iblock->page.id().page_no(), mtr);
 	}
 }
 
@@ -1649,7 +1514,7 @@ static void fsp_free_seg_inode(
 static
 fseg_inode_t*
 fseg_inode_try_get(
-	fseg_header_t*		header,
+	const fseg_header_t*	header,
 	ulint			space,
 	ulint			zip_size,
 	mtr_t*			mtr,
@@ -1686,7 +1551,7 @@ fseg_inode_try_get(
 static
 fseg_inode_t*
 fseg_inode_get(
-	fseg_header_t*		header,
+	const fseg_header_t*	header,
 	ulint			space,
 	ulint			zip_size,
 	mtr_t*			mtr,
@@ -1698,44 +1563,35 @@ fseg_inode_get(
 	return(inode);
 }
 
-/**********************************************************************//**
-Gets the page number from the nth fragment page slot.
-@return page number, FIL_NULL if not in use */
-UNIV_INLINE
-ulint
-fseg_get_nth_frag_page_no(
-/*======================*/
-	fseg_inode_t*	inode,	/*!< in: segment inode */
-	ulint		n,	/*!< in: slot index */
-	mtr_t*		mtr MY_ATTRIBUTE((unused)))
-				/*!< in/out: mini-transaction */
+/** Get the page number from the nth fragment page slot.
+@param inode  file segment findex
+@param n      slot index
+@return page number
+@retval FIL_NULL if not in use */
+static uint32_t fseg_get_nth_frag_page_no(const fseg_inode_t *inode, ulint n)
 {
-	ut_ad(inode && mtr);
+	ut_ad(inode);
 	ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
-	ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_SX_FIX));
 	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
 	return(mach_read_from_4(inode + FSEG_FRAG_ARR
 				+ n * FSEG_FRAG_SLOT_SIZE));
 }
 
-/**********************************************************************//**
-Sets the page number in the nth fragment page slot. */
-UNIV_INLINE
-void
-fseg_set_nth_frag_page_no(
-/*======================*/
-	fseg_inode_t*	inode,	/*!< in: segment inode */
-	ulint		n,	/*!< in: slot index */
-	ulint		page_no,/*!< in: page number to set */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+/** Set the page number in the nth fragment page slot.
+@param[in,out]  inode   segment inode
+@param[in,out]  iblock  segment inode page
+@param[in]      n       slot index
+@param[in]      page_no page number to set
+@param[in,out]  mtr     mini-transaction */
+inline void fseg_set_nth_frag_page_no(fseg_inode_t *inode, buf_block_t *iblock,
+                                      ulint n, ulint page_no, mtr_t *mtr)
 {
-	ut_ad(inode && mtr);
-	ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
-	ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_SX_FIX));
-	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+  ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
+  ut_ad(mtr->memo_contains_flagged(iblock, MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
 
-	mlog_write_ulint(inode + FSEG_FRAG_ARR + n * FSEG_FRAG_SLOT_SIZE,
-			 page_no, MLOG_4BYTES, mtr);
+  mtr->write<4>(*iblock, inode + FSEG_FRAG_ARR + n * FSEG_FRAG_SLOT_SIZE,
+                page_no);
 }
 
 /**********************************************************************//**
@@ -1745,16 +1601,13 @@ static
 ulint
 fseg_find_free_frag_page_slot(
 /*==========================*/
-	fseg_inode_t*	inode,	/*!< in: segment inode */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	fseg_inode_t*	inode)	/*!< in: segment inode */
 {
 	ulint	i;
 	ulint	page_no;
 
-	ut_ad(inode && mtr);
-
 	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
-		page_no = fseg_get_nth_frag_page_no(inode, i, mtr);
+		page_no = fseg_get_nth_frag_page_no(inode, i);
 
 		if (page_no == FIL_NULL) {
 
@@ -1772,17 +1625,14 @@ static
 ulint
 fseg_find_last_used_frag_page_slot(
 /*===============================*/
-	fseg_inode_t*	inode,	/*!< in: segment inode */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	fseg_inode_t*	inode)	/*!< in: segment inode */
 {
 	ulint	i;
 	ulint	page_no;
 
-	ut_ad(inode && mtr);
-
 	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
 		page_no = fseg_get_nth_frag_page_no(
-			inode, FSEG_FRAG_ARR_N_SLOTS - i - 1, mtr);
+			inode, FSEG_FRAG_ARR_N_SLOTS - i - 1);
 
 		if (page_no != FIL_NULL) {
 
@@ -1793,23 +1643,16 @@ fseg_find_last_used_frag_page_slot(
 	return(ULINT_UNDEFINED);
 }
 
-/**********************************************************************//**
-Calculates reserved fragment page slots.
+/** Calculate reserved fragment page slots.
+@param inode  file segment index
 @return number of fragment pages */
-static
-ulint
-fseg_get_n_frag_pages(
-/*==================*/
-	fseg_inode_t*	inode,	/*!< in: segment inode */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+static ulint fseg_get_n_frag_pages(const fseg_inode_t *inode)
 {
 	ulint	i;
 	ulint	count	= 0;
 
-	ut_ad(inode && mtr);
-
 	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
-		if (FIL_NULL != fseg_get_nth_frag_page_no(inode, i, mtr)) {
+		if (FIL_NULL != fseg_get_nth_frag_page_no(inode, i)) {
 			count++;
 		}
 	}
@@ -1830,11 +1673,9 @@ buf_block_t*
 fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr,
             bool has_done_reservation, buf_block_t *block)
 {
-	fsp_header_t*	space_header;
 	fseg_inode_t*	inode;
 	ib_id_t		seg_id;
-	fseg_header_t*	header	= 0; /* remove warning */
-	ulint		n_reserved;
+	uint32_t	n_reserved;
 
 	DBUG_ENTER("fseg_create");
 
@@ -1847,12 +1688,10 @@ fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr,
 	ut_d(space->modify_check(*mtr));
 
 	if (block) {
-		header = byte_offset + buf_block_get_frame(block);
-
-		ut_ad(block->page.id.space() == space->id);
+		ut_ad(block->page.id().space() == space->id);
 
 		if (!space->full_crc32()) {
-			fil_block_check_type(*block, block->page.id
+			fil_block_check_type(*block, block->page.id()
 					     == page_id_t(TRX_SYS_SPACE,
 							  TRX_SYS_PAGE_NO)
 					     ? FIL_PAGE_TYPE_TRX_SYS
@@ -1867,9 +1706,10 @@ fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr,
 		DBUG_RETURN(NULL);
 	}
 
-	space_header = fsp_get_space_header(space, mtr);
+	buf_block_t* header = fsp_get_header(space, mtr);
+	buf_block_t* iblock;
 
-	inode = fsp_alloc_seg_inode(space, space_header, mtr);
+	inode = fsp_alloc_seg_inode(space, header, &iblock, mtr);
 
 	if (inode == NULL) {
 		goto funct_exit;
@@ -1878,55 +1718,58 @@ fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr,
 	/* Read the next segment id from space header and increment the
 	value in space header */
 
-	seg_id = mach_read_from_8(space_header + FSP_SEG_ID);
+	seg_id = mach_read_from_8(FSP_HEADER_OFFSET + FSP_SEG_ID
+				  + header->frame);
 
-	mlog_write_ull(space_header + FSP_SEG_ID, seg_id + 1, mtr);
-	mlog_write_ull(inode + FSEG_ID, seg_id, mtr);
+	mtr->write<8>(*header, FSP_HEADER_OFFSET + FSP_SEG_ID + header->frame,
+		      seg_id + 1);
+	mtr->write<8>(*iblock, inode + FSEG_ID, seg_id);
 	ut_ad(!mach_read_from_4(inode + FSEG_NOT_FULL_N_USED));
 
-	flst_init(inode + FSEG_FREE, mtr);
-	flst_init(inode + FSEG_NOT_FULL, mtr);
-	flst_init(inode + FSEG_FULL, mtr);
+	flst_init(*iblock, inode + FSEG_FREE, mtr);
+	flst_init(*iblock, inode + FSEG_NOT_FULL, mtr);
+	flst_init(*iblock, inode + FSEG_FULL, mtr);
 
-	mlog_write_ulint(inode + FSEG_MAGIC_N, FSEG_MAGIC_N_VALUE,
-			 MLOG_4BYTES, mtr);
+	mtr->write<4>(*iblock, inode + FSEG_MAGIC_N, FSEG_MAGIC_N_VALUE);
 	compile_time_assert(FSEG_FRAG_SLOT_SIZE == 4);
 	compile_time_assert(FIL_NULL == 0xffffffff);
-	mlog_memset(inode + FSEG_FRAG_ARR,
-		    FSEG_FRAG_SLOT_SIZE * FSEG_FRAG_ARR_N_SLOTS, 0xff, mtr);
+	mtr->memset(iblock, uint16_t(inode - iblock->frame) + FSEG_FRAG_ARR,
+		    FSEG_FRAG_SLOT_SIZE * FSEG_FRAG_ARR_N_SLOTS, 0xff);
 
 	if (!block) {
-		block = fseg_alloc_free_page_low(space, inode, 0, FSP_UP,
-						 mtr, mtr
+		block = fseg_alloc_free_page_low(space,
+						 inode, iblock, 0, FSP_UP,
 #ifdef UNIV_DEBUG
-						 , has_done_reservation
+						 has_done_reservation,
 #endif /* UNIV_DEBUG */
-						 );
+						 mtr, mtr);
 
 		/* The allocation cannot fail if we have already reserved a
 		space for the page. */
 		ut_ad(!has_done_reservation || block != NULL);
 
 		if (block == NULL) {
-			fsp_free_seg_inode(space, inode, mtr);
+			fsp_free_seg_inode(space, inode, iblock, mtr);
 			goto funct_exit;
 		}
 
-		ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1);
-
-		header = byte_offset + buf_block_get_frame(block);
-		mlog_write_ulint(buf_block_get_frame(block) + FIL_PAGE_TYPE,
-				 FIL_PAGE_TYPE_SYS, MLOG_2BYTES, mtr);
+		ut_d(const auto x = rw_lock_get_x_lock_count(&block->lock));
+		ut_ad(x > 0);
+		ut_ad(x == 1 || space->is_being_truncated);
+		ut_ad(x <= 2);
+		ut_ad(!fil_page_get_type(block->frame));
+		mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+			      FIL_PAGE_TYPE_SYS);
 	}
 
-	mlog_write_ulint(header + FSEG_HDR_OFFSET,
-			 page_offset(inode), MLOG_2BYTES, mtr);
+	mtr->write<2>(*block, byte_offset + FSEG_HDR_OFFSET
+		      + block->frame, page_offset(inode));
 
-	mlog_write_ulint(header + FSEG_HDR_PAGE_NO,
-			 page_get_page_no(page_align(inode)),
-			 MLOG_4BYTES, mtr);
+	mtr->write<4>(*block, byte_offset + FSEG_HDR_PAGE_NO
+		      + block->frame, iblock->page.id().page_no());
 
-	mlog_write_ulint(header + FSEG_HDR_SPACE, space->id, MLOG_4BYTES, mtr);
+	mtr->write<4,mtr_t::MAYBE_NOP>(*block, byte_offset + FSEG_HDR_SPACE
+				       + block->frame, space->id);
 
 funct_exit:
 	if (!has_done_reservation) {
@@ -1944,70 +1787,55 @@ static
 ulint
 fseg_n_reserved_pages_low(
 /*======================*/
-	fseg_inode_t*	inode,	/*!< in: segment inode */
-	ulint*		used,	/*!< out: number of pages used (not
+	const fseg_inode_t*	inode,	/*!< in: segment inode */
+	ulint*		used)	/*!< out: number of pages used (not
 				more than reserved) */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	ulint	ret;
-
-	ut_ad(inode && used && mtr);
-	ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_SX_FIX));
-
 	*used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED)
 		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL)
-		+ fseg_get_n_frag_pages(inode, mtr);
+		+ fseg_get_n_frag_pages(inode);
 
-	ret = fseg_get_n_frag_pages(inode, mtr)
+	return fseg_get_n_frag_pages(inode)
 		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FREE)
 		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_NOT_FULL)
 		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL);
-
-	return(ret);
 }
 
-/**********************************************************************//**
-Calculates the number of pages reserved by a segment, and how many pages are
-currently used.
+/** Calculate the number of pages reserved by a segment,
+and how many pages are currently used.
+@param[in]      block   buffer block containing the file segment header
+@param[in]      header  file segment header
+@param[out]     used    number of pages that are used (not more than reserved)
+@param[in,out]  mtr     mini-transaction
 @return number of reserved pages */
-ulint
-fseg_n_reserved_pages(
-/*==================*/
-	fseg_header_t*	header,	/*!< in: segment header */
-	ulint*		used,	/*!< out: number of pages used (<= reserved) */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+ulint fseg_n_reserved_pages(const buf_block_t &block,
+                            const fseg_header_t *header, ulint *used,
+                            mtr_t *mtr)
 {
-	ulint		ret;
-	fseg_inode_t*	inode;
-	ulint		space_id;
-	fil_space_t*	space;
-
-	space_id = page_get_space_id(page_align(header));
-	space = mtr_x_lock_space(space_id, mtr);
-
-	inode = fseg_inode_get(header, space_id, space->zip_size(), mtr);
-
-	ret = fseg_n_reserved_pages_low(inode, used, mtr);
-
-	return(ret);
+  ut_ad(page_align(header) == block.frame);
+  return fseg_n_reserved_pages_low(fseg_inode_get(header,
+                                                  block.page.id().space(),
+                                                  block.zip_size(), mtr),
+                                   used);
 }
 
 /** Tries to fill the free list of a segment with consecutive free extents.
 This happens if the segment is big enough to allow extents in the free list,
 the free list is empty, and the extents can be allocated consecutively from
 the hint onward.
-@param[in]	inode		segment inode
-@param[in]	space		tablespace
-@param[in]	hint		hint which extent would be good as the first
-extent
-@param[in,out]	mtr		mini-transaction */
+@param[in,out]	inode	segment inode
+@param[in,out]	iblock	segment inode page
+@param[in]	space	tablespace
+@param[in]	hint	hint which extent would be good as the first extent
+@param[in,out]	mtr	mini-transaction */
 static
 void
 fseg_fill_free_list(
-	fseg_inode_t*		inode,
-	fil_space_t*		space,
-	ulint			hint,
-	mtr_t*			mtr)
+	fseg_inode_t*	inode,
+	buf_block_t*	iblock,
+	fil_space_t*	space,
+	uint32_t	hint,
+	mtr_t*		mtr)
 {
 	xdes_t*	descr;
 	ulint	i;
@@ -2019,7 +1847,7 @@ fseg_fill_free_list(
 	ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
 	ut_d(space->modify_check(*mtr));
 
-	reserved = fseg_n_reserved_pages_low(inode, &used, mtr);
+	reserved = fseg_n_reserved_pages_low(inode, &used);
 
 	if (reserved < FSEG_FREE_LIST_LIMIT * FSP_EXTENT_SIZE) {
 
@@ -2035,26 +1863,28 @@ fseg_fill_free_list(
 	}
 
 	for (i = 0; i < FSEG_FREE_LIST_MAX_LEN; i++) {
-		descr = xdes_get_descriptor(space, hint, mtr);
-
-		if ((descr == NULL)
-		    || (XDES_FREE != xdes_get_state(descr, mtr))) {
+		buf_block_t* xdes;
+		descr = xdes_get_descriptor(space, hint, &xdes, mtr);
 
+		if (!descr || (XDES_FREE != xdes_get_state(descr))) {
 			/* We cannot allocate the desired extent: stop */
-
 			return;
 		}
 
-		descr = fsp_alloc_free_extent(space, hint, mtr);
+		descr = fsp_alloc_free_extent(space, hint, &xdes, mtr);
 
-		xdes_set_state(descr, XDES_FSEG, mtr);
+		xdes_set_state(*xdes, descr, XDES_FSEG, mtr);
 
 		seg_id = mach_read_from_8(inode + FSEG_ID);
 		ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
 		      == FSEG_MAGIC_N_VALUE);
-		mlog_write_ull(descr + XDES_ID, seg_id, mtr);
+		mtr->write<8>(*xdes, descr + XDES_ID, seg_id);
 
-		flst_add_last(inode + FSEG_FREE, descr + XDES_FLST_NODE, mtr);
+		flst_add_last(iblock,
+			      static_cast<uint16_t>(inode - iblock->frame
+						    + FSEG_FREE), xdes,
+			      static_cast<uint16_t>(descr - xdes->frame
+						    + XDES_FLST_NODE), mtr);
 		hint += FSP_EXTENT_SIZE;
 	}
 }
@@ -2063,17 +1893,18 @@ fseg_fill_free_list(
 the segment, then tries to allocate from the space free list.
 NOTE that the extent returned still resides in the segment free list, it is
 not yet taken off it!
-@param[in]	inode		segment inode
+@param[in,out]	inode		segment inode
+@param[in,out]	iblock		segment inode page
+@param[out]	xdes		extent descriptor page
 @param[in,out]	space		tablespace
 @param[in,out]	mtr		mini-transaction
-@retval NULL	if no page could be allocated
-@retval block	rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
-(init_mtr == mtr, or the page was not previously freed in mtr)
-@retval block	(not allocated or initialized) otherwise */
+@retval NULL	if no page could be allocated */
 static
 xdes_t*
 fseg_alloc_free_extent(
 	fseg_inode_t*		inode,
+	buf_block_t*		iblock,
+	buf_block_t**		xdes,
 	fil_space_t*		space,
 	mtr_t*			mtr)
 {
@@ -2088,12 +1919,12 @@ fseg_alloc_free_extent(
 	if (flst_get_len(inode + FSEG_FREE) > 0) {
 		/* Segment free list is not empty, allocate from it */
 
-		first = flst_get_first(inode + FSEG_FREE, mtr);
+		first = flst_get_first(inode + FSEG_FREE);
 
-		descr = xdes_lst_get_descriptor(space, first, mtr);
+		descr = xdes_lst_get_descriptor(space, first, xdes, mtr);
 	} else {
 		/* Segment free list was empty, allocate from space */
-		descr = fsp_alloc_free_extent(space, 0, mtr);
+		descr = fsp_alloc_free_extent(space, 0, xdes, mtr);
 
 		if (descr == NULL) {
 
@@ -2102,12 +1933,17 @@ fseg_alloc_free_extent(
 
 		seg_id = mach_read_from_8(inode + FSEG_ID);
 
-		xdes_set_state(descr, XDES_FSEG, mtr);
-		mlog_write_ull(descr + XDES_ID, seg_id, mtr);
-		flst_add_last(inode + FSEG_FREE, descr + XDES_FLST_NODE, mtr);
+		xdes_set_state(**xdes, descr, XDES_FSEG, mtr);
+		mtr->write<8,mtr_t::MAYBE_NOP>(**xdes, descr + XDES_ID,
+					       seg_id);
+		flst_add_last(iblock,
+			      static_cast<uint16_t>(inode - iblock->frame
+						    + FSEG_FREE), *xdes,
+			      static_cast<uint16_t>(descr - (*xdes)->frame
+						    + XDES_FLST_NODE), mtr);
 
 		/* Try to fill the segment free list */
-		fseg_fill_free_list(inode, space,
+		fseg_fill_free_list(inode, iblock, space,
 				    xdes_get_offset(descr) + FSP_EXTENT_SIZE,
 				    mtr);
 	}
@@ -2120,42 +1956,38 @@ This function implements the intelligent allocation strategy which tries to
 minimize file space fragmentation.
 @param[in,out]	space			tablespace
 @param[in,out]	seg_inode		segment inode
+@param[in,out]	iblock			segment inode page
 @param[in]	hint			hint of which page would be desirable
 @param[in]	direction		if the new page is needed because of
 an index page split, and records are inserted there in order, into which
 direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR
 @param[in,out]	mtr			mini-transaction
 @param[in,out]	init_mtr		mtr or another mini-transaction in
-which the page should be initialized. If init_mtr != mtr, but the page is
-already latched in mtr, do not initialize the page
-@param[in]	has_done_reservation	TRUE if the space has already been
-reserved, in this case we will never return NULL
-@retval NULL	if no page could be allocated
-@retval block	rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
-(init_mtr == mtr, or the page was not previously freed in mtr)
-@retval block	(not allocated or initialized) otherwise */
+which the page should be initialized.
+@retval NULL	if no page could be allocated */
 static
 buf_block_t*
 fseg_alloc_free_page_low(
 	fil_space_t*		space,
 	fseg_inode_t*		seg_inode,
-	ulint			hint,
+	buf_block_t*		iblock,
+	uint32_t		hint,
 	byte			direction,
-	mtr_t*			mtr,
-	mtr_t*			init_mtr
 #ifdef UNIV_DEBUG
-	, ibool			has_done_reservation
+	bool			has_done_reservation,
+	/*!< whether the space has already been reserved */
 #endif /* UNIV_DEBUG */
-)
+	mtr_t*			mtr,
+	mtr_t*			init_mtr)
 {
-	fsp_header_t*	space_header;
 	ib_id_t		seg_id;
 	ulint		used;
 	ulint		reserved;
 	xdes_t*		descr;		/*!< extent of the hinted page */
-	ulint		ret_page;	/*!< the allocated page offset, FIL_NULL
+	uint32_t	ret_page;	/*!< the allocated page offset, FIL_NULL
 					if could not be allocated */
 	xdes_t*		ret_descr;	/*!< the extent of the allocated page */
+	buf_block_t*	xdes;
 	ulint		n;
 	const ulint	space_id	= space->id;
 
@@ -2169,26 +2001,25 @@ fseg_alloc_free_page_low(
 	ut_d(space->modify_check(*mtr));
 	ut_ad(fil_page_get_type(page_align(seg_inode)) == FIL_PAGE_INODE);
 
-	reserved = fseg_n_reserved_pages_low(seg_inode, &used, mtr);
+	reserved = fseg_n_reserved_pages_low(seg_inode, &used);
 
-	space_header = fsp_get_space_header(space, mtr);
+	buf_block_t* header = fsp_get_header(space, mtr);
 
-	descr = xdes_get_descriptor_with_space_hdr(space_header, space,
-						   hint, mtr);
+	descr = xdes_get_descriptor_with_space_hdr(header, space, hint,
+						   &xdes, mtr);
 	if (descr == NULL) {
 		/* Hint outside space or too high above free limit: reset
 		hint */
 		/* The file space header page is always allocated. */
 		hint = 0;
-		descr = xdes_get_descriptor(space, hint, mtr);
+		descr = xdes_get_descriptor(space, hint, &xdes, mtr);
 	}
 
 	/* In the big if-else below we look for ret_page and ret_descr */
 	/*-------------------------------------------------------------*/
-	if ((xdes_get_state(descr, mtr) == XDES_FSEG)
+	if ((xdes_get_state(descr) == XDES_FSEG)
 	    && mach_read_from_8(descr + XDES_ID) == seg_id
-	    && (xdes_mtr_get_bit(descr, XDES_FREE_BIT,
-				 hint % FSP_EXTENT_SIZE, mtr) == TRUE)) {
+	    && xdes_is_free(descr, hint % FSP_EXTENT_SIZE)) {
 take_hinted_page:
 		/* 1. We can take the hinted page
 		=================================*/
@@ -2199,7 +2030,7 @@ take_hinted_page:
 		we would have got (descr == NULL) above and reset the hint. */
 		goto got_hinted_page;
 		/*-----------------------------------------------------------*/
-	} else if (xdes_get_state(descr, mtr) == XDES_FREE
+	} else if (xdes_get_state(descr) == XDES_FREE
 		   && reserved - used < reserved / FSEG_FILLFACTOR
 		   && used >= FSEG_FRAG_LIMIT) {
 
@@ -2207,26 +2038,30 @@ take_hinted_page:
 		=========================================================
 		the hinted page
 		===============*/
-		ret_descr = fsp_alloc_free_extent(space, hint, mtr);
+		ret_descr = fsp_alloc_free_extent(space, hint, &xdes, mtr);
 
 		ut_a(ret_descr == descr);
 
-		xdes_set_state(ret_descr, XDES_FSEG, mtr);
-		mlog_write_ull(ret_descr + XDES_ID, seg_id, mtr);
-		flst_add_last(seg_inode + FSEG_FREE,
-			      ret_descr + XDES_FLST_NODE, mtr);
+		xdes_set_state(*xdes, ret_descr, XDES_FSEG, mtr);
+		mtr->write<8,mtr_t::MAYBE_NOP>(*xdes, ret_descr + XDES_ID,
+					       seg_id);
+		flst_add_last(iblock,
+			      static_cast<uint16_t>(seg_inode - iblock->frame
+						    + FSEG_FREE), xdes,
+			      static_cast<uint16_t>(ret_descr - xdes->frame
+						    + XDES_FLST_NODE), mtr);
 
 		/* Try to fill the segment free list */
-		fseg_fill_free_list(seg_inode, space,
+		fseg_fill_free_list(seg_inode, iblock, space,
 				    hint + FSP_EXTENT_SIZE, mtr);
 		goto take_hinted_page;
 		/*-----------------------------------------------------------*/
 	} else if ((direction != FSP_NO_DIR)
 		   && ((reserved - used) < reserved / FSEG_FILLFACTOR)
 		   && (used >= FSEG_FRAG_LIMIT)
-		   && (!!(ret_descr
-			  = fseg_alloc_free_extent(seg_inode, space, mtr)))) {
-
+		   && !!(ret_descr = fseg_alloc_free_extent(seg_inode, iblock,
+							    &xdes, space,
+							    mtr))) {
 		/* 3. We take any free extent (which was already assigned above
 		===============================================================
 		in the if-condition to ret_descr) and take the lowest or
@@ -2240,9 +2075,9 @@ take_hinted_page:
 		}
 		ut_ad(!has_done_reservation || ret_page != FIL_NULL);
 		/*-----------------------------------------------------------*/
-	} else if ((xdes_get_state(descr, mtr) == XDES_FSEG)
+	} else if ((xdes_get_state(descr) == XDES_FSEG)
 		   && mach_read_from_8(descr + XDES_ID) == seg_id
-		   && (!xdes_is_full(descr, mtr))) {
+		   && (!xdes_is_full(descr))) {
 
 		/* 4. We can take the page from the same extent as the
 		======================================================
@@ -2251,10 +2086,12 @@ take_hinted_page:
 		segment)
 		========*/
 		ret_descr = descr;
-		ret_page = xdes_get_offset(ret_descr)
-			+ xdes_find_bit(ret_descr, XDES_FREE_BIT, TRUE,
-					hint % FSP_EXTENT_SIZE, mtr);
-		ut_ad(!has_done_reservation || ret_page != FIL_NULL);
+		ret_page = xdes_find_free(ret_descr, hint % FSP_EXTENT_SIZE);
+		if (ret_page == FIL_NULL) {
+			ut_ad(!has_done_reservation);
+		} else {
+			ret_page += xdes_get_offset(ret_descr);
+		}
 		/*-----------------------------------------------------------*/
 	} else if (reserved - used > 0) {
 		/* 5. We take any unused page from the segment
@@ -2262,20 +2099,21 @@ take_hinted_page:
 		fil_addr_t	first;
 
 		if (flst_get_len(seg_inode + FSEG_NOT_FULL) > 0) {
-			first = flst_get_first(seg_inode + FSEG_NOT_FULL,
-					       mtr);
+			first = flst_get_first(seg_inode + FSEG_NOT_FULL);
 		} else if (flst_get_len(seg_inode + FSEG_FREE) > 0) {
-			first = flst_get_first(seg_inode + FSEG_FREE, mtr);
+			first = flst_get_first(seg_inode + FSEG_FREE);
 		} else {
 			ut_ad(!has_done_reservation);
 			return(NULL);
 		}
 
-		ret_descr = xdes_lst_get_descriptor(space, first, mtr);
-		ret_page = xdes_get_offset(ret_descr)
-			+ xdes_find_bit(ret_descr, XDES_FREE_BIT, TRUE,
-					0, mtr);
-		ut_ad(!has_done_reservation || ret_page != FIL_NULL);
+		ret_descr = xdes_lst_get_descriptor(space, first, &xdes, mtr);
+		ret_page = xdes_find_free(ret_descr);
+		if (ret_page == FIL_NULL) {
+			ut_ad(!has_done_reservation);
+		} else {
+			ret_page += xdes_get_offset(ret_descr);
+		}
 		/*-----------------------------------------------------------*/
 	} else if (used < FSEG_FRAG_LIMIT) {
 		/* 6. We allocate an individual page from the space
@@ -2288,12 +2126,12 @@ take_hinted_page:
 		if (block) {
 			/* Put the page in the fragment page array of the
 			segment */
-			n = fseg_find_free_frag_page_slot(seg_inode, mtr);
+			n = fseg_find_free_frag_page_slot(seg_inode);
 			ut_a(n != ULINT_UNDEFINED);
 
 			fseg_set_nth_frag_page_no(
-				seg_inode, n, block->page.id.page_no(),
-				mtr);
+				seg_inode, iblock, n,
+				block->page.id().page_no(), mtr);
 		}
 
 		/* fsp_alloc_free_page() invoked fsp_init_file_page()
@@ -2303,7 +2141,8 @@ take_hinted_page:
 	} else {
 		/* 7. We allocate a new extent and take its first page
 		======================================================*/
-		ret_descr = fseg_alloc_free_extent(seg_inode, space, mtr);
+		ret_descr = fseg_alloc_free_extent(seg_inode, iblock, &xdes,
+						   space, mtr);
 
 		if (ret_descr == NULL) {
 			ret_page = FIL_NULL;
@@ -2336,7 +2175,7 @@ take_hinted_page:
 		}
 
 		if (!fsp_try_extend_data_file_with_pages(
-			    space, ret_page, space_header, mtr)) {
+			    space, ret_page, header, mtr)) {
 			/* No disk space left */
 			ut_ad(!has_done_reservation);
 			return(NULL);
@@ -2351,54 +2190,50 @@ got_hinted_page:
 		The extent is still in the appropriate list (FSEG_NOT_FULL
 		or FSEG_FREE), and the page is not yet marked as used. */
 
-		ut_ad(xdes_get_descriptor(space, ret_page, mtr) == ret_descr);
-
-		ut_ad(xdes_mtr_get_bit(
-				ret_descr, XDES_FREE_BIT,
-				ret_page % FSP_EXTENT_SIZE, mtr));
+		ut_d(buf_block_t* xxdes);
+		ut_ad(xdes_get_descriptor(space, ret_page, &xxdes, mtr)
+		      == ret_descr);
+		ut_ad(xdes == xxdes);
+		ut_ad(xdes_is_free(ret_descr, ret_page % FSP_EXTENT_SIZE));
 
-		fseg_mark_page_used(seg_inode, ret_page, ret_descr, mtr);
+		fseg_mark_page_used(seg_inode, iblock, ret_page, ret_descr,
+				    xdes, mtr);
 	}
 
-	return fsp_page_create(space, ret_page, mtr, init_mtr);
+	return fsp_page_create(space, ret_page, init_mtr);
 }
 
 /**********************************************************************//**
 Allocates a single free page from a segment. This function implements
 the intelligent allocation strategy which tries to minimize file space
 fragmentation.
-@retval NULL if no page could be allocated
-@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
-(init_mtr == mtr, or the page was not previously freed in mtr)
-@retval block (not allocated or initialized) otherwise */
+@retval NULL if no page could be allocated */
 buf_block_t*
 fseg_alloc_free_page_general(
 /*=========================*/
 	fseg_header_t*	seg_header,/*!< in/out: segment header */
-	ulint		hint,	/*!< in: hint of which page would be
+	uint32_t	hint,	/*!< in: hint of which page would be
 				desirable */
 	byte		direction,/*!< in: if the new page is needed because
 				of an index page split, and records are
 				inserted there in order, into which
 				direction they go alphabetically: FSP_DOWN,
 				FSP_UP, FSP_NO_DIR */
-	ibool		has_done_reservation, /*!< in: TRUE if the caller has
+	bool		has_done_reservation, /*!< in: true if the caller has
 				already done the reservation for the page
 				with fsp_reserve_free_extents, then there
 				is no need to do the check for this individual
 				page */
 	mtr_t*		mtr,	/*!< in/out: mini-transaction */
 	mtr_t*		init_mtr)/*!< in/out: mtr or another mini-transaction
-				in which the page should be initialized.
-				If init_mtr!=mtr, but the page is already
-				latched in mtr, do not initialize the page. */
+				in which the page should be initialized. */
 {
 	fseg_inode_t*	inode;
 	ulint		space_id;
 	fil_space_t*	space;
 	buf_block_t*	iblock;
 	buf_block_t*	block;
-	ulint		n_reserved;
+	uint32_t	n_reserved;
 
 	space_id = page_get_space_id(page_align(seg_header));
 	space = mtr_x_lock_space(space_id, mtr);
@@ -2415,12 +2250,11 @@ fseg_alloc_free_page_general(
 	}
 
 	block = fseg_alloc_free_page_low(space,
-					 inode, hint, direction,
-					 mtr, init_mtr
+					 inode, iblock, hint, direction,
 #ifdef UNIV_DEBUG
-					 , has_done_reservation
+					 has_done_reservation,
 #endif /* UNIV_DEBUG */
-					 );
+					 mtr, init_mtr);
 
 	/* The allocation cannot fail if we have already reserved a
 	space for the page. */
@@ -2438,38 +2272,37 @@ of a single-table tablespace, and they are also physically initialized to
 the data file. That is we have already extended the data file so that those
 pages are inside the data file. If not, this function extends the tablespace
 with pages.
-@param[in,out]	space		tablespace
-@param[in,out]	space_header	tablespace header, x-latched
-@param[in]	size		size of the tablespace in pages,
-must be less than FSP_EXTENT_SIZE
-@param[in,out]	mtr		mini-transaction
-@param[in]	n_pages		number of pages to reserve
+@param[in,out]	space	tablespace
+@param[in,out]	header	tablespace header, x-latched
+@param[in]	size	tablespace size in pages, less than FSP_EXTENT_SIZE
+@param[in,out]	mtr	mini-transaction
+@param[in]	n_pages	number of pages to reserve
 @return true if there were at least n_pages free pages, or we were able
 to extend */
 static
 bool
 fsp_reserve_free_pages(
 	fil_space_t*	space,
-	fsp_header_t*	space_header,
+	buf_block_t*	header,
 	ulint		size,
 	mtr_t*		mtr,
-	ulint		n_pages)
+	uint32_t	n_pages)
 {
 	xdes_t*	descr;
-	ulint	n_used;
 
 	ut_a(!is_system_tablespace(space->id));
 	ut_a(size < FSP_EXTENT_SIZE);
 
-	descr = xdes_get_descriptor_with_space_hdr(
-		space_header, space, 0, mtr);
-	n_used = xdes_get_n_used(descr, mtr);
+	buf_block_t* xdes;
+	descr = xdes_get_descriptor_with_space_hdr(header, space, 0, &xdes,
+						   mtr);
+	uint32_t n_used = xdes_get_n_used(descr);
 
 	ut_a(n_used <= size);
 
 	return(size >= n_used + n_pages
 	       || fsp_try_extend_data_file_with_pages(
-		       space, n_used + n_pages - 1, space_header, mtr));
+		       space, n_used + n_pages - 1, header, mtr));
 }
 
 /** Reserves free pages from a tablespace. All mini-transactions which may
@@ -2512,62 +2345,63 @@ free pages available.
 @return true if we were able to make the reservation */
 bool
 fsp_reserve_free_extents(
-	ulint*		n_reserved,
+	uint32_t*	n_reserved,
 	fil_space_t*	space,
-	ulint		n_ext,
+	uint32_t	n_ext,
 	fsp_reserve_t	alloc_type,
 	mtr_t*		mtr,
-	ulint		n_pages)
+	uint32_t	n_pages)
 {
-	fsp_header_t*	space_header;
-	ulint		n_free_list_ext;
-	ulint		free_limit;
-	ulint		size;
-	ulint		n_free;
-	ulint		n_free_up;
 	ulint		reserve;
 
 	ut_ad(mtr);
 	*n_reserved = n_ext;
 
+	const uint32_t extent_size = FSP_EXTENT_SIZE;
+
 	mtr_x_lock_space(space, mtr);
-	const ulint physical_size = space->physical_size();
+	const unsigned physical_size = space->physical_size();
 
-	space_header = fsp_get_space_header(space, mtr);
+	buf_block_t* header = fsp_get_header(space, mtr);
 try_again:
-	size = mach_read_from_4(space_header + FSP_SIZE);
+	uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+					 + header->frame);
 	ut_ad(size == space->size_in_header);
 
-	if (size < FSP_EXTENT_SIZE && n_pages < FSP_EXTENT_SIZE / 2) {
+	if (size < extent_size && n_pages < extent_size / 2) {
 		/* Use different rules for small single-table tablespaces */
 		*n_reserved = 0;
-		return(fsp_reserve_free_pages(space, space_header, size,
+		return(fsp_reserve_free_pages(space, header, size,
 					      mtr, n_pages));
 	}
 
-	n_free_list_ext = flst_get_len(space_header + FSP_FREE);
+	uint32_t n_free_list_ext = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE
+						+ header->frame);
 	ut_ad(space->free_len == n_free_list_ext);
 
-	free_limit = mach_read_from_4(space_header + FSP_FREE_LIMIT);
+	uint32_t free_limit = mach_read_from_4(FSP_HEADER_OFFSET
+					       + FSP_FREE_LIMIT
+					       + header->frame);
 	ut_ad(space->free_limit == free_limit);
 
 	/* Below we play safe when counting free extents above the free limit:
 	some of them will contain extent descriptor pages, and therefore
 	will not be free extents */
 
+	uint32_t n_free_up;
+
 	if (size >= free_limit) {
-		n_free_up = (size - free_limit) / FSP_EXTENT_SIZE;
+		n_free_up = (size - free_limit) / extent_size;
+		if (n_free_up) {
+			n_free_up--;
+			n_free_up -= n_free_up / (physical_size / extent_size);
+		}
 	} else {
 		ut_ad(alloc_type == FSP_BLOB);
 		n_free_up = 0;
 	}
 
-	if (n_free_up > 0) {
-		n_free_up--;
-		n_free_up -= n_free_up / (physical_size / FSP_EXTENT_SIZE);
-	}
-
-	n_free = n_free_list_ext + n_free_up;
+	uint32_t n_free = n_free_list_ext + n_free_up;
 
 	switch (alloc_type) {
 	case FSP_NORMAL:
@@ -2575,7 +2409,7 @@ try_again:
 		and 1 extent + 0.5 % to cleaning operations; NOTE: this source
 		code is duplicated in the function below! */
 
-		reserve = 2 + ((size / FSP_EXTENT_SIZE) * 2) / 200;
+		reserve = 2 + ((size / extent_size) * 2) / 200;
 
 		if (n_free <= reserve + n_ext) {
 
@@ -2585,7 +2419,7 @@ try_again:
 	case FSP_UNDO:
 		/* We reserve 0.5 % of the space size to cleaning operations */
 
-		reserve = 1 + ((size / FSP_EXTENT_SIZE) * 1) / 200;
+		reserve = 1 + ((size / extent_size) * 1) / 200;
 
 		if (n_free <= reserve + n_ext) {
 
@@ -2604,84 +2438,27 @@ try_again:
 		return(true);
 	}
 try_to_extend:
-	if (fsp_try_extend_data_file(space, space_header, mtr)) {
+	if (fsp_try_extend_data_file(space, header, mtr)) {
 		goto try_again;
 	}
 
 	return(false);
 }
 
-/********************************************************************//**
-Marks a page used. The page must reside within the extents of the given
-segment. */
-static MY_ATTRIBUTE((nonnull))
-void
-fseg_mark_page_used(
-/*================*/
-	fseg_inode_t*	seg_inode,/*!< in: segment inode */
-	ulint		page,	/*!< in: page offset */
-	xdes_t*		descr,  /*!< in: extent descriptor */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
-{
-	ulint	not_full_n_used;
-
-	ut_ad(fil_page_get_type(page_align(seg_inode)) == FIL_PAGE_INODE);
-	ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
-	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
-	      == FSEG_MAGIC_N_VALUE);
-	ut_ad(!memcmp(seg_inode + FSEG_ID, descr + XDES_ID, 4));
-
-	if (xdes_is_free(descr, mtr)) {
-		/* We move the extent from the free list to the
-		NOT_FULL list */
-		flst_remove(seg_inode + FSEG_FREE, descr + XDES_FLST_NODE,
-			    mtr);
-		flst_add_last(seg_inode + FSEG_NOT_FULL,
-			      descr + XDES_FLST_NODE, mtr);
-	}
-
-	ut_ad(xdes_mtr_get_bit(
-			descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr));
-
-	/* We mark the page as used */
-	xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, FALSE, mtr);
-
-	not_full_n_used = mach_read_from_4(seg_inode + FSEG_NOT_FULL_N_USED);
-	not_full_n_used++;
-	mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED, not_full_n_used,
-			 MLOG_4BYTES, mtr);
-	if (xdes_is_full(descr, mtr)) {
-		/* We move the extent from the NOT_FULL list to the
-		FULL list */
-		flst_remove(seg_inode + FSEG_NOT_FULL,
-			    descr + XDES_FLST_NODE, mtr);
-		flst_add_last(seg_inode + FSEG_FULL,
-			      descr + XDES_FLST_NODE, mtr);
-
-		mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
-				 not_full_n_used - FSP_EXTENT_SIZE,
-				 MLOG_4BYTES, mtr);
-	}
-}
-
 /** Frees a single page of a segment.
 @param[in]	seg_inode	segment inode
 @param[in,out]	space		tablespace
 @param[in]	offset		page number
-@param[in]	log		whether to write MLOG_INIT_FREE_PAGE record
 @param[in,out]	mtr		mini-transaction */
 static
 void
 fseg_free_page_low(
 	fseg_inode_t*		seg_inode,
+	buf_block_t*		iblock,
 	fil_space_t*		space,
 	page_no_t		offset,
-	bool			log,
 	mtr_t*			mtr)
 {
-	xdes_t*	descr;
-	ulint	not_full_n_used;
-	ulint	state;
 	ib_id_t	descr_id;
 	ib_id_t	seg_id;
 
@@ -2690,12 +2467,15 @@ fseg_free_page_low(
 	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
 	      == FSEG_MAGIC_N_VALUE);
 	ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+	ut_ad(iblock->frame == page_align(seg_inode));
 	ut_d(space->modify_check(*mtr));
 
-	descr = xdes_get_descriptor(space, offset, mtr);
+	const uint32_t extent_size = FSP_EXTENT_SIZE;
+	ut_ad(ut_is_2pow(extent_size));
+	buf_block_t* xdes;
+	xdes_t* descr = xdes_get_descriptor(space, offset, &xdes, mtr);
 
-	if (xdes_mtr_get_bit(descr, XDES_FREE_BIT,
-			     offset % FSP_EXTENT_SIZE, mtr)) {
+	if (xdes_is_free(descr, offset & (extent_size - 1))) {
 		ib::fatal() << "InnoDB is trying to free page "
 			<< page_id_t(space->id, offset)
 			<< " though it is already marked as free in the"
@@ -2705,23 +2485,22 @@ fseg_free_page_low(
 			<< FORCE_RECOVERY_MSG;
 	}
 
-	state = xdes_get_state(descr, mtr);
-
-	if (state != XDES_FSEG) {
+	if (xdes_get_state(descr) != XDES_FSEG) {
 		/* The page is in the fragment pages of the segment */
 		for (ulint i = 0;; i++) {
-			if (fseg_get_nth_frag_page_no(seg_inode, i, mtr)
+			if (fseg_get_nth_frag_page_no(seg_inode, i)
 			    != offset) {
 				continue;
 			}
 
 			compile_time_assert(FIL_NULL == 0xffffffff);
-			mlog_memset(seg_inode + FSEG_FRAG_ARR
-				    + i * FSEG_FRAG_SLOT_SIZE, 4, 0xff, mtr);
+			mtr->memset(iblock, uint16_t(seg_inode - iblock->frame)
+				    + FSEG_FRAG_ARR
+				    + i * FSEG_FRAG_SLOT_SIZE, 4, 0xff);
 			break;
 		}
 
-		fsp_free_page(space, offset, log, mtr);
+		fsp_free_page(space, offset, mtr);
 		return;
 	}
 
@@ -2745,48 +2524,51 @@ fseg_free_page_low(
 			<< FORCE_RECOVERY_MSG;
 	}
 
-	not_full_n_used = mach_read_from_4(seg_inode + FSEG_NOT_FULL_N_USED);
-	if (xdes_is_full(descr, mtr)) {
+	byte* p_not_full = seg_inode + FSEG_NOT_FULL_N_USED;
+	uint32_t not_full_n_used = mach_read_from_4(p_not_full);
+	const uint16_t xoffset= uint16_t(descr - xdes->frame + XDES_FLST_NODE);
+	const uint16_t ioffset= uint16_t(seg_inode - iblock->frame);
+
+	if (xdes_is_full(descr)) {
 		/* The fragment is full: move it to another list */
-		flst_remove(seg_inode + FSEG_FULL,
-			    descr + XDES_FLST_NODE, mtr);
-		flst_add_last(seg_inode + FSEG_NOT_FULL,
-			      descr + XDES_FLST_NODE, mtr);
-		mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
-				 not_full_n_used + FSP_EXTENT_SIZE - 1,
-				 MLOG_4BYTES, mtr);
+		flst_remove(iblock, static_cast<uint16_t>(FSEG_FULL + ioffset),
+			    xdes, xoffset, mtr);
+		flst_add_last(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
+							    + ioffset),
+			      xdes, xoffset, mtr);
+		not_full_n_used += extent_size - 1;
 	} else {
 		ut_a(not_full_n_used > 0);
-		mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
-				 not_full_n_used - 1, MLOG_4BYTES, mtr);
+		not_full_n_used--;
 	}
 
-	const ulint	bit = offset % FSP_EXTENT_SIZE;
+	mtr->write<4>(*iblock, p_not_full, not_full_n_used);
+
+	const ulint	bit = offset & (extent_size - 1);
 
-	xdes_set_bit(descr, XDES_FREE_BIT, bit, TRUE, mtr);
-	/* xdes_init() should have set all XDES_CLEAN_BIT */
-	ut_ad(xdes_get_bit(descr, XDES_CLEAN_BIT, bit));
+	xdes_set_free<true>(*xdes, descr, bit, mtr);
 
-	if (xdes_is_free(descr, mtr)) {
+	if (!xdes_get_n_used(descr)) {
 		/* The extent has become free: free it to space */
-		flst_remove(seg_inode + FSEG_NOT_FULL,
-			    descr + XDES_FLST_NODE, mtr);
+		flst_remove(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
+							  + ioffset),
+			    xdes, xoffset, mtr);
 		fsp_free_extent(space, offset, mtr);
 	}
+
+	mtr->free(*space, static_cast<uint32_t>(offset));
 }
 
 /** Free a page in a file segment.
 @param[in,out]	seg_header	file segment header
 @param[in,out]	space		tablespace
 @param[in]	offset		page number
-@param[in]	log		whether to write MLOG_INIT_FREE_PAGE record
 @param[in,out]	mtr		mini-transaction */
 void
 fseg_free_page(
 	fseg_header_t*	seg_header,
 	fil_space_t*	space,
-	ulint		offset,
-	bool		log,
+	uint32_t	offset,
 	mtr_t*		mtr)
 {
 	DBUG_ENTER("fseg_free_page");
@@ -2804,9 +2586,7 @@ fseg_free_page(
 		fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
 	}
 
-	fseg_free_page_low(seg_inode, space, offset, log, mtr);
-
-	ut_d(buf_page_set_file_page_was_freed(page_id_t(space->id, offset)));
+	fseg_free_page_low(seg_inode, iblock, space, offset, mtr);
 
 	DBUG_VOID_RETURN;
 }
@@ -2824,14 +2604,13 @@ fseg_page_is_free(fil_space_t* space, unsigned page)
 							  page);
 
 	mtr.start();
-	mtr_s_lock_space(space, &mtr);
+	mtr_sx_lock_space(space, &mtr);
 
 	if (page >= space->free_limit || page >= space->size_in_header) {
 		is_free = true;
 	} else if (const xdes_t* descr = xdes_get_descriptor_const(
 			   space, dpage, page, &mtr)) {
-		is_free = xdes_get_bit(descr, XDES_FREE_BIT,
-				       page % FSP_EXTENT_SIZE);
+		is_free = xdes_is_free(descr, page % FSP_EXTENT_SIZE);
 	} else {
 		is_free = true;
 	}
@@ -2850,53 +2629,53 @@ static
 void
 fseg_free_extent(
 	fseg_inode_t*		seg_inode,
+	buf_block_t*		iblock,
 	fil_space_t*		space,
-	ulint			page,
+	uint32_t		page,
 	mtr_t*			mtr)
 {
-	xdes_t*	descr;
-	ulint	not_full_n_used;
-	ulint	descr_n_used;
 
 	ut_ad(mtr != NULL);
 
-	descr = xdes_get_descriptor(space, page, mtr);
+	buf_block_t* xdes;
+	xdes_t*	descr = xdes_get_descriptor(space, page, &xdes, mtr);
 
-	ut_a(xdes_get_state(descr, mtr) == XDES_FSEG);
+	ut_a(xdes_get_state(descr) == XDES_FSEG);
 	ut_a(!memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8));
 	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
 	      == FSEG_MAGIC_N_VALUE);
 	ut_d(space->modify_check(*mtr));
-	ut_d(ulint first_page_in_extent = page - (page % FSP_EXTENT_SIZE));
-
-	if (xdes_is_full(descr, mtr)) {
-		flst_remove(seg_inode + FSEG_FULL,
-			    descr + XDES_FLST_NODE, mtr);
-	} else if (xdes_is_free(descr, mtr)) {
-		flst_remove(seg_inode + FSEG_FREE,
-			    descr + XDES_FLST_NODE, mtr);
-	} else {
-		flst_remove(seg_inode + FSEG_NOT_FULL,
-			    descr + XDES_FLST_NODE, mtr);
+	const uint32_t first_page_in_extent = page - (page % FSP_EXTENT_SIZE);
 
-		not_full_n_used = mach_read_from_4(FSEG_NOT_FULL_N_USED
-						   + seg_inode);
-		descr_n_used = xdes_get_n_used(descr, mtr);
+	const uint16_t xoffset= uint16_t(descr - xdes->frame + XDES_FLST_NODE);
+	const uint16_t ioffset= uint16_t(seg_inode - iblock->frame);
+
+	if (xdes_is_full(descr)) {
+		flst_remove(iblock, static_cast<uint16_t>(FSEG_FULL + ioffset),
+			    xdes, xoffset, mtr);
+	} else if (!xdes_get_n_used(descr)) {
+		flst_remove(iblock, static_cast<uint16_t>(FSEG_FREE + ioffset),
+			    xdes, xoffset, mtr);
+	} else {
+		flst_remove(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
+							  + ioffset),
+			    xdes, xoffset, mtr);
+		uint32_t not_full_n_used = mach_read_from_4(
+			FSEG_NOT_FULL_N_USED + seg_inode);
+		uint32_t descr_n_used = xdes_get_n_used(descr);
 		ut_a(not_full_n_used >= descr_n_used);
-		mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
-				 not_full_n_used - descr_n_used,
-				 MLOG_4BYTES, mtr);
+		mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED,
+			      not_full_n_used - descr_n_used);
 	}
 
 	fsp_free_extent(space, page, mtr);
 
-#ifdef UNIV_DEBUG
-	for (ulint i = 0; i < FSP_EXTENT_SIZE; i++) {
-
-		buf_page_set_file_page_was_freed(
-			page_id_t(space->id, first_page_in_extent + i));
+	for (uint32_t i = 0; i < FSP_EXTENT_SIZE; i++) {
+		if (!xdes_is_free(descr, i)) {
+			buf_page_free(space, first_page_in_extent + i, mtr,
+				      __FILE__, __LINE__);
+		}
 	}
-#endif /* UNIV_DEBUG */
 }
 
 /**********************************************************************//**
@@ -2914,27 +2693,22 @@ fseg_free_step(
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint		n;
-	ulint		page;
-	xdes_t*		descr;
 	fseg_inode_t*	inode;
-	ulint		space_id;
-	ulint		header_page;
 
 	DBUG_ENTER("fseg_free_step");
 
-	space_id = page_get_space_id(page_align(header));
-	header_page = page_get_page_no(page_align(header));
-
-	fil_space_t*		space = mtr_x_lock_space(space_id, mtr);
+	const uint32_t space_id = page_get_space_id(page_align(header));
+	const uint32_t header_page = page_get_page_no(page_align(header));
 
-	descr = xdes_get_descriptor(space, header_page, mtr);
+	fil_space_t* space = mtr_x_lock_space(space_id, mtr);
+	buf_block_t* xdes;
+	xdes_t* descr = xdes_get_descriptor(space, header_page, &xdes, mtr);
 
 	/* Check that the header resides on a page which has not been
 	freed yet */
 
-	ut_a(xdes_mtr_get_bit(descr, XDES_FREE_BIT,
-			      header_page % FSP_EXTENT_SIZE, mtr) == FALSE);
-	buf_block_t*		iblock;
+	ut_a(!xdes_is_free(descr, header_page % FSP_EXTENT_SIZE));
+	buf_block_t* iblock;
 	const ulint zip_size = space->zip_size();
 	inode = fseg_inode_try_get(header, space_id, zip_size, mtr, &iblock);
 
@@ -2951,31 +2725,32 @@ fseg_free_step(
 
 	if (descr != NULL) {
 		/* Free the extent held by the segment */
-		page = xdes_get_offset(descr);
-		fseg_free_extent(inode, space, page, mtr);
+		fseg_free_extent(inode, iblock, space, xdes_get_offset(descr),
+				 mtr);
 		DBUG_RETURN(false);
 	}
 
 	/* Free a frag page */
-	n = fseg_find_last_used_frag_page_slot(inode, mtr);
+	n = fseg_find_last_used_frag_page_slot(inode);
 
 	if (n == ULINT_UNDEFINED) {
 		/* Freeing completed: free the segment inode */
-		fsp_free_seg_inode(space, inode, mtr);
+		fsp_free_seg_inode(space, inode, iblock, mtr);
 
 		DBUG_RETURN(true);
 	}
 
-	fseg_free_page_low(
-		inode, space,
-		fseg_get_nth_frag_page_no(inode, n, mtr),
-		true, mtr);
+	page_no_t page_no = fseg_get_nth_frag_page_no(inode, n);
+
+	fseg_free_page_low(inode, iblock, space, page_no, mtr);
 
-	n = fseg_find_last_used_frag_page_slot(inode, mtr);
+	buf_page_free(space, page_no, mtr, __FILE__, __LINE__);
+
+	n = fseg_find_last_used_frag_page_slot(inode);
 
 	if (n == ULINT_UNDEFINED) {
 		/* Freeing completed: free the segment inode */
-		fsp_free_seg_inode(space, inode, mtr);
+		fsp_free_seg_inode(space, inode, iblock, mtr);
 
 		DBUG_RETURN(true);
 	}
@@ -2994,13 +2769,10 @@ fseg_free_step_not_header(
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint		n;
-	ulint		page;
 	xdes_t*		descr;
 	fseg_inode_t*	inode;
-	ulint		space_id;
-	ulint		page_no;
 
-	space_id = page_get_space_id(page_align(header));
+	const uint32_t space_id = page_get_space_id(page_align(header));
 	ut_ad(mtr->is_named_space(space_id));
 
 	fil_space_t*		space = mtr_x_lock_space(space_id, mtr);
@@ -3016,24 +2788,25 @@ fseg_free_step_not_header(
 
 	if (descr != NULL) {
 		/* Free the extent held by the segment */
-		page = xdes_get_offset(descr);
-		fseg_free_extent(inode, space, page, mtr);
+		fseg_free_extent(inode, iblock, space, xdes_get_offset(descr),
+				 mtr);
 		return false;
 	}
 
 	/* Free a frag page */
 
-	n = fseg_find_last_used_frag_page_slot(inode, mtr);
+	n = fseg_find_last_used_frag_page_slot(inode);
 
 	ut_a(n != ULINT_UNDEFINED);
 
-	page_no = fseg_get_nth_frag_page_no(inode, n, mtr);
+	uint32_t page_no = fseg_get_nth_frag_page_no(inode, n);
 
 	if (page_no == page_get_page_no(page_align(header))) {
 		return true;
 	}
 
-	fseg_free_page_low(inode, space, page_no, true, mtr);
+	fseg_free_page_low(inode, iblock, space, page_no, mtr);
+	buf_page_free(space, page_no, mtr, __FILE__, __LINE__);
 	return false;
 }
 
@@ -3058,35 +2831,27 @@ fseg_get_first_extent(
 	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
 
 	if (flst_get_len(inode + FSEG_FULL) > 0) {
-
-		first = flst_get_first(inode + FSEG_FULL, mtr);
-
+		first = flst_get_first(inode + FSEG_FULL);
 	} else if (flst_get_len(inode + FSEG_NOT_FULL) > 0) {
-
-		first = flst_get_first(inode + FSEG_NOT_FULL, mtr);
-
+		first = flst_get_first(inode + FSEG_NOT_FULL);
 	} else if (flst_get_len(inode + FSEG_FREE) > 0) {
-
-		first = flst_get_first(inode + FSEG_FREE, mtr);
+		first = flst_get_first(inode + FSEG_FREE);
 	} else {
 		return(NULL);
 	}
 
-	ut_ad(first.page != FIL_NULL);
+	DBUG_ASSERT(first.page != FIL_NULL);
+
+	buf_block_t *xdes;
 
 	return(first.page == FIL_NULL ? NULL
-	       : xdes_lst_get_descriptor(space, first, mtr));
+	       : xdes_lst_get_descriptor(space, first, &xdes, mtr));
 }
 
 #ifdef UNIV_BTR_PRINT
 /*******************************************************************//**
 Writes info of a segment. */
-static
-void
-fseg_print_low(
-/*===========*/
-	fseg_inode_t*	inode, /*!< in: segment inode */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+static void fseg_print_low(const fseg_inode_t *inode)
 {
 	ulint	space;
 	ulint	n_used;
@@ -3099,15 +2864,14 @@ fseg_print_low(
 	ulint	page_no;
 	ib_id_t	seg_id;
 
-	ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX));
 	space = page_get_space_id(page_align(inode));
 	page_no = page_get_page_no(page_align(inode));
 
-	reserved = fseg_n_reserved_pages_low(inode, &used, mtr);
+	reserved = fseg_n_reserved_pages_low(inode, &used);
 
 	seg_id = mach_read_from_8(inode + FSEG_ID);
 	n_used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED);
-	n_frag = fseg_get_n_frag_pages(inode, mtr);
+	n_frag = fseg_get_n_frag_pages(inode);
 	n_free = flst_get_len(inode + FSEG_FREE);
 	n_not_full = flst_get_len(inode + FSEG_NOT_FULL);
 	n_full = flst_get_len(inode + FSEG_FULL);
@@ -3140,7 +2904,7 @@ fseg_print(
 
 	inode = fseg_inode_get(header, space_id, space->zip_size(), mtr);
 
-	fseg_print_low(inode, mtr);
+	fseg_print_low(inode);
 }
 #endif /* UNIV_BTR_PRINT */
 
diff --git a/storage/innobase/fsp/fsp0space.cc b/storage/innobase/fsp/fsp0space.cc
index 1ed4af86367..b0a80efe7c4 100644
--- a/storage/innobase/fsp/fsp0space.cc
+++ b/storage/innobase/fsp/fsp0space.cc
@@ -130,7 +130,7 @@ Tablespace::open_or_create(bool is_temp)
 				fsp_flags = FSP_FLAGS_PAGE_SSIZE();
 			}
 
-			space = fil_space_create(
+			space = fil_space_t::create(
 				m_name, m_space_id, fsp_flags,
 				is_temp
 				? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE,
diff --git a/storage/innobase/fsp/fsp0sysspace.cc b/storage/innobase/fsp/fsp0sysspace.cc
index 98b3ae38097..c6f8410f784 100644
--- a/storage/innobase/fsp/fsp0sysspace.cc
+++ b/storage/innobase/fsp/fsp0sysspace.cc
@@ -32,6 +32,7 @@ Refactored 2013-7-26 by Kevin Lewis
 #include "mem0mem.h"
 #include "os0file.h"
 #include "row0mysql.h"
+#include "buf0dblwr.h"
 
 /** The server header file is included to access opt_initialize global variable.
 If server passes the option for create/open DB to SE, we should remove such
@@ -46,7 +47,7 @@ SysTablespace srv_tmp_space;
 
 /** If the last data file is auto-extended, we add this many pages to it
 at a time. We have to make this public because it is a config variable. */
-ulong sys_tablespace_auto_extend_increment;
+uint sys_tablespace_auto_extend_increment;
 
 /** Convert a numeric string that optionally ends in G or M or K,
     to a number containing megabytes.
@@ -274,7 +275,8 @@ SysTablespace::parse_params(
 			}
 		}
 
-		m_files.push_back(Datafile(filepath, flags(), size, order));
+		m_files.push_back(Datafile(filepath, flags(), uint32_t(size),
+					   order));
 		Datafile* datafile = &m_files.back();
 		datafile->make_filepath(path(), filepath, NO_EXT);
 
@@ -350,7 +352,7 @@ SysTablespace::check_size(
 	also the data file could contain an incomplete extent.
 	So we need to round the size downward to a  megabyte.*/
 
-	const ulint	rounded_size_pages = static_cast<ulint>(
+	const uint32_t	rounded_size_pages = static_cast<uint32_t>(
 		size >> srv_page_size_shift);
 
 	/* If last file */
@@ -391,7 +393,7 @@ dberr_t
 SysTablespace::set_size(
 	Datafile&	file)
 {
-	ut_a(!srv_read_only_mode || m_ignore_read_only);
+	ut_ad(!srv_read_only_mode || m_ignore_read_only);
 
 	/* We created the data file and now write it full of zeros */
 	ib::info() << "Setting file '" << file.filepath() << "' size to "
@@ -426,7 +428,7 @@ SysTablespace::create_file(
 	dberr_t	err = DB_SUCCESS;
 
 	ut_a(!file.m_exists);
-	ut_a(!srv_read_only_mode || m_ignore_read_only);
+	ut_ad(!srv_read_only_mode || m_ignore_read_only);
 
 	switch (file.m_type) {
 	case SRV_NEW_RAW:
@@ -558,7 +560,7 @@ SysTablespace::read_lsn_and_check_flags(lsn_t* flushed_lsn)
 	ut_a(it->order() == 0);
 
 	if (srv_operation == SRV_OPERATION_NORMAL) {
-		buf_dblwr_init_or_load_pages(it->handle(), it->filepath());
+		buf_dblwr.init_or_load_pages(it->handle(), it->filepath());
 	}
 
 	/* Check the contents of the first page of the
@@ -908,28 +910,22 @@ SysTablespace::open_or_create(
 
 		if (it != begin) {
 		} else if (is_temp) {
-			ut_ad(!fil_system.temp_space);
 			ut_ad(space_id() == SRV_TMP_SPACE_ID);
-			space = fil_space_create(
+			space = fil_space_t::create(
 				name(), SRV_TMP_SPACE_ID, flags(),
 				FIL_TYPE_TEMPORARY, NULL);
-
-			mutex_enter(&fil_system.mutex);
-			fil_system.temp_space = space;
-			mutex_exit(&fil_system.mutex);
+			ut_ad(space == fil_system.temp_space);
 			if (!space) {
 				return DB_ERROR;
 			}
+			ut_ad(!space->is_compressed());
+			ut_ad(space->full_crc32());
 		} else {
-			ut_ad(!fil_system.sys_space);
 			ut_ad(space_id() == TRX_SYS_SPACE);
-			space = fil_space_create(
+			space = fil_space_t::create(
 				name(), TRX_SYS_SPACE, it->flags(),
 				FIL_TYPE_TABLESPACE, NULL);
-
-			mutex_enter(&fil_system.mutex);
-			fil_system.sys_space = space;
-			mutex_exit(&fil_system.mutex);
+			ut_ad(space == fil_system.sys_space);
 			if (!space) {
 				return DB_ERROR;
 			}
@@ -937,10 +933,10 @@ SysTablespace::open_or_create(
 
 		ut_a(fil_validate());
 
-		ulint	max_size = (++node_counter == m_files.size()
+		uint32_t max_size = (++node_counter == m_files.size()
 				    ? (m_last_file_size_max == 0
-				       ? ULINT_MAX
-				       : m_last_file_size_max)
+				       ? UINT32_MAX
+				       : uint32_t(m_last_file_size_max))
 				    : it->m_size);
 
 		space->add(it->m_filepath, OS_FILE_CLOSED, it->m_size,
@@ -967,30 +963,21 @@ SysTablespace::normalize_size()
 
 /**
 @return next increment size */
-ulint
-SysTablespace::get_increment() const
+uint32_t SysTablespace::get_increment() const
 {
-	ulint	increment;
-
-	if (m_last_file_size_max == 0) {
-		increment = get_autoextend_increment();
-	} else {
-
-		if (!is_valid_size()) {
-			ib::error() << "The last data file in " << name()
-				<< " has a size of " << last_file_size()
-				<< " but the max size allowed is "
-				<< m_last_file_size_max;
-		}
-
-		increment = m_last_file_size_max - last_file_size();
-	}
-
-	if (increment > get_autoextend_increment()) {
-		increment = get_autoextend_increment();
-	}
-
-	return(increment);
+  if (m_last_file_size_max == 0)
+    return get_autoextend_increment();
+
+  if (!is_valid_size())
+  {
+     ib::error() << "The last data file in " << name()
+                 << " has a size of " << last_file_size()
+                 << " but the max size allowed is "
+                 << m_last_file_size_max;
+  }
+
+  return std::min(uint32_t(m_last_file_size_max) - last_file_size(),
+                  get_autoextend_increment());
 }
 
 
diff --git a/storage/innobase/fts/fts0ast.cc b/storage/innobase/fts/fts0ast.cc
index 6be4fb0d52b..bb42f7c9f54 100644
--- a/storage/innobase/fts/fts0ast.cc
+++ b/storage/innobase/fts/fts0ast.cc
@@ -24,10 +24,10 @@ Full Text Search parser helper file.
 Created 2007/3/16 Sunny Bains.
 ***********************************************************************/
 
+#include "row0sel.h"
 #include "fts0ast.h"
 #include "fts0pars.h"
 #include "fts0fts.h"
-#include "row0sel.h"
 
 /* The FTS ast visit pass. */
 enum fts_ast_visit_pass_t {
diff --git a/storage/innobase/fts/fts0blex.cc b/storage/innobase/fts/fts0blex.cc
index 2f66e9740aa..6a2b42025a0 100644
--- a/storage/innobase/fts/fts0blex.cc
+++ b/storage/innobase/fts/fts0blex.cc
@@ -478,7 +478,7 @@ struct yy_buffer_state
  */
 #define YY_CURRENT_BUFFER ( yyg->yy_buffer_stack \
                           ? yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] \
-                          : NULL)
+                          : 0)
 /* Same as previous macro, but useful when we know that the buffer stack is not
  * NULL or when we need an lvalue. For internal use only.
  */
diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc
index 71059f5fd75..d17392eba8f 100644
--- a/storage/innobase/fts/fts0fts.cc
+++ b/storage/innobase/fts/fts0fts.cc
@@ -356,7 +356,7 @@ fts_load_default_stopword(
 		new_word.nodes = ib_vector_create(
 			allocator, sizeof(fts_node_t), 4);
 
-		str.f_len = ut_strlen(word);
+		str.f_len = strlen(word);
 		str.f_str = reinterpret_cast<byte*>(word);
 
 		fts_string_dup(&new_word.text, &str, heap);
@@ -834,17 +834,8 @@ fts_drop_index(
 		doc_id_t	current_doc_id;
 		doc_id_t	first_doc_id;
 
-		/* If we are dropping the only FTS index of the table,
-		remove it from optimize thread */
-		fts_optimize_remove_table(table);
-
 		DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS);
 
-		while (index->index_fts_syncing
-		       && !trx_is_interrupted(trx)) {
-			DICT_BG_YIELD(trx);
-		}
-
 		current_doc_id = table->fts->cache->next_doc_id;
 		first_doc_id = table->fts->cache->first_doc_id;
 		fts_cache_clear(table->fts->cache);
@@ -861,10 +852,6 @@ fts_drop_index(
 		index_cache = fts_find_index_cache(cache, index);
 
 		if (index_cache != NULL) {
-			while (index->index_fts_syncing
-			       && !trx_is_interrupted(trx)) {
-				DICT_BG_YIELD(trx);
-			}
 			if (index_cache->words) {
 				fts_words_free(index_cache->words);
 				rbt_free(index_cache->words);
@@ -1278,7 +1265,7 @@ fts_cache_node_add_positions(
 		} else if (new_size < 48) {
 			new_size = 48;
 		} else {
-			new_size = (ulint)(1.2 * new_size);
+			new_size = new_size * 6 / 5;
 		}
 
 		ilist = static_cast<byte*>(ut_malloc_nokey(new_size));
@@ -1436,7 +1423,7 @@ fts_drop_table(
 
 		dict_table_close(table, TRUE, FALSE);
 
-		/* Pass nonatomic=false (dont allow data dict unlock),
+		/* Pass nonatomic=false (don't allow data dict unlock),
 		because the transaction may hold locks on SYS_* tables from
 		previous calls to fts_drop_table(). */
 		error = row_drop_table_for_mysql(table_name, trx,
@@ -3533,7 +3520,7 @@ fts_add_doc_by_id(
 				mtr_start(&mtr);
 
 				if (i < num_idx - 1) {
-					ut_d(btr_pcur_t::restore_status status=)
+					ut_d(auto status=)
 					  btr_pcur_restore_position(
 					      BTR_SEARCH_LEAF, doc_pcur, &mtr);
 					ut_ad(status == btr_pcur_t::SAME_ALL);
@@ -4140,7 +4127,8 @@ fts_sync_commit(
 			<< ": SYNC time: "
 			<< (time(NULL) - sync->start_time)
 			<< " secs: elapsed "
-			<< (double) n_nodes / elapsed_time
+			<< static_cast<double>(n_nodes)
+			/ static_cast<double>(elapsed_time)
 			<< " ins/sec";
 	}
 
@@ -4274,8 +4262,6 @@ begin_sync:
 
 		DBUG_EXECUTE_IF("fts_instrument_sync_before_syncing",
 				os_thread_sleep(300000););
-		index_cache->index->index_fts_syncing = true;
-
 		error = fts_sync_index(sync, index_cache);
 
 		if (error != DB_SUCCESS) {
@@ -4320,13 +4306,6 @@ end_sync:
 	}
 
 	rw_lock_x_lock(&cache->lock);
-	/* Clear fts syncing flags of any indexes in case sync is
-	interrupted */
-	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
-		static_cast<fts_index_cache_t*>(
-			ib_vector_get(cache->indexes, i))
-			->index->index_fts_syncing = false;
-	}
 
 	sync->interrupted = false;
 	sync->in_progress = false;
@@ -4519,8 +4498,8 @@ fts_get_token_size(
 		int	ctype;
 		int	mbl;
 
-		mbl = cs->cset->ctype(
-			cs, &ctype,
+		mbl = cs->ctype(
+			&ctype,
 			reinterpret_cast<uchar*>(start),
 			reinterpret_cast<uchar*>(end));
 
@@ -5255,7 +5234,9 @@ fts_update_doc_id(
 
 		clust_index = dict_table_get_first_index(table);
 
-		ufield->field_no = dict_col_get_clust_pos(col, clust_index);
+		ufield->field_no = static_cast<unsigned>(
+			dict_col_get_clust_pos(col, clust_index))
+			& dict_index_t::MAX_N_FIELDS;
 		dict_col_copy_type(col, dfield_get_type(&ufield->new_val));
 
 		/* It is possible we update record that has
@@ -5282,7 +5263,7 @@ fts_t::fts_t(
 	added_synced(0), dict_locked(0),
 	add_wq(NULL),
 	cache(NULL),
-	doc_col(ULINT_UNDEFINED), in_queue(false),
+	doc_col(ULINT_UNDEFINED), in_queue(false), sync_message(false),
 	fts_heap(heap)
 {
 	ut_a(table->fts == NULL);
@@ -5924,11 +5905,7 @@ fts_valid_stopword_table(
 
 		return(NULL);
 	} else {
-		const char*     col_name;
-
-		col_name = dict_table_get_col_name(table, 0);
-
-		if (ut_strcmp(col_name, "value")) {
+		if (strcmp(dict_table_get_col_name(table, 0), "value")) {
 			ib::error() << "Invalid column name for stopword"
 				" table " << stopword_table_name << ". Its"
 				" first column must be named as 'value'.";
@@ -6051,7 +6028,7 @@ fts_load_stopword(
 		if (!reload) {
 			str.f_n_char = 0;
 			str.f_str = (byte*) stopword_to_use;
-			str.f_len = ut_strlen(stopword_to_use);
+			str.f_len = strlen(stopword_to_use);
 
 			error = fts_config_set_value(
 				trx, &fts_table, FTS_STOPWORD_TABLE_NAME, &str);
diff --git a/storage/innobase/fts/fts0opt.cc b/storage/innobase/fts/fts0opt.cc
index bb8d9c13d68..348566ae952 100644
--- a/storage/innobase/fts/fts0opt.cc
+++ b/storage/innobase/fts/fts0opt.cc
@@ -47,13 +47,19 @@ constexpr bool wsrep_sst_disable_writes= false;
 
 /** The FTS optimize thread's work queue. */
 ib_wqueue_t* fts_optimize_wq;
+static void fts_optimize_callback(void *);
+static void timer_callback(void*);
+static tpool::timer* timer;
+
+static tpool::task_group task_group(1);
+static tpool::task task(fts_optimize_callback,0, &task_group);
+
+/** FTS optimize thread, for MDL acquisition */
+static THD *fts_opt_thd;
 
 /** The FTS vector to store fts_slot_t */
 static ib_vector_t*  fts_slots;
 
-/** Time to wait for a message. */
-static const ulint FTS_QUEUE_WAIT_IN_USECS = 5000000;
-
 /** Default optimize interval in secs. */
 static const ulint FTS_OPTIMIZE_INTERVAL_IN_SECS = 300;
 
@@ -584,7 +590,7 @@ fts_zip_read_word(
 		/* Finished decompressing block. */
 		if (zip->zp->avail_in == 0) {
 
-			/* Free the block thats been decompressed. */
+			/* Free the block that's been decompressed. */
 			if (zip->pos > 0) {
 				ulint	prev = zip->pos - 1;
 
@@ -2390,7 +2396,7 @@ fts_optimize_table_bk(
 	dict_table_t*	table = slot->table;
 	dberr_t		error;
 
-	if (fil_table_accessible(table)
+	if (table->is_accessible()
 	    && table->fts && table->fts->cache
 	    && table->fts->cache->deleted >= FTS_OPTIMIZE_THRESHOLD) {
 		error = fts_optimize_table(table);
@@ -2530,6 +2536,22 @@ fts_optimize_create_msg(
 	return(msg);
 }
 
+/** Add message to wqueue, signal thread pool*/
+static void add_msg(fts_msg_t *msg, bool wq_locked= false)
+{
+  ib_wqueue_add(fts_optimize_wq, msg, msg->heap, wq_locked);
+  srv_thread_pool->submit_task(&task);
+}
+
+/**
+Called by "idle" timer. Submits optimize task, which
+will only recalculate is_sync_needed, in case the queue is empty.
+*/
+static void timer_callback(void*)
+{
+  srv_thread_pool->submit_task(&task);
+}
+
 /** Add the table to add to the OPTIMIZER's list.
 @param[in]	table	table to add */
 void fts_optimize_add_table(dict_table_t* table)
@@ -2540,12 +2562,6 @@ void fts_optimize_add_table(dict_table_t* table)
 		return;
 	}
 
-	/* If there is no fts index present then don't add to
-	optimize queue. */
-	if (!ib_vector_size(table->fts->indexes)) {
-		return;
-	}
-
 	/* Make sure table with FTS index cannot be evicted */
 	dict_table_prevent_eviction(table);
 
@@ -2553,7 +2569,7 @@ void fts_optimize_add_table(dict_table_t* table)
 
 	mutex_enter(&fts_optimize_wq->mutex);
 
-	ib_wqueue_add(fts_optimize_wq, msg, msg->heap, true);
+	add_msg(msg, true);
 
 	table->fts->in_queue = true;
 
@@ -2608,7 +2624,9 @@ fts_optimize_remove_table(
 	remove->event = event;
 	msg->ptr = remove;
 
-	ib_wqueue_add(fts_optimize_wq, msg, msg->heap, true);
+	ut_ad(!mutex_own(&dict_sys.mutex));
+
+	add_msg(msg, true);
 
 	mutex_exit(&fts_optimize_wq->mutex);
 
@@ -2643,11 +2661,20 @@ fts_optimize_request_sync_table(
 		return;
 	}
 
+	mutex_enter(&fts_optimize_wq->mutex);
+
+	if (table->fts->sync_message) {
+		/* If the table already has SYNC message in
+		fts_optimize_wq queue then ignore it */
+		mutex_exit(&fts_optimize_wq->mutex);
+		return;
+	}
+
 	fts_msg_t* msg = fts_optimize_create_msg(FTS_MSG_SYNC_TABLE, table);
 
-	mutex_enter(&fts_optimize_wq->mutex);
+	add_msg(msg, true);
 
-	ib_wqueue_add(fts_optimize_wq, msg, msg->heap, true);
+	table->fts->sync_message = true;
 
 	DBUG_EXECUTE_IF("fts_optimize_wq_count_check",
 			DBUG_ASSERT(fts_optimize_wq->length <= 1000););
@@ -2777,55 +2804,72 @@ static bool fts_is_sync_needed()
 }
 
 /** Sync fts cache of a table
-@param[in,out]	table	table to be synced */
-static void fts_optimize_sync_table(dict_table_t* table)
+@param[in,out]  table           table to be synced
+@param[in]      process_message processing messages from fts_optimize_wq */
+static void fts_optimize_sync_table(dict_table_t *table,
+                                    bool process_message= false)
 {
-	if (table->fts && table->fts->cache && fil_table_accessible(table)) {
-		fts_sync_table(table, false);
-	}
-
-	DBUG_EXECUTE_IF("ib_optimize_wq_hang", os_thread_sleep(6000000););
+  MDL_ticket* mdl_ticket= nullptr;
+  dict_table_t *sync_table= dict_acquire_mdl_shared<true>(table, fts_opt_thd,
+                                                          &mdl_ticket);
+
+  if (!sync_table)
+    return;
+
+  if (sync_table->fts && sync_table->fts->cache && sync_table->is_accessible())
+  {
+    fts_sync_table(sync_table, false);
+    if (process_message)
+    {
+      mutex_enter(&fts_optimize_wq->mutex);
+      sync_table->fts->sync_message = false;
+      mutex_exit(&fts_optimize_wq->mutex);
+    }
+  }
+
+  DBUG_EXECUTE_IF("ib_optimize_wq_hang", os_thread_sleep(6000000););
+
+  if (mdl_ticket)
+    dict_table_close(sync_table, false, false, fts_opt_thd, mdl_ticket);
 }
 
 /**********************************************************************//**
 Optimize all FTS tables.
 @return Dummy return */
-static
-os_thread_ret_t
-DECLARE_THREAD(fts_optimize_thread)(
-/*================*/
-	void*		arg)			/*!< in: work queue*/
+static void fts_optimize_callback(void *)
 {
-	ulint		current = 0;
-	ibool		done = FALSE;
-	ulint		n_tables = 0;
-	ulint		n_optimize = 0;
-	ib_wqueue_t*	wq = (ib_wqueue_t*) arg;
-
 	ut_ad(!srv_read_only_mode);
-	my_thread_init();
 
-	ut_ad(fts_slots);
+	if (!fts_optimize_wq) {
+		/* Possibly timer initiated callback, can come after FTS_MSG_STOP.*/
+		return;
+	}
 
-	/* Assign number of tables added in fts_slots_t to n_tables */
-	n_tables = ib_vector_size(fts_slots);
+	static ulint		current = 0;
+	static ibool		done = FALSE;
+	static ulint		n_tables = ib_vector_size(fts_slots);
+	static ulint		n_optimize = 0;
 
 	while (!done && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
 		/* If there is no message in the queue and we have tables
 		to optimize then optimize the tables. */
 
 		if (!done
-		    && ib_wqueue_is_empty(wq)
+		    && ib_wqueue_is_empty(fts_optimize_wq)
 		    && n_tables > 0
 		    && n_optimize > 0) {
 
 			/* The queue is empty but we have tables
 			to optimize. */
-			while (UNIV_UNLIKELY(wsrep_sst_disable_writes)
-			       && srv_shutdown_state
-			       <= SRV_SHUTDOWN_INITIATED) {
-				os_thread_sleep(1000000);
-				continue;
+			if (UNIV_UNLIKELY(wsrep_sst_disable_writes)) {
+retry_later:
+				if (fts_is_sync_needed()) {
+					fts_need_sync = true;
+				}
+				if (n_tables) {
+					timer->set_time(5000, 0);
+				}
+				return;
 			}
 
 			fts_slot_t* slot = static_cast<fts_slot_t*>(
@@ -2842,20 +2886,13 @@ DECLARE_THREAD(fts_optimize_thread)(
 				n_optimize = fts_optimize_how_many();
 				current = 0;
 			}
-
-		} else if (n_optimize == 0 || !ib_wqueue_is_empty(wq)) {
-			fts_msg_t*	msg;
-
-			msg = static_cast<fts_msg_t*>(
-				ib_wqueue_timedwait(wq, FTS_QUEUE_WAIT_IN_USECS));
-
+		} else if (n_optimize == 0
+			   || !ib_wqueue_is_empty(fts_optimize_wq)) {
+			fts_msg_t* msg = static_cast<fts_msg_t*>
+				(ib_wqueue_nowait(fts_optimize_wq));
 			/* Timeout ? */
-			if (msg == NULL) {
-				if (fts_is_sync_needed()) {
-					fts_need_sync = true;
-				}
-
-				continue;
+			if (!msg) {
+				goto retry_later;
 			}
 
 			switch (msg->type) {
@@ -2887,10 +2924,8 @@ DECLARE_THREAD(fts_optimize_thread)(
 
 			case FTS_MSG_SYNC_TABLE:
 				if (UNIV_UNLIKELY(wsrep_sst_disable_writes)) {
-					ib_wqueue_add(wq, msg, msg->heap,
-						      false);
-					os_thread_sleep(1000000);
-					goto next;
+					add_msg(msg);
+					goto retry_later;
 				}
 
 				DBUG_EXECUTE_IF(
@@ -2898,7 +2933,8 @@ DECLARE_THREAD(fts_optimize_thread)(
 					os_thread_sleep(300000););
 
 				fts_optimize_sync_table(
-					static_cast<dict_table_t*>(msg->ptr));
+					static_cast<dict_table_t*>(msg->ptr),
+					true);
 				break;
 
 			default:
@@ -2906,7 +2942,6 @@ DECLARE_THREAD(fts_optimize_thread)(
 			}
 
 			mem_heap_free(msg->heap);
-next:
 			n_optimize = done ? 0 : fts_optimize_how_many();
 		}
 	}
@@ -2927,16 +2962,13 @@ next:
 	ib_vector_free(fts_slots);
 	fts_slots = NULL;
 
+	ib_wqueue_free(fts_optimize_wq);
+	fts_optimize_wq = NULL;
+
+	innobase_destroy_background_thd(fts_opt_thd);
 	ib::info() << "FTS optimize thread exiting.";
 
 	os_event_set(fts_opt_shutdown_event);
-	my_thread_end();
-
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-	os_thread_exit();
-
-	OS_THREAD_DUMMY_RETURN;
 }
 
 /**********************************************************************//**
@@ -2955,12 +2987,14 @@ fts_optimize_init(void)
 
 	/* Create FTS optimize work queue */
 	fts_optimize_wq = ib_wqueue_create();
+	timer = srv_thread_pool->create_timer(timer_callback);
 
 	/* Create FTS vector to store fts_slot_t */
 	heap = mem_heap_create(sizeof(dict_table_t*) * 64);
 	heap_alloc = ib_heap_allocator_create(heap);
 	fts_slots = ib_vector_create(heap_alloc, sizeof(fts_slot_t), 4);
 
+	fts_opt_thd = innobase_create_background_thd("InnoDB FTS optimizer");
 	/* Add fts tables to fts_slots which could be skipped
 	during dict_load_table_one() because fts_optimize_thread
 	wasn't even started. */
@@ -2983,8 +3017,6 @@ fts_optimize_init(void)
 
 	fts_opt_shutdown_event = os_event_create(0);
 	last_check_sync_time = time(NULL);
-
-	os_thread_create(fts_optimize_thread, fts_optimize_wq, NULL);
 }
 
 /** Shutdown fts optimize thread. */
@@ -3008,15 +3040,38 @@ fts_optimize_shutdown()
 	/* We tell the OPTIMIZE thread to switch to state done, we
 	can't delete the work queue here because the add thread needs
 	deregister the FTS tables. */
+	timer->disarm();
+	task_group.cancel_pending(&task);
 
 	msg = fts_optimize_create_msg(FTS_MSG_STOP, NULL);
 
-	ib_wqueue_add(fts_optimize_wq, msg, msg->heap);
+	add_msg(msg);
 
 	os_event_wait(fts_opt_shutdown_event);
 
 	os_event_destroy(fts_opt_shutdown_event);
+	fts_opt_thd = NULL;
+	delete timer;
+	timer = NULL;
+}
 
-	ib_wqueue_free(fts_optimize_wq);
-	fts_optimize_wq = NULL;
+/** Sync the table during commit phase
+@param[in]	table	table to be synced */
+void fts_sync_during_ddl(dict_table_t* table)
+{
+  if (!fts_optimize_wq)
+    return;
+  mutex_enter(&fts_optimize_wq->mutex);
+  if (!table->fts->sync_message)
+  {
+    mutex_exit(&fts_optimize_wq->mutex);
+    return;
+  }
+
+  mutex_exit(&fts_optimize_wq->mutex);
+  fts_sync_table(table, false);
+
+  mutex_enter(&fts_optimize_wq->mutex);
+  table->fts->sync_message = false;
+  mutex_exit(&fts_optimize_wq->mutex);
 }
diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc
index 7ab7301ce8c..2964748d330 100644
--- a/storage/innobase/fts/fts0que.cc
+++ b/storage/innobase/fts/fts0que.cc
@@ -602,14 +602,15 @@ fts_ranking_words_add(
 
 		ranking->words = static_cast<byte*>(
 			mem_heap_zalloc(query->heap, words_len));
-		ut_memcpy(ranking->words, words, ranking->words_len);
+		memcpy(ranking->words, words, ranking->words_len);
 		ranking->words_len = words_len;
 	}
 
 	/* Set ranking words */
 	ut_ad(byte_offset < ranking->words_len);
 	bit_offset = pos % CHAR_BIT;
-	ranking->words[byte_offset] |= 1 << bit_offset;
+	ranking->words[byte_offset] = static_cast<byte>(
+		ranking->words[byte_offset] | 1 << bit_offset);
 }
 
 /*******************************************************************//**
@@ -3510,8 +3511,9 @@ fts_query_calculate_idf(
 				word_freq->idf = log10(1.0001);
 			} else {
 				word_freq->idf = log10(
-					total_docs
-					/ (double) word_freq->doc_count);
+					static_cast<double>(total_docs)
+					/ static_cast<double>(
+						word_freq->doc_count));
 			}
 		}
 	}
diff --git a/storage/innobase/fts/fts0sql.cc b/storage/innobase/fts/fts0sql.cc
index 09ec2d08324..a4234f7b376 100644
--- a/storage/innobase/fts/fts0sql.cc
+++ b/storage/innobase/fts/fts0sql.cc
@@ -256,32 +256,3 @@ fts_get_select_columns_str(
 
 	return(str);
 }
-
-/******************************************************************//**
-Commit a transaction.
-@return DB_SUCCESS or error code */
-dberr_t
-fts_sql_commit(
-/*===========*/
-	trx_t*		trx)		/*!< in: transaction */
-{
-	dberr_t	error;
-
-	error = trx_commit_for_mysql(trx);
-
-	/* Commit should always succeed */
-	ut_a(error == DB_SUCCESS);
-
-	return(DB_SUCCESS);
-}
-
-/******************************************************************//**
-Rollback a transaction.
-@return DB_SUCCESS or error code */
-dberr_t
-fts_sql_rollback(
-/*=============*/
-	trx_t*		trx)		/*!< in: transaction */
-{
-	return(trx_rollback_to_savepoint(trx, NULL));
-}
diff --git a/storage/innobase/fut/fut0lst.cc b/storage/innobase/fut/fut0lst.cc
index e9a4c3b8636..e084f0b7935 100644
--- a/storage/innobase/fut/fut0lst.cc
+++ b/storage/innobase/fut/fut0lst.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -27,434 +28,365 @@ Created 11/28/1995 Heikki Tuuri
 #include "buf0buf.h"
 #include "page0page.h"
 
-/********************************************************************//**
-Adds a node to an empty list. */
-static
-void
-flst_add_to_empty(
-/*==============*/
-	flst_base_node_t*	base,	/*!< in: pointer to base node of
-					empty list */
-	flst_node_t*		node,	/*!< in: node to add */
-	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+
+/** Write a file address.
+@param[in]      block   file page
+@param[in,out]  faddr   file address location
+@param[in]      page    page number
+@param[in]      boffset byte offset
+@param[in,out]  mtr     mini-transaction */
+static void flst_write_addr(const buf_block_t& block, byte *faddr,
+                            uint32_t page, uint16_t boffset, mtr_t* mtr)
 {
-	ulint		space;
-	fil_addr_t	node_addr;
-
-	ut_ad(mtr && base && node);
-	ut_ad(base != node);
-	ut_ad(mtr_memo_contains_page_flagged(mtr, base,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_SX_FIX));
-	ut_ad(mtr_memo_contains_page_flagged(mtr, node,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_SX_FIX));
-	ut_a(!flst_get_len(base));
-
-	buf_ptr_get_fsp_addr(node, &space, &node_addr);
-
-	/* Update first and last fields of base node */
-	flst_write_addr(base + FLST_FIRST, node_addr, mtr);
-	flst_write_addr(base + FLST_LAST, node_addr, mtr);
-
-	/* Set prev and next fields of node to add */
-	flst_zero_addr(node + FLST_PREV, mtr);
-	flst_zero_addr(node + FLST_NEXT, mtr);
-
-	/* Update len of base node */
-	mlog_write_ulint(base + FLST_LEN, 1, MLOG_4BYTES, mtr);
+  ut_ad(mtr->memo_contains_page_flagged(faddr,
+					MTR_MEMO_PAGE_X_FIX
+					| MTR_MEMO_PAGE_SX_FIX));
+  ut_a(page == FIL_NULL || boffset >= FIL_PAGE_DATA);
+  ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA);
+
+  static_assert(FIL_ADDR_PAGE == 0, "compatibility");
+  static_assert(FIL_ADDR_BYTE == 4, "compatibility");
+  static_assert(FIL_ADDR_SIZE == 6, "compatibility");
+
+  const bool same_page= mach_read_from_4(faddr + FIL_ADDR_PAGE) == page;
+  const bool same_offset= mach_read_from_2(faddr + FIL_ADDR_BYTE) == boffset;
+  if (same_page)
+  {
+    if (!same_offset)
+      mtr->write<2>(block, faddr + FIL_ADDR_BYTE, boffset);
+    return;
+  }
+  if (same_offset)
+    mtr->write<4>(block, faddr + FIL_ADDR_PAGE, page);
+  else
+  {
+    alignas(4) byte fil_addr[6];
+    mach_write_to_4(fil_addr + FIL_ADDR_PAGE, page);
+    mach_write_to_2(fil_addr + FIL_ADDR_BYTE, boffset);
+    mtr->memcpy(block, faddr + FIL_ADDR_PAGE, fil_addr, 6);
+  }
 }
 
-/********************************************************************//**
-Inserts a node after another in a list. */
-static
-void
-flst_insert_after(
-/*==============*/
-	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
-	flst_node_t*		node1,	/*!< in: node to insert after */
-	flst_node_t*		node2,	/*!< in: node to add */
-	mtr_t*			mtr);	/*!< in: mini-transaction handle */
-/********************************************************************//**
-Inserts a node before another in a list. */
-static
-void
-flst_insert_before(
-/*===============*/
-	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
-	flst_node_t*		node2,	/*!< in: node to insert */
-	flst_node_t*		node3,	/*!< in: node to insert before */
-	mtr_t*			mtr);	/*!< in: mini-transaction handle */
-
-/********************************************************************//**
-Adds a node as the last node in a list. */
-void
-flst_add_last(
-/*==========*/
-	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
-	flst_node_t*		node,	/*!< in: node to add */
-	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+/** Write 2 null file addresses.
+@param[in]      b       file page
+@param[in,out]  addr	file address to be zeroed out
+@param[in,out]  mtr     mini-transaction */
+static void flst_zero_both(const buf_block_t& b, byte *addr, mtr_t *mtr)
 {
-	ulint		space;
-	fil_addr_t	node_addr;
-	ulint		len;
-	fil_addr_t	last_addr;
-
-	ut_ad(mtr && base && node);
-	ut_ad(base != node);
-	ut_ad(mtr_memo_contains_page_flagged(mtr, base,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_SX_FIX));
-	ut_ad(mtr_memo_contains_page_flagged(mtr, node,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_SX_FIX));
-	len = flst_get_len(base);
-	last_addr = flst_get_last(base, mtr);
-
-	buf_ptr_get_fsp_addr(node, &space, &node_addr);
-
-	/* If the list is not empty, call flst_insert_after */
-	if (len != 0) {
-		flst_node_t*	last_node;
-
-		if (last_addr.page == node_addr.page) {
-			last_node = page_align(node) + last_addr.boffset;
-		} else {
-			fil_space_t* s = fil_space_acquire_silent(space);
-			ulint zip_size = s ? s->zip_size() : 0;
-			if (s) s->release();
-
-			last_node = fut_get_ptr(space, zip_size, last_addr,
-						RW_SX_LATCH, mtr);
-		}
-
-		flst_insert_after(base, last_node, node, mtr);
-	} else {
-		/* else call flst_add_to_empty */
-		flst_add_to_empty(base, node, mtr);
-	}
+  if (mach_read_from_4(addr + FIL_ADDR_PAGE) != FIL_NULL)
+    mtr->memset(&b, ulint(addr - b.frame) + FIL_ADDR_PAGE, 4, 0xff);
+  mtr->write<2,mtr_t::MAYBE_NOP>(b, addr + FIL_ADDR_BYTE, 0U);
+  /* Initialize the other address by (MEMMOVE|0x80,offset,FIL_ADDR_SIZE,source)
+  which is 4 bytes, or less than FIL_ADDR_SIZE. */
+  memcpy(addr + FIL_ADDR_SIZE, addr, FIL_ADDR_SIZE);
+  const uint16_t boffset= page_offset(addr);
+  mtr->memmove(b, boffset + FIL_ADDR_SIZE, boffset, FIL_ADDR_SIZE);
 }
 
-/********************************************************************//**
-Adds a node as the first node in a list. */
-void
-flst_add_first(
-/*===========*/
-	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
-	flst_node_t*		node,	/*!< in: node to add */
-	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+/** Add a node to an empty list. */
+static void flst_add_to_empty(buf_block_t *base, uint16_t boffset,
+                              buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
 {
-	ulint		space;
-	fil_addr_t	node_addr;
-	ulint		len;
-	fil_addr_t	first_addr;
-	flst_node_t*	first_node;
-
-	ut_ad(mtr && base && node);
-	ut_ad(base != node);
-	ut_ad(mtr_memo_contains_page_flagged(mtr, base,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_SX_FIX));
-	ut_ad(mtr_memo_contains_page_flagged(mtr, node,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_SX_FIX));
-	len = flst_get_len(base);
-	first_addr = flst_get_first(base, mtr);
-
-	buf_ptr_get_fsp_addr(node, &space, &node_addr);
-
-	/* If the list is not empty, call flst_insert_before */
-	if (len != 0) {
-		if (first_addr.page == node_addr.page) {
-			first_node = page_align(node) + first_addr.boffset;
-		} else {
-			fil_space_t* s = fil_space_acquire_silent(space);
-			ulint zip_size = s ? s->zip_size() : 0;
-			if (s) s->release();
-
-			first_node = fut_get_ptr(space, zip_size, first_addr,
-						 RW_SX_LATCH, mtr);
-		}
-
-		flst_insert_before(base, node, first_node, mtr);
-	} else {
-		/* else call flst_add_to_empty */
-		flst_add_to_empty(base, node, mtr);
-	}
+  ut_ad(base != add || boffset != aoffset);
+  ut_ad(boffset < base->physical_size());
+  ut_ad(aoffset < add->physical_size());
+  ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+
+  ut_ad(!mach_read_from_4(base->frame + boffset + FLST_LEN));
+  mtr->write<1>(*base, base->frame + boffset + (FLST_LEN + 3), 1U);
+  /* Update first and last fields of base node */
+  flst_write_addr(*base, base->frame + boffset + FLST_FIRST,
+                  add->page.id().page_no(), aoffset, mtr);
+  memcpy(base->frame + boffset + FLST_LAST, base->frame + boffset + FLST_FIRST,
+         FIL_ADDR_SIZE);
+  /* Initialize FLST_LAST by (MEMMOVE|0x80,offset,FIL_ADDR_SIZE,source)
+  which is 4 bytes, or less than FIL_ADDR_SIZE. */
+  mtr->memmove(*base, boffset + FLST_LAST, boffset + FLST_FIRST,
+               FIL_ADDR_SIZE);
+
+  /* Set prev and next fields of node to add */
+  static_assert(FLST_NEXT == FLST_PREV + FIL_ADDR_SIZE, "compatibility");
+  flst_zero_both(*add, add->frame + aoffset + FLST_PREV, mtr);
 }
 
-/********************************************************************//**
-Inserts a node after another in a list. */
-static
-void
-flst_insert_after(
-/*==============*/
-	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
-	flst_node_t*		node1,	/*!< in: node to insert after */
-	flst_node_t*		node2,	/*!< in: node to add */
-	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+/** Insert a node after another one.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  cur     insert position block
+@param[in]      coffset byte offset of the insert position
+@param[in,out]  add     block to be added
+@param[in]      aoffset byte offset of the block to be added
+@param[in,outr] mtr     mini-transaction */
+static void flst_insert_after(buf_block_t *base, uint16_t boffset,
+                              buf_block_t *cur, uint16_t coffset,
+                              buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
 {
-	ulint		space;
-	fil_addr_t	node1_addr;
-	fil_addr_t	node2_addr;
-	flst_node_t*	node3;
-	fil_addr_t	node3_addr;
-	ulint		len;
-
-	ut_ad(mtr && node1 && node2 && base);
-	ut_ad(base != node1);
-	ut_ad(base != node2);
-	ut_ad(node2 != node1);
-	ut_ad(mtr_memo_contains_page_flagged(mtr, base,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_SX_FIX));
-	ut_ad(mtr_memo_contains_page_flagged(mtr, node1,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_SX_FIX));
-	ut_ad(mtr_memo_contains_page_flagged(mtr, node2,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_SX_FIX));
-
-	buf_ptr_get_fsp_addr(node1, &space, &node1_addr);
-	buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
-
-	node3_addr = flst_get_next_addr(node1, mtr);
-
-	/* Set prev and next fields of node2 */
-	flst_write_addr(node2 + FLST_PREV, node1_addr, mtr);
-	flst_write_addr(node2 + FLST_NEXT, node3_addr, mtr);
-
-	if (!fil_addr_is_null(node3_addr)) {
-		/* Update prev field of node3 */
-		fil_space_t* s = fil_space_acquire_silent(space);
-		ulint zip_size = s ? s->zip_size() : 0;
-		if (s) s->release();
-
-		node3 = fut_get_ptr(space, zip_size,
-				    node3_addr, RW_SX_LATCH, mtr);
-		flst_write_addr(node3 + FLST_PREV, node2_addr, mtr);
-	} else {
-		/* node1 was last in list: update last field in base */
-		flst_write_addr(base + FLST_LAST, node2_addr, mtr);
-	}
-
-	/* Set next field of node1 */
-	flst_write_addr(node1 + FLST_NEXT, node2_addr, mtr);
-
-	/* Update len of base node */
-	len = flst_get_len(base);
-	mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr);
+  ut_ad(base != cur || boffset != coffset);
+  ut_ad(base != add || boffset != aoffset);
+  ut_ad(cur != add || coffset != aoffset);
+  ut_ad(boffset < base->physical_size());
+  ut_ad(coffset < cur->physical_size());
+  ut_ad(aoffset < add->physical_size());
+  ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->memo_contains_flagged(cur, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+
+  fil_addr_t next_addr= flst_get_next_addr(cur->frame + coffset);
+
+  flst_write_addr(*add, add->frame + aoffset + FLST_PREV,
+                  cur->page.id().page_no(), coffset, mtr);
+  flst_write_addr(*add, add->frame + aoffset + FLST_NEXT,
+                  next_addr.page, next_addr.boffset, mtr);
+
+  if (next_addr.page == FIL_NULL)
+    flst_write_addr(*base, base->frame + boffset + FLST_LAST,
+                    add->page.id().page_no(), aoffset, mtr);
+  else
+  {
+    buf_block_t *block;
+    flst_node_t *next= fut_get_ptr(add->page.id().space(), add->zip_size(),
+                                   next_addr, RW_SX_LATCH, mtr, &block);
+    flst_write_addr(*block, next + FLST_PREV,
+                    add->page.id().page_no(), aoffset, mtr);
+  }
+
+  flst_write_addr(*cur, cur->frame + coffset + FLST_NEXT,
+                  add->page.id().page_no(), aoffset, mtr);
+
+  byte *len= &base->frame[boffset + FLST_LEN];
+  mtr->write<4>(*base, len, mach_read_from_4(len) + 1);
 }
 
-/********************************************************************//**
-Inserts a node before another in a list. */
-static
-void
-flst_insert_before(
-/*===============*/
-	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
-	flst_node_t*		node2,	/*!< in: node to insert */
-	flst_node_t*		node3,	/*!< in: node to insert before */
-	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+/** Insert a node before another one.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  cur     insert position block
+@param[in]      coffset byte offset of the insert position
+@param[in,out]  add     block to be added
+@param[in]      aoffset byte offset of the block to be added
+@param[in,outr] mtr     mini-transaction */
+static void flst_insert_before(buf_block_t *base, uint16_t boffset,
+                               buf_block_t *cur, uint16_t coffset,
+                               buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
 {
-	ulint		space;
-	flst_node_t*	node1;
-	fil_addr_t	node1_addr;
-	fil_addr_t	node2_addr;
-	fil_addr_t	node3_addr;
-	ulint		len;
-
-	ut_ad(mtr && node2 && node3 && base);
-	ut_ad(base != node2);
-	ut_ad(base != node3);
-	ut_ad(node2 != node3);
-	ut_ad(mtr_memo_contains_page_flagged(mtr, base,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_SX_FIX));
-	ut_ad(mtr_memo_contains_page_flagged(mtr, node2,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_SX_FIX));
-	ut_ad(mtr_memo_contains_page_flagged(mtr, node3,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_SX_FIX));
-
-	buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
-	buf_ptr_get_fsp_addr(node3, &space, &node3_addr);
-
-	node1_addr = flst_get_prev_addr(node3, mtr);
-
-	/* Set prev and next fields of node2 */
-	flst_write_addr(node2 + FLST_PREV, node1_addr, mtr);
-	flst_write_addr(node2 + FLST_NEXT, node3_addr, mtr);
-
-	if (!fil_addr_is_null(node1_addr)) {
-		fil_space_t* s = fil_space_acquire_silent(space);
-		ulint zip_size = s ? s->zip_size() : 0;
-		if (s) s->release();
-
-		/* Update next field of node1 */
-		node1 = fut_get_ptr(space, zip_size, node1_addr,
-				    RW_SX_LATCH, mtr);
-		flst_write_addr(node1 + FLST_NEXT, node2_addr, mtr);
-	} else {
-		/* node3 was first in list: update first field in base */
-		flst_write_addr(base + FLST_FIRST, node2_addr, mtr);
-	}
-
-	/* Set prev field of node3 */
-	flst_write_addr(node3 + FLST_PREV, node2_addr, mtr);
-
-	/* Update len of base node */
-	len = flst_get_len(base);
-	mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr);
+  ut_ad(base != cur || boffset != coffset);
+  ut_ad(base != add || boffset != aoffset);
+  ut_ad(cur != add || coffset != aoffset);
+  ut_ad(boffset < base->physical_size());
+  ut_ad(coffset < cur->physical_size());
+  ut_ad(aoffset < add->physical_size());
+  ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->memo_contains_flagged(cur, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+
+  fil_addr_t prev_addr= flst_get_prev_addr(cur->frame + coffset);
+
+  flst_write_addr(*add, add->frame + aoffset + FLST_PREV,
+                  prev_addr.page, prev_addr.boffset, mtr);
+  flst_write_addr(*add, add->frame + aoffset + FLST_NEXT,
+		  cur->page.id().page_no(), coffset, mtr);
+
+  if (prev_addr.page == FIL_NULL)
+    flst_write_addr(*base, base->frame + boffset + FLST_FIRST,
+                    add->page.id().page_no(), aoffset, mtr);
+  else
+  {
+    buf_block_t *block;
+    flst_node_t *prev= fut_get_ptr(add->page.id().space(), add->zip_size(),
+                                   prev_addr, RW_SX_LATCH, mtr, &block);
+    flst_write_addr(*block, prev + FLST_NEXT,
+                    add->page.id().page_no(), aoffset, mtr);
+  }
+
+  flst_write_addr(*cur, cur->frame + coffset + FLST_PREV,
+                    add->page.id().page_no(), aoffset, mtr);
+
+  byte *len= &base->frame[boffset + FLST_LEN];
+  mtr->write<4>(*base, len, mach_read_from_4(len) + 1);
 }
 
-/********************************************************************//**
-Removes a node. */
-void
-flst_remove(
-/*========*/
-	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
-	flst_node_t*		node2,	/*!< in: node to remove */
-	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+/** Initialize a list base node.
+@param[in]      block   file page
+@param[in,out]  base    base node
+@param[in,out]  mtr     mini-transaction */
+void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr)
 {
-	ulint		space;
-	flst_node_t*	node1;
-	fil_addr_t	node1_addr;
-	fil_addr_t	node2_addr;
-	flst_node_t*	node3;
-	fil_addr_t	node3_addr;
-	ulint		len;
-
-	ut_ad(mtr && node2 && base);
-	ut_ad(mtr_memo_contains_page_flagged(mtr, base,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_SX_FIX));
-	ut_ad(mtr_memo_contains_page_flagged(mtr, node2,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_SX_FIX));
-
-	buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
-
-	fil_space_t* s = fil_space_acquire_silent(space);
-	ulint zip_size = s ? s->zip_size() : 0;
-	if (s) s->release();
-
-	node1_addr = flst_get_prev_addr(node2, mtr);
-	node3_addr = flst_get_next_addr(node2, mtr);
-
-	if (!fil_addr_is_null(node1_addr)) {
-
-		/* Update next field of node1 */
-
-		if (node1_addr.page == node2_addr.page) {
-
-			node1 = page_align(node2) + node1_addr.boffset;
-		} else {
-			node1 = fut_get_ptr(space, zip_size,
-					    node1_addr, RW_SX_LATCH, mtr);
-		}
-
-		ut_ad(node1 != node2);
-
-		flst_write_addr(node1 + FLST_NEXT, node3_addr, mtr);
-	} else {
-		/* node2 was first in list: update first field in base */
-		flst_write_addr(base + FLST_FIRST, node3_addr, mtr);
-	}
-
-	if (!fil_addr_is_null(node3_addr)) {
-		/* Update prev field of node3 */
-
-		if (node3_addr.page == node2_addr.page) {
-
-			node3 = page_align(node2) + node3_addr.boffset;
-		} else {
-			node3 = fut_get_ptr(space, zip_size,
-					    node3_addr, RW_SX_LATCH, mtr);
-		}
-
-		ut_ad(node2 != node3);
-
-		flst_write_addr(node3 + FLST_PREV, node1_addr, mtr);
-	} else {
-		/* node2 was last in list: update last field in base */
-		flst_write_addr(base + FLST_LAST, node1_addr, mtr);
-	}
-
-	/* Update len of base node */
-	len = flst_get_len(base);
-	ut_ad(len > 0);
-
-	mlog_write_ulint(base + FLST_LEN, len - 1, MLOG_4BYTES, mtr);
+  ut_ad(mtr->memo_contains_page_flagged(base, MTR_MEMO_PAGE_X_FIX |
+                                        MTR_MEMO_PAGE_SX_FIX));
+  mtr->write<4,mtr_t::MAYBE_NOP>(block, base + FLST_LEN, 0U);
+  static_assert(FLST_LAST == FLST_FIRST + FIL_ADDR_SIZE, "compatibility");
+  flst_zero_both(block, base + FLST_FIRST, mtr);
 }
 
-/********************************************************************//**
-Validates a file-based list.
-@return TRUE if ok */
-ibool
-flst_validate(
-/*==========*/
-	const flst_base_node_t*	base,	/*!< in: pointer to base node of list */
-	mtr_t*			mtr1)	/*!< in: mtr */
+/** Append a file list node to a list.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  add     block to be added
+@param[in]      aoffset byte offset of the node to be added
+@param[in,outr] mtr     mini-transaction */
+void flst_add_last(buf_block_t *base, uint16_t boffset,
+                   buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
 {
-	ulint			space;
-	const flst_node_t*	node;
-	fil_addr_t		node_addr;
-	fil_addr_t		base_addr;
-	ulint			len;
-	ulint			i;
-	mtr_t			mtr2;
-
-	ut_ad(base);
-	ut_ad(mtr_memo_contains_page_flagged(mtr1, base,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_SX_FIX));
-
-	/* We use two mini-transaction handles: the first is used to
-	lock the base node, and prevent other threads from modifying the
-	list. The second is used to traverse the list. We cannot run the
-	second mtr without committing it at times, because if the list
-	is long, then the x-locked pages could fill the buffer resulting
-	in a deadlock. */
-
-	/* Find out the space id */
-	buf_ptr_get_fsp_addr(base, &space, &base_addr);
-
-	fil_space_t* s = fil_space_acquire_silent(space);
-	ulint zip_size = s ? s->zip_size() : 0;
-	if (s) s->release();
-
-	len = flst_get_len(base);
-	node_addr = flst_get_first(base, mtr1);
-
-	for (i = 0; i < len; i++) {
-		mtr_start(&mtr2);
-
-		node = fut_get_ptr(space, zip_size,
-				   node_addr, RW_SX_LATCH, &mtr2);
-		node_addr = flst_get_next_addr(node, &mtr2);
-
-		mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer
-				   becoming full */
-	}
-
-	ut_a(fil_addr_is_null(node_addr));
-
-	node_addr = flst_get_last(base, mtr1);
-
-	for (i = 0; i < len; i++) {
-		mtr_start(&mtr2);
-
-		node = fut_get_ptr(space, zip_size,
-				   node_addr, RW_SX_LATCH, &mtr2);
-		node_addr = flst_get_prev_addr(node, &mtr2);
+  ut_ad(base != add || boffset != aoffset);
+  ut_ad(boffset < base->physical_size());
+  ut_ad(aoffset < add->physical_size());
+  ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+
+  if (!flst_get_len(base->frame + boffset))
+    flst_add_to_empty(base, boffset, add, aoffset, mtr);
+  else
+  {
+    fil_addr_t addr= flst_get_last(base->frame + boffset);
+    buf_block_t *cur= add;
+    const flst_node_t *c= addr.page == add->page.id().page_no()
+      ? add->frame + addr.boffset
+      : fut_get_ptr(add->page.id().space(), add->zip_size(), addr,
+                    RW_SX_LATCH, mtr, &cur);
+    flst_insert_after(base, boffset, cur,
+                      static_cast<uint16_t>(c - cur->frame),
+                      add, aoffset, mtr);
+  }
+}
 
-		mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer
-				   becoming full */
-	}
+/** Prepend a file list node to a list.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  add     block to be added
+@param[in]      aoffset byte offset of the node to be added
+@param[in,outr] mtr     mini-transaction */
+void flst_add_first(buf_block_t *base, uint16_t boffset,
+                    buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+{
+  ut_ad(base != add || boffset != aoffset);
+  ut_ad(boffset < base->physical_size());
+  ut_ad(aoffset < add->physical_size());
+  ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+
+  if (!flst_get_len(base->frame + boffset))
+    flst_add_to_empty(base, boffset, add, aoffset, mtr);
+  else
+  {
+    fil_addr_t addr= flst_get_first(base->frame + boffset);
+    buf_block_t *cur= add;
+    const flst_node_t *c= addr.page == add->page.id().page_no()
+      ? add->frame + addr.boffset
+      : fut_get_ptr(add->page.id().space(), add->zip_size(), addr,
+                    RW_SX_LATCH, mtr, &cur);
+    flst_insert_before(base, boffset, cur,
+                       static_cast<uint16_t>(c - cur->frame),
+                       add, aoffset, mtr);
+  }
+}
 
-	ut_a(fil_addr_is_null(node_addr));
+/** Remove a file list node.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  cur     block to be removed
+@param[in]      coffset byte offset of the current record to be removed
+@param[in,outr] mtr     mini-transaction */
+void flst_remove(buf_block_t *base, uint16_t boffset,
+                 buf_block_t *cur, uint16_t coffset, mtr_t *mtr)
+{
+  ut_ad(boffset < base->physical_size());
+  ut_ad(coffset < cur->physical_size());
+  ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->memo_contains_flagged(cur, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+
+  const fil_addr_t prev_addr= flst_get_prev_addr(cur->frame + coffset);
+  const fil_addr_t next_addr= flst_get_next_addr(cur->frame + coffset);
+
+  if (prev_addr.page == FIL_NULL)
+    flst_write_addr(*base, base->frame + boffset + FLST_FIRST,
+                    next_addr.page, next_addr.boffset, mtr);
+  else
+  {
+    buf_block_t *block= cur;
+    flst_node_t *prev= prev_addr.page == cur->page.id().page_no()
+      ? cur->frame + prev_addr.boffset
+      : fut_get_ptr(cur->page.id().space(), cur->zip_size(), prev_addr,
+                    RW_SX_LATCH, mtr, &block);
+    flst_write_addr(*block, prev + FLST_NEXT,
+                    next_addr.page, next_addr.boffset, mtr);
+  }
+
+  if (next_addr.page == FIL_NULL)
+    flst_write_addr(*base, base->frame + boffset + FLST_LAST,
+                    prev_addr.page, prev_addr.boffset, mtr);
+  else
+  {
+    buf_block_t *block= cur;
+    flst_node_t *next= next_addr.page == cur->page.id().page_no()
+      ? cur->frame + next_addr.boffset
+      : fut_get_ptr(cur->page.id().space(), cur->zip_size(), next_addr,
+                    RW_SX_LATCH, mtr, &block);
+    flst_write_addr(*block, next + FLST_PREV,
+                    prev_addr.page, prev_addr.boffset, mtr);
+  }
+
+  byte *len= &base->frame[boffset + FLST_LEN];
+  ut_ad(mach_read_from_4(len) > 0);
+  mtr->write<4>(*base, len, mach_read_from_4(len) - 1);
+}
 
-	return(TRUE);
+#ifdef UNIV_DEBUG
+/** Validate a file-based list. */
+void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr)
+{
+  ut_ad(boffset < base->physical_size());
+  ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+
+  /* We use two mini-transaction handles: the first is used to lock
+  the base node, and prevent other threads from modifying the list.
+  The second is used to traverse the list. We cannot run the second
+  mtr without committing it at times, because if the list is long,
+  the x-locked pages could fill the buffer, resulting in a deadlock. */
+  mtr_t mtr2;
+
+  const uint32_t len= flst_get_len(base->frame + boffset);
+  fil_addr_t addr= flst_get_first(base->frame + boffset);
+
+  for (uint32_t i= len; i--; )
+  {
+    mtr2.start();
+    const flst_node_t *node= fut_get_ptr(base->page.id().space(),
+                                         base->zip_size(), addr,
+                                         RW_SX_LATCH, &mtr2);
+    addr= flst_get_next_addr(node);
+    mtr2.commit();
+  }
+
+  ut_ad(addr.page == FIL_NULL);
+
+  addr= flst_get_last(base->frame + boffset);
+
+  for (uint32_t i= len; i--; )
+  {
+    mtr2.start();
+    const flst_node_t *node= fut_get_ptr(base->page.id().space(),
+                                         base->zip_size(), addr,
+                                         RW_SX_LATCH, &mtr2);
+    addr= flst_get_prev_addr(node);
+    mtr2.commit();
+  }
+
+  ut_ad(addr.page == FIL_NULL);
 }
+#endif
diff --git a/storage/innobase/gis/gis0geo.cc b/storage/innobase/gis/gis0geo.cc
index dad40d19da7..4c3ff1881d0 100644
--- a/storage/innobase/gis/gis0geo.cc
+++ b/storage/innobase/gis/gis0geo.cc
@@ -580,193 +580,71 @@ split_rtree_node(
 	return(first_rec_group);
 }
 
-/*************************************************************//**
-Compares two keys a and b depending on nextflag
-nextflag can contain these flags:
+/** Compare two minimum bounding rectangles.
+@param mode   comparison operator
    MBR_INTERSECT(a,b)  a overlaps b
    MBR_CONTAIN(a,b)    a contains b
    MBR_DISJOINT(a,b)   a disjoint b
    MBR_WITHIN(a,b)     a within   b
    MBR_EQUAL(a,b)      All coordinates of MBRs are equal
-Return 0 on success, otherwise 1. */
-int
-rtree_key_cmp(
-/*==========*/
-	page_cur_mode_t	mode,	/*!< in: compare method. */
-	const uchar*	b,	/*!< in: first key. */
-	int,
-	const uchar*	a,	/*!< in: second key. */
-	int		a_len)	/*!< in: second key len. */
+   MBR_DATA(a,b)       Data reference is the same
+@param b first MBR
+@param a second MBR
+@retval 0 if the predicate holds
+@retval 1 if the precidate does not hold */
+int rtree_key_cmp(page_cur_mode_t mode, const void *b, const void *a)
 {
-	double		amin, amax, bmin, bmax;
-	int		key_len;
-	int		keyseg_len;
-
-	keyseg_len = 2 * sizeof(double);
-	for (key_len = a_len; key_len > 0; key_len -= keyseg_len) {
-		amin = mach_double_read(a);
-		bmin = mach_double_read(b);
-		amax = mach_double_read(a + sizeof(double));
-		bmax = mach_double_read(b + sizeof(double));
-
-		switch (mode) {
-		case PAGE_CUR_INTERSECT:
-			if (INTERSECT_CMP(amin, amax, bmin, bmax)) {
-				return(1);
-			}
-			break;
-		case PAGE_CUR_CONTAIN:
-			if (CONTAIN_CMP(amin, amax, bmin, bmax)) {
-				return(1);
-			}
-			break;
-		case PAGE_CUR_WITHIN:
-			if (WITHIN_CMP(amin, amax, bmin, bmax)) {
-				return(1);
-			}
-			break;
-		case PAGE_CUR_MBR_EQUAL:
-			if (EQUAL_CMP(amin, amax, bmin, bmax)) {
-				return(1);
-			}
-			break;
-		case PAGE_CUR_DISJOINT:
-			int result;
-
-			result = DISJOINT_CMP(amin, amax, bmin, bmax);
-			if (result == 0) {
-				return(0);
-			}
-
-			if (key_len - keyseg_len <= 0) {
-				return(1);
-			}
-
-			break;
-		default:
-			/* if unknown comparison operator */
-			ut_ad(0);
-		}
-
-		a += keyseg_len;
-		b += keyseg_len;
-	}
-
-	return(0);
-}
-
-/*************************************************************//**
-Calculates MBR_AREA(a+b) - MBR_AREA(a)
-Note: when 'a' and 'b' objects are far from each other,
-the area increase can be really big, so this function
-can return 'inf' as a result.
-Return the area increaed. */
-double
-rtree_area_increase(
-	const uchar*	a,		/*!< in: original mbr. */
-	const uchar*	b,		/*!< in: new mbr. */
-	int		mbr_len,	/*!< in: mbr length of a and b. */
-	double*		ab_area)	/*!< out: increased area. */
-{
-	double		a_area = 1.0;
-	double		loc_ab_area = 1.0;
-	double		amin, amax, bmin, bmax;
-	int		key_len;
-	int		keyseg_len;
-	double		data_round = 1.0;
-
-	keyseg_len = 2 * sizeof(double);
-
-	for (key_len = mbr_len; key_len > 0; key_len -= keyseg_len) {
-		double	area;
-
-		amin = mach_double_read(a);
-		bmin = mach_double_read(b);
-		amax = mach_double_read(a + sizeof(double));
-		bmax = mach_double_read(b + sizeof(double));
-
-		area = amax - amin;
-		if (area == 0) {
-			a_area *= LINE_MBR_WEIGHTS;
-		} else {
-			a_area *= area;
-		}
-
-		area = (double)std::max(amax, bmax) -
-		       (double)std::min(amin, bmin);
-		if (area == 0) {
-			loc_ab_area *= LINE_MBR_WEIGHTS;
-		} else {
-			loc_ab_area *= area;
-		}
-
-		/* Value of amax or bmin can be so large that small difference
-		are ignored. For example: 3.2884281489988079e+284 - 100 =
-		3.2884281489988079e+284. This results some area difference
-		are not detected */
-		if (loc_ab_area == a_area) {
-			if (bmin < amin || bmax > amax) {
-				data_round *= ((double)std::max(amax, bmax)
-					       - amax
-					       + (amin - (double)std::min(
-								amin, bmin)));
-			} else {
-				data_round *= area;
-			}
-		}
-
-		a += keyseg_len;
-		b += keyseg_len;
-	}
-
-	*ab_area = loc_ab_area;
-
-	if (loc_ab_area == a_area && data_round != 1.0) {
-		return(data_round);
-	}
-
-	return(loc_ab_area - a_area);
-}
-
-/** Calculates overlapping area
-@param[in]	a	mbr a
-@param[in]	b	mbr b
-@param[in]	mbr_len	mbr length
-@return overlapping area */
-double
-rtree_area_overlapping(
-	const uchar*	a,
-	const uchar*	b,
-	int		mbr_len)
-{
-	double	area = 1.0;
-	double	amin;
-	double	amax;
-	double	bmin;
-	double	bmax;
-	int	key_len;
-	int	keyseg_len;
-
-	keyseg_len = 2 * sizeof(double);
-
-	for (key_len = mbr_len; key_len > 0; key_len -= keyseg_len) {
-		amin = mach_double_read(a);
-		bmin = mach_double_read(b);
-		amax = mach_double_read(a + sizeof(double));
-		bmax = mach_double_read(b + sizeof(double));
-
-		amin = std::max(amin, bmin);
-		amax = std::min(amax, bmax);
-
-		if (amin > amax) {
-			return(0);
-		} else {
-			area *= (amax - amin);
-		}
-
-		a += keyseg_len;
-		b += keyseg_len;
-	}
-
-	return(area);
+  const byte *b_= static_cast<const byte*>(b);
+  const byte *a_= static_cast<const byte*>(a);
+
+  static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double), "compatibility");
+
+  for (auto i = SPDIMS; i--; )
+  {
+    double amin= mach_double_read(a_);
+    double bmin= mach_double_read(b_);
+    a_+= sizeof(double);
+    b_+= sizeof(double);
+    double amax= mach_double_read(a_);
+    double bmax= mach_double_read(b_);
+    a_+= sizeof(double);
+    b_+= sizeof(double);
+
+    switch (mode) {
+    case PAGE_CUR_INTERSECT:
+      if (INTERSECT_CMP(amin, amax, bmin, bmax))
+        return 1;
+      continue;
+    case PAGE_CUR_CONTAIN:
+      if (CONTAIN_CMP(amin, amax, bmin, bmax))
+        return 1;
+      continue;
+    case PAGE_CUR_WITHIN:
+      if (WITHIN_CMP(amin, amax, bmin, bmax))
+        return 1;
+      continue;
+    case PAGE_CUR_MBR_EQUAL:
+      if (EQUAL_CMP(amin, amax, bmin, bmax))
+        return 1;
+      continue;
+    case PAGE_CUR_DISJOINT:
+      if (!DISJOINT_CMP(amin, amax, bmin, bmax))
+        return 0;
+      if (!i)
+        return 1;
+      continue;
+    case PAGE_CUR_UNSUPP:
+    case PAGE_CUR_G:
+    case PAGE_CUR_GE:
+    case PAGE_CUR_L:
+    case PAGE_CUR_LE:
+    case PAGE_CUR_RTREE_LOCATE:
+    case PAGE_CUR_RTREE_GET_FATHER:
+    case PAGE_CUR_RTREE_INSERT:
+      break;
+    }
+    ut_ad("unknown comparison operator" == 0);
+  }
+
+  return 0;
 }
diff --git a/storage/innobase/gis/gis0rtree.cc b/storage/innobase/gis/gis0rtree.cc
index 77dd6f7ae27..66fa3670ebd 100644
--- a/storage/innobase/gis/gis0rtree.cc
+++ b/storage/innobase/gis/gis0rtree.cc
@@ -185,82 +185,6 @@ rtr_index_build_node_ptr(
 }
 
 /**************************************************************//**
-In-place update the mbr field of a spatial index row.
-@return true if update is successful */
-static
-bool
-rtr_update_mbr_field_in_place(
-/*==========================*/
-	dict_index_t*	index,		/*!< in: spatial index. */
-	rec_t*		rec,		/*!< in/out: rec to be modified.*/
-	rec_offs*	offsets,	/*!< in/out: offsets on rec. */
-	rtr_mbr_t*	mbr,		/*!< in: the new mbr. */
-	mtr_t*		mtr)		/*!< in: mtr */
-{
-	void*		new_mbr_ptr;
-	double		new_mbr[SPDIMS * 2];
-	byte*		log_ptr;
-	page_t*		page = page_align(rec);
-	ulint		len = DATA_MBR_LEN;
-	ulint		flags = BTR_NO_UNDO_LOG_FLAG
-			| BTR_NO_LOCKING_FLAG
-			| BTR_KEEP_SYS_FLAG;
-	ulint		rec_info;
-
-	rtr_write_mbr(reinterpret_cast<byte*>(&new_mbr), mbr);
-	new_mbr_ptr = static_cast<void*>(new_mbr);
-	/* Otherwise, set the mbr to the new_mbr. */
-	rec_set_nth_field(rec, offsets, 0, new_mbr_ptr, len);
-
-	rec_info = rec_get_info_bits(rec, rec_offs_comp(offsets));
-
-	/* Write redo log. */
-	/* For now, we use LOG_REC_UPDATE_IN_PLACE to log this enlarge.
-	In the future, we may need to add a new log type for this. */
-	log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
-					    ? MLOG_COMP_REC_UPDATE_IN_PLACE
-					    : MLOG_REC_UPDATE_IN_PLACE,
-					    1 + DATA_ROLL_PTR_LEN + 14 + 2
-					    + MLOG_BUF_MARGIN);
-
-	if (!log_ptr) {
-		/* Logging in mtr is switched off during
-		crash recovery */
-		return(false);
-	}
-
-	/* Flags */
-	mach_write_to_1(log_ptr, flags);
-	log_ptr++;
-	/* TRX_ID Position */
-	log_ptr += mach_write_compressed(log_ptr, 0);
-	/* ROLL_PTR */
-	trx_write_roll_ptr(log_ptr, 0);
-	log_ptr += DATA_ROLL_PTR_LEN;
-	/* TRX_ID */
-	log_ptr += mach_u64_write_compressed(log_ptr, 0);
-
-	/* Offset */
-	mach_write_to_2(log_ptr, page_offset(rec));
-	log_ptr += 2;
-	/* Info bits */
-	mach_write_to_1(log_ptr, rec_info);
-	log_ptr++;
-	/* N fields */
-	log_ptr += mach_write_compressed(log_ptr, 1);
-	/* Field no, len */
-	log_ptr += mach_write_compressed(log_ptr, 0);
-	log_ptr += mach_write_compressed(log_ptr, len);
-	/* Data */
-	memcpy(log_ptr, new_mbr_ptr, len);
-	log_ptr += len;
-
-	mlog_close(mtr, log_ptr);
-
-	return(true);
-}
-
-/**************************************************************//**
 Update the mbr field of a spatial index row.
 @return true if update is successful */
 bool
@@ -281,7 +205,7 @@ rtr_update_mbr_field(
 	mem_heap_t*	heap;
 	page_t*		page;
 	rec_t*		rec;
-	ulint		flags = BTR_NO_UNDO_LOG_FLAG
+	constexpr ulint flags = BTR_NO_UNDO_LOG_FLAG
 			| BTR_NO_LOCKING_FLAG
 			| BTR_KEEP_SYS_FLAG;
 	dberr_t		err;
@@ -292,7 +216,6 @@ rtr_update_mbr_field(
 	ulint		low_match = 0;
 	ulint		child;
 	ulint		rec_info;
-	page_zip_des_t*	page_zip;
 	bool		ins_suc = true;
 	ulint		cur2_pos = 0;
 	ulint		del_page_no = 0;
@@ -306,7 +229,6 @@ rtr_update_mbr_field(
 	heap = mem_heap_create(100);
 	block = btr_cur_get_block(cursor);
 	ut_ad(page == buf_block_get_frame(block));
-	page_zip = buf_block_get_page_zip(block);
 
 	child = btr_node_ptr_get_child_page_no(rec, offsets);
 	const ulint n_core = page_is_leaf(block->frame)
@@ -332,11 +254,16 @@ rtr_update_mbr_field(
 		cur2_pos = page_rec_get_n_recs_before(btr_cur_get_rec(cursor2));
 	}
 
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_base(offsets)[0 + 1] == DATA_MBR_LEN);
+	ut_ad(node_ptr->fields[0].len == DATA_MBR_LEN);
+
 	if (rec_info & REC_INFO_MIN_REC_FLAG) {
 		/* When the rec is minimal rec in this level, we do
-		 in-place update for avoiding it move to other place. */
+		in-place update for avoiding it move to other place. */
+		page_zip_des_t* page_zip = buf_block_get_page_zip(block);
 
-		if (page_zip) {
+		if (UNIV_LIKELY_NULL(page_zip)) {
 			/* Check if there's enough space for in-place
 			update the zip page. */
 			if (!btr_cur_update_alloc_zip(
@@ -372,21 +299,18 @@ rtr_update_mbr_field(
 					rec, rec_offs_comp(offsets));
 			ut_ad(rec_info & REC_INFO_MIN_REC_FLAG);
 #endif /* UNIV_DEBUG */
-		}
-
-		if (!rtr_update_mbr_field_in_place(index, rec,
-						   offsets, mbr, mtr)) {
-			return(false);
-		}
-
-		if (page_zip) {
-			page_zip_write_rec(page_zip, rec, index, offsets, 0);
+			memcpy(rec, node_ptr->fields[0].data, DATA_MBR_LEN);
+			page_zip_write_rec(block, rec, index, offsets, 0, mtr);
+		} else {
+			mtr->memcpy<mtr_t::MAYBE_NOP>(*block, rec,
+						      node_ptr->fields[0].data,
+						      DATA_MBR_LEN);
 		}
 
 		if (cursor2) {
 			rec_offs* offsets2;
 
-			if (page_zip) {
+			if (UNIV_LIKELY_NULL(page_zip)) {
 				cursor2->page_cur.rec
 					= page_rec_get_nth(page, cur2_pos);
 			}
@@ -448,7 +372,7 @@ update_mbr:
 		if (!ins_suc) {
 			ut_ad(rec_info & REC_INFO_MIN_REC_FLAG);
 
-			btr_set_min_rec_mark(next_rec, mtr);
+			btr_set_min_rec_mark(next_rec, *block, mtr);
 		}
 
 		/* If there's more than 1 rec left in the page, delete
@@ -470,7 +394,7 @@ update_mbr:
 				mark the new leftmost node pointer as
 				the predefined minimum record */
 				rec_t*	next_rec = page_rec_get_next(cur2_rec);
-				btr_set_min_rec_mark(next_rec, mtr);
+				btr_set_min_rec_mark(next_rec, *block, mtr);
 			}
 
 			ut_ad(del_page_no
@@ -574,7 +498,7 @@ update_mbr:
 				mark the new leftmost node pointer as
 				the predefined minimum record */
 				rec_t*	next_rec = page_rec_get_next(cur2_rec);
-				btr_set_min_rec_mark(next_rec, mtr);
+				btr_set_min_rec_mark(next_rec, *block, mtr);
 			}
 
 			ut_ad(cur2_pno == del_page_no && cur2_rec != insert_rec);
@@ -630,12 +554,8 @@ rtr_adjust_upper_level(
 	rtr_mbr_t*	new_mbr,	/*!< in: MBR on the new page */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	page_t*		page;
-	page_t*		new_page;
 	ulint		page_no;
 	ulint		new_page_no;
-	page_zip_des_t*	page_zip;
-	page_zip_des_t*	new_page_zip;
 	dict_index_t*	index = sea_cur->index;
 	btr_cur_t	cursor;
 	rec_offs*	offsets;
@@ -659,13 +579,9 @@ rtr_adjust_upper_level(
 	level = btr_page_get_level(buf_block_get_frame(block));
 	ut_ad(level == btr_page_get_level(buf_block_get_frame(new_block)));
 
-	page = buf_block_get_frame(block);
-	page_no = block->page.id.page_no();
-	page_zip = buf_block_get_page_zip(block);
+	page_no = block->page.id().page_no();
 
-	new_page = buf_block_get_frame(new_block);
-	new_page_no = new_block->page.id.page_no();
-	new_page_zip = buf_block_get_page_zip(new_block);
+	new_page_no = new_block->page.id().page_no();
 
 	/* Set new mbr for the old page on the upper level. */
 	/* Look up the index for the node pointer to page */
@@ -674,7 +590,8 @@ rtr_adjust_upper_level(
 
 	page_cursor = btr_cur_get_page_cur(&cursor);
 
-	rtr_update_mbr_field(&cursor, offsets, NULL, page, mbr, NULL, mtr);
+	rtr_update_mbr_field(&cursor, offsets, NULL, block->frame, mbr, NULL,
+			     mtr);
 
 	/* Already updated parent MBR, reset in our path */
 	if (sea_cur->rtr_info) {
@@ -688,7 +605,7 @@ rtr_adjust_upper_level(
 	/* Insert the node for the new page. */
 	node_ptr_upper = rtr_index_build_node_ptr(
 		index, new_mbr,
-		page_rec_get_next(page_get_infimum_rec(new_page)),
+		page_rec_get_next(page_get_infimum_rec(new_block->frame)),
 		new_page_no, heap);
 
 	ulint	up_match = 0;
@@ -737,37 +654,31 @@ rtr_adjust_upper_level(
 	new_prdt.op = 0;
 
 	lock_prdt_update_parent(block, new_block, &prdt, &new_prdt,
-				index->table->space_id,
-				page_cursor->block->page.id.page_no());
+				page_cursor->block->page.id());
 
 	mem_heap_free(heap);
 
 	ut_ad(block->zip_size() == index->table->space->zip_size());
 
-	const uint32_t next_page_no = btr_page_get_next(page);
+	const uint32_t next_page_no = btr_page_get_next(block->frame);
 
 	if (next_page_no != FIL_NULL) {
-		page_id_t	next_page_id(block->page.id.space(),
-					     next_page_no);
-
 		buf_block_t*	next_block = btr_block_get(
-			next_page_id, block->zip_size(), RW_X_LATCH,
-			index, mtr);
+			*index, next_page_no, RW_X_LATCH, false, mtr);
 #ifdef UNIV_BTR_DEBUG
-		ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
+		ut_a(page_is_comp(next_block->frame)
+		     == page_is_comp(block->frame));
 		ut_a(btr_page_get_prev(next_block->frame)
-		     == block->page.id.page_no());
+		     == block->page.id().page_no());
 #endif /* UNIV_BTR_DEBUG */
 
-		btr_page_set_prev(buf_block_get_frame(next_block),
-				  buf_block_get_page_zip(next_block),
-				  new_page_no, mtr);
+		btr_page_set_prev(next_block, new_page_no, mtr);
 	}
 
-	btr_page_set_next(page, page_zip, new_page_no, mtr);
+	btr_page_set_next(block, new_page_no, mtr);
 
-	btr_page_set_prev(new_page, new_page_zip, page_no, mtr);
-	btr_page_set_next(new_page, new_page_zip, next_page_no, mtr);
+	btr_page_set_prev(new_block, page_no, mtr);
+	btr_page_set_next(new_block, next_page_no, mtr);
 }
 
 /*************************************************************//**
@@ -855,11 +766,8 @@ rtr_split_page_move_rec_list(
 			ut_ad(!n_core || cur_split_node->key != first_rec);
 
 			rec = page_cur_insert_rec_low(
-					page_cur_get_rec(&new_page_cursor),
-					index,
-					cur_split_node->key,
-					offsets,
-					mtr);
+				&new_page_cursor,
+				index, cur_split_node->key, offsets, mtr);
 
 			ut_a(rec);
 
@@ -896,7 +804,7 @@ rtr_split_page_move_rec_list(
 	if (new_page_zip) {
 		mtr_set_log_mode(mtr, log_mode);
 
-		if (!page_zip_compress(new_page_zip, new_page, index,
+		if (!page_zip_compress(new_block, index,
 				       page_zip_level, mtr)) {
 			ulint	ret_pos;
 
@@ -910,7 +818,8 @@ rtr_split_page_move_rec_list(
 			ret_pos == 0. */
 
 			if (UNIV_UNLIKELY
-			    (!page_zip_reorganize(new_block, index, mtr))) {
+			    (!page_zip_reorganize(new_block, index,
+						  page_zip_level, mtr))) {
 
 				if (UNIV_UNLIKELY
 				    (!page_zip_decompress(new_page_zip,
@@ -974,8 +883,6 @@ rtr_page_split_and_insert(
 	buf_block_t*		block;
 	page_t*			page;
 	page_t*			new_page;
-	ulint			page_no;
-	ulint			hint_page_no;
 	buf_block_t*		new_block;
 	page_zip_des_t*		page_zip;
 	page_zip_des_t*		new_page_zip;
@@ -989,7 +896,6 @@ rtr_page_split_and_insert(
 	rtr_split_node_t*	cur_split_node;
 	rtr_split_node_t*	end_split_node;
 	double*			buf_pos;
-	ulint			page_level;
 	node_seq_t		current_ssn;
 	node_seq_t		next_ssn;
 	buf_block_t*		root_block;
@@ -1009,8 +915,8 @@ func_start:
 	mem_heap_empty(*heap);
 	*offsets = NULL;
 
-	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(cursor->index),
-					MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(&cursor->index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
 	ut_ad(!dict_index_is_online_ddl(cursor->index)
 	      || (flags & BTR_CREATE_FLAG)
 	      || dict_index_is_clust(cursor->index));
@@ -1020,13 +926,12 @@ func_start:
 	block = btr_cur_get_block(cursor);
 	page = buf_block_get_frame(block);
 	page_zip = buf_block_get_page_zip(block);
-	page_level = btr_page_get_level(page);
 	current_ssn = page_get_ssn_id(page);
 
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 	ut_ad(page_get_n_recs(page) >= 1);
 
-	page_no = block->page.id.page_no();
+	const page_id_t page_id(block->page.id());
 
 	if (!page_has_prev(page) && !page_is_leaf(page)) {
 		first_rec = page_rec_get_next(
@@ -1064,10 +969,19 @@ func_start:
 					   static_cast<uchar*>(first_rec));
 
 	/* Allocate a new page to the index */
-	hint_page_no = page_no + 1;
-	new_block = btr_page_alloc(cursor->index, hint_page_no, FSP_UP,
-				   page_level, mtr, mtr);
+	const uint16_t page_level = btr_page_get_level(page);
+	new_block = btr_page_alloc(cursor->index, page_id.page_no() + 1,
+				   FSP_UP, page_level, mtr, mtr);
+	if (!new_block) {
+		return NULL;
+	}
+
 	new_page_zip = buf_block_get_page_zip(new_block);
+	if (page_level && UNIV_LIKELY_NULL(new_page_zip)) {
+		/* ROW_FORMAT=COMPRESSED non-leaf pages are not expected
+		to contain FIL_NULL in FIL_PAGE_PREV at this stage. */
+		memset_aligned<4>(new_block->frame + FIL_PAGE_PREV, 0, 4);
+	}
 	btr_page_create(new_block, new_page_zip, cursor->index,
 			page_level, mtr);
 
@@ -1104,7 +1018,7 @@ func_start:
 		as appropriate.  Deleting will always succeed. */
 		ut_a(new_page_zip);
 
-		page_zip_copy_recs(new_page_zip, new_page,
+		page_zip_copy_recs(new_block,
 				   page_zip, page, cursor->index, mtr);
 
 		page_cursor = btr_cur_get_page_cur(cursor);
@@ -1220,7 +1134,7 @@ func_start:
 	For compressed pages, page_cur_tuple_insert() will have
 	attempted this already. */
 	if (rec == NULL) {
-		if (!page_cur_get_page_zip(page_cursor)
+		if (!is_page_cur_get_page_zip(page_cursor)
 		    && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
 			rec = page_cur_tuple_insert(page_cursor, tuple,
 						    cursor->index, offsets,
@@ -1243,8 +1157,7 @@ after_insert:
 
 	/* Check any predicate locks need to be moved/copied to the
 	new page */
-	lock_prdt_update_split(new_block, &prdt, &new_prdt,
-			       cursor->index->table->space_id, page_no);
+	lock_prdt_update_split(new_block, &prdt, &new_prdt, page_id);
 
 	/* Adjust the upper level. */
 	rtr_adjust_upper_level(cursor, flags, block, new_block,
@@ -1454,8 +1367,8 @@ rtr_page_copy_rec_list_end_no_locks(
 					/* We have two identical leaf records,
 					skip copying the undeleted one, and
 					unmark deleted on the current page */
-					btr_rec_set_deleted_flag(
-						cur_rec, NULL, FALSE);
+					btr_rec_set_deleted<false>(
+						new_block, cur_rec, mtr);
 					goto next;
 				}
 			}
@@ -1472,12 +1385,12 @@ rtr_page_copy_rec_list_end_no_locks(
 		offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core,
 					   ULINT_UNDEFINED, &heap);
 
-		ins_rec = page_cur_insert_rec_low(cur_rec, index,
+		ins_rec = page_cur_insert_rec_low(&page_cur, index,
 						  cur1_rec, offsets1, mtr);
 		if (UNIV_UNLIKELY(!ins_rec)) {
-			fprintf(stderr, "page number %ld and %ld\n",
-				(long)new_block->page.id.page_no(),
-				(long)block->page.id.page_no());
+			fprintf(stderr, "page number %u and %u\n",
+				new_block->page.id().page_no(),
+				block->page.id().page_no());
 
 			ib::fatal() << "rec offset " << page_offset(rec)
 				<< ", cur1 offset "
@@ -1575,8 +1488,8 @@ rtr_page_copy_rec_list_start_no_locks(
 					/* We have two identical leaf records,
 					skip copying the undeleted one, and
 					unmark deleted on the current page */
-					btr_rec_set_deleted_flag(
-						cur_rec, NULL, FALSE);
+					btr_rec_set_deleted<false>(
+						new_block, cur_rec, mtr);
 					goto next;
 				}
 			}
@@ -1593,14 +1506,11 @@ rtr_page_copy_rec_list_start_no_locks(
 		offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core,
 					   ULINT_UNDEFINED, &heap);
 
-		ins_rec = page_cur_insert_rec_low(cur_rec, index,
+		ins_rec = page_cur_insert_rec_low(&page_cur, index,
 						  cur1_rec, offsets1, mtr);
 		if (UNIV_UNLIKELY(!ins_rec)) {
-			fprintf(stderr, "page number %ld and %ld\n",
-				(long)new_block->page.id.page_no(),
-				(long)block->page.id.page_no());
-
-			ib::fatal() << "rec offset " << page_offset(rec)
+			ib::fatal() << new_block->page.id()
+				<< "rec offset " << page_offset(rec)
 				<< ", cur1 offset "
 				<<  page_offset(page_cur_get_rec(&cur1))
 				<< ", cur_rec offset "
@@ -1737,7 +1647,7 @@ rtr_check_same_block(
 	mem_heap_t*	heap)	/*!< in: memory heap */
 
 {
-	ulint		page_no = childb->page.id.page_no();
+	ulint		page_no = childb->page.id().page_no();
 	rec_offs*	offsets;
 	rec_t*		rec = page_rec_get_next(page_get_infimum_rec(
 				buf_block_get_frame(parentb)));
@@ -1757,6 +1667,113 @@ rtr_check_same_block(
 	return(false);
 }
 
+/*************************************************************//**
+Calculates MBR_AREA(a+b) - MBR_AREA(a)
+Note: when 'a' and 'b' objects are far from each other,
+the area increase can be really big, so this function
+can return 'inf' as a result.
+Return the area increaed. */
+static double
+rtree_area_increase(
+	const uchar*	a,		/*!< in: original mbr. */
+	const uchar*	b,		/*!< in: new mbr. */
+	double*		ab_area)	/*!< out: increased area. */
+{
+	double		a_area = 1.0;
+	double		loc_ab_area = 1.0;
+	double		amin, amax, bmin, bmax;
+	double		data_round = 1.0;
+
+	static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double),
+		      "compatibility");
+
+	for (auto i = SPDIMS; i--; ) {
+		double	area;
+
+		amin = mach_double_read(a);
+		bmin = mach_double_read(b);
+		amax = mach_double_read(a + sizeof(double));
+		bmax = mach_double_read(b + sizeof(double));
+
+		a += 2 * sizeof(double);
+		b += 2 * sizeof(double);
+
+		area = amax - amin;
+		if (area == 0) {
+			a_area *= LINE_MBR_WEIGHTS;
+		} else {
+			a_area *= area;
+		}
+
+		area = (double)std::max(amax, bmax) -
+		       (double)std::min(amin, bmin);
+		if (area == 0) {
+			loc_ab_area *= LINE_MBR_WEIGHTS;
+		} else {
+			loc_ab_area *= area;
+		}
+
+		/* Value of amax or bmin can be so large that small difference
+		are ignored. For example: 3.2884281489988079e+284 - 100 =
+		3.2884281489988079e+284. This results some area difference
+		are not detected */
+		if (loc_ab_area == a_area) {
+			if (bmin < amin || bmax > amax) {
+				data_round *= ((double)std::max(amax, bmax)
+					       - amax
+					       + (amin - (double)std::min(
+								amin, bmin)));
+			} else {
+				data_round *= area;
+			}
+		}
+	}
+
+	*ab_area = loc_ab_area;
+
+	if (loc_ab_area == a_area && data_round != 1.0) {
+		return(data_round);
+	}
+
+	return(loc_ab_area - a_area);
+}
+
+/** Calculates overlapping area
+@param[in]	a	mbr a
+@param[in]	b	mbr b
+@return overlapping area */
+static double rtree_area_overlapping(const byte *a, const byte *b)
+{
+	double	area = 1.0;
+	double	amin;
+	double	amax;
+	double	bmin;
+	double	bmax;
+
+	static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double),
+		      "compatibility");
+
+	for (auto i = SPDIMS; i--; ) {
+		amin = mach_double_read(a);
+		bmin = mach_double_read(b);
+		amax = mach_double_read(a + sizeof(double));
+		bmax = mach_double_read(b + sizeof(double));
+		a += 2 * sizeof(double);
+		b += 2 * sizeof(double);
+
+		amin = std::max(amin, bmin);
+		amax = std::min(amax, bmax);
+
+		if (amin > amax) {
+			return(0);
+		} else {
+			area *= (amax - amin);
+		}
+	}
+
+	return(area);
+}
+
 /****************************************************************//**
 Calculate the area increased for a new record
 @return area increased */
@@ -1769,28 +1786,20 @@ rtr_rec_cal_increase(
 				dtuple in some of the common fields, or which
 				has an equal number or more fields than
 				dtuple */
-	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
 	double*		area)	/*!< out: increased area */
 {
 	const dfield_t*	dtuple_field;
-	ulint		dtuple_f_len;
-	ulint		rec_f_len;
-	const byte*	rec_b_ptr;
-	double		ret = 0;
 
 	ut_ad(!page_rec_is_supremum(rec));
 	ut_ad(!page_rec_is_infimum(rec));
 
 	dtuple_field = dtuple_get_nth_field(dtuple, 0);
-	dtuple_f_len = dfield_get_len(dtuple_field);
+	ut_ad(dfield_get_len(dtuple_field) == DATA_MBR_LEN);
 
-	rec_b_ptr = rec_get_nth_field(rec, offsets, 0, &rec_f_len);
-	ret = rtree_area_increase(
-		rec_b_ptr,
-		static_cast<const byte*>(dfield_get_data(dtuple_field)),
-		static_cast<int>(dtuple_f_len), area);
-
-	return(ret);
+	return rtree_area_increase(rec,
+				   static_cast<const byte*>(
+					   dfield_get_data(dtuple_field)),
+				   area);
 }
 
 /** Estimates the number of rows in a given area.
@@ -1846,16 +1855,17 @@ rtr_estimate_n_rows_in_range(
 	index->set_modified(mtr);
 	mtr_s_lock_index(index, &mtr);
 
-	buf_block_t* block = btr_block_get(
-		page_id_t(index->table->space_id, index->page),
-		index->table->space->zip_size(),
-		RW_S_LATCH, index, &mtr);
+	buf_block_t* block = btr_root_block_get(index, RW_S_LATCH, &mtr);
+	if (!block) {
+err_exit:
+		mtr.commit();
+		return HA_POS_ERROR;
+	}
 	const page_t* page = buf_block_get_frame(block);
 	const unsigned n_recs = page_header_get_field(page, PAGE_N_RECS);
 
 	if (n_recs == 0) {
-		mtr.commit();
-		return(HA_POS_ERROR);
+		goto err_exit;
 	}
 
 	/* Scan records in root page and calculate area. */
@@ -1883,10 +1893,9 @@ rtr_estimate_n_rows_in_range(
 
 			case PAGE_CUR_WITHIN:
 			case PAGE_CUR_MBR_EQUAL:
-				if (rtree_key_cmp(
+				if (!rtree_key_cmp(
 					    PAGE_CUR_WITHIN, range_mbr_ptr,
-					    DATA_MBR_LEN, rec, DATA_MBR_LEN)
-				    == 0) {
+					    rec)) {
 					area += 1;
 				}
 
@@ -1900,14 +1909,14 @@ rtr_estimate_n_rows_in_range(
 			case PAGE_CUR_CONTAIN:
 			case PAGE_CUR_INTERSECT:
 				area += rtree_area_overlapping(
-					range_mbr_ptr, rec, DATA_MBR_LEN)
+					range_mbr_ptr, rec)
 					/ rec_area;
 				break;
 
 			case PAGE_CUR_DISJOINT:
 				area += 1;
 				area -= rtree_area_overlapping(
-					range_mbr_ptr, rec, DATA_MBR_LEN)
+					range_mbr_ptr, rec)
 					/ rec_area;
 				break;
 
@@ -1915,7 +1924,7 @@ rtr_estimate_n_rows_in_range(
 			case PAGE_CUR_MBR_EQUAL:
 				if (!rtree_key_cmp(
 					    PAGE_CUR_WITHIN, range_mbr_ptr,
-					    DATA_MBR_LEN, rec, DATA_MBR_LEN)) {
+					    rec)) {
 					area += range_area / rec_area;
 				}
 
@@ -1933,5 +1942,6 @@ rtr_estimate_n_rows_in_range(
 	}
 
 	area /= n_recs;
-	return ha_rows(dict_table_get_n_rows(index->table) * area);
+	return ha_rows(static_cast<double>(dict_table_get_n_rows(index->table))
+		       * area);
 }
diff --git a/storage/innobase/gis/gis0sea.cc b/storage/innobase/gis/gis0sea.cc
index 18f75e3d139..1c22aab4d00 100644
--- a/storage/innobase/gis/gis0sea.cc
+++ b/storage/innobase/gis/gis0sea.cc
@@ -139,10 +139,10 @@ rtr_pcur_getnext_from_path(
 		      || latch_mode & BTR_MODIFY_LEAF);
 		mtr_s_lock_index(index, mtr);
 	} else {
-		ut_ad(mtr_memo_contains_flagged(mtr, &index->lock,
-						MTR_MEMO_SX_LOCK
-						| MTR_MEMO_S_LOCK
-						| MTR_MEMO_X_LOCK));
+		ut_ad(mtr->memo_contains_flagged(&index->lock,
+						 MTR_MEMO_SX_LOCK
+						 | MTR_MEMO_S_LOCK
+						 | MTR_MEMO_X_LOCK));
 	}
 
 	const ulint zip_size = index->table->space->zip_size();
@@ -298,8 +298,9 @@ rtr_pcur_getnext_from_path(
 			    && mode != PAGE_CUR_RTREE_LOCATE) {
 				ut_ad(rtr_info->thr);
 				lock_place_prdt_page_lock(
-					index->table->space_id,
-					next_page_no, index,
+					page_id_t(block->page.id().space(),
+						  next_page_no),
+					index,
 					rtr_info->thr);
 			}
 			new_split = true;
@@ -422,9 +423,7 @@ rtr_pcur_getnext_from_path(
 
 					btr_cur_latch_leaves(
 						block,
-						page_id_t(index->table->space_id,
-							  block->page.id.page_no()),
-						zip_size, BTR_MODIFY_TREE,
+						BTR_MODIFY_TREE,
 						btr_cur, mtr);
 				}
 
@@ -600,15 +599,14 @@ rtr_pcur_open_low(
 	n_fields = dtuple_get_n_fields(tuple);
 
 	if (latch_mode & BTR_ALREADY_S_LATCHED) {
-		ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
-				  MTR_MEMO_S_LOCK));
+		ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_S_LOCK));
 		tree_latched = true;
 	}
 
 	if (latch_mode & BTR_MODIFY_TREE) {
-		ut_ad(mtr_memo_contains_flagged(mtr, &index->lock,
-						MTR_MEMO_X_LOCK
-						| MTR_MEMO_SX_LOCK));
+		ut_ad(mtr->memo_contains_flagged(&index->lock,
+						 MTR_MEMO_X_LOCK
+						 | MTR_MEMO_SX_LOCK));
 		tree_latched = true;
 	}
 
@@ -675,7 +673,7 @@ rtr_page_get_father(
 	ulint	page_no = btr_node_ptr_get_child_page_no(cursor->page_cur.rec,
 							 offsets);
 
-	ut_ad(page_no == block->page.id.page_no());
+	ut_ad(page_no == block->page.id().page_no());
 #else
 	rtr_page_get_father_block(
 		NULL, heap, index, block, mtr, sea_cur, cursor);
@@ -708,11 +706,8 @@ static void rtr_get_father_node(
 	/* Try to optimally locate the parent node. Level should always
 	less than sea_cur->tree_height unless the root is splitting */
 	if (sea_cur && sea_cur->tree_height > level) {
-
-		ut_ad(mtr_memo_contains_flagged(mtr,
-						dict_index_get_lock(index),
-						MTR_MEMO_X_LOCK
-						| MTR_MEMO_SX_LOCK));
+		ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+						 | MTR_MEMO_SX_LOCK));
 		ret = rtr_cur_restore_position(
 			BTR_CONT_MODIFY_TREE, sea_cur, level, mtr);
 
@@ -821,12 +816,12 @@ rtr_page_get_father_node_ptr(
 	dict_index_t*	index;
 	rtr_mbr_t	mbr;
 
-	page_no = btr_cur_get_block(cursor)->page.id.page_no();
+	page_no = btr_cur_get_block(cursor)->page.id().page_no();
 	index = btr_cur_get_index(cursor);
 
 	ut_ad(srv_read_only_mode
-	      || mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
-					   MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+	      || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					    | MTR_MEMO_SX_LOCK));
 
 	ut_ad(dict_index_get_page(index) != page_no);
 
@@ -1201,7 +1196,7 @@ rtr_check_discard_page(
 				the root page */
 	buf_block_t*	block)	/*!< in: block of page to be discarded */
 {
-	const ulint pageno = block->page.id.page_no();
+	const ulint pageno = block->page.id().page_no();
 
 	mutex_enter(&index->rtr_track->rtr_active_mutex);
 
@@ -1222,7 +1217,7 @@ rtr_check_discard_page(
 		if (rtr_info->matches) {
 			mutex_enter(&rtr_info->matches->rtr_match_mutex);
 
-			if ((&rtr_info->matches->block)->page.id.page_no()
+			if ((&rtr_info->matches->block)->page.id().page_no()
 			     == pageno) {
 				if (!rtr_info->matches->matched_recs->empty()) {
 					rtr_info->matches->matched_recs->clear();
@@ -1238,8 +1233,8 @@ rtr_check_discard_page(
 	mutex_exit(&index->rtr_track->rtr_active_mutex);
 
 	lock_mutex_enter();
-	lock_prdt_page_free_from_discard(block, lock_sys.prdt_hash);
-	lock_prdt_page_free_from_discard(block, lock_sys.prdt_page_hash);
+	lock_prdt_page_free_from_discard(block, &lock_sys.prdt_hash);
+	lock_prdt_page_free_from_discard(block, &lock_sys.prdt_page_hash);
 	lock_mutex_exit();
 }
 
@@ -1346,8 +1341,8 @@ rtr_cur_restore_position(
 	page_cur_t*	page_cursor;
 	node_visit_t*	node = rtr_get_parent_node(btr_cur, level, false);
 	node_seq_t	path_ssn = node->seq_no;
-	const ulint	zip_size = index->table->space->zip_size();
-	ulint		page_no = node->page_no;
+	const unsigned	zip_size = index->table->space->zip_size();
+	uint32_t	page_no = node->page_no;
 
 	heap = mem_heap_create(256);
 
@@ -1507,14 +1502,13 @@ rtr_non_leaf_insert_stack_push(
 	dict_index_t*		index,	/*!< in: index descriptor */
 	rtr_node_path_t*	path,	/*!< in/out: search path */
 	ulint			level,	/*!< in: index page level */
-	ulint			child_no,/*!< in: child page no */
+	uint32_t		child_no,/*!< in: child page no */
 	const buf_block_t*	block,	/*!< in: block of the page */
 	const rec_t*		rec,	/*!< in: positioned record */
 	double			mbr_inc)/*!< in: MBR needs to be enlarged */
 {
 	node_seq_t	new_seq;
 	btr_pcur_t*	my_cursor;
-	ulint		page_no = block->page.id.page_no();
 
 	my_cursor = static_cast<btr_pcur_t*>(
 		ut_malloc_nokey(sizeof(*my_cursor)));
@@ -1526,11 +1520,11 @@ rtr_non_leaf_insert_stack_push(
 	(btr_pcur_get_btr_cur(my_cursor))->index = index;
 
 	new_seq = rtr_get_current_ssn_id(index);
-	rtr_non_leaf_stack_push(path, page_no, new_seq, level, child_no,
-				my_cursor, mbr_inc);
+	rtr_non_leaf_stack_push(path, block->page.id().page_no(),
+				new_seq, level, child_no, my_cursor, mbr_inc);
 }
 
-/** Copy a buf_block_t strcuture, except "block->lock" and "block->mutex".
+/** Copy a buf_block_t, except "block->lock".
 @param[in,out]	matches	copy to match->block
 @param[in]	block	block to copy */
 static
@@ -1539,13 +1533,11 @@ rtr_copy_buf(
 	matched_rec_t*		matches,
 	const buf_block_t*	block)
 {
-	/* Copy all members of "block" to "matches->block" except "mutex"
-	and "lock". We skip "mutex" and "lock" because they are not used
+	/* Copy all members of "block" to "matches->block" except "lock".
+	We skip "lock" because it is not used
 	from the dummy buf_block_t we create here and because memcpy()ing
-	them generates (valid) compiler warnings that the vtable pointer
-	will be copied. It is also undefined what will happen with the
-	newly memcpy()ed mutex if the source mutex was acquired by
-	(another) thread while it was copied. */
+	it generates (valid) compiler warnings that the vtable pointer
+	will be copied. */
 	new (&matches->block.page) buf_page_t(block->page);
 	matches->block.frame = block->frame;
 	matches->block.unzip_LRU = block->unzip_LRU;
@@ -1553,9 +1545,7 @@ rtr_copy_buf(
 	ut_d(matches->block.in_unzip_LRU_list = block->in_unzip_LRU_list);
 	ut_d(matches->block.in_withdraw_list = block->in_withdraw_list);
 
-	/* Skip buf_block_t::mutex */
 	/* Skip buf_block_t::lock */
-	matches->block.lock_hash_val = block->lock_hash_val;
 	matches->block.modify_clock = block->modify_clock;
 #ifdef BTR_CUR_HASH_ADAPT
 	matches->block.n_hash_helps = block->n_hash_helps;
@@ -1638,6 +1628,60 @@ rtr_get_mbr_from_tuple(
 		     mbr);
 }
 
+/** Compare minimum bounding rectangles.
+@return	1, 0, -1, if mode == PAGE_CUR_MBR_EQUAL. And return
+1, 0 for rest compare modes, depends on a and b qualifies the
+relationship (CONTAINS, WITHIN etc.) */
+static int cmp_gis_field(page_cur_mode_t mode, const void *a, const void *b)
+{
+  return mode == PAGE_CUR_MBR_EQUAL
+    ? cmp_geometry_field(a, b)
+    : rtree_key_cmp(mode, a, b);
+}
+
+/** Compare a GIS data tuple to a physical record in rtree non-leaf node.
+We need to check the page number field, since we don't store pk field in
+rtree non-leaf node.
+@param[in]	dtuple		data tuple
+@param[in]	rec		R-tree record
+@return whether dtuple is less than rec */
+static bool
+cmp_dtuple_rec_with_gis_internal(const dtuple_t* dtuple, const rec_t* rec)
+{
+  const dfield_t *dtuple_field= dtuple_get_nth_field(dtuple, 0);
+  ut_ad(dfield_get_len(dtuple_field) == DATA_MBR_LEN);
+
+  if (cmp_gis_field(PAGE_CUR_WITHIN, dfield_get_data(dtuple_field), rec))
+    return true;
+
+  dtuple_field= dtuple_get_nth_field(dtuple, 1);
+  ut_ad(dfield_get_len(dtuple_field) == 4); /* child page number */
+  ut_ad(dtuple_field->type.mtype == DATA_SYS_CHILD);
+  ut_ad(!(dtuple_field->type.prtype & ~DATA_NOT_NULL));
+
+  return memcmp(dtuple_field->data, rec + DATA_MBR_LEN, 4) != 0;
+}
+
+#ifndef UNIV_DEBUG
+static
+#endif
+/** Compare a GIS data tuple to a physical record.
+@param[in] dtuple data tuple
+@param[in] rec R-tree record
+@param[in] mode compare mode
+@retval negative if dtuple is less than rec */
+int cmp_dtuple_rec_with_gis(const dtuple_t *dtuple, const rec_t *rec,
+                            page_cur_mode_t mode)
+{
+  const dfield_t *dtuple_field= dtuple_get_nth_field(dtuple, 0);
+  /* FIXME: TABLE_SHARE::init_from_binary_frm_image() is adding
+  field->key_part_length_bytes() to the key length */
+  ut_ad(dfield_get_len(dtuple_field) == DATA_MBR_LEN ||
+        dfield_get_len(dtuple_field) == DATA_MBR_LEN + 2);
+
+  return cmp_gis_field(mode, dfield_get_data(dtuple_field), rec);
+}
+
 /****************************************************************//**
 Searches the right position in rtree for a page cursor. */
 bool
@@ -1663,7 +1707,6 @@ rtr_cur_search_with_match(
 	const rec_t*	best_rec;
 	const rec_t*	last_match_rec = NULL;
 	bool		match_init = false;
-	ulint		space = block->page.id.space();
 	page_cur_mode_t	orig_mode = mode;
 	const rec_t*	first_rec = NULL;
 
@@ -1710,7 +1753,7 @@ rtr_cur_search_with_match(
 		and the table is a compressed table, try to avoid
 		first page as much as possible, as there will be problem
 		when update MIN_REC rec in compress table */
-		if (buf_block_get_page_zip(block)
+		if (is_buf_block_get_page_zip(block)
 		    && !page_has_prev(page)
 		    && page_get_n_recs(page) >= 2) {
 
@@ -1719,9 +1762,6 @@ rtr_cur_search_with_match(
 	}
 
 	while (!page_rec_is_supremum(rec)) {
-		offsets = rec_get_offsets(rec, index, offsets, n_core,
-					  dtuple_get_n_fields_cmp(tuple),
-					  &heap);
 		if (!n_core) {
 			switch (mode) {
 			case PAGE_CUR_CONTAIN:
@@ -1731,21 +1771,21 @@ rtr_cur_search_with_match(
 				both CONTAIN and INTERSECT for either of
 				the search mode */
 				cmp = cmp_dtuple_rec_with_gis(
-					tuple, rec, offsets, PAGE_CUR_CONTAIN);
+					tuple, rec, PAGE_CUR_CONTAIN);
 
 				if (cmp != 0) {
 					cmp = cmp_dtuple_rec_with_gis(
-						tuple, rec, offsets,
+						tuple, rec,
 						PAGE_CUR_INTERSECT);
 				}
 				break;
 			case PAGE_CUR_DISJOINT:
 				cmp = cmp_dtuple_rec_with_gis(
-					tuple, rec, offsets, mode);
+					tuple, rec, mode);
 
 				if (cmp != 0) {
 					cmp = cmp_dtuple_rec_with_gis(
-						tuple, rec, offsets,
+						tuple, rec,
 						PAGE_CUR_INTERSECT);
 				}
 				break;
@@ -1754,11 +1794,11 @@ rtr_cur_search_with_match(
 				double	area;
 
 				cmp = cmp_dtuple_rec_with_gis(
-					tuple, rec, offsets, PAGE_CUR_WITHIN);
+					tuple, rec, PAGE_CUR_WITHIN);
 
 				if (cmp != 0) {
 					increase = rtr_rec_cal_increase(
-						tuple, rec, offsets, &area);
+						tuple, rec, &area);
 					/* Once it goes beyond DBL_MAX,
 					it would not make sense to record
 					such value, just make it
@@ -1781,19 +1821,19 @@ rtr_cur_search_with_match(
 				break;
 			case PAGE_CUR_RTREE_GET_FATHER:
 				cmp = cmp_dtuple_rec_with_gis_internal(
-					tuple, rec, offsets);
+					tuple, rec);
 				break;
 			default:
 				/* WITHIN etc. */
 				cmp = cmp_dtuple_rec_with_gis(
-					tuple, rec, offsets, mode);
+					tuple, rec, mode);
 			}
 		} else {
 			/* At leaf level, INSERT should translate to LE */
 			ut_ad(mode != PAGE_CUR_RTREE_INSERT);
 
 			cmp = cmp_dtuple_rec_with_gis(
-				tuple, rec, offsets, mode);
+				tuple, rec, mode);
 		}
 
 		if (cmp == 0) {
@@ -1804,7 +1844,7 @@ rtr_cur_search_with_match(
 			rtr_info->matches for leaf nodes */
 			if (rtr_info && mode != PAGE_CUR_RTREE_INSERT) {
 				if (!n_core) {
-					ulint		page_no;
+					uint32_t	page_no;
 					node_seq_t	new_seq;
 					bool		is_loc;
 
@@ -1847,7 +1887,11 @@ rtr_cur_search_with_match(
 						/* Lock the page, preventing it
 						from being shrunk */
 						lock_place_prdt_page_lock(
-							space, page_no, index,
+							page_id_t(block->page
+								  .id()
+								  .space(),
+								  page_no),
+							index,
 							rtr_info->thr);
 					}
 				} else {
@@ -1893,12 +1937,11 @@ rtr_cur_search_with_match(
 				then we select the record that result in
 				least increased area */
 				if (mode == PAGE_CUR_RTREE_INSERT) {
-					ulint	child_no;
 					ut_ad(least_inc < DBL_MAX);
 					offsets = rec_get_offsets(
 						best_rec, index, offsets,
 						0, ULINT_UNDEFINED, &heap);
-					child_no =
+					uint32_t child_no =
 					btr_node_ptr_get_child_page_no(
 						best_rec, offsets);
 
@@ -1967,16 +2010,10 @@ rtr_cur_search_with_match(
 	} else {
 
 		if (mode == PAGE_CUR_RTREE_INSERT) {
-			ulint	child_no;
-			ut_ad(!last_match_rec && rec);
-
-			offsets = rec_get_offsets(rec, index, offsets, 0,
-						  ULINT_UNDEFINED, &heap);
-
-			child_no = btr_node_ptr_get_child_page_no(rec, offsets);
-
+			ut_ad(!last_match_rec);
 			rtr_non_leaf_insert_stack_push(
-				index, rtr_info->parent_path, level, child_no,
+				index, rtr_info->parent_path, level,
+				mach_read_from_4(rec + DATA_MBR_LEN),
 				block, rec, 0);
 
 		} else if (rtr_info && found && !n_core) {
diff --git a/storage/innobase/ha/ha0ha.cc b/storage/innobase/ha/ha0ha.cc
deleted file mode 100644
index 8e8a3369b7c..00000000000
--- a/storage/innobase/ha/ha0ha.cc
+++ /dev/null
@@ -1,488 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/********************************************************************//**
-@file ha/ha0ha.cc
-The hash table with external chains
-
-Created 8/22/1994 Heikki Tuuri
-*************************************************************************/
-
-#include "ha0ha.h"
-
-#ifdef UNIV_DEBUG
-# include "buf0buf.h"
-#endif /* UNIV_DEBUG */
-#include "btr0sea.h"
-#include "page0page.h"
-
-/*************************************************************//**
-Creates a hash table with at least n array cells.  The actual number
-of cells is chosen to be a prime number slightly bigger than n.
-@return own: created table */
-hash_table_t*
-ib_create(
-/*======*/
-	ulint		n,	/*!< in: number of array cells */
-	latch_id_t	id,	/*!< in: latch ID */
-	ulint		n_sync_obj,
-				/*!< in: number of mutexes to protect the
-				hash table: must be a power of 2, or 0 */
-	ulint		type)	/*!< in: type of datastructure for which
-				MEM_HEAP_FOR_PAGE_HASH */
-{
-	hash_table_t*	table;
-
-	ut_a(type == MEM_HEAP_FOR_BTR_SEARCH
-	     || type == MEM_HEAP_FOR_PAGE_HASH);
-
-	ut_ad(ut_is_2pow(n_sync_obj));
-	table = hash_create(n);
-
-	/* Creating MEM_HEAP_BTR_SEARCH type heaps can potentially fail,
-	but in practise it never should in this case, hence the asserts. */
-
-	if (n_sync_obj == 0) {
-		table->heap = mem_heap_create_typed(
-			std::min<ulong>(
-				4096,
-				MEM_MAX_ALLOC_IN_BUF / 2
-				- MEM_BLOCK_HEADER_SIZE - MEM_SPACE_NEEDED(0)),
-			type);
-		ut_a(table->heap);
-
-		return(table);
-	}
-
-	if (type == MEM_HEAP_FOR_PAGE_HASH) {
-		/* We create a hash table protected by rw_locks for
-		buf_pool->page_hash. */
-		hash_create_sync_obj(
-			table, HASH_TABLE_SYNC_RW_LOCK, id, n_sync_obj);
-	} else {
-		hash_create_sync_obj(
-			table, HASH_TABLE_SYNC_MUTEX, id, n_sync_obj);
-	}
-
-	table->heaps = static_cast<mem_heap_t**>(
-		ut_malloc_nokey(n_sync_obj * sizeof(void*)));
-
-	for (ulint i = 0; i < n_sync_obj; i++) {
-		table->heaps[i] = mem_heap_create_typed(
-			std::min<ulong>(
-				4096,
-				MEM_MAX_ALLOC_IN_BUF / 2
-				- MEM_BLOCK_HEADER_SIZE - MEM_SPACE_NEEDED(0)),
-			type);
-		ut_a(table->heaps[i]);
-	}
-
-	return(table);
-}
-
-/** Recreate a hash table with at least n array cells. The actual number
-of cells is chosen to be a prime number slightly bigger than n.
-The new cells are all cleared. The heaps are recreated.
-The sync objects are reused.
-@param[in,out]	table	hash table to be resuzed (to be freed later)
-@param[in]	n	number of array cells
-@return	resized new table */
-hash_table_t*
-ib_recreate(
-	hash_table_t*	table,
-	ulint		n)
-{
-	/* This function is for only page_hash for now */
-	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
-	ut_ad(table->n_sync_obj > 0);
-
-	hash_table_t*	new_table = hash_create(n);
-
-	new_table->type = table->type;
-	new_table->n_sync_obj = table->n_sync_obj;
-	new_table->sync_obj = table->sync_obj;
-
-	for (ulint i = 0; i < table->n_sync_obj; i++) {
-		mem_heap_free(table->heaps[i]);
-	}
-	ut_free(table->heaps);
-
-	new_table->heaps = static_cast<mem_heap_t**>(
-		ut_malloc_nokey(new_table->n_sync_obj * sizeof(void*)));
-
-	for (ulint i = 0; i < new_table->n_sync_obj; i++) {
-		new_table->heaps[i] = mem_heap_create_typed(
-			std::min<ulong>(
-				4096,
-				MEM_MAX_ALLOC_IN_BUF / 2
-				- MEM_BLOCK_HEADER_SIZE - MEM_SPACE_NEEDED(0)),
-			MEM_HEAP_FOR_PAGE_HASH);
-		ut_a(new_table->heaps[i]);
-	}
-
-	return(new_table);
-}
-
-/*************************************************************//**
-Empties a hash table and frees the memory heaps. */
-void
-ha_clear(
-/*=====*/
-	hash_table_t*	table)	/*!< in, own: hash table */
-{
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-#ifdef BTR_CUR_HASH_ADAPT
-	ut_ad(!table->adaptive || btr_search_own_all(RW_LOCK_X));
-#endif /* BTR_CUR_HASH_ADAPT */
-
-	for (ulint i = 0; i < table->n_sync_obj; i++) {
-		mem_heap_free(table->heaps[i]);
-	}
-
-	ut_free(table->heaps);
-
-	switch (table->type) {
-	case HASH_TABLE_SYNC_MUTEX:
-		for (ulint i = 0; i < table->n_sync_obj; ++i) {
-			mutex_destroy(&table->sync_obj.mutexes[i]);
-		}
-		ut_free(table->sync_obj.mutexes);
-		table->sync_obj.mutexes = NULL;
-		break;
-
-	case HASH_TABLE_SYNC_RW_LOCK:
-		for (ulint i = 0; i < table->n_sync_obj; ++i) {
-			rw_lock_free(&table->sync_obj.rw_locks[i]);
-		}
-
-		ut_free(table->sync_obj.rw_locks);
-		table->sync_obj.rw_locks = NULL;
-		break;
-
-	case HASH_TABLE_SYNC_NONE:
-		/* do nothing */
-		break;
-	}
-
-	table->n_sync_obj = 0;
-	table->type = HASH_TABLE_SYNC_NONE;
-
-
-	/* Clear the hash table. */
-	ulint	n = hash_get_n_cells(table);
-
-	for (ulint i = 0; i < n; i++) {
-		hash_get_nth_cell(table, i)->node = NULL;
-	}
-}
-
-#ifdef BTR_CUR_HASH_ADAPT
-# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-/** Maximum number of records in a page */
-static const ulint MAX_N_POINTERS
-	= UNIV_PAGE_SIZE_MAX / REC_N_NEW_EXTRA_BYTES;
-# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-
-/*************************************************************//**
-Inserts an entry into a hash table. If an entry with the same fold number
-is found, its node is updated to point to the new data, and no new node
-is inserted. If btr_search_enabled is set to FALSE, we will only allow
-updating existing nodes, but no new node is allowed to be added.
-@return TRUE if succeed, FALSE if no more memory could be allocated */
-ibool
-ha_insert_for_fold_func(
-/*====================*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold,	/*!< in: folded value of data; if a node with
-				the same fold value already exists, it is
-				updated to point to the same data, and no new
-				node is created! */
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-	buf_block_t*	block,	/*!< in: buffer block containing the data */
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-	const rec_t*		data)	/*!< in: data, must not be NULL */
-{
-	hash_cell_t*	cell;
-	ha_node_t*	node;
-	ha_node_t*	prev_node;
-	ulint		hash;
-
-	ut_ad(data);
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-	ut_a(block->frame == page_align(data));
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-	hash_assert_can_modify(table, fold);
-	ut_ad(btr_search_enabled);
-
-	hash = hash_calc_hash(fold, table);
-
-	cell = hash_get_nth_cell(table, hash);
-
-	prev_node = static_cast<ha_node_t*>(cell->node);
-
-	while (prev_node != NULL) {
-		if (prev_node->fold == fold) {
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-			if (table->adaptive) {
-				buf_block_t* prev_block = prev_node->block;
-				ut_a(prev_block->frame
-				     == page_align(prev_node->data));
-				ut_a(prev_block->n_pointers-- < MAX_N_POINTERS);
-				ut_a(block->n_pointers++ < MAX_N_POINTERS);
-			}
-
-			prev_node->block = block;
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-			prev_node->data = data;
-
-			return(TRUE);
-		}
-
-		prev_node = prev_node->next;
-	}
-
-	/* We have to allocate a new chain node */
-
-	node = static_cast<ha_node_t*>(
-		mem_heap_alloc(hash_get_heap(table, fold), sizeof(ha_node_t)));
-
-	if (node == NULL) {
-		/* It was a btr search type memory heap and at the moment
-		no more memory could be allocated: return */
-
-		ut_ad(hash_get_heap(table, fold)->type & MEM_HEAP_BTR_SEARCH);
-
-		return(FALSE);
-	}
-
-	ha_node_set_data(node, block, data);
-
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-	if (table->adaptive) {
-		ut_a(block->n_pointers++ < MAX_N_POINTERS);
-	}
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-
-	node->fold = fold;
-
-	node->next = NULL;
-
-	prev_node = static_cast<ha_node_t*>(cell->node);
-
-	if (prev_node == NULL) {
-
-		cell->node = node;
-
-		return(TRUE);
-	}
-
-	while (prev_node->next != NULL) {
-
-		prev_node = prev_node->next;
-	}
-
-	prev_node->next = node;
-
-	return(TRUE);
-}
-
-#ifdef UNIV_DEBUG
-/** Verify if latch corresponding to the hash table is x-latched
-@param[in]	table		hash table */
-static
-void
-ha_btr_search_latch_x_locked(const hash_table_t* table)
-{
-	ulint	i;
-	for (i = 0; i < btr_ahi_parts; ++i) {
-		if (btr_search_sys->hash_tables[i] == table) {
-			break;
-		}
-	}
-
-	ut_ad(i < btr_ahi_parts);
-	ut_ad(rw_lock_own(btr_search_latches[i], RW_LOCK_X));
-}
-#endif /* UNIV_DEBUG */
-
-/***********************************************************//**
-Deletes a hash node. */
-void
-ha_delete_hash_node(
-/*================*/
-	hash_table_t*	table,		/*!< in: hash table */
-	ha_node_t*	del_node)	/*!< in: node to be deleted */
-{
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ut_d(ha_btr_search_latch_x_locked(table));
-	ut_ad(btr_search_enabled);
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-	if (table->adaptive) {
-		ut_a(del_node->block->frame = page_align(del_node->data));
-		ut_a(del_node->block->n_pointers-- < MAX_N_POINTERS);
-	}
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-
-	HASH_DELETE_AND_COMPACT(ha_node_t, next, table, del_node);
-}
-
-/*********************************************************//**
-Looks for an element when we know the pointer to the data, and updates
-the pointer to data, if found.
-@return TRUE if found */
-ibool
-ha_search_and_update_if_found_func(
-/*===============================*/
-	hash_table_t*	table,	/*!< in/out: hash table */
-	ulint		fold,	/*!< in: folded value of the searched data */
-	const rec_t*	data,	/*!< in: pointer to the data */
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-	buf_block_t*	new_block,/*!< in: block containing new_data */
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-	const rec_t*	new_data)/*!< in: new pointer to the data */
-{
-	ha_node_t*	node;
-
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	hash_assert_can_modify(table, fold);
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-	ut_a(new_block->frame == page_align(new_data));
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-
-	ut_d(ha_btr_search_latch_x_locked(table));
-
-	if (!btr_search_enabled) {
-		return(FALSE);
-	}
-
-	node = ha_search_with_data(table, fold, data);
-
-	if (node) {
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-		if (table->adaptive) {
-			ut_a(node->block->n_pointers-- < MAX_N_POINTERS);
-			ut_a(new_block->n_pointers++ < MAX_N_POINTERS);
-		}
-
-		node->block = new_block;
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-		node->data = new_data;
-
-		return(TRUE);
-	}
-
-	return(FALSE);
-}
-
-/*****************************************************************//**
-Removes from the chain determined by fold all nodes whose data pointer
-points to the page given. */
-void
-ha_remove_all_nodes_to_page(
-/*========================*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold,	/*!< in: fold value */
-	const page_t*	page)	/*!< in: buffer page */
-{
-	ha_node_t*	node;
-
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	hash_assert_can_modify(table, fold);
-	ut_ad(btr_search_enabled);
-
-	node = ha_chain_get_first(table, fold);
-
-	while (node) {
-		if (page_align(ha_node_get_data(node)) == page) {
-
-			/* Remove the hash node */
-
-			ha_delete_hash_node(table, node);
-
-			/* Start again from the first node in the chain
-			because the deletion may compact the heap of
-			nodes and move other nodes! */
-
-			node = ha_chain_get_first(table, fold);
-		} else {
-			node = ha_chain_get_next(node);
-		}
-	}
-#ifdef UNIV_DEBUG
-	/* Check that all nodes really got deleted */
-
-	node = ha_chain_get_first(table, fold);
-
-	while (node) {
-		ut_a(page_align(ha_node_get_data(node)) != page);
-
-		node = ha_chain_get_next(node);
-	}
-#endif /* UNIV_DEBUG */
-}
-
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-/*************************************************************//**
-Validates a given range of the cells in hash table.
-@return TRUE if ok */
-ibool
-ha_validate(
-/*========*/
-	hash_table_t*	table,		/*!< in: hash table */
-	ulint		start_index,	/*!< in: start index */
-	ulint		end_index)	/*!< in: end index */
-{
-	ibool		ok	= TRUE;
-	ulint		i;
-
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ut_a(start_index <= end_index);
-	ut_a(start_index < hash_get_n_cells(table));
-	ut_a(end_index < hash_get_n_cells(table));
-
-	for (i = start_index; i <= end_index; i++) {
-		ha_node_t*	node;
-		hash_cell_t*	cell;
-
-		cell = hash_get_nth_cell(table, i);
-
-		for (node = static_cast<ha_node_t*>(cell->node);
-		     node != 0;
-		     node = node->next) {
-
-			if (hash_calc_hash(node->fold, table) != i) {
-				ib::error() << "Hash table node fold value "
-					<< node->fold << " does not match the"
-					" cell number " << i << ".";
-
-				ok = FALSE;
-			}
-		}
-	}
-
-	return(ok);
-}
-#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
-#endif /* BTR_CUR_HASH_ADAPT */
diff --git a/storage/innobase/ha/ha0storage.cc b/storage/innobase/ha/ha0storage.cc
index 8857b81f2d2..acde71b0557 100644
--- a/storage/innobase/ha/ha0storage.cc
+++ b/storage/innobase/ha/ha0storage.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -53,7 +54,7 @@ ha_storage_get(
 
 	HASH_SEARCH(
 		next,			/* node->"next" */
-		storage->hash,		/* the hash table */
+		&storage->hash,		/* the hash table */
 		fold,			/* key */
 		ha_storage_node_t*,	/* type of node->next */
 		node,			/* auxiliary variable */
@@ -127,7 +128,7 @@ ha_storage_put_memlim(
 	HASH_INSERT(
 		ha_storage_node_t,	/* type used in the hash chain */
 		next,			/* node->"next" */
-		storage->hash,		/* the hash table */
+		&storage->hash,		/* the hash table */
 		fold,			/* key */
 		node);			/* add this data to the hash */
 
diff --git a/storage/innobase/ha/hash0hash.cc b/storage/innobase/ha/hash0hash.cc
deleted file mode 100644
index 51f3db09922..00000000000
--- a/storage/innobase/ha/hash0hash.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file ha/hash0hash.cc
-The simple hash table utility
-
-Created 5/20/1997 Heikki Tuuri
-*******************************************************/
-
-#include "hash0hash.h"
-#include "mem0mem.h"
-#include "sync0sync.h"
-
-/************************************************************//**
-Reserves all the locks of a hash table, in an ascending order. */
-void
-hash_lock_x_all(
-/*============*/
-	hash_table_t*	table)	/*!< in: hash table */
-{
-	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
-
-	for (ulint i = 0; i < table->n_sync_obj; i++) {
-
-		rw_lock_t* lock = table->sync_obj.rw_locks + i;
-
-		ut_ad(!rw_lock_own(lock, RW_LOCK_S));
-		ut_ad(!rw_lock_own(lock, RW_LOCK_X));
-
-		rw_lock_x_lock(lock);
-	}
-}
-
-/************************************************************//**
-Releases all the locks of a hash table, in an ascending order. */
-void
-hash_unlock_x_all(
-/*==============*/
-	hash_table_t*	table)	/*!< in: hash table */
-{
-	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
-
-	for (ulint i = 0; i < table->n_sync_obj; i++) {
-
-		rw_lock_t* lock = table->sync_obj.rw_locks + i;
-
-		ut_ad(rw_lock_own(lock, RW_LOCK_X));
-
-		rw_lock_x_unlock(lock);
-	}
-}
-
-/************************************************************//**
-Releases all but passed in lock of a hash table, */
-void
-hash_unlock_x_all_but(
-/*==================*/
-	hash_table_t*	table,		/*!< in: hash table */
-	rw_lock_t*	keep_lock)	/*!< in: lock to keep */
-{
-	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
-
-	for (ulint i = 0; i < table->n_sync_obj; i++) {
-
-		rw_lock_t* lock = table->sync_obj.rw_locks + i;
-
-		ut_ad(rw_lock_own(lock, RW_LOCK_X));
-
-		if (keep_lock != lock) {
-			rw_lock_x_unlock(lock);
-		}
-	}
-}
-
-/*************************************************************//**
-Creates a hash table with >= n array cells. The actual number of cells is
-chosen to be a prime number slightly bigger than n.
-@return own: created table */
-hash_table_t*
-hash_create(
-/*========*/
-	ulint	n)	/*!< in: number of array cells */
-{
-	hash_cell_t*	array;
-	ulint		prime;
-	hash_table_t*	table;
-
-	prime = ut_find_prime(n);
-
-	table = static_cast<hash_table_t*>(
-		ut_malloc_nokey(sizeof(hash_table_t)));
-
-	array = static_cast<hash_cell_t*>(
-		ut_malloc_nokey(sizeof(hash_cell_t) * prime));
-
-	/* The default type of hash_table is HASH_TABLE_SYNC_NONE i.e.:
-	the caller is responsible for access control to the table. */
-	table->type = HASH_TABLE_SYNC_NONE;
-	table->array = array;
-	table->n_cells = prime;
-#ifdef BTR_CUR_HASH_ADAPT
-# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-	table->adaptive = FALSE;
-# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-#endif /* BTR_CUR_HASH_ADAPT */
-	table->n_sync_obj = 0;
-	table->sync_obj.mutexes = NULL;
-	table->heaps = NULL;
-	table->heap = NULL;
-	ut_d(table->magic_n = HASH_TABLE_MAGIC_N);
-
-	/* Initialize the cell array */
-	hash_table_clear(table);
-
-	return(table);
-}
-
-/*************************************************************//**
-Frees a hash table. */
-void
-hash_table_free(
-/*============*/
-	hash_table_t*	table)	/*!< in, own: hash table */
-{
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-
-	ut_free(table->array);
-	ut_free(table);
-}
-
-/*************************************************************//**
-Creates a sync object array to protect a hash table.
-::sync_obj can be mutexes or rw_locks depening on the type of
-hash table. */
-void
-hash_create_sync_obj(
-/*=================*/
-	hash_table_t*		table,	/*!< in: hash table */
-	enum hash_table_sync_t	type,	/*!< in: HASH_TABLE_SYNC_MUTEX
-					or HASH_TABLE_SYNC_RW_LOCK */
-	latch_id_t		id,	/*!< in: latch ID */
-	ulint			n_sync_obj)/*!< in: number of sync objects,
-					must be a power of 2 */
-{
-	ut_a(n_sync_obj > 0);
-	ut_a(ut_is_2pow(n_sync_obj));
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-
-	table->type = type;
-
-	switch (table->type) {
-	case HASH_TABLE_SYNC_MUTEX:
-		table->sync_obj.mutexes = static_cast<ib_mutex_t*>(
-			ut_malloc_nokey(n_sync_obj * sizeof(ib_mutex_t)));
-
-		for (ulint i = 0; i < n_sync_obj; i++) {
-			mutex_create(id, table->sync_obj.mutexes + i);
-		}
-
-		break;
-
-	case HASH_TABLE_SYNC_RW_LOCK: {
-
-		latch_level_t	level = sync_latch_get_level(id);
-
-		ut_a(level != SYNC_UNKNOWN);
-
-		table->sync_obj.rw_locks = static_cast<rw_lock_t*>(
-			ut_malloc_nokey(n_sync_obj * sizeof(rw_lock_t)));
-
-		for (ulint i = 0; i < n_sync_obj; i++) {
-			rw_lock_create(hash_table_locks_key,
-			     table->sync_obj.rw_locks + i, level);
-		}
-
-		break;
-	}
-
-	case HASH_TABLE_SYNC_NONE:
-		ut_error;
-	}
-
-	table->n_sync_obj = n_sync_obj;
-}
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 86069f61c31..042999d33cb 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -56,6 +56,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #include <mysql/service_thd_wait.h>
 #include "field.h"
 #include "scope.h"
+#include "srv0srv.h"
 
 // MYSQL_PLUGIN_IMPORT extern my_bool lower_case_file_system;
 // MYSQL_PLUGIN_IMPORT extern char mysql_unpacked_real_data_home[];
@@ -70,6 +71,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #include "btr0sea.h"
 #include "buf0dblwr.h"
 #include "buf0dump.h"
+#include "buf0buf.h"
 #include "buf0flu.h"
 #include "buf0lru.h"
 #include "dict0boot.h"
@@ -77,6 +79,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #include "btr0defragment.h"
 #include "dict0crea.h"
 #include "dict0dict.h"
+#include "dict0priv.h"
 #include "dict0stats.h"
 #include "dict0stats_bg.h"
 #include "fil0fil.h"
@@ -102,7 +105,6 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #include "row0upd.h"
 #include "fil0crypt.h"
 #include "srv0mon.h"
-#include "srv0srv.h"
 #include "srv0start.h"
 #include "rem0rec.h"
 #include "trx0purge.h"
@@ -111,8 +113,11 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #include "trx0trx.h"
 #include "fil0pagecompress.h"
 #include "ut0mem.h"
+#include "ut0mutex.h"
 #include "row0ext.h"
 
+#include <limits>
+
 #define thd_get_trx_isolation(X) ((enum_tx_isolation)thd_tx_isolation(X))
 
 extern "C" void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all);
@@ -121,9 +126,10 @@ void thd_clear_error(MYSQL_THD thd);
 
 TABLE *find_fk_open_table(THD *thd, const char *db, size_t db_len,
 			  const char *table, size_t table_len);
-MYSQL_THD create_thd();
-void destroy_thd(MYSQL_THD thd);
+MYSQL_THD create_background_thd();
+void destroy_background_thd(MYSQL_THD thd);
 void reset_thd(MYSQL_THD thd);
+TABLE *get_purge_table(THD *thd);
 TABLE *open_purge_table(THD *thd, const char *db, size_t dblen,
 			const char *tb, size_t tblen);
 void close_thread_tables(THD* thd);
@@ -149,12 +155,6 @@ void close_thread_tables(THD* thd);
 #include "wsrep_sst.h"
 #endif /* WITH_WSREP */
 
-/** to force correct commit order in binlog */
-static ulong commit_threads = 0;
-static mysql_cond_t commit_cond;
-static mysql_mutex_t commit_cond_m;
-static mysql_mutex_t pending_checkpoint_mutex;
-
 #define INSIDE_HA_INNOBASE_CC
 
 #define EQ_CURRENT_THD(thd) ((thd) == current_thd)
@@ -167,7 +167,6 @@ static const long AUTOINC_NO_LOCKING = 2;
 
 static ulong innobase_open_files;
 static long innobase_autoinc_lock_mode;
-static ulong innobase_commit_concurrency;
 
 static ulonglong innobase_buffer_pool_size;
 
@@ -189,17 +188,10 @@ static char*	innobase_reset_all_monitor_counter;
 
 static ulong	innodb_flush_method;
 
-/** Deprecated; no effect other than issuing a deprecation warning. */
-static char* innodb_file_format;
-/** Deprecated; no effect other than issuing a deprecation warning. */
-static char* innodb_large_prefix;
-
 /* This variable can be set in the server configure file, specifying
 stopword table to be used */
 static char*	innobase_server_stopword_table;
 
-static my_bool	innobase_use_checksums;
-static my_bool	innobase_locks_unsafe_for_binlog;
 static my_bool	innobase_rollback_on_timeout;
 static my_bool	innobase_create_status_file;
 my_bool	innobase_stats_on_metadata;
@@ -210,14 +202,8 @@ static char*	innodb_version_str = (char*) INNODB_VERSION_STR;
 extern uint srv_fil_crypt_rotate_key_age;
 extern uint srv_n_fil_crypt_iops;
 
-extern my_bool srv_immediate_scrub_data_uncompressed;
-extern my_bool srv_background_scrub_data_uncompressed;
-extern my_bool srv_background_scrub_data_compressed;
-extern uint srv_background_scrub_data_interval;
-extern uint srv_background_scrub_data_check_interval;
 #ifdef UNIV_DEBUG
 my_bool innodb_evict_tables_on_commit_debug;
-extern my_bool srv_scrub_force_testing;
 #endif
 
 /** File format constraint for ALTER TABLE */
@@ -272,63 +258,8 @@ is_partition(
 	return strstr(file_name, table_name_t::part_suffix);
 }
 
-/** Signal to shut down InnoDB (NULL if shutdown was signaled, or if
-running in innodb_read_only mode, srv_read_only_mode) */
-std::atomic <st_my_thread_var *> srv_running;
-/** Service thread that waits for the server shutdown and stops purge threads.
-Purge workers have THDs that are needed to calculate virtual columns.
-This THDs must be destroyed rather early in the server shutdown sequence.
-This service thread creates a THD and idly waits for it to get a signal to
-die. Then it notifies all purge workers to shutdown.
-*/
-static pthread_t thd_destructor_thread;
-
-pthread_handler_t
-thd_destructor_proxy(void *)
-{
-	mysql_mutex_t thd_destructor_mutex;
-	mysql_cond_t thd_destructor_cond;
-
-	my_thread_init();
-	mysql_mutex_init(PSI_NOT_INSTRUMENTED, &thd_destructor_mutex, 0);
-	mysql_cond_init(PSI_NOT_INSTRUMENTED, &thd_destructor_cond, 0);
-
-	st_my_thread_var *myvar= _my_thread_var();
-	myvar->current_mutex = &thd_destructor_mutex;
-	myvar->current_cond = &thd_destructor_cond;
-
-	THD *thd= create_thd();
-	thd_proc_info(thd, "InnoDB shutdown handler");
 
 
-	mysql_mutex_lock(&thd_destructor_mutex);
-	srv_running.store(myvar, std::memory_order_relaxed);
-	/* wait until the server wakes the THD to abort and die */
-	while (!myvar->abort)
-		mysql_cond_wait(&thd_destructor_cond, &thd_destructor_mutex);
-	mysql_mutex_unlock(&thd_destructor_mutex);
-	srv_running.store(NULL, std::memory_order_relaxed);
-
-	while (srv_fast_shutdown == 0 &&
-	       (trx_sys.any_active_transactions() ||
-		THD_count::value() > srv_n_purge_threads + 1)) {
-		thd_proc_info(thd, "InnoDB slow shutdown wait");
-		os_thread_sleep(1000);
-	}
-
-	/* Some background threads might generate undo pages that will
-	need to be purged, so they have to be shut down before purge
-	threads if slow shutdown is requested.  */
-	srv_shutdown_bg_undo_sources();
-	srv_purge_shutdown();
-
-	destroy_thd(thd);
-	mysql_cond_destroy(&thd_destructor_cond);
-	mysql_mutex_destroy(&thd_destructor_mutex);
-	my_thread_end();
-	return 0;
-}
-
 /** Return the InnoDB ROW_FORMAT enum value
 @param[in]	row_format	row_format from "innodb_default_row_format"
 @return InnoDB ROW_FORMAT value from rec_format_t enum. */
@@ -450,14 +381,6 @@ TYPELIB innodb_flush_method_typelib = {
 	NULL
 };
 
-/* The following counter is used to convey information to InnoDB
-about server activity: in case of normal DML ops it is not
-sensible to call srv_active_wake_master_thread after each
-operation, we only do it every INNOBASE_WAKE_INTERVAL'th step. */
-
-#define INNOBASE_WAKE_INTERVAL	32
-static ulong	innobase_active_counter	= 0;
-
 /** Allowed values of innodb_change_buffering */
 static const char* innodb_change_buffering_names[] = {
 	"none",		/* IBUF_USE_NONE */
@@ -580,32 +503,17 @@ const struct _ft_vft_ext ft_vft_ext_result = {innobase_fts_get_version,
 /* All RWLOCK used in Innodb are SX-locks */
 # define PSI_RWLOCK_KEY(n) {&n##_key, #n, PSI_RWLOCK_FLAG_SX}
 
-/* Keys to register pthread mutexes/cond in the current file with
+/* Keys to register pthread mutexes in the current file with
 performance schema */
-static mysql_pfs_key_t	commit_cond_mutex_key;
-static mysql_pfs_key_t	commit_cond_key;
 static mysql_pfs_key_t	pending_checkpoint_mutex_key;
-static mysql_pfs_key_t  thd_destructor_thread_key;
-
-static PSI_mutex_info	all_pthread_mutexes[] = {
-	PSI_KEY(commit_cond_mutex),
-	PSI_KEY(pending_checkpoint_mutex),
-};
-
-static PSI_cond_info	all_innodb_conds[] = {
-	PSI_KEY(commit_cond)
-};
 
 # ifdef UNIV_PFS_MUTEX
 /* all_innodb_mutexes array contains mutexes that are
 performance schema instrumented if "UNIV_PFS_MUTEX"
 is defined */
 static PSI_mutex_info all_innodb_mutexes[] = {
-#  ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
-	PSI_KEY(buffer_block_mutex),
-#  endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
+	PSI_KEY(pending_checkpoint_mutex),
 	PSI_KEY(buf_pool_mutex),
-	PSI_KEY(buf_pool_zip_mutex),
 	PSI_KEY(dict_foreign_err_mutex),
 	PSI_KEY(dict_sys_mutex),
 	PSI_KEY(recalc_pool_mutex),
@@ -614,39 +522,30 @@ static PSI_mutex_info all_innodb_mutexes[] = {
 	PSI_KEY(fts_delete_mutex),
 	PSI_KEY(fts_doc_id_mutex),
 	PSI_KEY(log_flush_order_mutex),
-	PSI_KEY(hash_table_mutex),
 	PSI_KEY(ibuf_bitmap_mutex),
 	PSI_KEY(ibuf_mutex),
 	PSI_KEY(ibuf_pessimistic_insert_mutex),
 	PSI_KEY(index_online_log),
 	PSI_KEY(log_sys_mutex),
-	PSI_KEY(log_sys_write_mutex),
-	PSI_KEY(mutex_list_mutex),
 	PSI_KEY(page_zip_stat_per_index_mutex),
 	PSI_KEY(purge_sys_pq_mutex),
 	PSI_KEY(recv_sys_mutex),
-	PSI_KEY(recv_writer_mutex),
 	PSI_KEY(redo_rseg_mutex),
 	PSI_KEY(noredo_rseg_mutex),
 #  ifdef UNIV_DEBUG
 	PSI_KEY(rw_lock_debug_mutex),
 #  endif /* UNIV_DEBUG */
 	PSI_KEY(rw_lock_list_mutex),
-	PSI_KEY(rw_lock_mutex),
 	PSI_KEY(srv_innodb_monitor_mutex),
 	PSI_KEY(srv_misc_tmpfile_mutex),
 	PSI_KEY(srv_monitor_file_mutex),
 	PSI_KEY(buf_dblwr_mutex),
 	PSI_KEY(trx_pool_mutex),
 	PSI_KEY(trx_pool_manager_mutex),
-	PSI_KEY(srv_sys_mutex),
 	PSI_KEY(lock_mutex),
 	PSI_KEY(lock_wait_mutex),
 	PSI_KEY(trx_mutex),
 	PSI_KEY(srv_threads_mutex),
-#  ifndef PFS_SKIP_EVENT_MUTEX
-	PSI_KEY(event_mutex),
-#  endif /* PFS_SKIP_EVENT_MUTEX */
 	PSI_KEY(rtr_active_mutex),
 	PSI_KEY(rtr_match_mutex),
 	PSI_KEY(rtr_path_mutex),
@@ -660,21 +559,13 @@ performance schema instrumented if "UNIV_PFS_RWLOCK"
 is defined */
 static PSI_rwlock_info all_innodb_rwlocks[] = {
 	PSI_RWLOCK_KEY(btr_search_latch),
-#  ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
-	PSI_RWLOCK_KEY(buf_block_lock),
-#  endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
-#  ifdef UNIV_DEBUG
-	PSI_RWLOCK_KEY(buf_block_debug_latch),
-#  endif /* UNIV_DEBUG */
 	PSI_RWLOCK_KEY(dict_operation_lock),
 	PSI_RWLOCK_KEY(fil_space_latch),
-	PSI_RWLOCK_KEY(checkpoint_lock),
 	PSI_RWLOCK_KEY(fts_cache_rw_lock),
 	PSI_RWLOCK_KEY(fts_cache_init_rw_lock),
 	PSI_RWLOCK_KEY(trx_i_s_cache_lock),
 	PSI_RWLOCK_KEY(trx_purge_latch),
 	PSI_RWLOCK_KEY(index_tree_rw_lock),
-	PSI_RWLOCK_KEY(hash_table_locks)
 };
 # endif /* UNIV_PFS_RWLOCK */
 
@@ -683,23 +574,9 @@ static PSI_rwlock_info all_innodb_rwlocks[] = {
 performance schema instrumented if "UNIV_PFS_THREAD"
 is defined */
 static PSI_thread_info	all_innodb_threads[] = {
-	PSI_KEY(buf_dump_thread),
-	PSI_KEY(dict_stats_thread),
-	PSI_KEY(io_handler_thread),
-	PSI_KEY(io_ibuf_thread),
-	PSI_KEY(io_log_thread),
-	PSI_KEY(io_read_thread),
-	PSI_KEY(io_write_thread),
 	PSI_KEY(page_cleaner_thread),
-	PSI_KEY(recv_writer_thread),
-	PSI_KEY(srv_error_monitor_thread),
-	PSI_KEY(srv_lock_timeout_thread),
-	PSI_KEY(srv_master_thread),
-	PSI_KEY(srv_monitor_thread),
-	PSI_KEY(srv_purge_thread),
-	PSI_KEY(srv_worker_thread),
 	PSI_KEY(trx_rollback_clean_thread),
-	PSI_KEY(thd_destructor_thread),
+	PSI_KEY(thread_pool_thread)
 };
 # endif /* UNIV_PFS_THREAD */
 
@@ -904,7 +781,7 @@ innodb_tmpdir_validate(
 Maps a MySQL trx isolation level code to the InnoDB isolation level code
 @return	InnoDB isolation level */
 static inline
-ulint
+uint
 innobase_map_isolation_level(
 /*=========================*/
 	enum_tx_isolation	iso);	/*!< in: MySQL isolation level code */
@@ -986,6 +863,11 @@ static MYSQL_THDVAR_STR(tmpdir,
   innodb_tmpdir_validate, NULL, NULL);
 
 static SHOW_VAR innodb_status_variables[]= {
+#ifdef BTR_CUR_HASH_ADAPT
+  {"adaptive_hash_hash_searches", &btr_cur_n_sea, SHOW_SIZE_T},
+  {"adaptive_hash_non_hash_searches", &btr_cur_n_non_sea, SHOW_SIZE_T},
+#endif
+  {"background_log_sync", &srv_log_writes_and_flush, SHOW_SIZE_T},
   {"buffer_pool_dump_status",
   (char*) &export_vars.innodb_buffer_pool_dump_status,	  SHOW_CHAR},
   {"buffer_pool_load_status",
@@ -995,236 +877,195 @@ static SHOW_VAR innodb_status_variables[]= {
   {"buffer_pool_load_incomplete",
   &export_vars.innodb_buffer_pool_load_incomplete,        SHOW_BOOL},
   {"buffer_pool_pages_data",
-  (char*) &export_vars.innodb_buffer_pool_pages_data,	  SHOW_LONG},
+   &export_vars.innodb_buffer_pool_pages_data, SHOW_SIZE_T},
   {"buffer_pool_bytes_data",
-  (char*) &export_vars.innodb_buffer_pool_bytes_data,	  SHOW_LONG},
+   &export_vars.innodb_buffer_pool_bytes_data, SHOW_SIZE_T},
   {"buffer_pool_pages_dirty",
-  (char*) &export_vars.innodb_buffer_pool_pages_dirty,	  SHOW_LONG},
+   &export_vars.innodb_buffer_pool_pages_dirty, SHOW_SIZE_T},
   {"buffer_pool_bytes_dirty",
-  (char*) &export_vars.innodb_buffer_pool_bytes_dirty,	  SHOW_LONG},
-  {"buffer_pool_pages_flushed",
-  (char*) &export_vars.innodb_buffer_pool_pages_flushed,  SHOW_LONG},
+   &export_vars.innodb_buffer_pool_bytes_dirty, SHOW_SIZE_T},
+  {"buffer_pool_pages_flushed", &buf_flush_page_count, SHOW_SIZE_T},
   {"buffer_pool_pages_free",
-  (char*) &export_vars.innodb_buffer_pool_pages_free,	  SHOW_LONG},
+   &export_vars.innodb_buffer_pool_pages_free, SHOW_SIZE_T},
 #ifdef UNIV_DEBUG
   {"buffer_pool_pages_latched",
-  (char*) &export_vars.innodb_buffer_pool_pages_latched,  SHOW_LONG},
+   &export_vars.innodb_buffer_pool_pages_latched, SHOW_SIZE_T},
 #endif /* UNIV_DEBUG */
+  {"buffer_pool_pages_made_not_young",
+   &export_vars.innodb_buffer_pool_pages_made_not_young, SHOW_SIZE_T},
+  {"buffer_pool_pages_made_young",
+   &export_vars.innodb_buffer_pool_pages_made_young, SHOW_SIZE_T},
   {"buffer_pool_pages_misc",
-  (char*) &export_vars.innodb_buffer_pool_pages_misc,	  SHOW_LONG},
+   &export_vars.innodb_buffer_pool_pages_misc, SHOW_SIZE_T},
+  {"buffer_pool_pages_old",
+   &export_vars.innodb_buffer_pool_pages_old, SHOW_SIZE_T},
   {"buffer_pool_pages_total",
-  (char*) &export_vars.innodb_buffer_pool_pages_total,	  SHOW_LONG},
+   &export_vars.innodb_buffer_pool_pages_total, SHOW_SIZE_T},
+  {"buffer_pool_pages_LRU_flushed", &buf_lru_flush_page_count, SHOW_SIZE_T},
   {"buffer_pool_read_ahead_rnd",
-  (char*) &export_vars.innodb_buffer_pool_read_ahead_rnd, SHOW_LONG},
+   &export_vars.innodb_buffer_pool_read_ahead_rnd, SHOW_SIZE_T},
   {"buffer_pool_read_ahead",
-  (char*) &export_vars.innodb_buffer_pool_read_ahead,	  SHOW_LONG},
+   &export_vars.innodb_buffer_pool_read_ahead, SHOW_SIZE_T},
   {"buffer_pool_read_ahead_evicted",
-  (char*) &export_vars.innodb_buffer_pool_read_ahead_evicted, SHOW_LONG},
+   &export_vars.innodb_buffer_pool_read_ahead_evicted, SHOW_SIZE_T},
   {"buffer_pool_read_requests",
-  (char*) &export_vars.innodb_buffer_pool_read_requests,  SHOW_LONG},
+   &export_vars.innodb_buffer_pool_read_requests, SHOW_SIZE_T},
   {"buffer_pool_reads",
-  (char*) &export_vars.innodb_buffer_pool_reads,	  SHOW_LONG},
-  {"buffer_pool_wait_free",
-  (char*) &export_vars.innodb_buffer_pool_wait_free,	  SHOW_LONG},
+   &export_vars.innodb_buffer_pool_reads, SHOW_SIZE_T},
+  {"buffer_pool_wait_free", &buf_pool.stat.LRU_waits, SHOW_SIZE_T},
   {"buffer_pool_write_requests",
-  (char*) &export_vars.innodb_buffer_pool_write_requests, SHOW_LONG},
-  {"data_fsyncs",
-  (char*) &export_vars.innodb_data_fsyncs,		  SHOW_LONG},
-  {"data_pending_fsyncs",
-  (char*) &export_vars.innodb_data_pending_fsyncs,	  SHOW_LONG},
-  {"data_pending_reads",
-  (char*) &export_vars.innodb_data_pending_reads,	  SHOW_LONG},
-  {"data_pending_writes",
-  (char*) &export_vars.innodb_data_pending_writes,	  SHOW_LONG},
-  {"data_read",
-  (char*) &export_vars.innodb_data_read,		  SHOW_LONG},
-  {"data_reads",
-  (char*) &export_vars.innodb_data_reads,		  SHOW_LONG},
-  {"data_writes",
-  (char*) &export_vars.innodb_data_writes,		  SHOW_LONG},
-  {"data_written",
-  (char*) &export_vars.innodb_data_written,		  SHOW_LONG},
-  {"dblwr_pages_written",
-  (char*) &export_vars.innodb_dblwr_pages_written,	  SHOW_LONG},
-  {"dblwr_writes",
-  (char*) &export_vars.innodb_dblwr_writes,		  SHOW_LONG},
-  {"log_waits",
-  (char*) &export_vars.innodb_log_waits,		  SHOW_LONG},
-  {"log_write_requests",
-  (char*) &export_vars.innodb_log_write_requests,	  SHOW_LONG},
-  {"log_writes",
-  (char*) &export_vars.innodb_log_writes,		  SHOW_LONG},
-  {"os_log_fsyncs",
-  (char*) &export_vars.innodb_os_log_fsyncs,		  SHOW_LONG},
-  {"os_log_pending_fsyncs",
-  (char*) &export_vars.innodb_os_log_pending_fsyncs,	  SHOW_LONG},
-  {"os_log_pending_writes",
-  (char*) &export_vars.innodb_os_log_pending_writes,	  SHOW_LONG},
-  {"os_log_written",
-  (char*) &export_vars.innodb_os_log_written,		  SHOW_LONGLONG},
-  {"page_size",
-  (char*) &export_vars.innodb_page_size,		  SHOW_LONG},
-  {"pages_created",
-  (char*) &export_vars.innodb_pages_created,		  SHOW_LONG},
-  {"pages_read",
-  (char*) &export_vars.innodb_pages_read,		  SHOW_LONG},
-  {"pages_written",
-  (char*) &export_vars.innodb_pages_written,		  SHOW_LONG},
-  {"row_lock_current_waits",
-  (char*) &export_vars.innodb_row_lock_current_waits,	  SHOW_LONG},
-  {"row_lock_time",
-  (char*) &export_vars.innodb_row_lock_time,		  SHOW_LONGLONG},
-  {"row_lock_time_avg",
-  (char*) &export_vars.innodb_row_lock_time_avg,	  SHOW_LONG},
-  {"row_lock_time_max",
-  (char*) &export_vars.innodb_row_lock_time_max,	  SHOW_LONG},
-  {"row_lock_waits",
-  (char*) &export_vars.innodb_row_lock_waits,		  SHOW_LONG},
-  {"rows_deleted",
-  (char*) &export_vars.innodb_rows_deleted,		  SHOW_LONG},
-  {"rows_inserted",
-  (char*) &export_vars.innodb_rows_inserted,		  SHOW_LONG},
-  {"rows_read",
-  (char*) &export_vars.innodb_rows_read,		  SHOW_LONG},
-  {"rows_updated",
-  (char*) &export_vars.innodb_rows_updated,		  SHOW_LONG},
-  {"system_rows_deleted",
-  (char*) &export_vars.innodb_system_rows_deleted, SHOW_LONG},
-  {"system_rows_inserted",
-  (char*) &export_vars.innodb_system_rows_inserted, SHOW_LONG},
-  {"system_rows_read",
-  (char*) &export_vars.innodb_system_rows_read, SHOW_LONG},
-  {"system_rows_updated",
-  (char*) &export_vars.innodb_system_rows_updated, SHOW_LONG},
-  {"num_open_files",
-  (char*) &export_vars.innodb_num_open_files,		  SHOW_LONG},
-  {"truncated_status_writes",
-  (char*) &export_vars.innodb_truncated_status_writes,	  SHOW_LONG},
-  {"available_undo_logs",
-  (char*) &export_vars.innodb_available_undo_logs,        SHOW_LONG},
-  {"undo_truncations",
-  (char*) &export_vars.innodb_undo_truncations,           SHOW_LONG},
+   &export_vars.innodb_buffer_pool_write_requests, SHOW_SIZE_T},
+  {"checkpoint_age", &export_vars.innodb_checkpoint_age, SHOW_SIZE_T},
+  {"checkpoint_max_age", &export_vars.innodb_checkpoint_max_age, SHOW_SIZE_T},
+  {"data_fsyncs", &export_vars.innodb_data_fsyncs, SHOW_SIZE_T},
+  {"data_pending_fsyncs", &export_vars.innodb_data_pending_fsyncs,SHOW_SIZE_T},
+  {"data_pending_reads", &export_vars.innodb_data_pending_reads, SHOW_SIZE_T},
+  {"data_pending_writes", &export_vars.innodb_data_pending_writes,SHOW_SIZE_T},
+  {"data_read", &export_vars.innodb_data_read, SHOW_SIZE_T},
+  {"data_reads", &export_vars.innodb_data_reads, SHOW_SIZE_T},
+  {"data_writes", &export_vars.innodb_data_writes, SHOW_SIZE_T},
+  {"data_written", &export_vars.innodb_data_written, SHOW_SIZE_T},
+  {"dblwr_pages_written", &export_vars.innodb_dblwr_pages_written,SHOW_SIZE_T},
+  {"dblwr_writes", &export_vars.innodb_dblwr_writes, SHOW_SIZE_T},
+  {"deadlocks", &srv_stats.lock_deadlock_count, SHOW_SIZE_T},
+  {"history_list_length", &export_vars.innodb_history_list_length,SHOW_SIZE_T},
+  {"ibuf_discarded_delete_marks", &ibuf.n_discarded_ops[IBUF_OP_DELETE_MARK],
+   SHOW_SIZE_T},
+  {"ibuf_discarded_deletes", &ibuf.n_discarded_ops[IBUF_OP_DELETE],
+   SHOW_SIZE_T},
+  {"ibuf_discarded_inserts", &ibuf.n_discarded_ops[IBUF_OP_INSERT],
+   SHOW_SIZE_T},
+  {"ibuf_free_list", &ibuf.free_list_len, SHOW_SIZE_T},
+  {"ibuf_merged_delete_marks", &ibuf.n_merged_ops[IBUF_OP_DELETE_MARK],
+   SHOW_SIZE_T},
+  {"ibuf_merged_deletes", &ibuf.n_merged_ops[IBUF_OP_DELETE], SHOW_SIZE_T},
+  {"ibuf_merged_inserts", &ibuf.n_merged_ops[IBUF_OP_INSERT], SHOW_SIZE_T},
+  {"ibuf_merges", &ibuf.n_merges, SHOW_SIZE_T},
+  {"ibuf_segment_size", &ibuf.seg_size, SHOW_SIZE_T},
+  {"ibuf_size", &ibuf.size, SHOW_SIZE_T},
+  {"log_waits", &export_vars.innodb_log_waits, SHOW_SIZE_T},
+  {"log_write_requests", &export_vars.innodb_log_write_requests, SHOW_SIZE_T},
+  {"log_writes", &export_vars.innodb_log_writes, SHOW_SIZE_T},
+  {"lsn_current", &export_vars.innodb_lsn_current, SHOW_ULONGLONG},
+  {"lsn_flushed", &export_vars.innodb_lsn_flushed, SHOW_ULONGLONG},
+  {"lsn_last_checkpoint", &export_vars.innodb_lsn_last_checkpoint,
+   SHOW_ULONGLONG},
+  {"master_thread_active_loops", &srv_main_active_loops, SHOW_SIZE_T},
+  {"master_thread_idle_loops", &srv_main_idle_loops, SHOW_SIZE_T},
+  {"max_trx_id", &export_vars.innodb_max_trx_id, SHOW_ULONGLONG},
+#ifdef BTR_CUR_HASH_ADAPT
+  {"mem_adaptive_hash", &export_vars.innodb_mem_adaptive_hash, SHOW_SIZE_T},
+#endif
+  {"mem_dictionary", &export_vars.innodb_mem_dictionary, SHOW_SIZE_T},
+  {"os_log_fsyncs", &export_vars.innodb_os_log_fsyncs, SHOW_SIZE_T},
+  {"os_log_pending_fsyncs", &export_vars.innodb_os_log_pending_fsyncs,
+   SHOW_SIZE_T},
+  {"os_log_pending_writes", &export_vars.innodb_os_log_pending_writes,
+   SHOW_SIZE_T},
+  {"os_log_written", &export_vars.innodb_os_log_written, SHOW_SIZE_T},
+  {"page_size", &srv_page_size, SHOW_ULONG},
+  {"pages_created", &buf_pool.stat.n_pages_created, SHOW_SIZE_T},
+  {"pages_read", &buf_pool.stat.n_pages_read, SHOW_SIZE_T},
+  {"pages_written", &buf_pool.stat.n_pages_written, SHOW_SIZE_T},
+  {"row_lock_current_waits", &export_vars.innodb_row_lock_current_waits,
+   SHOW_SIZE_T},
+  {"row_lock_time", &export_vars.innodb_row_lock_time, SHOW_LONGLONG},
+  {"row_lock_time_avg", &export_vars.innodb_row_lock_time_avg, SHOW_SIZE_T},
+  {"row_lock_time_max", &export_vars.innodb_row_lock_time_max, SHOW_SIZE_T},
+  {"row_lock_waits", &export_vars.innodb_row_lock_waits, SHOW_SIZE_T},
+  {"rows_deleted", &export_vars.innodb_rows_deleted, SHOW_SIZE_T},
+  {"rows_inserted", &export_vars.innodb_rows_inserted, SHOW_SIZE_T},
+  {"rows_read", &export_vars.innodb_rows_read, SHOW_SIZE_T},
+  {"rows_updated", &export_vars.innodb_rows_updated, SHOW_SIZE_T},
+  {"system_rows_deleted", &export_vars.innodb_system_rows_deleted,SHOW_SIZE_T},
+  {"system_rows_inserted", &export_vars.innodb_system_rows_inserted,
+   SHOW_SIZE_T},
+  {"system_rows_read", &export_vars.innodb_system_rows_read, SHOW_SIZE_T},
+  {"system_rows_updated", &export_vars.innodb_system_rows_updated,
+   SHOW_SIZE_T},
+  {"num_open_files", &fil_system.n_open, SHOW_SIZE_T},
+  {"truncated_status_writes", &export_vars.innodb_truncated_status_writes,
+   SHOW_SIZE_T},
+  {"available_undo_logs", &srv_available_undo_logs, SHOW_ULONG},
+  {"undo_truncations", &export_vars.innodb_undo_truncations, SHOW_ULONG},
 
   /* Status variables for page compression */
   {"page_compression_saved",
-   (char*) &export_vars.innodb_page_compression_saved,    SHOW_LONGLONG},
+   &export_vars.innodb_page_compression_saved, SHOW_LONGLONG},
   {"num_index_pages_written",
-   (char*) &export_vars.innodb_index_pages_written,       SHOW_LONGLONG},
+   &export_vars.innodb_index_pages_written, SHOW_LONGLONG},
   {"num_non_index_pages_written",
-   (char*) &export_vars.innodb_non_index_pages_written,       SHOW_LONGLONG},
+   &export_vars.innodb_non_index_pages_written, SHOW_LONGLONG},
   {"num_pages_page_compressed",
-   (char*) &export_vars.innodb_pages_page_compressed,     SHOW_LONGLONG},
+   &export_vars.innodb_pages_page_compressed, SHOW_LONGLONG},
   {"num_page_compressed_trim_op",
-   (char*) &export_vars.innodb_page_compressed_trim_op,     SHOW_LONGLONG},
+   &export_vars.innodb_page_compressed_trim_op, SHOW_LONGLONG},
   {"num_pages_page_decompressed",
-   (char*) &export_vars.innodb_pages_page_decompressed,   SHOW_LONGLONG},
+   &export_vars.innodb_pages_page_decompressed, SHOW_LONGLONG},
   {"num_pages_page_compression_error",
-   (char*) &export_vars.innodb_pages_page_compression_error,   SHOW_LONGLONG},
+   &export_vars.innodb_pages_page_compression_error, SHOW_LONGLONG},
   {"num_pages_encrypted",
-   (char*) &export_vars.innodb_pages_encrypted,   SHOW_LONGLONG},
+   &export_vars.innodb_pages_encrypted, SHOW_LONGLONG},
   {"num_pages_decrypted",
-   (char*) &export_vars.innodb_pages_decrypted,   SHOW_LONGLONG},
-  {"have_lz4",
-  (char*) &innodb_have_lz4,		  SHOW_BOOL},
-  {"have_lzo",
-  (char*) &innodb_have_lzo,		  SHOW_BOOL},
-  {"have_lzma",
-  (char*) &innodb_have_lzma,		  SHOW_BOOL},
-  {"have_bzip2",
-  (char*) &innodb_have_bzip2,		  SHOW_BOOL},
-  {"have_snappy",
-  (char*) &innodb_have_snappy,		  SHOW_BOOL},
-  {"have_punch_hole",
-  (char*) &innodb_have_punch_hole,	  SHOW_BOOL},
+   &export_vars.innodb_pages_decrypted, SHOW_LONGLONG},
+  {"have_lz4", &innodb_have_lz4, SHOW_BOOL},
+  {"have_lzo", &innodb_have_lzo, SHOW_BOOL},
+  {"have_lzma", &innodb_have_lzma, SHOW_BOOL},
+  {"have_bzip2", &innodb_have_bzip2, SHOW_BOOL},
+  {"have_snappy", &innodb_have_snappy, SHOW_BOOL},
+  {"have_punch_hole", &innodb_have_punch_hole, SHOW_BOOL},
 
   /* Defragmentation */
   {"defragment_compression_failures",
-  (char*) &export_vars.innodb_defragment_compression_failures, SHOW_LONG},
-  {"defragment_failures",
-  (char*) &export_vars.innodb_defragment_failures, SHOW_LONG},
-  {"defragment_count",
-  (char*) &export_vars.innodb_defragment_count, SHOW_LONG},
+   &export_vars.innodb_defragment_compression_failures, SHOW_SIZE_T},
+  {"defragment_failures", &export_vars.innodb_defragment_failures,SHOW_SIZE_T},
+  {"defragment_count", &export_vars.innodb_defragment_count, SHOW_SIZE_T},
 
   {"instant_alter_column",
-  (char*) &export_vars.innodb_instant_alter_column, SHOW_LONG},
+   &export_vars.innodb_instant_alter_column, SHOW_ULONG},
 
   /* Online alter table status variables */
   {"onlineddl_rowlog_rows",
-  (char*) &export_vars.innodb_onlineddl_rowlog_rows, SHOW_LONG},
+   &export_vars.innodb_onlineddl_rowlog_rows, SHOW_SIZE_T},
   {"onlineddl_rowlog_pct_used",
-  (char*) &export_vars.innodb_onlineddl_rowlog_pct_used, SHOW_LONG},
+   &export_vars.innodb_onlineddl_rowlog_pct_used, SHOW_SIZE_T},
   {"onlineddl_pct_progress",
-  (char*) &export_vars.innodb_onlineddl_pct_progress, SHOW_LONG},
+   &export_vars.innodb_onlineddl_pct_progress, SHOW_SIZE_T},
 
   /* Times secondary index lookup triggered cluster lookup and
   times prefix optimization avoided triggering cluster lookup */
   {"secondary_index_triggered_cluster_reads",
-  (char*) &export_vars.innodb_sec_rec_cluster_reads,	  SHOW_LONG},
+   &export_vars.innodb_sec_rec_cluster_reads, SHOW_SIZE_T},
   {"secondary_index_triggered_cluster_reads_avoided",
-  (char*) &export_vars.innodb_sec_rec_cluster_reads_avoided, SHOW_LONG},
+   &export_vars.innodb_sec_rec_cluster_reads_avoided, SHOW_SIZE_T},
 
   /* Encryption */
   {"encryption_rotation_pages_read_from_cache",
-   (char*) &export_vars.innodb_encryption_rotation_pages_read_from_cache,
-   SHOW_LONG},
+   &export_vars.innodb_encryption_rotation_pages_read_from_cache, SHOW_SIZE_T},
   {"encryption_rotation_pages_read_from_disk",
-  (char*) &export_vars.innodb_encryption_rotation_pages_read_from_disk,
-   SHOW_LONG},
+   &export_vars.innodb_encryption_rotation_pages_read_from_disk, SHOW_SIZE_T},
   {"encryption_rotation_pages_modified",
-  (char*) &export_vars.innodb_encryption_rotation_pages_modified,
-   SHOW_LONG},
+   &export_vars.innodb_encryption_rotation_pages_modified, SHOW_SIZE_T},
   {"encryption_rotation_pages_flushed",
-  (char*) &export_vars.innodb_encryption_rotation_pages_flushed,
-   SHOW_LONG},
+   &export_vars.innodb_encryption_rotation_pages_flushed, SHOW_SIZE_T},
   {"encryption_rotation_estimated_iops",
-  (char*) &export_vars.innodb_encryption_rotation_estimated_iops,
-   SHOW_LONG},
+   &export_vars.innodb_encryption_rotation_estimated_iops, SHOW_SIZE_T},
   {"encryption_key_rotation_list_length",
-  (char*)&export_vars.innodb_key_rotation_list_length,
-   SHOW_LONGLONG},
+   &export_vars.innodb_key_rotation_list_length, SHOW_LONGLONG},
   {"encryption_n_merge_blocks_encrypted",
-  (char*)&export_vars.innodb_n_merge_blocks_encrypted,
-   SHOW_LONGLONG},
+   &export_vars.innodb_n_merge_blocks_encrypted, SHOW_LONGLONG},
   {"encryption_n_merge_blocks_decrypted",
-  (char*)&export_vars.innodb_n_merge_blocks_decrypted,
-   SHOW_LONGLONG},
+   &export_vars.innodb_n_merge_blocks_decrypted, SHOW_LONGLONG},
   {"encryption_n_rowlog_blocks_encrypted",
-  (char*)&export_vars.innodb_n_rowlog_blocks_encrypted,
-   SHOW_LONGLONG},
+   &export_vars.innodb_n_rowlog_blocks_encrypted, SHOW_LONGLONG},
   {"encryption_n_rowlog_blocks_decrypted",
-  (char*)&export_vars.innodb_n_rowlog_blocks_decrypted,
-   SHOW_LONGLONG},
+   &export_vars.innodb_n_rowlog_blocks_decrypted, SHOW_LONGLONG},
   {"encryption_n_temp_blocks_encrypted",
-  (char*)&export_vars.innodb_n_temp_blocks_encrypted,
-   SHOW_LONGLONG},
+   &export_vars.innodb_n_temp_blocks_encrypted, SHOW_LONGLONG},
   {"encryption_n_temp_blocks_decrypted",
-  (char*)&export_vars.innodb_n_temp_blocks_decrypted,
-   SHOW_LONGLONG},
-
-  /* scrubing */
-  {"scrub_background_page_reorganizations",
-   (char*) &export_vars.innodb_scrub_page_reorganizations,
-   SHOW_LONG},
-  {"scrub_background_page_splits",
-   (char*) &export_vars.innodb_scrub_page_splits,
-   SHOW_LONG},
-  {"scrub_background_page_split_failures_underflow",
-   (char*) &export_vars.innodb_scrub_page_split_failures_underflow,
-   SHOW_LONG},
-  {"scrub_background_page_split_failures_out_of_filespace",
-   (char*) &export_vars.innodb_scrub_page_split_failures_out_of_filespace,
-   SHOW_LONG},
-  {"scrub_background_page_split_failures_missing_index",
-   (char*) &export_vars.innodb_scrub_page_split_failures_missing_index,
-   SHOW_LONG},
-  {"scrub_background_page_split_failures_unknown",
-   (char*) &export_vars.innodb_scrub_page_split_failures_unknown,
-   SHOW_LONG},
-  {"scrub_log",
-   (char*) &export_vars.innodb_scrub_log,
+   &export_vars.innodb_n_temp_blocks_decrypted, SHOW_LONGLONG},
+  {"encryption_num_key_requests", &export_vars.innodb_encryption_key_requests,
    SHOW_LONGLONG},
-  {"encryption_num_key_requests",
-   (char*) &export_vars.innodb_encryption_key_requests, SHOW_LONGLONG},
 
   {NullS, NullS, SHOW_LONG}
 };
@@ -1331,21 +1172,32 @@ innobase_release_savepoint(
 					savepoint should be released */
 	void*		savepoint);	/*!< in: savepoint data */
 
-static void innobase_checkpoint_request(handlerton *hton, void *cookie);
-
-/** @brief Initialize the default value of innodb_commit_concurrency.
+/** Request notification of log writes */
+static void innodb_log_flush_request(void *cookie);
 
-Once InnoDB is running, the innodb_commit_concurrency must not change
-from zero to nonzero. (Bug #42101)
+/** Requests for log flushes */
+struct log_flush_request
+{
+  /** earlier request (for a smaller LSN) */
+  log_flush_request *next;
+  /** parameter provided to innodb_log_flush_request() */
+  void *cookie;
+  /** log sequence number that is being waited for */
+  lsn_t lsn;
+};
 
-The initial default value is 0, and without this extra initialization,
-SET GLOBAL innodb_commit_concurrency=DEFAULT would set the parameter
-to 0, even if it was initially set to nonzero at the command line
-or configuration file. */
-static
-void
-innobase_commit_concurrency_init_default();
-/*=======================================*/
+/** Buffer of pending innodb_log_flush_request() */
+MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) static
+struct
+{
+  /** first request */
+  std::atomic<log_flush_request*> start;
+  /** last request */
+  log_flush_request *end;
+  /** mutex protecting this object */
+  mysql_mutex_t mutex;
+}
+log_requests;
 
 /** @brief Adjust some InnoDB startup parameters based on file contents
 or innodb_page_size. */
@@ -1496,6 +1348,30 @@ innobase_show_status(
 	stat_print_fn*		stat_print,
 	enum ha_stat_type	stat_type);
 
+/** After ALTER TABLE, recompute statistics. */
+inline void ha_innobase::reload_statistics()
+{
+  if (dict_table_t *table= m_prebuilt ? m_prebuilt->table : nullptr)
+  {
+    if (table->is_readable())
+      dict_stats_init(table);
+    else
+      table->stat_initialized= 1;
+  }
+}
+
+/** After ALTER TABLE, recompute statistics. */
+static int innodb_notify_tabledef_changed(handlerton *,
+                                          LEX_CSTRING *, LEX_CSTRING *,
+                                          LEX_CUSTRING *, LEX_CUSTRING *,
+                                          handler *handler)
+{
+  DBUG_ENTER("innodb_notify_tabledef_changed");
+  if (handler)
+    static_cast<ha_innobase*>(handler)->reload_statistics();
+  DBUG_RETURN(0);
+}
+
 /****************************************************************//**
 Parse and enable InnoDB monitor counters during server startup.
 User can enable monitor counters/groups by specifying
@@ -1526,36 +1402,6 @@ innobase_fts_store_docid(
 }
 #endif
 
-/*************************************************************//**
-Check for a valid value of innobase_commit_concurrency.
-@return 0 for valid innodb_commit_concurrency */
-static
-int
-innobase_commit_concurrency_validate(
-/*=================================*/
-	THD*, st_mysql_sys_var*,
-	void*				save,	/*!< out: immediate result
-						for update function */
-	struct st_mysql_value*		value)	/*!< in: incoming string */
-{
-	long long	intbuf;
-	ulong		commit_concurrency;
-
-	DBUG_ENTER("innobase_commit_concurrency_validate");
-
-	if (value->val_int(value, &intbuf)) {
-		/* The value is NULL. That is invalid. */
-		DBUG_RETURN(1);
-	}
-
-	*reinterpret_cast<ulong*>(save) = commit_concurrency
-		= static_cast<ulong>(intbuf);
-
-	/* Allow the value to be updated, as long as it remains zero
-	or nonzero. */
-	DBUG_RETURN(!(!commit_concurrency == !innobase_commit_concurrency));
-}
-
 /*******************************************************************//**
 Function for constructing an InnoDB table handler instance. */
 static
@@ -1598,10 +1444,7 @@ innodb_page_size_validate(
 
 /******************************************************************//**
 Returns true if the thread is the replication thread on the slave
-server. Used in srv_conc_enter_innodb() to determine if the thread
-should be allowed to enter InnoDB - the replication thread is treated
-differently than other threads. Also used in
-srv_conc_force_exit_innodb().
+server.
 @return true if thd is the replication thread */
 ibool
 thd_is_replication_slave_thread(
@@ -1634,7 +1477,7 @@ MYSQL_THD
 innobase_create_background_thd(const char* name)
 /*============================*/
 {
-	MYSQL_THD thd= create_thd();
+	MYSQL_THD thd= create_background_thd();
 	thd_proc_info(thd, name);
 	THDVAR(thd, background_thread) = true;
 	return thd;
@@ -1652,7 +1495,7 @@ innobase_destroy_background_thd(
 	if innodb is in the PLUGIN_IS_DYING state */
 	innobase_close_connection(innodb_hton_ptr, thd);
 	thd_set_ha_data(thd, innodb_hton_ptr, NULL);
-	destroy_thd(thd);
+	destroy_background_thd(thd);
 }
 
 /** Close opened tables, free memory, delete items for a MYSQL_THD.
@@ -1687,86 +1530,7 @@ thd_trx_is_auto_commit(
 	       && !thd_test_options(
 		       thd,
 		       OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)
-	       && thd_is_select(thd));
-}
-
-/** Enter InnoDB engine after checking the max number of user threads
-allowed, else the thread is put into sleep.
-@param[in,out]	prebuilt	row prebuilt handler */
-static inline void innobase_srv_conc_enter_innodb(row_prebuilt_t *prebuilt)
-{
-	trx_t* trx = prebuilt->trx;
-
-#ifdef WITH_WSREP
-	if (global_system_variables.wsrep_on &&
-	    (wsrep_thd_is_applying(trx->mysql_thd)
-	     || wsrep_thd_is_toi(trx->mysql_thd))) {
-		return;
-	}
-#endif /* WITH_WSREP */
-
-	if (srv_thread_concurrency) {
-		if (trx->n_tickets_to_enter_innodb > 0) {
-
-			/* If trx has 'free tickets' to enter the engine left,
-			then use one such ticket */
-
-			--trx->n_tickets_to_enter_innodb;
-
-		} else if (trx->mysql_thd != NULL
-			   && thd_is_replication_slave_thread(trx->mysql_thd)) {
-			const ulonglong end = my_interval_timer()
-				+ ulonglong(srv_replication_delay) * 1000000;
-			while ((srv_conc_get_active_threads()
-			        >= srv_thread_concurrency)
-			       && my_interval_timer() < end) {
-				os_thread_sleep(2000 /* 2 ms */);
-			}
-		} else {
-			srv_conc_enter_innodb(prebuilt);
-		}
-	}
-}
-
-/** Note that the thread wants to leave InnoDB only if it doesn't have
-any spare tickets.
-@param[in,out]	m_prebuilt	row prebuilt handler */
-static inline void innobase_srv_conc_exit_innodb(row_prebuilt_t *prebuilt)
-{
-	ut_ad(!sync_check_iterate(sync_check()));
-
-	trx_t* trx = prebuilt->trx;
-
-#ifdef WITH_WSREP
-	if (global_system_variables.wsrep_on &&
-	    (wsrep_thd_is_applying(trx->mysql_thd)
-	     || wsrep_thd_is_toi(trx->mysql_thd))) {
-		return;
-	}
-#endif /* WITH_WSREP */
-
-	/* This is to avoid making an unnecessary function call. */
-	if (trx->declared_to_be_inside_innodb
-	    && trx->n_tickets_to_enter_innodb == 0) {
-
-		srv_conc_force_exit_innodb(trx);
-	}
-}
-
-/******************************************************************//**
-Force a thread to leave InnoDB even if it has spare tickets. */
-static inline
-void
-innobase_srv_conc_force_exit_innodb(
-/*================================*/
-	trx_t*	trx)	/*!< in: transaction handle */
-{
-	ut_ad(!sync_check_iterate(sync_check()));
-
-	/* This is to avoid making an unnecessary function call. */
-	if (trx->declared_to_be_inside_innodb) {
-		srv_conc_force_exit_innodb(trx);
-	}
+	       && thd_sql_command(thd) == SQLCOM_SELECT);
 }
 
 /******************************************************************//**
@@ -1803,17 +1567,6 @@ thd_query_start_micro(
 }
 
 /******************************************************************//**
-Returns true if the thread is executing a SELECT statement.
-@return true if thd is executing SELECT */
-ibool
-thd_is_select(
-/*==========*/
-	const THD*	thd)	/*!< in: thread handle */
-{
-	return(thd_sql_command(thd) == SQLCOM_SELECT);
-}
-
-/******************************************************************//**
 Returns the lock wait timeout for the current connection.
 @return the lock wait timeout, in seconds */
 ulong
@@ -1875,6 +1628,7 @@ static void sst_disable_innodb_writes()
   srv_n_fil_crypt_threads= old_count;
 
   wsrep_sst_disable_writes= true;
+  dict_stats_shutdown();
   purge_sys.stop();
   /* We are holding a global MDL thanks to FLUSH TABLES WITH READ LOCK.
 
@@ -1902,6 +1656,7 @@ static void sst_enable_innodb_writes()
 {
   ut_ad(recv_no_log_write);
   ut_d(recv_no_log_write= false);
+  dict_stats_start();
   purge_sys.resume();
   wsrep_sst_disable_writes= false;
   fil_crypt_set_thread_cnt(srv_n_fil_crypt_threads);
@@ -1920,23 +1675,6 @@ static int innobase_wsrep_set_checkpoint(handlerton* hton, const XID* xid);
 static int innobase_wsrep_get_checkpoint(handlerton* hton, XID* xid);
 #endif /* WITH_WSREP */
 /********************************************************************//**
-Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth
-time calls srv_active_wake_master_thread. This function should be used
-when a single database operation may introduce a small need for
-server utility activity, like checkpointing. */
-inline
-void
-innobase_active_small(void)
-/*=======================*/
-{
-	innobase_active_counter++;
-
-	if ((innobase_active_counter % INNOBASE_WAKE_INTERVAL) == 0) {
-		srv_active_wake_master_thread();
-	}
-}
-
-/********************************************************************//**
 Converts an InnoDB error code to a MySQL error code and also tells to MySQL
 about a possible transaction rollback inside InnoDB caused by a lock wait
 timeout or a deadlock.
@@ -2166,8 +1904,8 @@ void
 innobase_get_cset_width(
 /*====================*/
 	ulint	cset,		/*!< in: MySQL charset-collation code */
-	ulint*	mbminlen,	/*!< out: minimum length of a char (in bytes) */
-	ulint*	mbmaxlen)	/*!< out: maximum length of a char (in bytes) */
+	unsigned*mbminlen,	/*!< out: minimum length of a char (in bytes) */
+	unsigned*mbmaxlen)	/*!< out: maximum length of a char (in bytes) */
 {
 	CHARSET_INFO*	cs;
 	ut_ad(cset <= MAX_CHAR_COLL_NUM);
@@ -2384,98 +2122,10 @@ static bool is_mysql_datadir_path(const char *path)
   if (!lower_case_file_system)
     return(memcmp(mysql_data_dir, path_dir, mysql_data_home_len));
 
-  return(files_charset_info->coll->strnncoll(files_charset_info,
-                                            (uchar *) path_dir, path_len,
-                                            (uchar *) mysql_data_dir,
-                                            mysql_data_home_len,
-                                            TRUE));
-}
-
-static int mysql_tmpfile_path(const char *path, const char *prefix)
-{
-  DBUG_ASSERT(path != NULL);
-  DBUG_ASSERT((strlen(path) + strlen(prefix)) <= FN_REFLEN);
-
-  char filename[FN_REFLEN];
-  File fd = create_temp_file(filename, path, prefix, O_BINARY | O_SEQUENTIAL,
-                             MYF(MY_WME | MY_TEMPORARY));
-  return fd;
-}
-
-/** Creates a temporary file in the location specified by the parameter
-path. If the path is NULL, then it will be created in tmpdir.
-@param[in]	path	location for creating temporary file
-@return temporary file descriptor, or < 0 on error */
-os_file_t
-innobase_mysql_tmpfile(
-	const char*	path)
-{
-	File	fd;
-
-	DBUG_EXECUTE_IF(
-		"innobase_tmpfile_creation_failure",
-		return(OS_FILE_CLOSED);
-	);
-
-	if (path == NULL) {
-		fd = mysql_tmpfile("ib");
-	} else {
-		fd = mysql_tmpfile_path(path, "ib");
-	}
-
-	if (fd < 0)
-		return OS_FILE_CLOSED;
-
-	/* Copy the file descriptor, so that the additional resources
-	allocated by create_temp_file() can be freed by invoking
-	my_close().
-
-	Because the file descriptor returned by this function
-	will be passed to fdopen(), it will be closed by invoking
-	fclose(), which in turn will invoke close() instead of
-	my_close(). */
-
-#ifdef _WIN32
-	/* Note that on Windows, the integer returned by mysql_tmpfile
-	has no relation to C runtime file descriptor. Here, we need
-	to call my_get_osfhandle to get the HANDLE and then convert it
-	to C runtime filedescriptor. */
-
-	HANDLE hFile = my_get_osfhandle(fd);
-	HANDLE hDup;
-	BOOL bOK = DuplicateHandle(
-			GetCurrentProcess(),
-			hFile, GetCurrentProcess(),
-			&hDup, 0, FALSE, DUPLICATE_SAME_ACCESS);
-	my_close(fd, MYF(MY_WME));
-
-	if (!bOK) {
-		my_osmaperr(GetLastError());
-		goto error;
-	}
-	return hDup;
-#else
-#ifdef F_DUPFD_CLOEXEC
-	int fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 0);
-#else
-	int fd2 = dup(fd);
-#endif
-	my_close(fd, MYF(MY_WME));
-	if (fd2 < 0) {
-		set_my_errno(errno);
-		goto error;
-	}
-	return fd2;
-#endif
-
-error:
-	char errbuf[MYSYS_STRERROR_SIZE];
-
-	my_error(EE_OUT_OF_FILERESOURCES,
-		MYF(0),
-		"ib*", errno,
-		my_strerror(errbuf, sizeof(errbuf), errno));
-	return (OS_FILE_CLOSED);
+  return(files_charset_info->strnncoll((uchar *) path_dir, path_len,
+                                       (uchar *) mysql_data_dir,
+                                       mysql_data_home_len,
+                                       TRUE));
 }
 
 /*********************************************************************//**
@@ -2827,18 +2477,6 @@ trx_is_registered_for_2pc(
 }
 
 /*********************************************************************//**
-Note that a transaction has been registered with MySQL 2PC coordinator. */
-static inline
-void
-trx_register_for_2pc(
-/*==================*/
-	trx_t*	trx)	/* in: transaction */
-{
-	trx->is_registered = 1;
-	ut_ad(!trx->active_commit_ordered);
-}
-
-/*********************************************************************//**
 Note that a transaction has been deregistered. */
 static inline
 void
@@ -2950,6 +2588,7 @@ ha_innobase::ha_innobase(
 			  | HA_CAN_FULLTEXT_HINTS
 		*/
 			  | HA_CAN_EXPORT
+                          | HA_ONLINE_ANALYZE
 			  | HA_CAN_RTREEKEYS
                           | HA_CAN_TABLES_WITHOUT_ROLLBACK
                           | HA_CAN_ONLINE_BACKUPS
@@ -3032,22 +2671,17 @@ innobase_register_trx(
 	THD*		thd,	/* in: MySQL thd (connection) object */
 	trx_t*		trx)	/* in: transaction to register */
 {
-	/* JAN: TODO: MySQL 5.7 PSI
-	const ulonglong	trx_id = static_cast<const ulonglong>(
-		trx_get_id_for_print(trx));
-
-	trans_register_ha(thd, FALSE, hton, &trx_id);
-	*/
-	trans_register_ha(thd, FALSE, hton);
+  ut_ad(!trx->active_commit_ordered);
+  const trx_id_t trx_id= trx->id;
 
-	if (!trx_is_registered_for_2pc(trx)
-	    && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+  trans_register_ha(thd, false, hton, trx_id);
 
-		//trans_register_ha(thd, TRUE, hton, &trx_id);
-		trans_register_ha(thd, TRUE, hton);
-	}
-
-	trx_register_for_2pc(trx);
+  if (!trx->is_registered)
+  {
+    trx->is_registered= true;
+    if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
+      trans_register_ha(thd, true, hton, trx_id);
+  }
 }
 
 /*	BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB
@@ -3217,8 +2851,6 @@ innobase_query_caching_of_table_permitted(
 		return(false);
 	}
 
-	innobase_srv_conc_force_exit_innodb(trx);
-
 	if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)
 	    && trx->n_mysql_tables_in_use == 0) {
 		/* We are going to retrieve the query result from the query
@@ -3516,8 +3148,6 @@ ha_innobase::init_table_handle_for_HANDLER(void)
 	/* Initialize the m_prebuilt struct much like it would be inited in
 	external_lock */
 
-	innobase_srv_conc_force_exit_innodb(m_prebuilt->trx);
-
 	/* If the transaction is not started yet, start it */
 
 	trx_start_if_not_started_xa(m_prebuilt->trx, false);
@@ -3624,44 +3254,12 @@ static MYSQL_SYSVAR_ULONGLONG(buffer_pool_size, innobase_buffer_pool_size,
   2ULL << 20,
   LLONG_MAX, 1024*1024L);
 
-/** Deprecation message about innodb_idle_flush_pct */
-static const char*	deprecated_idle_flush_pct
-	= "innodb_idle_flush_pct is DEPRECATED and has no effect.";
-
 static const char*	deprecated_innodb_checksum_algorithm
 	= "Setting innodb_checksum_algorithm to values other than"
 	" crc32, full_crc32, strict_crc32 or strict_full_crc32"
 	" is UNSAFE and DEPRECATED."
 	" These deprecated values will be disallowed in MariaDB 10.6.";
 
-static ulong innodb_idle_flush_pct;
-
-/** If applicable, emit a message that log checksums cannot be disabled.
-@param[in,out]	thd	client session, or NULL if at startup
-@param[in]	check	whether redo log block checksums are enabled
-@return whether redo log block checksums are enabled */
-static inline
-bool
-innodb_log_checksums_func_update(THD* thd, bool check)
-{
-	static const char msg[] = "innodb_log_checksums is deprecated"
-		" and has no effect outside recovery";
-
-	ut_ad(!thd == !srv_was_started);
-
-	if (!check) {
-		if (thd) {
-			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-					    HA_ERR_UNSUPPORTED, msg);
-			check = true;
-		} else {
-			sql_print_warning(msg);
-		}
-	}
-
-	return(check);
-}
-
 static void innodb_checksum_algorithm_update(THD *thd, st_mysql_sys_var*,
                                              void *, const void *save)
 {
@@ -3718,64 +3316,146 @@ static ulonglong innodb_prepare_commit_versioned(THD* thd, ulonglong *trx_id)
 /** Initialize and normalize innodb_buffer_pool_size. */
 static void innodb_buffer_pool_size_init()
 {
-	if (srv_buf_pool_size >= BUF_POOL_SIZE_THRESHOLD) {
-
-		if (srv_buf_pool_instances == srv_buf_pool_instances_default) {
-#if defined(_WIN32) && !defined(_WIN64)
-			/* Do not allocate too large of a buffer pool on
-			Windows 32-bit systems, which can have trouble
-			allocating larger single contiguous memory blocks. */
-			srv_buf_pool_size = ulint(
-				ut_uint64_align_up(srv_buf_pool_size,
-						   srv_buf_pool_chunk_unit));
-			srv_buf_pool_instances = std::min<ulong>(
-				MAX_BUFFER_POOLS,
-				ulong(srv_buf_pool_size
-				      / srv_buf_pool_chunk_unit));
-#else /* defined(_WIN32) && !defined(_WIN64) */
-			/* Default to 8 instances when size > 1GB. */
-			srv_buf_pool_instances = 8;
-#endif /* defined(_WIN32) && !defined(_WIN64) */
-		}
-	} else {
-		/* If buffer pool is less than 1 GiB, assume fewer
-		threads. Also use only one buffer pool instance. */
-		if (srv_buf_pool_instances != srv_buf_pool_instances_default
-		    && srv_buf_pool_instances != 1) {
-			/* We can't distinguish whether the user has explicitly
-			started mysqld with --innodb-buffer-pool-instances=0,
-			(srv_buf_pool_instances_default is 0) or has not
-			specified that option at all. Thus we have the
-			limitation that if the user started with =0, we
-			will not emit a warning here, but we should actually
-			do so. */
-			ib::info()
-				<< "Adjusting innodb_buffer_pool_instances"
-				" from " << srv_buf_pool_instances << " to 1"
-				" since innodb_buffer_pool_size is less than "
-				<< BUF_POOL_SIZE_THRESHOLD / (1024 * 1024)
-				<< " MiB";
-		}
-
-		srv_buf_pool_instances = 1;
-	}
-
-	if (srv_buf_pool_chunk_unit * srv_buf_pool_instances
-	    > srv_buf_pool_size) {
+	if (srv_buf_pool_chunk_unit > srv_buf_pool_size) {
 		/* Size unit of buffer pool is larger than srv_buf_pool_size.
 		adjust srv_buf_pool_chunk_unit for srv_buf_pool_size. */
-		srv_buf_pool_chunk_unit
-			= static_cast<ulong>(srv_buf_pool_size)
-			  / srv_buf_pool_instances;
-		if (srv_buf_pool_size % srv_buf_pool_instances != 0) {
-			++srv_buf_pool_chunk_unit;
-		}
+		srv_buf_pool_chunk_unit = ulong(srv_buf_pool_size);
 	}
 
 	srv_buf_pool_size = buf_pool_size_align(srv_buf_pool_size);
 	innobase_buffer_pool_size = srv_buf_pool_size;
 }
 
+namespace deprecated {
+/** Deprecated; no effect other than issuing a deprecation warning. */
+char* innodb_file_format;
+/** Deprecated; no effect other than issuing a deprecation warning. */
+char* innodb_large_prefix;
+
+/** Deprecated parameter with no effect */
+static my_bool innodb_log_checksums;
+/** Deprecation message for innodb_log_checksums */
+static const char* innodb_log_checksums_msg
+= "The parameter innodb_log_checksums is deprecated and has no effect.";
+/** Deprecated parameter with no effect */
+static my_bool innodb_log_compressed_pages;
+/** Deprecation message for innodb_log_compressed_pages */
+static const char* innodb_log_compressed_pages_msg
+= "The parameter innodb_log_compressed_pages is deprecated and has no effect.";
+/** Deprecated parameter with no effect */
+static my_bool	innodb_log_optimize_ddl;
+static const char* innodb_log_optimize_ddl_msg
+= "The parameter innodb_log_optimize_ddl is deprecated and has no effect.";
+/** Deprecated parameter with no effect */
+static my_bool innodb_scrub_log;
+/** Deprecation message for innodb_scrub_log */
+static const char* innodb_scrub_log_msg
+= "The parameter innodb_scrub_log is deprecated and has no effect.";
+/** Deprecated parameter with no effect */
+static ulonglong innodb_scrub_log_speed;
+/** Deprecation message for innodb_scrub_log_speed */
+static const char* innodb_scrub_log_speed_msg
+= "The parameter innodb_scrub_log_speed is deprecated and has no effect.";
+/** Deprecated parameter with no effect */
+static ulong innodb_undo_logs;
+/** Deprecation message for innodb_undo_logs */
+static const char* innodb_undo_logs_msg
+= "The parameter innodb_undo_logs is deprecated and has no effect.";
+/** Deprecated parameter with no effect */
+static ulong innodb_buffer_pool_instances;
+/** Deprecated parameter with no effect */
+static ulong innodb_page_cleaners;
+static const char* innodb_page_cleaners_msg
+= "The parameter innodb_page_cleaners is deprecated and has no effect.";
+
+ulong srv_n_log_files;
+static const char* srv_n_log_files_msg
+= "The parameter innodb_log_files_in_group is deprecated and has no effect.";
+
+static my_bool innodb_background_scrub_data_uncompressed;
+
+static const char* innodb_background_scrub_data_uncompressed_msg
+= "The parameter innodb_background_scrub_data_uncompressed is deprecated and"
+  " has no effect.";
+
+static my_bool innodb_background_scrub_data_compressed;
+
+static const char* innodb_background_scrub_data_compressed_msg
+= "The parameter innodb_background_scrub_data_compressed is deprecated and"
+  " has no effect.";
+
+static uint innodb_background_scrub_data_check_interval;
+
+static const char* innodb_background_scrub_data_check_interval_msg
+= "The parameter innodb_background_scrub_data_check_interval is deprecated and"
+  " has no effect.";
+
+static uint innodb_background_scrub_data_interval;
+
+static const char* innodb_background_scrub_data_interval_msg
+= "The parameter innodb_background_scrub_data_interval is deprecated and"
+  " has no effect.";
+
+uint replication_delay;
+uint thread_concurrency;
+uint commit_concurrency;
+uint concurrency_tickets;
+uint adaptive_max_sleep_delay;
+uint thread_sleep_delay;
+
+static const char * const replication_delay_msg
+= "The parameter innodb_replication_delay is deprecated and has no effect.";
+static const char * const thread_concurrency_msg
+= "The parameter innodb_thread_concurrency is deprecated and has no effect.";
+static const char * const commit_concurrency_msg
+= "The parameter innodb_commit_concurrency is deprecated and has no effect.";
+static const char * const concurrency_tickets_msg
+= "The parameter innodb_concurrency_tickets is deprecated and has no effect.";
+static const char * const adaptive_max_sleep_delay_msg
+= "The parameter innodb_adaptive_max_sleep_delay is deprecated and"
+  " has no effect.";
+static const char * const thread_sleep_delay_msg
+= "The parameter innodb_thread_sleep_delay is deprecated and has no effect.";
+
+static void replication_delay_warn(THD* thd, st_mysql_sys_var*, void*,
+                                   const void*)
+{
+  push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
+                      replication_delay_msg);
+}
+static void thread_concurrency_warn(THD* thd, st_mysql_sys_var*, void*,
+                                    const void*)
+{
+  push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
+                      thread_concurrency_msg);
+}
+static void commit_concurrency_warn(THD* thd, st_mysql_sys_var*, void*,
+                                    const void*)
+{
+  push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
+                      commit_concurrency_msg);
+}
+static void concurrency_tickets_warn(THD* thd, st_mysql_sys_var*, void*,
+                                     const void*)
+{
+  push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
+                      concurrency_tickets_msg);
+}
+static void adaptive_max_sleep_delay_warn(THD* thd, st_mysql_sys_var*, void*,
+                                          const void*)
+{
+  push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
+                      adaptive_max_sleep_delay_msg);
+}
+static void thread_sleep_delay_warn(THD* thd, st_mysql_sys_var*, void*,
+                                    const void*)
+{
+  push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
+                      thread_sleep_delay_msg);
+}
+
+} // namespace deprecated
+
 /** Initialize, validate and normalize the InnoDB startup parameters.
 @return failure code
 @retval 0 on success
@@ -3789,8 +3469,8 @@ static int innodb_init_params()
 	char		*default_path;
 	ulong		num_pll_degree;
 
-	if (innodb_large_prefix || innodb_file_format) {
-		const char* p = innodb_file_format
+	if (deprecated::innodb_large_prefix || deprecated::innodb_file_format) {
+		const char* p = deprecated::innodb_file_format
 			? "file_format"
 			: "large_prefix";
 		sql_print_warning("The parameter innodb_%s is deprecated"
@@ -3800,6 +3480,79 @@ static int innodb_init_params()
 				  "xtradbinnodb-file-format/", p);
 	}
 
+	if (UNIV_UNLIKELY(!deprecated::innodb_log_checksums)) {
+		sql_print_warning(deprecated::innodb_log_checksums_msg);
+		deprecated::innodb_log_checksums = TRUE;
+	}
+
+	if (UNIV_UNLIKELY(!deprecated::innodb_log_compressed_pages)) {
+		sql_print_warning(deprecated::innodb_log_compressed_pages_msg);
+		deprecated::innodb_log_compressed_pages = TRUE;
+	}
+
+	if (UNIV_UNLIKELY(deprecated::innodb_log_optimize_ddl)) {
+		sql_print_warning(deprecated::innodb_log_optimize_ddl_msg);
+		deprecated::innodb_log_optimize_ddl = FALSE;
+	}
+
+	if (UNIV_UNLIKELY(deprecated::innodb_scrub_log)) {
+		sql_print_warning(deprecated::innodb_scrub_log_msg);
+		deprecated::innodb_scrub_log = FALSE;
+	}
+
+	if (UNIV_UNLIKELY(deprecated::innodb_scrub_log_speed != 256)) {
+		sql_print_warning(deprecated::innodb_scrub_log_speed_msg);
+		deprecated::innodb_scrub_log_speed = 256;
+	}
+
+	if (UNIV_UNLIKELY(deprecated::innodb_buffer_pool_instances)) {
+		sql_print_warning("The parameter innodb_buffer_pool_instances"
+				  " is deprecated and has no effect.");
+	}
+
+	if (UNIV_UNLIKELY(deprecated::innodb_page_cleaners)) {
+		sql_print_warning(deprecated::innodb_page_cleaners_msg);
+	}
+
+	if (UNIV_UNLIKELY(deprecated::srv_n_log_files != 1)) {
+		sql_print_warning(deprecated::srv_n_log_files_msg);
+		deprecated::srv_n_log_files = 1;
+	}
+
+	deprecated::innodb_buffer_pool_instances = 1;
+
+	deprecated::innodb_page_cleaners = 1;
+
+	if (UNIV_UNLIKELY(deprecated::innodb_undo_logs != TRX_SYS_N_RSEGS)) {
+		sql_print_warning(deprecated::innodb_undo_logs_msg);
+		deprecated::innodb_undo_logs = TRX_SYS_N_RSEGS;
+	}
+
+	if (UNIV_UNLIKELY(deprecated::replication_delay)) {
+		sql_print_warning(deprecated::replication_delay_msg);
+		deprecated::replication_delay = 0;
+	}
+	if (UNIV_UNLIKELY(deprecated::thread_concurrency)) {
+		sql_print_warning(deprecated::thread_concurrency_msg);
+		deprecated::thread_concurrency = 0;
+	}
+	if (UNIV_UNLIKELY(deprecated::commit_concurrency)) {
+		sql_print_warning(deprecated::commit_concurrency_msg);
+		deprecated::commit_concurrency = 0;
+	}
+	if (UNIV_UNLIKELY(deprecated::concurrency_tickets)) {
+		sql_print_warning(deprecated::concurrency_tickets_msg);
+		deprecated::concurrency_tickets = 0;
+	}
+	if (UNIV_UNLIKELY(deprecated::adaptive_max_sleep_delay)) {
+		sql_print_warning(deprecated::adaptive_max_sleep_delay_msg);
+		deprecated::adaptive_max_sleep_delay = 0;
+	}
+	if (UNIV_UNLIKELY(deprecated::thread_sleep_delay)) {
+		sql_print_warning(deprecated::thread_sleep_delay_msg);
+		deprecated::thread_sleep_delay = 0;
+	}
+
 	/* Check that values don't overflow on 32-bit systems. */
 	if (sizeof(ulint) == 4) {
 		if (innobase_buffer_pool_size > UINT_MAX32) {
@@ -4031,15 +3784,6 @@ static int innodb_init_params()
 		DBUG_RETURN(HA_ERR_INITIALIZATION);
 	}
 
-	if (srv_n_log_files * srv_log_file_size >= log_group_max_size) {
-		/* Log group size is limited by the size of page number.
-		Remove this limitation when fil_io() is not used for
-		recovery log io. */
-		ib::error() << "Combined size of log files must be < "
-			<< log_group_max_size;
-		DBUG_RETURN(HA_ERR_INITIALIZATION);
-	}
-
 	DBUG_ASSERT(innodb_change_buffering <= IBUF_USE_ALL);
 
 	/* Check that interdependent parameters have sane values. */
@@ -4104,35 +3848,18 @@ static int innodb_init_params()
 
 	srv_buf_pool_size = ulint(innobase_buffer_pool_size);
 
-	if (!innobase_use_checksums) {
-		ib::warn() << "Setting innodb_checksums to OFF is DEPRECATED."
-			" This option was removed in MariaDB 10.5.";
-		srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_NONE;
-	} else {
-		switch (srv_checksum_algorithm) {
-		case SRV_CHECKSUM_ALGORITHM_CRC32:
-		case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
-		case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
-		case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
-			break;
-		default:
-			ib::warn() << deprecated_innodb_checksum_algorithm;
-		}
+	switch (srv_checksum_algorithm) {
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+		break;
+	default:
+		ib::warn() << deprecated_innodb_checksum_algorithm;
 	}
 
-	innodb_log_checksums = innodb_log_checksums_func_update(
-		NULL, innodb_log_checksums);
-
 	row_rollback_on_timeout = (ibool) innobase_rollback_on_timeout;
 
-	srv_locks_unsafe_for_binlog = (ibool) innobase_locks_unsafe_for_binlog;
-	if (innobase_locks_unsafe_for_binlog) {
-		ib::warn() << "Using innodb_locks_unsafe_for_binlog is"
-			" DEPRECATED. This option may be removed in future"
-			" releases. Please use READ COMMITTED transaction"
-			" isolation level instead; " << SET_TRANSACTION_MSG;
-	}
-
 	if (innobase_open_files < 10) {
 		innobase_open_files = 300;
 		if (srv_file_per_table && tc_size > 300 && tc_size < open_files_limit) {
@@ -4169,12 +3896,6 @@ static int innodb_init_params()
 
 	data_mysql_default_charset_coll = (ulint) default_charset_info->number;
 
-	innobase_commit_concurrency_init_default();
-
-	if (innodb_idle_flush_pct != 100) {
-		ib::warn() << deprecated_idle_flush_pct;
-	}
-
 #ifndef _WIN32
 	if (srv_use_atomic_writes && my_may_have_atomic_write) {
 		/*
@@ -4198,9 +3919,6 @@ static int innodb_init_params()
 	}
 
 #ifdef LINUX_NATIVE_AIO
-	if (srv_use_native_aio) {
-		ib::info() << "Using Linux native AIO";
-	}
 #elif !defined _WIN32
 	/* Currently native AIO is supported only on windows and linux
 	and that also when the support is compiled in. In all other
@@ -4226,12 +3944,6 @@ static int innodb_init_params()
 
 	innodb_buffer_pool_size_init();
 
-	if (srv_n_page_cleaners > srv_buf_pool_instances) {
-		/* limit of page_cleaner parallelizability
-		is number of buffer pool instances. */
-		srv_n_page_cleaners = srv_buf_pool_instances;
-	}
-
 	srv_lock_table_size = 5 * (srv_buf_pool_size >> srv_page_size_shift);
 	DBUG_RETURN(0);
 }
@@ -4246,7 +3958,6 @@ static int innodb_init(void* p)
 	handlerton* innobase_hton= static_cast<handlerton*>(p);
 	innodb_hton_ptr = innobase_hton;
 
-	innobase_hton->state = SHOW_OPTION_YES;
 	innobase_hton->db_type = DB_TYPE_INNODB;
 	innobase_hton->savepoint_offset = sizeof(trx_named_savept_t);
 	innobase_hton->close_connection = innobase_close_connection;
@@ -4266,17 +3977,19 @@ static int innodb_init(void* p)
 	innobase_hton->recover = innobase_xa_recover;
 	innobase_hton->commit_by_xid = innobase_commit_by_xid;
 	innobase_hton->rollback_by_xid = innobase_rollback_by_xid;
-	innobase_hton->commit_checkpoint_request=innobase_checkpoint_request;
+	innobase_hton->commit_checkpoint_request = innodb_log_flush_request;
 	innobase_hton->create = innobase_create_handler;
 
 	innobase_hton->drop_database = innobase_drop_database;
 	innobase_hton->panic = innobase_end;
+	innobase_hton->pre_shutdown = innodb_preshutdown;
 
 	innobase_hton->start_consistent_snapshot =
 		innobase_start_trx_and_assign_read_view;
 
 	innobase_hton->flush_logs = innobase_flush_logs;
 	innobase_hton->show_status = innobase_show_status;
+	innobase_hton->notify_tabledef_changed= innodb_notify_tabledef_changed;
 	innobase_hton->flags =
 		HTON_SUPPORTS_EXTENDED_KEYS | HTON_SUPPORTS_FOREIGN_KEYS |
 		HTON_NATIVE_SYS_VERSIONING |
@@ -4334,9 +4047,6 @@ static int innodb_init(void* p)
 	/* Register keys with MySQL performance schema */
 	int	count;
 
-	count = array_elements(all_pthread_mutexes);
-	mysql_mutex_register("innodb", all_pthread_mutexes, count);
-
 # ifdef UNIV_PFS_MUTEX
 	count = array_elements(all_innodb_mutexes);
 	mysql_mutex_register("innodb", all_innodb_mutexes, count);
@@ -4356,9 +4066,6 @@ static int innodb_init(void* p)
 	count = array_elements(all_innodb_files);
 	mysql_file_register("innodb", all_innodb_files, count);
 # endif /* UNIV_PFS_IO */
-
-	count = array_elements(all_innodb_conds);
-	mysql_cond_register("innodb", all_innodb_conds, count);
 #endif /* HAVE_PSI_INTERFACE */
 
 	bool	create_new_db = false;
@@ -4375,27 +4082,18 @@ static int innodb_init(void* p)
 	if (err != DB_SUCCESS) {
 		innodb_shutdown();
 		DBUG_RETURN(innodb_init_abort());
-	} else if (!srv_read_only_mode) {
-		mysql_thread_create(thd_destructor_thread_key,
-				    &thd_destructor_thread,
-				    NULL, thd_destructor_proxy, NULL);
-		while (!srv_running.load(std::memory_order_relaxed))
-			os_thread_sleep(20);
 	}
 
 	srv_was_started = true;
 	innodb_params_adjust();
 
-	innobase_old_blocks_pct = static_cast<uint>(
-		buf_LRU_old_ratio_update(innobase_old_blocks_pct, TRUE));
+	innobase_old_blocks_pct = buf_LRU_old_ratio_update(
+		innobase_old_blocks_pct, true);
 
 	ibuf_max_size_update(srv_change_buffer_max_size);
 
-	mysql_mutex_init(commit_cond_mutex_key,
-			 &commit_cond_m, MY_MUTEX_INIT_FAST);
-	mysql_cond_init(commit_cond_key, &commit_cond, 0);
 	mysql_mutex_init(pending_checkpoint_mutex_key,
-			 &pending_checkpoint_mutex,
+			 &log_requests.mutex,
 			 MY_MUTEX_INIT_FAST);
 #ifdef MYSQL_DYNAMIC_PLUGIN
 	if (innobase_hton != p) {
@@ -4455,23 +4153,10 @@ innobase_end(handlerton*, ha_panic_function)
 		 	}
 		}
 
-		if (auto r = srv_running.load(std::memory_order_relaxed)) {
-			ut_ad(!srv_read_only_mode);
-			if (!abort_loop) {
-				// may be UNINSTALL PLUGIN statement
-				mysql_mutex_lock(r->current_mutex);
-				r->abort = 1;
-				mysql_cond_broadcast(r->current_cond);
-				mysql_mutex_unlock(r->current_mutex);
-			}
-			pthread_join(thd_destructor_thread, NULL);
-		}
 
 		innodb_shutdown();
 
-		mysql_mutex_destroy(&commit_cond_m);
-		mysql_cond_destroy(&commit_cond);
-		mysql_mutex_destroy(&pending_checkpoint_mutex);
+		mysql_mutex_destroy(&log_requests.mutex);
 	}
 
 	DBUG_RETURN(0);
@@ -4529,8 +4214,6 @@ innobase_start_trx_and_assign_read_view(
 
 	trx_t*	trx = check_trx_exists(thd);
 
-	innobase_srv_conc_force_exit_innodb(trx);
-
 	/* The transaction should not be active yet, start it */
 
 	ut_ad(!trx_is_started(trx));
@@ -4570,30 +4253,9 @@ innobase_commit_ordered_2(
 {
 	DBUG_ENTER("innobase_commit_ordered_2");
 
-	bool	read_only = trx->read_only || trx->id == 0;
+	const bool read_only = trx->read_only || trx->id == 0;
 
 	if (!read_only) {
-
-		while (innobase_commit_concurrency > 0) {
-
-			mysql_mutex_lock(&commit_cond_m);
-
-			++commit_threads;
-
-			if (commit_threads
-				<= innobase_commit_concurrency) {
-
-				mysql_mutex_unlock(&commit_cond_m);
-				break;
-			}
-
-			--commit_threads;
-
-			mysql_cond_wait(&commit_cond, &commit_cond_m);
-
-			mysql_mutex_unlock(&commit_cond_m);
-		}
-
 		/* The following call reads the binary log position of
 		the transaction being committed.
 
@@ -4625,19 +4287,8 @@ innobase_commit_ordered_2(
 	innobase_commit_low(trx);
 
 	if (!read_only) {
+		trx->mysql_log_file_name = NULL;
 		trx->flush_log_later = false;
-
-		if (innobase_commit_concurrency > 0) {
-
-			mysql_mutex_lock(&commit_cond_m);
-
-			ut_ad(commit_threads > 0);
-			--commit_threads;
-
-			mysql_cond_signal(&commit_cond);
-
-			mysql_mutex_unlock(&commit_cond_m);
-		}
 	}
 
 	DBUG_VOID_RETURN;
@@ -4777,8 +4428,6 @@ innobase_commit(
 	/* This is a statement level variable. */
 	trx->fts_next_doc_id = 0;
 
-	innobase_srv_conc_force_exit_innodb(trx);
-
 	DBUG_RETURN(0);
 }
 
@@ -4806,8 +4455,6 @@ innobase_rollback(
 	ut_ad(trx->dict_operation_lock_mode == 0);
 	ut_ad(trx->dict_operation == TRX_DICT_OP_NONE);
 
-	innobase_srv_conc_force_exit_innodb(trx);
-
 	/* Reset the number AUTO-INC rows required */
 
 	trx->n_autoinc_rows = 0;
@@ -4858,164 +4505,140 @@ innobase_rollback_trx(
 	DBUG_ENTER("innobase_rollback_trx");
 	DBUG_PRINT("trans", ("aborting transaction"));
 
-	innobase_srv_conc_force_exit_innodb(trx);
-
 	/* If we had reserved the auto-inc lock for some table (if
 	we come here to roll back the latest SQL statement) we
 	release it now before a possibly lengthy rollback */
 	lock_unlock_table_autoinc(trx);
-
-	if (!trx->has_logged()) {
-		trx->will_lock = false;
-#ifdef WITH_WSREP
-		trx->wsrep= false;
-		trx->lock.was_chosen_as_wsrep_victim= false;
-#endif
-		DBUG_RETURN(0);
-	}
+	trx_deregister_from_2pc(trx);
 
 	DBUG_RETURN(convert_error_code_to_mysql(trx_rollback_for_mysql(trx),
 						0, trx->mysql_thd));
 }
 
+/** Invoke commit_checkpoint_notify_ha() on completed log flush requests.
+@param pending  log_requests.start
+@param lsn      log_sys.get_flushed_lsn() */
+static void log_flush_notify_and_unlock(log_flush_request *pending, lsn_t lsn)
+{
+  mysql_mutex_assert_owner(&log_requests.mutex);
+  ut_ad(pending == log_requests.start.load(std::memory_order_relaxed));
+  log_flush_request *entry= pending, *last= nullptr;
+  /* Process the first requests that have been completed. Since
+  the list is not necessarily in ascending order of LSN, we may
+  miss to notify some requests that have already been completed.
+  But there is no harm in delaying notifications for those a bit.
+  And in practise, the list is unlikely to have more than one
+  element anyway, because the redo log would be flushed every
+  srv_flush_log_at_timeout seconds (1 by default). */
+  for (; entry && entry->lsn <= lsn; last= entry, entry= entry->next);
+
+  if (!last)
+  {
+    mysql_mutex_unlock(&log_requests.mutex);
+    return;
+  }
 
-struct pending_checkpoint {
-	struct pending_checkpoint *next;
-	handlerton *hton;
-	void *cookie;
-	ib_uint64_t lsn;
-};
-static struct pending_checkpoint *pending_checkpoint_list;
-static struct pending_checkpoint *pending_checkpoint_list_end;
-
-/*****************************************************************//**
-Handle a commit checkpoint request from server layer.
-We put the request in a queue, so that we can notify upper layer about
-checkpoint complete when we have flushed the redo log.
-If we have already flushed all relevant redo log, we notify immediately.*/
-static
-void
-innobase_checkpoint_request(
-	handlerton *hton,
-	void *cookie)
-{
-	ib_uint64_t			lsn;
-	ib_uint64_t			flush_lsn;
-	struct pending_checkpoint *	entry;
-
-	/* Do the allocation outside of lock to reduce contention. The normal
-	case is that not everything is flushed, so we will need to enqueue. */
-	entry = static_cast<struct pending_checkpoint *>
-		(my_malloc(sizeof(*entry), MYF(MY_WME)));
-	if (!entry) {
-		sql_print_error("Failed to allocate %u bytes."
-				" Commit checkpoint will be skipped.",
-				static_cast<unsigned>(sizeof(*entry)));
-		return;
-	}
-
-	entry->next = NULL;
-	entry->hton = hton;
-	entry->cookie = cookie;
-
-	mysql_mutex_lock(&pending_checkpoint_mutex);
-	lsn = log_get_lsn();
-	flush_lsn = log_get_flush_lsn();
-	if (lsn > flush_lsn) {
-		/* Put the request in queue.
-		When the log gets flushed past the lsn, we will remove the
-		entry from the queue and notify the upper layer. */
-		entry->lsn = lsn;
-		if (pending_checkpoint_list_end) {
-			pending_checkpoint_list_end->next = entry;
-			/* There is no need to order the entries in the list
-			by lsn. The upper layer can accept notifications in
-			any order, and short delays in notifications do not
-			significantly impact performance. */
-		} else {
-			pending_checkpoint_list = entry;
-		}
-		pending_checkpoint_list_end = entry;
-		entry = NULL;
-	}
-	mysql_mutex_unlock(&pending_checkpoint_mutex);
+  /* Detach the head of the list that corresponds to persisted log writes. */
+  if (!entry)
+    log_requests.end= entry;
+  log_requests.start.store(entry, std::memory_order_relaxed);
+  mysql_mutex_unlock(&log_requests.mutex);
 
-	if (entry) {
-		/* We are already flushed. Notify the checkpoint immediately. */
-		commit_checkpoint_notify_ha(entry->hton, entry->cookie);
-		my_free(entry);
-	}
+  /* Now that we have released the mutex, notify the submitters
+  and free the head of the list. */
+  do
+  {
+    entry= pending;
+    pending= pending->next;
+    commit_checkpoint_notify_ha(entry->cookie);
+    my_free(entry);
+  }
+  while (entry != last);
 }
 
-/*****************************************************************//**
-Log code calls this whenever log has been written and/or flushed up
-to a new position. We use this to notify upper layer of a new commit
-checkpoint when necessary.*/
-UNIV_INTERN
-void
-innobase_mysql_log_notify(
-/*======================*/
-	ib_uint64_t	flush_lsn)	/*!< in: LSN flushed to disk */
+/** Invoke commit_checkpoint_notify_ha() to notify that outstanding
+log writes have been completed. */
+void log_flush_notify(lsn_t flush_lsn)
 {
-	struct pending_checkpoint *	pending;
-	struct pending_checkpoint *	entry;
-	struct pending_checkpoint *	last_ready;
-
-	/* It is safe to do a quick check for NULL first without lock.
-	Even if we should race, we will at most skip one checkpoint and
-	take the next one, which is harmless. */
-	if (!pending_checkpoint_list)
-		return;
-
-	mysql_mutex_lock(&pending_checkpoint_mutex);
-	pending = pending_checkpoint_list;
-	if (!pending)
-	{
-		mysql_mutex_unlock(&pending_checkpoint_mutex);
-		return;
-	}
-
-	last_ready = NULL;
-	for (entry = pending; entry != NULL; entry = entry -> next)
-	{
-		/* Notify checkpoints up until the first entry that has not
-		been fully flushed to the redo log. Since we do not maintain
-		the list ordered, in principle there could be more entries
-		later than were also flushed. But there is no harm in
-		delaying notifications for those a bit. And in practise, the
-		list is unlikely to have more than one element anyway, as we
-		flush the redo log at least once every second. */
-		if (entry->lsn > flush_lsn)
-			break;
-		last_ready = entry;
-	}
-
-	if (last_ready)
-	{
-		/* We found some pending checkpoints that are now flushed to
-		disk. So remove them from the list. */
-		pending_checkpoint_list = entry;
-		if (!entry)
-			pending_checkpoint_list_end = NULL;
-	}
-
-	mysql_mutex_unlock(&pending_checkpoint_mutex);
+  if (auto pending= log_requests.start.load(std::memory_order_acquire))
+  {
+    mysql_mutex_lock(&log_requests.mutex);
+    pending= log_requests.start.load(std::memory_order_relaxed);
+    log_flush_notify_and_unlock(pending, flush_lsn);
+  }
+}
 
-	if (!last_ready)
-		return;
+/** Handle a commit checkpoint request from server layer.
+We put the request in a queue, so that we can notify upper layer about
+checkpoint complete when we have flushed the redo log.
+If we have already flushed all relevant redo log, we notify immediately.*/
+static void innodb_log_flush_request(void *cookie)
+{
+  lsn_t flush_lsn= log_sys.get_flushed_lsn();
+  /* Load lsn relaxed after flush_lsn was loaded from the same cache line */
+  const lsn_t lsn= log_sys.get_lsn();
+
+  if (flush_lsn >= lsn)
+    /* All log is already persistent. */;
+  else if (UNIV_UNLIKELY(srv_force_recovery >= SRV_FORCE_NO_BACKGROUND))
+    /* Normally, srv_master_callback() should periodically invoke
+    srv_sync_log_buffer_in_background(), which should initiate a log
+    flush about once every srv_flush_log_at_timeout seconds.  But,
+    starting with the innodb_force_recovery=2 level, that background
+    task will not run. */
+    log_write_up_to(flush_lsn= lsn, true);
+  else if (log_flush_request *req= static_cast<log_flush_request*>
+           (my_malloc(PSI_INSTRUMENT_ME, sizeof *req, MYF(MY_WME))))
+  {
+    req->next= nullptr;
+    req->cookie= cookie;
+    req->lsn= lsn;
+
+    log_flush_request *start= nullptr;
+
+    mysql_mutex_lock(&log_requests.mutex);
+    /* In order to prevent a race condition where log_flush_notify()
+    would skip a notification due to, we must update log_requests.start from
+    nullptr (empty) to the first req using std::memory_order_release. */
+    if (log_requests.start.compare_exchange_strong(start, req,
+                                                   std::memory_order_release,
+                                                   std::memory_order_relaxed))
+    {
+      ut_ad(!log_requests.end);
+      start= req;
+      /* In case log_flush_notify() executed
+      log_requests.start.load(std::memory_order_acquire) right before
+      our successful compare_exchange, we must re-read flush_lsn to
+      ensure that our request will be notified immediately if applicable. */
+      flush_lsn= log_sys.get_flushed_lsn();
+    }
+    else
+    {
+      /* Append the entry to the list. Because we determined req->lsn before
+      acquiring the mutex, this list may not be ordered by req->lsn,
+      even though log_flush_notify_and_unlock() assumes so. */
+      log_requests.end->next= req;
+    }
 
-	/* Now that we have released the lock, notify upper layer about all
-	commit checkpoints that have now completed. */
-	for (;;) {
-		entry = pending;
-		pending = pending->next;
+    log_requests.end= req;
 
-		commit_checkpoint_notify_ha(entry->hton, entry->cookie);
+    /* This hopefully addresses the hang that was reported in MDEV-24302.
+    Upon receiving a new request, we will notify old requests of
+    completion. */
+    log_flush_notify_and_unlock(start, flush_lsn);
+    return;
+  }
+  else
+    sql_print_error("Failed to allocate %zu bytes."
+                    " Commit checkpoint will be skipped.", sizeof *req);
 
-		my_free(entry);
-		if (entry == last_ready)
-			break;
-	}
+  /* This hopefully addresses the hang that was reported in MDEV-24302.
+  Upon receiving a new request to notify of log writes becoming
+  persistent, we will notify old requests of completion. Note:
+  log_flush_notify() may skip some notifications because it is
+  basically assuming that the list is in ascending order of LSN. */
+  log_flush_notify(flush_lsn);
+  commit_checkpoint_notify_ha(cookie);
 }
 
 /*****************************************************************//**
@@ -5038,8 +4661,6 @@ innobase_rollback_to_savepoint(
 
 	trx_t*	trx = check_trx_exists(thd);
 
-	innobase_srv_conc_force_exit_innodb(trx);
-
 	/* TODO: use provided savepoint data area to store savepoint data */
 
 	char	name[64];
@@ -5144,8 +4765,6 @@ innobase_savepoint(
 
 	trx_t*	trx = check_trx_exists(thd);
 
-	innobase_srv_conc_force_exit_innodb(trx);
-
 	/* Cannot happen outside of transaction */
 	DBUG_ASSERT(trx_is_registered_for_2pc(trx));
 
@@ -5163,75 +4782,29 @@ innobase_savepoint(
 	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
 }
 
-/*****************************************************************//**
-Frees a possible InnoDB trx object associated with the current THD.
-@return 0 or error number */
-static
-int
-innobase_close_connection(
-/*======================*/
-	handlerton*	hton,	/*!< in: innobase handlerton */
-	THD*		thd)	/*!< in: handle to the MySQL thread of the user
-				whose resources should be free'd */
-{
-
-	DBUG_ENTER("innobase_close_connection");
-	DBUG_ASSERT(hton == innodb_hton_ptr);
-
-	trx_t*	trx = thd_to_trx(thd);
 
-	/* During server initialization MySQL layer will try to open
-	some of the master-slave tables those residing in InnoDB.
-	After MySQL layer is done with needed checks these tables
-	are closed followed by invocation of close_connection on the
-	associated thd.
-
-	close_connection rolls back the trx and then frees it.
-	Once trx is freed thd should avoid maintaining reference to
-	it else it can be classified as stale reference.
-
-	Re-invocation of innodb_close_connection on same thd should
-	get trx as NULL. */
-
-	if (trx) {
-
-		thd_set_ha_data(thd, hton, NULL);
-		if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
-
-			sql_print_error("Transaction not registered for MariaDB 2PC, "
-				"but transaction is active");
-		}
+/**
+  Frees a possible InnoDB trx object associated with the current THD.
 
-		/* Disconnect causes rollback in the following cases:
-		- trx is not started, or
-		- trx is in *not* in PREPARED state, or
-		- trx has not updated any persistent data.
-		TODO/FIXME: it does not make sense to initiate rollback
-		in the 1st and 3rd case. */
-		if (trx_is_started(trx)) {
-			if (trx_state_eq(trx, TRX_STATE_PREPARED)) {
-				if (trx->has_logged_persistent()) {
-					trx_disconnect_prepared(trx);
-				} else {
-					trx_deregister_from_2pc(trx);
-					goto rollback_and_free;
-				}
-			} else {
-			sql_print_warning(
-				"MariaDB is closing a connection that has an active "
-				"InnoDB transaction.  " TRX_ID_FMT " row modifications "
-				"will roll back.",
-					trx->undo_no);
-				goto rollback_and_free;
-			}
-		} else {
-rollback_and_free:
-			innobase_rollback_trx(trx);
-			trx->free();
-		}
-	}
+  @param hton  innobase handlerton
+  @param thd   server thread descriptor, which resources should be free'd
 
-	DBUG_RETURN(0);
+  @return 0 always
+*/
+static int innobase_close_connection(handlerton *hton, THD *thd)
+{
+  DBUG_ASSERT(hton == innodb_hton_ptr);
+  if (auto trx= thd_to_trx(thd))
+  {
+    if (trx->state == TRX_STATE_PREPARED && trx->has_logged_persistent())
+    {
+      trx_disconnect_prepared(trx);
+      return 0;
+    }
+    innobase_rollback_trx(trx);
+    trx->free();
+  }
+  return 0;
 }
 
 void lock_cancel_waiting_and_release(lock_t *lock);
@@ -5493,13 +5066,6 @@ ha_innobase::table_cache_type()
 Determines if the primary key is clustered index.
 @return true */
 
-bool
-ha_innobase::primary_key_is_clustered()
-/*===================================*/
-{
-	return(true);
-}
-
 /** Normalizes a table name string.
 A normalized name consists of the database name catenated to '/'
 and table name. For example: test/mytable.
@@ -5770,13 +5336,12 @@ innobase_match_index_columns(
 	One hidden assumption here is that the index column sequences
 	are matched up between those in mysql and InnoDB. */
 	for (; key_part != key_end; ++key_part) {
-		ulint	col_type;
-		ibool	is_unsigned;
-		ulint	mtype = innodb_idx_fld->col->mtype;
+		unsigned is_unsigned;
+		auto mtype = innodb_idx_fld->col->mtype;
 
 		/* Need to translate to InnoDB column type before
 		comparison. */
-		col_type = get_innobase_type_from_mysql_type(
+		auto col_type = get_innobase_type_from_mysql_type(
 			&is_unsigned, key_part->field);
 
 		/* Ignore InnoDB specific system columns. */
@@ -6159,15 +5724,16 @@ initialize_auto_increment(dict_table_t* table, const Field* field)
 
 	table->autoinc_mutex.lock();
 
-	table->persistent_autoinc = 1
-		+ dict_table_get_nth_col_pos(table, col_no, NULL);
+	table->persistent_autoinc = static_cast<uint16_t>(
+		dict_table_get_nth_col_pos(table, col_no, NULL) + 1)
+		& dict_index_t::MAX_N_FIELDS;
 
 	if (table->autoinc) {
 		/* Already initialized. Our caller checked
 		table->persistent_autoinc without
 		autoinc_mutex protection, and there might be multiple
 		ha_innobase::open() executing concurrently. */
-	} else if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
+	} else if (srv_force_recovery > SRV_FORCE_NO_IBUF_MERGE) {
 		/* If the recovery level is set so high that writes
 		are disabled we force the AUTOINC counter to 0
 		value effectively disabling writes to the table.
@@ -6197,14 +5763,6 @@ initialize_auto_increment(dict_table_t* table, const Field* field)
 int
 ha_innobase::open(const char* name, int, uint)
 {
-	/* TODO: If trx_rollback_recovered(bool all=false) is ever
-	removed, the first-time open() must hold (or acquire and release)
-	a table lock that conflicts with trx_resurrect_table_locks(),
-	to ensure that any recovered incomplete ALTER TABLE will have been
-	rolled back. Otherwise, dict_table_t::instant could be cleared by
-	the rollback invoking dict_index_t::clear_instant_alter() while
-	open table handles exist in client connections. */
-
 	char			norm_name[FN_REFLEN];
 
 	DBUG_ENTER("ha_innobase::open");
@@ -6427,7 +5985,7 @@ ha_innobase::open(const char* name, int, uint)
 	}
 
 	/* Index block size in InnoDB: used by MySQL in query optimization */
-	stats.block_size = srv_page_size;
+	stats.block_size = static_cast<uint>(srv_page_size);
 
 	const my_bool for_vc_purge = THDVAR(thd, background_thread);
 
@@ -6628,11 +6186,6 @@ ha_innobase::close()
 
 	MONITOR_INC(MONITOR_TABLE_CLOSE);
 
-	/* Tell InnoDB server that there might be work for
-	utility threads: */
-
-	srv_active_wake_master_thread();
-
 	DBUG_RETURN(0);
 }
 
@@ -6647,9 +6200,9 @@ wsrep_innobase_mysql_sort(
 	int		mysql_type,	/* in: MySQL type */
 	uint		charset_number,	/* in: number of the charset */
 	unsigned char*	str,		/* in: data field */
-	unsigned int	str_length,	/* in: data field length,
+	ulint		str_length,	/* in: data field length,
 					not UNIV_SQL_NULL */
-	unsigned int	buf_length)	/* in: total str buffer length */
+	ulint		buf_length)	/* in: total str buffer length */
 
 {
 	CHARSET_INFO*		charset;
@@ -6672,7 +6225,7 @@ wsrep_innobase_mysql_sort(
 	case MYSQL_TYPE_VARCHAR:
 	{
 		uchar tmp_str[REC_VERSION_56_MAX_INDEX_COL_LEN] = {'\0'};
-		uint tmp_length = REC_VERSION_56_MAX_INDEX_COL_LEN;
+		ulint tmp_length = REC_VERSION_56_MAX_INDEX_COL_LEN;
 
 		/* Use the charset number to pick the right charset struct for
 		the comparison. Since the MySQL function get_charset may be
@@ -6698,23 +6251,23 @@ wsrep_innobase_mysql_sort(
 		ut_a(str_length <= tmp_length);
 		memcpy(tmp_str, str, str_length);
 
-		tmp_length = charset->coll->strnxfrm(charset, str, str_length,
-						     str_length, tmp_str,
-						     tmp_length, 0);
+		tmp_length = charset->strnxfrm(str, str_length,
+					       uint(str_length), tmp_str,
+					       tmp_length, 0);
 		DBUG_ASSERT(tmp_length <= str_length);
 		if (wsrep_protocol_version < 3) {
-			tmp_length = charset->coll->strnxfrm(
-				charset, str, str_length,
-				str_length, tmp_str, tmp_length, 0);
+			tmp_length = charset->strnxfrm(
+				str, str_length,
+				uint(str_length), tmp_str, tmp_length, 0);
 			DBUG_ASSERT(tmp_length <= str_length);
 		} else {
 			/* strnxfrm will expand the destination string,
 			   protocols < 3 truncated the sorted sring
 			   protocols >= 3 gets full sorted sring
 			*/
-			tmp_length = charset->coll->strnxfrm(
-				charset, str, buf_length,
-				str_length, tmp_str, str_length, 0);
+			tmp_length = charset->strnxfrm(
+				str, buf_length,
+				uint(str_length), tmp_str, str_length, 0);
 			DBUG_ASSERT(tmp_length <= buf_length);
 			ret_length = tmp_length;
 		}
@@ -6807,7 +6360,7 @@ innobase_strnxfrm(
 		return(0);
 	}
 
-	my_strnxfrm(cs, (uchar*) mystr, 2, str, len);
+	cs->strnxfrm((uchar*) mystr, 2, str, len);
 
 	value = mach_read_from_2(mystr);
 
@@ -6859,7 +6412,7 @@ innobase_fts_casedn_str(
 
 		return(strlen(dst));
 	} else {
-		return(cs->cset->casedn(cs, src, src_len, dst, dst_len));
+		return(cs->casedn(src, src_len, dst, dst_len));
 	}
 }
 
@@ -6896,8 +6449,7 @@ innobase_mysql_fts_get_token(
 
 		int	ctype;
 
-		mbl = cs->cset->ctype(
-			cs, &ctype, doc, (const uchar*) end);
+		mbl = cs->ctype(&ctype, doc, (const uchar*) end);
 
 		if (true_word_char(ctype, *doc)) {
 			break;
@@ -6915,8 +6467,7 @@ innobase_mysql_fts_get_token(
 
 		int	ctype;
 
-		mbl = cs->cset->ctype(
-			cs, &ctype, (uchar*) doc, (uchar*) end);
+		mbl = cs->ctype(&ctype, (uchar*) doc, (uchar*) end);
 		if (true_word_char(ctype, *doc)) {
 			mwc = 0;
 		} else if (!misc_word_char(*doc) || mwc) {
@@ -6943,22 +6494,18 @@ VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
 ENUM and SET, and unsigned integer types are 'unsigned types'
 @param[in]	f		MySQL Field
 @return DATA_BINARY, DATA_VARCHAR, ... */
-ulint
-get_innobase_type_from_mysql_type(
-	ulint*			unsigned_flag,
-	const void*		f)
+uint8_t
+get_innobase_type_from_mysql_type(unsigned *unsigned_flag, const Field *field)
 {
-	const class Field* field = reinterpret_cast<const class Field*>(f);
-
 	/* The following asserts try to check that the MySQL type code fits in
 	8 bits: this is used in ibuf and also when DATA_NOT_NULL is ORed to
 	the type */
 
-	DBUG_ASSERT((ulint)MYSQL_TYPE_STRING < 256);
-	DBUG_ASSERT((ulint)MYSQL_TYPE_VAR_STRING < 256);
-	DBUG_ASSERT((ulint)MYSQL_TYPE_DOUBLE < 256);
-	DBUG_ASSERT((ulint)MYSQL_TYPE_FLOAT < 256);
-	DBUG_ASSERT((ulint)MYSQL_TYPE_DECIMAL < 256);
+	static_assert(MYSQL_TYPE_STRING < 256, "compatibility");
+	static_assert(MYSQL_TYPE_VAR_STRING < 256, "compatibility");
+	static_assert(MYSQL_TYPE_DOUBLE < 256, "compatibility");
+	static_assert(MYSQL_TYPE_FLOAT < 256, "compatibility");
+	static_assert(MYSQL_TYPE_DECIMAL < 256, "compatibility");
 
 	if (field->flags & UNSIGNED_FLAG) {
 
@@ -6995,7 +6542,7 @@ get_innobase_type_from_mysql_type(
 		}
 	case MYSQL_TYPE_BIT:
 	case MYSQL_TYPE_STRING:
-		if (field->binary()) {
+		if (field->binary() || field->key_type() == HA_KEYTYPE_BINARY) {
 			return(DATA_FIXBINARY);
 		} else if (field->charset() == &my_charset_latin1) {
 			return(DATA_CHAR);
@@ -7063,8 +6610,8 @@ innobase_read_from_2_little_endian(
 /*******************************************************************//**
 Stores a key value for a row to a buffer.
 @return	key value length as stored in buff */
-UNIV_INTERN
-uint
+static
+uint16_t
 wsrep_store_key_val_for_row(
 /*=========================*/
 	THD* 		thd,
@@ -7074,7 +6621,7 @@ wsrep_store_key_val_for_row(
 				format) */
 	uint		buff_len,/*!< in: buffer length */
 	const uchar*	record,
-	ibool*          key_is_null)/*!< out: full key was null */
+	bool*		key_is_null)/*!< out: full key was null */
 {
 	KEY*		key_info	= table->key_info + keynr;
 	KEY_PART_INFO*	key_part	= key_info->key_part;
@@ -7082,24 +6629,23 @@ wsrep_store_key_val_for_row(
 	char*		buff_start	= buff;
 	enum_field_types mysql_type;
 	Field*		field;
-	uint buff_space = buff_len;
+	ulint buff_space = buff_len;
 
 	DBUG_ENTER("wsrep_store_key_val_for_row");
 
 	memset(buff, 0, buff_len);
-	*key_is_null = TRUE;
+	*key_is_null = true;
 
 	for (; key_part != end; key_part++) {
-
 		uchar sorted[REC_VERSION_56_MAX_INDEX_COL_LEN] = {'\0'};
-		ibool part_is_null = FALSE;
+		bool part_is_null = false;
 
 		if (key_part->null_bit) {
 			if (buff_space > 0) {
 				if (record[key_part->null_offset]
 				    & key_part->null_bit) {
 					*buff = 1;
-					part_is_null = TRUE;
+					part_is_null = true;
 				} else {
 					*buff = 0;
 				}
@@ -7110,7 +6656,7 @@ wsrep_store_key_val_for_row(
 					 wsrep_thd_query(thd));
 			}
 		}
-		if (!part_is_null)  *key_is_null = FALSE;
+		if (!part_is_null)  *key_is_null = false;
 
 		field = key_part->field;
 		mysql_type = field->type();
@@ -7364,7 +6910,7 @@ wsrep_store_key_val_for_row(
 
 	ut_a(buff <= buff_start + buff_len);
 
-	DBUG_RETURN((uint)(buff - buff_start));
+	DBUG_RETURN(static_cast<uint16_t>(buff - buff_start));
 }
 #endif /* WITH_WSREP */
 /**************************************************************//**
@@ -7711,7 +7257,8 @@ ha_innobase::build_template(
 
 	m_prebuilt->template_type = whole_row
 		? ROW_MYSQL_WHOLE_ROW : ROW_MYSQL_REC_FIELDS;
-	m_prebuilt->null_bitmap_len = table->s->null_bytes;
+	m_prebuilt->null_bitmap_len = table->s->null_bytes
+		& dict_index_t::MAX_N_FIELDS;
 
 	/* Prepare to build m_prebuilt->mysql_template[]. */
 	m_prebuilt->templ_contains_blob = FALSE;
@@ -8156,8 +7703,6 @@ ha_innobase::write_row(
 		build_template(true);
 	}
 
-	innobase_srv_conc_enter_innodb(m_prebuilt);
-
 	vers_set_fields = table->versioned_write(VERS_TRX_ID) ?
 		ROW_INS_VERSIONED : ROW_INS_NORMAL;
 
@@ -8221,8 +7766,6 @@ ha_innobase::write_row(
 					    wsrep_thd_query(m_user_thd));
 					error= DB_SUCCESS;
 					wsrep_thd_self_abort(m_user_thd);
-                                        innobase_srv_conc_exit_innodb(
-						m_prebuilt);
                                         /* jump straight to func exit over
                                          * later wsrep hooks */
                                         goto func_exit;
@@ -8279,8 +7822,6 @@ set_max_autoinc:
 		}
 	}
 
-	innobase_srv_conc_exit_innodb(m_prebuilt);
-
 report_error:
 	/* Cleanup and exit. */
 	if (error == DB_TABLESPACE_DELETED) {
@@ -8316,8 +7857,6 @@ report_error:
 	}
 
 func_exit:
-	innobase_active_small();
-
 	DBUG_RETURN(error_result);
 }
 
@@ -8405,7 +7944,7 @@ calc_row_difference(
 	ibool		changes_fts_doc_col = FALSE;
 	trx_t* const	trx = prebuilt->trx;
 	doc_id_t	doc_id = FTS_NULL_DOC_ID;
-	ulint		num_v = 0;
+	uint16_t	num_v = 0;
 	const bool skip_virtual = ha_innobase::omits_virtual_cols(*table->s);
 
 	ut_ad(!srv_read_only_mode);
@@ -8636,9 +8175,11 @@ calc_row_difference(
 				num_v++;
 				ut_ad(field != table->found_next_number_field);
 			} else {
-				ufield->field_no = dict_col_get_clust_pos(
-					&prebuilt->table->cols[i - num_v],
-					clust_index);
+				ufield->field_no = static_cast<uint16_t>(
+					dict_col_get_clust_pos(
+						&prebuilt->table->cols
+						[i - num_v],
+						clust_index));
 				ufield->old_v_val = NULL;
 				if (field != table->found_next_number_field
 				    || dfield_is_null(&ufield->new_val)) {
@@ -8770,29 +8311,24 @@ wsrep_calc_row_hash(
 					dictionary */
 	row_prebuilt_t*	prebuilt)	/*!< in: InnoDB prebuilt struct */
 {
-	ulint		len;
-	const byte*	ptr;
-
 	void *ctx = alloca(my_md5_context_size());
 	my_md5_init(ctx);
 
 	for (uint i = 0; i < table->s->fields; i++) {
 		byte null_byte=0;
 		byte true_byte=1;
-		ulint col_type;
-		ulint is_unsigned;
+		unsigned is_unsigned;
 
 		const Field* field = table->field[i];
 		if (!field->stored_in_db()) {
 			continue;
 		}
 
-		ptr = (const byte*) row + get_field_offset(table, field);
-		len = field->pack_length();
-		col_type = get_innobase_type_from_mysql_type(&is_unsigned, field);
-
-		switch (col_type) {
+		auto ptr = row + get_field_offset(table, field);
+		ulint len = field->pack_length();
 
+		switch (get_innobase_type_from_mysql_type(&is_unsigned,
+							  field)) {
 		case DATA_BLOB:
 			ptr = row_mysql_read_blob_ref(&len, ptr, len);
 
@@ -8880,7 +8416,7 @@ ha_innobase::update_row(
 			+ MAX_REF_PARTS * 3;
 
 		m_upd_buf = reinterpret_cast<uchar*>(
-			my_malloc(//PSI_INSTRUMENT_ME,
+			my_malloc(PSI_INSTRUMENT_ME,
                                   m_upd_buf_size,
 				MYF(MY_WME)));
 
@@ -8920,8 +8456,6 @@ ha_innobase::update_row(
 			if (error != DB_SUCCESS) {
 				goto func_exit;
 			}
-			innobase_srv_conc_exit_innodb(m_prebuilt);
-			innobase_active_small();
 		}
 		DBUG_RETURN(HA_ERR_RECORD_IS_THE_SAME);
 	} else {
@@ -8938,8 +8472,6 @@ ha_innobase::update_row(
 			? VERSIONED_DELETE
 			: NO_DELETE;
 
-		innobase_srv_conc_enter_innodb(m_prebuilt);
-
 		error = row_update_for_mysql(m_prebuilt);
 
 		if (error == DB_SUCCESS && vers_ins_row
@@ -8990,8 +8522,6 @@ ha_innobase::update_row(
 		}
 	}
 
-	innobase_srv_conc_exit_innodb(m_prebuilt);
-
 func_exit:
 	if (error == DB_FTS_INVALID_DOCID) {
 		err = HA_FTS_INVALID_DOCID;
@@ -9001,11 +8531,6 @@ func_exit:
 			error, m_prebuilt->table->flags, m_user_thd);
 	}
 
-	/* Tell InnoDB server that there might be work for
-	utility threads: */
-
-	innobase_active_small();
-
 #ifdef WITH_WSREP
 	if (error == DB_SUCCESS && trx->is_wsrep()
 	    && wsrep_thd_is_local(m_user_thd)
@@ -9061,17 +8586,8 @@ ha_innobase::delete_row(
 		? VERSIONED_DELETE
 		: PLAIN_DELETE;
 
-	innobase_srv_conc_enter_innodb(m_prebuilt);
-
 	error = row_update_for_mysql(m_prebuilt);
 
-	innobase_srv_conc_exit_innodb(m_prebuilt);
-
-	/* Tell the InnoDB server that there might be work for
-	utility threads: */
-
-	innobase_active_small();
-
 #ifdef WITH_WSREP
 	if (error == DB_SUCCESS && trx->is_wsrep()
 	    && wsrep_thd_is_local(m_user_thd)
@@ -9091,7 +8607,7 @@ ha_innobase::delete_row(
 /**********************************************************************//**
 Removes a new lock set on a row, if it was not read optimistically. This can
 be called after a row has been read in the processing of an UPDATE or a DELETE
-query, if the option innodb_locks_unsafe_for_binlog is set. */
+query. */
 
 void
 ha_innobase::unlock_row(void)
@@ -9107,11 +8623,8 @@ ha_innobase::unlock_row(void)
 
 	switch (m_prebuilt->row_read_type) {
 	case ROW_READ_WITH_LOCKS:
-		if (!srv_locks_unsafe_for_binlog
-		    && m_prebuilt->trx->isolation_level
-		    > TRX_ISO_READ_COMMITTED) {
+		if (m_prebuilt->trx->isolation_level > TRX_ISO_READ_COMMITTED)
 			break;
-		}
 		/* fall through */
 	case ROW_READ_TRY_SEMI_CONSISTENT:
 		row_unlock_for_mysql(m_prebuilt, FALSE);
@@ -9134,28 +8647,16 @@ ha_innobase::was_semi_consistent_read(void)
 }
 
 /* See handler.h and row0mysql.h for docs on this function. */
-
-void
-ha_innobase::try_semi_consistent_read(bool yes)
-/*===========================================*/
+void ha_innobase::try_semi_consistent_read(bool yes)
 {
-	ut_a(m_prebuilt->trx == thd_to_trx(ha_thd()));
-
+	ut_ad(m_prebuilt->trx == thd_to_trx(ha_thd()));
 	/* Row read type is set to semi consistent read if this was
-	requested by the MySQL and either innodb_locks_unsafe_for_binlog
-	option is used or this session is using READ COMMITTED isolation
-	level. */
-
-	if (yes
-	    && (srv_locks_unsafe_for_binlog
-		|| m_prebuilt->trx->isolation_level
-		<= TRX_ISO_READ_COMMITTED)) {
-
-		m_prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
-
-	} else {
-		m_prebuilt->row_read_type = ROW_READ_WITH_LOCKS;
-	}
+	requested by the SQL layer and the transaction isolation level is
+	READ UNCOMMITTED or READ COMMITTED. */
+	m_prebuilt->row_read_type = yes
+		&& m_prebuilt->trx->isolation_level <= TRX_ISO_READ_COMMITTED
+		? ROW_READ_TRY_SEMI_CONSISTENT
+		: ROW_READ_WITH_LOCKS;
 }
 
 /******************************************************************//**
@@ -9381,20 +8882,8 @@ ha_innobase::index_read(
 
 	m_last_match_mode = (uint) match_mode;
 
-	dberr_t		ret;
-
-	if (mode != PAGE_CUR_UNSUPP) {
-
-		innobase_srv_conc_enter_innodb(m_prebuilt);
-
-		ret = row_search_mvcc(
-			buf, mode, m_prebuilt, match_mode, 0);
-
-		innobase_srv_conc_exit_innodb(m_prebuilt);
-	} else {
-
-		ret = DB_UNSUPPORTED;
-	}
+	dberr_t ret = mode == PAGE_CUR_UNSUPP ? DB_UNSUPPORTED
+		: row_search_mvcc(buf, mode, m_prebuilt, match_mode, 0);
 
 	DBUG_EXECUTE_IF("ib_select_query_failure", ret = DB_ERROR;);
 
@@ -9658,16 +9147,10 @@ ha_innobase::general_fetch(
 			    : HA_ERR_NO_SUCH_TABLE);
 	}
 
-	innobase_srv_conc_enter_innodb(m_prebuilt);
-
-	dberr_t	ret = row_search_mvcc(
-		buf, PAGE_CUR_UNSUPP, m_prebuilt, match_mode, direction);
-
-	innobase_srv_conc_exit_innodb(m_prebuilt);
-
 	int	error;
 
-	switch (ret) {
+	switch (dberr_t	ret = row_search_mvcc(buf, PAGE_CUR_UNSUPP, m_prebuilt,
+					      match_mode, direction)) {
 	case DB_SUCCESS:
 		error = 0;
 		table->status = 0;
@@ -9973,7 +9456,7 @@ ha_innobase::ft_init_ext(
 
 		buf_tmp_used = innobase_convert_string(
 			buf_tmp, sizeof(buf_tmp) - 1,
-			&my_charset_utf8_general_ci,
+			&my_charset_utf8mb3_general_ci,
 			query, query_len, (CHARSET_INFO*) char_set,
 			&num_errors);
 
@@ -10040,7 +9523,7 @@ ha_innobase::ft_init_ext(
 
 	/* Allocate FTS handler, and instantiate it before return */
 	fts_hdl = reinterpret_cast<NEW_FT_INFO*>(
-		my_malloc(/*PSI_INSTRUMENT_ME,*/ sizeof(NEW_FT_INFO), MYF(0)));
+		my_malloc(PSI_INSTRUMENT_ME, sizeof(NEW_FT_INFO), MYF(0)));
 
 	fts_hdl->please = const_cast<_ft_vft*>(&ft_vft_result);
 	fts_hdl->could_you = const_cast<_ft_vft_ext*>(&ft_vft_ext_result);
@@ -10178,16 +9661,11 @@ next_record:
 		tuple. */
 		innobase_fts_create_doc_id_key(tuple, index, &search_doc_id);
 
-		innobase_srv_conc_enter_innodb(m_prebuilt);
-
-		dberr_t ret = row_search_for_mysql(
-			(byte*) buf, PAGE_CUR_GE, m_prebuilt, ROW_SEL_EXACT, 0);
-
-		innobase_srv_conc_exit_innodb(m_prebuilt);
-
 		int	error;
 
-		switch (ret) {
+		switch (dberr_t ret = row_search_for_mysql(buf, PAGE_CUR_GE,
+							   m_prebuilt,
+							   ROW_SEL_EXACT, 0)) {
 		case DB_SUCCESS:
 			error = 0;
 			table->status = 0;
@@ -10301,7 +9779,7 @@ wsrep_append_foreign_key(
 
 	ulint rcode = DB_SUCCESS;
 	char  cache_key[513] = {'\0'};
-	int   cache_key_len=0;
+	size_t cache_key_len = 0;
 
 	if ( !((referenced) ?
 		foreign->referenced_table : foreign->foreign_table)) {
@@ -10561,12 +10039,11 @@ ha_innobase::wsrep_append_keys(
 	}
 
 	if (wsrep_protocol_version == 0) {
-		uint	len;
 		char 	keyval[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'};
 		char 	*key 		= &keyval[0];
-		ibool    is_null;
+		bool    is_null;
 
-		len = wsrep_store_key_val_for_row(
+		auto len = wsrep_store_key_val_for_row(
 			thd, table, 0, key, WSREP_MAX_SUPPORTED_KEY_LENGTH,
 			record0, &is_null);
 
@@ -10623,15 +10100,15 @@ ha_innobase::wsrep_append_keys(
 			      referenced_by_foreign_key2(tab, idx)) ||
 			     (!tab && referenced_by_foreign_key()))) {
 
-				ibool is_null0;
-				uint len0 = wsrep_store_key_val_for_row(
+				bool is_null0;
+				auto len0 = wsrep_store_key_val_for_row(
 					thd, table, i, key0,
 					WSREP_MAX_SUPPORTED_KEY_LENGTH,
 					record0, &is_null0);
 
 				if (record1) {
-					ibool is_null1;
-					uint len1 = wsrep_store_key_val_for_row(
+					bool is_null1;
+					auto len1= wsrep_store_key_val_for_row(
 						thd, table, i, key1,
 						WSREP_MAX_SUPPORTED_KEY_LENGTH,
 						record1, &is_null1);
@@ -10652,7 +10129,8 @@ ha_innobase::wsrep_append_keys(
 							keyval1,
 						    /* for len1+1 see keyval1
 						     initialization comment */
-							len1+1, key_type);
+							uint16_t(len1+1),
+							key_type);
 						    if (rcode)
 							DBUG_RETURN(rcode);
 						}
@@ -10664,7 +10142,8 @@ ha_innobase::wsrep_append_keys(
 						thd, trx, table_share,
 						/* for len0+1 see keyval0
 						   initialization comment */
-						keyval0, len0+1, key_type);
+						keyval0, uint16_t(len0+1),
+						key_type);
 					if (rcode)
 						DBUG_RETURN(rcode);
 
@@ -10764,20 +10243,17 @@ create_table_check_doc_id_col(
 					wrong type/name/size */
 {
 	for (ulint i = 0; i < form->s->fields; i++) {
-		const Field*	field;
-		ulint		col_type;
-		ulint		col_len;
-		ulint		unsigned_type;
-
-		field = form->field[i];
+		const Field* field = form->field[i];
 		if (!field->stored_in_db()) {
 			continue;
 		}
 
-		col_type = get_innobase_type_from_mysql_type(
+		unsigned unsigned_type;
+
+		auto col_type = get_innobase_type_from_mysql_type(
 			&unsigned_type, field);
 
-		col_len = field->pack_length();
+		auto col_len = field->pack_length();
 
 		if (innobase_strcasecmp(field->field_name.str,
 					FTS_DOC_ID_COL_NAME) == 0) {
@@ -10835,7 +10311,8 @@ prepare_vcol_for_base_setup(
 	bitmap_clear_all(&field->table->tmp_set);
 	field->vcol_info->expr->walk(
 		&Item::register_field_in_read_map, 1, field->table);
-	col->num_base= bitmap_bits_set(&field->table->tmp_set);
+	col->num_base= bitmap_bits_set(&field->table->tmp_set)
+		& dict_index_t::MAX_N_FIELDS;
 	if (col->num_base != 0) {
 		col->base_col = static_cast<dict_col_t**>(mem_heap_zalloc(
 					table->heap, col->num_base * sizeof(
@@ -10855,7 +10332,7 @@ innodb_base_col_setup(
 	const Field*	field,
 	dict_v_col_t*	v_col)
 {
-	ulint n = 0;
+	uint16_t n = 0;
 
 	prepare_vcol_for_base_setup(table, field, v_col);
 
@@ -10880,7 +10357,7 @@ innodb_base_col_setup(
 			n++;
 		}
 	}
-	v_col->num_base= n;
+	v_col->num_base= n & dict_index_t::MAX_N_FIELDS;
 }
 
 /** Set up base columns for stored column
@@ -10932,10 +10409,8 @@ int
 create_table_info_t::create_table_def()
 {
 	dict_table_t*	table;
-	ulint		col_type;
-	ulint		col_len;
 	ulint		nulls_allowed;
-	ulint		unsigned_type;
+	unsigned	unsigned_type;
 	ulint		binary_type;
 	ulint		long_true_varchar;
 	ulint		charset_no;
@@ -11041,7 +10516,7 @@ create_table_info_t::create_table_def()
 			}
 		}
 
-		col_type = get_innobase_type_from_mysql_type(
+		auto col_type = get_innobase_type_from_mysql_type(
 			&unsigned_type, field);
 
 		if (!col_type) {
@@ -11054,7 +10529,10 @@ create_table_info_t::create_table_def()
 				" the table with an appropriate"
 				" column type.",
 				table->name.m_name, field->field_name.str);
-			goto err_col;
+err_col:
+			dict_mem_table_free(table);
+			ut_ad(trx_state_eq(m_trx, TRX_STATE_NOT_STARTED));
+			DBUG_RETURN(HA_ERR_GENERIC);
 		}
 
 		nulls_allowed = field->real_maybe_null() ? 0 : DATA_NOT_NULL;
@@ -11086,7 +10564,7 @@ create_table_info_t::create_table_def()
 			}
 		}
 
-		col_len = field->pack_length();
+		auto col_len = field->pack_length();
 
 		/* The MySQL pack length contains 1 or 2 bytes length field
 		for a true VARCHAR. Let us subtract that, so that the InnoDB
@@ -11108,10 +10586,7 @@ create_table_info_t::create_table_def()
 		if (dict_col_name_is_reserved(field->field_name.str)){
 			my_error(ER_WRONG_COLUMN_NAME, MYF(0),
 				 field->field_name.str);
-err_col:
-			dict_mem_table_free(table);
-			ut_ad(trx_state_eq(m_trx, TRX_STATE_NOT_STARTED));
-			DBUG_RETURN(HA_ERR_GENERIC);
+			goto err_col;
 		}
 
 		ulint is_virtual = !field->stored_in_db() ? DATA_VIRTUAL : 0;
@@ -11328,7 +10803,7 @@ create_index(
 		ind_type |= DICT_UNIQUE;
 	}
 
-	field_lengths = (ulint*) my_malloc(//PSI_INSTRUMENT_ME,
+	field_lengths = (ulint*) my_malloc(PSI_INSTRUMENT_ME,
 		key->user_defined_key_parts * sizeof *
 				field_lengths, MYF(MY_FAE));
 
@@ -11341,8 +10816,7 @@ create_index(
 	for (ulint i = 0; i < key->user_defined_key_parts; i++) {
 		KEY_PART_INFO*	key_part = key->key_part + i;
 		ulint		prefix_len;
-		ulint		col_type;
-		ulint		is_unsigned;
+		unsigned	is_unsigned;
 
 
 		/* (The flag HA_PART_KEY_SEG denotes in MySQL a
@@ -11362,7 +10836,7 @@ create_index(
 
 		const char*	field_name = key_part->field->field_name.str;
 
-		col_type = get_innobase_type_from_mysql_type(
+		auto col_type = get_innobase_type_from_mysql_type(
 			&is_unsigned, key_part->field);
 
 		if (DATA_LARGE_MTYPE(col_type)
@@ -12159,7 +11633,7 @@ index_bad:
 @param[in]	str	string which might include 'MERGE_THRESHOLD='
 @return	value parsed. 0 means not found or invalid value. */
 static
-ulint
+unsigned
 innobase_parse_merge_threshold(
 	THD*		thd,
 	const char*	str)
@@ -12179,7 +11653,7 @@ innobase_parse_merge_threshold(
 	lint	ret = atoi(pos);
 
 	if (ret > 0 && ret <= 50) {
-		return(static_cast<ulint>(ret));
+		return(static_cast<unsigned>(ret));
 	}
 
 	push_warning_printf(
@@ -12202,8 +11676,8 @@ innobase_parse_hint_from_comment(
 	dict_table_t*		table,
 	const TABLE_SHARE*	table_share)
 {
-	ulint	merge_threshold_table;
-	ulint	merge_threshold_index[MAX_KEY];
+	unsigned merge_threshold_table;
+	unsigned merge_threshold_index[MAX_KEY];
 	bool	is_found[MAX_KEY];
 
 	if (table_share->comment.str != NULL) {
@@ -12293,7 +11767,8 @@ innobase_parse_hint_from_comment(
 			/* x-lock index is needed to exclude concurrent
 			pessimistic tree operations */
 			rw_lock_x_lock(dict_index_get_lock(index));
-			index->merge_threshold = merge_threshold_table;
+			index->merge_threshold = merge_threshold_table
+				& ((1U << 6) - 1);
 			rw_lock_x_unlock(dict_index_get_lock(index));
 
 			continue;
@@ -12313,7 +11788,8 @@ innobase_parse_hint_from_comment(
 				pessimistic tree operations */
 				rw_lock_x_lock(dict_index_get_lock(index));
 				index->merge_threshold
-					= merge_threshold_index[i];
+					= merge_threshold_index[i]
+					& ((1U << 6) - 1);
 				rw_lock_x_unlock(dict_index_get_lock(index));
 				is_found[i] = true;
 
@@ -12460,6 +11936,590 @@ int create_table_info_t::prepare_create_table(const char* name, bool strict)
 	DBUG_RETURN(parse_table_name(name));
 }
 
+/** Push warning message to SQL-layer based on foreign key constraint index
+match error.
+@param[in]	trx		Current transaction
+@param[in]	operation	Operation ("Create" or "Alter")
+@param[in]	create_name	Table name as specified in SQL
+@param[in]	columns		Foreign key column names array
+@param[in]	index_error 	Index error code
+@param[in]	err_col	  	Column where error happened
+@param[in]	err_index  	Index where error happened
+@param[in]	table	  	Table object */
+static void
+foreign_push_index_error(trx_t* trx, const char* operation,
+			 const char* create_name, const char* fk_text,
+			 const char** columns, fkerr_t index_error,
+			 ulint err_col, dict_index_t* err_index,
+			 dict_table_t* table)
+{
+	switch (index_error) {
+	case FK_SUCCESS:
+		break;
+	case FK_INDEX_NOT_FOUND:
+		ib_foreign_warn(trx, DB_CANNOT_ADD_CONSTRAINT, create_name,
+				"%s table %s with foreign key %s constraint"
+				" failed. There is no index in the referenced"
+				" table where the referenced columns appear"
+				" as the first columns.",
+				operation, create_name, fk_text);
+		return;
+	case FK_IS_PREFIX_INDEX:
+		ib_foreign_warn(
+			trx, DB_CANNOT_ADD_CONSTRAINT, create_name,
+			"%s table %s with foreign key %s constraint"
+			" failed. There is only prefix index in the referenced"
+			" table where the referenced columns appear"
+			" as the first columns.",
+			operation, create_name, fk_text);
+		return;
+	case FK_COL_NOT_NULL:
+		ib_foreign_warn(
+			trx, DB_CANNOT_ADD_CONSTRAINT, create_name,
+			"%s table %s with foreign key %s constraint"
+			" failed. You have defined a SET NULL condition but "
+			"column '%s' on index is defined as NOT NULL.",
+			operation, create_name, fk_text, columns[err_col]);
+		return;
+	case FK_COLS_NOT_EQUAL:
+		dict_field_t* field;
+		const char*   col_name;
+		field = dict_index_get_nth_field(err_index, err_col);
+
+		col_name = field->col->is_virtual()
+				   ? "(null)"
+				   : dict_table_get_col_name(
+					   table, dict_col_get_no(field->col));
+		ib_foreign_warn(
+			trx, DB_CANNOT_ADD_CONSTRAINT, create_name,
+			"%s table %s with foreign key %s constraint"
+			" failed. Field type or character set for column '%s' "
+			"does not match referenced column '%s'.",
+			operation, create_name, fk_text, columns[err_col],
+			col_name);
+		return;
+	}
+	DBUG_ASSERT("unknown error" == 0);
+}
+
+/** Find column or virtual column in table by its name.
+@param[in]	table	Table where column is searched
+@param[in]	name	Name to search for
+@retval		true	if found
+@retval		false	if not found */
+static bool
+find_col(dict_table_t* table, const char** name)
+{
+	ulint i;
+	for (i = 0; i < dict_table_get_n_cols(table); i++) {
+
+		const char* col_name = dict_table_get_col_name(table, i);
+
+		if (0 == innobase_strcasecmp(col_name, *name)) {
+			/* Found */
+			strcpy((char*)*name, col_name);
+			return true;
+		}
+	}
+
+	for (i = 0; i < dict_table_get_n_v_cols(table); i++) {
+
+		const char* col_name = dict_table_get_v_col_name(table, i);
+
+		if (0 == innobase_strcasecmp(col_name, *name)) {
+			/* Found */
+			strcpy((char*)*name, col_name);
+			return true;
+		}
+	}
+	return false;
+}
+
+/** Foreign key printer for error messages. Prints FK name if it exists or
+key part list in the form (col1, col2, col3, ...) */
+class key_text
+{
+	static const size_t MAX_TEXT = 48;
+	char		    buf[MAX_TEXT + 1];
+
+public:
+	key_text(Key* key)
+	{
+		char* ptr = buf;
+		if (key->name.str) {
+			size_t len = std::min(key->name.length, MAX_TEXT - 2);
+			*(ptr++)   = '`';
+			memcpy(ptr, key->name.str, len);
+			ptr	  += len;
+			*(ptr++)   = '`';
+			*ptr	   = '\0';
+			return;
+		}
+		*(ptr++)  = '(';
+		List_iterator_fast<Key_part_spec> it(key->columns);
+		while (Key_part_spec* k = it++) {
+			/* 3 is etc continuation ("...");
+			   2 is comma separator (", ") in case of next exists;
+			   1 is terminating ')' */
+			if (MAX_TEXT - (size_t)(ptr - buf)
+				>= (it.peek() ? 3 + 2 + 1 : 3 + 1)
+				+ k->field_name.length) {
+				memcpy(ptr, k->field_name.str,
+				       k->field_name.length);
+				ptr += k->field_name.length;
+				if (it.peek()) {
+					*(ptr++) = ',';
+					*(ptr++) = ' ';
+				}
+			} else {
+				ut_ad((size_t)(ptr - buf) <= MAX_TEXT - 4);
+				memcpy(ptr, "...", 3);
+				ptr += 3;
+				break;
+			}
+		}
+		*(ptr++) = ')';
+		*ptr 	 = '\0';
+	}
+	const char* str() { return buf; }
+};
+
+/** Create InnoDB foreign keys from MySQL alter_info. Collect all
+dict_foreign_t items into local_fk_set and then add into system table.
+@return		DB_SUCCESS or specific error code */
+dberr_t
+create_table_info_t::create_foreign_keys()
+{
+	dict_foreign_set      local_fk_set;
+	dict_foreign_set_free local_fk_set_free(local_fk_set);
+	dberr_t		      error;
+	ulint		      number	      = 1;
+	static const unsigned MAX_COLS_PER_FK = 500;
+	const char*	      column_names[MAX_COLS_PER_FK];
+	const char*	      ref_column_names[MAX_COLS_PER_FK];
+	char		      create_name[MAX_DATABASE_NAME_LEN + 1 +
+					  MAX_TABLE_NAME_LEN + 1];
+	dict_index_t*	      index	  = NULL;
+	fkerr_t		      index_error = FK_SUCCESS;
+	dict_index_t*	      err_index	  = NULL;
+	ulint		      err_col;
+	const bool	      tmp_table = m_flags2 & DICT_TF2_TEMPORARY;
+	const CHARSET_INFO*   cs	= thd_charset(m_thd);
+	const char*	      operation = "Create ";
+	const char*	      name	= m_table_name;
+
+	enum_sql_command sqlcom = enum_sql_command(thd_sql_command(m_thd));
+
+	if (sqlcom == SQLCOM_ALTER_TABLE) {
+		dict_table_t* table_to_alter;
+		mem_heap_t*   heap = mem_heap_create(10000);
+		ulint	      highest_id_so_far;
+		char*	      n = dict_get_referenced_table(
+			name, LEX_STRING_WITH_LEN(m_form->s->db),
+			LEX_STRING_WITH_LEN(m_form->s->table_name),
+			&table_to_alter, heap, cs);
+
+		/* Starting from 4.0.18 and 4.1.2, we generate foreign key id's
+		in the format databasename/tablename_ibfk_[number], where
+		[number] is local to the table; look for the highest [number]
+		for table_to_alter, so that we can assign to new constraints
+		higher numbers. */
+
+		/* If we are altering a temporary table, the table name after
+		ALTER TABLE does not correspond to the internal table name, and
+		table_to_alter is NULL. TODO: should we fix this somehow? */
+
+		if (table_to_alter) {
+			n		  = table_to_alter->name.m_name;
+			highest_id_so_far = dict_table_get_highest_foreign_id(
+				table_to_alter);
+		} else {
+			highest_id_so_far = 0;
+		}
+
+		char* bufend = innobase_convert_name(
+			create_name, sizeof create_name, n, strlen(n), m_thd);
+		create_name[bufend - create_name] = '\0';
+		number				  = highest_id_so_far + 1;
+		mem_heap_free(heap);
+		operation = "Alter ";
+	} else if (strstr(name, "#P#") || strstr(name, "#p#")) {
+		/* Partitioned table */
+		create_name[0] = '\0';
+	} else {
+		char* bufend = innobase_convert_name(create_name,
+						     sizeof create_name,
+						     name,
+						     strlen(name), m_thd);
+		create_name[bufend - create_name] = '\0';
+	}
+
+	Alter_info* alter_info = m_create_info->alter_info;
+	ut_ad(alter_info);
+	List_iterator_fast<Key> key_it(alter_info->key_list);
+
+	dict_table_t* table = dict_table_get_low(name);
+	if (!table) {
+		ib_foreign_warn(m_trx, DB_CANNOT_ADD_CONSTRAINT, create_name,
+				"%s table %s foreign key constraint"
+				" failed. Table not found.",
+				operation, create_name);
+
+		return (DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	while (Key* key = key_it++) {
+		if (key->type != Key::FOREIGN_KEY)
+			continue;
+
+		if (tmp_table) {
+			ib_foreign_warn(m_trx, DB_CANNOT_ADD_CONSTRAINT,
+					create_name,
+					"%s table `%s`.`%s` with foreign key "
+					"constraint failed. "
+					"Temporary tables can't have "
+					"foreign key constraints.",
+					operation, m_form->s->db.str,
+					m_form->s->table_name.str);
+
+			return (DB_CANNOT_ADD_CONSTRAINT);
+		} else if (!*create_name) {
+			ut_ad("should be unreachable" == 0);
+			return DB_CANNOT_ADD_CONSTRAINT;
+		}
+
+		Foreign_key*   fk = static_cast<Foreign_key*>(key);
+		Key_part_spec* col;
+		bool	       success;
+
+		dict_foreign_t* foreign = dict_mem_foreign_create();
+		if (!foreign) {
+			return (DB_OUT_OF_MEMORY);
+		}
+
+		List_iterator_fast<Key_part_spec> col_it(fk->columns);
+		unsigned			  i = 0, j = 0;
+		while ((col = col_it++)) {
+			column_names[i] = mem_heap_strdupl(
+				foreign->heap, col->field_name.str,
+				col->field_name.length);
+			success = find_col(table, column_names + i);
+			if (!success) {
+				key_text k(fk);
+				ib_foreign_warn(
+					m_trx, DB_CANNOT_ADD_CONSTRAINT,
+					create_name,
+					"%s table %s foreign key %s constraint"
+					" failed. Column %s was not found.",
+					operation, create_name, k.str(),
+					column_names[i]);
+				dict_foreign_free(foreign);
+				return (DB_CANNOT_ADD_CONSTRAINT);
+			}
+			++i;
+			if (i >= MAX_COLS_PER_FK) {
+				key_text k(fk);
+				ib_foreign_warn(
+					m_trx, DB_CANNOT_ADD_CONSTRAINT,
+					create_name,
+					"%s table %s foreign key %s constraint"
+					" failed. Too many columns: %u (%u "
+					"allowed).",
+					operation, create_name, k.str(), i,
+					MAX_COLS_PER_FK);
+				dict_foreign_free(foreign);
+				return (DB_CANNOT_ADD_CONSTRAINT);
+			}
+		}
+
+		index = dict_foreign_find_index(
+			table, NULL, column_names, i, NULL, TRUE, FALSE,
+			&index_error, &err_col, &err_index);
+
+		if (!index) {
+			key_text k(fk);
+			foreign_push_index_error(m_trx, operation, create_name,
+						 k.str(), column_names,
+						 index_error, err_col,
+						 err_index, table);
+			dict_foreign_free(foreign);
+			return (DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		if (fk->constraint_name.str) {
+			ulint db_len;
+
+			/* Catenate 'databasename/' to the constraint name
+			specified by the user: we conceive the constraint as
+			belonging to the same MySQL 'database' as the table
+			itself. We store the name to foreign->id. */
+
+			db_len = dict_get_db_name_len(table->name.m_name);
+
+			foreign->id = static_cast<char*>(mem_heap_alloc(
+				foreign->heap,
+				db_len + fk->constraint_name.length + 2));
+
+			memcpy(foreign->id, table->name.m_name, db_len);
+			foreign->id[db_len] = '/';
+			strcpy(foreign->id + db_len + 1,
+			       fk->constraint_name.str);
+		}
+
+		if (foreign->id == NULL) {
+			error = dict_create_add_foreign_id(
+				&number, table->name.m_name, foreign);
+			if (error != DB_SUCCESS) {
+				dict_foreign_free(foreign);
+				return (error);
+			}
+		}
+
+		std::pair<dict_foreign_set::iterator, bool> ret
+			= local_fk_set.insert(foreign);
+
+		if (!ret.second) {
+			/* A duplicate foreign key name has been found */
+			dict_foreign_free(foreign);
+			return (DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		foreign->foreign_table = table;
+		foreign->foreign_table_name
+			= mem_heap_strdup(foreign->heap, table->name.m_name);
+		if (!foreign->foreign_table_name) {
+			return (DB_OUT_OF_MEMORY);
+		}
+
+		dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
+
+		foreign->foreign_index = index;
+		foreign->n_fields      = i & dict_index_t::MAX_N_FIELDS;
+
+		foreign->foreign_col_names = static_cast<const char**>(
+			mem_heap_alloc(foreign->heap, i * sizeof(void*)));
+		if (!foreign->foreign_col_names) {
+			return (DB_OUT_OF_MEMORY);
+		}
+
+		memcpy(foreign->foreign_col_names, column_names,
+		       i * sizeof(void*));
+
+		foreign->referenced_table_name = dict_get_referenced_table(
+			name, LEX_STRING_WITH_LEN(fk->ref_db),
+			LEX_STRING_WITH_LEN(fk->ref_table),
+			&foreign->referenced_table, foreign->heap, cs);
+
+		if (!foreign->referenced_table_name) {
+			return (DB_OUT_OF_MEMORY);
+		}
+
+		if (!foreign->referenced_table && m_trx->check_foreigns) {
+			char  buf[MAX_TABLE_NAME_LEN + 1] = "";
+			char* bufend;
+
+			bufend = innobase_convert_name(
+				buf, MAX_TABLE_NAME_LEN,
+				foreign->referenced_table_name,
+				strlen(foreign->referenced_table_name), m_thd);
+			buf[bufend - buf] = '\0';
+			key_text k(fk);
+			ib_foreign_warn(m_trx, DB_CANNOT_ADD_CONSTRAINT,
+					create_name,
+					"%s table %s with foreign key %s "
+					"constraint failed. Referenced table "
+					"%s not found in the data dictionary.",
+					operation, create_name, k.str(), buf);
+			return (DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		/* Don't allow foreign keys on partitioned tables yet. */
+		if (foreign->referenced_table
+		    && dict_table_is_partition(foreign->referenced_table)) {
+			/* How could one make a referenced table to be a
+			 * partition? */
+			ut_ad(0);
+			my_error(ER_FEATURE_NOT_SUPPORTED_WITH_PARTITIONING,
+				 MYF(0), "FOREIGN KEY");
+			return (DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		col_it.init(fk->ref_columns);
+		while ((col = col_it++)) {
+			ref_column_names[j] = mem_heap_strdupl(
+				foreign->heap, col->field_name.str,
+				col->field_name.length);
+			if (foreign->referenced_table) {
+				success = find_col(foreign->referenced_table,
+						   ref_column_names + j);
+				if (!success) {
+					key_text k(fk);
+					ib_foreign_warn(
+						m_trx,
+						DB_CANNOT_ADD_CONSTRAINT,
+						create_name,
+						"%s table %s foreign key %s "
+						"constraint failed. "
+						"Column %s was not found.",
+						operation, create_name,
+						k.str(), ref_column_names[j]);
+
+					return (DB_CANNOT_ADD_CONSTRAINT);
+				}
+			}
+			++j;
+		}
+		/* See ER_WRONG_FK_DEF in mysql_prepare_create_table() */
+		ut_ad(i == j);
+
+		/* Try to find an index which contains the columns as the first
+		fields and in the right order, and the types are the same as in
+		foreign->foreign_index */
+
+		if (foreign->referenced_table) {
+			index = dict_foreign_find_index(
+				foreign->referenced_table, NULL,
+				ref_column_names, i, foreign->foreign_index,
+				TRUE, FALSE, &index_error, &err_col,
+				&err_index);
+
+			if (!index) {
+				key_text k(fk);
+				foreign_push_index_error(
+					m_trx, operation, create_name, k.str(),
+					column_names, index_error, err_col,
+					err_index, foreign->referenced_table);
+
+				return (DB_CANNOT_ADD_CONSTRAINT);
+			}
+		} else {
+			ut_a(m_trx->check_foreigns == FALSE);
+			index = NULL;
+		}
+
+		foreign->referenced_index = index;
+		dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
+
+		foreign->referenced_col_names = static_cast<const char**>(
+			mem_heap_alloc(foreign->heap, i * sizeof(void*)));
+		if (!foreign->referenced_col_names) {
+			return (DB_OUT_OF_MEMORY);
+		}
+
+		memcpy(foreign->referenced_col_names, ref_column_names,
+		       i * sizeof(void*));
+
+		if (fk->delete_opt == FK_OPTION_SET_NULL
+		    || fk->update_opt == FK_OPTION_SET_NULL) {
+			for (j = 0; j < foreign->n_fields; j++) {
+				if ((dict_index_get_nth_col(
+					     foreign->foreign_index, j)
+					     ->prtype)
+				    & DATA_NOT_NULL) {
+					const dict_col_t* col
+						= dict_index_get_nth_col(
+							foreign->foreign_index,
+							j);
+					const char* col_name
+						= dict_table_get_col_name(
+							foreign->foreign_index
+								->table,
+							dict_col_get_no(col));
+
+					/* It is not sensible to define SET
+					NULL
+					if the column is not allowed to be
+					NULL! */
+					key_text k(fk);
+					ib_foreign_warn(
+						m_trx,
+						DB_CANNOT_ADD_CONSTRAINT,
+						create_name,
+						"%s table %s with foreign key "
+						"%s constraint failed. You have"
+						" defined a SET NULL condition "
+						"but column '%s' is defined as "
+						"NOT NULL.",
+						operation, create_name,
+						k.str(), col_name);
+
+					return (DB_CANNOT_ADD_CONSTRAINT);
+				}
+			}
+		}
+
+		switch (fk->delete_opt) {
+		case FK_OPTION_UNDEF:
+		case FK_OPTION_RESTRICT:
+			break;
+		case FK_OPTION_CASCADE:
+			foreign->type |= DICT_FOREIGN_ON_DELETE_CASCADE;
+			break;
+		case FK_OPTION_SET_NULL:
+			foreign->type |= DICT_FOREIGN_ON_DELETE_SET_NULL;
+			break;
+		case FK_OPTION_NO_ACTION:
+			foreign->type |= DICT_FOREIGN_ON_DELETE_NO_ACTION;
+			break;
+		case FK_OPTION_SET_DEFAULT:
+			// TODO: MDEV-10393 Foreign keys SET DEFAULT action
+			break;
+		default:
+			ut_ad(0);
+			break;
+		}
+
+		switch (fk->update_opt) {
+		case FK_OPTION_UNDEF:
+		case FK_OPTION_RESTRICT:
+			break;
+		case FK_OPTION_CASCADE:
+			foreign->type |= DICT_FOREIGN_ON_UPDATE_CASCADE;
+			break;
+		case FK_OPTION_SET_NULL:
+			foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL;
+			break;
+		case FK_OPTION_NO_ACTION:
+			foreign->type |= DICT_FOREIGN_ON_UPDATE_NO_ACTION;
+			break;
+		case FK_OPTION_SET_DEFAULT:
+			// TODO: MDEV-10393 Foreign keys SET DEFAULT action
+			break;
+		default:
+			ut_ad(0);
+			break;
+		}
+	}
+
+	if (dict_foreigns_has_s_base_col(local_fk_set, table)) {
+		return (DB_NO_FK_ON_S_BASE_COL);
+	}
+
+	/**********************************************************/
+	/* The following call adds the foreign key constraints
+	to the data dictionary system tables on disk */
+	m_trx->op_info = "adding foreign keys";
+
+	trx_start_if_not_started_xa(m_trx, true);
+
+	trx_set_dict_operation(m_trx, TRX_DICT_OP_TABLE);
+
+	error = dict_create_add_foreigns_to_dictionary(local_fk_set, table,
+						       m_trx);
+
+	if (error == DB_SUCCESS) {
+
+		table->foreign_set.insert(local_fk_set.begin(),
+					  local_fk_set.end());
+		std::for_each(local_fk_set.begin(), local_fk_set.end(),
+			      dict_foreign_add_to_referenced_table());
+		local_fk_set.clear();
+
+		dict_mem_table_fill_foreign_vcol_set(table);
+	}
+	return (error);
+}
+
 /** Create the internal innodb table.
 @param create_fk	whether to add FOREIGN KEY constraints */
 int create_table_info_t::create_table(bool create_fk)
@@ -12579,64 +12639,58 @@ int create_table_info_t::create_table(bool create_fk)
 		dict_table_get_all_fts_indexes(m_table, fts->indexes);
 	}
 
-	size_t stmt_len;
-	if (const char* stmt = innobase_get_stmt_unsafe(m_thd, &stmt_len)) {
-		dberr_t err = create_fk
-			? dict_create_foreign_constraints(
-				m_trx, stmt, stmt_len, m_table_name,
-				m_flags2 & DICT_TF2_TEMPORARY)
-			: DB_SUCCESS;
-		if (err == DB_SUCCESS) {
-			/* Check that also referencing constraints are ok */
-			dict_names_t	fk_tables;
-			err = dict_load_foreigns(m_table_name, NULL,
-						 false, true,
-						 DICT_ERR_IGNORE_NONE,
-						 fk_tables);
-			while (err == DB_SUCCESS && !fk_tables.empty()) {
-				dict_load_table(fk_tables.front(),
-						DICT_ERR_IGNORE_NONE);
-				fk_tables.pop_front();
-			}
+	dberr_t err = create_fk ? create_foreign_keys() : DB_SUCCESS;
+
+	if (err == DB_SUCCESS) {
+		/* Check that also referencing constraints are ok */
+		dict_names_t	fk_tables;
+		err = dict_load_foreigns(m_table_name, NULL,
+						false, true,
+						DICT_ERR_IGNORE_NONE,
+						fk_tables);
+		while (err == DB_SUCCESS && !fk_tables.empty()) {
+			dict_load_table(fk_tables.front(),
+					DICT_ERR_IGNORE_NONE);
+			fk_tables.pop_front();
 		}
+	}
 
-		switch (err) {
-		case DB_PARENT_NO_INDEX:
-			push_warning_printf(
-				m_thd, Sql_condition::WARN_LEVEL_WARN,
-				HA_ERR_CANNOT_ADD_FOREIGN,
-				"Create table '%s' with foreign key constraint"
-				" failed. There is no index in the referenced"
-				" table where the referenced columns appear"
-				" as the first columns.\n", m_table_name);
-			break;
+	switch (err) {
+	case DB_PARENT_NO_INDEX:
+		push_warning_printf(
+			m_thd, Sql_condition::WARN_LEVEL_WARN,
+			HA_ERR_CANNOT_ADD_FOREIGN,
+			"Create table '%s' with foreign key constraint"
+			" failed. There is no index in the referenced"
+			" table where the referenced columns appear"
+			" as the first columns.\n", m_table_name);
+		break;
 
-		case DB_CHILD_NO_INDEX:
-			push_warning_printf(
-				m_thd, Sql_condition::WARN_LEVEL_WARN,
-				HA_ERR_CANNOT_ADD_FOREIGN,
-				"Create table '%s' with foreign key constraint"
-				" failed. There is no index in the referencing"
-				" table where referencing columns appear"
-				" as the first columns.\n", m_table_name);
-			break;
-		case DB_NO_FK_ON_S_BASE_COL:
-			push_warning_printf(
-				m_thd, Sql_condition::WARN_LEVEL_WARN,
-				HA_ERR_CANNOT_ADD_FOREIGN,
-				"Create table '%s' with foreign key constraint"
-				" failed. Cannot add foreign key constraint"
-				" placed on the base column of stored"
-				" column. \n",
-				m_table_name);
-		default:
-			break;
-		}
+	case DB_CHILD_NO_INDEX:
+		push_warning_printf(
+			m_thd, Sql_condition::WARN_LEVEL_WARN,
+			HA_ERR_CANNOT_ADD_FOREIGN,
+			"Create table '%s' with foreign key constraint"
+			" failed. There is no index in the referencing"
+			" table where referencing columns appear"
+			" as the first columns.\n", m_table_name);
+		break;
+	case DB_NO_FK_ON_S_BASE_COL:
+		push_warning_printf(
+			m_thd, Sql_condition::WARN_LEVEL_WARN,
+			HA_ERR_CANNOT_ADD_FOREIGN,
+			"Create table '%s' with foreign key constraint"
+			" failed. Cannot add foreign key constraint"
+			" placed on the base column of stored"
+			" column. \n",
+			m_table_name);
+	default:
+		break;
+	}
 
-		if (err != DB_SUCCESS) {
-			DBUG_RETURN(convert_error_code_to_mysql(
-					    err, m_flags, NULL));
-		}
+	if (err != DB_SUCCESS) {
+		DBUG_RETURN(convert_error_code_to_mysql(
+					err, m_flags, NULL));
 	}
 
 	/* In TRUNCATE TABLE, we will merely warn about the maximum
@@ -12913,7 +12967,6 @@ create_table_info_t::create_table_update_dict()
 	if (m_flags2 & DICT_TF2_FTS) {
 		if (!innobase_fts_load_stopword(innobase_table, NULL, m_thd)) {
 			dict_table_close(innobase_table, FALSE, FALSE);
-			srv_active_wake_master_thread();
 			DBUG_RETURN(-1);
 		}
 
@@ -12941,9 +12994,12 @@ create_table_info_t::create_table_update_dict()
 		} else {
 			const unsigned	col_no = innodb_col_no(ai);
 
-			innobase_table->persistent_autoinc = 1
-				+ dict_table_get_nth_col_pos(
-					innobase_table, col_no, NULL);
+			innobase_table->persistent_autoinc
+				= static_cast<uint16_t>(
+					dict_table_get_nth_col_pos(
+						innobase_table, col_no, NULL)
+					+ 1)
+				& dict_index_t::MAX_N_FIELDS;
 
 			/* Persist the "last used" value, which
 			typically is AUTO_INCREMENT - 1.
@@ -13060,11 +13116,6 @@ func_exit:
 		trx->free();
 	}
 
-	/* Tell the InnoDB server that there might be work for
-	utility threads: */
-
-	srv_active_wake_master_thread();
-
 	DBUG_RETURN(error);
 }
 
@@ -13205,6 +13256,19 @@ ha_innobase::discard_or_import_tablespace(
 }
 
 /**
+   @return 1 if frm file exists
+   @return 0 if it doesn't exists
+*/
+
+static bool frm_file_exists(const char *path)
+{
+  char buff[FN_REFLEN];
+  strxnmov(buff, FN_REFLEN, path, reg_ext, NullS);
+  return !access(buff, F_OK);
+}
+
+
+/**
 Drops a table from an InnoDB database. Before calling this function,
 MySQL calls innobase_commit to commit the transaction of the current user.
 Then the current user cannot have locks set on the table. Drop table
@@ -13298,7 +13362,9 @@ inline int ha_innobase::delete_table(const char* name, enum_sql_command sqlcom)
 		}
 	}
 
-	if (err == DB_TABLE_NOT_FOUND) {
+	if (err == DB_TABLE_NOT_FOUND &&
+            frm_file_exists(name))
+        {
 		/* Test to drop all tables which matches db/tablename + '#'.
 		Only partitions can have '#' as non-first character in
 		the table name!
@@ -13432,7 +13498,7 @@ innobase_drop_database(
 	}
 
 	ptr++;
-	namebuf = (char*) my_malloc(/*PSI_INSTRUMENT_ME,*/ (uint) len + 2, MYF(0));
+	namebuf = (char*) my_malloc(PSI_INSTRUMENT_ME, (uint) len + 2, MYF(0));
 
 	memcpy(namebuf, ptr, len);
 	namebuf[len] = '/';
@@ -13495,40 +13561,6 @@ inline dberr_t innobase_rename_table(trx_t *trx, const char *from,
 		row_mysql_lock_data_dictionary(trx);
 	}
 
-	dict_table_t*   table = dict_table_open_on_name(
-		norm_from, TRUE, FALSE, DICT_ERR_IGNORE_FK_NOKEY);
-
-	/* Since DICT_BG_YIELD has sleep for 250 milliseconds,
-	Convert lock_wait_timeout unit from second to 250 milliseconds */
-	long int lock_wait_timeout = thd_lock_wait_timeout(trx->mysql_thd) * 4;
-	if (table != NULL) {
-		if (commit) {
-			dict_stats_wait_bg_to_stop_using_table(table, trx);
-		}
-		for (dict_index_t* index = dict_table_get_first_index(table);
-		     index != NULL;
-		     index = dict_table_get_next_index(index)) {
-
-			if (index->type & DICT_FTS) {
-				/* Found */
-				while (index->index_fts_syncing
-					&& !trx_is_interrupted(trx)
-					&& (lock_wait_timeout--) > 0) {
-					DICT_BG_YIELD(trx);
-				}
-			}
-		}
-		if (!commit) {
-			dict_table_close(table, TRUE, FALSE);
-		}
-	}
-
-	/* FTS sync is in progress. We shall timeout this operation */
-	if (lock_wait_timeout < 0) {
-		error = DB_LOCK_WAIT_TIMEOUT;
-		goto func_exit;
-	}
-
 	error = row_rename_table_for_mysql(norm_from, norm_to, trx, commit,
 					   commit);
 
@@ -13580,12 +13612,7 @@ inline dberr_t innobase_rename_table(trx_t *trx, const char *from,
 		}
 	}
 
-func_exit:
 	if (commit) {
-		if (table) {
-			table->stats_bg_flag &= ~BG_STAT_SHOULD_QUIT;
-			dict_table_close(table, TRUE, FALSE);
-		}
 		row_mysql_unlock_data_dictionary(trx);
 	}
 
@@ -13614,8 +13641,8 @@ int ha_innobase::truncate()
 	HA_CREATE_INFO	info;
 	mem_heap_t*	heap = mem_heap_create(1000);
 	dict_table_t*	ib_table = m_prebuilt->table;
-	const time_t	update_time = ib_table->update_time;
-	const ulint	stored_lock = m_prebuilt->stored_select_lock_type;
+	const auto	update_time = ib_table->update_time;
+	const auto	stored_lock = m_prebuilt->stored_select_lock_type;
 	info.init();
 	update_create_info_from_table(&info, table);
 
@@ -13785,10 +13812,11 @@ ha_rows
 ha_innobase::records_in_range(
 /*==========================*/
 	uint			keynr,		/*!< in: index number */
-	key_range		*min_key,	/*!< in: start key value of the
+	const key_range		*min_key,	/*!< in: start key value of the
 						range, may also be 0 */
-	key_range		*max_key)	/*!< in: range end key val, may
+	const key_range		*max_key,	/*!< in: range end key val, may
 						also be 0 */
+        page_range              *pages)
 {
 	KEY*		key;
 	dict_index_t*	index;
@@ -13877,8 +13905,12 @@ ha_innobase::records_in_range(
 			n_rows = rtr_estimate_n_rows_in_range(
 				index, range_start, mode1);
 		} else {
+                        btr_pos_t tuple1(range_start, mode1, pages->first_page);
+                        btr_pos_t tuple2(range_end,   mode2, pages->last_page);
 			n_rows = btr_estimate_n_rows_in_range(
-				index, range_start, mode1, range_end, mode2);
+                                 index, &tuple1, &tuple2);
+                        pages->first_page= tuple1.page_id.raw();
+                        pages->last_page=  tuple2.page_id.raw();
 		}
 	} else {
 
@@ -14176,11 +14208,12 @@ innodb_rec_per_key(
 			rec_per_key calculation */
 			rec_per_key
 				= static_cast<rec_per_key_t>(records - n_null)
-				/ (n_diff - n_null);
+				/ static_cast<rec_per_key_t>(n_diff - n_null);
 		}
 	} else {
 		DEBUG_SYNC_C("after_checking_for_0");
-		rec_per_key = static_cast<rec_per_key_t>(records) / n_diff;
+		rec_per_key = static_cast<rec_per_key_t>(records)
+			/ static_cast<rec_per_key_t>(n_diff);
 	}
 
 	if (rec_per_key < 1.0) {
@@ -14405,7 +14438,6 @@ ha_innobase::info_low(
 	}
 
 	if (flag & HA_STATUS_CONST) {
-		ulong	i;
 		/* Verify the number of index in InnoDB and MySQL
 		matches up. If m_prebuilt->clust_index_was_generated
 		holds, InnoDB defines GEN_CLUST_INDEX internally */
@@ -14470,7 +14502,7 @@ ha_innobase::info_low(
 
 		ut_a(ib_table->stat_initialized);
 
-		for (i = 0; i < table->s->keys; i++) {
+		for (uint i = 0; i < table->s->keys; i++) {
 			ulong	j;
 
 			dict_index_t* index = innobase_get_index(i);
@@ -14547,7 +14579,7 @@ ha_innobase::info_low(
 		}
 	}
 
-	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
+	if (srv_force_recovery > SRV_FORCE_NO_IBUF_MERGE) {
 
 		goto func_exit;
 
@@ -14770,7 +14802,6 @@ ha_innobase::check(
 	ulint		n_rows;
 	ulint		n_rows_in_table	= ULINT_UNDEFINED;
 	bool		is_ok		= true;
-	ulint		old_isolation_level;
 	dberr_t		ret;
 
 	DBUG_ENTER("ha_innobase::check");
@@ -14833,7 +14864,7 @@ ha_innobase::check(
 		DBUG_RETURN(HA_ADMIN_CORRUPT);
 	}
 
-	old_isolation_level = m_prebuilt->trx->isolation_level;
+	uint old_isolation_level = m_prebuilt->trx->isolation_level;
 
 	/* We must run the index record counts at an isolation level
 	>= READ COMMITTED, because a dirty read can see a wrong number
@@ -15031,14 +15062,8 @@ ha_innobase::get_foreign_key_create_info(void)
 	m_prebuilt->trx->op_info = "";
 
 	/* Allocate buffer for the string */
-	char* fk_str = (char*) my_malloc(str.length() + 1, MYF(0));
-
-	/* JAN: TODO: MySQL 5.7
-	fk_str = reinterpret_cast<char*>(
+	char *fk_str = reinterpret_cast<char*>(
 			my_malloc(PSI_INSTRUMENT_ME, str.length() + 1, MYF(0)));
-	*/
-
-
 
 	if (fk_str) {
 		memcpy(fk_str, str.c_str(), str.length());
@@ -15056,8 +15081,8 @@ static
 FOREIGN_KEY_INFO*
 get_foreign_key_info(
 /*=================*/
-	THD*			thd,	/*!< in: user thread handle */
-	dict_foreign_t*		foreign)/*!< in: foreign key constraint */
+	THD*		thd,	/*!< in: user thread handle */
+	dict_foreign_t* foreign)/*!< in: foreign key constraint */
 {
 	FOREIGN_KEY_INFO	f_key_info;
 	FOREIGN_KEY_INFO*	pf_key_info;
@@ -15070,8 +15095,8 @@ get_foreign_key_info(
 	LEX_CSTRING*		name = NULL;
 
 	if (dict_table_t::is_temporary_name(foreign->foreign_table_name)) {
-		return NULL;
-	}
+ 		return NULL;
+ 	}
 
 	ptr = dict_remove_db_name(foreign->id);
 	f_key_info.foreign_id = thd_make_lex_string(
@@ -15082,7 +15107,7 @@ get_foreign_key_info(
 	/* Referenced (parent) database name */
 	len = dict_get_db_name_len(foreign->referenced_table_name);
 	ut_a(len < sizeof(tmp_buff));
-	ut_memcpy(tmp_buff, foreign->referenced_table_name, len);
+	memcpy(tmp_buff, foreign->referenced_table_name, len);
 	tmp_buff[len] = 0;
 
 	len = filename_to_tablename(tmp_buff, name_buff, sizeof(name_buff));
@@ -15098,7 +15123,7 @@ get_foreign_key_info(
 	/* Dependent (child) database name */
 	len = dict_get_db_name_len(foreign->foreign_table_name);
 	ut_a(len < sizeof(tmp_buff));
-	ut_memcpy(tmp_buff, foreign->foreign_table_name, len);
+	memcpy(tmp_buff, foreign->foreign_table_name, len);
 	tmp_buff[len] = 0;
 
 	len = filename_to_tablename(tmp_buff, name_buff, sizeof(name_buff));
@@ -15162,7 +15187,7 @@ get_foreign_key_info(
 					<< foreign->referenced_table_name
 					<< " not found for foreign table "
 					<< foreign->foreign_table_name;
-			}
+ 			}
 		} else {
 
 			dict_table_close(ref_table, TRUE, FALSE);
@@ -15464,8 +15489,6 @@ ha_innobase::start_stmt(
 
 	trx = m_prebuilt->trx;
 
-	innobase_srv_conc_force_exit_innodb(trx);
-
 	/* Reset the AUTOINC statement level counter for multi-row INSERTs. */
 	trx->n_autoinc_rows = 0;
 
@@ -15542,7 +15565,7 @@ ha_innobase::start_stmt(
 Maps a MySQL trx isolation level code to the InnoDB isolation level code
 @return InnoDB isolation level */
 static inline
-ulint
+uint
 innobase_map_isolation_level(
 /*=========================*/
 	enum_tx_isolation	iso)	/*!< in: MySQL isolation level code */
@@ -15771,8 +15794,6 @@ ha_innobase::external_lock(
 	trx->n_mysql_tables_in_use--;
 	m_mysql_has_locked = false;
 
-	innobase_srv_conc_force_exit_innodb(trx);
-
 	/* If the MySQL lock count drops to zero we know that the current SQL
 	statement has ended */
 
@@ -15847,10 +15868,6 @@ innodb_show_status(
 
 	srv_wake_purge_thread_if_not_active();
 
-	trx_t*	trx = check_trx_exists(thd);
-
-	innobase_srv_conc_force_exit_innodb(trx);
-
 	/* We let the InnoDB Monitor to output at most MAX_STATUS_SIZE
 	bytes of text. */
 
@@ -15882,7 +15899,7 @@ innodb_show_status(
 	/* allocate buffer for the string, and
 	read the contents of the temporary file */
 
-	if (!(str = (char*) my_malloc(//PSI_INSTRUMENT_ME,
+	if (!(str = (char*) my_malloc(PSI_INSTRUMENT_ME,
 		      usable_len + 1, MYF(0)))) {
 		mutex_exit(&srv_monitor_file_mutex);
 		DBUG_RETURN(1);
@@ -16151,7 +16168,7 @@ innodb_show_rwlock_status(
 {
 	DBUG_ENTER("innodb_show_rwlock_status");
 
-	rw_lock_t*	block_rwlock = NULL;
+	const rw_lock_t* block_rwlock= nullptr;
 	ulint		block_rwlock_oswait_count = 0;
 	uint		hton_name_len = (uint) strlen(innobase_hton_name);
 
@@ -16159,36 +16176,34 @@ innodb_show_rwlock_status(
 
 	mutex_enter(&rw_lock_list_mutex);
 
-	for (rw_lock_t* rw_lock = UT_LIST_GET_FIRST(rw_lock_list);
-	     rw_lock != NULL;
-	     rw_lock = UT_LIST_GET_NEXT(list, rw_lock)) {
+	for (const rw_lock_t& rw_lock : rw_lock_list) {
 
-		if (rw_lock->count_os_wait == 0) {
+		if (rw_lock.count_os_wait == 0) {
 			continue;
 		}
 
 		int		buf1len;
 		char		buf1[IO_SIZE];
 
-		if (rw_lock->is_block_lock) {
+		if (rw_lock.is_block_lock) {
 
-			block_rwlock = rw_lock;
-			block_rwlock_oswait_count += rw_lock->count_os_wait;
+			block_rwlock = &rw_lock;
+			block_rwlock_oswait_count += rw_lock.count_os_wait;
 
 			continue;
 		}
 
 		buf1len = snprintf(
 			buf1, sizeof buf1, "rwlock: %s:%u",
-			innobase_basename(rw_lock->cfile_name),
-			rw_lock->cline);
+			innobase_basename(rw_lock.cfile_name),
+			rw_lock.cline);
 
 		int		buf2len;
 		char		buf2[IO_SIZE];
 
 		buf2len = snprintf(
 			buf2, sizeof buf2, "waits=%u",
-			rw_lock->count_os_wait);
+			rw_lock.count_os_wait);
 
 		if (stat_print(thd, innobase_hton_name,
 			       hton_name_len,
@@ -16286,6 +16301,7 @@ innobase_show_status(
 	/* Success */
 	return(false);
 }
+
 /*********************************************************************//**
 Returns number of THR_LOCK locks used for one instance of InnoDB table.
 InnoDB no longer relies on THR_LOCK locks so 0 value is returned.
@@ -16440,9 +16456,7 @@ ha_innobase::store_lock(
 		if (sql_command == SQLCOM_CHECKSUM
 		    || sql_command == SQLCOM_CREATE_SEQUENCE
 		    || (sql_command == SQLCOM_ANALYZE && lock_type == TL_READ)
-		    || ((srv_locks_unsafe_for_binlog
-			|| trx->isolation_level <= TRX_ISO_READ_COMMITTED)
-			&& trx->isolation_level != TRX_ISO_SERIALIZABLE
+		    || (trx->isolation_level <= TRX_ISO_READ_COMMITTED
 			&& (lock_type == TL_READ
 			    || lock_type == TL_READ_NO_INSERT)
 			&& (sql_command == SQLCOM_INSERT_SELECT
@@ -16451,10 +16465,8 @@ ha_innobase::store_lock(
 			    || sql_command == SQLCOM_CREATE_SEQUENCE
 			    || sql_command == SQLCOM_CREATE_TABLE))) {
 
-			/* If we either have innobase_locks_unsafe_for_binlog
-			option set or this session is using READ COMMITTED
-			isolation level and isolation level of the transaction
-			is not set to serializable and MySQL is doing
+			/* If the transaction isolation level is
+			READ UNCOMMITTED or READ COMMITTED and we are executing
 			INSERT INTO...SELECT or REPLACE INTO...SELECT
 			or UPDATE ... = (SELECT ...) or CREATE  ...
 			SELECT... without FOR UPDATE or IN SHARE
@@ -16908,7 +16920,7 @@ innobase_get_at_most_n_mbchars(
 	character. */
 
 	if (charset->mbmaxlen > 1) {
-		/* my_charpos() returns the byte length of the first n_chars
+		/* charpos() returns the byte length of the first n_chars
 		characters, or a value bigger than the length of str, if
 		there were not enough full characters in str.
 
@@ -16926,7 +16938,7 @@ innobase_get_at_most_n_mbchars(
 		characters, and we can store in the column prefix index the
 		whole string. */
 
-		char_length= my_charpos(charset, str, str + data_len, n_chars);
+		char_length= charset->charpos(str, str + data_len, n_chars);
 		if (char_length > data_len) {
 			char_length = data_len;
 		}
@@ -16963,8 +16975,6 @@ innobase_xa_prepare(
 
 	thd_get_xid(thd, (MYSQL_XID*) trx->xid);
 
-	innobase_srv_conc_force_exit_innodb(trx);
-
 	if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
 
 		sql_print_error("Transaction not registered for MariaDB 2PC,"
@@ -17051,6 +17061,9 @@ innobase_commit_by_xid(
 {
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
+	DBUG_EXECUTE_IF("innobase_xa_fail",
+			return XAER_RMFAIL;);
+
 	if (high_level_read_only) {
 		return(XAER_RMFAIL);
 	}
@@ -17080,6 +17093,9 @@ int innobase_rollback_by_xid(handlerton* hton, XID* xid)
 {
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
+	DBUG_EXECUTE_IF("innobase_xa_fail",
+			return XAER_RMFAIL;);
+
 	if (high_level_read_only) {
 		return(XAER_RMFAIL);
 	}
@@ -17094,7 +17110,6 @@ int innobase_rollback_by_xid(handlerton* hton, XID* xid)
 		}
 #endif /* WITH_WSREP */
 		int ret = innobase_rollback_trx(trx);
-		trx_deregister_from_2pc(trx);
 		ut_ad(!trx->will_lock);
 		trx->free();
 
@@ -17246,6 +17261,12 @@ innodb_max_dirty_pages_pct_update(
 	}
 
 	srv_max_buf_pool_modified_pct = in_val;
+
+	mysql_mutex_unlock(&LOCK_global_system_variables);
+	mysql_mutex_lock(&buf_pool.flush_list_mutex);
+	buf_pool.page_cleaner_wakeup();
+	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+	mysql_mutex_lock(&LOCK_global_system_variables);
 }
 
 /****************************************************************//**
@@ -17276,6 +17297,12 @@ innodb_max_dirty_pages_pct_lwm_update(
 	}
 
 	srv_max_dirty_pages_pct_lwm = in_val;
+
+	mysql_mutex_unlock(&LOCK_global_system_variables);
+	mysql_mutex_lock(&buf_pool.flush_list_mutex);
+	buf_pool.page_cleaner_wakeup();
+	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+	mysql_mutex_lock(&LOCK_global_system_variables);
 }
 
 /*************************************************************//**
@@ -17300,7 +17327,7 @@ fast_shutdown_validate(
 	uint new_val = *reinterpret_cast<uint*>(save);
 
 	if (srv_fast_shutdown && !new_val
-	    && !srv_running.load(std::memory_order_relaxed)) {
+	    && !srv_read_only_mode && abort_loop) {
 		return(1);
 	}
 
@@ -17356,6 +17383,8 @@ innodb_stopword_table_validate(
 	return(ret);
 }
 
+extern void buf_resize_start();
+
 /** Update the system variable innodb_buffer_pool_size using the "saved"
 value. This function is registered as a callback with MySQL.
 @param[in]	save	immediate result from check function */
@@ -17369,7 +17398,7 @@ innodb_buffer_pool_size_update(THD*,st_mysql_sys_var*,void*, const void* save)
 	        sizeof(export_vars.innodb_buffer_pool_resize_status),
 		"Requested to resize buffer pool.");
 
-	os_event_set(srv_buf_resize_event);
+	buf_resize_start();
 
 	ib::info() << export_vars.innodb_buffer_pool_resize_status
 		<< " (new size: " << in_val << " bytes)";
@@ -17484,20 +17513,8 @@ innodb_change_buffer_max_size_update(THD*, st_mysql_sys_var*, void*,
 }
 
 #ifdef UNIV_DEBUG
-static ulong srv_fil_make_page_dirty_debug = 0;
-static ulong srv_saved_page_number_debug = 0;
-
-/****************************************************************//**
-Save an InnoDB page number. */
-static
-void
-innodb_save_page_no(THD*, st_mysql_sys_var*, void*, const void* save)
-{
-	srv_saved_page_number_debug = *static_cast<const ulong*>(save);
-
-	ib::info() << "Saving InnoDB page number: "
-		<< srv_saved_page_number_debug;
-}
+static uint srv_fil_make_page_dirty_debug = 0;
+static uint srv_saved_page_number_debug;
 
 /****************************************************************//**
 Make the first page of given user tablespace dirty. */
@@ -17506,9 +17523,9 @@ void
 innodb_make_page_dirty(THD*, st_mysql_sys_var*, void*, const void* save)
 {
 	mtr_t		mtr;
-	ulong		space_id = *static_cast<const ulong*>(save);
+	uint		space_id = *static_cast<const uint*>(save);
 	mysql_mutex_unlock(&LOCK_global_system_variables);
-	fil_space_t*	space = fil_space_acquire_silent(space_id);
+	fil_space_t*	space = fil_space_t::get(space_id);
 
 	if (space == NULL) {
 func_exit_no_space:
@@ -17530,45 +17547,16 @@ func_exit:
 		space->zip_size(), RW_X_LATCH, &mtr);
 
 	if (block != NULL) {
-		byte*	page = block->frame;
-
-		ib::info() << "Dirtying page: " << page_id_t(
-			page_get_space_id(page), page_get_page_no(page));
-
-		mlog_write_ulint(page + FIL_PAGE_TYPE,
-				 fil_page_get_type(page),
-				 MLOG_2BYTES, &mtr);
+		ib::info() << "Dirtying page: " << block->page.id();
+		mtr.write<1,mtr_t::FORCED>(*block,
+					   block->frame + FIL_PAGE_SPACE_ID,
+					   block->frame[FIL_PAGE_SPACE_ID]);
 	}
 	mtr.commit();
+	log_write_up_to(mtr.commit_lsn(), true);
 	goto func_exit;
 }
 #endif // UNIV_DEBUG
-/*************************************************************//**
-Just emit a warning that the usage of the variable is deprecated.
-@return 0 */
-static
-void
-innodb_stats_sample_pages_update(
-/*=============================*/
-	THD*				thd,	/*!< in: thread handle */
-	st_mysql_sys_var*, void*,
-	const void*			save)	/*!< in: immediate result
-						from check function */
-{
-
-	const char*	STATS_SAMPLE_PAGES_DEPRECATED_MSG =
-		"Using innodb_stats_sample_pages is deprecated and"
-		" the variable may be removed in future releases."
-		" Please use innodb_stats_transient_sample_pages instead.";
-
-	push_warning(thd, Sql_condition::WARN_LEVEL_WARN,
-		     HA_ERR_WRONG_COMMAND, STATS_SAMPLE_PAGES_DEPRECATED_MSG);
-
-	ib::warn() << STATS_SAMPLE_PAGES_DEPRECATED_MSG;
-
-	srv_stats_transient_sample_pages =
-		*static_cast<const unsigned long long*>(save);
-}
 
 /****************************************************************//**
 Update the monitor counter according to the "set_option",  turn
@@ -17848,7 +17836,7 @@ innodb_monitor_validate(
 	by InnoDB, so we can access it in another callback
 	function innodb_monitor_update() and free it appropriately */
 	if (name) {
-		monitor_name = my_strdup(//PSI_INSTRUMENT_ME,
+		monitor_name = my_strdup(PSI_INSTRUMENT_ME,
                                          name, MYF(0));
 	} else {
 		return(1);
@@ -18027,35 +18015,27 @@ static bool innodb_buffer_pool_evict_uncompressed()
 {
 	bool	all_evicted = true;
 
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool = &buf_pool_ptr[i];
+	mysql_mutex_lock(&buf_pool.mutex);
 
-		buf_pool_mutex_enter(buf_pool);
+	for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+	     block != NULL; ) {
+		buf_block_t*	prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
+		ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+		ut_ad(block->in_unzip_LRU_list);
+		ut_ad(block->page.in_LRU_list);
 
-		for (buf_block_t* block = UT_LIST_GET_LAST(
-			     buf_pool->unzip_LRU);
-		     block != NULL; ) {
-			buf_block_t*	prev_block = UT_LIST_GET_PREV(
-				unzip_LRU, block);
-			ut_ad(buf_block_get_state(block)
-			      == BUF_BLOCK_FILE_PAGE);
-			ut_ad(block->in_unzip_LRU_list);
-			ut_ad(block->page.in_LRU_list);
-
-			if (!buf_LRU_free_page(&block->page, false)) {
-				all_evicted = false;
-				block = prev_block;
-			} else {
-				/* Because buf_LRU_free_page() may release
-				and reacquire buf_pool_t::mutex, prev_block
-				may be invalid. */
-				block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
-			}
+		if (!buf_LRU_free_page(&block->page, false)) {
+			all_evicted = false;
+			block = prev_block;
+		} else {
+			/* Because buf_LRU_free_page() may release
+			and reacquire buf_pool.mutex, prev_block
+			may be invalid. */
+			block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
 		}
-
-		buf_pool_mutex_exit(buf_pool);
 	}
 
+	mysql_mutex_unlock(&buf_pool.mutex);
 	return(all_evicted);
 }
 
@@ -18350,18 +18330,17 @@ checkpoint_now_set(THD*, st_mysql_sys_var*, void*, const void* save)
 	if (*(my_bool*) save) {
 		mysql_mutex_unlock(&LOCK_global_system_variables);
 
-		while (log_sys.last_checkpoint_lsn
-		       + SIZE_OF_MLOG_CHECKPOINT
-		       + (log_sys.append_on_checkpoint != NULL
-			  ? log_sys.append_on_checkpoint->size() : 0)
-		       < log_sys.lsn) {
+		lsn_t lsn;
+
+		while (log_sys.last_checkpoint_lsn.load(
+			       std::memory_order_acquire)
+		       + SIZE_OF_FILE_CHECKPOINT
+		       < (lsn= log_sys.get_lsn(std::memory_order_acquire))) {
 			log_make_checkpoint();
-			fil_flush_file_spaces(FIL_TYPE_LOG);
+			log_sys.log.flush();
 		}
 
-		dberr_t err = fil_write_flushed_lsn(log_sys.lsn);
-
-		if (err != DB_SUCCESS) {
+		if (dberr_t err = fil_write_flushed_lsn(lsn)) {
 			ib::warn() << "Checkpoint set failed " << err;
 		}
 
@@ -18377,7 +18356,7 @@ buf_flush_list_now_set(THD*, st_mysql_sys_var*, void*, const void* save)
 {
 	if (*(my_bool*) save) {
 		mysql_mutex_unlock(&LOCK_global_system_variables);
-		buf_flush_sync_all_buf_pools();
+		buf_flush_sync();
 		mysql_mutex_lock(&LOCK_global_system_variables);
 	}
 }
@@ -18566,12 +18545,12 @@ static
 void
 innodb_status_output_update(THD*,st_mysql_sys_var*,void*var,const void*save)
 {
-  *static_cast<my_bool*>(var)= *static_cast<const my_bool*>(save);
-  if (srv_monitor_event)
+  if (srv_monitor_timer)
   {
+    *static_cast<my_bool*>(var)= *static_cast<const my_bool*>(save);
     mysql_mutex_unlock(&LOCK_global_system_variables);
-    /* Wakeup server monitor thread. */
-    os_event_set(srv_monitor_event);
+    /* Wakeup server monitor. */
+    srv_monitor_timer_schedule_now();
     mysql_mutex_lock(&LOCK_global_system_variables);
   }
 }
@@ -18622,45 +18601,106 @@ innodb_encrypt_tables_update(THD*, st_mysql_sys_var*, void*, const void* save)
 	mysql_mutex_lock(&LOCK_global_system_variables);
 }
 
-/** Update the innodb_log_checksums parameter.
-@param[in,out]	thd	client connection
-@param[out]	var_ptr	current value
-@param[in]	save	immediate result from check function */
-static
-void
-innodb_log_checksums_update(THD* thd, st_mysql_sys_var*, void* var_ptr,
-			    const void* save)
+/** Issue a deprecation warning for SET GLOBAL innodb_log_checksums.
+@param[in,out]	thd	client connection */
+static void
+innodb_log_checksums_warn(THD* thd, st_mysql_sys_var*, void*, const void*)
 {
-	*static_cast<my_bool*>(var_ptr) = innodb_log_checksums_func_update(
-		thd, *static_cast<const my_bool*>(save));
+	push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+			    HA_ERR_UNSUPPORTED,
+			    deprecated::innodb_log_checksums_msg);
 }
 
-#ifdef UNIV_DEBUG
-static
-void
-innobase_debug_sync_callback(srv_slot_t *slot, const void *value)
+/** Issue a deprecation warning for SET GLOBAL innodb_log_compressed_pages.
+@param[in,out]	thd	client connection */
+static void
+innodb_log_compressed_pages_warn(THD* thd, st_mysql_sys_var*, void*,
+				 const void*)
+{
+	push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+			    HA_ERR_UNSUPPORTED,
+			    deprecated::innodb_log_compressed_pages_msg);
+}
+
+/** Issue a deprecation warning for SET GLOBAL innodb_log_optimize_ddl.
+@param[in,out]	thd	client connection */
+static void
+innodb_log_optimize_ddl_warn(THD* thd, st_mysql_sys_var*, void*, const void*)
 {
-	const char *value_str = *static_cast<const char* const*>(value);
-	size_t len = strlen(value_str) + 1;
+	push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+			    HA_ERR_UNSUPPORTED,
+			    deprecated::innodb_log_optimize_ddl_msg);
+}
 
+/** Issue a deprecation warning for SET GLOBAL innodb_page_cleaners.
+@param[in,out]	thd	client connection */
+static void
+innodb_page_cleaners_warn(THD* thd, st_mysql_sys_var*, void*, const void*)
+{
+	push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+			    HA_ERR_UNSUPPORTED,
+			    deprecated::innodb_page_cleaners_msg);
+}
 
-	// One allocation for list node object and value.
-	void *buf = ut_malloc_nokey(sizeof(srv_slot_t::debug_sync_t) + len-1);
-	srv_slot_t::debug_sync_t *sync = new(buf) srv_slot_t::debug_sync_t();
-	strcpy(sync->str, value_str);
+/** Issue a deprecation warning for SET GLOBAL innodb_undo_logs.
+@param[in,out]	thd	client connection */
+static void
+innodb_undo_logs_warn(THD* thd, st_mysql_sys_var*, void*, const void*)
+{
+	push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+			    HA_ERR_UNSUPPORTED,
+			    deprecated::innodb_undo_logs_msg);
+}
 
-	rw_lock_x_lock(&slot->debug_sync_lock);
-	UT_LIST_ADD_LAST(slot->debug_sync, sync);
-	rw_lock_x_unlock(&slot->debug_sync_lock);
+/** Issue a deprecation warning for SET GLOBAL innodb_scrub_log_speed.
+@param[in,out]	thd	client connection */
+static void
+innodb_scrub_log_speed_warn(THD* thd, st_mysql_sys_var*, void*, const void*)
+{
+	push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+			    HA_ERR_UNSUPPORTED,
+			    deprecated::innodb_scrub_log_speed_msg);
 }
-static
-void
-innobase_debug_sync_set(THD *thd, st_mysql_sys_var*, void *, const void *value)
+
+static void
+innodb_background_scrub_data_uncompressed_warn(THD* thd, st_mysql_sys_var*,
+					       void*, const void*)
 {
-	srv_for_each_thread(SRV_WORKER, innobase_debug_sync_callback, value);
-	srv_for_each_thread(SRV_PURGE, innobase_debug_sync_callback, value);
+	push_warning_printf(
+		thd, Sql_condition::WARN_LEVEL_WARN,
+		HA_ERR_UNSUPPORTED,
+		deprecated::innodb_background_scrub_data_uncompressed_msg);
+}
+
+static void
+innodb_background_scrub_data_compressed_warn(THD* thd, st_mysql_sys_var*,
+					     void*, const void*)
+{
+	push_warning_printf(
+		thd, Sql_condition::WARN_LEVEL_WARN,
+		HA_ERR_UNSUPPORTED,
+		deprecated::innodb_background_scrub_data_compressed_msg);
+}
+
+static void
+innodb_background_scrub_data_check_interval_warn(
+	THD* thd, st_mysql_sys_var*, void*, const void*)
+{
+	push_warning_printf(
+		thd, Sql_condition::WARN_LEVEL_WARN,
+		HA_ERR_UNSUPPORTED,
+		deprecated::innodb_background_scrub_data_check_interval_msg);
+}
+
+static void
+innodb_background_scrub_data_interval_warn(
+	THD* thd, st_mysql_sys_var*, void*, const void*)
+{
+	push_warning_printf(
+		thd, Sql_condition::WARN_LEVEL_WARN,
+		HA_ERR_UNSUPPORTED,
+		deprecated::innodb_background_scrub_data_interval_msg);
 }
-#endif
 
 static SHOW_VAR innodb_status_variables_export[]= {
 	{"Innodb", (char*) &show_innodb_vars, SHOW_FUNC},
@@ -18838,7 +18878,6 @@ wsrep_abort_transaction(
     wsrep_kill_victim(bf_thd, victim_thd, victim_trx, signal);
     lock_mutex_exit();
     trx_mutex_exit(victim_trx);
-    wsrep_srv_conc_cancel_wait(victim_trx);
   }
   else
   {
@@ -18878,14 +18917,6 @@ innobase_wsrep_get_checkpoint(
 }
 #endif /* WITH_WSREP */
 
-static void innodb_idle_flush_pct_update(THD *thd, st_mysql_sys_var *var,
-                                         void*, const void *save)
-{
-  innodb_idle_flush_pct = *static_cast<const ulong*>(save);
-  push_warning(thd, Sql_condition::WARN_LEVEL_WARN,
-               HA_ERR_WRONG_COMMAND, deprecated_idle_flush_pct);
-}
-
 /* plugin options */
 
 static MYSQL_SYSVAR_ENUM(checksum_algorithm, srv_checksum_algorithm,
@@ -18914,21 +18945,16 @@ static MYSQL_SYSVAR_ENUM(checksum_algorithm, srv_checksum_algorithm,
   " Files updated when this option is set to crc32 or strict_crc32 will"
   " not be readable by MariaDB versions older than 10.0.4;"
   " new files created with full_crc32 are readable by MariaDB 10.4.3+",
-  NULL, innodb_checksum_algorithm_update, SRV_CHECKSUM_ALGORITHM_CRC32,
+  NULL, innodb_checksum_algorithm_update, SRV_CHECKSUM_ALGORITHM_FULL_CRC32,
   &innodb_checksum_algorithm_typelib);
 
-static MYSQL_SYSVAR_BOOL(log_checksums, innodb_log_checksums,
-  PLUGIN_VAR_RQCMDARG,
-  "DEPRECATED. Whether to require checksums for InnoDB redo log blocks.",
-  NULL, innodb_log_checksums_update, TRUE);
+/** Description of deprecated and ignored parameters */
+static const char* innodb_deprecated_ignored
+= "Deprecated parameter with no effect.";
 
-static MYSQL_SYSVAR_BOOL(checksums, innobase_use_checksums,
-  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
-  "DEPRECATED. Use innodb_checksum_algorithm=NONE instead of setting"
-  " this to OFF."
-  " Enable InnoDB checksums validation (enabled by default)."
-  " Disable with --skip-innodb-checksums.",
-  NULL, NULL, TRUE);
+static MYSQL_SYSVAR_BOOL(log_checksums, deprecated::innodb_log_checksums,
+  PLUGIN_VAR_RQCMDARG,
+  innodb_deprecated_ignored, NULL, innodb_log_checksums_warn, TRUE);
 
 static MYSQL_SYSVAR_STR(data_home_dir, innobase_data_home_dir,
   PLUGIN_VAR_READONLY,
@@ -18973,11 +18999,6 @@ static MYSQL_SYSVAR_ULONG(io_capacity_max, srv_max_io_capacity,
   SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT, 100,
   SRV_MAX_IO_CAPACITY_LIMIT, 0);
 
-static MYSQL_SYSVAR_ULONG(idle_flush_pct, innodb_idle_flush_pct,
-  PLUGIN_VAR_RQCMDARG,
-  "DEPRECATED. This setting has no effect.",
-  NULL, innodb_idle_flush_pct_update, 100, 0, 100, 0);
-
 #ifdef UNIV_DEBUG
 static MYSQL_SYSVAR_BOOL(background_drop_list_empty,
   innodb_background_drop_list_empty,
@@ -19012,14 +19033,10 @@ static MYSQL_SYSVAR_ULONG(purge_batch_size, srv_purge_batch_size,
   1,			/* Minimum value */
   5000, 0);		/* Maximum value */
 
-static MYSQL_SYSVAR_ULONG(purge_threads, srv_n_purge_threads,
+static MYSQL_SYSVAR_UINT(purge_threads, srv_n_purge_threads,
   PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
-  "Purge threads can be from 1 to 32. Default is 4.",
-  NULL, NULL,
-  4,			/* Default setting */
-  1,			/* Minimum value */
-  srv_max_purge_threads,/* Maximum value */
-  0);
+  "Number of tasks for purging transaction history",
+  NULL, NULL, 4, 1, innodb_purge_threads_MAX, 0);
 
 static MYSQL_SYSVAR_ULONG(sync_array_size, srv_sync_array_size,
   PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
@@ -19072,54 +19089,36 @@ static MYSQL_SYSVAR_ENUM(flush_method, innodb_flush_method,
   NULL, NULL, IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_FSYNC),
   &innodb_flush_method_typelib);
 
-static MYSQL_SYSVAR_STR(file_format, innodb_file_format,
+static MYSQL_SYSVAR_STR(file_format, deprecated::innodb_file_format,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  "Deprecated parameter with no effect.", NULL, NULL, NULL);
-
-static MYSQL_SYSVAR_STR(large_prefix, innodb_large_prefix,
+  innodb_deprecated_ignored, NULL, NULL, NULL);
+static MYSQL_SYSVAR_STR(large_prefix, deprecated::innodb_large_prefix,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  "Deprecated parameter with no effect.", NULL, NULL, NULL);
+  innodb_deprecated_ignored, NULL, NULL, NULL);
 
 static MYSQL_SYSVAR_BOOL(force_load_corrupted, srv_load_corrupted,
   PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
   "Force InnoDB to load metadata of corrupted table.",
   NULL, NULL, FALSE);
 
-static MYSQL_SYSVAR_BOOL(locks_unsafe_for_binlog, innobase_locks_unsafe_for_binlog,
-  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
-  "DEPRECATED. This option may be removed in future releases."
-  " Please use READ COMMITTED transaction isolation level instead."
-  " Force InnoDB to not use next-key locking, to use only row-level locking.",
-  NULL, NULL, FALSE);
-
 static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Path to InnoDB log files.", NULL, NULL, NULL);
 
-/** Update innodb_page_cleaners.
-@param[in]	save	the new value of innodb_page_cleaners */
-static
-void
-innodb_page_cleaners_threads_update(THD*, struct st_mysql_sys_var*, void*, const void *save)
-{
-	buf_flush_set_page_cleaner_thread_cnt(*static_cast<const ulong*>(save));
-}
-
-static MYSQL_SYSVAR_ULONG(page_cleaners, srv_n_page_cleaners,
+static MYSQL_SYSVAR_ULONG(page_cleaners, deprecated::innodb_page_cleaners,
   PLUGIN_VAR_RQCMDARG,
-  "Page cleaner threads can be from 1 to 64. Default is 4.",
-  NULL,
-  innodb_page_cleaners_threads_update, 4, 1, 64, 0);
+  innodb_deprecated_ignored, NULL, innodb_page_cleaners_warn, 0, 0, 64, 0);
 
 static MYSQL_SYSVAR_DOUBLE(max_dirty_pages_pct, srv_max_buf_pool_modified_pct,
   PLUGIN_VAR_RQCMDARG,
   "Percentage of dirty pages allowed in bufferpool.",
-  NULL, innodb_max_dirty_pages_pct_update, 75.0, 0, 99.999, 0);
+  NULL, innodb_max_dirty_pages_pct_update, 90.0, 0, 99.999, 0);
 
 static MYSQL_SYSVAR_DOUBLE(max_dirty_pages_pct_lwm,
   srv_max_dirty_pages_pct_lwm,
   PLUGIN_VAR_RQCMDARG,
-  "Percentage of dirty pages at which flushing kicks in.",
+  "Percentage of dirty pages at which flushing kicks in. "
+  "The value 0 (default) means 'refer to innodb_max_dirty_pages_pct'.",
   NULL, innodb_max_dirty_pages_pct_lwm_update, 0, 0, 99.999, 0);
 
 static MYSQL_SYSVAR_DOUBLE(adaptive_flushing_lwm,
@@ -19178,11 +19177,6 @@ static MYSQL_SYSVAR_BOOL(stats_on_metadata, innobase_stats_on_metadata,
   " SHOW TABLE STATUS for tables that use transient statistics (off by default)",
   NULL, NULL, FALSE);
 
-static MYSQL_SYSVAR_ULONGLONG(stats_sample_pages, srv_stats_transient_sample_pages,
-  PLUGIN_VAR_RQCMDARG,
-  "Deprecated, use innodb_stats_transient_sample_pages instead",
-  NULL, innodb_stats_sample_pages_update, 8, 1, ~0ULL, 0);
-
 static MYSQL_SYSVAR_ULONGLONG(stats_transient_sample_pages,
   srv_stats_transient_sample_pages,
   PLUGIN_VAR_RQCMDARG,
@@ -19224,9 +19218,8 @@ static MYSQL_SYSVAR_BOOL(stats_traditional, srv_stats_sample_traditional,
 #ifdef BTR_CUR_HASH_ADAPT
 static MYSQL_SYSVAR_BOOL(adaptive_hash_index, btr_search_enabled,
   PLUGIN_VAR_OPCMDARG,
-  "Enable InnoDB adaptive hash index (enabled by default). "
-  " Disable with --skip-innodb-adaptive-hash-index.",
-  NULL, innodb_adaptive_hash_index_update, true);
+  "Enable InnoDB adaptive hash index (disabled by default).",
+  NULL, innodb_adaptive_hash_index_update, false);
 
 /** Number of distinct partitions of AHI.
 Each partition is protected by its own latch and so we have parts number
@@ -19237,11 +19230,10 @@ static MYSQL_SYSVAR_ULONG(adaptive_hash_index_parts, btr_ahi_parts,
   NULL, NULL, 8, 1, 512, 0);
 #endif /* BTR_CUR_HASH_ADAPT */
 
-static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay,
+static MYSQL_SYSVAR_UINT(replication_delay, deprecated::replication_delay,
   PLUGIN_VAR_RQCMDARG,
-  "Replication thread delay (ms) on the slave server if"
-  " innodb_thread_concurrency is reached (0 by default)",
-  NULL, NULL, 0, 0, ~0UL, 0);
+  innodb_deprecated_ignored, nullptr, deprecated::replication_delay_warn,
+   0, 0, ~0U, 0);
 
 static MYSQL_SYSVAR_UINT(compression_level, page_zip_level,
   PLUGIN_VAR_RQCMDARG,
@@ -19249,48 +19241,29 @@ static MYSQL_SYSVAR_UINT(compression_level, page_zip_level,
   ", 1 is fastest, 9 is best compression and default is 6.",
   NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0);
 
-static MYSQL_SYSVAR_BOOL(log_compressed_pages, page_zip_log_pages,
-       PLUGIN_VAR_OPCMDARG,
-  "Enables/disables the logging of entire compressed page images."
-  " InnoDB logs the compressed pages to prevent corruption if"
-  " the zlib compression algorithm changes."
-  " When turned OFF, InnoDB will assume that the zlib"
-  " compression algorithm doesn't change.",
-  NULL, NULL, TRUE);
+static MYSQL_SYSVAR_BOOL(log_compressed_pages,
+  deprecated::innodb_log_compressed_pages,
+  PLUGIN_VAR_OPCMDARG,
+  innodb_deprecated_ignored, NULL, innodb_log_compressed_pages_warn, TRUE);
 
-static MYSQL_SYSVAR_BOOL(log_optimize_ddl, innodb_log_optimize_ddl,
+static MYSQL_SYSVAR_BOOL(log_optimize_ddl, deprecated::innodb_log_optimize_ddl,
   PLUGIN_VAR_OPCMDARG,
-  "DEPRECATED. Ignored in MariaDB 10.5."
-  " Reduce redo logging when natively creating indexes or rebuilding tables."
-  " Enabling this may slow down backup and cause delay due to page flushing.",
-  NULL, NULL, FALSE);
+  innodb_deprecated_ignored, NULL, innodb_log_optimize_ddl_warn, FALSE);
 
-static MYSQL_SYSVAR_ULONG(autoextend_increment,
+static MYSQL_SYSVAR_UINT(autoextend_increment,
   sys_tablespace_auto_extend_increment,
   PLUGIN_VAR_RQCMDARG,
   "Data file autoextend increment in megabytes",
-  NULL, NULL, 64L, 1L, 1000L, 0);
+  NULL, NULL, 64, 1, 1000, 0);
 
 static MYSQL_SYSVAR_ULONG(buffer_pool_chunk_size, srv_buf_pool_chunk_unit,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  "Size of a single memory chunk within each buffer pool instance"
+  "Size of a single memory chunk"
   " for resizing buffer pool. Online buffer pool resizing happens"
   " at this granularity. 0 means disable resizing buffer pool.",
   NULL, NULL,
   128 * 1024 * 1024, 1024 * 1024, LONG_MAX, 1024 * 1024);
 
-#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG
-static MYSQL_SYSVAR_ULONG(page_hash_locks, srv_n_page_hash_locks,
-  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
-  "Number of rw_locks protecting buffer pool page_hash. Rounded up to the next power of 2",
-  NULL, NULL, 16, 1, MAX_PAGE_HASH_LOCKS, 0);
-
-static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size,
-  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
-  "Number of pages reserved in doublewrite buffer for batch flushing",
-  NULL, NULL, 120, 1, 127, 0);
-#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */
-
 static MYSQL_SYSVAR_ENUM(lock_schedule_algorithm, innodb_lock_schedule_algorithm,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "The algorithm Innodb uses for deciding which locks to grant next when"
@@ -19303,10 +19276,10 @@ static MYSQL_SYSVAR_ENUM(lock_schedule_algorithm, innodb_lock_schedule_algorithm
   NULL, NULL, INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS,
   &innodb_lock_schedule_algorithm_typelib);
 
-static MYSQL_SYSVAR_ULONG(buffer_pool_instances, srv_buf_pool_instances,
+static MYSQL_SYSVAR_ULONG(buffer_pool_instances,
+  deprecated::innodb_buffer_pool_instances,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  "Number of buffer pool instances, set to higher value on high-end machines to increase scalability",
-  NULL, NULL, srv_buf_pool_instances_default, 0, MAX_BUFFER_POOLS, 0);
+  innodb_deprecated_ignored, NULL, NULL, 0, 0, 64, 0);
 
 static MYSQL_SYSVAR_STR(buffer_pool_filename, srv_buf_dump_filename,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
@@ -19414,7 +19387,12 @@ static MYSQL_SYSVAR_UINT(defragment_frequency, srv_defragment_frequency,
 static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth,
   PLUGIN_VAR_RQCMDARG,
   "How deep to scan LRU to keep it clean",
-  NULL, NULL, 1024, 100, ~0UL, 0);
+  NULL, NULL, 1536, 100, ~0UL, 0);
+
+static MYSQL_SYSVAR_SIZE_T(lru_flush_size, innodb_lru_flush_size,
+  PLUGIN_VAR_RQCMDARG,
+  "How many pages to flush on LRU eviction",
+  NULL, NULL, 32, 1, SIZE_T_MAX, 0);
 
 static MYSQL_SYSVAR_ULONG(flush_neighbors, srv_flush_neighbors,
   PLUGIN_VAR_OPCMDARG,
@@ -19424,15 +19402,15 @@ static MYSQL_SYSVAR_ULONG(flush_neighbors, srv_flush_neighbors,
   " when flushing a block",
   NULL, NULL, 1, 0, 2, 0);
 
-static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency,
+static MYSQL_SYSVAR_UINT(commit_concurrency, deprecated::commit_concurrency,
   PLUGIN_VAR_RQCMDARG,
-  "Helps in performance tuning in heavily concurrent environments.",
-  innobase_commit_concurrency_validate, NULL, 0, 0, 1000, 0);
+  innodb_deprecated_ignored, nullptr, deprecated::commit_concurrency_warn,
+  0, 0, 1000, 0);
 
-static MYSQL_SYSVAR_ULONG(concurrency_tickets, srv_n_free_tickets_to_enter,
+static MYSQL_SYSVAR_UINT(concurrency_tickets, deprecated::concurrency_tickets,
   PLUGIN_VAR_RQCMDARG,
-  "Number of times a thread is allowed to enter InnoDB within the same SQL query after it has once got the ticket",
-  NULL, NULL, 5000L, 1L, ~0UL, 0);
+  innodb_deprecated_ignored, nullptr, deprecated::concurrency_tickets_warn,
+  0, 0, ~0U, 0);
 
 static MYSQL_SYSVAR_BOOL(deadlock_detect, innobase_deadlock_detect,
   PLUGIN_VAR_NOCMDARG,
@@ -19531,19 +19509,19 @@ static MYSQL_SYSVAR_BOOL(optimize_fulltext_only, innodb_optimize_fulltext_only,
   "Only optimize the Fulltext index of the table",
   NULL, NULL, FALSE);
 
-static MYSQL_SYSVAR_ULONG(read_io_threads, srv_n_read_io_threads,
+static MYSQL_SYSVAR_UINT(read_io_threads, srv_n_read_io_threads,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Number of background read I/O threads in InnoDB.",
   NULL, NULL, 4, 1, 64, 0);
 
-static MYSQL_SYSVAR_ULONG(write_io_threads, srv_n_write_io_threads,
+static MYSQL_SYSVAR_UINT(write_io_threads, srv_n_write_io_threads,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Number of background write I/O threads in InnoDB.",
-  NULL, NULL, 4, 1, 64, 0);
+  NULL, NULL, 4, 2, 64, 0);
 
 static MYSQL_SYSVAR_ULONG(force_recovery, srv_force_recovery,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  "Helps to save your data in case the disk image of the database becomes corrupt.",
+  "Helps to save your data in case the disk image of the database becomes corrupt. Value 5 can return bogus data, and 6 can permanently corrupt data.",
   NULL, NULL, 0, 0, 6, 0);
 
 static MYSQL_SYSVAR_ULONG(page_size, srv_page_size,
@@ -19560,14 +19538,12 @@ static MYSQL_SYSVAR_ULONG(log_buffer_size, srv_log_buffer_size,
 static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Size of each log file in a log group.",
-  NULL, NULL, 48 << 20, 1 << 20, log_group_max_size, UNIV_PAGE_SIZE_MAX);
-/* OS_FILE_LOG_BLOCK_SIZE would be more appropriate than UNIV_PAGE_SIZE_MAX,
-but fil_space_t is being used for the redo log, and it uses data pages. */
+  NULL, NULL, 96 << 20, 1 << 20, std::numeric_limits<ulonglong>::max(),
+  UNIV_PAGE_SIZE_MAX);
 
-static MYSQL_SYSVAR_ULONG(log_files_in_group, srv_n_log_files,
+static MYSQL_SYSVAR_ULONG(log_files_in_group, deprecated::srv_n_log_files,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  "Number of log files in the log group. InnoDB writes to the files in a circular fashion.",
-  NULL, NULL, 2, 1, SRV_N_LOG_FILES_MAX, 0);
+  innodb_deprecated_ignored, NULL, NULL, 1, 1, 100, 0);
 
 static MYSQL_SYSVAR_ULONG(log_write_ahead_size, srv_log_write_ahead_size,
   PLUGIN_VAR_RQCMDARG,
@@ -19603,19 +19579,16 @@ static MYSQL_SYSVAR_UINT(spin_wait_delay, srv_spin_wait_delay,
   "Maximum delay between polling for a spin lock (4 by default)",
   NULL, NULL, 4, 0, 6000, 0);
 
-static MYSQL_SYSVAR_ULONG(thread_concurrency, srv_thread_concurrency,
+static MYSQL_SYSVAR_UINT(thread_concurrency, deprecated::thread_concurrency,
   PLUGIN_VAR_RQCMDARG,
-  "Helps in performance tuning in heavily concurrent environments. Sets the maximum number of threads allowed inside InnoDB. Value 0 will disable the thread throttling.",
-  NULL, NULL, 0, 0, 1000, 0);
+  innodb_deprecated_ignored, nullptr, deprecated::thread_concurrency_warn,
+  0, 0, 1000, 0);
 
-static MYSQL_SYSVAR_ULONG(
-  adaptive_max_sleep_delay, srv_adaptive_max_sleep_delay,
+static MYSQL_SYSVAR_UINT(
+  adaptive_max_sleep_delay, deprecated::adaptive_max_sleep_delay,
   PLUGIN_VAR_RQCMDARG,
-  "The upper limit of the sleep delay in usec. Value of 0 disables it.",
-  NULL, NULL,
-  150000,			/* Default setting */
-  0,				/* Minimum value */
-  1000000, 0);			/* Maximum value */
+  innodb_deprecated_ignored,
+  nullptr, deprecated::adaptive_max_sleep_delay_warn, 0, 0, 1000000, 0);
 
 static MYSQL_SYSVAR_BOOL(prefix_index_cluster_optimization,
   srv_prefix_index_cluster_optimization,
@@ -19623,14 +19596,10 @@ static MYSQL_SYSVAR_BOOL(prefix_index_cluster_optimization,
   "Enable prefix optimization to sometimes avoid cluster index lookups.",
   NULL, NULL, FALSE);
 
-static MYSQL_SYSVAR_ULONG(thread_sleep_delay, srv_thread_sleep_delay,
+static MYSQL_SYSVAR_UINT(thread_sleep_delay, deprecated::thread_sleep_delay,
   PLUGIN_VAR_RQCMDARG,
-  "Time of innodb thread sleeping before joining InnoDB queue (usec)."
-  " Value 0 disable a sleep",
-  NULL, NULL,
-  10000L,
-  0L,
-  1000000L, 0);
+  innodb_deprecated_ignored, nullptr, deprecated::thread_sleep_delay_warn,
+  0, 0, 1000000, 0);
 
 static MYSQL_SYSVAR_STR(data_file_path, innobase_data_file_path,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
@@ -19655,13 +19624,10 @@ static MYSQL_SYSVAR_ULONG(undo_tablespaces, srv_undo_tablespaces,
   0L,			/* Minimum value */
   TRX_SYS_MAX_UNDO_SPACES, 0); /* Maximum value */
 
-static MYSQL_SYSVAR_ULONG(undo_logs, srv_undo_logs,
+static MYSQL_SYSVAR_ULONG(undo_logs, deprecated::innodb_undo_logs,
   PLUGIN_VAR_OPCMDARG,
-  "Number of undo logs to use.",
-  NULL, NULL,
-  TRX_SYS_N_RSEGS,	/* Default setting */
-  1,			/* Minimum value */
-  TRX_SYS_N_RSEGS, 0);	/* Maximum value */
+  innodb_deprecated_ignored, NULL, innodb_undo_logs_warn,
+  TRX_SYS_N_RSEGS, 0, TRX_SYS_N_RSEGS, 0);
 
 static MYSQL_SYSVAR_ULONGLONG(max_undo_log_size, srv_max_undo_log_size,
   PLUGIN_VAR_OPCMDARG,
@@ -19682,15 +19648,6 @@ static MYSQL_SYSVAR_BOOL(undo_log_truncate, srv_undo_log_truncate,
   "Enable or Disable Truncate of UNDO tablespace.",
   NULL, NULL, FALSE);
 
-/* Alias for innodb_undo_logs, this config variable is deprecated. */
-static MYSQL_SYSVAR_ULONG(rollback_segments, srv_undo_logs,
-  PLUGIN_VAR_OPCMDARG,
-  "Number of undo logs to use (deprecated).",
-  NULL, NULL,
-  TRX_SYS_N_RSEGS,	/* Default setting */
-  1,			/* Minimum value */
-  TRX_SYS_N_RSEGS, 0);	/* Maximum value */
-
 static MYSQL_SYSVAR_LONG(autoinc_lock_mode, innobase_autoinc_lock_mode,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "The AUTOINC lock modes supported by InnoDB:"
@@ -19721,7 +19678,7 @@ static MYSQL_SYSVAR_BOOL(numa_interleave, srv_numa_interleave,
 static MYSQL_SYSVAR_ENUM(change_buffering, innodb_change_buffering,
   PLUGIN_VAR_RQCMDARG,
   "Buffer changes to secondary indexes.",
-  NULL, NULL, IBUF_USE_ALL, &innodb_change_buffering_typelib);
+  NULL, NULL, IBUF_USE_NONE, &innodb_change_buffering_typelib);
 
 static MYSQL_SYSVAR_UINT(change_buffer_max_size,
   srv_change_buffer_max_size,
@@ -19748,12 +19705,6 @@ static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug,
   PLUGIN_VAR_RQCMDARG,
   "Debug flags for InnoDB change buffering (0=none, 1=try to buffer)",
   NULL, NULL, 0, 0, 1, 0);
-
-static MYSQL_SYSVAR_BOOL(disable_background_merge,
-  srv_ibuf_disable_background_merge,
-  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_RQCMDARG,
-  "Disable change buffering merges by the master thread",
-  NULL, NULL, FALSE);
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 
 static MYSQL_SYSVAR_ULONG(buf_dump_status_frequency, srv_buf_dump_status_frequency,
@@ -19874,15 +19825,15 @@ static MYSQL_SYSVAR_UINT(data_file_size_debug,
   "InnoDB system tablespace size to be set in recovery.",
   NULL, NULL, 0, 0, 256U << 20, 0);
 
-static MYSQL_SYSVAR_ULONG(fil_make_page_dirty_debug,
+static MYSQL_SYSVAR_UINT(fil_make_page_dirty_debug,
   srv_fil_make_page_dirty_debug, PLUGIN_VAR_OPCMDARG,
   "Make the first page of the given tablespace dirty.",
   NULL, innodb_make_page_dirty, 0, 0, UINT_MAX32, 0);
 
-static MYSQL_SYSVAR_ULONG(saved_page_number_debug,
+static MYSQL_SYSVAR_UINT(saved_page_number_debug,
   srv_saved_page_number_debug, PLUGIN_VAR_OPCMDARG,
   "An InnoDB page number.",
-  NULL, innodb_save_page_no, 0, 0, UINT_MAX32, 0);
+  NULL, NULL, 0, 0, UINT_MAX32, 0);
 
 static MYSQL_SYSVAR_BOOL(disable_resize_buffer_pool_debug,
   buf_disable_resize_buffer_pool_debug, PLUGIN_VAR_NOCMDARG,
@@ -19890,10 +19841,9 @@ static MYSQL_SYSVAR_BOOL(disable_resize_buffer_pool_debug,
   NULL, NULL, TRUE);
 
 static MYSQL_SYSVAR_BOOL(page_cleaner_disabled_debug,
-  innodb_page_cleaner_disabled_debug,
-  PLUGIN_VAR_OPCMDARG,
+  innodb_page_cleaner_disabled_debug, PLUGIN_VAR_OPCMDARG,
   "Disable page cleaner",
-  NULL, buf_flush_page_cleaner_disabled_debug_update, FALSE);
+  NULL, NULL, FALSE);
 
 static MYSQL_SYSVAR_BOOL(sync_debug, srv_sync_debug,
   PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
@@ -19916,7 +19866,7 @@ static MYSQL_SYSVAR_BOOL(master_thread_disabled_debug,
 static MYSQL_SYSVAR_BOOL(force_primary_key,
   srv_force_primary_key,
   PLUGIN_VAR_OPCMDARG,
-  "Do not allow to create table without primary key (off by default)",
+  "Do not allow creating a table without primary key (off by default)",
   NULL, NULL, FALSE);
 
 static const char *page_compression_algorithms[]= { "none", "zlib", "lz4", "lzo", "lzma", "bzip2", "snappy", 0 };
@@ -19960,8 +19910,7 @@ static MYSQL_SYSVAR_ENUM(encrypt_tables, srv_encrypt_tables,
 
 static MYSQL_SYSVAR_UINT(encryption_threads, srv_n_fil_crypt_threads,
 			 PLUGIN_VAR_RQCMDARG,
-			 "Number of threads performing background key rotation and "
-			 "scrubbing",
+			 "Number of threads performing background key rotation ",
 			 NULL,
 			 innodb_encryption_threads_update,
 			 0, 0, 255, 0);
@@ -19984,18 +19933,15 @@ static MYSQL_SYSVAR_UINT(encryption_rotation_iops, srv_n_fil_crypt_iops,
 			 innodb_encryption_rotation_iops_update,
 			 srv_n_fil_crypt_iops, 0, UINT_MAX32, 0);
 
-static MYSQL_SYSVAR_BOOL(scrub_log, srv_scrub_log,
+static MYSQL_SYSVAR_BOOL(scrub_log, deprecated::innodb_scrub_log,
   PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
-  "Enable background redo log (ib_logfile0, ib_logfile1...) scrubbing",
+  innodb_deprecated_ignored,
   0, 0, 0);
 
-static MYSQL_SYSVAR_ULONGLONG(scrub_log_speed, innodb_scrub_log_speed,
+static MYSQL_SYSVAR_ULONGLONG(scrub_log_speed, deprecated::innodb_scrub_log_speed,
   PLUGIN_VAR_OPCMDARG,
-  "Background redo log scrubbing speed in bytes/sec",
-  NULL, NULL,
-  256,              /* 256 bytes/sec, corresponds to 2000 ms scrub_log_interval */
-  1,                /* min */
-  50000, 0);        /* 50Kbyte/sec, corresponds to 10 ms scrub_log_interval */
+  innodb_deprecated_ignored, NULL, innodb_scrub_log_speed_warn,
+  256, 1, 50000, 0);
 
 static MYSQL_SYSVAR_BOOL(encrypt_log, srv_encrypt_log,
   PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
@@ -20009,57 +19955,24 @@ static MYSQL_SYSVAR_BOOL(immediate_scrub_data_uncompressed,
 			 NULL, NULL, FALSE);
 
 static MYSQL_SYSVAR_BOOL(background_scrub_data_uncompressed,
-			 srv_background_scrub_data_uncompressed,
-			 0,
-			 "Enable scrubbing of uncompressed data by "
-			 "background threads (same as encryption_threads)",
-			 NULL, NULL, FALSE);
+  deprecated::innodb_background_scrub_data_uncompressed,
+  PLUGIN_VAR_OPCMDARG, innodb_deprecated_ignored, NULL,
+  innodb_background_scrub_data_uncompressed_warn, FALSE);
 
 static MYSQL_SYSVAR_BOOL(background_scrub_data_compressed,
-			 srv_background_scrub_data_compressed,
-			 0,
-			 "Enable scrubbing of compressed data by "
-			 "background threads (same as encryption_threads)",
-			 NULL, NULL, FALSE);
+  deprecated::innodb_background_scrub_data_compressed,
+  PLUGIN_VAR_OPCMDARG, innodb_deprecated_ignored, NULL,
+  innodb_background_scrub_data_compressed_warn, FALSE);
 
 static MYSQL_SYSVAR_UINT(background_scrub_data_check_interval,
-			 srv_background_scrub_data_check_interval,
-			 0,
-			 "check if spaces needs scrubbing every "
-			 "innodb_background_scrub_data_check_interval "
-			 "seconds",
-			 NULL, NULL,
-			 srv_background_scrub_data_check_interval,
-			 1,
-			 UINT_MAX32, 0);
+  deprecated::innodb_background_scrub_data_check_interval,
+  0, innodb_deprecated_ignored, NULL,
+  innodb_background_scrub_data_check_interval_warn, 0, 0, 0, 0);
 
 static MYSQL_SYSVAR_UINT(background_scrub_data_interval,
-			 srv_background_scrub_data_interval,
-			 0,
-			 "scrub spaces that were last scrubbed longer than "
-			 " innodb_background_scrub_data_interval seconds ago",
-			 NULL, NULL,
-			 srv_background_scrub_data_interval,
-			 1,
-			 UINT_MAX32, 0);
-
-#ifdef UNIV_DEBUG
-static MYSQL_SYSVAR_BOOL(debug_force_scrubbing,
-			 srv_scrub_force_testing,
-			 0,
-			 "Perform extra scrubbing to increase test exposure",
-			 NULL, NULL, FALSE);
-
-char *innobase_debug_sync;
-static MYSQL_SYSVAR_STR(debug_sync, innobase_debug_sync,
-			PLUGIN_VAR_NOCMDARG,
-			"debug_sync for innodb purge threads. "
-			"Use it to set up sync points for all purge threads "
-			"at once. The commands will be applied sequentially at"
-			" the beginning of purging the next undo record.",
-			NULL,
-			innobase_debug_sync_set, NULL);
-#endif /* UNIV_DEBUG */
+  deprecated::innodb_background_scrub_data_interval,
+  0, innodb_deprecated_ignored, NULL,
+  innodb_background_scrub_data_interval_warn, 0, 0, 0, 0);
 
 static MYSQL_SYSVAR_BOOL(encrypt_temporary_tables, innodb_encrypt_temporary_tables,
   PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
@@ -20091,10 +20004,10 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(defragment_fill_factor_n_recs),
   MYSQL_SYSVAR(defragment_frequency),
   MYSQL_SYSVAR(lru_scan_depth),
+  MYSQL_SYSVAR(lru_flush_size),
   MYSQL_SYSVAR(flush_neighbors),
   MYSQL_SYSVAR(checksum_algorithm),
   MYSQL_SYSVAR(log_checksums),
-  MYSQL_SYSVAR(checksums),
   MYSQL_SYSVAR(commit_concurrency),
   MYSQL_SYSVAR(concurrency_tickets),
   MYSQL_SYSVAR(compression_level),
@@ -20125,7 +20038,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(large_prefix), /* deprecated in MariaDB 10.2; no effect */
   MYSQL_SYSVAR(force_load_corrupted),
   MYSQL_SYSVAR(lock_schedule_algorithm),
-  MYSQL_SYSVAR(locks_unsafe_for_binlog),
   MYSQL_SYSVAR(lock_wait_timeout),
   MYSQL_SYSVAR(deadlock_detect),
   MYSQL_SYSVAR(page_size),
@@ -20156,7 +20068,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(ft_user_stopword_table),
   MYSQL_SYSVAR(disable_sort_file_cache),
   MYSQL_SYSVAR(stats_on_metadata),
-  MYSQL_SYSVAR(stats_sample_pages),
   MYSQL_SYSVAR(stats_transient_sample_pages),
   MYSQL_SYSVAR(stats_persistent),
   MYSQL_SYSVAR(stats_persistent_sample_pages),
@@ -20192,7 +20103,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
   MYSQL_SYSVAR(change_buffer_dump),
   MYSQL_SYSVAR(change_buffering_debug),
-  MYSQL_SYSVAR(disable_background_merge),
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
   MYSQL_SYSVAR(random_read_ahead),
   MYSQL_SYSVAR(read_ahead_threshold),
@@ -20201,7 +20111,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(io_capacity),
   MYSQL_SYSVAR(io_capacity_max),
   MYSQL_SYSVAR(page_cleaners),
-  MYSQL_SYSVAR(idle_flush_pct),
   MYSQL_SYSVAR(monitor_enable),
   MYSQL_SYSVAR(monitor_disable),
   MYSQL_SYSVAR(monitor_reset),
@@ -20214,10 +20123,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(buf_flush_list_now),
   MYSQL_SYSVAR(merge_threshold_set_all_debug),
 #endif /* UNIV_DEBUG */
-#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG
-  MYSQL_SYSVAR(page_hash_locks),
-  MYSQL_SYSVAR(doublewrite_batch_size),
-#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */
   MYSQL_SYSVAR(status_output),
   MYSQL_SYSVAR(status_output_locks),
   MYSQL_SYSVAR(print_all_deadlocks),
@@ -20226,7 +20131,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(max_undo_log_size),
   MYSQL_SYSVAR(purge_rseg_truncate_frequency),
   MYSQL_SYSVAR(undo_log_truncate),
-  MYSQL_SYSVAR(rollback_segments),
   MYSQL_SYSVAR(undo_directory),
   MYSQL_SYSVAR(undo_tablespaces),
   MYSQL_SYSVAR(sync_array_size),
@@ -20267,10 +20171,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(background_scrub_data_compressed),
   MYSQL_SYSVAR(background_scrub_data_interval),
   MYSQL_SYSVAR(background_scrub_data_check_interval),
-#ifdef UNIV_DEBUG
-  MYSQL_SYSVAR(debug_force_scrubbing),
-  MYSQL_SYSVAR(debug_sync),
-#endif
   MYSQL_SYSVAR(buf_dump_status_frequency),
   MYSQL_SYSVAR(background_thread),
   MYSQL_SYSVAR(encrypt_temporary_tables),
@@ -20325,28 +20225,9 @@ i_s_innodb_sys_datafiles,
 i_s_innodb_sys_virtual,
 i_s_innodb_mutexes,
 i_s_innodb_sys_semaphore_waits,
-i_s_innodb_tablespaces_encryption,
-i_s_innodb_tablespaces_scrubbing
+i_s_innodb_tablespaces_encryption
 maria_declare_plugin_end;
 
-/** @brief Initialize the default value of innodb_commit_concurrency.
-
-Once InnoDB is running, the innodb_commit_concurrency must not change
-from zero to nonzero. (Bug #42101)
-
-The initial default value is 0, and without this extra initialization,
-SET GLOBAL innodb_commit_concurrency=DEFAULT would set the parameter
-to 0, even if it was initially set to nonzero at the command line
-or configuration file. */
-static
-void
-innobase_commit_concurrency_init_default()
-/*======================================*/
-{
-	MYSQL_SYSVAR_NAME(commit_concurrency).def_val
-		= innobase_commit_concurrency;
-}
-
 /** @brief Adjust some InnoDB startup parameters based on file contents
 or innodb_page_size. */
 static
@@ -20440,126 +20321,6 @@ ha_innobase::multi_range_read_explain_info(
 	return m_ds_mrr.dsmrr_explain_info(mrr_mode, str, size);
 }
 
-/** Parse the table file name into table name and database name.
-@param[in]	tbl_name	InnoDB table name
-@param[out]	dbname		database name buffer (NAME_LEN + 1 bytes)
-@param[out]	tblname		table name buffer (NAME_LEN + 1 bytes)
-@param[out]	dbnamelen	database name length
-@param[out]	tblnamelen	table name length
-@return true if the table name is parsed properly. */
-static bool table_name_parse(
-	const table_name_t&	tbl_name,
-	char*			dbname,
-	char*			tblname,
-	ulint&			dbnamelen,
-	ulint&			tblnamelen)
-{
-	dbnamelen = dict_get_db_name_len(tbl_name.m_name);
-	char db_buf[MAX_DATABASE_NAME_LEN  + 1];
-	char tbl_buf[MAX_TABLE_NAME_LEN + 1];
-
-	ut_ad(dbnamelen > 0);
-	ut_ad(dbnamelen <= MAX_DATABASE_NAME_LEN);
-
-	memcpy(db_buf, tbl_name.m_name, dbnamelen);
-	db_buf[dbnamelen] = 0;
-
-	tblnamelen = strlen(tbl_name.m_name + dbnamelen + 1);
-	memcpy(tbl_buf, tbl_name.m_name + dbnamelen + 1, tblnamelen);
-	tbl_buf[tblnamelen] = 0;
-
-	dbnamelen = filename_to_tablename(db_buf, dbname, MAX_DATABASE_NAME_LEN + 1, true);
-
-	if (tblnamelen > TEMP_FILE_PREFIX_LENGTH
-	    && !strncmp(tbl_buf, TEMP_FILE_PREFIX, TEMP_FILE_PREFIX_LENGTH)) {
-		return false;
-	}
-
-	if (char *is_part = strchr(tbl_buf, '#')) {
-		*is_part = '\0';
-		tblnamelen = is_part - tbl_buf;
-	}
-
-	tblnamelen = filename_to_tablename(tbl_buf, tblname, MAX_TABLE_NAME_LEN + 1, true);
-	return true;
-}
-
-
-/** Acquire metadata lock and MariaDB table handle for an InnoDB table.
-@param[in,out]	thd	thread handle
-@param[in,out]	table	InnoDB table
-@return MariaDB table handle
-@retval NULL if the table does not exist, is unaccessible or corrupted. */
-static TABLE* innodb_acquire_mdl(THD* thd, dict_table_t* table)
-{
-	char	db_buf[NAME_LEN + 1], db_buf1[NAME_LEN + 1];
-	char	tbl_buf[NAME_LEN + 1], tbl_buf1[NAME_LEN + 1];
-	ulint	db_buf_len, db_buf1_len;
-	ulint	tbl_buf_len, tbl_buf1_len;
-
-	if (!table_name_parse(table->name, db_buf, tbl_buf,
-			      db_buf_len, tbl_buf_len)) {
-		table->release();
-		return NULL;
-	}
-
-	DEBUG_SYNC(thd, "ib_purge_virtual_latch_released");
-
-	const table_id_t table_id = table->id;
-retry_mdl:
-	const bool unaccessible = !table->is_readable() || table->corrupted;
-	table->release();
-
-	if (unaccessible) {
-		return NULL;
-	}
-
-	TABLE*	mariadb_table = open_purge_table(thd, db_buf, db_buf_len,
-						 tbl_buf, tbl_buf_len);
-	if (!mariadb_table)
-		thd_clear_error(thd);
-
-	DEBUG_SYNC(thd, "ib_purge_virtual_got_no_such_table");
-
-	table = dict_table_open_on_id(table_id, false, DICT_TABLE_OP_NORMAL);
-
-	if (table == NULL) {
-		/* Table is dropped. */
-		goto fail;
-	}
-
-	if (!fil_table_accessible(table)) {
-release_fail:
-		table->release();
-fail:
-		if (mariadb_table) {
-			close_thread_tables(thd);
-		}
-
-		return NULL;
-	}
-
-	if (!table_name_parse(table->name, db_buf1, tbl_buf1,
-			      db_buf1_len, tbl_buf1_len)) {
-		goto release_fail;
-	}
-
-	if (!mariadb_table) {
-	} else if (!strcmp(db_buf, db_buf1) && !strcmp(tbl_buf, tbl_buf1)) {
-		return mariadb_table;
-	} else {
-		/* Table is renamed. So release MDL for old name and try
-		to acquire the MDL for new table name. */
-		close_thread_tables(thd);
-	}
-
-	strcpy(tbl_buf, tbl_buf1);
-	strcpy(db_buf, db_buf1);
-	tbl_buf_len = tbl_buf1_len;
-	db_buf_len = db_buf1_len;
-	goto retry_mdl;
-}
-
 /** Find or open a table handle for the virtual column template
 @param[in]	thd	thread handle
 @param[in,out]	table	InnoDB table whose virtual column template
@@ -20569,26 +20330,13 @@ fail:
 for purge thread */
 static TABLE* innodb_find_table_for_vc(THD* thd, dict_table_t* table)
 {
-	DBUG_EXECUTE_IF(
-		"ib_purge_virtual_mdev_16222_1",
-		DBUG_ASSERT(!debug_sync_set_action(
-			    thd,
-			    STRING_WITH_LEN("ib_purge_virtual_latch_released "
-					    "SIGNAL latch_released "
-					    "WAIT_FOR drop_started"))););
-	DBUG_EXECUTE_IF(
-		"ib_purge_virtual_mdev_16222_2",
-		DBUG_ASSERT(!debug_sync_set_action(
-			    thd,
-			    STRING_WITH_LEN("ib_purge_virtual_got_no_such_table "
-					    "SIGNAL got_no_such_table"))););
-
-	if (THDVAR(thd, background_thread)) {
-		/* Purge thread acquires dict_sys.latch while
-		processing undo log record. Release it
-		before acquiring MDL on the table. */
-		rw_lock_s_unlock(&dict_sys.latch);
-		return innodb_acquire_mdl(thd, table);
+	TABLE *mysql_table;
+	const bool  bg_thread = THDVAR(thd, background_thread);
+
+	if (bg_thread) {
+		if ((mysql_table = get_purge_table(thd))) {
+			return mysql_table;
+		}
 	} else {
 		if (table->vc_templ->mysql_table_query_id
 		    == thd_get_query_id(thd)) {
@@ -20600,15 +20348,17 @@ static TABLE* innodb_find_table_for_vc(THD* thd, dict_table_t* table)
 	char	tbl_buf[NAME_LEN + 1];
 	ulint	db_buf_len, tbl_buf_len;
 
-	if (!table_name_parse(table->name, db_buf, tbl_buf,
-			      db_buf_len, tbl_buf_len)) {
-		ut_ad(!"invalid table name");
+	if (!table->parse_name(db_buf, tbl_buf, &db_buf_len, &tbl_buf_len)) {
 		return NULL;
 	}
 
-	TABLE* mysql_table = find_fk_open_table(thd, db_buf, db_buf_len,
-						tbl_buf, tbl_buf_len);
+	if (bg_thread) {
+		return open_purge_table(thd, db_buf, db_buf_len,
+					tbl_buf, tbl_buf_len);
+	}
 
+	mysql_table = find_fk_open_table(thd, db_buf, db_buf_len,
+					 tbl_buf, tbl_buf_len);
 	table->vc_templ->mysql_table = mysql_table;
 	table->vc_templ->mysql_table_query_id = thd_get_query_id(thd);
 	return mysql_table;
@@ -20825,8 +20575,9 @@ innobase_get_computed_value(
 		if (update) {
 			ulint clust_no = dict_col_get_clust_pos(base_col,
 								clust_index);
+			ut_ad(clust_no != ULINT_UNDEFINED);
 			if (const upd_field_t *uf = upd_get_field_by_field_no(
-				    update, clust_no, false)) {
+				    update, uint16_t(clust_no), false)) {
 				row_field = &uf->new_val;
 			}
 		}
@@ -20849,8 +20600,15 @@ innobase_get_computed_value(
 		}
 
 		if (len == UNIV_SQL_NULL) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
                         mysql_rec[templ->mysql_null_byte_offset]
                                 |= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
                         memcpy(mysql_rec + templ->mysql_col_offset,
                                static_cast<const byte*>(
 					index->table->vc_templ->default_rec
@@ -20867,7 +20625,8 @@ innobase_get_computed_value(
 				/* It is a nullable column with a
 				non-NULL value */
 				mysql_rec[templ->mysql_null_byte_offset]
-					&= ~(byte) templ->mysql_null_bit_mask;
+					&= static_cast<byte>(
+						~templ->mysql_null_bit_mask);
 			}
 		}
 	}
@@ -21410,38 +21169,27 @@ innodb_buffer_pool_size_validate(
 #endif /* UNIV_DEBUG */
 
 
-	buf_pool_mutex_enter_all();
+	mysql_mutex_lock(&buf_pool.mutex);
 
 	if (srv_buf_pool_old_size != srv_buf_pool_size) {
-		buf_pool_mutex_exit_all();
+		mysql_mutex_unlock(&buf_pool.mutex);
 		my_printf_error(ER_WRONG_ARGUMENTS,
 			"Another buffer pool resize is already in progress.", MYF(0));
 		return(1);
 	}
 
-	if (srv_buf_pool_instances > 1 && intbuf < BUF_POOL_SIZE_THRESHOLD) {
-		buf_pool_mutex_exit_all();
-
-		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-				    ER_WRONG_ARGUMENTS,
-				    "Cannot update innodb_buffer_pool_size"
-				    " to less than 1GB if"
-				    " innodb_buffer_pool_instances > 1.");
-		return(1);
-	}
-
 	ulint	requested_buf_pool_size = buf_pool_size_align(ulint(intbuf));
 
 	*static_cast<ulonglong*>(save) = requested_buf_pool_size;
 
 	if (srv_buf_pool_size == ulint(intbuf)) {
-		buf_pool_mutex_exit_all();
+		mysql_mutex_unlock(&buf_pool.mutex);
 		/* nothing to do */
 		return(0);
 	}
 
 	if (srv_buf_pool_size == requested_buf_pool_size) {
-		buf_pool_mutex_exit_all();
+		mysql_mutex_unlock(&buf_pool.mutex);
 		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 				    ER_WRONG_ARGUMENTS,
 				    "innodb_buffer_pool_size must be at least"
@@ -21452,7 +21200,7 @@ innodb_buffer_pool_size_validate(
 	}
 
 	srv_buf_pool_size = requested_buf_pool_size;
-	buf_pool_mutex_exit_all();
+	mysql_mutex_unlock(&buf_pool.mutex);
 
 	if (intbuf != static_cast<longlong>(requested_buf_pool_size)) {
 		char	buf[64];
@@ -21588,6 +21336,8 @@ static void innodb_remember_check_sysvar_funcs()
 	check_sysvar_int = MYSQL_SYSVAR_NAME(flush_log_at_timeout).check;
 }
 
+static const size_t MAX_BUF_SIZE = 4 * 1024;
+
 /********************************************************************//**
 Helper function to push warnings from InnoDB internals to SQL-layer. */
 UNIV_INTERN
@@ -21602,16 +21352,15 @@ ib_push_warning(
 		THD *thd = (THD *)trx->mysql_thd;
 		va_list args;
 		char *buf;
-#define MAX_BUF_SIZE 4*1024
 
 		va_start(args, format);
-		buf = (char *)my_malloc(MAX_BUF_SIZE, MYF(MY_WME));
+		buf = (char *)my_malloc(PSI_INSTRUMENT_ME, MAX_BUF_SIZE, MYF(MY_WME));
 		buf[MAX_BUF_SIZE - 1] = 0;
 		vsnprintf(buf, MAX_BUF_SIZE - 1, format, args);
-		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-				    uint(convert_error_code_to_mysql(error, 0,
-								     thd)),
-				    buf);
+
+		push_warning_printf(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			uint(convert_error_code_to_mysql(error, 0, thd)), buf);
 		my_free(buf);
 		va_end(args);
 	}
@@ -21630,7 +21379,6 @@ ib_push_warning(
 	va_list args;
 	THD *thd = (THD *)ithd;
 	char *buf;
-#define MAX_BUF_SIZE 4*1024
 
 	if (ithd == NULL) {
 		thd = current_thd;
@@ -21638,7 +21386,7 @@ ib_push_warning(
 
 	if (thd) {
 		va_start(args, format);
-		buf = (char *)my_malloc(MAX_BUF_SIZE, MYF(MY_WME));
+		buf = (char *)my_malloc(PSI_INSTRUMENT_ME, MAX_BUF_SIZE, MYF(MY_WME));
 		buf[MAX_BUF_SIZE - 1] = 0;
 		vsnprintf(buf, MAX_BUF_SIZE - 1, format, args);
 
@@ -21650,6 +21398,52 @@ ib_push_warning(
 	}
 }
 
+/** Helper function to push warnings from InnoDB internals to SQL-layer.
+@param[in]	trx
+@param[in]	error		Error code to push as warning
+@param[in]	table_name	Table name
+@param[in]	format		Warning message
+@param[in]	...		Message arguments */
+UNIV_INTERN
+void
+ib_foreign_warn(trx_t*	    trx,   /*!< in: trx */
+		dberr_t	    error, /*!< in: error code to push as warning */
+		const char* table_name,
+		const char* format, /*!< in: warning message */
+		...)
+{
+	va_list		    args;
+	char*		    buf;
+	static FILE*	    ef		 = dict_foreign_err_file;
+	static const size_t MAX_BUF_SIZE = 4 * 1024;
+	buf = (char*)my_malloc(PSI_INSTRUMENT_ME, MAX_BUF_SIZE, MYF(MY_WME));
+	if (!buf) {
+		return;
+	}
+
+	va_start(args, format);
+	vsprintf(buf, format, args);
+	va_end(args);
+
+	mutex_enter(&dict_foreign_err_mutex);
+	rewind(ef);
+	ut_print_timestamp(ef);
+	fprintf(ef, " Error in foreign key constraint of table %s:\n",
+		table_name);
+	fputs(buf, ef);
+	mutex_exit(&dict_foreign_err_mutex);
+
+	if (trx && trx->mysql_thd) {
+		THD* thd = (THD*)trx->mysql_thd;
+
+		push_warning_printf(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			uint(convert_error_code_to_mysql(error, 0, thd)), buf);
+	}
+
+	my_free(buf);
+}
+
 /********************************************************************//**
 Helper function to push frm mismatch error to error log and
 if needed to sql-layer. */
@@ -21808,7 +21602,7 @@ ulint
 buf_pool_size_align(
 	ulint	size)
 {
-  const ib_uint64_t	m = ((ib_uint64_t)srv_buf_pool_instances) * srv_buf_pool_chunk_unit;
+  const ulong	m = srv_buf_pool_chunk_unit;
   size = ut_max((size_t) size, (size_t) MYSQL_SYSVAR_NAME(buffer_pool_size).min_val);
 
   if (size % m == 0) {
diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h
index 29f7902b477..948d231f6a6 100644
--- a/storage/innobase/handler/ha_innodb.h
+++ b/storage/innobase/handler/ha_innodb.h
@@ -53,7 +53,7 @@ struct ha_table_option_struct
 };
 
 /** The class defining a handle to an Innodb table */
-class ha_innobase final: public handler
+class ha_innobase final : public handler
 {
 public:
 	ha_innobase(handlerton* hton, TABLE_SHARE* table_arg);
@@ -177,9 +177,10 @@ public:
 	int start_stmt(THD *thd, thr_lock_type lock_type) override;
 
 	ha_rows records_in_range(
-		uint			inx,
-		key_range*		min_key,
-		key_range*		max_key) override;
+                uint                    inx,
+                const key_range*        min_key,
+                const key_range*        max_key,
+                page_range*             pages) override;
 
 	ha_rows estimate_rows_upper_bound() override;
 
@@ -207,6 +208,8 @@ public:
 	inline int defragment_table(const char* name);
 	int check(THD* thd, HA_CHECK_OPT* check_opt) override;
 
+	inline void reload_statistics();
+
 	char* get_foreign_key_create_info() override;
 
         int get_foreign_key_list(THD *thd,
@@ -254,8 +257,6 @@ public:
 		qc_engine_callback*	call_back,
 		ulonglong*		engine_data) override;
 
-	bool primary_key_is_clustered() override;
-
 	int cmp_ref(const uchar* ref1, const uchar* ref2) override;
 
 	/** On-line ALTER TABLE interface @see handler0alter.cc @{ */
@@ -643,6 +644,9 @@ public:
 	/** Set m_tablespace_type. */
 	void set_tablespace_type(bool table_being_altered_is_file_per_table);
 
+	/** Create InnoDB foreign keys from MySQL alter_info. */
+	dberr_t create_foreign_keys();
+
 	/** Create the internal innodb table.
 	@param create_fk	whether to add FOREIGN KEY constraints */
 	int create_table(bool create_fk = true);
diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc
index e79d9d67dbf..939cf5b06a5 100644
--- a/storage/innobase/handler/handler0alter.cc
+++ b/storage/innobase/handler/handler0alter.cc
@@ -46,7 +46,6 @@ Smart ALTER TABLE
 #include "row0row.h"
 #include "row0upd.h"
 #include "trx0trx.h"
-#include "trx0roll.h"
 #include "handler0alter.h"
 #include "srv0mon.h"
 #include "srv0srv.h"
@@ -115,6 +114,7 @@ static const alter_table_operations INNOBASE_INPLACE_IGNORE
 	| ALTER_VIRTUAL_GCOL_EXPR
 	| ALTER_DROP_CHECK_CONSTRAINT
 	| ALTER_RENAME
+	| ALTER_INDEX_ORDER
 	| ALTER_COLUMN_INDEX_LENGTH
 	| ALTER_CHANGE_INDEX_COMMENT;
 
@@ -153,30 +153,6 @@ static const alter_table_operations INNOBASE_ALTER_INSTANT
 	| ALTER_RENAME_INDEX
 	| ALTER_DROP_VIRTUAL_COLUMN;
 
-/** Acquire a page latch on the possible metadata record,
-to prevent concurrent invocation of dict_index_t::clear_instant_alter()
-by purge when the table turns out to be empty.
-@param[in,out]	index	clustered index
-@param[in,out]	mtr	mini-transaction */
-static void instant_metadata_lock(dict_index_t& index, mtr_t& mtr)
-{
-	DBUG_ASSERT(index.is_primary());
-
-	if (!index.is_instant()) {
-		/* dict_index_t::clear_instant_alter() cannot be called.
-		No need for a latch. */
-		return;
-	}
-
-	btr_cur_t btr_cur;
-	btr_cur_open_at_index_side(true, &index, BTR_SEARCH_LEAF,
-				   &btr_cur, 0, &mtr);
-	ut_ad(page_cur_is_before_first(btr_cur_get_page_cur(&btr_cur)));
-	ut_ad(page_is_leaf(btr_cur_get_page(&btr_cur)));
-	ut_ad(!page_has_prev(btr_cur_get_page(&btr_cur)));
-	ut_ad(!buf_block_get_page_zip(btr_cur_get_block(&btr_cur)));
-}
-
 /** Initialize instant->field_map.
 @param[in]	table	table definition to copy from */
 inline void dict_table_t::init_instant(const dict_table_t& table)
@@ -251,16 +227,10 @@ inline void dict_table_t::prepare_instant(const dict_table_t& old,
 	If that is the case, the instant ALTER TABLE would keep
 	the InnoDB table in its current format. */
 
-	dict_index_t& oindex = *old.indexes.start;
+	const dict_index_t& oindex = *old.indexes.start;
 	dict_index_t& index = *indexes.start;
 	first_alter_pos = 0;
 
-	mtr_t mtr;
-	mtr.start();
-	/* Protect oindex.n_core_fields and others, so that
-	purge cannot invoke dict_index_t::clear_instant_alter(). */
-	instant_metadata_lock(oindex, mtr);
-
 	for (unsigned i = 0; i + DATA_N_SYS_COLS < old.n_cols; i++) {
 		if (col_map[i] != i) {
 			first_alter_pos = 1 + i;
@@ -326,12 +296,12 @@ add_metadata:
 			DBUG_ASSERT(instant->dropped[i].is_dropped());
 		}
 #endif
-		const uint n_fields = index.n_fields + n_dropped();
+		const unsigned n_fields = index.n_fields + n_dropped();
 
 		DBUG_ASSERT(n_fields >= oindex.n_fields);
 		dict_field_t* fields = static_cast<dict_field_t*>(
 			mem_heap_zalloc(heap, n_fields * sizeof *fields));
-		uint i = 0, j = 0, n_nullable = 0;
+		unsigned i = 0, j = 0, n_nullable = 0;
 		ut_d(uint core_null = 0);
 		for (; i < oindex.n_fields; i++) {
 			DBUG_ASSERT(j <= i);
@@ -374,7 +344,7 @@ found_nullable:
 							goto found_j;
 						}
 					}
-					DBUG_ASSERT(!"no such col");
+					DBUG_ASSERT("no such col" == 0);
 found_j:
 					std::swap(index.fields[j],
 						  index.fields[k]);
@@ -421,11 +391,12 @@ found_j:
 				    == fields[i].col->name(*this));
 		}
 		DBUG_ASSERT(j == index.n_fields);
-		index.n_fields = index.n_def = n_fields;
+		index.n_fields = index.n_def = n_fields
+			& dict_index_t::MAX_N_FIELDS;
 		index.fields = fields;
 		DBUG_ASSERT(n_nullable >= index.n_nullable);
 		DBUG_ASSERT(n_nullable >= oindex.n_nullable);
-		index.n_nullable = n_nullable;
+		index.n_nullable = n_nullable & dict_index_t::MAX_N_FIELDS;
 		goto set_core_fields;
 	}
 
@@ -433,7 +404,6 @@ found_j:
 	DBUG_ASSERT(n_dropped() >= old.n_dropped());
 	DBUG_ASSERT(index.n_core_fields == oindex.n_core_fields);
 	DBUG_ASSERT(index.n_core_null_bytes == oindex.n_core_null_bytes);
-	mtr.commit();
 }
 
 /** Adjust index metadata for instant ADD/DROP/reorder COLUMN.
@@ -453,15 +423,8 @@ inline void dict_index_t::instant_add_field(const dict_index_t& instant)
 	DBUG_ASSERT(n_uniq == instant.n_uniq);
 	DBUG_ASSERT(instant.n_fields >= n_fields);
 	DBUG_ASSERT(instant.n_nullable >= n_nullable);
-	/* dict_table_t::prepare_instant() initialized n_core_fields
-	to be equal. However, after that purge could have emptied the
-	table and invoked dict_index_t::clear_instant_alter(). */
-	DBUG_ASSERT(instant.n_core_fields <= n_core_fields);
-	DBUG_ASSERT(instant.n_core_null_bytes <= n_core_null_bytes);
-	DBUG_ASSERT(instant.n_core_fields == n_core_fields
-		    || (!is_instant() && instant.is_instant()));
-	DBUG_ASSERT(instant.n_core_null_bytes == n_core_null_bytes
-		    || (!is_instant() && instant.is_instant()));
+	DBUG_ASSERT(instant.n_core_fields == n_core_fields);
+	DBUG_ASSERT(instant.n_core_null_bytes == n_core_null_bytes);
 
 	/* instant will have all fields (including ones for columns
 	that have been or are being instantly dropped) in the same position
@@ -541,7 +504,7 @@ inline bool dict_table_t::instant_column(const dict_table_t& table,
 
 	/* Preserve the default values of previously instantly added
 	columns, or copy the new default values to this->heap. */
-	for (ulint i = 0; i < ulint(table.n_cols); i++) {
+	for (uint16_t i = 0; i < table.n_cols; i++) {
 		dict_col_t& c = cols[i];
 
 		if (const dict_col_t* o = find(old_cols, col_map, n_cols, i)) {
@@ -555,16 +518,16 @@ inline bool dict_table_t::instant_column(const dict_table_t& table,
 
 			if (o->vers_sys_start()) {
 				ut_ad(o->ind == vers_start);
-				vers_start = i;
+				vers_start = i & dict_index_t::MAX_N_FIELDS;
 			} else if (o->vers_sys_end()) {
 				ut_ad(o->ind == vers_end);
-				vers_end = i;
+				vers_end = i & dict_index_t::MAX_N_FIELDS;
 			}
 			continue;
 		}
 
 		DBUG_ASSERT(c.is_added());
-		if (c.def_val.len <= sizeof field_ref_zero
+		if (c.def_val.len <= UNIV_PAGE_SIZE_MAX
 		    && (!c.def_val.len
 			|| !memcmp(c.def_val.data, field_ref_zero,
 				   c.def_val.len))) {
@@ -576,8 +539,10 @@ inline bool dict_table_t::instant_column(const dict_table_t& table,
 		}
 	}
 
-	n_t_def += table.n_cols - n_cols;
-	n_t_cols += table.n_cols - n_cols;
+	n_t_def = (n_t_def + (table.n_cols - n_cols))
+		& dict_index_t::MAX_N_FIELDS;
+	n_t_cols = (n_t_cols + (table.n_cols - n_cols))
+		& dict_index_t::MAX_N_FIELDS;
 	n_def = table.n_cols;
 
 	const dict_v_col_t* const old_v_cols = v_cols;
@@ -602,8 +567,10 @@ inline bool dict_table_t::instant_column(const dict_table_t& table,
 		v_cols = NULL;
 	}
 
-	n_t_def += table.n_v_cols - n_v_cols;
-	n_t_cols += table.n_v_cols - n_v_cols;
+	n_t_def = (n_t_def + (table.n_v_cols - n_v_cols))
+		& dict_index_t::MAX_N_FIELDS;
+	n_t_cols = (n_t_cols + (table.n_v_cols - n_v_cols))
+		& dict_index_t::MAX_N_FIELDS;
 	n_v_def = table.n_v_cols;
 
 	for (unsigned i = 0; i < n_v_def; i++) {
@@ -774,11 +741,6 @@ inline void dict_table_t::rollback_instant(
 	}
 
 	dict_index_t* index = indexes.start;
-	mtr_t mtr;
-	mtr.start();
-	/* Prevent concurrent execution of dict_index_t::clear_instant_alter()
-	by acquiring a latch on the leftmost leaf page. */
-	instant_metadata_lock(*index, mtr);
 	/* index->is_instant() does not necessarily hold here, because
 	the table may have been emptied */
 	DBUG_ASSERT(old_n_cols >= DATA_N_SYS_COLS);
@@ -802,12 +764,14 @@ inline void dict_table_t::rollback_instant(
 		v_cols[i].~dict_v_col_t();
 	}
 
-	index->n_core_fields = (index->n_fields == index->n_core_fields)
-		? old_n_fields
-		: old_n_core_fields;
-	index->n_def = index->n_fields = old_n_fields;
-	index->n_core_null_bytes = UT_BITS_IN_BYTES(
-		index->get_n_nullable(index->n_core_fields));
+	index->n_core_fields = ((index->n_fields == index->n_core_fields)
+				? old_n_fields
+				: old_n_core_fields)
+		& dict_index_t::MAX_N_FIELDS;
+	index->n_def = index->n_fields = old_n_fields
+		& dict_index_t::MAX_N_FIELDS;
+	index->n_core_null_bytes = static_cast<uint8_t>(
+		UT_BITS_IN_BYTES(index->get_n_nullable(index->n_core_fields)));
 
 	const dict_col_t* const new_cols = cols;
 	const dict_col_t* const new_cols_end __attribute__((unused)) = cols + n_cols;
@@ -818,22 +782,21 @@ inline void dict_table_t::rollback_instant(
 	col_names = old_col_names;
 	v_cols = old_v_cols;
 	v_col_names = old_v_col_names;
-	n_def = n_cols = old_n_cols;
-	n_v_def = n_v_cols = old_n_v_cols;
-	n_t_def = n_t_cols = n_cols + n_v_cols;
+	n_def = n_cols = old_n_cols & dict_index_t::MAX_N_FIELDS;
+	n_v_def = n_v_cols = old_n_v_cols & dict_index_t::MAX_N_FIELDS;
+	n_t_def = n_t_cols = (n_cols + n_v_cols) & dict_index_t::MAX_N_FIELDS;
 
 	if (versioned()) {
 		for (unsigned i = 0; i < n_cols; ++i) {
 			if (cols[i].vers_sys_start()) {
-				vers_start = i;
+				vers_start = i & dict_index_t::MAX_N_FIELDS;
 			} else if (cols[i].vers_sys_end()) {
-				vers_end = i;
+				vers_end = i & dict_index_t::MAX_N_FIELDS;
 			}
 		}
 	}
 
 	index->fields = old_fields;
-	mtr.commit();
 
 	while ((index = dict_table_get_next_index(index)) != NULL) {
 		if (index->to_be_dropped) {
@@ -924,12 +887,12 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx
 	/** whether the order of the clustered index is unchanged */
 	bool		skip_pk_sort;
 	/** number of virtual columns to be added */
-	ulint		num_to_add_vcol;
+	unsigned	num_to_add_vcol;
 	/** virtual columns to be added */
 	dict_v_col_t*	add_vcol;
 	const char**	add_vcol_name;
 	/** number of virtual columns to be dropped */
-	ulint		num_to_drop_vcol;
+	unsigned	num_to_drop_vcol;
 	/** virtual columns to be dropped */
 	dict_v_col_t*	drop_vcol;
 	const char**	drop_vcol_name;
@@ -1992,8 +1955,7 @@ next_page:
 
     next_page= false;
     block= page_cur_get_block(cur);
-    block= btr_block_get(page_id_t(block->page.id.space(), next_page_no),
-                         block->page.zip_size(), BTR_SEARCH_LEAF, clust_index,
+    block= btr_block_get(*clust_index, next_page_no, BTR_SEARCH_LEAF, false,
                          &mtr);
     btr_leaf_page_release(page_cur_get_block(cur), BTR_SEARCH_LEAF, &mtr);
     page_cur_set_before_first(block, cur);
@@ -2208,7 +2170,7 @@ innodb_instant_alter_column_allowed_reason:
 		const Field*		field = table->field[i];
 		const dict_col_t*	col = dict_table_get_nth_col(
 			m_prebuilt->table, icol);
-		ulint			unsigned_flag;
+		unsigned unsigned_flag;
 
 		if (!field->stored_in_db()) {
 			continue;
@@ -2507,7 +2469,8 @@ next_column:
 			   | ALTER_ADD_UNIQUE_INDEX
 		*/
 			   | ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX
-			   | ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX);
+			   | ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX
+			   | ALTER_INDEX_ORDER);
 		if (supports_instant) {
 			flags &= ~(ALTER_DROP_STORED_COLUMN
 #if 0 /* MDEV-17468: remove check_v_col_in_order() and fix the code */
@@ -2696,7 +2659,7 @@ innobase_init_foreign(
                 foreign->id = static_cast<char*>(mem_heap_alloc(
                         foreign->heap, db_len + strlen(constraint_name) + 2));
 
-                ut_memcpy(foreign->id, table->name.m_name, db_len);
+                memcpy(foreign->id, table->name.m_name, db_len);
                 foreign->id[db_len] = '/';
                 strcpy(foreign->id + db_len + 1, constraint_name);
 
@@ -2715,7 +2678,8 @@ innobase_init_foreign(
         dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
 
         foreign->foreign_index = index;
-        foreign->n_fields = (unsigned int) num_field;
+        foreign->n_fields = static_cast<unsigned>(num_field)
+		& dict_index_t::MAX_N_FIELDS;
 
         foreign->foreign_col_names = static_cast<const char**>(
                 mem_heap_alloc(foreign->heap, num_field * sizeof(void*)));
@@ -3019,6 +2983,7 @@ innobase_get_foreign_key_info(
 	char*		referenced_table_name = NULL;
 	ulint		num_fk = 0;
 	Alter_info*	alter_info = ha_alter_info->alter_info;
+	const CHARSET_INFO*	cs = thd_charset(trx->mysql_thd);
 
 	DBUG_ENTER("innobase_get_foreign_key_info");
 
@@ -3036,12 +3001,6 @@ innobase_get_foreign_key_info(
 		ulint		num_col = 0;
 		ulint		referenced_num_col = 0;
 		bool		correct_option;
-		char*		db_namep = NULL;
-		char*		tbl_namep = NULL;
-		ulint		db_name_len = 0;
-		ulint		tbl_name_len = 0;
-		char		db_name[MAX_DATABASE_NAME_LEN];
-		char		tbl_name[MAX_TABLE_NAME_LEN];
 
 		Foreign_key* fk_key = static_cast<Foreign_key*>(&key);
 
@@ -3089,45 +3048,14 @@ innobase_get_foreign_key_info(
 
 		add_fk[num_fk] = dict_mem_foreign_create();
 
-#ifndef _WIN32
-		if (fk_key->ref_db.str) {
-			tablename_to_filename(fk_key->ref_db.str, db_name,
-					      MAX_DATABASE_NAME_LEN);
-			db_namep = db_name;
-			db_name_len = strlen(db_name);
-		}
-		if (fk_key->ref_table.str) {
-			tablename_to_filename(fk_key->ref_table.str, tbl_name,
-					      MAX_TABLE_NAME_LEN);
-			tbl_namep = tbl_name;
-			tbl_name_len = strlen(tbl_name);
-		}
-#else
-		ut_ad(fk_key->ref_table.str);
-		tablename_to_filename(fk_key->ref_table.str, tbl_name,
-				      MAX_TABLE_NAME_LEN);
-		innobase_casedn_str(tbl_name);
-		tbl_name_len = strlen(tbl_name);
-		tbl_namep = &tbl_name[0];
-
-		if (fk_key->ref_db.str != NULL) {
-			tablename_to_filename(fk_key->ref_db.str, db_name,
-					      MAX_DATABASE_NAME_LEN);
-			innobase_casedn_str(db_name);
-			db_name_len = strlen(db_name);
-			db_namep = &db_name[0];
-		}
-#endif
 		mutex_enter(&dict_sys.mutex);
 
 		referenced_table_name = dict_get_referenced_table(
 			table->name.m_name,
-			db_namep,
-			db_name_len,
-			tbl_namep,
-			tbl_name_len,
+			LEX_STRING_WITH_LEN(fk_key->ref_db),
+			LEX_STRING_WITH_LEN(fk_key->ref_table),
 			&referenced_table,
-			add_fk[num_fk]->heap);
+			add_fk[num_fk]->heap, cs);
 
 		/* Test the case when referenced_table failed to
 		open, if trx->check_foreigns is not set, we should
@@ -3138,7 +3066,7 @@ innobase_get_foreign_key_info(
 		if (!referenced_table && trx->check_foreigns) {
 			mutex_exit(&dict_sys.mutex);
 			my_error(ER_FK_CANNOT_OPEN_PARENT,
-				 MYF(0), tbl_namep);
+				 MYF(0), fk_key->ref_table.str);
 
 			goto err_exit;
 		}
@@ -3173,7 +3101,7 @@ innobase_get_foreign_key_info(
 					my_error(ER_FK_NO_INDEX_PARENT, MYF(0),
 						 fk_key->name.str
 						 ? fk_key->name.str : "",
-						 tbl_namep);
+						 fk_key->ref_table.str);
 					goto err_exit;
 				}
 			} else {
@@ -3185,7 +3113,8 @@ innobase_get_foreign_key_info(
 			/* Not possible to add a foreign key without a
 			referenced column */
 			mutex_exit(&dict_sys.mutex);
-			my_error(ER_CANNOT_ADD_FOREIGN, MYF(0), tbl_namep);
+			my_error(ER_CANNOT_ADD_FOREIGN, MYF(0),
+				 fk_key->ref_table.str);
 			goto err_exit;
 		}
 
@@ -3560,6 +3489,14 @@ innobase_check_index_keys(
 				}
 			}
 
+			for (const Alter_inplace_info::Rename_key_pair& pair :
+			     info->rename_keys) {
+				if (0 == strcmp(key.name.str,
+                                                pair.old_key->name.str)) {
+					goto name_ok;
+				}
+			}
+
 			my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
                                  key.name.str);
 			return(ER_WRONG_NAME_FOR_INDEX);
@@ -3571,7 +3508,7 @@ name_ok:
 				= key.key_part[i];
 			const Field*		field
 				= key_part1.field;
-			ibool			is_unsigned;
+			unsigned		is_unsigned;
 
 			switch (get_innobase_type_from_mysql_type(
 					&is_unsigned, field)) {
@@ -3640,9 +3577,8 @@ innobase_create_index_field_def(
 	index_field_t*		index_field)
 {
 	const Field*	field;
-	ibool		is_unsigned;
-	ulint		col_type;
-	ulint		num_v = 0;
+	unsigned	is_unsigned;
+	unsigned	num_v = 0;
 
 	DBUG_ENTER("innobase_create_index_field_def");
 
@@ -3656,7 +3592,7 @@ innobase_create_index_field_def(
 		}
 	}
 
-	col_type = get_innobase_type_from_mysql_type(
+	auto col_type = get_innobase_type_from_mysql_type(
 		&is_unsigned, field);
 
 	if ((index_field->is_v_col = !field->stored_in_db())) {
@@ -3733,8 +3669,8 @@ innobase_create_index_def(
 
 		if (key->flags & HA_USES_PARSER) {
 			for (ulint j = 0; j < altered_table->s->keys; j++) {
-				if (ut_strcmp(altered_table->key_info[j].name.str,
-					      key->name.str) == 0) {
+				if (!strcmp(altered_table->key_info[j].name.str,
+					    key->name.str)) {
 					ut_ad(altered_table->key_info[j].flags
 					      & HA_USES_PARSER);
 
@@ -4513,13 +4449,14 @@ innobase_build_col_map(
 				}
 
 				col_map[old_i - num_old_v] = i;
-				if (old_table->versioned()
-				    && altered_table->versioned()) {
-					if (old_i == old_table->vers_start) {
-						new_table->vers_start = i + num_v;
-					} else if (old_i == old_table->vers_end) {
-						new_table->vers_end = i + num_v;
-					}
+				if (!old_table->versioned()
+				    || !altered_table->versioned()) {
+				} else if (old_i == old_table->vers_start) {
+					new_table->vers_start = (i + num_v)
+						& dict_index_t::MAX_N_FIELDS;
+				} else if (old_i == old_table->vers_end) {
+					new_table->vers_end = (i + num_v)
+						& dict_index_t::MAX_N_FIELDS;
 				}
 				goto found_col;
 			}
@@ -4997,8 +4934,7 @@ prepare_inplace_add_virtual(
 	const TABLE*		table)
 {
 	ha_innobase_inplace_ctx*	ctx;
-	ulint				i = 0;
-	ulint				j = 0;
+	uint16_t i = 0, j = 0;
 
 	ctx = static_cast<ha_innobase_inplace_ctx*>
 		(ha_alter_info->handler_ctx);
@@ -5021,14 +4957,12 @@ prepare_inplace_add_virtual(
 			continue;
 		}
 
-		ulint	is_unsigned;
-		ulint	charset_no;
-		ulint	col_type
-				= get_innobase_type_from_mysql_type(
-					&is_unsigned, field);
+		unsigned is_unsigned;
+		auto col_type = get_innobase_type_from_mysql_type(
+			&is_unsigned, field);
 
-		ulint col_len = field->pack_length();
-		ulint field_type = (ulint) field->type();
+		auto col_len = field->pack_length();
+		unsigned field_type = field->type() | is_unsigned;
 
 		if (!field->real_maybe_null()) {
 			field_type |= DATA_NOT_NULL;
@@ -5038,12 +4972,10 @@ prepare_inplace_add_virtual(
 			field_type |= DATA_BINARY_TYPE;
 		}
 
-		if (is_unsigned) {
-			field_type |= DATA_UNSIGNED;
-		}
+		unsigned charset_no;
 
 		if (dtype_is_string_type(col_type)) {
-			charset_no = (ulint) field->charset()->number;
+			charset_no = field->charset()->number;
 
 			DBUG_EXECUTE_IF(
 				"ib_alter_add_virtual_fail",
@@ -5078,14 +5010,16 @@ prepare_inplace_add_virtual(
 
 		ctx->add_vcol[j].m_col.mtype = col_type;
 
-		ctx->add_vcol[j].m_col.len = col_len;
+		ctx->add_vcol[j].m_col.len = static_cast<uint16_t>(col_len);
 
-		ctx->add_vcol[j].m_col.ind = i - 1;
+		ctx->add_vcol[j].m_col.ind = (i - 1)
+			& dict_index_t::MAX_N_FIELDS;
 		ctx->add_vcol[j].num_base = 0;
 		ctx->add_vcol_name[j] = field->field_name.str;
 		ctx->add_vcol[j].base_col = NULL;
-		ctx->add_vcol[j].v_pos = ctx->old_table->n_v_cols
-					 - ctx->num_to_drop_vcol + j;
+		ctx->add_vcol[j].v_pos = (ctx->old_table->n_v_cols
+					  - ctx->num_to_drop_vcol + j)
+			& dict_index_t::MAX_N_FIELDS;
 
 		/* MDEV-17468: Do this on ctx->instant_table later */
 		innodb_base_col_setup(ctx->old_table, field, &ctx->add_vcol[j]);
@@ -5107,8 +5041,7 @@ prepare_inplace_drop_virtual(
 	const TABLE*		table)
 {
 	ha_innobase_inplace_ctx*	ctx;
-	ulint				i = 0;
-	ulint				j = 0;
+	unsigned i = 0, j = 0;
 
 	ctx = static_cast<ha_innobase_inplace_ctx*>
 		(ha_alter_info->handler_ctx);
@@ -5134,17 +5067,13 @@ prepare_inplace_drop_virtual(
 			continue;
 		}
 
-		ulint	col_len;
-		ulint	is_unsigned;
-		ulint	field_type;
-		ulint	charset_no;
+		unsigned is_unsigned;
 
-		ulint           col_type
-                                = get_innobase_type_from_mysql_type(
-                                        &is_unsigned, field);
+		auto col_type = get_innobase_type_from_mysql_type(
+			&is_unsigned, field);
 
-		col_len = field->pack_length();
-		field_type = (ulint) field->type();
+		auto col_len = field->pack_length();
+		unsigned field_type = field->type() | is_unsigned;
 
 		if (!field->real_maybe_null()) {
 			field_type |= DATA_NOT_NULL;
@@ -5154,12 +5083,10 @@ prepare_inplace_drop_virtual(
 			field_type |= DATA_BINARY_TYPE;
 		}
 
-		if (is_unsigned) {
-			field_type |= DATA_UNSIGNED;
-		}
+		unsigned charset_no = 0;
 
 		if (dtype_is_string_type(col_type)) {
-			charset_no = (ulint) field->charset()->number;
+			charset_no = field->charset()->number;
 
 			DBUG_EXECUTE_IF(
 				"ib_alter_add_virtual_fail",
@@ -5194,9 +5121,9 @@ prepare_inplace_drop_virtual(
 
 		ctx->drop_vcol[j].m_col.mtype = col_type;
 
-		ctx->drop_vcol[j].m_col.len = col_len;
+		ctx->drop_vcol[j].m_col.len = static_cast<uint16_t>(col_len);
 
-		ctx->drop_vcol[j].m_col.ind = i;
+		ctx->drop_vcol[j].m_col.ind = i & dict_index_t::MAX_N_FIELDS;
 
 		ctx->drop_vcol_name[j] = field->field_name.str;
 
@@ -5328,7 +5255,7 @@ static bool innobase_add_one_virtual(
 		return true;
 	}
 
-	for (ulint i = 0; i < unsigned{vcol->num_base}; i++) {
+	for (unsigned i = 0; i < vcol->num_base; i++) {
 		if (innobase_insert_sys_virtual(
 			    table, pos, vcol->base_col[i]->ind, trx)) {
 			return true;
@@ -5601,7 +5528,7 @@ innobase_drop_virtual_try(
 	ctx = static_cast<ha_innobase_inplace_ctx*>
 		(ha_alter_info->handler_ctx);
 
-	for (ulint i = 0; i < ctx->num_to_drop_vcol; i++) {
+	for (unsigned i = 0; i < ctx->num_to_drop_vcol; i++) {
 
 		ulint	pos = dict_create_v_col_pos(
 			ctx->drop_vcol[i].v_pos - i,
@@ -5757,12 +5684,6 @@ static bool innobase_instant_try(
 	dict_table_t* user_table = ctx->old_table;
 
 	dict_index_t* index = dict_table_get_first_index(user_table);
-	mtr_t mtr;
-	mtr.start();
-	/* Prevent purge from calling dict_index_t::clear_instant_add(),
-	to protect index->n_core_fields, index->table->instant and others
-	from changing during ctx->instant_column(). */
-	instant_metadata_lock(*index, mtr);
 	const unsigned n_old_fields = index->n_fields;
 	const dict_col_t* old_cols = user_table->cols;
 	DBUG_ASSERT(user_table->n_cols == ctx->old_n_cols);
@@ -5770,11 +5691,6 @@ static bool innobase_instant_try(
 	const bool metadata_changed = ctx->instant_column();
 
 	DBUG_ASSERT(index->n_fields >= n_old_fields);
-	/* Release the page latch. Between this and the next
-	btr_pcur_open_at_index_side(), data fields such as
-	index->n_core_fields and index->table->instant could change,
-	but we would handle that in empty_table: below. */
-	mtr.commit();
 	/* The table may have been emptied and may have lost its
 	'instantness' during this ALTER TABLE. */
 
@@ -5944,6 +5860,7 @@ add_all_virtual:
 	memset(roll_ptr, 0, sizeof roll_ptr);
 
 	dtuple_t* entry = index->instant_metadata(*row, ctx->heap);
+	mtr_t	mtr;
 	mtr.start();
 	index->set_modified(mtr);
 	btr_pcur_t pcur;
@@ -5959,7 +5876,7 @@ add_all_virtual:
 	const rec_t* rec = btr_pcur_get_rec(&pcur);
 	que_thr_t* thr = pars_complete_graph_for_exec(
 		NULL, trx, ctx->heap, NULL);
-	const bool is_root = block->page.id.page_no() == index->page;
+	const bool is_root = block->page.id().page_no() == index->page;
 
 	dberr_t err = DB_SUCCESS;
 	if (rec_is_metadata(rec, *index)) {
@@ -5981,7 +5898,7 @@ add_all_virtual:
 						       &mtr);
 		DBUG_ASSERT(root);
 		if (fil_page_get_type(root->frame) != FIL_PAGE_TYPE_INSTANT) {
-			DBUG_ASSERT(!"wrong page type");
+			DBUG_ASSERT("wrong page type" == 0);
 			err = DB_CORRUPTION;
 			goto func_exit;
 		}
@@ -5993,7 +5910,7 @@ add_all_virtual:
 		/* Reserve room for DB_TRX_ID,DB_ROLL_PTR and any
 		non-updated off-page columns in case they are moved off
 		page as a result of the update. */
-		const unsigned f = user_table->instant != NULL;
+		const uint16_t f = user_table->instant != NULL;
 		upd_t* update = upd_create(index->n_fields + f, ctx->heap);
 		update->n_fields = n + f;
 		update->info_bits = f
@@ -6012,7 +5929,7 @@ add_all_virtual:
 
 		for (unsigned k = n_old_fields; k < index->n_fields; k++) {
 			upd_field_t* uf = upd_get_nth_field(update, j++);
-			uf->field_no = k + f;
+			uf->field_no = static_cast<uint16_t>(k + f);
 			uf->new_val = entry->fields[k + f];
 
 			ut_ad(j <= n + f);
@@ -6053,7 +5970,7 @@ empty_table:
 		/* The table is empty. */
 		ut_ad(fil_page_index_page_check(block->frame));
 		ut_ad(!page_has_siblings(block->frame));
-		ut_ad(block->page.id.page_no() == index->page);
+		ut_ad(block->page.id().page_no() == index->page);
 		/* MDEV-17383: free metadata BLOBs! */
 		btr_page_empty(block, NULL, index, 0, &mtr);
 		if (index->is_instant()) {
@@ -6071,7 +5988,7 @@ empty_table:
 	index->set_modified(mtr);
 	if (buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, &mtr)) {
 		if (fil_page_get_type(root->frame) != FIL_PAGE_INDEX) {
-			DBUG_ASSERT(!"wrong page type");
+			DBUG_ASSERT("wrong page type" == 0);
 			goto err_exit;
 		}
 
@@ -6308,11 +6225,11 @@ prepare_inplace_alter_table_dict(
 			for (ulint i = 0; i < ctx->num_to_add_vcol; i++) {
 				/* Set mbminmax for newly added column */
 				dict_col_t& col = ctx->add_vcol[i].m_col;
-				ulint mbminlen, mbmaxlen;
+				unsigned mbminlen, mbmaxlen;
 				dtype_get_mblen(col.mtype, col.prtype,
 						&mbminlen, &mbmaxlen);
-				col.mbminlen = mbminlen;
-				col.mbmaxlen = mbmaxlen;
+				col.mbminlen = mbminlen & 7;
+				col.mbmaxlen = mbmaxlen & 7;
 			}
 			add_v = static_cast<dict_add_v_col_t*>(
 				mem_heap_alloc(ctx->heap, sizeof *add_v));
@@ -6419,7 +6336,7 @@ prepare_inplace_alter_table_dict(
 			    user_table, ctx->drop_fk, ctx->num_to_drop_fk)) {
 new_clustered_failed:
 			DBUG_ASSERT(ctx->trx != ctx->prebuilt->trx);
-			trx_rollback_to_savepoint(ctx->trx, NULL);
+			ctx->trx->rollback();
 
 			ut_ad(user_table->get_ref_count() == 1);
 
@@ -6506,15 +6423,11 @@ new_clustered_failed:
 
 		for (uint i = 0; i < altered_table->s->fields; i++) {
 			const Field*	field = altered_table->field[i];
-			ulint		is_unsigned;
-			ulint		field_type
-				= (ulint) field->type();
-			ulint		col_type
-				= get_innobase_type_from_mysql_type(
-					&is_unsigned, field);
-			ulint		charset_no;
-			ulint		col_len;
-			const bool	is_virtual = !field->stored_in_db();
+			unsigned is_unsigned;
+			auto col_type = get_innobase_type_from_mysql_type(
+				&is_unsigned, field);
+			unsigned field_type = field->type() | is_unsigned;
+			const bool is_virtual = !field->stored_in_db();
 
 			/* we assume in dtype_form_prtype() that this
 			fits in two bytes */
@@ -6528,10 +6441,6 @@ new_clustered_failed:
 				field_type |= DATA_BINARY_TYPE;
 			}
 
-			if (is_unsigned) {
-				field_type |= DATA_UNSIGNED;
-			}
-
 			if (altered_table->versioned()) {
 				if (i == altered_table->s->vers.start_fieldno) {
 					field_type |= DATA_VERS_START;
@@ -6544,8 +6453,10 @@ new_clustered_failed:
 				}
 			}
 
+			unsigned charset_no;
+
 			if (dtype_is_string_type(col_type)) {
-				charset_no = (ulint) field->charset()->number;
+				charset_no = field->charset()->number;
 
 				if (charset_no > MAX_CHAR_COLL_NUM) {
 					my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB",
@@ -6556,7 +6467,7 @@ new_clustered_failed:
 				charset_no = 0;
 			}
 
-			col_len = field->pack_length();
+			auto col_len = field->pack_length();
 
 			/* The MySQL pack length contains 1 or 2 bytes
 			length field for a true VARCHAR. Let us
@@ -7008,9 +6919,10 @@ error_handling_drop_uncached_1:
 		if (const Field* ai = altered_table->found_next_number_field) {
 			const unsigned	col_no = innodb_col_no(ai);
 
-			ctx->new_table->persistent_autoinc = 1
-				+ dict_table_get_nth_col_pos(
-					ctx->new_table, col_no, NULL);
+			ctx->new_table->persistent_autoinc =
+				(dict_table_get_nth_col_pos(
+					ctx->new_table, col_no, NULL) + 1)
+				& dict_index_t::MAX_N_FIELDS;
 
 			/* Initialize the AUTO_INCREMENT sequence
 			to the rebuilt table from the old one. */
@@ -7230,7 +7142,11 @@ op_ok:
 	row_mysql_unlock_data_dictionary(ctx->trx);
 	dict_locked = false;
 
-	ut_a(ctx->trx->lock.n_active_thrs == 0);
+	ut_ad(!ctx->trx->lock.n_active_thrs);
+
+	if (ctx->old_table->fts) {
+		fts_sync_during_ddl(ctx->old_table);
+	}
 
 error_handling:
 	/* After an error, remove all those index definitions from the
@@ -8789,6 +8705,26 @@ innobase_rollback_sec_index(
 	}
 }
 
+/* Get the number of uncommitted fts index during rollback
+operation.
+@param[in]	table	table which undergoes rollback for alter
+@return number of uncommitted fts indexes. */
+static
+ulint innobase_get_uncommitted_fts_indexes(const dict_table_t* table)
+{
+  ut_ad(mutex_own(&dict_sys.mutex));
+  dict_index_t*	index = dict_table_get_first_index(table);
+  ulint n_uncommitted_fts = 0;
+
+  for (; index ; index = dict_table_get_next_index(index))
+  {
+    if (index->type & DICT_FTS && !index->is_committed())
+      n_uncommitted_fts++;
+  }
+
+  return n_uncommitted_fts;
+}
+
 /** Roll back the changes made during prepare_inplace_alter_table()
 and inplace_alter_table() inside the storage engine. Note that the
 allowed level of concurrency during this operation will be the same as
@@ -8871,6 +8807,19 @@ rollback_inplace_alter_table(
 			      & ALTER_ADD_PK_INDEX));
 		DBUG_ASSERT(ctx->new_table == prebuilt->table);
 
+		/* Remove the fts table from fts_optimize_wq if
+		there is only one fts index exist. */
+		if (prebuilt->table->fts
+		    && innobase_get_uncommitted_fts_indexes(
+					prebuilt->table) == 1
+		    && (ib_vector_is_empty(prebuilt->table->fts->indexes)
+			|| ib_vector_size(prebuilt->table->fts->indexes)
+			   == 1)) {
+			row_mysql_unlock_data_dictionary(ctx->trx);
+			fts_optimize_remove_table(prebuilt->table);
+			row_mysql_lock_data_dictionary(ctx->trx);
+		}
+
 		innobase_rollback_sec_index(
 			prebuilt->table, table,
 			(ha_alter_info->alter_info->requested_lock
@@ -9295,14 +9244,15 @@ processed_field:
 }
 
 /** Convert field type and length to InnoDB format */
-static void get_type(const Field& f, ulint& prtype, ulint& mtype, ulint& len)
+static void get_type(const Field& f, uint& prtype, uint8_t& mtype,
+                     uint16_t& len)
 {
 	mtype = get_innobase_type_from_mysql_type(&prtype, &f);
-	len = f.pack_length();
+	len = static_cast<uint16_t>(f.pack_length());
 	prtype |= f.type();
 	if (f.type() == MYSQL_TYPE_VARCHAR) {
 		auto l = static_cast<const Field_varstring&>(f).length_bytes;
-		len -= l;
+		len = static_cast<uint16_t>(len - l);
 		if (l == 2) prtype |= DATA_LONG_TRUE_VARCHAR;
 	}
 	if (!f.real_maybe_null()) prtype |= DATA_NOT_NULL;
@@ -9319,7 +9269,7 @@ static void get_type(const Field& f, ulint& prtype, ulint& mtype, ulint& len)
 	if (!f.stored_in_db()) prtype |= DATA_VIRTUAL;
 
 	if (dtype_is_string_type(mtype)) {
-		prtype |= ulint(f.charset()->number) << 16;
+		prtype |= f.charset()->number << 16;
 	}
 }
 
@@ -9364,7 +9314,9 @@ innobase_rename_or_enlarge_column_try(
 		n_base = 0;
 	}
 
-	ulint prtype, mtype, len;
+	unsigned prtype;
+	uint8_t mtype;
+	uint16_t len;
 	get_type(f, prtype, mtype, len);
 	DBUG_ASSERT(!dtype_is_string_type(col->mtype)
 		    || col->mbminlen == f.charset()->mbminlen);
@@ -9518,7 +9470,9 @@ innobase_rename_or_enlarge_columns_cache(
 			DBUG_ASSERT(col->mbminlen
 				    == (is_string
 					? (*af)->charset()->mbminlen : 0));
-			ulint prtype, mtype, len;
+			unsigned prtype;
+			uint8_t mtype;
+			uint16_t len;
 			get_type(**af, prtype, mtype, len);
 			DBUG_ASSERT(is_string == dtype_is_string_type(mtype));
 
@@ -9526,7 +9480,7 @@ innobase_rename_or_enlarge_columns_cache(
 			col->mtype = mtype;
 			col->len = len;
 			col->mbmaxlen = is_string
-				? (*af)->charset()->mbmaxlen : 0;
+				? (*af)->charset()->mbmaxlen & 7: 0;
 
 			if ((*fp)->flags & FIELD_IS_RENAMED) {
 				dict_mem_table_col_rename(
@@ -10082,13 +10036,16 @@ commit_try_rebuild(
 
 	/* We can now rename the old table as a temporary table,
 	rename the new temporary table as the old table and drop the
-	old table. First, we only do this in the data dictionary
-	tables. The actual renaming will be performed in
-	commit_cache_rebuild(), once the data dictionary transaction
-	has been successfully committed. */
+	old table. */
+	char* old_name= mem_heap_strdup(ctx->heap, user_table->name.m_name);
 
-	error = row_merge_rename_tables_dict(
-		user_table, rebuilt_table, ctx->tmp_name, trx);
+	error = row_rename_table_for_mysql(user_table->name.m_name,
+					   ctx->tmp_name, trx, false, false);
+	if (error == DB_SUCCESS) {
+		error = row_rename_table_for_mysql(rebuilt_table->name.m_name,
+						   old_name, trx,
+						   false, false);
+	}
 
 	/* We must be still holding a table handle. */
 	DBUG_ASSERT(user_table->get_ref_count() == 1);
@@ -10145,38 +10102,6 @@ rename_indexes_try(
 	return false;
 }
 
-/** Apply the changes made during commit_try_rebuild(),
-to the data dictionary cache and the file system.
-@param ctx In-place ALTER TABLE context */
-inline MY_ATTRIBUTE((nonnull))
-void
-commit_cache_rebuild(
-/*=================*/
-	ha_innobase_inplace_ctx*	ctx)
-{
-	dberr_t		error;
-
-	DBUG_ENTER("commit_cache_rebuild");
-	DEBUG_SYNC_C("commit_cache_rebuild");
-	DBUG_ASSERT(ctx->need_rebuild());
-	DBUG_ASSERT(!ctx->old_table->space == !ctx->new_table->space);
-
-	const char* old_name = mem_heap_strdup(
-		ctx->heap, ctx->old_table->name.m_name);
-
-	/* We already committed and redo logged the renames,
-	so this must succeed. */
-	error = dict_table_rename_in_cache(
-		ctx->old_table, ctx->tmp_name, false);
-	ut_a(error == DB_SUCCESS);
-
-	error = dict_table_rename_in_cache(
-		ctx->new_table, old_name, false);
-	ut_a(error == DB_SUCCESS);
-
-	DBUG_VOID_RETURN;
-}
-
 /** Set of column numbers */
 typedef std::set<ulint, std::less<ulint>, ut_allocator<ulint> >	col_set;
 
@@ -10291,20 +10216,30 @@ dict_stats_try_drop_table(THD *thd, const table_name_t &name,
 }
 
 /** Evict the table from cache and reopen it. Drop outdated statistics.
-  @param thd                 mariadb THD entity
-  @param table               innodb table
-  @param maria_table_name    user-friendly table name for errors
-  @return newly opened table */
-static
-dict_table_t*
-innobase_reload_table(THD *thd, dict_table_t *table,
-                      const LEX_CSTRING &table_name)
+@param thd           mariadb THD entity
+@param table         innodb table
+@param table_name    user-friendly table name for errors
+@param ctx           ALTER TABLE context
+@return newly opened table */
+static dict_table_t *innobase_reload_table(THD *thd, dict_table_t *table,
+                                           const LEX_CSTRING &table_name,
+                                           ha_innobase_inplace_ctx &ctx)
 {
   char *tb_name= strdup(table->name.m_name);
   dict_table_close(table, true, false);
+
+  if (ctx.is_instant())
+  {
+    for (auto i = ctx.old_n_v_cols; i--; )
+    {
+      ctx.old_v_cols[i].~dict_v_col_t();
+      const_cast<unsigned&>(ctx.old_n_v_cols) = 0;
+    }
+  }
+
   dict_sys.remove(table);
   table= dict_table_open_on_name(tb_name, TRUE, TRUE,
-                                  DICT_ERR_IGNORE_FK_NOKEY);
+                                 DICT_ERR_IGNORE_FK_NOKEY);
 
   /* Drop outdated table stats. */
   dict_stats_try_drop_table(thd, table->name, table_name);
@@ -10470,9 +10405,9 @@ commit_try_norebuild(
 			DBUG_RETURN(true);
 		}
 
-		ulint	n_col = unsigned(ctx->old_table->n_cols)
+		unsigned n_col = ctx->old_table->n_cols
 			- DATA_N_SYS_COLS;
-		ulint	n_v_col = unsigned(ctx->old_table->n_v_cols)
+		unsigned n_v_col = ctx->old_table->n_v_cols
 			+ ctx->num_to_add_vcol - ctx->num_to_drop_vcol;
 
 		if (innodb_update_cols(
@@ -10515,11 +10450,22 @@ commit_cache_norebuild(
 
 	if (ctx->page_compression_level) {
 		DBUG_ASSERT(ctx->new_table->space != fil_system.sys_space);
-		ctx->new_table->flags &=
-			~(0xFU << DICT_TF_POS_PAGE_COMPRESSION_LEVEL);
-		ctx->new_table->flags |= 1 << DICT_TF_POS_PAGE_COMPRESSION
-			| (ctx->page_compression_level
-			   << DICT_TF_POS_PAGE_COMPRESSION_LEVEL);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */
+#endif
+		ctx->new_table->flags
+			= static_cast<uint16_t>(
+				(ctx->new_table->flags
+				 & ~(0xFU
+				     << DICT_TF_POS_PAGE_COMPRESSION_LEVEL))
+				| 1 << DICT_TF_POS_PAGE_COMPRESSION
+				| (ctx->page_compression_level & 0xF)
+				<< DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
+			& ((1U << DICT_TF_BITS) - 1);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
 
 		if (fil_space_t* space = ctx->new_table->space) {
 			bool update = !(space->flags
@@ -10555,13 +10501,15 @@ commit_cache_norebuild(
 					    page_id_t(space->id, 0),
 					    space->zip_size(),
 					    RW_X_LATCH, &mtr)) {
-					mtr.set_named_space(space);
-					mlog_write_ulint(
-						FSP_HEADER_OFFSET
-						+ FSP_SPACE_FLAGS + b->frame,
-						space->flags
-						& ~FSP_FLAGS_MEM_MASK,
-						MLOG_4BYTES, &mtr);
+					byte* f = FSP_HEADER_OFFSET
+						+ FSP_SPACE_FLAGS + b->frame;
+					const auto sf = space->flags
+						& ~FSP_FLAGS_MEM_MASK;
+					if (mach_read_from_4(f) != sf) {
+						mtr.set_named_space(space);
+						mtr.write<4,mtr_t::FORCED>(
+							*b, f, sf);
+					}
 				}
 				mtr.commit();
 			}
@@ -10998,7 +10946,6 @@ ha_innobase::commit_inplace_alter_table(
 	bool			commit)
 {
 	ha_innobase_inplace_ctx*ctx0;
-	struct mtr_buf_copy_t	logs;
 
 	ctx0 = static_cast<ha_innobase_inplace_ctx*>
 		(ha_alter_info->handler_ctx);
@@ -11118,6 +11065,7 @@ ha_innobase::commit_inplace_alter_table(
 		if (ctx->new_table->fts) {
 			ut_ad(!ctx->new_table->fts->add_wq);
 			fts_optimize_remove_table(ctx->new_table);
+			fts_sync_during_ddl(ctx->new_table);
 		}
 
 		/* Apply the online log of the table before acquiring
@@ -11144,8 +11092,6 @@ ha_innobase::commit_inplace_alter_table(
 	or lock waits can happen in it during the data dictionary operation. */
 	row_mysql_lock_data_dictionary(trx);
 
-	ut_ad(log_append_on_checkpoint(NULL) == NULL);
-
 	/* Prevent the background statistics collection from accessing
 	the tables. */
 	for (;;) {
@@ -11175,44 +11121,6 @@ ha_innobase::commit_inplace_alter_table(
 		DICT_BG_YIELD(trx);
 	}
 
-	/* Make a concurrent Drop fts Index to wait until sync of that
-	fts index is happening in the background */
-	for (int retry_count = 0;;) {
-		bool    retry = false;
-
-		for (inplace_alter_handler_ctx** pctx = ctx_array;
-		    *pctx; pctx++) {
-			ha_innobase_inplace_ctx*        ctx
-				= static_cast<ha_innobase_inplace_ctx*>(*pctx);
-			DBUG_ASSERT(new_clustered == ctx->need_rebuild());
-
-			if (dict_fts_index_syncing(ctx->old_table)) {
-				retry = true;
-				break;
-			}
-
-			if (new_clustered && dict_fts_index_syncing(ctx->new_table)) {
-				retry = true;
-				break;
-			}
-		}
-
-		if (!retry) {
-			 break;
-		}
-
-		/* Print a message if waiting for a long time. */
-		if (retry_count < 100) {
-			retry_count++;
-		} else {
-			ib::info() << "Drop index waiting for background sync"
-				" to finish";
-			retry_count = 0;
-		}
-
-		DICT_BG_YIELD(trx);
-	}
-
 	/* Apply the changes to the data dictionary tables, for all
 	partitions. */
 
@@ -11279,33 +11187,6 @@ ha_innobase::commit_inplace_alter_table(
 	} else if (!new_clustered) {
 		trx_commit_for_mysql(trx);
 	} else {
-		mtr_t	mtr;
-		mtr_start(&mtr);
-
-		for (inplace_alter_handler_ctx** pctx = ctx_array;
-		     *pctx; pctx++) {
-			ha_innobase_inplace_ctx*	ctx
-				= static_cast<ha_innobase_inplace_ctx*>(*pctx);
-
-			DBUG_ASSERT(ctx->need_rebuild());
-			/* Check for any possible problems for any
-			file operations that will be performed in
-			commit_cache_rebuild(), and if none, generate
-			the redo log for these operations. */
-			dberr_t error = fil_mtr_rename_log(
-				ctx->old_table, ctx->new_table, ctx->tmp_name,
-				&mtr);
-			if (error != DB_SUCCESS) {
-				/* Out of memory or a problem will occur
-				when renaming files. */
-				fail = true;
-				my_error_innodb(error, ctx->old_table->name.m_name,
-						ctx->old_table->flags);
-			}
-			DBUG_INJECT_CRASH("ib_commit_inplace_crash",
-					  crash_inject_count++);
-		}
-
 		/* Test what happens on crash if the redo logs
 		are flushed to disk here. The log records
 		about the rename should not be committed, and
@@ -11317,39 +11198,11 @@ ha_innobase::commit_inplace_alter_table(
 		ut_ad(!trx->fts_trx);
 
 		if (fail) {
-			mtr.set_log_mode(MTR_LOG_NO_REDO);
-			mtr_commit(&mtr);
 			trx_rollback_for_mysql(trx);
 		} else {
 			ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
 			ut_ad(trx->has_logged());
-
-			if (mtr.get_log()->size() > 0) {
-				ut_ad(*mtr.get_log()->front()->begin()
-				      == MLOG_FILE_RENAME2);
-
-				/* Append the MLOG_FILE_RENAME2
-				records on checkpoint, as a separate
-				mini-transaction before the one that
-				contains the MLOG_CHECKPOINT marker. */
-				static const byte	multi
-					= MLOG_MULTI_REC_END;
-
-				mtr.get_log()->for_each_block(logs);
-				logs.m_buf.push(&multi, sizeof multi);
-
-				log_append_on_checkpoint(&logs.m_buf);
-			}
-
-			/* The following call commits the
-			mini-transaction, making the data dictionary
-			transaction committed at mtr.end_lsn. The
-			transaction becomes 'durable' by the time when
-			log_buffer_flush_to_disk() returns. In the
-			logical sense the commit in the file-based
-			data structures happens here. */
-
-			trx->commit_low(&mtr);
+			trx->commit();
 		}
 
 		/* If server crashes here, the dictionary in
@@ -11426,9 +11279,6 @@ ha_innobase::commit_inplace_alter_table(
 			DBUG_PRINT("to_be_dropped",
 				   ("table: %s", ctx->old_table->name.m_name));
 
-			/* Rename the tablespace files. */
-			commit_cache_rebuild(ctx);
-
 			if (innobase_update_foreign_cache(ctx, m_user_thd)
 			    != DB_SUCCESS
 			    && m_prebuilt->trx->check_foreigns) {
@@ -11462,13 +11312,6 @@ foreign_fail:
 				  crash_inject_count++);
 	}
 
-	log_append_on_checkpoint(NULL);
-
-	/* Tell the InnoDB server that there might be work for
-	utility threads: */
-
-	srv_active_wake_master_thread();
-
 	if (fail) {
 		for (inplace_alter_handler_ctx** pctx = ctx_array;
 		     *pctx; pctx++) {
@@ -11527,19 +11370,19 @@ foreign_fail:
 		&& m_prebuilt->table->n_v_cols
 		&& ha_alter_info->handler_flags & ALTER_STORED_COLUMN_ORDER)) {
 		DBUG_ASSERT(ctx0->old_table->get_ref_count() == 1);
+		ut_ad(ctx0->prebuilt == m_prebuilt);
 		trx_commit_for_mysql(m_prebuilt->trx);
 
-		if (ctx0->is_instant()) {
-			for (unsigned i = ctx0->old_n_v_cols; i--; ) {
-				ctx0->old_v_cols[i].~dict_v_col_t();
-			}
-			const_cast<unsigned&>(ctx0->old_n_v_cols) = 0;
+		for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx;
+		     pctx++) {
+			auto ctx= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+			ctx->prebuilt->table = innobase_reload_table(
+				m_user_thd, ctx->prebuilt->table,
+				table->s->table_name, *ctx);
+			innobase_copy_frm_flags_from_table_share(
+				ctx->prebuilt->table, altered_table->s);
 		}
 
-		m_prebuilt->table = innobase_reload_table(m_user_thd,
-                                                          m_prebuilt->table,
-                                                          table->s->table_name);
-
 		row_mysql_unlock_data_dictionary(trx);
 		trx->free();
 		MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc
index d21c0cefec0..8a15bb64c9f 100644
--- a/storage/innobase/handler/i_s.cc
+++ b/storage/innobase/handler/i_s.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2014, 2021, MariaDB Corporation.
+Copyright (c) 2014, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -115,27 +115,23 @@ currently cached in the buffer pool. It will be used to populate
 table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE */
 struct buf_page_info_t{
 	ulint		block_id;	/*!< Buffer Pool block ID */
-	unsigned	space_id:32;	/*!< Tablespace ID */
-	unsigned	page_num:32;	/*!< Page number/offset */
+	/** page identifier */
+	page_id_t	id;
 	unsigned	access_time:32;	/*!< Time of first access */
-	unsigned	pool_id:MAX_BUFFER_POOLS_BITS;
-					/*!< Buffer Pool ID. Must be less than
-					MAX_BUFFER_POOLS */
-	unsigned	flush_type:2;	/*!< Flush type */
 	unsigned	io_fix:2;	/*!< type of pending I/O operation */
-	unsigned	fix_count:19;	/*!< Count of how manyfold this block
+	uint32_t	fix_count;	/*!< Count of how manyfold this block
 					is bufferfixed */
 #ifdef BTR_CUR_HASH_ADAPT
 	unsigned	hashed:1;	/*!< Whether hash index has been
 					built on this page */
 #endif /* BTR_CUR_HASH_ADAPT */
 	unsigned	is_old:1;	/*!< TRUE if the block is in the old
-					blocks in buf_pool->LRU_old */
+					blocks in buf_pool.LRU_old */
 	unsigned	freed_page_clock:31; /*!< the value of
-					buf_pool->freed_page_clock */
+					buf_pool.freed_page_clock */
 	unsigned	zip_ssize:PAGE_ZIP_SSIZE_BITS;
 					/*!< Compressed page size */
-	unsigned	page_state:BUF_PAGE_STATE_BITS; /*!< Page state */
+	unsigned	page_state:3; /*!< Page state */
 	unsigned	page_type:I_S_PAGE_TYPE_BITS;	/*!< Page type */
 	unsigned	num_recs:UNIV_PAGE_SIZE_SHIFT_MAX-2;
 					/*!< Number of records on Page */
@@ -266,131 +262,101 @@ field_store_string(
 	return field->store(str, uint(strlen(str)), system_charset_info);
 }
 
-/*******************************************************************//**
-Auxiliary function to store ulint value in MYSQL_TYPE_LONGLONG field.
-If the value is ULINT_UNDEFINED then the field is set to NULL.
-@return 0 on success */
-int
-field_store_ulint(
-/*==============*/
-	Field*	field,	/*!< in/out: target field for storage */
-	ulint	n)	/*!< in: value to store */
-{
-	int	ret;
-
-	if (n != ULINT_UNDEFINED) {
-
-		ret = field->store(longlong(n), true);
-		field->set_notnull();
-	} else {
-
-		ret = 0; /* success */
-		field->set_null();
-	}
-
-	return(ret);
-}
-
 #ifdef BTR_CUR_HASH_ADAPT
 # define I_S_AHI 1 /* Include the IS_HASHED column */
 #else
 # define I_S_AHI 0 /* Omit the IS_HASHED column */
 #endif
 
+static const LEX_CSTRING isolation_level_values[] =
+{
+	{ STRING_WITH_LEN("READ UNCOMMITTED") },
+	{ STRING_WITH_LEN("READ COMMITTED") },
+	{ STRING_WITH_LEN("REPEATABLE READ") },
+	{ STRING_WITH_LEN("SERIALIZABLE") }
+};
+
+static TypelibBuffer<4> isolation_level_values_typelib(isolation_level_values);
+
+namespace Show {
+
 /* Fields of the dynamic table INFORMATION_SCHEMA.innodb_trx */
 static ST_FIELD_INFO innodb_trx_fields_info[]=
 {
 #define IDX_TRX_ID		0
-  {"trx_id", TRX_ID_MAX_LEN + 1, MYSQL_TYPE_STRING, 0, 0, "", SKIP_OPEN_TABLE},
+  Column("trx_id", ULonglong(), NOT_NULL),
 
 #define IDX_TRX_STATE		1
-  {"trx_state", TRX_QUE_STATE_STR_MAX_LEN + 1, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("trx_state", Varchar(TRX_QUE_STATE_STR_MAX_LEN + 1), NOT_NULL),
 
 #define IDX_TRX_STARTED		2
-  {"trx_started", 0, MYSQL_TYPE_DATETIME, 0, 0, "", SKIP_OPEN_TABLE},
+  Column("trx_started", Datetime(0), NOT_NULL),
 
 #define IDX_TRX_REQUESTED_LOCK_ID	3
-  {"trx_requested_lock_id", TRX_I_S_LOCK_ID_MAX_LEN + 1, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("trx_requested_lock_id",
+         Varchar(TRX_I_S_LOCK_ID_MAX_LEN + 1), NULLABLE),
 
 #define IDX_TRX_WAIT_STARTED	4
-  {"trx_wait_started", 0, MYSQL_TYPE_DATETIME,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+ Column("trx_wait_started", Datetime(0), NULLABLE),
 
 #define IDX_TRX_WEIGHT		5
-  {"trx_weight",MY_INT64_NUM_DECIMAL_DIGITS,MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+ Column("trx_weight", ULonglong(), NOT_NULL),
 
 #define IDX_TRX_MYSQL_THREAD_ID	6
-  {"trx_mysql_thread_id",MY_INT64_NUM_DECIMAL_DIGITS,MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("trx_mysql_thread_id", ULonglong(), NOT_NULL),
 
 #define IDX_TRX_QUERY		7
-  {"trx_query", TRX_I_S_TRX_QUERY_MAX_LEN, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("trx_query", Varchar(TRX_I_S_TRX_QUERY_MAX_LEN), NULLABLE),
 
 #define IDX_TRX_OPERATION_STATE	8
-  {"trx_operation_state", TRX_I_S_TRX_OP_STATE_MAX_LEN, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("trx_operation_state", Varchar(64), NULLABLE),
 
 #define IDX_TRX_TABLES_IN_USE	9
-  {"trx_tables_in_use", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("trx_tables_in_use", ULonglong(), NOT_NULL),
 
 #define IDX_TRX_TABLES_LOCKED	10
-  {"trx_tables_locked", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("trx_tables_locked", ULonglong(), NOT_NULL),
 
 #define IDX_TRX_LOCK_STRUCTS	11
-  {"trx_lock_structs", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("trx_lock_structs", ULonglong(), NOT_NULL),
 
 #define IDX_TRX_LOCK_MEMORY_BYTES	12
-  {"trx_lock_memory_bytes", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("trx_lock_memory_bytes", ULonglong(), NOT_NULL),
 
 #define IDX_TRX_ROWS_LOCKED	13
-  {"trx_rows_locked", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("trx_rows_locked", ULonglong(), NOT_NULL),
 
-#define IDX_TRX_ROWS_MODIFIED		14
-  {"trx_rows_modified", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+#define IDX_TRX_ROWS_MODIFIED	14
+  Column("trx_rows_modified", ULonglong(), NOT_NULL),
 
 #define IDX_TRX_CONNCURRENCY_TICKETS	15
-  {"trx_concurrency_tickets", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("trx_concurrency_tickets", ULonglong(), NOT_NULL),
 
 #define IDX_TRX_ISOLATION_LEVEL	16
-  {"trx_isolation_level", TRX_I_S_TRX_ISOLATION_LEVEL_MAX_LEN,
-   MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("trx_isolation_level",
+         Enum(&isolation_level_values_typelib), NOT_NULL),
 
 #define IDX_TRX_UNIQUE_CHECKS	17
-  {"trx_unique_checks", 1, MYSQL_TYPE_LONG,
-   1, 0, "", SKIP_OPEN_TABLE},
+  Column("trx_unique_checks", SLong(1), NOT_NULL),
 
 #define IDX_TRX_FOREIGN_KEY_CHECKS	18
-  {"trx_foreign_key_checks", 1, MYSQL_TYPE_LONG,
-   1, 0, "", SKIP_OPEN_TABLE},
+  Column("trx_foreign_key_checks", SLong(1), NOT_NULL),
 
 #define IDX_TRX_LAST_FOREIGN_KEY_ERROR	19
-  {"trx_last_foreign_key_error", TRX_I_S_TRX_FK_ERROR_MAX_LEN,
-   MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("trx_last_foreign_key_error",
+         Varchar(TRX_I_S_TRX_FK_ERROR_MAX_LEN),NULLABLE),
 
 #define IDX_TRX_READ_ONLY		20
-  {"trx_is_read_only", 1, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("trx_is_read_only", SLong(1), NOT_NULL),
 
 #define IDX_TRX_AUTOCOMMIT_NON_LOCKING	21
-  {"trx_autocommit_non_locking", 1, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("trx_autocommit_non_locking", SLong(1), NOT_NULL),
 
-  END_OF_ST_FIELD_INFO
+  CEnd()
 };
 
+} // namespace Show
+
 /*******************************************************************//**
 Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_trx
 table with it.
@@ -419,15 +385,13 @@ fill_innodb_trx_from_cache(
 	for (i = 0; i < rows_num; i++) {
 
 		i_s_trx_row_t*	row;
-		char		trx_id[TRX_ID_MAX_LEN + 1];
 
 		row = (i_s_trx_row_t*)
 			trx_i_s_cache_get_nth_row(
 				cache, I_S_INNODB_TRX, i);
 
 		/* trx_id */
-		snprintf(trx_id, sizeof(trx_id), TRX_ID_FMT, row->trx_id);
-		OK(field_store_string(fields[IDX_TRX_ID], trx_id));
+		OK(fields[IDX_TRX_ID]->store(row->trx_id, true));
 
 		/* trx_state */
 		OK(field_store_string(fields[IDX_TRX_STATE],
@@ -507,12 +471,11 @@ fill_innodb_trx_from_cache(
 			   row->trx_rows_modified, true));
 
 		/* trx_concurrency_tickets */
-		OK(fields[IDX_TRX_CONNCURRENCY_TICKETS]->store(
-			   row->trx_concurrency_tickets, true));
+		OK(fields[IDX_TRX_CONNCURRENCY_TICKETS]->store(0, true));
 
 		/* trx_isolation_level */
-		OK(field_store_string(fields[IDX_TRX_ISOLATION_LEVEL],
-				      row->trx_isolation_level));
+		OK(fields[IDX_TRX_ISOLATION_LEVEL]->store(
+			   1 + row->trx_isolation_level, true));
 
 		/* trx_unique_checks */
 		OK(fields[IDX_TRX_UNIQUE_CHECKS]->store(
@@ -532,8 +495,7 @@ fill_innodb_trx_from_cache(
 
 		/* trx_is_autocommit_non_locking */
 		OK(fields[IDX_TRX_AUTOCOMMIT_NON_LOCKING]->store(
-			   (longlong) row->trx_is_autocommit_non_locking,
-			   true));
+			   row->trx_is_autocommit_non_locking, true));
 
 		OK(schema_table_store_record(thd, table));
 	}
@@ -556,7 +518,7 @@ innodb_trx_init(
 
 	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = innodb_trx_fields_info;
+	schema->fields_info = Show::innodb_trx_fields_info;
 	schema->fill_table = trx_i_s_common_fill_table;
 
 	DBUG_RETURN(0);
@@ -616,53 +578,65 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_trx =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+static const LEX_CSTRING lock_mode_values[] =
+{
+	{ STRING_WITH_LEN("S") },
+	{ STRING_WITH_LEN("S,GAP") },
+	{ STRING_WITH_LEN("X") },
+	{ STRING_WITH_LEN("X,GAP") },
+	{ STRING_WITH_LEN("IS") },
+	{ STRING_WITH_LEN("IS,GAP") },
+	{ STRING_WITH_LEN("IX") },
+	{ STRING_WITH_LEN("IX,GAP") },
+	{ STRING_WITH_LEN("AUTO_INC") }
+};
+
+static TypelibBuffer<9> lock_mode_values_typelib(lock_mode_values);
+
+static const LEX_CSTRING lock_type_values[] =
+{
+	{ STRING_WITH_LEN("RECORD") },
+	{ STRING_WITH_LEN("TABLE") }
+};
+
+static TypelibBuffer<2> lock_type_values_typelib(lock_type_values);
+
+namespace Show {
 /* Fields of the dynamic table INFORMATION_SCHEMA.innodb_locks */
 static ST_FIELD_INFO innodb_locks_fields_info[]=
 {
 #define IDX_LOCK_ID		0
-  {"lock_id", TRX_I_S_LOCK_ID_MAX_LEN + 1, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("lock_id",     Varchar(TRX_I_S_LOCK_ID_MAX_LEN + 1),  NOT_NULL),
 
 #define IDX_LOCK_TRX_ID		1
-  {"lock_trx_id", TRX_ID_MAX_LEN + 1, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("lock_trx_id", ULonglong(), NOT_NULL),
 
 #define IDX_LOCK_MODE		2
-  {"lock_mode",
-   /* S[,GAP] X[,GAP] IS[,GAP] IX[,GAP] AUTO_INC UNKNOWN */
-   32, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("lock_mode",   Enum(&lock_mode_values_typelib), NOT_NULL),
 
 #define IDX_LOCK_TYPE		3
-  {"lock_type", 32 /* RECORD|TABLE|UNKNOWN */, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("lock_type",   Enum(&lock_type_values_typelib), NOT_NULL),
 
 #define IDX_LOCK_TABLE		4
-  {"lock_table", 1024, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("lock_table",  Varchar(1024), NOT_NULL),
 
 #define IDX_LOCK_INDEX		5
-  {"lock_index", 1024, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("lock_index",  Varchar(1024), NULLABLE),
 
 #define IDX_LOCK_SPACE		6
-  {"lock_space", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("lock_space",  ULong(),   NULLABLE),
 
 #define IDX_LOCK_PAGE		7
-  {"lock_page", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("lock_page",   ULong(),   NULLABLE),
 
 #define IDX_LOCK_REC		8
-  {"lock_rec", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("lock_rec",    ULong(),   NULLABLE),
 
 #define IDX_LOCK_DATA		9
-  {"lock_data", TRX_I_S_LOCK_DATA_MAX_LEN, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
-
-  END_OF_ST_FIELD_INFO
+  Column("lock_data",   Varchar(TRX_I_S_LOCK_DATA_MAX_LEN), NULLABLE),
+  CEnd()
 };
+} // namespace Show
 
 /*******************************************************************//**
 Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_locks
@@ -694,8 +668,6 @@ fill_innodb_locks_from_cache(
 		char			buf[MAX_FULL_NAME_LEN + 1];
 		const char*		bufend;
 
-		char			lock_trx_id[TRX_ID_MAX_LEN + 1];
-
 		row = (i_s_locks_row_t*)
 			trx_i_s_cache_get_nth_row(
 				cache, I_S_INNODB_LOCKS, i);
@@ -706,17 +678,14 @@ fill_innodb_locks_from_cache(
 				      lock_id));
 
 		/* lock_trx_id */
-		snprintf(lock_trx_id, sizeof(lock_trx_id),
-			 TRX_ID_FMT, row->lock_trx_id);
-		OK(field_store_string(fields[IDX_LOCK_TRX_ID], lock_trx_id));
+		OK(fields[IDX_LOCK_TRX_ID]->store(row->lock_trx_id, true));
 
 		/* lock_mode */
-		OK(field_store_string(fields[IDX_LOCK_MODE],
-				      row->lock_mode));
+		OK(fields[IDX_LOCK_MODE]->store(row->lock_mode, true));
 
 		/* lock_type */
-		OK(field_store_string(fields[IDX_LOCK_TYPE],
-				      row->lock_type));
+		OK(fields[IDX_LOCK_TYPE]->store(
+			   row->lock_index ? 1 : 2, true));
 
 		/* lock_table */
 		bufend = innobase_convert_name(buf, sizeof(buf),
@@ -726,25 +695,27 @@ fill_innodb_locks_from_cache(
 		OK(fields[IDX_LOCK_TABLE]->store(
 			buf, uint(bufend - buf), system_charset_info));
 
-		/* lock_index */
-		OK(field_store_string(fields[IDX_LOCK_INDEX],
-				      row->lock_index));
-
-		/* lock_space */
-		OK(field_store_ulint(fields[IDX_LOCK_SPACE],
-				     row->lock_space));
-
-		/* lock_page */
-		OK(field_store_ulint(fields[IDX_LOCK_PAGE],
-				     row->lock_page));
-
-		/* lock_rec */
-		OK(field_store_ulint(fields[IDX_LOCK_REC],
-				     row->lock_rec));
-
-		/* lock_data */
-		OK(field_store_string(fields[IDX_LOCK_DATA],
-				      row->lock_data));
+		if (row->lock_index) {
+			/* record lock */
+			OK(field_store_string(fields[IDX_LOCK_INDEX],
+					      row->lock_index));
+			OK(fields[IDX_LOCK_SPACE]->store(
+				   row->lock_page.space(), true));
+			fields[IDX_LOCK_SPACE]->set_notnull();
+			OK(fields[IDX_LOCK_PAGE]->store(
+				   row->lock_page.page_no(), true));
+			fields[IDX_LOCK_PAGE]->set_notnull();
+			OK(fields[IDX_LOCK_REC]->store(
+				   row->lock_rec, true));
+			fields[IDX_LOCK_REC]->set_notnull();
+			OK(field_store_string(fields[IDX_LOCK_DATA],
+					      row->lock_data));
+		} else {
+			fields[IDX_LOCK_INDEX]->set_null();
+			fields[IDX_LOCK_SPACE]->set_null();
+			fields[IDX_LOCK_REC]->set_null();
+			fields[IDX_LOCK_DATA]->set_null();
+		}
 
 		OK(schema_table_store_record(thd, table));
 	}
@@ -767,7 +738,7 @@ innodb_locks_init(
 
 	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = innodb_locks_fields_info;
+	schema->fields_info = Show::innodb_locks_fields_info;
 	schema->fill_table = trx_i_s_common_fill_table;
 
 	DBUG_RETURN(0);
@@ -822,27 +793,25 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_locks =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+
+namespace Show {
 /* Fields of the dynamic table INFORMATION_SCHEMA.innodb_lock_waits */
 static ST_FIELD_INFO innodb_lock_waits_fields_info[]=
 {
 #define IDX_REQUESTING_TRX_ID	0
-  {"requesting_trx_id", TRX_ID_MAX_LEN + 1, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("requesting_trx_id", ULonglong(), NOT_NULL),
 
 #define IDX_REQUESTED_LOCK_ID	1
-  {"requested_lock_id", TRX_I_S_LOCK_ID_MAX_LEN + 1, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("requested_lock_id", Varchar(TRX_I_S_LOCK_ID_MAX_LEN + 1), NOT_NULL),
 
 #define IDX_BLOCKING_TRX_ID	2
-  {"blocking_trx_id", TRX_ID_MAX_LEN + 1, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("blocking_trx_id",   ULonglong(), NOT_NULL),
 
 #define IDX_BLOCKING_LOCK_ID	3
-  {"blocking_lock_id", TRX_I_S_LOCK_ID_MAX_LEN + 1, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
-
-  END_OF_ST_FIELD_INFO
+  Column("blocking_lock_id",  Varchar(TRX_I_S_LOCK_ID_MAX_LEN + 1), NOT_NULL),
+  CEnd()
 };
+} // namespace Show
 
 /*******************************************************************//**
 Read data from cache buffer and fill the
@@ -874,18 +843,13 @@ fill_innodb_lock_waits_from_cache(
 
 		i_s_lock_waits_row_t*	row;
 
-		char	requesting_trx_id[TRX_ID_MAX_LEN + 1];
-		char	blocking_trx_id[TRX_ID_MAX_LEN + 1];
-
 		row = (i_s_lock_waits_row_t*)
 			trx_i_s_cache_get_nth_row(
 				cache, I_S_INNODB_LOCK_WAITS, i);
 
 		/* requesting_trx_id */
-		snprintf(requesting_trx_id, sizeof(requesting_trx_id),
-			 TRX_ID_FMT, row->requested_lock_row->lock_trx_id);
-		OK(field_store_string(fields[IDX_REQUESTING_TRX_ID],
-				      requesting_trx_id));
+		OK(fields[IDX_REQUESTING_TRX_ID]->store(
+				      row->requested_lock_row->lock_trx_id, true));
 
 		/* requested_lock_id */
 		OK(field_store_string(
@@ -896,10 +860,8 @@ fill_innodb_lock_waits_from_cache(
 				   sizeof(requested_lock_id))));
 
 		/* blocking_trx_id */
-		snprintf(blocking_trx_id, sizeof(blocking_trx_id),
-			 TRX_ID_FMT, row->blocking_lock_row->lock_trx_id);
-		OK(field_store_string(fields[IDX_BLOCKING_TRX_ID],
-				      blocking_trx_id));
+		OK(fields[IDX_BLOCKING_TRX_ID]->store(
+				      row->blocking_lock_row->lock_trx_id, true));
 
 		/* blocking_lock_id */
 		OK(field_store_string(
@@ -930,7 +892,7 @@ innodb_lock_waits_init(
 
 	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = innodb_lock_waits_fields_info;
+	schema->fields_info = Show::innodb_lock_waits_fields_info;
 	schema->fill_table = trx_i_s_common_fill_table;
 
 	DBUG_RETURN(0);
@@ -1084,23 +1046,22 @@ trx_i_s_common_fill_table(
 #endif
 }
 
+namespace Show {
 /* Fields of the dynamic table information_schema.innodb_cmp. */
-static ST_FIELD_INFO i_s_cmp_fields_info[]=
-{
-  {"page_size", 5, MYSQL_TYPE_LONG,
-   0, 0, "Compressed Page Size", SKIP_OPEN_TABLE},
-  {"compress_ops", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "Total Number of Compressions", SKIP_OPEN_TABLE},
-  {"compress_ops_ok", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "Total Number of Successful Compressions", SKIP_OPEN_TABLE},
-  {"compress_time", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "Total Duration of Compressions, in Seconds", SKIP_OPEN_TABLE},
-  {"uncompress_ops", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "Total Number of Decompressions", SKIP_OPEN_TABLE},
-  {"uncompress_time", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "Total Duration of Decompressions, in Seconds", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+static ST_FIELD_INFO i_s_cmp_fields_info[] =
+{
+  Column("page_size",      SLong(5),NOT_NULL, "Compressed Page Size"),
+  Column("compress_ops",   SLong(), NOT_NULL, "Total Number of Compressions"),
+  Column("compress_ops_ok",SLong(), NOT_NULL, "Total Number of "
+                                              "Successful Compressions"),
+  Column("compress_time",  SLong(), NOT_NULL, "Total Duration of "
+                                              "Compressions, in Seconds"),
+  Column("uncompress_ops", SLong(), NOT_NULL, "Total Number of Decompressions"),
+  Column("uncompress_time",SLong(), NOT_NULL, "Total Duration of "
+                                              "Decompressions, in Seconds"),
+  CEnd(),
 };
+} // namespace Show
 
 
 /*******************************************************************//**
@@ -1201,7 +1162,7 @@ i_s_cmp_init(
 	DBUG_ENTER("i_s_cmp_init");
 	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = i_s_cmp_fields_info;
+	schema->fields_info = Show::i_s_cmp_fields_info;
 	schema->fill_table = i_s_cmp_fill;
 
 	DBUG_RETURN(0);
@@ -1219,7 +1180,7 @@ i_s_cmp_reset_init(
 	DBUG_ENTER("i_s_cmp_reset_init");
 	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = i_s_cmp_fields_info;
+	schema->fields_info = Show::i_s_cmp_fields_info;
 	schema->fill_table = i_s_cmp_reset_fill;
 
 	DBUG_RETURN(0);
@@ -1324,46 +1285,42 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmp_reset =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+
+namespace Show {
 /* Fields of the dynamic tables
 information_schema.innodb_cmp_per_index and
 information_schema.innodb_cmp_per_index_reset. */
 static ST_FIELD_INFO i_s_cmp_per_index_fields_info[]=
 {
 #define IDX_DATABASE_NAME	0
-  {"database_name", 192, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("database_name",   Varchar(NAME_CHAR_LEN), NOT_NULL),
 
-#define IDX_TABLE_NAME		1
-  {"table_name", 192, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+#define IDX_TABLE_NAME		1 /* FIXME: this is in my_charset_filename! */
+  Column("table_name",      Varchar(NAME_CHAR_LEN), NOT_NULL),
 
 #define IDX_INDEX_NAME		2
-  {"index_name", 192, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("index_name",      Varchar(NAME_CHAR_LEN), NOT_NULL),
 
 #define IDX_COMPRESS_OPS	3
-  {"compress_ops", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("compress_ops",    SLong(),      NOT_NULL),
 
 #define IDX_COMPRESS_OPS_OK	4
-  {"compress_ops_ok", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("compress_ops_ok", SLong(),      NOT_NULL),
 
 #define IDX_COMPRESS_TIME	5
-  {"compress_time", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("compress_time",   SLong(),      NOT_NULL),
 
 #define IDX_UNCOMPRESS_OPS	6
-  {"uncompress_ops", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("uncompress_ops",  SLong(),      NOT_NULL),
 
 #define IDX_UNCOMPRESS_TIME	7
-  {"uncompress_time", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("uncompress_time", SLong(),      NOT_NULL),
 
-  END_OF_ST_FIELD_INFO
+  CEnd()
 };
 
+} // namespace Show
+
 /*******************************************************************//**
 Fill the dynamic table
 information_schema.innodb_cmp_per_index or
@@ -1511,7 +1468,7 @@ i_s_cmp_per_index_init(
 	DBUG_ENTER("i_s_cmp_init");
 	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = i_s_cmp_per_index_fields_info;
+	schema->fields_info = Show::i_s_cmp_per_index_fields_info;
 	schema->fill_table = i_s_cmp_per_index_fill;
 
 	DBUG_RETURN(0);
@@ -1529,7 +1486,7 @@ i_s_cmp_per_index_reset_init(
 	DBUG_ENTER("i_s_cmp_reset_init");
 	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = i_s_cmp_per_index_fields_info;
+	schema->fields_info = Show::i_s_cmp_per_index_fields_info;
 	schema->fill_table = i_s_cmp_per_index_reset_fill;
 
 	DBUG_RETURN(0);
@@ -1634,23 +1591,21 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmp_per_index_reset =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+
+namespace Show {
 /* Fields of the dynamic table information_schema.innodb_cmpmem. */
-static ST_FIELD_INFO i_s_cmpmem_fields_info[]=
-{
-  {"page_size", 5, MYSQL_TYPE_LONG,
-   0, 0, "Buddy Block Size", SKIP_OPEN_TABLE},
-  {"buffer_pool_instance", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "Buffer Pool Id", SKIP_OPEN_TABLE},
-  {"pages_used", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "Currently in Use", SKIP_OPEN_TABLE},
-  {"pages_free", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "Currently Available", SKIP_OPEN_TABLE},
-  {"relocation_ops", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, 0, "Total Number of Relocations", SKIP_OPEN_TABLE},
-  {"relocation_time", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "Total Duration of Relocations, in Seconds", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+static ST_FIELD_INFO i_s_cmpmem_fields_info[] =
+{
+  Column("page_size",           SLong(5), NOT_NULL, "Buddy Block Size"),
+  Column("buffer_pool_instance", SLong(), NOT_NULL, "Buffer Pool Id"),
+  Column("pages_used",           SLong(), NOT_NULL, "Currently in Use"),
+  Column("pages_free",           SLong(), NOT_NULL, "Currently Available"),
+  Column("relocation_ops",   SLonglong(), NOT_NULL, "Total Number of Relocations"),
+  Column("relocation_time",      SLong(), NOT_NULL, "Total Duration of Relocations,"
+                                                    " in Seconds"),
+  CEnd()
 };
+} // namespace Show
 
 /*******************************************************************//**
 Fill the dynamic table information_schema.innodb_cmpmem or
@@ -1665,7 +1620,6 @@ i_s_cmpmem_fill_low(
 	Item*		,	/*!< in: condition (ignored) */
 	ibool		reset)	/*!< in: TRUE=reset cumulated counts */
 {
-	int		status = 0;
 	TABLE*	table	= (TABLE*) tables->table;
 
 	DBUG_ENTER("i_s_cmpmem_fill_low");
@@ -1678,57 +1632,45 @@ i_s_cmpmem_fill_low(
 
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
 
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*		buf_pool;
-		ulint			zip_free_len_local[BUF_BUDDY_SIZES_MAX + 1];
-		buf_buddy_stat_t	buddy_stat_local[BUF_BUDDY_SIZES_MAX + 1];
+	ulint			zip_free_len_local[BUF_BUDDY_SIZES_MAX + 1];
+	buf_buddy_stat_t	buddy_stat_local[BUF_BUDDY_SIZES_MAX + 1];
 
-		status	= 0;
+	/* Save buddy stats for buffer pool in local variables. */
+	mysql_mutex_lock(&buf_pool.mutex);
 
-		buf_pool = buf_pool_from_array(i);
+	for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) {
+		zip_free_len_local[x] = (x < BUF_BUDDY_SIZES) ?
+			UT_LIST_GET_LEN(buf_pool.zip_free[x]) : 0;
 
-		/* Save buddy stats for buffer pool in local variables. */
-		buf_pool_mutex_enter(buf_pool);
-		for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) {
+		buddy_stat_local[x] = buf_pool.buddy_stat[x];
 
-			zip_free_len_local[x] = (x < BUF_BUDDY_SIZES) ?
-				UT_LIST_GET_LEN(buf_pool->zip_free[x]) : 0;
-
-			buddy_stat_local[x] = buf_pool->buddy_stat[x];
-
-			if (reset) {
-				/* This is protected by buf_pool->mutex. */
-				buf_pool->buddy_stat[x].relocated = 0;
-				buf_pool->buddy_stat[x].relocated_usec = 0;
-			}
+		if (reset) {
+			/* This is protected by buf_pool.mutex. */
+			buf_pool.buddy_stat[x].relocated = 0;
+			buf_pool.buddy_stat[x].relocated_usec = 0;
 		}
-		buf_pool_mutex_exit(buf_pool);
+	}
 
-		for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) {
-			buf_buddy_stat_t*	buddy_stat;
+	mysql_mutex_unlock(&buf_pool.mutex);
 
-			buddy_stat = &buddy_stat_local[x];
+	for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) {
+		buf_buddy_stat_t* buddy_stat = &buddy_stat_local[x];
 
-			table->field[0]->store(BUF_BUDDY_LOW << x);
-			table->field[1]->store(i, true);
-			table->field[2]->store(buddy_stat->used, true);
-			table->field[3]->store(zip_free_len_local[x], true);
-			table->field[4]->store(buddy_stat->relocated, true);
-			table->field[5]->store(
-				buddy_stat->relocated_usec / 1000000, true);
+		Field **field = table->field;
 
-			if (schema_table_store_record(thd, table)) {
-				status = 1;
-				break;
-			}
-		}
+		(*field++)->store(BUF_BUDDY_LOW << x);
+		(*field++)->store(0, true);
+		(*field++)->store(buddy_stat->used, true);
+		(*field++)->store(zip_free_len_local[x], true);
+		(*field++)->store(buddy_stat->relocated, true);
+		(*field)->store(buddy_stat->relocated_usec / 1000000, true);
 
-		if (status) {
-			break;
+		if (schema_table_store_record(thd, table)) {
+			DBUG_RETURN(1);
 		}
 	}
 
-	DBUG_RETURN(status);
+	DBUG_RETURN(0);
 }
 
 /*******************************************************************//**
@@ -1771,7 +1713,7 @@ i_s_cmpmem_init(
 	DBUG_ENTER("i_s_cmpmem_init");
 	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = i_s_cmpmem_fields_info;
+	schema->fields_info = Show::i_s_cmpmem_fields_info;
 	schema->fill_table = i_s_cmpmem_fill;
 
 	DBUG_RETURN(0);
@@ -1789,7 +1731,7 @@ i_s_cmpmem_reset_init(
 	DBUG_ENTER("i_s_cmpmem_reset_init");
 	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = i_s_cmpmem_fields_info;
+	schema->fields_info = Show::i_s_cmpmem_fields_info;
 	schema->fill_table = i_s_cmpmem_reset_fill;
 
 	DBUG_RETURN(0);
@@ -1894,61 +1836,75 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmpmem_reset =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+
+static const LEX_CSTRING metric_type_values[] =
+{
+	{ STRING_WITH_LEN("value") },
+	{ STRING_WITH_LEN("status_counter") },
+	{ STRING_WITH_LEN("set_owner") },
+	{ STRING_WITH_LEN("set_member") },
+	{ STRING_WITH_LEN("counter") }
+};
+
+static TypelibBuffer<5> metric_type_values_typelib(metric_type_values);
+
+namespace Show {
 /* Fields of the dynamic table INFORMATION_SCHEMA.innodb_metrics */
 static ST_FIELD_INFO innodb_metrics_fields_info[]=
 {
 #define	METRIC_NAME		0
-  {"NAME", NAME_LEN + 1, MYSQL_TYPE_STRING, 0, 0, "", SKIP_OPEN_TABLE},
+  Column("NAME",            Varchar(NAME_LEN + 1),       NOT_NULL),
+
 #define	METRIC_SUBSYS		1
-  {"SUBSYSTEM", NAME_LEN + 1, MYSQL_TYPE_STRING, 0, 0, "", SKIP_OPEN_TABLE},
+  Column("SUBSYSTEM",       Varchar(NAME_LEN + 1),       NOT_NULL),
+
 #define	METRIC_VALUE_START	2
-  {"COUNT", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("COUNT",           SLonglong(),                 NOT_NULL),
+
 #define	METRIC_MAX_VALUE_START	3
-  {"MAX_COUNT", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("MAX_COUNT",       SLonglong(),                 NULLABLE),
+
 #define	METRIC_MIN_VALUE_START	4
-  {"MIN_COUNT", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("MIN_COUNT",       SLonglong(),                 NULLABLE),
+
 #define	METRIC_AVG_VALUE_START	5
-  {"AVG_COUNT", MAX_FLOAT_STR_LENGTH, MYSQL_TYPE_FLOAT,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("AVG_COUNT",       Float(MAX_FLOAT_STR_LENGTH), NULLABLE),
+
 #define	METRIC_VALUE_RESET	6
-  {"COUNT_RESET", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("COUNT_RESET",     SLonglong(),                 NOT_NULL),
+
 #define	METRIC_MAX_VALUE_RESET	7
-  {"MAX_COUNT_RESET", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("MAX_COUNT_RESET", SLonglong(),                 NULLABLE),
+
 #define	METRIC_MIN_VALUE_RESET	8
-  {"MIN_COUNT_RESET", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("MIN_COUNT_RESET", SLonglong(),                 NULLABLE),
+
 #define	METRIC_AVG_VALUE_RESET	9
-  {"AVG_COUNT_RESET", MAX_FLOAT_STR_LENGTH, MYSQL_TYPE_FLOAT,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("AVG_COUNT_RESET", Float(MAX_FLOAT_STR_LENGTH), NULLABLE),
+
 #define	METRIC_START_TIME	10
-  {"TIME_ENABLED", 0, MYSQL_TYPE_DATETIME,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("TIME_ENABLED",    Datetime(0),                 NULLABLE),
+
 #define	METRIC_STOP_TIME	11
-  {"TIME_DISABLED", 0, MYSQL_TYPE_DATETIME,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("TIME_DISABLED",   Datetime(0),                 NULLABLE),
+
 #define	METRIC_TIME_ELAPSED	12
-  {"TIME_ELAPSED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("TIME_ELAPSED",    SLonglong(),                 NULLABLE),
+
 #define	METRIC_RESET_TIME	13
-  {"TIME_RESET", 0, MYSQL_TYPE_DATETIME,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("TIME_RESET",      Datetime(0),                 NULLABLE),
+
 #define	METRIC_STATUS		14
-  {"STATUS", NAME_LEN + 1, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("ENABLED", SLong(1), NOT_NULL),
+
 #define	METRIC_TYPE		15
-  {"TYPE", NAME_LEN + 1, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
-#define	METRIC_DESC		16
-  {"COMMENT", NAME_LEN + 1, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("TYPE",    Enum(&metric_type_values_typelib), NOT_NULL),
 
-  END_OF_ST_FIELD_INFO
+#define	METRIC_DESC		16
+  Column("COMMENT",         Varchar(NAME_LEN + 1),       NOT_NULL),
+  CEnd()
 };
+} // namespace Show
 
 /**********************************************************************//**
 Fill the information schema metrics table.
@@ -2156,7 +2112,8 @@ i_s_metrics_fill(
 			if (time_diff != 0) {
 				OK(fields[METRIC_AVG_VALUE_RESET]->store(
 					static_cast<double>(
-						MONITOR_VALUE(count) / time_diff)));
+						MONITOR_VALUE(count))
+					/ time_diff));
 				fields[METRIC_AVG_VALUE_RESET]->set_notnull();
 			} else {
 				fields[METRIC_AVG_VALUE_RESET]->set_null();
@@ -2166,7 +2123,6 @@ i_s_metrics_fill(
 			fields[METRIC_AVG_VALUE_RESET]->set_null();
 		}
 
-
 		if (MONITOR_IS_ON(count)) {
 			/* If monitor is on, the stop time will set to NULL */
 			fields[METRIC_STOP_TIME]->set_null();
@@ -2183,9 +2139,7 @@ i_s_metrics_fill(
 				fields[METRIC_RESET_TIME]->set_null();
 			}
 
-			/* Display the monitor status as "enabled" */
-			OK(field_store_string(fields[METRIC_STATUS],
-					      "enabled"));
+			OK(fields[METRIC_STATUS]->store(1, true));
 		} else {
 			if (MONITOR_FIELD(count, mon_stop_time)) {
 				OK(field_store_time_t(fields[METRIC_STOP_TIME],
@@ -2197,27 +2151,25 @@ i_s_metrics_fill(
 
 			fields[METRIC_RESET_TIME]->set_null();
 
-			OK(field_store_string(fields[METRIC_STATUS],
-					      "disabled"));
+			OK(fields[METRIC_STATUS]->store(0, true));
 		}
 
+		uint metric_type;
+
 		if (monitor_info->monitor_type & MONITOR_DISPLAY_CURRENT) {
-			OK(field_store_string(fields[METRIC_TYPE],
-					      "value"));
+			metric_type = 1; /* "value" */
 		} else if (monitor_info->monitor_type & MONITOR_EXISTING) {
-			OK(field_store_string(fields[METRIC_TYPE],
-					      "status_counter"));
+			metric_type = 2; /* "status_counter" */
 		} else if (monitor_info->monitor_type & MONITOR_SET_OWNER) {
-			OK(field_store_string(fields[METRIC_TYPE],
-					      "set_owner"));
-		} else if ( monitor_info->monitor_type & MONITOR_SET_MEMBER) {
-			OK(field_store_string(fields[METRIC_TYPE],
-					      "set_member"));
+			metric_type = 3; /* "set_owner" */
+		} else if (monitor_info->monitor_type & MONITOR_SET_MEMBER) {
+			metric_type = 4; /* "set_member" */
 		} else {
-			OK(field_store_string(fields[METRIC_TYPE],
-					      "counter"));
+			metric_type = 5; /* "counter" */
 		}
 
+		OK(fields[METRIC_TYPE]->store(metric_type, true));
+
 		OK(schema_table_store_record(thd, table_to_fill));
 	}
 
@@ -2261,7 +2213,7 @@ innodb_metrics_init(
 
 	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = innodb_metrics_fields_info;
+	schema->fields_info = Show::innodb_metrics_fields_info;
 	schema->fill_table = i_s_metrics_fill_table;
 
 	DBUG_RETURN(0);
@@ -2315,13 +2267,16 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_metrics =
 	INNODB_VERSION_STR,
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
+
+namespace Show {
 /* Fields of the dynamic table INFORMATION_SCHEMA.innodb_ft_default_stopword */
 static ST_FIELD_INFO i_s_stopword_fields_info[]=
 {
 #define STOPWORD_VALUE	0
-  {"value", TRX_ID_MAX_LEN + 1, MYSQL_TYPE_STRING, 0, 0, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+  Column("value", Varchar(TRX_ID_MAX_LEN + 1), NOT_NULL),
+  CEnd()
 };
+} // namespace Show
 
 /*******************************************************************//**
 Fill the dynamic table information_schema.innodb_ft_default_stopword.
@@ -2367,7 +2322,7 @@ i_s_stopword_init(
 	DBUG_ENTER("i_s_stopword_init");
 	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = i_s_stopword_fields_info;
+	schema->fields_info = Show::i_s_stopword_fields_info;
 	schema->fill_table = i_s_stopword_fill;
 
 	DBUG_RETURN(0);
@@ -2422,15 +2377,16 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_default_stopword =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+namespace Show {
 /* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED
 INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED */
 static ST_FIELD_INFO i_s_fts_doc_fields_info[]=
 {
 #define	I_S_FTS_DOC_ID			0
-  {"DOC_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+  Column("DOC_ID", ULonglong(), NOT_NULL),
+  CEnd()
 };
+} // namespace Show
 
 /*******************************************************************//**
 Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED or
@@ -2541,7 +2497,7 @@ i_s_fts_deleted_init(
 	DBUG_ENTER("i_s_fts_deleted_init");
 	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = i_s_fts_doc_fields_info;
+	schema->fields_info = Show::i_s_fts_doc_fields_info;
 	schema->fill_table = i_s_fts_deleted_fill;
 
 	DBUG_RETURN(0);
@@ -2624,7 +2580,7 @@ i_s_fts_being_deleted_init(
 	DBUG_ENTER("i_s_fts_deleted_init");
 	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = i_s_fts_doc_fields_info;
+	schema->fields_info = Show::i_s_fts_doc_fields_info;
 	schema->fill_table = i_s_fts_being_deleted_fill;
 
 	DBUG_RETURN(0);
@@ -2679,30 +2635,32 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_being_deleted =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+
+namespace Show {
 /* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED and
 INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE */
 static ST_FIELD_INFO i_s_fts_index_fields_info[]=
 {
 #define	I_S_FTS_WORD			0
-  {"WORD", FTS_MAX_WORD_LEN + 1, MYSQL_TYPE_STRING, 0, 0, "", SKIP_OPEN_TABLE},
+  Column("WORD",         Varchar(FTS_MAX_WORD_LEN + 1), NOT_NULL),
+
 #define	I_S_FTS_FIRST_DOC_ID		1
-  {"FIRST_DOC_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("FIRST_DOC_ID", ULonglong(),                   NOT_NULL),
+
 #define	I_S_FTS_LAST_DOC_ID		2
-  {"LAST_DOC_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("LAST_DOC_ID",  ULonglong(),                   NOT_NULL),
+
 #define	I_S_FTS_DOC_COUNT		3
-  {"DOC_COUNT", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("DOC_COUNT",    ULonglong(),                   NOT_NULL),
+
 #define	I_S_FTS_ILIST_DOC_ID		4
-  {"DOC_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-#define	I_S_FTS_ILIST_DOC_POS		5
-  {"POSITION", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("DOC_ID",       ULonglong(),                   NOT_NULL),
 
-  END_OF_ST_FIELD_INFO
+#define	I_S_FTS_ILIST_DOC_POS		5
+  Column("POSITION",     ULonglong(),                   NOT_NULL),
+  CEnd()
 };
+} // namespace Show
 
 /*******************************************************************//**
 Go through the Doc Node and its ilist, fill the dynamic table
@@ -2890,7 +2848,7 @@ i_s_fts_index_cache_init(
 	DBUG_ENTER("i_s_fts_index_cache_init");
 	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = i_s_fts_index_fields_info;
+	schema->fields_info = Show::i_s_fts_index_fields_info;
 	schema->fill_table = i_s_fts_index_cache_fill;
 
 	DBUG_RETURN(0);
@@ -3327,7 +3285,7 @@ i_s_fts_index_table_init(
 	DBUG_ENTER("i_s_fts_index_table_init");
 	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = i_s_fts_index_fields_info;
+	schema->fields_info = Show::i_s_fts_index_fields_info;
 	schema->fill_table = i_s_fts_index_table_fill;
 
 	DBUG_RETURN(0);
@@ -3382,15 +3340,20 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_index_table =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+
+namespace Show {
 /* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG */
 static ST_FIELD_INFO i_s_fts_config_fields_info[]=
 {
 #define	FTS_CONFIG_KEY			0
-  {"KEY", NAME_LEN + 1, MYSQL_TYPE_STRING, 0, 0, "", SKIP_OPEN_TABLE},
+  Column("KEY",   Varchar(NAME_LEN + 1),  NOT_NULL),
+
 #define	FTS_CONFIG_VALUE		1
-  {"VALUE", NAME_LEN + 1, MYSQL_TYPE_STRING, 0, 0, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+  Column("VALUE", Varchar(NAME_LEN + 1),  NOT_NULL),
+
+  CEnd()
 };
+} // namespace Show
 
 static const char* fts_config_key[] = {
 	FTS_OPTIMIZE_LIMIT_IN_SECS,
@@ -3521,7 +3484,7 @@ i_s_fts_config_init(
 	DBUG_ENTER("i_s_fts_config_init");
 	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = i_s_fts_config_fields_info;
+	schema->fields_info = Show::i_s_fts_config_fields_info;
 	schema->fill_table = i_s_fts_config_fill;
 
 	DBUG_RETURN(0);
@@ -3576,295 +3539,237 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_config =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+namespace Show {
 /* Fields of the dynamic table INNODB_BUFFER_POOL_STATS. */
 static ST_FIELD_INFO i_s_innodb_buffer_stats_fields_info[]=
 {
 #define IDX_BUF_STATS_POOL_ID		0
-  {"POOL_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("POOL_ID", ULong(), NOT_NULL),
+
 #define IDX_BUF_STATS_POOL_SIZE		1
-  {"POOL_SIZE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("POOL_SIZE", ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_FREE_BUFFERS	2
-  {"FREE_BUFFERS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("FREE_BUFFERS", ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_LRU_LEN		3
-  {"DATABASE_PAGES", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("DATABASE_PAGES", ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_OLD_LRU_LEN	4
-  {"OLD_DATABASE_PAGES", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("OLD_DATABASE_PAGES", ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_FLUSH_LIST_LEN	5
-  {"MODIFIED_DATABASE_PAGES", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("MODIFIED_DATABASE_PAGES", ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_PENDING_ZIP	6
-  {"PENDING_DECOMPRESS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("PENDING_DECOMPRESS", ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_PENDING_READ	7
-  {"PENDING_READS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("PENDING_READS",ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_FLUSH_LRU		8
-  {"PENDING_FLUSH_LRU", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("PENDING_FLUSH_LRU",ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_FLUSH_LIST	9
-  {"PENDING_FLUSH_LIST", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("PENDING_FLUSH_LIST", ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_PAGE_YOUNG	10
-  {"PAGES_MADE_YOUNG", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("PAGES_MADE_YOUNG",ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_PAGE_NOT_YOUNG	11
-  {"PAGES_NOT_MADE_YOUNG", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("PAGES_NOT_MADE_YOUNG",ULonglong(), NOT_NULL),
+
 #define	IDX_BUF_STATS_PAGE_YOUNG_RATE	12
-  {"PAGES_MADE_YOUNG_RATE", MAX_FLOAT_STR_LENGTH, MYSQL_TYPE_FLOAT,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("PAGES_MADE_YOUNG_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
 #define	IDX_BUF_STATS_PAGE_NOT_YOUNG_RATE 13
-  {"PAGES_MADE_NOT_YOUNG_RATE", MAX_FLOAT_STR_LENGTH, MYSQL_TYPE_FLOAT,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("PAGES_MADE_NOT_YOUNG_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
 #define IDX_BUF_STATS_PAGE_READ		14
-  {"NUMBER_PAGES_READ", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("NUMBER_PAGES_READ",ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_PAGE_CREATED	15
-  {"NUMBER_PAGES_CREATED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("NUMBER_PAGES_CREATED",ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_PAGE_WRITTEN	16
-  {"NUMBER_PAGES_WRITTEN", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("NUMBER_PAGES_WRITTEN",ULonglong(), NOT_NULL),
+
 #define	IDX_BUF_STATS_PAGE_READ_RATE	17
-  {"PAGES_READ_RATE", MAX_FLOAT_STR_LENGTH, MYSQL_TYPE_FLOAT,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("PAGES_READ_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
 #define	IDX_BUF_STATS_PAGE_CREATE_RATE	18
-  {"PAGES_CREATE_RATE", MAX_FLOAT_STR_LENGTH, MYSQL_TYPE_FLOAT,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("PAGES_CREATE_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
 #define	IDX_BUF_STATS_PAGE_WRITTEN_RATE	19
-  {"PAGES_WRITTEN_RATE", MAX_FLOAT_STR_LENGTH, MYSQL_TYPE_FLOAT,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("PAGES_WRITTEN_RATE",Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
 #define IDX_BUF_STATS_GET		20
-  {"NUMBER_PAGES_GET", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("NUMBER_PAGES_GET", ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_HIT_RATE		21
-  {"HIT_RATE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("HIT_RATE", ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_MADE_YOUNG_PCT	22
-  {"YOUNG_MAKE_PER_THOUSAND_GETS", MY_INT64_NUM_DECIMAL_DIGITS,
-   MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("YOUNG_MAKE_PER_THOUSAND_GETS", ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_NOT_MADE_YOUNG_PCT 23
-  {"NOT_YOUNG_MAKE_PER_THOUSAND_GETS", MY_INT64_NUM_DECIMAL_DIGITS,
-   MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-#define IDX_BUF_STATS_READ_AHREAD	24
-  {"NUMBER_PAGES_READ_AHEAD", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("NOT_YOUNG_MAKE_PER_THOUSAND_GETS", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_READ_AHEAD	24
+  Column("NUMBER_PAGES_READ_AHEAD", ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_READ_AHEAD_EVICTED 25
-  {"NUMBER_READ_AHEAD_EVICTED", MY_INT64_NUM_DECIMAL_DIGITS,
-   MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("NUMBER_READ_AHEAD_EVICTED", ULonglong(), NOT_NULL),
+
 #define	IDX_BUF_STATS_READ_AHEAD_RATE	26
-  {"READ_AHEAD_RATE", MAX_FLOAT_STR_LENGTH, MYSQL_TYPE_FLOAT,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("READ_AHEAD_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
 #define	IDX_BUF_STATS_READ_AHEAD_EVICT_RATE 27
-  {"READ_AHEAD_EVICTED_RATE", MAX_FLOAT_STR_LENGTH, MYSQL_TYPE_FLOAT,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("READ_AHEAD_EVICTED_RATE",Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
 #define IDX_BUF_STATS_LRU_IO_SUM	28
-  {"LRU_IO_TOTAL", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("LRU_IO_TOTAL", ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_LRU_IO_CUR	29
-  {"LRU_IO_CURRENT", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("LRU_IO_CURRENT", ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_UNZIP_SUM		30
-  {"UNCOMPRESS_TOTAL", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("UNCOMPRESS_TOTAL",ULonglong(), NOT_NULL),
+
 #define IDX_BUF_STATS_UNZIP_CUR		31
-  {"UNCOMPRESS_CURRENT", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+  Column("UNCOMPRESS_CURRENT", ULonglong(), NOT_NULL),
+
+  CEnd()
 };
+} // namespace Show
 
-/*******************************************************************//**
-Fill Information Schema table INNODB_BUFFER_POOL_STATS for a particular
-buffer pool
+/** Fill INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS
+@param[in,out]	thd	connection
+@param[in,out]	tables	tables to fill
 @return 0 on success, 1 on failure */
-static
-int
-i_s_innodb_stats_fill(
-/*==================*/
-	THD*			thd,		/*!< in: thread */
-	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
-	const buf_pool_info_t*	info)		/*!< in: buffer pool
-						information */
+static int i_s_innodb_stats_fill(THD *thd, TABLE_LIST * tables, Item *)
 {
-	TABLE*			table;
-	Field**			fields;
+	TABLE*		table;
+	Field**		fields;
+	buf_pool_info_t	info;
 
 	DBUG_ENTER("i_s_innodb_stats_fill");
 
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	/* Only allow the PROCESS privilege holder to access the stats */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	buf_stats_get_pool_info(&info);
+
 	table = tables->table;
 
 	fields = table->field;
 
-	OK(fields[IDX_BUF_STATS_POOL_ID]->store(
-		   info->pool_unique_id, true));
+	OK(fields[IDX_BUF_STATS_POOL_ID]->store(0, true));
 
-	OK(fields[IDX_BUF_STATS_POOL_SIZE]->store(
-		   info->pool_size, true));
+	OK(fields[IDX_BUF_STATS_POOL_SIZE]->store(info.pool_size, true));
 
-	OK(fields[IDX_BUF_STATS_LRU_LEN]->store(
-		   info->lru_len, true));
+	OK(fields[IDX_BUF_STATS_LRU_LEN]->store(info.lru_len, true));
 
-	OK(fields[IDX_BUF_STATS_OLD_LRU_LEN]->store(
-		   info->old_lru_len, true));
+	OK(fields[IDX_BUF_STATS_OLD_LRU_LEN]->store(info.old_lru_len, true));
 
 	OK(fields[IDX_BUF_STATS_FREE_BUFFERS]->store(
-		   info->free_list_len, true));
+		   info.free_list_len, true));
 
 	OK(fields[IDX_BUF_STATS_FLUSH_LIST_LEN]->store(
-		   info->flush_list_len, true));
+		   info.flush_list_len, true));
 
-	OK(fields[IDX_BUF_STATS_PENDING_ZIP]->store(
-		   info->n_pend_unzip, true));
+	OK(fields[IDX_BUF_STATS_PENDING_ZIP]->store(info.n_pend_unzip, true));
 
-	OK(fields[IDX_BUF_STATS_PENDING_READ]->store(
-		   info->n_pend_reads, true));
+	OK(fields[IDX_BUF_STATS_PENDING_READ]->store(info.n_pend_reads, true));
 
 	OK(fields[IDX_BUF_STATS_FLUSH_LRU]->store(
-		   info->n_pending_flush_lru, true));
+		   info.n_pending_flush_lru, true));
 
 	OK(fields[IDX_BUF_STATS_FLUSH_LIST]->store(
-		   info->n_pending_flush_list, true));
+		   info.n_pending_flush_list, true));
 
 	OK(fields[IDX_BUF_STATS_PAGE_YOUNG]->store(
-		   info->n_pages_made_young, true));
+		   info.n_pages_made_young, true));
 
 	OK(fields[IDX_BUF_STATS_PAGE_NOT_YOUNG]->store(
-		   info->n_pages_not_made_young, true));
+		   info.n_pages_not_made_young, true));
 
 	OK(fields[IDX_BUF_STATS_PAGE_YOUNG_RATE]->store(
-		   info->page_made_young_rate));
+		   info.page_made_young_rate));
 
 	OK(fields[IDX_BUF_STATS_PAGE_NOT_YOUNG_RATE]->store(
-		   info->page_not_made_young_rate));
+		   info.page_not_made_young_rate));
 
-	OK(fields[IDX_BUF_STATS_PAGE_READ]->store(
-		   info->n_pages_read, true));
+	OK(fields[IDX_BUF_STATS_PAGE_READ]->store(info.n_pages_read, true));
 
 	OK(fields[IDX_BUF_STATS_PAGE_CREATED]->store(
-		   info->n_pages_created, true));
+		   info.n_pages_created, true));
 
 	OK(fields[IDX_BUF_STATS_PAGE_WRITTEN]->store(
-		   info->n_pages_written, true));
+		   info.n_pages_written, true));
 
-	OK(fields[IDX_BUF_STATS_GET]->store(
-		   info->n_page_gets, true));
+	OK(fields[IDX_BUF_STATS_GET]->store(info.n_page_gets, true));
 
 	OK(fields[IDX_BUF_STATS_PAGE_READ_RATE]->store(
-		   info->pages_read_rate));
+		   info.pages_read_rate));
 
 	OK(fields[IDX_BUF_STATS_PAGE_CREATE_RATE]->store(
-		   info->pages_created_rate));
+		   info.pages_created_rate));
 
 	OK(fields[IDX_BUF_STATS_PAGE_WRITTEN_RATE]->store(
-		   info->pages_written_rate));
+		   info.pages_written_rate));
 
-	if (info->n_page_get_delta) {
-		if (info->page_read_delta <= info->n_page_get_delta) {
+	if (info.n_page_get_delta) {
+		if (info.page_read_delta <= info.n_page_get_delta) {
 			OK(fields[IDX_BUF_STATS_HIT_RATE]->store(
 				static_cast<double>(
-					1000 - (1000 * info->page_read_delta
-					/ info->n_page_get_delta))));
+					1000 - (1000 * info.page_read_delta
+					/ info.n_page_get_delta))));
 		} else {
 			OK(fields[IDX_BUF_STATS_HIT_RATE]->store(0));
 		}
 
 		OK(fields[IDX_BUF_STATS_MADE_YOUNG_PCT]->store(
-			   1000 * info->young_making_delta
-			   / info->n_page_get_delta, true));
+			   1000 * info.young_making_delta
+			   / info.n_page_get_delta, true));
 
 		OK(fields[IDX_BUF_STATS_NOT_MADE_YOUNG_PCT]->store(
-			   1000 * info->not_young_making_delta
-			   / info->n_page_get_delta, true));
+			   1000 * info.not_young_making_delta
+			   / info.n_page_get_delta, true));
 	} else {
 		OK(fields[IDX_BUF_STATS_HIT_RATE]->store(0, true));
 		OK(fields[IDX_BUF_STATS_MADE_YOUNG_PCT]->store(0, true));
 		OK(fields[IDX_BUF_STATS_NOT_MADE_YOUNG_PCT]->store(0, true));
 	}
 
-	OK(fields[IDX_BUF_STATS_READ_AHREAD]->store(
-		   info->n_ra_pages_read, true));
+	OK(fields[IDX_BUF_STATS_READ_AHEAD]->store(
+		   info.n_ra_pages_read, true));
 
 	OK(fields[IDX_BUF_STATS_READ_AHEAD_EVICTED]->store(
-		   info->n_ra_pages_evicted, true));
+		   info.n_ra_pages_evicted, true));
 
 	OK(fields[IDX_BUF_STATS_READ_AHEAD_RATE]->store(
-		   info->pages_readahead_rate));
+		   info.pages_readahead_rate));
 
 	OK(fields[IDX_BUF_STATS_READ_AHEAD_EVICT_RATE]->store(
-		   info->pages_evicted_rate));
+		   info.pages_evicted_rate));
 
-	OK(fields[IDX_BUF_STATS_LRU_IO_SUM]->store(
-		   info->io_sum, true));
+	OK(fields[IDX_BUF_STATS_LRU_IO_SUM]->store(info.io_sum, true));
 
-	OK(fields[IDX_BUF_STATS_LRU_IO_CUR]->store(
-		   info->io_cur, true));
+	OK(fields[IDX_BUF_STATS_LRU_IO_CUR]->store(info.io_cur, true));
 
-	OK(fields[IDX_BUF_STATS_UNZIP_SUM]->store(
-		   info->unzip_sum, true));
+	OK(fields[IDX_BUF_STATS_UNZIP_SUM]->store(info.unzip_sum, true));
 
-	OK(fields[IDX_BUF_STATS_UNZIP_CUR]->store(
-		   info->unzip_cur, true));
+	OK(fields[IDX_BUF_STATS_UNZIP_CUR]->store(info.unzip_cur, true));
 
 	DBUG_RETURN(schema_table_store_record(thd, table));
 }
 
 /*******************************************************************//**
-This is the function that loops through each buffer pool and fetch buffer
-pool stats to information schema  table: I_S_INNODB_BUFFER_POOL_STATS
-@return 0 on success, 1 on failure */
-static
-int
-i_s_innodb_buffer_stats_fill_table(
-/*===============================*/
-	THD*		thd,		/*!< in: thread */
-	TABLE_LIST*	tables,		/*!< in/out: tables to fill */
-	Item*		)		/*!< in: condition (ignored) */
-{
-	int			status	= 0;
-	buf_pool_info_t*	pool_info;
-
-	DBUG_ENTER("i_s_innodb_buffer_fill_general");
-	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
-
-	/* Only allow the PROCESS privilege holder to access the stats */
-	if (check_global_access(thd, PROCESS_ACL)) {
-		DBUG_RETURN(0);
-	}
-
-	pool_info = (buf_pool_info_t*) ut_zalloc_nokey(
-		srv_buf_pool_instances *  sizeof *pool_info);
-
-	/* Walk through each buffer pool */
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*		buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		/* Fetch individual buffer pool info */
-		buf_stats_get_pool_info(buf_pool, i, pool_info);
-
-		status = i_s_innodb_stats_fill(thd, tables, &pool_info[i]);
-
-		/* If something goes wrong, break and return */
-		if (status) {
-			break;
-		}
-	}
-
-	ut_free(pool_info);
-
-	DBUG_RETURN(status);
-}
-
-/*******************************************************************//**
 Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS.
 @return 0 on success, 1 on failure */
 static
@@ -3879,8 +3784,8 @@ i_s_innodb_buffer_pool_stats_init(
 
 	schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p);
 
-	schema->fields_info = i_s_innodb_buffer_stats_fields_info;
-	schema->fill_table = i_s_innodb_buffer_stats_fill_table;
+	schema->fields_info = Show::i_s_innodb_buffer_stats_fields_info;
+	schema->fill_table = i_s_innodb_stats_fill;
 
 	DBUG_RETURN(0);
 }
@@ -3934,73 +3839,96 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_buffer_stats =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+/** These must correspond to the first values of buf_page_state */
+static const LEX_CSTRING page_state_values[] =
+{
+  { STRING_WITH_LEN("NOT_USED") },
+  { STRING_WITH_LEN("MEMORY") },
+  { STRING_WITH_LEN("REMOVE_HASH") },
+  { STRING_WITH_LEN("FILE_PAGE") },
+};
+
+static const TypelibBuffer<4> page_state_values_typelib(page_state_values);
+
+static const LEX_CSTRING io_values[] =
+{
+	{ STRING_WITH_LEN("IO_NONE") },
+	{ STRING_WITH_LEN("IO_READ") },
+	{ STRING_WITH_LEN("IO_WRITE") },
+	{ STRING_WITH_LEN("IO_PIN") }
+};
+
+
+static TypelibBuffer<4> io_values_typelib(io_values);
+
+namespace Show {
 /* Fields of the dynamic table INNODB_BUFFER_POOL_PAGE. */
 static ST_FIELD_INFO i_s_innodb_buffer_page_fields_info[]=
 {
 #define IDX_BUFFER_POOL_ID		0
-  {"POOL_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("POOL_ID", ULong(), NOT_NULL),
+
 #define IDX_BUFFER_BLOCK_ID		1
-  {"BLOCK_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("BLOCK_ID", ULonglong(), NOT_NULL),
+
 #define IDX_BUFFER_PAGE_SPACE		2
-  {"SPACE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("SPACE", ULong(), NOT_NULL),
+
 #define IDX_BUFFER_PAGE_NUM		3
-  {"PAGE_NUMBER", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("PAGE_NUMBER", ULong(), NOT_NULL),
+
 #define IDX_BUFFER_PAGE_TYPE		4
-  {"PAGE_TYPE", 64, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("PAGE_TYPE", Varchar(64), NULLABLE),
+
 #define IDX_BUFFER_PAGE_FLUSH_TYPE	5
-  {"FLUSH_TYPE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("FLUSH_TYPE", ULong(), NOT_NULL),
+
 #define IDX_BUFFER_PAGE_FIX_COUNT	6
-  {"FIX_COUNT", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("FIX_COUNT", ULong(), NOT_NULL),
+
 #ifdef BTR_CUR_HASH_ADAPT
-# define IDX_BUFFER_PAGE_HASHED		7
-  {"IS_HASHED", 3, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+#define IDX_BUFFER_PAGE_HASHED		7
+  Column("IS_HASHED", SLong(1), NOT_NULL),
 #endif /* BTR_CUR_HASH_ADAPT */
 #define IDX_BUFFER_PAGE_NEWEST_MOD	7 + I_S_AHI
-  {"NEWEST_MODIFICATION", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("NEWEST_MODIFICATION", ULonglong(), NOT_NULL),
+
 #define IDX_BUFFER_PAGE_OLDEST_MOD	8 + I_S_AHI
-  {"OLDEST_MODIFICATION", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("OLDEST_MODIFICATION", ULonglong(), NOT_NULL),
+
 #define IDX_BUFFER_PAGE_ACCESS_TIME	9 + I_S_AHI
-  {"ACCESS_TIME", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("ACCESS_TIME", ULonglong(), NOT_NULL),
+
 #define IDX_BUFFER_PAGE_TABLE_NAME	10 + I_S_AHI
-  {"TABLE_NAME", 1024, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("TABLE_NAME", Varchar(1024), NULLABLE),
+
 #define IDX_BUFFER_PAGE_INDEX_NAME	11 + I_S_AHI
-  {"INDEX_NAME", 1024, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("INDEX_NAME", Varchar(NAME_CHAR_LEN), NULLABLE),
+
 #define IDX_BUFFER_PAGE_NUM_RECS	12 + I_S_AHI
-  {"NUMBER_RECORDS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("NUMBER_RECORDS", ULonglong(), NOT_NULL),
+
 #define IDX_BUFFER_PAGE_DATA_SIZE	13 + I_S_AHI
-  {"DATA_SIZE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("DATA_SIZE", ULonglong(), NOT_NULL),
+
 #define IDX_BUFFER_PAGE_ZIP_SIZE	14 + I_S_AHI
-  {"COMPRESSED_SIZE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("COMPRESSED_SIZE", ULonglong(), NOT_NULL),
+
 #define IDX_BUFFER_PAGE_STATE		15 + I_S_AHI
-  {"PAGE_STATE", 64, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("PAGE_STATE", Enum(&page_state_values_typelib), NOT_NULL),
+
 #define IDX_BUFFER_PAGE_IO_FIX		16 + I_S_AHI
-  {"IO_FIX", 64, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("IO_FIX", Enum(&io_values_typelib), NOT_NULL),
+
 #define IDX_BUFFER_PAGE_IS_OLD		17 + I_S_AHI
-  {"IS_OLD", 3, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("IS_OLD", SLong(1), NOT_NULL),
+
 #define IDX_BUFFER_PAGE_FREE_CLOCK	18 + I_S_AHI
-  {"FREE_PAGE_CLOCK", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+  Column("FREE_PAGE_CLOCK", ULonglong(), NOT_NULL),
+
+  CEnd()
 };
+} // namespace Show
 
 /*******************************************************************//**
 Fill Information Schema table INNODB_BUFFER_PAGE with information
@@ -4033,38 +3961,32 @@ i_s_innodb_buffer_page_fill(
 		const buf_page_info_t*	page_info;
 		char			table_name[MAX_FULL_NAME_LEN + 1];
 		const char*		table_name_end = NULL;
-		const char*		state_str;
-		enum buf_page_state	state;
 
 		page_info = info_array + i;
 
-		state_str = NULL;
-
-		OK(fields[IDX_BUFFER_POOL_ID]->store(
-			   page_info->pool_id, true));
+		OK(fields[IDX_BUFFER_POOL_ID]->store(0, true));
 
 		OK(fields[IDX_BUFFER_BLOCK_ID]->store(
 			   page_info->block_id, true));
 
 		OK(fields[IDX_BUFFER_PAGE_SPACE]->store(
-			   page_info->space_id, true));
+			   page_info->id.space(), true));
 
 		OK(fields[IDX_BUFFER_PAGE_NUM]->store(
-			   page_info->page_num, true));
+			   page_info->id.page_no(), true));
 
 		OK(field_store_string(
 			   fields[IDX_BUFFER_PAGE_TYPE],
 			   i_s_page_type[page_info->page_type].type_str));
 
-		OK(fields[IDX_BUFFER_PAGE_FLUSH_TYPE]->store(
-			   page_info->flush_type, true));
+		OK(fields[IDX_BUFFER_PAGE_FLUSH_TYPE]->store(0, true));
 
 		OK(fields[IDX_BUFFER_PAGE_FIX_COUNT]->store(
 			   page_info->fix_count, true));
 
 #ifdef BTR_CUR_HASH_ADAPT
-		OK(field_store_string(fields[IDX_BUFFER_PAGE_HASHED],
-				      page_info->hashed ? "YES" : "NO"));
+		OK(fields[IDX_BUFFER_PAGE_HASHED]->store(
+			   page_info->hashed, true));
 #endif /* BTR_CUR_HASH_ADAPT */
 
 		OK(fields[IDX_BUFFER_PAGE_NEWEST_MOD]->store(
@@ -4132,58 +4054,16 @@ i_s_innodb_buffer_page_fill(
 			   page_info->zip_ssize
 			   ? (UNIV_ZIP_SIZE_MIN >> 1) << page_info->zip_ssize
 			   : 0, true));
-		compile_time_assert(BUF_PAGE_STATE_BITS == 3);
-		state = static_cast<enum buf_page_state>(page_info->page_state);
-
-		switch (state) {
-		/* First three states are for compression pages and
-		are not states we would get as we scan pages through
-		buffer blocks */
-		case BUF_BLOCK_POOL_WATCH:
-		case BUF_BLOCK_ZIP_PAGE:
-		case BUF_BLOCK_ZIP_DIRTY:
-			state_str = NULL;
-			break;
-		case BUF_BLOCK_NOT_USED:
-			state_str = "NOT_USED";
-			break;
-		case BUF_BLOCK_READY_FOR_USE:
-			state_str = "READY_FOR_USE";
-			break;
-		case BUF_BLOCK_FILE_PAGE:
-			state_str = "FILE_PAGE";
-			break;
-		case BUF_BLOCK_MEMORY:
-			state_str = "MEMORY";
-			break;
-		case BUF_BLOCK_REMOVE_HASH:
-			state_str = "REMOVE_HASH";
-			break;
-		};
 
-		OK(field_store_string(fields[IDX_BUFFER_PAGE_STATE],
-				      state_str));
+		OK(fields[IDX_BUFFER_PAGE_STATE]->store(
+			   1 + std::min<unsigned>(page_info->page_state,
+						  BUF_BLOCK_FILE_PAGE), true));
 
-		switch (page_info->io_fix) {
-		case BUF_IO_NONE:
-			state_str = "IO_NONE";
-			break;
-		case BUF_IO_READ:
-			state_str = "IO_READ";
-			break;
-		case BUF_IO_WRITE:
-			state_str = "IO_WRITE";
-			break;
-		case BUF_IO_PIN:
-			state_str = "IO_PIN";
-			break;
-		}
-
-		OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX],
-				      state_str));
+		OK(fields[IDX_BUFFER_PAGE_IO_FIX]->store(
+			   1 + page_info->io_fix, true));
 
-		OK(field_store_string(fields[IDX_BUFFER_PAGE_IS_OLD],
-				      (page_info->is_old) ? "YES" : "NO"));
+		OK(fields[IDX_BUFFER_PAGE_IS_OLD]->store(
+			   page_info->is_old, true));
 
 		OK(fields[IDX_BUFFER_PAGE_FREE_CLOCK]->store(
 			   page_info->freed_page_clock, true));
@@ -4202,9 +4082,10 @@ i_s_innodb_set_page_type(
 /*=====================*/
 	buf_page_info_t*page_info,	/*!< in/out: structure to fill with
 					scanned info */
-	ulint		page_type,	/*!< in: page type */
 	const byte*	frame)		/*!< in: buffer frame */
 {
+	uint16_t page_type = fil_page_get_type(frame);
+
 	if (fil_page_type_is_index(page_type)) {
 		const page_t*	page = (const page_t*) frame;
 
@@ -4228,13 +4109,13 @@ i_s_innodb_set_page_type(
 			page_info->page_type = I_S_PAGE_TYPE_INDEX;
 		}
 
-		page_info->data_size = unsigned(page_header_get_field(
+		page_info->data_size = uint16_t(page_header_get_field(
 			page, PAGE_HEAP_TOP) - (page_is_comp(page)
 						? PAGE_NEW_SUPREMUM_END
 						: PAGE_OLD_SUPREMUM_END)
 			- page_header_get_field(page, PAGE_GARBAGE));
 
-		page_info->num_recs = page_get_n_recs(page);
+		page_info->num_recs = page_get_n_recs(page) & ((1U << 14) - 1);
 	} else if (page_type > FIL_PAGE_TYPE_LAST) {
 		/* Encountered an unknown page type */
 		page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
@@ -4243,15 +4124,7 @@ i_s_innodb_set_page_type(
 		i_s_page_type[] array */
 		ut_a(page_type == i_s_page_type[page_type].type_value);
 
-		page_info->page_type = page_type;
-	}
-
-	if (page_info->page_type == FIL_PAGE_TYPE_ZBLOB
-	    || page_info->page_type == FIL_PAGE_TYPE_ZBLOB2) {
-		page_info->page_num = mach_read_from_4(
-			frame + FIL_PAGE_OFFSET);
-		page_info->space_id = mach_read_from_4(
-			frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+		page_info->page_type = page_type & 0xf;
 	}
 }
 /*******************************************************************//**
@@ -4263,61 +4136,59 @@ void
 i_s_innodb_buffer_page_get_info(
 /*============================*/
 	const buf_page_t*bpage,		/*!< in: buffer pool page to scan */
-	ulint		pool_id,	/*!< in: buffer pool id */
 	ulint		pos,		/*!< in: buffer block position in
 					buffer pool or in the LRU list */
 	buf_page_info_t*page_info)	/*!< in: zero filled info structure;
 					out: structure filled with scanned
 					info */
 {
-	ut_ad(pool_id < MAX_BUFFER_POOLS);
-
-	page_info->pool_id = pool_id;
-
 	page_info->block_id = pos;
 
-	page_info->page_state = buf_page_get_state(bpage);
-
-	/* Only fetch information for buffers that map to a tablespace,
-	that is, buffer page with state BUF_BLOCK_ZIP_PAGE,
-	BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_FILE_PAGE */
-	if (buf_page_in_file(bpage)) {
-		const byte*	frame;
-		ulint		page_type;
+	compile_time_assert(BUF_BLOCK_NOT_USED == 0);
+	compile_time_assert(BUF_BLOCK_MEMORY == 1);
+	compile_time_assert(BUF_BLOCK_REMOVE_HASH == 2);
+	compile_time_assert(BUF_BLOCK_FILE_PAGE == 3);
+	compile_time_assert(BUF_BLOCK_ZIP_PAGE == 4);
 
-		page_info->space_id = bpage->id.space();
+	auto state = bpage->state();
+	page_info->page_state= int{state} & 7;
 
-		page_info->page_num = bpage->id.page_no();
-
-		page_info->flush_type = bpage->flush_type;
+	switch (state) {
+	default:
+		page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
+		break;
+	case BUF_BLOCK_FILE_PAGE:
+	case BUF_BLOCK_ZIP_PAGE:
+		const byte*	frame;
 
-		page_info->fix_count = bpage->buf_fix_count;
+		page_info->id = bpage->id();
 
-		page_info->newest_mod = bpage->newest_modification;
+		page_info->fix_count = bpage->buf_fix_count();
 
-		page_info->oldest_mod = bpage->oldest_modification;
+		page_info->oldest_mod = bpage->oldest_modification();
 
 		page_info->access_time = bpage->access_time;
 
 		page_info->zip_ssize = bpage->zip.ssize;
 
-		page_info->io_fix = bpage->io_fix;
+		page_info->io_fix = bpage->io_fix() & 3;
 
 		page_info->is_old = bpage->old;
 
 		page_info->freed_page_clock = bpage->freed_page_clock;
 
-		switch (buf_page_get_io_fix(bpage)) {
+		switch (bpage->io_fix()) {
 		case BUF_IO_NONE:
 		case BUF_IO_WRITE:
 		case BUF_IO_PIN:
 			break;
 		case BUF_IO_READ:
 			page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
+			page_info->newest_mod = 0;
 			return;
 		}
 
-		if (page_info->page_state == BUF_BLOCK_FILE_PAGE) {
+		if (state == BUF_BLOCK_FILE_PAGE) {
 			const buf_block_t*block;
 
 			block = reinterpret_cast<const buf_block_t*>(bpage);
@@ -4334,38 +4205,35 @@ i_s_innodb_buffer_page_get_info(
 			frame = bpage->zip.data;
 		}
 
-		page_type = fil_page_get_type(frame);
-
-		i_s_innodb_set_page_type(page_info, page_type, frame);
-	} else {
-		page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
+		page_info->newest_mod = mach_read_from_8(FIL_PAGE_LSN + frame);
+		i_s_innodb_set_page_type(page_info, frame);
 	}
 }
 
 /*******************************************************************//**
 This is the function that goes through each block of the buffer pool
 and fetch information to information schema tables: INNODB_BUFFER_PAGE.
+@param[in,out]	thd	connection
+@param[in,out]	tables	tables to fill
 @return 0 on success, 1 on failure */
-static
-int
-i_s_innodb_fill_buffer_pool(
-/*========================*/
-	THD*			thd,		/*!< in: thread */
-	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
-	buf_pool_t*		buf_pool,	/*!< in: buffer pool to scan */
-	const ulint		pool_id)	/*!< in: buffer pool id */
+static int i_s_innodb_buffer_page_fill(THD *thd, TABLE_LIST *tables, Item *)
 {
 	int			status	= 0;
 	mem_heap_t*		heap;
 
-	DBUG_ENTER("i_s_innodb_fill_buffer_pool");
+	DBUG_ENTER("i_s_innodb_buffer_page_fill");
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	/* deny access to user without PROCESS privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
 
 	heap = mem_heap_create(10000);
 
-	/* Go through each chunk of buffer pool. Currently, we only
-	have one single chunk for each buffer pool */
 	for (ulint n = 0;
-	     n < ut_min(buf_pool->n_chunks, buf_pool->n_chunks_new); n++) {
+	     n < ut_min(buf_pool.n_chunks, buf_pool.n_chunks_new); n++) {
 		const buf_block_t*	block;
 		ulint			n_blocks;
 		buf_page_info_t*	info_buffer;
@@ -4376,7 +4244,8 @@ i_s_innodb_fill_buffer_pool(
 		ulint			block_id = 0;
 
 		/* Get buffer block of the nth chunk */
-		block = buf_get_nth_chunk_block(buf_pool, n, &chunk_size);
+		block = buf_pool.chunks[n].blocks;
+		chunk_size = buf_pool.chunks[n].size;
 		num_page = 0;
 
 		while (chunk_size > 0) {
@@ -4397,18 +4266,18 @@ i_s_innodb_fill_buffer_pool(
 			buffer pool info printout, we are not required to
 			preserve the overall consistency, so we can
 			release mutex periodically */
-			buf_pool_mutex_enter(buf_pool);
+			mysql_mutex_lock(&buf_pool.mutex);
 
 			/* GO through each block in the chunk */
 			for (n_blocks = num_to_process; n_blocks--; block++) {
 				i_s_innodb_buffer_page_get_info(
-					&block->page, pool_id, block_id,
+					&block->page, block_id,
 					info_buffer + num_page);
 				block_id++;
 				num_page++;
 			}
 
-			buf_pool_mutex_exit(buf_pool);
+			mysql_mutex_unlock(&buf_pool.mutex);
 
 			/* Fill in information schema table with information
 			just collected from the buffer chunk scan */
@@ -4433,48 +4302,6 @@ i_s_innodb_fill_buffer_pool(
 }
 
 /*******************************************************************//**
-Fill page information for pages in InnoDB buffer pool to the
-dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE
-@return 0 on success, 1 on failure */
-static
-int
-i_s_innodb_buffer_page_fill_table(
-/*==============================*/
-	THD*		thd,		/*!< in: thread */
-	TABLE_LIST*	tables,		/*!< in/out: tables to fill */
-	Item*		)		/*!< in: condition (ignored) */
-{
-	int	status	= 0;
-
-	DBUG_ENTER("i_s_innodb_buffer_page_fill_table");
-
-	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
-
-	/* deny access to user without PROCESS privilege */
-	if (check_global_access(thd, PROCESS_ACL)) {
-		DBUG_RETURN(0);
-	}
-
-	/* Walk through each buffer pool */
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		/* Fetch information from pages in this buffer pool,
-		and fill the corresponding I_S table */
-		status = i_s_innodb_fill_buffer_pool(thd, tables, buf_pool, i);
-
-		/* If something wrong, break and return */
-		if (status) {
-			break;
-		}
-	}
-
-	DBUG_RETURN(status);
-}
-
-/*******************************************************************//**
 Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE.
 @return 0 on success, 1 on failure */
 static
@@ -4489,8 +4316,8 @@ i_s_innodb_buffer_page_init(
 
 	schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p);
 
-	schema->fields_info = i_s_innodb_buffer_page_fields_info;
-	schema->fill_table = i_s_innodb_buffer_page_fill_table;
+	schema->fields_info = Show::i_s_innodb_buffer_page_fields_info;
+	schema->fill_table = i_s_innodb_buffer_page_fill;
 
 	DBUG_RETURN(0);
 }
@@ -4544,72 +4371,73 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_buffer_page =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
-static ST_FIELD_INFO i_s_innodb_buf_page_lru_fields_info[]=
+namespace Show {
+static ST_FIELD_INFO i_s_innodb_buf_page_lru_fields_info[] =
 {
 #define IDX_BUF_LRU_POOL_ID		0
-  {"POOL_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("POOL_ID", ULong(), NOT_NULL),
+
 #define IDX_BUF_LRU_POS			1
-  {"LRU_POSITION", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("LRU_POSITION", ULonglong(), NOT_NULL),
+
 #define IDX_BUF_LRU_PAGE_SPACE		2
-  {"SPACE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("SPACE", ULong(), NOT_NULL),
+
 #define IDX_BUF_LRU_PAGE_NUM		3
-  {"PAGE_NUMBER", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("PAGE_NUMBER", ULong(), NOT_NULL),
+
 #define IDX_BUF_LRU_PAGE_TYPE		4
-  {"PAGE_TYPE", 64, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("PAGE_TYPE", Varchar(64), NULLABLE),
+
 #define IDX_BUF_LRU_PAGE_FLUSH_TYPE	5
-  {"FLUSH_TYPE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("FLUSH_TYPE", ULong(), NOT_NULL),
+
 #define IDX_BUF_LRU_PAGE_FIX_COUNT	6
-  {"FIX_COUNT", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("FIX_COUNT", ULong(), NOT_NULL),
+
 #ifdef BTR_CUR_HASH_ADAPT
-# define IDX_BUF_LRU_PAGE_HASHED		7
-  {"IS_HASHED", 3, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+#define IDX_BUF_LRU_PAGE_HASHED		7
+  Column("IS_HASHED", SLong(1), NOT_NULL),
 #endif /* BTR_CUR_HASH_ADAPT */
 #define IDX_BUF_LRU_PAGE_NEWEST_MOD	7 + I_S_AHI
-  {"NEWEST_MODIFICATION", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("NEWEST_MODIFICATION",ULonglong(), NOT_NULL),
+
 #define IDX_BUF_LRU_PAGE_OLDEST_MOD	8 + I_S_AHI
-  {"OLDEST_MODIFICATION", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("OLDEST_MODIFICATION",ULonglong(), NOT_NULL),
+
 #define IDX_BUF_LRU_PAGE_ACCESS_TIME	9 + I_S_AHI
-  {"ACCESS_TIME", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("ACCESS_TIME",ULonglong(), NOT_NULL),
+
 #define IDX_BUF_LRU_PAGE_TABLE_NAME	10 + I_S_AHI
-  {"TABLE_NAME", 1024, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("TABLE_NAME", Varchar(1024), NULLABLE),
+
 #define IDX_BUF_LRU_PAGE_INDEX_NAME	11 + I_S_AHI
-  {"INDEX_NAME", 1024, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("INDEX_NAME", Varchar(NAME_CHAR_LEN), NULLABLE),
+
 #define IDX_BUF_LRU_PAGE_NUM_RECS	12 + I_S_AHI
-  {"NUMBER_RECORDS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("NUMBER_RECORDS", ULonglong(), NOT_NULL),
+
 #define IDX_BUF_LRU_PAGE_DATA_SIZE	13 + I_S_AHI
-  {"DATA_SIZE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("DATA_SIZE", ULonglong(), NOT_NULL),
+
 #define IDX_BUF_LRU_PAGE_ZIP_SIZE	14 + I_S_AHI
-  {"COMPRESSED_SIZE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("COMPRESSED_SIZE",ULonglong(), NOT_NULL),
+
 #define IDX_BUF_LRU_PAGE_STATE		15 + I_S_AHI
-  {"COMPRESSED", 3, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("COMPRESSED", SLong(1), NOT_NULL),
+
 #define IDX_BUF_LRU_PAGE_IO_FIX		16 + I_S_AHI
-  {"IO_FIX", 64, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("IO_FIX", Enum(&io_values_typelib), NOT_NULL),
+
 #define IDX_BUF_LRU_PAGE_IS_OLD		17 + I_S_AHI
-  {"IS_OLD", 3, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("IS_OLD", SLong(1), NULLABLE),
+
 #define IDX_BUF_LRU_PAGE_FREE_CLOCK	18 + I_S_AHI
-  {"FREE_PAGE_CLOCK", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+  Column("FREE_PAGE_CLOCK", ULonglong(), NOT_NULL),
+
+  CEnd()
 };
+} // namespace Show
 
 /*******************************************************************//**
 Fill Information Schema table INNODB_BUFFER_PAGE_LRU with information
@@ -4636,38 +4464,32 @@ i_s_innodb_buf_page_lru_fill(
 		const buf_page_info_t*	page_info;
 		char			table_name[MAX_FULL_NAME_LEN + 1];
 		const char*		table_name_end = NULL;
-		const char*		state_str;
-		enum buf_page_state	state;
-
-		state_str = NULL;
 
 		page_info = info_array + i;
 
-		OK(fields[IDX_BUF_LRU_POOL_ID]->store(
-			   page_info->pool_id, true));
+		OK(fields[IDX_BUF_LRU_POOL_ID]->store(0, true));
 
 		OK(fields[IDX_BUF_LRU_POS]->store(
 			   page_info->block_id, true));
 
 		OK(fields[IDX_BUF_LRU_PAGE_SPACE]->store(
-			   page_info->space_id, true));
+			   page_info->id.space(), true));
 
 		OK(fields[IDX_BUF_LRU_PAGE_NUM]->store(
-			   page_info->page_num, true));
+			   page_info->id.page_no(), true));
 
 		OK(field_store_string(
 			   fields[IDX_BUF_LRU_PAGE_TYPE],
 			   i_s_page_type[page_info->page_type].type_str));
 
-		OK(fields[IDX_BUF_LRU_PAGE_FLUSH_TYPE]->store(
-			   page_info->flush_type, true));
+		OK(fields[IDX_BUF_LRU_PAGE_FLUSH_TYPE]->store(0, true));
 
 		OK(fields[IDX_BUF_LRU_PAGE_FIX_COUNT]->store(
 			   page_info->fix_count, true));
 
 #ifdef BTR_CUR_HASH_ADAPT
-		OK(field_store_string(fields[IDX_BUF_LRU_PAGE_HASHED],
-				      page_info->hashed ? "YES" : "NO"));
+		OK(fields[IDX_BUF_LRU_PAGE_HASHED]->store(
+			   page_info->hashed, true));
 #endif /* BTR_CUR_HASH_ADAPT */
 
 		OK(fields[IDX_BUF_LRU_PAGE_NEWEST_MOD]->store(
@@ -4735,51 +4557,15 @@ i_s_innodb_buf_page_lru_fill(
 			   page_info->zip_ssize
 			   ? 512 << page_info->zip_ssize : 0, true));
 
-		state = static_cast<enum buf_page_state>(page_info->page_state);
-
-		switch (state) {
-		/* Compressed page */
-		case BUF_BLOCK_ZIP_PAGE:
-		case BUF_BLOCK_ZIP_DIRTY:
-			state_str = "YES";
-			break;
-		/* Uncompressed page */
-		case BUF_BLOCK_FILE_PAGE:
-			state_str = "NO";
-			break;
-		/* We should not see following states */
-		case BUF_BLOCK_POOL_WATCH:
-		case BUF_BLOCK_READY_FOR_USE:
-		case BUF_BLOCK_NOT_USED:
-		case BUF_BLOCK_MEMORY:
-		case BUF_BLOCK_REMOVE_HASH:
-			state_str = NULL;
-			break;
-		};
-
-		OK(field_store_string(fields[IDX_BUF_LRU_PAGE_STATE],
-				      state_str));
-
-		switch (page_info->io_fix) {
-		case BUF_IO_NONE:
-			state_str = "IO_NONE";
-			break;
-		case BUF_IO_READ:
-			state_str = "IO_READ";
-			break;
-		case BUF_IO_WRITE:
-			state_str = "IO_WRITE";
-			break;
-		case BUF_IO_PIN:
-			state_str = "IO_PIN";
-			break;
-		}
+		OK(fields[IDX_BUF_LRU_PAGE_STATE]->store(
+			   page_info->page_state == BUF_BLOCK_ZIP_PAGE,
+			   true));
 
-		OK(field_store_string(fields[IDX_BUF_LRU_PAGE_IO_FIX],
-				      state_str));
+		OK(fields[IDX_BUF_LRU_PAGE_IO_FIX]->store(
+			   1 + page_info->io_fix, true));
 
-		OK(field_store_string(fields[IDX_BUF_LRU_PAGE_IS_OLD],
-				      page_info->is_old ? "YES" : "NO"));
+		OK(fields[IDX_BUF_LRU_PAGE_IS_OLD]->store(
+			   page_info->is_old, true));
 
 		OK(fields[IDX_BUF_LRU_PAGE_FREE_CLOCK]->store(
 			   page_info->freed_page_clock, true));
@@ -4790,18 +4576,11 @@ i_s_innodb_buf_page_lru_fill(
 	DBUG_RETURN(0);
 }
 
-/*******************************************************************//**
-This is the function that goes through buffer pool's LRU list
-and fetch information to INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU.
+/** Fill the table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU.
+@param[in]	thd		thread
+@param[in,out]	tables		tables to fill
 @return 0 on success, 1 on failure */
-static
-int
-i_s_innodb_fill_buffer_lru(
-/*=======================*/
-	THD*			thd,		/*!< in: thread */
-	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
-	buf_pool_t*		buf_pool,	/*!< in: buffer pool to scan */
-	const ulint		pool_id)	/*!< in: buffer pool id */
+static int i_s_innodb_fill_buffer_lru(THD *thd, TABLE_LIST *tables, Item *)
 {
 	int			status = 0;
 	buf_page_info_t*	info_buffer;
@@ -4811,35 +4590,36 @@ i_s_innodb_fill_buffer_lru(
 
 	DBUG_ENTER("i_s_innodb_fill_buffer_lru");
 
-	/* Obtain buf_pool mutex before allocate info_buffer, since
-	UT_LIST_GET_LEN(buf_pool->LRU) could change */
-	buf_pool_mutex_enter(buf_pool);
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
 
-	lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+	/* deny access to any users that do not hold PROCESS_ACL */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	/* Aquire the mutex before allocating info_buffer, since
+	UT_LIST_GET_LEN(buf_pool.LRU) could change */
+	mysql_mutex_lock(&buf_pool.mutex);
+
+	lru_len = UT_LIST_GET_LEN(buf_pool.LRU);
 
 	/* Print error message if malloc fail */
-	info_buffer = (buf_page_info_t*) my_malloc(
-		lru_len * sizeof *info_buffer, MYF(MY_WME));
-	/* JAN: TODO: MySQL 5.7 PSI
 	info_buffer = (buf_page_info_t*) my_malloc(PSI_INSTRUMENT_ME,
-		lru_len * sizeof *info_buffer, MYF(MY_WME));
-	*/
+		lru_len * sizeof *info_buffer, MYF(MY_WME | MY_ZEROFILL));
 
 	if (!info_buffer) {
 		status = 1;
 		goto exit;
 	}
 
-	memset(info_buffer, 0, lru_len * sizeof *info_buffer);
-
 	/* Walk through Pool's LRU list and print the buffer page
 	information */
-	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+	bpage = UT_LIST_GET_LAST(buf_pool.LRU);
 
 	while (bpage != NULL) {
 		/* Use the same function that collect buffer info for
 		INNODB_BUFFER_PAGE to get buffer page info */
-		i_s_innodb_buffer_page_get_info(bpage, pool_id, lru_pos,
+		i_s_innodb_buffer_page_get_info(bpage, lru_pos,
 						(info_buffer + lru_pos));
 
 		bpage = UT_LIST_GET_PREV(LRU, bpage);
@@ -4848,10 +4628,10 @@ i_s_innodb_fill_buffer_lru(
 	}
 
 	ut_ad(lru_pos == lru_len);
-	ut_ad(lru_pos == UT_LIST_GET_LEN(buf_pool->LRU));
+	ut_ad(lru_pos == UT_LIST_GET_LEN(buf_pool.LRU));
 
 exit:
-	buf_pool_mutex_exit(buf_pool);
+	mysql_mutex_unlock(&buf_pool.mutex);
 
 	if (info_buffer) {
 		status = i_s_innodb_buf_page_lru_fill(
@@ -4864,48 +4644,6 @@ exit:
 }
 
 /*******************************************************************//**
-Fill page information for pages in InnoDB buffer pool to the
-dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU
-@return 0 on success, 1 on failure */
-static
-int
-i_s_innodb_buf_page_lru_fill_table(
-/*===============================*/
-	THD*		thd,		/*!< in: thread */
-	TABLE_LIST*	tables,		/*!< in/out: tables to fill */
-	Item*		)		/*!< in: condition (ignored) */
-{
-	int	status	= 0;
-
-	DBUG_ENTER("i_s_innodb_buf_page_lru_fill_table");
-
-	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
-
-	/* deny access to any users that do not hold PROCESS_ACL */
-	if (check_global_access(thd, PROCESS_ACL)) {
-		DBUG_RETURN(0);
-	}
-
-	/* Walk through each buffer pool */
-	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		/* Fetch information from pages in this buffer pool's LRU list,
-		and fill the corresponding I_S table */
-		status = i_s_innodb_fill_buffer_lru(thd, tables, buf_pool, i);
-
-		/* If something wrong, break and return */
-		if (status) {
-			break;
-		}
-	}
-
-	DBUG_RETURN(status);
-}
-
-/*******************************************************************//**
 Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU.
 @return 0 on success, 1 on failure */
 static
@@ -4920,8 +4658,8 @@ i_s_innodb_buffer_page_lru_init(
 
 	schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p);
 
-	schema->fields_info = i_s_innodb_buf_page_lru_fields_info;
-	schema->fill_table = i_s_innodb_buf_page_lru_fill_table;
+	schema->fields_info = Show::i_s_innodb_buf_page_lru_fields_info;
+	schema->fill_table = i_s_innodb_fill_buffer_lru;
 
 	DBUG_RETURN(0);
 }
@@ -4987,36 +4725,56 @@ static int i_s_common_deinit(void*)
 	DBUG_RETURN(0);
 }
 
+static const LEX_CSTRING row_format_values[] =
+{
+  { STRING_WITH_LEN("Redundant") },
+  { STRING_WITH_LEN("Compact") },
+  { STRING_WITH_LEN("Compressed") },
+  { STRING_WITH_LEN("Dynamic") }
+};
+
+static TypelibBuffer<4> row_format_values_typelib(row_format_values);
+
+static const LEX_CSTRING space_type_values[] =
+{
+	{ STRING_WITH_LEN("Single") },
+	{ STRING_WITH_LEN("System") }
+};
+
+static TypelibBuffer<2> space_type_values_typelib(space_type_values);
+
+namespace Show {
 /**  SYS_TABLES  ***************************************************/
 /* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLES */
 static ST_FIELD_INFO innodb_sys_tables_fields_info[]=
 {
 #define SYS_TABLES_ID			0
-  {"TABLE_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("TABLE_ID", ULonglong(), NOT_NULL),
+
 #define SYS_TABLES_NAME			1
-  {"NAME", MAX_FULL_NAME_LEN + 1, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("NAME", Varchar(MAX_FULL_NAME_LEN + 1), NOT_NULL),
+
 #define SYS_TABLES_FLAG			2
-  {"FLAG", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("FLAG", SLong(), NOT_NULL),
+
 #define SYS_TABLES_NUM_COLUMN		3
-  {"N_COLS", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("N_COLS", ULong(), NOT_NULL),
+
 #define SYS_TABLES_SPACE		4
-  {"SPACE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("SPACE", ULong(), NOT_NULL),
+
 #define SYS_TABLES_ROW_FORMAT		5
-  {"ROW_FORMAT", 12, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("ROW_FORMAT", Enum(&row_format_values_typelib), NULLABLE),
+
 #define SYS_TABLES_ZIP_PAGE_SIZE	6
-  {"ZIP_PAGE_SIZE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-#define SYS_TABLES_SPACE_TYPE	7
-  {"SPACE_TYPE", 10, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+  Column("ZIP_PAGE_SIZE", ULong(), NOT_NULL),
+
+#define SYS_TABLES_SPACE_TYPE		7
+  Column("SPACE_TYPE", Enum(&space_type_values_typelib), NULLABLE),
+
+  CEnd()
 };
+} // namespace Show
 
 /**********************************************************************//**
 Populate information_schema.innodb_sys_tables table with information
@@ -5157,7 +4915,7 @@ innodb_sys_tables_init(
 
 	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = innodb_sys_tables_fields_info;
+	schema->fields_info = Show::innodb_sys_tables_fields_info;
 	schema->fill_table = i_s_sys_tables_fill_table;
 
 	DBUG_RETURN(0);
@@ -5212,40 +4970,41 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_tables =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+namespace Show {
 /**  SYS_TABLESTATS  ***********************************************/
 /* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLESTATS */
 static ST_FIELD_INFO innodb_sys_tablestats_fields_info[]=
 {
 #define SYS_TABLESTATS_ID		0
-  {"TABLE_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("TABLE_ID", ULonglong(), NOT_NULL),
+
 #define SYS_TABLESTATS_NAME		1
-  {"NAME", NAME_LEN + 1, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
 #define SYS_TABLESTATS_INIT		2
-  {"STATS_INITIALIZED", NAME_LEN + 1, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("STATS_INITIALIZED", SLong(1), NOT_NULL),
+
 #define SYS_TABLESTATS_NROW		3
-  {"NUM_ROWS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("NUM_ROWS", ULonglong(), NOT_NULL),
+
 #define SYS_TABLESTATS_CLUST_SIZE	4
-  {"CLUST_INDEX_SIZE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("CLUST_INDEX_SIZE", ULonglong(), NOT_NULL),
+
 #define SYS_TABLESTATS_INDEX_SIZE	5
-  {"OTHER_INDEX_SIZE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("OTHER_INDEX_SIZE", ULonglong(), NOT_NULL),
+
 #define SYS_TABLESTATS_MODIFIED		6
-  {"MODIFIED_COUNTER", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("MODIFIED_COUNTER", ULonglong(), NOT_NULL),
+
 #define SYS_TABLESTATS_AUTONINC		7
-  {"AUTOINC", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("AUTOINC", ULonglong(), NOT_NULL),
+
 #define SYS_TABLESTATS_TABLE_REF_COUNT	8
-  {"REF_COUNT", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("REF_COUNT", SLong(), NOT_NULL),
 
-  END_OF_ST_FIELD_INFO
+  CEnd()
 };
+} // namespace Show
 
 /** Populate information_schema.innodb_sys_tablestats table with information
 from SYS_TABLES.
@@ -5280,10 +5039,10 @@ i_s_dict_fill_sys_tablestats(
 			~Locking() { mutex_exit(&dict_sys.mutex); }
 		} locking;
 
-		if (table->stat_initialized) {
-			OK(field_store_string(fields[SYS_TABLESTATS_INIT],
-					      "Initialized"));
+		OK(fields[SYS_TABLESTATS_INIT]->store(table->stat_initialized,
+						      true));
 
+		if (table->stat_initialized) {
 			OK(fields[SYS_TABLESTATS_NROW]->store(
 				   table->stat_n_rows, true));
 
@@ -5297,9 +5056,6 @@ i_s_dict_fill_sys_tablestats(
 			OK(fields[SYS_TABLESTATS_MODIFIED]->store(
 				   table->stat_modified_counter, true));
 		} else {
-			OK(field_store_string(fields[SYS_TABLESTATS_INIT],
-					      "Uninitialized"));
-
 			OK(fields[SYS_TABLESTATS_NROW]->store(0, true));
 
 			OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store(0, true));
@@ -5414,7 +5170,7 @@ innodb_sys_tablestats_init(
 
 	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = innodb_sys_tablestats_fields_info;
+	schema->fields_info = Show::innodb_sys_tablestats_fields_info;
 	schema->fill_table = i_s_sys_tables_fill_table_stats;
 
 	DBUG_RETURN(0);
@@ -5469,36 +5225,38 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_tablestats =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+namespace Show {
 /**  SYS_INDEXES  **************************************************/
 /* Fields of the dynamic table INFORMATION_SCHEMA.SYS_INDEXES */
 static ST_FIELD_INFO innodb_sysindex_fields_info[]=
 {
 #define SYS_INDEX_ID		0
-  {"INDEX_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("INDEX_ID", ULonglong(), NOT_NULL),
+
 #define SYS_INDEX_NAME		1
-  {"NAME", NAME_LEN + 1, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
 #define SYS_INDEX_TABLE_ID	2
-  {"TABLE_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("TABLE_ID", ULonglong(), NOT_NULL),
+
 #define SYS_INDEX_TYPE		3
-  {"TYPE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("TYPE", SLong(), NOT_NULL),
+
 #define SYS_INDEX_NUM_FIELDS	4
-  {"N_FIELDS", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("N_FIELDS", SLong(), NOT_NULL),
+
 #define SYS_INDEX_PAGE_NO	5
-  {"PAGE_NO", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("PAGE_NO", SLong(), NOT_NULL),
+
 #define SYS_INDEX_SPACE		6
-  {"SPACE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("SPACE", SLong(), NOT_NULL),
+
 #define SYS_INDEX_MERGE_THRESHOLD 7
-  {"MERGE_THRESHOLD", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+  Column("MERGE_THRESHOLD", SLong(), NOT_NULL),
+
+  CEnd()
 };
+} // namespace Show
 
 /**********************************************************************//**
 Function to populate the information_schema.innodb_sys_indexes table with
@@ -5651,7 +5409,7 @@ innodb_sys_indexes_init(
 
 	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = innodb_sysindex_fields_info;
+	schema->fields_info = Show::innodb_sysindex_fields_info;
 	schema->fill_table = i_s_sys_indexes_fill_table;
 
 	DBUG_RETURN(0);
@@ -5706,30 +5464,32 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_indexes =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+namespace Show {
 /**  SYS_COLUMNS  **************************************************/
 /* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_COLUMNS */
 static ST_FIELD_INFO innodb_sys_columns_fields_info[]=
 {
 #define SYS_COLUMN_TABLE_ID		0
-  {"TABLE_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("TABLE_ID", ULonglong(), NOT_NULL),
+
 #define SYS_COLUMN_NAME		1
-  {"NAME", NAME_LEN + 1, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
 #define SYS_COLUMN_POSITION	2
-  {"POS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("POS", ULonglong(), NOT_NULL),
+
 #define SYS_COLUMN_MTYPE		3
-  {"MTYPE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("MTYPE", SLong(), NOT_NULL),
+
 #define SYS_COLUMN__PRTYPE	4
-  {"PRTYPE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("PRTYPE", SLong(), NOT_NULL),
+
 #define SYS_COLUMN_COLUMN_LEN	5
-  {"LEN", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, 0, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+  Column("LEN", SLong(), NOT_NULL),
+
+  CEnd()
 };
+} // namespace Show
 
 /**********************************************************************//**
 Function to populate the information_schema.innodb_sys_columns with
@@ -5861,7 +5621,7 @@ innodb_sys_columns_init(
 
 	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = innodb_sys_columns_fields_info;
+	schema->fields_info = Show::innodb_sys_columns_fields_info;
 	schema->fill_table = i_s_sys_columns_fill_table;
 
 	DBUG_RETURN(0);
@@ -5916,21 +5676,23 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_columns =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+namespace Show {
 /**  SYS_VIRTUAL **************************************************/
 /** Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_VIRTUAL */
 static ST_FIELD_INFO innodb_sys_virtual_fields_info[]=
 {
 #define SYS_VIRTUAL_TABLE_ID		0
-  {"TABLE_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("TABLE_ID", ULonglong(), NOT_NULL),
+
 #define SYS_VIRTUAL_POS			1
-  {"POS", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("POS", ULong(), NOT_NULL),
+
 #define SYS_VIRTUAL_BASE_POS		2
-  {"BASE_POS", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+  Column("BASE_POS", ULong(), NOT_NULL),
+
+  CEnd()
 };
+} // namespace Show
 
 /** Function to populate the information_schema.innodb_sys_virtual with
 related information
@@ -6046,7 +5808,7 @@ innodb_sys_virtual_init(
 
 	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = innodb_sys_virtual_fields_info;
+	schema->fields_info = Show::innodb_sys_virtual_fields_info;
 	schema->fill_table = i_s_sys_virtual_fill_table;
 
 	DBUG_RETURN(0);
@@ -6100,21 +5862,25 @@ struct st_maria_plugin	i_s_innodb_sys_virtual =
 	INNODB_VERSION_STR,
 	MariaDB_PLUGIN_MATURITY_STABLE,
 };
+
+
+namespace Show {
 /**  SYS_FIELDS  ***************************************************/
 /* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FIELDS */
 static ST_FIELD_INFO innodb_sys_fields_fields_info[]=
 {
 #define SYS_FIELD_INDEX_ID	0
-  {"INDEX_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("INDEX_ID", ULonglong(), NOT_NULL),
+
 #define SYS_FIELD_NAME		1
-  {"NAME", NAME_LEN + 1, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
 #define SYS_FIELD_POS		2
-  {"POS", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+  Column("POS", ULong(), NOT_NULL),
+
+  CEnd()
 };
+} // namespace Show
 
 /**********************************************************************//**
 Function to fill information_schema.innodb_sys_fields with information
@@ -6237,7 +6003,7 @@ innodb_sys_fields_init(
 
 	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = innodb_sys_fields_fields_info;
+	schema->fields_info = Show::innodb_sys_fields_fields_info;
 	schema->fill_table = i_s_sys_fields_fill_table;
 
 	DBUG_RETURN(0);
@@ -6292,24 +6058,29 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_fields =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+namespace Show {
 /**  SYS_FOREIGN        ********************************************/
 /* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FOREIGN */
 static ST_FIELD_INFO innodb_sys_foreign_fields_info[]=
 {
 #define SYS_FOREIGN_ID		0
-  {"ID", NAME_LEN + 1, MYSQL_TYPE_STRING, 0, 0, "", SKIP_OPEN_TABLE},
+  Column("ID", Varchar(NAME_LEN + 1), NOT_NULL),
+
 #define SYS_FOREIGN_FOR_NAME	1
-  {"FOR_NAME", NAME_LEN + 1, MYSQL_TYPE_STRING, 0, 0, "", SKIP_OPEN_TABLE},
+  Column("FOR_NAME", Varchar(NAME_LEN + 1), NOT_NULL),
+
 #define SYS_FOREIGN_REF_NAME	2
-  {"REF_NAME", NAME_LEN + 1, MYSQL_TYPE_STRING, 0, 0, "", SKIP_OPEN_TABLE},
+  Column("REF_NAME", Varchar(NAME_LEN + 1), NOT_NULL),
+
 #define SYS_FOREIGN_NUM_COL	3
-  {"N_COLS", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("N_COLS", ULong(), NOT_NULL),
+
 #define SYS_FOREIGN_TYPE	4
-  {"TYPE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+  Column("TYPE", ULong(), NOT_NULL),
+
+  CEnd()
 };
+} // namespace Show
 
 /**********************************************************************//**
 Function to fill information_schema.innodb_sys_foreign with information
@@ -6429,7 +6200,7 @@ innodb_sys_foreign_init(
 
 	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = innodb_sys_foreign_fields_info;
+	schema->fields_info = Show::innodb_sys_foreign_fields_info;
 	schema->fill_table = i_s_sys_foreign_fill_table;
 
 	DBUG_RETURN(0);
@@ -6484,21 +6255,26 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_foreign =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+namespace Show {
 /**  SYS_FOREIGN_COLS   ********************************************/
 /* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FOREIGN_COLS */
 static ST_FIELD_INFO innodb_sys_foreign_cols_fields_info[]=
 {
 #define SYS_FOREIGN_COL_ID		0
-  {"ID", NAME_LEN + 1, MYSQL_TYPE_STRING, 0, 0, "", SKIP_OPEN_TABLE},
+  Column("ID", Varchar(NAME_LEN + 1), NOT_NULL),
+
 #define SYS_FOREIGN_COL_FOR_NAME	1
-  {"FOR_COL_NAME", NAME_LEN + 1, MYSQL_TYPE_STRING, 0, 0, "", SKIP_OPEN_TABLE},
+  Column("FOR_COL_NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
 #define SYS_FOREIGN_COL_REF_NAME	2
-  {"REF_COL_NAME", NAME_LEN + 1, MYSQL_TYPE_STRING, 0, 0, "", SKIP_OPEN_TABLE},
+  Column("REF_COL_NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
 #define SYS_FOREIGN_COL_POS		3
-  {"POS", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+  Column("POS", ULong(), NOT_NULL),
+
+  CEnd()
 };
+} // namespace Show
 
 /**********************************************************************//**
 Function to fill information_schema.innodb_sys_foreign_cols with information
@@ -6619,7 +6395,7 @@ innodb_sys_foreign_cols_init(
 
 	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = innodb_sys_foreign_cols_fields_info;
+	schema->fields_info = Show::innodb_sys_foreign_cols_fields_info;
 	schema->fill_table = i_s_sys_foreign_cols_fill_table;
 
 	DBUG_RETURN(0);
@@ -6674,42 +6450,41 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_foreign_cols =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+namespace Show {
 /**  SYS_TABLESPACES    ********************************************/
 /* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES */
 static ST_FIELD_INFO innodb_sys_tablespaces_fields_info[]=
 {
 #define SYS_TABLESPACES_SPACE		0
-  {"SPACE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("SPACE", ULong(), NOT_NULL),
+
 #define SYS_TABLESPACES_NAME		1
-  {"NAME", MAX_FULL_NAME_LEN + 1, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("NAME", Varchar(MAX_FULL_NAME_LEN + 1), NOT_NULL),
+
 #define SYS_TABLESPACES_FLAGS		2
-  {"FLAG", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("FLAG", ULong(), NOT_NULL),
+
 #define SYS_TABLESPACES_ROW_FORMAT	3
-  {"ROW_FORMAT", 22, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("ROW_FORMAT", Varchar(22), NULLABLE),
+
 #define SYS_TABLESPACES_PAGE_SIZE	4
-  {"PAGE_SIZE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("PAGE_SIZE", ULong(), NOT_NULL),
+
 #define SYS_TABLESPACES_ZIP_PAGE_SIZE	5
-  {"ZIP_PAGE_SIZE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-#define SYS_TABLESPACES_SPACE_TYPE	6
-  {"SPACE_TYPE", 10, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
-#define SYS_TABLESPACES_FS_BLOCK_SIZE	7
-  {"FS_BLOCK_SIZE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-#define SYS_TABLESPACES_FILE_SIZE	8
-  {"FILE_SIZE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-#define SYS_TABLESPACES_ALLOC_SIZE	9
-  {"ALLOCATED_SIZE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+  Column("ZIP_PAGE_SIZE", ULong(), NOT_NULL),
+
+#define SYS_TABLESPACES_FS_BLOCK_SIZE	6
+  Column("FS_BLOCK_SIZE", ULong(),NOT_NULL),
+
+#define SYS_TABLESPACES_FILE_SIZE	7
+  Column("FILE_SIZE", ULonglong(), NOT_NULL),
+
+#define SYS_TABLESPACES_ALLOC_SIZE	8
+  Column("ALLOCATED_SIZE", ULonglong(), NOT_NULL),
+
+  CEnd()
 };
+} // namespace Show
 
 
 extern size_t os_file_get_fs_block_size(const char *path);
@@ -6723,7 +6498,7 @@ int
 i_s_dict_fill_sys_tablespaces(
 /*==========================*/
 	THD*		thd,		/*!< in: thread */
-	ulint		space,		/*!< in: space ID */
+	uint32_t	space,		/*!< in: space ID */
 	const char*	name,		/*!< in: tablespace name */
 	ulint		flags,		/*!< in: tablespace flags */
 	TABLE*		table_to_fill)	/*!< in/out: fill this table */
@@ -6756,10 +6531,6 @@ i_s_dict_fill_sys_tablespaces(
 
 	OK(field_store_string(fields[SYS_TABLESPACES_ROW_FORMAT], row_format));
 
-	OK(field_store_string(fields[SYS_TABLESPACES_SPACE_TYPE],
-			      is_system_tablespace(space)
-			      ? "System" : "Single"));
-
 	ulint cflags = fil_space_t::is_valid_flags(flags, space)
 		? flags : fsp_flags_convert_from_101(flags);
 	if (cflags == ULINT_UNDEFINED) {
@@ -6783,7 +6554,7 @@ i_s_dict_fill_sys_tablespaces(
 
 	memset(&file, 0xff, sizeof(file));
 
-	if (fil_space_t* s = fil_space_acquire_silent(space)) {
+	if (fil_space_t* s = fil_space_t::get(space)) {
 		const char *filepath = s->chain.start
 			? s->chain.start->name : NULL;
 		if (!filepath) {
@@ -6797,7 +6568,7 @@ file_done:
 		s->release();
 	}
 
-	if (file.m_total_size == static_cast<os_offset_t>(~0)) {
+	if (file.m_total_size == os_offset_t(~0)) {
 		fs_block_size = 0;
 		file.m_total_size = 0;
 		file.m_alloc_size = 0;
@@ -6849,7 +6620,7 @@ i_s_sys_tablespaces_fill_table(
 	     rec = dict_getnext_system(&pcur, &mtr)) {
 
 		const char*	err_msg;
-		ulint		space;
+		uint32_t	space;
 		const char*	name;
 		ulint		flags;
 
@@ -6898,7 +6669,7 @@ innodb_sys_tablespaces_init(
 
 	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = innodb_sys_tablespaces_fields_info;
+	schema->fields_info = Show::innodb_sys_tablespaces_fields_info;
 	schema->fill_table = i_s_sys_tablespaces_fill_table;
 
 	DBUG_RETURN(0);
@@ -6953,17 +6724,20 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_tablespaces =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+namespace Show {
 /**  SYS_DATAFILES  ************************************************/
 /* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_DATAFILES */
 static ST_FIELD_INFO innodb_sys_datafiles_fields_info[]=
 {
 #define SYS_DATAFILES_SPACE		0
-  {"SPACE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("SPACE", ULong(), NOT_NULL),
+
 #define SYS_DATAFILES_PATH		1
-  {"PATH", OS_FILE_MAX_PATH, MYSQL_TYPE_STRING, 0, 0, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+  Column("PATH", Varchar(OS_FILE_MAX_PATH), NOT_NULL),
+
+  CEnd()
 };
+} // namespace Show
 
 /**********************************************************************//**
 Function to fill INFORMATION_SCHEMA.INNODB_SYS_DATAFILES with information
@@ -6974,7 +6748,7 @@ int
 i_s_dict_fill_sys_datafiles(
 /*========================*/
 	THD*		thd,		/*!< in: thread */
-	ulint		space,		/*!< in: space ID */
+	uint32_t	space,		/*!< in: space ID */
 	const char*	path,		/*!< in: absolute path */
 	TABLE*		table_to_fill)	/*!< in/out: fill this table */
 {
@@ -6984,7 +6758,7 @@ i_s_dict_fill_sys_datafiles(
 
 	fields = table_to_fill->field;
 
-	OK(field_store_ulint(fields[SYS_DATAFILES_SPACE], space));
+	OK(fields[SYS_DATAFILES_SPACE]->store(space, true));
 
 	OK(field_store_string(fields[SYS_DATAFILES_PATH], path));
 
@@ -7026,7 +6800,7 @@ i_s_sys_datafiles_fill_table(
 
 	while (rec) {
 		const char*	err_msg;
-		ulint		space;
+		uint32_t	space;
 		const char*	path;
 
 		/* Extract necessary information from a SYS_DATAFILES row */
@@ -7074,7 +6848,7 @@ innodb_sys_datafiles_init(
 
 	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = innodb_sys_datafiles_fields_info;
+	schema->fields_info = Show::innodb_sys_datafiles_fields_info;
 	schema->fill_table = i_s_sys_datafiles_fill_table;
 
 	DBUG_RETURN(0);
@@ -7129,44 +6903,44 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_datafiles =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
+namespace Show {
 /**  TABLESPACES_ENCRYPTION    ********************************************/
 /* Fields of the table INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION */
 static ST_FIELD_INFO innodb_tablespaces_encryption_fields_info[]=
 {
 #define TABLESPACES_ENCRYPTION_SPACE	0
-  {"SPACE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("SPACE", ULong(), NOT_NULL),
+
 #define TABLESPACES_ENCRYPTION_NAME		1
-  {"NAME", MAX_FULL_NAME_LEN + 1, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("NAME", Varchar(MAX_FULL_NAME_LEN + 1), NULLABLE),
+
 #define TABLESPACES_ENCRYPTION_ENCRYPTION_SCHEME	2
-  {"ENCRYPTION_SCHEME", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("ENCRYPTION_SCHEME", ULong(), NOT_NULL),
+
 #define TABLESPACES_ENCRYPTION_KEYSERVER_REQUESTS	3
-  {"KEYSERVER_REQUESTS", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("KEYSERVER_REQUESTS", ULong(), NOT_NULL),
+
 #define TABLESPACES_ENCRYPTION_MIN_KEY_VERSION	4
-  {"MIN_KEY_VERSION", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("MIN_KEY_VERSION", ULong(), NOT_NULL),
+
 #define TABLESPACES_ENCRYPTION_CURRENT_KEY_VERSION	5
-  {"CURRENT_KEY_VERSION", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("CURRENT_KEY_VERSION", ULong(), NOT_NULL),
+
 #define TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER	6
-  {"KEY_ROTATION_PAGE_NUMBER", MY_INT64_NUM_DECIMAL_DIGITS,
-   MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("KEY_ROTATION_PAGE_NUMBER", ULonglong(), NULLABLE),
+
 #define TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER 7
-  {"KEY_ROTATION_MAX_PAGE_NUMBER", MY_INT64_NUM_DECIMAL_DIGITS,
-   MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
+  Column("KEY_ROTATION_MAX_PAGE_NUMBER", ULonglong(), NULLABLE),
+
 #define TABLESPACES_ENCRYPTION_CURRENT_KEY_ID	8
-  {"CURRENT_KEY_ID", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("CURRENT_KEY_ID", ULong(), NOT_NULL),
+
 #define TABLESPACES_ENCRYPTION_ROTATING_OR_FLUSHING 9
-  {"ROTATING_OR_FLUSHING", 1, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+  Column("ROTATING_OR_FLUSHING", SLong(1), NOT_NULL),
+
+  CEnd()
 };
+} // namespace Show
 
 /**********************************************************************//**
 Function to fill INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION
@@ -7213,7 +6987,7 @@ i_s_dict_fill_tablespaces_encryption(
 	OK(fields[TABLESPACES_ENCRYPTION_CURRENT_KEY_ID]->store(
 		   status.key_id, true));
 	OK(fields[TABLESPACES_ENCRYPTION_ROTATING_OR_FLUSHING]->store(
-		   status.rotating || status.flushing, true));
+			   status.rotating || status.flushing, true));
 
 	if (status.rotating) {
 		fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER]->set_notnull();
@@ -7255,25 +7029,29 @@ i_s_tablespaces_encryption_fill_table(
 		DBUG_RETURN(0);
 	}
 
+	int err = 0;
 	mutex_enter(&fil_system.mutex);
+	fil_system.freeze_space_list++;
 
 	for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list);
 	     space; space = UT_LIST_GET_NEXT(space_list, space)) {
 		if (space->purpose == FIL_TYPE_TABLESPACE
-		    && space->acquire()) {
+		    && !space->is_stopping()) {
+			space->reacquire();
 			mutex_exit(&fil_system.mutex);
-			if (int err = i_s_dict_fill_tablespaces_encryption(
-				    thd, space, tables->table)) {
-				space->release();
-				DBUG_RETURN(err);
-			}
+			err = i_s_dict_fill_tablespaces_encryption(
+				thd, space, tables->table);
 			mutex_enter(&fil_system.mutex);
 			space->release();
+			if (err) {
+				break;
+			}
 		}
 	}
 
+	fil_system.freeze_space_list--;
 	mutex_exit(&fil_system.mutex);
-	DBUG_RETURN(0);
+	DBUG_RETURN(err);
 }
 /*******************************************************************//**
 Bind the dynamic table INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION
@@ -7290,7 +7068,7 @@ innodb_tablespaces_encryption_init(
 
 	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = innodb_tablespaces_encryption_fields_info;
+	schema->fields_info = Show::innodb_tablespaces_encryption_fields_info;
 	schema->fill_table = i_s_tablespaces_encryption_fill_table;
 
 	DBUG_RETURN(0);
@@ -7345,243 +7123,26 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_tablespaces_encryption =
 	MariaDB_PLUGIN_MATURITY_STABLE
 };
 
-/**  TABLESPACES_SCRUBBING    ********************************************/
-/* Fields of the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING */
-static ST_FIELD_INFO innodb_tablespaces_scrubbing_fields_info[]=
-{
-#define TABLESPACES_SCRUBBING_SPACE	0
-  {"SPACE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-#define TABLESPACES_SCRUBBING_NAME		1
-  {"NAME", MAX_FULL_NAME_LEN + 1, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
-#define TABLESPACES_SCRUBBING_COMPRESSED	2
-  {"COMPRESSED", 1, MYSQL_TYPE_LONG, 0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-#define TABLESPACES_SCRUBBING_LAST_SCRUB_COMPLETED	3
-  {"LAST_SCRUB_COMPLETED", 0, MYSQL_TYPE_DATETIME,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
-#define TABLESPACES_SCRUBBING_CURRENT_SCRUB_STARTED	4
-  {"CURRENT_SCRUB_STARTED", 0, MYSQL_TYPE_DATETIME,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
-#define TABLESPACES_SCRUBBING_CURRENT_SCRUB_ACTIVE_THREADS	5
-  {"CURRENT_SCRUB_ACTIVE_THREADS", MY_INT32_NUM_DECIMAL_DIGITS,
-   MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
-#define TABLESPACES_SCRUBBING_CURRENT_SCRUB_PAGE_NUMBER	6
-  {"CURRENT_SCRUB_PAGE_NUMBER", MY_INT64_NUM_DECIMAL_DIGITS,
-   MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-#define TABLESPACES_SCRUBBING_CURRENT_SCRUB_MAX_PAGE_NUMBER	7
-  {"CURRENT_SCRUB_MAX_PAGE_NUMBER", MY_INT64_NUM_DECIMAL_DIGITS,
-   MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-#define TABLESPACES_SCRUBBING_ON_SSD	8
-  {"ON_SSD", 1, MYSQL_TYPE_LONG, 0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
-};
-
-/**********************************************************************//**
-Function to fill INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING
-with information collected by scanning SYS_TABLESPACES table and
-fil_space.
-@param[in]	thd		Thread handle
-@param[in]	space		Tablespace
-@param[in]	table_to_fill	I_S table
-@return 0 on success */
-static
-int
-i_s_dict_fill_tablespaces_scrubbing(
-	THD*		thd,
-	fil_space_t*	space,
-	TABLE*		table_to_fill)
-{
-	Field**	fields;
-        struct fil_space_scrub_status_t status;
-
-	DBUG_ENTER("i_s_dict_fill_tablespaces_scrubbing");
-
-	fields = table_to_fill->field;
-
-	fil_space_get_scrub_status(space, &status);
-
-	OK(fields[TABLESPACES_SCRUBBING_SPACE]->store(space->id, true));
-
-	OK(field_store_string(fields[TABLESPACES_SCRUBBING_NAME],
-			      space->name));
-
-	OK(fields[TABLESPACES_SCRUBBING_COMPRESSED]->store(
-		   status.compressed ? 1 : 0, true));
-
-	if (status.last_scrub_completed == 0) {
-		fields[TABLESPACES_SCRUBBING_LAST_SCRUB_COMPLETED]->set_null();
-	} else {
-		fields[TABLESPACES_SCRUBBING_LAST_SCRUB_COMPLETED]
-			->set_notnull();
-		OK(field_store_time_t(
-			   fields[TABLESPACES_SCRUBBING_LAST_SCRUB_COMPLETED],
-			   status.last_scrub_completed));
-	}
-
-	int field_numbers[] = {
-		TABLESPACES_SCRUBBING_CURRENT_SCRUB_STARTED,
-		TABLESPACES_SCRUBBING_CURRENT_SCRUB_ACTIVE_THREADS,
-		TABLESPACES_SCRUBBING_CURRENT_SCRUB_PAGE_NUMBER,
-		TABLESPACES_SCRUBBING_CURRENT_SCRUB_MAX_PAGE_NUMBER };
-
-	if (status.scrubbing) {
-		for (uint i = 0; i < array_elements(field_numbers); i++) {
-			fields[field_numbers[i]]->set_notnull();
-		}
-
-		OK(field_store_time_t(
-			   fields[TABLESPACES_SCRUBBING_CURRENT_SCRUB_STARTED],
-			   status.current_scrub_started));
-		OK(fields[TABLESPACES_SCRUBBING_CURRENT_SCRUB_ACTIVE_THREADS]
-		   ->store(status.current_scrub_active_threads, true));
-		OK(fields[TABLESPACES_SCRUBBING_CURRENT_SCRUB_PAGE_NUMBER]
-		   ->store(status.current_scrub_page_number, true));
-		OK(fields[TABLESPACES_SCRUBBING_CURRENT_SCRUB_MAX_PAGE_NUMBER]
-		   ->store(status.current_scrub_max_page_number, true));
-	} else {
-		for (uint i = 0; i < array_elements(field_numbers); i++) {
-			fields[field_numbers[i]]->set_null();
-		}
-	}
-
-	OK(fields[TABLESPACES_SCRUBBING_ON_SSD]->store(!space->is_rotational(),
-						       true));
-	OK(schema_table_store_record(thd, table_to_fill));
-
-	DBUG_RETURN(0);
-}
-/*******************************************************************//**
-Function to populate INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING table.
-Loop through each record in TABLESPACES_SCRUBBING, and extract the column
-information and fill the INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING table.
-@return 0 on success */
-static
-int
-i_s_tablespaces_scrubbing_fill_table(
-/*===========================*/
-	THD*		thd,	/*!< in: thread */
-	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	Item*		)	/*!< in: condition (not used) */
-{
-	DBUG_ENTER("i_s_tablespaces_scrubbing_fill_table");
-	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
-
-	/* deny access to user without SUPER_ACL privilege */
-	if (check_global_access(thd, SUPER_ACL)) {
-		DBUG_RETURN(0);
-	}
-
-	mutex_enter(&fil_system.mutex);
-
-	for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list);
-	     space; space = UT_LIST_GET_NEXT(space_list, space)) {
-		if (space->purpose == FIL_TYPE_TABLESPACE
-		    && space->acquire()) {
-			mutex_exit(&fil_system.mutex);
-			if (int err = i_s_dict_fill_tablespaces_scrubbing(
-				    thd, space, tables->table)) {
-				space->release();
-				DBUG_RETURN(err);
-			}
-			mutex_enter(&fil_system.mutex);
-			space->release();
-		}
-	}
-
-	mutex_exit(&fil_system.mutex);
-	DBUG_RETURN(0);
-}
-/*******************************************************************//**
-Bind the dynamic table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING
-@return 0 on success */
-static
-int
-innodb_tablespaces_scrubbing_init(
-/*========================*/
-	void*	p)	/*!< in/out: table schema object */
-{
-	ST_SCHEMA_TABLE*	schema;
-
-	DBUG_ENTER("innodb_tablespaces_scrubbing_init");
-
-	schema = (ST_SCHEMA_TABLE*) p;
-
-	schema->fields_info = innodb_tablespaces_scrubbing_fields_info;
-	schema->fill_table = i_s_tablespaces_scrubbing_fill_table;
-
-	DBUG_RETURN(0);
-}
-
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_tablespaces_scrubbing =
-{
-	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
-	/* int */
-	MYSQL_INFORMATION_SCHEMA_PLUGIN,
-
-	/* pointer to type-specific plugin descriptor */
-	/* void* */
-	&i_s_info,
-
-	/* plugin name */
-	/* const char* */
-	"INNODB_TABLESPACES_SCRUBBING",
-
-	/* plugin author (for SHOW PLUGINS) */
-	/* const char* */
-	"Google Inc",
-
-	/* general descriptive text (for SHOW PLUGINS) */
-	/* const char* */
-	"InnoDB TABLESPACES_SCRUBBING",
-
-	/* the plugin license (PLUGIN_LICENSE_XXX) */
-	/* int */
-	PLUGIN_LICENSE_BSD,
-
-	/* the function to invoke when plugin is loaded */
-	/* int (*)(void*); */
-	innodb_tablespaces_scrubbing_init,
-
-	/* the function to invoke when plugin is unloaded */
-	/* int (*)(void*); */
-	i_s_common_deinit,
-
-	/* plugin version (for SHOW PLUGINS) */
-	/* unsigned int */
-	INNODB_VERSION_SHORT,
-
-	/* struct st_mysql_show_var* */
-	NULL,
-
-	/* struct st_mysql_sys_var** */
-	NULL,
-
-	/* Maria extension */
-	INNODB_VERSION_STR,
-	MariaDB_PLUGIN_MATURITY_STABLE
-};
-
+namespace Show {
 /**  INNODB_MUTEXES  *********************************************/
 /* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_MUTEXES */
 static ST_FIELD_INFO innodb_mutexes_fields_info[]=
 {
 #define MUTEXES_NAME			0
-  {"NAME", OS_FILE_MAX_PATH, MYSQL_TYPE_STRING, 0, 0, "", SKIP_OPEN_TABLE},
+  Column("NAME", Varchar(OS_FILE_MAX_PATH), NOT_NULL),
+
 #define MUTEXES_CREATE_FILE		1
-  {"CREATE_FILE", OS_FILE_MAX_PATH, MYSQL_TYPE_STRING,
-   0, 0, "", SKIP_OPEN_TABLE},
+  Column("CREATE_FILE", Varchar(OS_FILE_MAX_PATH), NOT_NULL),
+
 #define MUTEXES_CREATE_LINE		2
-  {"CREATE_LINE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
+  Column("CREATE_LINE", ULong(), NOT_NULL),
+
 #define MUTEXES_OS_WAITS		3
-  {"OS_WAITS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+  Column("OS_WAITS", ULonglong(), NOT_NULL),
+
+  CEnd()
 };
+} // namespace Show
 
 /*******************************************************************//**
 Function to populate INFORMATION_SCHEMA.INNODB_MUTEXES table.
@@ -7596,9 +7157,8 @@ i_s_innodb_mutexes_fill_table(
 	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
 	Item*		)	/*!< in: condition (not used) */
 {
-	rw_lock_t*	lock;
 	ulint		block_lock_oswait_count = 0;
-	rw_lock_t*	block_lock = NULL;
+	const rw_lock_t* block_lock= nullptr;
 	Field**		fields = tables->table->field;
 
 	DBUG_ENTER("i_s_innodb_mutexes_fill_table");
@@ -7607,54 +7167,7 @@ i_s_innodb_mutexes_fill_table(
 	/* deny access to user without PROCESS_ACL privilege */
 	if (check_global_access(thd, PROCESS_ACL)) {
 		DBUG_RETURN(0);
-	}
-
-	// mutex_enter(&mutex_list_mutex);
-
-#ifdef JAN_TODO_FIXME
-	ib_mutex_t*	mutex;
-	ulint		block_mutex_oswait_count = 0;
-	ib_mutex_t*	block_mutex = NULL;
-	for (mutex = UT_LIST_GET_FIRST(os_mutex_list); mutex != NULL;
-	     mutex = UT_LIST_GET_NEXT(list, mutex)) {
-		if (mutex->count_os_wait == 0) {
-			continue;
-		}
-
-		if (buf_pool_is_block_mutex(mutex)) {
-			block_mutex = mutex;
-			block_mutex_oswait_count += mutex->count_os_wait;
-			continue;
-		}
-
-		OK(field_store_string(fields[MUTEXES_NAME], mutex->cmutex_name));
-		OK(field_store_string(fields[MUTEXES_CREATE_FILE],
-				      innobase_basename(mutex->cfile_name)));
-		OK(fields[MUTEXES_CREATE_LINE]->store(lock->cline, true));
-		fields[MUTEXES_CREATE_LINE]->set_notnull();
-		OK(fields[MUTEXES_OS_WAITS]->store(lock->count_os_wait, true));
-		fields[MUTEXES_OS_WAITS]->set_notnull();
-		OK(schema_table_store_record(thd, tables->table));
-	}
-
-	if (block_mutex) {
-		char buf1[IO_SIZE];
-
-		snprintf(buf1, sizeof buf1, "combined %s",
-			 innobase_basename(block_mutex->cfile_name));
-
-		OK(field_store_string(fields[MUTEXES_NAME], block_mutex->cmutex_name));
-		OK(field_store_string(fields[MUTEXES_CREATE_FILE], buf1));
-		OK(fields[MUTEXES_CREATE_LINE]->store(block_mutex->cline, true));
-		fields[MUTEXES_CREATE_LINE]->set_notnull();
-		OK(field_store_ulint(fields[MUTEXES_OS_WAITS], (longlong)block_mutex_oswait_count));
-		OK(schema_table_store_record(thd, tables->table));
-	}
-
-	mutex_exit(&mutex_list_mutex);
-#endif /* JAN_TODO_FIXME */
-
-	{
+	} else {
 		struct Locking
 		{
 			Locking() { mutex_enter(&rw_lock_list_mutex); }
@@ -7663,32 +7176,31 @@ i_s_innodb_mutexes_fill_table(
 
 		char lock_name[sizeof "buf0dump.cc:12345"];
 
-		for (lock = UT_LIST_GET_FIRST(rw_lock_list); lock != NULL;
-		     lock = UT_LIST_GET_NEXT(list, lock)) {
-			if (lock->count_os_wait == 0) {
+		for (const rw_lock_t& lock : rw_lock_list) {
+			if (lock.count_os_wait == 0) {
 				continue;
 			}
 
-			if (buf_pool_is_block_lock(lock)) {
-				block_lock = lock;
-				block_lock_oswait_count += lock->count_os_wait;
+			if (buf_pool.is_block_lock(&lock)) {
+				block_lock = &lock;
+				block_lock_oswait_count += lock.count_os_wait;
 				continue;
 			}
 
 			const char* basename = innobase_basename(
-				lock->cfile_name);
+				lock.cfile_name);
 
 			snprintf(lock_name, sizeof lock_name, "%s:%u",
-				 basename, lock->cline);
+				 basename, lock.cline);
 
 			OK(field_store_string(fields[MUTEXES_NAME],
 					      lock_name));
 			OK(field_store_string(fields[MUTEXES_CREATE_FILE],
 					      basename));
-			OK(fields[MUTEXES_CREATE_LINE]->store(lock->cline,
+			OK(fields[MUTEXES_CREATE_LINE]->store(lock.cline,
 							      true));
 			fields[MUTEXES_CREATE_LINE]->set_notnull();
-			OK(fields[MUTEXES_OS_WAITS]->store(lock->count_os_wait,
+			OK(fields[MUTEXES_OS_WAITS]->store(lock.count_os_wait,
 							   true));
 			fields[MUTEXES_OS_WAITS]->set_notnull();
 			OK(schema_table_store_record(thd, tables->table));
@@ -7732,7 +7244,7 @@ innodb_mutexes_init(
 
 	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = innodb_mutexes_fields_info;
+	schema->fields_info = Show::innodb_mutexes_fields_info;
 	schema->fill_table = i_s_innodb_mutexes_fill_table;
 
 	DBUG_RETURN(0);
@@ -7787,52 +7299,73 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_mutexes =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
-/**  SYS_SEMAPHORE_WAITS  ************************************************/
+namespace Show {
 /* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_SEMAPHORE_WAITS */
 static ST_FIELD_INFO innodb_sys_semaphore_waits_fields_info[]=
 {
-  {"THREAD_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  {"OBJECT_NAME", OS_FILE_MAX_PATH, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
-  {"FILE", OS_FILE_MAX_PATH, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
-  {"LINE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  {"WAIT_TIME", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  {"WAIT_OBJECT", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  {"WAIT_TYPE", 16, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
-  {"HOLDER_THREAD_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  {"HOLDER_FILE", OS_FILE_MAX_PATH, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
-  {"HOLDER_LINE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  {"CREATED_FILE", OS_FILE_MAX_PATH, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
-  {"CREATED_LINE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  {"WRITER_THREAD", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  {"RESERVATION_MODE", 16, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
-  {"READERS", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  {"WAITERS_FLAG", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  {"LOCK_WORD", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  {"LAST_WRITER_FILE", OS_FILE_MAX_PATH, MYSQL_TYPE_STRING,
-   0, MY_I_S_MAYBE_NULL, "", SKIP_OPEN_TABLE},
-  {"LAST_WRITER_LINE", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  {"OS_WAIT_COUNT", MY_INT32_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG,
-   0, MY_I_S_UNSIGNED, "", SKIP_OPEN_TABLE},
-  END_OF_ST_FIELD_INFO
+	// SYS_SEMAPHORE_WAITS_THREAD_ID	0
+  Column("THREAD_ID", ULonglong(), NOT_NULL),
+
+	// SYS_SEMAPHORE_WAITS_OBJECT_NAME	1
+  Column("OBJECT_NAME", Varchar(OS_FILE_MAX_PATH), NULLABLE),
+
+	// SYS_SEMAPHORE_WAITS_FILE	2
+  Column("FILE", Varchar(OS_FILE_MAX_PATH), NULLABLE),
+
+	// SYS_SEMAPHORE_WAITS_LINE	3
+  Column("LINE", ULong(), NOT_NULL),
+
+	// SYS_SEMAPHORE_WAITS_WAIT_TIME	4
+  Column("WAIT_TIME", ULonglong(), NOT_NULL),
+
+	// SYS_SEMAPHORE_WAITS_WAIT_OBJECT	5
+  Column("WAIT_OBJECT", ULonglong(), NOT_NULL),
+
+	// SYS_SEMAPHORE_WAITS_WAIT_TYPE	6
+  Column("WAIT_TYPE", Varchar(16), NULLABLE),
+
+	// SYS_SEMAPHORE_WAITS_HOLDER_THREAD_ID	7
+  Column("HOLDER_THREAD_ID", ULonglong(), NOT_NULL),
+
+	// SYS_SEMAPHORE_WAITS_HOLDER_FILE 8
+  Column("HOLDER_FILE", Varchar(OS_FILE_MAX_PATH), NULLABLE),
+
+	// SYS_SEMAPHORE_WAITS_HOLDER_LINE 9
+  Column("HOLDER_LINE", ULong(), NOT_NULL),
+
+	// SYS_SEMAPHORE_WAITS_CREATED_FILE 10
+  Column("CREATED_FILE", Varchar(OS_FILE_MAX_PATH), NULLABLE),
+
+	// SYS_SEMAPHORE_WAITS_CREATED_LINE 11
+  Column("CREATED_LINE", ULong(), NOT_NULL),
+
+	// SYS_SEMAPHORE_WAITS_WRITER_THREAD 12
+  Column("WRITER_THREAD", ULonglong(), NOT_NULL),
+
+	// SYS_SEMAPHORE_WAITS_RESERVATION_MODE 13
+  Column("RESERVATION_MODE", Varchar(16), NULLABLE),
+
+	// SYS_SEMAPHORE_WAITS_READERS	14
+  Column("READERS", ULong(), NOT_NULL),
+
+	// SYS_SEMAPHORE_WAITS_WAITERS_FLAG 15
+  Column("WAITERS_FLAG", ULonglong(), NOT_NULL),
+
+	// SYS_SEMAPHORE_WAITS_LOCK_WORD	16
+  Column("LOCK_WORD", ULonglong(), NOT_NULL),
+
+	// SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE 17
+  Column("LAST_WRITER_FILE", Varchar(OS_FILE_MAX_PATH), NULLABLE),
+
+	// SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE 18
+  Column("LAST_WRITER_LINE", ULong(), NOT_NULL),
+
+	// SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT 19
+  Column("OS_WAIT_COUNT", ULong(), NOT_NULL),
+
+  CEnd()
 };
+} // namespace Show
 
 
 /*******************************************************************//**
@@ -7850,7 +7383,7 @@ innodb_sys_semaphore_waits_init(
 
 	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = innodb_sys_semaphore_waits_fields_info;
+	schema->fields_info = Show::innodb_sys_semaphore_waits_fields_info;
 	schema->fill_table = sync_arr_fill_sys_semphore_waits_table;
 
 	DBUG_RETURN(0);
diff --git a/storage/innobase/handler/i_s.h b/storage/innobase/handler/i_s.h
index b4319c75abb..87799e7669c 100644
--- a/storage/innobase/handler/i_s.h
+++ b/storage/innobase/handler/i_s.h
@@ -63,7 +63,6 @@ extern struct st_maria_plugin	i_s_innodb_sys_datafiles;
 extern struct st_maria_plugin	i_s_innodb_mutexes;
 extern struct st_maria_plugin	i_s_innodb_sys_virtual;
 extern struct st_maria_plugin	i_s_innodb_tablespaces_encryption;
-extern struct st_maria_plugin	i_s_innodb_tablespaces_scrubbing;
 extern struct st_maria_plugin	i_s_innodb_sys_semaphore_waits;
 
 /** The latest successfully looked up innodb_fts_aux_table */
@@ -119,16 +118,6 @@ HPUX aCC: HP ANSI C++ B3910B A.03.65) can't handle it. */
 #define SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT 19
 
 /*******************************************************************//**
-Auxiliary function to store ulint value in MYSQL_TYPE_LONGLONG field.
-If the value is ULINT_UNDEFINED then the field it set to NULL.
-@return	0 on success */
-int
-field_store_ulint(
-/*==============*/
-	Field*	field,	/*!< in/out: target field for storage */
-	ulint	n);	/*!< in: value to store */
-
-/*******************************************************************//**
 Auxiliary function to store char* value in MYSQL_TYPE_STRING field.
 @return	0 on success */
 int
diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc
index 77481dd8ad8..9e2e75ed6dd 100644
--- a/storage/innobase/ibuf/ibuf0ibuf.cc
+++ b/storage/innobase/ibuf/ibuf0ibuf.cc
@@ -28,12 +28,6 @@ Created 7/19/1997 Heikki Tuuri
 #include "sync0sync.h"
 #include "btr0sea.h"
 
-using st_::span;
-
-#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
-my_bool	srv_ibuf_disable_background_merge;
-#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
-
 /** Number of bits describing a single page */
 #define IBUF_BITS_PER_PAGE	4
 /** The start address for an insert buffer bitmap page bitmap */
@@ -194,7 +188,7 @@ uint	ibuf_debug;
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 
 /** The insert buffer control structure */
-ibuf_t*	ibuf			= NULL;
+ibuf_t	ibuf;
 
 /** @name Offsets to the per-page bits in the insert buffer bitmap */
 /* @{ */
@@ -261,17 +255,7 @@ const ulint		IBUF_MERGE_THRESHOLD = 4;
 batch, in order to merge the entries for them in the insert buffer */
 const ulint		IBUF_MAX_N_PAGES_MERGED = IBUF_MERGE_AREA;
 
-/** If the combined size of the ibuf trees exceeds ibuf->max_size by this
-many pages, we start to contract it in connection to inserts there, using
-non-synchronous contract */
-const ulint		IBUF_CONTRACT_ON_INSERT_NON_SYNC = 0;
-
-/** If the combined size of the ibuf trees exceeds ibuf->max_size by this
-many pages, we start to contract it in connection to inserts there, using
-synchronous contract */
-const ulint		IBUF_CONTRACT_ON_INSERT_SYNC = 5;
-
-/** If the combined size of the ibuf trees exceeds ibuf->max_size by
+/** If the combined size of the ibuf trees exceeds ibuf.max_size by
 this many pages, we start to contract it synchronous contract, but do
 not insert */
 const ulint		IBUF_CONTRACT_DO_NOT_INSERT = 10;
@@ -348,22 +332,17 @@ ibuf_header_page_get(
 	return page;
 }
 
-/******************************************************************//**
-Gets the root page and sx-latches it.
-@return insert buffer tree root page */
-static
-page_t*
-ibuf_tree_root_get(
-/*===============*/
-	mtr_t*		mtr)	/*!< in: mtr */
+/** Acquire the change buffer root page.
+@param[in,out]  mtr     mini-transaction
+@return change buffer root page, SX-latched */
+static buf_block_t *ibuf_tree_root_get(mtr_t *mtr)
 {
 	buf_block_t*	block;
-	page_t*		root;
 
 	ut_ad(ibuf_inside(mtr));
 	ut_ad(mutex_own(&ibuf_mutex));
 
-	mtr_sx_lock_index(ibuf->index, mtr);
+	mtr_sx_lock_index(ibuf.index, mtr);
 
 	/* only segment list access is exclusive each other */
 	block = buf_page_get(
@@ -372,13 +351,11 @@ ibuf_tree_root_get(
 
 	buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
 
-	root = buf_block_get_frame(block);
-
-	ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
-	ut_ad(page_get_page_no(root) == FSP_IBUF_TREE_ROOT_PAGE_NO);
-	ut_ad(ibuf->empty == page_is_empty(root));
+	ut_ad(page_get_space_id(block->frame) == IBUF_SPACE_ID);
+	ut_ad(page_get_page_no(block->frame) == FSP_IBUF_TREE_ROOT_PAGE_NO);
+	ut_ad(ibuf.empty == page_is_empty(block->frame));
 
-	return(root);
+	return block;
 }
 
 /******************************************************************//**
@@ -387,7 +364,7 @@ void
 ibuf_close(void)
 /*============*/
 {
-	if (ibuf == NULL) {
+	if (!ibuf.index) {
 		return;
 	}
 
@@ -397,13 +374,11 @@ ibuf_close(void)
 
 	mutex_free(&ibuf_bitmap_mutex);
 
-	dict_table_t*	ibuf_table = ibuf->index->table;
-	rw_lock_free(&ibuf->index->lock);
-	dict_mem_index_free(ibuf->index);
+	dict_table_t*	ibuf_table = ibuf.index->table;
+	rw_lock_free(&ibuf.index->lock);
+	dict_mem_index_free(ibuf.index);
 	dict_mem_table_free(ibuf_table);
-
-	ut_free(ibuf);
-	ibuf = NULL;
+	ibuf.index = NULL;
 }
 
 /******************************************************************//**
@@ -417,13 +392,13 @@ ibuf_size_update(
 {
 	ut_ad(mutex_own(&ibuf_mutex));
 
-	ibuf->free_list_len = flst_get_len(root + PAGE_HEADER
+	ibuf.free_list_len = flst_get_len(root + PAGE_HEADER
 					   + PAGE_BTR_IBUF_FREE_LIST);
 
-	ibuf->height = 1 + btr_page_get_level(root);
+	ibuf.height = 1 + btr_page_get_level(root);
 
 	/* the '1 +' is the ibuf header page */
-	ibuf->size = ibuf->seg_size - (1 + ibuf->free_list_len);
+	ibuf.size = ibuf.seg_size - (1 + ibuf.free_list_len);
 }
 
 /******************************************************************//**
@@ -435,19 +410,29 @@ ibuf_init_at_db_start(void)
 /*=======================*/
 {
 	page_t*		root;
-	mtr_t		mtr;
 	ulint		n_used;
-	page_t*		header_page;
-	dberr_t		error= DB_SUCCESS;
 
-	ibuf = static_cast<ibuf_t*>(ut_zalloc_nokey(sizeof(ibuf_t)));
+	ut_ad(!ibuf.index);
+	mtr_t mtr;
+	mtr.start();
+	compile_time_assert(IBUF_SPACE_ID == TRX_SYS_SPACE);
+	compile_time_assert(IBUF_SPACE_ID == 0);
+	mtr_x_lock_space(fil_system.sys_space, &mtr);
+	buf_block_t* header_page = buf_page_get(
+		page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO),
+		0, RW_X_LATCH, &mtr);
+
+	if (!header_page) {
+		mtr.commit();
+		return DB_DECRYPTION_FAILED;
+	}
 
 	/* At startup we intialize ibuf to have a maximum of
 	CHANGE_BUFFER_DEFAULT_SIZE in terms of percentage of the
 	buffer pool size. Once ibuf struct is initialized this
 	value is updated with the user supplied size by calling
 	ibuf_max_size_update(). */
-	ibuf->max_size = ((buf_pool_get_curr_size() >> srv_page_size_shift)
+	ibuf.max_size = ((buf_pool_get_curr_size() >> srv_page_size_shift)
 			  * CHANGE_BUFFER_DEFAULT_SIZE) / 100;
 
 	mutex_create(LATCH_ID_IBUF, &ibuf_mutex);
@@ -457,26 +442,15 @@ ibuf_init_at_db_start(void)
 	mutex_create(LATCH_ID_IBUF_PESSIMISTIC_INSERT,
 		     &ibuf_pessimistic_insert_mutex);
 
-	mtr_start(&mtr);
-
-	compile_time_assert(IBUF_SPACE_ID == TRX_SYS_SPACE);
-	compile_time_assert(IBUF_SPACE_ID == 0);
-	mtr_x_lock_space(fil_system.sys_space, &mtr);
-
 	mutex_enter(&ibuf_mutex);
 
-	header_page = ibuf_header_page_get(&mtr);
-
-	if (!header_page) {
-		return (DB_DECRYPTION_FAILED);
-	}
-
-	fseg_n_reserved_pages(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
-			      &n_used, &mtr);
+	fseg_n_reserved_pages(*header_page,
+			      IBUF_HEADER + IBUF_TREE_SEG_HEADER
+			      + header_page->frame, &n_used, &mtr);
 
 	ut_ad(n_used >= 2);
 
-	ibuf->seg_size = n_used;
+	ibuf.seg_size = n_used;
 
 	{
 		buf_block_t*	block;
@@ -493,33 +467,33 @@ ibuf_init_at_db_start(void)
 	ibuf_size_update(root);
 	mutex_exit(&ibuf_mutex);
 
-	ibuf->empty = page_is_empty(root);
+	ibuf.empty = page_is_empty(root);
 	mtr.commit();
 
-	ibuf->index = dict_mem_index_create(
+	ibuf.index = dict_mem_index_create(
 		dict_mem_table_create("innodb_change_buffer",
 				      fil_system.sys_space, 1, 0, 0, 0),
 		"CLUST_IND",
 		DICT_CLUSTERED | DICT_IBUF, 1);
-	ibuf->index->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID;
-	ibuf->index->n_uniq = REC_MAX_N_FIELDS;
-	rw_lock_create(index_tree_rw_lock_key, &ibuf->index->lock,
+	ibuf.index->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID;
+	ibuf.index->n_uniq = REC_MAX_N_FIELDS;
+	rw_lock_create(index_tree_rw_lock_key, &ibuf.index->lock,
 		       SYNC_IBUF_INDEX_TREE);
 #ifdef BTR_CUR_ADAPT
-	ibuf->index->search_info = btr_search_info_create(ibuf->index->heap);
+	ibuf.index->search_info = btr_search_info_create(ibuf.index->heap);
 #endif /* BTR_CUR_ADAPT */
-	ibuf->index->page = FSP_IBUF_TREE_ROOT_PAGE_NO;
-	ut_d(ibuf->index->cached = TRUE);
+	ibuf.index->page = FSP_IBUF_TREE_ROOT_PAGE_NO;
+	ut_d(ibuf.index->cached = TRUE);
 
 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
 	if (!ibuf_dump) {
-		return error;
+		return DB_SUCCESS;
 	}
 	ib::info() << "Dumping the change buffer";
 	ibuf_mtr_start(&mtr);
 	btr_pcur_t pcur;
 	if (DB_SUCCESS == btr_pcur_open_at_index_side(
-		    true, ibuf->index, BTR_SEARCH_LEAF, &pcur,
+		    true, ibuf.index, BTR_SEARCH_LEAF, &pcur,
 		    true, 0, &mtr)) {
 		while (btr_pcur_move_to_next_user_rec(&pcur, &mtr)) {
 			rec_print_old(stderr, btr_pcur_get_rec(&pcur));
@@ -529,7 +503,7 @@ ibuf_init_at_db_start(void)
 	ib::info() << "Dumped the change buffer";
 #endif
 
-	return (error);
+	return DB_SUCCESS;
 }
 
 /*********************************************************************//**
@@ -543,29 +517,10 @@ ibuf_max_size_update(
 	ulint	new_size = ((buf_pool_get_curr_size() >> srv_page_size_shift)
 			    * new_val) / 100;
 	mutex_enter(&ibuf_mutex);
-	ibuf->max_size = new_size;
+	ibuf.max_size = new_size;
 	mutex_exit(&ibuf_mutex);
 }
 
-
-/** Apply MLOG_IBUF_BITMAP_INIT when crash-upgrading */
-ATTRIBUTE_COLD void ibuf_bitmap_init_apply(buf_block_t* block)
-{
-	page_t*	page;
-	ulint	byte_offset;
-
-	page = buf_block_get_frame(block);
-	fil_page_set_type(page, FIL_PAGE_IBUF_BITMAP);
-
-	/* Write all zeros to the bitmap */
-	compile_time_assert(!(IBUF_BITS_PER_PAGE % 2));
-
-	byte_offset = UT_BITS_IN_BYTES(block->physical_size()
-				       * IBUF_BITS_PER_PAGE);
-
-	memset(page + IBUF_BITMAP, 0, byte_offset);
-}
-
 # ifdef UNIV_DEBUG
 /** Gets the desired bits for a given page from a bitmap page.
 @param[in]	page		bitmap page
@@ -621,7 +576,7 @@ ibuf_bitmap_page_get_bits_low(
 	ut_ad(ut_is_2pow(zip_size));
 	ut_ad(bit < IBUF_BITS_PER_PAGE);
 	compile_time_assert(!(IBUF_BITS_PER_PAGE % 2));
-	ut_ad(mtr_memo_contains_page(mtr, page, latch_type));
+	ut_ad(mtr->memo_contains_page_flagged(page, latch_type));
 
 	bit_offset = (page_id.page_no() & (size - 1))
 		* IBUF_BITS_PER_PAGE + bit;
@@ -645,29 +600,27 @@ ibuf_bitmap_page_get_bits_low(
 }
 
 /** Sets the desired bit for a given page in a bitmap page.
-@param[in,out]	page		bitmap page
+@tparam bit	IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
+@param[in,out]	block		bitmap page
 @param[in]	page_id		page id whose bits to set
 @param[in]	physical_size	page size
-@param[in]	bit		IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
 @param[in]	val		value to set
 @param[in,out]	mtr		mtr containing an x-latch to the bitmap page */
-static
-void
+template<ulint bit>
+static void
 ibuf_bitmap_page_set_bits(
-	page_t*			page,
+	buf_block_t*		block,
 	const page_id_t		page_id,
 	ulint			physical_size,
-	ulint			bit,
 	ulint			val,
 	mtr_t*			mtr)
 {
 	ulint	byte_offset;
 	ulint	bit_offset;
-	ulint	map_byte;
 
-	ut_ad(bit < IBUF_BITS_PER_PAGE);
+	static_assert(bit < IBUF_BITS_PER_PAGE, "wrong bit");
 	compile_time_assert(!(IBUF_BITS_PER_PAGE % 2));
-	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 	ut_ad(mtr->is_named_space(page_id.space()));
 
 	bit_offset = (page_id.page_no() % physical_size)
@@ -678,21 +631,29 @@ ibuf_bitmap_page_set_bits(
 
 	ut_ad(byte_offset + IBUF_BITMAP < srv_page_size);
 
-	map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset);
+	byte* map_byte = &block->frame[IBUF_BITMAP + byte_offset];
+	byte b = *map_byte;
 
 	if (bit == IBUF_BITMAP_FREE) {
 		ut_ad(bit_offset + 1 < 8);
 		ut_ad(val <= 3);
-
-		map_byte = ut_bit_set_nth(map_byte, bit_offset, val / 2);
-		map_byte = ut_bit_set_nth(map_byte, bit_offset + 1, val % 2);
+		b &= static_cast<byte>(~(3U << bit_offset));
+		b |= static_cast<byte>(((val & 2) >> 1) << bit_offset
+				       | (val & 1) << (bit_offset + 1));
 	} else {
 		ut_ad(val <= 1);
-		map_byte = ut_bit_set_nth(map_byte, bit_offset, val);
+		b &= static_cast<byte>(~(1U << bit_offset));
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+		b |= static_cast<byte>(val << bit_offset);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
 	}
 
-	mlog_write_ulint(page + IBUF_BITMAP + byte_offset, map_byte,
-			 MLOG_1BYTE, mtr);
+	mtr->write<1,mtr_t::MAYBE_NOP>(*block, map_byte, b);
 }
 
 /** Calculates the bitmap page number for a given page number.
@@ -701,10 +662,11 @@ ibuf_bitmap_page_set_bits(
 @return the bitmap page id where the file page is mapped */
 inline page_id_t ibuf_bitmap_page_no_calc(const page_id_t page_id, ulint size)
 {
-	if (!size) size = srv_page_size;
+  if (!size)
+    size= srv_page_size;
 
-	return page_id_t(page_id.space(), FSP_IBUF_BITMAP_OFFSET
-			 + (page_id.page_no() & ~(size - 1)));
+  return page_id_t(page_id.space(), FSP_IBUF_BITMAP_OFFSET
+		   + uint32_t(page_id.page_no() & ~(size - 1)));
 }
 
 /** Gets the ibuf bitmap page where the bits describing a given file page are
@@ -718,7 +680,7 @@ stored.
 page containing the descriptor bits for the file page; the bitmap page
 is x-latched */
 static
-page_t*
+buf_block_t*
 ibuf_bitmap_get_map_page_func(
 	const page_id_t		page_id,
 	ulint			zip_size,
@@ -726,21 +688,16 @@ ibuf_bitmap_get_map_page_func(
 	unsigned		line,
 	mtr_t*			mtr)
 {
-	buf_block_t*	block = NULL;
-	dberr_t		err = DB_SUCCESS;
-
-	block = buf_page_get_gen(ibuf_bitmap_page_no_calc(page_id, zip_size),
-				 zip_size, RW_X_LATCH, NULL, BUF_GET,
-				 file, line, mtr, &err);
+	buf_block_t* block = buf_page_get_gen(
+		ibuf_bitmap_page_no_calc(page_id, zip_size),
+		zip_size, RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED,
+		file, line, mtr);
 
-	if (err != DB_SUCCESS) {
-		return NULL;
+	if (block) {
+		buf_block_dbg_add_level(block, SYNC_IBUF_BITMAP);
 	}
 
-
-	buf_block_dbg_add_level(block, SYNC_IBUF_BITMAP);
-
-	return(buf_block_get_frame(block));
+	return block;
 }
 
 /** Gets the ibuf bitmap page where the bits describing a given file page are
@@ -770,31 +727,22 @@ ibuf_set_free_bits_low(
 	ulint			val,	/*!< in: value to set: < 4 */
 	mtr_t*			mtr)	/*!< in/out: mtr */
 {
-	page_t*	bitmap_page;
-	buf_frame_t* frame;
-
-	ut_ad(mtr->is_named_space(block->page.id.space()));
-
-	if (!block) {
-		return;
-	}
-
-	frame = buf_block_get_frame(block);
-
-	if (!frame || !page_is_leaf(frame)) {
+	ut_ad(mtr->is_named_space(block->page.id().space()));
+	if (!page_is_leaf(block->frame)) {
 		return;
 	}
 
-	bitmap_page = ibuf_bitmap_get_map_page(block->page.id,
-					       block->zip_size(), mtr);
-
 #ifdef UNIV_IBUF_DEBUG
 	ut_a(val <= ibuf_index_page_calc_free(block));
 #endif /* UNIV_IBUF_DEBUG */
+	const page_id_t id(block->page.id());
 
-	ibuf_bitmap_page_set_bits(
-		bitmap_page, block->page.id, block->physical_size(),
-		IBUF_BITMAP_FREE, val, mtr);
+	if (buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
+			id, block->zip_size(), mtr)) {
+		ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(
+			bitmap_page, id, block->physical_size(),
+			val, mtr);
+	}
 }
 
 /************************************************************************//**
@@ -814,34 +762,22 @@ ibuf_set_free_bits_func(
 #endif /* UNIV_IBUF_DEBUG */
 	ulint		val)	/*!< in: value to set: < 4 */
 {
-	mtr_t	mtr;
-	page_t*	page;
-	page_t*	bitmap_page;
-
-	page = buf_block_get_frame(block);
-
-	if (!page_is_leaf(page)) {
-
+	if (!page_is_leaf(block->frame)) {
 		return;
 	}
 
-	mtr_start(&mtr);
-	const fil_space_t* space = mtr.set_named_space_id(
-		block->page.id.space());
+	mtr_t	mtr;
+	mtr.start();
+	const page_id_t id(block->page.id());
 
-	bitmap_page = ibuf_bitmap_get_map_page(block->page.id,
-					       block->zip_size(), &mtr);
+	const fil_space_t* space = mtr.set_named_space_id(id.space());
 
-	switch (space->purpose) {
-	case FIL_TYPE_LOG:
-		ut_ad(0);
-		break;
-	case FIL_TYPE_TABLESPACE:
-		break;
-		/* fall through */
-	case FIL_TYPE_TEMPORARY:
-	case FIL_TYPE_IMPORT:
-		mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+	buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(id,
+							    block->zip_size(),
+							    &mtr);
+
+	if (space->purpose != FIL_TYPE_TABLESPACE) {
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
 	}
 
 #ifdef UNIV_IBUF_DEBUG
@@ -849,33 +785,19 @@ ibuf_set_free_bits_func(
 		ulint	old_val;
 
 		old_val = ibuf_bitmap_page_get_bits(
-			bitmap_page, block->page.id,
+			bitmap_page, id,
 			IBUF_BITMAP_FREE, &mtr);
-# if 0
-		if (old_val != max_val) {
-			fprintf(stderr,
-				"Ibuf: page %lu old val %lu max val %lu\n",
-				page_get_page_no(page),
-				old_val, max_val);
-		}
-# endif
-
 		ut_a(old_val <= max_val);
 	}
-# if 0
-	fprintf(stderr, "Setting page no %lu free bits to %lu should be %lu\n",
-		page_get_page_no(page), val,
-		ibuf_index_page_calc_free(block));
-# endif
 
 	ut_a(val <= ibuf_index_page_calc_free(block));
 #endif /* UNIV_IBUF_DEBUG */
 
-	ibuf_bitmap_page_set_bits(
-		bitmap_page, block->page.id, block->physical_size(),
-		IBUF_BITMAP_FREE, val, &mtr);
+	ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(
+		bitmap_page, id, block->physical_size(),
+		val, &mtr);
 
-	mtr_commit(&mtr);
+	mtr.commit();
 }
 
 /************************************************************************//**
@@ -919,8 +841,8 @@ ibuf_update_free_bits_low(
 	ulint	before;
 	ulint	after;
 
-	ut_a(!buf_block_get_page_zip(block));
-	ut_ad(mtr->is_named_space(block->page.id.space()));
+	ut_a(!is_buf_block_get_page_zip(block));
+	ut_ad(mtr->is_named_space(block->page.id().space()));
 
 	before = ibuf_index_page_calc_free_bits(srv_page_size,
 						max_ins_size);
@@ -950,19 +872,10 @@ ibuf_update_free_bits_zip(
 	buf_block_t*	block,	/*!< in/out: index page */
 	mtr_t*		mtr)	/*!< in/out: mtr */
 {
-	page_t*	bitmap_page;
-	ulint	after;
-
-	ut_a(block);
-	buf_frame_t* frame = buf_block_get_frame(block);
-	ut_a(frame);
-	ut_a(page_is_leaf(frame));
-	ut_a(block->zip_size());
+	ut_ad(page_is_leaf(block->frame));
+	ut_ad(block->zip_size());
 
-	bitmap_page = ibuf_bitmap_get_map_page(block->page.id,
-					       block->zip_size(), mtr);
-
-	after = ibuf_index_page_calc_free_zip(block);
+	ulint after = ibuf_index_page_calc_free_zip(block);
 
 	if (after == 0) {
 		/* We move the page to the front of the buffer pool LRU list:
@@ -973,9 +886,13 @@ ibuf_update_free_bits_zip(
 		buf_page_make_young(&block->page);
 	}
 
-	ibuf_bitmap_page_set_bits(
-		bitmap_page, block->page.id, block->physical_size(),
-		IBUF_BITMAP_FREE, after, mtr);
+	if (buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
+		block->page.id(), block->zip_size(), mtr)) {
+
+		ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(
+			bitmap_page, block->page.id(),
+			block->physical_size(), after, mtr);
+	}
 }
 
 /**********************************************************************//**
@@ -994,8 +911,8 @@ ibuf_update_free_bits_for_two_pages_low(
 {
 	ulint	state;
 
-	ut_ad(mtr->is_named_space(block1->page.id.space()));
-	ut_ad(block1->page.id.space() == block2->page.id.space());
+	ut_ad(mtr->is_named_space(block1->page.id().space()));
+	ut_ad(block1->page.id().space() == block2->page.id().space());
 
 	/* As we have to x-latch two random bitmap pages, we have to acquire
 	the bitmap mutex to prevent a deadlock with a similar operation
@@ -1020,8 +937,7 @@ ibuf_update_free_bits_for_two_pages_low(
 @return TRUE if a fixed address ibuf i/o page */
 inline bool ibuf_fixed_addr_page(const page_id_t page_id, ulint zip_size)
 {
-	return((page_id.space() == IBUF_SPACE_ID
-		&& page_id.page_no() == IBUF_TREE_ROOT_PAGE_NO)
+	return(page_id == page_id_t(IBUF_SPACE_ID, IBUF_TREE_ROOT_PAGE_NO)
 	       || ibuf_bitmap_page(page_id, zip_size));
 }
 
@@ -1050,7 +966,6 @@ ibuf_page_low(
 {
 	ibool	ret;
 	mtr_t	local_mtr;
-	page_t*	bitmap_page;
 
 	ut_ad(!recv_no_ibuf_operations);
 	ut_ad(x_latch || mtr == NULL);
@@ -1085,10 +1000,8 @@ ibuf_page_low(
 			zip_size, RW_NO_LATCH, NULL, BUF_GET_NO_LATCH,
 			file, line, &local_mtr, &err);
 
-		bitmap_page = buf_block_get_frame(block);
-
 		ret = ibuf_bitmap_page_get_bits_low(
-			bitmap_page, page_id, zip_size,
+			block->frame, page_id, zip_size,
 			MTR_MEMO_BUF_FIX, &local_mtr, IBUF_BITMAP_IBUF);
 
 		mtr_commit(&local_mtr);
@@ -1101,10 +1014,10 @@ ibuf_page_low(
 		mtr_start(mtr);
 	}
 
-	bitmap_page = ibuf_bitmap_get_map_page_func(page_id, zip_size,
-						    file, line, mtr);
-
-	ret = ibuf_bitmap_page_get_bits(bitmap_page, page_id, zip_size,
+	ret = ibuf_bitmap_page_get_bits(ibuf_bitmap_get_map_page_func(
+						page_id, zip_size, file, line,
+						mtr)->frame,
+					page_id, zip_size,
 					IBUF_BITMAP_IBUF, mtr);
 
 	if (mtr == &local_mtr) {
@@ -1124,7 +1037,7 @@ ibuf_page_low(
 Returns the page number field of an ibuf record.
 @return page number */
 static
-ulint
+uint32_t
 ibuf_rec_get_page_no_func(
 /*======================*/
 #ifdef UNIV_DEBUG
@@ -1135,9 +1048,8 @@ ibuf_rec_get_page_no_func(
 	const byte*	field;
 	ulint		len;
 
-	ut_ad(mtr_memo_contains_page_flagged(mtr, rec,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
 	ut_ad(ibuf_inside(mtr));
 	ut_ad(rec_get_n_fields_old(rec) > 2);
 
@@ -1163,7 +1075,7 @@ Returns the space id field of an ibuf record. For < 4.1.x format records
 returns 0.
 @return space id */
 static
-ulint
+uint32_t
 ibuf_rec_get_space_func(
 /*====================*/
 #ifdef UNIV_DEBUG
@@ -1174,8 +1086,8 @@ ibuf_rec_get_space_func(
 	const byte*	field;
 	ulint		len;
 
-	ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
 	ut_ad(ibuf_inside(mtr));
 	ut_ad(rec_get_n_fields_old(rec) > 2);
 
@@ -1224,8 +1136,8 @@ ibuf_rec_get_info_func(
 	ulint		info_len_local;
 	ulint		counter_local;
 
-	ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
 	ut_ad(ibuf_inside(mtr));
 	fields = rec_get_n_fields_old(rec);
 	ut_a(fields > IBUF_REC_FIELD_USER);
@@ -1298,8 +1210,8 @@ ibuf_rec_get_op_type_func(
 {
 	ulint		len;
 
-	ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
 	ut_ad(ibuf_inside(mtr));
 	ut_ad(rec_get_n_fields_old(rec) > 2);
 
@@ -1488,8 +1400,8 @@ ibuf_build_entry_from_ibuf_rec_func(
 	ulint		comp;
 	dict_index_t*	index;
 
-	ut_ad(mtr_memo_contains_page_flagged(mtr, ibuf_rec, MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(mtr->memo_contains_page_flagged(ibuf_rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
 	ut_ad(ibuf_inside(mtr));
 
 	data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len);
@@ -1528,8 +1440,8 @@ ibuf_build_entry_from_ibuf_rec_func(
 		ibuf_dummy_index_add_col(index, dfield_get_type(field), len);
 	}
 
-	index->n_core_null_bytes
-		= UT_BITS_IN_BYTES(unsigned(index->n_nullable));
+	index->n_core_null_bytes = static_cast<uint8_t>(
+		UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
 
 	/* Prevent an ut_ad() failure in page_zip_write_rec() by
 	adding system columns to the dummy table pointed to by the
@@ -1613,8 +1525,8 @@ ibuf_rec_get_volume_func(
 	ibuf_op_t	op;
 	ulint		info_len;
 
-	ut_ad(mtr_memo_contains_page_flagged(mtr, ibuf_rec, MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(mtr->memo_contains_page_flagged(ibuf_rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
 	ut_ad(ibuf_inside(mtr));
 	ut_ad(rec_get_n_fields_old(ibuf_rec) > 2);
 
@@ -1895,7 +1807,7 @@ static inline bool ibuf_data_enough_free_for_insert()
 	inserts buffered for pages that we read to the buffer pool, without
 	any risk of running out of free space in the insert buffer. */
 
-	return(ibuf->free_list_len >= (ibuf->size / 2) + 3 * ibuf->height);
+	return(ibuf.free_list_len >= (ibuf.size / 2) + 3 * ibuf.height);
 }
 
 /*********************************************************************//**
@@ -1909,26 +1821,19 @@ ibuf_data_too_much_free(void)
 {
 	ut_ad(mutex_own(&ibuf_mutex));
 
-	return(ibuf->free_list_len >= 3 + (ibuf->size / 2) + 3 * ibuf->height);
+	return(ibuf.free_list_len >= 3 + (ibuf.size / 2) + 3 * ibuf.height);
 }
 
-/*********************************************************************//**
-Allocates a new page from the ibuf file segment and adds it to the free
-list.
-@return TRUE on success, FALSE if no space left */
-static
-ibool
-ibuf_add_free_page(void)
-/*====================*/
+/** Allocate a change buffer page.
+@retval true on success
+@retval false if no space left */
+static bool ibuf_add_free_page()
 {
 	mtr_t		mtr;
 	page_t*		header_page;
 	buf_block_t*	block;
-	page_t*		page;
-	page_t*		root;
-	page_t*		bitmap_page;
 
-	mtr_start(&mtr);
+	mtr.start();
 	/* Acquire the fsp latch before the ibuf header, obeying the latching
 	order */
 	mtr_x_lock_space(fil_system.sys_space, &mtr);
@@ -1949,44 +1854,43 @@ ibuf_add_free_page(void)
 		&mtr);
 
 	if (block == NULL) {
-		mtr_commit(&mtr);
-
-		return(FALSE);
+		mtr.commit();
+		return false;
 	}
 
 	ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1);
 	ibuf_enter(&mtr);
 	mutex_enter(&ibuf_mutex);
-	root = ibuf_tree_root_get(&mtr);
 
 	buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
-	page = buf_block_get_frame(block);
 
-	mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_IBUF_FREE_LIST,
-			 MLOG_2BYTES, &mtr);
+	mtr.write<2>(*block, block->frame + FIL_PAGE_TYPE,
+		     FIL_PAGE_IBUF_FREE_LIST);
 
 	/* Add the page to the free list and update the ibuf size data */
 
-	flst_add_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
-		      page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
+	flst_add_last(ibuf_tree_root_get(&mtr),
+		      PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+		      block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
 
-	ibuf->seg_size++;
-	ibuf->free_list_len++;
+	ibuf.seg_size++;
+	ibuf.free_list_len++;
 
 	/* Set the bit indicating that this page is now an ibuf tree page
 	(level 2 page) */
 
-	const page_id_t		page_id(IBUF_SPACE_ID, block->page.id.page_no());
-	bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr);
+	const page_id_t page_id(block->page.id());
+	buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr);
 
 	mutex_exit(&ibuf_mutex);
 
-	ibuf_bitmap_page_set_bits(bitmap_page, page_id, srv_page_size,
-				  IBUF_BITMAP_IBUF, TRUE, &mtr);
+	ibuf_bitmap_page_set_bits<IBUF_BITMAP_IBUF>(bitmap_page, page_id,
+						    srv_page_size, true,
+						    &mtr);
 
 	ibuf_mtr_commit(&mtr);
 
-	return(TRUE);
+	return true;
 }
 
 /*********************************************************************//**
@@ -1999,10 +1903,6 @@ ibuf_remove_free_page(void)
 	mtr_t	mtr;
 	mtr_t	mtr2;
 	page_t*	header_page;
-	ulint	page_no;
-	page_t*	page;
-	page_t*	root;
-	page_t*	bitmap_page;
 
 	log_free_check();
 
@@ -2030,12 +1930,12 @@ ibuf_remove_free_page(void)
 
 	ibuf_mtr_start(&mtr2);
 
-	root = ibuf_tree_root_get(&mtr2);
+	buf_block_t* root = ibuf_tree_root_get(&mtr2);
 
 	mutex_exit(&ibuf_mutex);
 
-	page_no = flst_get_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
-				&mtr2).page;
+	uint32_t page_no = flst_get_last(PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST
+					 + root->frame).page;
 
 	/* NOTE that we must release the latch on the ibuf tree root
 	because in fseg_free_page we access level 1 pages, and the root
@@ -2052,53 +1952,43 @@ ibuf_remove_free_page(void)
 
 	compile_time_assert(IBUF_SPACE_ID == 0);
 	fseg_free_page(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
-		       fil_system.sys_space, page_no, true, &mtr);
+		       fil_system.sys_space, page_no, &mtr);
 
 	const page_id_t	page_id(IBUF_SPACE_ID, page_no);
 
-	ut_d(buf_page_reset_file_page_was_freed(page_id));
-
 	ibuf_enter(&mtr);
 
 	mutex_enter(&ibuf_mutex);
 
 	root = ibuf_tree_root_get(&mtr);
 
-	ut_ad(page_no == flst_get_last(root + PAGE_HEADER
-				       + PAGE_BTR_IBUF_FREE_LIST, &mtr).page);
+	ut_ad(page_no == flst_get_last(PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST
+				       + root->frame).page);
 
-	{
-		buf_block_t*	block;
-
-		block = buf_page_get(page_id, 0, RW_X_LATCH, &mtr);
-
-		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
-
-		page = buf_block_get_frame(block);
-	}
+	buf_block_t* block = buf_page_get(page_id, 0, RW_X_LATCH, &mtr);
+	buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
 
 	/* Remove the page from the free list and update the ibuf size data */
 
-	flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
-		    page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
+	flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+		    block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
 
 	mutex_exit(&ibuf_pessimistic_insert_mutex);
 
-	ibuf->seg_size--;
-	ibuf->free_list_len--;
+	ibuf.seg_size--;
+	ibuf.free_list_len--;
 
 	/* Set the bit indicating that this page is no more an ibuf tree page
 	(level 2 page) */
 
-	bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr);
+	buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr);
 
 	mutex_exit(&ibuf_mutex);
 
-	ibuf_bitmap_page_set_bits(
-		bitmap_page, page_id, srv_page_size,
-		IBUF_BITMAP_IBUF, FALSE, &mtr);
+	ibuf_bitmap_page_set_bits<IBUF_BITMAP_IBUF>(
+		bitmap_page, page_id, srv_page_size, false, &mtr);
 
-	ut_d(buf_page_set_file_page_was_freed(page_id));
+	buf_page_free(fil_system.sys_space, page_no, &mtr, __FILE__, __LINE__);
 
 	ibuf_mtr_commit(&mtr);
 }
@@ -2111,10 +2001,6 @@ void
 ibuf_free_excess_pages(void)
 /*========================*/
 {
-	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
-		return;
-	}
-
 	/* Free at most a few pages at a time, so that we do not delay the
 	requested service too much */
 
@@ -2158,27 +2044,27 @@ ibuf_get_merge_page_nos_func(
 #ifdef UNIV_DEBUG
 	mtr_t*		mtr,	/*!< in: mini-transaction holding rec */
 #endif /* UNIV_DEBUG */
-	ulint*		space_ids,/*!< in/out: space id's of the pages */
-	ulint*		page_nos,/*!< in/out: buffer for at least
+	uint32_t*	space_ids,/*!< in/out: space id's of the pages */
+	uint32_t*	page_nos,/*!< in/out: buffer for at least
 				IBUF_MAX_N_PAGES_MERGED many page numbers;
 				the page numbers are in an ascending order */
 	ulint*		n_stored)/*!< out: number of page numbers stored to
 				page_nos in this function */
 {
-	ulint	prev_page_no;
-	ulint	prev_space_id;
-	ulint	first_page_no;
-	ulint	first_space_id;
-	ulint	rec_page_no;
-	ulint	rec_space_id;
+	uint32_t prev_page_no;
+	uint32_t prev_space_id;
+	uint32_t first_page_no;
+	uint32_t first_space_id;
+	uint32_t rec_page_no;
+	uint32_t rec_space_id;
 	ulint	sum_volumes;
 	ulint	volume_for_page;
 	ulint	rec_volume;
 	ulint	limit;
 	ulint	n_pages;
 
-	ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
 	ut_ad(ibuf_inside(mtr));
 
 	*n_stored = 0;
@@ -2363,25 +2249,23 @@ ulint
 ibuf_get_merge_pages(
 /*=================*/
 	btr_pcur_t*	pcur,	/*!< in/out: cursor */
-	ulint		space,	/*!< in: space for which to merge */
+	uint32_t	space,	/*!< in: space for which to merge */
 	ulint		limit,	/*!< in: max page numbers to read */
-	ulint*		pages,	/*!< out: pages read */
-	ulint*		spaces,	/*!< out: spaces read */
+	uint32_t*	pages,	/*!< out: pages read */
+	uint32_t*	spaces,	/*!< out: spaces read */
 	ulint*		n_pages,/*!< out: number of pages read */
 	mtr_t*		mtr)	/*!< in: mini transaction */
 {
 	const rec_t*	rec;
 	ulint		volume = 0;
 
-	ut_a(space != ULINT_UNDEFINED);
-
 	*n_pages = 0;
 
 	while ((rec = ibuf_get_user_rec(pcur, mtr)) != 0
 	       && ibuf_rec_get_space(mtr, rec) == space
 	       && *n_pages < limit) {
 
-		ulint	page_no = ibuf_rec_get_page_no(mtr, rec);
+		uint32_t page_no = ibuf_rec_get_page_no(mtr, rec);
 
 		if (*n_pages == 0 || pages[*n_pages - 1] != page_no) {
 			spaces[*n_pages] = space;
@@ -2397,6 +2281,135 @@ ibuf_get_merge_pages(
 	return(volume);
 }
 
+/**
+Delete a change buffer record.
+@param[in]	page_id		page identifier
+@param[in,out]	pcur		persistent cursor positioned on the record
+@param[in]	search_tuple	search key for (space,page_no)
+@param[in,out]	mtr		mini-transaction
+@return whether mtr was committed (due to pessimistic operation) */
+static MY_ATTRIBUTE((warn_unused_result, nonnull))
+bool ibuf_delete_rec(const page_id_t page_id, btr_pcur_t* pcur,
+		     const dtuple_t* search_tuple, mtr_t* mtr);
+
+/** Merge the change buffer to some pages. */
+static void ibuf_read_merge_pages(const uint32_t* space_ids,
+				  const uint32_t* page_nos, ulint n_stored)
+{
+#ifndef DBUG_OFF
+	mem_heap_t* heap = mem_heap_create(512);
+	ulint dops[IBUF_OP_COUNT];
+	memset(dops, 0, sizeof(dops));
+#endif
+
+	for (ulint i = 0; i < n_stored; i++) {
+		const ulint space_id = space_ids[i];
+		fil_space_t* s = fil_space_t::get(space_id);
+		if (!s) {
+tablespace_deleted:
+			/* The tablespace was not found: remove all
+			entries for it */
+			ibuf_delete_for_discarded_space(space_id);
+			while (i + 1 < n_stored
+			       && space_ids[i + 1] == space_id) {
+				i++;
+			}
+			continue;
+		}
+
+		const ulint zip_size = s->zip_size(), size = s->size;
+		s->release();
+		mtr_t mtr;
+
+		if (UNIV_LIKELY(page_nos[i] < size)) {
+			mtr.start();
+			dberr_t err;
+			buf_page_get_gen(page_id_t(space_id, page_nos[i]),
+					 zip_size, RW_X_LATCH, nullptr,
+					 BUF_GET_POSSIBLY_FREED,
+					 __FILE__, __LINE__, &mtr, &err, true);
+			mtr.commit();
+			if (err == DB_TABLESPACE_DELETED) {
+				goto tablespace_deleted;
+			}
+		}
+#ifndef DBUG_OFF
+		DBUG_EXECUTE_IF("ibuf_merge_corruption", goto work_around;);
+		continue;
+
+		/* The following code works around a hang when the
+		change buffer is corrupted, likely due to the race
+		condition in crash recovery that was fixed in
+		MDEV-24449. But, it also introduces corruption by
+		itself in the following scenario:
+
+		(1) We merged buffered changes in buf_page_get_gen()
+		(2) We committed the mini-transaction
+		(3) Redo log and the page with the merged changes is written
+		(4) A write completion callback thread evicts the page.
+		(5) Other threads buffer changes for that page.
+		(6) We will wrongly discard those newly buffered changes below.
+
+		This code will be available in debug builds, so that
+		users may try to fix a shutdown hang that occurs due
+		to a corrupted change buffer. */
+
+work_around:
+		/* Prevent an infinite loop, by removing entries from
+		the change buffer also in the case the bitmap bits were
+		wrongly clear even though buffered changes exist. */
+		const dtuple_t* tuple = ibuf_search_tuple_build(
+			space_id, page_nos[i], heap);
+loop:
+		btr_pcur_t pcur;
+		ibuf_mtr_start(&mtr);
+		btr_pcur_open(ibuf.index, tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
+			      &pcur, &mtr);
+		if (!btr_pcur_is_on_user_rec(&pcur)) {
+			ut_ad(btr_pcur_is_after_last_on_page(&pcur));
+			goto done;
+		}
+
+		for (;;) {
+			ut_ad(btr_pcur_is_on_user_rec(&pcur));
+
+			const rec_t* ibuf_rec = btr_pcur_get_rec(&pcur);
+			if (ibuf_rec_get_space(&mtr, ibuf_rec) != space_id
+			    || ibuf_rec_get_page_no(&mtr, ibuf_rec)
+			    != page_nos[i]) {
+				break;
+			}
+
+			dops[ibuf_rec_get_op_type(&mtr, ibuf_rec)]++;
+			/* Delete the record from ibuf */
+			if (ibuf_delete_rec(page_id_t(space_id, page_nos[i]),
+					    &pcur, tuple, &mtr)) {
+				/* Deletion was pessimistic and mtr
+				was committed: we start from the
+				beginning again */
+				ut_ad(mtr.has_committed());
+				goto loop;
+			}
+
+			if (btr_pcur_is_after_last_on_page(&pcur)) {
+				ibuf_mtr_commit(&mtr);
+				btr_pcur_close(&pcur);
+				goto loop;
+			}
+		}
+done:
+		ibuf_mtr_commit(&mtr);
+		btr_pcur_close(&pcur);
+		mem_heap_empty(heap);
+#endif
+	}
+
+#ifndef DBUG_OFF
+	ibuf_add_ops(ibuf.n_discarded_ops, dops);
+	mem_heap_free(heap);
+#endif
+}
+
 /*********************************************************************//**
 Contracts insert buffer trees by reading pages to the buffer pool.
 @return a lower limit for the combined size in bytes of entries which
@@ -2406,16 +2419,13 @@ static
 ulint
 ibuf_merge_pages(
 /*=============*/
-	ulint*	n_pages,	/*!< out: number of pages to which merged */
-	bool	sync)		/*!< in: true if the caller wants to wait for
-				the issued read with the highest tablespace
-				address to complete */
+	ulint*	n_pages)	/*!< out: number of pages to which merged */
 {
 	mtr_t		mtr;
 	btr_pcur_t	pcur;
 	ulint		sum_sizes;
-	ulint		page_nos[IBUF_MAX_N_PAGES_MERGED];
-	ulint		space_ids[IBUF_MAX_N_PAGES_MERGED];
+	uint32_t	page_nos[IBUF_MAX_N_PAGES_MERGED];
+	uint32_t	space_ids[IBUF_MAX_N_PAGES_MERGED];
 
 	*n_pages = 0;
 
@@ -2425,22 +2435,20 @@ ibuf_merge_pages(
 	position within the leaf */
 	bool available;
 
-	available = btr_pcur_open_at_rnd_pos(ibuf->index, BTR_SEARCH_LEAF,
+	available = btr_pcur_open_at_rnd_pos(ibuf.index, BTR_SEARCH_LEAF,
 					     &pcur, &mtr);
 	/* No one should make this index unavailable when server is running */
 	ut_a(available);
 
-	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
+	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
 
 	if (page_is_empty(btr_pcur_get_page(&pcur))) {
 		/* If a B-tree page is empty, it must be the root page
 		and the whole B-tree must be empty. InnoDB does not
 		allow empty B-tree pages other than the root. */
-		ut_ad(ibuf->empty);
-		ut_ad(page_get_space_id(btr_pcur_get_page(&pcur))
-		      == IBUF_SPACE_ID);
-		ut_ad(page_get_page_no(btr_pcur_get_page(&pcur))
-		      == FSP_IBUF_TREE_ROOT_PAGE_NO);
+		ut_ad(ibuf.empty);
+		ut_ad(btr_pcur_get_block(&pcur)->page.id()
+		      == page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO));
 
 		ibuf_mtr_commit(&mtr);
 		btr_pcur_close(&pcur);
@@ -2452,15 +2460,10 @@ ibuf_merge_pages(
 					    btr_pcur_get_rec(&pcur), &mtr,
 					    space_ids,
 					    page_nos, n_pages);
-#if 0 /* defined UNIV_IBUF_DEBUG */
-	fprintf(stderr, "Ibuf contract sync %lu pages %lu volume %lu\n",
-		sync, *n_pages, sum_sizes);
-#endif
 	ibuf_mtr_commit(&mtr);
 	btr_pcur_close(&pcur);
 
-	buf_read_ibuf_merge_pages(
-		sync, space_ids, page_nos, *n_pages);
+	ibuf_read_merge_pages(space_ids, page_nos, *n_pages);
 
 	return(sum_sizes + 1);
 }
@@ -2480,38 +2483,35 @@ ibuf_merge_space(
 	dtuple_t*	tuple = ibuf_search_tuple_build(space, 0, heap);
 	ulint		n_pages = 0;
 
-	ut_ad(space < SRV_LOG_SPACE_FIRST_ID);
+	ut_ad(space < SRV_SPACE_ID_UPPER_BOUND);
 
 	ibuf_mtr_start(&mtr);
 
 	/* Position the cursor on the first matching record. */
 
 	btr_pcur_open(
-		ibuf->index, tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur,
+		ibuf.index, tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur,
 		&mtr);
 
 	mem_heap_free(heap);
 
-	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
+	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
 
 	ulint		sum_sizes = 0;
-	ulint		pages[IBUF_MAX_N_PAGES_MERGED];
-	ulint		spaces[IBUF_MAX_N_PAGES_MERGED];
+	uint32_t	pages[IBUF_MAX_N_PAGES_MERGED];
+	uint32_t	spaces[IBUF_MAX_N_PAGES_MERGED];
 
 	if (page_is_empty(btr_pcur_get_page(&pcur))) {
 		/* If a B-tree page is empty, it must be the root page
 		and the whole B-tree must be empty. InnoDB does not
 		allow empty B-tree pages other than the root. */
-		ut_ad(ibuf->empty);
-		ut_ad(page_get_space_id(btr_pcur_get_page(&pcur))
-		      == IBUF_SPACE_ID);
-		ut_ad(page_get_page_no(btr_pcur_get_page(&pcur))
-		      == FSP_IBUF_TREE_ROOT_PAGE_NO);
-
+		ut_ad(ibuf.empty);
+		ut_ad(btr_pcur_get_block(&pcur)->page.id()
+		      == page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO));
 	} else {
 
 		sum_sizes = ibuf_get_merge_pages(
-			&pcur, space, IBUF_MAX_N_PAGES_MERGED,
+			&pcur, uint32_t(space), IBUF_MAX_N_PAGES_MERGED,
 			&pages[0], &spaces[0], &n_pages,
 			&mtr);
 		ib::info() << "Size of pages merged " << sum_sizes;
@@ -2530,8 +2530,7 @@ ibuf_merge_space(
 		}
 #endif /* UNIV_DEBUG */
 
-		buf_read_ibuf_merge_pages(
-			true, spaces, pages, n_pages);
+		ibuf_read_merge_pages(spaces, pages, n_pages);
 	}
 
 	return(n_pages);
@@ -2544,108 +2543,63 @@ the issued reads to complete
 @return a lower limit for the combined size in bytes of entries which
 will be merged from ibuf trees to the pages read, 0 if ibuf is
 empty */
-static MY_ATTRIBUTE((warn_unused_result))
-ulint
-ibuf_merge(
-	ulint*		n_pages,
-	bool		sync)
+MY_ATTRIBUTE((warn_unused_result))
+static ulint ibuf_merge(ulint* n_pages)
 {
 	*n_pages = 0;
 
-	/* We perform a dirty read of ibuf->empty, without latching
+	/* We perform a dirty read of ibuf.empty, without latching
 	the insert buffer root page. We trust this dirty read except
 	when a slow shutdown is being executed. During a slow
 	shutdown, the insert buffer merge must be completed. */
 
-	if (ibuf->empty && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
+	if (ibuf.empty && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
 		return(0);
 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
 	} else if (ibuf_debug) {
 		return(0);
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 	} else {
-		return(ibuf_merge_pages(n_pages, sync));
+		return ibuf_merge_pages(n_pages);
 	}
 }
 
 /** Contract the change buffer by reading pages to the buffer pool.
-@param[in]	sync	whether the caller waits for
-the issued reads to complete
 @return a lower limit for the combined size in bytes of entries which
 will be merged from ibuf trees to the pages read, 0 if ibuf is empty */
-static
-ulint
-ibuf_contract(
-	bool	sync)
+static ulint ibuf_contract()
 {
-	ulint	n_pages;
-
-	return(ibuf_merge_pages(&n_pages, sync));
+	ulint n_pages;
+	return ibuf_merge_pages(&n_pages);
 }
 
 /** Contract the change buffer by reading pages to the buffer pool.
-@param[in]	full		If true, do a full contraction based
-on PCT_IO(100). If false, the size of contract batch is determined
-based on the current size of the change buffer.
 @return a lower limit for the combined size in bytes of entries which
 will be merged from ibuf trees to the pages read, 0 if ibuf is
 empty */
-ulint
-ibuf_merge_in_background(
-	bool	full)
+ulint ibuf_merge_all()
 {
-	ulint	sum_bytes	= 0;
-	ulint	sum_pages	= 0;
-	ulint	n_pag2;
-	ulint	n_pages;
-
-#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
-	if (srv_ibuf_disable_background_merge) {
-		return(0);
-	}
-#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
-
-	if (full) {
-		/* Caller has requested a full batch */
-		n_pages = PCT_IO(100);
-	} else {
-		/* By default we do a batch of 5% of the io_capacity */
-		n_pages = PCT_IO(5);
-
-		mutex_enter(&ibuf_mutex);
-
-		/* If the ibuf->size is more than half the max_size
-		then we make more agreesive contraction.
-		+1 is to avoid division by zero. */
-		if (ibuf->size > ibuf->max_size / 2) {
-			ulint diff = ibuf->size - ibuf->max_size / 2;
-			n_pages += PCT_IO((diff * 100)
-					   / (ibuf->max_size + 1));
-		}
-
-		mutex_exit(&ibuf_mutex);
-	}
-
 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
 	if (ibuf_debug) {
 		return(0);
 	}
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 
-	while (sum_pages < n_pages) {
-		ulint	n_bytes;
+	ulint	sum_bytes	= 0;
+	ulint	n_pages = srv_io_capacity;
 
-		n_bytes = ibuf_merge(&n_pag2, false);
+	for (ulint sum_pages = 0; sum_pages < n_pages; ) {
+		ulint n_pag2;
+		ulint n_bytes = ibuf_merge(&n_pag2);
 
 		if (n_bytes == 0) {
-			return(sum_bytes);
+			break;
 		}
 
 		sum_bytes += n_bytes;
-		sum_pages += n_pag2;
 	}
 
-	return(sum_bytes);
+	return sum_bytes;
 }
 
 /*********************************************************************//**
@@ -2657,34 +2611,23 @@ ibuf_contract_after_insert(
 	ulint	entry_size)	/*!< in: size of a record which was inserted
 				into an ibuf tree */
 {
-	ibool	sync;
-	ulint	sum_sizes;
-	ulint	size;
-	ulint	max_size;
-
-	/* Perform dirty reads of ibuf->size and ibuf->max_size, to
-	reduce ibuf_mutex contention. ibuf->max_size remains constant
-	after ibuf_init_at_db_start(), but ibuf->size should be
-	protected by ibuf_mutex. Given that ibuf->size fits in a
+	/* Perform dirty reads of ibuf.size and ibuf.max_size, to
+	reduce ibuf_mutex contention. ibuf.max_size remains constant
+	after ibuf_init_at_db_start(), but ibuf.size should be
+	protected by ibuf_mutex. Given that ibuf.size fits in a
 	machine word, this should be OK; at worst we are doing some
 	excessive ibuf_contract() or occasionally skipping a
 	ibuf_contract(). */
-	size = ibuf->size;
-	max_size = ibuf->max_size;
-
-	if (size < max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) {
+	if (ibuf.size < ibuf.max_size) {
 		return;
 	}
 
-	sync = (size >= max_size + IBUF_CONTRACT_ON_INSERT_SYNC);
-
 	/* Contract at least entry_size many bytes */
-	sum_sizes = 0;
-	size = 1;
+	ulint sum_sizes = 0;
+	ulint size;
 
 	do {
-
-		size = ibuf_contract(sync);
+		size = ibuf_contract();
 		sum_sizes += size;
 	} while (size > 0 && sum_sizes < entry_size);
 }
@@ -2744,8 +2687,8 @@ ibuf_get_volume_buffered_count_func(
 	const byte*	types;
 	ulint		n_fields;
 
-	ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
 	ut_ad(ibuf_inside(mtr));
 
 	n_fields = rec_get_n_fields_old(rec);
@@ -2872,9 +2815,7 @@ ibuf_get_volume_buffered(
 	ulint		volume;
 	const rec_t*	rec;
 	const page_t*	page;
-	ulint		prev_page_no;
 	const page_t*	prev_page;
-	ulint		next_page_no;
 	const page_t*	next_page;
 	/* bitmap of buffered recs */
 	ulint		hash_bitmap[128 / sizeof(ulint)];
@@ -2893,12 +2834,14 @@ ibuf_get_volume_buffered(
 
 	rec = btr_pcur_get_rec(pcur);
 	page = page_align(rec);
-	ut_ad(page_validate(page, ibuf->index));
+	ut_ad(page_validate(page, ibuf.index));
 
 	if (page_rec_is_supremum(rec)) {
 		rec = page_rec_get_prev_const(rec);
 	}
 
+	uint32_t prev_page_no;
+
 	for (; !page_rec_is_infimum(rec);
 	     rec = page_rec_get_prev_const(rec)) {
 		ut_ad(page_align(rec) == page);
@@ -2933,11 +2876,14 @@ ibuf_get_volume_buffered(
 		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
 
 		prev_page = buf_block_get_frame(block);
-		ut_ad(page_validate(prev_page, ibuf->index));
+		ut_ad(page_validate(prev_page, ibuf.index));
 	}
 
 #ifdef UNIV_BTR_DEBUG
-	ut_a(!memcmp(prev_page + FIL_PAGE_NEXT, page + FIL_PAGE_OFFSET, 4));
+	static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment");
+	static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+	ut_a(!memcmp_aligned<4>(prev_page + FIL_PAGE_NEXT,
+				page + FIL_PAGE_OFFSET, 4));
 #endif /* UNIV_BTR_DEBUG */
 
 	rec = page_get_supremum_rec(prev_page);
@@ -2988,7 +2934,7 @@ count_later:
 
 	/* Look at the next page */
 
-	next_page_no = btr_page_get_next(page);
+	uint32_t next_page_no = btr_page_get_next(page);
 
 	if (next_page_no == FIL_NULL) {
 
@@ -3005,11 +2951,14 @@ count_later:
 		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
 
 		next_page = buf_block_get_frame(block);
-		ut_ad(page_validate(next_page, ibuf->index));
+		ut_ad(page_validate(next_page, ibuf.index));
 	}
 
 #ifdef UNIV_BTR_DEBUG
-	ut_a(!memcmp(next_page + FIL_PAGE_PREV, page + FIL_PAGE_OFFSET, 4));
+	static_assert(FIL_PAGE_PREV % 4 == 0, "alignment");
+	static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+	ut_a(!memcmp_aligned<4>(next_page + FIL_PAGE_PREV,
+				page + FIL_PAGE_OFFSET, 4));
 #endif /* UNIV_BTR_DEBUG */
 
 	rec = page_get_infimum_rec(next_page);
@@ -3051,14 +3000,14 @@ ibuf_update_max_tablespace_id(void)
 	btr_pcur_t	pcur;
 	mtr_t		mtr;
 
-	ut_a(!dict_table_is_comp(ibuf->index->table));
+	ut_a(!dict_table_is_comp(ibuf.index->table));
 
 	ibuf_mtr_start(&mtr);
 
 	btr_pcur_open_at_index_side(
-		false, ibuf->index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
+		false, ibuf.index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
 
-	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
+	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
 
 	btr_pcur_move_to_prev(&pcur, &mtr);
 
@@ -3113,8 +3062,8 @@ ibuf_get_entry_counter_low_func(
 	ulint		len;
 
 	ut_ad(ibuf_inside(mtr));
-	ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
 	ut_ad(rec_get_n_fields_old(rec) > 2);
 
 	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
@@ -3188,8 +3137,8 @@ ibuf_get_entry_counter_func(
 					in the node pointer */
 {
 	ut_ad(ibuf_inside(mtr));
-	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
-	ut_ad(page_validate(page_align(rec), ibuf->index));
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(page_validate(page_align(rec), ibuf.index));
 
 	if (page_rec_is_supremum(rec)) {
 		/* This is just for safety. The record should be a
@@ -3268,14 +3217,13 @@ ibuf_insert_low(
 	ulint		buffered;
 	lint		min_n_recs;
 	rec_t*		ins_rec;
-	ibool		old_bit_value;
-	page_t*		bitmap_page;
+	buf_block_t*	bitmap_page;
 	buf_block_t*	block;
 	page_t*		root;
 	dberr_t		err;
 	ibool		do_merge;
-	ulint		space_ids[IBUF_MAX_N_PAGES_MERGED];
-	ulint		page_nos[IBUF_MAX_N_PAGES_MERGED];
+	uint32_t	space_ids[IBUF_MAX_N_PAGES_MERGED];
+	uint32_t	page_nos[IBUF_MAX_N_PAGES_MERGED];
 	ulint		n_stored;
 	mtr_t		mtr;
 	mtr_t		bitmap_mtr;
@@ -3289,16 +3237,16 @@ ibuf_insert_low(
 
 	do_merge = FALSE;
 
-	/* Perform dirty reads of ibuf->size and ibuf->max_size, to
-	reduce ibuf_mutex contention. Given that ibuf->max_size and
-	ibuf->size fit in a machine word, this should be OK; at worst
+	/* Perform dirty reads of ibuf.size and ibuf.max_size, to
+	reduce ibuf_mutex contention. Given that ibuf.max_size and
+	ibuf.size fit in a machine word, this should be OK; at worst
 	we are doing some excessive ibuf_contract() or occasionally
 	skipping an ibuf_contract(). */
-	if (ibuf->max_size == 0) {
+	if (ibuf.max_size == 0) {
 		return(DB_STRONG_FAIL);
 	}
 
-	if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_DO_NOT_INSERT) {
+	if (ibuf.size >= ibuf.max_size + IBUF_CONTRACT_DO_NOT_INSERT) {
 		/* Insert buffer is now too big, contract it but do not try
 		to insert */
 
@@ -3306,7 +3254,7 @@ ibuf_insert_low(
 #ifdef UNIV_IBUF_DEBUG
 		fputs("Ibuf too big\n", stderr);
 #endif
-		ibuf_contract(true);
+		ibuf_contract();
 
 		return(DB_STRONG_FAIL);
 	}
@@ -3352,8 +3300,8 @@ ibuf_insert_low(
 
 	ibuf_mtr_start(&mtr);
 
-	btr_pcur_open(ibuf->index, ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr);
-	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
+	btr_pcur_open(ibuf.index, ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr);
+	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
 
 	/* Find out the volume of already buffered inserts for the same index
 	page */
@@ -3368,7 +3316,7 @@ ibuf_insert_low(
 	const ulint physical_size = zip_size ? zip_size : srv_page_size;
 
 	if (op == IBUF_OP_DELETE
-	    && (min_n_recs < 2 || buf_pool_watch_occurred(page_id))) {
+	    && (min_n_recs < 2 || buf_pool.watch_occurred(page_id))) {
 		/* The page could become empty after the record is
 		deleted, or the page has been read in to the buffer
 		pool.  Refuse to buffer the operation. */
@@ -3400,30 +3348,34 @@ fail_exit:
 	buffer pool, but we do not have to care about it, since we are
 	holding a latch on the insert buffer leaf page that contains
 	buffered changes for (space, page_no).  If the page enters the
-	buffer pool, buf_page_io_complete() for (space, page_no) will
+	buffer pool, buf_page_read_complete() for (space, page_no) will
 	have to acquire a latch on the same insert buffer leaf page,
 	which it cannot do until we have buffered the IBUF_OP_DELETE
 	and done mtr_commit(&mtr) to release the latch. */
 
 	ibuf_mtr_start(&bitmap_mtr);
-	index->set_modified(bitmap_mtr);
 
 	bitmap_page = ibuf_bitmap_get_map_page(page_id, zip_size, &bitmap_mtr);
 
 	/* We check if the index page is suitable for buffered entries */
 
-	if (buf_page_peek(page_id)
-	    || lock_rec_expl_exist_on_page(page_id.space(),
-					   page_id.page_no())) {
-
+	if (buf_pool.page_hash_contains(page_id)) {
+commit_exit:
 		ibuf_mtr_commit(&bitmap_mtr);
 		goto fail_exit;
+	} else {
+		lock_mutex_enter();
+		const auto lock_exists = lock_sys.get_first(page_id);
+		lock_mutex_exit();
+		if (lock_exists) {
+			goto commit_exit;
+		}
 	}
 
 	if (op == IBUF_OP_INSERT) {
 		ulint	bits = ibuf_bitmap_page_get_bits(
-			bitmap_page, page_id, physical_size, IBUF_BITMAP_FREE,
-			&bitmap_mtr);
+			bitmap_page->frame, page_id, physical_size,
+			IBUF_BITMAP_FREE, &bitmap_mtr);
 
 		if (buffered + entry_size + page_dir_calc_reserved_space(1)
 		    > ibuf_index_page_calc_free_from_bits(physical_size,
@@ -3455,8 +3407,7 @@ fail_exit:
 		dfield_t*	field;
 
 		if (counter == ULINT_UNDEFINED) {
-			ibuf_mtr_commit(&bitmap_mtr);
-			goto fail_exit;
+			goto commit_exit;
 		}
 
 		field = dtuple_get_nth_field(
@@ -3468,17 +3419,9 @@ fail_exit:
 
 	/* Set the bitmap bit denoting that the insert buffer contains
 	buffered entries for this index page, if the bit is not set yet */
-
-	old_bit_value = ibuf_bitmap_page_get_bits(
-		bitmap_page, page_id, physical_size,
-		IBUF_BITMAP_BUFFERED, &bitmap_mtr);
-
-	if (!old_bit_value) {
-		ibuf_bitmap_page_set_bits(bitmap_page, page_id, physical_size,
-					  IBUF_BITMAP_BUFFERED, TRUE,
-					  &bitmap_mtr);
-	}
-
+	index->set_modified(bitmap_mtr);
+	ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>(
+		bitmap_page, page_id, physical_size, true, &bitmap_mtr);
 	ibuf_mtr_commit(&bitmap_mtr);
 
 	cursor = btr_pcur_get_btr_cur(&pcur);
@@ -3490,17 +3433,17 @@ fail_exit:
 			ibuf_entry, &ins_rec,
 			&dummy_big_rec, 0, thr, &mtr);
 		block = btr_cur_get_block(cursor);
-		ut_ad(block->page.id.space() == IBUF_SPACE_ID);
+		ut_ad(block->page.id().space() == IBUF_SPACE_ID);
 
-		/* If this is the root page, update ibuf->empty. */
-		if (block->page.id.page_no() == FSP_IBUF_TREE_ROOT_PAGE_NO) {
+		/* If this is the root page, update ibuf.empty. */
+		if (block->page.id().page_no() == FSP_IBUF_TREE_ROOT_PAGE_NO) {
 			const page_t*	root = buf_block_get_frame(block);
 
 			ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
 			ut_ad(page_get_page_no(root)
 			      == FSP_IBUF_TREE_ROOT_PAGE_NO);
 
-			ibuf->empty = page_is_empty(root);
+			ibuf.empty = page_is_empty(root);
 		}
 	} else {
 		ut_ad(BTR_LATCH_MODE_WITHOUT_INTENTION(mode)
@@ -3511,7 +3454,7 @@ fail_exit:
 		which would cause the sx-latching of the root after that to
 		break the latching order. */
 
-		root = ibuf_tree_root_get(&mtr);
+		root = ibuf_tree_root_get(&mtr)->frame;
 
 		err = btr_cur_optimistic_insert(
 			BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
@@ -3530,10 +3473,10 @@ fail_exit:
 		mutex_exit(&ibuf_pessimistic_insert_mutex);
 		ibuf_size_update(root);
 		mutex_exit(&ibuf_mutex);
-		ibuf->empty = page_is_empty(root);
+		ibuf.empty = page_is_empty(root);
 
 		block = btr_cur_get_block(cursor);
-		ut_ad(block->page.id.space() == IBUF_SPACE_ID);
+		ut_ad(block->page.id().space() == IBUF_SPACE_ID);
 	}
 
 	if (offsets_heap) {
@@ -3561,8 +3504,7 @@ func_exit:
 #ifdef UNIV_IBUF_DEBUG
 		ut_a(n_stored <= IBUF_MAX_N_PAGES_MERGED);
 #endif
-		buf_read_ibuf_merge_pages(false, space_ids,
-					  page_nos, n_stored);
+		ibuf_read_merge_pages(space_ids, page_nos, n_stored);
 	}
 
 	return(err);
@@ -3663,22 +3605,16 @@ check_watch:
 	would always trigger the buffer pool watch during purge and
 	thus prevent the buffering of delete operations.  We assume
 	that the issuer of IBUF_OP_DELETE has called
-	buf_pool_watch_set(space, page_no). */
-
-	{
-		buf_pool_t*	buf_pool = buf_pool_get(page_id);
-		buf_page_t*	bpage
-			= buf_page_get_also_watch(buf_pool, page_id);
-
-		if (bpage != NULL) {
-			/* A buffer pool watch has been set or the
-			page has been read into the buffer pool.
-			Do not buffer the request.  If a purge operation
-			is being buffered, have this request executed
-			directly on the page in the buffer pool after the
-			buffered entries for this page have been merged. */
-			DBUG_RETURN(false);
-		}
+	buf_pool_t::watch_set(). */
+
+	if (buf_pool.page_hash_contains<true>(page_id)) {
+		/* A buffer pool watch has been set or the
+		page has been read into the buffer pool.
+		Do not buffer the request.  If a purge operation
+		is being buffered, have this request executed
+		directly on the page in the buffer pool after the
+		buffered entries for this page have been merged. */
+		DBUG_RETURN(false);
 	}
 
 skip_watch:
@@ -3724,9 +3660,6 @@ ibuf_insert_to_index_page_low(
 	page_cur_t*	page_cur)/*!< in/out: cursor positioned on the record
 				after which to insert the buffered entry */
 {
-	const page_t*	page;
-	const page_t*	bitmap_page;
-	ulint		old_bits;
 	rec_t*		rec;
 	DBUG_ENTER("ibuf_insert_to_index_page_low");
 
@@ -3740,7 +3673,7 @@ ibuf_insert_to_index_page_low(
 	been attempted by page_cur_tuple_insert(). Besides, per
 	ibuf_index_page_calc_free_zip() the page should not have been
 	recompressed or reorganized. */
-	ut_ad(!buf_block_get_page_zip(block));
+	ut_ad(!is_buf_block_get_page_zip(block));
 
 	/* If the record did not fit, reorganize */
 
@@ -3754,11 +3687,10 @@ ibuf_insert_to_index_page_low(
 		DBUG_RETURN(rec);
 	}
 
-	page = buf_block_get_frame(block);
-
 	ib::error() << "Insert buffer insert fails; page free "
-		<< page_get_max_insert_size(page, 1) << ", dtuple size "
-		<< rec_get_converted_size(index, entry, 0);
+		    << page_get_max_insert_size(block->frame, 1)
+		    << ", dtuple size "
+		    << rec_get_converted_size(index, entry, 0);
 
 	fputs("InnoDB: Cannot insert index record ", stderr);
 	dtuple_print(stderr, entry);
@@ -3766,14 +3698,15 @@ ibuf_insert_to_index_page_low(
 	      "InnoDB: is now probably corrupt. Please run CHECK TABLE on\n"
 	      "InnoDB: that table.\n", stderr);
 
-	bitmap_page = ibuf_bitmap_get_map_page(block->page.id,
-					       block->zip_size(), mtr);
-	old_bits = ibuf_bitmap_page_get_bits(
-		bitmap_page, block->page.id, block->zip_size(),
-		IBUF_BITMAP_FREE, mtr);
+	if (buf_block_t *bitmap_page =  ibuf_bitmap_get_map_page(
+			block->page.id(), block->zip_size(), mtr)) {
 
-	ib::error() << "page " << block->page.id << ", size "
-		<< block->physical_size() << ", bitmap bits " << old_bits;
+		ib::error() << "page " << block->page.id() << ", size "
+			    << block->physical_size() << ", bitmap bits "
+			    << ibuf_bitmap_page_get_bits(bitmap_page->frame,
+					block->page.id(), block->zip_size(),
+					IBUF_BITMAP_FREE, mtr);
+	}
 
 	ib::error() << BUG_REPORT_MSG;
 
@@ -3804,8 +3737,8 @@ ibuf_insert_to_index_page(
 	DBUG_ENTER("ibuf_insert_to_index_page");
 
 	DBUG_PRINT("ibuf", ("page " UINT32PF ":" UINT32PF,
-			    block->page.id.space(),
-			    block->page.id.page_no()));
+			    block->page.id().space(),
+			    block->page.id().page_no()));
 
 	ut_ad(!dict_index_is_online_ddl(index));// this is an ibuf_dummy index
 	ut_ad(ibuf_inside(mtr));
@@ -3817,7 +3750,7 @@ ibuf_insert_to_index_page(
 	ut_ad(!block->index);
 	assert_block_ahi_empty(block);
 #endif /* BTR_CUR_HASH_ADAPT */
-	ut_ad(mtr->is_named_space(block->page.id.space()));
+	ut_ad(mtr->is_named_space(block->page.id().space()));
 
 	if (UNIV_UNLIKELY(dict_table_is_comp(index->table)
 			  != (ibool)!!page_is_comp(page))) {
@@ -3862,7 +3795,6 @@ dump:
 
 	if (UNIV_UNLIKELY(low_match == dtuple_get_n_fields(entry))) {
 		upd_t*		update;
-		page_zip_des_t*	page_zip;
 
 		rec = page_cur_get_rec(&page_cur);
 
@@ -3875,20 +3807,18 @@ dump:
 		update = row_upd_build_sec_rec_difference_binary(
 			rec, index, offsets, entry, heap);
 
-		page_zip = buf_block_get_page_zip(block);
-
 		if (update->n_fields == 0) {
 			/* The records only differ in the delete-mark.
 			Clear the delete-mark, like we did before
 			Bug #56680 was fixed. */
-			btr_cur_set_deleted_flag_for_ibuf(
-				rec, page_zip, FALSE, mtr);
+			btr_rec_set_deleted<false>(block, rec, mtr);
 			goto updated_in_place;
 		}
 
 		/* Copy the info bits. Clear the delete-mark. */
 		update->info_bits = rec_get_info_bits(rec, page_is_comp(page));
-		update->info_bits &= ~REC_INFO_DELETED_FLAG;
+		update->info_bits &= byte(~REC_INFO_DELETED_FLAG);
+		page_zip_des_t* page_zip = buf_block_get_page_zip(block);
 
 		/* We cannot invoke btr_cur_optimistic_update() here,
 		because we do not have a btr_cur_t or que_thr_t,
@@ -3901,15 +3831,8 @@ dump:
 			/* This is the easy case. Do something similar
 			to btr_cur_update_in_place(). */
 			rec = page_cur_get_rec(&page_cur);
-			row_upd_rec_in_place(rec, index, offsets,
-					     update, page_zip);
-
-			/* Log the update in place operation. During recovery
-			MLOG_COMP_REC_UPDATE_IN_PLACE/MLOG_REC_UPDATE_IN_PLACE
-			expects trx_id, roll_ptr for secondary indexes. So we
-			just write dummy trx_id(0), roll_ptr(0) */
-			btr_cur_update_in_place_log(BTR_KEEP_SYS_FLAG, rec,
-						    index, update, 0, 0, mtr);
+			btr_cur_upd_rec_in_place(rec, index, offsets,
+						 update, block, mtr);
 
 			DBUG_EXECUTE_IF(
 				"crash_after_log_ibuf_upd_inplace",
@@ -3986,11 +3909,7 @@ ibuf_set_del_mark(
 	low_match = page_cur_search(block, index, entry, &page_cur);
 
 	if (low_match == dtuple_get_n_fields(entry)) {
-		rec_t*		rec;
-		page_zip_des_t*	page_zip;
-
-		rec = page_cur_get_rec(&page_cur);
-		page_zip = page_cur_get_page_zip(&page_cur);
+		rec_t* rec = page_cur_get_rec(&page_cur);
 
 		/* Delete mark the old index record. According to a
 		comment in row_upd_sec_index_entry(), it can already
@@ -4001,8 +3920,7 @@ ibuf_set_del_mark(
 		if (UNIV_LIKELY
 		    (!rec_get_deleted_flag(
 			    rec, dict_table_is_comp(index->table)))) {
-			btr_cur_set_deleted_flag_for_ibuf(rec, page_zip,
-							  TRUE, mtr);
+			btr_rec_set_deleted<true>(block, rec, mtr);
 		}
 	} else {
 		const page_t*		page
@@ -4017,7 +3935,7 @@ ibuf_set_del_mark(
 		      "InnoDB: record ", stderr);
 		rec_print(stderr, page_cur_get_rec(&page_cur), index);
 
-		ib::error() << "page " << block->page.id << " ("
+		ib::error() << "page " << block->page.id() << " ("
 			<< page_get_n_recs(page) << " records, index id "
 			<< btr_page_get_index_id(page) << ").";
 
@@ -4081,8 +3999,8 @@ ibuf_delete(
 				" (%u records, index id %llu)\n"
 				"InnoDB: Submit a detailed bug report"
 				" to https://jira.mariadb.org/\n",
-				block->page.id.space(),
-				block->page.id.page_no(),
+				block->page.id().space(),
+				block->page.id().page_no(),
 				(unsigned) page_get_n_recs(page),
 				(ulonglong) btr_page_get_index_id(page));
 
@@ -4121,14 +4039,12 @@ ibuf_delete(
 
 /*********************************************************************//**
 Restores insert buffer tree cursor position
-@return TRUE if the position was restored; FALSE if not */
+@return whether the position was restored */
 static MY_ATTRIBUTE((nonnull))
-ibool
+bool
 ibuf_restore_pos(
 /*=============*/
-	ulint		space,	/*!< in: space id */
-	ulint		page_no,/*!< in: index page number where the record
-				should belong */
+	const page_id_t	page_id,/*!< in: page identifier */
 	const dtuple_t*	search_tuple,
 				/*!< in: search tuple for entries of page_no */
 	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
@@ -4139,16 +4055,15 @@ ibuf_restore_pos(
 	ut_ad(mode == BTR_MODIFY_LEAF
 	      || BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE);
 
-	if (btr_pcur_restore_position(mode, pcur, mtr) ==
-	    btr_pcur_t::SAME_ALL) {
-
-		return(TRUE);
+	if (UNIV_LIKELY(btr_pcur_restore_position(mode, pcur, mtr) ==
+	      btr_pcur_t::SAME_ALL)) {
+		return true;
 	}
 
-	if (fil_space_t* s = fil_space_acquire_silent(space)) {
+	if (fil_space_t* s = fil_space_t::get(page_id.space())) {
 		ib::error() << "ibuf cursor restoration fails!"
 			" ibuf record inserted to page "
-			<< space << ":" << page_no
+			<< page_id
 			<< " in file " << s->chain.start->name;
 		s->release();
 
@@ -4163,26 +4078,19 @@ ibuf_restore_pos(
 	}
 
 	ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
-	return(FALSE);
+	return false;
 }
 
-/*********************************************************************//**
-Deletes from ibuf the record on which pcur is positioned. If we have to
-resort to a pessimistic delete, this function commits mtr and closes
-the cursor.
-@return TRUE if mtr was committed and pcur closed in this operation */
-static MY_ATTRIBUTE((warn_unused_result))
-ibool
-ibuf_delete_rec(
-/*============*/
-	ulint		space,	/*!< in: space id */
-	ulint		page_no,/*!< in: index page number that the record
-				should belong to */
-	btr_pcur_t*	pcur,	/*!< in: pcur positioned on the record to
-				delete, having latch mode BTR_MODIFY_LEAF */
-	const dtuple_t*	search_tuple,
-				/*!< in: search tuple for entries of page_no */
-	mtr_t*		mtr)	/*!< in: mtr */
+/**
+Delete a change buffer record.
+@param[in]	page_id		page identifier
+@param[in,out]	pcur		persistent cursor positioned on the record
+@param[in]	search_tuple	search key for (space,page_no)
+@param[in,out]	mtr		mini-transaction
+@return whether mtr was committed (due to pessimistic operation) */
+static MY_ATTRIBUTE((warn_unused_result, nonnull))
+bool ibuf_delete_rec(const page_id_t page_id, btr_pcur_t* pcur,
+		     const dtuple_t* search_tuple, mtr_t* mtr)
 {
 	ibool		success;
 	page_t*		root;
@@ -4190,14 +4098,14 @@ ibuf_delete_rec(
 
 	ut_ad(ibuf_inside(mtr));
 	ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur)));
-	ut_ad(ibuf_rec_get_page_no(mtr, btr_pcur_get_rec(pcur)) == page_no);
-	ut_ad(ibuf_rec_get_space(mtr, btr_pcur_get_rec(pcur)) == space);
+	ut_ad(ibuf_rec_get_page_no(mtr, btr_pcur_get_rec(pcur))
+	      == page_id.page_no());
+	ut_ad(ibuf_rec_get_space(mtr, btr_pcur_get_rec(pcur))
+	      == page_id.space());
 
 	success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur),
 					    0, mtr);
 
-	const page_id_t	page_id(space, page_no);
-
 	if (success) {
 		if (page_is_empty(btr_pcur_get_page(pcur))) {
 			/* If a B-tree page is empty, it must be the root page
@@ -4209,25 +4117,21 @@ ibuf_delete_rec(
 			ut_ad(page_get_page_no(root)
 			      == FSP_IBUF_TREE_ROOT_PAGE_NO);
 
-			/* ibuf->empty is protected by the root page latch.
+			/* ibuf.empty is protected by the root page latch.
 			Before the deletion, it had to be FALSE. */
-			ut_ad(!ibuf->empty);
-			ibuf->empty = true;
+			ut_ad(!ibuf.empty);
+			ibuf.empty = true;
 		}
 
 		return(FALSE);
 	}
 
-	ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur)));
-	ut_ad(ibuf_rec_get_page_no(mtr, btr_pcur_get_rec(pcur)) == page_no);
-	ut_ad(ibuf_rec_get_space(mtr, btr_pcur_get_rec(pcur)) == space);
-
 	/* We have to resort to a pessimistic delete from ibuf.
 	Delete-mark the record so that it will not be applied again,
 	in case the server crashes before the pessimistic delete is
 	made persistent. */
-	btr_cur_set_deleted_flag_for_ibuf(
-		btr_pcur_get_rec(pcur), NULL, TRUE, mtr);
+	btr_rec_set_deleted<true>(btr_pcur_get_block(pcur),
+				  btr_pcur_get_rec(pcur), mtr);
 
 	btr_pcur_store_position(pcur, mtr);
 	ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
@@ -4235,7 +4139,7 @@ ibuf_delete_rec(
 	ibuf_mtr_start(mtr);
 	mutex_enter(&ibuf_mutex);
 
-	if (!ibuf_restore_pos(space, page_no, search_tuple,
+	if (!ibuf_restore_pos(page_id, search_tuple,
 			      BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
 			      pcur, mtr)) {
 
@@ -4244,7 +4148,7 @@ ibuf_delete_rec(
 		goto func_exit;
 	}
 
-	root = ibuf_tree_root_get(mtr);
+	root = ibuf_tree_root_get(mtr)->frame;
 
 	btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur), 0,
 				   false, mtr);
@@ -4253,7 +4157,7 @@ ibuf_delete_rec(
 	ibuf_size_update(root);
 	mutex_exit(&ibuf_mutex);
 
-	ibuf->empty = page_is_empty(root);
+	ibuf.empty = page_is_empty(root);
 	ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
 
 func_exit:
@@ -4263,69 +4167,56 @@ func_exit:
 	return(TRUE);
 }
 
-/**
-Delete any buffered entries for a page.
-This prevents an infinite loop on slow shutdown
-in the case where the change buffer bitmap claims that no buffered
-changes exist, while entries exist in the change buffer tree.
-@param page_id  page number for which there should be no unbuffered changes */
-ATTRIBUTE_COLD void ibuf_delete_recs(const page_id_t page_id)
+/** Check whether buffered changes exist for a page.
+@param[in]	id		page identifier
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return whether buffered changes exist */
+bool ibuf_page_exists(const page_id_t id, ulint zip_size)
 {
-	ulint dops[IBUF_OP_COUNT];
-	mtr_t mtr;
-	btr_pcur_t pcur;
-	mem_heap_t* heap = mem_heap_create(512);
-	const dtuple_t* tuple = ibuf_search_tuple_build(
-		page_id.space(), page_id.page_no(), heap);
-	memset(dops, 0, sizeof(dops));
+	ut_ad(!fsp_is_system_temporary(id.space()));
 
-loop:
-	ibuf_mtr_start(&mtr);
-	btr_pcur_open(ibuf->index, tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
-		      &pcur, &mtr);
+	const ulint physical_size = zip_size ? zip_size : srv_page_size;
 
-	if (!btr_pcur_is_on_user_rec(&pcur)) {
-		ut_ad(btr_pcur_is_after_last_in_tree(&pcur));
-		goto func_exit;
+	if (ibuf_fixed_addr_page(id, physical_size)
+	    || fsp_descr_page(id, physical_size)) {
+		return false;
 	}
 
-	for (;;) {
-		ut_ad(btr_pcur_is_on_user_rec(&pcur));
-
-		const rec_t* ibuf_rec = btr_pcur_get_rec(&pcur);
-
-		if (ibuf_rec_get_space(&mtr, ibuf_rec)
-		    != page_id.space()
-		    || ibuf_rec_get_page_no(&mtr, ibuf_rec)
-		    != page_id.page_no()) {
-			break;
-		}
-
-		dops[ibuf_rec_get_op_type(&mtr, ibuf_rec)]++;
-
-		/* Delete the record from ibuf */
-		if (ibuf_delete_rec(page_id.space(), page_id.page_no(),
-				    &pcur, tuple, &mtr)) {
-			/* Deletion was pessimistic and mtr was committed:
-			we start from the beginning again */
-			ut_ad(mtr.has_committed());
-			goto loop;
-		}
+	mtr_t mtr;
+	bool bitmap_bits = false;
 
-		if (btr_pcur_is_after_last_on_page(&pcur)) {
-			ibuf_mtr_commit(&mtr);
-			btr_pcur_close(&pcur);
-			goto loop;
-		}
+	ibuf_mtr_start(&mtr);
+	if (const buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
+		    id, zip_size, &mtr)) {
+		bitmap_bits = ibuf_bitmap_page_get_bits(
+			bitmap_page->frame, id, zip_size,
+			IBUF_BITMAP_BUFFERED, &mtr) != 0;
 	}
-
-func_exit:
 	ibuf_mtr_commit(&mtr);
-	btr_pcur_close(&pcur);
-
-	ibuf_add_ops(ibuf->n_discarded_ops, dops);
+	return bitmap_bits;
+}
 
-	mem_heap_free(heap);
+/** Reset the bits in the bitmap page for the given block and page id.
+@param b        X-latched secondary index page (nullptr to discard changes)
+@param page_id  page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param mtr      mini-transaction */
+static void ibuf_reset_bitmap(buf_block_t *b, page_id_t page_id,
+                              ulint zip_size, mtr_t *mtr)
+{
+ buf_block_t *bitmap= ibuf_bitmap_get_map_page(page_id, zip_size, mtr);
+ if (!bitmap)
+   return;
+
+ const ulint physical_size = zip_size ? zip_size : srv_page_size;
+ /* FIXME: update the bitmap byte only once! */
+ ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>(bitmap, page_id,
+                                                 physical_size, false, mtr);
+
+ if (b)
+   ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(bitmap, page_id, physical_size,
+                                               ibuf_index_page_calc_free(b),
+                                               mtr);
 }
 
 /** When an index page is read from a disk to the buffer pool, this function
@@ -4340,11 +4231,14 @@ subsequently was dropped.
 void ibuf_merge_or_delete_for_page(buf_block_t *block, const page_id_t page_id,
                                    ulint zip_size)
 {
+	if (trx_sys_hdr_page(page_id)) {
+		return;
+	}
+
 	btr_pcur_t	pcur;
 #ifdef UNIV_IBUF_DEBUG
 	ulint		volume			= 0;
 #endif /* UNIV_IBUF_DEBUG */
-	page_zip_des_t*	page_zip		= NULL;
 	bool		corruption_noticed	= false;
 	mtr_t		mtr;
 
@@ -4352,15 +4246,11 @@ void ibuf_merge_or_delete_for_page(buf_block_t *block, const page_id_t page_id,
 	ulint		mops[IBUF_OP_COUNT];
 	ulint		dops[IBUF_OP_COUNT];
 
-	ut_ad(block == NULL || page_id == block->page.id);
-	ut_ad(block == NULL || buf_block_get_io_fix(block) == BUF_IO_READ
-	      || recv_recovery_is_on());
-
-	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE
-	    || trx_sys_hdr_page(page_id)
-	    || fsp_is_system_temporary(page_id.space())) {
-		return;
-	}
+	ut_ad(!block || page_id == block->page.id());
+	ut_ad(!block || block->page.state() == BUF_BLOCK_FILE_PAGE);
+	ut_ad(!block || block->page.status == buf_page_t::NORMAL);
+	ut_ad(!trx_sys_hdr_page(page_id));
+	ut_ad(page_id < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0));
 
 	const ulint physical_size = zip_size ? zip_size : srv_page_size;
 
@@ -4369,7 +4259,7 @@ void ibuf_merge_or_delete_for_page(buf_block_t *block, const page_id_t page_id,
 		return;
 	}
 
-	fil_space_t* space = fil_space_acquire_silent(page_id.space());
+	fil_space_t* space = fil_space_t::get(page_id.space());
 
 	if (UNIV_UNLIKELY(!space)) {
 		block = NULL;
@@ -4378,32 +4268,31 @@ void ibuf_merge_or_delete_for_page(buf_block_t *block, const page_id_t page_id,
 
 		ibuf_mtr_start(&mtr);
 
-		page_t* bitmap_page = ibuf_bitmap_get_map_page(
+		buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
 			page_id, zip_size, &mtr);
 
-		if (bitmap_page &&
-		    fil_page_get_type(bitmap_page) != FIL_PAGE_TYPE_ALLOCATED) {
+		if (bitmap_page
+		    && fil_page_get_type(bitmap_page->frame)
+		    != FIL_PAGE_TYPE_ALLOCATED) {
 			bitmap_bits = ibuf_bitmap_page_get_bits(
-				bitmap_page, page_id, zip_size,
+				bitmap_page->frame, page_id, zip_size,
 				IBUF_BITMAP_BUFFERED, &mtr);
 		}
 
 		ibuf_mtr_commit(&mtr);
 
+		if (bitmap_bits && fseg_page_is_free(
+				space, page_id.page_no())) {
+			ibuf_mtr_start(&mtr);
+			mtr.set_named_space(space);
+			ibuf_reset_bitmap(block, page_id, zip_size, &mtr);
+			ibuf_mtr_commit(&mtr);
+			bitmap_bits = 0;
+		}
+
 		if (!bitmap_bits) {
 			/* No changes are buffered for this page. */
 			space->release();
-			if (UNIV_UNLIKELY(srv_shutdown_state)
-			    && !srv_fast_shutdown
-			    && (!block
-				|| btr_page_get_index_id(block->frame)
-				!= DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
-				/* Prevent an infinite loop on slow
-				shutdown, in case the bitmap bits are
-				wrongly clear even though buffered
-				changes exist. */
-				ibuf_delete_recs(page_id);
-			}
 			return;
 		}
 	}
@@ -4420,7 +4309,6 @@ void ibuf_merge_or_delete_for_page(buf_block_t *block, const page_id_t page_id,
 		the debug checks. */
 
 		rw_lock_x_lock_move_ownership(&(block->lock));
-		page_zip = buf_block_get_page_zip(block);
 
 		if (!fil_page_index_page_check(block->frame)
 		    || !page_is_leaf(block->frame)) {
@@ -4449,16 +4337,15 @@ loop:
 	/* Position pcur in the insert buffer at the first entry for this
 	index page */
 	btr_pcur_open_on_user_rec(
-		ibuf->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
+		ibuf.index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
 		&pcur, &mtr);
 
 	if (block) {
-		ibool success = buf_page_get_known_nowait(
-			RW_X_LATCH, block,
-			BUF_KEEP_OLD, __FILE__, __LINE__, &mtr);
-
-		ut_a(success);
+		ut_ad(rw_lock_own(&block->lock, RW_LOCK_X));
+		buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+		rw_lock_x_lock(&block->lock);
 
+		mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
 		/* This is a user page (secondary index leaf page),
 		but we pretend that it is a change buffer page in
 		order to obey the latching order. This should be OK,
@@ -4489,8 +4376,7 @@ loop:
 		    || ibuf_rec_get_space(&mtr, rec) != page_id.space()) {
 
 			if (block != NULL) {
-				page_header_reset_last_insert(
-					block->frame, page_zip, &mtr);
+				page_header_reset_last_insert(block, &mtr);
 			}
 
 			goto reset_bit;
@@ -4512,10 +4398,11 @@ loop:
 			ibuf_op_t	op = ibuf_rec_get_op_type(&mtr, rec);
 
 			max_trx_id = page_get_max_trx_id(page_align(rec));
-			page_update_max_trx_id(block, page_zip, max_trx_id,
-					       &mtr);
+			page_update_max_trx_id(block,
+					       buf_block_get_page_zip(block),
+					       max_trx_id, &mtr);
 
-			ut_ad(page_validate(page_align(rec), ibuf->index));
+			ut_ad(page_validate(page_align(rec), ibuf.index));
 
 			entry = ibuf_build_entry_from_ibuf_rec(
 				&mtr, rec, heap, &dummy_index);
@@ -4526,7 +4413,6 @@ loop:
 			ut_ad(page_validate(block->frame, dummy_index));
 
 			switch (op) {
-				ibool	success;
 			case IBUF_OP_INSERT:
 #ifdef UNIV_IBUF_DEBUG
 				volume += rec_get_converted_size(
@@ -4564,10 +4450,9 @@ loop:
 				the server crashes between the following
 				mtr_commit() and the subsequent mtr_commit()
 				of deleting the change buffer record. */
-
-				btr_cur_set_deleted_flag_for_ibuf(
-					btr_pcur_get_rec(&pcur), NULL,
-					TRUE, &mtr);
+				btr_rec_set_deleted<true>(
+					btr_pcur_get_block(&pcur),
+					btr_pcur_get_rec(&pcur), &mtr);
 
 				btr_pcur_store_position(&pcur, &mtr);
 				ibuf_btr_pcur_commit_specify_mtr(&pcur, &mtr);
@@ -4575,11 +4460,11 @@ loop:
 				ibuf_mtr_start(&mtr);
 				mtr.set_named_space(space);
 
-				success = buf_page_get_known_nowait(
-					RW_X_LATCH, block,
-					BUF_KEEP_OLD,
-					__FILE__, __LINE__, &mtr);
-				ut_a(success);
+				ut_ad(rw_lock_own(&block->lock, RW_LOCK_X));
+				buf_block_buf_fix_inc(block,
+						      __FILE__, __LINE__);
+				rw_lock_x_lock(&block->lock);
+				mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
 
 				/* This is a user page (secondary
 				index leaf page), but it should be OK
@@ -4588,9 +4473,7 @@ loop:
 				buf_block_dbg_add_level(
 					block, SYNC_IBUF_TREE_NODE);
 
-				if (!ibuf_restore_pos(page_id.space(),
-						      page_id.page_no(),
-						      search_tuple,
+				if (!ibuf_restore_pos(page_id, search_tuple,
 						      BTR_MODIFY_LEAF,
 						      &pcur, &mtr)) {
 
@@ -4613,8 +4496,7 @@ loop:
 		}
 
 		/* Delete the record from ibuf */
-		if (ibuf_delete_rec(page_id.space(), page_id.page_no(),
-				    &pcur, search_tuple, &mtr)) {
+		if (ibuf_delete_rec(page_id, &pcur, search_tuple, &mtr)) {
 			/* Deletion was pessimistic and mtr was committed:
 			we start from the beginning again */
 
@@ -4630,28 +4512,7 @@ loop:
 
 reset_bit:
 	if (space) {
-		page_t*	bitmap_page;
-
-		bitmap_page = ibuf_bitmap_get_map_page(page_id, zip_size,
-						       &mtr);
-
-		ibuf_bitmap_page_set_bits(
-			bitmap_page, page_id, physical_size,
-			IBUF_BITMAP_BUFFERED, FALSE, &mtr);
-
-		if (block != NULL) {
-			ulint old_bits = ibuf_bitmap_page_get_bits(
-				bitmap_page, page_id, zip_size,
-				IBUF_BITMAP_FREE, &mtr);
-
-			ulint new_bits = ibuf_index_page_calc_free(block);
-
-			if (old_bits != new_bits) {
-				ibuf_bitmap_page_set_bits(
-					bitmap_page, page_id, physical_size,
-					IBUF_BITMAP_FREE, new_bits, &mtr);
-			}
-		}
+		ibuf_reset_bitmap(block, page_id, zip_size, &mtr);
 	}
 
 	ibuf_mtr_commit(&mtr);
@@ -4663,9 +4524,9 @@ reset_bit:
 	btr_pcur_close(&pcur);
 	mem_heap_free(heap);
 
-	ibuf->n_merges++;
-	ibuf_add_ops(ibuf->n_merged_ops, mops);
-	ibuf_add_ops(ibuf->n_discarded_ops, dops);
+	ibuf.n_merges++;
+	ibuf_add_ops(ibuf.n_merged_ops, mops);
+	ibuf_add_ops(ibuf.n_discarded_ops, dops);
 }
 
 /** Delete all change buffer entries for a tablespace,
@@ -4677,7 +4538,6 @@ void ibuf_delete_for_discarded_space(ulint space)
 	btr_pcur_t	pcur;
 	dtuple_t*	search_tuple;
 	const rec_t*	ibuf_rec;
-	ulint		page_no;
 	mtr_t		mtr;
 
 	/* Counts for discarded operations. */
@@ -4697,7 +4557,7 @@ loop:
 	/* Position pcur in the insert buffer at the first entry for the
 	space */
 	btr_pcur_open_on_user_rec(
-		ibuf->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
+		ibuf.index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
 		&pcur, &mtr);
 
 	if (!btr_pcur_is_on_user_rec(&pcur)) {
@@ -4716,13 +4576,13 @@ loop:
 			goto leave_loop;
 		}
 
-		page_no = ibuf_rec_get_page_no(&mtr, ibuf_rec);
+		uint32_t page_no = ibuf_rec_get_page_no(&mtr, ibuf_rec);
 
 		dops[ibuf_rec_get_op_type(&mtr, ibuf_rec)]++;
 
 		/* Delete the record from ibuf */
-		if (ibuf_delete_rec(space, page_no, &pcur, search_tuple,
-				    &mtr)) {
+		if (ibuf_delete_rec(page_id_t(space, page_no),
+				    &pcur, search_tuple, &mtr)) {
 			/* Deletion was pessimistic and mtr was committed:
 			we start from the beginning again */
 
@@ -4742,7 +4602,7 @@ leave_loop:
 	ibuf_mtr_commit(&mtr);
 	btr_pcur_close(&pcur);
 
-	ibuf_add_ops(ibuf->n_discarded_ops, dops);
+	ibuf_add_ops(ibuf.n_discarded_ops, dops);
 
 	mem_heap_free(heap);
 }
@@ -4754,18 +4614,15 @@ bool
 ibuf_is_empty(void)
 /*===============*/
 {
-	bool		is_empty;
-	const page_t*	root;
 	mtr_t		mtr;
 
 	ibuf_mtr_start(&mtr);
 
-	mutex_enter(&ibuf_mutex);
-	root = ibuf_tree_root_get(&mtr);
-	mutex_exit(&ibuf_mutex);
-
-	is_empty = page_is_empty(root);
-	ut_a(is_empty == ibuf->empty);
+	ut_d(mutex_enter(&ibuf_mutex));
+	const buf_block_t* root = ibuf_tree_root_get(&mtr);
+	bool is_empty = page_is_empty(root->frame);
+	ut_a(is_empty == ibuf.empty);
+	ut_d(mutex_exit(&ibuf_mutex));
 	ibuf_mtr_commit(&mtr);
 
 	return(is_empty);
@@ -4783,16 +4640,16 @@ ibuf_print(
 	fprintf(file,
 		"Ibuf: size " ULINTPF ", free list len " ULINTPF ","
 		" seg size " ULINTPF ", " ULINTPF " merges\n",
-		ibuf->size,
-		ibuf->free_list_len,
-		ibuf->seg_size,
-		ulint{ibuf->n_merges});
+		ibuf.size,
+		ibuf.free_list_len,
+		ibuf.seg_size,
+		ulint{ibuf.n_merges});
 
 	fputs("merged operations:\n ", file);
-	ibuf_print_ops(ibuf->n_merged_ops, file);
+	ibuf_print_ops(ibuf.n_merged_ops, file);
 
 	fputs("discarded operations:\n ", file);
-	ibuf_print_ops(ibuf->n_discarded_ops, file);
+	ibuf_print_ops(ibuf.n_discarded_ops, file);
 
 	mutex_exit(&ibuf_mutex);
 }
@@ -4803,37 +4660,20 @@ ibuf_print(
 @return DB_SUCCESS or error code */
 dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
 {
-	ulint	page_no;
 	ut_ad(trx->mysql_thd);
 	ut_ad(space->purpose == FIL_TYPE_IMPORT);
 
-	const ulint zip_size = space->zip_size();
-	const ulint physical_size = space->physical_size();
-	/* fil_space_t::size and fil_space_t::free_limit would still be 0
-	at this point. So, we will have to read page 0. */
-	ut_ad(!space->free_limit);
-	ut_ad(!space->size);
+	const unsigned zip_size = space->zip_size();
+	const unsigned physical_size = space->physical_size();
 
-	mtr_t	mtr;
-	ulint	size;
-	mtr.start();
-	if (buf_block_t* sp = buf_page_get(page_id_t(space->id, 0),
-					   zip_size,
-					   RW_S_LATCH, &mtr)) {
-		size = std::min(
-			mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
-					 + sp->frame),
-			mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
-					 + sp->frame));
-	} else {
-		size = 0;
-	}
-	mtr.commit();
+	uint32_t size= std::min(space->free_limit, space->size);
 
 	if (size == 0) {
 		return(DB_TABLE_NOT_FOUND);
 	}
 
+	mtr_t mtr;
+
 	mutex_enter(&ibuf_mutex);
 
 	/* The two bitmap pages (allocation bitmap and ibuf bitmap) repeat
@@ -4842,10 +4682,7 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
 	below page_no is measured in number of pages since the beginning of
 	the space, as usual. */
 
-	for (page_no = 0; page_no < size; page_no += physical_size) {
-		page_t*	bitmap_page;
-		ulint	i;
-
+	for (uint32_t page_no = 0; page_no < size; page_no += physical_size) {
 		if (trx_is_interrupted(trx)) {
 			mutex_exit(&ibuf_mutex);
 			return(DB_INTERRUPTED);
@@ -4857,23 +4694,21 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
 
 		ibuf_enter(&mtr);
 
-		bitmap_page = ibuf_bitmap_get_map_page(
+		buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
 			page_id_t(space->id, page_no), zip_size, &mtr);
-
 		if (!bitmap_page) {
 			mutex_exit(&ibuf_mutex);
-			ibuf_exit(&mtr);
-			mtr_commit(&mtr);
+			mtr.commit();
 			return DB_CORRUPTION;
 		}
 
-		if (buf_is_zeroes(span<const byte>(bitmap_page,
+		if (buf_is_zeroes(span<const byte>(bitmap_page->frame,
 						   physical_size))) {
 			/* This means we got all-zero page instead of
 			ibuf bitmap page. The subsequent page should be
 			all-zero pages. */
 #ifdef UNIV_DEBUG
-			for (ulint curr_page = page_no + 1;
+			for (uint32_t curr_page = page_no + 1;
 			     curr_page < physical_size; curr_page++) {
 
 				buf_block_t* block = buf_page_get(
@@ -4890,12 +4725,13 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
 			continue;
 		}
 
-		for (i = FSP_IBUF_BITMAP_OFFSET + 1; i < physical_size; i++) {
-			const ulint	offset = page_no + i;
+		for (uint32_t i = FSP_IBUF_BITMAP_OFFSET + 1; i < physical_size;
+		     i++) {
+			const uint32_t offset = page_no + i;
 			const page_id_t	cur_page_id(space->id, offset);
 
 			if (ibuf_bitmap_page_get_bits(
-				    bitmap_page, cur_page_id, zip_size,
+				    bitmap_page->frame, cur_page_id, zip_size,
 				    IBUF_BITMAP_IBUF, &mtr)) {
 
 				mutex_exit(&ibuf_mutex);
@@ -4905,7 +4741,7 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
 				ib_errf(trx->mysql_thd,
 					IB_LOG_LEVEL_ERROR,
 					 ER_INNODB_INDEX_CORRUPT,
-					 "File %s page " ULINTPF
+					 "File %s page %u"
 					 " is wrongly flagged to belong to the"
 					 " insert buffer",
 					space->chain.start->name, offset);
@@ -4913,24 +4749,22 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
 			}
 
 			if (ibuf_bitmap_page_get_bits(
-				    bitmap_page, cur_page_id, zip_size,
+				    bitmap_page->frame, cur_page_id, zip_size,
 				    IBUF_BITMAP_BUFFERED, &mtr)) {
 
 				ib_errf(trx->mysql_thd,
 					IB_LOG_LEVEL_WARN,
 					ER_INNODB_INDEX_CORRUPT,
 					"Buffered changes"
-					" for file %s page " ULINTPF
-					" are lost",
+					" for file %s page %u are lost",
 					space->chain.start->name, offset);
 
 				/* Tolerate this error, so that
 				slightly corrupted tables can be
 				imported and dumped.  Clear the bit. */
-				ibuf_bitmap_page_set_bits(
+				ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>(
 					bitmap_page, cur_page_id,
-					physical_size,
-					IBUF_BITMAP_BUFFERED, FALSE, &mtr);
+					physical_size, false, &mtr);
 			}
 		}
 
@@ -4950,7 +4784,6 @@ ibuf_set_bitmap_for_bulk_load(
 	buf_block_t*	block,
 	bool		reset)
 {
-	page_t*	bitmap_page;
 	mtr_t	mtr;
 	ulint	free_val;
 
@@ -4958,20 +4791,22 @@ ibuf_set_bitmap_for_bulk_load(
 
 	free_val = ibuf_index_page_calc_free(block);
 
-	mtr_start(&mtr);
-	fil_space_t* space = mtr.set_named_space_id(block->page.id.space());
+	mtr.start();
+	fil_space_t* space = mtr.set_named_space_id(block->page.id().space());
 
-	bitmap_page = ibuf_bitmap_get_map_page(block->page.id,
-                                               space->zip_size(), &mtr);
+	buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(block->page.id(),
+							    space->zip_size(),
+							    &mtr);
 
 	free_val = reset ? 0 : ibuf_index_page_calc_free(block);
-	ibuf_bitmap_page_set_bits(
-		bitmap_page, block->page.id, block->physical_size(),
-		IBUF_BITMAP_FREE, free_val, &mtr);
+	/* FIXME: update the bitmap byte only once! */
+	ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(
+		bitmap_page, block->page.id(), block->physical_size(),
+		free_val, &mtr);
 
-	ibuf_bitmap_page_set_bits(
-		bitmap_page, block->page.id, block->physical_size(),
-		IBUF_BITMAP_BUFFERED, FALSE, &mtr);
+	ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>(
+		bitmap_page, block->page.id(), block->physical_size(),
+		false, &mtr);
 
-	mtr_commit(&mtr);
+	mtr.commit();
 }
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
index ed64a92688a..c0dcc6f39d3 100644
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -218,37 +218,55 @@ btr_height_get(
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 	MY_ATTRIBUTE((warn_unused_result));
 
-/** Gets a buffer page and declares its latching order level.
-@param[in]	page_id	page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+/** Get an index page and declare its latching order level.
+@param[in]	index	index tree
+@param[in]	page	page number
 @param[in]	mode	latch mode
+@param[in]	merge	whether change buffer merge should be attempted
 @param[in]	file	file name
 @param[in]	line	line where called
-@param[in]	index	index tree, may be NULL if it is not an insert buffer
-tree
 @param[in,out]	mtr	mini-transaction
 @return block */
-UNIV_INLINE
-buf_block_t*
-btr_block_get_func(
-	const page_id_t		page_id,
-	ulint			zip_size,
-	ulint			mode,
-	const char*		file,
-	unsigned		line,
-	dict_index_t*		index,
-	mtr_t*			mtr);
+inline buf_block_t* btr_block_get_func(const dict_index_t& index,
+				       uint32_t page, ulint mode, bool merge,
+				       const char* file, unsigned line,
+				       mtr_t* mtr)
+{
+	dberr_t err;
+
+	if (buf_block_t* block = buf_page_get_gen(
+		    page_id_t(index.table->space->id, page),
+		    index.table->space->zip_size(), mode, NULL, BUF_GET,
+		    file, line, mtr, &err, merge && !index.is_clust())) {
+		ut_ad(err == DB_SUCCESS);
+		if (mode != RW_NO_LATCH) {
+			buf_block_dbg_add_level(block, index.is_ibuf()
+						? SYNC_IBUF_TREE_NODE
+						: SYNC_TREE_NODE);
+		}
+		return block;
+	} else {
+		ut_ad(err != DB_SUCCESS);
+
+		if (err == DB_DECRYPTION_FAILED) {
+			if (index.table) {
+				index.table->file_unreadable = true;
+			}
+		}
+
+		return NULL;
+	}
+}
 
 /** Gets a buffer page and declares its latching order level.
-@param page_id tablespace/page identifier
-@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param index index tree
+@param page page number
 @param mode latch mode
-@param index index tree, may be NULL if not the insert buffer tree
+@param merge whether change buffer merge should be attempted
 @param mtr mini-transaction handle
 @return the block descriptor */
-# define btr_block_get(page_id, zip_size, mode, index, mtr)	\
-	btr_block_get_func(page_id, zip_size, mode,		\
-		__FILE__, __LINE__, (dict_index_t*)index, mtr)
+# define btr_block_get(index, page, mode, merge, mtr)		\
+	btr_block_get_func(index, page, mode, merge, __FILE__, __LINE__, mtr)
 /**************************************************************//**
 Gets the index id field of a page.
 @return index id */
@@ -258,23 +276,16 @@ btr_page_get_index_id(
 /*==================*/
 	const page_t*	page)	/*!< in: index page */
 	MY_ATTRIBUTE((warn_unused_result));
-/********************************************************//**
-Gets the node level field in an index page.
-@param[in]	page	index page
-@return level, leaf level == 0 */
-UNIV_INLINE
-ulint
-btr_page_get_level(const page_t* page)
+/** Read the B-tree or R-tree PAGE_LEVEL.
+@param page B-tree or R-tree page
+@return number of child page links to reach the leaf level
+@retval 0 for leaf pages */
+inline uint16_t btr_page_get_level(const page_t *page)
 {
-	ulint	level;
-
-	ut_ad(page);
-
-	level = mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL);
-
-	ut_ad(level <= BTR_MAX_NODE_LEVEL);
-
-	return(level);
+  uint16_t level= mach_read_from_2(my_assume_aligned<2>
+                                   (PAGE_HEADER + PAGE_LEVEL + page));
+  ut_ad(level <= BTR_MAX_NODE_LEVEL);
+  return level;
 } MY_ATTRIBUTE((warn_unused_result))
 
 /** Read FIL_PAGE_NEXT.
@@ -282,7 +293,7 @@ btr_page_get_level(const page_t* page)
 @return previous page number */
 inline uint32_t btr_page_get_next(const page_t* page)
 {
-  return mach_read_from_4(page + FIL_PAGE_NEXT);
+  return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
 }
 
 /** Read FIL_PAGE_PREV.
@@ -290,7 +301,7 @@ inline uint32_t btr_page_get_next(const page_t* page)
 @return previous page number */
 inline uint32_t btr_page_get_prev(const page_t* page)
 {
-  return mach_read_from_4(page + FIL_PAGE_PREV);
+  return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
 }
 
 /**************************************************************//**
@@ -312,7 +323,7 @@ the child page number. In other words offsets must have been retrieved
 with rec_get_offsets(n_fields=ULINT_UNDEFINED).
 @return child node address */
 UNIV_INLINE
-ulint
+uint32_t
 btr_node_ptr_get_child_page_no(
 /*===========================*/
 	const rec_t*	rec,	/*!< in: node pointer record */
@@ -323,11 +334,11 @@ btr_node_ptr_get_child_page_no(
 @param[in]	type			type of the index
 @param[in,out]	space			tablespace where created
 @param[in]	index_id		index id
-@param[in]	index			index
+@param[in]	index			index, or NULL to create a system table
 @param[in,out]	mtr			mini-transaction
 @return	page number of the created root
 @retval	FIL_NULL	if did not succeed */
-ulint
+uint32_t
 btr_create(
 	ulint			type,
 	fil_space_t*		space,
@@ -385,6 +396,13 @@ btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset = false)
 @param[in,out]	mtr	mini-transaction */
 void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr);
 
+/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE.
+@param[in]      index   clustered index with instant ALTER TABLE
+@param[in]      all     whether to reset FIL_PAGE_TYPE as well
+@param[in,out]  mtr     mini-transaction */
+ATTRIBUTE_COLD __attribute__((nonnull))
+void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr);
+
 /*************************************************************//**
 Makes tree one level higher by splitting the root, and inserts
 the tuple. It is assumed that mtr contains an x-latch on the tree.
@@ -419,31 +437,6 @@ IBUF_BITMAP_FREE is unaffected by reorganization.
 @retval true if the operation was successful
 @retval false if it is a compressed page, and recompression failed */
 bool
-btr_page_reorganize_low(
-/*====================*/
-	bool		recovery,/*!< in: true if called in recovery:
-				locks should not be updated, i.e.,
-				there cannot exist locks on the
-				page, and a hash index should not be
-				dropped: it cannot exist */
-	ulint		z_level,/*!< in: compression level to be used
-				if dealing with compressed page */
-	page_cur_t*	cursor,	/*!< in/out: page cursor */
-	dict_index_t*	index,	/*!< in: the index tree of the page */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
-	MY_ATTRIBUTE((warn_unused_result));
-/*************************************************************//**
-Reorganizes an index page.
-
-IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
-if this is a compressed leaf page in a secondary index. This has to
-be done either within the same mini-transaction, or by invoking
-ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
-IBUF_BITMAP_FREE is unaffected by reorganization.
-
-@retval true if the operation was successful
-@retval false if it is a compressed page, and recompression failed */
-bool
 btr_page_reorganize(
 /*================*/
 	page_cur_t*	cursor,	/*!< in/out: page cursor */
@@ -506,8 +499,28 @@ btr_insert_on_non_leaf_level_func(
 #define btr_insert_on_non_leaf_level(f,i,l,t,m)			\
 	btr_insert_on_non_leaf_level_func(f,i,l,t,__FILE__,__LINE__,m)
 
-/** Sets a record as the predefined minimum record. */
-void btr_set_min_rec_mark(rec_t* rec, mtr_t* mtr) MY_ATTRIBUTE((nonnull));
+/** Set a child page pointer record as the predefined minimum record.
+@tparam has_prev  whether the page is supposed to have a left sibling
+@param[in,out]  rec     leftmost record on a leftmost non-leaf page
+@param[in,out]  block   buffer pool block
+@param[in,out]  mtr     mini-transaction */
+template<bool has_prev= false>
+inline void btr_set_min_rec_mark(rec_t *rec, const buf_block_t &block,
+                                 mtr_t *mtr)
+{
+  ut_ad(block.frame == page_align(rec));
+  ut_ad(!page_is_leaf(block.frame));
+  ut_ad(has_prev == page_has_prev(block.frame));
+
+  rec-= page_rec_is_comp(rec) ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS;
+
+  if (block.page.zip.data)
+    /* This flag is computed from other contents on a ROW_FORMAT=COMPRESSED
+    page. We are not modifying the compressed page frame at all. */
+    *rec|= REC_INFO_MIN_REC_FLAG;
+  else
+    mtr->write<1>(block, rec, *rec | REC_INFO_MIN_REC_FLAG);
+}
 
 /** Seek to the parent page of a B-tree page.
 @param[in,out]	index	b-tree
@@ -560,32 +573,6 @@ btr_discard_page(
 	btr_cur_t*	cursor,	/*!< in: cursor on the page to discard: not on
 				the root page */
 	mtr_t*		mtr);	/*!< in: mtr */
-/****************************************************************//**
-Parses the redo log record for setting an index record as the predefined
-minimum record.
-@return end of log record or NULL */
-byte*
-btr_parse_set_min_rec_mark(
-/*=======================*/
-	byte*	ptr,	/*!< in: buffer */
-	byte*	end_ptr,/*!< in: buffer end */
-	ulint	comp,	/*!< in: nonzero=compact page format */
-	page_t*	page,	/*!< in: page or NULL */
-	mtr_t*	mtr)	/*!< in: mtr or NULL */
-	MY_ATTRIBUTE((nonnull(1,2), warn_unused_result));
-/***********************************************************//**
-Parses a redo log record of reorganizing a page.
-@return end of log record or NULL */
-byte*
-btr_parse_page_reorganize(
-/*======================*/
-	byte*		ptr,	/*!< in: buffer */
-	byte*		end_ptr,/*!< in: buffer end */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	bool		compressed,/*!< in: true if compressed page */
-	buf_block_t*	block,	/*!< in: page to be reorganized, or NULL */
-	mtr_t*		mtr)	/*!< in: mtr or NULL */
-	MY_ATTRIBUTE((warn_unused_result));
 /**************************************************************//**
 Gets the number of pages in a B-tree.
 @return number of pages, or ULINT_UNDEFINED if the index is unavailable */
@@ -615,15 +602,12 @@ btr_get_size_and_reserved(
 /**************************************************************//**
 Allocates a new file page to be used in an index tree. NOTE: we assume
 that the caller has made the reservation for free extents!
-@retval NULL if no page could be allocated
-@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
-(init_mtr == mtr, or the page was not previously freed in mtr)
-@retval block (not allocated or initialized) otherwise */
+@retval NULL if no page could be allocated */
 buf_block_t*
 btr_page_alloc(
 /*===========*/
 	dict_index_t*	index,		/*!< in: index tree */
-	ulint		hint_page_no,	/*!< in: hint of a good page */
+	uint32_t	hint_page_no,	/*!< in: hint of a good page */
 	byte		file_direction,	/*!< in: direction where a possible
 					page split is made */
 	ulint		level,		/*!< in: level where the page is placed
@@ -676,7 +660,7 @@ buf_block_t*
 btr_root_block_get(
 /*===============*/
 	const dict_index_t*	index,	/*!< in: index tree */
-	ulint			mode,	/*!< in: either RW_S_LATCH
+	rw_lock_type_t		mode,	/*!< in: either RW_S_LATCH
 					or RW_X_LATCH */
 	mtr_t*			mtr);	/*!< in: mtr */
 
@@ -691,15 +675,7 @@ IBUF_BITMAP_FREE is unaffected by reorganization.
 
 @retval true if the operation was successful
 @retval false if it is a compressed page, and recompression failed */
-UNIV_INTERN
-bool
-btr_page_reorganize_block(
-/*======================*/
-	bool		recovery,/*!< in: true if called in recovery:
-				locks should not be updated, i.e.,
-				there cannot exist locks on the
-				page, and a hash index should not be
-				dropped: it cannot exist */
+bool btr_page_reorganize_block(
 	ulint		z_level,/*!< in: compression level to be used
 				if dealing with compressed page */
 	buf_block_t*	block,	/*!< in/out: B-tree page */
@@ -749,29 +725,12 @@ btr_validate_index(
 	MY_ATTRIBUTE((warn_unused_result));
 
 /** Remove a page from the level list of pages.
-@param[in]	space		space where removed
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in,out]	page		page to remove
+@param[in]	block		page to remove
 @param[in]	index		index tree
 @param[in,out]	mtr		mini-transaction */
-dberr_t
-btr_level_list_remove_func(
-	ulint			space,
-	ulint			zip_size,
-	page_t*			page,
-	dict_index_t*		index,
-	mtr_t*			mtr)
-	MY_ATTRIBUTE((warn_unused_result));
-
-/*************************************************************//**
-Removes a page from the level list of pages.
-@param space	in: space where removed
-@param zip_size	in: compressed page size in bytes, or 0 for uncompressed
-@param page	in/out: page to remove
-@param index	in: index tree
-@param mtr	in/out: mini-transaction */
-# define btr_level_list_remove(space,zip_size,page,index,mtr)		\
-	btr_level_list_remove_func(space,zip_size,page,index,mtr)
+dberr_t btr_level_list_remove(const buf_block_t& block,
+                              const dict_index_t& index, mtr_t* mtr)
+  MY_ATTRIBUTE((warn_unused_result));
 
 /*************************************************************//**
 If page is the only on its level, this function moves its records to the
diff --git a/storage/innobase/include/btr0btr.inl b/storage/innobase/include/btr0btr.inl
index d3827b7dc6f..89826e8f214 100644
--- a/storage/innobase/include/btr0btr.inl
+++ b/storage/innobase/include/btr0btr.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2019, MariaDB Corporation.
+Copyright (c) 2015, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -29,73 +29,6 @@ Created 6/2/1994 Heikki Tuuri
 #include "mtr0log.h"
 #include "page0zip.h"
 
-/** Gets a buffer page and declares its latching order level.
-@param[in]	page_id	page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	mode	latch mode
-@param[in]	file	file name
-@param[in]	line	line where called
-@param[in]	index	index tree, may be NULL if it is not an insert buffer
-tree
-@param[in,out]	mtr	mini-transaction
-@return block */
-UNIV_INLINE
-buf_block_t*
-btr_block_get_func(
-	const page_id_t		page_id,
-	ulint			zip_size,
-	ulint			mode,
-	const char*		file,
-	unsigned		line,
-	dict_index_t*		index,
-	mtr_t*			mtr)
-{
-	buf_block_t*	block;
-	dberr_t		err=DB_SUCCESS;
-
-	block = buf_page_get_gen(
-		page_id, zip_size, mode, NULL, BUF_GET, file, line, mtr, &err);
-
-	if (err == DB_DECRYPTION_FAILED) {
-		if (index && index->table) {
-			index->table->file_unreadable = true;
-		}
-	}
-
-	if (block) {
-		if (mode != RW_NO_LATCH) {
-
-			buf_block_dbg_add_level(
-				block, index != NULL && dict_index_is_ibuf(index)
-				? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
-		}
-	}
-
-	return(block);
-}
-
-/**************************************************************//**
-Sets the index id field of a page. */
-UNIV_INLINE
-void
-btr_page_set_index_id(
-/*==================*/
-	page_t*		page,	/*!< in: page to be created */
-	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
-				part will be updated, or NULL */
-	index_id_t	id,	/*!< in: index id */
-	mtr_t*		mtr)	/*!< in: mtr */
-{
-	if (page_zip) {
-		mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), id);
-		page_zip_write_header(page_zip,
-				      page + (PAGE_HEADER + PAGE_INDEX_ID),
-				      8, mtr);
-	} else {
-		mlog_write_ull(page + (PAGE_HEADER + PAGE_INDEX_ID), id, mtr);
-	}
-}
-
 /**************************************************************//**
 Gets the index id field of a page.
 @return index id */
@@ -108,77 +41,45 @@ btr_page_get_index_id(
 	return(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID));
 }
 
-/********************************************************//**
-Sets the node level field in an index page. */
-UNIV_INLINE
-void
-btr_page_set_level(
-/*===============*/
-	page_t*		page,	/*!< in: index page */
-	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
-				part will be updated, or NULL */
-	ulint		level,	/*!< in: level, leaf level == 0 */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+/** Set PAGE_LEVEL.
+@param[in,out]  block  buffer block
+@param[in]      level  page level
+@param[in,out]  mtr    mini-transaction */
+inline
+void btr_page_set_level(buf_block_t *block, ulint level, mtr_t *mtr)
 {
-	ut_ad(page != NULL);
-	ut_ad(mtr != NULL);
-	ut_ad(level <= BTR_MAX_NODE_LEVEL);
-
-	if (page_zip) {
-		mach_write_to_2(page + (PAGE_HEADER + PAGE_LEVEL), level);
-		page_zip_write_header(page_zip,
-				      page + (PAGE_HEADER + PAGE_LEVEL),
-				      2, mtr);
-	} else {
-		mlog_write_ulint(page + (PAGE_HEADER + PAGE_LEVEL), level,
-				 MLOG_2BYTES, mtr);
-	}
+  ut_ad(level <= BTR_MAX_NODE_LEVEL);
+  constexpr uint16_t field= PAGE_HEADER + PAGE_LEVEL;
+  byte *b= my_assume_aligned<2>(&block->frame[field]);
+  if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, level) &&
+      UNIV_LIKELY_NULL(block->page.zip.data))
+    memcpy_aligned<2>(&block->page.zip.data[field], b, 2);
 }
 
-/********************************************************//**
-Sets the next index page field. */
-UNIV_INLINE
-void
-btr_page_set_next(
-/*==============*/
-	page_t*		page,	/*!< in: index page */
-	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
-				part will be updated, or NULL */
-	ulint		next,	/*!< in: next page number */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+/** Set FIL_PAGE_NEXT.
+@param[in,out]  block  buffer block
+@param[in]      next   number of successor page
+@param[in,out]  mtr    mini-transaction */
+inline void btr_page_set_next(buf_block_t *block, ulint next, mtr_t *mtr)
 {
-	ut_ad(page != NULL);
-	ut_ad(mtr != NULL);
-
-	if (page_zip) {
-		mach_write_to_4(page + FIL_PAGE_NEXT, next);
-		page_zip_write_header(page_zip, page + FIL_PAGE_NEXT, 4, mtr);
-	} else {
-		mlog_write_ulint(page + FIL_PAGE_NEXT, next, MLOG_4BYTES, mtr);
-	}
+  constexpr uint16_t field= FIL_PAGE_NEXT;
+  byte *b= my_assume_aligned<4>(&block->frame[field]);
+  if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, next) &&
+      UNIV_LIKELY_NULL(block->page.zip.data))
+    memcpy_aligned<4>(&block->page.zip.data[field], b, 4);
 }
 
-/********************************************************//**
-Sets the previous index page field. */
-UNIV_INLINE
-void
-btr_page_set_prev(
-/*==============*/
-	page_t*		page,	/*!< in: index page */
-	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
-				part will be updated, or NULL */
-	ulint		prev,	/*!< in: previous page number */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+/** Set FIL_PAGE_PREV.
+@param[in,out]  block  buffer block
+@param[in]      prev   number of predecessor page
+@param[in,out]  mtr    mini-transaction */
+inline void btr_page_set_prev(buf_block_t *block, ulint prev, mtr_t *mtr)
 {
-	ut_ad(page != NULL);
-	ut_ad(mtr != NULL);
-
-	if (page_zip) {
-		mach_write_to_4(page + FIL_PAGE_PREV, prev);
-		page_zip_write_header(page_zip, page + FIL_PAGE_PREV, 4, mtr);
-	} else {
-		mlog_write_ulint(page + FIL_PAGE_PREV, prev, MLOG_4BYTES, mtr);
-	}
+  constexpr uint16_t field= FIL_PAGE_PREV;
+  byte *b= my_assume_aligned<4>(&block->frame[field]);
+  if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, prev) &&
+      UNIV_LIKELY_NULL(block->page.zip.data))
+    memcpy_aligned<4>(&block->page.zip.data[field], b, 4);
 }
 
 /**************************************************************//**
@@ -189,7 +90,7 @@ the child page number. In other words offsets must have been retrieved
 with rec_get_offsets(n_fields=ULINT_UNDEFINED).
 @return child node address */
 UNIV_INLINE
-ulint
+uint32_t
 btr_node_ptr_get_child_page_no(
 /*===========================*/
 	const rec_t*	rec,	/*!< in: node pointer record */
@@ -197,7 +98,6 @@ btr_node_ptr_get_child_page_no(
 {
 	const byte*	field;
 	ulint		len;
-	ulint		page_no;
 
 	ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
 
@@ -207,7 +107,7 @@ btr_node_ptr_get_child_page_no(
 
 	ut_ad(len == 4);
 
-	page_no = mach_read_from_4(field);
+	uint32_t page_no = mach_read_from_4(field);
 	ut_ad(page_no > 1);
 
 	return(page_no);
@@ -228,9 +128,9 @@ btr_leaf_page_release(
 	      || latch_mode == BTR_MODIFY_LEAF
 	      || latch_mode == BTR_NO_LATCHES);
 
-	ut_ad(!mtr_memo_contains(mtr, block, MTR_MEMO_MODIFY));
+	ut_ad(!mtr->memo_contains_flagged(block, MTR_MEMO_MODIFY));
 
-	ulint mode;
+	mtr_memo_type_t mode;
 	switch (latch_mode) {
 		case BTR_SEARCH_LEAF:
 			mode = MTR_MEMO_PAGE_S_FIX;
diff --git a/storage/innobase/include/btr0bulk.h b/storage/innobase/include/btr0bulk.h
index b8428186383..943836f8759 100644
--- a/storage/innobase/include/btr0bulk.h
+++ b/storage/innobase/include/btr0bulk.h
@@ -35,8 +35,6 @@ Created 03/11/2014 Shaohua Wang
 
 /** Innodb B-tree index fill factor for bulk load. */
 extern	uint	innobase_fill_factor;
-/** whether to reduce redo logging during ALTER TABLE */
-extern	my_bool	innodb_log_optimize_ddl;
 
 /*
 The proper function call sequence of PageBulk is as below:
@@ -55,14 +53,12 @@ public:
 	@param[in]	index		B-tree index
 	@param[in]	page_no		page number
 	@param[in]	level		page level
-	@param[in]	trx_id		transaction id
-	@param[in]	observer	flush observer */
+	@param[in]	trx_id		transaction id */
 	PageBulk(
 		dict_index_t*	index,
 		trx_id_t	trx_id,
-		ulint		page_no,
-		ulint		level,
-		FlushObserver*	observer)
+		uint32_t	page_no,
+		ulint		level)
 		:
 		m_heap(NULL),
 		m_index(index),
@@ -83,7 +79,6 @@ public:
 		m_total_data(0),
 #endif /* UNIV_DEBUG */
 		m_modify_clock(0),
-		m_flush_observer(observer),
 		m_err(DB_SUCCESS)
 	{
 		ut_ad(!dict_index_is_spatial(m_index));
@@ -104,11 +99,24 @@ public:
 	/** Insert a record in the page.
 	@param[in]	rec		record
 	@param[in]	offsets		record offsets */
-	void insert(const rec_t* rec, rec_offs* offsets);
+	inline void insert(const rec_t* rec, rec_offs* offsets);
+private:
+	/** Page format */
+	enum format { REDUNDANT, DYNAMIC, COMPRESSED };
+	/** Mark end of insertion to the page. Scan all records to set page
+	dirs, and set page header members.
+	@tparam format  the page format */
+	template<format> inline void finishPage();
+	/** Insert a record in the page.
+	@tparam format  the page format
+	@param[in,out]	rec		record
+	@param[in]	offsets		record offsets */
+	template<format> inline void insertPage(rec_t* rec, rec_offs* offsets);
 
+public:
 	/** Mark end of insertion to the page. Scan all records to set page
 	dirs, and set page header members. */
-	void finish();
+	inline void finish();
 
   /** @return whether finish() actually needs to do something */
   inline bool needs_finish() const;
@@ -171,10 +179,7 @@ public:
 	inline bool isSpaceAvailable(ulint	rec_size);
 
 	/** Get page no */
-	ulint	getPageNo()
-	{
-		return(m_page_no);
-	}
+	uint32_t getPageNo() const { return m_page_no; }
 
 	/** Get page level */
 	ulint	getLevel()
@@ -205,6 +210,8 @@ public:
 		return(m_err);
 	}
 
+	void set_modified() { m_mtr.set_modified(*m_block); }
+
 	/* Memory heap for internal allocation */
 	mem_heap_t*	m_heap;
 
@@ -231,7 +238,7 @@ private:
 	rec_t*		m_cur_rec;
 
 	/** The page no */
-	ulint		m_page_no;
+	uint32_t	m_page_no;
 
 	/** The page level in B-tree */
 	ulint		m_level;
@@ -263,9 +270,6 @@ private:
 	when the block is re-pinned */
 	ib_uint64_t     m_modify_clock;
 
-	/** Flush observer, or NULL if redo logging is enabled */
-	FlushObserver*	m_flush_observer;
-
 	/** Operation result DB_SUCCESS or error code */
 	dberr_t		m_err;
 };
@@ -278,31 +282,15 @@ class BtrBulk
 public:
 	/** Constructor
 	@param[in]	index		B-tree index
-	@param[in]	trx		transaction
-	@param[in]	observer	flush observer */
+	@param[in]	trx		transaction */
 	BtrBulk(
 		dict_index_t*	index,
-		const trx_t*	trx,
-		FlushObserver*	observer)
+		const trx_t*	trx)
 		:
 		m_index(index),
-		m_trx(trx),
-		m_flush_observer(observer)
+		m_trx(trx)
 	{
 		ut_ad(!dict_index_is_spatial(index));
-#ifdef UNIV_DEBUG
-		if (m_flush_observer)
-			m_index->table->space->redo_skipped_count++;
-#endif /* UNIV_DEBUG */
-	}
-
-	/** Destructor */
-	~BtrBulk()
-	{
-#ifdef UNIV_DEBUG
-		if (m_flush_observer)
-			m_index->table->space->redo_skipped_count--;
-#endif /* UNIV_DEBUG */
 	}
 
 	/** Insert a tuple
@@ -376,9 +364,6 @@ private:
 	/** Root page level */
 	ulint			m_root_level;
 
-	/** Flush observer, or NULL if redo logging is enabled */
-	FlushObserver*const	m_flush_observer;
-
 	/** Page cursor vector for all level */
 	page_bulk_vector	m_page_bulks;
 };
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
index 0eb06a3bc99..3f9cad12004 100644
--- a/storage/innobase/include/btr0cur.h
+++ b/storage/innobase/include/btr0cur.h
@@ -69,7 +69,6 @@ struct btr_latch_leaves_t {
 
 #include "que0types.h"
 #include "row0types.h"
-#include "ha0ha.h"
 
 #ifdef UNIV_DEBUG
 /*********************************************************//**
@@ -356,6 +355,22 @@ btr_cur_update_alloc_zip_func(
 # define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr) \
 	btr_cur_update_alloc_zip_func(page_zip,cursor,index,len,cr,mtr)
 #endif /* UNIV_DEBUG */
+
+/** Apply an update vector to a record. No field size changes are allowed.
+
+This is usually invoked on a clustered index. The only use case for a
+secondary index is row_ins_sec_index_entry_by_modify() or its
+counterpart in ibuf_insert_to_index_page().
+@param[in,out]  rec     index record
+@param[in]      index   the index of the record
+@param[in]      offsets rec_get_offsets(rec, index)
+@param[in]      update  update vector
+@param[in,out]  block   index page
+@param[in,out]  mtr     mini-transaction */
+void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index,
+                              const rec_offs *offsets, const upd_t *update,
+                              buf_block_t *block, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
 /*************************************************************//**
 Updates a record when the update causes no size changes in its fields.
 @return locking or undo log related error code, or
@@ -380,19 +395,6 @@ btr_cur_update_in_place(
 				mtr_commit(mtr) before latching any
 				further pages */
 	MY_ATTRIBUTE((warn_unused_result, nonnull));
-/***********************************************************//**
-Writes a redo log record of updating a record in-place. */
-void
-btr_cur_update_in_place_log(
-/*========================*/
-	ulint		flags,		/*!< in: flags */
-	const rec_t*	rec,		/*!< in: record */
-	dict_index_t*	index,		/*!< in: index of the record */
-	const upd_t*	update,		/*!< in: update vector */
-	trx_id_t	trx_id,		/*!< in: transaction id */
-	roll_ptr_t	roll_ptr,	/*!< in: roll ptr */
-	mtr_t*		mtr)		/*!< in: mtr */
-	MY_ATTRIBUTE((nonnull));
 /*************************************************************//**
 Tries to update a record on a page in an index tree. It is assumed that mtr
 holds an x-latch on the page. The operation does not succeed if there is too
@@ -475,18 +477,6 @@ btr_cur_del_mark_set_clust_rec(
 	const dtuple_t*	entry,	/*!< in: dtuple for the deleting record */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
-/***********************************************************//**
-Sets a secondary index record delete mark to TRUE or FALSE.
-@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
-dberr_t
-btr_cur_del_mark_set_sec_rec(
-/*=========================*/
-	ulint		flags,	/*!< in: locking flag */
-	btr_cur_t*	cursor,	/*!< in: cursor */
-	ibool		val,	/*!< in: value to set */
-	que_thr_t*	thr,	/*!< in: query thread */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
 /*************************************************************//**
 Tries to compress a page of the tree if it seems useful. It is assumed
 that mtr holds an x-latch on the tree and on the cursor page. To avoid
@@ -575,44 +565,30 @@ btr_cur_parse_update_in_place(
 	page_t*		page,	/*!< in/out: page or NULL */
 	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
 	dict_index_t*	index);	/*!< in: index corresponding to page */
-/****************************************************************//**
-Parses the redo log record for delete marking or unmarking of a clustered
-index record.
-@return end of log record or NULL */
-byte*
-btr_cur_parse_del_mark_set_clust_rec(
-/*=================================*/
-	byte*		ptr,	/*!< in: buffer */
-	byte*		end_ptr,/*!< in: buffer end */
-	page_t*		page,	/*!< in/out: page or NULL */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	dict_index_t*	index);	/*!< in: index corresponding to page */
-/****************************************************************//**
-Parses the redo log record for delete marking or unmarking of a secondary
-index record.
-@return end of log record or NULL */
-byte*
-btr_cur_parse_del_mark_set_sec_rec(
-/*===============================*/
-	byte*		ptr,	/*!< in: buffer */
-	byte*		end_ptr,/*!< in: buffer end */
-	page_t*		page,	/*!< in/out: page or NULL */
-	page_zip_des_t*	page_zip);/*!< in/out: compressed page, or NULL */
+/** Arguments to btr_estimate_n_rows_in_range */
+struct btr_pos_t
+{
+  btr_pos_t(dtuple_t *arg_tuple,
+            page_cur_mode_t arg_mode,
+            page_id_t arg_page_id)
+  :tuple(arg_tuple), mode(arg_mode), page_id(arg_page_id)
+  {}
+
+  dtuple_t*       tuple;       /* Range start or end. May be NULL */
+  page_cur_mode_t mode;        /* search mode for range */
+  page_id_t       page_id;     /* Out: Page where we found the tuple */
+};
 
 /** Estimates the number of rows in a given index range.
 @param[in]	index	index
-@param[in]	tuple1	range start, may also be empty tuple
-@param[in]	mode1	search mode for range start
-@param[in]	tuple2	range end, may also be empty tuple
-@param[in]	mode2	search mode for range end
+@param[in/out]	range_start
+@param[in/out]	range_ end
 @return estimated number of rows */
 ha_rows
 btr_estimate_n_rows_in_range(
 	dict_index_t*	index,
-	const dtuple_t*	tuple1,
-	page_cur_mode_t	mode1,
-	const dtuple_t*	tuple2,
-	page_cur_mode_t	mode2);
+        btr_pos_t*      range_start,
+        btr_pos_t*      range_end);
 
 
 /** Statistics for one field of an index. */
@@ -662,8 +638,7 @@ to free the field. */
 void
 btr_cur_disown_inherited_fields(
 /*============================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
-				part will be updated, or NULL */
+	buf_block_t*	block,	/*!< in/out: index page */
 	rec_t*		rec,	/*!< in/out: record in a clustered index */
 	dict_index_t*	index,	/*!< in: index of the page */
 	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
@@ -738,12 +713,12 @@ btr_free_externally_stored_field(
 					page_zip_write_blob_ptr(), or NULL */
 	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index),
 					or NULL */
-	page_zip_des_t*	page_zip,	/*!< in: compressed page corresponding
-					to rec, or NULL if rec == NULL */
+	buf_block_t*	block,		/*!< in/out: page of field_ref */
 	ulint		i,		/*!< in: field number of field_ref;
 					ignored if rec == NULL */
 	bool		rollback,	/*!< in: performing rollback? */
-	mtr_t*		local_mtr);	/*!< in: mtr containing the latch */
+	mtr_t*		local_mtr)	/*!< in: mtr containing the latch */
+	MY_ATTRIBUTE((nonnull(1,2,5,8)));
 
 /** Copies the prefix of an externally stored field of a record.
 The clustered index record must be protected by a lock or a page latch.
@@ -800,33 +775,8 @@ btr_rec_copy_externally_stored_field(
 	ulint*			len,
 	mem_heap_t*		heap);
 
-/***********************************************************//**
-Sets a secondary index record's delete mark to the given value. This
-function is only used by the insert buffer merge mechanism. */
-void
-btr_cur_set_deleted_flag_for_ibuf(
-/*==============================*/
-	rec_t*		rec,		/*!< in/out: record */
-	page_zip_des_t*	page_zip,	/*!< in/out: compressed page
-					corresponding to rec, or NULL
-					when the tablespace is uncompressed */
-	ibool		val,		/*!< in: value to set */
-	mtr_t*		mtr);		/*!< in/out: mini-transaction */
-
-/******************************************************//**
-The following function is used to set the deleted bit of a record. */
-UNIV_INLINE
-void
-btr_rec_set_deleted_flag(
-/*=====================*/
-	rec_t*		rec,	/*!< in/out: physical record */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page (or NULL) */
-	ulint		flag);	/*!< in: nonzero if delete marked */
-
 /** Latches the leaf page or pages requested.
 @param[in]	block		leaf page where the search converged
-@param[in]	page_id		page id of the leaf
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 @param[in]	latch_mode	BTR_SEARCH_LEAF, ...
 @param[in]	cursor		cursor
 @param[in]	mtr		mini-transaction
@@ -834,8 +784,6 @@ btr_rec_set_deleted_flag(
 btr_latch_leaves_t
 btr_cur_latch_leaves(
 	buf_block_t*		block,
-	const page_id_t		page_id,
-	ulint			zip_size,
 	ulint			latch_mode,
 	btr_cur_t*		cursor,
 	mtr_t*			mtr);
@@ -866,7 +814,7 @@ struct btr_path_t {
 	ulint	n_recs;
 
 	/** Number of the page containing the record. */
-	ulint	page_no;
+	uint32_t page_no;
 
 	/** Level of the page. If later we fetch the page under page_no
 	and it is no different level then we know that the tree has been
@@ -987,15 +935,14 @@ struct btr_cur_t {
 	}
 };
 
-/******************************************************//**
-The following function is used to set the deleted bit of a record. */
-UNIV_INLINE
-void
-btr_rec_set_deleted_flag(
-/*=====================*/
-	rec_t*		rec,	/*!< in/out: physical record */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page (or NULL) */
-	ulint		flag);	/*!< in: nonzero if delete marked */
+/** Modify the delete-mark flag of a record.
+@tparam         flag    the value of the delete-mark flag
+@param[in,out]  block   buffer block
+@param[in,out]  rec     record on a physical index page
+@param[in,out]  mtr     mini-transaction  */
+template<bool flag>
+void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
 
 /** If pessimistic delete fails because of lack of file space, there
 is still a good change of success a little later.  Try this many
diff --git a/storage/innobase/include/btr0cur.inl b/storage/innobase/include/btr0cur.inl
index be6ac28129e..8a45b714936 100644
--- a/storage/innobase/include/btr0cur.inl
+++ b/storage/innobase/include/btr0cur.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, MariaDB Corporation.
+Copyright (c) 2018, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -129,25 +129,25 @@ btr_cur_compress_recommendation(
 {
 	const page_t*	page;
 
-	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
-			       MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
 
 	page = btr_cur_get_page(cursor);
 
 	LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page) * 2U,
 				      return(FALSE));
 
-	if (page_get_data_size(page)
-	    < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index)
-	    || !page_has_siblings(page)) {
+	if (!page_has_siblings(page)
+	    || page_get_data_size(page)
+	    < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index)) {
 
 		/* The page fillfactor has dropped below a predefined
 		minimum value OR the level in the B-tree contains just
 		one page: we recommend compression if this is not the
 		root page. */
 
-		return(dict_index_get_page(cursor->index)
-		       != page_get_page_no(page));
+		return cursor->index->page
+			!= btr_cur_get_block(cursor)->page.id().page_no();
 	}
 
 	return(FALSE);
@@ -167,22 +167,22 @@ btr_cur_can_delete_without_compress(
 {
 	page_t*		page;
 
-	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
-				MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
 
 	page = btr_cur_get_page(cursor);
 
-	if (page_get_data_size(page) - rec_size
-	    < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index)
-	    || !page_has_siblings(page) || page_get_n_recs(page) < 2) {
+	if (!page_has_siblings(page) || page_get_n_recs(page) < 2
+	    || page_get_data_size(page) - rec_size
+	    < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index)) {
 
 		/* The page fillfactor will drop below a predefined
 		minimum value, OR the level in the B-tree contains just
 		one page, OR the page will become empty: we recommend
 		compression if this is not the root page. */
 
-		return(dict_index_get_page(cursor->index)
-		       == page_get_page_no(page));
+		return cursor->index->page
+			== btr_cur_get_block(cursor)->page.id().page_no();
 	}
 
 	return(TRUE);
@@ -209,21 +209,3 @@ btr_blob_op_is_update(
 	ut_ad(0);
 	return(FALSE);
 }
-
-/******************************************************//**
-The following function is used to set the deleted bit of a record. */
-UNIV_INLINE
-void
-btr_rec_set_deleted_flag(
-/*=====================*/
-	rec_t*		rec,	/*!< in/out: physical record */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page (or NULL) */
-	ulint		flag)	/*!< in: nonzero if delete marked */
-{
-	if (page_rec_is_comp(rec)) {
-		rec_set_deleted_flag_new(rec, page_zip, flag);
-	} else {
-		ut_ad(!page_zip);
-		rec_set_deleted_flag_old(rec, flag);
-	}
-}
diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h
index 2b6b8f77c73..a9212db0e04 100644
--- a/storage/innobase/include/btr0defragment.h
+++ b/storage/innobase/include/btr0defragment.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
-Copyright (C) 2014, 2017, MariaDB Corporation.
+Copyright (C) 2014, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -30,21 +30,6 @@ extern Atomic_counter<ulint> btr_defragment_compression_failures;
 extern Atomic_counter<ulint> btr_defragment_failures;
 extern Atomic_counter<ulint> btr_defragment_count;
 
-/** Item in the work queue for btr_degrament_thread. */
-struct btr_defragment_item_t
-{
-	btr_pcur_t*	pcur;		/* persistent cursor where
-					btr_defragment_n_pages should start */
-	os_event_t	event;		/* if not null, signal after work
-					is done */
-	bool		removed;	/* Mark an item as removed */
-	ulonglong	last_processed;	/* timestamp of last time this index
-					is processed by defragment thread */
-
-	btr_defragment_item_t(btr_pcur_t* pcur, os_event_t event);
-	~btr_defragment_item_t();
-};
-
 /******************************************************************//**
 Initialize defragmentation. */
 void
@@ -84,12 +69,7 @@ void
 btr_defragment_save_defrag_stats_if_needed(
 	dict_index_t*	index);	/*!< in: index */
 
-/** Merge consecutive b-tree pages into fewer pages to defragment indexes */
-extern "C" UNIV_INTERN
-os_thread_ret_t
-DECLARE_THREAD(btr_defragment_thread)(void*);
-
-/** Whether btr_defragment_thread is active */
-extern bool btr_defragment_thread_active;
-
+/* Stop defragmentation.*/
+void btr_defragment_end();
+extern bool btr_defragment_active;
 #endif
diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h
index bbb9831ae93..e50a97a6b1d 100644
--- a/storage/innobase/include/btr0pcur.h
+++ b/storage/innobase/include/btr0pcur.h
@@ -516,7 +516,8 @@ struct btr_pcur_t{
 	ulint		buf_size;
 
 	btr_pcur_t() :
-		btr_cur(), latch_mode(0), old_stored(false), old_rec(NULL),
+		btr_cur(), latch_mode(RW_NO_LATCH),
+		old_stored(false), old_rec(NULL),
 		old_n_fields(0), rel_pos(btr_pcur_pos_t(0)),
 		block_when_stored(),
 		modify_clock(0), pos_state(BTR_PCUR_NOT_POSITIONED),
diff --git a/storage/innobase/include/btr0scrub.h b/storage/innobase/include/btr0scrub.h
deleted file mode 100644
index 0f17467fb70..00000000000
--- a/storage/innobase/include/btr0scrub.h
+++ /dev/null
@@ -1,152 +0,0 @@
-// Copyright 2014 Google
-
-#ifndef btr0scrub_h
-#define btr0scrub_h
-
-#include "dict0dict.h"
-
-/**
- * enum describing page allocation status
- */
-enum btr_scrub_page_allocation_status_t {
-	BTR_SCRUB_PAGE_FREE,
-	BTR_SCRUB_PAGE_ALLOCATED,
-	BTR_SCRUB_PAGE_ALLOCATION_UNKNOWN
-};
-
-/**
-* constants returned by btr_page_needs_scrubbing & btr_scrub_recheck_page
-*/
-#define BTR_SCRUB_PAGE                         1 /* page should be scrubbed */
-#define BTR_SCRUB_SKIP_PAGE                    2 /* no scrub & no action */
-#define BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE    3 /* no scrub & close table */
-#define BTR_SCRUB_SKIP_PAGE_AND_COMPLETE_SPACE 4 /* no scrub & complete space */
-#define BTR_SCRUB_TURNED_OFF                   5 /* we detected that scrubbing
-						 was disabled by global
-						 variable */
-
-/**************************************************************//**
-struct for keeping scrub statistics. */
-struct btr_scrub_stat_t {
-	/* page reorganizations */
-	ulint page_reorganizations;
-	/* page splits */
-	ulint page_splits;
-	/* scrub failures */
-	ulint page_split_failures_underflow;
-	ulint page_split_failures_out_of_filespace;
-	ulint page_split_failures_missing_index;
-	ulint page_split_failures_unknown;
-};
-
-/**************************************************************//**
-struct for thread local scrub state. */
-struct btr_scrub_t {
-
-	/* current space */
-	ulint space;
-
-	/* is scrubbing enabled for this space */
-	bool scrubbing;
-
-	/* is current space compressed */
-	bool compressed;
-
-	dict_table_t* current_table;
-	dict_index_t* current_index;
-	/* savepoint for X_LATCH of block */
-	ulint savepoint;
-
-	/* statistic counters */
-	btr_scrub_stat_t scrub_stat;
-};
-
-/*********************************************************************
-Init scrub global variables */
-UNIV_INTERN
-void
-btr_scrub_init();
-
-/*********************************************************************
-Cleanup scrub globals */
-UNIV_INTERN
-void
-btr_scrub_cleanup();
-
-/***********************************************************************
-Return crypt statistics */
-UNIV_INTERN
-void
-btr_scrub_total_stat(
-/*==================*/
-	btr_scrub_stat_t *stat); /*!< out: stats to update */
-
-/**************************************************************//**
-Check if a page needs scrubbing
-* @return BTR_SCRUB_PAGE if page should be scrubbed
-* else btr_scrub_skip_page should be called
-* with this return value (and without any latches held)
-*/
-UNIV_INTERN
-int
-btr_page_needs_scrubbing(
-/*=====================*/
-	btr_scrub_t*	scrub_data, /*!< in: scrub data  */
-	buf_block_t*	block,	    /*!< in: block to check, latched */
-	btr_scrub_page_allocation_status_t allocated); /*!< in: is block
-						       allocated, free or
-						       unknown */
-
-/****************************************************************
-Recheck if a page needs scrubbing, and if it does load appropriate
-table and index
-* @return BTR_SCRUB_PAGE if page should be scrubbed
-* else btr_scrub_skip_page should be called
-* with this return value (and without any latches held)
-*/
-UNIV_INTERN
-int
-btr_scrub_recheck_page(
-/*====================*/
-	btr_scrub_t* scrub_data,  /*!< inut: scrub data */
-	buf_block_t* block,       /*!< in: block */
-	btr_scrub_page_allocation_status_t allocated, /*!< in: is block
-						      allocated or free */
-	mtr_t* mtr);              /*!< in: mtr */
-
-/****************************************************************
-Perform actual scrubbing of page */
-UNIV_INTERN
-int
-btr_scrub_page(
-/*============*/
-	btr_scrub_t* scrub_data,  /*!< in/out: scrub data */
-	buf_block_t* block,       /*!< in: block */
-	btr_scrub_page_allocation_status_t allocated, /*!< in: is block
-						      allocated or free */
-	mtr_t* mtr);              /*!< in: mtr */
-
-/****************************************************************
-Perform cleanup needed for a page not needing scrubbing */
-UNIV_INTERN
-void
-btr_scrub_skip_page(
-/*============*/
-	btr_scrub_t* scrub_data,  /*!< in/out: scrub data */
-	int needs_scrubbing);     /*!< in:  return value from
-				  btr_page_needs_scrubbing or
-				  btr_scrub_recheck_page which encodes what kind
-				  of cleanup is needed */
-
-/****************************************************************
-Start iterating a space
-* @return true if scrubbing is turned on */
-bool btr_scrub_start_space(const fil_space_t &space, btr_scrub_t *scrub_data);
-
-/** Complete iterating a space.
-@param[in,out]	scrub_data	 scrub data */
-UNIV_INTERN
-void
-btr_scrub_complete_space(btr_scrub_t* scrub_data);
-
-#endif
diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h
index 8ed0a13f0b5..db4c54008c2 100644
--- a/storage/innobase/include/btr0sea.h
+++ b/storage/innobase/include/btr0sea.h
@@ -30,19 +30,16 @@ Created 2/17/1996 Heikki Tuuri
 #include "dict0dict.h"
 #ifdef BTR_CUR_HASH_ADAPT
 #include "ha0ha.h"
+#include "sync0sync.h"
 
-/** Creates and initializes the adaptive search system at a database start.
-@param[in]	hash_size	hash table size. */
-void btr_search_sys_create(ulint hash_size);
-
-/** Frees the adaptive search system at a database shutdown. */
-void btr_search_sys_free();
+#define btr_search_sys_create() btr_search_sys.create()
+#define btr_search_sys_free() btr_search_sys.free()
 
 /** Disable the adaptive hash search system and empty the index. */
 void btr_search_disable();
 
 /** Enable the adaptive hash search system.
-@param resize whether buf_pool_resize() is the caller */
+@param resize whether buf_pool_t::resize() is the caller */
 void btr_search_enable(bool resize= false);
 
 /*********************************************************************//**
@@ -98,7 +95,7 @@ btr_search_move_or_delete_hash_entries(
 @param[in,out]	block	block containing index page, s- or x-latched, or an
 			index page for which we know that
 			block->buf_fix_count == 0 or it is an index page which
-			has already been removed from the buf_pool->page_hash
+			has already been removed from the buf_pool.page_hash
 			i.e.: it is in state BUF_BLOCK_REMOVE_HASH */
 void btr_search_drop_page_hash_index(buf_block_t* block);
 
@@ -162,19 +159,8 @@ static inline bool btr_search_own_any();
 /** Unlock all search latches from shared mode. */
 static inline void btr_search_s_unlock_all();
 
-/** Get the latch based on index attributes.
-A latch is selected from an array of latches using pair of index-id, space-id.
-@param[in]	index	index handler
-@return latch */
-static inline rw_lock_t* btr_get_search_latch(const dict_index_t* index);
-
-/** Get the hash-table based on index attributes.
-A table is selected from an array of tables using pair of index-id, space-id.
-@param[in]	index	index handler
-@return hash table */
-static inline hash_table_t* btr_get_search_table(const dict_index_t* index);
 #else /* BTR_CUR_HASH_ADAPT */
-# define btr_search_sys_create(size)
+# define btr_search_sys_create()
 # define btr_search_sys_free()
 # define btr_search_drop_page_hash_index(block)
 # define btr_search_s_lock_all(index)
@@ -230,9 +216,9 @@ struct btr_search_t{
 				btr_search_info_create(). */
 
 	/*---------------------- @{ */
-	ulint	n_fields;	/*!< recommended prefix length for hash search:
+	uint16_t n_fields;	/*!< recommended prefix length for hash search:
 				number of full fields */
-	ulint	n_bytes;	/*!< recommended prefix: number of bytes in
+	uint16_t n_bytes;	/*!< recommended prefix: number of bytes in
 				an incomplete field
 				@see BTR_PAGE_MAX_REC_SIZE */
 	bool	left_side;	/*!< true or false, depending on whether
@@ -257,31 +243,124 @@ struct btr_search_t{
 };
 
 #ifdef BTR_CUR_HASH_ADAPT
+/** The hash index system */
+struct btr_search_sys_t
+{
+  /** Partition of the hash table */
+  struct partition
+  {
+    /** latches protecting hash_table */
+    rw_lock_t latch;
+    /** mapping of dtuple_fold() to rec_t* in buf_block_t::frame */
+    hash_table_t table;
+    /** memory heap for table */
+    mem_heap_t *heap;
+
+    char pad[(CPU_LEVEL1_DCACHE_LINESIZE - sizeof(rw_lock_t) -
+              sizeof(hash_table_t) - sizeof(mem_heap_t)) &
+             (CPU_LEVEL1_DCACHE_LINESIZE - 1)];
+
+    void init()
+    {
+      memset((void*) this, 0, sizeof *this);
+      rw_lock_create(btr_search_latch_key, &latch, SYNC_SEARCH_SYS);
+    }
+
+    void alloc(ulint hash_size)
+    {
+      table.create(hash_size);
+      heap= mem_heap_create_typed(std::min<ulong>(4096,
+                                                  MEM_MAX_ALLOC_IN_BUF / 2
+                                                  - MEM_BLOCK_HEADER_SIZE
+                                                  - MEM_SPACE_NEEDED(0)),
+                                  MEM_HEAP_FOR_BTR_SEARCH);
+    }
+
+    void clear()
+    {
+      mem_heap_free(heap);
+      heap= nullptr;
+      ut_free(table.array);
+    }
+
+    void free()
+    {
+      rw_lock_free(&latch);
+      if (heap)
+        clear();
+    }
+  };
+
+  /** Partitions of the adaptive hash index */
+  partition *parts;
+
+  /** Get an adaptive hash index partition */
+  partition *get_part(index_id_t id, ulint space_id) const
+  {
+    return parts + ut_fold_ulint_pair(ulint(id), space_id) % btr_ahi_parts;
+  }
+
+  /** Get an adaptive hash index partition */
+  partition *get_part(const dict_index_t &index) const
+  {
+    ut_ad(!index.table->space ||
+          index.table->space->id == index.table->space_id);
+    return get_part(ulint(index.id), index.table->space_id);
+  }
+
+  /** Get the search latch for the adaptive hash index partition */
+  rw_lock_t *get_latch(const dict_index_t &index) const
+  { return &get_part(index)->latch; }
+
+  /** Create and initialize at startup */
+  void create()
+  {
+    parts= static_cast<partition*>(ut_malloc(btr_ahi_parts * sizeof *parts,
+                                             mem_key_ahi));
+    for (ulong i= 0; i < btr_ahi_parts; ++i)
+      parts[i].init();
+    if (btr_search_enabled)
+      btr_search_enable();
+  }
+
+  void alloc(ulint hash_size)
+  {
+    hash_size/= btr_ahi_parts;
+    for (ulong i= 0; i < btr_ahi_parts; ++i)
+      parts[i].alloc(hash_size);
+  }
+
+  /** Clear when disabling the adaptive hash index */
+  void clear() { for (ulong i= 0; i < btr_ahi_parts; ++i) parts[i].clear(); }
+
+  /** Free at shutdown */
+  void free()
+  {
+    if (parts)
+    {
+      for (ulong i= 0; i < btr_ahi_parts; ++i)
+        parts[i].free();
+      ut_free(parts);
+      parts= nullptr;
+    }
+  }
+};
+
+/** The adaptive hash index */
+extern btr_search_sys_t btr_search_sys;
+
 /** @return number of leaf pages pointed to by the adaptive hash index */
 inline ulint dict_index_t::n_ahi_pages() const
 {
   if (!btr_search_enabled)
     return 0;
-  rw_lock_t *latch = btr_get_search_latch(this);
+  rw_lock_t *latch = &btr_search_sys.get_part(*this)->latch;
   rw_lock_s_lock(latch);
   ulint ref_count= search_info->ref_count;
   rw_lock_s_unlock(latch);
   return ref_count;
 }
 
-/** The hash index system */
-struct btr_search_sys_t{
-	hash_table_t**	hash_tables;	/*!< the adaptive hash tables,
-					mapping dtuple_fold values
-					to rec_t pointers on index pages */
-};
-
-/** Latches protecting access to adaptive hash index. */
-extern rw_lock_t**		btr_search_latches;
-
-/** The adaptive hash index */
-extern btr_search_sys_t*	btr_search_sys;
-
 #ifdef UNIV_SEARCH_PERF_STAT
 /** Number of successful adaptive hash index lookups */
 extern ulint	btr_search_n_succ;
diff --git a/storage/innobase/include/btr0sea.inl b/storage/innobase/include/btr0sea.inl
index 9db0084ce59..40eb5d86ead 100644
--- a/storage/innobase/include/btr0sea.inl
+++ b/storage/innobase/include/btr0sea.inl
@@ -88,7 +88,7 @@ btr_search_info_update(
 static inline void btr_search_x_lock_all()
 {
 	for (ulint i = 0; i < btr_ahi_parts; ++i) {
-		rw_lock_x_lock(btr_search_latches[i]);
+		rw_lock_x_lock(&btr_search_sys.parts[i].latch);
 	}
 }
 
@@ -96,7 +96,7 @@ static inline void btr_search_x_lock_all()
 static inline void btr_search_x_unlock_all()
 {
 	for (ulint i = 0; i < btr_ahi_parts; ++i) {
-		rw_lock_x_unlock(btr_search_latches[i]);
+		rw_lock_x_unlock(&btr_search_sys.parts[i].latch);
 	}
 }
 
@@ -104,7 +104,7 @@ static inline void btr_search_x_unlock_all()
 static inline void btr_search_s_lock_all()
 {
 	for (ulint i = 0; i < btr_ahi_parts; ++i) {
-		rw_lock_s_lock(btr_search_latches[i]);
+		rw_lock_s_lock(&btr_search_sys.parts[i].latch);
 	}
 }
 
@@ -112,7 +112,7 @@ static inline void btr_search_s_lock_all()
 static inline void btr_search_s_unlock_all()
 {
 	for (ulint i = 0; i < btr_ahi_parts; ++i) {
-		rw_lock_s_unlock(btr_search_latches[i]);
+		rw_lock_s_unlock(&btr_search_sys.parts[i].latch);
 	}
 }
 
@@ -124,7 +124,7 @@ static inline void btr_search_s_unlock_all()
 static inline bool btr_search_own_all(ulint mode)
 {
 	for (ulint i = 0; i < btr_ahi_parts; ++i) {
-		if (!rw_lock_own(btr_search_latches[i], mode)) {
+		if (!rw_lock_own(&btr_search_sys.parts[i].latch, mode)) {
 			return(false);
 		}
 	}
@@ -138,7 +138,7 @@ static inline bool btr_search_own_all(ulint mode)
 static inline bool btr_search_own_any(ulint mode)
 {
 	for (ulint i = 0; i < btr_ahi_parts; ++i) {
-		if (rw_lock_own(btr_search_latches[i], mode)) {
+		if (rw_lock_own(&btr_search_sys.parts[i].latch, mode)) {
 			return(true);
 		}
 	}
@@ -149,7 +149,7 @@ static inline bool btr_search_own_any(ulint mode)
 static inline bool btr_search_own_any()
 {
 	for (ulint i = btr_ahi_parts; i--; ) {
-		if (rw_lock_own_flagged(btr_search_latches[i],
+		if (rw_lock_own_flagged(&btr_search_sys.parts[i].latch,
 					RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)) {
 			return true;
 		}
@@ -157,34 +157,4 @@ static inline bool btr_search_own_any()
 	return false;
 }
 #endif /* UNIV_DEBUG */
-
-/** Get the adaptive hash search index latch for a b-tree.
-@param[in]	index	b-tree index
-@return latch */
-static inline rw_lock_t* btr_get_search_latch(const dict_index_t* index)
-{
-	ut_ad(index != NULL);
-	ut_ad(!index->table->space
-	      || index->table->space->id == index->table->space_id);
-
-	ulint	ifold = ut_fold_ulint_pair(ulint(index->id),
-					   index->table->space_id);
-
-	return(btr_search_latches[ifold % btr_ahi_parts]);
-}
-
-/** Get the hash-table based on index attributes.
-A table is selected from an array of tables using pair of index-id, space-id.
-@param[in]	index	index handler
-@return hash table */
-static inline hash_table_t* btr_get_search_table(const dict_index_t* index)
-{
-	ut_ad(index != NULL);
-	ut_ad(index->table->space->id == index->table->space_id);
-
-	ulint	ifold = ut_fold_ulint_pair(ulint(index->id),
-					   index->table->space_id);
-
-	return(btr_search_sys->hash_tables[ifold % btr_ahi_parts]);
-}
 #endif /* BTR_CUR_HASH_ADAPT */
diff --git a/storage/innobase/include/buf0block_hint.h b/storage/innobase/include/buf0block_hint.h
index 2d681175b25..ee48e7ce6d2 100644
--- a/storage/innobase/include/buf0block_hint.h
+++ b/storage/innobase/include/buf0block_hint.h
@@ -28,19 +28,18 @@ this program; if not, write to the Free Software Foundation, Inc.,
 
 namespace buf {
 class Block_hint {
- public:
-  Block_hint():m_block(NULL),m_page_id(0,0) {}
+public:
   /** Stores the pointer to the block, which is currently buffer-fixed.
   @param  block   a pointer to a buffer-fixed block to be stored */
   inline void store(buf_block_t *block)
   {
-    ut_ad(block->page.buf_fix_count);
+    ut_ad(block->page.buf_fix_count());
     m_block= block;
-    m_page_id= block->page.id;
+    m_page_id= block->page.id();
   }
 
   /** Clears currently stored pointer. */
-  inline void clear() { m_block= NULL; }
+  inline void clear() { m_block= nullptr; }
 
   /** Invoke f on m_block(which may be null)
   @param  f   The function to be executed. It will be passed the pointer.
@@ -65,9 +64,9 @@ class Block_hint {
 
  private:
   /** The block pointer stored by store(). */
-  buf_block_t *m_block;
+  buf_block_t *m_block= nullptr;
   /** If m_block is non-null, the m_block->page.id at time it was stored. */
-  page_id_t m_page_id;
+  page_id_t m_page_id{0, 0};
 
   /** A helper function which checks if m_block is not a dangling pointer and
   still points to block with page with m_page_id and if so, buffer-fixes it,
diff --git a/storage/innobase/include/buf0buddy.h b/storage/innobase/include/buf0buddy.h
index 63ec8cd0001..bb9994203d6 100644
--- a/storage/innobase/include/buf0buddy.h
+++ b/storage/innobase/include/buf0buddy.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -28,60 +29,63 @@ Created December 2006 by Marko Makela
 
 #include "buf0types.h"
 
-/**********************************************************************//**
-Allocate a block.  The thread calling this function must hold
-buf_pool->mutex and must not hold buf_pool->zip_mutex or any
-block->mutex.  The buf_pool->mutex may be released and reacquired.
-This function should only be used for allocating compressed page frames.
+/**
+@param[in]	block size in bytes
+@return index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
+inline
+ulint
+buf_buddy_get_slot(ulint size)
+{
+	ulint	i;
+	ulint	s;
+
+	ut_ad(ut_is_2pow(size));
+	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
+	ut_ad(size <= srv_page_size);
+
+	for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) {
+	}
+	ut_ad(i <= BUF_BUDDY_SIZES);
+	return i;
+}
+
+/** Allocate a ROW_FORMAT=COMPRESSED block.
+@param i      index of buf_pool.zip_free[] or BUF_BUDDY_SIZES
+@param lru    assigned to true if buf_pool.mutex was temporarily released
+@return allocated block, never NULL */
+byte *buf_buddy_alloc_low(ulint i, bool *lru) MY_ATTRIBUTE((malloc));
+
+/** Allocate a ROW_FORMAT=COMPRESSED block.
+@param size   compressed page size in bytes
+@param lru    assigned to true if buf_pool.mutex was temporarily released
 @return allocated block, never NULL */
-UNIV_INLINE
-byte*
-buf_buddy_alloc(
-/*============*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool in which
-					the page resides */
-	ulint		size,		/*!< in: compressed page size
-					(between UNIV_ZIP_SIZE_MIN and
-					srv_page_size) */
-	bool*		lru)		/*!< in: pointer to a variable
-					that will be assigned true if
-				       	storage was allocated from the
-				       	LRU list and buf_pool->mutex was
-				       	temporarily released */
-	MY_ATTRIBUTE((malloc, nonnull));
-
-/**********************************************************************//**
-Deallocate a block. */
-UNIV_INLINE
-void
-buf_buddy_free(
-/*===========*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool in which
-					the block resides */
-	void*		buf,		/*!< in: block to be freed, must not
-					be pointed to by the buffer pool */
-	ulint		size)		/*!< in: block size,
-					up to srv_page_size */
-	MY_ATTRIBUTE((nonnull));
-
-/** Reallocate a block.
-@param[in]	buf_pool	buffer pool instance
+inline byte *buf_buddy_alloc(ulint size, bool *lru= nullptr)
+{
+  return buf_buddy_alloc_low(buf_buddy_get_slot(size), lru);
+}
+
+/** Deallocate a block.
+@param[in]	buf	block to be freed, must not be pointed to
+			by the buffer pool
+@param[in]	i	index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
+void buf_buddy_free_low(void* buf, ulint i);
+
+/** Deallocate a block.
+@param[in]	buf	block to be freed, must not be pointed to
+			by the buffer pool
+@param[in]	size	block size in bytes */
+inline void buf_buddy_free(void* buf, ulint size)
+{
+	buf_buddy_free_low(buf, buf_buddy_get_slot(size));
+}
+
+/** Try to reallocate a block.
 @param[in]	buf		block to be reallocated, must be pointed
 to by the buffer pool
 @param[in]	size		block size, up to srv_page_size
 @retval false	if failed because of no free blocks. */
-bool
-buf_buddy_realloc(
-	buf_pool_t*	buf_pool,
-	void*		buf,
-	ulint		size);
-
-/** Combine all pairs of free buddies.
-@param[in]	buf_pool	buffer pool instance */
-void
-buf_buddy_condense_free(
-	buf_pool_t*	buf_pool);
-
-#include "buf0buddy.inl"
+bool buf_buddy_realloc(void* buf, ulint size);
 
+/** Combine all pairs of free buddies. */
+void buf_buddy_condense_free();
 #endif /* buf0buddy_h */
diff --git a/storage/innobase/include/buf0buddy.inl b/storage/innobase/include/buf0buddy.inl
deleted file mode 100644
index 39ab46d80dd..00000000000
--- a/storage/innobase/include/buf0buddy.inl
+++ /dev/null
@@ -1,129 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file include/buf0buddy.ic
-Binary buddy allocator for compressed pages
-
-Created December 2006 by Marko Makela
-*******************************************************/
-
-#include "buf0buf.h"
-#include "buf0buddy.h"
-
-/**********************************************************************//**
-Allocate a block.  The thread calling this function must hold
-buf_pool->mutex and must not hold buf_pool->zip_mutex or any block->mutex.
-The buf_pool_mutex may be released and reacquired.
-@return allocated block, never NULL */
-void*
-buf_buddy_alloc_low(
-/*================*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
-	ulint		i,		/*!< in: index of buf_pool->zip_free[],
-					or BUF_BUDDY_SIZES */
-	bool*		lru)		/*!< in: pointer to a variable that
-					will be assigned true if storage was
-					allocated from the LRU list and
-					buf_pool->mutex was temporarily
-					released */
-	MY_ATTRIBUTE((malloc, nonnull));
-
-/**********************************************************************//**
-Deallocate a block. */
-void
-buf_buddy_free_low(
-/*===============*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	void*		buf,		/*!< in: block to be freed, must not be
-					pointed to by the buffer pool */
-	ulint		i)		/*!< in: index of buf_pool->zip_free[],
-					or BUF_BUDDY_SIZES */
-	MY_ATTRIBUTE((nonnull));
-
-/**********************************************************************//**
-Get the index of buf_pool->zip_free[] for a given block size.
-@return index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */
-UNIV_INLINE
-ulint
-buf_buddy_get_slot(
-/*===============*/
-	ulint	size)	/*!< in: block size */
-{
-	ulint	i;
-	ulint	s;
-
-	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
-
-	for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) {
-	}
-
-	ut_ad(i <= BUF_BUDDY_SIZES);
-	return(i);
-}
-
-/**********************************************************************//**
-Allocate a block.  The thread calling this function must hold
-buf_pool->mutex and must not hold buf_pool->zip_mutex or any
-block->mutex.  The buf_pool->mutex may be released and reacquired.
-This function should only be used for allocating compressed page frames.
-@return allocated block, never NULL */
-UNIV_INLINE
-byte*
-buf_buddy_alloc(
-/*============*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool in which
-					the page resides */
-	ulint		size,		/*!< in: compressed page size
-					(between UNIV_ZIP_SIZE_MIN and
-					srv_page_size) */
-	bool*		lru)		/*!< in: pointer to a variable
-					that will be assigned true if
-				       	storage was allocated from the
-				       	LRU list and buf_pool->mutex was
-				       	temporarily released */
-{
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(ut_is_2pow(size));
-	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
-	ut_ad(size <= srv_page_size);
-
-	return((byte*) buf_buddy_alloc_low(buf_pool, buf_buddy_get_slot(size),
-					   lru));
-}
-
-/**********************************************************************//**
-Deallocate a block. */
-UNIV_INLINE
-void
-buf_buddy_free(
-/*===========*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool in which
-					the block resides */
-	void*		buf,		/*!< in: block to be freed, must not
-					be pointed to by the buffer pool */
-	ulint		size)		/*!< in: block size,
-					up to srv_page_size */
-{
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(ut_is_2pow(size));
-	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
-	ut_ad(size <= srv_page_size);
-
-	buf_buddy_free_low(buf_pool, buf, buf_buddy_get_slot(size));
-}
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index e61e3e4c283..fec8dcb4e2a 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2021, Oracle and/or its affiliates.
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2013, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
@@ -34,12 +34,11 @@ Created 11/5/1995 Heikki Tuuri
 #include "mtr0types.h"
 #include "buf0types.h"
 #include "span.h"
+#include "assume_aligned.h"
 #ifndef UNIV_INNOCHECKSUM
 #include "hash0hash.h"
 #include "ut0byte.h"
 #include "page0types.h"
-#include "ut0rbt.h"
-#include "os0proc.h"
 #include "log0log.h"
 #include "srv0srv.h"
 #include <ostream>
@@ -68,31 +67,6 @@ struct fil_addr_t;
 					if the file page has been freed. */
 #define BUF_EVICT_IF_IN_POOL	20	/*!< evict a clean block if found */
 /* @} */
-/** @name Modes for buf_page_get_known_nowait */
-/* @{ */
-#ifdef BTR_CUR_HASH_ADAPT
-# define BUF_MAKE_YOUNG	51		/*!< Move the block to the
-					start of the LRU list if there
-					is a danger that the block
-					would drift out of the buffer
-					pool*/
-#endif /* BTR_CUR_HASH_ADAPT */
-#define BUF_KEEP_OLD	52		/*!< Preserve the current LRU
-					position of the block. */
-/* @} */
-
-#define MAX_BUFFER_POOLS_BITS	6	/*!< Number of bits to representing
-					a buffer pool ID */
-
-#define MAX_BUFFER_POOLS 	(1 << MAX_BUFFER_POOLS_BITS)
-					/*!< The maximum number of buffer
-					pools that can be defined */
-
-#define BUF_POOL_WATCH_SIZE		(srv_n_purge_threads + 1)
-					/*!< Maximum number of concurrent
-					buffer pool watches */
-#define MAX_PAGE_HASH_LOCKS	1024	/*!< The maximum number of
-					page_hash locks */
 
 /** If LRU list of a buf_pool is less than this size then LRU eviction
 should not happen. This is because when we do LRU flushing we also put
@@ -100,76 +74,55 @@ the blocks on free list. If LRU list is very small then we can end up
 in thrashing. */
 #define BUF_LRU_MIN_LEN		256
 
-extern	buf_pool_t*	buf_pool_ptr;	/*!< The buffer pools
-					of the database */
-
-extern	volatile bool	buf_pool_withdrawing; /*!< true when withdrawing buffer
-					pool pages might cause page relocation */
-
 # ifdef UNIV_DEBUG
 extern my_bool	buf_disable_resize_buffer_pool_debug; /*!< if TRUE, resizing
 					buffer pool is not allowed. */
 # endif /* UNIV_DEBUG */
 
-/** @brief States of a control block
-@see buf_page_t
-
-The enumeration values must be 0..7. */
-enum buf_page_state {
-	BUF_BLOCK_POOL_WATCH,		/*!< a sentinel for the buffer pool
-					watch, element of buf_pool->watch[] */
-	BUF_BLOCK_ZIP_PAGE,		/*!< contains a clean
-					compressed page */
-	BUF_BLOCK_ZIP_DIRTY,		/*!< contains a compressed
-					page that is in the
-					buf_pool->flush_list */
-
-	BUF_BLOCK_NOT_USED,		/*!< is in the free list;
-					must be after the BUF_BLOCK_ZIP_
-					constants for compressed-only pages
-					@see buf_block_state_valid() */
-	BUF_BLOCK_READY_FOR_USE,	/*!< when buf_LRU_get_free_block
-					returns a block, it is in this state */
-	BUF_BLOCK_FILE_PAGE,		/*!< contains a buffered file page */
-	BUF_BLOCK_MEMORY,		/*!< contains some main memory
-					object */
-	BUF_BLOCK_REMOVE_HASH		/*!< hash index should be removed
-					before putting to the free list */
+/** buf_page_t::state() values, distinguishing buf_page_t and buf_block_t */
+enum buf_page_state
+{
+  /** available in buf_pool.free or buf_pool.watch */
+  BUF_BLOCK_NOT_USED,
+  /** allocated for something else than a file page */
+  BUF_BLOCK_MEMORY,
+  /** a previously allocated file page, in transit to NOT_USED */
+  BUF_BLOCK_REMOVE_HASH,
+  /** a buf_block_t that is also in buf_pool.LRU */
+  BUF_BLOCK_FILE_PAGE,
+  /** the buf_page_t of a ROW_FORMAT=COMPRESSED page
+  whose uncompressed page frame has been evicted */
+  BUF_BLOCK_ZIP_PAGE
 };
 
-
 /** This structure defines information we will fetch from each buffer pool. It
 will be used to print table IO stats */
-struct buf_pool_info_t{
+struct buf_pool_info_t
+{
 	/* General buffer pool info */
-	ulint	pool_unique_id;		/*!< Buffer Pool ID */
 	ulint	pool_size;		/*!< Buffer Pool size in pages */
-	ulint	lru_len;		/*!< Length of buf_pool->LRU */
-	ulint	old_lru_len;		/*!< buf_pool->LRU_old_len */
-	ulint	free_list_len;		/*!< Length of buf_pool->free list */
-	ulint	flush_list_len;		/*!< Length of buf_pool->flush_list */
-	ulint	n_pend_unzip;		/*!< buf_pool->n_pend_unzip, pages
+	ulint	lru_len;		/*!< Length of buf_pool.LRU */
+	ulint	old_lru_len;		/*!< buf_pool.LRU_old_len */
+	ulint	free_list_len;		/*!< Length of buf_pool.free list */
+	ulint	flush_list_len;		/*!< Length of buf_pool.flush_list */
+	ulint	n_pend_unzip;		/*!< buf_pool.n_pend_unzip, pages
 					pending decompress */
-	ulint	n_pend_reads;		/*!< buf_pool->n_pend_reads, pages
+	ulint	n_pend_reads;		/*!< buf_pool.n_pend_reads, pages
 					pending read */
 	ulint	n_pending_flush_lru;	/*!< Pages pending flush in LRU */
-	ulint	n_pending_flush_single_page;/*!< Pages pending to be
-					flushed as part of single page
-					flushes issued by various user
-					threads */
 	ulint	n_pending_flush_list;	/*!< Pages pending flush in FLUSH
 					LIST */
 	ulint	n_pages_made_young;	/*!< number of pages made young */
 	ulint	n_pages_not_made_young;	/*!< number of pages not made young */
-	ulint	n_pages_read;		/*!< buf_pool->n_pages_read */
-	ulint	n_pages_created;	/*!< buf_pool->n_pages_created */
-	ulint	n_pages_written;	/*!< buf_pool->n_pages_written */
-	ulint	n_page_gets;		/*!< buf_pool->n_page_gets */
-	ulint	n_ra_pages_read_rnd;	/*!< buf_pool->n_ra_pages_read_rnd,
+	ulint	n_pages_read;		/*!< buf_pool.n_pages_read */
+	ulint	n_pages_created;	/*!< buf_pool.n_pages_created */
+	ulint	n_pages_written;	/*!< buf_pool.n_pages_written */
+	ulint	n_page_gets;		/*!< buf_pool.n_page_gets */
+	ulint	n_ra_pages_read_rnd;	/*!< buf_pool.n_ra_pages_read_rnd,
 					number of pages readahead */
-	ulint	n_ra_pages_read;	/*!< buf_pool->n_ra_pages_read, number
+	ulint	n_ra_pages_read;	/*!< buf_pool.n_ra_pages_read, number
 					of pages readahead */
-	ulint	n_ra_pages_evicted;	/*!< buf_pool->n_ra_pages_evicted,
+	ulint	n_ra_pages_evicted;	/*!< buf_pool.n_ra_pages_evicted,
 					number of readahead pages evicted
 					without access */
 	ulint	n_page_get_delta;	/*!< num of buffer pool page gets since
@@ -199,7 +152,7 @@ struct buf_pool_info_t{
 					without access, in pages per second */
 
 	/* Stats about LRU eviction */
-	ulint	unzip_lru_len;		/*!< length of buf_pool->unzip_LRU
+	ulint	unzip_lru_len;		/*!< length of buf_pool.unzip_LRU
 					list */
 	/* Counters for LRU policy */
 	ulint	io_sum;			/*!< buf_LRU_stat_sum.io */
@@ -210,13 +163,6 @@ struct buf_pool_info_t{
 					pages decompressed in current
 					interval */
 };
-
-/** The occupied bytes of lists in all buffer pools */
-struct buf_pools_list_size_t {
-	ulint	LRU_bytes;		/*!< LRU size in bytes */
-	ulint	unzip_LRU_bytes;	/*!< unzip_LRU size in bytes */
-	ulint	flush_list_bytes;	/*!< flush_list size in bytes */
-};
 #endif /* !UNIV_INNOCHECKSUM */
 
 /** Print the given page_id_t object.
@@ -229,62 +175,6 @@ operator<<(
 	const page_id_t		page_id);
 
 #ifndef UNIV_INNOCHECKSUM
-/********************************************************************//**
-Acquire mutex on all buffer pool instances */
-UNIV_INLINE
-void
-buf_pool_mutex_enter_all(void);
-/*===========================*/
-
-/********************************************************************//**
-Release mutex on all buffer pool instances */
-UNIV_INLINE
-void
-buf_pool_mutex_exit_all(void);
-/*==========================*/
-
-/********************************************************************//**
-Creates the buffer pool.
-@return DB_SUCCESS if success, DB_ERROR if not enough memory or error */
-dberr_t
-buf_pool_init(
-/*=========*/
-	ulint	size,		/*!< in: Size of the total pool in bytes */
-	ulint	n_instances);	/*!< in: Number of instances */
-/********************************************************************//**
-Frees the buffer pool at shutdown.  This must not be invoked before
-freeing all mutexes. */
-void
-buf_pool_free(
-/*==========*/
-	ulint	n_instances);	/*!< in: numbere of instances to free */
-
-/** Determines if a block is intended to be withdrawn.
-@param[in]	buf_pool	buffer pool instance
-@param[in]	block		pointer to control block
-@retval true	if will be withdrawn */
-bool
-buf_block_will_withdrawn(
-	buf_pool_t*		buf_pool,
-	const buf_block_t*	block);
-
-/** Determines if a frame is intended to be withdrawn.
-@param[in]	buf_pool	buffer pool instance
-@param[in]	ptr		pointer to a frame
-@retval true	if will be withdrawn */
-bool
-buf_frame_will_withdrawn(
-	buf_pool_t*	buf_pool,
-	const byte*	ptr);
-
-/** This is the thread for resizing buffer pool. It waits for an event and
-when waked up either performs a resizing and sleeps again.
-@return	this function does not return, calls os_thread_exit()
-*/
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(buf_resize_thread)(void*);
-
 /*********************************************************************//**
 Gets the current size of buffer buf_pool in bytes.
 @return size in bytes */
@@ -292,20 +182,6 @@ UNIV_INLINE
 ulint
 buf_pool_get_curr_size(void);
 /*========================*/
-/*********************************************************************//**
-Gets the current size of buffer buf_pool in frames.
-@return size in pages */
-UNIV_INLINE
-ulint
-buf_pool_get_n_pages(void);
-/*=======================*/
-/********************************************************************//**
-Gets the smallest oldest_modification lsn for any page in the pool. Returns
-zero if all modified pages have been flushed to disk.
-@return oldest modification in pool, zero if none */
-lsn_t
-buf_pool_get_oldest_modification(void);
-/*==================================*/
 
 /********************************************************************//**
 Allocates a buf_page_t descriptor. This function must succeed. In case
@@ -324,15 +200,9 @@ buf_page_free_descriptor(
 	buf_page_t*	bpage)	/*!< in: bpage descriptor to free. */
 	MY_ATTRIBUTE((nonnull));
 
-/********************************************************************//**
-Allocates a buffer block.
+/** Allocate a buffer block.
 @return own: the allocated block, in state BUF_BLOCK_MEMORY */
-buf_block_t*
-buf_block_alloc(
-/*============*/
-	buf_pool_t*	buf_pool);	/*!< in: buffer pool instance,
-					or NULL for round-robin selection
-					of the buffer pool */
+inline buf_block_t *buf_block_alloc();
 /********************************************************************//**
 Frees a buffer block which does not contain a file page. */
 UNIV_INLINE
@@ -341,22 +211,13 @@ buf_block_free(
 /*===========*/
 	buf_block_t*	block);	/*!< in, own: block to be freed */
 
-/*********************************************************************//**
-Copies contents of a buffer frame to a given buffer.
-@return buf */
-UNIV_INLINE
-byte*
-buf_frame_copy(
-/*===========*/
-	byte*			buf,	/*!< in: buffer to copy to */
-	const buf_frame_t*	frame);	/*!< in: buffer frame */
-
 /**************************************************************//**
 NOTE! The following macros should be used instead of buf_page_get_gen,
 to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed
 in LA! */
 #define buf_page_get(ID, SIZE, LA, MTR)					\
-	buf_page_get_gen(ID, SIZE, LA, NULL, BUF_GET, __FILE__, __LINE__, MTR, NULL)
+	buf_page_get_gen(ID, SIZE, LA, NULL, BUF_GET, __FILE__, __LINE__, MTR)
+
 /**************************************************************//**
 Use these macros to bufferfix a page with no latching. Remember not to
 read the contents of the page unless you know it is safe. Do not modify
@@ -365,7 +226,7 @@ error-prone programming not to set a latch, and it should be used
 with care. */
 #define buf_page_get_with_no_latch(ID, SIZE, MTR)	\
 	buf_page_get_gen(ID, SIZE, RW_NO_LATCH, NULL, BUF_GET_NO_LATCH, \
-			 __FILE__, __LINE__, MTR, NULL)
+			 __FILE__, __LINE__, MTR)
 /********************************************************************//**
 This is the general function used to get optimistic access to a database
 page.
@@ -379,19 +240,6 @@ buf_page_optimistic_get(
 	const char*	file,	/*!< in: file name */
 	unsigned	line,	/*!< in: line where called */
 	mtr_t*		mtr);	/*!< in: mini-transaction */
-/********************************************************************//**
-This is used to get access to a known database page, when no waiting can be
-done.
-@return TRUE if success */
-ibool
-buf_page_get_known_nowait(
-/*======================*/
-	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
-	buf_block_t*	block,	/*!< in: the known page */
-	ulint		mode,	/*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
-	const char*	file,	/*!< in: file name */
-	unsigned	line,	/*!< in: line where called */
-	mtr_t*		mtr);	/*!< in: mini-transaction */
 
 /** Given a tablespace id and page number tries to get that page. If the
 page is not in the buffer pool it is not loaded and NULL is returned.
@@ -429,18 +277,19 @@ the same set of mutexes or latches.
 @return pointer to the block */
 buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size);
 
-/** This is the general function used to get access to a database page.
-It does page initialization and applies the buffered redo logs.
-@param[in]	page_id		page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	rw_latch	RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
-@param[in]	guess		guessed block or NULL
-@param[in]	mode		BUF_GET, BUF_GET_IF_IN_POOL,
+/** Get access to a database page. Buffered redo log may be applied.
+@param[in]	page_id			page id
+@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in]	guess			guessed block or NULL
+@param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
 BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
-@param[in]	file		file name
-@param[in]	line		line where called
-@param[in]	mtr		mini-transaction
-@param[out]	err		DB_SUCCESS or error code
+@param[in]	file			file name
+@param[in]	line			line where called
+@param[in]	mtr			mini-transaction
+@param[out]	err			DB_SUCCESS or error code
+@param[in]	allow_ibuf_merge	Allow change buffer merge while
+reading the pages from file.
 @return pointer to the block or NULL */
 buf_block_t*
 buf_page_get_gen(
@@ -452,19 +301,24 @@ buf_page_get_gen(
 	const char*		file,
 	unsigned		line,
 	mtr_t*			mtr,
-	dberr_t*		err);
+	dberr_t*		err = NULL,
+	bool			allow_ibuf_merge = false);
 
-/** Low level function used to get access to a database page.
-@param[in]	page_id		page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	rw_latch	RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
-@param[in]	guess		guessed block or NULL
-@param[in]	mode		BUF_GET, BUF_GET_IF_IN_POOL,
+/** This is the low level function used to get access to a database page.
+@param[in]	page_id			page id
+@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in]	guess			guessed block or NULL
+@param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
 BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
-@param[in]	file		file name
-@param[in]	line		line where called
-@param[in]	mtr		mini-transaction
-@param[out]	err		DB_SUCCESS or error code
+@param[in]	file			file name
+@param[in]	line			line where called
+@param[in]	mtr			mini-transaction
+@param[out]	err			DB_SUCCESS or error code
+@param[in]	allow_ibuf_merge	Allow change buffer merge to happen
+while reading the page from file
+then it makes sure that it does merging of change buffer changes while
+reading the page from file.
 @return pointer to the block or NULL */
 buf_block_t*
 buf_page_get_low(
@@ -476,21 +330,22 @@ buf_page_get_low(
 	const char*		file,
 	unsigned		line,
 	mtr_t*			mtr,
-	dberr_t*		err);
+	dberr_t*		err,
+	bool			allow_ibuf_merge);
 
 /** Initialize a page in the buffer pool. The page is usually not read
 from a file even if it cannot be found in the buffer buf_pool. This is one
 of the functions which perform to a block a state transition NOT_USED =>
 FILE_PAGE (the other is buf_page_get_gen).
-@param[in]	page_id		page id
+@param[in,out]	space		space object
+@param[in]	offset		offset of the tablespace
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 @param[in,out]	mtr		mini-transaction
+@param[in,out]	free_block	pre-allocated buffer block
 @return pointer to the block, page bufferfixed */
 buf_block_t*
-buf_page_create(
-	const page_id_t		page_id,
-	ulint			zip_size,
-	mtr_t*			mtr);
+buf_page_create(fil_space_t *space, uint32_t offset,
+                ulint zip_size, mtr_t *mtr, buf_block_t *free_block);
 
 /********************************************************************//**
 Releases a compressed-only page acquired with buf_page_get_zip(). */
@@ -508,41 +363,18 @@ buf_page_release_latch(
 	buf_block_t*	block,		/*!< in: buffer block */
 	ulint		rw_latch);	/*!< in: RW_S_LATCH, RW_X_LATCH,
 					RW_NO_LATCH */
-/********************************************************************//**
-Moves a page to the start of the buffer pool LRU list. This high-level
-function can be used to prevent an important page from slipping out of
-the buffer pool. */
-void
-buf_page_make_young(
-/*================*/
-	buf_page_t*	bpage);	/*!< in: buffer block of a file page */
-
-/** Returns TRUE if the page can be found in the buffer pool hash table.
-NOTE that it is possible that the page is not yet read from disk,
-though.
-@param[in]	page_id	page id
-@return TRUE if found in the page hash table */
-inline bool buf_page_peek(const page_id_t page_id);
-
-#ifdef UNIV_DEBUG
-
-/** Sets file_page_was_freed TRUE if the page is found in the buffer pool.
-This function should be called when we free a file page and want the
-debug version to check that it is not accessed any more unless
-reallocated.
-@param[in]	page_id	page id
-@return control block if found in page hash table, otherwise NULL */
-buf_page_t* buf_page_set_file_page_was_freed(const page_id_t page_id);
-
-/** Sets file_page_was_freed FALSE if the page is found in the buffer pool.
-This function should be called when we free a file page and want the
-debug version to check that it is not accessed any more unless
-reallocated.
-@param[in]	page_id	page id
-@return control block if found in page hash table, otherwise NULL */
-buf_page_t* buf_page_reset_file_page_was_freed(const page_id_t page_id);
+/** Move a block to the start of the LRU list. */
+void buf_page_make_young(buf_page_t *bpage);
+/** Mark the page status as FREED for the given tablespace id and
+page number. If the page is not in buffer pool then ignore it.
+@param[in,out]	space	tablespace
+@param[in]	page	page number
+@param[in,out]	mtr	mini-transaction
+@param[in]	file	file name
+@param[in]	line	line where called */
+void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr,
+                   const char *file, unsigned line);
 
-#endif /* UNIV_DEBUG */
 /********************************************************************//**
 Reads the freed_page_clock of a buffer block.
 @return freed_page_clock */
@@ -562,41 +394,33 @@ buf_block_get_freed_page_clock(
 	const buf_block_t*	block)	/*!< in: block */
 	MY_ATTRIBUTE((warn_unused_result));
 
-/********************************************************************//**
-Tells if a block is still close enough to the MRU end of the LRU list
+/** Determine if a block is still close enough to the MRU end of the LRU list
 meaning that it is not in danger of getting evicted and also implying
 that it has been accessed recently.
 Note that this is for heuristics only and does not reserve buffer pool
 mutex.
-@return TRUE if block is close to MRU end of LRU */
-UNIV_INLINE
-ibool
-buf_page_peek_if_young(
-/*===================*/
-	const buf_page_t*	bpage);	/*!< in: block */
-/********************************************************************//**
-Recommends a move of a block to the start of the LRU list if there is danger
-of dropping from the buffer pool. NOTE: does not reserve the buffer pool
-mutex.
-@return TRUE if should be made younger */
-UNIV_INLINE
-ibool
-buf_page_peek_if_too_old(
-/*=====================*/
-	const buf_page_t*	bpage);	/*!< in: block to make younger */
-/********************************************************************//**
-Gets the youngest modification log sequence number for a frame.
-Returns zero if not file page or no modification occurred yet.
-@return newest modification to page */
-UNIV_INLINE
-lsn_t
-buf_page_get_newest_modification(
-/*=============================*/
-	const buf_page_t*	bpage);	/*!< in: block containing the
-					page frame */
+@param[in]	bpage		buffer pool page
+@return whether bpage is close to MRU end of LRU */
+inline bool buf_page_peek_if_young(const buf_page_t *bpage);
+
+/** Determine if a block should be moved to the start of the LRU list if
+there is danger of dropping from the buffer pool.
+@param[in]	bpage		buffer pool page
+@return true if bpage should be made younger */
+inline bool buf_page_peek_if_too_old(const buf_page_t *bpage);
+
+/** Move a page to the start of the buffer pool LRU list if it is too old.
+@param[in,out]	bpage		buffer pool page */
+inline void buf_page_make_young_if_needed(buf_page_t *bpage)
+{
+	if (UNIV_UNLIKELY(buf_page_peek_if_too_old(bpage))) {
+		buf_page_make_young(bpage);
+	}
+}
+
 /********************************************************************//**
 Increments the modify clock of a frame by 1. The caller must (1) own the
-buf_pool->mutex and block bufferfix count has to be zero, (2) or own an x-lock
+buf_pool.mutex and block bufferfix count has to be zero, (2) or own an x-lock
 on the block. */
 UNIV_INLINE
 void
@@ -722,10 +546,11 @@ stored in 26th position.
 @return key version of the page. */
 inline uint32_t buf_page_get_key_version(const byte* read_buf, ulint fsp_flags)
 {
-	return fil_space_t::full_crc32(fsp_flags)
-		? mach_read_from_4(read_buf + FIL_PAGE_FCRC32_KEY_VERSION)
-		: mach_read_from_4(read_buf
-				   + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+  static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "compatibility");
+  return fil_space_t::full_crc32(fsp_flags)
+    ? mach_read_from_4(my_assume_aligned<4>(read_buf))
+    : mach_read_from_4(my_assume_aligned<2>
+		       (read_buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION));
 }
 
 /** Read the compression info from the page. In full crc32 format,
@@ -736,10 +561,10 @@ stored in page type.
 @return true if page is compressed. */
 inline bool buf_page_is_compressed(const byte* read_buf, ulint fsp_flags)
 {
-	ulint page_type = mach_read_from_2(read_buf + FIL_PAGE_TYPE);
-	return fil_space_t::full_crc32(fsp_flags)
-		? !!(page_type & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)
-		: page_type == FIL_PAGE_PAGE_COMPRESSED;
+  uint16_t page_type= fil_page_get_type(read_buf);
+  return fil_space_t::full_crc32(fsp_flags)
+    ? !!(page_type & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)
+    : page_type == FIL_PAGE_PAGE_COMPRESSED;
 }
 
 /** Get the compressed or uncompressed size of a full_crc32 page.
@@ -749,7 +574,7 @@ inline bool buf_page_is_compressed(const byte* read_buf, ulint fsp_flags)
 @return the payload size in the file page */
 inline uint buf_page_full_crc32_size(const byte* buf, bool* comp, bool* cr)
 {
-	uint t = mach_read_from_2(buf + FIL_PAGE_TYPE);
+	uint t = fil_page_get_type(buf);
 	uint page_size = uint(srv_page_size);
 
 	if (!(t & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)) {
@@ -772,64 +597,6 @@ inline uint buf_page_full_crc32_size(const byte* buf, bool* comp, bool* cr)
 }
 
 #ifndef UNIV_INNOCHECKSUM
-/**********************************************************************//**
-Gets the space id, page offset, and byte offset within page of a
-pointer pointing to a buffer frame containing a file page. */
-UNIV_INLINE
-void
-buf_ptr_get_fsp_addr(
-/*=================*/
-	const void*	ptr,	/*!< in: pointer to a buffer frame */
-	ulint*		space,	/*!< out: space id */
-	fil_addr_t*	addr);	/*!< out: page offset and byte offset */
-/**********************************************************************//**
-Gets the hash value of a block. This can be used in searches in the
-lock hash table.
-@return lock hash value */
-UNIV_INLINE
-unsigned
-buf_block_get_lock_hash_val(
-/*========================*/
-	const buf_block_t*	block)	/*!< in: block */
-	MY_ATTRIBUTE((warn_unused_result));
-#ifdef UNIV_DEBUG
-/*********************************************************************//**
-Finds a block in the buffer pool that points to a
-given compressed page.
-@return buffer block pointing to the compressed page, or NULL */
-buf_block_t*
-buf_pool_contains_zip(
-/*==================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	const void*	data);		/*!< in: pointer to compressed page */
-#endif /* UNIV_DEBUG */
-
-/***********************************************************************
-FIXME_FTS: Gets the frame the pointer is pointing to. */
-UNIV_INLINE
-buf_frame_t*
-buf_frame_align(
-/*============*/
-                        /* out: pointer to frame */
-        byte*   ptr);   /* in: pointer to a frame */
-
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/*********************************************************************//**
-Validates the buffer pool data structure.
-@return TRUE */
-ibool
-buf_validate(void);
-/*==============*/
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/*********************************************************************//**
-Prints info of the buffer pool data structure. */
-void
-buf_print(void);
-/*============*/
-#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
-
 /** Dump a page to stderr.
 @param[in]	read_buf	database page
 @param[in]	zip_size	compressed page size, or 0 */
@@ -845,62 +612,25 @@ buf_zip_decompress(
 	ibool		check);	/*!< in: TRUE=verify the page checksum */
 
 #ifdef UNIV_DEBUG
-/*********************************************************************//**
-Returns the number of latched pages in the buffer pool.
-@return number of latched pages */
-ulint
-buf_get_latched_pages_number(void);
-/*==============================*/
+/** @return the number of latched pages in the buffer pool */
+ulint buf_get_latched_pages_number();
 #endif /* UNIV_DEBUG */
 /*********************************************************************//**
-Returns the number of pending buf pool read ios.
-@return number of pending read I/O operations */
-ulint
-buf_get_n_pending_read_ios(void);
-/*============================*/
-/*********************************************************************//**
 Prints info of the buffer i/o. */
 void
 buf_print_io(
 /*=========*/
 	FILE*	file);	/*!< in: file where to print */
-/*******************************************************************//**
-Collect buffer pool stats information for a buffer pool. Also
-record aggregated stats if there are more than one buffer pool
-in the server */
-void
-buf_stats_get_pool_info(
-/*====================*/
-	buf_pool_t*		buf_pool,	/*!< in: buffer pool */
-	ulint			pool_id,	/*!< in: buffer pool ID */
-	buf_pool_info_t*	all_pool_info);	/*!< in/out: buffer pool info
-						to fill */
-/** Return the ratio in percents of modified pages in the buffer pool /
-database pages in the buffer pool.
-@return modified page percentage ratio */
-double
-buf_get_modified_ratio_pct(void);
+/** Collect buffer pool metadata.
+@param[out]	pool_info	buffer pool metadata */
+void buf_stats_get_pool_info(buf_pool_info_t *pool_info);
+
 /** Refresh the statistics used to print per-second averages. */
-void
-buf_refresh_io_stats_all(void);
-/** Assert that all file pages in the buffer are in a replaceable state.
-@return TRUE */
-ibool
-buf_all_freed(void);
-/*********************************************************************//**
-Checks that there currently are no pending i/o-operations for the buffer
-pool.
-@return number of pending i/o operations */
-ulint
-buf_pool_check_no_pending_io(void);
-/*==============================*/
-/*********************************************************************//**
-Invalidates the file pages in the buffer pool when an archive recovery is
-completed. All the file pages buffered must be in a replaceable state when
-this function is called: not latched and not modified. */
-void
-buf_pool_invalidate(void);
-/*=====================*/
+void buf_refresh_io_stats();
+
+/** Invalidate all pages in the buffer pool.
+All pages must be in a replaceable state (not modified or latched). */
+void buf_pool_invalidate();
 
 /*========================================================================
 --------------------------- LOWER LEVEL ROUTINES -------------------------
@@ -921,216 +651,6 @@ buf_block_dbg_add_level(
 #else /* UNIV_DEBUG */
 # define buf_block_dbg_add_level(block, level) /* nothing */
 #endif /* UNIV_DEBUG */
-/*********************************************************************//**
-Gets the state of a block.
-@return state */
-UNIV_INLINE
-enum buf_page_state
-buf_page_get_state(
-/*===============*/
-	const buf_page_t*	bpage);	/*!< in: pointer to the control
-					block */
-/*********************************************************************//**
-Gets the state name for state of a block
-@return	name or "CORRUPTED" */
-UNIV_INLINE
-const char*
-buf_get_state_name(
-/*===============*/
-	const buf_block_t*	block);	/*!< in: pointer to the control
-					block */
-/*********************************************************************//**
-Gets the state of a block.
-@return state */
-UNIV_INLINE
-enum buf_page_state
-buf_block_get_state(
-/*================*/
-	const buf_block_t*	block)	/*!< in: pointer to the control block */
-	MY_ATTRIBUTE((warn_unused_result));
-/*********************************************************************//**
-Sets the state of a block. */
-UNIV_INLINE
-void
-buf_page_set_state(
-/*===============*/
-	buf_page_t*		bpage,	/*!< in/out: pointer to control block */
-	enum buf_page_state	state);	/*!< in: state */
-/*********************************************************************//**
-Sets the state of a block. */
-UNIV_INLINE
-void
-buf_block_set_state(
-/*================*/
-	buf_block_t*		block,	/*!< in/out: pointer to control block */
-	enum buf_page_state	state);	/*!< in: state */
-/*********************************************************************//**
-Determines if a block is mapped to a tablespace.
-@return TRUE if mapped */
-UNIV_INLINE
-ibool
-buf_page_in_file(
-/*=============*/
-	const buf_page_t*	bpage)	/*!< in: pointer to control block */
-	MY_ATTRIBUTE((warn_unused_result));
-
-/*********************************************************************//**
-Determines if a block should be on unzip_LRU list.
-@return TRUE if block belongs to unzip_LRU */
-UNIV_INLINE
-ibool
-buf_page_belongs_to_unzip_LRU(
-/*==========================*/
-	const buf_page_t*	bpage)	/*!< in: pointer to control block */
-	MY_ATTRIBUTE((warn_unused_result));
-
-/*********************************************************************//**
-Gets the mutex of a block.
-@return pointer to mutex protecting bpage */
-UNIV_INLINE
-BPageMutex*
-buf_page_get_mutex(
-/*===============*/
-	const buf_page_t*	bpage)	/*!< in: pointer to control block */
-	MY_ATTRIBUTE((warn_unused_result));
-
-/*********************************************************************//**
-Get the flush type of a page.
-@return flush type */
-UNIV_INLINE
-buf_flush_t
-buf_page_get_flush_type(
-/*====================*/
-	const buf_page_t*	bpage)	/*!< in: buffer page */
-	MY_ATTRIBUTE((warn_unused_result));
-/*********************************************************************//**
-Set the flush type of a page. */
-UNIV_INLINE
-void
-buf_page_set_flush_type(
-/*====================*/
-	buf_page_t*	bpage,		/*!< in: buffer page */
-	buf_flush_t	flush_type);	/*!< in: flush type */
-
-/** Map a block to a file page.
-@param[in,out]	block	pointer to control block
-@param[in]	page_id	page id */
-UNIV_INLINE
-void
-buf_block_set_file_page(
-	buf_block_t*		block,
-	const page_id_t		page_id);
-
-/*********************************************************************//**
-Gets the io_fix state of a block.
-@return io_fix state */
-UNIV_INLINE
-enum buf_io_fix
-buf_page_get_io_fix(
-/*================*/
-	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
-	MY_ATTRIBUTE((warn_unused_result));
-/*********************************************************************//**
-Gets the io_fix state of a block.
-@return io_fix state */
-UNIV_INLINE
-enum buf_io_fix
-buf_block_get_io_fix(
-/*================*/
-	const buf_block_t*	block)	/*!< in: pointer to the control block */
-	MY_ATTRIBUTE((warn_unused_result));
-/*********************************************************************//**
-Sets the io_fix state of a block. */
-UNIV_INLINE
-void
-buf_page_set_io_fix(
-/*================*/
-	buf_page_t*	bpage,	/*!< in/out: control block */
-	enum buf_io_fix	io_fix);/*!< in: io_fix state */
-/*********************************************************************//**
-Sets the io_fix state of a block. */
-UNIV_INLINE
-void
-buf_block_set_io_fix(
-/*=================*/
-	buf_block_t*	block,	/*!< in/out: control block */
-	enum buf_io_fix	io_fix);/*!< in: io_fix state */
-/*********************************************************************//**
-Makes a block sticky. A sticky block implies that even after we release
-the buf_pool->mutex and the block->mutex:
-* it cannot be removed from the flush_list
-* the block descriptor cannot be relocated
-* it cannot be removed from the LRU list
-Note that:
-* the block can still change its position in the LRU list
-* the next and previous pointers can change. */
-UNIV_INLINE
-void
-buf_page_set_sticky(
-/*================*/
-	buf_page_t*	bpage);	/*!< in/out: control block */
-/*********************************************************************//**
-Removes stickiness of a block. */
-UNIV_INLINE
-void
-buf_page_unset_sticky(
-/*==================*/
-	buf_page_t*	bpage);	/*!< in/out: control block */
-/********************************************************************//**
-Determine if a buffer block can be relocated in memory.  The block
-can be dirty, but it must not be I/O-fixed or bufferfixed. */
-UNIV_INLINE
-ibool
-buf_page_can_relocate(
-/*==================*/
-	const buf_page_t*	bpage)	/*!< control block being relocated */
-	MY_ATTRIBUTE((warn_unused_result));
-
-/*********************************************************************//**
-Determine if a block has been flagged old.
-@return TRUE if old */
-UNIV_INLINE
-ibool
-buf_page_is_old(
-/*============*/
-	const buf_page_t*	bpage)	/*!< in: control block */
-	MY_ATTRIBUTE((warn_unused_result));
-/*********************************************************************//**
-Flag a block old. */
-UNIV_INLINE
-void
-buf_page_set_old(
-/*=============*/
-	buf_page_t*	bpage,	/*!< in/out: control block */
-	bool		old);	/*!< in: old */
-/*********************************************************************//**
-Determine the time of first access of a block in the buffer pool.
-@return ut_time_ms() at the time of first access, 0 if not accessed */
-UNIV_INLINE
-unsigned
-buf_page_is_accessed(
-/*=================*/
-	const buf_page_t*	bpage)	/*!< in: control block */
-	MY_ATTRIBUTE((warn_unused_result));
-/*********************************************************************//**
-Flag a block accessed. */
-UNIV_INLINE
-void
-buf_page_set_accessed(
-/*==================*/
-	buf_page_t*	bpage)		/*!< in/out: control block */
-	MY_ATTRIBUTE((nonnull));
-/*********************************************************************//**
-Gets the buf_block_t handle of a buffered file block if an uncompressed
-page frame exists, or NULL. Note: even though bpage is not declared a
-const we don't update its value.
-@return control block, or NULL */
-UNIV_INLINE
-buf_block_t*
-buf_page_get_block(
-/*===============*/
-	buf_page_t*	bpage)	/*!< in: control block, or NULL */
-	MY_ATTRIBUTE((warn_unused_result));
 
 #ifdef UNIV_DEBUG
 /*********************************************************************//**
@@ -1150,251 +670,25 @@ buf_block_get_frame(
 Gets the compressed page descriptor corresponding to an uncompressed page
 if applicable. */
 #define buf_block_get_page_zip(block) \
-	((block)->page.zip.data ? &(block)->page.zip : NULL)
-
-#ifdef BTR_CUR_HASH_ADAPT
-/** Get a buffer block from an adaptive hash index pointer.
-This function does not return if the block is not identified.
-@param[in]	ptr	pointer to within a page frame
-@return pointer to block, never NULL */
-buf_block_t*
-buf_block_from_ahi(const byte* ptr);
-#endif /* BTR_CUR_HASH_ADAPT */
-
-/********************************************************************//**
-Find out if a pointer belongs to a buf_block_t. It can be a pointer to
-the buf_block_t itself or a member of it
-@return TRUE if ptr belongs to a buf_block_t struct */
-ibool
-buf_pointer_is_block_field(
-/*=======================*/
-	const void*		ptr);	/*!< in: pointer not
-					dereferenced */
-/** Find out if a pointer corresponds to a buf_block_t::mutex.
-@param m in: mutex candidate
-@return TRUE if m is a buf_block_t::mutex */
-#define buf_pool_is_block_mutex(m)			\
-	buf_pointer_is_block_field((const void*)(m))
-/** Find out if a pointer corresponds to a buf_block_t::lock.
-@param l in: rw-lock candidate
-@return TRUE if l is a buf_block_t::lock */
-#define buf_pool_is_block_lock(l)			\
-	buf_pointer_is_block_field((const void*)(l))
-
-/** Initialize a page for read to the buffer buf_pool. If the page is
-(1) already in buf_pool, or
-(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
-(3) if the space is deleted or being deleted,
-then this function does nothing.
-Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
-on the buffer frame. The io-handler must take care that the flag is cleared
-and the lock released later.
-@param[out]	err			DB_SUCCESS or DB_TABLESPACE_DELETED
-@param[in]	mode			BUF_READ_IBUF_PAGES_ONLY, ...
-@param[in]	page_id			page id
-@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	unzip			whether the uncompressed page is
-					requested (for ROW_FORMAT=COMPRESSED)
-@return pointer to the block
-@retval	NULL	in case of an error */
-buf_page_t*
-buf_page_init_for_read(
-	dberr_t*		err,
-	ulint			mode,
-	const page_id_t		page_id,
-	ulint			zip_size,
-	bool			unzip);
-
-/** Complete a read or write request of a file page to or from the buffer pool.
-@param[in,out]	bpage	page to complete
-@param[in]	dblwr	whether the doublewrite buffer was used (on write)
-@param[in]	evict	whether or not to evict the page from LRU list
+	(UNIV_LIKELY_NULL((block)->page.zip.data) ? &(block)->page.zip : NULL)
+#define is_buf_block_get_page_zip(block) \
+        UNIV_LIKELY_NULL((block)->page.zip.data)
+
+/** Monitor the buffer page read/write activity, and increment corresponding
+counter value in MONITOR_MODULE_BUF_PAGE.
+@param bpage   buffer page whose read or write was completed
+@param io_type BUF_IO_READ or BUF_IO_WRITE */
+ATTRIBUTE_COLD __attribute__((nonnull))
+void buf_page_monitor(const buf_page_t *bpage, buf_io_fix io_type);
+
+/** Complete a read request of a file page to buf_pool.
+@param bpage    recently read page
+@param node     data file
 @return whether the operation succeeded
-@retval	DB_SUCCESS		always when writing, or if a read page was OK
-@retval	DB_PAGE_CORRUPTED	if the checksum fails on a page read
-@retval	DB_DECRYPTION_FAILED	if page post encryption checksum matches but
-				after decryption normal page checksum does
-				not match */
-UNIV_INTERN
-dberr_t
-buf_page_io_complete(buf_page_t* bpage, bool dblwr = false, bool evict = false)
-	MY_ATTRIBUTE((nonnull));
-
-/********************************************************************//**
-Calculates the index of a buffer pool to the buf_pool[] array.
-@return the position of the buffer pool in buf_pool[] */
-UNIV_INLINE
-unsigned
-buf_pool_index(
-/*===========*/
-	const buf_pool_t*	buf_pool)	/*!< in: buffer pool */
-	MY_ATTRIBUTE((warn_unused_result));
-/******************************************************************//**
-Returns the buffer pool instance given a page instance
-@return buf_pool */
-UNIV_INLINE
-buf_pool_t*
-buf_pool_from_bpage(
-/*================*/
-	const buf_page_t*	bpage); /*!< in: buffer pool page */
-/******************************************************************//**
-Returns the buffer pool instance given a block instance
-@return buf_pool */
-UNIV_INLINE
-buf_pool_t*
-buf_pool_from_block(
-/*================*/
-	const buf_block_t*	block); /*!< in: block */
-
-/** Returns the buffer pool instance given a page id.
-@param[in]	page_id	page id
-@return buffer pool */
-inline buf_pool_t* buf_pool_get(const page_id_t page_id);
-
-/******************************************************************//**
-Returns the buffer pool instance given its array index
-@return buffer pool */
-UNIV_INLINE
-buf_pool_t*
-buf_pool_from_array(
-/*================*/
-	ulint	index);		/*!< in: array index to get
-				buffer pool instance from */
-
-/** Returns the control block of a file page, NULL if not found.
-@param[in]	buf_pool	buffer pool instance
-@param[in]	page_id		page id
-@return block, NULL if not found */
-UNIV_INLINE
-buf_page_t*
-buf_page_hash_get_low(
-	buf_pool_t*		buf_pool,
-	const page_id_t		page_id);
-
-/** Returns the control block of a file page, NULL if not found.
-If the block is found and lock is not NULL then the appropriate
-page_hash lock is acquired in the specified lock mode. Otherwise,
-mode value is ignored. It is up to the caller to release the
-lock. If the block is found and the lock is NULL then the page_hash
-lock is released by this function.
-@param[in]	buf_pool	buffer pool instance
-@param[in]	page_id		page id
-@param[in,out]	lock		lock of the page hash acquired if bpage is
-found, NULL otherwise. If NULL is passed then the hash_lock is released by
-this function.
-@param[in]	lock_mode	RW_LOCK_X or RW_LOCK_S. Ignored if
-lock == NULL
-@param[in]	watch		if true, return watch sentinel also.
-@return pointer to the bpage or NULL; if NULL, lock is also NULL or
-a watch sentinel. */
-UNIV_INLINE
-buf_page_t*
-buf_page_hash_get_locked(
-	buf_pool_t*		buf_pool,
-	const page_id_t		page_id,
-	rw_lock_t**		lock,
-	ulint			lock_mode,
-	bool			watch = false);
-
-/** Returns the control block of a file page, NULL if not found.
-If the block is found and lock is not NULL then the appropriate
-page_hash lock is acquired in the specified lock mode. Otherwise,
-mode value is ignored. It is up to the caller to release the
-lock. If the block is found and the lock is NULL then the page_hash
-lock is released by this function.
-@param[in]	buf_pool	buffer pool instance
-@param[in]	page_id		page id
-@param[in,out]	lock		lock of the page hash acquired if bpage is
-found, NULL otherwise. If NULL is passed then the hash_lock is released by
-this function.
-@param[in]	lock_mode	RW_LOCK_X or RW_LOCK_S. Ignored if
-lock == NULL
-@return pointer to the block or NULL; if NULL, lock is also NULL. */
-UNIV_INLINE
-buf_block_t*
-buf_block_hash_get_locked(
-	buf_pool_t*		buf_pool,
-	const page_id_t		page_id,
-	rw_lock_t**		lock,
-	ulint			lock_mode);
-
-/* There are four different ways we can try to get a bpage or block
-from the page hash:
-1) Caller already holds the appropriate page hash lock: in the case call
-buf_page_hash_get_low() function.
-2) Caller wants to hold page hash lock in x-mode
-3) Caller wants to hold page hash lock in s-mode
-4) Caller doesn't want to hold page hash lock */
-#define buf_page_hash_get_s_locked(b, page_id, l)		\
-	buf_page_hash_get_locked(b, page_id, l, RW_LOCK_S)
-#define buf_page_hash_get_x_locked(b, page_id, l)		\
-	buf_page_hash_get_locked(b, page_id, l, RW_LOCK_X)
-#define buf_page_hash_get(b, page_id)				\
-	buf_page_hash_get_locked(b, page_id, NULL, 0)
-#define buf_page_get_also_watch(b, page_id)			\
-	buf_page_hash_get_locked(b, page_id, NULL, 0, true)
-
-#define buf_block_hash_get_s_locked(b, page_id, l)		\
-	buf_block_hash_get_locked(b, page_id, l, RW_LOCK_S)
-#define buf_block_hash_get_x_locked(b, page_id, l)		\
-	buf_block_hash_get_locked(b, page_id, l, RW_LOCK_X)
-#define buf_block_hash_get(b, page_id)				\
-	buf_block_hash_get_locked(b, page_id, NULL, 0)
-
-/********************************************************************//**
-Determine if a block is a sentinel for a buffer pool watch.
-@return TRUE if a sentinel for a buffer pool watch, FALSE if not */
-ibool
-buf_pool_watch_is_sentinel(
-/*=======================*/
-	const buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	const buf_page_t*	bpage)		/*!< in: block */
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
-
-/** Stop watching if the page has been read in.
-buf_pool_watch_set(space,offset) must have returned NULL before.
-@param[in]	page_id	page id */
-void buf_pool_watch_unset(const page_id_t page_id);
-
-/** Check if the page has been read in.
-This may only be called after buf_pool_watch_set(space,offset)
-has returned NULL and before invoking buf_pool_watch_unset(space,offset).
-@param[in]	page_id	page id
-@return FALSE if the given page was not read in, TRUE if it was */
-bool buf_pool_watch_occurred(const page_id_t page_id)
-MY_ATTRIBUTE((warn_unused_result));
-
-/********************************************************************//**
-Get total buffer pool statistics. */
-void
-buf_get_total_list_len(
-/*===================*/
-	ulint*		LRU_len,	/*!< out: length of all LRU lists */
-	ulint*		free_len,	/*!< out: length of all free lists */
-	ulint*		flush_list_len);/*!< out: length of all flush lists */
-/********************************************************************//**
-Get total list size in bytes from all buffer pools. */
-void
-buf_get_total_list_size_in_bytes(
-/*=============================*/
-	buf_pools_list_size_t*	buf_pools_list_size);	/*!< out: list sizes
-							in all buffer pools */
-/********************************************************************//**
-Get total buffer pool statistics. */
-void
-buf_get_total_stat(
-/*===============*/
-	buf_pool_stat_t*tot_stat);	/*!< out: buffer pool stats */
-/*********************************************************************//**
-Get the nth chunk's buffer block in the specified buffer pool.
-@return the nth chunk's buffer block. */
-UNIV_INLINE
-buf_block_t*
-buf_get_nth_chunk_block(
-/*====================*/
-	const buf_pool_t* buf_pool,	/*!< in: buffer pool instance */
-	ulint		n,		/*!< in: nth chunk in the buffer pool */
-	ulint*		chunk_size);	/*!< in: chunk size */
+@retval DB_SUCCESS              always when writing, or if a read page was OK
+@retval DB_PAGE_CORRUPTED       if the checksum fails on a page read
+@retval DB_DECRYPTION_FAILED    if the page cannot be decrypted */
+dberr_t buf_page_read_complete(buf_page_t *bpage, const fil_node_t &node);
 
 /** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit,
 if needed.
@@ -1413,215 +707,136 @@ bool buf_page_verify_crypt_checksum(
 	const byte*	page,
 	ulint		fsp_flags);
 
-/** Calculate the checksum of a page from compressed table and update the
-page.
-@param[in,out]	page	page to update
-@param[in]	size	compressed page size
-@param[in]	lsn	LSN to stamp on the page */
-void
-buf_flush_update_zip_checksum(
-	buf_frame_t*	page,
-	ulint		size,
-	lsn_t		lsn);
-
-/** Encryption and page_compression hook that is called just before
-a page is written to disk.
-@param[in,out]	space		tablespace
-@param[in,out]	bpage		buffer page
-@param[in]	src_frame	physical page frame that is being encrypted
-@return	page frame to be written to file
-(may be src_frame or an encrypted/compressed copy of it) */
-UNIV_INTERN
-byte*
-buf_page_encrypt(
-	fil_space_t*	space,
-	buf_page_t*	bpage,
-	byte*		src_frame);
+/** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page.
+@param[in,out]	page		page to update
+@param[in]	size		compressed page size */
+void buf_flush_update_zip_checksum(buf_frame_t* page, ulint size);
 
 /** @brief The temporary memory structure.
 
 NOTE! The definition appears here only for other modules of this
 directory (buf) to see it. Do not use from outside! */
 
-class buf_tmp_buffer_t {
-	/** whether this slot is reserved */
-	std::atomic<bool> reserved;
+class buf_tmp_buffer_t
+{
+  /** whether this slot is reserved */
+  std::atomic<bool> reserved;
 public:
-	byte*           crypt_buf;	/*!< for encryption the data needs to be
-					copied to a separate buffer before it's
-					encrypted&written. this as a page can be
-					read while it's being flushed */
-	byte*		comp_buf;	/*!< for compression we need
-					temporal buffer because page
-					can be read while it's being flushed */
-	byte*		out_buf;	/*!< resulting buffer after
-					encryption/compression. This is a
-					pointer and not allocated. */
-
-	/** Release the slot */
-	void release()
-	{
-		reserved.store(false, std::memory_order_relaxed);
-	}
-
-	/** Acquire the slot
-	@return whether the slot was acquired */
-	bool acquire()
-	{
-		return !reserved.exchange(true, std::memory_order_relaxed);
-	}
+  /** For encryption, the data needs to be copied to a separate buffer
+  before it's encrypted&written. The buffer block itself can be replaced
+  while a write of crypt_buf to file is in progress. */
+  byte *crypt_buf;
+  /** buffer for fil_page_compress(), for flushing page_compressed pages */
+  byte *comp_buf;
+  /** pointer to resulting buffer after encryption or compression;
+  not separately allocated memory */
+  byte *out_buf;
+
+  /** Release the slot */
+  void release() { reserved.store(false, std::memory_order_relaxed); }
+
+  /** Acquire the slot
+  @return whether the slot was acquired */
+  bool acquire() { return !reserved.exchange(true, std::memory_order_relaxed);}
+
+  /** Allocate a buffer for encryption, decryption or decompression. */
+  void allocate()
+  {
+    if (!crypt_buf)
+      crypt_buf= static_cast<byte*>
+      (aligned_malloc(srv_page_size, srv_page_size));
+  }
 };
 
 /** The common buffer control block structure
 for compressed and uncompressed frames */
 
-/** Number of bits used for buffer page states. */
-#define BUF_PAGE_STATE_BITS	3
+class buf_pool_t;
 
-class buf_page_t {
-public:
-	/** @name General fields
-	None of these bit-fields must be modified without holding
-	buf_page_get_mutex() [buf_block_t::mutex or
-	buf_pool->zip_mutex], since they can be stored in the same
-	machine word.  Some of these fields are additionally protected
-	by buf_pool->mutex. */
-	/* @{ */
+class buf_page_t
+{
+  friend buf_pool_t;
+  friend buf_block_t;
+  /** @name General fields */
+  /* @{ */
+
+public: // FIXME: fix fil_iterate()
+  /** Page id. Protected by buf_pool.hash_lock_get(id) when
+  the page is in buf_pool.page_hash. */
+  page_id_t id_;
+private:
+  /** Count of how manyfold this block is currently bufferfixed. */
+  Atomic_counter<uint32_t> buf_fix_count_;
+
+  /** log sequence number of the START of the log entry written of the
+  oldest modification to this block which has not yet been written
+  to the data file;
+
+  0 if no modifications are pending;
+  1 if no modifications are pending, but the block is in buf_pool.flush_list;
+  2 if modifications are pending, but the block is not in buf_pool.flush_list
+  (because id().space() is the temporary tablespace). */
+  Atomic_relaxed<lsn_t> oldest_modification_;
+
+  /** type of pending I/O operation; protected by buf_pool.mutex
+  if in_LRU_list */
+  Atomic_relaxed<buf_io_fix> io_fix_;
+  /** Block state. @see in_file().
+  State transitions between in_file() states and to
+  BUF_BLOCK_REMOVE_HASH are protected by buf_pool.hash_lock_get(id)
+  when the block is in buf_pool.page_hash.
+  Other transitions when in_LRU_list are protected by buf_pool.mutex. */
+  buf_page_state state_;
 
-	/** Page id. Protected by buf_pool mutex. */
-	page_id_t	id;
-	buf_page_t*	hash;		/*!< node used in chaining to
-					buf_pool->page_hash or
-					buf_pool->zip_hash */
-
-	/** Count of how manyfold this block is currently bufferfixed. */
-	Atomic_counter<uint32_t>	buf_fix_count;
-
-	/** type of pending I/O operation; also protected by
-	buf_pool->mutex for writes only */
-	buf_io_fix	io_fix;
-
-	/** Block state. @see buf_page_in_file */
-	buf_page_state	state;
-
-	unsigned	flush_type:2;	/*!< if this block is currently being
-					flushed to disk, this tells the
-					flush_type.
-					@see buf_flush_t */
-	unsigned	buf_pool_index:6;/*!< index number of the buffer pool
-					that this block belongs to */
-# if MAX_BUFFER_POOLS > 64
-#  error "MAX_BUFFER_POOLS > 64; redefine buf_pool_index:6"
-# endif
-	/* @} */
+public:
+  /** buf_pool.page_hash link; protected by buf_pool.hash_lock_get(id) */
+  buf_page_t *hash;
+  /* @} */
 	page_zip_des_t	zip;		/*!< compressed page; zip.data
 					(but not the data it points to) is
-					also protected by buf_pool->mutex;
+					also protected by buf_pool.mutex;
 					state == BUF_BLOCK_ZIP_PAGE and
 					zip.data == NULL means an active
-					buf_pool->watch */
-
-	ulint           real_size;	/*!< Real size of the page
-					Normal pages == srv_page_size
-					page compressed pages, payload
-					size alligned to sector boundary.
-					*/
+					buf_pool.watch */
 
 	buf_tmp_buffer_t* slot;		/*!< Slot for temporary memory
 					used for encryption/compression
 					or NULL */
 #ifdef UNIV_DEBUG
-	ibool		in_page_hash;	/*!< TRUE if in buf_pool->page_hash */
-	ibool		in_zip_hash;	/*!< TRUE if in buf_pool->zip_hash */
+  /** whether this->list is in buf_pool.zip_hash; protected by buf_pool.mutex */
+  bool in_zip_hash;
+  /** whether this->LRU is in buf_pool.LRU (in_file() holds);
+  protected by buf_pool.mutex */
+  bool in_LRU_list;
+  /** whether this is in buf_pool.page_hash (in_file() holds);
+  protected by buf_pool.mutex */
+  bool in_page_hash;
+  /** whether this->list is in buf_pool.free (state() == BUF_BLOCK_NOT_USED);
+  protected by buf_pool.flush_list_mutex */
+  bool in_free_list;
 #endif /* UNIV_DEBUG */
+  /** list member in one of the lists of buf_pool; protected by
+  buf_pool.mutex or buf_pool.flush_list_mutex
 
-	/** @name Page flushing fields
-	All these are protected by buf_pool->mutex. */
-	/* @{ */
+  state() == BUF_BLOCK_NOT_USED: buf_pool.free or buf_pool.withdraw
 
-	UT_LIST_NODE_T(buf_page_t) list;
-					/*!< based on state, this is a
-					list node, protected either by
-					buf_pool->mutex or by
-					buf_pool->flush_list_mutex,
-					in one of the following lists in
-					buf_pool:
-
-					- BUF_BLOCK_NOT_USED:	free, withdraw
-					- BUF_BLOCK_FILE_PAGE:	flush_list
-					- BUF_BLOCK_ZIP_DIRTY:	flush_list
-					- BUF_BLOCK_ZIP_PAGE:	zip_clean
-
-					If bpage is part of flush_list
-					then the node pointers are
-					covered by buf_pool->flush_list_mutex.
-					Otherwise these pointers are
-					protected by buf_pool->mutex.
-
-					The contents of the list node
-					is undefined if !in_flush_list
-					&& state == BUF_BLOCK_FILE_PAGE,
-					or if state is one of
-					BUF_BLOCK_MEMORY,
-					BUF_BLOCK_REMOVE_HASH or
-					BUF_BLOCK_READY_IN_USE. */
+  in_file() && oldest_modification():
+  buf_pool.flush_list (protected by buf_pool.flush_list_mutex)
 
-#ifdef UNIV_DEBUG
-	ibool		in_flush_list;	/*!< TRUE if in buf_pool->flush_list;
-					when buf_pool->flush_list_mutex is
-					free, the following should hold:
-					in_flush_list
-					== (state == BUF_BLOCK_FILE_PAGE
-					    || state == BUF_BLOCK_ZIP_DIRTY)
-					Writes to this field must be
-					covered by both block->mutex
-					and buf_pool->flush_list_mutex. Hence
-					reads can happen while holding
-					any one of the two mutexes */
-	ibool		in_free_list;	/*!< TRUE if in buf_pool->free; when
-					buf_pool->mutex is free, the following
-					should hold: in_free_list
-					== (state == BUF_BLOCK_NOT_USED) */
-#endif /* UNIV_DEBUG */
+  The contents is undefined if in_file() && !oldest_modification(),
+  or if state() is BUF_BLOCK_MEMORY or BUF_BLOCK_REMOVE_HASH. */
+  UT_LIST_NODE_T(buf_page_t) list;
 
-	FlushObserver*	flush_observer;	/*!< flush observer */
-
-	lsn_t		newest_modification;
-					/*!< log sequence number of
-					the youngest modification to
-					this block, zero if not
-					modified. Protected by block
-					mutex */
-	lsn_t		oldest_modification;
-					/*!< log sequence number of
-					the START of the log entry
-					written of the oldest
-					modification to this block
-					which has not yet been flushed
-					on disk; zero if all
-					modifications are on disk.
-					Writes to this field must be
-					covered by both block->mutex
-					and buf_pool->flush_list_mutex. Hence
-					reads can happen while holding
-					any one of the two mutexes */
-	/* @} */
-	/** @name LRU replacement algorithm fields
-	These fields are protected by buf_pool->mutex only (not
-	buf_pool->zip_mutex or buf_block_t::mutex). */
+	/** @name LRU replacement algorithm fields.
+	Protected by buf_pool.mutex. */
 	/* @{ */
 
 	UT_LIST_NODE_T(buf_page_t) LRU;
 					/*!< node of the LRU list */
-#ifdef UNIV_DEBUG
-	ibool		in_LRU_list;	/*!< TRUE if the page is in
-					the LRU list; used in
-					debugging */
-#endif /* UNIV_DEBUG */
 	unsigned	old:1;		/*!< TRUE if the block is in the old
-					blocks in buf_pool->LRU_old */
+					blocks in buf_pool.LRU_old */
 	unsigned	freed_page_clock:31;/*!< the value of
-					buf_pool->freed_page_clock
+					buf_pool.freed_page_clock
 					when this block was the last
 					time put to the head of the
 					LRU list; a thread is allowed
@@ -1629,22 +844,147 @@ public:
 					purposes without holding any
 					mutex or latch */
 	/* @} */
-	unsigned	access_time;	/*!< time of first access, or
+	Atomic_counter<unsigned> access_time;	/*!< time of first access, or
 					0 if the block was never accessed
-					in the buffer pool. Protected by
-					block mutex */
-# ifdef UNIV_DEBUG
-	ibool		file_page_was_freed;
-					/*!< this is set to TRUE when
-					fsp frees a page in buffer pool;
-					protected by buf_pool->zip_mutex
-					or buf_block_t::mutex. */
-# endif /* UNIV_DEBUG */
+					in the buffer pool.
+
+					For state==BUF_BLOCK_MEMORY
+					blocks, this field can be repurposed
+					for something else.
+
+					When this field counts log records
+					and bytes allocated for recv_sys.pages,
+					the field is protected by
+					recv_sys_t::mutex. */
+  /** Change buffer entries for the page exist.
+  Protected by io_fix()==BUF_IO_READ or by buf_block_t::lock. */
+  bool ibuf_exist;
+
+  /** Block initialization status. Can be modified while holding io_fix()
+  or buf_block_t::lock X-latch */
+  enum {
+    /** the page was read normally and should be flushed normally */
+    NORMAL = 0,
+    /** the page was (re)initialized, and the doublewrite buffer can be
+    skipped on the next flush */
+    INIT_ON_FLUSH,
+    /** the page was freed and need to be flushed.
+    For page_compressed, page flush will punch a hole to free space.
+    Else if innodb_immediate_scrub_data_uncompressed, the page will
+    be overwritten with zeroes. */
+    FREED
+  } status;
+
+  buf_page_t() : id_(0)
+  {
+    static_assert(BUF_BLOCK_NOT_USED == 0, "compatibility");
+    memset((void*) this, 0, sizeof *this);
+  }
 
-  void fix() { buf_fix_count++; }
+  /** Initialize some fields */
+  void init()
+  {
+    io_fix_= BUF_IO_NONE;
+    buf_fix_count_= 0;
+    old= 0;
+    freed_page_clock= 0;
+    access_time= 0;
+    oldest_modification_= 0;
+    slot= nullptr;
+    ibuf_exist= false;
+    status= NORMAL;
+    ut_d(in_zip_hash= false);
+    ut_d(in_free_list= false);
+    ut_d(in_LRU_list= false);
+    ut_d(in_page_hash= false);
+    HASH_INVALIDATE(this, hash);
+  }
+
+  /** Initialize some more fields */
+  void init(buf_page_state state, page_id_t id, uint32_t buf_fix_count= 0)
+  {
+    init();
+    state_= state;
+    id_= id;
+    buf_fix_count_= buf_fix_count;
+  }
+
+  /** Initialize some more fields */
+  void init(page_id_t id, uint32_t buf_fix_count= 0)
+  {
+    init();
+    id_= id;
+    buf_fix_count_= buf_fix_count;
+  }
+
+public:
+  const page_id_t &id() const { return id_; }
+  buf_page_state state() const { return state_; }
+  uint32_t buf_fix_count() const { return buf_fix_count_; }
+  buf_io_fix io_fix() const { return io_fix_; }
+  void io_unfix()
+  {
+    ut_d(const auto old_io_fix= io_fix());
+    ut_ad(old_io_fix == BUF_IO_READ || old_io_fix == BUF_IO_PIN);
+    io_fix_= BUF_IO_NONE;
+  }
+
+  /** @return if this belongs to buf_pool.unzip_LRU */
+  bool belongs_to_unzip_LRU() const
+  {
+    return zip.data && state() != BUF_BLOCK_ZIP_PAGE;
+  }
+
+  inline void add_buf_fix_count(uint32_t count);
+  inline void set_buf_fix_count(uint32_t count);
+  inline void set_state(buf_page_state state);
+  inline void set_io_fix(buf_io_fix io_fix);
+  inline void set_corrupt_id();
+
+  /** @return the log sequence number of the oldest pending modification
+  @retval 0 if the block is being removed from (or not in) buf_pool.flush_list
+  @retval 1 if the block is in buf_pool.flush_list but not modified
+  @retval 2 if the block belongs to the temporary tablespace and
+  has unwritten changes */
+  lsn_t oldest_modification() const { return oldest_modification_; }
+  /** @return the log sequence number of the oldest pending modification,
+  @retval 0 if the block is definitely not in buf_pool.flush_list
+  @retval 1 if the block is in buf_pool.flush_list but not modified
+  @retval 2 if the block belongs to the temporary tablespace and
+  has unwritten changes */
+  lsn_t oldest_modification_acquire() const
+  { return oldest_modification_.load(std::memory_order_acquire); }
+  /** Set oldest_modification when adding to buf_pool.flush_list */
+  inline void set_oldest_modification(lsn_t lsn);
+  /** Clear oldest_modification after removing from buf_pool.flush_list */
+  inline void clear_oldest_modification();
+  /** Note that a block is no longer dirty, while not removing
+  it from buf_pool.flush_list */
+  inline void clear_oldest_modification(bool temporary);
+
+  /** Notify that a page in a temporary tablespace has been modified. */
+  void set_temp_modified()
+  {
+    ut_ad(fsp_is_system_temporary(id().space()));
+    ut_ad(state() == BUF_BLOCK_FILE_PAGE);
+    ut_ad(!oldest_modification());
+    oldest_modification_= 2;
+  }
+
+  /** Prepare to release a file page to buf_pool.free. */
+  void free_file_page()
+  {
+    ut_ad(state() == BUF_BLOCK_REMOVE_HASH);
+    /* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */
+    ut_d(oldest_modification_= 0;)
+    set_corrupt_id();
+    ut_d(set_state(BUF_BLOCK_MEMORY));
+  }
+
+  void fix() { buf_fix_count_++; }
   uint32_t unfix()
   {
-    uint32_t count= buf_fix_count--;
+    uint32_t count= buf_fix_count_--;
     ut_ad(count != 0);
     return count - 1;
   }
@@ -1661,6 +1001,56 @@ public:
   {
     return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : 0;
   }
+
+  /** @return the byte offset of the page within a file */
+  os_offset_t physical_offset() const
+  {
+    os_offset_t o= id().page_no();
+    return zip.ssize
+      ? o << (zip.ssize + (UNIV_ZIP_SIZE_SHIFT_MIN - 1))
+      : o << srv_page_size_shift;
+  }
+
+  /** @return whether the block is mapped to a data file */
+  bool in_file() const
+  {
+    switch (state_) {
+    case BUF_BLOCK_ZIP_PAGE:
+    case BUF_BLOCK_FILE_PAGE:
+      return true;
+    case BUF_BLOCK_NOT_USED:
+    case BUF_BLOCK_MEMORY:
+    case BUF_BLOCK_REMOVE_HASH:
+      return false;
+    }
+
+    ut_error;
+    return false;
+  }
+
+  /** @return whether the block is modified and ready for flushing */
+  inline bool ready_for_flush() const;
+  /** @return whether the state can be changed to BUF_BLOCK_NOT_USED */
+  bool ready_for_replace() const
+  { return !oldest_modification() && can_relocate(); }
+  /** @return whether the block can be relocated in memory.
+  The block can be dirty, but it must not be I/O-fixed or bufferfixed. */
+  inline bool can_relocate() const;
+  /** @return whether the block has been flagged old in buf_pool.LRU */
+  inline bool is_old() const;
+  /** Set whether a block is old in buf_pool.LRU */
+  inline void set_old(bool old);
+  /** Flag a page accessed in buf_pool
+  @return whether this is not the first access */
+  bool set_accessed()
+  {
+    if (is_accessed()) return true;
+    access_time= static_cast<uint32_t>(ut_time_ms());
+    return false;
+  }
+  /** @return ut_time_ms() at the time of first access of a block in buf_pool
+  @retval 0 if not accessed */
+  unsigned is_accessed() const { ut_ad(in_file()); return access_time; }
 };
 
 /** The buffer control block structure */
@@ -1672,32 +1062,29 @@ struct buf_block_t{
 
 	buf_page_t	page;		/*!< page information; this must
 					be the first field, so that
-					buf_pool->page_hash can point
+					buf_pool.page_hash can point
 					to buf_page_t or buf_block_t */
 	byte*		frame;		/*!< pointer to buffer frame which
 					is of size srv_page_size, and
 					aligned to an address divisible by
 					srv_page_size */
-	BPageLock	lock;		/*!< read-write lock of the buffer
+	rw_lock_t	lock;		/*!< read-write lock of the buffer
 					frame */
+#ifdef UNIV_DEBUG
+  /** whether page.list is in buf_pool.withdraw
+  ((state() == BUF_BLOCK_NOT_USED)) and the buffer pool is being shrunk;
+  protected by buf_pool.mutex */
+  bool in_withdraw_list;
+  /** whether unzip_LRU is in buf_pool.unzip_LRU
+  (state() == BUF_BLOCK_FILE_PAGE and zip.data != nullptr);
+  protected by buf_pool.mutex */
+  bool in_unzip_LRU_list;
+#endif
 	UT_LIST_NODE_T(buf_block_t) unzip_LRU;
 					/*!< node of the decompressed LRU list;
 					a block is in the unzip_LRU list
-					if page.state == BUF_BLOCK_FILE_PAGE
+					if page.state() == BUF_BLOCK_FILE_PAGE
 					and page.zip.data != NULL */
-#ifdef UNIV_DEBUG
-	ibool		in_unzip_LRU_list;/*!< TRUE if the page is in the
-					decompressed LRU list;
-					used in debugging */
-	ibool		in_withdraw_list;
-#endif /* UNIV_DEBUG */
-	uint32_t	lock_hash_val;	/*!< hashed value of the page address
-					in the record lock hash table;
-					protected by buf_block_t::lock
-					(or buf_block_t::mutex, buf_pool->mutex
-				        in buf_page_get_gen(),
-					buf_page_init_for_read()
-					and buf_page_create()) */
 	/* @} */
 	/** @name Optimistic search field */
 	/* @{ */
@@ -1719,13 +1106,13 @@ struct buf_block_t{
 	NOTE that these fields are NOT protected by any semaphore! */
 	/* @{ */
 
-	ulint		n_hash_helps;	/*!< counter which controls building
-					of a new hash index for the page */
-	volatile ulint	n_bytes;	/*!< recommended prefix length for hash
+	volatile uint16_t n_bytes;	/*!< recommended prefix length for hash
 					search: number of bytes in
 					an incomplete last field */
-	volatile ulint	n_fields;	/*!< recommended prefix length for hash
+	volatile uint16_t n_fields;	/*!< recommended prefix length for hash
 					search: number of full fields */
+	uint16_t	n_hash_helps;	/*!< counter which controls building
+					of a new hash index for the page */
 	volatile bool	left_side;	/*!< true or false, depending on
 					whether the leftmost record of several
 					records with the same prefix should be
@@ -1744,11 +1131,11 @@ struct buf_block_t{
 	An exception to this is when we init or create a page
 	in the buffer pool in buf0buf.cc.
 
-	Another exception for buf_pool_clear_hash_index() is that
+	Another exception for buf_pool_t::clear_hash_index() is that
 	assigning block->index = NULL (and block->n_pointers = 0)
 	is allowed whenever btr_search_own_all(RW_LOCK_X).
 
-	Another exception is that ha_insert_for_fold_func() may
+	Another exception is that ha_insert_for_fold() may
 	decrement n_pointers without holding the appropriate latch
 	in btr_search_latches[]. Thus, n_pointers must be
 	protected by atomic memory access.
@@ -1760,7 +1147,7 @@ struct buf_block_t{
 	and holding some latch prevents the state from changing to that.
 
 	Some use of assert_block_ahi_empty() or assert_block_ahi_valid()
-	is prone to race conditions while buf_pool_clear_hash_index() is
+	is prone to race conditions while buf_pool_t::clear_hash_index() is
 	executing (the adaptive hash index is being disabled). Such use
 	is explicitly commented. */
 
@@ -1815,15 +1202,15 @@ struct buf_block_t{
 					debug utilities in sync0rw */
 	/* @} */
 # endif
-	BPageMutex	mutex;		/*!< mutex protecting this block:
-					state (also protected by the buffer
-					pool mutex), io_fix, buf_fix_count,
-					and accessed; we introduce this new
-					mutex in InnoDB-5.1 to relieve
-					contention on the buffer pool mutex */
-
   void fix() { page.fix(); }
-  uint32_t unfix() { return page.unfix(); }
+  uint32_t unfix()
+  {
+    ut_ad(page.buf_fix_count() || page.io_fix() != BUF_IO_NONE ||
+          page.state() == BUF_BLOCK_ZIP_PAGE ||
+          !rw_lock_own_flagged(&lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S |
+                               RW_LOCK_FLAG_SX));
+    return page.unfix();
+  }
 
   /** @return the physical size, in bytes */
   ulint physical_size() const { return page.physical_size(); }
@@ -1831,18 +1218,16 @@ struct buf_block_t{
   /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes
   @retval 0 if not compressed */
   ulint zip_size() const { return page.zip_size(); }
-};
-
-/** Check if a buf_block_t object is in a valid state
-@param block buffer block
-@return TRUE if valid */
-#define buf_block_state_valid(block)				\
-(buf_block_get_state(block) >= BUF_BLOCK_NOT_USED		\
- && (buf_block_get_state(block) <= BUF_BLOCK_REMOVE_HASH))
 
+  /** Initialize the block.
+  @param page_id  page identifier
+  @param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+  @param fix      initial buf_fix_count() */
+  void initialise(const page_id_t page_id, ulint zip_size, uint32_t fix= 0);
+};
 
 /**********************************************************************//**
-Compute the hash fold value for blocks in buf_pool->zip_hash. */
+Compute the hash fold value for blocks in buf_pool.zip_hash. */
 /* @{ */
 #define BUF_POOL_ZIP_FOLD_PTR(ptr) (ulint(ptr) >> srv_page_size_shift)
 #define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame)
@@ -1853,127 +1238,100 @@ Compute the hash fold value for blocks in buf_pool->zip_hash. */
 inside the buffer pool. A hazard pointer is a buf_page_t pointer
 which we intend to iterate over next and we want it remain valid
 even after we release the buffer pool mutex. */
-class HazardPointer {
-
+class HazardPointer
+{
 public:
-	/** Constructor
-	@param buf_pool buffer pool instance
-	@param mutex	mutex that is protecting the hp. */
-	HazardPointer(const buf_pool_t* buf_pool, const ib_mutex_t* mutex)
-		:
-		m_buf_pool(buf_pool)
-#ifdef UNIV_DEBUG
-		, m_mutex(mutex)
-#endif /* UNIV_DEBUG */
-		, m_hp() {}
+  virtual ~HazardPointer() {}
 
-	/** Destructor */
-	virtual ~HazardPointer() {}
+  /** @return current value */
+  buf_page_t *get() const { mysql_mutex_assert_owner(m_mutex); return m_hp; }
 
-	/** Get current value */
-	buf_page_t* get() const
-	{
-		ut_ad(mutex_own(m_mutex));
-		return(m_hp);
-	}
+  /** Set current value
+  @param bpage buffer block to be set as hp */
+  void set(buf_page_t *bpage)
+  {
+    mysql_mutex_assert_owner(m_mutex);
+    ut_ad(!bpage || bpage->in_file());
+    m_hp= bpage;
+  }
 
-	/** Set current value
-	@param bpage	buffer block to be set as hp */
-	void set(buf_page_t* bpage);
+  /** Checks if a bpage is the hp
+  @param bpage  buffer block to be compared
+  @return true if it is hp */
+  bool is_hp(const buf_page_t *bpage) const
+  { mysql_mutex_assert_owner(m_mutex); return bpage == m_hp; }
 
-	/** Checks if a bpage is the hp
-	@param bpage	buffer block to be compared
-	@return true if it is hp */
-	bool is_hp(const buf_page_t* bpage);
+  /** Adjust the value of hp. This happens when some
+  other thread working on the same list attempts to
+  remove the hp from the list. */
+  virtual void adjust(const buf_page_t*) = 0;
 
-	/** Adjust the value of hp. This happens when some
-	other thread working on the same list attempts to
-	remove the hp from the list. Must be implemented
-	by the derived classes.
-	@param bpage	buffer block to be compared */
-	virtual void adjust(const buf_page_t*) = 0;
+#ifdef UNIV_DEBUG
+  /** mutex that protects access to the m_hp. */
+  const mysql_mutex_t *m_mutex= nullptr;
+#endif /* UNIV_DEBUG */
 
 protected:
-	/** Disable copying */
-	HazardPointer(const HazardPointer&);
-	HazardPointer& operator=(const HazardPointer&);
+  /** hazard pointer */
+  buf_page_t *m_hp= nullptr;
+};
 
-	/** Buffer pool instance */
-	const buf_pool_t*	m_buf_pool;
+/** Class implementing buf_pool.flush_list hazard pointer */
+class FlushHp : public HazardPointer
+{
+public:
+  ~FlushHp() override {}
 
-#ifdef UNIV_DEBUG
-	/** mutex that protects access to the m_hp. */
-	const ib_mutex_t*	m_mutex;
-#endif /* UNIV_DEBUG */
+  /** Adjust the value of hp. This happens when some
+  other thread working on the same list attempts to
+  remove the hp from the list.
+  @param bpage  buffer block to be compared */
+  void adjust(const buf_page_t *bpage) override
+  {
+    ut_ad(bpage != NULL);
 
-	/** hazard pointer. */
-	buf_page_t*		m_hp;
-};
+    /* We only support reverse traversal for now. */
+    if (is_hp(bpage))
+      m_hp= UT_LIST_GET_PREV(list, m_hp);
 
-/** Class implementing buf_pool->flush_list hazard pointer */
-class FlushHp: public HazardPointer {
+    ut_ad(!m_hp || m_hp->oldest_modification());
+  }
+};
 
+/** Class implementing buf_pool.LRU hazard pointer */
+class LRUHp : public HazardPointer {
 public:
-	/** Constructor
-	@param buf_pool buffer pool instance
-	@param mutex	mutex that is protecting the hp. */
-	FlushHp(const buf_pool_t* buf_pool, const ib_mutex_t* mutex)
-		:
-		HazardPointer(buf_pool, mutex) {}
-
-	/** Destructor */
-	~FlushHp() override {}
-
-	/** Adjust the value of hp. This happens when some
-	other thread working on the same list attempts to
-	remove the hp from the list.
-	@param bpage	buffer block to be compared */
-	void adjust(const buf_page_t* bpage) override;
-};
+  ~LRUHp() override {}
 
-/** Class implementing buf_pool->LRU hazard pointer */
-class LRUHp: public HazardPointer {
+  /** Adjust the value of hp. This happens when some
+  other thread working on the same list attempts to
+  remove the hp from the list.
+  @param bpage  buffer block to be compared */
+  void adjust(const buf_page_t *bpage) override
+  {
+    ut_ad(bpage);
+    /** We only support reverse traversal for now. */
+    if (is_hp(bpage))
+      m_hp= UT_LIST_GET_PREV(LRU, m_hp);
 
-public:
-	/** Constructor
-	@param buf_pool buffer pool instance
-	@param mutex	mutex that is protecting the hp. */
-	LRUHp(const buf_pool_t* buf_pool, const ib_mutex_t* mutex)
-		:
-		HazardPointer(buf_pool, mutex) {}
-
-	/** Destructor */
-	~LRUHp() override {}
-
-	/** Adjust the value of hp. This happens when some
-	other thread working on the same list attempts to
-	remove the hp from the list.
-	@param bpage	buffer block to be compared */
-	void adjust(const buf_page_t* bpage) override;
+    ut_ad(!m_hp || m_hp->in_LRU_list);
+  }
 };
 
 /** Special purpose iterators to be used when scanning the LRU list.
 The idea is that when one thread finishes the scan it leaves the
 itr in that position and the other thread can start scan from
 there */
-class LRUItr: public LRUHp {
-
+class LRUItr : public LRUHp {
 public:
-	/** Constructor
-	@param buf_pool buffer pool instance
-	@param mutex	mutex that is protecting the hp. */
-	LRUItr(const buf_pool_t* buf_pool, const ib_mutex_t* mutex)
-		:
-		LRUHp(buf_pool, mutex) {}
-
-	/** Destructor */
-	~LRUItr() override {}
-
-	/** Selects from where to start a scan. If we have scanned
-	too deep into the LRU list it resets the value to the tail
-	of the LRU list.
-	@return buf_page_t from where to start scan. */
-	buf_page_t* start();
+  LRUItr() : LRUHp() {}
+  ~LRUItr() override {}
+
+  /** Select from where to start a scan. If we have scanned
+  too deep into the LRU list it resets the value to the tail
+  of the LRU list.
+  @return buf_page_t from where to start scan. */
+  inline buf_page_t *start();
 };
 
 /** Struct that is embedded in the free zip blocks */
@@ -2016,11 +1374,13 @@ struct buf_pool_stat_t{
 				pages that are evicted without
 				being accessed */
 	ulint	n_pages_made_young; /*!< number of pages made young, in
-				calls to buf_LRU_make_block_young() */
+				buf_page_make_young() */
 	ulint	n_pages_not_made_young; /*!< number of pages not made
 				young because the first access
 				was not long enough ago, in
 				buf_page_peek_if_too_old() */
+	/** number of waits for eviction; writes protected by buf_pool.mutex */
+	ulint	LRU_waits;
 	ulint	LRU_bytes;	/*!< LRU size in bytes */
 	ulint	flush_list_bytes;/*!< flush_list size in bytes */
 };
@@ -2035,31 +1395,422 @@ struct buf_buddy_stat_t {
 	ib_uint64_t	relocated_usec;
 };
 
-/** @brief The buffer pool structure.
+/** The buffer pool */
+class buf_pool_t
+{
+  /** A chunk of buffers */
+  struct chunk_t
+  {
+    /** number of elements in blocks[] */
+    size_t size;
+    /** memory allocated for the page frames */
+    unsigned char *mem;
+    /** descriptor of mem */
+    ut_new_pfx_t mem_pfx;
+    /** array of buffer control blocks */
+    buf_block_t *blocks;
+
+    /** Map of first page frame address to chunks[] */
+    using map= std::map<const void*, chunk_t*, std::less<const void*>,
+                        ut_allocator<std::pair<const void* const,chunk_t*>>>;
+    /** Chunk map that may be under construction by buf_resize_thread() */
+    static map *map_reg;
+    /** Current chunk map for lookup only */
+    static map *map_ref;
+
+    /** @return the memory size bytes. */
+    size_t mem_size() const { return mem_pfx.m_size; }
+
+    /** Register the chunk */
+    void reg() { map_reg->emplace(map::value_type(blocks->frame, this)); }
+
+    /** Allocate a chunk of buffer frames.
+    @param bytes    requested size
+    @return whether the allocation succeeded */
+    inline bool create(size_t bytes);
 
-NOTE! The definition appears here only for other modules of this
-directory (buf) to see it. Do not use from outside! */
+#ifdef UNIV_DEBUG
+    /** Find a block that points to a ROW_FORMAT=COMPRESSED page
+    @param data  pointer to the start of a ROW_FORMAT=COMPRESSED page frame
+    @return the block
+    @retval nullptr  if not found */
+    const buf_block_t *contains_zip(const void *data) const
+    {
+      const buf_block_t *block= blocks;
+      for (auto i= size; i--; block++)
+        if (block->page.zip.data == data)
+          return block;
+      return nullptr;
+    }
+
+    /** Check that all blocks are in a replaceable state.
+    @return address of a non-free block
+    @retval nullptr if all freed */
+    inline const buf_block_t *not_freed() const;
+#endif /* UNIV_DEBUG */
+  };
+
+  /** Withdraw blocks from the buffer pool until meeting withdraw_target.
+  @return whether retry is needed */
+  inline bool withdraw_blocks();
+
+  /** Determine if a pointer belongs to a buf_block_t. It can be a pointer to
+  the buf_block_t itself or a member of it.
+  @param ptr    a pointer that will not be dereferenced
+  @return whether the ptr belongs to a buf_block_t struct */
+  bool is_block_field(const void *ptr) const
+  {
+    const chunk_t *chunk= chunks;
+    const chunk_t *const echunk= chunk + ut_min(n_chunks, n_chunks_new);
+
+    /* TODO: protect chunks with a mutex (the older pointer will
+    currently remain during resize()) */
+    for (; chunk < echunk; chunk++)
+      if (ptr >= reinterpret_cast<const void*>(chunk->blocks) &&
+          ptr < reinterpret_cast<const void*>(chunk->blocks + chunk->size))
+        return true;
+    return false;
+  }
+
+  /** Try to reallocate a control block.
+  @param block  control block to reallocate
+  @return whether the reallocation succeeded */
+  inline bool realloc(buf_block_t *block);
+
+public:
+  bool is_initialised() const { return chunks != nullptr; }
+
+  /** Create the buffer pool.
+  @return whether the creation failed */
+  bool create();
 
-struct buf_pool_t{
+  /** Clean up after successful create() */
+  void close();
+
+  /** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */
+  inline void resize();
+
+  /** @return whether resize() is in progress */
+  bool resize_in_progress() const
+  {
+    return UNIV_UNLIKELY(resizing.load(std::memory_order_relaxed));
+  }
+
+  /** @return the current size in blocks */
+  size_t get_n_pages() const
+  {
+    ut_ad(is_initialised());
+    size_t size= 0;
+    for (auto j= n_chunks; j--; )
+      size+= chunks[j].size;
+    return size;
+  }
+
+  /** Determine whether a frame is intended to be withdrawn during resize().
+  @param ptr    pointer within a buf_block_t::frame
+  @return whether the frame will be withdrawn */
+  bool will_be_withdrawn(const byte *ptr) const
+  {
+    ut_ad(curr_size < old_size);
+#ifdef SAFE_MUTEX
+    if (resizing.load(std::memory_order_relaxed))
+      mysql_mutex_assert_owner(&mutex);
+#endif /* SAFE_MUTEX */
+
+    for (const chunk_t *chunk= chunks + n_chunks_new,
+         * const echunk= chunks + n_chunks;
+         chunk != echunk; chunk++)
+      if (ptr >= chunk->blocks->frame &&
+          ptr < (chunk->blocks + chunk->size - 1)->frame + srv_page_size)
+        return true;
+    return false;
+  }
+
+  /** Determine whether a block is intended to be withdrawn during resize().
+  @param bpage  buffer pool block
+  @return whether the frame will be withdrawn */
+  bool will_be_withdrawn(const buf_page_t &bpage) const
+  {
+    ut_ad(curr_size < old_size);
+#ifdef SAFE_MUTEX
+    if (resizing.load(std::memory_order_relaxed))
+      mysql_mutex_assert_owner(&mutex);
+#endif /* SAFE_MUTEX */
+
+    for (const chunk_t *chunk= chunks + n_chunks_new,
+         * const echunk= chunks + n_chunks;
+         chunk != echunk; chunk++)
+      if (&bpage >= &chunk->blocks->page &&
+          &bpage < &chunk->blocks[chunk->size].page)
+        return true;
+    return false;
+  }
+
+  /** Release and evict a corrupted page.
+  @param bpage    page that was being read */
+  ATTRIBUTE_COLD void corrupted_evict(buf_page_t *bpage);
+
+  /** Release a memory block to the buffer pool. */
+  ATTRIBUTE_COLD void free_block(buf_block_t *block);
+
+#ifdef UNIV_DEBUG
+  /** Find a block that points to a ROW_FORMAT=COMPRESSED page
+  @param data  pointer to the start of a ROW_FORMAT=COMPRESSED page frame
+  @return the block
+  @retval nullptr  if not found */
+  const buf_block_t *contains_zip(const void *data) const
+  {
+    mysql_mutex_assert_owner(&mutex);
+    for (const chunk_t *chunk= chunks, * const end= chunks + n_chunks;
+         chunk != end; chunk++)
+      if (const buf_block_t *block= chunk->contains_zip(data))
+        return block;
+    return nullptr;
+  }
+
+  /** Assert that all buffer pool pages are in a replaceable state */
+  void assert_all_freed();
+#endif /* UNIV_DEBUG */
+
+#ifdef BTR_CUR_HASH_ADAPT
+  /** Clear the adaptive hash index on all pages in the buffer pool. */
+  inline void clear_hash_index();
+
+  /** Get a buffer block from an adaptive hash index pointer.
+  This function does not return if the block is not identified.
+  @param ptr  pointer to within a page frame
+  @return pointer to block, never NULL */
+  inline buf_block_t *block_from_ahi(const byte *ptr) const;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+  bool is_block_lock(const rw_lock_t *l) const
+  { return is_block_field(static_cast<const void*>(l)); }
+
+  /**
+  @return the smallest oldest_modification lsn for any page
+  @retval empty_lsn if all modified persistent pages have been flushed */
+  lsn_t get_oldest_modification(lsn_t empty_lsn)
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    while (buf_page_t *bpage= UT_LIST_GET_LAST(flush_list))
+    {
+      ut_ad(!fsp_is_system_temporary(bpage->id().space()));
+      lsn_t lsn= bpage->oldest_modification();
+      if (lsn != 1)
+      {
+        ut_ad(lsn > 2);
+        return lsn;
+      }
+      delete_from_flush_list(bpage);
+    }
+    return empty_lsn;
+  }
+
+  /** Determine if a buffer block was created by chunk_t::create().
+  @param block  block descriptor (not dereferenced)
+  @return whether block has been created by chunk_t::create() */
+  bool is_uncompressed(const buf_block_t *block) const
+  {
+    return is_block_field(reinterpret_cast<const void*>(block));
+  }
+
+  /** Get the page_hash latch for a page */
+  page_hash_latch *hash_lock_get(const page_id_t id) const
+  {
+    return page_hash.lock_get(id.fold());
+  }
+
+  /** Look up a block descriptor.
+  @param id    page identifier
+  @param fold  id.fold()
+  @return block descriptor, possibly in watch[]
+  @retval nullptr  if not found*/
+  buf_page_t *page_hash_get_low(const page_id_t id, const ulint fold)
+  {
+    ut_ad(id.fold() == fold);
+#ifdef SAFE_MUTEX
+    DBUG_ASSERT(mysql_mutex_is_owner(&mutex) ||
+                page_hash.lock_get(fold)->is_locked());
+#endif /* SAFE_MUTEX */
+    buf_page_t *bpage;
+    /* Look for the page in the hash table */
+    HASH_SEARCH(hash, &page_hash, fold, buf_page_t*, bpage,
+                ut_ad(bpage->in_page_hash), id == bpage->id());
+    return bpage;
+  }
+private:
+  /** Look up a block descriptor.
+  @tparam exclusive  whether the latch is to be acquired exclusively
+  @tparam watch      whether to allow watch_is_sentinel()
+  @param page_id     page identifier
+  @param fold        page_id.fold()
+  @param hash_lock   pointer to the acquired latch (to be released by caller)
+  @return pointer to the block
+  @retval nullptr  if no block was found; !lock || !*lock will also hold */
+  template<bool exclusive,bool watch>
+  buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold,
+                                   page_hash_latch **hash_lock)
+  {
+    ut_ad(hash_lock || !exclusive);
+    page_hash_latch *latch= page_hash.lock<exclusive>(fold);
+    buf_page_t *bpage= page_hash_get_low(page_id, fold);
+    if (!bpage || watch_is_sentinel(*bpage))
+    {
+      latch->release<exclusive>();
+      if (hash_lock)
+        *hash_lock= nullptr;
+      return watch ? bpage : nullptr;
+    }
+
+    ut_ad(bpage->in_file());
+    ut_ad(page_id == bpage->id());
+
+    if (hash_lock)
+      *hash_lock= latch; /* to be released by the caller */
+    else
+      latch->release<exclusive>();
+    return bpage;
+  }
+public:
+  /** Look up a block descriptor.
+  @tparam exclusive  whether the latch is to be acquired exclusively
+  @param page_id     page identifier
+  @param fold        page_id.fold()
+  @param hash_lock   pointer to the acquired latch (to be released by caller)
+  @return pointer to the block
+  @retval nullptr  if no block was found; !lock || !*lock will also hold */
+  template<bool exclusive>
+  buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold,
+                                   page_hash_latch **hash_lock)
+  { return page_hash_get_locked<exclusive,false>(page_id, fold, hash_lock); }
+
+  /** @return whether the buffer pool contains a page
+  @tparam watch      whether to allow watch_is_sentinel()
+  @param page_id     page identifier */
+  template<bool watch= false>
+  bool page_hash_contains(const page_id_t page_id)
+  {
+    return page_hash_get_locked<false,watch>(page_id, page_id.fold(), nullptr);
+  }
+
+  /** Determine if a block is a sentinel for a buffer pool watch.
+  @param bpage page descriptor
+  @return whether bpage a sentinel for a buffer pool watch */
+  bool watch_is_sentinel(const buf_page_t &bpage)
+  {
+#ifdef SAFE_MUTEX
+    DBUG_ASSERT(mysql_mutex_is_owner(&mutex) ||
+                hash_lock_get(bpage.id())->is_locked());
+#endif /* SAFE_MUTEX */
+    ut_ad(bpage.in_file());
+
+    if (&bpage < &watch[0] || &bpage >= &watch[UT_ARR_SIZE(watch)])
+    {
+      ut_ad(bpage.state() != BUF_BLOCK_ZIP_PAGE || bpage.zip.data);
+      return false;
+    }
+
+    ut_ad(bpage.state() == BUF_BLOCK_ZIP_PAGE);
+    ut_ad(!bpage.in_zip_hash);
+    ut_ad(!bpage.zip.data);
+    return true;
+  }
+
+  /** Check if a watched page has been read.
+  This may only be called after !watch_set() and before invoking watch_unset().
+  @param id   page identifier
+  @return whether the page was read to the buffer pool */
+  bool watch_occurred(const page_id_t id)
+  {
+    const ulint fold= id.fold();
+    page_hash_latch *hash_lock= page_hash.lock<false>(fold);
+    /* The page must exist because watch_set() increments buf_fix_count. */
+    buf_page_t *bpage= page_hash_get_low(id, fold);
+    const bool is_sentinel= watch_is_sentinel(*bpage);
+    hash_lock->read_unlock();
+    return !is_sentinel;
+  }
+
+  /** Register a watch for a page identifier. The caller must hold an
+  exclusive page hash latch. The *hash_lock may be released,
+  relocated, and reacquired.
+  @param id         page identifier
+  @param hash_lock  exclusively held page_hash latch
+  @return a buffer pool block corresponding to id
+  @retval nullptr   if the block was not present, and a watch was installed */
+  inline buf_page_t *watch_set(const page_id_t id,
+                               page_hash_latch **hash_lock);
+
+  /** Stop watching whether a page has been read in.
+  watch_set(id) must have returned nullptr before.
+  @param id   page identifier */
+  void watch_unset(const page_id_t id);
+
+  /** Remove the sentinel block for the watch before replacing it with a
+  real block. watch_unset() or watch_occurred() will notice
+  that the block has been replaced with the real block.
+  @param watch   sentinel */
+  inline void watch_remove(buf_page_t *watch);
+
+  /** @return whether less than 1/4 of the buffer pool is available */
+  bool running_out() const
+  {
+    return !recv_recovery_is_on() &&
+      UNIV_UNLIKELY(UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) <
+                    std::min(curr_size, old_size) / 4);
+  }
+
+#ifdef UNIV_DEBUG
+  /** Validate the buffer pool. */
+  void validate();
+#endif /* UNIV_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+  /** Write information of the buf_pool to the error log. */
+  void print();
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
+
+  /** Remove a block from the LRU list.
+  @return the predecessor in the LRU list */
+  buf_page_t *LRU_remove(buf_page_t *bpage)
+  {
+    mysql_mutex_assert_owner(&mutex);
+    ut_ad(bpage->in_LRU_list);
+    ut_ad(bpage->in_page_hash);
+    ut_ad(!bpage->in_zip_hash);
+    ut_ad(bpage->in_file());
+    lru_hp.adjust(bpage);
+    lru_scan_itr.adjust(bpage);
+    ut_d(bpage->in_LRU_list= false);
+    buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage);
+    UT_LIST_REMOVE(LRU, bpage);
+    return prev;
+  }
+
+  /** Number of pages to read ahead */
+  static constexpr uint32_t READ_AHEAD_PAGES= 64;
+
+  /** Buffer pool mutex */
+  MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
+  /** Number of pending LRU flush; protected by mutex. */
+  ulint n_flush_LRU_;
+  /** broadcast when n_flush_LRU reaches 0; protected by mutex */
+  pthread_cond_t done_flush_LRU;
+  /** Number of pending flush_list flush; protected by mutex */
+  ulint n_flush_list_;
+  /** broadcast when n_flush_list reaches 0; protected by mutex */
+  pthread_cond_t done_flush_list;
+
+  TPOOL_SUPPRESS_TSAN ulint n_flush_LRU() const { return n_flush_LRU_; }
+  TPOOL_SUPPRESS_TSAN ulint n_flush_list() const { return n_flush_list_; }
 
 	/** @name General fields */
 	/* @{ */
-	BufPoolMutex	mutex;		/*!< Buffer pool mutex of this
-					instance */
-	BufPoolZipMutex	zip_mutex;	/*!< Zip mutex of this buffer
-					pool instance, protects compressed
-					only pages (of type buf_page_t, not
-					buf_block_t */
-	ulint		instance_no;	/*!< Array index of this buffer
-					pool instance */
 	ulint		curr_pool_size;	/*!< Current pool size in bytes */
 	ulint		LRU_old_ratio;  /*!< Reserve this much of the buffer
 					pool for "old" blocks */
 #ifdef UNIV_DEBUG
 	ulint		buddy_n_frames; /*!< Number of frames allocated from
 					the buffer pool to the buddy system */
-#endif
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 	ulint		mutex_exit_forbidden; /*!< Forbid release mutex */
 #endif
 	ut_allocator<unsigned char>	allocator;	/*!< Allocator used for
@@ -2067,31 +1818,88 @@ struct buf_pool_t{
 					member. */
 	volatile ulint	n_chunks;	/*!< number of buffer pool chunks */
 	volatile ulint	n_chunks_new;	/*!< new number of buffer pool chunks */
-	buf_chunk_t*	chunks;		/*!< buffer pool chunks */
-	buf_chunk_t*	chunks_old;	/*!< old buffer pool chunks to be freed
+	chunk_t*	chunks;		/*!< buffer pool chunks */
+	chunk_t*	chunks_old;	/*!< old buffer pool chunks to be freed
 					after resizing buffer pool */
-	ulint		curr_size;	/*!< current pool size in pages */
-	ulint		old_size;	/*!< previous pool size in pages */
-	ulint		read_ahead_area;/*!< size in pages of the area which
-					the read-ahead algorithms read if
-					invoked */
-	hash_table_t*	page_hash;	/*!< hash table of buf_page_t or
-					buf_block_t file pages,
-					buf_page_in_file() == TRUE,
-					indexed by (space_id, offset).
-					page_hash is protected by an
-					array of mutexes.
-					Changes in page_hash are protected
-					by buf_pool->mutex and the relevant
-					page_hash mutex. Lookups can happen
-					while holding the buf_pool->mutex or
-					the relevant page_hash mutex. */
-	hash_table_t*	zip_hash;	/*!< hash table of buf_block_t blocks
-					whose frames are allocated to the
-					zip buddy system,
-					indexed by block->frame */
-	ulint		n_pend_reads;	/*!< number of pending read
-					operations */
+	/** current pool size in pages */
+	Atomic_counter<ulint> curr_size;
+	/** previous pool size in pages */
+	Atomic_counter<ulint> old_size;
+	/** read-ahead request size in pages */
+	Atomic_counter<uint32_t> read_ahead_area;
+
+  /** Hash table with singly-linked overflow lists. @see hash_table_t */
+  struct page_hash_table
+  {
+    /** Number of array[] elements per page_hash_latch.
+    Must be one less than a power of 2. */
+    static constexpr size_t ELEMENTS_PER_LATCH= CPU_LEVEL1_DCACHE_LINESIZE /
+      sizeof(void*) - 1;
+
+    /** number of payload elements in array[] */
+    Atomic_relaxed<ulint> n_cells;
+    /** the hash table, with pad(n_cells) elements, aligned to L1 cache size */
+    hash_cell_t *array;
+
+    /** Create the hash table.
+    @param n  the lower bound of n_cells */
+    void create(ulint n);
+
+    /** Free the hash table. */
+    void free() { aligned_free(array); array= nullptr; }
+
+    /** @return the index of an array element */
+    ulint calc_hash(ulint fold) const { return calc_hash(fold, n_cells); }
+    /** @return raw array index converted to padded index */
+    static ulint pad(ulint h) { return 1 + (h / ELEMENTS_PER_LATCH) + h; }
+  private:
+    /** @return the hash value before any ELEMENTS_PER_LATCH padding */
+    static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); }
+
+    /** @return the index of an array element */
+    static ulint calc_hash(ulint fold, ulint n_cells)
+    {
+      return pad(hash(fold, n_cells));
+    }
+    /** Get a page_hash latch. */
+    page_hash_latch *lock_get(ulint fold, ulint n) const
+    {
+      static_assert(!((ELEMENTS_PER_LATCH + 1) & ELEMENTS_PER_LATCH),
+                    "must be one less than a power of 2");
+      return reinterpret_cast<page_hash_latch*>
+        (&array[calc_hash(fold, n) & ~ELEMENTS_PER_LATCH]);
+    }
+  public:
+    /** Get a page_hash latch. */
+    page_hash_latch *lock_get(ulint fold) const
+    { return lock_get(fold, n_cells); }
+
+    /** Acquire an array latch.
+    @tparam exclusive  whether the latch is to be acquired exclusively
+    @param fold    hash bucket key */
+    template<bool exclusive> page_hash_latch *lock(ulint fold)
+    {
+      page_hash_latch *latch= lock_get(fold, n_cells);
+      latch->acquire<exclusive>();
+      return latch;
+    }
+
+    /** Exclusively aqcuire all latches */
+    inline void write_lock_all();
+
+    /** Release all latches */
+    inline void write_unlock_all();
+  };
+
+  /** Hash table of file pages (buf_page_t::in_file() holds),
+  indexed by page_id_t. Protected by both mutex and page_hash.lock_get(). */
+  page_hash_table page_hash;
+
+  /** map of block->frame to buf_block_t blocks that belong
+  to buf_buddy_alloc(); protected by buf_pool.mutex */
+  hash_table_t zip_hash;
+	/** number of pending read operations */
+	Atomic_counter<ulint> n_pend_reads;
 	Atomic_counter<ulint>
 			n_pend_unzip;	/*!< number of pending decompressions */
 
@@ -2106,51 +1914,51 @@ struct buf_pool_t{
 
 	/* @} */
 
-	/** @name Page flushing algorithm fields */
+  /** @name Page flushing algorithm fields */
+  /* @{ */
+
+  /** mutex protecting flush_list, buf_page_t::set_oldest_modification()
+  and buf_page_t::list pointers when !oldest_modification() */
+  MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_list_mutex;
+  /** "hazard pointer" for flush_list scans; protected by flush_list_mutex */
+  FlushHp flush_hp;
+  /** modified blocks (a subset of LRU) */
+  UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
+private:
+  /** whether the page cleaner needs wakeup from indefinite sleep */
+  bool page_cleaner_is_idle;
+  /** track server activity count for signaling idle flushing */
+  ulint last_activity_count;
+public:
+  /** signalled to wake up the page_cleaner; protected by flush_list_mutex */
+  pthread_cond_t do_flush_list;
 
-	/* @{ */
+  /** @return whether the page cleaner must sleep due to being idle */
+  bool page_cleaner_idle() const
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    return page_cleaner_is_idle;
+  }
+  /** Wake up the page cleaner if needed */
+  void page_cleaner_wakeup();
+
+  /** Register whether an explicit wakeup of the page cleaner is needed */
+  void page_cleaner_set_idle(bool deep_sleep)
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    page_cleaner_is_idle= deep_sleep;
+  }
+
+  /** Update server last activity count */
+  void update_last_activity_count(ulint activity_count)
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    last_activity_count= activity_count;
+  }
+
+  // n_flush_LRU() + n_flush_list()
+  // is approximately COUNT(io_fix()==BUF_IO_WRITE) in flush_list
 
-	FlushListMutex	flush_list_mutex;/*!< mutex protecting the
-					flush list access. This mutex
-					protects flush_list, flush_rbt
-					and bpage::list pointers when
-					the bpage is on flush_list. It
-					also protects writes to
-					bpage::oldest_modification and
-					flush_list_hp */
-	FlushHp			flush_hp;/*!< "hazard pointer"
-					used during scan of flush_list
-					while doing flush list batch.
-					Protected by flush_list_mutex */
-	UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
-					/*!< base node of the modified block
-					list */
-	ibool		init_flush[BUF_FLUSH_N_TYPES];
-					/*!< this is TRUE when a flush of the
-					given type is being initialized */
-	ulint		n_flush[BUF_FLUSH_N_TYPES];
-					/*!< this is the number of pending
-					writes in the given flush type */
-	os_event_t	no_flush[BUF_FLUSH_N_TYPES];
-					/*!< this is in the set state
-					when there is no flush batch
-					of the given type running;
-					os_event_set() and os_event_reset()
-					are protected by buf_pool_t::mutex */
-	ib_rbt_t*	flush_rbt;	/*!< a red-black tree is used
-					exclusively during recovery to
-					speed up insertions in the
-					flush_list. This tree contains
-					blocks in order of
-					oldest_modification LSN and is
-					kept in sync with the
-					flush_list.
-					Each member of the tree MUST
-					also be on the flush_list.
-					This tree is relevant only in
-					recovery and is set to NULL
-					once the recovery is over.
-					Protected by flush_list_mutex */
 	unsigned	freed_page_clock;/*!< a sequence number used
 					to count the number of buffer
 					blocks removed from the end of
@@ -2160,16 +1968,16 @@ struct buf_pool_t{
 					to read this for heuristic
 					purposes without holding any
 					mutex or latch */
-	ibool		try_LRU_scan;	/*!< Set to FALSE when an LRU
+	bool		try_LRU_scan;	/*!< Cleared when an LRU
 					scan for free block fails. This
 					flag is used to avoid repeated
 					scans of LRU list when we know
 					that there is no free block
 					available in the scan depth for
-					eviction. Set to TRUE whenever
+					eviction. Set whenever
 					we flush a batch from the
 					buffer pool. Protected by the
-					buf_pool->mutex */
+					buf_pool.mutex */
 	/* @} */
 
 	/** @name LRU replacement algorithm fields */
@@ -2178,6 +1986,8 @@ struct buf_pool_t{
 	UT_LIST_BASE_NODE_T(buf_page_t) free;
 					/*!< base node of the free
 					block list */
+  /** signaled each time when the free list grows; protected by mutex */
+  pthread_cond_t done_free;
 
 	UT_LIST_BASE_NODE_T(buf_page_t) withdraw;
 					/*!< base node of the withdraw
@@ -2189,17 +1999,13 @@ struct buf_pool_t{
 					block list, when withdrawing */
 
 	/** "hazard pointer" used during scan of LRU while doing
-	LRU list batch.  Protected by buf_pool::mutex */
+	LRU list batch.  Protected by buf_pool_t::mutex. */
 	LRUHp		lru_hp;
 
 	/** Iterator used to scan the LRU list when searching for
-	replacable victim. Protected by buf_pool::mutex. */
+	replacable victim. Protected by buf_pool_t::mutex. */
 	LRUItr		lru_scan_itr;
 
-	/** Iterator used to scan the LRU list when searching for
-	single page flushing victim.  Protected by buf_pool::mutex. */
-	LRUItr		single_scan_itr;
-
 	UT_LIST_BASE_NODE_T(buf_page_t) LRU;
 					/*!< base node of the LRU list */
 
@@ -2223,200 +2029,322 @@ struct buf_pool_t{
 					unzip_LRU list */
 
 	/* @} */
-	/** @name Buddy allocator fields
-	The buddy allocator is used for allocating compressed page
-	frames and buf_page_t descriptors of blocks that exist
-	in the buffer pool only in compressed form. */
-	/* @{ */
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	UT_LIST_BASE_NODE_T(buf_page_t)	zip_clean;
-					/*!< unmodified compressed pages */
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-	UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX];
-					/*!< buddy free lists */
+  /** free ROW_FORMAT=COMPRESSED page frames */
+  UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX];
 #if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN
 # error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN"
 #endif
-	/* @} */
 
-	buf_page_t*			watch;
-					/*!< Sentinel records for buffer
-					pool watches. Protected by
-					buf_pool->mutex. */
-
-	/** Temporary memory for page_compressed and encrypted I/O */
-	struct io_buf_t {
-		/** number of elements in slots[] */
-		const ulint n_slots;
-		/** array of slots */
-		buf_tmp_buffer_t* const slots;
-
-		io_buf_t() = delete;
-
-		/** Constructor */
-		explicit io_buf_t(ulint n_slots) :
-			n_slots(n_slots),
-			slots(static_cast<buf_tmp_buffer_t*>(
-				      ut_malloc_nokey(n_slots
-						      * sizeof *slots)))
-		{
-			memset((void*) slots, 0, n_slots * sizeof *slots);
-		}
-
-		~io_buf_t();
+  /** Sentinels to detect if pages are read into the buffer pool while
+  a delete-buffering operation is pending. Protected by mutex. */
+  buf_page_t watch[innodb_purge_threads_MAX + 1];
+  /** Reserve a buffer. */
+  buf_tmp_buffer_t *io_buf_reserve() { return io_buf.reserve(); }
 
-		/** Reserve a buffer */
-		buf_tmp_buffer_t* reserve()
-		{
-			for (buf_tmp_buffer_t* s = slots, *e = slots + n_slots;
-			     s != e; s++) {
-				if (s->acquire()) return s;
-			}
-			return NULL;
-		}
-	} io_buf;
+  /** @return whether any I/O is pending */
+  bool any_io_pending()
+  {
+    if (n_pend_reads)
+      return true;
+    mysql_mutex_lock(&mutex);
+    const bool any_pending{n_flush_LRU_ || n_flush_list_};
+    mysql_mutex_unlock(&mutex);
+    return any_pending;
+  }
+  /** @return total amount of pending I/O */
+  ulint io_pending() const
+  {
+    return n_pend_reads + n_flush_LRU() + n_flush_list();
+  }
 
-	/** Determine if a pointer belongs to a buf_block_t.
-	It can be a pointer to the buf_block_t itself or a member of it.
-	@param ptr	a pointer that will not be dereferenced
-	@return whether the ptr belongs to a buf_block_t struct */
-	inline bool is_block_field(const void *ptr) const;
+private:
+  /** Remove a block from the flush list. */
+  inline void delete_from_flush_list_low(buf_page_t *bpage);
+  /** Remove a block from flush_list.
+  @param bpage   buffer pool page
+  @param clear   whether to invoke buf_page_t::clear_oldest_modification() */
+  void delete_from_flush_list(buf_page_t *bpage, bool clear);
+public:
+  /** Remove a block from flush_list.
+  @param bpage   buffer pool page */
+  void delete_from_flush_list(buf_page_t *bpage)
+  { delete_from_flush_list(bpage, true); }
+
+  /** Insert a modified block into the flush list.
+  @param block    modified block
+  @param lsn      start LSN of the mini-transaction that modified the block */
+  void insert_into_flush_list(buf_block_t *block, lsn_t lsn);
+
+  /** Free a page whose underlying file page has been freed. */
+  inline void release_freed_page(buf_page_t *bpage);
+
+private:
+  /** Temporary memory for page_compressed and encrypted I/O */
+  struct io_buf_t
+  {
+    /** number of elements in slots[] */
+    ulint n_slots;
+    /** array of slots */
+    buf_tmp_buffer_t *slots;
+
+    void create(ulint n_slots)
+    {
+      this->n_slots= n_slots;
+      slots= static_cast<buf_tmp_buffer_t*>
+        (ut_malloc_nokey(n_slots * sizeof *slots));
+      memset((void*) slots, 0, n_slots * sizeof *slots);
+    }
+
+    void close()
+    {
+      for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
+      {
+        aligned_free(s->crypt_buf);
+        aligned_free(s->comp_buf);
+      }
+      ut_free(slots);
+      slots= nullptr;
+      n_slots= 0;
+    }
+
+    /** Reserve a buffer */
+    buf_tmp_buffer_t *reserve()
+    {
+      for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
+        if (s->acquire())
+          return s;
+      return nullptr;
+    }
+  } io_buf;
+
+  /** whether resize() is in the critical path */
+  std::atomic<bool> resizing;
 };
 
-/** Print the given buf_pool_t object.
-@param[in,out]	out		the output stream
-@param[in]	buf_pool	the buf_pool_t object to be printed
-@return the output stream */
-std::ostream&
-operator<<(
-        std::ostream&		out,
-        const buf_pool_t&	buf_pool);
-
-/** @name Accessors for buf_pool->mutex.
-Use these instead of accessing buf_pool->mutex directly. */
-/* @{ */
+/** The InnoDB buffer pool */
+extern buf_pool_t buf_pool;
 
-/** Test if a buffer pool mutex is owned. */
-#define buf_pool_mutex_own(b) mutex_own(&b->mutex)
-/** Acquire a buffer pool mutex. */
-#define buf_pool_mutex_enter(b) do {		\
-	ut_ad(!(b)->zip_mutex.is_owned());	\
-	mutex_enter(&(b)->mutex);		\
-} while (0)
+inline void page_hash_latch::read_lock()
+{
+  mysql_mutex_assert_not_owner(&buf_pool.mutex);
+  if (!read_trylock())
+    read_lock_wait();
+}
 
-/** Test if flush list mutex is owned. */
-#define buf_flush_list_mutex_own(b) mutex_own(&(b)->flush_list_mutex)
+inline void page_hash_latch::write_lock()
+{
+  if (!write_trylock())
+    write_lock_wait();
+}
 
-/** Acquire the flush list mutex. */
-#define buf_flush_list_mutex_enter(b) do {	\
-	mutex_enter(&(b)->flush_list_mutex);	\
-} while (0)
-/** Release the flush list mutex. */
-# define buf_flush_list_mutex_exit(b) do {	\
-	mutex_exit(&(b)->flush_list_mutex);	\
-} while (0)
+inline void buf_page_t::add_buf_fix_count(uint32_t count)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  buf_fix_count_+= count;
+}
 
+inline void buf_page_t::set_buf_fix_count(uint32_t count)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  buf_fix_count_= count;
+}
 
-/** Test if block->mutex is owned. */
-#define buf_page_mutex_own(b)	(b)->mutex.is_owned()
+inline void buf_page_t::set_state(buf_page_state state)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+#ifdef UNIV_DEBUG
+  switch (state) {
+  case BUF_BLOCK_REMOVE_HASH:
+    /* buf_pool_t::corrupted_evict() invokes set_corrupt_id()
+    before buf_LRU_free_one_page(), so we cannot assert that
+    we are holding the hash_lock. */
+    break;
+  case BUF_BLOCK_MEMORY:
+    if (!in_file()) break;
+    /* fall through */
+  case BUF_BLOCK_FILE_PAGE:
+    ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked());
+    break;
+  case BUF_BLOCK_NOT_USED:
+    if (!in_file()) break;
+    /* fall through */
+  case BUF_BLOCK_ZIP_PAGE:
+    ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked() ||
+          (this >= &buf_pool.watch[0] &&
+           this <= &buf_pool.watch[UT_ARR_SIZE(buf_pool.watch)]));
+    break;
+  }
+#endif
+  state_= state;
+}
 
-/** Acquire the block->mutex. */
-#define buf_page_mutex_enter(b) do {			\
-	mutex_enter(&(b)->mutex);			\
-} while (0)
+inline void buf_page_t::set_io_fix(buf_io_fix io_fix)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  io_fix_= io_fix;
+}
 
-/** Release the trx->mutex. */
-#define buf_page_mutex_exit(b) do {			\
-	(b)->mutex.exit();				\
-} while (0)
+inline void buf_page_t::set_corrupt_id()
+{
+#ifdef UNIV_DEBUG
+  switch (oldest_modification()) {
+  case 0:
+    break;
+  case 2:
+    ut_ad(fsp_is_system_temporary(id().space()));
+    /* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */
+    ut_d(oldest_modification_= 0;)
+    break;
+  default:
+    ut_ad("block is dirty" == 0);
+  }
+  switch (state()) {
+  case BUF_BLOCK_REMOVE_HASH:
+    break;
+  case BUF_BLOCK_ZIP_PAGE:
+  case BUF_BLOCK_FILE_PAGE:
+    ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked());
+    break;
+  case BUF_BLOCK_NOT_USED:
+  case BUF_BLOCK_MEMORY:
+    ut_ad("invalid state" == 0);
+  }
+#endif
+  id_= page_id_t(~0ULL);
+}
 
+/** Set oldest_modification when adding to buf_pool.flush_list */
+inline void buf_page_t::set_oldest_modification(lsn_t lsn)
+{
+  mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+  ut_ad(oldest_modification() <= 1);
+  ut_ad(lsn > 2);
+  oldest_modification_= lsn;
+}
 
-/** Get appropriate page_hash_lock. */
-UNIV_INLINE
-rw_lock_t*
-buf_page_hash_lock_get(const buf_pool_t* buf_pool, const page_id_t& page_id)
+/** Clear oldest_modification after removing from buf_pool.flush_list */
+inline void buf_page_t::clear_oldest_modification()
 {
-	return hash_get_lock(buf_pool->page_hash, page_id.fold());
+  mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+  ut_d(const auto state= state_);
+  ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_ZIP_PAGE ||
+        state == BUF_BLOCK_REMOVE_HASH);
+  ut_ad(oldest_modification());
+  ut_ad(!list.prev);
+  ut_ad(!list.next);
+  /* We must use release memory order to guarantee that callers of
+  oldest_modification_acquire() will observe the block as
+  being detached from buf_pool.flush_list, after reading the value 0. */
+  oldest_modification_.store(0, std::memory_order_release);
 }
 
-/** If not appropriate page_hash_lock, relock until appropriate. */
-# define buf_page_hash_lock_s_confirm(hash_lock, buf_pool, page_id)\
-	hash_lock_s_confirm(hash_lock, (buf_pool)->page_hash, (page_id).fold())
+/** Note that a block is no longer dirty, while not removing
+it from buf_pool.flush_list */
+inline void buf_page_t::clear_oldest_modification(bool temporary)
+{
+  ut_ad(temporary == fsp_is_system_temporary(id().space()));
+  if (temporary)
+  {
+    ut_ad(oldest_modification() == 2);
+    oldest_modification_= 0;
+  }
+  else
+  {
+    /* We use release memory order to guarantee that callers of
+    oldest_modification_acquire() will observe the block as
+    being detached from buf_pool.flush_list, after reading the value 0. */
+    ut_ad(oldest_modification() > 2);
+    oldest_modification_.store(1, std::memory_order_release);
+  }
+}
 
-# define buf_page_hash_lock_x_confirm(hash_lock, buf_pool, page_id)\
-	hash_lock_x_confirm(hash_lock, (buf_pool)->page_hash, (page_id).fold())
+/** @return whether the block is modified and ready for flushing */
+inline bool buf_page_t::ready_for_flush() const
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(in_LRU_list);
+  ut_a(in_file());
+  ut_ad(fsp_is_system_temporary(id().space())
+        ? oldest_modification() == 2
+        : oldest_modification() > 2);
+  return io_fix_ == BUF_IO_NONE;
+}
 
-#ifdef UNIV_DEBUG
-/** Test if page_hash lock is held in s-mode. */
-# define buf_page_hash_lock_held_s(buf_pool, bpage)	\
-	rw_lock_own(buf_page_hash_lock_get((buf_pool), (bpage)->id), RW_LOCK_S)
+/** @return whether the block can be relocated in memory.
+The block can be dirty, but it must not be I/O-fixed or bufferfixed. */
+inline bool buf_page_t::can_relocate() const
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(in_file());
+  ut_ad(in_LRU_list);
+  return io_fix_ == BUF_IO_NONE && !buf_fix_count_;
+}
 
-/** Test if page_hash lock is held in x-mode. */
-# define buf_page_hash_lock_held_x(buf_pool, bpage)	\
-	rw_lock_own(buf_page_hash_lock_get((buf_pool), (bpage)->id), RW_LOCK_X)
+/** @return whether the block has been flagged old in buf_pool.LRU */
+inline bool buf_page_t::is_old() const
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(in_file());
+  ut_ad(in_LRU_list);
+  return old;
+}
 
-/** Test if page_hash lock is held in x or s-mode. */
-# define buf_page_hash_lock_held_s_or_x(buf_pool, bpage)\
-	(buf_page_hash_lock_held_s((buf_pool), (bpage))	\
-	 || buf_page_hash_lock_held_x((buf_pool), (bpage)))
+/** Set whether a block is old in buf_pool.LRU */
+inline void buf_page_t::set_old(bool old)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(in_LRU_list);
 
-# define buf_block_hash_lock_held_s(buf_pool, block)	\
-	buf_page_hash_lock_held_s((buf_pool), &(block)->page)
+#ifdef UNIV_LRU_DEBUG
+  ut_a((buf_pool.LRU_old_len == 0) == (buf_pool.LRU_old == nullptr));
+  /* If a block is flagged "old", the LRU_old list must exist. */
+  ut_a(!old || buf_pool.LRU_old);
 
-# define buf_block_hash_lock_held_x(buf_pool, block)	\
-	buf_page_hash_lock_held_x((buf_pool), &(block)->page)
+  if (UT_LIST_GET_PREV(LRU, this) && UT_LIST_GET_NEXT(LRU, this))
+  {
+    const buf_page_t *prev= UT_LIST_GET_PREV(LRU, this);
+    const buf_page_t *next = UT_LIST_GET_NEXT(LRU, this);
+    if (prev->old == next->old)
+      ut_a(prev->old == old);
+    else
+    {
+      ut_a(!prev->old);
+      ut_a(buf_pool.LRU_old == (old ? this : next));
+    }
+  }
+#endif /* UNIV_LRU_DEBUG */
 
-# define buf_block_hash_lock_held_s_or_x(buf_pool, block)	\
-	buf_page_hash_lock_held_s_or_x((buf_pool), &(block)->page)
-#else /* UNIV_DEBUG */
-# define buf_page_hash_lock_held_s(b, p)	(TRUE)
-# define buf_page_hash_lock_held_x(b, p)	(TRUE)
-# define buf_page_hash_lock_held_s_or_x(b, p)	(TRUE)
-# define buf_block_hash_lock_held_s(b, p)	(TRUE)
-# define buf_block_hash_lock_held_x(b, p)	(TRUE)
-# define buf_block_hash_lock_held_s_or_x(b, p)	(TRUE)
-#endif /* UNIV_DEBUG */
+  this->old= old;
+}
 
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+#ifdef UNIV_DEBUG
 /** Forbid the release of the buffer pool mutex. */
-# define buf_pool_mutex_exit_forbid(b) do {	\
-	ut_ad(buf_pool_mutex_own(b));		\
-	b->mutex_exit_forbidden++;		\
+# define buf_pool_mutex_exit_forbid() do {		\
+	mysql_mutex_assert_owner(&buf_pool.mutex);	\
+	buf_pool.mutex_exit_forbidden++;		\
 } while (0)
 /** Allow the release of the buffer pool mutex. */
-# define buf_pool_mutex_exit_allow(b) do {	\
-	ut_ad(buf_pool_mutex_own(b));		\
-	ut_a(b->mutex_exit_forbidden);	\
-	b->mutex_exit_forbidden--;		\
-} while (0)
-/** Release the buffer pool mutex. */
-# define buf_pool_mutex_exit(b) do {		\
-	ut_a(!b->mutex_exit_forbidden);		\
-	mutex_exit(&b->mutex);			\
+# define buf_pool_mutex_exit_allow() do {		\
+	mysql_mutex_assert_owner(&buf_pool.mutex);	\
+	ut_ad(buf_pool.mutex_exit_forbidden--);		\
 } while (0)
 #else
 /** Forbid the release of the buffer pool mutex. */
-# define buf_pool_mutex_exit_forbid(b) ((void) 0)
+# define buf_pool_mutex_exit_forbid() ((void) 0)
 /** Allow the release of the buffer pool mutex. */
-# define buf_pool_mutex_exit_allow(b) ((void) 0)
-/** Release the buffer pool mutex. */
-# define buf_pool_mutex_exit(b) mutex_exit(&b->mutex)
+# define buf_pool_mutex_exit_allow() ((void) 0)
 #endif
-/* @} */
 
 /**********************************************************************
 Let us list the consistency conditions for different control block states.
 
 NOT_USED:	is in free list, not in LRU list, not in flush list, nor
 		page hash table
-READY_FOR_USE:	is not in free list, LRU list, or flush list, nor page
-		hash table
 MEMORY:		is not in free list, LRU list, or flush list, nor page
 		hash table
 FILE_PAGE:	space and offset are defined, is in page hash table
 		if io_fix == BUF_IO_WRITE,
-			pool: no_flush[flush_type] is in reset state,
-			pool: n_flush[flush_type] > 0
+			buf_pool.n_flush_LRU() || buf_pool.n_flush_list()
 
 		(1) if buf_fix_count == 0, then
 			is in LRU list, not in free list
@@ -2438,9 +2366,8 @@ FILE_PAGE:	space and offset are defined, is in page hash table
 
 State transitions:
 
-NOT_USED => READY_FOR_USE
-READY_FOR_USE => MEMORY
-READY_FOR_USE => FILE_PAGE
+NOT_USED => MEMORY
+MEMORY => FILE_PAGE
 MEMORY => NOT_USED
 FILE_PAGE => NOT_USED	NOTE: This transition is allowed if and only if
 				(1) buf_fix_count == 0,
@@ -2448,7 +2375,21 @@ FILE_PAGE => NOT_USED	NOTE: This transition is allowed if and only if
 				(3) io_fix == 0.
 */
 
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/** Select from where to start a scan. If we have scanned
+too deep into the LRU list it resets the value to the tail
+of the LRU list.
+@return buf_page_t from where to start scan. */
+inline buf_page_t *LRUItr::start()
+{
+  mysql_mutex_assert_owner(m_mutex);
+
+  if (!m_hp || m_hp->old)
+    m_hp= UT_LIST_GET_LAST(buf_pool.LRU);
+
+  return m_hp;
+}
+
+#ifdef UNIV_DEBUG
 /** Functor to validate the LRU list. */
 struct	CheckInLRUList {
 	void	operator()(const buf_page_t* elem) const
@@ -2456,9 +2397,9 @@ struct	CheckInLRUList {
 		ut_a(elem->in_LRU_list);
 	}
 
-	static void validate(const buf_pool_t* buf_pool)
+	static void validate()
 	{
-		ut_list_validate(buf_pool->LRU, CheckInLRUList());
+		ut_list_validate(buf_pool.LRU, CheckInLRUList());
 	}
 };
 
@@ -2469,9 +2410,9 @@ struct	CheckInFreeList {
 		ut_a(elem->in_free_list);
 	}
 
-	static void validate(const buf_pool_t* buf_pool)
+	static void validate()
 	{
-		ut_list_validate(buf_pool->free, CheckInFreeList());
+		ut_list_validate(buf_pool.free, CheckInFreeList());
 	}
 };
 
@@ -2482,13 +2423,13 @@ struct	CheckUnzipLRUAndLRUList {
                 ut_a(elem->in_unzip_LRU_list);
 	}
 
-	static void validate(const buf_pool_t* buf_pool)
+	static void validate()
 	{
-		ut_list_validate(buf_pool->unzip_LRU,
+		ut_list_validate(buf_pool.unzip_LRU,
 				 CheckUnzipLRUAndLRUList());
 	}
 };
-#endif /* UNIV_DEBUG || defined UNIV_BUF_DEBUG */
+#endif /* UNIV_DEBUG */
 
 #include "buf0buf.inl"
 
diff --git a/storage/innobase/include/buf0buf.inl b/storage/innobase/include/buf0buf.inl
index ac50588c217..364f04d3f69 100644
--- a/storage/innobase/include/buf0buf.inl
+++ b/storage/innobase/include/buf0buf.inl
@@ -37,42 +37,6 @@ Created 11/5/1995 Heikki Tuuri
 #include "buf0rea.h"
 #include "fsp0types.h"
 
-/** A chunk of buffers. The buffer pool is allocated in chunks. */
-struct buf_chunk_t{
-	ulint		size;		/*!< size of frames[] and blocks[] */
-	unsigned char*	mem;		/*!< pointer to the memory area which
-					was allocated for the frames */
-	ut_new_pfx_t	mem_pfx;	/*!< Auxiliary structure, describing
-					"mem". It is filled by the allocator's
-					alloc method and later passed to the
-					deallocate method. */
-	buf_block_t*	blocks;		/*!< array of buffer control blocks */
-
-	/** Get the size of 'mem' in bytes. */
-	size_t	mem_size() const {
-		return(mem_pfx.m_size);
-	}
-};
-
-bool buf_pool_t::is_block_field(const void *ptr) const
-{
-  const buf_chunk_t* chunk= chunks;
-  const buf_chunk_t *const echunk= chunk + ut_min(n_chunks,
-                                                  n_chunks_new);
-  /* TODO: protect chunks with a mutex (the older pointer will
-  currently remain during resize()) */
-  while (chunk < echunk)
-  {
-    if (ptr >= reinterpret_cast<const void*>(chunk->blocks) &&
-        ptr < reinterpret_cast<const void*>(
-		chunk->blocks + chunk->size))
-      return true;
-    chunk++;
-  }
-
-  return false;
-}
-
 /*********************************************************************//**
 Gets the current size of buffer buf_pool in bytes.
 @return size in bytes */
@@ -85,67 +49,6 @@ buf_pool_get_curr_size(void)
 }
 
 /********************************************************************//**
-Calculates the index of a buffer pool to the buf_pool[] array.
-@return the position of the buffer pool in buf_pool[] */
-UNIV_INLINE
-unsigned
-buf_pool_index(
-/*===========*/
-	const buf_pool_t*	buf_pool)	/*!< in: buffer pool */
-{
-	unsigned	i = unsigned(buf_pool - buf_pool_ptr);
-	ut_ad(i < MAX_BUFFER_POOLS);
-	ut_ad(i < srv_buf_pool_instances);
-	return(i);
-}
-
-/******************************************************************//**
-Returns the buffer pool instance given a page instance
-@return buf_pool */
-UNIV_INLINE
-buf_pool_t*
-buf_pool_from_bpage(
-/*================*/
-	const buf_page_t*	bpage) /*!< in: buffer pool page */
-{
-	ut_ad(bpage->buf_pool_index < srv_buf_pool_instances);
-	return(&buf_pool_ptr[bpage->buf_pool_index]);
-}
-
-/******************************************************************//**
-Returns the buffer pool instance given a block instance
-@return buf_pool */
-UNIV_INLINE
-buf_pool_t*
-buf_pool_from_block(
-/*================*/
-	const buf_block_t*	block) /*!< in: block */
-{
-	return(buf_pool_from_bpage(&block->page));
-}
-
-/*********************************************************************//**
-Gets the current size of buffer buf_pool in pages.
-@return size in pages*/
-UNIV_INLINE
-ulint
-buf_pool_get_n_pages(void)
-/*======================*/
-{
-  if (!buf_pool_ptr)
-    return buf_pool_get_curr_size() >> srv_page_size_shift;
-
-  ulint chunk_size= 0;
-  for (uint i= 0; i < srv_buf_pool_instances; i++)
-  {
-    buf_pool_t* buf_pool = buf_pool_from_array(i);
-    for (uint j= 0; j < buf_pool->n_chunks; j++)
-      chunk_size+= buf_pool->chunks[j].size;
-  }
-  return chunk_size;
-}
-
-/********************************************************************//**
 Reads the freed_page_clock of a buffer block.
 @return freed_page_clock */
 UNIV_INLINE
@@ -154,7 +57,7 @@ buf_page_get_freed_page_clock(
 /*==========================*/
 	const buf_page_t*	bpage)	/*!< in: block */
 {
-	/* This is sometimes read without holding buf_pool->mutex. */
+	/* This is sometimes read without holding buf_pool.mutex. */
 	return(bpage->freed_page_clock);
 }
 
@@ -170,49 +73,35 @@ buf_block_get_freed_page_clock(
 	return(buf_page_get_freed_page_clock(&block->page));
 }
 
-/********************************************************************//**
-Tells if a block is still close enough to the MRU end of the LRU list
+/** Determine if a block is still close enough to the MRU end of the LRU list
 meaning that it is not in danger of getting evicted and also implying
 that it has been accessed recently.
-Note that this is for heuristics only and does not reserve buffer pool
-mutex.
-@return TRUE if block is close to MRU end of LRU */
-UNIV_INLINE
-ibool
-buf_page_peek_if_young(
-/*===================*/
-	const buf_page_t*	bpage)	/*!< in: block */
+The page must be either buffer-fixed, or its page hash must be locked.
+@param[in]	bpage		buffer pool page
+@return whether bpage is close to MRU end of LRU */
+inline bool buf_page_peek_if_young(const buf_page_t *bpage)
 {
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-
 	/* FIXME: bpage->freed_page_clock is 31 bits */
-	return((buf_pool->freed_page_clock & ((1UL << 31) - 1))
+	return((buf_pool.freed_page_clock & ((1UL << 31) - 1))
 	       < (bpage->freed_page_clock
-		  + (buf_pool->curr_size
-		     * (BUF_LRU_OLD_RATIO_DIV - buf_pool->LRU_old_ratio)
+		  + (buf_pool.curr_size
+		     * (BUF_LRU_OLD_RATIO_DIV - buf_pool.LRU_old_ratio)
 		     / (BUF_LRU_OLD_RATIO_DIV * 4))));
 }
 
-/********************************************************************//**
-Recommends a move of a block to the start of the LRU list if there is danger
-of dropping from the buffer pool. NOTE: does not reserve the buffer pool
-mutex.
-@return TRUE if should be made younger */
-UNIV_INLINE
-ibool
-buf_page_peek_if_too_old(
-/*=====================*/
-	const buf_page_t*	bpage)	/*!< in: block to make younger */
+/** Determine if a block should be moved to the start of the LRU list if
+there is danger of dropping from the buffer pool.
+@param[in]	bpage		buffer pool page
+@return true if bpage should be made younger */
+inline bool buf_page_peek_if_too_old(const buf_page_t *bpage)
 {
-	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
-
-	if (buf_pool->freed_page_clock == 0) {
+	if (buf_pool.freed_page_clock == 0) {
 		/* If eviction has not started yet, do not update the
 		statistics or move blocks in the LRU list.  This is
 		either the warm-up phase or an in-memory workload. */
 		return(FALSE);
 	} else if (buf_LRU_old_threshold_ms && bpage->old) {
-		unsigned	access_time = buf_page_is_accessed(bpage);
+		uint32_t access_time = bpage->is_accessed();
 
 		/* It is possible that the below comparison returns an
 		unexpected result. 2^32 milliseconds pass in about 50 days,
@@ -220,524 +109,17 @@ buf_page_peek_if_too_old(
 		is e.g. 50 days + 15 ms, then the below will behave as if
 		it is 15 ms. This is known and fixing it would require to
 		increase buf_page_t::access_time from 32 to 64 bits. */
-		if (access_time > 0
+		if (access_time
 		    && ((ib_uint32_t) (ut_time_ms() - access_time))
 		    >= buf_LRU_old_threshold_ms) {
 			return(TRUE);
 		}
 
-		buf_pool->stat.n_pages_not_made_young++;
-		return(FALSE);
+		buf_pool.stat.n_pages_not_made_young++;
+		return false;
 	} else {
-		return(!buf_page_peek_if_young(bpage));
-	}
-}
-
-/*********************************************************************//**
-Gets the state of a block.
-@return state */
-UNIV_INLINE
-enum buf_page_state
-buf_page_get_state(
-/*===============*/
-	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
-{
-	enum buf_page_state	state	= bpage->state;
-
-#ifdef UNIV_DEBUG
-	switch (state) {
-	case BUF_BLOCK_POOL_WATCH:
-	case BUF_BLOCK_ZIP_PAGE:
-	case BUF_BLOCK_ZIP_DIRTY:
-	case BUF_BLOCK_NOT_USED:
-	case BUF_BLOCK_READY_FOR_USE:
-	case BUF_BLOCK_FILE_PAGE:
-	case BUF_BLOCK_MEMORY:
-	case BUF_BLOCK_REMOVE_HASH:
-		break;
-	default:
-		ut_error;
-	}
-#endif /* UNIV_DEBUG */
-
-	return(state);
-}
-/*********************************************************************//**
-Gets the state of a block.
-@return state */
-UNIV_INLINE
-enum buf_page_state
-buf_block_get_state(
-/*================*/
-	const buf_block_t*	block)	/*!< in: pointer to the control block */
-{
-	return(buf_page_get_state(&block->page));
-}
-
-/*********************************************************************//**
-Gets the state name for state of a block
-@return	name or "CORRUPTED" */
-UNIV_INLINE
-const char*
-buf_get_state_name(
-/*===============*/
-	const buf_block_t*	block)	/*!< in: pointer to the control
-					block */
-{
-	enum buf_page_state	state = buf_page_get_state(&block->page);
-
-	switch (state) {
-	case BUF_BLOCK_POOL_WATCH:
-		return (const char *) "BUF_BLOCK_POOL_WATCH";
-	case BUF_BLOCK_ZIP_PAGE:
-		return (const char *) "BUF_BLOCK_ZIP_PAGE";
-	case BUF_BLOCK_ZIP_DIRTY:
-		return (const char *) "BUF_BLOCK_ZIP_DIRTY";
-	case BUF_BLOCK_NOT_USED:
-		return (const char *) "BUF_BLOCK_NOT_USED";
-	case BUF_BLOCK_READY_FOR_USE:
-		return (const char *) "BUF_BLOCK_NOT_USED";
-	case BUF_BLOCK_FILE_PAGE:
-		return (const char *) "BUF_BLOCK_FILE_PAGE";
-	case BUF_BLOCK_MEMORY:
-		return (const char *) "BUF_BLOCK_MEMORY";
-	case BUF_BLOCK_REMOVE_HASH:
-		return (const char *) "BUF_BLOCK_REMOVE_HASH";
-	default:
-		return (const char *) "CORRUPTED";
-	}
-}
-
-/*********************************************************************//**
-Sets the state of a block. */
-UNIV_INLINE
-void
-buf_page_set_state(
-/*===============*/
-	buf_page_t*		bpage,	/*!< in/out: pointer to control block */
-	enum buf_page_state	state)	/*!< in: state */
-{
-#ifdef UNIV_DEBUG
-	enum buf_page_state	old_state	= buf_page_get_state(bpage);
-
-	switch (old_state) {
-	case BUF_BLOCK_POOL_WATCH:
-		ut_error;
-		break;
-	case BUF_BLOCK_ZIP_PAGE:
-		ut_a(state == BUF_BLOCK_ZIP_DIRTY);
-		break;
-	case BUF_BLOCK_ZIP_DIRTY:
-		ut_a(state == BUF_BLOCK_ZIP_PAGE);
-		break;
-	case BUF_BLOCK_NOT_USED:
-		ut_a(state == BUF_BLOCK_READY_FOR_USE);
-		break;
-	case BUF_BLOCK_READY_FOR_USE:
-		ut_a(state == BUF_BLOCK_MEMORY
-		     || state == BUF_BLOCK_FILE_PAGE
-		     || state == BUF_BLOCK_NOT_USED);
-		break;
-	case BUF_BLOCK_MEMORY:
-		ut_a(state == BUF_BLOCK_NOT_USED);
-		break;
-	case BUF_BLOCK_FILE_PAGE:
-		if (!(state == BUF_BLOCK_NOT_USED
-	              || state == BUF_BLOCK_REMOVE_HASH
-		      || state == BUF_BLOCK_FILE_PAGE)) {
-			const char *old_state_name = buf_get_state_name((buf_block_t*)bpage);
-			bpage->state = state;
-
-			fprintf(stderr,
-				"InnoDB: Error: block old state %d (%s) "
-				" new state %d (%s) not correct\n",
-				old_state,
-				old_state_name,
-				state,
-				buf_get_state_name((buf_block_t*)bpage));
-			ut_a(state == BUF_BLOCK_NOT_USED
-				|| state == BUF_BLOCK_REMOVE_HASH
-				|| state == BUF_BLOCK_FILE_PAGE);
-		}
-
-		break;
-	case BUF_BLOCK_REMOVE_HASH:
-		ut_a(state == BUF_BLOCK_MEMORY);
-		break;
-	}
-#endif /* UNIV_DEBUG */
-	bpage->state = state;
-}
-
-/*********************************************************************//**
-Sets the state of a block. */
-UNIV_INLINE
-void
-buf_block_set_state(
-/*================*/
-	buf_block_t*		block,	/*!< in/out: pointer to control block */
-	enum buf_page_state	state)	/*!< in: state */
-{
-	buf_page_set_state(&block->page, state);
-}
-
-/*********************************************************************//**
-Determines if a block is mapped to a tablespace.
-@return TRUE if mapped */
-UNIV_INLINE
-ibool
-buf_page_in_file(
-/*=============*/
-	const buf_page_t*	bpage)	/*!< in: pointer to control block */
-{
-	switch (buf_page_get_state(bpage)) {
-	case BUF_BLOCK_POOL_WATCH:
-		ut_error;
-		break;
-	case BUF_BLOCK_ZIP_PAGE:
-	case BUF_BLOCK_ZIP_DIRTY:
-	case BUF_BLOCK_FILE_PAGE:
-		return(TRUE);
-	case BUF_BLOCK_NOT_USED:
-	case BUF_BLOCK_READY_FOR_USE:
-	case BUF_BLOCK_MEMORY:
-	case BUF_BLOCK_REMOVE_HASH:
-		break;
-	}
-
-	return(FALSE);
-}
-
-/*********************************************************************//**
-Determines if a block should be on unzip_LRU list.
-@return TRUE if block belongs to unzip_LRU */
-UNIV_INLINE
-ibool
-buf_page_belongs_to_unzip_LRU(
-/*==========================*/
-	const buf_page_t*	bpage)	/*!< in: pointer to control block */
-{
-	ut_ad(buf_page_in_file(bpage));
-
-	return(bpage->zip.data
-	       && buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
-}
-
-/*********************************************************************//**
-Gets the mutex of a block.
-@return pointer to mutex protecting bpage */
-UNIV_INLINE
-BPageMutex*
-buf_page_get_mutex(
-/*===============*/
-	const buf_page_t*	bpage)	/*!< in: pointer to control block */
-{
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-
-	switch (buf_page_get_state(bpage)) {
-	case BUF_BLOCK_POOL_WATCH:
-		ut_error;
-		return(NULL);
-	case BUF_BLOCK_ZIP_PAGE:
-	case BUF_BLOCK_ZIP_DIRTY:
-		return(&buf_pool->zip_mutex);
-	default:
-		return(&((buf_block_t*) bpage)->mutex);
-	}
-}
-
-/*********************************************************************//**
-Get the flush type of a page.
-@return flush type */
-UNIV_INLINE
-buf_flush_t
-buf_page_get_flush_type(
-/*====================*/
-	const buf_page_t*	bpage)	/*!< in: buffer page */
-{
-	buf_flush_t	flush_type = (buf_flush_t) bpage->flush_type;
-
-#ifdef UNIV_DEBUG
-	switch (flush_type) {
-	case BUF_FLUSH_LRU:
-	case BUF_FLUSH_LIST:
-	case BUF_FLUSH_SINGLE_PAGE:
-		return(flush_type);
-	case BUF_FLUSH_N_TYPES:
-		ut_error;
-	}
-	ut_error;
-#endif /* UNIV_DEBUG */
-	return(flush_type);
-}
-/*********************************************************************//**
-Set the flush type of a page. */
-UNIV_INLINE
-void
-buf_page_set_flush_type(
-/*====================*/
-	buf_page_t*	bpage,		/*!< in: buffer page */
-	buf_flush_t	flush_type)	/*!< in: flush type */
-{
-	bpage->flush_type = flush_type;
-	ut_ad(buf_page_get_flush_type(bpage) == flush_type);
-}
-
-/** Map a block to a file page.
-@param[in,out]	block	pointer to control block
-@param[in]	page_id	page id */
-UNIV_INLINE
-void
-buf_block_set_file_page(
-	buf_block_t*		block,
-	const page_id_t		page_id)
-{
-	buf_block_set_state(block, BUF_BLOCK_FILE_PAGE);
-	block->page.id = page_id;
-}
-
-/*********************************************************************//**
-Gets the io_fix state of a block.
-@return io_fix state */
-UNIV_INLINE
-enum buf_io_fix
-buf_page_get_io_fix(
-/*================*/
-	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
-{
-	ut_ad(bpage != NULL);
-
-	enum buf_io_fix	io_fix	= bpage->io_fix;
-
-#ifdef UNIV_DEBUG
-	switch (io_fix) {
-	case BUF_IO_NONE:
-	case BUF_IO_READ:
-	case BUF_IO_WRITE:
-	case BUF_IO_PIN:
-		return(io_fix);
+		return !buf_page_peek_if_young(bpage);
 	}
-	ut_error;
-#endif /* UNIV_DEBUG */
-	return(io_fix);
-}
-
-/*********************************************************************//**
-Gets the io_fix state of a block.
-@return io_fix state */
-UNIV_INLINE
-enum buf_io_fix
-buf_block_get_io_fix(
-/*=================*/
-	const buf_block_t*	block)	/*!< in: pointer to the control block */
-{
-	return(buf_page_get_io_fix(&block->page));
-}
-
-/*********************************************************************//**
-Sets the io_fix state of a block. */
-UNIV_INLINE
-void
-buf_page_set_io_fix(
-/*================*/
-	buf_page_t*	bpage,	/*!< in/out: control block */
-	enum buf_io_fix	io_fix)	/*!< in: io_fix state */
-{
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	ut_ad(buf_pool_mutex_own(buf_pool));
-#endif /* UNIV_DEBUG */
-	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-
-	bpage->io_fix = io_fix;
-	ut_ad(buf_page_get_io_fix(bpage) == io_fix);
-}
-
-/*********************************************************************//**
-Sets the io_fix state of a block. */
-UNIV_INLINE
-void
-buf_block_set_io_fix(
-/*=================*/
-	buf_block_t*	block,	/*!< in/out: control block */
-	enum buf_io_fix	io_fix)	/*!< in: io_fix state */
-{
-	buf_page_set_io_fix(&block->page, io_fix);
-}
-
-/*********************************************************************//**
-Makes a block sticky. A sticky block implies that even after we release
-the buf_pool->mutex and the block->mutex:
-* it cannot be removed from the flush_list
-* the block descriptor cannot be relocated
-* it cannot be removed from the LRU list
-Note that:
-* the block can still change its position in the LRU list
-* the next and previous pointers can change. */
-UNIV_INLINE
-void
-buf_page_set_sticky(
-/*================*/
-	buf_page_t*	bpage)	/*!< in/out: control block */
-{
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	ut_ad(buf_pool_mutex_own(buf_pool));
-#endif /* UNIV_DEBUG */
-	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
-
-	bpage->io_fix = BUF_IO_PIN;
-}
-
-/*********************************************************************//**
-Removes stickiness of a block. */
-UNIV_INLINE
-void
-buf_page_unset_sticky(
-/*==================*/
-	buf_page_t*	bpage)	/*!< in/out: control block */
-{
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	ut_ad(buf_pool_mutex_own(buf_pool));
-#endif /* UNIV_DEBUG */
-	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_PIN);
-
-	bpage->io_fix = BUF_IO_NONE;
-}
-
-/********************************************************************//**
-Determine if a buffer block can be relocated in memory.  The block
-can be dirty, but it must not be I/O-fixed or bufferfixed. */
-UNIV_INLINE
-ibool
-buf_page_can_relocate(
-/*==================*/
-	const buf_page_t*	bpage)	/*!< control block being relocated */
-{
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	ut_ad(buf_pool_mutex_own(buf_pool));
-#endif /* UNIV_DEBUG */
-	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-	ut_ad(buf_page_in_file(bpage));
-	ut_ad(bpage->in_LRU_list);
-
-	return(buf_page_get_io_fix(bpage) == BUF_IO_NONE
-	       && bpage->buf_fix_count == 0);
-}
-
-/*********************************************************************//**
-Determine if a block has been flagged old.
-@return TRUE if old */
-UNIV_INLINE
-ibool
-buf_page_is_old(
-/*============*/
-	const buf_page_t*	bpage)	/*!< in: control block */
-{
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	ut_ad(buf_pool_mutex_own(buf_pool));
-#endif /* UNIV_DEBUG */
-	ut_ad(buf_page_in_file(bpage));
-
-	return(bpage->old);
-}
-
-/*********************************************************************//**
-Flag a block old. */
-UNIV_INLINE
-void
-buf_page_set_old(
-/*=============*/
-	buf_page_t*	bpage,	/*!< in/out: control block */
-	bool		old)	/*!< in: old */
-{
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-#endif /* UNIV_DEBUG */
-	ut_a(buf_page_in_file(bpage));
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(bpage->in_LRU_list);
-
-#ifdef UNIV_LRU_DEBUG
-	ut_a((buf_pool->LRU_old_len == 0) == (buf_pool->LRU_old == NULL));
-	/* If a block is flagged "old", the LRU_old list must exist. */
-	ut_a(!old || buf_pool->LRU_old);
-
-	if (UT_LIST_GET_PREV(LRU, bpage) && UT_LIST_GET_NEXT(LRU, bpage)) {
-		const buf_page_t*	prev = UT_LIST_GET_PREV(LRU, bpage);
-		const buf_page_t*	next = UT_LIST_GET_NEXT(LRU, bpage);
-		if (prev->old == next->old) {
-			ut_a(prev->old == old);
-		} else {
-			ut_a(!prev->old);
-			ut_a(buf_pool->LRU_old == (old ? bpage : next));
-		}
-	}
-#endif /* UNIV_LRU_DEBUG */
-
-	bpage->old = old;
-}
-
-/*********************************************************************//**
-Determine the time of first access of a block in the buffer pool.
-@return ut_time_ms() at the time of first access, 0 if not accessed */
-UNIV_INLINE
-unsigned
-buf_page_is_accessed(
-/*=================*/
-	const buf_page_t*	bpage)	/*!< in: control block */
-{
-	ut_ad(buf_page_in_file(bpage));
-
-	return(bpage->access_time);
-}
-
-/*********************************************************************//**
-Flag a block accessed. */
-UNIV_INLINE
-void
-buf_page_set_accessed(
-/*==================*/
-	buf_page_t*	bpage)		/*!< in/out: control block */
-{
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	ut_ad(!buf_pool_mutex_own(buf_pool));
-	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-#endif /* UNIV_DEBUG */
-
-	ut_a(buf_page_in_file(bpage));
-
-	if (bpage->access_time == 0) {
-		/* Make this the time of the first access. */
-		bpage->access_time = static_cast<uint>(ut_time_ms());
-	}
-}
-
-/*********************************************************************//**
-Gets the buf_block_t handle of a buffered file block if an uncompressed
-page frame exists, or NULL.
-@return control block, or NULL */
-UNIV_INLINE
-buf_block_t*
-buf_page_get_block(
-/*===============*/
-	buf_page_t*	bpage)	/*!< in: control block, or NULL */
-{
-	if (bpage != NULL) {
-		ut_ad(buf_page_in_file(bpage));
-
-		if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
-			return((buf_block_t*) bpage);
-		}
-	}
-
-	return(NULL);
 }
 
 #ifdef UNIV_DEBUG
@@ -754,17 +136,14 @@ buf_block_get_frame(
 		return NULL;
 	}
 
-	switch (buf_block_get_state(block)) {
-	case BUF_BLOCK_POOL_WATCH:
+	switch (block->page.state()) {
 	case BUF_BLOCK_ZIP_PAGE:
-	case BUF_BLOCK_ZIP_DIRTY:
 	case BUF_BLOCK_NOT_USED:
 		ut_error;
 		break;
 	case BUF_BLOCK_FILE_PAGE:
-		ut_a(block->page.buf_fix_count > 0);
+		ut_a(block->page.buf_fix_count());
 		/* fall through */
-	case BUF_BLOCK_READY_FOR_USE:
 	case BUF_BLOCK_MEMORY:
 	case BUF_BLOCK_REMOVE_HASH:
 		goto ok;
@@ -775,61 +154,6 @@ ok:
 }
 #endif /* UNIV_DEBUG */
 
-/***********************************************************************
-FIXME_FTS Gets the frame the pointer is pointing to. */
-UNIV_INLINE
-buf_frame_t*
-buf_frame_align(
-/*============*/
-                        /* out: pointer to frame */
-        byte*   ptr)    /* in: pointer to a frame */
-{
-        buf_frame_t*    frame;
-
-        ut_ad(ptr);
-
-        frame = (buf_frame_t*) ut_align_down(ptr, srv_page_size);
-
-        return(frame);
-}
-
-/**********************************************************************//**
-Gets the space id, page offset, and byte offset within page of a
-pointer pointing to a buffer frame containing a file page. */
-UNIV_INLINE
-void
-buf_ptr_get_fsp_addr(
-/*=================*/
-	const void*	ptr,	/*!< in: pointer to a buffer frame */
-	ulint*		space,	/*!< out: space id */
-	fil_addr_t*	addr)	/*!< out: page offset and byte offset */
-{
-	const page_t*	page = (const page_t*) ut_align_down(ptr,
-							     srv_page_size);
-
-	*space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
-	addr->page = mach_read_from_4(page + FIL_PAGE_OFFSET);
-	addr->boffset = ut_align_offset(ptr, srv_page_size);
-}
-
-/**********************************************************************//**
-Gets the hash value of the page the pointer is pointing to. This can be used
-in searches in the lock hash table.
-@return lock hash value */
-UNIV_INLINE
-unsigned
-buf_block_get_lock_hash_val(
-/*========================*/
-	const buf_block_t*	block)	/*!< in: block */
-{
-	ut_ad(block);
-	ut_ad(buf_page_in_file(&block->page));
-	ut_ad(rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_X)
-	      || rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_S));
-
-	return(block->lock_hash_val);
-}
-
 /********************************************************************//**
 Allocates a buf_page_t descriptor. This function must succeed. In case
 of failure we assert in this function.
@@ -859,6 +183,13 @@ buf_page_free_descriptor(
 	ut_free(bpage);
 }
 
+/** Allocate a buffer block.
+@return own: the allocated block, in state BUF_BLOCK_MEMORY */
+inline buf_block_t *buf_block_alloc()
+{
+  return buf_LRU_get_free_block(false);
+}
+
 /********************************************************************//**
 Frees a buffer block which does not contain a file page. */
 UNIV_INLINE
@@ -867,63 +198,9 @@ buf_block_free(
 /*===========*/
 	buf_block_t*	block)	/*!< in, own: block to be freed */
 {
-	buf_pool_t*	buf_pool = buf_pool_from_bpage((buf_page_t*) block);
-
-	buf_pool_mutex_enter(buf_pool);
-
-	buf_page_mutex_enter(block);
-
-	ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
-
+	mysql_mutex_lock(&buf_pool.mutex);
 	buf_LRU_block_free_non_file_page(block);
-
-	buf_page_mutex_exit(block);
-
-	buf_pool_mutex_exit(buf_pool);
-}
-
-/*********************************************************************//**
-Copies contents of a buffer frame to a given buffer.
-@return buf */
-UNIV_INLINE
-byte*
-buf_frame_copy(
-/*===========*/
-	byte*			buf,	/*!< in: buffer to copy to */
-	const buf_frame_t*	frame)	/*!< in: buffer frame */
-{
-	ut_ad(buf && frame);
-
-	ut_memcpy(buf, frame, srv_page_size);
-
-	return(buf);
-}
-
-/********************************************************************//**
-Gets the youngest modification log sequence number for a frame.
-Returns zero if not file page or no modification occurred yet.
-@return newest modification to page */
-UNIV_INLINE
-lsn_t
-buf_page_get_newest_modification(
-/*=============================*/
-	const buf_page_t*	bpage)	/*!< in: block containing the
-					page frame */
-{
-	lsn_t		lsn;
-	BPageMutex*	block_mutex = buf_page_get_mutex(bpage);
-
-	mutex_enter(block_mutex);
-
-	if (buf_page_in_file(bpage)) {
-		lsn = bpage->newest_modification;
-	} else {
-		lsn = 0;
-	}
-
-	mutex_exit(block_mutex);
-
-	return(lsn);
+	mysql_mutex_unlock(&buf_pool.mutex);
 }
 
 /********************************************************************//**
@@ -936,17 +213,20 @@ buf_block_modify_clock_inc(
 /*=======================*/
 	buf_block_t*	block)	/*!< in: block */
 {
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage((buf_page_t*) block);
-
+#ifdef SAFE_MUTEX
 	/* No latch is acquired for the shared temporary tablespace. */
-	if (!fsp_is_system_temporary(block->page.id.space())) {
-		ut_ad((buf_pool_mutex_own(buf_pool)
-		       && (block->page.buf_fix_count == 0))
-		      || rw_lock_own_flagged(&block->lock,
-					     RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
-	}
-#endif /* UNIV_DEBUG */
+	ut_ad(fsp_is_system_temporary(block->page.id().space())
+	      || (mysql_mutex_is_owner(&buf_pool.mutex)
+		  && !block->page.buf_fix_count())
+	      || rw_lock_own_flagged(&block->lock,
+				     RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+#else /* SAFE_MUTEX */
+	/* No latch is acquired for the shared temporary tablespace. */
+	ut_ad(fsp_is_system_temporary(block->page.id().space())
+	      || !block->page.buf_fix_count()
+	      || rw_lock_own_flagged(&block->lock,
+				     RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+#endif /* SAFE_MUTEX */
 	assert_block_ahi_valid(block);
 
 	block->modify_clock++;
@@ -964,7 +244,7 @@ buf_block_get_modify_clock(
 {
 #ifdef UNIV_DEBUG
 	/* No latch is acquired for the shared temporary tablespace. */
-	if (!fsp_is_system_temporary(block->page.id.space())) {
+	if (!fsp_is_system_temporary(block->page.id().space())) {
 		ut_ad(rw_lock_own(&(block->lock), RW_LOCK_S)
 		      || rw_lock_own(&(block->lock), RW_LOCK_X)
 		      || rw_lock_own(&(block->lock), RW_LOCK_SX));
@@ -990,7 +270,7 @@ buf_block_buf_fix_inc_func(
 	/* No debug latch is acquired if block belongs to system temporary.
 	Debug latch is not of much help if access to block is single
 	threaded. */
-	if (!fsp_is_system_temporary(block->page.id.space())) {
+	if (!fsp_is_system_temporary(block->page.id().space())) {
 		ibool   ret;
 		ret = rw_lock_s_lock_nowait(block->debug_latch, file, line);
 		ut_a(ret);
@@ -1012,7 +292,7 @@ buf_block_buf_fix_dec(
 	/* No debug latch is acquired if block belongs to system temporary.
 	Debug latch is not of much help if access to block is single
 	threaded. */
-	if (!fsp_is_system_temporary(block->page.id.space())) {
+	if (!fsp_is_system_temporary(block->page.id().space())) {
 		rw_lock_s_unlock(block->debug_latch);
 	}
 #endif /* UNIV_DEBUG */
@@ -1020,224 +300,6 @@ buf_block_buf_fix_dec(
 	block->unfix();
 }
 
-/** Returns the buffer pool instance given a page id.
-@param[in]	page_id	page id
-@return buffer pool */
-inline buf_pool_t* buf_pool_get(const page_id_t page_id)
-{
-        /* 2log of BUF_READ_AHEAD_AREA (64) */
-        ulint		ignored_page_no = page_id.page_no() >> 6;
-
-        page_id_t	id(page_id.space(), ignored_page_no);
-
-        ulint		i = id.fold() % srv_buf_pool_instances;
-
-        return(&buf_pool_ptr[i]);
-}
-
-/******************************************************************//**
-Returns the buffer pool instance given its array index
-@return buffer pool */
-UNIV_INLINE
-buf_pool_t*
-buf_pool_from_array(
-/*================*/
-	ulint	index)		/*!< in: array index to get
-				buffer pool instance from */
-{
-	ut_ad(index < MAX_BUFFER_POOLS);
-	ut_ad(index < srv_buf_pool_instances);
-	return(&buf_pool_ptr[index]);
-}
-
-/** Returns the control block of a file page, NULL if not found.
-@param[in]	buf_pool	buffer pool instance
-@param[in]	page_id		page id
-@return block, NULL if not found */
-UNIV_INLINE
-buf_page_t*
-buf_page_hash_get_low(
-	buf_pool_t*		buf_pool,
-	const page_id_t		page_id)
-{
-	buf_page_t*	bpage;
-
-#ifdef UNIV_DEBUG
-	rw_lock_t*	hash_lock;
-
-	hash_lock = hash_get_lock(buf_pool->page_hash, page_id.fold());
-	ut_ad(rw_lock_own(hash_lock, RW_LOCK_X)
-	      || rw_lock_own(hash_lock, RW_LOCK_S));
-#endif /* UNIV_DEBUG */
-
-	/* Look for the page in the hash table */
-
-	HASH_SEARCH(hash, buf_pool->page_hash, page_id.fold(), buf_page_t*,
-		    bpage,
-		    ut_ad(bpage->in_page_hash && !bpage->in_zip_hash
-			  && buf_page_in_file(bpage)),
-		    page_id == bpage->id);
-	if (bpage) {
-		ut_a(buf_page_in_file(bpage));
-		ut_ad(bpage->in_page_hash);
-		ut_ad(!bpage->in_zip_hash);
-		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
-	}
-
-	return(bpage);
-}
-
-/** Returns the control block of a file page, NULL if not found.
-If the block is found and lock is not NULL then the appropriate
-page_hash lock is acquired in the specified lock mode. Otherwise,
-mode value is ignored. It is up to the caller to release the
-lock. If the block is found and the lock is NULL then the page_hash
-lock is released by this function.
-@param[in]	buf_pool	buffer pool instance
-@param[in]	page_id		page id
-@param[in,out]	lock		lock of the page hash acquired if bpage is
-found, NULL otherwise. If NULL is passed then the hash_lock is released by
-this function.
-@param[in]	lock_mode	RW_LOCK_X or RW_LOCK_S. Ignored if
-lock == NULL
-@param[in]	watch		if true, return watch sentinel also.
-@return pointer to the bpage or NULL; if NULL, lock is also NULL or
-a watch sentinel. */
-UNIV_INLINE
-buf_page_t*
-buf_page_hash_get_locked(
-	buf_pool_t*		buf_pool,
-	const page_id_t		page_id,
-	rw_lock_t**		lock,
-	ulint			lock_mode,
-	bool			watch)
-{
-	buf_page_t*	bpage = NULL;
-	rw_lock_t*	hash_lock;
-	ulint		mode = RW_LOCK_S;
-
-	if (lock != NULL) {
-		*lock = NULL;
-		ut_ad(lock_mode == RW_LOCK_X
-		      || lock_mode == RW_LOCK_S);
-		mode = lock_mode;
-	}
-
-	hash_lock = hash_get_lock(buf_pool->page_hash, page_id.fold());
-
-	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_X)
-	      && !rw_lock_own(hash_lock, RW_LOCK_S));
-
-	if (mode == RW_LOCK_S) {
-		rw_lock_s_lock(hash_lock);
-
-		/* If not own buf_pool_mutex, page_hash can be changed. */
-		hash_lock = hash_lock_s_confirm(
-			hash_lock, buf_pool->page_hash, page_id.fold());
-	} else {
-		rw_lock_x_lock(hash_lock);
-		/* If not own buf_pool_mutex, page_hash can be changed. */
-		hash_lock = hash_lock_x_confirm(
-			hash_lock, buf_pool->page_hash, page_id.fold());
-	}
-
-	bpage = buf_page_hash_get_low(buf_pool, page_id);
-
-	if (!bpage || buf_pool_watch_is_sentinel(buf_pool, bpage)) {
-		if (!watch) {
-			bpage = NULL;
-		}
-		goto unlock_and_exit;
-	}
-
-	ut_ad(buf_page_in_file(bpage));
-	ut_ad(page_id == bpage->id);
-
-	if (lock == NULL) {
-		/* The caller wants us to release the page_hash lock */
-		goto unlock_and_exit;
-	} else {
-		/* To be released by the caller */
-		*lock = hash_lock;
-		goto exit;
-	}
-
-unlock_and_exit:
-	if (mode == RW_LOCK_S) {
-		rw_lock_s_unlock(hash_lock);
-	} else {
-		rw_lock_x_unlock(hash_lock);
-	}
-exit:
-	return(bpage);
-}
-
-/** Returns the control block of a file page, NULL if not found.
-If the block is found and lock is not NULL then the appropriate
-page_hash lock is acquired in the specified lock mode. Otherwise,
-mode value is ignored. It is up to the caller to release the
-lock. If the block is found and the lock is NULL then the page_hash
-lock is released by this function.
-@param[in]	buf_pool	buffer pool instance
-@param[in]	page_id		page id
-@param[in,out]	lock		lock of the page hash acquired if bpage is
-found, NULL otherwise. If NULL is passed then the hash_lock is released by
-this function.
-@param[in]	lock_mode	RW_LOCK_X or RW_LOCK_S. Ignored if
-lock == NULL
-@return pointer to the block or NULL; if NULL, lock is also NULL. */
-UNIV_INLINE
-buf_block_t*
-buf_block_hash_get_locked(
-	buf_pool_t*		buf_pool,
-	const page_id_t		page_id,
-	rw_lock_t**		lock,
-	ulint			lock_mode)
-{
-	buf_page_t*	bpage = buf_page_hash_get_locked(buf_pool,
-							 page_id,
-							 lock,
-							 lock_mode);
-	buf_block_t*	block = buf_page_get_block(bpage);
-
-	if (block != NULL) {
-
-		ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-		ut_ad(!lock || rw_lock_own(*lock, lock_mode));
-
-		return(block);
-	} else if (bpage) {
-		/* It is not a block. Just a bpage */
-		ut_ad(buf_page_in_file(bpage));
-
-		if (lock) {
-			if (lock_mode == RW_LOCK_S) {
-				rw_lock_s_unlock(*lock);
-			} else {
-				rw_lock_x_unlock(*lock);
-			}
-		}
-		*lock = NULL;
-		return(NULL);
-	}
-
-	ut_ad(!bpage);
-	ut_ad(lock == NULL ||*lock == NULL);
-	return(NULL);
-}
-
-/** Returns TRUE if the page can be found in the buffer pool hash table.
-NOTE that it is possible that the page is not yet read from disk,
-though.
-@param[in]	page_id	page id
-@return true if found in the page hash table */
-inline bool buf_page_peek(const page_id_t page_id)
-{
-	buf_pool_t*	buf_pool = buf_pool_get(page_id);
-
-	return(buf_page_hash_get(buf_pool, page_id) != NULL);
-}
-
 /********************************************************************//**
 Releases a compressed-only page acquired with buf_page_get_zip(). */
 UNIV_INLINE
@@ -1247,9 +309,9 @@ buf_page_release_zip(
 	buf_page_t*	bpage)		/*!< in: buffer block */
 {
 	ut_ad(bpage);
-	ut_a(bpage->buf_fix_count > 0);
+	ut_a(bpage->buf_fix_count());
 
-	switch (buf_page_get_state(bpage)) {
+	switch (bpage->state()) {
 	case BUF_BLOCK_FILE_PAGE:
 #ifdef UNIV_DEBUG
 	{
@@ -1257,20 +319,17 @@ buf_page_release_zip(
 		temporary. Debug latch is not of much help if access to block
 		is single threaded. */
 		buf_block_t*	block = reinterpret_cast<buf_block_t*>(bpage);
-		if (!fsp_is_system_temporary(block->page.id.space())) {
+		if (!fsp_is_system_temporary(block->page.id().space())) {
 			rw_lock_s_unlock(block->debug_latch);
 		}
 	}
 #endif /* UNIV_DEBUG */
 		/* Fall through */
 	case BUF_BLOCK_ZIP_PAGE:
-	case BUF_BLOCK_ZIP_DIRTY:
 		reinterpret_cast<buf_block_t*>(bpage)->unfix();
 		return;
 
-	case BUF_BLOCK_POOL_WATCH:
 	case BUF_BLOCK_NOT_USED:
-	case BUF_BLOCK_READY_FOR_USE:
 	case BUF_BLOCK_MEMORY:
 	case BUF_BLOCK_REMOVE_HASH:
 		break;
@@ -1293,7 +352,7 @@ buf_page_release_latch(
 	/* No debug latch is acquired if block belongs to system
 	temporary. Debug latch is not of much help if access to block
 	is single threaded. */
-	if (!fsp_is_system_temporary(block->page.id.space())) {
+	if (!fsp_is_system_temporary(block->page.id().space())) {
 		rw_lock_s_unlock(block->debug_latch);
 	}
 #endif /* UNIV_DEBUG */
@@ -1322,55 +381,7 @@ buf_block_dbg_add_level(
 {
 	sync_check_lock(&block->lock, level);
 }
-
 #endif /* UNIV_DEBUG */
-/********************************************************************//**
-Acquire mutex on all buffer pool instances. */
-UNIV_INLINE
-void
-buf_pool_mutex_enter_all(void)
-/*==========================*/
-{
-	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
-		buf_pool_t*	buf_pool = buf_pool_from_array(i);
-
-		buf_pool_mutex_enter(buf_pool);
-	}
-}
-
-/********************************************************************//**
-Release mutex on all buffer pool instances. */
-UNIV_INLINE
-void
-buf_pool_mutex_exit_all(void)
-/*=========================*/
-{
-	ulint   i;
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-		buf_pool_mutex_exit(buf_pool);
-	}
-}
-/*********************************************************************//**
-Get the nth chunk's buffer block in the specified buffer pool.
-@return the nth chunk's buffer block. */
-UNIV_INLINE
-buf_block_t*
-buf_get_nth_chunk_block(
-/*====================*/
-	const buf_pool_t* buf_pool,	/*!< in: buffer pool instance */
-	ulint		n,		/*!< in: nth chunk in the buffer pool */
-	ulint*		chunk_size)	/*!< in: chunk size */
-{
-	const buf_chunk_t*	chunk;
-
-	chunk = buf_pool->chunks + n;
-	*chunk_size = chunk->size;
-	return(chunk->blocks);
-}
 
 /********************************************************************//**
 Get buf frame. */
diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h
index c34c1077d97..fb9df55504c 100644
--- a/storage/innobase/include/buf0dblwr.h
+++ b/storage/innobase/include/buf0dblwr.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,138 +24,147 @@ Doublewrite buffer module
 Created 2011/12/19 Inaam Rana
 *******************************************************/
 
-#ifndef buf0dblwr_h
-#define buf0dblwr_h
+#pragma once
 
-#include "ut0byte.h"
-#include "log0log.h"
+#include "os0file.h"
 #include "buf0types.h"
-#include "log0recv.h"
-
-/** Doublewrite system */
-extern buf_dblwr_t*	buf_dblwr;
-/** Set to TRUE when the doublewrite buffer is being created */
-extern ibool		buf_dblwr_being_created;
-
-/** Create the doublewrite buffer if the doublewrite buffer header
-is not present in the TRX_SYS page.
-@return	whether the operation succeeded
-@retval	true	if the doublewrite buffer exists or was created
-@retval	false	if the creation failed (too small first data file) */
-MY_ATTRIBUTE((warn_unused_result))
-bool
-buf_dblwr_create();
-
-/**
-At database startup initializes the doublewrite buffer memory structure if
-we already have a doublewrite buffer created in the data files. If we are
-upgrading to an InnoDB version which supports multiple tablespaces, then this
-function performs the necessary update operations. If we are in a crash
-recovery, this function loads the pages from double write buffer into memory.
-@param[in]	file		File handle
-@param[in]	path		Path name of file
-@return DB_SUCCESS or error code */
-dberr_t
-buf_dblwr_init_or_load_pages(
-	pfs_os_file_t	file,
-	const char*	path);
-
-/** Process and remove the double write buffer pages for all tablespaces. */
-void
-buf_dblwr_process();
-
-/****************************************************************//**
-frees doublewrite buffer. */
-void
-buf_dblwr_free();
-
-/********************************************************************//**
-Updates the doublewrite buffer when an IO request is completed. */
-void
-buf_dblwr_update(
-/*=============*/
-	const buf_page_t*	bpage,	/*!< in: buffer block descriptor */
-	buf_flush_t		flush_type);/*!< in: flush type */
-/****************************************************************//**
-Determines if a page number is located inside the doublewrite buffer.
-@return TRUE if the location is inside the two blocks of the
-doublewrite buffer */
-ibool
-buf_dblwr_page_inside(
-/*==================*/
-	ulint	page_no);	/*!< in: page number */
-/********************************************************************//**
-Posts a buffer page for writing. If the doublewrite memory buffer is
-full, calls buf_dblwr_flush_buffered_writes and waits for for free
-space to appear. */
-void
-buf_dblwr_add_to_batch(
-/*====================*/
-	buf_page_t*	bpage);	/*!< in: buffer block to write */
-
-/********************************************************************//**
-Flush a batch of writes to the datafiles that have already been
-written to the dblwr buffer on disk. */
-void
-buf_dblwr_sync_datafiles();
-
-/********************************************************************//**
-Flushes possible buffered writes from the doublewrite memory buffer to disk,
-and also wakes up the aio thread if simulated aio is used. It is very
-important to call this function after a batch of writes has been posted,
-and also when we may have to wait for a page latch! Otherwise a deadlock
-of threads can occur. */
-void
-buf_dblwr_flush_buffered_writes();
-
-/********************************************************************//**
-Writes a page to the doublewrite buffer on disk, sync it, then write
-the page to the datafile and sync the datafile. This function is used
-for single page flushes. If all the buffers allocated for single page
-flushes in the doublewrite buffer are in use we wait here for one to
-become free. We are guaranteed that a slot will become free because any
-thread that is using a slot must also release the slot before leaving
-this function. */
-void
-buf_dblwr_write_single_page(
-/*========================*/
-	buf_page_t*	bpage,	/*!< in: buffer block to write */
-	bool		sync);	/*!< in: true if sync IO requested */
 
 /** Doublewrite control struct */
-struct buf_dblwr_t{
-	ib_mutex_t	mutex;	/*!< mutex protecting the first_free
-				field and write_buf */
-	ulint		block1;	/*!< the page number of the first
-				doublewrite block (64 pages) */
-	ulint		block2;	/*!< page number of the second block */
-	ulint		first_free;/*!< first free position in write_buf
-				measured in units of srv_page_size */
-	ulint		b_reserved;/*!< number of slots currently reserved
-				for batch flush. */
-	os_event_t	b_event;/*!< event where threads wait for a
-				batch flush to end;
-				os_event_set() and os_event_reset()
-				are protected by buf_dblwr_t::mutex */
-	ulint		s_reserved;/*!< number of slots currently
-				reserved for single page flushes. */
-	os_event_t	s_event;/*!< event where threads wait for a
-				single page flush slot. Protected by mutex. */
-	bool*		in_use;	/*!< flag used to indicate if a slot is
-				in use. Only used for single page
-				flushes. */
-	bool		batch_running;/*!< set to TRUE if currently a batch
-				is being written from the doublewrite
-				buffer. */
-	byte*		write_buf;/*!< write buffer used in writing to the
-				doublewrite buffer, aligned to an
-				address divisible by srv_page_size
-				(which is required by Windows aio) */
-	byte*		write_buf_unaligned;/*!< pointer to write_buf,
-				but unaligned */
-	buf_page_t**	buf_block_arr;/*!< array to store pointers to
-				the buffer blocks which have been
-				cached to write_buf */
+class buf_dblwr_t
+{
+  struct element
+  {
+    /** asynchronous write request */
+    IORequest request;
+    /** payload size in bytes */
+    size_t size;
+  };
+
+  struct slot
+  {
+    /** first free position in write_buf measured in units of
+     * srv_page_size */
+    ulint first_free;
+    /** number of slots reserved for the current write batch */
+    ulint reserved;
+    /** the doublewrite buffer, aligned to srv_page_size */
+    byte* write_buf;
+    /** buffer blocks to be written via write_buf */
+    element* buf_block_arr;
+  };
+
+  /** the page number of the first doublewrite block (block_size() pages) */
+  page_id_t block1= page_id_t(0, 0);
+  /** the page number of the second doublewrite block (block_size() pages) */
+  page_id_t block2= page_id_t(0, 0);
+
+  /** mutex protecting the data members below */
+  mysql_mutex_t mutex;
+  /** condition variable for !batch_running */
+  pthread_cond_t cond;
+  /** whether a batch is being written from the doublewrite buffer */
+  bool batch_running;
+  /** number of expected flush_buffered_writes_completed() calls */
+  unsigned flushing_buffered_writes;
+  /** pages submitted to flush_buffered_writes() */
+  ulint pages_submitted;
+  /** number of flush_buffered_writes_completed() calls */
+  ulint writes_completed;
+  /** number of pages written by flush_buffered_writes_completed() */
+  ulint pages_written;
+
+  slot slots[2];
+  slot *active_slot= &slots[0];
+
+  /** Initialize the doublewrite buffer data structure.
+  @param header   doublewrite page header in the TRX_SYS page */
+  inline void init(const byte *header);
+
+  /** Flush possible buffered writes to persistent storage. */
+  bool flush_buffered_writes(const ulint size);
+
+public:
+  /** Create or restore the doublewrite buffer in the TRX_SYS page.
+  @return whether the operation succeeded */
+  bool create();
+  /** Free the doublewrite buffer. */
+  void close();
+
+  /** Acquire the mutex */
+  void lock() { mysql_mutex_lock(&mutex); }
+  /** @return the number of submitted page writes */
+  ulint submitted() const
+  { mysql_mutex_assert_owner(&mutex); return pages_submitted; }
+  /** @return the number of completed batches */
+  ulint batches() const
+  { mysql_mutex_assert_owner(&mutex); return writes_completed; }
+  /** @return the number of final pages written */
+  ulint written() const
+  { mysql_mutex_assert_owner(&mutex); return pages_written; }
+  /** Release the mutex */
+  void unlock() { mysql_mutex_unlock(&mutex); }
+
+  /** Initialize the doublewrite buffer memory structure on recovery.
+  If we are upgrading from a version before MySQL 4.1, then this
+  function performs the necessary update operations to support
+  innodb_file_per_table. If we are in a crash recovery, this function
+  loads the pages from double write buffer into memory.
+  @param file File handle
+  @param path Path name of file
+  @return DB_SUCCESS or error code */
+  dberr_t init_or_load_pages(pfs_os_file_t file, const char *path);
+
+  /** Process and remove the double write buffer pages for all tablespaces. */
+  void recover();
+
+  /** Update the doublewrite buffer on data page write completion. */
+  void write_completed();
+  /** Flush possible buffered writes to persistent storage.
+  It is very important to call this function after a batch of writes has been
+  posted, and also when we may have to wait for a page latch!
+  Otherwise a deadlock of threads can occur. */
+  void flush_buffered_writes();
+  /** Update the doublewrite buffer on write batch completion
+  @param request  the completed batch write request */
+  void flush_buffered_writes_completed(const IORequest &request);
+
+  /** Size of the doublewrite block in pages */
+  uint32_t block_size() const { return FSP_EXTENT_SIZE; }
+
+  /** Schedule a page write. If the doublewrite memory buffer is full,
+  flush_buffered_writes() will be invoked to make space.
+  @param request    asynchronous write request
+  @param size       payload size in bytes */
+  void add_to_batch(const IORequest &request, size_t size);
+
+  /** Determine whether the doublewrite buffer is initialized */
+  bool is_initialised() const
+  { return UNIV_LIKELY(block1 != page_id_t(0, 0)); }
+
+  /** @return whether a page identifier is part of the doublewrite buffer */
+  bool is_inside(const page_id_t id) const
+  {
+    if (!is_initialised())
+      return false;
+    ut_ad(block1 < block2);
+    if (id < block1)
+      return false;
+    const uint32_t size= block_size();
+    return id < block1 + size || (id >= block2 && id < block2 + size);
+  }
+
+  /** Wait for flush_buffered_writes() to be fully completed */
+  void wait_flush_buffered_writes()
+  {
+    if (is_initialised())
+    {
+      mysql_mutex_lock(&mutex);
+      while (batch_running)
+        my_cond_wait(&cond, &mutex.m_mutex);
+      mysql_mutex_unlock(&mutex);
+    }
+  }
 };
 
-#endif
+/** The doublewrite buffer */
+extern buf_dblwr_t buf_dblwr;
diff --git a/storage/innobase/include/buf0dump.h b/storage/innobase/include/buf0dump.h
index 8a7ef95ef9c..485869007be 100644
--- a/storage/innobase/include/buf0dump.h
+++ b/storage/innobase/include/buf0dump.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -26,44 +27,18 @@ Created April 08, 2011 Vasil Dimov
 #ifndef buf0dump_h
 #define buf0dump_h
 
-#include "univ.i"
+/** Start the buffer pool dump/load task and instructs it to start a dump. */
+void buf_dump_start();
+/** Start the buffer pool dump/load task and instructs it to start a load. */
+void buf_load_start();
 
-/*****************************************************************//**
-Wakes up the buffer pool dump/load thread and instructs it to start
-a dump. This function is called by MySQL code via buffer_pool_dump_now()
-and it should return immediately because the whole MySQL is frozen during
-its execution. */
-void
-buf_dump_start();
-/*============*/
+/** Abort a currently running buffer pool load. */
+void buf_load_abort();
 
-/*****************************************************************//**
-Wakes up the buffer pool dump/load thread and instructs it to start
-a load. This function is called by MySQL code via buffer_pool_load_now()
-and it should return immediately because the whole MySQL is frozen during
-its execution. */
-void
-buf_load_start();
-/*============*/
+/** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/
+void buf_load_at_startup();
 
-/*****************************************************************//**
-Aborts a currently running buffer pool load. This function is called by
-MySQL code via buffer_pool_load_abort() and it should return immediately
-because the whole MySQL is frozen during its execution. */
-void
-buf_load_abort();
-/*============*/
-
-/*****************************************************************//**
-This is the main thread for buffer pool dump/load. It waits for an
-event and when waked up either performs a dump or load and sleeps
-again.
-@return this function does not return, it calls os_thread_exit() */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(buf_dump_thread)(
-/*============================*/
-	void*	arg);				/*!< in: a dummy parameter
-						required by os_thread_create */
+/** Wait for currently running load/dumps to finish*/
+void buf_load_dump_end();
 
 #endif /* buf0dump_h */
diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h
index aa9eb78b8ef..10ead048fa6 100644
--- a/storage/innobase/include/buf0flu.h
+++ b/storage/innobase/include/buf0flu.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2014, 2018, MariaDB Corporation.
+Copyright (c) 2014, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -31,6 +31,12 @@ Created 11/5/1995 Heikki Tuuri
 #include "log0log.h"
 #include "buf0types.h"
 
+/** Number of pages flushed. Protected by buf_pool.mutex. */
+extern ulint buf_flush_page_count;
+/** Number of pages flushed via LRU. Protected by buf_pool.mutex.
+Also included in buf_flush_page_count. */
+extern ulint buf_lru_flush_page_count;
+
 /** Flag indicating if the page_cleaner is in active state. */
 extern bool buf_page_cleaner_is_active;
 
@@ -41,38 +47,27 @@ extern my_bool		innodb_page_cleaner_disabled_debug;
 
 #endif /* UNIV_DEBUG */
 
-/** Event to synchronise with the flushing. */
-extern os_event_t	buf_flush_event;
-
-class ut_stage_alter_t;
+/** Remove all dirty pages belonging to a given tablespace when we are
+deleting the data file of that tablespace.
+The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU.
+@param id    tablespace identifier */
+void buf_flush_remove_pages(ulint id);
 
-/** Handled page counters for a single flush */
-struct flush_counters_t {
-	ulint	flushed;	/*!< number of dirty pages flushed */
-	ulint	evicted;	/*!< number of clean pages evicted */
-	ulint	unzip_LRU_evicted;/*!< number of uncompressed page images
-				evicted */
-};
-
-/********************************************************************//**
-Remove a block from the flush list of modified blocks. */
-void
-buf_flush_remove(
-/*=============*/
-	buf_page_t*	bpage);	/*!< in: pointer to the block in question */
 /*******************************************************************//**
 Relocates a buffer control block on the flush_list.
 Note that it is assumed that the contents of bpage has already been
 copied to dpage. */
+ATTRIBUTE_COLD
 void
 buf_flush_relocate_on_flush_list(
 /*=============================*/
 	buf_page_t*	bpage,	/*!< in/out: control block being moved */
 	buf_page_t*	dpage);	/*!< in/out: destination block */
-/** Update the flush system data structures when a write is completed.
-@param[in,out]	bpage	flushed page
-@param[in]	dblwr	whether the doublewrite buffer was used */
-void buf_flush_write_complete(buf_page_t* bpage, bool dblwr);
+
+/** Complete write of a file page from buf_pool.
+@param request write request */
+void buf_page_write_complete(const IORequest &request);
 
 /** Assign the full crc32 checksum for non-compressed page.
 @param[in,out]	page	page to be updated */
@@ -82,97 +77,37 @@ void buf_flush_assign_full_crc32_checksum(byte* page);
 @param[in]	block			buffer block; NULL if bypassing the buffer pool
 @param[in,out]	page			page frame
 @param[in,out]	page_zip_		compressed page, or NULL if uncompressed
-@param[in]	newest_lsn		newest modification LSN to the page
 @param[in]	use_full_checksum	whether tablespace uses full checksum */
 void
 buf_flush_init_for_writing(
 	const buf_block_t*	block,
 	byte*			page,
 	void*			page_zip_,
-	lsn_t			newest_lsn,
 	bool			use_full_checksum);
 
-# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
-/********************************************************************//**
-Writes a flushable page asynchronously from the buffer pool to a file.
-NOTE: buf_pool->mutex and block->mutex must be held upon entering this
-function, and they will be released by this function after flushing.
-This is loosely based on buf_flush_batch() and buf_flush_page().
-@return TRUE if the page was flushed and the mutexes released */
-ibool
-buf_flush_page_try(
-/*===============*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
-	buf_block_t*	block)		/*!< in/out: buffer control block */
-	MY_ATTRIBUTE((warn_unused_result));
-# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
-/** Do flushing batch of a given type.
-NOTE: The calling thread is not allowed to own any latches on pages!
-@param[in,out]	buf_pool	buffer pool instance
-@param[in]	type		flush type
-@param[in]	min_n		wished minimum mumber of blocks flushed
-(it is not guaranteed that the actual number is that big, though)
-@param[in]	lsn_limit	in the case BUF_FLUSH_LIST all blocks whose
-oldest_modification is smaller than this should be flushed (if their number
-does not exceed min_n), otherwise ignored
-@param[out]	n		the number of pages which were processed is
-passed back to caller. Ignored if NULL
-@retval true	if a batch was queued successfully.
-@retval false	if another batch of same type was already running. */
-bool
-buf_flush_do_batch(
-	buf_pool_t*		buf_pool,
-	buf_flush_t		type,
-	ulint			min_n,
-	lsn_t			lsn_limit,
-	flush_counters_t*	n);
-
-/** This utility flushes dirty blocks from the end of the flush list of all
-buffer pool instances.
-NOTE: The calling thread is not allowed to own any latches on pages!
-@param[in]	min_n		wished minimum mumber of blocks flushed (it is
-not guaranteed that the actual number is that big, though)
-@param[in]	lsn_limit	in the case BUF_FLUSH_LIST all blocks whose
-oldest_modification is smaller than this should be flushed (if their number
-does not exceed min_n), otherwise ignored
-@param[out]	n_processed	the number of pages which were processed is
-passed back to caller. Ignored if NULL.
-@return true if a batch was queued successfully for each buffer pool
-instance. false if another batch of same type was already running in
-at least one of the buffer pool instance */
-bool
-buf_flush_lists(
-	ulint			min_n,
-	lsn_t			lsn_limit,
-	ulint*			n_processed);
-
-/******************************************************************//**
-This function picks up a single page from the tail of the LRU
-list, flushes it (if it is dirty), removes it from page_hash and LRU
-list and puts it on the free list. It is called from user threads when
-they are unable to find a replaceable page at the tail of the LRU
-list i.e.: when the background LRU flushing in the page_cleaner thread
-is not fast enough to keep pace with the workload.
-@return true if success. */
-bool
-buf_flush_single_page_from_LRU(
-/*===========================*/
-	buf_pool_t*	buf_pool);	/*!< in/out: buffer pool instance */
-/******************************************************************//**
-Waits until a flush batch of the given type ends */
-void
-buf_flush_wait_batch_end(
-/*=====================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	buf_flush_t	type);		/*!< in: BUF_FLUSH_LRU
-					or BUF_FLUSH_LIST */
-/**
-Waits until a flush batch of the given lsn ends
-@param[in]	new_oldest	target oldest_modified_lsn to wait for */
-
-void
-buf_flush_wait_flushed(
-	lsn_t		new_oldest);
+/** Try to flush dirty pages that belong to a given tablespace.
+@param space       tablespace
+@param n_flushed   number of pages written
+@return whether the flush for some pages might not have been initiated */
+bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/** Write out dirty blocks from buf_pool.LRU.
+@param max_n    wished maximum mumber of blocks flushed
+@return the number of processed pages
+@retval 0 if a buf_pool.LRU batch is already running */
+ulint buf_flush_LRU(ulint max_n);
+
+/** Wait until a flush batch ends.
+@param lru    true=buf_pool.LRU; false=buf_pool.flush_list */
+void buf_flush_wait_batch_end(bool lru);
+/** Wait until all persistent pages are flushed up to a limit.
+@param sync_lsn   buf_pool.get_oldest_modification(LSN_MAX) to wait for */
+ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn);
+/** Initiate more eager page flushing if the log checkpoint age is too old.
+@param lsn      buf_pool.get_oldest_modification(LSN_MAX) target
+@param furious  true=furious flushing, false=limit to innodb_io_capacity */
+ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious);
 
 /********************************************************************//**
 This function should be called at a mini-transaction commit, if a page was
@@ -185,217 +120,30 @@ buf_flush_note_modification(
 	buf_block_t*	block,		/*!< in: block which is modified */
 	lsn_t		start_lsn,	/*!< in: start lsn of the first mtr in a
 					set of mtr's */
-	lsn_t		end_lsn,	/*!< in: end lsn of the last mtr in the
+	lsn_t		end_lsn);	/*!< in: end lsn of the last mtr in the
 					set of mtr's */
-	FlushObserver*	observer);	/*!< in: flush observer */
-/********************************************************************//**
-Returns TRUE if the file page block is immediately suitable for replacement,
-i.e., transition FILE_PAGE => NOT_USED allowed.
-@return TRUE if can replace immediately */
-ibool
-buf_flush_ready_for_replace(
-/*========================*/
-	buf_page_t*	bpage);	/*!< in: buffer control block, must be
-				buf_page_in_file(bpage) and in the LRU list */
-
-#ifdef UNIV_DEBUG
-/** Disables page cleaner threads (coordinator and workers).
-It's used by: SET GLOBAL innodb_page_cleaner_disabled_debug = 1 (0).
-@param[in]	save		immediate result from check function */
-void buf_flush_page_cleaner_disabled_debug_update(THD*,
-						  st_mysql_sys_var*, void*,
-						  const void* save);
-#endif /* UNIV_DEBUG */
-
-/******************************************************************//**
-page_cleaner thread tasked with flushing dirty pages from the buffer
-pools. As of now we'll have only one coordinator of this thread.
-@return a dummy parameter */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(buf_flush_page_cleaner_coordinator)(
-/*===============================================*/
-	void*	arg);		/*!< in: a dummy parameter required by
-				os_thread_create */
-
-/** Adjust thread count for page cleaner workers.
-@param[in]	new_cnt		Number of threads to be used */
-void
-buf_flush_set_page_cleaner_thread_cnt(ulong new_cnt);
 
-/******************************************************************//**
-Worker thread of page_cleaner.
-@return a dummy parameter */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(buf_flush_page_cleaner_worker)(
-/*==========================================*/
-	void*	arg);		/*!< in: a dummy parameter required by
-				os_thread_create */
 /** Initialize page_cleaner. */
-void
-buf_flush_page_cleaner_init(void);
-
-/** Wait for any possible LRU flushes that are in progress to end. */
-void
-buf_flush_wait_LRU_batch_end(void);
+ATTRIBUTE_COLD void buf_flush_page_cleaner_init();
 
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/******************************************************************//**
-Validates the flush list.
-@return TRUE if ok */
-ibool
-buf_flush_validate(
-/*===============*/
-	buf_pool_t*	buf_pool);
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+/** Wait for pending flushes to complete. */
+void buf_flush_wait_batch_end_acquiring_mutex(bool lru);
 
-/********************************************************************//**
-Initialize the red-black tree to speed up insertions into the flush_list
-during recovery process. Should be called at the start of recovery
-process before any page has been read/written. */
-void
-buf_flush_init_flush_rbt(void);
-/*==========================*/
+/** Flush the buffer pool on shutdown. */
+ATTRIBUTE_COLD void buf_flush_buffer_pool();
 
-/********************************************************************//**
-Frees up the red-black tree. */
-void
-buf_flush_free_flush_rbt(void);
-/*==========================*/
-
-/********************************************************************//**
-Writes a flushable page asynchronously from the buffer pool to a file.
-NOTE: in simulated aio we must call
-os_aio_simulated_wake_handler_threads after we have posted a batch of
-writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
-held upon entering this function, and they will be released by this
-function.
-@return TRUE if page was flushed */
-ibool
-buf_flush_page(
-/*===========*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	buf_page_t*	bpage,		/*!< in: buffer control block */
-	buf_flush_t	flush_type,	/*!< in: type of flush */
-	bool		sync);		/*!< in: true if sync IO request */
-/********************************************************************//**
-Returns true if the block is modified and ready for flushing.
-@return true if can flush immediately */
-bool
-buf_flush_ready_for_flush(
-/*======================*/
-	buf_page_t*	bpage,	/*!< in: buffer control block, must be
-				buf_page_in_file(bpage) */
-	buf_flush_t	flush_type)/*!< in: type of flush */
-	MY_ATTRIBUTE((warn_unused_result));
-
-/******************************************************************//**
-Check if there are any dirty pages that belong to a space id in the flush
-list in a particular buffer pool.
-@return number of dirty pages present in a single buffer pool */
-ulint
-buf_pool_get_dirty_pages_count(
-/*===========================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool */
-	ulint		id,		/*!< in: space id to check */
-	FlushObserver*	observer);	/*!< in: flush observer to check */
-
-/*******************************************************************//**
-Synchronously flush dirty blocks from the end of the flush list of all buffer
-pool instances.
-NOTE: The calling thread is not allowed to own any latches on pages! */
-void
-buf_flush_sync_all_buf_pools(void);
-/*==============================*/
-
-/** Request IO burst and wake page_cleaner up.
-@param[in]	lsn_limit	upper limit of LSN to be flushed */
-void
-buf_flush_request_force(
-	lsn_t	lsn_limit);
-
-/** We use FlushObserver to track flushing of non-redo logged pages in bulk
-create index(BtrBulk.cc).Since we disable redo logging during a index build,
-we need to make sure that all dirty pages modifed by the index build are
-flushed to disk before any redo logged operations go to the index. */
-
-class FlushObserver {
-public:
-	/** Constructor
-	@param[in,out]	space		tablespace
-	@param[in]	trx		trx instance
-	@param[in]	stage		performance schema accounting object,
-	used by ALTER TABLE. It is passed to log_preflush_pool_modified_pages()
-	for accounting. */
-	FlushObserver(fil_space_t* space, trx_t* trx, ut_stage_alter_t* stage);
-
-	/** Deconstructor */
-	~FlushObserver();
-
-	/** Check pages have been flushed and removed from the flush list
-	in a buffer pool instance.
-	@param[in]	instance_no	buffer pool instance no
-	@return true if the pages were removed from the flush list */
-	bool is_complete(ulint	instance_no)
-	{
-		return(m_flushed->at(instance_no) == m_removed->at(instance_no)
-		       || m_interrupted);
-	}
-
-	/** @return whether to flush only some pages of the tablespace */
-	bool is_partial_flush() const { return m_stage != NULL; }
-
-	/** @return whether the operation was interrupted */
-	bool is_interrupted() const { return m_interrupted; }
-
-	/** Interrupt observer not to wait. */
-	void interrupted()
-	{
-		m_interrupted = true;
-	}
-
-	/** Check whether the operation has been interrupted */
-	void check_interrupted();
-
-	/** Flush dirty pages. */
-	void flush();
-	/** Notify observer of flushing a page
-	@param[in]	buf_pool	buffer pool instance
-	@param[in]	bpage		buffer page to flush */
-	void notify_flush(
-		buf_pool_t*	buf_pool,
-		buf_page_t*	bpage);
-
-	/** Notify observer of removing a page from flush list
-	@param[in]	buf_pool	buffer pool instance
-	@param[in]	bpage		buffer page flushed */
-	void notify_remove(
-		buf_pool_t*	buf_pool,
-		buf_page_t*	bpage);
-private:
-	/** Tablespace */
-	fil_space_t*		m_space;
-
-	/** Trx instance */
-	const trx_t* const	m_trx;
-
-	/** Performance schema accounting object, used by ALTER TABLE.
-	If not NULL, then stage->begin_phase_flush() will be called initially,
-	specifying the number of pages to be attempted to be flushed and
-	subsequently, stage->inc() will be called for each page we attempt to
-	flush. */
-	ut_stage_alter_t*	m_stage;
-
-	/* Flush request sent */
-	std::vector<ulint>*	m_flushed;
+#ifdef UNIV_DEBUG
+/** Validate the flush list. */
+void buf_flush_validate();
+#endif /* UNIV_DEBUG */
 
-	/* Flush request finished */
-	std::vector<ulint>*	m_removed;
+/** Synchronously flush dirty blocks during recv_sys_t::apply().
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync_batch(lsn_t lsn);
 
-	/* True if the operation was interrupted. */
-	bool			m_interrupted;
-};
+/** Synchronously flush dirty blocks.
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync();
 
 #include "buf0flu.inl"
 
diff --git a/storage/innobase/include/buf0flu.inl b/storage/innobase/include/buf0flu.inl
index 02f3d8ced57..b8a9b6d1f5d 100644
--- a/storage/innobase/include/buf0flu.inl
+++ b/storage/innobase/include/buf0flu.inl
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -23,19 +24,9 @@ The database buffer pool flush algorithm
 Created 11/5/1995 Heikki Tuuri
 *******************************************************/
 
+#include "assume_aligned.h"
 #include "buf0buf.h"
-#include "mtr0mtr.h"
 #include "srv0srv.h"
-#include "fsp0types.h"
-
-/********************************************************************//**
-Inserts a modified block into the flush list. */
-void
-buf_flush_insert_into_flush_list(
-/*=============================*/
-	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	buf_block_t*	block,		/*!< in/out: block which is modified */
-	lsn_t		lsn);		/*!< in: oldest modification */
 
 /********************************************************************//**
 This function should be called at a mini-transaction commit, if a page was
@@ -48,33 +39,28 @@ buf_flush_note_modification(
 	buf_block_t*	block,		/*!< in: block which is modified */
 	lsn_t		start_lsn,	/*!< in: start lsn of the mtr that
 					modified this block */
-	lsn_t		end_lsn,	/*!< in: end lsn of the mtr that
+	lsn_t		end_lsn)	/*!< in: end lsn of the mtr that
 					modified this block */
-	FlushObserver*	observer)	/*!< in: flush observer */
 {
-	mutex_enter(&block->mutex);
-	ut_ad(!srv_read_only_mode
-	      || fsp_is_system_temporary(block->page.id.space()));
-	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-	ut_ad(block->page.buf_fix_count > 0);
-	ut_ad(block->page.newest_modification <= end_lsn);
-	block->page.newest_modification = end_lsn;
-
-	/* Don't allow to set flush observer from non-null to null,
-	or from one observer to another. */
-	ut_ad(block->page.flush_observer == NULL
-	      || block->page.flush_observer == observer);
-	block->page.flush_observer = observer;
+	ut_ad(!srv_read_only_mode);
+	ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+	ut_ad(block->page.buf_fix_count());
+	ut_ad(mach_read_from_8(block->frame + FIL_PAGE_LSN) <= end_lsn);
+	mach_write_to_8(block->frame + FIL_PAGE_LSN, end_lsn);
+	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+		memcpy_aligned<8>(FIL_PAGE_LSN + block->page.zip.data,
+				  FIL_PAGE_LSN + block->frame, 8);
+	}
 
-	if (block->page.oldest_modification == 0) {
-		buf_pool_t*	buf_pool = buf_pool_from_block(block);
+	const lsn_t oldest_modification = block->page.oldest_modification();
 
-		buf_flush_insert_into_flush_list(buf_pool, block, start_lsn);
+	if (oldest_modification > 1) {
+		ut_ad(oldest_modification <= start_lsn);
+	} else if (fsp_is_system_temporary(block->page.id().space())) {
+		block->page.set_temp_modified();
 	} else {
-		ut_ad(block->page.oldest_modification <= start_lsn);
+		buf_pool.insert_into_flush_list(block, start_lsn);
 	}
 
-	mutex_exit(&block->mutex);
-
 	srv_stats.buf_pool_write_requests.inc();
 }
diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h
index e7707ffd6dc..540c14a49c9 100644
--- a/storage/innobase/include/buf0lru.h
+++ b/storage/innobase/include/buf0lru.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -34,14 +34,8 @@ Created 11/5/1995 Heikki Tuuri
 struct trx_t;
 struct fil_space_t;
 
-/******************************************************************//**
-Returns TRUE if less than 25 % of the buffer pool is available. This can be
-used in heuristics to prevent huge transactions eating up the whole buffer
-pool for their locks.
-@return TRUE if less than 25 % of buffer pool left */
-ibool
-buf_LRU_buf_pool_running_out(void);
-/*==============================*/
+/** Flush this many pages in buf_LRU_get_free_block() */
+extern size_t innodb_lru_flush_size;
 
 /*#######################################################################
 These are low-level functions
@@ -50,103 +44,59 @@ These are low-level functions
 /** Minimum LRU list length for which the LRU_old pointer is defined */
 #define BUF_LRU_OLD_MIN_LEN	512	/* 8 megabytes of 16k pages */
 
-/** Empty the flush list for all pages belonging to a tablespace.
-@param[in]	id		tablespace identifier
-@param[in,out]	observer	flush observer,
-				or NULL if nothing is to be written
-@param[in]	first		first page to be flushed or evicted */
-void buf_LRU_flush_or_remove_pages(ulint id, FlushObserver* observer,
-				   ulint first = 0);
+/** Try to free a block. If bpage is a descriptor of a compressed-only
+ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well.
+The caller must hold buf_pool.mutex.
+@param bpage      block to be freed
+@param zip        whether to remove both copies of a ROW_FORMAT=COMPRESSED page
+@retval true if freed and buf_pool.mutex may have been temporarily released
+@retval false if the page was not freed */
+bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
+  MY_ATTRIBUTE((nonnull));
+
+/** Try to free a replaceable block.
+@param limit  maximum number of blocks to scan
+@return true if found and freed */
+bool buf_LRU_scan_and_free_block(ulint limit= ULINT_UNDEFINED);
 
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/********************************************************************//**
-Insert a compressed block into buf_pool->zip_clean in the LRU order. */
-void
-buf_LRU_insert_zip_clean(
-/*=====================*/
-	buf_page_t*	bpage);	/*!< in: pointer to the block in question */
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+/** @return a buffer block from the buf_pool.free list
+@retval	NULL	if the free list is empty */
+buf_block_t* buf_LRU_get_free_only();
+
+/** Get a block from the buf_pool.free list.
+If the list is empty, blocks will be moved from the end of buf_pool.LRU
+to buf_pool.free.
 
-/******************************************************************//**
-Try to free a block.  If bpage is a descriptor of a compressed-only
-page, the descriptor object will be freed as well.
-
-NOTE: If this function returns true, it will temporarily
-release buf_pool->mutex.  Furthermore, the page frame will no longer be
-accessible via bpage.
-
-The caller must hold buf_pool->mutex and must not hold any
-buf_page_get_mutex() when calling this function.
-@return true if freed, false otherwise. */
-bool
-buf_LRU_free_page(
-/*==============*/
-	buf_page_t*	bpage,	/*!< in: block to be freed */
-	bool		zip)	/*!< in: true if should remove also the
-				compressed page of an uncompressed page */
-	MY_ATTRIBUTE((nonnull));
-/******************************************************************//**
-Try to free a replaceable block.
-@return true if found and freed */
-bool
-buf_LRU_scan_and_free_block(
-/*========================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	bool		scan_all)	/*!< in: scan whole LRU list
-					if true, otherwise scan only
-					'old' blocks. */
-	MY_ATTRIBUTE((nonnull,warn_unused_result));
-/******************************************************************//**
-Returns a free block from the buf_pool.  The block is taken off the
-free list.  If it is empty, returns NULL.
-@return a free control block, or NULL if the buf_block->free list is empty */
-buf_block_t*
-buf_LRU_get_free_only(
-/*==================*/
-	buf_pool_t*	buf_pool);	/*!< buffer pool instance */
-/******************************************************************//**
-Returns a free block from the buf_pool. The block is taken off the
-free list. If free list is empty, blocks are moved from the end of the
-LRU list to the free list.
 This function is called from a user thread when it needs a clean
 block to read in a page. Note that we only ever get a block from
 the free list. Even when we flush a page or find a page in LRU scan
 we put it to free list to be used.
 * iteration 0:
-  * get a block from free list, success:done
-  * if buf_pool->try_LRU_scan is set
-    * scan LRU up to srv_LRU_scan_depth to find a clean block
-    * the above will put the block on free list
+  * get a block from the buf_pool.free list, success:done
+  * if buf_pool.try_LRU_scan is set
+    * scan LRU up to 100 pages to free a clean block
     * success:retry the free list
-  * flush one dirty page from tail of LRU to disk
-    * the above will put the block on free list
+  * flush up to innodb_lru_flush_size LRU blocks to data files
+    (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth)
+    * on buf_page_write_complete() the blocks will put on buf_pool.free list
     * success: retry the free list
-* iteration 1:
-  * same as iteration 0 except:
-    * scan whole LRU list
-    * scan LRU list even if buf_pool->try_LRU_scan is not set
-* iteration > 1:
-  * same as iteration 1 but sleep 10ms
-@return the free control block, in state BUF_BLOCK_READY_FOR_USE */
-buf_block_t*
-buf_LRU_get_free_block(
-/*===================*/
-	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
-	MY_ATTRIBUTE((nonnull,warn_unused_result));
-/******************************************************************//**
-Determines if the unzip_LRU list should be used for evicting a victim
-instead of the general LRU list.
-@return TRUE if should use unzip_LRU */
-ibool
-buf_LRU_evict_from_unzip_LRU(
-/*=========================*/
-	buf_pool_t*	buf_pool);
-/******************************************************************//**
-Puts a block back to the free list. */
+* subsequent iterations: same as iteration 0 except:
+  * scan whole LRU list
+  * scan LRU list even if buf_pool.try_LRU_scan is not set
+
+@param have_mutex  whether buf_pool.mutex is already being held
+@return the free control block, in state BUF_BLOCK_MEMORY */
+buf_block_t* buf_LRU_get_free_block(bool have_mutex)
+	MY_ATTRIBUTE((malloc,warn_unused_result));
+
+/** @return whether the unzip_LRU list should be used for evicting a victim
+instead of the general LRU list */
+bool buf_LRU_evict_from_unzip_LRU();
+
+/** Puts a block back to the free list.
+@param[in]	block	block; not containing a file page */
 void
-buf_LRU_block_free_non_file_page(
-/*=============================*/
-	buf_block_t*	block);	/*!< in: block, must not contain a file page */
+buf_LRU_block_free_non_file_page(buf_block_t* block);
 /******************************************************************//**
 Adds a block to the LRU list. Please make sure that the page_size is
 already set when invoking the function, so that we can get correct
@@ -155,7 +105,7 @@ void
 buf_LRU_add_block(
 /*==============*/
 	buf_page_t*	bpage,	/*!< in: control block */
-	ibool		old);	/*!< in: TRUE if should be put to the old
+	bool		old);	/*!< in: true if should be put to the old
 				blocks in the LRU list, else put to the
 				start; if the LRU list is very short, added to
 				the start regardless of this parameter */
@@ -167,72 +117,48 @@ buf_unzip_LRU_add_block(
 	buf_block_t*	block,	/*!< in: control block */
 	ibool		old);	/*!< in: TRUE if should be put to the end
 				of the list, else put to the start */
-/******************************************************************//**
-Moves a block to the start of the LRU list. */
-void
-buf_LRU_make_block_young(
-/*=====================*/
-	buf_page_t*	bpage);	/*!< in: control block */
-/**********************************************************************//**
-Updates buf_pool->LRU_old_ratio.
+
+/** Update buf_pool.LRU_old_ratio.
+@param[in]	old_pct		Reserve this percentage of
+				the buffer pool for "old" blocks
+@param[in]	adjust		true=adjust the LRU list;
+				false=just assign buf_pool.LRU_old_ratio
+				during the initialization of InnoDB
 @return updated old_pct */
-uint
-buf_LRU_old_ratio_update(
-/*=====================*/
-	uint	old_pct,/*!< in: Reserve this percentage of
-			the buffer pool for "old" blocks. */
-	bool	adjust);/*!< in: true=adjust the LRU list;
-			false=just assign buf_pool->LRU_old_ratio
-			during the initialization of InnoDB */
+uint buf_LRU_old_ratio_update(uint old_pct, bool adjust);
 /********************************************************************//**
 Update the historical stats that we are collecting for LRU eviction
 policy at the end of each interval. */
 void
-buf_LRU_stat_update(void);
-/*=====================*/
+buf_LRU_stat_update();
 
 /** Remove one page from LRU list and put it to free list.
-@param[in,out]	bpage		block, must contain a file page and be in
-				a freeable state; there may or may not be a
-				hash index to the page
-@param[in]	old_page_id	page number before bpage->id was invalidated */
-void buf_LRU_free_one_page(buf_page_t* bpage, page_id_t old_page_id)
-	MY_ATTRIBUTE((nonnull));
-
-/******************************************************************//**
-Adjust LRU hazard pointers if needed. */
-void
-buf_LRU_adjust_hp(
-/*==============*/
-	buf_pool_t*		buf_pool,/*!< in: buffer pool instance */
-	const buf_page_t*	bpage);	/*!< in: control block */
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/**********************************************************************//**
-Validates the LRU list.
-@return TRUE */
-ibool
-buf_LRU_validate(void);
-/*==================*/
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/**********************************************************************//**
-Prints the LRU list. */
-void
-buf_LRU_print(void);
-/*===============*/
-#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+@param bpage     file page to be freed
+@param id        page identifier
+@param hash_lock buf_pool.page_hash latch (will be released here) */
+void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id,
+                           page_hash_latch *hash_lock)
+  MY_ATTRIBUTE((nonnull));
+
+#ifdef UNIV_DEBUG
+/** Validate the LRU list. */
+void buf_LRU_validate();
+#endif /* UNIV_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+/** Dump the LRU list to stderr. */
+void buf_LRU_print();
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
 
 /** @name Heuristics for detecting index scan @{ */
-/** The denominator of buf_pool->LRU_old_ratio. */
+/** The denominator of buf_pool.LRU_old_ratio. */
 #define BUF_LRU_OLD_RATIO_DIV	1024
-/** Maximum value of buf_pool->LRU_old_ratio.
+/** Maximum value of buf_pool.LRU_old_ratio.
 @see buf_LRU_old_adjust_len
-@see buf_pool->LRU_old_ratio_update */
+@see buf_pool.LRU_old_ratio_update */
 #define BUF_LRU_OLD_RATIO_MAX	BUF_LRU_OLD_RATIO_DIV
-/** Minimum value of buf_pool->LRU_old_ratio.
+/** Minimum value of buf_pool.LRU_old_ratio.
 @see buf_LRU_old_adjust_len
-@see buf_pool->LRU_old_ratio_update
+@see buf_pool.LRU_old_ratio_update
 The minimum must exceed
 (BUF_LRU_OLD_TOLERANCE + 5) * BUF_LRU_OLD_RATIO_DIV / BUF_LRU_OLD_MIN_LEN. */
 #define BUF_LRU_OLD_RATIO_MIN	51
@@ -253,7 +179,7 @@ extern uint	buf_LRU_old_threshold_ms;
 
 These statistics are not 'of' LRU but 'for' LRU.  We keep count of I/O
 and page_zip_decompress() operations.  Based on the statistics we decide
-if we want to evict from buf_pool->unzip_LRU or buf_pool->LRU. */
+if we want to evict from buf_pool.unzip_LRU or buf_pool.LRU. */
 struct buf_LRU_stat_t
 {
 	ulint	io;	/**< Counter of buffer pool I/O operations. */
@@ -265,7 +191,7 @@ Cleared by buf_LRU_stat_update(). */
 extern buf_LRU_stat_t	buf_LRU_stat_cur;
 
 /** Running sum of past values of buf_LRU_stat_cur.
-Updated by buf_LRU_stat_update().  Protected by buf_pool->mutex. */
+Updated by buf_LRU_stat_update().  Protected by buf_pool.mutex. */
 extern buf_LRU_stat_t	buf_LRU_stat_sum;
 
 /********************************************************************//**
diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h
index ff0ba474bb3..8d6b28194dc 100644
--- a/storage/innobase/include/buf0rea.h
+++ b/storage/innobase/include/buf0rea.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2019, MariaDB Corporation.
+Copyright (c) 2015, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -46,11 +46,12 @@ dberr_t buf_read_page(const page_id_t page_id, ulint zip_size);
 buffer buf_pool if it is not already there. Sets the io_fix flag and sets
 an exclusive lock on the buffer frame. The flag is cleared and the x-lock
 released by the i/o-handler thread.
+@param[in,out]	space		tablespace
 @param[in]	page_id		page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	sync		true if synchronous aio is desired */
-void
-buf_read_page_background(const page_id_t page_id, ulint zip_size, bool sync);
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0 */
+void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
+                              ulint zip_size)
+  MY_ATTRIBUTE((nonnull));
 
 /** Applies a random read-ahead in buf_pool if there are at least a threshold
 value of accessed pages from the random read-ahead area. Does not read any
@@ -100,44 +101,12 @@ which could result in a deadlock if the OS does not support asynchronous io.
 ulint
 buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf);
 
-/********************************************************************//**
-Issues read requests for pages which the ibuf module wants to read in, in
-order to contract the insert buffer tree. Technically, this function is like
-a read-ahead function. */
-void
-buf_read_ibuf_merge_pages(
-/*======================*/
-	bool		sync,		/*!< in: true if the caller
-					wants this function to wait
-					for the highest address page
-					to get read in, before this
-					function returns */
-	const ulint*	space_ids,	/*!< in: array of space ids */
-	const ulint*	page_nos,	/*!< in: array of page numbers
-					to read, with the highest page
-					number the last in the
-					array */
-	ulint		n_stored);	/*!< in: number of elements
-					in the arrays */
-
 /** Issues read requests for pages which recovery wants to read in.
-@param[in]	sync		true if the caller wants this function to wait
-for the highest address page to get read in, before this function returns
 @param[in]	space_id	tablespace id
 @param[in]	page_nos	array of page numbers to read, with the
 highest page number the last in the array
-@param[in]	n_stored	number of page numbers in the array */
-
-void
-buf_read_recv_pages(
-	bool		sync,
-	ulint		space_id,
-	const ulint*	page_nos,
-	ulint		n_stored);
-
-/** The size in pages of the area which the read-ahead algorithms read if
-invoked */
-#define	BUF_READ_AHEAD_AREA(b)		((b)->read_ahead_area)
+@param[in]	n		number of page numbers in the array */
+void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n);
 
 /** @name Modes used in read-ahead @{ */
 /** read only pages belonging to the insert buffer tree */
diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h
index 5532a524782..5dd581097f9 100644
--- a/storage/innobase/include/buf0types.h
+++ b/storage/innobase/include/buf0types.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2015, Oracle and/or its affiliates. All rights reserved.
-Copyright (c) 2019, 2020, MariaDB Corporation.
+Copyright (c) 2019, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -27,39 +27,20 @@ Created 11/17/1995 Heikki Tuuri
 #ifndef buf0types_h
 #define buf0types_h
 
-#include "os0event.h"
-#include "ut0ut.h"
+#include "univ.i"
 
 /** Buffer page (uncompressed or compressed) */
 class buf_page_t;
 /** Buffer block for which an uncompressed page exists */
 struct buf_block_t;
-/** Buffer pool chunk comprising buf_block_t */
-struct buf_chunk_t;
-/** Buffer pool comprising buf_chunk_t */
-struct buf_pool_t;
 /** Buffer pool statistics struct */
 struct buf_pool_stat_t;
 /** Buffer pool buddy statistics struct */
 struct buf_buddy_stat_t;
-/** Doublewrite memory struct */
-struct buf_dblwr_t;
-/** Flush observer for bulk create index */
-class FlushObserver;
 
 /** A buffer frame. @see page_t */
 typedef	byte	buf_frame_t;
 
-/** Flags for flush types */
-enum buf_flush_t {
-	BUF_FLUSH_LRU = 0,		/*!< flush via the LRU list */
-	BUF_FLUSH_LIST,			/*!< flush via the flush list
-					of dirty blocks */
-	BUF_FLUSH_SINGLE_PAGE,		/*!< flush via the LRU list
-					but only a single page */
-	BUF_FLUSH_N_TYPES		/*!< index of last element + 1  */
-};
-
 /** Flags for io_fix types */
 enum buf_io_fix {
 	BUF_IO_NONE = 0,		/**< no pending I/O */
@@ -135,98 +116,110 @@ this must be equal to srv_page_size */
 /* @} */
 
 /** Page identifier. */
-class page_id_t {
+class page_id_t
+{
 public:
-
-	/** Constructor from (space, page_no).
-	@param[in]	space	tablespace id
-	@param[in]	page_no	page number */
-	page_id_t(ulint space, ulint page_no)
-		: m_space(uint32_t(space)), m_page_no(uint32(page_no))
-	{
-		ut_ad(space <= 0xFFFFFFFFU);
-		ut_ad(page_no <= 0xFFFFFFFFU);
-	}
-
-	bool operator==(const page_id_t& rhs) const
-	{
-		return m_space == rhs.m_space && m_page_no == rhs.m_page_no;
-	}
-	bool operator!=(const page_id_t& rhs) const { return !(*this == rhs); }
-
-	bool operator<(const page_id_t& rhs) const
-	{
-		if (m_space == rhs.m_space) {
-			return m_page_no < rhs.m_page_no;
-		}
-
-		return m_space < rhs.m_space;
-	}
-
-	/** Retrieve the tablespace id.
-	@return tablespace id */
-	uint32_t space() const { return m_space; }
-
-	/** Retrieve the page number.
-	@return page number */
-	uint32_t page_no() const { return m_page_no; }
-
-	/** Retrieve the fold value.
-	@return fold value */
-	ulint fold() const { return (m_space << 20) + m_space + m_page_no; }
-
-	/** Reset the page number only.
-	@param[in]	page_no	page number */
-	void set_page_no(ulint page_no)
-	{
-		m_page_no = uint32_t(page_no);
-
-		ut_ad(page_no <= 0xFFFFFFFFU);
-	}
-
-	/** Set the FIL_NULL for the space and page_no */
-	void set_corrupt_id()
-	{
-		m_space = m_page_no = ULINT32_UNDEFINED;
-	}
-
+  /** Constructor from (space, page_no).
+  @param[in]	space	tablespace id
+  @param[in]	page_no	page number */
+  page_id_t(ulint space, uint32_t page_no) : m_id(uint64_t{space} << 32 | page_no)
+  {
+    ut_ad(space <= 0xFFFFFFFFU);
+  }
+
+  page_id_t(uint64_t id) : m_id(id) {}
+  bool operator==(const page_id_t& rhs) const { return m_id == rhs.m_id; }
+  bool operator!=(const page_id_t& rhs) const { return m_id != rhs.m_id; }
+  bool operator<(const page_id_t& rhs) const { return m_id < rhs.m_id; }
+  bool operator>(const page_id_t& rhs) const { return m_id > rhs.m_id; }
+  bool operator<=(const page_id_t& rhs) const { return m_id <= rhs.m_id; }
+  bool operator>=(const page_id_t& rhs) const { return m_id >= rhs.m_id; }
+  page_id_t &operator--() { ut_ad(page_no()); m_id--; return *this; }
+  page_id_t &operator++()
+  {
+    ut_ad(page_no() < 0xFFFFFFFFU);
+    m_id++;
+    return *this;
+  }
+  page_id_t operator-(uint32_t i) const
+  {
+    ut_ad(page_no() >= i);
+    return page_id_t(m_id - i);
+  }
+  page_id_t operator+(uint32_t i) const
+  {
+    ut_ad(page_no() < ~i);
+    return page_id_t(m_id + i);
+  }
+
+  /** Retrieve the tablespace id.
+  @return tablespace id */
+  uint32_t space() const { return static_cast<uint32_t>(m_id >> 32); }
+
+  /** Retrieve the page number.
+  @return page number */
+  uint32_t page_no() const { return static_cast<uint32_t>(m_id); }
+
+  /** Retrieve the fold value.
+  @return fold value */
+  ulint fold() const { return (space() << 20) + space() + page_no(); }
+
+  /** Reset the page number only.
+  @param[in]	page_no	page number */
+  void set_page_no(uint32_t page_no)
+  {
+    m_id= (m_id & ~uint64_t{0} << 32) | page_no;
+  }
+
+  ulonglong raw() { return m_id; }
 private:
-
-	/** Tablespace id. */
-	uint32_t	m_space;
-
-	/** Page number. */
-	uint32_t	m_page_no;
-
-	/** Declare the overloaded global operator<< as a friend of this
-	class. Refer to the global declaration for further details.  Print
-	the given page_id_t object.
-	@param[in,out]	out	the output stream
-	@param[in]	page_id	the page_id_t object to be printed
-	@return the output stream */
-        friend
-        std::ostream&
-        operator<<(
-                std::ostream&           out,
-                const page_id_t        page_id);
+  /** The page identifier */
+  uint64_t m_id;
 };
 
-/** A field reference full of zero, for use in assertions and checks,
+/** A 64KiB buffer of NUL bytes, for use in assertions and checks,
 and dummy default values of instantly dropped columns.
-Initially, BLOB field references are set to zero, in
+Initially, BLOB field references are set to NUL bytes, in
 dtuple_convert_big_rec(). */
-extern const byte field_ref_zero[UNIV_PAGE_SIZE_MAX];
+extern const byte *field_ref_zero;
 
 #ifndef UNIV_INNOCHECKSUM
 
 #include "ut0mutex.h"
 #include "sync0rw.h"
+#include "rw_lock.h"
+
+class page_hash_latch : public rw_lock
+{
+public:
+  /** Wait for a shared lock */
+  void read_lock_wait();
+  /** Wait for an exclusive lock */
+  void write_lock_wait();
+
+  /** Acquire a shared lock */
+  inline void read_lock();
+  /** Acquire an exclusive lock */
+  inline void write_lock();
+
+  /** Acquire a lock */
+  template<bool exclusive> void acquire()
+  {
+    if (exclusive)
+      write_lock();
+    else
+      read_lock();
+  }
+  /** Release a lock */
+  template<bool exclusive> void release()
+  {
+    if (exclusive)
+      write_unlock();
+    else
+      read_unlock();
+  }
+};
 
-typedef ib_bpmutex_t BPageMutex;
-typedef ib_mutex_t BufPoolMutex;
-typedef ib_mutex_t FlushListMutex;
-typedef BPageMutex BufPoolZipMutex;
-typedef rw_lock_t BPageLock;
 #endif /* !UNIV_INNOCHECKSUM */
 
 #endif /* buf0types.h */
diff --git a/storage/innobase/include/data0data.inl b/storage/innobase/include/data0data.inl
index 39ade7b1e09..2d1bf5a2d50 100644
--- a/storage/innobase/include/data0data.inl
+++ b/storage/innobase/include/data0data.inl
@@ -62,7 +62,6 @@ spatial_status_t
 dfield_get_spatial_status(
 	const dfield_t*	field)
 {
-	ut_ad(field);
 	ut_ad(dfield_is_ext(field));
 
 	return(static_cast<spatial_status_t>(field->spatial_status));
@@ -77,10 +76,8 @@ dfield_set_spatial_status(
 	dfield_t*		field,
 	spatial_status_t	spatial_status)
 {
-	ut_ad(field);
-	ut_ad(dfield_is_ext(field));
-
-	field->spatial_status = spatial_status;
+	field->spatial_status = spatial_status & 3;
+	ut_ad(dfield_get_spatial_status(field) == spatial_status);
 }
 
 /*********************************************************************//**
diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h
index 3523a487d74..9528443e7a8 100644
--- a/storage/innobase/include/data0type.h
+++ b/storage/innobase/include/data0type.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -181,7 +181,8 @@ be less than 256 */
 				BLOB columns.
 */
 #define DATA_GIS_MBR	2048U	/* Used as GIS MBR column */
-#define DATA_MBR_LEN	SPDIMS * 2 * sizeof(double) /* GIS MBR length*/
+/** the size of a GIS maximum bounding rectangle */
+constexpr uint8_t DATA_MBR_LEN= uint8_t(SPDIMS * 2 * sizeof(double));
 
 #define	DATA_LONG_TRUE_VARCHAR 4096U	/* this is ORed to the precise data
 				type when the column is true VARCHAR where
@@ -330,9 +331,9 @@ dtype_get_mblen(
 /*============*/
 	ulint	mtype,		/*!< in: main type */
 	ulint	prtype,		/*!< in: precise type (and collation) */
-	ulint*	mbminlen,	/*!< out: minimum length of a
+	unsigned* mbminlen,	/*!< out: minimum length of a
 				multi-byte character */
-	ulint*	mbmaxlen);	/*!< out: maximum length of a
+	unsigned* mbmaxlen);	/*!< out: maximum length of a
 				multi-byte character */
 /**
 Get the charset-collation code for string types.
@@ -399,7 +400,7 @@ dtype_get_mbmaxlen(
 Returns the size of a fixed size data type, 0 if not a fixed size type.
 @return fixed size, or 0 */
 UNIV_INLINE
-ulint
+unsigned
 dtype_get_fixed_size_low(
 /*=====================*/
 	ulint	mtype,		/*!< in: main type */
@@ -415,7 +416,7 @@ dtype_get_fixed_size_low(
 Returns the minimum size of a data type.
 @return minimum size */
 UNIV_INLINE
-ulint
+unsigned
 dtype_get_min_size_low(
 /*===================*/
 	ulint	mtype,		/*!< in: main type */
diff --git a/storage/innobase/include/data0type.inl b/storage/innobase/include/data0type.inl
index 037a71a9345..b81b68e69e9 100644
--- a/storage/innobase/include/data0type.inl
+++ b/storage/innobase/include/data0type.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -72,9 +72,9 @@ dtype_get_mblen(
 /*============*/
 	ulint	mtype,		/*!< in: main type */
 	ulint	prtype,		/*!< in: precise type (and collation) */
-	ulint*	mbminlen,	/*!< out: minimum length of a
+	unsigned*mbminlen,	/*!< out: minimum length of a
 				multi-byte character */
-	ulint*	mbmaxlen)	/*!< out: maximum length of a
+	unsigned*mbmaxlen)	/*!< out: maximum length of a
 				multi-byte character */
 {
 	if (dtype_is_string_type(mtype)) {
@@ -96,12 +96,11 @@ dtype_set_mblen(
 /*============*/
 	dtype_t*	type)	/*!< in/out: type */
 {
-	ulint	mbminlen;
-	ulint	mbmaxlen;
+	unsigned mbminlen, mbmaxlen;
 
 	dtype_get_mblen(type->mtype, type->prtype, &mbminlen, &mbmaxlen);
-	type->mbminlen = mbminlen;
-	type->mbmaxlen = mbmaxlen;
+	type->mbminlen = mbminlen & 7;
+	type->mbmaxlen = mbmaxlen & 7;
 
 	ut_ad(dtype_validate(type));
 }
@@ -120,9 +119,9 @@ dtype_set(
 	ut_ad(type);
 	ut_ad(mtype <= DATA_MTYPE_MAX);
 
-	type->mtype = unsigned(mtype);
-	type->prtype = unsigned(prtype);
-	type->len = unsigned(len);
+	type->mtype = static_cast<byte>(mtype);
+	type->prtype = static_cast<unsigned>(prtype);
+	type->len = static_cast<uint16_t>(len);
 
 	dtype_set_mblen(type);
 }
@@ -429,7 +428,7 @@ dtype_sql_name(
 Returns the size of a fixed size data type, 0 if not a fixed size type.
 @return fixed size, or 0 */
 UNIV_INLINE
-ulint
+unsigned
 dtype_get_fixed_size_low(
 /*=====================*/
 	ulint	mtype,		/*!< in: main type */
@@ -465,15 +464,15 @@ dtype_get_fixed_size_low(
 	case DATA_INT:
 	case DATA_FLOAT:
 	case DATA_DOUBLE:
-		return(len);
+		return static_cast<unsigned>(len);
 	case DATA_MYSQL:
 		if (prtype & DATA_BINARY_TYPE) {
-			return(len);
+			return static_cast<unsigned>(len);
 		} else if (!comp) {
-			return(len);
+			return static_cast<unsigned>(len);
 		} else {
 #ifdef UNIV_DEBUG
-			ulint	i_mbminlen, i_mbmaxlen;
+			unsigned i_mbminlen, i_mbmaxlen;
 
 			innobase_get_cset_width(
 				dtype_get_charset_coll(prtype),
@@ -483,7 +482,7 @@ dtype_get_fixed_size_low(
 			ut_ad(i_mbmaxlen == mbmaxlen);
 #endif /* UNIV_DEBUG */
 			if (mbminlen == mbmaxlen) {
-				return(len);
+				return static_cast<unsigned>(len);
 			}
 		}
 		/* Treat as variable-length. */
@@ -506,7 +505,7 @@ dtype_get_fixed_size_low(
 Returns the minimum size of a data type.
 @return minimum size */
 UNIV_INLINE
-ulint
+unsigned
 dtype_get_min_size_low(
 /*===================*/
 	ulint	mtype,		/*!< in: main type */
@@ -539,20 +538,21 @@ dtype_get_min_size_low(
 	case DATA_INT:
 	case DATA_FLOAT:
 	case DATA_DOUBLE:
-		return(len);
+		return static_cast<unsigned>(len);
 	case DATA_MYSQL:
 		if (prtype & DATA_BINARY_TYPE) {
-			return(len);
+			return static_cast<unsigned>(len);
 		} else {
 			if (mbminlen == mbmaxlen) {
-				return(len);
+				return static_cast<unsigned>(len);
 			}
 
 			/* this is a variable-length character set */
 			ut_a(mbminlen > 0);
 			ut_a(mbmaxlen > mbminlen);
 			ut_a(len % mbmaxlen == 0);
-			return(len * mbminlen / mbmaxlen);
+			return static_cast<unsigned>(
+				len * mbminlen / mbmaxlen);
 		}
 	case DATA_VARCHAR:
 	case DATA_BINARY:
diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h
index 4daa1d23c4f..186fd30f89f 100644
--- a/storage/innobase/include/dict0boot.h
+++ b/storage/innobase/include/dict0boot.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, MariaDB Corporation.
+Copyright (c) 2018, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -33,15 +33,8 @@ Created 4/18/1996 Heikki Tuuri
 #include "buf0buf.h"
 #include "dict0dict.h"
 
-typedef	byte	dict_hdr_t;
-
-/**********************************************************************//**
-Gets a pointer to the dictionary header and x-latches its page.
-@return pointer to the dictionary header, page x-latched */
-dict_hdr_t*
-dict_hdr_get(
-/*=========*/
-	mtr_t*	mtr);	/*!< in: mtr */
+/** @return the DICT_HDR block, x-latched */
+buf_block_t *dict_hdr_get(mtr_t* mtr);
 /**********************************************************************//**
 Returns a new table, index, or space id. */
 void
@@ -67,14 +60,6 @@ row_id_t
 dict_sys_get_new_row_id(void);
 /*=========================*/
 /**********************************************************************//**
-Reads a row id from a record or other 6-byte stored form.
-@return row id */
-UNIV_INLINE
-row_id_t
-dict_sys_read_row_id(
-/*=================*/
-	const byte*	field);	/*!< in: record field */
-/**********************************************************************//**
 Writes a row id to a record or other 6-byte stored form. */
 UNIV_INLINE
 void
diff --git a/storage/innobase/include/dict0boot.inl b/storage/innobase/include/dict0boot.inl
index 7b0a2fd0b86..d920bddecee 100644
--- a/storage/innobase/include/dict0boot.inl
+++ b/storage/innobase/include/dict0boot.inl
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -50,19 +51,6 @@ dict_sys_get_new_row_id(void)
 }
 
 /**********************************************************************//**
-Reads a row id from a record or other 6-byte stored form.
-@return row id */
-UNIV_INLINE
-row_id_t
-dict_sys_read_row_id(
-/*=================*/
-	const byte*	field)	/*!< in: record field */
-{
-	compile_time_assert(DATA_ROW_ID_LEN == 6);
-	return(mach_read_from_6(field));
-}
-
-/**********************************************************************//**
 Writes a row id to a record or other 6-byte stored form. */
 UNIV_INLINE
 void
diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h
index e3c40c794e9..50f7f34a8e8 100644
--- a/storage/innobase/include/dict0crea.h
+++ b/storage/innobase/include/dict0crea.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -97,11 +97,10 @@ dict_create_index_tree(
 	const trx_t*	trx);	/*!< in: InnoDB transaction handle */
 
 /** Drop the index tree associated with a row in SYS_INDEXES table.
-@param[in,out]	rec	SYS_INDEXES record
 @param[in,out]	pcur	persistent cursor on rec
 @param[in,out]	trx	dictionary transaction
 @param[in,out]	mtr	mini-transaction */
-void dict_drop_index_tree(rec_t* rec, btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr)
+void dict_drop_index_tree(btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr)
 	MY_ATTRIBUTE((nonnull));
 
 /***************************************************************//**
@@ -273,7 +272,7 @@ struct ind_node_t{
 	/*----------------------*/
 	/* Local storage for this graph node */
 	ulint		state;		/*!< node execution state */
-	ulint		page_no;	/* root page number of the index */
+	uint32_t	page_no;	/* root page number of the index */
 	dict_table_t*	table;		/*!< table which owns the index */
 	dtuple_t*	ind_row;	/* index definition row built */
 	ulint		field_no;	/* next field definition to insert */
diff --git a/storage/innobase/include/dict0defrag_bg.h b/storage/innobase/include/dict0defrag_bg.h
index ae017932b9f..3aea41b0bb8 100644
--- a/storage/innobase/include/dict0defrag_bg.h
+++ b/storage/innobase/include/dict0defrag_bg.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2016, 2018, MariaDB Corporation.
+Copyright (c) 2016, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -27,8 +27,6 @@ Created 25/08/2016 Jan Lindström
 #ifndef dict0defrag_bg_h
 #define dict0defrag_bg_h
 
-#include "os0event.h"
-#include "os0thread.h"
 #include "dict0types.h"
 
 /** Indices whose defrag stats need to be saved to persistent storage.*/
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
index 2326f00599b..5682a10c889 100644
--- a/storage/innobase/include/dict0dict.h
+++ b/storage/innobase/include/dict0dict.h
@@ -33,21 +33,27 @@ Created 1/8/1996 Heikki Tuuri
 #include "fsp0fsp.h"
 #include <deque>
 
+class MDL_ticket;
 extern bool innodb_table_stats_not_found;
 extern bool innodb_index_stats_not_found;
 
 /** the first table or index ID for other than hard-coded system tables */
 constexpr uint8_t DICT_HDR_FIRST_ID= 10;
 
-/********************************************************************//**
-Get the database name length in a table name.
+
+/** Get the database name length in a table name.
+@param name   filename-safe encoded table name "dbname/tablename"
 @return database name length */
-ulint
-dict_get_db_name_len(
-/*=================*/
-	const char*	name)	/*!< in: table name in the form
-				dbname '/' tablename */
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
+inline size_t dict_get_db_name_len(const char *name)
+{
+  /* table_name_t::dblen() would assert that '/' is contained */
+  if (const char* s= strchr(name, '/'))
+    return size_t(s - name);
+
+  return 0;
+}
+
+
 /*********************************************************************//**
 Open a table from its database and table name, this is currently used by
 foreign constraint parser to get the referenced table.
@@ -62,7 +68,8 @@ dict_get_referenced_table(
 	const char*	table_name,	/*!< in: table name */
 	ulint		table_name_len,	/*!< in: table name length */
 	dict_table_t**	table,		/*!< out: table object or NULL */
-	mem_heap_t*	heap);		/*!< in: heap memory */
+	mem_heap_t*	heap,		/*!< in: heap memory */
+	CHARSET_INFO*	from_cs);	/*!< in: table name charset */
 /*********************************************************************//**
 Frees a foreign key struct. */
 void
@@ -79,6 +86,21 @@ dict_table_get_highest_foreign_id(
 /*==============================*/
 	dict_table_t*	table);		/*!< in: table in the dictionary
 					memory cache */
+/** Check whether the dict_table_t is a partition.
+A partitioned table on the SQL level is composed of InnoDB tables,
+where each InnoDB table is a [sub]partition including its secondary indexes
+which belongs to the partition.
+@param[in]	table	Table to check.
+@return true if the dict_table_t is a partition else false. */
+UNIV_INLINE
+bool
+dict_table_is_partition(const dict_table_t* table)
+{
+	/* Check both P and p on all platforms in case it was moved to/from
+	WIN. */
+	return (strstr(table->name.m_name, "#p#")
+		|| strstr(table->name.m_name, "#P#"));
+}
 /********************************************************************//**
 Return the end of table name where we have removed dbname and '/'.
 @return table name */
@@ -102,33 +124,50 @@ enum dict_table_op_t {
 	DICT_TABLE_OP_OPEN_ONLY_IF_CACHED
 };
 
-/**********************************************************************//**
-Returns a table object based on table id.
+/** Acquire MDL shared for the table name.
+@tparam trylock whether to use non-blocking operation
+@param[in,out]  table           table object
+@param[in,out]  thd             background thread
+@param[out]     mdl             mdl ticket
+@param[in]      table_op        operation to perform when opening
+@return table object after locking MDL shared
+@retval NULL if the table is not readable, or if trylock && MDL blocked */
+template<bool trylock>
+dict_table_t*
+dict_acquire_mdl_shared(dict_table_t *table,
+                        THD *thd,
+                        MDL_ticket **mdl,
+                        dict_table_op_t table_op= DICT_TABLE_OP_NORMAL);
+
+/** Look up a table by numeric identifier.
+@param[in]      table_id        table identifier
+@param[in]      dict_locked     data dictionary locked
+@param[in]      table_op        operation to perform when opening
+@param[in,out]  thd             background thread, or NULL to not acquire MDL
+@param[out]     mdl             mdl ticket, or NULL
 @return table, NULL if does not exist */
 dict_table_t*
-dict_table_open_on_id(
-/*==================*/
-	table_id_t	table_id,	/*!< in: table id */
-	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
-	dict_table_op_t	table_op)	/*!< in: operation to perform */
-	MY_ATTRIBUTE((warn_unused_result));
-
-/**********************************************************************//**
-Returns a table object based on table id.
-@return	table, NULL if does not exist */
-dict_table_t* dict_table_open_on_index_id(index_id_t index_id)
-	__attribute__((warn_unused_result));
-/********************************************************************//**
-Decrements the count of open handles to a table. */
+dict_table_open_on_id(table_id_t table_id, bool dict_locked,
+                      dict_table_op_t table_op, THD *thd= nullptr,
+                      MDL_ticket **mdl= nullptr)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/** Decrements the count of open handles of a table.
+@param[in,out]	table		table
+@param[in]	dict_locked	data dictionary locked
+@param[in]	try_drop	try to drop any orphan indexes after
+				an aborted online index creation
+@param[in]	thd		thread to release MDL
+@param[in]	mdl		metadata lock or NULL if the thread is a
+				foreground one. */
 void
 dict_table_close(
-/*=============*/
-	dict_table_t*	table,		/*!< in/out: table */
-	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
-	ibool		try_drop)	/*!< in: TRUE=try to drop any orphan
-					indexes after an aborted online
-					index creation */
-	MY_ATTRIBUTE((nonnull));
+	dict_table_t*	table,
+	bool		dict_locked,
+	bool		try_drop,
+	THD*		thd = NULL,
+	MDL_ticket*	mdl = NULL);
+
 /*********************************************************************//**
 Closes the only open handle to a table and drops a table while assuring
 that dict_sys.mutex is held the whole time.  This assures that the table
@@ -144,7 +183,7 @@ dict_table_close_and_drop(
 Gets the minimum number of bytes per character.
 @return minimum multi-byte char size, in bytes */
 UNIV_INLINE
-ulint
+unsigned
 dict_col_get_mbminlen(
 /*==================*/
 	const dict_col_t*	col)	/*!< in: column */
@@ -153,7 +192,7 @@ dict_col_get_mbminlen(
 Gets the maximum number of bytes per character.
 @return maximum multi-byte char size, in bytes */
 UNIV_INLINE
-ulint
+unsigned
 dict_col_get_mbmaxlen(
 /*==================*/
 	const dict_col_t*	col)	/*!< in: column */
@@ -209,7 +248,7 @@ dict_col_type_assert_equal(
 Returns the minimum size of the column.
 @return minimum size */
 UNIV_INLINE
-ulint
+unsigned
 dict_col_get_min_size(
 /*==================*/
 	const dict_col_t*	col)	/*!< in: column */
@@ -227,7 +266,7 @@ dict_col_get_max_size(
 Returns the size of a fixed size column, 0 if not a fixed size column.
 @return fixed size, or 0 */
 UNIV_INLINE
-ulint
+unsigned
 dict_col_get_fixed_size(
 /*====================*/
 	const dict_col_t*	col,	/*!< in: column */
@@ -238,7 +277,7 @@ Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
 For fixed length types it is the fixed length of the type, otherwise 0.
 @return SQL null storage size in ROW_FORMAT=REDUNDANT */
 UNIV_INLINE
-ulint
+unsigned
 dict_col_get_sql_null_size(
 /*=======================*/
 	const dict_col_t*	col,	/*!< in: column */
@@ -248,7 +287,7 @@ dict_col_get_sql_null_size(
 Gets the column number.
 @return col->ind, table column position (starting from 0) */
 UNIV_INLINE
-ulint
+unsigned
 dict_col_get_no(
 /*============*/
 	const dict_col_t*	col)	/*!< in: column */
@@ -417,34 +456,6 @@ dict_foreign_replace_index(
 					to use table->col_names */
 	const dict_index_t*	index)	/*!< in: index to be replaced */
 	MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
-/** Scans a table create SQL string and adds to the data dictionary
-the foreign key constraints declared in the string. This function
-should be called after the indexes for a table have been created.
-Each foreign key constraint must be accompanied with indexes in
-bot participating tables. The indexes are allowed to contain more
-fields than mentioned in the constraint.
-
-@param[in]	trx		transaction
-@param[in]	sql_string	table create statement where
-				foreign keys are declared like:
-				FOREIGN KEY (a, b) REFERENCES table2(c, d),
-				table2 can be written also with the database
-				name before it: test.table2; the default
-				database id the database of parameter name
-@param[in]	sql_length	length of sql_string
-@param[in]	name		table full name in normalized form
-@param[in]	reject_fks	if TRUE, fail with error code
-				DB_CANNOT_ADD_CONSTRAINT if any
-				foreign keys are found.
-@return error code or DB_SUCCESS */
-dberr_t
-dict_create_foreign_constraints(
-	trx_t*			trx,
-	const char*		sql_string,
-	size_t			sql_length,
-	const char*		name,
-	ibool			reject_fks)
-	MY_ATTRIBUTE((warn_unused_result));
 /**********************************************************************//**
 Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement.
 @return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the
@@ -679,7 +690,7 @@ dictionary cache.
 @return number of user-defined (e.g., not ROW_ID) non-virtual
 columns of a table */
 UNIV_INLINE
-ulint
+unsigned
 dict_table_get_n_user_cols(
 /*=======================*/
 	const dict_table_t*	table)	/*!< in: table */
@@ -689,7 +700,7 @@ Gets the number of all non-virtual columns (also system) in a table
 in the dictionary cache.
 @return number of columns of a table */
 UNIV_INLINE
-ulint
+unsigned
 dict_table_get_n_cols(
 /*==================*/
 	const dict_table_t*	table)	/*!< in: table */
@@ -699,7 +710,7 @@ dict_table_get_n_cols(
 @param[in]	table	the table to check
 @return number of virtual columns of a table */
 UNIV_INLINE
-ulint
+unsigned
 dict_table_get_n_v_cols(
 	const dict_table_t*	table);
 
@@ -778,7 +789,7 @@ dict_col_t*
 dict_table_get_sys_col(
 /*===================*/
 	const dict_table_t*	table,	/*!< in: table */
-	ulint			sys)	/*!< in: DATA_ROW_ID, ... */
+	unsigned		sys)	/*!< in: DATA_ROW_ID, ... */
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
 #else /* UNIV_DEBUG */
 #define dict_table_get_nth_col(table, pos)	(&(table)->cols[pos])
@@ -803,18 +814,18 @@ dict_table_get_col_name(const dict_table_t* table, ulint col_nr)
 Gets the given system column number of a table.
 @return column number */
 UNIV_INLINE
-ulint
+unsigned
 dict_table_get_sys_col_no(
 /*======================*/
 	const dict_table_t*	table,	/*!< in: table */
-	ulint			sys)	/*!< in: DATA_ROW_ID, ... */
+	unsigned		sys)	/*!< in: DATA_ROW_ID, ... */
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
 /********************************************************************//**
 Returns the minimum data size of an index record.
 @return minimum data size in bytes */
 UNIV_INLINE
-ulint
+unsigned
 dict_index_get_min_size(
 /*====================*/
 	const dict_index_t*	index)	/*!< in: index */
@@ -972,7 +983,7 @@ Gets the number of fields in the internal representation of an index,
 including fields added by the dictionary system.
 @return number of fields */
 UNIV_INLINE
-ulint
+uint16_t
 dict_index_get_n_fields(
 /*====================*/
 	const dict_index_t*	index)	/*!< in: an internal
@@ -987,7 +998,7 @@ we do not take multiversioning into account: in the B-tree use the value
 returned by dict_index_get_n_unique_in_tree.
 @return number of fields */
 UNIV_INLINE
-ulint
+uint16_t
 dict_index_get_n_unique(
 /*====================*/
 	const dict_index_t*	index)	/*!< in: an internal representation
@@ -999,7 +1010,7 @@ which uniquely determine the position of an index entry in the index, if
 we also take multiversioning into account.
 @return number of fields */
 UNIV_INLINE
-ulint
+uint16_t
 dict_index_get_n_unique_in_tree(
 /*============================*/
 	const dict_index_t*	index)	/*!< in: an internal representation
@@ -1017,7 +1028,7 @@ include page no field.
 @param[in]	index	index
 @return number of fields */
 UNIV_INLINE
-ulint
+uint16_t
 dict_index_get_n_unique_in_tree_nonleaf(
 	const dict_index_t*	index)
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
@@ -1028,7 +1039,7 @@ unique, but this function returns the number of fields the user defined
 in the index as ordering fields.
 @return number of fields */
 UNIV_INLINE
-ulint
+uint16_t
 dict_index_get_n_ordering_defined_by_user(
 /*======================================*/
 	const dict_index_t*	index)	/*!< in: an internal representation
@@ -1116,7 +1127,7 @@ dict_index_get_nth_field_pos(
 /********************************************************************//**
 Looks for column n position in the clustered index.
 @return position in internal representation of the clustered index */
-ulint
+unsigned
 dict_table_get_nth_col_pos(
 /*=======================*/
 	const dict_table_t*	table,	/*!< in: table */
@@ -1163,7 +1174,7 @@ dict_index_get_if_in_cache_low(
 /*===========================*/
 	index_id_t	index_id)	/*!< in: index id */
 	MY_ATTRIBUTE((warn_unused_result));
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+#ifdef UNIV_DEBUG
 /**********************************************************************//**
 Returns an index object if it is found in the dictionary cache.
 @return index, NULL if not found */
@@ -1172,8 +1183,6 @@ dict_index_get_if_in_cache(
 /*=======================*/
 	index_id_t	index_id)	/*!< in: index id */
 	MY_ATTRIBUTE((warn_unused_result));
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-#ifdef UNIV_DEBUG
 /**********************************************************************//**
 Checks that a tuple has n_fields_cmp value in a sensible range, so that
 no comparison can occur with the page number field in a node pointer.
@@ -1240,7 +1249,7 @@ dict_index_build_data_tuple(
 Gets the page number of the root of the index tree.
 @return page number */
 UNIV_INLINE
-ulint
+uint32_t
 dict_index_get_page(
 /*================*/
 	const dict_index_t*	tree)	/*!< in: index */
@@ -1400,10 +1409,10 @@ public:
 					header and flushed to a file; in
 					recovery this must be derived from
 					the log records */
-	hash_table_t*	table_hash;	/*!< hash table of the tables, based
+	hash_table_t	table_hash;	/*!< hash table of the tables, based
 					on name */
 	/** hash table of persistent table IDs */
-	hash_table_t*	table_id_hash;
+	hash_table_t	table_id_hash;
 	dict_table_t*	sys_tables;	/*!< SYS_TABLES table */
 	dict_table_t*	sys_columns;	/*!< SYS_COLUMNS table */
 	dict_table_t*	sys_indexes;	/*!< SYS_INDEXES table */
@@ -1422,7 +1431,7 @@ private:
 	/** the sequence of temporary table IDs */
 	std::atomic<table_id_t> temp_table_id;
 	/** hash table of temporary table IDs */
-	hash_table_t*	temp_id_hash;
+	hash_table_t temp_id_hash;
 public:
 	/** @return a new temporary table ID */
 	table_id_t get_temporary_table_id() {
@@ -1439,7 +1448,7 @@ public:
 		ut_ad(mutex_own(&mutex));
 		dict_table_t* table;
 		ulint fold = ut_fold_ull(id);
-		HASH_SEARCH(id_hash, temp_id_hash, fold, dict_table_t*, table,
+		HASH_SEARCH(id_hash, &temp_id_hash, fold, dict_table_t*, table,
 			    ut_ad(table->cached), table->id == id);
 		if (UNIV_LIKELY(table != NULL)) {
 			DBUG_ASSERT(table->is_temporary());
@@ -1458,7 +1467,8 @@ public:
 		ut_ad(mutex_own(&mutex));
 		dict_table_t* table;
 		ulint fold = ut_fold_ull(id);
-		HASH_SEARCH(id_hash, table_id_hash, fold, dict_table_t*, table,
+		HASH_SEARCH(id_hash, &table_id_hash, fold, dict_table_t*,
+			    table,
 			    ut_ad(table->cached), table->id == id);
 		DBUG_ASSERT(!table || !table->is_temporary());
 		return table;
@@ -1547,6 +1557,23 @@ public:
     mutex_exit(&mutex);
     rw_lock_x_unlock(&latch);
   }
+
+  /** Estimate the used memory occupied by the data dictionary
+  table and index objects.
+  @return number of bytes occupied */
+  ulint rough_size() const
+  {
+    /* No mutex; this is a very crude approximation anyway */
+    ulint size = UT_LIST_GET_LEN(table_LRU) + UT_LIST_GET_LEN(table_non_LRU);
+    size *= sizeof(dict_table_t)
+      + sizeof(dict_index_t) * 2
+      + (sizeof(dict_col_t) + sizeof(dict_field_t)) * 10
+      + sizeof(dict_field_t) * 5 /* total number of key fields */
+      + 200; /* arbitrary, covering names and overhead */
+    size += (table_hash.n_cells + table_id_hash.n_cells
+	     + temp_id_hash.n_cells) * sizeof(hash_cell_t);
+    return size;
+  }
 };
 
 /** the data dictionary cache */
@@ -1556,17 +1583,6 @@ extern dict_sys_t	dict_sys;
 #define dict_sys_lock() dict_sys.lock(__FILE__, __LINE__)
 #define dict_sys_unlock() dict_sys.unlock()
 
-/** dummy index for ROW_FORMAT=REDUNDANT supremum and infimum records */
-extern dict_index_t*	dict_ind_redundant;
-
-/** Initialize dict_ind_redundant. */
-void
-dict_ind_init();
-
-/** Free dict_ind_redundant. */
-void
-dict_ind_free();
-
 /* Auxiliary structs for checking a table definition @{ */
 
 /* This struct is used to specify the name and type that a column must
@@ -1768,13 +1784,6 @@ dict_table_decode_n_col(
 	ulint*	n_col,
 	ulint*	n_v_col);
 
-/** Calculate the used memory occupied by the data dictionary
-table and index objects.
-@return number of bytes occupied. */
-UNIV_INTERN
-ulint
-dict_sys_get_size();
-
 /** Free the virtual column template
 @param[in,out]	vc_templ	virtual column template */
 UNIV_INLINE
diff --git a/storage/innobase/include/dict0dict.inl b/storage/innobase/include/dict0dict.inl
index d00f4f8f6a3..eda639ba7c1 100644
--- a/storage/innobase/include/dict0dict.inl
+++ b/storage/innobase/include/dict0dict.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2019, MariaDB Corporation.
+Copyright (c) 2013, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -31,7 +31,7 @@ Created 1/8/1996 Heikki Tuuri
 Gets the minimum number of bytes per character.
 @return minimum multi-byte char size, in bytes */
 UNIV_INLINE
-ulint
+unsigned
 dict_col_get_mbminlen(
 /*==================*/
 	const dict_col_t*	col)	/*!< in: column */
@@ -42,7 +42,7 @@ dict_col_get_mbminlen(
 Gets the maximum number of bytes per character.
 @return maximum multi-byte char size, in bytes */
 UNIV_INLINE
-ulint
+unsigned
 dict_col_get_mbmaxlen(
 /*==================*/
 	const dict_col_t*	col)	/*!< in: column */
@@ -93,7 +93,7 @@ dict_col_type_assert_equal(
 Returns the minimum size of the column.
 @return minimum size */
 UNIV_INLINE
-ulint
+unsigned
 dict_col_get_min_size(
 /*==================*/
 	const dict_col_t*	col)	/*!< in: column */
@@ -116,7 +116,7 @@ dict_col_get_max_size(
 Returns the size of a fixed size column, 0 if not a fixed size column.
 @return fixed size, or 0 */
 UNIV_INLINE
-ulint
+unsigned
 dict_col_get_fixed_size(
 /*====================*/
 	const dict_col_t*	col,	/*!< in: column */
@@ -130,7 +130,7 @@ Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
 For fixed length types it is the fixed length of the type, otherwise 0.
 @return SQL null storage size in ROW_FORMAT=REDUNDANT */
 UNIV_INLINE
-ulint
+unsigned
 dict_col_get_sql_null_size(
 /*=======================*/
 	const dict_col_t*	col,	/*!< in: column */
@@ -143,7 +143,7 @@ dict_col_get_sql_null_size(
 Gets the column number.
 @return col->ind, table column position (starting from 0) */
 UNIV_INLINE
-ulint
+unsigned
 dict_col_get_no(
 /*============*/
 	const dict_col_t*	col)	/*!< in: column */
@@ -247,7 +247,7 @@ dictionary cache.
 @return number of user-defined (e.g., not ROW_ID) non-virtual
 columns of a table */
 UNIV_INLINE
-ulint
+unsigned
 dict_table_get_n_user_cols(
 /*=======================*/
 	const dict_table_t*	table)	/*!< in: table */
@@ -264,7 +264,7 @@ Gets the number of all non-virtual columns (also system) in a table
 in the dictionary cache.
 @return number of non-virtual columns of a table */
 UNIV_INLINE
-ulint
+unsigned
 dict_table_get_n_cols(
 /*==================*/
 	const dict_table_t*	table)	/*!< in: table */
@@ -277,7 +277,7 @@ dict_table_get_n_cols(
 @param[in]	table	the table to check
 @return number of virtual columns of a table */
 UNIV_INLINE
-ulint
+unsigned
 dict_table_get_n_v_cols(
 	const dict_table_t*	table)
 {
@@ -296,7 +296,7 @@ dict_table_has_indexed_v_cols(
 	const dict_table_t*	table)
 {
 
-	for (ulint i = 0; i < table->n_v_cols; i++) {
+	for (unsigned i = 0; i < table->n_v_cols; i++) {
 		const dict_v_col_t*     col = dict_table_get_nth_v_col(table, i);
 		if (col->m_col.ord_part) {
 			return(true);
@@ -399,7 +399,7 @@ dict_col_t*
 dict_table_get_sys_col(
 /*===================*/
 	const dict_table_t*	table,	/*!< in: table */
-	ulint			sys)	/*!< in: DATA_ROW_ID, ... */
+	unsigned		sys)	/*!< in: DATA_ROW_ID, ... */
 {
 	dict_col_t*	col;
 	col = dict_table_get_nth_col(table,
@@ -415,11 +415,11 @@ dict_table_get_sys_col(
 Gets the given system column number of a table.
 @return column number */
 UNIV_INLINE
-ulint
+unsigned
 dict_table_get_sys_col_no(
 /*======================*/
 	const dict_table_t*	table,	/*!< in: table */
-	ulint			sys)	/*!< in: DATA_ROW_ID, ... */
+	unsigned		sys)	/*!< in: DATA_ROW_ID, ... */
 {
 	ut_ad(sys < DATA_N_SYS_COLS);
 	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
@@ -705,34 +705,12 @@ dict_tf_to_sys_tables_type(
 	return(type);
 }
 
-/*********************************************************************//**
-Returns true if the particular FTS index in the table is still syncing
-in the background, false otherwise.
-@param [in] table      Table containing FTS index
-@return True if sync of fts index is still going in the background  */
-UNIV_INLINE
-bool
-dict_fts_index_syncing(
-	dict_table_t*   table)
-{
-	 dict_index_t*   index;
-
-	for (index = dict_table_get_first_index(table);
-	    index != NULL;
-	    index = dict_table_get_next_index(index)) {
-		if (index->index_fts_syncing) {
-			 return(true);
-		}
-	}
-	return(false);
-}
-
 /********************************************************************//**
 Gets the number of fields in the internal representation of an index,
 including fields added by the dictionary system.
 @return number of fields */
 UNIV_INLINE
-ulint
+uint16_t
 dict_index_get_n_fields(
 /*====================*/
 	const dict_index_t*	index)	/*!< in: an internal
@@ -750,7 +728,7 @@ we do not take multiversioning into account: in the B-tree use the value
 returned by dict_index_get_n_unique_in_tree.
 @return number of fields */
 UNIV_INLINE
-ulint
+uint16_t
 dict_index_get_n_unique(
 /*====================*/
 	const dict_index_t*	index)	/*!< in: an internal representation
@@ -767,7 +745,7 @@ which uniquely determine the position of an index entry in the index, if
 we also take multiversioning into account.
 @return number of fields */
 UNIV_INLINE
-ulint
+uint16_t
 dict_index_get_n_unique_in_tree(
 /*============================*/
 	const dict_index_t*	index)	/*!< in: an internal representation
@@ -792,7 +770,7 @@ include page no field.
 @param[in]	index	index
 @return number of fields */
 UNIV_INLINE
-ulint
+uint16_t
 dict_index_get_n_unique_in_tree_nonleaf(
 	const dict_index_t*	index)
 {
@@ -816,7 +794,7 @@ to make a clustered index unique, but this function returns the number of
 fields the user defined in the index as ordering fields.
 @return number of fields */
 UNIV_INLINE
-ulint
+uint16_t
 dict_index_get_n_ordering_defined_by_user(
 /*======================================*/
 	const dict_index_t*	index)	/*!< in: an internal representation
@@ -901,27 +879,25 @@ dict_index_get_nth_col_pos(
 Returns the minimum data size of an index record.
 @return minimum data size in bytes */
 UNIV_INLINE
-ulint
+unsigned
 dict_index_get_min_size(
 /*====================*/
 	const dict_index_t*	index)	/*!< in: index */
 {
-	ulint	n	= dict_index_get_n_fields(index);
-	ulint	size	= 0;
+  unsigned n= dict_index_get_n_fields(index);
+  unsigned size= 0;
 
-	while (n--) {
-		size += dict_col_get_min_size(dict_index_get_nth_col(index,
-								     n));
-	}
+  while (n--)
+    size+= dict_col_get_min_size(dict_index_get_nth_col(index, n));
 
-	return(size);
+  return size;
 }
 
 /*********************************************************************//**
 Gets the page number of the root of the index tree.
 @return page number */
 UNIV_INLINE
-ulint
+uint32_t
 dict_index_get_page(
 /*================*/
 	const dict_index_t*	index)	/*!< in: index */
@@ -1016,7 +992,7 @@ dict_index_set_online_status(
 	}
 #endif /* UNIV_DEBUG */
 
-	index->online_status = status;
+	index->online_status = status & 3;
 	ut_ad(dict_index_get_online_status(index) == status);
 }
 
diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h
index afc017fd9d1..f067571ca5b 100644
--- a/storage/innobase/include/dict0load.h
+++ b/storage/innobase/include/dict0load.h
@@ -268,7 +268,7 @@ dict_process_sys_tablespaces(
 /*=========================*/
 	mem_heap_t*	heap,		/*!< in/out: heap memory */
 	const rec_t*	rec,		/*!< in: current SYS_TABLESPACES rec */
-	ulint*		space,		/*!< out: pace id */
+	uint32_t*	space,		/*!< out: tablespace identifier */
 	const char**	name,		/*!< out: tablespace name */
 	ulint*		flags);		/*!< out: tablespace flags */
 /********************************************************************//**
@@ -280,7 +280,7 @@ dict_process_sys_datafiles(
 /*=======================*/
 	mem_heap_t*	heap,		/*!< in/out: heap memory */
 	const rec_t*	rec,		/*!< in: current SYS_DATAFILES rec */
-	ulint*		space,		/*!< out: pace id */
+	uint32_t*	space,		/*!< out: tablespace identifier */
 	const char**	path);		/*!< out: datafile path */
 
 /** Update the record for space_id in SYS_TABLESPACES to this filepath.
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
index 596800ee8d2..70b028d7c63 100644
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@@ -46,6 +46,7 @@ Created 1/8/1996 Heikki Tuuri
 #include "gis0type.h"
 #include "fil0fil.h"
 #include "fil0crypt.h"
+#include "mysql_com.h"
 #include <sql_const.h>
 #include <set>
 #include <algorithm>
@@ -279,7 +280,7 @@ index tables) of a FTS table are in HEX format. */
 	(table->flags2 & (flag))
 
 #define DICT_TF2_FLAG_UNSET(table, flag)	\
-	(table->flags2 &= ~(flag))
+	(table->flags2 &= ~(flag) & ((1U << DICT_TF2_BITS) - 1))
 
 /** Tables could be chained together with Foreign key constraint. When
 first load the parent table, we would load all of its descedents.
@@ -305,17 +306,11 @@ before proceeds. */
 @param flags    table flags
 @param flags2   table flags2
 @return own: table object */
-dict_table_t*
-dict_mem_table_create(
-	const char*	name,
-	fil_space_t*	space,
-	ulint		n_cols,
-	ulint		n_v_cols,
-	ulint		flags,
-	ulint		flags2);
-
-/****************************************************************//**
-Free a table memory object. */
+dict_table_t *dict_mem_table_create(const char *name, fil_space_t *space,
+                                    ulint n_cols, ulint n_v_cols, ulint flags,
+                                    ulint flags2);
+/****************************************************************/ /**
+ Free a table memory object. */
 void
 dict_mem_table_free(
 /*================*/
@@ -644,7 +639,7 @@ public:
     if (fixed)
     {
       mtype= DATA_FIXBINARY;
-      len= fixed;
+      len= static_cast<uint16_t>(fixed);
     }
     else
     {
@@ -982,6 +977,9 @@ const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX";
 /** Data structure for an index.  Most fields will be
 initialized to 0, NULL or FALSE in dict_mem_index_create(). */
 struct dict_index_t {
+  /** Maximum number of fields */
+  static constexpr unsigned MAX_N_FIELDS= (1U << 10) - 1;
+
 	index_id_t	id;	/*!< id of the index */
 	mem_heap_t*	heap;	/*!< memory heap */
 	id_name_t	name;	/*!< index name */
@@ -1074,10 +1072,6 @@ struct dict_index_t {
 	It should use heap from dict_index_t. It should be freed
 	while removing the index from table. */
 	dict_add_v_col_info* new_vcol_info;
-
-	bool            index_fts_syncing;/*!< Whether the fts index is
-					still syncing in the background;
-					FIXME: remove this and use MDL */
 	UT_LIST_NODE_T(dict_index_t)
 			indexes;/*!< list of indexes of the table */
 #ifdef BTR_CUR_ADAPT
@@ -1215,18 +1209,24 @@ public:
 	bool has_virtual() const { return type & DICT_VIRTUAL; }
 
 	/** @return the position of DB_TRX_ID */
-	unsigned db_trx_id() const {
+	uint16_t db_trx_id() const {
 		DBUG_ASSERT(is_primary());
 		DBUG_ASSERT(n_uniq);
 		DBUG_ASSERT(n_uniq <= MAX_REF_PARTS);
 		return n_uniq;
 	}
 	/** @return the position of DB_ROLL_PTR */
-	unsigned db_roll_ptr() const { return db_trx_id() + 1; }
+	uint16_t db_roll_ptr() const
+	{
+		return static_cast<uint16_t>(db_trx_id() + 1);
+	}
 
 	/** @return the offset of the metadata BLOB field,
 	or the first user field after the PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR */
-	unsigned first_user_field() const { return db_trx_id() + 2; }
+	uint16_t first_user_field() const
+	{
+		return static_cast<uint16_t>(db_trx_id() + 2);
+	}
 
 	/** @return whether the index is corrupted */
 	inline bool is_corrupted() const;
@@ -1346,6 +1346,16 @@ public:
   void set_freed() { ut_ad(!freed()); page= 1; }
 #endif /* BTR_CUR_HASH_ADAPT */
 
+  /** @return whether it is forbidden to invoke clear_instant_add() */
+  bool must_avoid_clear_instant_add() const
+  {
+    if (is_instant())
+      for (auto i= this; (i= UT_LIST_GET_NEXT(indexes, i)) != nullptr; )
+        if (i->to_be_dropped /* || i->online_log*/)
+          return true;
+    return false;
+  }
+
 	/** This ad-hoc class is used by record_size_info only.	*/
 	class record_size_info_t {
 	public:
@@ -1510,7 +1520,7 @@ struct dict_foreign_compare {
 		const dict_foreign_t*	lhs,
 		const dict_foreign_t*	rhs) const
 	{
-		return(ut_strcmp(lhs->id, rhs->id) < 0);
+		return strcmp(lhs->id, rhs->id) < 0;
 	}
 };
 
@@ -1753,7 +1763,7 @@ class field_map_element_t
 	/** Field metadata */
 	uint16_t data;
 
-	void clear_not_null() { data &= ~NOT_NULL; }
+	void clear_not_null() { data &= uint16_t(~NOT_NULL); }
 public:
 	bool is_dropped() const { return data & DROPPED; }
 	void set_dropped() { data |= DROPPED; }
@@ -1841,6 +1851,13 @@ struct dict_table_t {
 		return(UNIV_LIKELY(!file_unreadable));
 	}
 
+	/** @return whether the table is accessible */
+	bool is_accessible() const
+	{
+		return UNIV_LIKELY(is_readable() && !corrupted && space)
+			&& !space->is_stopping();
+	}
+
 	/** Check if a table name contains the string "/#sql"
 	which denotes temporary or intermediate tables in MariaDB. */
 	static bool is_temporary_name(const char* name)
@@ -1968,6 +1985,18 @@ struct dict_table_t {
 	/** For overflow fields returns potential max length stored inline */
 	inline size_t get_overflow_field_local_len() const;
 
+	/** Parse the table file name into table name and database name.
+	@tparam		dict_locked	whether dict_sys.mutex is being held
+	@param[in,out]	db_name		database name buffer
+	@param[in,out]	tbl_name	table name buffer
+	@param[out]	db_name_len	database name length
+	@param[out]	tbl_name_len	table name length
+	@return whether the table name is visible to SQL */
+	template<bool dict_locked= false>
+	bool parse_name(char (&db_name)[NAME_LEN + 1],
+			char (&tbl_name)[NAME_LEN + 1],
+			size_t *db_name_len, size_t *tbl_name_len) const;
+
 private:
 	/** Initialize instant->field_map.
 	@param[in]	table	table definition to copy from */
@@ -2238,22 +2267,6 @@ public:
 	/** The state of the background stats thread wrt this table.
 	See BG_STAT_NONE, BG_STAT_IN_PROGRESS and BG_STAT_SHOULD_QUIT.
 	Writes are covered by dict_sys.mutex. Dirty reads are possible. */
-
-	#define BG_SCRUB_IN_PROGRESS	((byte)(1 << 2))
-				/*!< BG_SCRUB_IN_PROGRESS is set in
-				stats_bg_flag when the background
-				scrub code is working on this table. The DROP
-				TABLE code waits for this to be cleared
-				before proceeding. */
-
-	#define BG_STAT_SHOULD_QUIT		(1 << 1)
-
-	#define BG_IN_PROGRESS (BG_STAT_IN_PROGRESS | BG_SCRUB_IN_PROGRESS)
-
-
-	/** The state of the background stats thread wrt this table.
-	See BG_STAT_NONE, BG_STAT_IN_PROGRESS and BG_STAT_SHOULD_QUIT.
-	Writes are covered by dict_sys.mutex. Dirty reads are possible. */
 	byte					stats_bg_flag;
 
 	bool		stats_error_printed;
@@ -2387,14 +2400,14 @@ inline bool dict_index_t::is_corrupted() const
 
 inline void dict_index_t::clear_instant_add()
 {
-	DBUG_ASSERT(is_primary());
-	DBUG_ASSERT(is_instant());
-	DBUG_ASSERT(!table->instant);
-	for (unsigned i = n_core_fields; i < n_fields; i++) {
-		fields[i].col->clear_instant();
-	}
-	n_core_fields = n_fields;
-	n_core_null_bytes = UT_BITS_IN_BYTES(unsigned(n_nullable));
+  DBUG_ASSERT(is_primary());
+  DBUG_ASSERT(is_instant());
+  DBUG_ASSERT(!table->instant);
+  for (unsigned i= n_core_fields; i < n_fields; i++)
+    fields[i].col->clear_instant();
+  n_core_fields= n_fields;
+  n_core_null_bytes= static_cast<byte>
+    (UT_BITS_IN_BYTES(static_cast<unsigned>(n_nullable)));
 }
 
 inline void dict_index_t::clear_instant_alter()
@@ -2435,8 +2448,9 @@ inline void dict_index_t::clear_instant_alter()
 	}
 
 	DBUG_ASSERT(&fields[n_fields - table->n_dropped()] == end);
-	n_core_fields = n_fields = n_def = end - fields;
-	n_core_null_bytes = UT_BITS_IN_BYTES(n_nullable);
+	n_core_fields = n_fields = n_def
+		= static_cast<unsigned>(end - fields) & MAX_N_FIELDS;
+	n_core_null_bytes = static_cast<byte>(UT_BITS_IN_BYTES(n_nullable));
 	std::sort(begin, end, [](const dict_field_t& a, const dict_field_t& b)
 			      { return a.col->ind < b.col->ind; });
 	table->instant = NULL;
@@ -2444,7 +2458,10 @@ inline void dict_index_t::clear_instant_alter()
 		auto a = std::find_if(fields, end,
 				      [ai_col](const dict_field_t& f)
 				      { return f.col == ai_col; });
-		table->persistent_autoinc = (a == end) ? 0 : 1 + (a - fields);
+		table->persistent_autoinc = (a == end)
+			? 0
+			: (1 + static_cast<unsigned>(a - fields))
+			& MAX_N_FIELDS;
 	}
 }
 
diff --git a/storage/innobase/include/dict0mem.inl b/storage/innobase/include/dict0mem.inl
index 090ec73278b..0a554a54dbd 100644
--- a/storage/innobase/include/dict0mem.inl
+++ b/storage/innobase/include/dict0mem.inl
@@ -54,13 +54,13 @@ dict_mem_fill_index_struct(
 		index->fields = NULL;
 	}
 
-	/* Assign a ulint to a 4-bit-mapped field.
-	Only the low-order 4 bits are assigned. */
-	index->type = unsigned(type);
+	index->type = type & ((1U << DICT_IT_BITS) - 1);
 	index->page = FIL_NULL;
 	index->merge_threshold = DICT_INDEX_MERGE_THRESHOLD_DEFAULT;
-	index->n_fields = (unsigned int) n_fields;
-	index->n_core_fields = (unsigned int) n_fields;
+	index->n_fields = static_cast<unsigned>(n_fields)
+		& index->MAX_N_FIELDS;
+	index->n_core_fields = static_cast<unsigned>(n_fields)
+		& index->MAX_N_FIELDS;
 	/* The '1 +' above prevents allocation
 	of an empty mem block */
 	index->nulls_equal = false;
diff --git a/storage/innobase/include/dict0priv.inl b/storage/innobase/include/dict0priv.inl
index ff645378175..2fcadc055e1 100644
--- a/storage/innobase/include/dict0priv.inl
+++ b/storage/innobase/include/dict0priv.inl
@@ -84,7 +84,7 @@ dict_table_check_if_in_cache_low(
 	/* Look for the table name in the hash table */
 	table_fold = ut_fold_string(table_name);
 
-	HASH_SEARCH(name_hash, dict_sys.table_hash, table_fold,
+	HASH_SEARCH(name_hash, &dict_sys.table_hash, table_fold,
 		    dict_table_t*, table, ut_ad(table->cached),
 		    !strcmp(table->name.m_name, table_name));
 	DBUG_RETURN(table);
diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h
index dccb354f803..34c1bef26c5 100644
--- a/storage/innobase/include/dict0stats.h
+++ b/storage/innobase/include/dict0stats.h
@@ -140,10 +140,18 @@ dict_stats_update(
 					the stats or to fetch them from
 					the persistent storage */
 
-/*********************************************************************//**
-Removes the information for a particular index's stats from the persistent
+/** Remove the information for a particular index's stats from the persistent
 storage if it exists and if there is data stored for this index.
 This function creates its own trx and commits it.
+
+We must modify system tables in a separate transaction in order to
+adhere to the InnoDB design constraint that dict_sys.latch prevents
+lock waits on system tables. If we modified system and user tables in
+the same transaction, we should exclusively hold dict_sys.latch until
+the transaction is committed, and effectively block other transactions
+that will attempt to open any InnoDB tables. Because we have no
+guarantee that user transactions will be committed fast, we cannot
+afford to keep the system tables locked in a user transaction.
 @return DB_SUCCESS or error code */
 dberr_t
 dict_stats_drop_index(
diff --git a/storage/innobase/include/dict0stats.inl b/storage/innobase/include/dict0stats.inl
index d4e23ecb0a4..4972efe8961 100644
--- a/storage/innobase/include/dict0stats.inl
+++ b/storage/innobase/include/dict0stats.inl
@@ -201,15 +201,15 @@ dict_stats_deinit(
 		MEM_UNDEFINED(
 			index->stat_n_diff_key_vals,
 			index->n_uniq
-			* sizeof(index->stat_n_diff_key_vals[0]));
+			* sizeof index->stat_n_diff_key_vals[0]);
 		MEM_UNDEFINED(
 			index->stat_n_sample_sizes,
 			index->n_uniq
-			* sizeof(index->stat_n_sample_sizes[0]));
+			* sizeof index->stat_n_sample_sizes[0]);
 		MEM_UNDEFINED(
 			index->stat_n_non_null_key_vals,
 			index->n_uniq
-			* sizeof(index->stat_n_non_null_key_vals[0]));
+			* sizeof index->stat_n_non_null_key_vals[0]);
 		MEM_UNDEFINED(
 			&index->stat_index_size,
 			sizeof(index->stat_index_size));
diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h
index 526139643d1..b210a2ec357 100644
--- a/storage/innobase/include/dict0stats_bg.h
+++ b/storage/innobase/include/dict0stats_bg.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2012, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2018, MariaDB Corporation.
+Copyright (c) 2017, 2019, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -28,13 +28,8 @@ Created Apr 26, 2012 Vasil Dimov
 #define dict0stats_bg_h
 
 #include "dict0types.h"
-#include "os0event.h"
 #include "os0thread.h"
 
-/** Event to wake up dict_stats_thread on dict_stats_recalc_pool_add()
-or shutdown. Not protected by any mutex. */
-extern os_event_t	dict_stats_event;
-
 #ifdef HAVE_PSI_INTERFACE
 extern mysql_pfs_key_t	dict_stats_recalc_pool_mutex_key;
 #endif /* HAVE_PSI_INTERFACE */
@@ -99,17 +94,13 @@ dict_stats_wait_bg_to_stop_using_table(
 				unlocking/locking the data dict */
 /*****************************************************************//**
 Initialize global variables needed for the operation of dict_stats_thread().
-Must be called before dict_stats_thread() is started. */
-void
-dict_stats_thread_init();
-/*====================*/
+Must be called before dict_stats task is started. */
+void dict_stats_init();
 
 /*****************************************************************//**
 Free resources allocated by dict_stats_thread_init(), must be called
-after dict_stats_thread() has exited. */
-void
-dict_stats_thread_deinit();
-/*======================*/
+after dict_stats task has exited. */
+void dict_stats_deinit();
 
 #ifdef UNIV_DEBUG
 /** Disables dict stats thread. It's used by:
@@ -119,20 +110,13 @@ void dict_stats_disabled_debug_update(THD*, st_mysql_sys_var*, void*,
 				      const void* save);
 #endif /* UNIV_DEBUG */
 
-/*****************************************************************//**
-This is the thread for background stats gathering. It pops tables, from
-the auto recalc list and proceeds them, eventually recalculating their
-statistics.
-@return this function does not return, it calls os_thread_exit() */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(dict_stats_thread)(
-/*==============================*/
-	void*	arg);	/*!< in: a dummy parameter
-			required by os_thread_create */
-
-/** Shut down the dict_stats_thread. */
-void
-dict_stats_shutdown();
+/** Start the dict stats timer. */
+void dict_stats_start();
+
+/** Shut down the dict_stats timer. */
+void dict_stats_shutdown();
+
+/** Reschedule dict stats timer to run now. */
+void dict_stats_schedule_now();
 
 #endif /* dict0stats_bg_h */
diff --git a/storage/innobase/include/dyn0buf.h b/storage/innobase/include/dyn0buf.h
index bd883eb796c..cb8b998f0ea 100644
--- a/storage/innobase/include/dyn0buf.h
+++ b/storage/innobase/include/dyn0buf.h
@@ -400,6 +400,9 @@ public:
 		return(m_heap == NULL);
 	}
 
+	/** @return whether the buffer is empty */
+	bool empty() const { return !back()->m_used; }
+
 private:
 	// Disable copying
 	mtr_buf_t(const mtr_buf_t&);
@@ -490,20 +493,4 @@ private:
 	block_t			m_first_block;
 };
 
-/** mtr_buf_t copier */
-struct mtr_buf_copy_t {
-	/** The copied buffer */
-	mtr_buf_t	m_buf;
-
-	/** Append a block to the redo log buffer.
-	@return whether the appending should continue (always true here) */
-	bool operator()(const mtr_buf_t::block_t* block)
-	{
-		byte*	buf = m_buf.open(block->used());
-		memcpy(buf, block->begin(), block->used());
-		m_buf.close(buf + block->used());
-		return(true);
-	}
-};
-
 #endif /* dyn0buf_h */
diff --git a/storage/innobase/include/dyn0types.h b/storage/innobase/include/dyn0types.h
index 06d837081a1..83d0b0d64c2 100644
--- a/storage/innobase/include/dyn0types.h
+++ b/storage/innobase/include/dyn0types.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -29,8 +30,7 @@ Created 2013-03-16 Sunny Bains
 /** Value of dyn_block_t::magic_n */
 #define DYN_BLOCK_MAGIC_N	375767
 
-/** This is the initial 'payload' size of a dynamic array;
-this must be > MLOG_BUF_MARGIN + 30! */
+/** This is the initial 'payload' size of a dynamic array */
 #define	DYN_ARRAY_DATA_SIZE	512
 
 /** Flag for dyn_block_t::used that indicates a full block */
diff --git a/storage/innobase/include/eval0eval.inl b/storage/innobase/include/eval0eval.inl
index ae0887408b0..0ea4057fdad 100644
--- a/storage/innobase/include/eval0eval.inl
+++ b/storage/innobase/include/eval0eval.inl
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -230,7 +231,7 @@ eval_node_copy_and_alloc_val(
 
 	data = eval_node_ensure_val_buf(node, len);
 
-	ut_memcpy(data, str, len);
+	memcpy(data, str, len);
 }
 
 /*****************************************************************//**
diff --git a/storage/innobase/include/fil0crypt.h b/storage/innobase/include/fil0crypt.h
index 4182b0a7c9d..62043003a6c 100644
--- a/storage/innobase/include/fil0crypt.h
+++ b/storage/innobase/include/fil0crypt.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
-Copyright (c) 2015, 2019, MariaDB Corporation.
+Copyright (c) 2015, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -85,19 +85,14 @@ struct fil_space_rotate_state_t
 {
 	time_t start_time;	/*!< time when rotation started */
 	ulint active_threads;	/*!< active threads in space */
-	ulint next_offset;	/*!< next "free" offset */
-	ulint max_offset;	/*!< max offset needing to be rotated */
+	uint32_t next_offset;	/*!< next "free" offset */
+	uint32_t max_offset;	/*!< max offset needing to be rotated */
 	uint  min_key_version_found; /*!< min key version found but not
 				     rotated */
 	lsn_t end_lsn;		/*!< max lsn created when rotating this
 				space */
 	bool starting;		/*!< initial write of IV */
 	bool flushing;		/*!< space is being flushed at end of rotate */
-	struct {
-		bool is_active; /*!< is scrubbing active in this space */
-		time_t last_scrub_completed; /*!< when was last scrub
-					     completed */
-	} scrubbing;
 };
 
 #ifndef UNIV_INNOCHECKSUM
@@ -115,7 +110,6 @@ struct fil_space_crypt_t : st_encryption_scheme
 		fil_encryption_t new_encryption)
 		: st_encryption_scheme(),
 		min_key_version(new_min_key_version),
-		page0_offset(0),
 		encryption(new_encryption),
 		key_found(0),
 		rotate_state()
@@ -184,14 +178,12 @@ struct fil_space_crypt_t : st_encryption_scheme
 	@param[in,out]	page	first page of the tablespace */
 	void fill_page0(ulint flags, byte* page);
 
-	/** Write crypt data to a page (0)
-	@param[in]	space	tablespace
-	@param[in,out]	page0	first page of the tablespace
+	/** Write encryption metadata to the first page.
+	@param[in,out]	block	first page of the tablespace
 	@param[in,out]	mtr	mini-transaction */
-	void write_page0(const fil_space_t* space, byte* page0, mtr_t* mtr);
+	void write_page0(buf_block_t* block, mtr_t* mtr);
 
 	uint min_key_version; // min key version for this space
-	ulint page0_offset;   // byte offset on page 0 for crypt data
 	fil_encryption_t encryption; // Encryption setup
 
 	ib_mutex_t mutex;   // mutex protecting following variables
@@ -229,18 +221,6 @@ struct fil_crypt_stat_t {
 	ulint estimated_iops;
 };
 
-/** Status info about scrubbing */
-struct fil_space_scrub_status_t {
-	ulint space;             /*!< tablespace id */
-	bool compressed;        /*!< is space compressed  */
-	time_t last_scrub_completed;  /*!< when was last scrub completed */
-	bool scrubbing;               /*!< is scrubbing ongoing */
-	time_t current_scrub_started; /*!< when started current scrubbing */
-	ulint current_scrub_active_threads; /*!< current scrub active threads */
-	ulint current_scrub_page_number; /*!< current scrub page no */
-	ulint current_scrub_max_page_number; /*!< current scrub max page no */
-};
-
 /*********************************************************************
 Init space crypt */
 UNIV_INTERN
@@ -294,25 +274,15 @@ void
 fil_space_destroy_crypt_data(
 	fil_space_crypt_t **crypt_data);
 
-/******************************************************************
-Parse a MLOG_FILE_WRITE_CRYPT_DATA log entry
-@param[in]	ptr		Log entry start
-@param[in]	end_ptr		Log entry end
-@param[out]	err		DB_SUCCESS or DB_DECRYPTION_FAILED
-@return position on log buffer */
-UNIV_INTERN
-byte*
-fil_parse_write_crypt_data(
-	byte*			ptr,
-	const byte*		end_ptr,
-	dberr_t*		err)
-	MY_ATTRIBUTE((warn_unused_result));
+/** Amend encryption information from redo log.
+@param[in]	space	tablespace
+@param[in]	data	encryption metadata */
+void fil_crypt_parse(fil_space_t* space, const byte* data);
 
 /** Encrypt a buffer.
 @param[in,out]		crypt_data		Crypt data
 @param[in]		space			space_id
 @param[in]		offset			Page offset
-@param[in]		lsn			Log sequence number
 @param[in]		src_frame		Page to encrypt
 @param[in]		zip_size		ROW_FORMAT=COMPRESSED page size, or 0
 @param[in,out]		dst_frame		Output buffer
@@ -324,7 +294,6 @@ fil_encrypt_buf(
 	fil_space_crypt_t*	crypt_data,
 	ulint			space,
 	ulint			offset,
-	lsn_t			lsn,
 	const byte*		src_frame,
 	ulint			zip_size,
 	byte*			dst_frame,
@@ -336,16 +305,12 @@ Encrypt a page.
 
 @param[in]		space		Tablespace
 @param[in]		offset		Page offset
-@param[in]		lsn		Log sequence number
 @param[in]		src_frame	Page to encrypt
 @param[in,out]		dst_frame	Output buffer
 @return encrypted buffer or NULL */
-UNIV_INTERN
-byte*
-fil_space_encrypt(
+byte* fil_space_encrypt(
 	const fil_space_t* space,
 	ulint		offset,
-	lsn_t		lsn,
 	byte*		src_frame,
 	byte*		dst_frame)
 	MY_ATTRIBUTE((warn_unused_result));
@@ -421,10 +386,7 @@ fil_crypt_set_rotation_iops(
 /*********************************************************************
 Adjust encrypt tables
 @param[in]	val		New setting for innodb-encrypt-tables */
-UNIV_INTERN
-void
-fil_crypt_set_encrypt_tables(
-	uint val);
+void fil_crypt_set_encrypt_tables(ulong val);
 
 /*********************************************************************
 Init threads for key rotation */
@@ -465,18 +427,6 @@ void
 fil_crypt_total_stat(
 	fil_crypt_stat_t *stat);
 
-/**
-Get scrub status for a space (used by information_schema)
-
-@param[in]	space		Tablespace
-@param[out]	status		Scrub status
-return 0 if data found */
-UNIV_INTERN
-void
-fil_space_get_scrub_status(
-	const fil_space_t*		space,
-	fil_space_scrub_status_t*	status);
-
 #include "fil0crypt.inl"
 #endif /* !UNIV_INNOCHECKSUM */
 
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index 0c0af083a00..9d5bbcadc65 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -28,22 +28,71 @@ Created 10/25/1995 Heikki Tuuri
 #define fil0fil_h
 
 #include "fsp0types.h"
+#include "mach0data.h"
+#include "assume_aligned.h"
 
 #ifndef UNIV_INNOCHECKSUM
 
+#include "buf0dblwr.h"
+#include "hash0hash.h"
 #include "log0recv.h"
 #include "dict0types.h"
 #include "ilist.h"
-#ifdef UNIV_LINUX
-# include <set>
-#endif
+#include <set>
+#include <mutex>
 
 struct unflushed_spaces_tag_t;
 struct rotation_list_tag_t;
 
 // Forward declaration
 extern my_bool srv_use_doublewrite_buf;
-extern struct buf_dblwr_t* buf_dblwr;
+
+/** Possible values of innodb_flush_method */
+enum srv_flush_t
+{
+  /** fsync, the default */
+  SRV_FSYNC= 0,
+  /** open log files in O_DSYNC mode */
+  SRV_O_DSYNC,
+  /** do not call os_file_flush() when writing data files, but do flush
+  after writing to log files */
+  SRV_LITTLESYNC,
+  /** do not flush after writing */
+  SRV_NOSYNC,
+  /** invoke os_file_set_nocache() on data files. This implies using
+  non-buffered IO but still using fsync, the reason for which is that
+  some FS do not flush meta-data when unbuffered IO happens */
+  SRV_O_DIRECT,
+  /** do not use fsync() when using direct IO i.e.: it can be set to
+  avoid the fsync() call that we make when using SRV_UNIX_O_DIRECT.
+  However, in this case user/DBA should be sure about the integrity of
+  the meta-data */
+  SRV_O_DIRECT_NO_FSYNC
+#ifdef _WIN32
+  /** Traditional Windows appoach to open all files without caching,
+  and do FileFlushBuffers() */
+  ,SRV_ALL_O_DIRECT_FSYNC
+#endif
+};
+
+/** innodb_flush_method */
+extern ulong srv_file_flush_method;
+
+/** Undo tablespaces starts with space_id. */
+extern	ulint	srv_undo_space_id_start;
+/** The number of UNDO tablespaces that are open and ready to use. */
+extern ulint	srv_undo_tablespaces_open;
+
+/** Check whether given space id is undo tablespace id
+@param[in]	space_id	space id to check
+@return true if it is undo tablespace else false. */
+inline bool srv_is_undo_tablespace(ulint space_id)
+{
+  return srv_undo_space_id_start > 0 &&
+    space_id >= srv_undo_space_id_start &&
+    space_id < srv_undo_space_id_start + srv_undo_tablespaces_open;
+}
+
 class page_id_t;
 
 /** Structure containing encryption specification */
@@ -57,38 +106,240 @@ enum fil_type_t {
 	FIL_TYPE_IMPORT,
 	/** persistent tablespace (for system, undo log or tables) */
 	FIL_TYPE_TABLESPACE,
-	/** redo log covering changes to files of FIL_TYPE_TABLESPACE */
-	FIL_TYPE_LOG
 };
 
-/** Check if fil_type is any of FIL_TYPE_TEMPORARY, FIL_TYPE_IMPORT
-or FIL_TYPE_TABLESPACE.
-@param[in]	type	variable of type fil_type_t
-@return true if any of FIL_TYPE_TEMPORARY, FIL_TYPE_IMPORT
-or FIL_TYPE_TABLESPACE */
-inline
-bool
-fil_type_is_data(
-	fil_type_t	type)
+struct fil_node_t;
+
+/** Structure to store first and last value of range */
+struct range_t
 {
-	return(type == FIL_TYPE_TEMPORARY
-	       || type == FIL_TYPE_IMPORT
-	       || type == FIL_TYPE_TABLESPACE);
-}
+  uint32_t first;
+  uint32_t last;
+};
 
-struct fil_node_t;
+/** Sort the range based on first value of the range */
+struct range_compare
+{
+  bool operator() (const range_t lhs, const range_t rhs) const
+  {
+    return lhs.first < rhs.first;
+  }
+};
+
+using range_set_t= std::set<range_t, range_compare>;
+/** Range to store the set of ranges of integers */
+class range_set
+{
+private:
+  range_set_t ranges;
+
+  range_set_t::iterator find(uint32_t value) const
+  {
+    auto r_offset= ranges.lower_bound({value, value});
+    const auto r_end= ranges.end();
+    if (r_offset != r_end);
+    else if (empty())
+      return r_end;
+    else
+      r_offset= std::prev(r_end);
+    if (r_offset->first <= value && r_offset->last >= value)
+      return r_offset;
+    return r_end;
+  }
+public:
+  /** Merge the current range with previous range.
+  @param[in] range      range to be merged
+  @param[in] prev_range range to be merged with next */
+  void merge_range(range_set_t::iterator range,
+		   range_set_t::iterator prev_range)
+  {
+    if (range->first != prev_range->last + 1)
+      return;
+
+    /* Merge the current range with previous range */
+    range_t new_range {prev_range->first, range->last};
+    ranges.erase(prev_range);
+    ranges.erase(range);
+    ranges.emplace(new_range);
+  }
+
+  /** Split the range and add two more ranges
+  @param[in] range	range to be split
+  @param[in] value	Value to be removed from range */
+  void split_range(range_set_t::iterator range, uint32_t value)
+  {
+    range_t split1{range->first, value - 1};
+    range_t split2{value + 1, range->last};
+
+    /* Remove the existing element */
+    ranges.erase(range);
+
+    /* Insert the two elements */
+    ranges.emplace(split1);
+    ranges.emplace(split2);
+  }
+
+  /** Remove the value with the given range
+  @param[in,out] range  range to be changed
+  @param[in]	 value	value to be removed */
+  void remove_within_range(range_set_t::iterator range, uint32_t value)
+  {
+    range_t new_range{range->first, range->last};
+    if (value == range->first)
+    {
+      if (range->first == range->last)
+      {
+        ranges.erase(range);
+        return;
+      }
+      else
+        new_range.first++;
+    }
+    else if (value == range->last)
+      new_range.last--;
+    else if (range->first < value && range->last > value)
+      return split_range(range, value);
+
+    ranges.erase(range);
+    ranges.emplace(new_range);
+  }
+
+  /** Remove the value from the ranges.
+  @param[in]	value	Value to be removed. */
+  void remove_value(uint32_t value)
+  {
+    if (empty())
+      return;
+    range_t new_range {value, value};
+    range_set_t::iterator range= ranges.lower_bound(new_range);
+    if (range == ranges.end())
+      return remove_within_range(std::prev(range), value);
+
+    if (range->first > value && range != ranges.begin())
+      /* Iterate the previous ranges to delete */
+      return remove_within_range(std::prev(range), value);
+    return remove_within_range(range, value);
+  }
+  /** Add the value within the existing range
+  @param[in]	range	range to be modified
+  @param[in]	value	value to be added */
+  range_set_t::iterator add_within_range(range_set_t::iterator range,
+                                         uint32_t value)
+  {
+    if (range->first <= value && range->last >= value)
+      return range;
+
+    range_t new_range{range->first, range->last};
+    if (range->last + 1 == value)
+      new_range.last++;
+    else if (range->first - 1 == value)
+      new_range.first--;
+    else return ranges.end();
+    ranges.erase(range);
+    return ranges.emplace(new_range).first;
+  }
+  /** Add the range in the ranges set
+  @param[in]	new_range	range to be added */
+  void add_range(range_t new_range)
+  {
+    auto r_offset= ranges.lower_bound(new_range);
+    auto r_begin= ranges.begin();
+    auto r_end= ranges.end();
+    if (!ranges.size())
+    {
+new_range:
+      ranges.emplace(new_range);
+      return;
+    }
+
+    if (r_offset == r_end)
+    {
+      /* last range */
+      if (add_within_range(std::prev(r_offset), new_range.first) == r_end)
+        goto new_range;
+    }
+    else if (r_offset == r_begin)
+    {
+      /* First range */
+      if (add_within_range(r_offset, new_range.first) == r_end)
+        goto new_range;
+    }
+    else if (r_offset->first - 1 == new_range.first)
+    {
+      /* Change starting of the existing range */
+      auto r_value= add_within_range(r_offset, new_range.first);
+      if (r_value != ranges.begin())
+        merge_range(r_value, std::prev(r_value));
+    }
+    else
+    {
+      /* previous range last_value alone */
+      if (add_within_range(std::prev(r_offset), new_range.first) == r_end)
+        goto new_range;
+    }
+  }
+
+ /** Add the value in the ranges
+ @param[in] value  value to be added */
+  void add_value(uint32_t value)
+  {
+    range_t new_range{value, value};
+    add_range(new_range);
+  }
+
+  bool remove_if_exists(uint32_t value)
+  {
+    auto r_offset= find(value);
+    if (r_offset != ranges.end())
+    {
+      remove_within_range(r_offset, value);
+      return true;
+    }
+    return false;
+  }
+
+  bool contains(uint32_t value) const
+  {
+    return find(value) != ranges.end();
+  }
 
+  ulint size() { return ranges.size(); }
+  void clear() { ranges.clear(); }
+  bool empty() const { return ranges.empty(); }
+  typename range_set_t::iterator begin() { return ranges.begin(); }
+  typename range_set_t::iterator end() { return ranges.end(); }
+};
 #endif
 
 /** Tablespace or log data space */
 #ifndef UNIV_INNOCHECKSUM
-struct fil_space_t : ilist_node<unflushed_spaces_tag_t>,
-                     ilist_node<rotation_list_tag_t>
+struct fil_io_t
+{
+  /** error code */
+  dberr_t err;
+  /** file; node->space->release() must follow IORequestRead call */
+  fil_node_t *node;
+};
+
+/** Tablespace encryption mode */
+enum fil_encryption_t
+{
+  /** Encrypted if innodb_encrypt_tables=ON (srv_encrypt_tables) */
+  FIL_ENCRYPTION_DEFAULT,
+  /** Encrypted */
+  FIL_ENCRYPTION_ON,
+  /** Not encrypted */
+  FIL_ENCRYPTION_OFF
+};
+
+struct fil_space_t final :
+  ilist_node<unflushed_spaces_tag_t>, ilist_node<rotation_list_tag_t>
 #else
-struct fil_space_t
+struct fil_space_t final
 #endif
 {
 #ifndef UNIV_INNOCHECKSUM
+  friend fil_node_t;
 	ulint		id;	/*!< space id */
 	hash_node_t	hash;	/*!< hash chain node */
 	char*		name;	/*!< Tablespace name */
@@ -99,67 +350,50 @@ struct fil_space_t
 				Protected by log_sys.mutex.
 				If and only if this is nonzero, the
 				tablespace will be in named_spaces. */
-	/** Log sequence number of the latest MLOG_INDEX_LOAD record
-	that was found while parsing the redo log */
-	lsn_t		enable_lsn;
 	/** whether undo tablespace truncation is in progress */
 	bool		is_being_truncated;
-#ifdef UNIV_DEBUG
-	/** reference count for operations who want to skip redo log in the
-	file space in order to make modify_check() pass. */
-	Atomic_counter<ulint> redo_skipped_count;
-#endif
 	fil_type_t	purpose;/*!< purpose */
 	UT_LIST_BASE_NODE_T(fil_node_t) chain;
 				/*!< base node for the file chain */
-	ulint		size;	/*!< tablespace file size in pages;
+	uint32_t	size;	/*!< tablespace file size in pages;
 				0 if not known yet */
-	ulint		size_in_header;
+	uint32_t	size_in_header;
 				/* FSP_SIZE in the tablespace header;
 				0 if not known yet */
-	ulint		free_len;
+	uint32_t	free_len;
 				/*!< length of the FSP_FREE list */
-	ulint		free_limit;
+	uint32_t	free_limit;
 				/*!< contents of FSP_FREE_LIMIT */
-	ulint		recv_size;
+	uint32_t	recv_size;
 				/*!< recovered tablespace size in pages;
 				0 if no size change was read from the redo log,
 				or if the size change was implemented */
-  /** the committed size of the tablespace in pages */
-  Atomic_relaxed<ulint> committed_size;
-	ulint		n_reserved_extents;
+	uint32_t	n_reserved_extents;
 				/*!< number of reserved free extents for
 				ongoing operations like B-tree page split */
-	ulint		n_pending_flushes; /*!< this is positive when flushing
-				the tablespace to disk; dropping of the
-				tablespace is forbidden if this is positive */
 private:
-  /** Number of pending buffer pool operations accessing the
-  tablespace without holding a table lock or dict_operation_lock
-  S-latch that would prevent the table (and tablespace) from being
-  dropped. An example is change buffer merge.
-
-  The tablespace cannot be dropped while this is nonzero, or while
-  fil_node_t::n_pending is nonzero.
-
-  The most significant bit contains the STOP_NEW_OPS flag. */
-  Atomic_relaxed<size_t> n_pending_ops;
-
-  /** Flag in n_pending_ops that indicates that the tablespace is being
+  /** the committed size of the tablespace in pages */
+  Atomic_relaxed<uint32_t> committed_size;
+  /** Number of pending operations on the file.
+  The tablespace cannot be freed while (n_pending & PENDING) != 0. */
+  std::atomic<uint32_t> n_pending;
+  /** Flag in n_pending that indicates that the tablespace is being
   deleted, and no further operations should be performed */
-  static const size_t STOP_NEW_OPS= ~(~size_t(0) >> 1);
+  static constexpr uint32_t STOPPING= 1U << 31;
+  /** Flag in n_pending that indicates that the tablespace is a candidate
+  for being closed, and fil_node_t::is_open() can only be trusted after
+  acquiring fil_system.mutex and resetting the flag */
+  static constexpr uint32_t CLOSING= 1U << 30;
+  /** Flag in n_pending that indicates that the tablespace needs fsync().
+  This must be the least significant flag bit; @see release_flush() */
+  static constexpr uint32_t NEEDS_FSYNC= 1U << 29;
+  /** The reference count */
+  static constexpr uint32_t PENDING= ~(STOPPING | CLOSING | NEEDS_FSYNC);
 public:
-	/** Number of pending block read or write operations
-	(when a write is imminent or a read has recently completed).
-	The tablespace object cannot be freed while this is nonzero,
-	but it can be detached from fil_system.
-	Note that fil_node_t::n_pending tracks actual pending I/O requests.
-	Protected by fil_system.mutex and std::atomic. */
-	std::atomic<ulint>		n_pending_ios;
 	rw_lock_t	latch;	/*!< latch protecting the file space storage
 				allocation */
 	UT_LIST_NODE_T(fil_space_t) named_spaces;
-				/*!< list of spaces for which MLOG_FILE_NAME
+				/*!< list of spaces for which FILE_MODIFY
 				records have been issued */
 	UT_LIST_NODE_T(fil_space_t) space_list;
 				/*!< list of all spaces */
@@ -180,24 +414,25 @@ public:
 	punch hole */
 	bool		punch_hole;
 
+	/** mutex to protect freed ranges */
+	std::mutex	freed_range_mutex;
+
+	/** Variables to store freed ranges. This can be used to write
+	zeroes/punch the hole in files. Protected by freed_mutex */
+	range_set	freed_ranges;
+
+	/** Stores last page freed lsn. Protected by freed_mutex */
+	lsn_t		last_freed_lsn;
+
 	ulint		magic_n;/*!< FIL_SPACE_MAGIC_N */
 
-  /** Clamp a page number for batched I/O, such as read-ahead.
-  @param offset   page number limit
-  @return offset clamped to the tablespace size */
-  ulint max_page_number_for_io(ulint offset) const
+  /** @return whether doublewrite buffering is needed */
+  bool use_doublewrite() const
   {
-    const ulint limit= committed_size;
-    return limit > offset ? offset : limit;
+    return !atomic_write_supported && srv_use_doublewrite_buf &&
+      buf_dblwr.is_initialised();
   }
 
-	/** @return whether doublewrite buffering is needed */
-	bool use_doublewrite() const
-	{
-		return !atomic_write_supported
-			&& srv_use_doublewrite_buf && buf_dblwr;
-	}
-
 	/** Append a file to the chain of files of a space.
 	@param[in]	name		file name of a file that is not open
 	@param[in]	handle		file handle, or OS_FILE_CLOSED
@@ -205,11 +440,11 @@ public:
 	@param[in]	is_raw		whether this is a raw device
 	@param[in]	atomic_write	true if atomic write could be enabled
 	@param[in]	max_pages	maximum number of pages in file,
-	or ULINT_MAX for unlimited
+	or UINT32_MAX for unlimited
 	@return file object */
 	fil_node_t* add(const char* name, pfs_os_file_t handle,
-			ulint size, bool is_raw, bool atomic_write,
-			ulint max_pages = ULINT_MAX);
+			uint32_t size, bool is_raw, bool atomic_write,
+			uint32_t max_pages = UINT32_MAX);
 #ifdef UNIV_DEBUG
 	/** Assert that the mini-transaction is compatible with
 	updating an allocation bitmap page.
@@ -221,7 +456,7 @@ public:
 	@param[in]	n_free_now	current number of free extents
 	@param[in]	n_to_reserve	number of extents to reserve
 	@return	whether the reservation succeeded */
-	bool reserve_free_extents(ulint n_free_now, ulint n_to_reserve)
+	bool reserve_free_extents(uint32_t n_free_now, uint32_t n_to_reserve)
 	{
 		ut_ad(rw_lock_own(&latch, RW_LOCK_X));
 		if (n_reserved_extents + n_to_reserve > n_free_now) {
@@ -234,7 +469,7 @@ public:
 
 	/** Release the reserved free extents.
 	@param[in]	n_reserved	number of reserved extents */
-	void release_free_extents(ulint n_reserved)
+	void release_free_extents(uint32_t n_reserved)
 	{
 		if (!n_reserved) return;
 		ut_ad(rw_lock_own(&latch, RW_LOCK_X));
@@ -252,59 +487,141 @@ public:
 	dberr_t rename(const char* name, const char* path, bool log,
 		       bool replace = false);
 
-	/** Note that the tablespace has been imported.
-	Initially, purpose=FIL_TYPE_IMPORT so that no redo log is
-	written while the space ID is being updated in each page. */
-	inline void set_imported();
-
-	/** @return whether the storage device is rotational (HDD, not SSD) */
-	inline bool is_rotational() const;
+  /** Note that the tablespace has been imported.
+  Initially, purpose=FIL_TYPE_IMPORT so that no redo log is
+  written while the space ID is being updated in each page. */
+  inline void set_imported();
 
-	/** Open each file. Only invoked on fil_system.temp_space.
-	@return whether all files were opened */
-	bool open();
-	/** Close each file. Only invoked on fil_system.temp_space. */
-	void close();
+  /** @return whether the storage device is rotational (HDD, not SSD) */
+  inline bool is_rotational() const;
 
-  /** @return whether the tablespace is about to be dropped */
-  bool is_stopping() const { return n_pending_ops & STOP_NEW_OPS; }
-
-  /** @return number of references being held */
-  size_t referenced() const { return n_pending_ops & ~STOP_NEW_OPS; }
+  /** Open each file. Never invoked on .ibd files.
+  @param create_new_db    whether to skip the call to fil_node_t::read_page0()
+  @return whether all files were opened */
+  bool open(bool create_new_db);
+  /** Close each file. Only invoked on fil_system.temp_space. */
+  void close();
 
   /** Note that operations on the tablespace must stop or can resume */
-  void set_stopping(bool stopping)
+  inline void set_stopping(bool stopping);
+
+private:
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Try to acquire a tablespace reference.
+  @return the old reference count (if STOPPING is set, it was not acquired) */
+  uint32_t acquire_low()
   {
-    ut_d(auto n=) n_pending_ops.fetch_xor(STOP_NEW_OPS);
-    ut_ad(!(n & STOP_NEW_OPS) == stopping);
+    uint32_t n= 0;
+    while (!n_pending.compare_exchange_strong(n, n + 1,
+                                              std::memory_order_acquire,
+                                              std::memory_order_relaxed) &&
+           !(n & STOPPING));
+    return n;
   }
+public:
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Acquire a tablespace reference.
+  @return whether a tablespace reference was successfully acquired */
+  inline bool acquire_if_not_stopped();
 
   MY_ATTRIBUTE((warn_unused_result))
-  /** @return whether a tablespace reference was successfully acquired */
+  /** Acquire a tablespace reference for I/O.
+  @return whether the file is usable */
   bool acquire()
   {
-    size_t n= 0;
-    while (!n_pending_ops.compare_exchange_strong(n, n + 1,
-                                                  std::memory_order_acquire,
-                                                  std::memory_order_relaxed))
-      if (UNIV_UNLIKELY(n & STOP_NEW_OPS))
-        return false;
-    return true;
+    uint32_t n= acquire_low();
+    if (UNIV_LIKELY(!(n & (STOPPING | CLOSING))))
+      return true;
+    return UNIV_LIKELY(!(n & STOPPING)) && prepare();
   }
+
+  /** Acquire another tablespace reference for I/O. */
+  inline void reacquire();
+
   /** Release a tablespace reference.
   @return whether this was the last reference */
   bool release()
   {
-    auto n= n_pending_ops.fetch_sub(1);
-    ut_ad(n & ~STOP_NEW_OPS);
-    return (n & ~STOP_NEW_OPS) == 1;
+    uint32_t n= n_pending.fetch_sub(1, std::memory_order_release);
+    ut_ad(n & PENDING);
+    return (n & PENDING) == 1;
+  }
+
+  /** Clear the NEEDS_FSYNC flag */
+  void clear_flush()
+  { n_pending.fetch_and(~NEEDS_FSYNC, std::memory_order_release); }
+
+private:
+  /** @return pending operations (and flags) */
+  uint32_t pending()const { return n_pending.load(std::memory_order_acquire); }
+public:
+  /** @return whether close() of the file handle has been requested */
+  bool is_closing() const { return pending() & CLOSING; }
+  /** @return whether the tablespace is going to be dropped */
+  bool is_stopping() const { return pending() & STOPPING; }
+  /** @return number of pending operations */
+  bool is_ready_to_close() const
+  { return (pending() & (PENDING | CLOSING)) == CLOSING; }
+  /** @return whether fsync() or similar is needed */
+  bool needs_flush() const { return pending() & NEEDS_FSYNC; }
+  /** @return whether fsync() or similar is needed, and the tablespace is
+  not being dropped  */
+  bool needs_flush_not_stopping() const
+  { return (pending() & (NEEDS_FSYNC | STOPPING)) == NEEDS_FSYNC; }
+
+  uint32_t referenced() const { return pending() & PENDING; }
+private:
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Prepare to close the file handle.
+  @return number of pending operations, possibly with NEEDS_FSYNC flag */
+  uint32_t set_closing()
+  {
+    return n_pending.fetch_or(CLOSING, std::memory_order_acquire) &
+      (PENDING | NEEDS_FSYNC);
+  }
+
+public:
+  /** Try to close a file to adhere to the innodb_open_files limit.
+  @param print_info   whether to diagnose why a file cannot be closed
+  @return whether a file was closed */
+  static bool try_to_close(bool print_info);
+
+  /** Close all tablespace files at shutdown */
+  static void close_all();
+
+  /** @return last_freed_lsn */
+  lsn_t get_last_freed_lsn() { return last_freed_lsn; }
+  /** Update last_freed_lsn */
+  void update_last_freed_lsn(lsn_t lsn)
+  {
+    std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+    last_freed_lsn= lsn;
+  }
+
+  /** Note that the file will need fsync().
+  @return whether this needs to be added to fil_system.unflushed_spaces */
+  bool set_needs_flush()
+  {
+    uint32_t n= 1;
+    while (!n_pending.compare_exchange_strong(n, n | NEEDS_FSYNC,
+                                              std::memory_order_acquire,
+                                              std::memory_order_relaxed))
+    {
+      ut_ad(n & PENDING);
+      if (n & (NEEDS_FSYNC | STOPPING))
+        return false;
+    }
+
+    return true;
+  }
+
+  /** Clear all freed ranges for undo tablespace when InnoDB
+  encounters TRIM redo log record */
+  void clear_freed_ranges()
+  {
+    std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+    freed_ranges.clear();
   }
-  /** Acquire a tablespace reference for I/O. */
-  void acquire_for_io() { n_pending_ios++; }
-  /** Release a tablespace reference for I/O. */
-  void release_for_io() { ut_d(auto n=) n_pending_ios--; ut_ad(n); }
-  /** @return whether I/O is pending */
-  bool pending_io() const { return n_pending_ios; }
 #endif /* !UNIV_INNOCHECKSUM */
 	/** FSP_SPACE_FLAGS and FSP_FLAGS_MEM_ flags;
 	check fsp0types.h to more info about flags. */
@@ -393,11 +710,11 @@ public:
   @param[in]	flags	tablespace flags */
   static bool is_compressed(ulint flags)
   {
-    return is_full_crc32_compressed(flags)
-        || FSP_FLAGS_HAS_PAGE_COMPRESSION(flags);
+    return is_full_crc32_compressed(flags) ||
+      FSP_FLAGS_HAS_PAGE_COMPRESSION(flags);
   }
-	/** @return whether the compression enabled for the tablespace. */
-	bool is_compressed() const { return is_compressed(flags); }
+  /** @return whether the compression enabled for the tablespace. */
+  bool is_compressed() const { return is_compressed(flags); }
 
 	/** Get the compression algorithm for full crc32 format.
 	@param[in]	flags	tablespace flags
@@ -580,6 +897,110 @@ public:
 		return(ssize == 0 || !is_ibd
 		       || srv_page_size != UNIV_PAGE_SIZE_ORIG);
 	}
+
+#ifndef UNIV_INNOCHECKSUM
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Create a tablespace in fil_system.
+  @param name       tablespace name
+  @param id         tablespace identifier
+  @param flags      tablespace flags
+  @param purpose    tablespace purpose
+  @param crypt_data encryption information
+  @param mode       encryption mode
+  @return pointer to created tablespace, to be filled in with add()
+  @retval nullptr on failure (such as when the same tablespace exists) */
+  static fil_space_t *create(const char *name, ulint id, ulint flags,
+                             fil_type_t purpose, fil_space_crypt_t *crypt_data,
+                             fil_encryption_t mode= FIL_ENCRYPTION_DEFAULT);
+
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Acquire a tablespace reference.
+  @param id      tablespace identifier
+  @return tablespace
+  @retval nullptr if the tablespace is missing or inaccessible */
+  static fil_space_t *get(ulint id);
+
+  /** Add/remove the free page in the freed ranges list.
+  @param[in] offset     page number to be added
+  @param[in] free       true if page to be freed */
+  void free_page(uint32_t offset, bool add=true)
+  {
+    std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+    if (add)
+      return freed_ranges.add_value(offset);
+
+    if (freed_ranges.empty())
+      return;
+
+    return freed_ranges.remove_value(offset);
+  }
+
+  /** Add the range of freed pages */
+  void add_free_ranges(range_set ranges)
+  {
+    std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+    freed_ranges= std::move(ranges);
+  }
+
+  /** Add the set of freed page ranges */
+  void add_free_range(const range_t range)
+  {
+    std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+    freed_ranges.add_range(range);
+  }
+
+  /** Set the tablespace size in pages */
+  void set_sizes(uint32_t s)
+  {
+    ut_ad(id ? !size : (size >= s));
+    size= s; committed_size= s;
+  }
+
+  /** Update committed_size in mtr_t::commit() */
+  void set_committed_size()
+  {
+    ut_ad(rw_lock_own(&latch, RW_LOCK_X));
+    committed_size= size;
+  }
+
+  /** @return the last persisted page number */
+  uint32_t last_page_number() const { return committed_size - 1; }
+
+  /** @return the size in pages (0 if unreadable) */
+  inline uint32_t get_size();
+
+  /** Read or write data.
+  @param type     I/O context
+  @param offset   offset in bytes
+  @param len      number of bytes
+  @param buf      the data to be read or written
+  @param bpage    buffer block (for type.is_async() completion callback)
+  @return status and file descriptor */
+  fil_io_t io(const IORequest &type, os_offset_t offset, size_t len,
+              void *buf, buf_page_t *bpage= nullptr);
+  /** Flush pending writes from the file system cache to the file. */
+  template<bool have_reference> inline void flush();
+  /** Flush pending writes from the file system cache to the file. */
+  void flush_low();
+
+  /** Read the first page of a data file.
+  @return whether the page was found valid */
+  bool read_page0();
+
+  /** Determine the next tablespace for encryption key rotation.
+  @param space    current tablespace (nullptr to start from the beginning)
+  @param recheck  whether the removal condition needs to be rechecked after
+                  encryption parameters were changed
+  @param encrypt  expected state of innodb_encrypt_tables
+  @return the next tablespace
+  @retval nullptr upon reaching the end of the iteration */
+  static inline fil_space_t *next(fil_space_t *space, bool recheck,
+                                  bool encrypt);
+
+private:
+  /** @return whether the file is usable for io() */
+  ATTRIBUTE_COLD bool prepare(bool have_mutex= false);
+#endif /*!UNIV_INNOCHECKSUM */
 };
 
 #ifndef UNIV_INNOCHECKSUM
@@ -587,7 +1008,8 @@ public:
 #define	FIL_SPACE_MAGIC_N	89472
 
 /** File node of a tablespace or the log data space */
-struct fil_node_t {
+struct fil_node_t final
+{
 	/** tablespace containing this file */
 	fil_space_t*	space;
 	/** file name; protected by fil_system.mutex and log_sys.mutex. */
@@ -601,24 +1023,16 @@ struct fil_node_t {
 	/** size of the file in database pages (0 if not known yet);
 	the possible last incomplete megabyte may be ignored
 	if space->id == 0 */
-	ulint		size;
+	uint32_t	size;
 	/** initial size of the file in database pages;
 	FIL_IBD_FILE_INITIAL_SIZE by default */
-	ulint		init_size;
+	uint32_t	init_size;
 	/** maximum size of the file in database pages (0 if unlimited) */
-	ulint		max_size;
-	/** count of pending i/o's; is_open must be true if nonzero */
-	ulint		n_pending;
-	/** count of pending flushes; is_open must be true if nonzero */
-	ulint		n_pending_flushes;
+	uint32_t	max_size;
 	/** whether the file is currently being extended */
-	bool		being_extended;
-	/** whether this file had writes after lasy fsync() */
-	bool		needs_flush;
+	Atomic_relaxed<bool> being_extended;
 	/** link to other files in this tablespace */
 	UT_LIST_NODE_T(fil_node_t) chain;
-	/** link to the fil_system.LRU list (keeping track of open files) */
-	UT_LIST_NODE_T(fil_node_t) LRU;
 
 	/** whether this file could use atomic write (data file) */
 	bool		atomic_write;
@@ -636,9 +1050,8 @@ struct fil_node_t {
 	}
 
 	/** Read the first page of a data file.
-	@param[in]	first	whether this is the very first read
 	@return	whether the page was found valid */
-	bool read_page0(bool first);
+	bool read_page0();
 
 	/** Determine some file metadata when creating or reading the file.
 	@param	file	the file that is being created, or OS_FILE_CLOSED */
@@ -648,8 +1061,21 @@ struct fil_node_t {
 #endif
 			   );
 
-	/** Close the file handle. */
-	void close();
+  /** Close the file handle. */
+  void close();
+  /** Same as close() but returns file handle instead of closing it. */
+  pfs_os_file_t detach() MY_ATTRIBUTE((warn_unused_result));
+  /** Prepare to free a file from fil_system.
+  @param detach_handle whether to detach instead of closing a handle
+  @return detached handle or OS_FILE_CLOSED */
+  inline pfs_os_file_t close_to_free(bool detach_handle= false);
+
+  /** Update the data structures on write completion */
+  inline void complete_write();
+
+private:
+  /** Does stuff common for close() and detach() */
+  void prepare_to_close_or_detach();
 };
 
 /** Value of fil_node_t::magic_n */
@@ -657,20 +1083,18 @@ struct fil_node_t {
 
 inline void fil_space_t::set_imported()
 {
-	ut_ad(purpose == FIL_TYPE_IMPORT);
-	purpose = FIL_TYPE_TABLESPACE;
-	UT_LIST_GET_FIRST(chain)->find_metadata();
+  ut_ad(purpose == FIL_TYPE_IMPORT);
+  purpose= FIL_TYPE_TABLESPACE;
+  UT_LIST_GET_FIRST(chain)->find_metadata();
 }
 
 inline bool fil_space_t::is_rotational() const
 {
-	for (const fil_node_t* node = UT_LIST_GET_FIRST(chain);
-	     node != NULL; node = UT_LIST_GET_NEXT(chain, node)) {
-		if (!node->on_ssd) {
-			return true;
-		}
-	}
-	return false;
+  for (const fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
+       node= UT_LIST_GET_NEXT(chain, node))
+    if (!node->on_ssd)
+      return true;
+  return false;
 }
 
 /** Common InnoDB file extensions */
@@ -689,13 +1113,6 @@ extern const char* dot_ext[];
 but in the MySQL Embedded Server Library and mysqlbackup it is not the default
 directory, and we must set the base file path explicitly */
 extern const char*	fil_path_to_mysql_datadir;
-
-/* Space address data type; this is intended to be used when
-addresses accurate to a byte are stored in file pages. If the page part
-of the address is FIL_NULL, the address is considered undefined. */
-
-typedef	byte	fil_faddr_t;	/*!< 'type' definition in C: an address
-				stored in a file page is a string of bytes */
 #else
 # include "univ.i"
 #endif /* !UNIV_INNOCHECKSUM */
@@ -713,8 +1130,10 @@ typedef	byte	fil_faddr_t;	/*!< 'type' definition in C: an address
 
 /** File space address */
 struct fil_addr_t {
-	ulint	page;		/*!< page number within a space */
-	ulint	boffset;	/*!< byte offset within the page */
+  /** page number within a tablespace */
+  uint32_t page;
+  /** byte offset within the page */
+  uint16_t boffset;
 };
 
 /** The byte offsets on a file page for various variables @{ */
@@ -814,50 +1233,63 @@ not using full_crc32 */
 
 /** File page types (values of FIL_PAGE_TYPE) @{ */
 /** page_compressed, encrypted=YES (not used for full_crc32) */
-#define FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED 37401
+constexpr uint16_t FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED= 37401;
 /** page_compressed (not used for full_crc32) */
-#define FIL_PAGE_PAGE_COMPRESSED 34354  /*!< page compressed page */
-#define FIL_PAGE_INDEX		17855	/*!< B-tree node */
-#define FIL_PAGE_RTREE		17854	/*!< R-tree node (SPATIAL INDEX) */
-#define FIL_PAGE_UNDO_LOG	2	/*!< Undo log page */
-#define FIL_PAGE_INODE		3	/*!< Index node */
-#define FIL_PAGE_IBUF_FREE_LIST	4	/*!< Insert buffer free list */
-/* File page types introduced in MySQL/InnoDB 5.1.7 */
-#define FIL_PAGE_TYPE_ALLOCATED	0	/*!< Freshly allocated page */
-#define FIL_PAGE_IBUF_BITMAP	5	/*!< Insert buffer bitmap */
-#define FIL_PAGE_TYPE_SYS	6	/*!< System page */
-#define FIL_PAGE_TYPE_TRX_SYS	7	/*!< Transaction system data */
-#define FIL_PAGE_TYPE_FSP_HDR	8	/*!< File space header */
-#define FIL_PAGE_TYPE_XDES	9	/*!< Extent descriptor page */
-#define FIL_PAGE_TYPE_BLOB	10	/*!< Uncompressed BLOB page */
-#define FIL_PAGE_TYPE_ZBLOB	11	/*!< First compressed BLOB page */
-#define FIL_PAGE_TYPE_ZBLOB2	12	/*!< Subsequent compressed BLOB page */
-#define FIL_PAGE_TYPE_UNKNOWN	13	/*!< In old tablespaces, garbage
-					in FIL_PAGE_TYPE is replaced with this
-					value when flushing pages. */
+constexpr uint16_t FIL_PAGE_PAGE_COMPRESSED= 34354;
+/** B-tree index page */
+constexpr uint16_t FIL_PAGE_INDEX= 17855;
+/** R-tree index page (SPATIAL INDEX) */
+constexpr uint16_t FIL_PAGE_RTREE= 17854;
+/** Undo log page */
+constexpr uint16_t FIL_PAGE_UNDO_LOG= 2;
+/** Index node (of file-in-file metadata) */
+constexpr uint16_t FIL_PAGE_INODE= 3;
+/** Insert buffer free list */
+constexpr uint16_t FIL_PAGE_IBUF_FREE_LIST= 4;
+/** Freshly allocated page */
+constexpr uint16_t FIL_PAGE_TYPE_ALLOCATED= 0;
+/** Change buffer bitmap (pages n*innodb_page_size+1) */
+constexpr uint16_t FIL_PAGE_IBUF_BITMAP= 5;
+/** System page */
+constexpr uint16_t FIL_PAGE_TYPE_SYS= 6;
+/** Transaction system data */
+constexpr uint16_t FIL_PAGE_TYPE_TRX_SYS= 7;
+/** Tablespace header (page 0) */
+constexpr uint16_t FIL_PAGE_TYPE_FSP_HDR= 8;
+/** Extent descriptor page (pages n*innodb_page_size, except 0) */
+constexpr uint16_t FIL_PAGE_TYPE_XDES= 9;
+/** Uncompressed BLOB page */
+constexpr uint16_t FIL_PAGE_TYPE_BLOB= 10;
+/** First ROW_FORMAT=COMPRESSED BLOB page */
+constexpr uint16_t FIL_PAGE_TYPE_ZBLOB= 11;
+/** Subsequent ROW_FORMAT=COMPRESSED BLOB page */
+constexpr uint16_t FIL_PAGE_TYPE_ZBLOB2= 12;
+/** In old tablespaces, garbage in FIL_PAGE_TYPE is replaced with this
+value when flushing pages. */
+constexpr uint16_t FIL_PAGE_TYPE_UNKNOWN= 13;
 
 /* File page types introduced in MySQL 5.7, not supported in MariaDB */
-//#define FIL_PAGE_COMPRESSED	14
-//#define FIL_PAGE_ENCRYPTED	15
-//#define FIL_PAGE_COMPRESSED_AND_ENCRYPTED 16
-//#define FIL_PAGE_ENCRYPTED_RTREE 17
+//constexpr uint16_t FIL_PAGE_COMPRESSED = 14;
+//constexpr uint16_t FIL_PAGE_ENCRYPTED = 15;
+//constexpr uint16_t FIL_PAGE_COMPRESSED_AND_ENCRYPTED = 16;
+//constexpr FIL_PAGE_ENCRYPTED_RTREE = 17;
 /** Clustered index root page after instant ADD COLUMN */
-#define FIL_PAGE_TYPE_INSTANT	18
+constexpr uint16_t FIL_PAGE_TYPE_INSTANT= 18;
 
 /** Used by i_s.cc to index into the text description.
 Note: FIL_PAGE_TYPE_INSTANT maps to the same as FIL_PAGE_INDEX. */
-#define FIL_PAGE_TYPE_LAST	FIL_PAGE_TYPE_UNKNOWN
-					/*!< Last page type */
-/** Set in FIL_PAGE_TYPE if for full_crc32 pages in page_compressed format.
+constexpr uint16_t FIL_PAGE_TYPE_LAST= FIL_PAGE_TYPE_UNKNOWN;
+
+/** Set in FIL_PAGE_TYPE for full_crc32 pages in page_compressed format.
 If the flag is set, then the following holds for the remaining bits
 of FIL_PAGE_TYPE:
 Bits 0..7 will contain the compressed page size in bytes.
 Bits 8..14 are reserved and must be 0. */
-#define FIL_PAGE_COMPRESS_FCRC32_MARKER	15
+constexpr uint16_t FIL_PAGE_COMPRESS_FCRC32_MARKER= 15;
 /* @} */
 
 /** @return whether the page type is B-tree or R-tree index */
-inline bool fil_page_type_is_index(ulint page_type)
+inline bool fil_page_type_is_index(uint16_t page_type)
 {
 	switch (page_type) {
 	case FIL_PAGE_TYPE_INSTANT:
@@ -873,33 +1305,25 @@ index */
 #define fil_page_index_page_check(page)                         \
         fil_page_type_is_index(fil_page_get_type(page))
 
-/** Enum values for encryption table option */
-enum fil_encryption_t {
-	/** Encrypted if innodb_encrypt_tables=ON (srv_encrypt_tables) */
-	FIL_ENCRYPTION_DEFAULT,
-	/** Encrypted */
-	FIL_ENCRYPTION_ON,
-	/** Not encrypted */
-	FIL_ENCRYPTION_OFF
-};
+/** Get the file page type.
+@param[in]	page	file page
+@return page type */
+inline uint16_t fil_page_get_type(const byte *page)
+{
+  return mach_read_from_2(my_assume_aligned<2>(page + FIL_PAGE_TYPE));
+}
 
 #ifndef UNIV_INNOCHECKSUM
 
-/** The number of fsyncs done to the log */
-extern ulint	fil_n_log_flushes;
-
-/** Number of pending redo log flushes */
-extern ulint	fil_n_pending_log_flushes;
 /** Number of pending tablespace flushes */
-extern ulint	fil_n_pending_tablespace_flushes;
+extern Atomic_counter<ulint> fil_n_pending_tablespace_flushes;
 
 /** Look up a tablespace.
 The caller should hold an InnoDB table lock or a MDL that prevents
 the tablespace from being dropped during the operation,
 or the caller should be in single-threaded crash recovery mode
 (no user connections that could drop tablespaces).
-If this is not the case, fil_space_acquire() and fil_space_t::release()
-should be used instead.
+Normally, fil_space_t::get() should be used instead.
 @param[in]	id	tablespace ID
 @return tablespace, or NULL if not found */
 fil_space_t*
@@ -908,8 +1332,7 @@ fil_space_get(
 	MY_ATTRIBUTE((warn_unused_result));
 
 /** The tablespace memory cache; also the totality of logs (the log
-data space) is stored here; below we talk about tablespaces, but also
-the ib_logfiles form a 'space' and it is handled here */
+data space) is stored here; below we talk about tablespaces */
 struct fil_system_t {
   /**
     Constructor.
@@ -919,7 +1342,6 @@ struct fil_system_t {
   */
   fil_system_t(): m_initialised(false)
   {
-    UT_LIST_INIT(LRU, &fil_node_t::LRU);
     UT_LIST_INIT(space_list, &fil_space_t::space_list);
     UT_LIST_INIT(named_spaces, &fil_space_t::named_spaces);
   }
@@ -955,41 +1377,38 @@ public:
   }
 #endif
 public:
+  /** Detach a tablespace from the cache and close the files.
+  @param space tablespace
+  @param detach_handle whether to detach or close handles
+  @return detached handles or empty vector */
+  std::vector<pfs_os_file_t> detach(fil_space_t *space,
+                                    bool detach_handle= false);
+
 	ib_mutex_t	mutex;		/*!< The mutex protecting the cache */
 	fil_space_t*	sys_space;	/*!< The innodb_system tablespace */
 	fil_space_t*	temp_space;	/*!< The innodb_temporary tablespace */
-	hash_table_t*	spaces;		/*!< The hash table of spaces in the
-					system; they are hashed on the space
-					id */
-	UT_LIST_BASE_NODE_T(fil_node_t) LRU;
-					/*!< base node for the LRU list of the
-					most recently used open files with no
-					pending i/o's; if we start an i/o on
-					the file, we first remove it from this
-					list, and return it to the start of
-					the list when the i/o ends;
-					log files and the system tablespace are
-					not put to this list: they are opened
-					after the startup, and kept open until
-					shutdown */
-	sized_ilist<fil_space_t, unflushed_spaces_tag_t> unflushed_spaces;
-					/*!< list of those
-					tablespaces whose files contain
-					unflushed writes; those spaces have
-					at least one file node where
-					needs_flush == true */
-	ulint		n_open;		/*!< number of files currently open */
+  /** Map of fil_space_t::id to fil_space_t* */
+  hash_table_t spaces;
+  /** tablespaces for which fil_space_t::needs_flush() holds */
+  sized_ilist<fil_space_t, unflushed_spaces_tag_t> unflushed_spaces;
+  /** number of currently open files; protected by mutex */
+  ulint n_open;
+  /** last time we noted n_open exceeding the limit; protected by mutex */
+  time_t n_open_exceeded_time;
 	ulint		max_assigned_id;/*!< maximum space id in the existing
 					tables, or assigned during the time
 					mysqld has been up; at an InnoDB
 					startup we scan the data dictionary
 					and set here the maximum of the
 					space id's of the tables there */
+  /** nonzero if fil_node_open_file_low() should avoid moving the tablespace
+  to the end of space_list, for FIFO policy of try_to_close() */
+  ulint freeze_space_list;
 	UT_LIST_BASE_NODE_T(fil_space_t) space_list;
 					/*!< list of all file spaces */
 	UT_LIST_BASE_NODE_T(fil_space_t) named_spaces;
 					/*!< list of all file spaces
-					for which a MLOG_FILE_NAME
+					for which a FILE_MODIFY
 					record has been written since
 					the latest redo log checkpoint.
 					Protected only by log_sys.mutex. */
@@ -998,51 +1417,77 @@ public:
 	ilist<fil_space_t, rotation_list_tag_t> default_encrypt_tables;
 
 	bool		space_id_reuse_warned;
-					/*!< whether fil_space_create()
+					/*!< whether fil_space_t::create()
 					has issued a warning about
 					potential space_id reuse */
 
-	/** Trigger a call to fil_node_t::read_page0()
-	@param[in]	id	tablespace identifier
-	@return	tablespace
-	@retval	NULL	if the tablespace does not exist or cannot be read */
-	fil_space_t* read_page0(ulint id);
-
   /** Return the next tablespace from default_encrypt_tables list.
-  @param space   previous tablespace (NULL to start from the start)
+  @param space   previous tablespace (nullptr to start from the start)
   @param recheck whether the removal condition needs to be rechecked after
   the encryption parameters were changed
   @param encrypt expected state of innodb_encrypt_tables
   @return the next tablespace to process (n_pending_ops incremented)
-  @retval NULL if this was the last */
-  inline fil_space_t* default_encrypt_next(
-    fil_space_t *space, bool recheck, bool encrypt);
+  @retval fil_system.temp_space if there is no work to do
+  @retval nullptr upon reaching the end of the iteration */
+  inline fil_space_t* default_encrypt_next(fil_space_t *space, bool recheck,
+                                           bool encrypt);
+
+  /** Extend all open data files to the recovered size */
+  ATTRIBUTE_COLD void extend_to_recv_size();
 };
 
 /** The tablespace memory cache. */
 extern fil_system_t	fil_system;
 
-#include "fil0crypt.h"
+inline void fil_space_t::reacquire()
+{
+  ut_d(uint32_t n=) n_pending.fetch_add(1, std::memory_order_relaxed);
+  ut_d(if (mutex_own(&fil_system.mutex)) return);
+  ut_ad(n & PENDING);
+  ut_ad(UT_LIST_GET_FIRST(chain)->is_open());
+}
 
-/** Create a space memory object and put it to the fil_system hash table.
-Error messages are issued to the server log.
-@param[in]	name		tablespace name
-@param[in]	id		tablespace identifier
-@param[in]	flags		tablespace flags
-@param[in]	purpose		tablespace purpose
-@param[in,out]	crypt_data	encryption information
-@param[in]	mode		encryption mode
-@return pointer to created tablespace, to be filled in with fil_space_t::add()
-@retval NULL on failure (such as when the same tablespace exists) */
-fil_space_t*
-fil_space_create(
-	const char*		name,
-	ulint			id,
-	ulint			flags,
-	fil_type_t		purpose,
-	fil_space_crypt_t*	crypt_data,
-	fil_encryption_t	mode = FIL_ENCRYPTION_DEFAULT)
-	MY_ATTRIBUTE((warn_unused_result));
+/** Note that operations on the tablespace must stop or can resume */
+inline void fil_space_t::set_stopping(bool stopping)
+{
+  ut_ad(mutex_own(&fil_system.mutex));
+  ut_d(auto n=) n_pending.fetch_xor(STOPPING, std::memory_order_relaxed);
+  ut_ad(!(n & STOPPING) == stopping);
+}
+
+/** Flush pending writes from the file system cache to the file. */
+template<bool have_reference> inline void fil_space_t::flush()
+{
+  ut_ad(!mutex_own(&fil_system.mutex));
+  ut_ad(!have_reference || (pending() & PENDING));
+  ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT);
+  if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
+  {
+    ut_ad(!is_in_unflushed_spaces);
+    ut_ad(!needs_flush());
+  }
+  else if (have_reference)
+    flush_low();
+  else if (!(acquire_low() & STOPPING))
+  {
+    flush_low();
+    release();
+  }
+}
+
+/** @return the size in pages (0 if unreadable) */
+inline uint32_t fil_space_t::get_size()
+{
+  if (!size)
+  {
+    mutex_enter(&fil_system.mutex);
+    read_page0();
+    mutex_exit(&fil_system.mutex);
+  }
+  return size;
+}
+
+#include "fil0crypt.h"
 
 /*******************************************************************//**
 Assigns a new space id for a new single-table tablespace. This works simply by
@@ -1069,49 +1514,10 @@ fil_space_free(
 @param	id	tablespace ID
 @param	size	recovered size in pages
 @param	flags	tablespace flags */
-UNIV_INTERN
-void fil_space_set_recv_size_and_flags(ulint id, ulint size, uint32_t flags);
+void fil_space_set_recv_size_and_flags(ulint id, uint32_t size,
+                                       uint32_t flags);
 
 /*******************************************************************//**
-Returns the size of the space in pages. The tablespace must be cached in the
-memory cache.
-@return space size, 0 if space not found */
-ulint
-fil_space_get_size(
-/*===============*/
-	ulint	id);	/*!< in: space id */
-/*******************************************************************//**
-Returns the flags of the space. The tablespace must be cached
-in the memory cache.
-@return flags, ULINT_UNDEFINED if space not found */
-ulint
-fil_space_get_flags(
-/*================*/
-	ulint	id);	/*!< in: space id */
-
-/*******************************************************************//**
-Opens all log files and system tablespace data files. They stay open until the
-database server shutdown. This should be called at a server startup after the
-space objects for the log and the system tablespace have been created. The
-purpose of this operation is to make sure we never run out of file descriptors
-if we need to read from the insert buffer or to write to the log. */
-void
-fil_open_log_and_system_tablespace_files(void);
-/*==========================================*/
-/*******************************************************************//**
-Closes all open files. There must not be any pending i/o's or not flushed
-modifications in the files. */
-void
-fil_close_all_files(void);
-/*=====================*/
-/*******************************************************************//**
-Closes the redo log files. There must not be any pending i/o's or not
-flushed modifications in the files. */
-void
-fil_close_log_files(
-/*================*/
-	bool	free);	/*!< in: whether to free the memory object */
-/*******************************************************************//**
 Sets the max tablespace id counter if the given number is bigger than the
 previous value. */
 void
@@ -1128,82 +1534,14 @@ fil_write_flushed_lsn(
 	lsn_t	lsn)
 MY_ATTRIBUTE((warn_unused_result));
 
-/** Acquire a tablespace when it could be dropped concurrently.
-Used by background threads that do not necessarily hold proper locks
-for concurrency control.
-@param[in]	id	tablespace ID
-@param[in]	silent	whether to silently ignore missing tablespaces
-@return	the tablespace
-@retval	NULL if missing or being deleted */
-fil_space_t* fil_space_acquire_low(ulint id, bool silent)
-	MY_ATTRIBUTE((warn_unused_result));
-
-/** Acquire a tablespace when it could be dropped concurrently.
-Used by background threads that do not necessarily hold proper locks
-for concurrency control.
-@param[in]	id	tablespace ID
-@return	the tablespace
-@retval	NULL if missing or being deleted or truncated */
-inline
-fil_space_t*
-fil_space_acquire(ulint id)
-{
-	return (fil_space_acquire_low(id, false));
-}
-
-/** Acquire a tablespace that may not exist.
-Used by background threads that do not necessarily hold proper locks
-for concurrency control.
-@param[in]	id	tablespace ID
-@return	the tablespace
-@retval	NULL if missing or being deleted */
-inline
-fil_space_t*
-fil_space_acquire_silent(ulint id)
-{
-	return (fil_space_acquire_low(id, true));
-}
-
-/** Acquire a tablespace for reading or writing a block,
-when it could be dropped concurrently.
-@param[in]	id	tablespace ID
-@return	the tablespace
-@retval	NULL if missing */
-fil_space_t*
-fil_space_acquire_for_io(ulint id);
-
-/** Replay a file rename operation if possible.
-@param[in]	space_id	tablespace identifier
-@param[in]	first_page_no	first page number in the file
-@param[in]	name		old file name
-@param[in]	new_name	new file name
-@return	whether the operation was successfully applied
-(the name did not exist, or new_name did not exist and
-name was successfully renamed to new_name)  */
-bool
-fil_op_replay_rename(
-	ulint		space_id,
-	ulint		first_page_no,
-	const char*	name,
-	const char*	new_name)
-	MY_ATTRIBUTE((warn_unused_result));
-
-/** Determine whether a table can be accessed in operations that are
-not (necessarily) protected by meta-data locks.
-(Rollback would generally be protected, but rollback of
-FOREIGN KEY CASCADE/SET NULL is not protected by meta-data locks
-but only by InnoDB table locks, which may be broken by
-lock_remove_all_on_table().)
-@param[in]	table	persistent table
-checked @return whether the table is accessible */
-bool fil_table_accessible(const dict_table_t* table)
-	MY_ATTRIBUTE((warn_unused_result, nonnull));
-
 /** Delete a tablespace and associated .ibd file.
 @param[in]	id		tablespace identifier
 @param[in]	if_exists	whether to ignore missing tablespace
+@param[out]	leaked_handles	return detached handles here
 @return	DB_SUCCESS or error */
-dberr_t fil_delete_tablespace(ulint id, bool if_exists= false);
+dberr_t
+fil_delete_tablespace(ulint id, bool if_exists= false,
+                      std::vector<pfs_os_file_t> *detached_handles= nullptr);
 
 /** Prepare to truncate an undo tablespace.
 @param[in]	space_id	undo tablespace id
@@ -1211,24 +1549,15 @@ dberr_t fil_delete_tablespace(ulint id, bool if_exists= false);
 @retval	NULL if the tablespace does not exist */
 fil_space_t* fil_truncate_prepare(ulint space_id);
 
-/** Write log about an undo tablespace truncate operation. */
-void fil_truncate_log(fil_space_t* space, ulint size, mtr_t* mtr)
-	MY_ATTRIBUTE((nonnull));
-
-/*******************************************************************//**
-Closes a single-table tablespace. The tablespace must be cached in the
-memory cache. Free all pages used by the tablespace.
-@return DB_SUCCESS or error */
-dberr_t
-fil_close_tablespace(
-/*=================*/
-	trx_t*	trx,	/*!< in/out: Transaction covering the close */
-	ulint	id);	/*!< in: space id */
+/** Close a single-table tablespace on failed IMPORT TABLESPACE.
+The tablespace must be cached in the memory cache.
+Free all pages used by the tablespace. */
+void fil_close_tablespace(ulint id);
 
 /*******************************************************************//**
 Allocates and builds a file name from a path, a table or tablespace name
 and a suffix. The string must be freed by caller with ut_free().
-@param[in] path NULL or the direcory path or the full path and filename.
+@param[in] path NULL or the directory path or the full path and filename.
 @param[in] name NULL if path is full, or Table/Tablespace name
 @param[in] suffix NULL or the file extention to use.
 @return own: file name */
@@ -1257,7 +1586,7 @@ fil_ibd_create(
 	const char*	name,
 	const char*	path,
 	ulint		flags,
-	ulint		size,
+	uint32_t	size,
 	fil_encryption_t mode,
 	uint32_t	key_id,
 	dberr_t*	err)
@@ -1340,7 +1669,7 @@ fil_ibd_load(
 memory cache. Note that if we have not done a crash recovery at the database
 startup, there may be many tablespaces which are not yet in the memory cache.
 @param[in]	id		Tablespace ID
-@param[in]	name		Tablespace name used in fil_space_create().
+@param[in]	name		Tablespace name used in fil_space_t::create().
 @param[in]	table_flags	table flags
 @return the tablespace
 @retval	NULL	if no matching tablespace exists in the memory cache */
@@ -1354,98 +1683,15 @@ fil_space_for_table_exists_in_mem(
 @param[in,out]	space	tablespace
 @param[in]	size	desired size in pages
 @return whether the tablespace is at least as big as requested */
-bool
-fil_space_extend(
-	fil_space_t*	space,
-	ulint		size);
-
-/** Reads or writes data. This operation could be asynchronous (aio).
-
-@param[in]	type		IO context
-@param[in]	sync		true if synchronous aio is desired
-@param[in]	page_id		page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	byte_offset	remainder of offset in bytes; in aio this
-				must be divisible by the OS block size
-@param[in]	len		how many bytes to read or write; this must
-				not cross a file boundary; in aio this must
-				be a block size multiple
-@param[in,out]	buf		buffer where to store read data or from where
-				to write; in aio this must be appropriately
-				aligned
-@param[in]	message		message for aio handler if non-sync aio
-				used, else ignored
-@param[in]	ignore_missing_space true=ignore missing space during read
-@return DB_SUCCESS, or DB_TABLESPACE_DELETED
-if we are trying to do i/o on a tablespace which does not exist */
-dberr_t
-fil_io(
-	const IORequest&	type,
-	bool			sync,
-	const page_id_t		page_id,
-	ulint			zip_size,
-	ulint			byte_offset,
-	ulint			len,
-	void*			buf,
-	void*			message,
-	bool			ignore_missing_space = false);
-
-/**********************************************************************//**
-Waits for an aio operation to complete. This function is used to write the
-handler for completed requests. The aio array of pending requests is divided
-into segments (see os0file.cc for more info). The thread specifies which
-segment it wants to wait for. */
-void
-fil_aio_wait(
-/*=========*/
-	ulint	segment);	/*!< in: the number of the segment in the aio
-				array to wait for */
-/**********************************************************************//**
-Flushes to disk possible writes cached by the OS. If the space does not exist
-or is being dropped, does not do anything. */
-void
-fil_flush(
-/*======*/
-	ulint	space_id);	/*!< in: file space id (this can be a group of
-				log files or a tablespace of the database) */
-/** Flush a tablespace.
-@param[in,out]	space	tablespace to flush */
-void
-fil_flush(fil_space_t* space);
+bool fil_space_extend(fil_space_t *space, uint32_t size);
 
 /** Flush to disk the writes in file spaces of the given type
-possibly cached by the OS.
-@param[in]	purpose	FIL_TYPE_TABLESPACE or FIL_TYPE_LOG */
-void
-fil_flush_file_spaces(
-	fil_type_t	purpose);
+possibly cached by the OS. */
+void fil_flush_file_spaces();
 /******************************************************************//**
 Checks the consistency of the tablespace cache.
 @return true if ok */
-bool
-fil_validate(void);
-/*==============*/
-/********************************************************************//**
-Returns true if file address is undefined.
-@return true if undefined */
-bool
-fil_addr_is_null(
-/*=============*/
-	fil_addr_t	addr);	/*!< in: address */
-/********************************************************************//**
-Get the predecessor of a file page.
-@return FIL_PAGE_PREV */
-ulint
-fil_page_get_prev(
-/*==============*/
-	const byte*	page);	/*!< in: file page */
-/********************************************************************//**
-Get the successor of a file page.
-@return FIL_PAGE_NEXT */
-ulint
-fil_page_get_next(
-/*==============*/
-	const byte*	page);	/*!< in: file page */
+bool fil_validate();
 /*********************************************************************//**
 Sets the file page type. */
 void
@@ -1482,20 +1728,6 @@ char*
 fil_path_to_space_name(
 	const char*	filename);
 
-/** Generate redo log for swapping two .ibd files
-@param[in]	old_table	old table
-@param[in]	new_table	new table
-@param[in]	tmp_name	temporary table name
-@param[in,out]	mtr		mini-transaction
-@return innodb error code */
-dberr_t
-fil_mtr_rename_log(
-	const dict_table_t*	old_table,
-	const dict_table_t*	new_table,
-	const char*		tmp_name,
-	mtr_t*			mtr)
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
-
 /** Acquire the fil_system mutex. */
 #define fil_system_enter()	mutex_enter(&fil_system.mutex)
 /** Release the fil_system mutex. */
@@ -1515,66 +1747,40 @@ void
 fil_names_dirty(
 	fil_space_t*	space);
 
-/** Write MLOG_FILE_NAME records when a non-predefined persistent
+/** Write FILE_MODIFY records when a non-predefined persistent
 tablespace was modified for the first time since the latest
 fil_names_clear().
-@param[in,out]	space	tablespace
-@param[in,out]	mtr	mini-transaction */
-void
-fil_names_dirty_and_write(
-	fil_space_t*	space,
-	mtr_t*		mtr);
+@param[in,out]	space	tablespace */
+void fil_names_dirty_and_write(fil_space_t* space);
 
-/** Write MLOG_FILE_NAME records if a persistent tablespace was modified
+/** Write FILE_MODIFY records if a persistent tablespace was modified
 for the first time since the latest fil_names_clear().
 @param[in,out]	space	tablespace
 @param[in,out]	mtr	mini-transaction
-@return whether any MLOG_FILE_NAME record was written */
-inline MY_ATTRIBUTE((warn_unused_result))
-bool
-fil_names_write_if_was_clean(
-	fil_space_t*	space,
-	mtr_t*		mtr)
+@return whether any FILE_MODIFY record was written */
+inline bool fil_names_write_if_was_clean(fil_space_t* space)
 {
-	ut_ad(log_mutex_own());
+	mysql_mutex_assert_owner(&log_sys.mutex);
 
 	if (space == NULL) {
 		return(false);
 	}
 
 	const bool	was_clean = space->max_lsn == 0;
-	ut_ad(space->max_lsn <= log_sys.lsn);
-	space->max_lsn = log_sys.lsn;
+	ut_ad(space->max_lsn <= log_sys.get_lsn());
+	space->max_lsn = log_sys.get_lsn();
 
 	if (was_clean) {
-		fil_names_dirty_and_write(space, mtr);
+		fil_names_dirty_and_write(space);
 	}
 
 	return(was_clean);
 }
 
-/** During crash recovery, open a tablespace if it had not been opened
-yet, to get valid size and flags.
-@param[in,out]	space	tablespace */
-inline void fil_space_open_if_needed(fil_space_t* space)
-{
-	ut_d(extern volatile bool recv_recovery_on);
-	ut_ad(recv_recovery_on);
-
-	if (space->size == 0) {
-		/* Initially, size and flags will be set to 0,
-		until the files are opened for the first time.
-		fil_space_get_size() will open the file
-		and adjust the size and flags. */
-		ut_d(ulint size	=) fil_space_get_size(space->id);
-		ut_ad(size == space->size);
-	}
-}
-
 /** On a log checkpoint, reset fil_names_dirty_and_write() flags
-and write out MLOG_FILE_NAME and MLOG_CHECKPOINT if needed.
+and write out FILE_MODIFY and FILE_CHECKPOINT if needed.
 @param[in]	lsn		checkpoint LSN
-@param[in]	do_write	whether to always write MLOG_CHECKPOINT
+@param[in]	do_write	whether to always write FILE_CHECKPOINT
 @return whether anything was written to the redo log
 @retval false	if no flags were set and nothing written
 @retval true	if anything was written to the redo log */
diff --git a/storage/innobase/include/fil0fil.inl b/storage/innobase/include/fil0fil.inl
index 24e4157d1f3..fd5f5bc1db9 100644
--- a/storage/innobase/include/fil0fil.inl
+++ b/storage/innobase/include/fil0fil.inl
@@ -86,7 +86,7 @@ fil_page_type_validate(
 	fil_space_t*	space,
 	const byte*	page)
 {
-	ulint page_type = mach_read_from_2(page + FIL_PAGE_TYPE);
+	const uint16_t page_type = fil_page_get_type(page);
 
 	if ((page_type & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)
 	    && space->full_crc32()
diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h
index c513a94c799..a22867ad56a 100644
--- a/storage/innobase/include/fil0pagecompress.h
+++ b/storage/innobase/include/fil0pagecompress.h
@@ -20,7 +20,6 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #define fil0pagecompress_h
 
 #include "fsp0fsp.h"
-#include "fsp0pagecompress.h"
 
 /******************************************************************//**
 @file include/fil0pagecompress.h
diff --git a/storage/innobase/include/fsp0file.h b/storage/innobase/include/fsp0file.h
index 15485769429..7db85e87ed0 100644
--- a/storage/innobase/include/fsp0file.h
+++ b/storage/innobase/include/fsp0file.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, MariaDB Corporation.
+Copyright (c) 2018, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -61,7 +61,6 @@ public:
 		m_flags(),
 		m_exists(),
 		m_is_valid(),
-		m_first_page_buf(),
 		m_first_page(),
 		m_last_os_error(),
 		m_file_info()
@@ -69,7 +68,7 @@ public:
 		/* No op */
 	}
 
-	Datafile(const char* name, ulint flags, ulint size, ulint order)
+	Datafile(const char* name, ulint flags, uint32_t size, ulint order)
 		:
 		m_name(mem_strdup(name)),
 		m_filepath(),
@@ -83,7 +82,6 @@ public:
 		m_flags(flags),
 		m_exists(),
 		m_is_valid(),
-		m_first_page_buf(),
 		m_first_page(),
 		m_last_os_error(),
 		m_file_info()
@@ -103,7 +101,6 @@ public:
 		m_flags(file.m_flags),
 		m_exists(file.m_exists),
 		m_is_valid(file.m_is_valid),
-		m_first_page_buf(),
 		m_first_page(),
 		m_last_os_error(),
 		m_file_info()
@@ -162,7 +159,6 @@ public:
 
 		/* Do not make a copy of the first page,
 		it should be reread if needed */
-		m_first_page_buf = NULL;
 		m_first_page = NULL;
 
 		return(*this);
@@ -272,6 +268,14 @@ public:
 		return(m_handle);
 	}
 
+	/** @return detached file handle */
+	pfs_os_file_t detach()
+	{
+		pfs_os_file_t detached = m_handle;
+		m_handle = OS_FILE_CLOSED;
+		return detached;
+	}
+
 	/** Get Datafile::m_order.
 	@return m_order */
 	ulint	order()	const
@@ -435,7 +439,7 @@ private:
 
 	/** size in megabytes or pages; converted from megabytes to
 	pages in SysTablespace::normalize_size() */
-	ulint			m_size;
+	uint32_t		m_size;
 
 	/** ordinal position of this datafile in the tablespace */
 	ulint			m_order;
@@ -459,10 +463,7 @@ private:
 	/* true if the tablespace is valid */
 	bool			m_is_valid;
 
-	/** Buffer to hold first page */
-	byte*			m_first_page_buf;
-
-	/** Pointer to the first page held in the buffer above */
+	/** Aligned buffer to hold first page */
 	byte*			m_first_page;
 
 protected:
diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h
index 16d3274e481..7245db39273 100644
--- a/storage/innobase/include/fsp0fsp.h
+++ b/storage/innobase/include/fsp0fsp.h
@@ -27,6 +27,7 @@ Created 12/18/1995 Heikki Tuuri
 #ifndef fsp0fsp_h
 #define fsp0fsp_h
 
+#include "assume_aligned.h"
 #include "fsp0types.h"
 #include "fut0lst.h"
 #include "ut0byte.h"
@@ -101,7 +102,6 @@ see the table in fsp0types.h @{ */
 #define FSP_HEADER_OFFSET	FIL_PAGE_DATA
 
 /* The data structures in files are defined just as byte strings in C */
-typedef	byte	fsp_header_t;
 typedef	byte	xdes_t;
 
 /*			SPACE HEADER
@@ -207,7 +207,7 @@ typedef	byte	fseg_inode_t;
 	(16 + 3 * FLST_BASE_NODE_SIZE			\
 	 + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE)
 
-#define FSEG_MAGIC_N_VALUE	97937874
+static constexpr uint32_t FSEG_MAGIC_N_VALUE= 97937874;
 
 #define	FSEG_FILLFACTOR		8	/* If this value is x, then if
 					the number of unused but reserved
@@ -288,36 +288,37 @@ the extent are free and which contain old tuple version to clean. */
 /** Offset of the descriptor array on a descriptor page */
 #define	XDES_ARR_OFFSET		(FSP_HEADER_OFFSET + FSP_HEADER_SIZE)
 
+/**
+Determine if a page is marked free.
+@param[in]	descr	extent descriptor
+@param[in]	offset	page offset within extent
+@return whether the page is free */
+inline bool xdes_is_free(const xdes_t *descr, ulint offset)
+{
+  ut_ad(offset < FSP_EXTENT_SIZE);
+  ulint index= XDES_FREE_BIT + XDES_BITS_PER_PAGE * offset;
+  return ut_bit_get_nth(descr[XDES_BITMAP + (index >> 3)], index & 7);
+}
+
 #ifndef UNIV_INNOCHECKSUM
 /* @} */
 
-/**********************************************************************//**
-Reads the space id from the first page of a tablespace.
-@return space id, ULINT UNDEFINED if error */
-ulint
-fsp_header_get_space_id(
-/*====================*/
-	const page_t*	page);	/*!< in: first page of a tablespace */
-
 /** Read a tablespace header field.
 @param[in]	page	first page of a tablespace
 @param[in]	field	the header field
 @return the contents of the header field */
-inline
-ulint
-fsp_header_get_field(const page_t* page, ulint field)
+inline uint32_t fsp_header_get_field(const page_t* page, ulint field)
 {
-	return(mach_read_from_4(FSP_HEADER_OFFSET + field + page));
+  return mach_read_from_4(FSP_HEADER_OFFSET + field +
+			  my_assume_aligned<UNIV_ZIP_SIZE_MIN>(page));
 }
 
 /** Read the flags from the tablespace header page.
 @param[in]	page	first page of a tablespace
 @return the contents of FSP_SPACE_FLAGS */
-inline
-ulint
-fsp_header_get_flags(const page_t* page)
+inline uint32_t fsp_header_get_flags(const page_t *page)
 {
-	return(fsp_header_get_field(page, FSP_SPACE_FLAGS));
+  return fsp_header_get_field(page, FSP_SPACE_FLAGS);
 }
 
 /** Get the byte offset of encryption information in page 0.
@@ -356,7 +357,7 @@ fsp_header_init_fields(
 @param[in,out]	space	tablespace
 @param[in]	size	current size in blocks
 @param[in,out]	mtr	mini-transaction */
-void fsp_header_init(fil_space_t* space, ulint size, mtr_t* mtr)
+void fsp_header_init(fil_space_t* space, uint32_t size, mtr_t* mtr)
 	MY_ATTRIBUTE((nonnull));
 
 /** Create a new segment.
@@ -372,16 +373,17 @@ buf_block_t*
 fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr,
             bool has_done_reservation= false, buf_block_t *block= NULL);
 
-/**********************************************************************//**
-Calculates the number of pages reserved by a segment, and how many pages are
-currently used.
+/** Calculate the number of pages reserved by a segment,
+and how many pages are currently used.
+@param[in]      block   buffer block containing the file segment header
+@param[in]      header  file segment header
+@param[out]     used    number of pages that are used (not more than reserved)
+@param[in,out]  mtr     mini-transaction
 @return number of reserved pages */
-ulint
-fseg_n_reserved_pages(
-/*==================*/
-	fseg_header_t*	header,	/*!< in: segment header */
-	ulint*		used,	/*!< out: number of pages used (<= reserved) */
-	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+ulint fseg_n_reserved_pages(const buf_block_t &block,
+                            const fseg_header_t *header, ulint *used,
+                            mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
 /**********************************************************************//**
 Allocates a single free page from a segment. This function implements
 the intelligent allocation strategy which tries to minimize
@@ -397,36 +399,31 @@ file space fragmentation.
 @return X-latched block, or NULL if no page could be allocated */
 #define fseg_alloc_free_page(seg_header, hint, direction, mtr)		\
 	fseg_alloc_free_page_general(seg_header, hint, direction,	\
-				     FALSE, mtr, mtr)
+				     false, mtr, mtr)
 /**********************************************************************//**
 Allocates a single free page from a segment. This function implements
 the intelligent allocation strategy which tries to minimize file space
 fragmentation.
-@retval NULL if no page could be allocated
-@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
-(init_mtr == mtr, or the page was not previously freed in mtr)
-@retval block (not allocated or initialized) otherwise */
+@retval NULL if no page could be allocated */
 buf_block_t*
 fseg_alloc_free_page_general(
 /*=========================*/
 	fseg_header_t*	seg_header,/*!< in/out: segment header */
-	ulint		hint,	/*!< in: hint of which page would be
+	uint32_t	hint,	/*!< in: hint of which page would be
 				desirable */
 	byte		direction,/*!< in: if the new page is needed because
 				of an index page split, and records are
 				inserted there in order, into which
 				direction they go alphabetically: FSP_DOWN,
 				FSP_UP, FSP_NO_DIR */
-	ibool		has_done_reservation, /*!< in: TRUE if the caller has
+	bool		has_done_reservation, /*!< in: true if the caller has
 				already done the reservation for the page
 				with fsp_reserve_free_extents, then there
 				is no need to do the check for this individual
 				page */
 	mtr_t*		mtr,	/*!< in/out: mini-transaction */
 	mtr_t*		init_mtr)/*!< in/out: mtr or another mini-transaction
-				in which the page should be initialized.
-				If init_mtr!=mtr, but the page is already
-				latched in mtr, do not initialize the page. */
+				in which the page should be initialized. */
 	MY_ATTRIBUTE((warn_unused_result, nonnull));
 
 /** Reserves free pages from a tablespace. All mini-transactions which may
@@ -469,25 +466,23 @@ free pages available.
 @return true if we were able to make the reservation */
 bool
 fsp_reserve_free_extents(
-	ulint*		n_reserved,
+	uint32_t*	n_reserved,
 	fil_space_t*	space,
-	ulint		n_ext,
+	uint32_t	n_ext,
 	fsp_reserve_t	alloc_type,
 	mtr_t*		mtr,
-	ulint		n_pages = 2);
+	uint32_t	n_pages = 2);
 
 /** Free a page in a file segment.
 @param[in,out]	seg_header	file segment header
 @param[in,out]	space		tablespace
 @param[in]	offset		page number
-@param[in]	log		whether to write MLOG_INIT_FREE_PAGE record
 @param[in,out]	mtr		mini-transaction */
 void
 fseg_free_page(
 	fseg_header_t*	seg_header,
 	fil_space_t*	space,
-	ulint		offset,
-	bool		log,
+	uint32_t	offset,
 	mtr_t*		mtr);
 /** Determine whether a page is free.
 @param[in,out]	space	tablespace
@@ -531,14 +526,6 @@ Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE.
 ATTRIBUTE_COLD
 void fil_block_reset_type(const buf_block_t& block, ulint type, mtr_t* mtr);
 
-/** Get the file page type.
-@param[in]	page	file page
-@return page type */
-inline uint16_t fil_page_get_type(const byte* page)
-{
-	return mach_read_from_2(page + FIL_PAGE_TYPE);
-}
-
 /** Check (and if needed, reset) the page type.
 Data files created before MySQL 5.1.48 may contain
 garbage in the FIL_PAGE_TYPE field.
@@ -570,7 +557,7 @@ inline bool fsp_descr_page(const page_id_t page_id, ulint physical_size)
 
 /** Initialize a file page whose prior contents should be ignored.
 @param[in,out]	block	buffer pool block */
-void fsp_apply_init_file_page(buf_block_t* block);
+void fsp_apply_init_file_page(buf_block_t *block);
 
 /** Initialize a file page.
 @param[in]	space	tablespace
@@ -583,9 +570,9 @@ inline void fsp_init_file_page(
 	buf_block_t* block, mtr_t* mtr)
 {
 	ut_d(space->modify_check(*mtr));
-	ut_ad(space->id == block->page.id.space());
+	ut_ad(space->id == block->page.id().space());
 	fsp_apply_init_file_page(block);
-	mlog_write_initial_log_record(block->frame, MLOG_INIT_FILE_PAGE2, mtr);
+	mtr->init(block);
 }
 
 #ifndef UNIV_DEBUG
@@ -732,18 +719,6 @@ fsp_flags_match(ulint expected, ulint actual)
 	return(actual == expected);
 }
 
-/**********************************************************************//**
-Gets a descriptor bit of a page.
-@return TRUE if free */
-UNIV_INLINE
-ibool
-xdes_get_bit(
-/*=========*/
-	const xdes_t*	descr,	/*!< in: descriptor */
-	ulint		bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
-	ulint		offset);/*!< in: page offset within extent:
-				0 ... FSP_EXTENT_SIZE - 1 */
-
 /** Determine the descriptor index within a descriptor page.
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 @param[in]	offset		page offset
@@ -759,7 +734,7 @@ inline ulint xdes_calc_descriptor_index(ulint zip_size, ulint offset)
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 @param[in]	offset		page offset
 @return descriptor page offset */
-inline ulint xdes_calc_descriptor_page(ulint zip_size, ulint offset)
+inline uint32_t xdes_calc_descriptor_page(ulint zip_size, uint32_t offset)
 {
 	compile_time_assert(UNIV_PAGE_SIZE_MAX > XDES_ARR_OFFSET
 			    + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX)
@@ -777,12 +752,10 @@ inline ulint xdes_calc_descriptor_page(ulint zip_size, ulint offset)
 	ut_ad(!zip_size
 	      || zip_size > XDES_ARR_OFFSET
 	      + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE);
-	return ut_2pow_round<ulint>(offset,
-				    zip_size ? zip_size : srv_page_size);
+	return ut_2pow_round(offset,
+			     uint32_t(zip_size ? zip_size : srv_page_size));
 }
 
 #endif /* UNIV_INNOCHECKSUM */
 
-#include "fsp0fsp.inl"
-
 #endif
diff --git a/storage/innobase/include/fsp0fsp.inl b/storage/innobase/include/fsp0fsp.inl
deleted file mode 100644
index 31b9d8c5dbe..00000000000
--- a/storage/innobase/include/fsp0fsp.inl
+++ /dev/null
@@ -1,48 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2019, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file include/fsp0fsp.ic
-File space management
-
-Created 12/18/1995 Heikki Tuuri
-*******************************************************/
-
-/**********************************************************************//**
-Gets a descriptor bit of a page.
-@return TRUE if free */
-UNIV_INLINE
-ibool
-xdes_get_bit(
-/*=========*/
-	const xdes_t*	descr,	/*!< in: descriptor */
-	ulint		bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
-	ulint		offset)	/*!< in: page offset within extent:
-				0 ... FSP_EXTENT_SIZE - 1 */
-{
-	ut_ad(offset < FSP_EXTENT_SIZE);
-	ut_ad(bit == XDES_FREE_BIT || bit == XDES_CLEAN_BIT);
-
-	ulint	index = bit + XDES_BITS_PER_PAGE * offset;
-
-	ulint	bit_index = index % 8;
-	ulint	byte_index = index / 8;
-
-	return ut_bit_get_nth(descr[XDES_BITMAP + byte_index], bit_index);
-}
diff --git a/storage/innobase/include/fsp0pagecompress.h b/storage/innobase/include/fsp0pagecompress.h
deleted file mode 100644
index 34a132582c6..00000000000
--- a/storage/innobase/include/fsp0pagecompress.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*****************************************************************************
-
-Copyright (C) 2013, 2017, MariaDB Corporation. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/******************************************************************//**
-@file include/fsp0pagecompress.h
-Helper functions for extracting/storing page compression and
-atomic writes information to file space.
-
-Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
-***********************************************************************/
-
-#ifndef fsp0pagecompress_h
-#define fsp0pagecompress_h
-
-/**********************************************************************//**
-Reads the page compression level from the first page of a tablespace.
-@return	page compression level, or 0 if uncompressed */
-UNIV_INTERN
-ulint
-fsp_header_get_compression_level(
-/*=============================*/
-	const page_t*	page);	/*!< in: first page of a tablespace */
-
-/********************************************************************//**
-Extract the page compression level from tablespace flags.
-A tablespace has only one physical page compression level
-whether that page is compressed or not.
-@return	page compression level of the file-per-table tablespace,
-or zero if the table is not compressed.  */
-UNIV_INLINE
-ulint
-fsp_flags_get_page_compression_level(
-/*=================================*/
-	ulint	flags);	/*!< in: tablespace flags */
-
-#include "fsp0pagecompress.inl"
-
-#endif
diff --git a/storage/innobase/include/fsp0pagecompress.inl b/storage/innobase/include/fsp0pagecompress.inl
deleted file mode 100644
index 590a609c309..00000000000
--- a/storage/innobase/include/fsp0pagecompress.inl
+++ /dev/null
@@ -1,63 +0,0 @@
-/*****************************************************************************
-
-Copyright (C) 2013, 2018, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/******************************************************************//**
-@file include/fsp0pagecompress.ic
-Implementation for helper functions for extracting/storing page
-compression and atomic writes information to file space.
-
-Created 11/12/2013 Jan Lindström jan.lindstrom@mariadb.com
-
-***********************************************************************/
-
-/********************************************************************//**
-Determine the tablespace is page compression level from dict_table_t::flags.
-@return	page compression level or 0 if not compressed*/
-UNIV_INLINE
-ulint
-fsp_flags_get_page_compression_level(
-/*=================================*/
-	ulint	flags)	/*!< in: tablespace flags */
-{
-	return(FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags));
-}
-
-
-/*******************************************************************//**
-Find out wheather the page is page compressed
-@return	true if page is page compressed, false if not */
-UNIV_INLINE
-bool
-fil_page_is_compressed(
-/*===================*/
-	const byte*	buf)	/*!< in: page */
-{
-	return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED);
-}
-
-/*******************************************************************//**
-Find out wheather the page is page compressed
-@return	true if page is page compressed, false if not */
-UNIV_INLINE
-bool
-fil_page_is_compressed_encrypted(
-/*=============================*/
-	const byte*	buf)	/*!< in: page */
-{
-	return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED);
-}
diff --git a/storage/innobase/include/fsp0space.h b/storage/innobase/include/fsp0space.h
index 632c65e14cc..c00c8d689bf 100644
--- a/storage/innobase/include/fsp0space.h
+++ b/storage/innobase/include/fsp0space.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -163,9 +163,9 @@ public:
 	void shutdown();
 
 	/** @return the sum of the file sizes of each Datafile */
-	ulint get_sum_of_sizes() const
+	uint32_t get_sum_of_sizes() const
 	{
-		ulint	sum = 0;
+		uint32_t sum = 0;
 
 		for (const_iterator it = begin(); it != end(); ++it) {
 			sum += it->m_size;
diff --git a/storage/innobase/include/fsp0sysspace.h b/storage/innobase/include/fsp0sysspace.h
index bcb8dd5e5e9..2e0a395f71c 100644
--- a/storage/innobase/include/fsp0sysspace.h
+++ b/storage/innobase/include/fsp0sysspace.h
@@ -30,7 +30,7 @@ Created 2013-7-26 by Kevin Lewis
 
 /** If the last data file is auto-extended, we add this many pages to it
 at a time. We have to make this public because it is a config variable. */
-extern ulong sys_tablespace_auto_extend_increment;
+extern uint sys_tablespace_auto_extend_increment;
 
 /** Data structure that contains the information about shared tablespaces.
 Currently this can be the system tablespace or a temporary table tablespace */
@@ -120,7 +120,7 @@ public:
 
 	/** Set the last file size.
 	@param[in]	size	the size to set */
-	void set_last_file_size(ulint size)
+	void set_last_file_size(uint32_t size)
 	{
 		ut_ad(!m_files.empty());
 		m_files.back().m_size = size;
@@ -128,7 +128,7 @@ public:
 
 	/** Get the size of the last data file in the tablespace
 	@return the size of the last data file in the array */
-	ulint last_file_size() const
+	uint32_t last_file_size() const
 	{
 		ut_ad(!m_files.empty());
 		return(m_files.back().m_size);
@@ -136,7 +136,7 @@ public:
 
 	/**
 	@return the autoextend increment in pages. */
-	ulint get_autoextend_increment() const
+	uint32_t get_autoextend_increment() const
 	{
 		return sys_tablespace_auto_extend_increment
 			<< (20 - srv_page_size_shift);
@@ -144,7 +144,7 @@ public:
 
 	/**
 	@return next increment size */
-	ulint get_increment() const;
+	uint32_t get_increment() const;
 
 	/** Open or create the data files
 	@param[in]  is_temp		whether this is a temporary tablespace
@@ -240,8 +240,7 @@ private:
 	/** if true, then we auto-extend the last data file */
 	bool		m_auto_extend_last_file;
 
-	/** if != 0, this tells the max size auto-extending may increase the
-	last data file size */
+	/** maximum size of the last data file (0=unlimited) */
 	ulint		m_last_file_size_max;
 
 	/** If the following is true we do not allow
diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h
index 69c5346a4f9..f8e4c06baae 100644
--- a/storage/innobase/include/fsp0types.h
+++ b/storage/innobase/include/fsp0types.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2014, 2019, MariaDB Corporation.
+Copyright (c) 2014, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,12 +24,12 @@ File space management types
 Created May 26, 2009 Vasil Dimov
 *******************************************************/
 
-#ifndef fsp0types_h
-#define fsp0types_h
+#pragma once
+#include <cstddef>
 
 /** The fil_space_t::id of the redo log. All persistent tablespaces
 have a smaller fil_space_t::id. */
-#define SRV_LOG_SPACE_FIRST_ID		0xFFFFFFF0U
+static constexpr size_t SRV_SPACE_ID_UPPER_BOUND= 0xFFFFFFF0;
 /** The fil_space_t::id of the innodb_temporary tablespace. */
 #define SRV_TMP_SPACE_ID		0xFFFFFFFEU
 
@@ -400,4 +400,6 @@ in full crc32 format. */
 
 /* @} */
 
-#endif /* fsp0types_h */
+struct fil_node_t;
+struct fil_space_t;
+class buf_page_t;
diff --git a/storage/innobase/include/fts0ast.h b/storage/innobase/include/fts0ast.h
index 5799776c32d..15bf30bc5d5 100644
--- a/storage/innobase/include/fts0ast.h
+++ b/storage/innobase/include/fts0ast.h
@@ -29,14 +29,6 @@ Created 2007/03/16/03 Sunny Bains
 
 #include "mem0mem.h"
 
-#ifdef UNIV_PFS_MEMORY
-
-#define malloc(A)	ut_malloc_nokey(A)
-#define free(A)		ut_free(A)
-#define realloc(P, A)	ut_realloc(P, A)
-
-#endif /* UNIV_PFS_MEMORY */
-
 /* The type of AST Node */
 enum fts_ast_type_t {
 	FTS_AST_OPER,				/*!< Operator */
diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h
index d3cfa5b23df..082f61c2f63 100644
--- a/storage/innobase/include/fts0fts.h
+++ b/storage/innobase/include/fts0fts.h
@@ -24,8 +24,7 @@ Full text search header file
 Created 2011/09/02 Sunny Bains
 ***********************************************************************/
 
-#ifndef fts0fts_h
-#define fts0fts_h
+#pragma once
 
 #include "data0type.h"
 #include "data0types.h"
@@ -337,6 +336,10 @@ public:
 	protected by fts_optimize_wq mutex */
 	bool		in_queue;
 
+	/** Whether the sync message exists in fts_optimize_wq;
+	protected by fts_optimize_wq mutex */
+	bool		sync_message;
+
 	/** Heap for fts_t allocation. */
 	mem_heap_t*	fts_heap;
 };
@@ -963,6 +966,8 @@ dberr_t
 fts_update_sync_doc_id(const dict_table_t *table,
 		       doc_id_t  doc_id,
 		       trx_t *trx)
-MY_ATTRIBUTE((nonnull(1)));
+	MY_ATTRIBUTE((nonnull(1)));
 
-#endif /*!< fts0fts.h */
+/** Sync the table during commit phase
+@param[in]	table	table to be synced */
+void fts_sync_during_ddl(dict_table_t* table);
diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h
index 5517f05f1b6..660f7459249 100644
--- a/storage/innobase/include/fts0priv.h
+++ b/storage/innobase/include/fts0priv.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -292,22 +292,8 @@ fts_trx_table_id_cmp(
 	const void*	p1,		/*!< in: id1 */
 	const void*	p2)		/*!< in: id2 */
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
-/******************************************************************//**
-Commit a transaction.
-@return DB_SUCCESS if all OK */
-dberr_t
-fts_sql_commit(
-/*===========*/
-	trx_t*		trx)		/*!< in: transaction */
-	MY_ATTRIBUTE((nonnull));
-/******************************************************************//**
-Rollback a transaction.
-@return DB_SUCCESS if all OK */
-dberr_t
-fts_sql_rollback(
-/*=============*/
-	trx_t*		trx)		/*!< in: transaction */
-	MY_ATTRIBUTE((nonnull));
+#define fts_sql_commit(trx) trx_commit_for_mysql(trx)
+#define fts_sql_rollback(trx) (trx)->rollback()
 /******************************************************************//**
 Parse an SQL string. %s is replaced with the table's id. Don't acquire
 the dict mutex
diff --git a/storage/innobase/include/fts0tokenize.h b/storage/innobase/include/fts0tokenize.h
index ddfb5bbabfd..1cddaf5bf67 100644
--- a/storage/innobase/include/fts0tokenize.h
+++ b/storage/innobase/include/fts0tokenize.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -91,7 +92,7 @@ fts_get_word(
 	while (doc < end) {
 		for (; doc < end;
 		     doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
-			mbl = cs->cset->ctype(cs, &ctype, doc, end);
+			mbl = cs->ctype(&ctype, doc, end);
 
 			if (true_word_char(ctype, *doc)) {
 				break;
@@ -153,7 +154,7 @@ fts_get_word(
 		for (word->pos = doc;
 		     doc < end;
 		     length++, doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
-			mbl = cs->cset->ctype(cs, &ctype, doc, end);
+			mbl = cs->ctype(&ctype, doc, end);
 
 			if (true_word_char(ctype, *doc)) {
 				mwc = 0;
diff --git a/storage/innobase/include/fts0types.inl b/storage/innobase/include/fts0types.inl
index ed61726ff80..facc1e5c40b 100644
--- a/storage/innobase/include/fts0types.inl
+++ b/storage/innobase/include/fts0types.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2018, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -184,12 +184,12 @@ fts_select_index_by_hash(
 	char_len = my_mbcharlen_ptr(cs, reinterpret_cast<const char*>(str),
 				    reinterpret_cast<const char*>(str + len));
 	*/
-	size_t char_len = size_t(cs->cset->charlen(cs, str, str + len));
+	size_t char_len = size_t(cs->charlen(str, str + len));
 
 	ut_ad(char_len <= len);
 
 	/* Get collation hash code */
-	cs->coll->hash_sort(cs, str, char_len, &nr1, &nr2);
+	my_ci_hash_sort(cs, str, char_len, &nr1, &nr2);
 
 	return(nr1 % FTS_NUM_AUX_INDEX);
 }
diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h
index 0782781b8df..1ade24cd069 100644
--- a/storage/innobase/include/fut0lst.h
+++ b/storage/innobase/include/fut0lst.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, MariaDB Corporation.
+Copyright (c) 2018, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -68,134 +68,95 @@ typedef	byte	flst_node_t;
 @param[in,out]	block	file page
 @param[in]	ofs	byte offset of the list base node
 @param[in,out]	mtr	mini-transaction */
-inline void flst_init(buf_block_t* block, uint16_t ofs, mtr_t* mtr)
+inline void flst_init(const buf_block_t* block, uint16_t ofs, mtr_t* mtr)
 {
-	ut_ad(0 == mach_read_from_2(FLST_LEN + ofs + block->frame));
-	ut_ad(0 == mach_read_from_2(FLST_FIRST + FIL_ADDR_BYTE + ofs
-				    + block->frame));
-	ut_ad(0 == mach_read_from_2(FLST_LAST + FIL_ADDR_BYTE + ofs
-				    + block->frame));
-	compile_time_assert(FIL_NULL == 0xffU * 0x1010101U);
-	mlog_memset(block, FLST_FIRST + FIL_ADDR_PAGE + ofs, 4, 0xff, mtr);
-	mlog_memset(block, FLST_LAST + FIL_ADDR_PAGE + ofs, 4, 0xff, mtr);
+  ut_ad(!mach_read_from_2(FLST_LEN + ofs + block->frame));
+  ut_ad(!mach_read_from_2(FLST_FIRST + FIL_ADDR_BYTE + ofs + block->frame));
+  ut_ad(!mach_read_from_2(FLST_LAST + FIL_ADDR_BYTE + ofs + block->frame));
+  compile_time_assert(FIL_NULL == 0xffU * 0x1010101U);
+  mtr->memset(block, FLST_FIRST + FIL_ADDR_PAGE + ofs, 4, 0xff);
+  mtr->memset(block, FLST_LAST + FIL_ADDR_PAGE + ofs, 4, 0xff);
 }
 
-/** Write a null file address.
-@param[in,out]	faddr	file address to be zeroed otu
-@param[in,out]	mtr	mini-transaction */
-inline void flst_zero_addr(fil_faddr_t* faddr, mtr_t* mtr)
+/** Initialize a list base node.
+@param[in]      block   file page
+@param[in,out]  base    base node
+@param[in,out]  mtr     mini-transaction */
+void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+
+/** Append a file list node to a list.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  add     block to be added
+@param[in]      aoffset byte offset of the node to be added
+@param[in,outr] mtr     mini-transaction */
+void flst_add_last(buf_block_t *base, uint16_t boffset,
+                   buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+/** Prepend a file list node to a list.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  add     block to be added
+@param[in]      aoffset byte offset of the node to be added
+@param[in,outr] mtr     mini-transaction */
+void flst_add_first(buf_block_t *base, uint16_t boffset,
+                    buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+/** Remove a file list node.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  cur     block to be removed
+@param[in]      coffset byte offset of the current record to be removed
+@param[in,outr] mtr     mini-transaction */
+void flst_remove(buf_block_t *base, uint16_t boffset,
+                 buf_block_t *cur, uint16_t coffset, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+
+/** @return the length of a list */
+inline uint32_t flst_get_len(const flst_base_node_t *base)
 {
-	if (mach_read_from_4(faddr + FIL_ADDR_PAGE) != FIL_NULL) {
-		mlog_memset(faddr + FIL_ADDR_PAGE, 4, 0xff, mtr);
-	}
-	if (mach_read_from_2(faddr + FIL_ADDR_BYTE)) {
-		mlog_write_ulint(faddr + FIL_ADDR_BYTE, 0, MLOG_2BYTES, mtr);
-	}
+  return mach_read_from_4(base + FLST_LEN);
 }
 
-/********************************************************************//**
-Initializes a list base node. */
-UNIV_INLINE
-void
-flst_init(
-/*======*/
-	flst_base_node_t*	base,	/*!< in: pointer to base node */
-	mtr_t*			mtr);	/*!< in: mini-transaction handle */
-/********************************************************************//**
-Adds a node as the last node in a list. */
-void
-flst_add_last(
-/*==========*/
-	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
-	flst_node_t*		node,	/*!< in: node to add */
-	mtr_t*			mtr);	/*!< in: mini-transaction handle */
-/********************************************************************//**
-Adds a node as the first node in a list. */
-void
-flst_add_first(
-/*===========*/
-	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
-	flst_node_t*		node,	/*!< in: node to add */
-	mtr_t*			mtr);	/*!< in: mini-transaction handle */
-/********************************************************************//**
-Removes a node. */
-void
-flst_remove(
-/*========*/
-	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
-	flst_node_t*		node2,	/*!< in: node to remove */
-	mtr_t*			mtr);	/*!< in: mini-transaction handle */
-/** Get the length of a list.
-@param[in]	base	base node
-@return length */
-UNIV_INLINE
-uint32_t
-flst_get_len(
-	const flst_base_node_t*	base);
-/********************************************************************//**
-Gets list first node address.
-@return file address */
-UNIV_INLINE
-fil_addr_t
-flst_get_first(
-/*===========*/
-	const flst_base_node_t*	base,	/*!< in: pointer to base node */
-	mtr_t*			mtr);	/*!< in: mini-transaction handle */
-/********************************************************************//**
-Gets list last node address.
-@return file address */
-UNIV_INLINE
-fil_addr_t
-flst_get_last(
-/*==========*/
-	const flst_base_node_t*	base,	/*!< in: pointer to base node */
-	mtr_t*			mtr);	/*!< in: mini-transaction handle */
-/********************************************************************//**
-Gets list next node address.
-@return file address */
-UNIV_INLINE
-fil_addr_t
-flst_get_next_addr(
-/*===============*/
-	const flst_node_t*	node,	/*!< in: pointer to node */
-	mtr_t*			mtr);	/*!< in: mini-transaction handle */
-/********************************************************************//**
-Gets list prev node address.
-@return file address */
-UNIV_INLINE
-fil_addr_t
-flst_get_prev_addr(
-/*===============*/
-	const flst_node_t*	node,	/*!< in: pointer to node */
-	mtr_t*			mtr);	/*!< in: mini-transaction handle */
-/********************************************************************//**
-Writes a file address. */
-UNIV_INLINE
-void
-flst_write_addr(
-/*============*/
-	fil_faddr_t*	faddr,	/*!< in: pointer to file faddress */
-	fil_addr_t	addr,	/*!< in: file address */
-	mtr_t*		mtr);	/*!< in: mini-transaction handle */
-/********************************************************************//**
-Reads a file address.
-@return file address */
-UNIV_INLINE
-fil_addr_t
-flst_read_addr(
-/*===========*/
-	const fil_faddr_t*	faddr,	/*!< in: pointer to file faddress */
-	mtr_t*			mtr);	/*!< in: mini-transaction handle */
-/********************************************************************//**
-Validates a file-based list.
-@return TRUE if ok */
-ibool
-flst_validate(
-/*==========*/
-	const flst_base_node_t*	base,	/*!< in: pointer to base node of list */
-	mtr_t*			mtr1);	/*!< in: mtr */
-
-#include "fut0lst.inl"
+/** @return a file address */
+inline fil_addr_t flst_read_addr(const byte *faddr)
+{
+  fil_addr_t addr= { mach_read_from_4(faddr + FIL_ADDR_PAGE),
+		     mach_read_from_2(faddr + FIL_ADDR_BYTE) };
+  ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA);
+  ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA);
+  return addr;
+}
+
+/** @return list first node address */
+inline fil_addr_t flst_get_first(const flst_base_node_t *base)
+{
+  return flst_read_addr(base + FLST_FIRST);
+}
+
+/** @return list last node address */
+inline fil_addr_t flst_get_last(const flst_base_node_t *base)
+{
+  return flst_read_addr(base + FLST_LAST);
+}
+
+/** @return list next node address */
+inline fil_addr_t flst_get_next_addr(const flst_node_t* node)
+{
+  return flst_read_addr(node + FLST_NEXT);
+}
+
+/** @return list prev node address */
+inline fil_addr_t flst_get_prev_addr(const flst_node_t *node)
+{
+  return flst_read_addr(node + FLST_PREV);
+}
+
+#ifdef UNIV_DEBUG
+/** Validate a file-based list. */
+void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr);
+#endif
 
 #endif /* !UNIV_INNOCHECKSUM */
 
diff --git a/storage/innobase/include/fut0lst.inl b/storage/innobase/include/fut0lst.inl
deleted file mode 100644
index ec4181b2c93..00000000000
--- a/storage/innobase/include/fut0lst.inl
+++ /dev/null
@@ -1,153 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/******************************************************************//**
-@file include/fut0lst.ic
-File-based list utilities
-
-Created 11/28/1995 Heikki Tuuri
-***********************************************************************/
-
-#include "buf0buf.h"
-
-/********************************************************************//**
-Writes a file address. */
-UNIV_INLINE
-void
-flst_write_addr(
-/*============*/
-	fil_faddr_t*	faddr,	/*!< in: pointer to file faddress */
-	fil_addr_t	addr,	/*!< in: file address */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle */
-{
-	ut_ad(faddr && mtr);
-	ut_ad(mtr_memo_contains_page_flagged(mtr, faddr,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_SX_FIX));
-	ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA);
-	ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA);
-
-	mlog_write_ulint(faddr + FIL_ADDR_PAGE, addr.page, MLOG_4BYTES, mtr);
-	mlog_write_ulint(faddr + FIL_ADDR_BYTE, addr.boffset,
-			 MLOG_2BYTES, mtr);
-}
-
-/********************************************************************//**
-Reads a file address.
-@return file address */
-UNIV_INLINE
-fil_addr_t
-flst_read_addr(
-/*===========*/
-	const fil_faddr_t*	faddr,	/*!< in: pointer to file faddress */
-	mtr_t*			mtr)	/*!< in: mini-transaction handle */
-{
-	fil_addr_t	addr;
-
-	ut_ad(faddr && mtr);
-
-	addr.page = mach_read_from_4(faddr + FIL_ADDR_PAGE);
-	addr.boffset = mach_read_from_2(faddr + FIL_ADDR_BYTE);
-	ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA);
-	ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA);
-	return(addr);
-}
-
-/********************************************************************//**
-Initializes a list base node. */
-UNIV_INLINE
-void
-flst_init(
-/*======*/
-	flst_base_node_t*	base,	/*!< in: pointer to base node */
-	mtr_t*			mtr)	/*!< in: mini-transaction handle */
-{
-	ut_ad(mtr_memo_contains_page_flagged(mtr, base,
-					     MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_SX_FIX));
-
-	if (mach_read_from_4(base + FLST_LEN)) {
-		mlog_write_ulint(base + FLST_LEN, 0, MLOG_4BYTES, mtr);
-	}
-	flst_zero_addr(base + FLST_FIRST, mtr);
-	flst_zero_addr(base + FLST_LAST, mtr);
-}
-
-/** Get the length of a list.
-@param[in]	base	base node
-@return length */
-UNIV_INLINE
-uint32_t
-flst_get_len(
-	const flst_base_node_t*	base)
-{
-	return(mach_read_from_4(base + FLST_LEN));
-}
-
-/********************************************************************//**
-Gets list first node address.
-@return file address */
-UNIV_INLINE
-fil_addr_t
-flst_get_first(
-/*===========*/
-	const flst_base_node_t*	base,	/*!< in: pointer to base node */
-	mtr_t*			mtr)	/*!< in: mini-transaction handle */
-{
-	return(flst_read_addr(base + FLST_FIRST, mtr));
-}
-
-/********************************************************************//**
-Gets list last node address.
-@return file address */
-UNIV_INLINE
-fil_addr_t
-flst_get_last(
-/*==========*/
-	const flst_base_node_t*	base,	/*!< in: pointer to base node */
-	mtr_t*			mtr)	/*!< in: mini-transaction handle */
-{
-	return(flst_read_addr(base + FLST_LAST, mtr));
-}
-
-/********************************************************************//**
-Gets list next node address.
-@return file address */
-UNIV_INLINE
-fil_addr_t
-flst_get_next_addr(
-/*===============*/
-	const flst_node_t*	node,	/*!< in: pointer to node */
-	mtr_t*			mtr)	/*!< in: mini-transaction handle */
-{
-	return(flst_read_addr(node + FLST_NEXT, mtr));
-}
-
-/********************************************************************//**
-Gets list prev node address.
-@return file address */
-UNIV_INLINE
-fil_addr_t
-flst_get_prev_addr(
-/*===============*/
-	const flst_node_t*	node,	/*!< in: pointer to node */
-	mtr_t*			mtr)	/*!< in: mini-transaction handle */
-{
-	return(flst_read_addr(node + FLST_PREV, mtr));
-}
diff --git a/storage/innobase/include/gis0geo.h b/storage/innobase/include/gis0geo.h
index dea6d63f4e0..3fd01a3ae54 100644
--- a/storage/innobase/include/gis0geo.h
+++ b/storage/innobase/include/gis0geo.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
-Copyright (c) 2019, MariaDB Corporation.
+Copyright (c) 2019, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -106,45 +106,17 @@ split_rtree_node(
 	int			n_dim,		/*!< in: dimensions. */
 	uchar*			first_rec);	/*!< in: the first rec. */
 
-/*************************************************************//**
-Compares two keys a and b depending on nextflag
-nextflag can contain these flags:
+/** Compare two minimum bounding rectangles.
+@param mode   comparison operator
    MBR_INTERSECT(a,b)  a overlaps b
    MBR_CONTAIN(a,b)    a contains b
    MBR_DISJOINT(a,b)   a disjoint b
    MBR_WITHIN(a,b)     a within   b
    MBR_EQUAL(a,b)      All coordinates of MBRs are equal
    MBR_DATA(a,b)       Data reference is the same
-Returns 0 on success.  */
-int
-rtree_key_cmp(
-/*==========*/
-	page_cur_mode_t	mode,	/*!< in: compare method. */
-	const uchar*	b,	/*!< in: first key. */
-	int		b_len,	/*!< in: first key len. */
-	const uchar*	a,	/*!< in: second key. */
-	int		a_len);	/*!< in: second key len. */
-
-/*************************************************************//**
-Calculates MBR_AREA(a+b) - MBR_AREA(a)
-Note: when 'a' and 'b' objects are far from each other,
-the area increase can be really big, so this function
-can return 'inf' as a result.  */
-double
-rtree_area_increase(
-	const uchar*	a,		/*!< in: first mbr. */
-	const uchar*	b,		/*!< in: second mbr. */
-	int		a_len,		/*!< in: mbr length. */
-	double*		ab_area);	/*!< out: increased area. */
-
-/** Calculates overlapping area
-@param[in]	a	mbr a
-@param[in]	b	mbr b
-@param[in]	mbr_len	mbr length
-@return overlapping area */
-double
-rtree_area_overlapping(
-	const uchar*	a,
-	const uchar*	b,
-	int		mbr_len);
+@param b first MBR
+@param a second MBR
+@retval 0 if the predicate holds
+@retval 1 if the precidate does not hold */
+int rtree_key_cmp(page_cur_mode_t mode, const void *b, const void *a);
 #endif
diff --git a/storage/innobase/include/gis0rtree.h b/storage/innobase/include/gis0rtree.h
index b4646cd357b..1a27422a0d8 100644
--- a/storage/innobase/include/gis0rtree.h
+++ b/storage/innobase/include/gis0rtree.h
@@ -151,7 +151,6 @@ rtr_rec_cal_increase(
 				dtuple in some of the common fields, or which
 				has an equal number or more fields than
 				dtuple */
-	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
 	double*		area);	/*!< out: increased area */
 
 /****************************************************************//**
@@ -170,10 +169,10 @@ void
 rtr_non_leaf_stack_push(
 /*====================*/
 	rtr_node_path_t*	path,		/*!< in/out: search path */
-	ulint			pageno,		/*!< in: pageno to insert */
+	uint32_t		pageno,		/*!< in: pageno to insert */
 	node_seq_t		seq_no,		/*!< in: Node sequence num */
 	ulint			level,		/*!< in: index level */
-	ulint			child_no,	/*!< in: child page no */
+	uint32_t		child_no,	/*!< in: child page no */
 	btr_pcur_t*		cursor,		/*!< in: position cursor */
 	double			mbr_inc);	/*!< in: MBR needs to be
 						enlarged */
diff --git a/storage/innobase/include/gis0rtree.inl b/storage/innobase/include/gis0rtree.inl
index c829f0de255..1b53caa306b 100644
--- a/storage/innobase/include/gis0rtree.inl
+++ b/storage/innobase/include/gis0rtree.inl
@@ -97,10 +97,10 @@ void
 rtr_non_leaf_stack_push(
 /*====================*/
 	rtr_node_path_t*	path,		/*!< in/out: search path */
-	ulint			pageno,		/*!< in: pageno to insert */
+	uint32_t		pageno,		/*!< in: pageno to insert */
 	node_seq_t		seq_no,		/*!< in: Node sequence num */
 	ulint			level,		/*!< in: index page level */
-	ulint			child_no,	/*!< in: child page no */
+	uint32_t		child_no,	/*!< in: child page no */
 	btr_pcur_t*		cursor,		/*!< in: position cursor */
 	double			mbr_inc)	/*!< in: MBR needs to be
 						enlarged */
diff --git a/storage/innobase/include/gis0type.h b/storage/innobase/include/gis0type.h
index a1e0a878cb2..55944bfcce3 100644
--- a/storage/innobase/include/gis0type.h
+++ b/storage/innobase/include/gis0type.h
@@ -37,15 +37,15 @@ Created 2013/03/27 Jimmy Yang
 #include <vector>
 #include <forward_list>
 
-/* Node Sequence Number. Only updated when page splits */
-typedef ib_uint32_t     node_seq_t;
+/** Node Sequence Number. Only updated when page splits */
+typedef uint32_t     node_seq_t;
 
 /* RTree internal non-leaf Nodes to be searched, from root to leaf */
-typedef	struct node_visit {
-	ulint		page_no;	/*!< the page number */
+struct node_visit_t {
+	uint32_t	page_no;	/*!< the page number */
 	node_seq_t	seq_no;		/*!< the SSN (split sequence number */
 	ulint		level;		/*!< the page's index level */
-	ulint		child_no;	/*!< child page num if for parent
+	uint32_t	child_no;	/*!< child page num if for parent
 					recording */
 	btr_pcur_t*	cursor;		/*!< cursor structure if we positioned
 					FIXME: there is no need to use whole
@@ -53,7 +53,7 @@ typedef	struct node_visit {
 					members */
 	double		mbr_inc;	/*!< whether this node needs to be
 					enlarged for insertion */
-} node_visit_t;
+};
 
 typedef std::vector<node_visit_t, ut_allocator<node_visit_t> >	rtr_node_path_t;
 
diff --git a/storage/innobase/include/ha0ha.h b/storage/innobase/include/ha0ha.h
index b4a28e44e04..5aaa559b885 100644
--- a/storage/innobase/include/ha0ha.h
+++ b/storage/innobase/include/ha0ha.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, MariaDB Corporation.
+Copyright (c) 2018, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -19,7 +19,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 
 /**************************************************//**
 @file include/ha0ha.h
-The hash table with external chains
+The hash table interface for the adaptive hash index
 
 Created 8/18/1994 Heikki Tuuri
 *******************************************************/
@@ -43,160 +43,6 @@ ha_search_and_get_data(
 /*===================*/
 	hash_table_t*	table,	/*!< in: hash table */
 	ulint		fold);	/*!< in: folded value of the searched data */
-/*********************************************************//**
-Looks for an element when we know the pointer to the data and updates
-the pointer to data if found.
-@return TRUE if found */
-ibool
-ha_search_and_update_if_found_func(
-/*===============================*/
-	hash_table_t*	table,	/*!< in/out: hash table */
-	ulint		fold,	/*!< in: folded value of the searched data */
-	const rec_t*	data,	/*!< in: pointer to the data */
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-	buf_block_t*	new_block,/*!< in: block containing new_data */
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-	const rec_t*	new_data);/*!< in: new pointer to the data */
-
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-/** Looks for an element when we know the pointer to the data and
-updates the pointer to data if found.
-@param table in/out: hash table
-@param fold in: folded value of the searched data
-@param data in: pointer to the data
-@param new_block in: block containing new_data
-@param new_data in: new pointer to the data */
-# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \
-	ha_search_and_update_if_found_func(table,fold,data,new_block,new_data)
-#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-/** Looks for an element when we know the pointer to the data and
-updates the pointer to data if found.
-@param table in/out: hash table
-@param fold in: folded value of the searched data
-@param data in: pointer to the data
-@param new_block ignored: block containing new_data
-@param new_data in: new pointer to the data */
-# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \
-	ha_search_and_update_if_found_func(table,fold,data,new_data)
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-#endif /* BTR_CUR_HASH_ADAPT */
-
-/*************************************************************//**
-Creates a hash table with at least n array cells.  The actual number
-of cells is chosen to be a prime number slightly bigger than n.
-@return own: created table */
-hash_table_t*
-ib_create(
-/*======*/
-	ulint		n,	/*!< in: number of array cells */
-	latch_id_t	id,	/*!< in: latch ID */
-	ulint		n_mutexes,/*!< in: number of mutexes to protect the
-				hash table: must be a power of 2, or 0 */
-	ulint		type);	/*!< in: type of datastructure for which
-				the memory heap is going to be used e.g.:
-				MEM_HEAP_FOR_BTR_SEARCH or
-				MEM_HEAP_FOR_PAGE_HASH */
-
-/** Recreate a hash table with at least n array cells. The actual number
-of cells is chosen to be a prime number slightly bigger than n.
-The new cells are all cleared. The heaps are recreated.
-The sync objects are reused.
-@param[in,out]	table	hash table to be resuzed (to be freed later)
-@param[in]	n	number of array cells
-@return	resized new table */
-hash_table_t*
-ib_recreate(
-	hash_table_t*	table,
-	ulint		n);
-
-/*************************************************************//**
-Empties a hash table and frees the memory heaps. */
-void
-ha_clear(
-/*=====*/
-	hash_table_t*	table);	/*!< in, own: hash table */
-
-#ifdef BTR_CUR_HASH_ADAPT
-/*************************************************************//**
-Inserts an entry into a hash table. If an entry with the same fold number
-is found, its node is updated to point to the new data, and no new node
-is inserted.
-@return TRUE if succeed, FALSE if no more memory could be allocated */
-ibool
-ha_insert_for_fold_func(
-/*====================*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold,	/*!< in: folded value of data; if a node with
-				the same fold value already exists, it is
-				updated to point to the same data, and no new
-				node is created! */
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-	buf_block_t*	block,	/*!< in: buffer block containing the data */
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-	const rec_t*	data);	/*!< in: data, must not be NULL */
-
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-/**
-Inserts an entry into a hash table. If an entry with the same fold number
-is found, its node is updated to point to the new data, and no new node
-is inserted.
-@return TRUE if succeed, FALSE if no more memory could be allocated
-@param t in: hash table
-@param f in: folded value of data
-@param b in: buffer block containing the data
-@param d in: data, must not be NULL */
-# define ha_insert_for_fold(t,f,b,d) 	do {		\
-	ha_insert_for_fold_func(t,f,b,d);		\
-	MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);	\
-} while(0)
-#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-/**
-Inserts an entry into a hash table. If an entry with the same fold number
-is found, its node is updated to point to the new data, and no new node
-is inserted.
-@return TRUE if succeed, FALSE if no more memory could be allocated
-@param t in: hash table
-@param f in: folded value of data
-@param b ignored: buffer block containing the data
-@param d in: data, must not be NULL */
-# define ha_insert_for_fold(t,f,b,d)	do {		\
-	ha_insert_for_fold_func(t,f,d);			\
-	MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);	\
-} while (0)
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-
-/*********************************************************//**
-Looks for an element when we know the pointer to the data and deletes
-it from the hash table if found.
-@return TRUE if found */
-UNIV_INLINE
-ibool
-ha_search_and_delete_if_found(
-/*==========================*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold,	/*!< in: folded value of the searched data */
-	const rec_t*	data);	/*!< in: pointer to the data */
-
-/*****************************************************************//**
-Removes from the chain determined by fold all nodes whose data pointer
-points to the page given. */
-void
-ha_remove_all_nodes_to_page(
-/*========================*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold,	/*!< in: fold value */
-	const page_t*	page);	/*!< in: buffer page */
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-/*************************************************************//**
-Validates a given range of the cells in hash table.
-@return TRUE if ok */
-ibool
-ha_validate(
-/*========*/
-	hash_table_t*	table,		/*!< in: hash table */
-	ulint		start_index,	/*!< in: start index */
-	ulint		end_index);	/*!< in: end index */
-#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
 
 /** The hash table external chain node */
 struct ha_node_t {
@@ -207,35 +53,8 @@ struct ha_node_t {
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
 	const rec_t*	data;	/*!< pointer to the data */
 };
-#endif /* BTR_CUR_HASH_ADAPT */
-
-#if defined UNIV_DEBUG && defined BTR_CUR_HASH_ADAPT
-/********************************************************************//**
-Assert that the synchronization object in a hash operation involving
-possible change in the hash table is held.
-Note that in case of mutexes we assert that mutex is owned while in case
-of rw-locks we assert that it is held in exclusive mode. */
-UNIV_INLINE
-void
-hash_assert_can_modify(
-/*===================*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold);	/*!< in: fold value */
-/********************************************************************//**
-Assert that the synchronization object in a hash search operation is held.
-Note that in case of mutexes we assert that mutex is owned while in case
-of rw-locks we assert that it is held either in x-mode or s-mode. */
-UNIV_INLINE
-void
-hash_assert_can_search(
-/*===================*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold);	/*!< in: fold value */
-#else /* UNIV_DEBUG */
-#define hash_assert_can_modify(t, f)
-#define hash_assert_can_search(t, f)
-#endif /* UNIV_DEBUG */
 
 #include "ha0ha.inl"
+#endif /* BTR_CUR_HASH_ADAPT */
 
 #endif
diff --git a/storage/innobase/include/ha0ha.inl b/storage/innobase/include/ha0ha.inl
index 0612ef1bb25..0b256257214 100644
--- a/storage/innobase/include/ha0ha.inl
+++ b/storage/innobase/include/ha0ha.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, MariaDB Corporation.
+Copyright (c) 2018, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -19,14 +19,12 @@ this program; if not, write to the Free Software Foundation, Inc.,
 
 /********************************************************************//**
 @file include/ha0ha.ic
-The hash table with external chains
+The hash table interface for the adaptive hash index
 
 Created 8/18/1994 Heikki Tuuri
 *************************************************************************/
 
 #ifdef BTR_CUR_HASH_ADAPT
-#include "ut0rnd.h"
-#include "mem0mem.h"
 #include "btr0types.h"
 
 /******************************************************************//**
@@ -95,57 +93,9 @@ ha_chain_get_first(
 	hash_table_t*	table,	/*!< in: hash table */
 	ulint		fold)	/*!< in: fold value determining the chain */
 {
-	return((ha_node_t*)
-	       hash_get_nth_cell(table, hash_calc_hash(fold, table))->node);
+  return static_cast<ha_node_t*>(table->array[table->calc_hash(fold)].node);
 }
 
-#ifdef UNIV_DEBUG
-/********************************************************************//**
-Assert that the synchronization object in a hash operation involving
-possible change in the hash table is held.
-Note that in case of mutexes we assert that mutex is owned while in case
-of rw-locks we assert that it is held in exclusive mode. */
-UNIV_INLINE
-void
-hash_assert_can_modify(
-/*===================*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold)	/*!< in: fold value */
-{
-	if (table->type == HASH_TABLE_SYNC_MUTEX) {
-		ut_ad(mutex_own(hash_get_mutex(table, fold)));
-	} else if (table->type == HASH_TABLE_SYNC_RW_LOCK) {
-# ifdef UNIV_DEBUG
-		rw_lock_t* lock = hash_get_lock(table, fold);
-		ut_ad(rw_lock_own(lock, RW_LOCK_X));
-# endif
-	} else {
-		ut_ad(table->type == HASH_TABLE_SYNC_NONE);
-	}
-}
-
-/********************************************************************//**
-Assert that the synchronization object in a hash search operation is held.
-Note that in case of mutexes we assert that mutex is owned while in case
-of rw-locks we assert that it is held either in x-mode or s-mode. */
-UNIV_INLINE
-void
-hash_assert_can_search(
-/*===================*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold)	/*!< in: fold value */
-{
-	if (table->type == HASH_TABLE_SYNC_MUTEX) {
-		ut_ad(mutex_own(hash_get_mutex(table, fold)));
-	} else if (table->type == HASH_TABLE_SYNC_RW_LOCK) {
-		ut_ad(rw_lock_own_flagged(hash_get_lock(table, fold),
-					  RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
-	} else {
-		ut_ad(table->type == HASH_TABLE_SYNC_NONE);
-	}
-}
-#endif /* UNIV_DEBUG */
-
 /*************************************************************//**
 Looks for an element in a hash table.
 @return pointer to the data of the first hash table node in chain
@@ -157,7 +107,6 @@ ha_search_and_get_data(
 	hash_table_t*	table,	/*!< in: hash table */
 	ulint		fold)	/*!< in: folded value of the searched data */
 {
-	hash_assert_can_search(table, fold);
 	ut_ad(btr_search_enabled);
 
 	for (const ha_node_t* node = ha_chain_get_first(table, fold);
@@ -186,8 +135,6 @@ ha_search_with_data(
 {
 	ha_node_t*	node;
 
-	hash_assert_can_search(table, fold);
-
 	ut_ad(btr_search_enabled);
 
 	node = ha_chain_get_first(table, fold);
@@ -204,39 +151,4 @@ ha_search_with_data(
 	return(NULL);
 }
 
-/***********************************************************//**
-Deletes a hash node. */
-void
-ha_delete_hash_node(
-/*================*/
-	hash_table_t*	table,		/*!< in: hash table */
-	ha_node_t*	del_node);	/*!< in: node to be deleted */
-
-/*********************************************************//**
-Looks for an element when we know the pointer to the data, and deletes
-it from the hash table, if found.
-@return TRUE if found */
-UNIV_INLINE
-ibool
-ha_search_and_delete_if_found(
-/*==========================*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold,	/*!< in: folded value of the searched data */
-	const rec_t*	data)	/*!< in: pointer to the data */
-{
-	ha_node_t*	node;
-
-	hash_assert_can_modify(table, fold);
-	ut_ad(btr_search_enabled);
-
-	node = ha_search_with_data(table, fold, data);
-
-	if (node) {
-		ha_delete_hash_node(table, node);
-
-		return(TRUE);
-	}
-
-	return(FALSE);
-}
 #endif /* BTR_CUR_HASH_ADAPT */
diff --git a/storage/innobase/include/ha0storage.inl b/storage/innobase/include/ha0storage.inl
index 8cc487faf47..df9679cf997 100644
--- a/storage/innobase/include/ha0storage.inl
+++ b/storage/innobase/include/ha0storage.inl
@@ -32,7 +32,7 @@ Created September 24, 2007 Vasil Dimov
 struct ha_storage_t {
 	mem_heap_t*	heap;	/*!< memory heap from which memory is
 				allocated */
-	hash_table_t*	hash;	/*!< hash table used to avoid
+	hash_table_t	hash;	/*!< hash table used to avoid
 				duplicates */
 };
 
@@ -77,7 +77,7 @@ ha_storage_create(
 						 sizeof(ha_storage_t));
 
 	storage->heap = heap;
-	storage->hash = hash_create(initial_hash_cells);
+	storage->hash.create(initial_hash_cells);
 
 	return(storage);
 }
@@ -97,7 +97,7 @@ ha_storage_empty(
 	temp_storage.heap = (*storage)->heap;
 	temp_storage.hash = (*storage)->hash;
 
-	hash_table_clear(temp_storage.hash);
+	temp_storage.hash.clear();
 	mem_heap_empty(temp_storage.heap);
 
 	*storage = (ha_storage_t*) mem_heap_alloc(temp_storage.heap,
@@ -117,9 +117,7 @@ ha_storage_free(
 /*============*/
 	ha_storage_t*	storage)	/*!< in, own: hash storage */
 {
-	/* order is important because the pointer storage->hash is
-	within the heap */
-	hash_table_free(storage->hash);
+	storage->hash.free();
 	mem_heap_free(storage->heap);
 }
 
@@ -138,7 +136,7 @@ ha_storage_get_size(
 
 	/* this assumes hash->heap and hash->heaps are NULL */
 	ret += sizeof(hash_table_t);
-	ret += sizeof(hash_cell_t) * hash_get_n_cells(storage->hash);
+	ret += sizeof(hash_cell_t) * storage->hash.n_cells;
 
 	return(ret);
 }
diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
index c5971bc75aa..c1c41a8f77c 100644
--- a/storage/innobase/include/ha_prototypes.h
+++ b/storage/innobase/include/ha_prototypes.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -36,6 +36,7 @@ simple headers.
 
 /* Forward declarations */
 class THD;
+class Field;
 
 // JAN: TODO missing features:
 #undef MYSQL_FT_INIT_EXT
@@ -109,10 +110,7 @@ innobase_convert_name(
 
 /******************************************************************//**
 Returns true if the thread is the replication thread on the slave
-server. Used in srv_conc_enter_innodb() to determine if the thread
-should be allowed to enter InnoDB - the replication thread is treated
-differently than other threads. Also used in
-srv_conc_force_exit_innodb().
+server.
 @return true if thd is the replication thread */
 ibool
 thd_is_replication_slave_thread(
@@ -147,16 +145,6 @@ innobase_mysql_print_thd(
 	uint	max_query_len);	/*!< in: max query length to print, or 0 to
 				   use the default max length */
 
-/*****************************************************************//**
-Log code calls this whenever log has been written and/or flushed up
-to a new position. We use this to notify upper layer of a new commit
-checkpoint when necessary.*/
-UNIV_INTERN
-void
-innobase_mysql_log_notify(
-/*======================*/
-	ib_uint64_t	flush_lsn);	/*!< in: LSN flushed to disk */
-
 /** Converts a MySQL type to an InnoDB type. Note that this function returns
 the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
 VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
@@ -164,10 +152,8 @@ VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
 at least ENUM and SET, and unsigned integer types are 'unsigned types'
 @param[in]	f			MySQL Field
 @return DATA_BINARY, DATA_VARCHAR, ... */
-ulint
-get_innobase_type_from_mysql_type(
-	ulint*			unsigned_flag,
-	const void*		field);
+uint8_t
+get_innobase_type_from_mysql_type(unsigned *unsigned_flag, const Field *field);
 
 /******************************************************************//**
 Get the variable length bounds of the given character set. */
@@ -175,8 +161,8 @@ void
 innobase_get_cset_width(
 /*====================*/
 	ulint	cset,		/*!< in: MySQL charset-collation code */
-	ulint*	mbminlen,	/*!< out: minimum length of a char (in bytes) */
-	ulint*	mbmaxlen);	/*!< out: maximum length of a char (in bytes) */
+	unsigned*mbminlen,	/*!< out: minimum length of a char (in bytes) */
+	unsigned*mbmaxlen);	/*!< out: maximum length of a char (in bytes) */
 
 /******************************************************************//**
 Compares NUL-terminated UTF-8 strings case insensitively.
@@ -195,14 +181,6 @@ innobase_basename(
 	const char*	path_name);
 
 /******************************************************************//**
-Returns true if the thread is executing a SELECT statement.
-@return true if thd is executing SELECT */
-ibool
-thd_is_select(
-/*==========*/
-	const THD*	thd);	/*!< in: thread handle */
-
-/******************************************************************//**
 Converts an identifier to a table name. */
 void
 innobase_convert_from_table_id(
@@ -235,10 +213,9 @@ wsrep_innobase_kill_one_trx(
 	THD* bf_thd,
 	trx_t *victim_trx,
 	my_bool signal);
-
 ulint wsrep_innobase_mysql_sort(int mysql_type, uint charset_number,
-                             unsigned char* str, unsigned int str_length,
-                             unsigned int buf_length);
+                             unsigned char* str, ulint str_length,
+                             ulint buf_length);
 #endif /* WITH_WSREP */
 
 extern "C" struct charset_info_st *thd_charset(THD *thd);
@@ -499,6 +476,17 @@ ib_push_warning(
 	const char	*format,/*!< in: warning message */
 	...);
 
+/********************************************************************//**
+Helper function to push warnings from InnoDB internals to SQL-layer. */
+UNIV_INTERN
+void
+ib_foreign_warn(
+	trx_t*		trx,	/*!< in: trx */
+	dberr_t		error,	/*!< in: error code to push as warning */
+	const char	*table_name,
+	const char	*format,/*!< in: warning message */
+	...);
+
 /*****************************************************************//**
 Normalizes a table name string. A normalized name consists of the
 database name catenated to '/' and table name. An example:
diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h
index e2565c62169..981ff5a0814 100644
--- a/storage/innobase/include/hash0hash.h
+++ b/storage/innobase/include/hash0hash.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, MariaDB Corporation.
+Copyright (c) 2018, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,75 +24,14 @@ The simple hash table utility
 Created 5/20/1997 Heikki Tuuri
 *******************************************************/
 
-#ifndef hash0hash_h
-#define hash0hash_h
-
-#include "mem0mem.h"
-#include "sync0rw.h"
+#pragma once
+#include "ut0rnd.h"
 
 struct hash_table_t;
-struct hash_cell_t;
-
-typedef void*	hash_node_t;
-
-/* Fix Bug #13859: symbol collision between imap/mysql */
-#define hash_create hash0_create
-
-/* Differnt types of hash_table based on the synchronization
-method used for it. */
-enum hash_table_sync_t {
-	HASH_TABLE_SYNC_NONE = 0,	/*!< Don't use any internal
-					synchronization objects for
-					this hash_table. */
-	HASH_TABLE_SYNC_MUTEX,		/*!< Use mutexes to control
-					access to this hash_table. */
-	HASH_TABLE_SYNC_RW_LOCK		/*!< Use rw_locks to control
-					access to this hash_table. */
+struct hash_cell_t{
+	void*	node;	/*!< hash chain node, NULL if none */
 };
-
-/*************************************************************//**
-Creates a hash table with >= n array cells. The actual number
-of cells is chosen to be a prime number slightly bigger than n.
-@return own: created table */
-hash_table_t*
-hash_create(
-/*========*/
-	ulint	n);	/*!< in: number of array cells */
-
-/*************************************************************//**
-Creates a sync object array array to protect a hash table.
-::sync_obj can be mutexes or rw_locks depening on the type of
-hash table. */
-void
-hash_create_sync_obj(
-/*=================*/
-	hash_table_t*		table,	/*!< in: hash table */
-	hash_table_sync_t	type,	/*!< in: HASH_TABLE_SYNC_MUTEX
-					or HASH_TABLE_SYNC_RW_LOCK */
-	latch_id_t		id,	/*!< in: mutex/rw_lock ID */
-	ulint			n_sync_obj);/*!< in: number of sync objects,
-					must be a power of 2 */
-
-/*************************************************************//**
-Frees a hash table. */
-void
-hash_table_free(
-/*============*/
-	hash_table_t*	table);	/*!< in, own: hash table */
-/**************************************************************//**
-Calculates the hash value from a folded value.
-@return hashed value */
-UNIV_INLINE
-ulint
-hash_calc_hash(
-/*===========*/
-	ulint		fold,	/*!< in: folded value */
-	hash_table_t*	table);	/*!< in: hash table */
-/********************************************************************//**
-Assert that the mutex for the table is held */
-#define HASH_ASSERT_OWN(TABLE, FOLD)				\
-	ut_ad((TABLE)->type != HASH_TABLE_SYNC_MUTEX		\
-	      || (mutex_own(hash_get_mutex((TABLE), FOLD))));
+typedef void*	hash_node_t;
 
 /*******************************************************************//**
 Inserts a struct to a hash table. */
@@ -102,11 +41,9 @@ do {\
 	hash_cell_t*	cell3333;\
 	TYPE*		struct3333;\
 \
-	HASH_ASSERT_OWN(TABLE, FOLD)\
-\
 	(DATA)->NAME = NULL;\
 \
-	cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\
+	cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)];	\
 \
 	if (cell3333->node == NULL) {\
 		cell3333->node = DATA;\
@@ -130,11 +67,9 @@ do {							\
 	hash_cell_t*	cell3333;			\
 	TYPE*		struct3333;			\
 							\
-	HASH_ASSERT_OWN(TABLE, FOLD)			\
-							\
 	(DATA)->NAME = NULL;				\
 							\
-	cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\
+	cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)];	\
 							\
 	if (cell3333->node == NULL) {			\
 		cell3333->node = DATA;			\
@@ -163,9 +98,7 @@ do {\
 	hash_cell_t*	cell3333;\
 	TYPE*		struct3333;\
 \
-	HASH_ASSERT_OWN(TABLE, FOLD)\
-\
-	cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\
+	cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)]; \
 \
 	if (cell3333->node == DATA) {\
 		HASH_ASSERT_VALID(DATA->NAME);\
@@ -189,7 +122,7 @@ do {\
 		(DATA_NEW)->NAME = (DATA_OLD)->NAME;                          \
                                                                               \
 		hash_cell_t& cell3333                                         \
-			= TABLE->array[hash_calc_hash(FOLD, TABLE)];          \
+			= (TABLE)->array[(TABLE)->calc_hash(FOLD)]; \
 		TYPE** struct3333 = (TYPE**)&cell3333.node;                   \
 		while (*struct3333 != DATA_OLD) {                             \
 			struct3333 = &((*struct3333)->NAME);                  \
@@ -199,8 +132,7 @@ do {\
 /*******************************************************************//**
 Gets the first struct in a hash chain, NULL if none. */
 
-#define HASH_GET_FIRST(TABLE, HASH_VAL)\
-	(hash_get_nth_cell(TABLE, HASH_VAL)->node)
+#define HASH_GET_FIRST(TABLE, HASH_VAL) (TABLE)->array[HASH_VAL].node
 
 /*******************************************************************//**
 Gets the next struct in a hash chain, NULL if none. */
@@ -211,10 +143,7 @@ Gets the next struct in a hash chain, NULL if none. */
 Looks for a struct in a hash table. */
 #define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\
 {\
-\
-	HASH_ASSERT_OWN(TABLE, FOLD)\
-\
-	(DATA) = (TYPE) HASH_GET_FIRST(TABLE, hash_calc_hash(FOLD, TABLE));\
+	(DATA) = (TYPE) HASH_GET_FIRST(TABLE, (TABLE)->calc_hash(FOLD)); \
 	HASH_ASSERT_VALID(DATA);\
 \
 	while ((DATA) != NULL) {\
@@ -254,92 +183,6 @@ do {									\
 	}								\
 } while (0)
 
-/************************************************************//**
-Gets the nth cell in a hash table.
-@return pointer to cell */
-UNIV_INLINE
-hash_cell_t*
-hash_get_nth_cell(
-/*==============*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		n);	/*!< in: cell index */
-
-/*************************************************************//**
-Clears a hash table so that all the cells become empty. */
-UNIV_INLINE
-void
-hash_table_clear(
-/*=============*/
-	hash_table_t*	table);	/*!< in/out: hash table */
-
-/*************************************************************//**
-Returns the number of cells in a hash table.
-@return number of cells */
-UNIV_INLINE
-ulint
-hash_get_n_cells(
-/*=============*/
-	hash_table_t*	table);	/*!< in: table */
-/*******************************************************************//**
-Deletes a struct which is stored in the heap of the hash table, and compacts
-the heap. The fold value must be stored in the struct NODE in a field named
-'fold'. */
-
-#define HASH_DELETE_AND_COMPACT(TYPE, NAME, TABLE, NODE)\
-do {\
-	TYPE*		node111;\
-	TYPE*		top_node111;\
-	hash_cell_t*	cell111;\
-	ulint		fold111;\
-\
-	fold111 = (NODE)->fold;\
-\
-	HASH_DELETE(TYPE, NAME, TABLE, fold111, NODE);\
-\
-	top_node111 = (TYPE*) mem_heap_get_top(\
-				hash_get_heap(TABLE, fold111),\
-							sizeof(TYPE));\
-\
-	/* If the node to remove is not the top node in the heap, compact the\
-	heap of nodes by moving the top node in the place of NODE. */\
-\
-	if (NODE != top_node111) {\
-\
-		/* Copy the top node in place of NODE */\
-\
-		*(NODE) = *top_node111;\
-\
-		cell111 = hash_get_nth_cell(TABLE,\
-				hash_calc_hash(top_node111->fold, TABLE));\
-\
-		/* Look for the pointer to the top node, to update it */\
-\
-		if (cell111->node == top_node111) {\
-			/* The top node is the first in the chain */\
-\
-			cell111->node = NODE;\
-		} else {\
-			/* We have to look for the predecessor of the top\
-			node */\
-			node111 = static_cast<TYPE*>(cell111->node);\
-\
-			while (top_node111 != HASH_GET_NEXT(NAME, node111)) {\
-\
-				node111 = static_cast<TYPE*>(\
-					HASH_GET_NEXT(NAME, node111));\
-			}\
-\
-			/* Now we have the predecessor node */\
-\
-			node111->NAME = NODE;\
-		}\
-	}\
-\
-	/* Free the space occupied by the top node */\
-\
-	mem_heap_free_top(hash_get_heap(TABLE, fold111), sizeof(TYPE));\
-} while (0)
-
 /****************************************************************//**
 Move all hash table entries from OLD_TABLE to NEW_TABLE. */
 
@@ -348,7 +191,7 @@ do {\
 	ulint		i2222;\
 	ulint		cell_count2222;\
 \
-	cell_count2222 = hash_get_n_cells(OLD_TABLE);\
+	cell_count2222 = (OLD_TABLE)->n_cells;	\
 \
 	for (i2222 = 0; i2222 < cell_count2222; i2222++) {\
 		NODE_TYPE*	node2222 = static_cast<NODE_TYPE*>(\
@@ -367,159 +210,27 @@ do {\
 	}\
 } while (0)
 
-/************************************************************//**
-Gets the sync object index for a fold value in a hash table.
-@return index */
-UNIV_INLINE
-ulint
-hash_get_sync_obj_index(
-/*====================*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold);	/*!< in: fold */
-/************************************************************//**
-Gets the nth heap in a hash table.
-@return mem heap */
-UNIV_INLINE
-mem_heap_t*
-hash_get_nth_heap(
-/*==============*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		i);	/*!< in: index of the heap */
-/************************************************************//**
-Gets the heap for a fold value in a hash table.
-@return mem heap */
-UNIV_INLINE
-mem_heap_t*
-hash_get_heap(
-/*==========*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold);	/*!< in: fold */
-/************************************************************//**
-Gets the nth mutex in a hash table.
-@return mutex */
-UNIV_INLINE
-ib_mutex_t*
-hash_get_nth_mutex(
-/*===============*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		i);	/*!< in: index of the mutex */
-/************************************************************//**
-Gets the nth rw_lock in a hash table.
-@return rw_lock */
-UNIV_INLINE
-rw_lock_t*
-hash_get_nth_lock(
-/*==============*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		i);	/*!< in: index of the rw_lock */
-/************************************************************//**
-Gets the mutex for a fold value in a hash table.
-@return mutex */
-UNIV_INLINE
-ib_mutex_t*
-hash_get_mutex(
-/*===========*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold);	/*!< in: fold */
-/************************************************************//**
-Gets the rw_lock for a fold value in a hash table.
-@return rw_lock */
-UNIV_INLINE
-rw_lock_t*
-hash_get_lock(
-/*==========*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold);	/*!< in: fold */
-
-/** If not appropriate rw_lock for a fold value in a hash table,
-relock S-lock the another rw_lock until appropriate for a fold value.
-@param[in]	hash_lock	latched rw_lock to be confirmed
-@param[in]	table		hash table
-@param[in]	fold		fold value
-@return	latched rw_lock */
-UNIV_INLINE
-rw_lock_t*
-hash_lock_s_confirm(
-	rw_lock_t*	hash_lock,
-	hash_table_t*	table,
-	ulint		fold);
-
-/** If not appropriate rw_lock for a fold value in a hash table,
-relock X-lock the another rw_lock until appropriate for a fold value.
-@param[in]	hash_lock	latched rw_lock to be confirmed
-@param[in]	table		hash table
-@param[in]	fold		fold value
-@return	latched rw_lock */
-UNIV_INLINE
-rw_lock_t*
-hash_lock_x_confirm(
-	rw_lock_t*	hash_lock,
-	hash_table_t*	table,
-	ulint		fold);
-
-/************************************************************//**
-Reserves all the locks of a hash table, in an ascending order. */
-void
-hash_lock_x_all(
-/*============*/
-	hash_table_t*	table);	/*!< in: hash table */
-/************************************************************//**
-Releases all the locks of a hash table, in an ascending order. */
-void
-hash_unlock_x_all(
-/*==============*/
-	hash_table_t*	table);	/*!< in: hash table */
-/************************************************************//**
-Releases all but passed in lock of a hash table, */
-void
-hash_unlock_x_all_but(
-/*==================*/
-	hash_table_t*	table,		/*!< in: hash table */
-	rw_lock_t*	keep_lock);	/*!< in: lock to keep */
-
-struct hash_cell_t{
-	void*	node;	/*!< hash chain node, NULL if none */
-};
-
-/* The hash table structure */
-struct hash_table_t {
-	enum hash_table_sync_t	type;	/*<! type of hash_table. */
-#ifdef BTR_CUR_HASH_ADAPT
-# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-	ibool			adaptive;/* TRUE if this is the hash
-					table of the adaptive hash
-					index */
-# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-#endif /* BTR_CUR_HASH_ADAPT */
-	ulint			n_cells;/* number of cells in the hash table */
-	hash_cell_t*		array;	/*!< pointer to cell array */
-
-	ulint			n_sync_obj;/* if sync_objs != NULL, then
-					the number of either the number
-					of mutexes or the number of
-					rw_locks depending on the type.
-					Must be a power of 2 */
-	union {
-		ib_mutex_t*	mutexes;/* NULL, or an array of mutexes
-					used to protect segments of the
-					hash table */
-		rw_lock_t*	rw_locks;/* NULL, or an array of rw_lcoks
-					used to protect segments of the
-					hash table */
-	} sync_obj;
-
-	mem_heap_t**		heaps;	/*!< if this is non-NULL, hash
-					chain nodes for external chaining
-					can be allocated from these memory
-					heaps; there are then n_mutexes
-					many of these heaps */
-	mem_heap_t*		heap;
-#ifdef UNIV_DEBUG
-	ulint			magic_n;
-# define HASH_TABLE_MAGIC_N	76561114
-#endif /* UNIV_DEBUG */
+/** Hash table with singly-linked overflow lists */
+struct hash_table_t
+{
+  /** number of elements in array (a prime number) */
+  ulint n_cells;
+  /** the hash array */
+  hash_cell_t *array;
+
+  /** Create the hash table.
+  @param n  the lower bound of n_cells */
+  void create(ulint n)
+  {
+    n_cells= ut_find_prime(n);
+    array= static_cast<hash_cell_t*>(ut_zalloc_nokey(n_cells * sizeof *array));
+  }
+
+  /** Clear the hash table. */
+  void clear() { memset(array, 0, n_cells * sizeof *array); }
+
+  /** Free the hash table. */
+  void free() { ut_free(array); array= nullptr; }
+
+  ulint calc_hash(ulint fold) const { return ut_hash_ulint(fold, n_cells); }
 };
-
-#include "hash0hash.inl"
-
-#endif
diff --git a/storage/innobase/include/hash0hash.inl b/storage/innobase/include/hash0hash.inl
deleted file mode 100644
index d6dd104572f..00000000000
--- a/storage/innobase/include/hash0hash.inl
+++ /dev/null
@@ -1,277 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1997, 2015, Oracle and/or its affiliates. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file include/hash0hash.ic
-The simple hash table utility
-
-Created 5/20/1997 Heikki Tuuri
-*******************************************************/
-
-#include "ut0rnd.h"
-
-/************************************************************//**
-Gets the nth cell in a hash table.
-@return pointer to cell */
-UNIV_INLINE
-hash_cell_t*
-hash_get_nth_cell(
-/*==============*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		n)	/*!< in: cell index */
-{
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ut_ad(n < table->n_cells);
-
-	return(table->array + n);
-}
-
-/*************************************************************//**
-Clears a hash table so that all the cells become empty. */
-UNIV_INLINE
-void
-hash_table_clear(
-/*=============*/
-	hash_table_t*	table)	/*!< in/out: hash table */
-{
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	memset(table->array, 0x0,
-	       table->n_cells * sizeof(*table->array));
-}
-
-/*************************************************************//**
-Returns the number of cells in a hash table.
-@return number of cells */
-UNIV_INLINE
-ulint
-hash_get_n_cells(
-/*=============*/
-	hash_table_t*	table)	/*!< in: table */
-{
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	return(table->n_cells);
-}
-
-/**************************************************************//**
-Calculates the hash value from a folded value.
-@return hashed value */
-UNIV_INLINE
-ulint
-hash_calc_hash(
-/*===========*/
-	ulint		fold,	/*!< in: folded value */
-	hash_table_t*	table)	/*!< in: hash table */
-{
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	return(ut_hash_ulint(fold, table->n_cells));
-}
-
-/************************************************************//**
-Gets the sync object index for a fold value in a hash table.
-@return index */
-UNIV_INLINE
-ulint
-hash_get_sync_obj_index(
-/*====================*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold)	/*!< in: fold */
-{
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ut_ad(table->type != HASH_TABLE_SYNC_NONE);
-	ut_ad(ut_is_2pow(table->n_sync_obj));
-	return(ut_2pow_remainder(hash_calc_hash(fold, table),
-				 table->n_sync_obj));
-}
-
-/************************************************************//**
-Gets the nth heap in a hash table.
-@return mem heap */
-UNIV_INLINE
-mem_heap_t*
-hash_get_nth_heap(
-/*==============*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		i)	/*!< in: index of the heap */
-{
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ut_ad(table->type != HASH_TABLE_SYNC_NONE);
-	ut_ad(i < table->n_sync_obj);
-
-	return(table->heaps[i]);
-}
-
-/************************************************************//**
-Gets the heap for a fold value in a hash table.
-@return mem heap */
-UNIV_INLINE
-mem_heap_t*
-hash_get_heap(
-/*==========*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold)	/*!< in: fold */
-{
-	ulint	i;
-
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-
-	if (table->heap) {
-		return(table->heap);
-	}
-
-	i = hash_get_sync_obj_index(table, fold);
-
-	return(hash_get_nth_heap(table, i));
-}
-
-/************************************************************//**
-Gets the nth mutex in a hash table.
-@return mutex */
-UNIV_INLINE
-ib_mutex_t*
-hash_get_nth_mutex(
-/*===============*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		i)	/*!< in: index of the mutex */
-{
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
-	ut_ad(i < table->n_sync_obj);
-
-	return(table->sync_obj.mutexes + i);
-}
-
-/************************************************************//**
-Gets the mutex for a fold value in a hash table.
-@return mutex */
-UNIV_INLINE
-ib_mutex_t*
-hash_get_mutex(
-/*===========*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold)	/*!< in: fold */
-{
-	ulint	i;
-
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-
-	i = hash_get_sync_obj_index(table, fold);
-
-	return(hash_get_nth_mutex(table, i));
-}
-
-/************************************************************//**
-Gets the nth rw_lock in a hash table.
-@return rw_lock */
-UNIV_INLINE
-rw_lock_t*
-hash_get_nth_lock(
-/*==============*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		i)	/*!< in: index of the rw_lock */
-{
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
-	ut_ad(i < table->n_sync_obj);
-
-	return(table->sync_obj.rw_locks + i);
-}
-
-/************************************************************//**
-Gets the rw_lock for a fold value in a hash table.
-@return rw_lock */
-UNIV_INLINE
-rw_lock_t*
-hash_get_lock(
-/*==========*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold)	/*!< in: fold */
-{
-	ulint	i;
-
-	ut_ad(table);
-	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-
-	i = hash_get_sync_obj_index(table, fold);
-
-	return(hash_get_nth_lock(table, i));
-}
-
-/** If not appropriate rw_lock for a fold value in a hash table,
-relock S-lock the another rw_lock until appropriate for a fold value.
-@param[in]	hash_lock	latched rw_lock to be confirmed
-@param[in]	table		hash table
-@param[in]	fold		fold value
-@return	latched rw_lock */
-UNIV_INLINE
-rw_lock_t*
-hash_lock_s_confirm(
-	rw_lock_t*	hash_lock,
-	hash_table_t*	table,
-	ulint		fold)
-{
-	ut_ad(rw_lock_own(hash_lock, RW_LOCK_S));
-
-	rw_lock_t*	hash_lock_tmp = hash_get_lock(table, fold);
-
-	while (hash_lock_tmp != hash_lock) {
-		rw_lock_s_unlock(hash_lock);
-		hash_lock = hash_lock_tmp;
-		rw_lock_s_lock(hash_lock);
-		hash_lock_tmp = hash_get_lock(table, fold);
-	}
-
-	return(hash_lock);
-}
-
-/** If not appropriate rw_lock for a fold value in a hash table,
-relock X-lock the another rw_lock until appropriate for a fold value.
-@param[in]	hash_lock	latched rw_lock to be confirmed
-@param[in]	table		hash table
-@param[in]	fold		fold value
-@return	latched rw_lock */
-UNIV_INLINE
-rw_lock_t*
-hash_lock_x_confirm(
-	rw_lock_t*	hash_lock,
-	hash_table_t*	table,
-	ulint		fold)
-{
-	ut_ad(rw_lock_own(hash_lock, RW_LOCK_X));
-
-	rw_lock_t*	hash_lock_tmp = hash_get_lock(table, fold);
-
-	while (hash_lock_tmp != hash_lock) {
-		rw_lock_x_unlock(hash_lock);
-		hash_lock = hash_lock_tmp;
-		rw_lock_x_lock(hash_lock);
-		hash_lock_tmp = hash_get_lock(table, fold);
-	}
-
-	return(hash_lock);
-}
diff --git a/storage/innobase/include/ib0mutex.h b/storage/innobase/include/ib0mutex.h
index ce0e911dbb4..81ab756665d 100644
--- a/storage/innobase/include/ib0mutex.h
+++ b/storage/innobase/include/ib0mutex.h
@@ -145,7 +145,7 @@ private:
 };
 
 
-#ifdef HAVE_IB_LINUX_FUTEX
+#ifdef __linux__
 
 #include <linux/futex.h>
 #include <sys/syscall.h>
@@ -261,7 +261,7 @@ private:
 	std::atomic<int32>	m_lock_word;
 };
 
-#endif /* HAVE_IB_LINUX_FUTEX */
+#endif /* __linux__ */
 
 template <template <typename> class Policy>
 struct TTASMutex {
@@ -457,11 +457,7 @@ struct TTASEventMutex {
 
 				sync_cell_t*	cell;
 				sync_array_t *sync_arr = sync_array_get_and_reserve_cell(
-					this,
-					(m_policy.get_id() == LATCH_ID_BUF_BLOCK_MUTEX
-					 || m_policy.get_id() == LATCH_ID_BUF_POOL_ZIP)
-					? SYNC_BUF_BLOCK
-					: SYNC_MUTEX,
+					this, SYNC_MUTEX,
 					filename, line, &cell);
 
 				uint32_t oldval = MUTEX_STATE_LOCKED;
diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h
index 73f7054c9fb..ff80db5f92a 100644
--- a/storage/innobase/include/ibuf0ibuf.h
+++ b/storage/innobase/include/ibuf0ibuf.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2016, 2019, MariaDB Corporation.
+Copyright (c) 2016, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -62,7 +62,7 @@ enum ibuf_use_t {
 extern ulong		innodb_change_buffering;
 
 /** The insert buffer control structure */
-extern ibuf_t*		ibuf;
+extern ibuf_t		ibuf;
 
 /* The purpose of the insert buffer is to reduce random disk access.
 When we wish to insert a record into a non-unique secondary index and
@@ -317,13 +317,11 @@ ibuf_insert(
 	ulint			zip_size,
 	que_thr_t*		thr);
 
-/**
-Delete any buffered entries for a page.
-This prevents an infinite loop on slow shutdown
-in the case where the change buffer bitmap claims that no buffered
-changes exist, while entries exist in the change buffer tree.
-@param page_id  page number for which there should be no unbuffered changes */
-ATTRIBUTE_COLD void ibuf_delete_recs(const page_id_t page_id);
+/** Check whether buffered changes exist for a page.
+@param[in]	id		page identifier
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return whether buffered changes exist */
+bool ibuf_page_exists(const page_id_t id, ulint zip_size);
 
 /** When an index page is read from a disk to the buffer pool, this function
 applies any buffered operations to the page and deletes the entries from the
@@ -343,15 +341,10 @@ in DISCARD TABLESPACE, IMPORT TABLESPACE, or crash recovery.
 void ibuf_delete_for_discarded_space(ulint space);
 
 /** Contract the change buffer by reading pages to the buffer pool.
-@param[in]	full		If true, do a full contraction based
-on PCT_IO(100). If false, the size of contract batch is determined
-based on the current size of the change buffer.
 @return a lower limit for the combined size in bytes of entries which
 will be merged from ibuf trees to the pages read, 0 if ibuf is
 empty */
-ulint
-ibuf_merge_in_background(
-	bool	full);
+ulint ibuf_merge_all();
 
 /** Contracts insert buffer trees by reading pages referring to space_id
 to the buffer pool.
@@ -361,9 +354,6 @@ ibuf_merge_space(
 /*=============*/
 	ulint	space);	/*!< in: space id */
 
-/** Apply MLOG_IBUF_BITMAP_INIT when crash-upgrading */
-ATTRIBUTE_COLD void ibuf_bitmap_init_apply(buf_block_t* block);
-
 /******************************************************************//**
 Looks if the insert buffer is empty.
 @return true if empty */
diff --git a/storage/innobase/include/ibuf0ibuf.inl b/storage/innobase/include/ibuf0ibuf.inl
index db8c122c0f7..2c2620511c7 100644
--- a/storage/innobase/include/ibuf0ibuf.inl
+++ b/storage/innobase/include/ibuf0ibuf.inl
@@ -44,6 +44,11 @@ ibuf_mtr_start(
 {
 	mtr_start(mtr);
 	mtr->enter_ibuf();
+
+	if (high_level_read_only || srv_read_only_mode) {
+		mtr_set_log_mode(mtr, MTR_LOG_NO_REDO);
+	}
+
 }
 /***************************************************************//**
 Commits an insert buffer mini-transaction. */
@@ -126,12 +131,11 @@ ibuf_should_try(
 						decide */
 {
 	return(innodb_change_buffering
-	       && ibuf->max_size != 0
+	       && ibuf.max_size != 0
 	       && !dict_index_is_clust(index)
 	       && !dict_index_is_spatial(index)
 	       && index->table->quiesce == QUIESCE_NONE
-	       && (ignore_sec_unique || !dict_index_is_unique(index))
-	       && srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE);
+	       && (ignore_sec_unique || !dict_index_is_unique(index)));
 }
 
 /******************************************************************//**
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
index 9c3f5d57f01..a107359ccf1 100644
--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -252,15 +252,6 @@ lock_rec_restore_from_page_infimum(
 					state; lock bits are reset on
 					the infimum */
 /*********************************************************************//**
-Determines if there are explicit record locks on a page.
-@return an explicit record lock on the page, or NULL if there are none */
-lock_t*
-lock_rec_expl_exist_on_page(
-/*========================*/
-	ulint	space,	/*!< in: space id */
-	ulint	page_no)/*!< in: page number */
-	MY_ATTRIBUTE((warn_unused_result));
-/*********************************************************************//**
 Checks if locks of other transactions prevent an immediate insert of
 a record. If they do, first tests if the query thread should anyway
 be suspended for some reason; if not, then puts the transaction and
@@ -344,7 +335,7 @@ lock_sec_rec_read_check_and_lock(
 					records: LOCK_S or LOCK_X; the
 					latter is possible in
 					SELECT FOR UPDATE */
-	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+	unsigned		gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
 					LOCK_REC_NOT_GAP */
 	que_thr_t*		thr);	/*!< in: query thread */
 /*********************************************************************//**
@@ -372,7 +363,7 @@ lock_clust_rec_read_check_and_lock(
 					records: LOCK_S or LOCK_X; the
 					latter is possible in
 					SELECT FOR UPDATE */
-	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+	unsigned		gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
 					LOCK_REC_NOT_GAP */
 	que_thr_t*		thr);	/*!< in: query thread */
 /*********************************************************************//**
@@ -401,7 +392,7 @@ lock_clust_rec_read_check_and_lock_alt(
 					records: LOCK_S or LOCK_X; the
 					latter is possible in
 					SELECT FOR UPDATE */
-	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+	unsigned		gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
 					LOCK_REC_NOT_GAP */
 	que_thr_t*		thr)	/*!< in: query thread */
 	MY_ATTRIBUTE((warn_unused_result));
@@ -443,7 +434,7 @@ be granted immediately, the query thread is put to wait.
 dberr_t
 lock_table(
 /*=======*/
-	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is set,
+	unsigned	flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is set,
 				does nothing */
 	dict_table_t*	table,	/*!< in/out: database table
 				in dictionary cache */
@@ -487,27 +478,9 @@ lock_rec_unlock(
 and release possible other transactions waiting because of these locks. */
 void lock_release(trx_t* trx);
 
-/*********************************************************************//**
-Calculates the fold value of a page file address: used in inserting or
-searching for a lock in the hash table.
-@return folded value */
-UNIV_INLINE
-ulint
-lock_rec_fold(
-/*==========*/
-	ulint	space,	/*!< in: space */
-	ulint	page_no)/*!< in: page number */
-	MY_ATTRIBUTE((const));
-/*********************************************************************//**
-Calculates the hash value of a page file address: used in inserting or
-searching for a lock in the hash table.
-@return hashed value */
-UNIV_INLINE
-unsigned
-lock_rec_hash(
-/*==========*/
-	ulint	space,	/*!< in: space */
-	ulint	page_no);/*!< in: page number */
+/** Release non-exclusive locks on XA PREPARE,
+and release possible other transactions waiting because of these locks. */
+void lock_release_on_prepare(trx_t *trx);
 
 /*************************************************************//**
 Get the lock hash table */
@@ -606,42 +579,6 @@ lock_get_type(
 	const lock_t*	lock);	/*!< in: lock */
 
 /*******************************************************************//**
-Gets the trx of the lock. Non-inline version for using outside of the
-lock module.
-@return	trx_t* */
-UNIV_INTERN
-trx_t*
-lock_get_trx(
-/*=========*/
-	const lock_t*	lock);	/*!< in: lock */
-
-/*******************************************************************//**
-Gets the id of the transaction owning a lock.
-@return transaction id */
-trx_id_t
-lock_get_trx_id(
-/*============*/
-	const lock_t*	lock);	/*!< in: lock */
-
-/*******************************************************************//**
-Gets the mode of a lock in a human readable string.
-The string should not be free()'d or modified.
-@return lock mode */
-const char*
-lock_get_mode_str(
-/*==============*/
-	const lock_t*	lock);	/*!< in: lock */
-
-/*******************************************************************//**
-Gets the type of a lock in a human readable string.
-The string should not be free()'d or modified.
-@return lock type */
-const char*
-lock_get_type_str(
-/*==============*/
-	const lock_t*	lock);	/*!< in: lock */
-
-/*******************************************************************//**
 Gets the id of the table on which the lock is.
 @return id of the table */
 table_id_t
@@ -674,21 +611,6 @@ lock_rec_get_index_name(
 	const lock_t*	lock);	/*!< in: lock */
 
 /*******************************************************************//**
-For a record lock, gets the tablespace number on which the lock is.
-@return tablespace number */
-ulint
-lock_rec_get_space_id(
-/*==================*/
-	const lock_t*	lock);	/*!< in: lock */
-
-/*******************************************************************//**
-For a record lock, gets the page number on which the lock is.
-@return page number */
-ulint
-lock_rec_get_page_no(
-/*=================*/
-	const lock_t*	lock);	/*!< in: lock */
-/*******************************************************************//**
 Check if there are any locks (table or rec) against table.
 @return TRUE if locks exist */
 bool
@@ -698,15 +620,8 @@ lock_table_has_locks(
 					held on records in this table or on the
 					table itself */
 
-/*********************************************************************//**
-A thread which wakes up threads whose lock wait may have lasted too long.
-@return a dummy parameter */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(lock_wait_timeout_thread)(
-/*=====================================*/
-	void*	arg);	/*!< in: a dummy parameter required by
-			os_thread_create */
+/** A task which wakes up threads whose lock wait may have lasted too long */
+void lock_wait_timeout_task(void*);
 
 /********************************************************************//**
 Releases a user OS thread waiting for a lock to be released, if the
@@ -759,11 +674,6 @@ lock_trx_lock_list_init(
 /*====================*/
 	trx_lock_list_t*	lock_list);	/*!< List to initialise */
 
-/*******************************************************************//**
-Set the lock system timeout event. */
-void
-lock_set_timeout_event();
-/*====================*/
 /*********************************************************************//**
 Checks that a transaction id is sensible, i.e., not in the future.
 @return true if ok */
@@ -817,12 +727,12 @@ public:
 	MY_ALIGNED(CACHE_LINE_SIZE)
 	LockMutex	mutex;			/*!< Mutex protecting the
 						locks */
-	hash_table_t*	rec_hash;		/*!< hash table of the record
-						locks */
-	hash_table_t*	prdt_hash;		/*!< hash table of the predicate
-						lock */
-	hash_table_t*	prdt_page_hash;		/*!< hash table of the page
-						lock */
+  /** record locks */
+  hash_table_t rec_hash;
+  /** predicate locks for SPATIAL INDEX */
+  hash_table_t prdt_hash;
+  /** page locks for SPATIAL INDEX */
+  hash_table_t prdt_page_hash;
 
 	MY_ALIGNED(CACHE_LINE_SIZE)
 	LockMutex	wait_mutex;		/*!< Mutex protecting the
@@ -843,14 +753,8 @@ public:
 
 	ulint		n_lock_max_wait_time;	/*!< Max wait time */
 
-	os_event_t	timeout_event;		/*!< An event waited for by
-						lock_wait_timeout_thread.
-						Not protected by a mutex,
-						but the waits are timed.
-						Signaled on shutdown only. */
-
-	bool		timeout_thread_active;	/*!< True if the timeout thread
-						is running */
+	std::unique_ptr<tpool::timer>	timeout_timer; /*!< Thread pool timer task */
+	bool timeout_timer_active;
 
 
   /**
@@ -883,6 +787,46 @@ public:
 
   /** Closes the lock system at database shutdown. */
   void close();
+
+  /** @return the hash value for a page address */
+  ulint hash(const page_id_t id) const
+  { ut_ad(mutex_own(&mutex)); return rec_hash.calc_hash(id.fold()); }
+
+  /** Get the first lock on a page.
+  @param lock_hash   hash table to look at
+  @param id          page number
+  @return first lock
+  @retval nullptr if none exists */
+  lock_t *get_first(const hash_table_t &lock_hash, const page_id_t id) const
+  {
+    ut_ad(&lock_hash == &rec_hash || &lock_hash == &prdt_hash ||
+          &lock_hash == &prdt_page_hash);
+    for (lock_t *lock= static_cast<lock_t*>
+         (HASH_GET_FIRST(&lock_hash, hash(id)));
+         lock; lock= static_cast<lock_t*>(HASH_GET_NEXT(hash, lock)))
+      if (lock->un_member.rec_lock.page_id == id)
+         return lock;
+    return nullptr;
+  }
+
+  /** Get the first record lock on a page.
+  @param id          page number
+  @return first lock
+  @retval nullptr if none exists */
+  lock_t *get_first(const page_id_t id) const
+  { return get_first(rec_hash, id); }
+  /** Get the first predicate lock on a SPATIAL INDEX page.
+  @param id          page number
+  @return first lock
+  @retval nullptr if none exists */
+  lock_t *get_first_prdt(const page_id_t id) const
+  { return get_first(prdt_hash, id); }
+  /** Get the first predicate lock on a SPATIAL INDEX page.
+  @param id          page number
+  @return first lock
+  @retval nullptr if none exists */
+  lock_t *get_first_prdt_page(const page_id_t id) const
+  { return get_first(prdt_page_hash, id); }
 };
 
 /*********************************************************************//**
@@ -897,7 +841,7 @@ lock_rec_create(
 	lock_t*			c_lock,	/*!< conflicting lock */
 	que_thr_t*		thr,	/*!< thread owning trx */
 #endif
-	ulint			type_mode,/*!< in: lock mode and wait
+	unsigned		type_mode,/*!< in: lock mode and wait
 					flag, type is ignored and
 					replaced by LOCK_REC */
 	const buf_block_t*	block,	/*!< in: buffer block containing
@@ -922,8 +866,7 @@ lock_rec_discard(
 without checking for deadlocks or conflicts.
 @param[in]	type_mode	lock mode and wait flag; type will be replaced
 				with LOCK_REC
-@param[in]	space		tablespace id
-@param[in]	page_no		index page number
+@param[in]	page_id		index page number
 @param[in]	page		R-tree index page, or NULL
 @param[in]	heap_no		record heap number in the index page
 @param[in]	index		the index tree
@@ -936,9 +879,8 @@ lock_rec_create_low(
 	lock_t*		c_lock,	/*!< conflicting lock */
 	que_thr_t*	thr,	/*!< thread owning trx */
 #endif
-	ulint		type_mode,
-	ulint		space,
-	ulint		page_no,
+	unsigned	type_mode,
+	const page_id_t	page_id,
 	const page_t*	page,
 	ulint		heap_no,
 	dict_index_t*	index,
@@ -967,7 +909,7 @@ lock_rec_enqueue_waiting(
 #ifdef WITH_WSREP
 	lock_t*			c_lock,	/*!< conflicting lock */
 #endif
-	ulint			type_mode,
+	unsigned		type_mode,
 	const buf_block_t*	block,
 	ulint			heap_no,
 	dict_index_t*		index,
diff --git a/storage/innobase/include/lock0lock.inl b/storage/innobase/include/lock0lock.inl
index abe5052627b..2d5b6ff37f1 100644
--- a/storage/innobase/include/lock0lock.inl
+++ b/storage/innobase/include/lock0lock.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2018, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -29,35 +29,6 @@ Created 5/7/1996 Heikki Tuuri
 #include "page0page.h"
 
 /*********************************************************************//**
-Calculates the fold value of a page file address: used in inserting or
-searching for a lock in the hash table.
-@return folded value */
-UNIV_INLINE
-ulint
-lock_rec_fold(
-/*==========*/
-	ulint	space,	/*!< in: space */
-	ulint	page_no)/*!< in: page number */
-{
-	return(ut_fold_ulint_pair(space, page_no));
-}
-
-/*********************************************************************//**
-Calculates the hash value of a page file address: used in inserting or
-searching for a lock in the hash table.
-@return hashed value */
-UNIV_INLINE
-unsigned
-lock_rec_hash(
-/*==========*/
-	ulint	space,	/*!< in: space */
-	ulint	page_no)/*!< in: page number */
-{
-	return(unsigned(hash_calc_hash(lock_rec_fold(space, page_no),
-				       lock_sys.rec_hash)));
-}
-
-/*********************************************************************//**
 Gets the heap_no of the smallest user record on a page.
 @return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
 UNIV_INLINE
@@ -90,11 +61,11 @@ lock_hash_get(
 	ulint	mode)	/*!< in: lock mode */
 {
 	if (mode & LOCK_PREDICATE) {
-		return(lock_sys.prdt_hash);
+		return &lock_sys.prdt_hash;
 	} else if (mode & LOCK_PRDT_PAGE) {
-		return(lock_sys.prdt_page_hash);
+		return &lock_sys.prdt_page_hash;
 	} else {
-		return(lock_sys.rec_hash);
+		return &lock_sys.rec_hash;
 	}
 }
 
@@ -110,7 +81,7 @@ lock_rec_create(
 	lock_t*			c_lock,	/*!< conflicting lock */
 	que_thr_t*		thr,	/*!< thread owning trx */
 #endif
-	ulint			type_mode,/*!< in: lock mode and wait
+	unsigned		type_mode,/*!< in: lock mode and wait
 					flag, type is ignored and
 					replaced by LOCK_REC */
 	const buf_block_t*	block,	/*!< in: buffer block containing
@@ -127,8 +98,6 @@ lock_rec_create(
 #ifdef WITH_WSREP
 		c_lock, thr,
 #endif
-		type_mode,
-		block->page.id.space(), block->page.id.page_no(),
-		block->frame, heap_no,
+		type_mode, block->page.id(), block->frame, heap_no,
 		index, trx, caller_owns_trx_mutex);
 }
diff --git a/storage/innobase/include/lock0prdt.h b/storage/innobase/include/lock0prdt.h
index 378a5c2faca..43d68996691 100644
--- a/storage/innobase/include/lock0prdt.h
+++ b/storage/innobase/include/lock0prdt.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, MariaDB Corporation.
+Copyright (c) 2018, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -48,7 +48,7 @@ lock_prdt_lock(
 				records: LOCK_S or LOCK_X; the
 				latter is possible in
 				SELECT FOR UPDATE */
-	ulint		type_mode,
+	unsigned	type_mode,
 				/*!< in: LOCK_PREDICATE or LOCK_PRDT_PAGE */
 	que_thr_t*	thr);	/*!< in: query thread
 				(can be NULL if BTR_NO_LOCKING_FLAG) */
@@ -58,9 +58,7 @@ Acquire a "Page" lock on a block
 @return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
 dberr_t
 lock_place_prdt_page_lock(
-/*======================*/
-	ulint		space,	/*!< in: space for the page to lock */
-	ulint		pageno,	/*!< in: page number */
+	const page_id_t	page_id,	/*!< in: page identifier */
 	dict_index_t*	index,	/*!< in: secondary index */
 	que_thr_t*	thr);	/*!< in: query thread */
 
@@ -90,7 +88,7 @@ bool
 lock_prdt_has_to_wait(
 /*==================*/
 	const trx_t*	trx,	/*!< in: trx of new lock */
-	ulint		type_mode,/*!< in: precise mode of the new lock
+	unsigned	type_mode,/*!< in: precise mode of the new lock
 				to set: LOCK_S or LOCK_X, possibly
 				ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE,
 				LOCK_INSERT_INTENTION */
@@ -108,8 +106,7 @@ lock_prdt_update_split(
 	buf_block_t*	new_block,	/*!< in/out: the new half page */
 	lock_prdt_t*	prdt,		/*!< in: MBR on the old page */
 	lock_prdt_t*	new_prdt,	/*!< in: MBR on the new page */
-	ulint		space,		/*!< in: space id */
-	ulint		page_no);	/*!< in: page number */
+	const page_id_t	page_id);	/*!< in: page number */
 
 /**************************************************************//**
 Ajust locks from an ancester page of Rtree on the appropriate level . */
@@ -120,8 +117,7 @@ lock_prdt_update_parent(
 	buf_block_t*	right_block,	/*!< in/out: the new half page */
 	lock_prdt_t*	left_prdt,	/*!< in: MBR on the old page */
 	lock_prdt_t*	right_prdt,	/*!< in: MBR on the new page */
-	ulint		space,		/*!< in: space id */
-	ulint		page_no);	/*!< in: page number */
+	const page_id_t	page_id);	/*!< in: parent page */
 
 /*********************************************************************//**
 Checks if locks of other transactions prevent an immediate insert of
@@ -158,7 +154,7 @@ bool
 lock_prdt_has_to_wait(
 /*==================*/
 	const trx_t*	trx,	/*!< in: trx of new lock */
-	ulint		type_mode,/*!< in: precise mode of the new lock
+	unsigned	type_mode,/*!< in: precise mode of the new lock
 				to set: LOCK_S or LOCK_X, possibly
 				ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE,
 				LOCK_INSERT_INTENTION */
@@ -190,17 +186,11 @@ lock_prdt_rec_move(
 	const buf_block_t*	donator);	/*!< in: buffer block containing
 						the donating record */
 
-/** Check whether there are R-tree Page lock on a buffer page
+/** Check whether there are R-tree Page lock on a page
 @param[in]	trx	trx to test the lock
-@param[in]	space	space id for the page
-@param[in]	page_no	page number
-@return true if there is none */
-bool
-lock_test_prdt_page_lock(
-/*=====================*/
-	const trx_t*	trx,
-	ulint		space,
-	ulint		page_no);
+@param[in]	page_id	page identifier
+@return	true if there is none */
+bool lock_test_prdt_page_lock(const trx_t *trx, const page_id_t page_id);
 
 /** Removes predicate lock objects set on an index page which is discarded.
 @param[in]	block		page to be discarded
diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h
index b7dcbfa2b86..f39692903fa 100644
--- a/storage/innobase/include/lock0priv.h
+++ b/storage/innobase/include/lock0priv.h
@@ -521,7 +521,7 @@ inline byte lock_rec_reset_nth_bit(lock_t* lock, ulint i)
 	byte*	b = reinterpret_cast<byte*>(&lock[1]) + (i >> 3);
 	byte	mask = byte(1U << (i & 7));
 	byte	bit = *b & mask;
-	*b &= ~mask;
+	*b &= byte(~mask);
 
 	if (bit != 0) {
 		ut_ad(lock->trx->lock.n_rec_locks > 0);
@@ -539,29 +539,6 @@ lock_t*
 lock_rec_get_next_on_page(
 /*======================*/
 	lock_t*		lock);		/*!< in: a record lock */
-/*********************************************************************//**
-Gets the first record lock on a page, where the page is identified by its
-file address.
-@return first lock, NULL if none exists */
-UNIV_INLINE
-lock_t*
-lock_rec_get_first_on_page_addr(
-/*============================*/
-	hash_table_t*   lock_hash,	/* Lock hash table */
-	ulint           space,		/*!< in: space */
-	ulint           page_no);	/*!< in: page number */
-
-/*********************************************************************//**
-Gets the first record lock on a page, where the page is identified by a
-pointer to it.
-@return first lock, NULL if none exists */
-UNIV_INLINE
-lock_t*
-lock_rec_get_first_on_page(
-/*=======================*/
-	hash_table_t*		lock_hash,	/*!< in: lock hash table */
-	const buf_block_t*	block);		/*!< in: buffer block */
-
 
 /*********************************************************************//**
 Gets the next explicit lock request on a record.
@@ -633,20 +610,6 @@ lock_get_wait(
 	const lock_t*	lock);	/*!< in: lock */
 
 /*********************************************************************//**
-Looks for a suitable type record lock struct by the same trx on the same page.
-This can be used to save space when a new record lock should be set on a page:
-no new struct is needed, if a suitable old is found.
-@return lock or NULL */
-UNIV_INLINE
-lock_t*
-lock_rec_find_similar_on_page(
-/*==========================*/
-	ulint		type_mode,	/*!< in: lock type_mode field */
-	ulint		heap_no,	/*!< in: heap number of the record */
-	lock_t*		lock,		/*!< in: lock_rec_get_first_on_page() */
-	const trx_t*	trx);		/*!< in: transaction */
-
-/*********************************************************************//**
 Checks if a transaction has the specified table lock, or stronger. This
 function should only be called by the thread that owns the transaction.
 @return lock or NULL */
diff --git a/storage/innobase/include/lock0priv.inl b/storage/innobase/include/lock0priv.inl
index 8bb145e41fc..e16949a4917 100644
--- a/storage/innobase/include/lock0priv.inl
+++ b/storage/innobase/include/lock0priv.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, MariaDB Corporation.
+Copyright (c) 2018, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -98,8 +98,14 @@ lock_rec_set_nth_bit(
 	byte_index = i / 8;
 	bit_index = i % 8;
 
-	((byte*) &lock[1])[byte_index] |= 1 << bit_index;
-
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */
+#endif
+	((byte*) &lock[1])[byte_index] |= static_cast<byte>(1 << bit_index);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
 	++lock->trx->lock.n_rec_locks;
 }
 
@@ -116,68 +122,6 @@ lock_rec_get_next_on_page(
 }
 
 /*********************************************************************//**
-Gets the first record lock on a page, where the page is identified by its
-file address.
-@return	first lock, NULL if none exists */
-UNIV_INLINE
-lock_t*
-lock_rec_get_first_on_page_addr(
-/*============================*/
-	hash_table_t*	lock_hash,	/* Lock hash table */
-	ulint		space,		/*!< in: space */
-	ulint		page_no)	/*!< in: page number */
-{
-	ut_ad(lock_mutex_own());
-
-	for (lock_t* lock = static_cast<lock_t*>(
-			HASH_GET_FIRST(lock_hash,
-				       lock_rec_hash(space, page_no)));
-	     lock != NULL;
-	     lock = static_cast<lock_t*>(HASH_GET_NEXT(hash, lock))) {
-
-		if (lock->un_member.rec_lock.space == space
-		    && lock->un_member.rec_lock.page_no == page_no) {
-
-			return(lock);
-		}
-	}
-
-	return(NULL);
-}
-
-/*********************************************************************//**
-Gets the first record lock on a page, where the page is identified by a
-pointer to it.
-@return	first lock, NULL if none exists */
-UNIV_INLINE
-lock_t*
-lock_rec_get_first_on_page(
-/*=======================*/
-	hash_table_t*		lock_hash,	/*!< in: lock hash table */
-	const buf_block_t*	block)		/*!< in: buffer block */
-{
-	ut_ad(lock_mutex_own());
-
-	ulint	space	= block->page.id.space();
-	ulint	page_no	= block->page.id.page_no();
-	ulint	hash = buf_block_get_lock_hash_val(block);
-
-	for (lock_t* lock = static_cast<lock_t*>(
-			HASH_GET_FIRST(lock_hash, hash));
-	     lock != NULL;
-	     lock = static_cast<lock_t*>(HASH_GET_NEXT(hash, lock))) {
-
-		if (lock->un_member.rec_lock.space == space
-		    && lock->un_member.rec_lock.page_no == page_no) {
-
-			return(lock);
-		}
-	}
-
-	return(NULL);
-}
-
-/*********************************************************************//**
 Gets the next explicit lock request on a record.
 @return	next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
 UNIV_INLINE
@@ -221,16 +165,11 @@ lock_rec_get_first(
 	const buf_block_t*	block,	/*!< in: block containing the record */
 	ulint			heap_no)/*!< in: heap number of the record */
 {
-	ut_ad(lock_mutex_own());
-
-	for (lock_t* lock = lock_rec_get_first_on_page(hash, block); lock;
-	     lock = lock_rec_get_next_on_page(lock)) {
-		if (lock_rec_get_nth_bit(lock, heap_no)) {
-			return(lock);
-		}
-	}
-
-	return(NULL);
+  for (lock_t *lock= lock_sys.get_first(*hash, block->page.id());
+       lock; lock= lock_rec_get_next_on_page(lock))
+    if (lock_rec_get_nth_bit(lock, heap_no))
+      return lock;
+  return nullptr;
 }
 
 /*********************************************************************//**
@@ -267,23 +206,15 @@ lock_rec_get_next_on_page_const(
 /*============================*/
 	const lock_t*	lock)	/*!< in: a record lock */
 {
-	ut_ad(lock_mutex_own());
-	ut_ad(lock_get_type_low(lock) == LOCK_REC);
-
-	ulint	space = lock->un_member.rec_lock.space;
-	ulint	page_no = lock->un_member.rec_lock.page_no;
-
-	while ((lock = static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock)))
-	       != NULL) {
+  ut_ad(lock_mutex_own());
+  ut_ad(lock_get_type_low(lock) == LOCK_REC);
 
-		if (lock->un_member.rec_lock.space == space
-		    && lock->un_member.rec_lock.page_no == page_no) {
-
-			return(lock);
-		}
-	}
+  const page_id_t page_id(lock->un_member.rec_lock.page_id);
 
-	return(NULL);
+  while (!!(lock= static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock))))
+    if (lock->un_member.rec_lock.page_id == page_id)
+      break;
+  return lock;
 }
 
 /*********************************************************************//**
@@ -347,37 +278,6 @@ lock_get_wait(
 }
 
 /*********************************************************************//**
-Looks for a suitable type record lock struct by the same trx on the same page.
-This can be used to save space when a new record lock should be set on a page:
-no new struct is needed, if a suitable old is found.
-@return lock or NULL */
-UNIV_INLINE
-lock_t*
-lock_rec_find_similar_on_page(
-/*==========================*/
-	ulint           type_mode,      /*!< in: lock type_mode field */
-	ulint           heap_no,        /*!< in: heap number of the record */
-	lock_t*         lock,           /*!< in: lock_rec_get_first_on_page() */
-	const trx_t*    trx)            /*!< in: transaction */
-{
-	ut_ad(lock_mutex_own());
-
-	for (/* No op */;
-	     lock != NULL;
-	     lock = lock_rec_get_next_on_page(lock)) {
-
-		if (lock->trx == trx
-		    && lock->type_mode == type_mode
-		    && lock_rec_get_n_bits(lock) > heap_no) {
-
-			return(lock);
-		}
-	}
-
-	return(NULL);
-}
-
-/*********************************************************************//**
 Checks if a transaction has the specified table lock, or stronger. This
 function should only be called by the thread that owns the transaction.
 @return lock or NULL */
diff --git a/storage/innobase/include/lock0types.h b/storage/innobase/include/lock0types.h
index cb04afdf9db..23307375426 100644
--- a/storage/innobase/include/lock0types.h
+++ b/storage/innobase/include/lock0types.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2019, MariaDB Corporation.
+Copyright (c) 2018, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -25,6 +25,7 @@ Created 5/7/1996 Heikki Tuuri
 *******************************************************/
 
 #include "dict0types.h"
+#include "buf0types.h"
 #include "ut0lst.h"
 
 #ifndef lock0types_h
@@ -89,8 +90,8 @@ struct lock_table_t {
 
 /** Record lock for a page */
 struct lock_rec_t {
-	ib_uint32_t	space;		/*!< space id */
-	ib_uint32_t	page_no;	/*!< page number */
+	/** page identifier */
+	page_id_t	page_id;
 	ib_uint32_t	n_bits;		/*!< number of bits in the lock
 					bitmap; NOTE: the lock bitmap is
 					placed immediately after the
@@ -105,12 +106,12 @@ struct lock_rec_t {
 /** Print the record lock into the given output stream
 @param[in,out]	out	the output stream
 @return the given output stream. */
-inline
-std::ostream& lock_rec_t::print(std::ostream& out) const
+inline std::ostream &lock_rec_t::print(std::ostream &out) const
 {
-	out << "[lock_rec_t: space=" << space << ", page_no=" << page_no
-		<< ", n_bits=" << n_bits << "]";
-	return(out);
+  out << "[lock_rec_t: space=" << page_id.space()
+      << ", page_no=" << page_id.page_no()
+      << ", n_bits=" << n_bits << "]";
+  return out;
 }
 
 inline
diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h
index 8d26ccb2ba3..980a79d8f9e 100644
--- a/storage/innobase/include/log0crypt.h
+++ b/storage/innobase/include/log0crypt.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
-Copyright (C) 2014, 2018, MariaDB Corporation.
+Copyright (C) 2014, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -55,23 +55,18 @@ log_crypt_write_checkpoint_buf(
 /** Read the MariaDB 10.1 checkpoint crypto (version, msg and iv) info.
 @param[in]	buf	checkpoint buffer
 @return	whether the operation was successful */
-UNIV_INTERN
-bool
-log_crypt_101_read_checkpoint(const byte* buf);
+ATTRIBUTE_COLD bool log_crypt_101_read_checkpoint(const byte* buf);
 
 /** Decrypt a MariaDB 10.1 redo log block.
-@param[in,out]	buf	log block
+@param[in,out]	buf		log block
+@param[in]	start_lsn	server start LSN
 @return	whether the decryption was successful */
-UNIV_INTERN
-bool
-log_crypt_101_read_block(byte* buf);
+ATTRIBUTE_COLD bool log_crypt_101_read_block(byte* buf, lsn_t start_lsn);
 
 /** Read the checkpoint crypto (version, msg and iv) info.
 @param[in]	buf	checkpoint buffer
 @return	whether the operation was successful */
-UNIV_INTERN
-bool
-log_crypt_read_checkpoint_buf(const byte* buf);
+bool log_crypt_read_checkpoint_buf(const byte* buf);
 
 /** log_crypt() operation code */
 enum log_crypt_t {
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
index 477bb0a1d05..870f5da0925 100644
--- a/storage/innobase/include/log0log.h
+++ b/storage/innobase/include/log0log.h
@@ -34,27 +34,33 @@ Created 12/9/1995 Heikki Tuuri
 #ifndef log0log_h
 #define log0log_h
 
-#include "dyn0buf.h"
-#include "sync0rw.h"
 #include "log0types.h"
-#include "os0event.h"
 #include "os0file.h"
+#include "span.h"
+#include "my_atomic_wrapper.h"
+#include <vector>
+#include <string>
 
-#ifndef UINT32_MAX
-#define UINT32_MAX             (4294967295U)
-#endif
+using st_::span;
 
-/** Maximum number of srv_n_log_files, or innodb_log_files_in_group */
-#define SRV_N_LOG_FILES_MAX 100
+static const char LOG_FILE_NAME_PREFIX[] = "ib_logfile";
+static const char LOG_FILE_NAME[] = "ib_logfile0";
 
-/** Magic value to use instead of log checksums when they are disabled */
-#define LOG_NO_CHECKSUM_MAGIC 0xDEADBEEFUL
+/** Composes full path for a redo log file
+@param[in]	filename	name of the redo log file
+@return path with log file name*/
+std::string get_log_file_path(const char *filename= LOG_FILE_NAME);
 
-/* Margin for the free space in the smallest log group, before a new query
-step which modifies the database, is started */
+/** Returns paths for all existing log files */
+std::vector<std::string> get_existing_log_files_paths();
 
-#define LOG_CHECKPOINT_FREE_PER_THREAD	(4U << srv_page_size_shift)
-#define LOG_CHECKPOINT_EXTRA_FREE	(8U << srv_page_size_shift)
+/** Delete log file.
+@param[in]	suffix	suffix of the file name */
+static inline void delete_log_file(const char* suffix)
+{
+  auto path = get_log_file_path(LOG_FILE_NAME_PREFIX).append(suffix);
+  os_file_delete_if_exists(innodb_log_file_key, path.c_str(), nullptr);
+}
 
 /** Append a string to the log.
 @param[in]	str		string
@@ -81,78 +87,11 @@ log_free_check(void);
 @param[in]	len	requested minimum size in bytes */
 void log_buffer_extend(ulong len);
 
-/** Check margin not to overwrite transaction log from the last checkpoint.
-If would estimate the log write to exceed the log_group_capacity,
-waits for the checkpoint is done enough.
-@param[in]	len	length of the data to be written */
-
-void
-log_margin_checkpoint_age(
-	ulint	len);
-
-/** Open the log for log_write_low. The log must be closed with log_close.
-@param[in]	len	length of the data to be written
-@return start lsn of the log record */
-lsn_t
-log_reserve_and_open(
-	ulint	len);
-/************************************************************//**
-Writes to the log the string given. It is assumed that the caller holds the
-log mutex. */
-void
-log_write_low(
-/*==========*/
-	const byte*	str,		/*!< in: string */
-	ulint		str_len);	/*!< in: string length */
-/************************************************************//**
-Closes the log.
-@return lsn */
-lsn_t
-log_close(void);
-/*===========*/
-/************************************************************//**
-Gets the current lsn.
-@return current lsn */
-UNIV_INLINE
-lsn_t
-log_get_lsn(void);
-/*=============*/
-/************************************************************//**
-Gets the current lsn.
-@return	current lsn */
-UNIV_INLINE
-lsn_t
-log_get_lsn_nowait(void);
-/*=============*/
-/************************************************************//**
-Gets the last lsn that is fully flushed to disk.
-@return	last flushed lsn */
-UNIV_INLINE
-ib_uint64_t
-log_get_flush_lsn(void);
-/*=============*/
-/****************************************************************
-Gets the log group capacity. It is OK to read the value without
-holding log_sys.mutex because it is constant.
-@return log group capacity */
-UNIV_INLINE
-lsn_t
-log_get_capacity(void);
-/*==================*/
-/****************************************************************
-Get log_sys::max_modified_age_async. It is OK to read the value without
-holding log_sys::mutex because it is constant.
-@return max_modified_age_async */
-UNIV_INLINE
-lsn_t
-log_get_max_modified_age_async(void);
-/*================================*/
-
 /** Calculate the recommended highest values for lsn - last_checkpoint_lsn
-and lsn - buf_get_oldest_modification().
+and lsn - buf_pool.get_oldest_modification().
 @param[in]	file_size	requested innodb_log_file_size
 @retval true on success
-@retval false if the smallest log group is too small to
+@retval false if the smallest log is too small to
 accommodate the number of OS threads in the database server */
 bool
 log_set_capacity(ulonglong file_size)
@@ -168,68 +107,33 @@ be flushed to the file system
 @param[in]	rotate_key	whether to rotate the encryption key */
 void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key = false);
 
-/** write to the log file up to the last log entry.
-@param[in]	sync	whether we want the written log
-also to be flushed to disk. */
+/** Write to the log file up to the last log entry.
+@param sync  whether to wait for a durable write to complete */
 void log_buffer_flush_to_disk(bool sync= true);
 
 
 /** Prepare to invoke log_write_and_flush(), before acquiring log_sys.mutex. */
-#define log_write_and_flush_prepare() log_write_mutex_enter()
+ATTRIBUTE_COLD void log_write_and_flush_prepare();
 
-/** Durably write the log up to log_sys.lsn and release log_sys.mutex. */
+/** Durably write the log up to log_sys.lsn() and release log_sys.mutex. */
 ATTRIBUTE_COLD void log_write_and_flush();
 
-/****************************************************************//**
-This functions writes the log buffer to the log file and if 'flush'
-is set it forces a flush of the log file as well. This is meant to be
-called from background master thread only as it does not wait for
-the write (+ possible flush) to finish. */
-void
-log_buffer_sync_in_background(
-/*==========================*/
-	bool	flush);	/*<! in: flush the logs to disk */
-/** Make a checkpoint. Note that this function does not flush dirty
-blocks from the buffer pool: it only checks what is lsn of the oldest
-modification in the pool, and writes information about the lsn in
-log files. Use log_make_checkpoint() to flush also the pool.
-@param[in]	sync		whether to wait for the write to complete
-@return true if success, false if a checkpoint write was already running */
-bool log_checkpoint(bool sync);
-
 /** Make a checkpoint */
-void log_make_checkpoint();
+ATTRIBUTE_COLD void log_make_checkpoint();
+
+/** Make a checkpoint at the latest lsn on shutdown. */
+ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown();
+
+/** Write checkpoint info to the log header and release log_sys.mutex.
+@param[in]	end_lsn	start LSN of the FILE_CHECKPOINT mini-transaction */
+ATTRIBUTE_COLD void log_write_checkpoint_info(lsn_t end_lsn);
 
-/****************************************************************//**
-Makes a checkpoint at the latest lsn and writes it to first page of each
-data file in the database, so that we know that the file spaces contain
-all modifications up to that lsn. This can only be called at database
-shutdown. This function also writes all log in log files to the log archive. */
-void
-logs_empty_and_mark_files_at_shutdown(void);
-/*=======================================*/
-/** Read a log group header page to log_sys.checkpoint_buf.
-@param[in]	header	0 or LOG_CHECKPOINT_1 or LOG_CHECKPOINT2 */
-void log_header_read(ulint header);
-/** Write checkpoint info to the log header and invoke log_mutex_exit().
-@param[in]	sync	whether to wait for the write to complete
-@param[in]	end_lsn	start LSN of the MLOG_CHECKPOINT mini-transaction */
-void
-log_write_checkpoint_info(bool sync, lsn_t end_lsn);
-
-/** Set extra data to be written to the redo log during checkpoint.
-@param[in]	buf	data to be appended on checkpoint, or NULL
-@return pointer to previous data to be appended on checkpoint */
-mtr_buf_t*
-log_append_on_checkpoint(
-	mtr_buf_t*	buf);
 /**
 Checks that there is enough free space in the log to start a new query step.
 Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
 function may only be called if the calling thread owns no synchronization
 objects! */
-void
-log_check_margins(void);
+ATTRIBUTE_COLD void log_check_margins();
 
 /************************************************************//**
 Gets a log block flush bit.
@@ -263,14 +167,10 @@ log_block_set_data_len(
 /*===================*/
 	byte*	log_block,	/*!< in/out: log block */
 	ulint	len);		/*!< in: data length */
-
-/** Calculates the checksum for a log block using the CRC32 algorithm.
+/** Calculate the CRC-32C checksum of a log block.
 @param[in]	block	log block
 @return checksum */
-UNIV_INLINE
-ulint
-log_block_calc_checksum_crc32(
-	const byte*	block);
+inline ulint log_block_calc_checksum_crc32(const byte* block);
 
 /************************************************************//**
 Gets a log block checksum field value.
@@ -335,26 +235,12 @@ void
 log_print(
 /*======*/
 	FILE*	file);	/*!< in: file where to print */
-/******************************************************//**
-Peeks the current lsn.
-@return TRUE if success, FALSE if could not get the log system mutex */
-ibool
-log_peek_lsn(
-/*=========*/
-	lsn_t*	lsn);	/*!< out: if returns TRUE, current lsn is here */
 /**********************************************************************//**
 Refreshes the statistics used to print per-second averages. */
 void
 log_refresh_stats(void);
 /*===================*/
 
-/** Whether to require checksums on the redo log pages */
-extern my_bool	innodb_log_checksums;
-
-/* Values used as flags */
-#define LOG_FLUSH	7652559
-#define LOG_CHECKPOINT	78656949
-
 /* The counting of lsn's starts from this value: this must be non-zero */
 #define LOG_START_LSN		((lsn_t) (16 * OS_FILE_LOG_BLOCK_SIZE))
 
@@ -391,7 +277,7 @@ extern my_bool	innodb_log_checksums;
 
 #define	LOG_BLOCK_KEY		4	/* encryption key version
 					before LOG_BLOCK_CHECKSUM;
-					in log_t::FORMAT_ENC_10_4 only */
+					after log_t::FORMAT_ENC_10_4 only */
 #define	LOG_BLOCK_CHECKSUM	4	/* 4 byte checksum of the log block
 					contents; in InnoDB versions
 					< 3.23.52 this did not contain the
@@ -429,7 +315,7 @@ because InnoDB never supported more than one copy of the redo log. */
 LOG_FILE_START_LSN started here, 4 bytes earlier than LOG_HEADER_START_LSN,
 which the LOG_FILE_START_LSN was renamed to.
 Subformat 1 is for the fully redo-logged TRUNCATE
-(no MLOG_TRUNCATE records or extra log checkpoints or log files) */
+(no MLOG_TRUNCATE records or extra log checkpoints or log file) */
 #define LOG_HEADER_SUBFORMAT	4
 /** LSN of the start of data in this log file (with format version 1;
 in format version 0, it was called LOG_FILE_START_LSN and at offset 4). */
@@ -454,20 +340,84 @@ or the MySQL version that created the redo log file. */
 					header; we write alternately to the
 					checkpoint fields when we make new
 					checkpoints; this field is only defined
-					in the first log file of a log group */
+					in the first log file of a log */
 #define LOG_CHECKPOINT_2	(3 * OS_FILE_LOG_BLOCK_SIZE)
 					/* second checkpoint field in the log
 					header */
 #define LOG_FILE_HDR_SIZE	(4 * OS_FILE_LOG_BLOCK_SIZE)
 
-/* As long as fil_io() is used to handle log io, log group max size is limited
-by (maximum page number) * (minimum page size). Page number type is uint32_t.
-Remove this limitation if page number is no longer used for log file io. */
-static const ulonglong log_group_max_size =
-	((ulonglong(UINT32_MAX) + 1) * UNIV_PAGE_SIZE_MIN - 1);
+/** Abstraction for reading, writing and flushing file cache to disk */
+class file_io
+{
+public:
+  file_io(bool durable_writes= false) : m_durable_writes(durable_writes) {}
+  virtual ~file_io() noexcept {};
+  virtual dberr_t open(const char *path, bool read_only) noexcept= 0;
+  virtual dberr_t rename(const char *old_path,
+                         const char *new_path) noexcept= 0;
+  virtual dberr_t close() noexcept= 0;
+  virtual dberr_t read(os_offset_t offset, span<byte> buf) noexcept= 0;
+  virtual dberr_t write(const char *path, os_offset_t offset,
+                        span<const byte> buf) noexcept= 0;
+  virtual dberr_t flush() noexcept= 0;
+
+  /** Durable writes doesn't require calling flush() */
+  bool writes_are_durable() const noexcept { return m_durable_writes; }
+
+protected:
+  bool m_durable_writes;
+};
+
+class file_os_io final: public file_io
+{
+public:
+  file_os_io()= default;
+  file_os_io(const file_os_io &)= delete;
+  file_os_io &operator=(const file_os_io &)= delete;
+  file_os_io(file_os_io &&rhs);
+  file_os_io &operator=(file_os_io &&rhs);
+  ~file_os_io() noexcept;
+
+  dberr_t open(const char *path, bool read_only) noexcept final;
+  bool is_opened() const noexcept { return m_fd != OS_FILE_CLOSED; }
+  dberr_t rename(const char *old_path, const char *new_path) noexcept final;
+  dberr_t close() noexcept final;
+  dberr_t read(os_offset_t offset, span<byte> buf) noexcept final;
+  dberr_t write(const char *path, os_offset_t offset,
+                span<const byte> buf) noexcept final;
+  dberr_t flush() noexcept final;
 
-typedef ib_mutex_t	LogSysMutex;
-typedef ib_mutex_t	FlushOrderMutex;
+private:
+  pfs_os_file_t m_fd{OS_FILE_CLOSED};
+};
+
+/** File abstraction + path */
+class log_file_t
+{
+public:
+  log_file_t(std::string path= "") noexcept : m_path{std::move(path)} {}
+
+  dberr_t open(bool read_only) noexcept;
+  bool is_opened() const noexcept;
+
+  const std::string &get_path() const noexcept { return m_path; }
+
+  dberr_t rename(std::string new_path) noexcept;
+  dberr_t close() noexcept;
+  dberr_t read(os_offset_t offset, span<byte> buf) noexcept;
+  bool writes_are_durable() const noexcept;
+  dberr_t write(os_offset_t offset, span<const byte> buf) noexcept;
+  dberr_t flush() noexcept;
+  void free()
+  {
+    m_path.clear();
+    m_path.shrink_to_fit();
+  }
+
+private:
+  std::unique_ptr<file_io> m_file;
+  std::string m_path;
+};
 
 /** Redo log buffer */
 struct log_t{
@@ -488,48 +438,41 @@ struct log_t{
   static constexpr uint32_t FORMAT_ENCRYPTED = 1U << 31;
   /** The MariaDB 10.4.0 log format (only with innodb_encrypt_log=ON) */
   static constexpr uint32_t FORMAT_ENC_10_4 = FORMAT_10_4 | FORMAT_ENCRYPTED;
+  /** The MariaDB 10.5 physical redo log format */
+  static constexpr uint32_t FORMAT_10_5 = 0x50485953;
+  /** The MariaDB 10.5 physical format (only with innodb_encrypt_log=ON) */
+  static constexpr uint32_t FORMAT_ENC_10_5 = FORMAT_10_5 | FORMAT_ENCRYPTED;
 
-	MY_ALIGNED(CACHE_LINE_SIZE)
-	lsn_t		lsn;		/*!< log sequence number */
-	ulong		buf_free;	/*!< first free offset within the log
-					buffer in use */
-
-	MY_ALIGNED(CACHE_LINE_SIZE)
-	LogSysMutex	mutex;		/*!< mutex protecting the log */
-	MY_ALIGNED(CACHE_LINE_SIZE)
-	LogSysMutex	write_mutex;	/*!< mutex protecting writing to log */
-	MY_ALIGNED(CACHE_LINE_SIZE)
-	FlushOrderMutex	log_flush_order_mutex;/*!< mutex to serialize access to
-					the flush list when we are putting
-					dirty blocks in the list. The idea
-					behind this mutex is to be able
-					to release log_sys.mutex during
-					mtr_commit and still ensure that
-					insertions in the flush_list happen
-					in the LSN order. */
-	/** log_buffer, append data here */
-	byte*		buf;
-	/** log_buffer, writing data to file from this buffer.
-	Before flushing write_buf is swapped with flush_buf */
-	byte*		flush_buf;
-	ulong		max_buf_free;	/*!< recommended maximum value of
-					buf_free for the buffer in use, after
-					which the buffer is flushed */
-	bool		check_flush_or_checkpoint;
-					/*!< this is set when there may
-					be need to flush the log buffer, or
-					preflush buffer pool pages, or make
-					a checkpoint; this MUST be TRUE when
-					lsn - last_checkpoint_lsn >
-					max_checkpoint_age; this flag is
-					peeked at by log_free_check(), which
-					does not reserve the log mutex */
-
-  /** Log files. Protected by mutex or write_mutex. */
-  struct files {
-    /** number of files */
-    ulint				n_files;
-    /** format of the redo log: e.g., FORMAT_10_4 */
+private:
+  /** The log sequence number of the last change of durable InnoDB files */
+  MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE)
+  std::atomic<lsn_t> lsn;
+  /** the first guaranteed-durable log sequence number */
+  std::atomic<lsn_t> flushed_to_disk_lsn;
+  /** set when there may be need to flush the log buffer, or
+  preflush buffer pool pages, or initiate a log checkpoint.
+  This must hold if lsn - last_checkpoint_lsn > max_checkpoint_age. */
+  std::atomic<bool> check_flush_or_checkpoint_;
+public:
+  /** mutex protecting the log */
+  MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
+  /** first free offset within the log buffer in use */
+  size_t buf_free;
+  /** recommended maximum size of buf, after which the buffer is flushed */
+  size_t max_buf_free;
+  /** mutex to serialize access to the flush list when we are putting
+  dirty blocks in the list. The idea behind this mutex is to be able
+  to release log_sys.mutex during mtr_commit and still ensure that
+  insertions in the flush_list happen in the LSN order. */
+  MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_order_mutex;
+  /** log_buffer, append data here */
+  byte *buf;
+  /** log_buffer, writing data to file from this buffer.
+  Before flushing write_buf is swapped with flush_buf */
+  byte *flush_buf;
+  /** Log file stuff. Protected by mutex. */
+  struct file {
+    /** format of the redo log: e.g., FORMAT_10_5 */
     uint32_t				format;
     /** redo log subformat: 0 with separately logged TRUNCATE,
     2 with fully redo-logged TRUNCATE (1 in MariaDB 10.2) */
@@ -541,19 +484,47 @@ struct log_t{
     lsn_t				lsn;
     /** the byte offset of the above lsn */
     lsn_t				lsn_offset;
+    /** log file */
+    log_file_t				fd;
+
   public:
     /** used only in recovery: recovery scan succeeded up to this
     lsn in this log group */
     lsn_t				scanned_lsn;
 
+    /** opens log file which must be closed prior this call */
+    void open_file(std::string path);
+    /** writes header */
+    void write_header_durable(lsn_t lsn);
+    /** opens log file which must be closed prior this call */
+    dberr_t rename(std::string path) { return fd.rename(path); }
+    /** reads buffer from log file
+    @param[in]	offset		offset in log file
+    @param[in]	buf		buffer where to read */
+    void read(os_offset_t offset, span<byte> buf);
+    /** Tells whether writes require calling flush() */
+    bool writes_are_durable() const noexcept;
+    /** writes buffer to log file
+    @param[in]	offset		offset in log file
+    @param[in]	buf		buffer from which to write */
+    void write(os_offset_t offset, span<byte> buf);
+    /** flushes OS page cache (excluding metadata!) for log file */
+    void flush();
+    /** closes log file */
+    void close_file();
+
     /** @return whether the redo log is encrypted */
     bool is_encrypted() const { return format & FORMAT_ENCRYPTED; }
+    /** @return whether the redo log is in the physical format */
+    bool is_physical() const
+    { return (format & ~FORMAT_ENCRYPTED) == FORMAT_10_5; }
     /** @return capacity in bytes */
-    lsn_t capacity() const{ return (file_size - LOG_FILE_HDR_SIZE) * n_files; }
+    lsn_t capacity() const{ return file_size - LOG_FILE_HDR_SIZE; }
     /** Calculate the offset of a log sequence number.
     @param[in]	lsn	log sequence number
     @return offset within the log */
     inline lsn_t calc_lsn_offset(lsn_t lsn) const;
+    inline lsn_t calc_lsn_offset_old(lsn_t lsn) const;
 
     /** Set the field values to correspond to a given lsn. */
     void set_fields(lsn_t lsn)
@@ -570,15 +541,11 @@ struct log_t{
     @return	whether no invalid blocks (e.g checksum mismatch) were found */
     bool read_log_seg(lsn_t* start_lsn, lsn_t end_lsn);
 
-    /** Initialize the redo log buffer.
-    @param[in]	n_files		number of files */
-    void create(ulint n_files);
+    /** Initialize the redo log buffer. */
+    void create();
 
     /** Close the redo log buffer. */
-    void close()
-    {
-      n_files = 0;
-    }
+    void close() { close_file(); }
     void set_lsn(lsn_t a_lsn);
     lsn_t get_lsn() const { return lsn; }
     void set_lsn_offset(lsn_t a_lsn);
@@ -587,7 +554,7 @@ struct log_t{
 
 	/** The fields involved in the log buffer flush @{ */
 
-	ulong		buf_next_to_write;/*!< first offset in the log buffer
+	size_t		buf_next_to_write;/*!< first offset in the log buffer
 					where the byte content may not exist
 					written to file, e.g., the start
 					offset of a log record catenated
@@ -597,16 +564,9 @@ struct log_t{
 	lsn_t		write_lsn;	/*!< last written lsn */
 	lsn_t		current_flush_lsn;/*!< end lsn for the current running
 					write + flush operation */
-	lsn_t		flushed_to_disk_lsn;
-					/*!< how far we have written the log
-					AND flushed to disk */
-	ulint		n_pending_flushes;/*!< number of currently
-					pending flushes; protected by
-					log_sys.mutex */
-	os_event_t	flush_event;	/*!< this event is in the reset state
-					when a flush is running;
-					os_event_set() and os_event_reset()
-					are protected by log_sys.mutex */
+	std::atomic<size_t> pending_flushes; /*!< system calls in progress */
+	std::atomic<size_t> flushes;	/*!< system calls counter */
+
 	ulint		n_log_ios;	/*!< number of log i/os initiated thus
 					far */
 	ulint		n_log_ios_old;	/*!< number of log i/o's at the
@@ -616,7 +576,7 @@ struct log_t{
 	/* @} */
 
 	/** Fields involved in checkpoints @{ */
-	lsn_t		log_group_capacity; /*!< capacity of the log group; if
+	lsn_t		log_capacity;	/*!< capacity of the log; if
 					the checkpoint age exceeds this, it is
 					a serious error because it is possible
 					we will then overwrite log and spoil
@@ -624,48 +584,24 @@ struct log_t{
 	lsn_t		max_modified_age_async;
 					/*!< when this recommended
 					value for lsn -
-					buf_pool_get_oldest_modification()
+					buf_pool.get_oldest_modification()
 					is exceeded, we start an
 					asynchronous preflush of pool pages */
-	lsn_t		max_modified_age_sync;
-					/*!< when this recommended
-					value for lsn -
-					buf_pool_get_oldest_modification()
-					is exceeded, we start a
-					synchronous preflush of pool pages */
-	lsn_t		max_checkpoint_age_async;
-					/*!< when this checkpoint age
-					is exceeded we start an
-					asynchronous writing of a new
-					checkpoint */
 	lsn_t		max_checkpoint_age;
 					/*!< this is the maximum allowed value
 					for lsn - last_checkpoint_lsn when a
 					new query step is started */
 	ib_uint64_t	next_checkpoint_no;
 					/*!< next checkpoint number */
-	lsn_t		last_checkpoint_lsn;
-					/*!< latest checkpoint lsn */
-	lsn_t		next_checkpoint_lsn;
-					/*!< next checkpoint lsn */
-	mtr_buf_t*	append_on_checkpoint;
-					/*!< extra redo log records to write
-					during a checkpoint, or NULL if none.
-					The pointer is protected by
-					log_sys.mutex, and the data must
-					remain constant as long as this
-					pointer is not NULL. */
-	ulint		n_pending_checkpoint_writes;
-					/*!< number of currently pending
-					checkpoint writes */
-	rw_lock_t	checkpoint_lock;/*!< this latch is x-locked when a
-					checkpoint write is running; a thread
-					should wait for this without owning
-					the log mutex */
-
-	/** buffer for checkpoint header */
-	MY_ALIGNED(OS_FILE_LOG_BLOCK_SIZE)
-	byte		checkpoint_buf[OS_FILE_LOG_BLOCK_SIZE];
+  /** latest completed checkpoint (protected by log_sys.mutex) */
+  Atomic_relaxed<lsn_t> last_checkpoint_lsn;
+  /** next checkpoint LSN (protected by log_sys.mutex) */
+  lsn_t next_checkpoint_lsn;
+  /** whether a checkpoint is pending */
+  Atomic_relaxed<bool> checkpoint_pending;
+
+  /** buffer for checkpoint header */
+  byte *checkpoint_buf;
 	/* @} */
 
 private:
@@ -681,23 +617,43 @@ public:
 
   /** @return whether the redo log is encrypted */
   bool is_encrypted() const { return(log.is_encrypted()); }
+  /** @return whether the redo log is in the physical format */
+  bool is_physical() const { return log.is_physical(); }
 
   bool is_initialised() const { return m_initialised; }
 
-  /** Complete an asynchronous checkpoint write. */
-  void complete_checkpoint();
+  lsn_t get_lsn(std::memory_order order= std::memory_order_relaxed) const
+  { return lsn.load(order); }
+  void set_lsn(lsn_t lsn) { this->lsn.store(lsn, std::memory_order_release); }
+
+  lsn_t get_flushed_lsn() const
+  { return flushed_to_disk_lsn.load(std::memory_order_acquire); }
+  void set_flushed_lsn(lsn_t lsn)
+  { flushed_to_disk_lsn.store(lsn, std::memory_order_release); }
+
+  bool check_flush_or_checkpoint() const
+  {
+    return UNIV_UNLIKELY
+      (check_flush_or_checkpoint_.load(std::memory_order_relaxed));
+  }
+  void set_check_flush_or_checkpoint(bool flag= true)
+  { check_flush_or_checkpoint_.store(flag, std::memory_order_relaxed); }
+
+  bool has_encryption_key_rotation() const {
+    return log.format == FORMAT_ENC_10_4 || log.format == FORMAT_ENC_10_5;
+  }
 
   /** @return the log block header + trailer size */
   unsigned framing_size() const
   {
-    return log.format == FORMAT_ENC_10_4
+    return has_encryption_key_rotation()
       ? LOG_BLOCK_HDR_SIZE + LOG_BLOCK_KEY + LOG_BLOCK_CHECKSUM
       : LOG_BLOCK_HDR_SIZE + LOG_BLOCK_CHECKSUM;
   }
   /** @return the log block payload size */
   unsigned payload_size() const
   {
-    return log.format == FORMAT_ENC_10_4
+    return has_encryption_key_rotation()
       ? OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE - LOG_BLOCK_CHECKSUM -
       LOG_BLOCK_KEY
       : OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE - LOG_BLOCK_CHECKSUM;
@@ -705,11 +661,21 @@ public:
   /** @return the log block trailer offset */
   unsigned trailer_offset() const
   {
-    return log.format == FORMAT_ENC_10_4
+    return has_encryption_key_rotation()
       ? OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM - LOG_BLOCK_KEY
       : OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM;
   }
 
+  size_t get_pending_flushes() const
+  {
+    return pending_flushes.load(std::memory_order_relaxed);
+  }
+
+  size_t get_flushes() const
+  {
+    return flushes.load(std::memory_order_relaxed);
+  }
+
   /** Initialise the redo log subsystem. */
   void create();
 
@@ -719,90 +685,50 @@ public:
 
 /** Redo log system */
 extern log_t	log_sys;
+#ifdef UNIV_DEBUG
+extern bool log_write_lock_own();
+#endif
 
 /** Calculate the offset of a log sequence number.
 @param[in]     lsn     log sequence number
 @return offset within the log */
-inline lsn_t log_t::files::calc_lsn_offset(lsn_t lsn) const
+inline lsn_t log_t::file::calc_lsn_offset(lsn_t lsn) const
 {
   ut_ad(this == &log_sys.log);
   /* The lsn parameters are updated while holding both the mutexes
   and it is ok to have either of them while reading */
-  ut_ad(log_sys.mutex.is_owned() || log_sys.write_mutex.is_owned());
-  const lsn_t group_size= capacity();
+#ifdef SAFE_MUTEX
+  ut_ad(mysql_mutex_is_owner(&log_sys.mutex) || log_write_lock_own());
+#endif /* SAFE_MUTEX */
+  const lsn_t size = capacity();
   lsn_t l= lsn - this->lsn;
   if (longlong(l) < 0) {
-    l= lsn_t(-longlong(l)) % group_size;
-    l= group_size - l;
+	  l = lsn_t(-longlong(l)) % size;
+	  l = size - l;
   }
 
   l+= lsn_offset - LOG_FILE_HDR_SIZE * (1 + lsn_offset / file_size);
-  l%= group_size;
+  l %= size;
   return l + LOG_FILE_HDR_SIZE * (1 + l / (file_size - LOG_FILE_HDR_SIZE));
 }
 
-inline void log_t::files::set_lsn(lsn_t a_lsn) {
-      ut_ad(log_sys.mutex.is_owned() || log_sys.write_mutex.is_owned());
-      lsn = a_lsn;
+inline void log_t::file::set_lsn(lsn_t a_lsn)
+{
+#ifdef SAFE_MUTEX
+  ut_ad(mysql_mutex_is_owner(&log_sys.mutex) || log_write_lock_own());
+#endif /* SAFE_MUTEX */
+  lsn= a_lsn;
 }
 
-inline void log_t::files::set_lsn_offset(lsn_t a_lsn) {
-      ut_ad(log_sys.mutex.is_owned() || log_sys.write_mutex.is_owned());
-      ut_ad((lsn % OS_FILE_LOG_BLOCK_SIZE) == (a_lsn % OS_FILE_LOG_BLOCK_SIZE));
-      lsn_offset = a_lsn;
+inline void log_t::file::set_lsn_offset(lsn_t a_lsn)
+{
+#ifdef SAFE_MUTEX
+  ut_ad(mysql_mutex_is_owner(&log_sys.mutex) || log_write_lock_own());
+#endif /* SAFE_MUTEX */
+  ut_ad((lsn % OS_FILE_LOG_BLOCK_SIZE) == (a_lsn % OS_FILE_LOG_BLOCK_SIZE));
+  lsn_offset= a_lsn;
 }
 
-/** Test if flush order mutex is owned. */
-#define log_flush_order_mutex_own()			\
-	mutex_own(&log_sys.log_flush_order_mutex)
-
-/** Acquire the flush order mutex. */
-#define log_flush_order_mutex_enter() do {		\
-	mutex_enter(&log_sys.log_flush_order_mutex);	\
-} while (0)
-/** Release the flush order mutex. */
-# define log_flush_order_mutex_exit() do {		\
-	mutex_exit(&log_sys.log_flush_order_mutex);	\
-} while (0)
-
-/** Test if log sys mutex is owned. */
-#define log_mutex_own() mutex_own(&log_sys.mutex)
-
-/** Test if log sys write mutex is owned. */
-#define log_write_mutex_own() mutex_own(&log_sys.write_mutex)
-
-/** Acquire the log sys mutex. */
-#define log_mutex_enter() mutex_enter(&log_sys.mutex)
-
-/** Acquire the log sys write mutex. */
-#define log_write_mutex_enter() mutex_enter(&log_sys.write_mutex)
-
-/** Acquire all the log sys mutexes. */
-#define log_mutex_enter_all() do {		\
-	mutex_enter(&log_sys.write_mutex);	\
-	mutex_enter(&log_sys.mutex);		\
-} while (0)
-
-/** Release the log sys mutex. */
-#define log_mutex_exit() mutex_exit(&log_sys.mutex)
-
-/** Release the log sys write mutex.*/
-#define log_write_mutex_exit() mutex_exit(&log_sys.write_mutex)
-
-/** Release all the log sys mutexes. */
-#define log_mutex_exit_all() do {		\
-	mutex_exit(&log_sys.mutex);		\
-	mutex_exit(&log_sys.write_mutex);	\
-} while (0)
-
-/* log scrubbing speed, in bytes/sec */
-extern ulonglong innodb_scrub_log_speed;
-
-/** Event to wake up log_scrub_thread */
-extern os_event_t	log_scrub_event;
-/** Whether log_scrub_thread is active */
-extern bool		log_scrub_thread_active;
-
 #include "log0log.inl"
 
 #endif
diff --git a/storage/innobase/include/log0log.inl b/storage/innobase/include/log0log.inl
index 8dfd86d3078..d503e3ffec9 100644
--- a/storage/innobase/include/log0log.inl
+++ b/storage/innobase/include/log0log.inl
@@ -25,12 +25,9 @@ Created 12/9/1995 Heikki Tuuri
 *******************************************************/
 
 #include "mach0data.h"
-#include "srv0mon.h"
+#include "assume_aligned.h"
 #include "ut0crc32.h"
 
-#ifdef UNIV_LOG_LSN_DEBUG
-#include "mtr0types.h"
-#endif /* UNIV_LOG_LSN_DEBUG */
 extern ulong srv_log_buffer_size;
 
 /************************************************************//**
@@ -42,13 +39,10 @@ log_block_get_flush_bit(
 /*====================*/
 	const byte*	log_block)	/*!< in: log block */
 {
-	if (LOG_BLOCK_FLUSH_BIT_MASK
-	    & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO)) {
+  static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility");
+  static_assert(LOG_BLOCK_FLUSH_BIT_MASK == 0x80000000, "compatibility");
 
-		return(TRUE);
-	}
-
-	return(FALSE);
+  return *log_block & 0x80;
 }
 
 /************************************************************//**
@@ -60,17 +54,13 @@ log_block_set_flush_bit(
 	byte*	log_block,	/*!< in/out: log block */
 	ibool	val)		/*!< in: value to set */
 {
-	ulint	field;
-
-	field = mach_read_from_4(log_block + LOG_BLOCK_HDR_NO);
-
-	if (val) {
-		field = field | LOG_BLOCK_FLUSH_BIT_MASK;
-	} else {
-		field = field & ~LOG_BLOCK_FLUSH_BIT_MASK;
-	}
+  static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility");
+  static_assert(LOG_BLOCK_FLUSH_BIT_MASK == 0x80000000, "compatibility");
 
-	mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, field);
+  if (val)
+    *log_block|= 0x80;
+  else
+    *log_block&= 0x7f;
 }
 
 /************************************************************//**
@@ -82,8 +72,9 @@ log_block_get_hdr_no(
 /*=================*/
 	const byte*	log_block)	/*!< in: log block */
 {
-	return(~LOG_BLOCK_FLUSH_BIT_MASK
-	       & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO));
+  static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility");
+  return mach_read_from_4(my_assume_aligned<4>(log_block)) &
+    ~LOG_BLOCK_FLUSH_BIT_MASK;
 }
 
 /************************************************************//**
@@ -97,10 +88,11 @@ log_block_set_hdr_no(
 	ulint	n)		/*!< in: log block number: must be > 0 and
 				< LOG_BLOCK_FLUSH_BIT_MASK */
 {
-	ut_ad(n > 0);
-	ut_ad(n < LOG_BLOCK_FLUSH_BIT_MASK);
+  static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility");
+  ut_ad(n > 0);
+  ut_ad(n < LOG_BLOCK_FLUSH_BIT_MASK);
 
-	mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, n);
+  mach_write_to_4(my_assume_aligned<4>(log_block), n);
 }
 
 /************************************************************//**
@@ -112,7 +104,8 @@ log_block_get_data_len(
 /*===================*/
 	const byte*	log_block)	/*!< in: log block */
 {
-	return(mach_read_from_2(log_block + LOG_BLOCK_HDR_DATA_LEN));
+  return mach_read_from_2(my_assume_aligned<2>
+                          (log_block + LOG_BLOCK_HDR_DATA_LEN));
 }
 
 /************************************************************//**
@@ -124,7 +117,8 @@ log_block_set_data_len(
 	byte*	log_block,	/*!< in/out: log block */
 	ulint	len)		/*!< in: data length */
 {
-	mach_write_to_2(log_block + LOG_BLOCK_HDR_DATA_LEN, len);
+  mach_write_to_2(my_assume_aligned<2>(log_block + LOG_BLOCK_HDR_DATA_LEN),
+                  len);
 }
 
 /************************************************************//**
@@ -137,7 +131,8 @@ log_block_get_first_rec_group(
 /*==========================*/
 	const byte*	log_block)	/*!< in: log block */
 {
-	return(mach_read_from_2(log_block + LOG_BLOCK_FIRST_REC_GROUP));
+  return mach_read_from_2(my_assume_aligned<2>
+                          (log_block + LOG_BLOCK_FIRST_REC_GROUP));
 }
 
 /************************************************************//**
@@ -149,7 +144,8 @@ log_block_set_first_rec_group(
 	byte*	log_block,	/*!< in/out: log block */
 	ulint	offset)		/*!< in: offset, 0 if none */
 {
-	mach_write_to_2(log_block + LOG_BLOCK_FIRST_REC_GROUP, offset);
+  mach_write_to_2(my_assume_aligned<2>
+                  (log_block + LOG_BLOCK_FIRST_REC_GROUP), offset);
 }
 
 /************************************************************//**
@@ -161,7 +157,8 @@ log_block_get_checkpoint_no(
 /*========================*/
 	const byte*	log_block)	/*!< in: log block */
 {
-	return(mach_read_from_4(log_block + LOG_BLOCK_CHECKPOINT_NO));
+  return mach_read_from_4(my_assume_aligned<4>
+                          (log_block + LOG_BLOCK_CHECKPOINT_NO));
 }
 
 /************************************************************//**
@@ -173,7 +170,8 @@ log_block_set_checkpoint_no(
 	byte*		log_block,	/*!< in/out: log block */
 	ib_uint64_t	no)		/*!< in: checkpoint no */
 {
-	mach_write_to_4(log_block + LOG_BLOCK_CHECKPOINT_NO, (ulint) no);
+  mach_write_to_4(my_assume_aligned<4>(log_block + LOG_BLOCK_CHECKPOINT_NO),
+                  static_cast<uint32_t>(no));
 }
 
 /************************************************************//**
@@ -190,42 +188,10 @@ log_block_convert_lsn_to_no(
 			0xFUL, 0x3FFFFFFFUL)) + 1);
 }
 
-/** Calculate the checksum for a log block using the pre-5.7.9 algorithm.
-@param[in]	block	log block
-@return		checksum */
-UNIV_INLINE
-ulint
-log_block_calc_checksum_format_0(
-	const byte*	block)
-{
-	ulint	sum;
-	ulint	sh;
-	ulint	i;
-
-	sum = 1;
-	sh = 0;
-
-	for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM; i++) {
-		ulint	b = (ulint) block[i];
-		sum &= 0x7FFFFFFFUL;
-		sum += b;
-		sum += b << sh;
-		sh++;
-		if (sh > 24) {
-			sh = 0;
-		}
-	}
-
-	return(sum);
-}
-
-/** Calculate the checksum for a log block using the MySQL 5.7 algorithm.
+/** Calculate the CRC-32C checksum of a log block.
 @param[in]	block	log block
 @return checksum */
-UNIV_INLINE
-ulint
-log_block_calc_checksum_crc32(
-	const byte*	block)
+inline ulint log_block_calc_checksum_crc32(const byte* block)
 {
 	return ut_crc32(block, OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM);
 }
@@ -239,8 +205,9 @@ log_block_get_checksum(
 /*===================*/
 	const byte*	log_block)	/*!< in: log block */
 {
-	return(mach_read_from_4(log_block + OS_FILE_LOG_BLOCK_SIZE
-				- LOG_BLOCK_CHECKSUM));
+  return mach_read_from_4(my_assume_aligned<4>
+                          (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM +
+                           log_block));
 }
 
 /************************************************************//**
@@ -252,9 +219,9 @@ log_block_set_checksum(
 	byte*	log_block,	/*!< in/out: log block */
 	ulint	checksum)	/*!< in: checksum */
 {
-	mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE
-			- LOG_BLOCK_CHECKSUM,
-			checksum);
+  mach_write_to_4(my_assume_aligned<4>
+                  (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM +
+                   log_block), checksum);
 }
 
 /************************************************************//**
@@ -288,35 +255,10 @@ log_reserve_and_write_fast(
 	ulint		len,
 	lsn_t*		start_lsn)
 {
-	ut_ad(log_mutex_own());
+	mysql_mutex_assert_owner(&log_sys.mutex);
 	ut_ad(len > 0);
 
-#ifdef UNIV_LOG_LSN_DEBUG
-	/* Append a MLOG_LSN record after mtr_commit(), except when
-	the last bytes could be a MLOG_CHECKPOINT marker. We have special
-	handling when the log consists of only a single MLOG_CHECKPOINT
-	record since the latest checkpoint, and appending the
-	MLOG_LSN would ruin that.
-
-	Note that a longer redo log record could happen to end in what
-	looks like MLOG_CHECKPOINT, and we could be omitting MLOG_LSN
-	without reason. This is OK, because writing the MLOG_LSN is
-	just a 'best effort', aimed at finding log corruption due to
-	bugs in the redo log writing logic. */
-	const ulint	lsn_len
-		= len >= SIZE_OF_MLOG_CHECKPOINT
-		&& MLOG_CHECKPOINT == static_cast<const char*>(str)[
-			len - SIZE_OF_MLOG_CHECKPOINT]
-		? 0
-		: 1
-		+ mach_get_compressed_size(log_sys.lsn >> 32)
-		+ mach_get_compressed_size(log_sys.lsn & 0xFFFFFFFFUL);
-#endif /* UNIV_LOG_LSN_DEBUG */
-
 	const ulint	data_len = len
-#ifdef UNIV_LOG_LSN_DEBUG
-		+ lsn_len
-#endif /* UNIV_LOG_LSN_DEBUG */
 		+ log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE;
 
 	if (data_len >= log_sys.trailer_offset()) {
@@ -327,26 +269,9 @@ log_reserve_and_write_fast(
 		return(0);
 	}
 
-	*start_lsn = log_sys.lsn;
+	lsn_t lsn = log_sys.get_lsn();
+	*start_lsn = lsn;
 
-#ifdef UNIV_LOG_LSN_DEBUG
-	if (lsn_len) {
-		/* Write the LSN pseudo-record. */
-		byte* b = &log_sys.buf[log_sys.buf_free];
-
-		*b++ = MLOG_LSN | (MLOG_SINGLE_REC_FLAG & *(const byte*) str);
-
-		/* Write the LSN in two parts,
-		as a pseudo page number and space id. */
-		b += mach_write_compressed(b, log_sys.lsn >> 32);
-		b += mach_write_compressed(b, log_sys.lsn & 0xFFFFFFFFUL);
-		ut_a(b - lsn_len == &log_sys.buf[log_sys.buf_free]);
-
-		::memcpy(b, str, len);
-
-		len += lsn_len;
-	} else
-#endif /* UNIV_LOG_LSN_DEBUG */
 	memcpy(log_sys.buf + log_sys.buf_free, str, len);
 
 	log_block_set_data_len(
@@ -355,97 +280,14 @@ log_reserve_and_write_fast(
                         OS_FILE_LOG_BLOCK_SIZE)),
                 data_len);
 
-	log_sys.buf_free += ulong(len);
-
-	ut_ad(log_sys.buf_free <= srv_log_buffer_size);
-
-	log_sys.lsn += len;
-
-	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
-		    log_sys.lsn - log_sys.last_checkpoint_lsn);
-
-	return(log_sys.lsn);
-}
-
-/************************************************************//**
-Gets the current lsn.
-@return current lsn */
-UNIV_INLINE
-lsn_t
-log_get_lsn(void)
-/*=============*/
-{
-	lsn_t	lsn;
-
-	log_mutex_enter();
+	log_sys.buf_free += len;
 
-	lsn = log_sys.lsn;
+	ut_ad(log_sys.buf_free <= size_t{srv_log_buffer_size});
 
-	log_mutex_exit();
+	lsn += len;
+	log_sys.set_lsn(lsn);
 
-	return(lsn);
-}
-
-/************************************************************//**
-Gets the last lsn that is fully flushed to disk.
-@return	last flushed lsn */
-UNIV_INLINE
-ib_uint64_t
-log_get_flush_lsn(void)
-{
-	ib_uint64_t	lsn;
-
-	log_mutex_enter();
-
-	lsn = log_sys.flushed_to_disk_lsn;
-
-	log_mutex_exit();
-
-	return(lsn);
-}
-
-/************************************************************//**
-Gets the current lsn with a trylock
-@return	current lsn or 0 if false*/
-UNIV_INLINE
-lsn_t
-log_get_lsn_nowait(void)
-/*====================*/
-{
-	lsn_t	lsn=0;
-
-	if (!mutex_enter_nowait(&(log_sys.mutex))) {
-
-		lsn = log_sys.lsn;
-
-		mutex_exit(&(log_sys.mutex));
-	}
-
-	return(lsn);
-}
-
-/****************************************************************
-Gets the log group capacity. It is OK to read the value without
-holding log_sys.mutex because it is constant.
-@return log group capacity */
-UNIV_INLINE
-lsn_t
-log_get_capacity(void)
-/*==================*/
-{
-	return(log_sys.log_group_capacity);
-}
-
-/****************************************************************
-Get log_sys::max_modified_age_async. It is OK to read the value without
-holding log_sys::mutex because it is constant.
-@return max_modified_age_async */
-UNIV_INLINE
-lsn_t
-log_get_max_modified_age_async(void)
-/*================================*/
-{
-	return(log_sys.max_modified_age_async);
+	return lsn;
 }
 
 /***********************************************************************//**
@@ -477,7 +319,7 @@ log_free_check(void)
 		      sync_allowed_latches(latches,
 					   latches + UT_ARR_SIZE(latches))));
 
-	if (log_sys.check_flush_or_checkpoint) {
+	if (log_sys.check_flush_or_checkpoint()) {
 
 		log_check_margins();
 	}
diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h
index 716bc12b5d1..9159ba00859 100644
--- a/storage/innobase/include/log0recv.h
+++ b/storage/innobase/include/log0recv.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,22 +24,17 @@ Recovery
 Created 9/20/1997 Heikki Tuuri
 *******************************************************/
 
-#ifndef log0recv_h
-#define log0recv_h
+#pragma once
 
 #include "ut0byte.h"
 #include "buf0types.h"
-#include "hash0hash.h"
 #include "log0log.h"
 #include "mtr0types.h"
 
 #include <deque>
 
-/** Is recv_writer_thread active? */
-extern bool	recv_writer_thread_active;
-
 /** @return whether recovery is currently running. */
-#define recv_recovery_is_on() UNIV_UNLIKELY(recv_recovery_on)
+#define recv_recovery_is_on() UNIV_UNLIKELY(recv_sys.recovery_on)
 
 /** Find the latest checkpoint in the log header.
 @param[out]	max_field	LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2
@@ -48,45 +43,21 @@ dberr_t
 recv_find_max_checkpoint(ulint* max_field)
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
-/** Reduces recv_sys.n_addrs for the corrupted page.
-This function should called when srv_force_recovery > 0.
-@param[in]	page_id page id of the corrupted page */
-void recv_recover_corrupt_page(page_id_t page_id);
-
 /** Apply any buffered redo log to a page that was just read from a data file.
+@param[in,out]	space	tablespace
 @param[in,out]	bpage	buffer pool page */
-ATTRIBUTE_COLD void recv_recover_page(buf_page_t* bpage);
+ATTRIBUTE_COLD void recv_recover_page(fil_space_t* space, buf_page_t* bpage)
+	MY_ATTRIBUTE((nonnull));
 
 /** Start recovering from a redo log checkpoint.
-@see recv_recovery_from_checkpoint_finish
 @param[in]	flush_lsn	FIL_PAGE_FILE_FLUSH_LSN
 of first system tablespace page
 @return error code or DB_SUCCESS */
 dberr_t
 recv_recovery_from_checkpoint_start(
 	lsn_t	flush_lsn);
-/** Complete recovery from a checkpoint. */
-void
-recv_recovery_from_checkpoint_finish(void);
-/********************************************************//**
-Initiates the rollback of active transactions. */
-void
-recv_recovery_rollback_active(void);
-/*===============================*/
-
-/********************************************************//**
-Reset the state of the recovery system variables. */
-void
-recv_sys_var_init(void);
-/*===================*/
-
-/** Apply the hash table of stored log records to persistent data pages.
-@param[in]	last_batch	whether the change buffer merge will be
-				performed as part of the operation */
-void
-recv_apply_hashed_log_recs(bool last_batch);
-
-/** Whether to store redo log records to the hash table */
+
+/** Whether to store redo log records in recv_sys.pages */
 enum store_t {
 	/** Do not store redo log records. */
 	STORE_NO,
@@ -105,64 +76,32 @@ recv_sys.parse_start_lsn is non-zero.
 @return true if more data added */
 bool recv_sys_add_to_parsing_buf(const byte* log_block, lsn_t scanned_lsn);
 
-/** Parse log records from a buffer and optionally store them to a
-hash table to wait merging to file pages.
-@param[in]	checkpoint_lsn		the LSN of the latest checkpoint
-@param[in]	store			whether to store page operations
-@param[in]	available_memory	memory to read the redo logs
-@param[in]	apply			whether to apply the records
-@return whether MLOG_CHECKPOINT record was seen the first time,
-or corruption was noticed */
-bool recv_parse_log_recs(
-	lsn_t		checkpoint_lsn,
-	store_t*	store,
-	ulint		available_memory,
-	bool		apply);
-
 /** Moves the parsing buffer data left to the buffer start */
 void recv_sys_justify_left_parsing_buf();
 
-/** Report optimized DDL operation (without redo log),
-corresponding to MLOG_INDEX_LOAD.
-@param[in]	space_id	tablespace identifier
-*/
-extern void (*log_optimized_ddl_op)(ulint space_id);
-
 /** Report an operation to create, delete, or rename a file during backup.
 @param[in]	space_id	tablespace identifier
-@param[in]	flags		tablespace flags (NULL if not create)
+@param[in]	create		whether the file is being created
 @param[in]	name		file name (not NUL-terminated)
 @param[in]	len		length of name, in bytes
 @param[in]	new_name	new file name (NULL if not rename)
 @param[in]	new_len		length of new_name, in bytes (0 if NULL) */
-extern void (*log_file_op)(ulint space_id, const byte* flags,
+extern void (*log_file_op)(ulint space_id, bool create,
 			   const byte* name, ulint len,
 			   const byte* new_name, ulint new_len);
 
-/** Block of log record data */
-struct recv_data_t{
-	recv_data_t*	next;	/*!< pointer to the next block or NULL */
-				/*!< the log record data is stored physically
-				immediately after this struct, max amount
-				RECV_DATA_BLOCK_SIZE bytes of it */
-};
-
-/** Stored log record struct */
-struct recv_t{
-	mlog_id_t	type;	/*!< log record type */
-	ulint		len;	/*!< log record body length in bytes */
-	recv_data_t*	data;	/*!< chain of blocks containing the log record
-				body */
-	lsn_t		start_lsn;/*!< start lsn of the log segment written by
-				the mtr which generated this log record: NOTE
-				that this is not necessarily the start lsn of
-				this log record */
-	lsn_t		end_lsn;/*!< end lsn of the log segment written by
-				the mtr which generated this log record: NOTE
-				that this is not necessarily the end lsn of
-				this log record */
-	UT_LIST_NODE_T(recv_t)
-			rec_list;/*!< list of log records for this page */
+/** Stored redo log record */
+struct log_rec_t
+{
+  log_rec_t(lsn_t lsn) : next(nullptr), lsn(lsn) { ut_ad(lsn); }
+  log_rec_t()= delete;
+  log_rec_t(const log_rec_t&)= delete;
+  log_rec_t &operator=(const log_rec_t&)= delete;
+
+  /** next record */
+  log_rec_t *next;
+  /** mtr_t::commit_lsn() of the mini-transaction */
+  const lsn_t lsn;
 };
 
 struct recv_dblwr_t
@@ -196,33 +135,93 @@ struct recv_dblwr_t
   list pages;
 };
 
+/** the recovery state and buffered records for a page */
+struct page_recv_t
+{
+  /** Recovery state; protected by recv_sys.mutex */
+  enum
+  {
+    /** not yet processed */
+    RECV_NOT_PROCESSED,
+    /** not processed; the page will be reinitialized */
+    RECV_WILL_NOT_READ,
+    /** page is being read */
+    RECV_BEING_READ,
+    /** log records are being applied on the page */
+    RECV_BEING_PROCESSED
+  } state= RECV_NOT_PROCESSED;
+  /** Latest written byte offset when applying the log records.
+  @see mtr_t::m_last_offset */
+  uint16_t last_offset= 1;
+  /** log records for a page */
+  class recs_t
+  {
+    /** The first log record */
+    log_rec_t *head= nullptr;
+    /** The last log record */
+    log_rec_t *tail= nullptr;
+    friend struct page_recv_t;
+  public:
+    /** Append a redo log snippet for the page
+    @param recs log snippet */
+    void append(log_rec_t* recs)
+    {
+      if (tail)
+        tail->next= recs;
+      else
+        head= recs;
+      tail= recs;
+    }
+
+    /** @return the last log snippet */
+    const log_rec_t* last() const { return tail; }
+    /** @return the last log snippet */
+    log_rec_t* last() { return tail; }
+
+    class iterator
+    {
+      log_rec_t *cur;
+    public:
+      iterator(log_rec_t* rec) : cur(rec) {}
+      log_rec_t* operator*() const { return cur; }
+      iterator &operator++() { cur= cur->next; return *this; }
+      bool operator!=(const iterator& i) const { return cur != i.cur; }
+    };
+    iterator begin() { return head; }
+    iterator end() { return NULL; }
+    bool empty() const { ut_ad(!head == !tail); return !head; }
+    /** Clear and free the records; @see recv_sys_t::alloc() */
+    inline void clear();
+  } log;
+
+  /** Trim old log records for a page.
+  @param start_lsn oldest log sequence number to preserve
+  @return whether all the log for the page was trimmed */
+  inline bool trim(lsn_t start_lsn);
+  /** Ignore any earlier redo log records for this page. */
+  inline void will_not_read();
+  /** @return whether the log records for the page are being processed */
+  bool is_being_processed() const { return state == RECV_BEING_PROCESSED; }
+};
+
 /** Recovery system data structure */
-struct recv_sys_t{
-	ib_mutex_t		mutex;	/*!< mutex protecting the fields apply_log_recs,
-				n_addrs, and the state field in each recv_addr
-				struct */
-	ib_mutex_t		writer_mutex;/*!< mutex coordinating
-				flushing between recv_writer_thread and
-				the recovery thread. */
-	os_event_t		flush_start;/*!< event to activate
-				page cleaner threads */
-	os_event_t		flush_end;/*!< event to signal that the page
-				cleaner has finished the request */
-	buf_flush_t		flush_type;/*!< type of the flush request.
-				BUF_FLUSH_LRU: flush end of LRU, keeping free blocks.
-				BUF_FLUSH_LIST: flush all of blocks. */
-	/** whether recv_recover_page(), invoked from buf_page_io_complete(),
-	should apply log records*/
-	bool		apply_log_recs;
-	/** whether recv_apply_hashed_log_recs() is running */
-	bool		apply_batch_on;
+struct recv_sys_t
+{
+  /** mutex protecting apply_log_recs and page_recv_t::state */
+  ib_mutex_t mutex;
+  /** whether we are applying redo log records during crash recovery */
+  bool recovery_on;
+  /** whether recv_recover_page(), invoked from buf_page_read_complete(),
+  should apply log records*/
+  bool apply_log_recs;
+  /** whether apply() is running */
+  bool apply_batch_on;
 	byte*		buf;	/*!< buffer for parsing log records */
-	size_t		buf_size;	/*!< size of buf */
 	ulint		len;	/*!< amount of data in buf */
 	lsn_t		parse_start_lsn;
 				/*!< this is the lsn from which we were able to
 				start parsing log records and adding them to
-				the hash table; zero if a suitable
+				pages; zero if a suitable
 				start point not found yet */
 	lsn_t		scanned_lsn;
 				/*!< the log data has been scanned up to this
@@ -245,81 +244,159 @@ struct recv_sys_t{
 				the file system contents is detected
 				during log scan or apply */
 	lsn_t		mlog_checkpoint_lsn;
-				/*!< the LSN of a MLOG_CHECKPOINT
+				/*!< the LSN of a FILE_CHECKPOINT
 				record, or 0 if none was parsed */
 	/** the time when progress was last reported */
 	time_t		progress_time;
-	mem_heap_t*	heap;	/*!< memory heap of log records and file
-				addresses*/
-	hash_table_t*	addr_hash;/*!< hash table of file addresses of pages */
-	ulint		n_addrs;/*!< number of not processed hashed file
-				addresses in the hash table */
-
-	/** Undo tablespaces for which truncate has been logged
-	(indexed by id - srv_undo_space_id_start) */
-	struct trunc {
-		/** log sequence number of MLOG_FILE_CREATE2, or 0 if none */
-		lsn_t		lsn;
-		/** truncated size of the tablespace, or 0 if not truncated */
-		unsigned	pages;
-	} truncated_undo_spaces[127];
-
-	recv_dblwr_t	dblwr;
-
-	/** Lastly added LSN to the hash table of log records. */
-	lsn_t		last_stored_lsn;
-
-	/** Initialize the redo log recovery subsystem. */
-	void create();
-
-	/** Free most recovery data structures. */
-	void debug_free();
-
-	/** Clean up after create() */
-	void close();
-
-	bool is_initialised() const { return buf_size != 0; }
-
-	/** Store a redo log record for applying.
-	@param type	record type
-	@param space	tablespace identifier
-	@param page_no	page number
-	@param body	record body
-	@param rec_end	end of record
-	@param lsn	start LSN of the mini-transaction
-	@param end_lsn	end LSN of the mini-transaction */
-	inline void add(mlog_id_t type, ulint space, ulint page_no,
-			byte* body, byte* rec_end, lsn_t lsn,
-			lsn_t end_lsn);
-
-	/** Empty a fully processed set of stored redo log records. */
-	inline void empty();
-
-	/** Determine whether redo log recovery progress should be reported.
-	@param[in]	time	the current time
-	@return	whether progress should be reported
-		(the last report was at least 15 seconds ago) */
-	bool report(time_t time)
-	{
-		if (time - progress_time < 15) {
-			return false;
-		}
-
-		progress_time = time;
-		return true;
-	}
+
+  using map = std::map<const page_id_t, page_recv_t,
+                       std::less<const page_id_t>,
+                       ut_allocator<std::pair<const page_id_t, page_recv_t>>>;
+  /** buffered records waiting to be applied to pages */
+  map pages;
+
+private:
+  /** Process a record that indicates that a tablespace size is being shrunk.
+  @param page_id first page that is not in the file
+  @param lsn     log sequence number of the shrink operation */
+  inline void trim(const page_id_t page_id, lsn_t lsn);
+
+  /** Undo tablespaces for which truncate has been logged
+  (indexed by page_id_t::space() - srv_undo_space_id_start) */
+  struct trunc
+  {
+    /** log sequence number of FILE_CREATE, or 0 if none */
+    lsn_t lsn;
+    /** truncated size of the tablespace, or 0 if not truncated */
+    unsigned pages;
+  } truncated_undo_spaces[127];
+
+public:
+  /** The contents of the doublewrite buffer */
+  recv_dblwr_t dblwr;
+
+  /** Last added LSN to pages. */
+  lsn_t last_stored_lsn= 0;
+
+  void read(os_offset_t offset, span<byte> buf);
+  inline size_t files_size();
+  void close_files() { files.clear(); files.shrink_to_fit(); }
+
+private:
+  /** Attempt to initialize a page based on redo log records.
+  @param page_id  page identifier
+  @param p        iterator pointing to page_id
+  @param mtr      mini-transaction
+  @param b        pre-allocated buffer pool block
+  @return whether the page was successfully initialized */
+  inline buf_block_t *recover_low(const page_id_t page_id, map::iterator &p,
+                                  mtr_t &mtr, buf_block_t *b);
+  /** Attempt to initialize a page based on redo log records.
+  @param page_id  page identifier
+  @return the recovered block
+  @retval nullptr if the page cannot be initialized based on log records */
+  buf_block_t *recover_low(const page_id_t page_id);
+
+  /** All found log files (multiple ones are possible if we are upgrading
+  from before MariaDB Server 10.5.1) */
+  std::vector<log_file_t> files;
+
+  void open_log_files_if_needed();
+
+  /** Base node of the redo block list.
+  List elements are linked via buf_block_t::unzip_LRU. */
+  UT_LIST_BASE_NODE_T(buf_block_t) blocks;
+public:
+  /** Check whether the number of read redo log blocks exceeds the maximum.
+  Store last_stored_lsn if the recovery is not in the last phase.
+  @param[in,out] store    whether to store page operations
+  @return whether the memory is exhausted */
+  inline bool is_memory_exhausted(store_t *store);
+  /** Apply buffered log to persistent data pages.
+  @param last_batch     whether it is possible to write more redo log */
+  void apply(bool last_batch);
+
+#ifdef UNIV_DEBUG
+  /** whether all redo log in the current batch has been applied */
+  bool after_apply= false;
+#endif
+  /** Initialize the redo log recovery subsystem. */
+  void create();
+
+  /** Free most recovery data structures. */
+  void debug_free();
+
+  /** Clean up after create() */
+  void close();
+
+  bool is_initialised() const { return last_stored_lsn != 0; }
+
+  /** Register a redo log snippet for a page.
+  @param it       page iterator
+  @param start_lsn start LSN of the mini-transaction
+  @param lsn      @see mtr_t::commit_lsn()
+  @param l        redo log snippet @see log_t::FORMAT_10_5
+  @param len      length of l, in bytes */
+  inline void add(map::iterator it, lsn_t start_lsn, lsn_t lsn,
+                  const byte *l, size_t len);
+
+  /** Parse and register one mini-transaction in log_t::FORMAT_10_5.
+  @param checkpoint_lsn  the log sequence number of the latest checkpoint
+  @param store           whether to store the records
+  @param apply           whether to apply file-level log records
+  @return whether FILE_CHECKPOINT record was seen the first time,
+  or corruption was noticed */
+  bool parse(lsn_t checkpoint_lsn, store_t *store, bool apply);
+
+  /** Clear a fully processed set of stored redo log records. */
+  inline void clear();
+
+  /** Determine whether redo log recovery progress should be reported.
+  @param time  the current time
+  @return whether progress should be reported
+  (the last report was at least 15 seconds ago) */
+  bool report(time_t time)
+  {
+    if (time - progress_time < 15)
+      return false;
+
+    progress_time= time;
+    return true;
+  }
+
+  /** The alloc() memory alignment, in bytes */
+  static constexpr size_t ALIGNMENT= sizeof(size_t);
+
+  /** Allocate memory for log_rec_t
+  @param len  allocation size, in bytes
+  @return pointer to len bytes of memory (never NULL) */
+  inline void *alloc(size_t len);
+
+  /** Free a redo log snippet.
+  @param data buffer returned by alloc() */
+  inline void free(const void *data);
+
+  /** Remove records for a corrupted page.
+  This function should only be called when innodb_force_recovery is set.
+  @param page_id  corrupted page identifier */
+  ATTRIBUTE_COLD void free_corrupted_page(page_id_t page_id);
+
+  /** Attempt to initialize a page based on redo log records.
+  @param page_id  page identifier
+  @return the recovered block
+  @retval nullptr if the page cannot be initialized based on log records */
+  buf_block_t *recover(const page_id_t page_id)
+  {
+    return UNIV_UNLIKELY(recovery_on) ? recover_low(page_id) : nullptr;
+  }
 };
 
 /** The recovery system */
 extern recv_sys_t	recv_sys;
 
-/** TRUE when applying redo log records during crash recovery; FALSE
-otherwise.  Note that this is FALSE while a background thread is
-rolling back incomplete transactions. */
-extern volatile bool	recv_recovery_on;
 /** If the following is TRUE, the buffer pool file pages must be invalidated
-after recovery and no ibuf operations are allowed; this becomes TRUE if
-the log record hash table becomes too full, and log records must be merged
+after recovery and no ibuf operations are allowed; this will be set if
+recv_sys.pages becomes too full, and log records must be merged
 to file pages already before the recovery is finished: in this case no
 ibuf operations are allowed, as they could modify the pages read in the
 buffer pool before the pages have been recovered to the up-to-date state.
@@ -347,23 +424,3 @@ times! */
 /** Size of block reads when the log groups are scanned forward to do a
 roll-forward */
 #define RECV_SCAN_SIZE		(4U << srv_page_size_shift)
-
-/** This is a low level function for the recovery system
-to create a page which has buffered intialized redo log records.
-@param[in]	page_id	page to be created using redo logs
-@return whether the page creation successfully */
-buf_block_t* recv_recovery_create_page_low(const page_id_t page_id);
-
-/** Recovery system creates a page which has buffered intialized
-redo log records.
-@param[in]	page_id	page to be created using redo logs
-@return block which contains page was initialized */
-inline buf_block_t* recv_recovery_create_page(const page_id_t page_id)
-{
-  if (UNIV_LIKELY(!recv_recovery_on))
-    return NULL;
-
-  return recv_recovery_create_page_low(page_id);
-}
-
-#endif
diff --git a/storage/innobase/include/log0types.h b/storage/innobase/include/log0types.h
index 56faa7467cf..337fcd31793 100644
--- a/storage/innobase/include/log0types.h
+++ b/storage/innobase/include/log0types.h
@@ -41,10 +41,4 @@ typedef	ib_uint64_t		lsn_t;
 
 #define LSN_PF			UINT64PF
 
-/** The redo log manager */
-struct RedoLog;
-
-/** The recovery implementation */
-struct redo_recover_t;
-
 #endif /* log0types_h */
diff --git a/storage/innobase/include/mach0data.h b/storage/innobase/include/mach0data.h
index d160ddbcb9e..79cbd7d18a1 100644
--- a/storage/innobase/include/mach0data.h
+++ b/storage/innobase/include/mach0data.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -251,25 +251,6 @@ mach_u64_read_much_compressed(
 /*==========================*/
 	const byte*	b)	/*!< in: pointer to memory from where to read */
 	MY_ATTRIBUTE((warn_unused_result));
-/** Read a 32-bit integer in a compressed form.
-@param[in,out]	ptr	pointer to memory where to read;
-advanced by the number of bytes consumed, or set NULL if out of space
-@param[in]	end_ptr	end of the buffer
-@return unsigned value */
-ib_uint32_t
-mach_parse_compressed(
-	const byte**	ptr,
-	const byte*	end_ptr);
-/** Read a 64-bit integer in a compressed form.
-@param[in,out]	ptr	pointer to memory where to read;
-advanced by the number of bytes consumed, or set NULL if out of space
-@param[in]	end_ptr	end of the buffer
-@return unsigned value */
-UNIV_INLINE
-ib_uint64_t
-mach_u64_parse_compressed(
-	const byte**	ptr,
-	const byte*	end_ptr);
 
 /*********************************************************//**
 Reads a double. It is stored in a little-endian format.
diff --git a/storage/innobase/include/mach0data.inl b/storage/innobase/include/mach0data.inl
index 80bd925d70b..bfccf611991 100644
--- a/storage/innobase/include/mach0data.inl
+++ b/storage/innobase/include/mach0data.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -546,38 +546,6 @@ mach_read_next_much_compressed(
 	return(val);
 }
 
-/** Read a 64-bit integer in a compressed form.
-@param[in,out]	ptr	pointer to memory where to read;
-advanced by the number of bytes consumed, or set NULL if out of space
-@param[in]	end_ptr	end of the buffer
-@return unsigned value */
-UNIV_INLINE
-ib_uint64_t
-mach_u64_parse_compressed(
-	const byte**	ptr,
-	const byte*	end_ptr)
-{
-	ib_uint64_t	val = 0;
-
-	if (end_ptr < *ptr + 5) {
-		*ptr = NULL;
-		return(val);
-	}
-
-	val = mach_read_next_compressed(ptr);
-
-	if (end_ptr < *ptr + 4) {
-		*ptr = NULL;
-		return(val);
-	}
-
-	val <<= 32;
-	val |= mach_read_from_4(*ptr);
-	*ptr += 4;
-
-	return(val);
-}
-
 /*********************************************************//**
 Reads a double. It is stored in a little-endian format.
 @return double read */
diff --git a/storage/innobase/include/mem0mem.h b/storage/innobase/include/mem0mem.h
index 3f0db717be8..959147a61fc 100644
--- a/storage/innobase/include/mem0mem.h
+++ b/storage/innobase/include/mem0mem.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -58,8 +58,6 @@ buffer pool; the latter method is used for very big heaps */
 
 /** Different type of heaps in terms of which datastructure is using them */
 #define MEM_HEAP_FOR_BTR_SEARCH		(MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER)
-#define MEM_HEAP_FOR_PAGE_HASH		(MEM_HEAP_DYNAMIC)
-#define MEM_HEAP_FOR_RECV_SYS		(MEM_HEAP_BUFFER)
 #define MEM_HEAP_FOR_LOCK_HEAP		(MEM_HEAP_BUFFER)
 
 /** The following start size is used for the first block in the memory heap if
diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h
index d8060d106d6..0d83d83b794 100644
--- a/storage/innobase/include/mtr0log.h
+++ b/storage/innobase/include/mtr0log.h
@@ -1,7 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, MariaDB Corporation.
+Copyright (c) 2019, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,241 +16,658 @@ this program; if not, write to the Free Software Foundation, Inc.,
 
 *****************************************************************************/
 
-/**************************************************//**
+/**
 @file include/mtr0log.h
-Mini-transaction logging routines
-
-Created 12/7/1995 Heikki Tuuri
+Mini-transaction log record encoding and decoding
 *******************************************************/
 
-#ifndef mtr0log_h
-#define mtr0log_h
-
+#pragma once
 #include "mtr0mtr.h"
-#include "dyn0buf.h"
-
-// Forward declaration
-struct dict_index_t;
-
-/********************************************************//**
-Writes 1, 2 or 4 bytes to a file page. Writes the corresponding log
-record to the mini-transaction log if mtr is not NULL. */
-void
-mlog_write_ulint(
-/*=============*/
-	byte*		ptr,	/*!< in: pointer where to write */
-	ulint		val,	/*!< in: value to write */
-	mlog_id_t	type,	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
-	mtr_t*		mtr);	/*!< in: mini-transaction handle */
-
-/********************************************************//**
-Writes 8 bytes to a file page. Writes the corresponding log
-record to the mini-transaction log, only if mtr is not NULL */
-void
-mlog_write_ull(
-/*===========*/
-	byte*		ptr,	/*!< in: pointer where to write */
-	ib_uint64_t	val,	/*!< in: value to write */
-	mtr_t*		mtr);	/*!< in: mini-transaction handle */
-/********************************************************//**
-Writes a string to a file page buffered in the buffer pool. Writes the
-corresponding log record to the mini-transaction log. */
-void
-mlog_write_string(
-/*==============*/
-	byte*		ptr,	/*!< in: pointer where to write */
-	const byte*	str,	/*!< in: string to write */
-	ulint		len,	/*!< in: string length */
-	mtr_t*		mtr);	/*!< in: mini-transaction handle */
-/********************************************************//**
-Logs a write of a string to a file page buffered in the buffer pool.
-Writes the corresponding log record to the mini-transaction log. */
-void
-mlog_log_string(
-/*============*/
-	byte*	ptr,	/*!< in: pointer written to */
-	ulint	len,	/*!< in: string length */
-	mtr_t*	mtr);	/*!< in: mini-transaction handle */
 
-/** Initialize a string of bytes.
-@param[in,out]	b	buffer page
-@param[in]	ofs	byte offset from block->frame
-@param[in]	len	length of the data to write
-@param[in]	val	the data byte to write
-@param[in,out]	mtr	mini-transaction */
-void
-mlog_memset(buf_block_t* b, ulint ofs, ulint len, byte val, mtr_t* mtr);
+/** The minimum 2-byte integer (0b10xxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_2BYTE= 1 << 7;
+/** The minimum 3-byte integer (0b110xxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_3BYTE= MIN_2BYTE + (1 << 14);
+/** The minimum 4-byte integer (0b1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_4BYTE= MIN_3BYTE + (1 << 21);
+/** Minimum 5-byte integer (0b11110000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_5BYTE= MIN_4BYTE + (1 << 28);
+
+/** Error from mlog_decode_varint() */
+constexpr uint32_t MLOG_DECODE_ERROR= ~0U;
+
+/** Decode the length of a variable-length encoded integer.
+@param first  first byte of the encoded integer
+@return the length, in bytes */
+inline uint8_t mlog_decode_varint_length(byte first)
+{
+  uint8_t len= 1;
+  for (; first & 0x80; len++, first= static_cast<uint8_t>(first << 1));
+  return len;
+}
+
+/** Decode an integer in a redo log record.
+@param log    redo log record buffer
+@return the decoded integer
+@retval MLOG_DECODE_ERROR on error */
+inline uint32_t mlog_decode_varint(const byte* log)
+{
+  uint32_t i= *log;
+  if (i < MIN_2BYTE)
+    return i;
+  if (i < 0xc0)
+    return MIN_2BYTE + ((i & ~0x80) << 8 | log[1]);
+  if (i < 0xe0)
+    return MIN_3BYTE + ((i & ~0xc0) << 16 | uint32_t{log[1]} << 8 | log[2]);
+  if (i < 0xf0)
+    return MIN_4BYTE + ((i & ~0xe0) << 24 | uint32_t{log[1]} << 16 |
+                        uint32_t{log[2]} << 8 | log[3]);
+  if (i == 0xf0)
+  {
+    i= uint32_t{log[1]} << 24 | uint32_t{log[2]} << 16 |
+      uint32_t{log[3]} << 8 | log[4];
+    if (i <= ~MIN_5BYTE)
+      return MIN_5BYTE + i;
+  }
+  return MLOG_DECODE_ERROR;
+}
+
+/** Encode an integer in a redo log record.
+@param log  redo log record buffer
+@param i    the integer to encode
+@return end of the encoded integer */
+inline byte *mlog_encode_varint(byte *log, size_t i)
+{
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */
+#endif
+  if (i < MIN_2BYTE)
+  {
+  }
+  else if (i < MIN_3BYTE)
+  {
+    i-= MIN_2BYTE;
+    static_assert(MIN_3BYTE - MIN_2BYTE == 1 << 14, "compatibility");
+    *log++= 0x80 | static_cast<byte>(i >> 8);
+  }
+  else if (i < MIN_4BYTE)
+  {
+    i-= MIN_3BYTE;
+    static_assert(MIN_4BYTE - MIN_3BYTE == 1 << 21, "compatibility");
+    *log++= 0xc0 | static_cast<byte>(i >> 16);
+    goto last2;
+  }
+  else if (i < MIN_5BYTE)
+  {
+    i-= MIN_4BYTE;
+    static_assert(MIN_5BYTE - MIN_4BYTE == 1 << 28, "compatibility");
+    *log++= 0xe0 | static_cast<byte>(i >> 24);
+    goto last3;
+  }
+  else
+  {
+    ut_ad(i < MLOG_DECODE_ERROR);
+    i-= MIN_5BYTE;
+    *log++= 0xf0;
+    *log++= static_cast<byte>(i >> 24);
+last3:
+    *log++= static_cast<byte>(i >> 16);
+last2:
+    *log++= static_cast<byte>(i >> 8);
+  }
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+  *log++= static_cast<byte>(i);
+  return log;
+}
+
+/** Determine the length of a log record.
+@param log  start of log record
+@param end  end of the log record buffer
+@return the length of the record, in bytes
+@retval 0                 if the log extends past the end
+@retval MLOG_DECODE_ERROR if the record is corrupted */
+inline uint32_t mlog_decode_len(const byte *log, const byte *end)
+{
+  ut_ad(log < end);
+  uint32_t i= *log;
+  if (!i)
+    return 0; /* end of mini-transaction */
+  if (~i & 15)
+    return (i & 15) + 1; /* 1..16 bytes */
+  if (UNIV_UNLIKELY(++log == end))
+    return 0; /* end of buffer */
+  i= *log;
+  if (UNIV_LIKELY(i < MIN_2BYTE)) /* 1 additional length byte: 16..143 bytes */
+    return 16 + i;
+  if (i < 0xc0) /* 2 additional length bytes: 144..16,527 bytes */
+  {
+    if (UNIV_UNLIKELY(log + 1 == end))
+      return 0; /* end of buffer */
+    return 16 + MIN_2BYTE + ((i & ~0xc0) << 8 | log[1]);
+  }
+  if (i < 0xe0) /* 3 additional length bytes: 16528..1065103 bytes */
+  {
+    if (UNIV_UNLIKELY(log + 2 == end))
+      return 0; /* end of buffer */
+    return 16 + MIN_3BYTE + ((i & ~0xe0) << 16 |
+                             static_cast<uint32_t>(log[1]) << 8 | log[2]);
+  }
+  /* 1,065,103 bytes per log record ought to be enough for everyone */
+  return MLOG_DECODE_ERROR;
+}
+
+/** Write 1, 2, 4, or 8 bytes to a file page.
+@param[in]      block   file page
+@param[in,out]  ptr     pointer in file page
+@param[in]      val     value to write
+@tparam l       number of bytes to write
+@tparam w       write request type
+@tparam V       type of val
+@return whether any log was written */
+template<unsigned l,mtr_t::write_type w,typename V>
+inline bool mtr_t::write(const buf_block_t &block, void *ptr, V val)
+{
+  ut_ad(ut_align_down(ptr, srv_page_size) == block.frame);
+  static_assert(l == 1 || l == 2 || l == 4 || l == 8, "wrong length");
+  byte buf[l];
+
+  switch (l) {
+  case 1:
+    ut_ad(val == static_cast<byte>(val));
+    buf[0]= static_cast<byte>(val);
+    break;
+  case 2:
+    ut_ad(val == static_cast<uint16_t>(val));
+    mach_write_to_2(buf, static_cast<uint16_t>(val));
+    break;
+  case 4:
+    ut_ad(val == static_cast<uint32_t>(val));
+    mach_write_to_4(buf, static_cast<uint32_t>(val));
+    break;
+  case 8:
+    mach_write_to_8(buf, val);
+    break;
+  }
+  byte *p= static_cast<byte*>(ptr);
+  const byte *const end= p + l;
+  if (w != FORCED && m_log_mode == MTR_LOG_ALL)
+  {
+    const byte *b= buf;
+    while (*p++ == *b++)
+    {
+      if (p == end)
+      {
+        ut_ad(w == MAYBE_NOP);
+        return false;
+      }
+    }
+    p--;
+  }
+  ::memcpy(ptr, buf, l);
+  memcpy_low(block, static_cast<uint16_t>
+             (ut_align_offset(p, srv_page_size)), p, end - p);
+  return true;
+}
+
+/** Log an initialization of a string of bytes.
+@param[in]      b       buffer page
+@param[in]      ofs     byte offset from b->frame
+@param[in]      len     length of the data to write
+@param[in]      val     the data byte to write */
+inline void mtr_t::memset(const buf_block_t &b, ulint ofs, ulint len, byte val)
+{
+  ut_ad(len);
+  set_modified(b);
+  if (m_log_mode != MTR_LOG_ALL)
+    return;
+
+  static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+  size_t lenlen= (len < MIN_2BYTE ? 1 + 1 : len < MIN_3BYTE ? 2 + 1 : 3 + 1);
+  byte *l= log_write<MEMSET>(b.page.id(), &b.page, lenlen, true, ofs);
+  l= mlog_encode_varint(l, len);
+  *l++= val;
+  m_log.close(l);
+  m_last_offset= static_cast<uint16_t>(ofs + len);
+}
 
 /** Initialize a string of bytes.
-@param[in,out]	byte	byte address
-@param[in]	len	length of the data to write
-@param[in]	val	the data byte to write
-@param[in,out]	mtr	mini-transaction */
-void mlog_memset(byte* b, ulint len, byte val, mtr_t* mtr);
-
-/********************************************************//**
-Writes initial part of a log record consisting of one-byte item
-type and four-byte space and page numbers. */
-void
-mlog_write_initial_log_record(
-/*==========================*/
-	const byte*	ptr,	/*!< in: pointer to (inside) a buffer
-				frame holding the file page where
-				modification is made */
-	mlog_id_t	type,	/*!< in: log item type: MLOG_1BYTE, ... */
-	mtr_t*		mtr);	/*!< in: mini-transaction handle */
-/********************************************************//**
-Catenates 1 - 4 bytes to the mtr log. The value is not compressed. */
-UNIV_INLINE
-void
-mlog_catenate_ulint(
-/*================*/
-	mtr_buf_t*	dyn_buf,	/*!< in/out: buffer to write */
-	ulint		val,		/*!< in: value to write */
-	mlog_id_t	type);		/*!< in: type of value to write */
-/********************************************************//**
-Catenates 1 - 4 bytes to the mtr log. */
-UNIV_INLINE
-void
-mlog_catenate_ulint(
-/*================*/
-	mtr_t*		mtr,	/*!< in: mtr */
-	ulint		val,	/*!< in: value to write */
-	mlog_id_t	type);	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
-/********************************************************//**
-Catenates n bytes to the mtr log. */
-void
-mlog_catenate_string(
-/*=================*/
-	mtr_t*		mtr,	/*!< in: mtr */
-	const byte*	str,	/*!< in: string to write */
-	ulint		len);	/*!< in: string length */
-/********************************************************//**
-Catenates a compressed 64-bit integer to mlog. */
-UNIV_INLINE
-void
-mlog_catenate_ull_compressed(
-/*=========================*/
-	mtr_t*		mtr,	/*!< in: mtr */
-	ib_uint64_t	val);	/*!< in: value to write */
-/********************************************************//**
-Opens a buffer to mlog. It must be closed with mlog_close.
-@return buffer, NULL if log mode MTR_LOG_NONE */
-UNIV_INLINE
-byte*
-mlog_open(
-/*======*/
-	mtr_t*		mtr,	/*!< in: mtr */
-	ulint		size);	/*!< in: buffer size in bytes; MUST be
-				smaller than DYN_ARRAY_DATA_SIZE! */
-/********************************************************//**
-Closes a buffer opened to mlog. */
-UNIV_INLINE
-void
-mlog_close(
-/*=======*/
-	mtr_t*		mtr,	/*!< in: mtr */
-	byte*		ptr);	/*!< in: buffer space from ptr up was
-				not used */
-
-/** Writes a log record about an operation.
-@param[in]	type		redo log record type
-@param[in]	space_id	tablespace identifier
-@param[in]	page_no		page number
-@param[in,out]	log_ptr		current end of mini-transaction log
-@param[in,out]	mtr		mini-transaction
-@return	end of mini-transaction log */
-UNIV_INLINE
-byte*
-mlog_write_initial_log_record_low(
-	mlog_id_t	type,
-	ulint		space_id,
-	ulint		page_no,
-	byte*		log_ptr,
-	mtr_t*		mtr);
-
-/********************************************************//**
-Writes the initial part of a log record (3..11 bytes).
-If the implementation of this function is changed, all
-size parameters to mlog_open() should be adjusted accordingly!
-@return new value of log_ptr */
-UNIV_INLINE
-byte*
-mlog_write_initial_log_record_fast(
-/*===============================*/
-	const byte*	ptr,	/*!< in: pointer to (inside) a buffer
-				frame holding the file page where
-				modification is made */
-	mlog_id_t	type,	/*!< in: log item type: MLOG_1BYTE, ... */
-	byte*		log_ptr,/*!< in: pointer to mtr log which has
-				been opened */
-	mtr_t*		mtr);	/*!< in: mtr */
-/********************************************************//**
-Parses an initial log record written by mlog_write_initial_log_record.
-@return parsed record end, NULL if not a complete record */
-byte*
-mlog_parse_initial_log_record(
-/*==========================*/
-	const byte*	ptr,	/*!< in: buffer */
-	const byte*	end_ptr,/*!< in: buffer end */
-	mlog_id_t*	type,	/*!< out: log record type: MLOG_1BYTE, ... */
-	ulint*		space,	/*!< out: space id */
-	ulint*		page_no);/*!< out: page number */
-/********************************************************//**
-Parses a log record written by mlog_write_ulint, mlog_write_ull, mlog_memset.
-@return parsed record end, NULL if not a complete record */
-byte*
-mlog_parse_nbytes(
-/*==============*/
-	mlog_id_t	type,	/*!< in: log record type: MLOG_1BYTE, ... */
-	const byte*	ptr,	/*!< in: buffer */
-	const byte*	end_ptr,/*!< in: buffer end */
-	byte*		page,	/*!< in: page where to apply the log record,
-				or NULL */
-	void*		page_zip);/*!< in/out: compressed page, or NULL */
-/********************************************************//**
-Parses a log record written by mlog_write_string.
-@return parsed record end, NULL if not a complete record */
-byte*
-mlog_parse_string(
-/*==============*/
-	byte*	ptr,	/*!< in: buffer */
-	byte*	end_ptr,/*!< in: buffer end */
-	byte*	page,	/*!< in: page where to apply the log record, or NULL */
-	void*	page_zip);/*!< in/out: compressed page, or NULL */
-
-/********************************************************//**
-Opens a buffer for mlog, writes the initial log record and,
-if needed, the field lengths of an index.  Reserves space
-for further log entries.  The log entry must be closed with
-mtr_close().
-@return buffer, NULL if log mode MTR_LOG_NONE */
-byte*
-mlog_open_and_write_index(
-/*======================*/
-	mtr_t*			mtr,	/*!< in: mtr */
-	const byte*		rec,	/*!< in: index record or page */
-	const dict_index_t*	index,	/*!< in: record descriptor */
-	mlog_id_t		type,	/*!< in: log item type */
-	ulint			size);	/*!< in: requested buffer size in bytes
-					(if 0, calls mlog_close() and
-					returns NULL) */
-
-/********************************************************//**
-Parses a log record written by mlog_open_and_write_index.
-@return parsed record end, NULL if not a complete record */
-byte*
-mlog_parse_index(
-/*=============*/
-	byte*		ptr,	/*!< in: buffer */
-	const byte*	end_ptr,/*!< in: buffer end */
-	ibool		comp,	/*!< in: TRUE=compact record format */
-	dict_index_t**	index);	/*!< out, own: dummy index */
-
-/** Insert, update, and maybe other functions may use this value to define an
-extra mlog buffer size for variable size data */
-#define MLOG_BUF_MARGIN	256
-
-#include "mtr0log.inl"
-
-#endif /* mtr0log_h */
+@param[in,out]  b       buffer page
+@param[in]      ofs     byte offset from block->frame
+@param[in]      len     length of the data to write
+@param[in]      val     the data byte to write */
+inline void mtr_t::memset(const buf_block_t *b, ulint ofs, ulint len, byte val)
+{
+  ut_ad(ofs <= ulint(srv_page_size));
+  ut_ad(ofs + len <= ulint(srv_page_size));
+  ::memset(ofs + b->frame, val, len);
+  memset(*b, ofs, len, val);
+}
+
+/** Log an initialization of a repeating string of bytes.
+@param[in]      b       buffer page
+@param[in]      ofs     byte offset from b->frame
+@param[in]      len     length of the data to write, in bytes
+@param[in]      str     the string to write
+@param[in]      size    size of str, in bytes */
+inline void mtr_t::memset(const buf_block_t &b, ulint ofs, size_t len,
+                          const void *str, size_t size)
+{
+  ut_ad(size);
+  ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */
+  set_modified(b);
+  if (m_log_mode != MTR_LOG_ALL)
+    return;
+
+  static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+  size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3);
+  byte *l= log_write<MEMSET>(b.page.id(), &b.page, lenlen + size, true, ofs);
+  l= mlog_encode_varint(l, len);
+  ::memcpy(l, str, size);
+  l+= size;
+  m_log.close(l);
+  m_last_offset= static_cast<uint16_t>(ofs + len);
+}
+
+/** Initialize a repeating string of bytes.
+@param[in,out]  b       buffer page
+@param[in]      ofs     byte offset from b->frame
+@param[in]      len     length of the data to write, in bytes
+@param[in]      str     the string to write
+@param[in]      size    size of str, in bytes */
+inline void mtr_t::memset(const buf_block_t *b, ulint ofs, size_t len,
+                          const void *str, size_t size)
+{
+  ut_ad(ofs <= ulint(srv_page_size));
+  ut_ad(ofs + len <= ulint(srv_page_size));
+  ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */
+  size_t s= 0;
+  while (s < len)
+  {
+    ::memcpy(ofs + s + b->frame, str, size);
+    s+= len;
+  }
+  ::memcpy(ofs + s + b->frame, str, len - s);
+  memset(*b, ofs, len, str, size);
+}
+
+/** Log a write of a byte string to a page.
+@param[in]      b       buffer page
+@param[in]      offset  byte offset from b->frame
+@param[in]      str     the data to write
+@param[in]      len     length of the data to write */
+inline void mtr_t::memcpy(const buf_block_t &b, ulint offset, ulint len)
+{
+  ut_ad(len);
+  ut_ad(offset <= ulint(srv_page_size));
+  ut_ad(offset + len <= ulint(srv_page_size));
+  memcpy_low(b, uint16_t(offset), &b.frame[offset], len);
+}
+
+/** Log a write of a byte string to a page.
+@param block   page
+@param offset  byte offset within page
+@param data    data to be written
+@param len     length of the data, in bytes */
+inline void mtr_t::memcpy_low(const buf_block_t &block, uint16_t offset,
+                              const void *data, size_t len)
+{
+  ut_ad(len);
+  set_modified(block);
+  if (m_log_mode != MTR_LOG_ALL)
+    return;
+  if (len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5))
+  {
+    byte *end= log_write<WRITE>(block.page.id(), &block.page, len, true,
+                                offset);
+    ::memcpy(end, data, len);
+    m_log.close(end + len);
+  }
+  else
+  {
+    m_log.close(log_write<WRITE>(block.page.id(), &block.page, len, false,
+                                 offset));
+    m_log.push(static_cast<const byte*>(data), static_cast<uint32_t>(len));
+  }
+  m_last_offset= static_cast<uint16_t>(offset + len);
+}
+
+/** Log that a string of bytes was copied from the same page.
+@param[in]      b       buffer page
+@param[in]      d       destination offset within the page
+@param[in]      s       source offset within the page
+@param[in]      len     length of the data to copy */
+inline void mtr_t::memmove(const buf_block_t &b, ulint d, ulint s, ulint len)
+{
+  ut_ad(d >= 8);
+  ut_ad(s >= 8);
+  ut_ad(len);
+  ut_ad(s <= ulint(srv_page_size));
+  ut_ad(s + len <= ulint(srv_page_size));
+  ut_ad(s != d);
+  ut_ad(d <= ulint(srv_page_size));
+  ut_ad(d + len <= ulint(srv_page_size));
+
+  set_modified(b);
+  if (m_log_mode != MTR_LOG_ALL)
+    return;
+  static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+  size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3);
+  /* The source offset is encoded relative to the destination offset,
+  with the sign in the least significant bit. */
+  if (s > d)
+    s= (s - d) << 1;
+  else
+    s= (d - s) << 1 | 1;
+  /* The source offset 0 is not possible. */
+  s-= 1 << 1;
+  size_t slen= (s < MIN_2BYTE ? 1 : s < MIN_3BYTE ? 2 : 3);
+  byte *l= log_write<MEMMOVE>(b.page.id(), &b.page, lenlen + slen, true, d);
+  l= mlog_encode_varint(l, len);
+  l= mlog_encode_varint(l, s);
+  m_log.close(l);
+  m_last_offset= static_cast<uint16_t>(d + len);
+}
+
+/**
+Write a log record.
+@tparam type   redo log record type
+@param id     persistent page identifier
+@param bpage  buffer pool page, or nullptr
+@param len    number of additional bytes to write
+@param alloc  whether to allocate the additional bytes
+@param offset byte offset, or 0 if the record type does not allow one
+@return end of mini-transaction log, minus len */
+template<byte type>
+inline byte *mtr_t::log_write(const page_id_t id, const buf_page_t *bpage,
+                              size_t len, bool alloc, size_t offset)
+{
+  static_assert(!(type & 15) && type != RESERVED && type != OPTION &&
+                type <= FILE_CHECKPOINT, "invalid type");
+  ut_ad(type >= FILE_CREATE || is_named_space(id.space()));
+  ut_ad(!bpage || bpage->id() == id);
+  constexpr bool have_len= type != INIT_PAGE && type != FREE_PAGE;
+  constexpr bool have_offset= type == WRITE || type == MEMSET ||
+    type == MEMMOVE;
+  static_assert(!have_offset || have_len, "consistency");
+  ut_ad(have_len || len == 0);
+  ut_ad(have_len || !alloc);
+  ut_ad(have_offset || offset == 0);
+  ut_ad(offset + len <= srv_page_size);
+  static_assert(MIN_4BYTE >= UNIV_PAGE_SIZE_MAX, "consistency");
+
+  size_t max_len;
+  if (!have_len)
+    max_len= 1 + 5 + 5;
+  else if (!have_offset)
+    max_len= bpage && m_last == bpage
+      ? 1 + 3
+      : 1 + 3 + 5 + 5;
+  else if (bpage && m_last == bpage && m_last_offset <= offset)
+  {
+    /* Encode the offset relative from m_last_offset. */
+    offset-= m_last_offset;
+    max_len= 1 + 3 + 3;
+  }
+  else
+    max_len= 1 + 3 + 5 + 5 + 3;
+  byte *const log_ptr= m_log.open(alloc ? max_len + len : max_len);
+  byte *end= log_ptr + 1;
+  const byte same_page= max_len < 1 + 5 + 5 ? 0x80 : 0;
+  if (!same_page)
+  {
+    end= mlog_encode_varint(end, id.space());
+    end= mlog_encode_varint(end, id.page_no());
+    m_last= bpage;
+  }
+  if (have_offset)
+  {
+    byte* oend= mlog_encode_varint(end, offset);
+    if (oend + len > &log_ptr[16])
+    {
+      len+= oend - log_ptr - 15;
+      if (len >= MIN_3BYTE - 1)
+        len+= 2;
+      else if (len >= MIN_2BYTE)
+        len++;
+
+      *log_ptr= type | same_page;
+      end= mlog_encode_varint(log_ptr + 1, len);
+      if (!same_page)
+      {
+        end= mlog_encode_varint(end, id.space());
+        end= mlog_encode_varint(end, id.page_no());
+      }
+      end= mlog_encode_varint(end, offset);
+      return end;
+    }
+    else
+      end= oend;
+  }
+  else if (len >= 3 && end + len > &log_ptr[16])
+  {
+    len+= end - log_ptr - 15;
+    if (len >= MIN_3BYTE - 1)
+      len+= 2;
+    else if (len >= MIN_2BYTE)
+      len++;
+
+    end= log_ptr;
+    *end++= type | same_page;
+    end= mlog_encode_varint(end, len);
+
+    if (!same_page)
+    {
+      end= mlog_encode_varint(end, id.space());
+      end= mlog_encode_varint(end, id.page_no());
+    }
+    return end;
+  }
+
+  ut_ad(end + len >= &log_ptr[1] + !same_page);
+  ut_ad(end + len <= &log_ptr[16]);
+  ut_ad(end <= &log_ptr[max_len]);
+  *log_ptr= type | same_page | static_cast<byte>(end + len - log_ptr - 1);
+  ut_ad(*log_ptr & 15);
+  return end;
+}
+
+/** Write a byte string to a page.
+@param[in]      b       buffer page
+@param[in]      dest    destination within b.frame
+@param[in]      str     the data to write
+@param[in]      len     length of the data to write
+@tparam w       write request type */
+template<mtr_t::write_type w>
+inline void mtr_t::memcpy(const buf_block_t &b, void *dest, const void *str,
+                          ulint len)
+{
+  ut_ad(ut_align_down(dest, srv_page_size) == b.frame);
+  char *d= static_cast<char*>(dest);
+  const char *s= static_cast<const char*>(str);
+  if (w != FORCED && m_log_mode == MTR_LOG_ALL)
+  {
+    ut_ad(len);
+    const char *const end= d + len;
+    while (*d++ == *s++)
+    {
+      if (d == end)
+      {
+        ut_ad(w == MAYBE_NOP);
+        return;
+      }
+    }
+    s--;
+    d--;
+    len= static_cast<ulint>(end - d);
+  }
+  ::memcpy(d, s, len);
+  memcpy(b, ut_align_offset(d, srv_page_size), len);
+}
+
+/** Initialize an entire page.
+@param[in,out]        b       buffer page */
+inline void mtr_t::init(buf_block_t *b)
+{
+  const page_id_t id{b->page.id()};
+  ut_ad(is_named_space(id.space()));
+  ut_ad(!m_freed_pages == !m_freed_space);
+
+  if (UNIV_LIKELY_NULL(m_freed_space) &&
+      m_freed_space->id == id.space() &&
+      m_freed_pages->remove_if_exists(b->page.id().page_no()) &&
+      m_freed_pages->empty())
+  {
+    delete m_freed_pages;
+    m_freed_pages= nullptr;
+    m_freed_space= nullptr;
+  }
+
+  b->page.status= buf_page_t::INIT_ON_FLUSH;
+
+  if (m_log_mode != MTR_LOG_ALL)
+  {
+    ut_ad(m_log_mode == MTR_LOG_NONE || m_log_mode == MTR_LOG_NO_REDO);
+    return;
+  }
+
+  m_log.close(log_write<INIT_PAGE>(b->page.id(), &b->page));
+  m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Free a page.
+@param[in]	space 	tablespace contains page to be freed
+@param[in]	offset	page offset to be freed */
+inline void mtr_t::free(fil_space_t &space, uint32_t offset)
+{
+  ut_ad(is_named_space(&space));
+  ut_ad(!m_freed_space || m_freed_space == &space);
+
+  if (m_log_mode == MTR_LOG_ALL)
+    m_log.close(log_write<FREE_PAGE>({space.id, offset}, nullptr));
+}
+
+/** Write an EXTENDED log record.
+@param block  buffer pool page
+@param type   extended record subtype; @see mrec_ext_t */
+inline void mtr_t::log_write_extended(const buf_block_t &block, byte type)
+{
+  set_modified(block);
+  if (m_log_mode != MTR_LOG_ALL)
+    return;
+  byte *l= log_write<EXTENDED>(block.page.id(), &block.page, 1, true);
+  *l++= type;
+  m_log.close(l);
+  m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for partly initializing a B-tree or R-tree page.
+@param block    B-tree or R-tree page
+@param comp     false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */
+inline void mtr_t::page_create(const buf_block_t &block, bool comp)
+{
+  static_assert(false == INIT_ROW_FORMAT_REDUNDANT, "encoding");
+  static_assert(true == INIT_ROW_FORMAT_DYNAMIC, "encoding");
+  log_write_extended(block, comp);
+}
+
+/** Write log for deleting a B-tree or R-tree record in ROW_FORMAT=REDUNDANT.
+@param block      B-tree or R-tree page
+@param prev_rec   byte offset of the predecessor of the record to delete,
+                  starting from PAGE_OLD_INFIMUM */
+inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec)
+{
+  ut_ad(!block.zip_size());
+  ut_ad(prev_rec < block.physical_size());
+  set_modified(block);
+  if (m_log_mode != MTR_LOG_ALL)
+    return;
+  size_t len= (prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4);
+  byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, true);
+  ut_d(byte *end= l + len);
+  *l++= DELETE_ROW_FORMAT_REDUNDANT;
+  l= mlog_encode_varint(l, prev_rec);
+  ut_ad(end == l);
+  m_log.close(l);
+  m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for deleting a COMPACT or DYNAMIC B-tree or R-tree record.
+@param block      B-tree or R-tree page
+@param prev_rec   byte offset of the predecessor of the record to delete,
+                  starting from PAGE_NEW_INFIMUM
+@param prev_rec   the predecessor of the record to delete
+@param hdr_size   record header size, excluding REC_N_NEW_EXTRA_BYTES
+@param data_size  data payload size, in bytes */
+inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec,
+                               size_t hdr_size, size_t data_size)
+{
+  ut_ad(!block.zip_size());
+  set_modified(block);
+  ut_ad(hdr_size < MIN_3BYTE);
+  ut_ad(prev_rec < block.physical_size());
+  ut_ad(data_size < block.physical_size());
+  if (m_log_mode != MTR_LOG_ALL)
+    return;
+  size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4;
+  len+= hdr_size < MIN_2BYTE ? 1 : 2;
+  len+= data_size < MIN_2BYTE ? 1 : data_size < MIN_3BYTE ? 2 : 3;
+  byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, true);
+  ut_d(byte *end= l + len);
+  *l++= DELETE_ROW_FORMAT_DYNAMIC;
+  l= mlog_encode_varint(l, prev_rec);
+  l= mlog_encode_varint(l, hdr_size);
+  l= mlog_encode_varint(l, data_size);
+  ut_ad(end == l);
+  m_log.close(l);
+  m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for initializing an undo log page.
+@param block    undo page */
+inline void mtr_t::undo_create(const buf_block_t &block)
+{
+  log_write_extended(block, UNDO_INIT);
+}
+
+/** Write log for appending an undo log record.
+@param block    undo page
+@param data     record within the undo page
+@param len      length of the undo record, in bytes */
+inline void mtr_t::undo_append(const buf_block_t &block,
+                               const void *data, size_t len)
+{
+  ut_ad(len > 2);
+  set_modified(block);
+  if (m_log_mode != MTR_LOG_ALL)
+    return;
+  const bool small= len + 1 < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5);
+  byte *end= log_write<EXTENDED>(block.page.id(), &block.page, len + 1, small);
+  if (UNIV_LIKELY(small))
+  {
+    *end++= UNDO_APPEND;
+    ::memcpy(end, data, len);
+    m_log.close(end + len);
+  }
+  else
+  {
+    m_log.close(end);
+    *m_log.push<byte*>(1)= UNDO_APPEND;
+    m_log.push(static_cast<const byte*>(data), static_cast<uint32_t>(len));
+  }
+  m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Trim the end of a tablespace.
+@param id       first page identifier that will not be in the file */
+inline void mtr_t::trim_pages(const page_id_t id)
+{
+  if (m_log_mode != MTR_LOG_ALL)
+    return;
+  byte *l= log_write<EXTENDED>(id, nullptr, 1, true);
+  *l++= TRIM_PAGES;
+  m_log.close(l);
+  set_trim_pages();
+}
diff --git a/storage/innobase/include/mtr0log.inl b/storage/innobase/include/mtr0log.inl
deleted file mode 100644
index 70bcaf43b9e..00000000000
--- a/storage/innobase/include/mtr0log.inl
+++ /dev/null
@@ -1,223 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file include/mtr0log.ic
-Mini-transaction logging routines
-
-Created 12/7/1995 Heikki Tuuri
-*******************************************************/
-
-#include "buf0dblwr.h"
-#include "fsp0types.h"
-#include "mach0data.h"
-#include "trx0types.h"
-
-/********************************************************//**
-Opens a buffer to mlog. It must be closed with mlog_close.
-@return buffer, NULL if log mode MTR_LOG_NONE or MTR_LOG_NO_REDO */
-UNIV_INLINE
-byte*
-mlog_open(
-/*======*/
-	mtr_t*	mtr,	/*!< in: mtr */
-	ulint	size)	/*!< in: buffer size in bytes; MUST be
-			smaller than mtr_t::buf_t::MAX_DATA_SIZE! */
-{
-	mtr->set_modified();
-
-	if (mtr_get_log_mode(mtr) == MTR_LOG_NONE
-	    || mtr_get_log_mode(mtr) == MTR_LOG_NO_REDO) {
-
-		return(NULL);
-	}
-
-	return(mtr->get_log()->open(size));
-}
-
-/********************************************************//**
-Closes a buffer opened to mlog. */
-UNIV_INLINE
-void
-mlog_close(
-/*=======*/
-	mtr_t*	mtr,	/*!< in: mtr */
-	byte*	ptr)	/*!< in: buffer space from ptr up was not used */
-{
-	ut_ad(mtr_get_log_mode(mtr) != MTR_LOG_NONE);
-	ut_ad(mtr_get_log_mode(mtr) != MTR_LOG_NO_REDO);
-
-	mtr->get_log()->close(ptr);
-}
-
-/********************************************************//**
-Catenates 1 - 4 bytes to the mtr log. The value is not compressed. */
-UNIV_INLINE
-void
-mlog_catenate_ulint(
-/*================*/
-	mtr_buf_t*	mtr_buf,	/*!< in/out: buffer to write */
-	ulint		val,		/*!< in: value to write */
-	mlog_id_t	type)		/*!< in: type of value to write */
-{
-	compile_time_assert(MLOG_1BYTE == 1);
-	compile_time_assert(MLOG_2BYTES == 2);
-	compile_time_assert(MLOG_4BYTES == 4);
-	compile_time_assert(MLOG_8BYTES == 8);
-
-	byte*	ptr = mtr_buf->push<byte*>(type);
-
-	switch (type) {
-	case MLOG_4BYTES:
-		mach_write_to_4(ptr, val);
-		break;
-	case MLOG_2BYTES:
-		mach_write_to_2(ptr, val);
-		break;
-	case MLOG_1BYTE:
-		mach_write_to_1(ptr, val);
-		break;
-	default:
-		ut_error;
-	}
-}
-
-/********************************************************//**
-Catenates 1 - 4 bytes to the mtr log. The value is not compressed. */
-UNIV_INLINE
-void
-mlog_catenate_ulint(
-/*================*/
-	mtr_t*		mtr,	/*!< in/out: mtr */
-	ulint		val,	/*!< in: value to write */
-	mlog_id_t	type)	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
-{
-	if (mtr_get_log_mode(mtr) == MTR_LOG_NONE
-	    || mtr_get_log_mode(mtr) == MTR_LOG_NO_REDO) {
-
-		return;
-	}
-
-	mlog_catenate_ulint(mtr->get_log(), val, type);
-}
-
-/********************************************************//**
-Catenates a compressed 64-bit integer to mlog. */
-UNIV_INLINE
-void
-mlog_catenate_ull_compressed(
-/*=========================*/
-	mtr_t*		mtr,	/*!< in: mtr */
-	ib_uint64_t	val)	/*!< in: value to write */
-{
-	byte*	log_ptr;
-
-	log_ptr = mlog_open(mtr, 15);
-
-	/* If no logging is requested, we may return now */
-	if (log_ptr == NULL) {
-
-		return;
-	}
-
-	log_ptr += mach_u64_write_compressed(log_ptr, val);
-
-	mlog_close(mtr, log_ptr);
-}
-
-/** Writes a log record about an operation.
-@param[in]	type		redo log record type
-@param[in]	space_id	tablespace identifier
-@param[in]	page_no		page number
-@param[in,out]	log_ptr		current end of mini-transaction log
-@param[in,out]	mtr		mini-transaction
-@return	end of mini-transaction log */
-UNIV_INLINE
-byte*
-mlog_write_initial_log_record_low(
-	mlog_id_t	type,
-	ulint		space_id,
-	ulint		page_no,
-	byte*		log_ptr,
-	mtr_t*		mtr)
-{
-	ut_ad(type <= MLOG_BIGGEST_TYPE || EXTRA_CHECK_MLOG_NUMBER(type));
-	ut_ad(type == MLOG_FILE_NAME
-	      || type == MLOG_FILE_DELETE
-	      || type == MLOG_FILE_CREATE2
-	      || type == MLOG_FILE_RENAME2
-	      || type == MLOG_INDEX_LOAD
-	      || type == MLOG_FILE_WRITE_CRYPT_DATA
-	      || mtr->is_named_space(space_id));
-
-	mach_write_to_1(log_ptr, type);
-	log_ptr++;
-
-	log_ptr += mach_write_compressed(log_ptr, space_id);
-	log_ptr += mach_write_compressed(log_ptr, page_no);
-
-	mtr->added_rec();
-	return(log_ptr);
-}
-
-/********************************************************//**
-Writes the initial part of a log record (3..11 bytes).
-If the implementation of this function is changed, all
-size parameters to mlog_open() should be adjusted accordingly!
-@return new value of log_ptr */
-UNIV_INLINE
-byte*
-mlog_write_initial_log_record_fast(
-/*===============================*/
-	const byte*	ptr,	/*!< in: pointer to (inside) a buffer
-				frame holding the file page where
-				modification is made */
-	mlog_id_t	type,	/*!< in: log item type: MLOG_1BYTE, ... */
-	byte*		log_ptr,/*!< in: pointer to mtr log which has
-				been opened */
-	mtr_t*		mtr)	/*!< in/out: mtr */
-{
-	const byte*	page;
-	ulint		space;
-	ulint		offset;
-
-	ut_ad(log_ptr);
-	ut_d(mtr->memo_modify_page(ptr));
-
-	page = (const byte*) ut_align_down(ptr, srv_page_size);
-	space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
-	offset = mach_read_from_4(page + FIL_PAGE_OFFSET);
-
-	/* check whether the page is in the doublewrite buffer;
-	the doublewrite buffer is located in pages
-	FSP_EXTENT_SIZE, ..., 3 * FSP_EXTENT_SIZE - 1 in the
-	system tablespace */
-
-	if (space == TRX_SYS_SPACE
-	    && offset >= FSP_EXTENT_SIZE && offset < 3 * FSP_EXTENT_SIZE) {
-		ut_ad(buf_dblwr_being_created);
-		/* Do nothing: we only come to this branch in an
-		InnoDB database creation. We do not redo log
-		anything for the doublewrite buffer pages. */
-		return(log_ptr);
-	}
-
-	return(mlog_write_initial_log_record_low(type, space, offset,
-						 log_ptr, mtr));
-}
diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h
index f8a15aef7f9..0c7051ed31a 100644
--- a/storage/innobase/include/mtr0mtr.h
+++ b/storage/innobase/include/mtr0mtr.h
@@ -1,8 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2013, 2021, MariaDB Corporation.
+Copyright (c) 2013, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -46,10 +45,6 @@ savepoint. */
 #define mtr_release_s_latch_at_savepoint(m, s, l)			\
 				(m)->release_s_latch_at_savepoint((s), (l))
 
-/** Get the logging mode of a mini-transaction.
-@return	logging mode: MTR_LOG_NONE, ... */
-#define mtr_get_log_mode(m)	(m)->get_log_mode()
-
 /** Change the logging mode of a mini-transaction.
 @return	old mode */
 #define mtr_set_log_mode(m, d)	(m)->set_log_mode((d))
@@ -59,18 +54,6 @@ savepoint. */
 #define mtr_memo_release(m, o, t)					\
 				(m)->memo_release((o), (t))
 
-#ifdef UNIV_DEBUG
-/** Check if memo contains the given item.
-@return	TRUE if contains */
-#define mtr_memo_contains(m, o, t)					\
-				(m)->memo_contains((m)->get_memo(), (o), (t))
-
-/** Check if memo contains the given page.
-@return	TRUE if contains */
-#define mtr_memo_contains_page(m, p, t)					\
-	(m)->memo_contains_page_flagged((p), (t))
-#endif /* UNIV_DEBUG */
-
 /** Print info of an mtr handle. */
 #define mtr_print(m)		(m)->print()
 
@@ -81,19 +64,13 @@ savepoint. */
 /** Push an object to an mtr memo stack. */
 #define mtr_memo_push(m, o, t)	(m)->memo_push(o, t)
 
-#define mtr_s_lock_space(s, m)	(m)->s_lock_space((s), __FILE__, __LINE__)
 #define mtr_x_lock_space(s, m)	(m)->x_lock_space((s), __FILE__, __LINE__)
+#define mtr_sx_lock_space(s, m) (m)->sx_lock_space((s), __FILE__, __LINE__)
 
 #define mtr_s_lock_index(i, m)	(m)->s_lock(&(i)->lock, __FILE__, __LINE__)
 #define mtr_x_lock_index(i, m)	(m)->x_lock(&(i)->lock, __FILE__, __LINE__)
 #define mtr_sx_lock_index(i, m)	(m)->sx_lock(&(i)->lock, __FILE__, __LINE__)
 
-#define mtr_memo_contains_flagged(m, p, l)				\
-				(m)->memo_contains_flagged((p), (l))
-
-#define mtr_memo_contains_page_flagged(m, p, l)				\
-				(m)->memo_contains_page_flagged((p), (l))
-
 #define mtr_release_block_at_savepoint(m, s, b)				\
 				(m)->release_block_at_savepoint((s), (b))
 
@@ -103,55 +80,46 @@ savepoint. */
 #define mtr_block_x_latch_at_savepoint(m, s, b)				\
 				(m)->x_latch_at_savepoint((s), (b))
 
-/** Check if a mini-transaction is dirtying a clean page.
-@param b	block being x-fixed
-@return true if the mtr is dirtying a clean page. */
-#define mtr_block_dirtied(b)	mtr_t::is_block_dirtied((b))
-
-/** Append records to the system-wide redo log buffer.
-@param[in]	log	redo log records */
-void
-mtr_write_log(
-	const mtr_buf_t*	log);
-
 /** Mini-transaction memo stack slot. */
 struct mtr_memo_slot_t {
 	/** pointer to the object */
 	void*		object;
 
-	/** type of the stored object (MTR_MEMO_S_LOCK, ...) */
-	ulint		type;
+	/** type of the stored object */
+	mtr_memo_type_t	type;
 };
 
 /** Mini-transaction handle and buffer */
 struct mtr_t {
-	mtr_t() : m_state(MTR_STATE_INIT) {}
-
-	/** Start a mini-transaction. */
-	void start();
-
-	/** Commit the mini-transaction. */
-	void commit();
-
-	/** Commit a mini-transaction that is shrinking a tablespace.
-	@param space   tablespace that is being shrunk */
-	ATTRIBUTE_COLD void commit_shrink(fil_space_t &space);
-
-	/** Commit a mini-transaction that did not modify any pages,
-	but generated some redo log on a higher level, such as
-	MLOG_FILE_NAME records and a MLOG_CHECKPOINT marker.
-	The caller must invoke log_mutex_enter() and log_mutex_exit().
-	This is to be used at log_checkpoint().
-	@param[in]	checkpoint_lsn		the LSN of the log checkpoint
-	@param[in]	write_mlog_checkpoint	Write MLOG_CHECKPOINT marker
-						if it is enabled. */
-	void commit_checkpoint(
-		lsn_t	checkpoint_lsn,
-		bool	write_mlog_checkpoint);
-
-	/** Return current size of the buffer.
-	@return	savepoint */
-	ulint get_savepoint() const {ut_ad(is_active()); return m_memo.size();}
+  /** Start a mini-transaction. */
+  void start();
+
+  /** Commit the mini-transaction. */
+  void commit();
+
+  /** Release latches till savepoint. To simplify the code only
+  MTR_MEMO_S_LOCK and MTR_MEMO_PAGE_S_FIX slot types are allowed to be
+  released, otherwise it would be neccesary to add one more argument in the
+  function to point out what slot types are allowed for rollback, and this
+  would be overengineering as currently the function is used only in one place
+  in the code.
+  @param savepoint   savepoint, can be obtained with get_savepoint */
+  void rollback_to_savepoint(ulint savepoint);
+
+  /** Commit a mini-transaction that is shrinking a tablespace.
+  @param space   tablespace that is being shrunk */
+  ATTRIBUTE_COLD void commit_shrink(fil_space_t &space);
+
+  /** Commit a mini-transaction that did not modify any pages,
+  but generated some redo log on a higher level, such as
+  FILE_MODIFY records and an optional FILE_CHECKPOINT marker.
+  The caller must hold log_sys.mutex.
+  This is to be used at log_checkpoint().
+  @param checkpoint_lsn   the log sequence number of a checkpoint, or 0 */
+  void commit_files(lsn_t checkpoint_lsn= 0);
+
+  /** @return mini-transaction savepoint (current size of m_memo) */
+  ulint get_savepoint() const { ut_ad(is_active()); return m_memo.size(); }
 
 	/** Release the (index tree) s-latch stored in an mtr memo after a
 	savepoint.
@@ -172,18 +140,30 @@ struct mtr_t {
 	/** X-latch a not yet latched block after a savepoint. */
 	inline void x_latch_at_savepoint(ulint savepoint, buf_block_t*	block);
 
-	/** Get the logging mode.
-	@return	logging mode */
-	inline mtr_log_t get_log_mode() const
-		MY_ATTRIBUTE((warn_unused_result));
-
-	/** Change the logging mode.
-	@param mode	 logging mode
-	@return	old mode */
-	inline mtr_log_t set_log_mode(mtr_log_t mode);
+  /** @return the logging mode */
+  mtr_log_t get_log_mode() const
+  {
+    static_assert(MTR_LOG_ALL == 0, "efficiency");
+    ut_ad(m_log_mode <= MTR_LOG_NO_REDO);
+    return static_cast<mtr_log_t>(m_log_mode);
+  }
+
+  /** Change the logging mode.
+  @param mode	 logging mode
+  @return	old mode */
+  mtr_log_t set_log_mode(mtr_log_t mode)
+  {
+    const mtr_log_t old_mode= get_log_mode();
+    m_log_mode= mode & 3;
+    return old_mode;
+  }
+
+  /** Check if we are holding a block latch in exclusive mode
+  @param block  buffer pool block to search for */
+  bool have_x_latch(const buf_block_t &block) const;
 
 	/** Copy the tablespaces associated with the mini-transaction
-	(needed for generating MLOG_FILE_NAME records)
+	(needed for generating FILE_MODIFY records)
 	@param[in]	mtr	mini-transaction that may modify
 	the same set of tablespaces as this one */
 	void set_spaces(const mtr_t& mtr)
@@ -196,13 +176,13 @@ struct mtr_t {
 	}
 
 	/** Set the tablespace associated with the mini-transaction
-	(needed for generating a MLOG_FILE_NAME record)
+	(needed for generating a FILE_MODIFY record)
 	@param[in]	space_id	user or system tablespace ID
 	@return	the tablespace */
 	fil_space_t* set_named_space_id(ulint space_id)
 	{
 		ut_ad(!m_user_space_id);
-		ut_d(m_user_space_id = space_id);
+		ut_d(m_user_space_id = static_cast<uint32_t>(space_id));
 		if (!space_id) {
 			return fil_system.sys_space;
 		} else {
@@ -215,12 +195,12 @@ struct mtr_t {
 	}
 
 	/** Set the tablespace associated with the mini-transaction
-	(needed for generating a MLOG_FILE_NAME record)
+	(needed for generating a FILE_MODIFY record)
 	@param[in]	space	user or system tablespace */
 	void set_named_space(fil_space_t* space)
 	{
 		ut_ad(!m_user_space_id);
-		ut_d(m_user_space_id = space->id);
+		ut_d(m_user_space_id = static_cast<uint32_t>(space->id));
 		if (space->id) {
 			m_user_space = space;
 		}
@@ -228,12 +208,12 @@ struct mtr_t {
 
 #ifdef UNIV_DEBUG
 	/** Check the tablespace associated with the mini-transaction
-	(needed for generating a MLOG_FILE_NAME record)
+	(needed for generating a FILE_MODIFY record)
 	@param[in]	space	tablespace
 	@return whether the mini-transaction is associated with the space */
 	bool is_named_space(ulint space) const;
 	/** Check the tablespace associated with the mini-transaction
-	(needed for generating a MLOG_FILE_NAME record)
+	(needed for generating a FILE_MODIFY record)
 	@param[in]	space	tablespace
 	@return whether the mini-transaction is associated with the space */
 	bool is_named_space(const fil_space_t* space) const;
@@ -279,18 +259,6 @@ struct mtr_t {
 		memo_push(lock, MTR_MEMO_SX_LOCK);
 	}
 
-	/** Acquire a tablespace S-latch.
-	@param[in]	space	tablespace
-	@param[in]	file	file name from where called
-	@param[in]	line	line number in file */
-	void s_lock_space(fil_space_t* space, const char* file, unsigned line)
-	{
-		ut_ad(space->purpose == FIL_TYPE_TEMPORARY
-		      || space->purpose == FIL_TYPE_IMPORT
-		      || space->purpose == FIL_TYPE_TABLESPACE);
-		s_lock(&space->latch, file, line);
-	}
-
 	/** Acquire a tablespace X-latch.
 	@param[in]	space	tablespace
 	@param[in]	file	file name from where called
@@ -304,9 +272,21 @@ struct mtr_t {
 		rw_lock_x_lock_inline(&space->latch, 0, file, line);
 	}
 
+ /** Acquire a tablespace SX-latch.
+ @param[in]	space	tablespace
+ @param[in]	file	file name from where called
+ @param[in]	line	line number in file */
+ void sx_lock_space(fil_space_t *space, const char *file, unsigned line)
+ {
+   ut_ad(space->purpose == FIL_TYPE_TEMPORARY
+         || space->purpose == FIL_TYPE_IMPORT
+	 || space->purpose == FIL_TYPE_TABLESPACE);
+   sx_lock(&space->latch, file, line);
+ }
+
 	/** Release an object in the memo stack.
 	@param object	object
-	@param type	object type: MTR_MEMO_S_LOCK, ...
+	@param type	object type
 	@return bool if lock released */
 	bool memo_release(const void* object, ulint type);
 	/** Release a page latch.
@@ -314,59 +294,54 @@ struct mtr_t {
 	@param[in]	type	object type: MTR_MEMO_PAGE_X_FIX, ... */
 	void release_page(const void* ptr, mtr_memo_type_t type);
 
-	/** Note that the mini-transaction has modified data. */
-	void set_modified() { m_modifications = true; }
-
-	/** Set the state to not-modified. This will not log the
-	changes.  This is only used during redo log apply, to avoid
-	logging the changes. */
-	void discard_modifications() { m_modifications = false; }
+private:
+  /** Note that the mini-transaction will modify data. */
+  void flag_modified() { m_modifications = true; }
+  /** Mark the given latched page as modified.
+  @param block   page that will be modified */
+  void modify(const buf_block_t& block);
+public:
+  /** Note that the mini-transaction will modify a block. */
+  void set_modified(const buf_block_t &block)
+  { flag_modified(); if (m_log_mode != MTR_LOG_NONE) modify(block); }
 
-	/** Get the LSN of commit().
-	@return the commit LSN
-	@retval 0 if the transaction only modified temporary tablespaces */
-	lsn_t commit_lsn() const
-	{
-		ut_ad(has_committed());
-		return(m_commit_lsn);
-	}
+  /** Set the state to not-modified. This will not log the changes.
+  This is only used during redo log apply, to avoid logging the changes. */
+  void discard_modifications() { m_modifications = false; }
 
-	/** Note that we are inside the change buffer code. */
-	void enter_ibuf() { m_inside_ibuf = true; }
+  /** Get the LSN of commit().
+  @return the commit LSN
+  @retval 0 if the transaction only modified temporary tablespaces */
+  lsn_t commit_lsn() const { ut_ad(has_committed()); return m_commit_lsn; }
 
-	/** Note that we have exited from the change buffer code. */
-	void exit_ibuf() { m_inside_ibuf = false; }
+  /** Note that we are inside the change buffer code. */
+  void enter_ibuf() { m_inside_ibuf= true; }
 
-	/** @return true if we are inside the change buffer code */
-	bool is_inside_ibuf() const { return m_inside_ibuf; }
+  /** Note that we have exited from the change buffer code. */
+  void exit_ibuf() { m_inside_ibuf= false; }
 
-	/*
-	@return true if the mini-transaction is active */
-	bool is_active() const { return m_state == MTR_STATE_ACTIVE; }
+  /** @return true if we are inside the change buffer code */
+  bool is_inside_ibuf() const { return m_inside_ibuf; }
 
-	/** Get flush observer
-	@return flush observer */
-	FlushObserver* get_flush_observer() const { return m_flush_observer; }
+  /** Note that pages has been trimed */
+  void set_trim_pages() { m_trim_pages= true; }
 
-	/** Set flush observer
-	@param[in]	observer	flush observer */
-	void set_flush_observer(FlushObserver*	observer)
-	{
-		ut_ad(observer == NULL || m_log_mode == MTR_LOG_NO_REDO);
-		m_flush_observer = observer;
-	}
+  /** @return true if pages has been trimed */
+  bool is_trim_pages() { return m_trim_pages; }
 
 #ifdef UNIV_DEBUG
-	/** Check if memo contains the given item.
-	@param memo	memo stack
-	@param object,	object to search
-	@param type	type of object
-	@return	true if contains */
-	static bool memo_contains(
-		const mtr_buf_t*	memo,
-		const void*		object,
-		ulint			type)
-		MY_ATTRIBUTE((warn_unused_result));
+  /** Check if we are holding an rw-latch in this mini-transaction
+  @param lock   latch to search for
+  @param type   held latch type
+  @return whether (lock,type) is contained */
+  bool memo_contains(const rw_lock_t &lock, mtr_memo_type_t type)
+    MY_ATTRIBUTE((warn_unused_result));
+  /** Check if we are holding exclusive tablespace latch
+  @param space  tablespace to search for
+  @return whether space.latch is being held */
+  bool memo_contains(const fil_space_t& space)
+    MY_ATTRIBUTE((warn_unused_result));
+
 
 	/** Check if memo contains the given item.
 	@param object		object to search
@@ -385,16 +360,9 @@ struct mtr_t {
 		const byte*	ptr,
 		ulint		flags) const;
 
-	/** Mark the given latched page as modified.
-	@param[in]	ptr	pointer to within buffer frame */
-	void memo_modify_page(const byte* ptr);
-
 	/** Print info of an mtr handle. */
 	void print() const;
 
-	/** @return true if the mini-transaction has committed */
-	bool has_committed() const { return m_state == MTR_STATE_COMMITTED; }
-
 	/** @return true if mini-transaction contains modifications. */
 	bool has_modifications() const { return m_modifications; }
 
@@ -408,9 +376,6 @@ struct mtr_t {
 	/** @return true if a record was added to the mini-transaction */
 	bool is_dirty() const { return m_made_dirty; }
 
-	/** Note that a record has been added to the log */
-	void added_rec() { ++m_n_log_recs; }
-
 	/** Get the buffered redo log of this mini-transaction.
 	@return	redo log */
 	const mtr_buf_t* get_log() const { return &m_log; }
@@ -430,60 +395,312 @@ struct mtr_t {
 	static inline bool is_block_dirtied(const buf_block_t* block)
 		MY_ATTRIBUTE((warn_unused_result));
 
-	/** Check if we are holding a block latch in exclusive mode
-	@param block  buffer pool block to search for */
-	bool have_x_latch(const buf_block_t& block) const;
-private:
-	/** Prepare to write the mini-transaction log to the redo log buffer.
-	@return number of bytes to write in finish_write() */
-	inline ulint prepare_write();
+  /** Write request types */
+  enum write_type
+  {
+    /** the page is guaranteed to always change */
+    NORMAL= 0,
+    /** optional: the page contents might not change */
+    MAYBE_NOP,
+    /** force a write, even if the page contents is not changing */
+    FORCED
+  };
+
+  /** Write 1, 2, 4, or 8 bytes to a file page.
+  @param[in]      block   file page
+  @param[in,out]  ptr     pointer in file page
+  @param[in]      val     value to write
+  @tparam l       number of bytes to write
+  @tparam w       write request type
+  @tparam V       type of val
+  @return whether any log was written */
+  template<unsigned l,write_type w= NORMAL,typename V>
+  inline bool write(const buf_block_t &block, void *ptr, V val)
+    MY_ATTRIBUTE((nonnull));
+
+  /** Log a write of a byte string to a page.
+  @param[in]      b       buffer page
+  @param[in]      ofs     byte offset from b->frame
+  @param[in]      len     length of the data to write */
+  inline void memcpy(const buf_block_t &b, ulint ofs, ulint len);
+
+  /** Write a byte string to a page.
+  @param[in,out]  b       buffer page
+  @param[in]      dest    destination within b.frame
+  @param[in]      str     the data to write
+  @param[in]      len     length of the data to write
+  @tparam w       write request type */
+  template<write_type w= NORMAL>
+  inline void memcpy(const buf_block_t &b, void *dest, const void *str,
+                     ulint len);
+
+  /** Log a write of a byte string to a ROW_FORMAT=COMPRESSED page.
+  @param[in]      b       ROW_FORMAT=COMPRESSED index page
+  @param[in]      offset  byte offset from b.zip.data
+  @param[in]      len     length of the data to write */
+  inline void zmemcpy(const buf_block_t &b, ulint offset, ulint len);
+
+  /** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+  @param[in]      b       ROW_FORMAT=COMPRESSED index page
+  @param[in]      dest    destination within b.zip.data
+  @param[in]      str     the data to write
+  @param[in]      len     length of the data to write
+  @tparam w       write request type */
+  template<write_type w= NORMAL>
+  inline void zmemcpy(const buf_block_t &b, void *dest, const void *str,
+                      ulint len);
+
+  /** Log an initialization of a string of bytes.
+  @param[in]      b       buffer page
+  @param[in]      ofs     byte offset from b->frame
+  @param[in]      len     length of the data to write
+  @param[in]      val     the data byte to write */
+  inline void memset(const buf_block_t &b, ulint ofs, ulint len, byte val);
+
+  /** Initialize a string of bytes.
+  @param[in,out]        b       buffer page
+  @param[in]            ofs     byte offset from b->frame
+  @param[in]            len     length of the data to write
+  @param[in]            val     the data byte to write */
+  inline void memset(const buf_block_t *b, ulint ofs, ulint len, byte val);
+
+  /** Log an initialization of a repeating string of bytes.
+  @param[in]      b       buffer page
+  @param[in]      ofs     byte offset from b->frame
+  @param[in]      len     length of the data to write, in bytes
+  @param[in]      str     the string to write
+  @param[in]      size    size of str, in bytes */
+  inline void memset(const buf_block_t &b, ulint ofs, size_t len,
+                     const void *str, size_t size);
+
+  /** Initialize a repeating string of bytes.
+  @param[in,out]  b       buffer page
+  @param[in]      ofs     byte offset from b->frame
+  @param[in]      len     length of the data to write, in bytes
+  @param[in]      str     the string to write
+  @param[in]      size    size of str, in bytes */
+  inline void memset(const buf_block_t *b, ulint ofs, size_t len,
+                     const void *str, size_t size);
+
+  /** Log that a string of bytes was copied from the same page.
+  @param[in]      b       buffer page
+  @param[in]      d       destination offset within the page
+  @param[in]      s       source offset within the page
+  @param[in]      len     length of the data to copy */
+  inline void memmove(const buf_block_t &b, ulint d, ulint s, ulint len);
+
+  /** Initialize an entire page.
+  @param[in,out]        b       buffer page */
+  void init(buf_block_t *b);
+  /** Free a page.
+  @param[in]      space   tablespace contains page to be freed
+  @param[in]      offset  page offset to be freed */
+  inline void free(fil_space_t &space, uint32_t offset);
+  /** Write log for partly initializing a B-tree or R-tree page.
+  @param block    B-tree or R-tree page
+  @param comp     false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */
+  inline void page_create(const buf_block_t &block, bool comp);
+
+  /** Write log for inserting a B-tree or R-tree record in
+  ROW_FORMAT=REDUNDANT.
+  @param block      B-tree or R-tree page
+  @param reuse      false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+  @param prev_rec   byte offset of the predecessor of the record to insert,
+                    starting from PAGE_OLD_INFIMUM
+  @param info_bits  info_bits of the record
+  @param n_fields_s number of fields << 1 | rec_get_1byte_offs_flag()
+  @param hdr_c      number of common record header bytes with prev_rec
+  @param data_c     number of common data bytes with prev_rec
+  @param hdr        record header bytes to copy to the log
+  @param hdr_l      number of copied record header bytes
+  @param data       record payload bytes to copy to the log
+  @param data_l     number of copied record data bytes */
+  inline void page_insert(const buf_block_t &block, bool reuse,
+                          ulint prev_rec, byte info_bits,
+                          ulint n_fields_s, size_t hdr_c, size_t data_c,
+                          const byte *hdr, size_t hdr_l,
+                          const byte *data, size_t data_l);
+  /** Write log for inserting a B-tree or R-tree record in
+  ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC.
+  @param block       B-tree or R-tree page
+  @param reuse       false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+  @param prev_rec    byte offset of the predecessor of the record to insert,
+                     starting from PAGE_NEW_INFIMUM
+  @param info_status rec_get_info_and_status_bits()
+  @param shift       unless !reuse: number of bytes the PAGE_FREE is moving
+  @param hdr_c       number of common record header bytes with prev_rec
+  @param data_c      number of common data bytes with prev_rec
+  @param hdr         record header bytes to copy to the log
+  @param hdr_l       number of copied record header bytes
+  @param data        record payload bytes to copy to the log
+  @param data_l      number of copied record data bytes */
+  inline void page_insert(const buf_block_t &block, bool reuse,
+                          ulint prev_rec, byte info_status,
+                          ssize_t shift, size_t hdr_c, size_t data_c,
+                          const byte *hdr, size_t hdr_l,
+                          const byte *data, size_t data_l);
+  /** Write log for deleting a B-tree or R-tree record in ROW_FORMAT=REDUNDANT.
+  @param block      B-tree or R-tree page
+  @param prev_rec   byte offset of the predecessor of the record to delete,
+                    starting from PAGE_OLD_INFIMUM */
+  inline void page_delete(const buf_block_t &block, ulint prev_rec);
+  /** Write log for deleting a COMPACT or DYNAMIC B-tree or R-tree record.
+  @param block      B-tree or R-tree page
+  @param prev_rec   byte offset of the predecessor of the record to delete,
+                    starting from PAGE_NEW_INFIMUM
+  @param hdr_size   record header size, excluding REC_N_NEW_EXTRA_BYTES
+  @param data_size  data payload size, in bytes */
+  inline void page_delete(const buf_block_t &block, ulint prev_rec,
+                          size_t hdr_size, size_t data_size);
+
+  /** Write log for initializing an undo log page.
+  @param block    undo page */
+  inline void undo_create(const buf_block_t &block);
+  /** Write log for appending an undo log record.
+  @param block    undo page
+  @param data     record within the undo page
+  @param len      length of the undo record, in bytes */
+  inline void undo_append(const buf_block_t &block,
+                          const void *data, size_t len);
+  /** Trim the end of a tablespace.
+  @param id       first page identifier that will not be in the file */
+  inline void trim_pages(const page_id_t id);
+
+  /** Write a log record about a file operation.
+  @param type           file operation
+  @param space_id       tablespace identifier
+  @param path           file path
+  @param new_path       new file path for type=FILE_RENAME */
+  inline void log_file_op(mfile_type_t type, ulint space_id,
+                          const char *path,
+                          const char *new_path= nullptr);
+
+  /** Add freed page numbers to freed_pages */
+  void add_freed_offset(fil_space_t *space, uint32_t page)
+  {
+    ut_ad(is_named_space(space));
+    if (!m_freed_pages)
+    {
+      m_freed_pages= new range_set();
+      ut_ad(!m_freed_space);
+      m_freed_space= space;
+    }
+    else
+      ut_ad(m_freed_space == space);
+    m_freed_pages->add_value(page);
+  }
+
+  /** Determine the added buffer fix count of a block.
+  @param block block to be checked
+  @return number of buffer count added by this mtr */
+  uint32_t get_fix_count(const buf_block_t *block) const;
+
+  /** type of page flushing is needed during commit() */
+  enum page_flush_ahead
+  {
+    /** no need to trigger page cleaner */
+    PAGE_FLUSH_NO= 0,
+    /** asynchronous flushing is needed */
+    PAGE_FLUSH_ASYNC,
+    /** furious flushing is needed */
+    PAGE_FLUSH_SYNC
+  };
 
-	/** Append the redo log records to the redo log buffer.
-	@param[in]	len	number of bytes to write
-	@return start_lsn */
-	inline lsn_t finish_write(ulint len);
+private:
+  /** Log a write of a byte string to a page.
+  @param block   buffer page
+  @param offset  byte offset within page
+  @param data    data to be written
+  @param len     length of the data, in bytes */
+  inline void memcpy_low(const buf_block_t &block, uint16_t offset,
+                         const void *data, size_t len);
+  /**
+  Write a log record.
+  @tparam type  redo log record type
+  @param id     persistent page identifier
+  @param bpage  buffer pool page, or nullptr
+  @param len    number of additional bytes to write
+  @param alloc  whether to allocate the additional bytes
+  @param offset byte offset, or 0 if the record type does not allow one
+  @return end of mini-transaction log, minus len */
+  template<byte type>
+  inline byte *log_write(const page_id_t id, const buf_page_t *bpage,
+                         size_t len= 0, bool alloc= false, size_t offset= 0);
+
+  /** Write an EXTENDED log record.
+  @param block  buffer pool page
+  @param type   extended record subtype; @see mrec_ext_t */
+  inline void log_write_extended(const buf_block_t &block, byte type);
+
+  /** Append the redo log records to the redo log buffer.
+  @return {start_lsn,flush_ahead} */
+  std::pair<lsn_t,page_flush_ahead> do_write();
+
+  /** Append the redo log records to the redo log buffer.
+  @param len   number of bytes to write
+  @return {start_lsn,flush_ahead} */
+  inline std::pair<lsn_t,page_flush_ahead> finish_write(ulint len);
+
+  /** Release the resources */
+  inline void release_resources();
 
-	/** Release the resources */
-	inline void release_resources();
+#ifdef UNIV_DEBUG
+public:
+  /** @return whether the mini-transaction is active */
+  bool is_active() const
+  { ut_ad(!m_commit || m_start); return m_start && !m_commit; }
+  /** @return whether the mini-transaction has been committed */
+  bool has_committed() const { ut_ad(!m_commit || m_start); return m_commit; }
+private:
+  /** whether start() has been called */
+  bool m_start= false;
+  /** whether commit() has been called */
+  bool m_commit= false;
+#endif
 
-	/** memo stack for locks etc. */
-	mtr_buf_t	m_memo;
+  /** The page of the most recent m_log record written, or NULL */
+  const buf_page_t* m_last;
+  /** The current byte offset in m_last, or 0 */
+  uint16_t m_last_offset;
 
-	/** mini-transaction log */
-	mtr_buf_t	m_log;
+  /** specifies which operations should be logged; default MTR_LOG_ALL */
+  uint16_t m_log_mode:2;
 
-	/** true if mtr has made at least one buffer pool page dirty */
-	bool		m_made_dirty;
+  /** whether at least one buffer pool page was written to */
+  uint16_t m_modifications:1;
 
-	/** true if inside ibuf changes */
-	bool		m_inside_ibuf;
+  /** whether at least one previously clean buffer pool page was written to */
+  uint16_t m_made_dirty:1;
 
-	/** true if the mini-transaction modified buffer pool pages */
-	bool		m_modifications;
+  /** whether change buffer is latched; only needed in non-debug builds
+  to suppress some read-ahead operations, @see ibuf_inside() */
+  uint16_t m_inside_ibuf:1;
 
-	/** Count of how many page initial log records have been
-	written to the mtr log */
-	ib_uint32_t	m_n_log_recs;
+  /** whether the pages has been trimmed */
+  uint16_t m_trim_pages:1;
 
-	/** specifies which operations should be logged; default
-	value MTR_LOG_ALL */
-	mtr_log_t	m_log_mode;
 #ifdef UNIV_DEBUG
-	/** Persistent user tablespace associated with the
-	mini-transaction, or 0 (TRX_SYS_SPACE) if none yet */
-	ulint		m_user_space_id;
+  /** Persistent user tablespace associated with the
+  mini-transaction, or 0 (TRX_SYS_SPACE) if none yet */
+  uint32_t m_user_space_id;
 #endif /* UNIV_DEBUG */
-	/** User tablespace that is being modified by the mini-transaction */
-	fil_space_t*	m_user_space;
 
-	/** State of the transaction */
-	mtr_state_t	m_state;
+  /** acquired dict_index_t::lock, fil_space_t::latch, buf_block_t */
+  mtr_buf_t m_memo;
+
+  /** mini-transaction log */
+  mtr_buf_t m_log;
+
+  /** user tablespace that is being modified by the mini-transaction */
+  fil_space_t* m_user_space;
 
-	/** Flush Observer */
-	FlushObserver*	m_flush_observer;
+  /** LSN at commit time */
+  lsn_t m_commit_lsn;
 
-	/** LSN at commit time */
-	lsn_t		m_commit_lsn;
+  /** tablespace where pages have been freed */
+  fil_space_t *m_freed_space= nullptr;
+  /** set of freed page ids */
+  range_set *m_freed_pages= nullptr;
 };
 
 #include "mtr0mtr.inl"
diff --git a/storage/innobase/include/mtr0mtr.inl b/storage/innobase/include/mtr0mtr.inl
index c35de2bcbf9..bc2986503f9 100644
--- a/storage/innobase/include/mtr0mtr.inl
+++ b/storage/innobase/include/mtr0mtr.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -28,16 +28,12 @@ Created 11/26/1995 Heikki Tuuri
 
 /** Check if a mini-transaction is dirtying a clean page.
 @return true if the mtr is dirtying a clean page. */
-bool
-mtr_t::is_block_dirtied(const buf_block_t* block)
+inline bool mtr_t::is_block_dirtied(const buf_block_t *block)
 {
-	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-	ut_ad(block->page.buf_fix_count > 0);
-
-	/* It is OK to read oldest_modification because no
-	other thread can be performing a write of it and it
-	is only during write that the value is reset to 0. */
-	return(block->page.oldest_modification == 0);
+  ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+  ut_ad(block->page.buf_fix_count());
+  return block->page.oldest_modification() <= 1 &&
+    block->page.id().space() < SRV_TMP_SPACE_ID;
 }
 
 /**
@@ -56,8 +52,8 @@ mtr_t::memo_push(void* object, mtr_memo_type_t type)
 	grab log_sys.flush_order_mutex at mtr_t::commit() so that we
 	can insert the dirtied page into the flush list. */
 
-	if ((type == MTR_MEMO_PAGE_X_FIX || type == MTR_MEMO_PAGE_SX_FIX)
-	    && !m_made_dirty) {
+	if (!m_made_dirty
+            && (type == MTR_MEMO_PAGE_X_FIX || type == MTR_MEMO_PAGE_SX_FIX)) {
 
 		m_made_dirty = is_block_dirtied(
 			reinterpret_cast<const buf_block_t*>(object));
@@ -176,54 +172,3 @@ mtr_t::release_block_at_savepoint(
 
 	slot->object = NULL;
 }
-
-/**
-Gets the logging mode of a mini-transaction.
-@return	logging mode: MTR_LOG_NONE, ... */
-
-mtr_log_t
-mtr_t::get_log_mode() const
-{
-	ut_ad(m_log_mode >= MTR_LOG_ALL);
-	ut_ad(m_log_mode <= MTR_LOG_SHORT_INSERTS);
-
-	return m_log_mode;
-}
-
-/**
-Changes the logging mode of a mini-transaction.
-@return	old mode */
-
-mtr_log_t
-mtr_t::set_log_mode(mtr_log_t mode)
-{
-	ut_ad(mode >= MTR_LOG_ALL);
-	ut_ad(mode <= MTR_LOG_SHORT_INSERTS);
-
-	const mtr_log_t	old_mode = m_log_mode;
-
-	switch (old_mode) {
-	case MTR_LOG_NO_REDO:
-		/* Once this mode is set, it must not be changed. */
-		ut_ad(mode == MTR_LOG_NO_REDO || mode == MTR_LOG_NONE);
-		return(old_mode);
-	case MTR_LOG_NONE:
-		if (mode == old_mode || mode == MTR_LOG_SHORT_INSERTS) {
-			/* Keep MTR_LOG_NONE. */
-			return(old_mode);
-		}
-		/* fall through */
-	case MTR_LOG_SHORT_INSERTS:
-		ut_ad(mode == MTR_LOG_ALL);
-		/* fall through */
-	case MTR_LOG_ALL:
-		/* MTR_LOG_NO_REDO can only be set before generating
-		any redo log records. */
-		ut_ad(mode != MTR_LOG_NO_REDO || m_n_log_recs == 0);
-		m_log_mode = mode;
-		return(old_mode);
-	}
-
-	ut_ad(0);
-	return(old_mode);
-}
diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h
index 2d49cc4f5e2..9e59dc814d3 100644
--- a/storage/innobase/include/mtr0types.h
+++ b/storage/innobase/include/mtr0types.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -29,6 +29,8 @@ Created 11/26/1995 Heikki Tuuri
 
 #ifndef UNIV_INNOCHECKSUM
 #include "sync0rw.h"
+#else
+#include "univ.i"
 #endif /* UNIV_INNOCHECKSUM */
 
 struct mtr_t;
@@ -39,224 +41,276 @@ enum mtr_log_t {
 	MTR_LOG_ALL = 0,
 
 	/** Log no operations and dirty pages are not added to the flush list.
-	Set when applying log in crash recovery or when a modification of a
-	ROW_FORMAT=COMPRESSED page is attempted. */
+	Set for attempting modification of a ROW_FORMAT=COMPRESSED page. */
 	MTR_LOG_NONE,
 
 	/** Don't generate REDO log but add dirty pages to flush list */
-	MTR_LOG_NO_REDO,
-
-	/** Inserts are logged in a shorter form */
-	MTR_LOG_SHORT_INSERTS
+	MTR_LOG_NO_REDO
 };
 
-/** @name Log item types
-The log items are declared 'byte' so that the compiler can warn if val
-and type parameters are switched in a call to mlog_write_ulint. NOTE!
-For 1 - 8 bytes, the flag value must give the length also! @{ */
-enum mlog_id_t {
-	/** if the mtr contains only one log record for one page,
-	i.e., write_initial_log_record has been called only once,
-	this flag is ORed to the type of that first log record */
-	MLOG_SINGLE_REC_FLAG = 128,
-
-	/** one byte is written */
-	MLOG_1BYTE = 1,
-
-	/** 2 bytes ... */
-	MLOG_2BYTES = 2,
-
-	/** 4 bytes ... */
-	MLOG_4BYTES = 4,
-
-	/** 8 bytes ... */
-	MLOG_8BYTES = 8,
-
-	/** Record insert */
-	MLOG_REC_INSERT = 9,
-
-	/** Mark clustered index record deleted */
-	MLOG_REC_CLUST_DELETE_MARK = 10,
-
-	/** Mark secondary index record deleted */
-	MLOG_REC_SEC_DELETE_MARK = 11,
-
-	/** update of a record, preserves record field sizes */
-	MLOG_REC_UPDATE_IN_PLACE = 13,
-
-	/*!< Delete a record from a page */
-	MLOG_REC_DELETE = 14,
-
-	/** Delete record list end on index page */
-	MLOG_LIST_END_DELETE = 15,
-
-	/** Delete record list start on index page */
-	MLOG_LIST_START_DELETE = 16,
-
-	/** Copy record list end to a new created index page */
-	MLOG_LIST_END_COPY_CREATED = 17,
-
-	/** Reorganize an index page in ROW_FORMAT=REDUNDANT */
-	MLOG_PAGE_REORGANIZE = 18,
-
-	/** Create an index page */
-	MLOG_PAGE_CREATE = 19,
-
-	/** insert an undo log record */
-	MLOG_UNDO_INSERT = 20,
-
-	/** erase an undo log page end (used in MariaDB 10.2) */
-	MLOG_UNDO_ERASE_END = 21,
-
-	/** initialize a page in an undo log */
-	MLOG_UNDO_INIT = 22,
-
-	/** reuse an insert undo log header (used in MariaDB 10.2) */
-	MLOG_UNDO_HDR_REUSE = 24,
-
-	/** create an undo log header */
-	MLOG_UNDO_HDR_CREATE = 25,
-
-	/** mark an index record as the predefined minimum record */
-	MLOG_REC_MIN_MARK = 26,
-
-	/** initialize an ibuf bitmap page (used in MariaDB 10.2 and 10.3) */
-	MLOG_IBUF_BITMAP_INIT = 27,
-
-#ifdef UNIV_LOG_LSN_DEBUG
-	/** Current LSN */
-	MLOG_LSN = 28,
-#endif /* UNIV_LOG_LSN_DEBUG */
-
-	/** write a string to a page */
-	MLOG_WRITE_STRING = 30,
-
-	/** If a single mtr writes several log records, this log
-	record ends the sequence of these records */
-	MLOG_MULTI_REC_END = 31,
-
-	/** dummy log record used to pad a log block full */
-	MLOG_DUMMY_RECORD = 32,
-
-	/** log record about an .ibd file creation */
-	//MLOG_FILE_CREATE = 33,
-
-	/** rename databasename/tablename (no .ibd file name suffix) */
-	//MLOG_FILE_RENAME = 34,
-
-	/** delete a tablespace file that starts with (space_id,page_no) */
-	MLOG_FILE_DELETE = 35,
-
-	/** mark a compact index record as the predefined minimum record */
-	MLOG_COMP_REC_MIN_MARK = 36,
-
-	/** create a compact index page */
-	MLOG_COMP_PAGE_CREATE = 37,
-
-	/** compact record insert */
-	MLOG_COMP_REC_INSERT = 38,
-
-	/** mark compact clustered index record deleted */
-	MLOG_COMP_REC_CLUST_DELETE_MARK = 39,
-
-	/** update of a compact record, preserves record field sizes */
-	MLOG_COMP_REC_UPDATE_IN_PLACE = 41,
-
-	/** delete a compact record from a page */
-	MLOG_COMP_REC_DELETE = 42,
-
-	/** delete compact record list end on index page */
-	MLOG_COMP_LIST_END_DELETE = 43,
-
-	/*** delete compact record list start on index page */
-	MLOG_COMP_LIST_START_DELETE = 44,
-
-	/** copy compact record list end to a new created index page */
-	MLOG_COMP_LIST_END_COPY_CREATED = 45,
-
-	/** reorganize an index page */
-	MLOG_COMP_PAGE_REORGANIZE = 46,
-
-	/** log record about creating an .ibd file, with format */
-	MLOG_FILE_CREATE2 = 47,
-
-	/** write the node pointer of a record on a compressed
-	non-leaf B-tree page */
-	MLOG_ZIP_WRITE_NODE_PTR = 48,
-
-	/** write the BLOB pointer of an externally stored column
-	on a compressed page */
-	MLOG_ZIP_WRITE_BLOB_PTR = 49,
-
-	/** write to compressed page header */
-	MLOG_ZIP_WRITE_HEADER = 50,
-
-	/** compress an index page */
-	MLOG_ZIP_PAGE_COMPRESS = 51,
-
-	/** compress an index page without logging it's image */
-	MLOG_ZIP_PAGE_COMPRESS_NO_DATA = 52,
-
-	/** reorganize a compressed page */
-	MLOG_ZIP_PAGE_REORGANIZE = 53,
-
-	/** rename a tablespace file that starts with (space_id,page_no) */
-	MLOG_FILE_RENAME2 = 54,
-
-	/** note the first use of a tablespace file since checkpoint */
-	MLOG_FILE_NAME = 55,
-
-	/** note that all buffered log was written since a checkpoint */
-	MLOG_CHECKPOINT = 56,
-
-	/** Create a R-Tree index page */
-	MLOG_PAGE_CREATE_RTREE = 57,
-
-	/** create a R-tree compact page */
-	MLOG_COMP_PAGE_CREATE_RTREE = 58,
-
-	/** initialize a file page */
-	MLOG_INIT_FILE_PAGE2 = 59,
-
-	/** Table is being truncated. (Was used in 10.2 and 10.3;
-	not supported for crash-upgrade to 10.4 or later.) */
-	MLOG_TRUNCATE = 60,
-
-	/** notify that an index tree is being loaded without writing
-	redo log about individual pages */
-	MLOG_INDEX_LOAD = 61,
-
-	/** write DB_TRX_ID,DB_ROLL_PTR to a clustered index leaf page
-	of a ROW_FORMAT=COMPRESSED table */
-	MLOG_ZIP_WRITE_TRX_ID = 62,
-
-	/** initialize a page with a string of identical bytes */
-	MLOG_MEMSET = 63,
-
-	/** Zero-fill a page that is not allocated. */
-	MLOG_INIT_FREE_PAGE = 64,
+/*
+A mini-transaction is a stream of records that is always terminated by
+a NUL byte. The first byte of a mini-transaction record is never NUL,
+but NUL bytes can occur within mini-transaction records. The first
+bytes of each record will explicitly encode the length of the record.
+NUL bytes also acts as padding in log blocks, that is, there can be
+multiple sucessive NUL bytes between mini-transactions in a redo log
+block.
+
+The first byte of the record would contain a record type, flags, and a
+part of length. The optional second byte of the record will contain
+more length. (Not needed for short records.)
+
+Bit 7 of the first byte of a redo log record is the same_page flag.
+If same_page=1, the record is referring to the same page as the
+previous record. Records that do not refer to data pages but to file
+operations are identified by setting the same_page=1 in the very first
+record(s) of the mini-transaction. A mini-transaction record that
+carries same_page=0 must only be followed by page-oriented records.
+
+Bits 6..4 of the first byte of a redo log record identify the redo log
+type. The following record types refer to data pages:
+
+    FREE_PAGE (0): corresponds to MLOG_INIT_FREE_PAGE
+    INIT_PAGE (1): corresponds to MLOG_INIT_FILE_PAGE2
+    EXTENDED (2): extended record; followed by subtype code @see mrec_ext_t
+    WRITE (3): replaces MLOG_nBYTES, MLOG_WRITE_STRING, MLOG_ZIP_*
+    MEMSET (4): extends the 10.4 MLOG_MEMSET record
+    MEMMOVE (5): copy data within the page (avoids logging redundant data)
+    RESERVED (6): reserved for future use; a subtype code
+    (encoded immediately after the length) would be written
+    to reserve code space for further extensions
+    OPTION (7): optional record that may be ignored; a subtype code
+    (encoded immediately after the length) would distinguish actual
+    usage, such as:
+     * MDEV-18976 page checksum record
+     * binlog record
+     * SQL statement (at the start of statement)
+
+Bits 3..0 indicate the redo log record length, excluding the first
+byte, but including additional length bytes and any other bytes,
+such as the optional tablespace identifier and page number.
+Values 1..15 represent lengths of 1 to 15 bytes. The special value 0
+indicates that 1 to 3 length bytes will follow to encode the remaining
+length that exceeds 16 bytes.
+
+Additional length bytes if length>16: 0 to 3 bytes
+0xxxxxxx                   for 0 to 127 (total: 16 to 143 bytes)
+10xxxxxx xxxxxxxx          for 128 to 16511 (total: 144 to 16527)
+110xxxxx xxxxxxxx xxxxxxxx for 16512 to 2113663 (total: 16528 to 2113679)
+111xxxxx                   reserved (corrupted record, and file!)
+
+If same_page=0, the tablespace identifier and page number will use
+similar 1-to-5-byte variable-length encoding:
+0xxxxxxx                                     for 0 to 127
+10xxxxxx xxxxxxxx                            for 128 to 16,511
+110xxxxx xxxxxxxx xxxxxxxx                   for 16,512 to 2,113,663
+1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx          for 2,113,664 to 270,549,119
+11110xxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx for 270,549,120 to 34,630,287,487
+11111xxx                                     reserved (corrupted record)
+Note: Some 5-byte values are reserved, because the tablespace identifier
+and page number can only be up to 4,294,967,295.
+
+If same_page=1 is set in a record that follows a same_page=0 record
+in a mini-transaction, the tablespace identifier and page number
+fields will be omitted.
+
+For FILE_ records (if same_page=1 for the first record
+of a mini-transaction), we will write a tablespace identifier and
+a page number (always 0) using the same 1-to-5-byte encoding.
+
+For FREE_PAGE or INIT_PAGE, if same_page=1, the record will be treated
+as corrupted (or reserved for future extension).  The type code must
+be followed by 1+1 to 5+5 bytes (to encode the tablespace identifier
+and page number). If the record length does not match the encoded
+lengths of the tablespace identifier and page number, the record will
+be treated as corrupted. This allows future expansion of the format.
+
+If there is a FREE_PAGE record in a mini-transaction, it must be the
+only record for that page in the mini-transaction. If there is an
+INIT_PAGE record for a page in a mini-transaction, it must be the
+first record for that page in the mini-transaction.
+
+An EXTENDED record must be followed by 1+1 to 5+5 bytes for the page
+identifier (unless the same_page flag is set) and a subtype; @see mrec_ext_t
+
+For WRITE, MEMSET, MEMMOVE, the next 1 to 3 bytes are the byte offset
+on the page, relative from the previous offset. If same_page=0, the
+"previous offset" is 0. If same_page=1, the "previous offset" is where
+the previous operation ended (FIL_PAGE_TYPE for INIT_PAGE).
+0xxxxxxx                                     for 0 to 127
+10xxxxxx xxxxxxxx                            for 128 to 16,511
+110xxxxx xxxxxxxx xxxxxxxx                   for 16,512 to 2,113,663
+111xxxxx                                     reserved (corrupted record)
+If the sum of the "previous offset" and the current offset exceeds the
+page size, the record is treated as corrupted. Negative relative offsets
+cannot be written. Instead, a record with same_page=0 can be written.
+
+For MEMSET and MEMMOVE, the target length will follow, encoded in 1 to
+3 bytes.  If the length+offset exceeds the page size, the record will
+be treated as corrupted.
+
+For MEMMOVE, the source offset will follow, encoded in 1 to 3 bytes,
+relative to the current offset. The offset 0 is not possible, and
+the sign bit is the least significant bit. That is,
++x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...) and
+-x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...).
+The source offset must be within the page size, or else the record
+will be treated as corrupted.
+
+For MEMSET or WRITE, the byte(s) to be written will follow. For
+MEMSET, it usually is a single byte, but it could also be a multi-byte
+string, which would be copied over and over until the target length is
+reached. The length of the remaining bytes is implied by the length
+bytes at the start of the record.
+
+For MEMMOVE, if any bytes follow, the record is treated as corrupted
+(future expansion).
+
+As mentioned at the start of this comment, the type byte 0 would be
+special, marking the end of a mini-transaction. We could use the
+corresponding value 0x80 (with same_page=1) for something special,
+such as a future extension when more type codes are needed, or for
+encoding rarely needed redo log records.
+
+Examples:
+
+INIT could be logged as 0x12 0x34 0x56, meaning "type code 1 (INIT), 2
+bytes to follow" and "tablespace ID 0x34", "page number 0x56".
+The first byte must be between 0x12 and 0x1a, and the total length of
+the record must match the lengths of the encoded tablespace ID and
+page number.
+
+WRITE could be logged as 0x36 0x40 0x57 0x60 0x12 0x34 0x56, meaning
+"type code 3 (WRITE), 6 bytes to follow" and "tablespace ID 0x40",
+"page number 0x57", "byte offset 0x60", data 0x34,0x56.
+
+A subsequent WRITE to the same page could be logged 0xb5 0x7f 0x23
+0x34 0x56 0x78, meaning "same page, type code 3 (WRITE), 5 bytes to
+follow", "byte offset 0x7f"+0x60+2, bytes 0x23,0x34,0x56,0x78.
+
+The end of the mini-transaction would be indicated by a NUL byte.
+*/
+
+/** Redo log record types. These bit patterns (3 bits) will be written
+to the redo log file, so the existing codes or their interpretation on
+crash recovery must not be changed. */
+enum mrec_type_t
+{
+  /** Free a page. On recovery, it is unnecessary to read the page.
+  The next record for the page (if any) must be INIT_PAGE.
+  After this record has been written, the page may be
+  overwritten with zeros, or discarded or trimmed. */
+  FREE_PAGE= 0,
+  /** Zero-initialize a page. The current byte offset (for subsequent
+  records) will be reset to FIL_PAGE_TYPE. */
+  INIT_PAGE= 0x10,
+  /** Extended record; @see mrec_ext_t */
+  EXTENDED= 0x20,
+  /** Write a string of bytes. Followed by the byte offset (unsigned,
+  relative to the current byte offset, encoded in 1 to 3 bytes) and
+  the bytes to write (at least one). The current byte offset will be
+  set after the last byte written. */
+  WRITE= 0x30,
+  /** Like WRITE, but before the bytes to write, the data_length-1
+  (encoded in 1 to 3 bytes) will be encoded, and it must be more
+  than the length of the following data bytes to write.
+  The data byte(s) will be repeatedly copied to the output until
+  the data_length is reached. */
+  MEMSET= 0x40,
+  /** Like MEMSET, but instead of the bytes to write, a source byte
+  offset (signed, nonzero, relative to the target byte offset, encoded
+  in 1 to 3 bytes, with the sign bit in the least significant bit)
+  will be written.
+  That is, +x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...)
+  and -x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...).
+  The source offset and data_length must be within the page size, or
+  else the record will be treated as corrupted. The data will be
+  copied from the page as it was at the start of the
+  mini-transaction. */
+  MEMMOVE= 0x50,
+  /** Reserved for future use. */
+  RESERVED= 0x60,
+  /** Optional record that may be ignored in crash recovery.
+  A subtype code will be encoded immediately after the length.
+  Possible subtypes would include a MDEV-18976 page checksum record,
+  a binlog record, or an SQL statement. */
+  OPTION= 0x70
+};
 
-	/** biggest value (used in assertions) */
-	MLOG_BIGGEST_TYPE = MLOG_INIT_FREE_PAGE,
 
-	/** log record for writing/updating crypt data of
-	a tablespace */
-	MLOG_FILE_WRITE_CRYPT_DATA = 100,
+/** Supported EXTENDED record subtypes. */
+enum mrec_ext_t
+{
+  /** Partly initialize a ROW_FORMAT=REDUNDANT B-tree or R-tree index page,
+  including writing the "infimum" and "supremum" pseudo-records.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INIT_ROW_FORMAT_REDUNDANT= 0,
+  /** Partly initialize a ROW_FORMAT=COMPACT or DYNAMIC index page,
+  including writing the "infimum" and "supremum" pseudo-records.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INIT_ROW_FORMAT_DYNAMIC= 1,
+  /** Initialize an undo log page.
+  This is roughly (not exactly) equivalent to the old MLOG_UNDO_INIT record.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  UNDO_INIT= 2,
+  /** Append a record to an undo log page.
+  This is equivalent to the old MLOG_UNDO_INSERT record.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  UNDO_APPEND= 3,
+  /** Insert a ROW_FORMAT=REDUNDANT record, extending PAGE_HEAP_TOP.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INSERT_HEAP_REDUNDANT= 4,
+  /** Insert a ROW_FORMAT=REDUNDANT record, reusing PAGE_FREE.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INSERT_REUSE_REDUNDANT= 5,
+  /** Insert a ROW_FORMAT=COMPACT or DYNAMIC record, extending PAGE_HEAP_TOP.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INSERT_HEAP_DYNAMIC= 6,
+  /** Insert a ROW_FORMAT=COMPACT or DYNAMIC record, reusing PAGE_FREE.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INSERT_REUSE_DYNAMIC= 7,
+  /** Delete a record on a ROW_FORMAT=REDUNDANT page.
+  We point to the precedessor of the record to be deleted.
+  The current byte offset will be reset to FIL_PAGE_TYPE.
+  This is similar to the old MLOG_REC_DELETE record. */
+  DELETE_ROW_FORMAT_REDUNDANT= 8,
+  /** Delete a record on a ROW_FORMAT=COMPACT or DYNAMIC page.
+  We point to the precedessor of the record to be deleted
+  and include the total size of the record being deleted.
+  The current byte offset will be reset to FIL_PAGE_TYPE.
+  This is similar to the old MLOG_COMP_REC_DELETE record. */
+  DELETE_ROW_FORMAT_DYNAMIC= 9,
+  /** Truncate a data file. */
+  TRIM_PAGES= 10
 };
 
-/* @} */
 
-#define EXTRA_CHECK_MLOG_NUMBER(x) \
-  ((x) == MLOG_FILE_WRITE_CRYPT_DATA)
+/** Redo log record types for file-level operations. These bit
+patterns will be written to redo log files, so the existing codes or
+their interpretation on crash recovery must not be changed. */
+enum mfile_type_t
+{
+  /** Create a file. Followed by tablespace ID and the file name. */
+  FILE_CREATE = 0x80,
+  /** Delete a file. Followed by tablespace ID and the file name.  */
+  FILE_DELETE = 0x90,
+  /** Rename a file. Followed by tablespace ID and the old file name,
+  NUL, and the new file name.  */
+  FILE_RENAME = 0xa0,
+  /** Modify a file. Followed by tablespace ID and the file name. */
+  FILE_MODIFY = 0xb0,
+  /** End-of-checkpoint marker. Followed by 2 dummy bytes of page identifier,
+  8 bytes of LSN, and padded with a NUL; @see SIZE_OF_FILE_CHECKPOINT. */
+  FILE_CHECKPOINT = 0xf0
+};
 
-/** Size of a MLOG_CHECKPOINT record in bytes.
-The record consists of a MLOG_CHECKPOINT byte followed by
-mach_write_to_8(checkpoint_lsn). */
-#define SIZE_OF_MLOG_CHECKPOINT	9
+/** Size of a FILE_CHECKPOINT record, including the trailing byte to
+terminate the mini-transaction. */
+constexpr byte SIZE_OF_FILE_CHECKPOINT= 3/*type,page_id*/ + 8/*LSN*/ + 1;
 
 #ifndef UNIV_INNOCHECKSUM
-/** Types for the mlock objects to store in the mtr memo; NOTE that the
-first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
+/** Types for the mlock objects to store in the mtr_t::m_memo */
 enum mtr_memo_type_t {
 	MTR_MEMO_PAGE_S_FIX = RW_S_LATCH,
 
@@ -266,9 +320,10 @@ enum mtr_memo_type_t {
 
 	MTR_MEMO_BUF_FIX = RW_NO_LATCH,
 
-#ifdef UNIV_DEBUG
 	MTR_MEMO_MODIFY = 16,
-#endif /* UNIV_DEBUG */
+
+	MTR_MEMO_PAGE_X_MODIFY = MTR_MEMO_PAGE_X_FIX | MTR_MEMO_MODIFY,
+	MTR_MEMO_PAGE_SX_MODIFY = MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_MODIFY,
 
 	MTR_MEMO_S_LOCK = RW_S_LATCH << 5,
 
@@ -281,10 +336,4 @@ enum mtr_memo_type_t {
 };
 #endif /* !UNIV_CHECKSUM */
 
-enum mtr_state_t {
-	MTR_STATE_INIT = 0,
-	MTR_STATE_ACTIVE,
-	MTR_STATE_COMMITTED
-};
-
 #endif /* mtr0types_h */
diff --git a/storage/innobase/include/os0api.h b/storage/innobase/include/os0api.h
deleted file mode 100644
index 3be7c0afaa4..00000000000
--- a/storage/innobase/include/os0api.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/***********************************************************************
-
-Copyright (c) 2017, 2019, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it
-under the terms of the GNU General Public License as published by the
-Free Software Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-***********************************************************************/
-
-/**************************************************//**
-@file os0api.h
-The interface to the helper functions.
-These functions are used on os0file.h where
-including full full header is not feasible and
-implemented on buf0buf.cc and fil0fil.cc.
-*******************************************************/
-
-#ifndef OS_API_H
-#define OS_API_H 1
-
-/** Page control block */
-class buf_page_t;
-
-/** File Node */
-struct fil_node_t;
-
-/**
-Should we punch hole to deallocate unused portion of the page.
-@param[in]	bpage		Page control block
-@return true if punch hole should be used, false if not */
-bool
-buf_page_should_punch_hole(
-	const buf_page_t* bpage)
-	MY_ATTRIBUTE((warn_unused_result));
-
-/**
-Calculate the length of trim (punch_hole) operation.
-@param[in]	bpage		Page control block
-@param[in]	write_length	Write length
-@return length of the trim or zero. */
-ulint
-buf_page_get_trim_length(
-	const buf_page_t*	bpage,
-	ulint			write_length)
-	MY_ATTRIBUTE((warn_unused_result));
-
-#endif /* OS_API_H */
diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
index b301fcf5dfa..81a98ef7e01 100644
--- a/storage/innobase/include/os0file.h
+++ b/storage/innobase/include/os0file.h
@@ -37,7 +37,8 @@ Created 10/21/1995 Heikki Tuuri
 #define os0file_h
 
 #include "fsp0types.h"
-#include "os0api.h"
+#include "tpool.h"
+#include "my_counter.h"
 
 #ifndef _WIN32
 #include <dirent.h>
@@ -45,12 +46,6 @@ Created 10/21/1995 Heikki Tuuri
 #include <time.h>
 #endif /* !_WIN32 */
 
-#include "my_counter.h"
-
-/** File node of a tablespace or the log data space */
-struct fil_node_t;
-struct fil_space_t;
-
 extern bool	os_has_said_disk_full;
 
 /** File offset in bytes */
@@ -66,7 +61,7 @@ the OS actually supports it: Win 95 does not, NT does. */
 # define UNIV_NON_BUFFERED_IO
 
 /** File handle */
-typedef HANDLE os_file_t;
+typedef native_file_handle os_file_t;
 
 
 #else /* _WIN32 */
@@ -100,6 +95,14 @@ struct pfs_os_file_t
 	/** Assignment operator.
 	@param[in]	file	file handle to be assigned */
 	void operator=(os_file_t file) { m_file = file; }
+	bool operator==(os_file_t file) const { return m_file == file; }
+	bool operator!=(os_file_t file) const { return !(*this == file); }
+#ifndef DBUG_OFF
+	friend std::ostream& operator<<(std::ostream& os, pfs_os_file_t f){
+		os << os_file_t(f);
+		return os;
+	}
+#endif
 };
 
 /** The next value should be smaller or equal to the smallest sector size used
@@ -171,259 +174,81 @@ static const ulint OS_FILE_OPERATION_NOT_SUPPORTED = 125;
 static const ulint OS_FILE_ERROR_MAX = 200;
 /* @} */
 
-/** Types for AIO operations @{ */
-
-/** No transformations during read/write, write as is. */
-#define IORequestRead		IORequest(IORequest::READ)
-#define IORequestWrite		IORequest(IORequest::WRITE)
-#define IORequestLogRead	IORequest(IORequest::LOG | IORequest::READ)
-#define IORequestLogWrite	IORequest(IORequest::LOG | IORequest::WRITE)
-
-
-
 /**
-The IO Context that is passed down to the low level IO code */
-class IORequest {
+The I/O context that is passed down to the low level IO code */
+class IORequest
+{
 public:
-	/** Flags passed in the request, they can be ORred together. */
-	enum {
-		READ = 1,
-		WRITE = 2,
-
-		/** Double write buffer recovery. */
-		DBLWR_RECOVER = 4,
-
-		/** Enumarations below can be ORed to READ/WRITE above*/
-
-		/** Data file */
-		DATA_FILE = 8,
-
-		/** Log file request*/
-		LOG = 16,
-
-		/** Disable partial read warnings */
-		DISABLE_PARTIAL_IO_WARNINGS = 32,
-
-		/** Do not to wake i/o-handler threads, but the caller will do
-		the waking explicitly later, in this way the caller can post
-		several requests in a batch; NOTE that the batch must not be
-		so big that it exhausts the slots in AIO arrays! NOTE that
-		a simulated batch may introduce hidden chances of deadlocks,
-		because I/Os are not actually handled until all
-		have been posted: use with great caution! */
-		DO_NOT_WAKE = 64,
-
-		/** Ignore failed reads of non-existent pages */
-		IGNORE_MISSING = 128,
-
-		/** Use punch hole if available*/
-		PUNCH_HOLE = 256,
-	};
-
-	/** Default constructor */
-	IORequest()
-		:
-		m_bpage(NULL),
-		m_fil_node(NULL),
-		m_type(READ)
-	{
-		/* No op */
-	}
-
-	/**
-	@param[in]	type		Request type, can be a value that is
-					ORed from the above enum */
-	explicit IORequest(ulint type)
-		:
-		m_bpage(NULL),
-		m_fil_node(NULL),
-		m_type(static_cast<uint16_t>(type))
-	{
-		if (!is_punch_hole_supported()) {
-			clear_punch_hole();
-		}
-	}
-
-	/**
-	@param[in]	type		Request type, can be a value that is
-					ORed from the above enum
-	@param[in]	bpage		Page to be written */
-	IORequest(ulint type, buf_page_t* bpage)
-		:
-		m_bpage(bpage),
-		m_fil_node(NULL),
-		m_type(static_cast<uint16_t>(type))
-	{
-		if (bpage && buf_page_should_punch_hole(bpage)) {
-			set_punch_hole();
-		}
-
-		if (!is_punch_hole_supported()) {
-			clear_punch_hole();
-		}
-	}
-
-	/** Destructor */
-	~IORequest() { }
-
-	/** @return true if ignore missing flag is set */
-	static bool ignore_missing(ulint type)
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		return((type & IGNORE_MISSING) == IGNORE_MISSING);
-	}
-
-	/** @return true if it is a read request */
-	bool is_read() const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		return((m_type & READ) == READ);
-	}
-
-	/** @return true if it is a write request */
-	bool is_write() const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		return((m_type & WRITE) == WRITE);
-	}
-
-	/** @return true if it is a redo log write */
-	bool is_log() const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		return((m_type & LOG) == LOG);
-	}
-
-	/** @return true if the simulated AIO thread should be woken up */
-	bool is_wake() const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		return((m_type & DO_NOT_WAKE) == 0);
-	}
-
-	/** Clear the punch hole flag */
-	void clear_punch_hole()
-	{
-		m_type &= ~PUNCH_HOLE;
-	}
-
-	/** @return true if partial read warning disabled */
-	bool is_partial_io_warning_disabled() const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		return((m_type & DISABLE_PARTIAL_IO_WARNINGS)
-		       == DISABLE_PARTIAL_IO_WARNINGS);
-	}
-
-	/** Disable partial read warnings */
-	void disable_partial_io_warnings()
-	{
-		m_type |= DISABLE_PARTIAL_IO_WARNINGS;
-	}
-
-	/** @return true if missing files should be ignored */
-	bool ignore_missing() const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		return(ignore_missing(m_type));
-	}
-
-	/** @return true if punch hole should be used */
-	bool punch_hole() const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		return((m_type & PUNCH_HOLE) == PUNCH_HOLE);
-	}
-
-	/** @return true if the read should be validated */
-	bool validate() const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		return(is_read() ^ is_write());
-	}
-
-	/** Set the punch hole flag */
-	void set_punch_hole()
-	{
-		if (is_punch_hole_supported()) {
-			m_type |= PUNCH_HOLE;
-		}
-	}
-
-	/** Clear the do not wake flag */
-	void clear_do_not_wake()
-	{
-		m_type &= ~DO_NOT_WAKE;
-	}
-
-	/** Set the pointer to file node for IO
-	@param[in] node			File node */
-	inline void set_fil_node(fil_node_t* node);
-
-	bool operator==(const IORequest& rhs) const
-	{
-		return(m_type == rhs.m_type);
-	}
-
-	/** Note that the IO is for double write recovery. */
-	void dblwr_recover()
-	{
-		m_type |= DBLWR_RECOVER;
-	}
-
-	/** @return true if the request is from the dblwr recovery */
-	bool is_dblwr_recover() const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		return((m_type & DBLWR_RECOVER) == DBLWR_RECOVER);
-	}
-
-	/** @return true if punch hole is supported */
-	static bool is_punch_hole_supported()
-	{
-
-		/* In this debugging mode, we act as if punch hole is supported,
-		and then skip any calls to actually punch a hole here.
-		In this way, Transparent Page Compression is still being tested. */
-		DBUG_EXECUTE_IF("ignore_punch_hole",
-			return(true);
-		);
-
-#if defined(HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE) || defined(_WIN32)
-		return(true);
-#else
-		return(false);
-#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || _WIN32 */
-	}
-
-	ulint get_trim_length(ulint write_length) const
-	{
-		return (m_bpage ?
-			buf_page_get_trim_length(m_bpage, write_length)
-			: 0);
-	}
-
-	inline bool should_punch_hole() const;
-
-	/** Free storage space associated with a section of the file.
-	@param[in]	fh		Open file handle
-	@param[in]	off		Starting offset (SEEK_SET)
-	@param[in]	len		Size of the hole
-	@return DB_SUCCESS or error code */
-	dberr_t punch_hole(os_file_t fh, os_offset_t off, ulint len);
+  enum Type
+  {
+    /** Synchronous read */
+    READ_SYNC= 2,
+    /** Asynchronous read; some errors will be ignored */
+    READ_ASYNC= READ_SYNC | 1,
+    /** Possibly partial read; only used with
+    os_file_read_no_error_handling() */
+    READ_MAYBE_PARTIAL= READ_SYNC | 4,
+    /** Read for doublewrite buffer recovery */
+    DBLWR_RECOVER= READ_SYNC | 8,
+    /** Synchronous write */
+    WRITE_SYNC= 16,
+    /** Asynchronous write */
+    WRITE_ASYNC= WRITE_SYNC | 1,
+    /** A doublewrite batch */
+    DBLWR_BATCH= WRITE_ASYNC | 8,
+    /** Write data; evict the block on write completion */
+    WRITE_LRU= WRITE_ASYNC | 32,
+    /** Write data and punch hole for the rest */
+    PUNCH= WRITE_ASYNC | 64,
+    /** Write data and punch hole; evict the block on write completion */
+    PUNCH_LRU= PUNCH | WRITE_LRU,
+    /** Zero out a range of bytes in fil_space_t::io() */
+    PUNCH_RANGE= WRITE_SYNC | 128,
+  };
+
+  constexpr IORequest(buf_page_t *bpage, fil_node_t *node, Type type) :
+    bpage(bpage), node(node), type(type) {}
+
+  constexpr IORequest(Type type= READ_SYNC, buf_page_t *bpage= nullptr) :
+    bpage(bpage), type(type) {}
+
+  bool is_read() const { return (type & READ_SYNC) != 0; }
+  bool is_write() const { return (type & WRITE_SYNC) != 0; }
+  bool is_LRU() const { return (type & (WRITE_LRU ^ WRITE_ASYNC)) != 0; }
+  bool is_async() const { return (type & (READ_SYNC ^ READ_ASYNC)) != 0; }
+
+  /** If requested, free storage space associated with a section of the file.
+  @param off   byte offset from the start (SEEK_SET)
+  @param len   size of the hole in bytes
+  @return DB_SUCCESS or error code */
+  dberr_t maybe_punch_hole(os_offset_t off, ulint len)
+  {
+    return off && len && node && (type & (PUNCH ^ WRITE_ASYNC))
+      ? punch_hole(off, len)
+      : DB_SUCCESS;
+  }
 
 private:
-	/** Page to be written on write operation. */
-	buf_page_t*		m_bpage;
+  /** Free storage space associated with a section of the file.
+  @param off   byte offset from the start (SEEK_SET)
+  @param len   size of the hole in bytes
+  @return DB_SUCCESS or error code */
+  dberr_t punch_hole(os_offset_t off, ulint len) const;
+
+public:
+  /** Page to be written on write operation */
+  buf_page_t* const bpage= nullptr;
 
-	/** File node */
-	fil_node_t*		m_fil_node;
+  /** File descriptor */
+  fil_node_t *const node= nullptr;
 
-	/** Request type bit flags */
-	uint16_t		m_type;
+  /** Request type bit flags */
+  const Type type;
 };
 
-/* @} */
+constexpr IORequest IORequestRead(IORequest::READ_SYNC);
+constexpr IORequest IORequestReadPartial(IORequest::READ_MAYBE_PARTIAL);
+constexpr IORequest IORequestWrite(IORequest::WRITE_SYNC);
 
 /** Sparse file size information. */
 struct os_file_size_t {
@@ -435,27 +260,7 @@ struct os_file_size_t {
 	os_offset_t	m_alloc_size;
 };
 
-/** Win NT does not allow more than 64 */
-static const ulint OS_AIO_N_PENDING_IOS_PER_THREAD = 32;
-
-/** Modes for aio operations @{ */
-/** Normal asynchronous i/o not for ibuf pages or ibuf bitmap pages */
-static const ulint OS_AIO_NORMAL = 21;
-
-/**  Asynchronous i/o for ibuf pages or ibuf bitmap pages */
-static const ulint OS_AIO_IBUF = 22;
-
-/** Asynchronous i/o for the log */
-static const ulint OS_AIO_LOG = 23;
-
-/** Asynchronous i/o where the calling thread will itself wait for
-the i/o to complete, doing also the job of the i/o-handler thread;
-can be used for any pages, ibuf or non-ibuf.  This is used to save
-CPU time, as we can do with fewer thread switches. Plain synchronous
-I/O is not as good, because it must serialize the file seek and read
-or write, causing a bottleneck for parallelism. */
-static const ulint OS_AIO_SYNC = 24;
-/* @} */
+constexpr ulint OS_AIO_N_PENDING_IOS_PER_THREAD= 256;
 
 extern Atomic_counter<ulint> os_n_file_reads;
 extern ulint	os_n_file_writes;
@@ -638,8 +443,7 @@ Closes a file handle. In case of error, error number can be retrieved with
 os_file_get_last_error.
 @param[in]	file		own: handle to a file
 @return true if success */
-bool
-os_file_close_func(os_file_t file);
+bool os_file_close_func(os_file_t file);
 
 #ifdef UNIV_PFS_IO
 
@@ -682,10 +486,12 @@ do {									\
 	register_pfs_file_open_begin(state, locker, key, op, name,	\
 					src_file, src_line)		\
 
-# define register_pfs_file_rename_end(locker, result)			\
+# define register_pfs_file_rename_end(locker, from, to, result)		\
 do {									\
-	if (locker != NULL) {				\
-		PSI_FILE_CALL(end_file_open_wait)(locker, result);	\
+	if (locker != NULL) {						\
+		 PSI_FILE_CALL(						\
+			end_file_rename_wait)(				\
+			locker, from, to, result);			\
 	}								\
 } while (0)
 
@@ -761,12 +567,6 @@ The wrapper functions have the prefix of "innodb_". */
 # define os_file_close(file)						\
 	pfs_os_file_close_func(file, __FILE__, __LINE__)
 
-# define os_aio(type, mode, name, file, buf, offset,		\
-	n, read_only, message1, message2)			\
-	pfs_os_aio_func(type, mode, name, file, buf, offset,	\
-		n, read_only, message1, message2,		\
-			__FILE__, __LINE__)
-
 # define os_file_read(type, file, buf, offset, n)			\
 	pfs_os_file_read_func(type, file, buf, offset, n, __FILE__, __LINE__)
 
@@ -946,44 +746,6 @@ pfs_os_file_read_no_error_handling_func(
 	const char*		src_file,
 	uint			src_line);
 
-/** NOTE! Please use the corresponding macro os_aio(), not directly this
-function!
-Performance schema wrapper function of os_aio() which requests
-an asynchronous I/O operation.
-@param[in,out]	type		IO request context
-@param[in]	mode		IO mode
-@param[in]	name		Name of the file or path as NUL terminated
-				string
-@param[in]	file		Open file handle
-@param[out]	buf		buffer where to read
-@param[in]	offset		file offset where to read
-@param[in]	n		number of bytes to read
-@param[in]	read_only	if true read only mode checks are enforced
-@param[in,out]	m1		Message for the AIO handler, (can be used to
-				identify a completed AIO operation); ignored
-				if mode is OS_AIO_SYNC
-@param[in,out]	m2		message for the AIO handler (can be used to
-				identify a completed AIO operation); ignored
-				if mode is OS_AIO_SYNC
-@param[in]	src_file	file name where func invoked
-@param[in]	src_line	line where the func invoked
-@return DB_SUCCESS if request was queued successfully, FALSE if fail */
-UNIV_INLINE
-dberr_t
-pfs_os_aio_func(
-	IORequest&	type,
-	ulint		mode,
-	const char*	name,
-	pfs_os_file_t	file,
-	void*		buf,
-	os_offset_t	offset,
-	ulint		n,
-	bool		read_only,
-	fil_node_t*	m1,
-	void*		m2,
-	const char*	src_file,
-	uint		src_line);
-
 /** NOTE! Please use the corresponding macro os_file_write(), not directly
 this function!
 This is the performance schema instrumented wrapper function for
@@ -1026,6 +788,7 @@ pfs_os_file_flush_func(
 	const char*	src_file,
 	uint		src_line);
 
+
 /** NOTE! Please use the corresponding macro os_file_rename(), not directly
 this function!
 This is the performance schema instrumented wrapper function for
@@ -1104,11 +867,6 @@ to original un-instrumented file I/O APIs */
 
 # define os_file_close(file)	os_file_close_func(file)
 
-# define os_aio(type, mode, name, file, buf, offset,			\
-	n, read_only, message1, message2)			\
-	os_aio_func(type, mode, name, file, buf, offset,		\
-		n, read_only, message1, message2)
-
 # define os_file_read(type, file, buf, offset, n)			\
 	os_file_read_func(type, file, buf, offset, n)
 
@@ -1336,111 +1094,30 @@ void
 unit_test_os_file_get_parent_dir();
 #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
 
-/** Initializes the asynchronous io system. Creates one array each for ibuf
-and log i/o. Also creates one array each for read and write where each
-array is divided logically into n_read_segs and n_write_segs
-respectively. The caller must create an i/o handler thread for each
-segment in these arrays. This function also creates the sync array.
-No i/o handler thread needs to be created for that
-@param[in]	n_read_segs	number of reader threads
-@param[in]	n_write_segs	number of writer threads
-@param[in]	n_slots_sync	number of slots in the sync aio array */
-
-bool
-os_aio_init(
-	ulint		n_read_segs,
-	ulint		n_write_segs,
-	ulint		n_slots_sync);
-
 /**
-Frees the asynchronous io system. */
-void
-os_aio_free();
+Initializes the asynchronous io system. */
+int os_aio_init();
 
 /**
-NOTE! Use the corresponding macro os_aio(), not directly this function!
-Requests an asynchronous i/o operation.
-@param[in,out]	type		IO request context
-@param[in]	mode		IO mode
-@param[in]	name		Name of the file or path as NUL terminated
-				string
-@param[in]	file		Open file handle
-@param[out]	buf		buffer where to read
-@param[in]	offset		file offset where to read
-@param[in]	n		number of bytes to read
-@param[in]	read_only	if true read only mode checks are enforced
-@param[in,out]	m1		Message for the AIO handler, (can be used to
-				identify a completed AIO operation); ignored
-				if mode is OS_AIO_SYNC
-@param[in,out]	m2		message for the AIO handler (can be used to
-				identify a completed AIO operation); ignored
-				if mode is OS_AIO_SYNC
-@return DB_SUCCESS or error code */
-dberr_t
-os_aio_func(
-	IORequest&	type,
-	ulint		mode,
-	const char*	name,
-	pfs_os_file_t	file,
-	void*		buf,
-	os_offset_t	offset,
-	ulint		n,
-	bool		read_only,
-	fil_node_t*	m1,
-	void*		m2);
+Frees the asynchronous io system. */
+void os_aio_free();
 
-/** Wakes up all async i/o threads so that they know to exit themselves in
-shutdown. */
-void
-os_aio_wake_all_threads_at_shutdown();
+/** Request a read or write.
+@param type		I/O request
+@param buf		buffer
+@param offset		file offset
+@param n		number of bytes
+@retval DB_SUCCESS if request was queued successfully
+@retval DB_IO_ERROR on I/O error */
+dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n);
 
-/** Waits until there are no pending writes in os_aio_write_array. There can
-be other, synchronous, pending writes. */
-void
-os_aio_wait_until_no_pending_writes();
+/** Wait until there are no pending asynchronous writes. */
+void os_aio_wait_until_no_pending_writes();
 
-/** Wakes up simulated aio i/o-handler threads if they have something to do. */
-void
-os_aio_simulated_wake_handler_threads();
 
-#ifdef _WIN32
-/** This function can be called if one wants to post a batch of reads and
-prefers an i/o-handler thread to handle them all at once later. You must
-call os_aio_simulated_wake_handler_threads later to ensure the threads
-are not left sleeping! */
-void
-os_aio_simulated_put_read_threads_to_sleep();
-#else /* _WIN32 */
-# define os_aio_simulated_put_read_threads_to_sleep()
-#endif /* _WIN32 */
+/** Wait until there are no pending asynchronous reads. */
+void os_aio_wait_until_no_pending_reads();
 
-/** This is the generic AIO handler interface function.
-Waits for an aio operation to complete. This function is used to wait the
-for completed requests. The AIO array of pending requests is divided
-into segments. The thread specifies which segment or slot it wants to wait
-for. NOTE: this function will also take care of freeing the aio slot,
-therefore no other thread is allowed to do the freeing!
-@param[in]	segment		the number of the segment in the aio arrays to
-				wait for; segment 0 is the ibuf I/O thread,
-				segment 1 the log I/O thread, then follow the
-				non-ibuf read threads, and as the last are the
-				non-ibuf write threads; if this is
-				ULINT_UNDEFINED, then it means that sync AIO
-				is used, and this parameter is ignored
-@param[out]	m1		the messages passed with the AIO request;
-				note that also in the case where the AIO
-				operation failed, these output parameters
-				are valid and can be used to restart the
-				operation, for example
-@param[out]	m2		callback message
-@param[out]	type		OS_FILE_WRITE or ..._READ
-@return DB_SUCCESS or error code */
-dberr_t
-os_aio_handler(
-	ulint		segment,
-	fil_node_t**	m1,
-	void**		m2,
-	IORequest*	type);
 
 /** Prints info of the aio arrays.
 @param[in/out]	file		file where to print */
@@ -1456,14 +1133,6 @@ no pending io operations. */
 bool
 os_aio_all_slots_free();
 
-#ifdef UNIV_DEBUG
-
-/** Prints all pending IO
-@param[in]	file	file where to print */
-void
-os_aio_print_pending_io(FILE* file);
-
-#endif /* UNIV_DEBUG */
 
 /** This function returns information about the specified file
 @param[in]	path		pathname of the file
@@ -1479,15 +1148,6 @@ os_file_get_status(
 	bool		check_rw_perm,
 	bool		read_only);
 
-/** Creates a temporary file in the location specified by the parameter
-path. If the path is NULL then it will be created on --tmpdir location.
-This function is defined in ha_innodb.cc.
-@param[in]	path	location for creating temporary file
-@return temporary file descriptor, or < 0 on error */
-os_file_t
-innobase_mysql_tmpfile(
-	const char*	path);
-
 /** Set the file create umask
 @param[in]	umask		The umask to use for file creation. */
 void
diff --git a/storage/innobase/include/os0file.inl b/storage/innobase/include/os0file.inl
index e01fcb41afb..e88f94b8ff3 100644
--- a/storage/innobase/include/os0file.inl
+++ b/storage/innobase/include/os0file.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2010, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2019, MariaDB Corporation.
+Copyright (c) 2013, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -201,63 +201,6 @@ pfs_os_file_close_func(
 	return(result);
 }
 
-/** NOTE! Please use the corresponding macro os_aio(), not directly this
-function!
-Performance schema wrapper function of os_aio() which requests
-an asynchronous i/o operation.
-@param[in,type]	type		IO request context
-@param[in]	mode		IO mode
-@param[in]	name		Name of the file or path as NUL terminated
-				string
-@param[in]	file		Open file handle
-@param[out]	buf		buffer where to read
-@param[in]	offset		file offset where to read
-@param[in]	n		number of bytes to read
-@param[in]	read_only	if true read only mode checks are enforced
-@param[in,out]	m1		Message for the AIO handler, (can be used to
-				identify a completed AIO operation); ignored
-				if mode is OS_AIO_SYNC
-@param[in,out]	m2		message for the AIO handler (can be used to
-				identify a completed AIO operation); ignored
-				if mode is OS_AIO_SYNC
-@param[in]	src_file	file name where func invoked
-@param[in]	src_line	line where the func invoked
-@return DB_SUCCESS if request was queued successfully, FALSE if fail */
-UNIV_INLINE
-dberr_t
-pfs_os_aio_func(
-	IORequest&	type,
-	ulint		mode,
-	const char*	name,
-	pfs_os_file_t	file,
-	void*		buf,
-	os_offset_t	offset,
-	ulint		n,
-	bool		read_only,
-	fil_node_t*	m1,
-	void*		m2,
-	const char*	src_file,
-	uint		src_line)
-{
-	PSI_file_locker_state	state;
-	struct PSI_file_locker*	locker = NULL;
-
-	ut_ad(type.validate());
-
-	/* Register the read or write I/O depending on "type" */
-	register_pfs_file_io_begin(
-		&state, locker, file, n,
-		type.is_write() ? PSI_FILE_WRITE : PSI_FILE_READ,
-		src_file, src_line);
-
-	dberr_t	result = os_aio_func(
-		type, mode, name, file, buf, offset, n, read_only, m1, m2);
-
-	register_pfs_file_io_end(locker, n);
-
-	return(result);
-}
-
 /** NOTE! Please use the corresponding macro os_file_read(), not directly
 this function!
 This is the performance schema instrumented wrapper function for
@@ -284,8 +227,6 @@ pfs_os_file_read_func(
 	PSI_file_locker_state	state;
 	struct PSI_file_locker*	locker = NULL;
 
-	ut_ad(type.validate());
-
 	register_pfs_file_io_begin(
 		&state, locker, file, n, PSI_FILE_READ, src_file, src_line);
 
@@ -433,13 +374,13 @@ pfs_os_file_rename_func(
 	PSI_file_locker_state	state;
 	struct PSI_file_locker*	locker = NULL;
 
-	register_pfs_file_open_begin(
+	register_pfs_file_rename_begin(
 		&state, locker, key, PSI_FILE_RENAME, newpath,
 		src_file, src_line);
 
 	bool	result = os_file_rename_func(oldpath, newpath);
 
-	register_pfs_file_rename_end(locker, 0);
+	register_pfs_file_rename_end(locker, oldpath, newpath, !result);
 
 	return(result);
 }
diff --git a/storage/innobase/include/os0proc.h b/storage/innobase/include/os0proc.h
deleted file mode 100644
index d8952a56cc9..00000000000
--- a/storage/innobase/include/os0proc.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file include/os0proc.h
-The interface to the operating system
-process control primitives
-
-Created 9/30/1995 Heikki Tuuri
-*******************************************************/
-
-#ifndef os0proc_h
-#define os0proc_h
-
-#include "univ.i"
-
-#ifdef UNIV_LINUX
-#include <sys/ipc.h>
-#include <sys/shm.h>
-#endif
-
-typedef void*			os_process_t;
-typedef unsigned long int	os_process_id_t;
-
-/** The total amount of memory currently allocated from the operating
-system with os_mem_alloc_large(). */
-extern Atomic_counter<ulint>	os_total_large_mem_allocated;
-
-/** Converts the current process id to a number.
-@return process id as a number */
-ulint
-os_proc_get_number(void);
-
-/** Allocates large pages memory.
-@param[in,out]	n	Number of bytes to allocate
-@return allocated memory */
-void*
-os_mem_alloc_large(
-	ulint*	n);
-
-/** Frees large pages memory.
-@param[in]	ptr	pointer returned by os_mem_alloc_large()
-@param[in]	size	size returned by os_mem_alloc_large() */
-void
-os_mem_free_large(
-	void	*ptr,
-	ulint	size);
-
-#endif
diff --git a/storage/innobase/include/os0thread.h b/storage/innobase/include/os0thread.h
index 67ee3097274..ed989045f18 100644
--- a/storage/innobase/include/os0thread.h
+++ b/storage/innobase/include/os0thread.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -46,10 +46,6 @@ typedef LPTHREAD_START_ROUTINE	os_thread_func_t;
 
 /** Macro for specifying a Windows thread start function. */
 #define DECLARE_THREAD(func)	WINAPI func
-
-#define os_thread_create(f,a,i)	\
-	os_thread_create_func(f, a, i)
-
 #else
 
 typedef pthread_t		os_thread_t;
@@ -60,8 +56,6 @@ extern "C"  { typedef void*	(*os_thread_func_t)(void*); }
 
 /** Macro for specifying a POSIX thread start function. */
 #define DECLARE_THREAD(func)	func
-#define os_thread_create(f,a,i)	os_thread_create_func(f, a, i)
-
 #endif /* _WIN32 */
 
 /* Define a function pointer type to use in a typecast */
@@ -72,25 +66,16 @@ typedef void* (*os_posix_f_t) (void*);
 typedef unsigned int    mysql_pfs_key_t;
 #endif /* HAVE_PSI_INTERFACE */
 
-/** Number of threads active. */
-extern	Atomic_counter<ulint>	os_thread_count;
-
-/***************************************************************//**
-Compares two thread ids for equality.
-@return TRUE if equal */
-ibool
-os_thread_eq(
-/*=========*/
-	os_thread_id_t	a,	/*!< in: OS thread or thread id */
-	os_thread_id_t	b);	/*!< in: OS thread or thread id */
-/****************************************************************//**
-Converts an OS thread id to a ulint. It is NOT guaranteed that the ulint is
-unique for the thread though!
-@return thread identifier as a number */
-ulint
-os_thread_pf(
-/*=========*/
-	os_thread_id_t	a);	/*!< in: OS thread identifier */
+#ifndef _WIN32
+#define os_thread_eq(a,b) pthread_equal(a, b)
+#define os_thread_yield() sched_yield()
+#define os_thread_get_curr_id() pthread_self()
+#else
+bool os_thread_eq(os_thread_id_t a, os_thread_id_t b);
+void os_thread_yield();
+os_thread_id_t os_thread_get_curr_id();
+#endif
+
 /****************************************************************//**
 Creates a new thread of execution. The execution starts from
 the function given.
@@ -98,41 +83,12 @@ NOTE: We count the number of threads in os_thread_exit(). A created
 thread should always use that to exit so thatthe thread count will be
 decremented.
 We do not return an error code because if there is one, we crash here. */
-os_thread_t
-os_thread_create_func(
-/*==================*/
-	os_thread_func_t	func,		/*!< in: pointer to function
-						from which to start */
-	void*			arg,		/*!< in: argument to start
-						function */
-	os_thread_id_t*		thread_id);	/*!< out: id of the created
-						thread, or NULL */
-
-/** Waits until the specified thread completes and joins it.
-Its return value is ignored.
-@param[in,out]	thread	thread to join */
-void
-os_thread_join(
-	os_thread_id_t	thread);
+os_thread_t os_thread_create(os_thread_func_t func, void *arg= nullptr);
 
-/** Exits the current thread.
-@param[in]	detach	if true, the thread will be detached right before
-exiting. If false, another thread is responsible for joining this thread */
-ATTRIBUTE_NORETURN ATTRIBUTE_COLD
-void os_thread_exit(bool detach = true);
+/** Detach and terminate the current thread. */
+ATTRIBUTE_NORETURN void os_thread_exit();
 
 /*****************************************************************//**
-Returns the thread identifier of current thread.
-@return current thread identifier */
-os_thread_id_t
-os_thread_get_curr_id(void);
-/*========================*/
-/*****************************************************************//**
-Advises the os to give up remainder of the thread's time slice. */
-void
-os_thread_yield(void);
-/*=================*/
-/*****************************************************************//**
 The thread sleeps at least the time given in microseconds. */
 void
 os_thread_sleep(
diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h
index 1de6952b7eb..6ce31dea0c1 100644
--- a/storage/innobase/include/page0cur.h
+++ b/storage/innobase/include/page0cur.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, MariaDB Corporation.
+Copyright (c) 2018, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -27,13 +27,7 @@ Created 10/4/1994 Heikki Tuuri
 #ifndef page0cur_h
 #define page0cur_h
 
-#include "buf0types.h"
 #include "page0page.h"
-#include "rem0types.h"
-#include "rem0rec.h"
-#include "data0data.h"
-#include "mtr0mtr.h"
-#include "gis0type.h"
 
 #ifdef UNIV_DEBUG
 /*********************************************************//**
@@ -74,6 +68,7 @@ page_cur_get_rec(
 # define page_cur_get_page_zip(cur)	buf_block_get_page_zip((cur)->block)
 # define page_cur_get_rec(cur)		(cur)->rec
 #endif /* UNIV_DEBUG */
+# define is_page_cur_get_page_zip(cur)	is_buf_block_get_page_zip((cur)->block)
 /*********************************************************//**
 Sets the cursor object to point before the first user record
 on the page. */
@@ -155,29 +150,8 @@ page_cur_tuple_insert(
 	rec_offs**	offsets,/*!< out: offsets on *rec */
 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
-	MY_ATTRIBUTE((nonnull(1,2,3,4,5), warn_unused_result));
-/***********************************************************//**
-Inserts a record next to page cursor. Returns pointer to inserted record if
-succeed, i.e., enough space available, NULL otherwise. The cursor stays at
-the same logical position, but the physical position may change if it is
-pointing to a compressed page that was reorganized.
-
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if this is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
-@return pointer to record if succeed, NULL otherwise */
-UNIV_INLINE
-rec_t*
-page_cur_rec_insert(
-/*================*/
-	page_cur_t*	cursor,	/*!< in/out: a page cursor */
-	const rec_t*	rec,	/*!< in: record to insert */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
-	mtr_t*		mtr);	/*!< in: mini-transaction handle, or NULL */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
 /***********************************************************//**
 Inserts a record next to page cursor on an uncompressed page.
 Returns pointer to inserted record if succeed, i.e., enough
@@ -186,13 +160,12 @@ space available, NULL otherwise. The cursor stays at the same position.
 rec_t*
 page_cur_insert_rec_low(
 /*====================*/
-	rec_t*		current_rec,/*!< in: pointer to current record after
-				which the new record is inserted */
+	const page_cur_t*cur,	/*!< in: page cursor */
 	dict_index_t*	index,	/*!< in: record descriptor */
-	const rec_t*	rec,	/*!< in: pointer to a physical record */
+	const rec_t*	rec,	/*!< in: record to insert after cur */
 	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
-	MY_ATTRIBUTE((nonnull(1,2,3,4), warn_unused_result));
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
 /***********************************************************//**
 Inserts a record next to page cursor on a compressed and uncompressed
@@ -213,23 +186,8 @@ page_cur_insert_rec_zip(
 	dict_index_t*	index,	/*!< in: record descriptor */
 	const rec_t*	rec,	/*!< in: pointer to a physical record */
 	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
-	MY_ATTRIBUTE((nonnull(1,2,3,4), warn_unused_result));
-/*************************************************************//**
-Copies records from page to a newly created page, from a given record onward,
-including that record. Infimum and supremum records are not copied.
-
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if this is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit(). */
-void
-page_copy_rec_list_end_to_created_page(
-/*===================================*/
-	page_t*		new_page,	/*!< in/out: index page to copy to */
-	rec_t*		rec,		/*!< in: first record to copy */
-	dict_index_t*	index,		/*!< in: record descriptor */
-	mtr_t*		mtr);		/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
 /***********************************************************//**
 Deletes a record at the page cursor. The cursor is moved to the
 next record after the deleted one. */
@@ -240,7 +198,58 @@ page_cur_delete_rec(
 	const dict_index_t*	index,	/*!< in: record descriptor */
 	const rec_offs*		offsets,/*!< in: rec_get_offsets(
 					cursor->rec, index) */
-	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+	mtr_t*			mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
+
+/** Apply a INSERT_HEAP_REDUNDANT or INSERT_REUSE_REDUNDANT record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=REDUNDANT page.
+@param block      B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse      false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev       byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@param enc_hdr    encoded fixed-size header bits
+@param hdr_c      number of common record header bytes with prev
+@param data_c     number of common data bytes with prev
+@param data       literal header and data bytes
+@param data_len   length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_redundant(const buf_block_t &block, bool reuse,
+                                 ulint prev, ulint enc_hdr,
+                                 size_t hdr_c, size_t data_c,
+                                 const void *data, size_t data_len);
+
+/** Apply a INSERT_HEAP_DYNAMIC or INSERT_REUSE_DYNAMIC record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block      B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse      false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev       byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param shift      unless !reuse: number of bytes the PAGE_FREE is moving
+@param enc_hdr_l  number of copied record header bytes, plus record type bits
+@param hdr_c      number of common record header bytes with prev
+@param data_c     number of common data bytes with prev
+@param data       literal header and data bytes
+@param data_len   length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_dynamic(const buf_block_t &block, bool reuse,
+                               ulint prev, ulint shift, ulint enc_hdr_l,
+                               size_t hdr_c, size_t data_c,
+                               const void *data, size_t data_len);
+
+/** Apply a DELETE_ROW_FORMAT_REDUNDANT record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=REDUNDANT page.
+@param block    B-tree or R-tree page in ROW_FORMAT=REDUNDANT
+@param prev     byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_redundant(const buf_block_t &block, ulint prev);
+
+/** Apply a DELETE_ROW_FORMAT_DYNAMIC record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block      B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param prev       byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param hdr_size   record header size, excluding REC_N_NEW_EXTRA_BYTES
+@param data_size  data payload size, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_dynamic(const buf_block_t &block, ulint prev,
+                               size_t hdr_size, size_t data_size);
 
 /** Search the right position for a page cursor.
 @param[in] block buffer block
@@ -326,67 +335,6 @@ page_cur_open_on_rnd_user_rec(
 /*==========================*/
 	buf_block_t*	block,	/*!< in: page */
 	page_cur_t*	cursor);/*!< out: page cursor */
-/** Write a redo log record of inserting a record into an index page.
-@param[in]	insert_rec	inserted record
-@param[in]	rec_size	rec_get_size(insert_rec)
-@param[in]	cursor_rec	predecessor of insert_rec
-@param[in,out]	index		index tree
-@param[in,out]	mtr		mini-transaction */
-void
-page_cur_insert_rec_write_log(
-	const rec_t*	insert_rec,
-	ulint		rec_size,
-	const rec_t*	cursor_rec,
-	dict_index_t*	index,
-	mtr_t*		mtr)
-	MY_ATTRIBUTE((nonnull));
-/***********************************************************//**
-Parses a log record of a record insert on a page.
-@return end of log record or NULL */
-byte*
-page_cur_parse_insert_rec(
-/*======================*/
-	ibool		is_short,/*!< in: TRUE if short inserts */
-	const byte*	ptr,	/*!< in: buffer */
-	const byte*	end_ptr,/*!< in: buffer end */
-	buf_block_t*	block,	/*!< in: page or NULL */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	mtr_t*		mtr);	/*!< in: mtr or NULL */
-/**********************************************************//**
-Parses a log record of copying a record list end to a new created page.
-@return end of log record or NULL */
-byte*
-page_parse_copy_rec_list_to_created_page(
-/*=====================================*/
-	byte*		ptr,	/*!< in: buffer */
-	byte*		end_ptr,/*!< in: buffer end */
-	buf_block_t*	block,	/*!< in: page or NULL */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	mtr_t*		mtr);	/*!< in: mtr or NULL */
-/***********************************************************//**
-Parses log record of a record delete on a page.
-@return pointer to record end or NULL */
-byte*
-page_cur_parse_delete_rec(
-/*======================*/
-	byte*		ptr,	/*!< in: buffer */
-	byte*		end_ptr,/*!< in: buffer end */
-	buf_block_t*	block,	/*!< in: page or NULL */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	mtr_t*		mtr);	/*!< in: mtr or NULL */
-/*******************************************************//**
-Removes the record from a leaf page. This function does not log
-any changes. It is used by the IMPORT tablespace functions.
-@return true if success, i.e., the page did not become too empty */
-bool
-page_delete_rec(
-/*============*/
-	const dict_index_t*	index,	/*!< in: The index that the record
-					belongs to */
-	page_cur_t*		pcur,	/*!< in/out: page cursor on record
-					to delete */
-	page_zip_des_t*		page_zip,/*!< in: compressed page descriptor */
-	const rec_offs*		offsets);/*!< in: offsets for record */
 
 /** Index page cursor */
 
diff --git a/storage/innobase/include/page0cur.inl b/storage/innobase/include/page0cur.inl
index e53f6d8f463..828be6840d2 100644
--- a/storage/innobase/include/page0cur.inl
+++ b/storage/innobase/include/page0cur.inl
@@ -257,7 +257,7 @@ page_cur_tuple_insert(
 	rec_offs**	offsets,/*!< out: offsets on *rec */
 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	rec_t*		rec;
 	ulint		size = rec_get_converted_size(index, tuple, n_ext);
@@ -278,45 +278,14 @@ page_cur_tuple_insert(
 				   ULINT_UNDEFINED, heap);
 	ut_ad(size == rec_offs_size(*offsets));
 
-	if (buf_block_get_page_zip(cursor->block)) {
+	if (is_buf_block_get_page_zip(cursor->block)) {
 		rec = page_cur_insert_rec_zip(
 			cursor, index, rec, *offsets, mtr);
 	} else {
-		rec = page_cur_insert_rec_low(cursor->rec,
+		rec = page_cur_insert_rec_low(cursor,
 					      index, rec, *offsets, mtr);
 	}
 
 	ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, *offsets));
 	return(rec);
 }
-
-/***********************************************************//**
-Inserts a record next to page cursor. Returns pointer to inserted record if
-succeed, i.e., enough space available, NULL otherwise. The cursor stays at
-the same logical position, but the physical position may change if it is
-pointing to a compressed page that was reorganized.
-
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if this is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
-@return pointer to record if succeed, NULL otherwise */
-UNIV_INLINE
-rec_t*
-page_cur_rec_insert(
-/*================*/
-	page_cur_t*	cursor,	/*!< in/out: a page cursor */
-	const rec_t*	rec,	/*!< in: record to insert */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
-{
-	if (buf_block_get_page_zip(cursor->block)) {
-		return(page_cur_insert_rec_zip(
-			       cursor, index, rec, offsets, mtr));
-	} else {
-		return(page_cur_insert_rec_low(cursor->rec,
-					       index, rec, offsets, mtr));
-	}
-}
diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h
index e80a9a050fe..eb6bf56e8dd 100644
--- a/storage/innobase/include/page0page.h
+++ b/storage/innobase/include/page0page.h
@@ -31,6 +31,7 @@ Created 2/2/1994 Heikki Tuuri
 #include "fil0fil.h"
 #include "buf0buf.h"
 #include "rem0rec.h"
+#include "mach0data.h"
 #ifndef UNIV_INNOCHECKSUM
 #include "dict0dict.h"
 #include "data0data.h"
@@ -42,8 +43,6 @@ Created 2/2/1994 Heikki Tuuri
 Index page header starts at the first offset left free by the FIL-module */
 
 typedef	byte		page_header_t;
-#else
-# include "mach0data.h"
 #endif /* !UNIV_INNOCHECKSUM */
 
 #define	PAGE_HEADER	FSEG_PAGE_DATA	/* index page header starts at this
@@ -172,7 +171,6 @@ constexpr uint16_t PAGE_NO_DIRECTION= 5;
 */
 
 typedef	byte			page_dir_slot_t;
-typedef page_dir_slot_t		page_dir_t;
 
 /* Offset of the directory start down from the page end. We call the
 slot with the highest file address directory start, as it points to
@@ -180,7 +178,7 @@ the first record in the list of records. */
 #define	PAGE_DIR		FIL_PAGE_DATA_END
 
 /* We define a slot in the page directory as two bytes */
-#define	PAGE_DIR_SLOT_SIZE	2
+constexpr uint16_t PAGE_DIR_SLOT_SIZE= 2;
 
 /* The offset of the physically lower end of the directory, counted from
 page end, when the page is empty */
@@ -199,22 +197,23 @@ extern my_bool srv_immediate_scrub_data_uncompressed;
 @param[in]	ptr	pointer within a page frame
 @return start of the page frame */
 MY_ATTRIBUTE((const))
-inline
-page_t*
-page_align(const void* ptr)
+inline page_t* page_align(void *ptr)
+{
+  return my_assume_aligned<UNIV_PAGE_SIZE_MIN>
+    (reinterpret_cast<page_t*>(ut_align_down(ptr, srv_page_size)));
+}
+inline const page_t *page_align(const void *ptr)
 {
-	return(static_cast<page_t*>(ut_align_down(ptr, srv_page_size)));
+  return page_align(const_cast<void*>(ptr));
 }
 
 /** Gets the byte offset within a page frame.
 @param[in]	ptr	pointer within a page frame
 @return offset from the start of the page */
 MY_ATTRIBUTE((const))
-inline
-ulint
-page_offset(const void*	ptr)
+inline uint16_t page_offset(const void*	ptr)
 {
-	return(ut_align_offset(ptr, srv_page_size));
+  return static_cast<uint16_t>(ut_align_offset(ptr, srv_page_size));
 }
 
 /** Determine whether an index page is not in ROW_FORMAT=REDUNDANT.
@@ -395,13 +394,50 @@ inline
 bool
 page_rec_is_infimum(const rec_t* rec);
 
-/*************************************************************//**
-Returns the max trx id field value. */
-UNIV_INLINE
-trx_id_t
-page_get_max_trx_id(
-/*================*/
-	const page_t*	page);	/*!< in: page */
+/** Read PAGE_MAX_TRX_ID.
+@param[in]      page    index page
+@return the value of PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline trx_id_t page_get_max_trx_id(const page_t *page)
+{
+  ut_ad(fil_page_index_page_check(page));
+  static_assert((PAGE_HEADER + PAGE_MAX_TRX_ID) % 8 == 0, "alignment");
+  const auto *p= my_assume_aligned<8>(page + PAGE_HEADER + PAGE_MAX_TRX_ID);
+  return mach_read_from_8(p);
+}
+
+/**
+Set the number of owned records.
+@tparam compressed    whether to update any ROW_FORMAT=COMPRESSED page as well
+@param[in,out]  block   index page
+@param[in,out]  rec     record in block.frame
+@param[in]      n_owned number of records skipped in the sparse page directory
+@param[in]      comp    whether ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED
+@param[in,out]  mtr     mini-transaction */
+template<bool compressed>
+inline void page_rec_set_n_owned(buf_block_t *block, rec_t *rec, ulint n_owned,
+                                 bool comp, mtr_t *mtr)
+{
+  ut_ad(block->frame == page_align(rec));
+  ut_ad(comp == (page_is_comp(block->frame) != 0));
+
+  if (page_zip_des_t *page_zip= compressed
+      ? buf_block_get_page_zip(block) : nullptr)
+  {
+    ut_ad(comp);
+    rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED,
+                        REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+    if (rec_get_status(rec) != REC_STATUS_SUPREMUM)
+      page_zip_rec_set_owned(block, rec, n_owned, mtr);
+  }
+  else
+  {
+    rec-= comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED;
+    mtr->write<1,mtr_t::MAYBE_NOP>(*block, rec, (*rec & ~REC_N_OWNED_MASK) |
+                                   (n_owned << REC_N_OWNED_SHIFT));
+  }
+}
+
 /*************************************************************//**
 Sets the max trx id field value. */
 void
@@ -426,7 +462,6 @@ page_update_max_trx_id(
 
 /** Persist the AUTO_INCREMENT value on a clustered index root page.
 @param[in,out]	block	clustered index root page
-@param[in]	index	clustered index
 @param[in]	autoinc	next available AUTO_INCREMENT value
 @param[in,out]	mtr	mini-transaction
 @param[in]	reset	whether to reset the AUTO_INCREMENT
@@ -435,20 +470,11 @@ page_update_max_trx_id(
 void
 page_set_autoinc(
 	buf_block_t*		block,
-	const dict_index_t*	index MY_ATTRIBUTE((unused)),
 	ib_uint64_t		autoinc,
 	mtr_t*			mtr,
 	bool			reset)
 	MY_ATTRIBUTE((nonnull));
 
-/** Read the AUTO_INCREMENT value from a clustered index root page.
-@param[in]	page	clustered index root page
-@return	the persisted AUTO_INCREMENT value */
-MY_ATTRIBUTE((nonnull, warn_unused_result))
-UNIV_INLINE
-ib_uint64_t
-page_get_autoinc(const page_t* page);
-
 /*************************************************************//**
 Returns the RTREE SPLIT SEQUENCE NUMBER (FIL_RTREE_SPLIT_SEQ_NUM).
 @return SPLIT SEQUENCE NUMBER */
@@ -470,28 +496,16 @@ page_set_ssn_id(
 	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 
 #endif /* !UNIV_INNOCHECKSUM */
-/*************************************************************//**
-Reads the given header field. */
-UNIV_INLINE
-uint16_t
-page_header_get_field(
-/*==================*/
-	const page_t*	page,	/*!< in: page */
-	ulint		field);	/*!< in: PAGE_N_DIR_SLOTS, ... */
+/** Read a page header field. */
+inline uint16_t page_header_get_field(const page_t *page, ulint field)
+{
+  ut_ad(field <= PAGE_INDEX_ID);
+  ut_ad(!(field & 1));
+  return mach_read_from_2(my_assume_aligned<2>(PAGE_HEADER + field + page));
+}
 
 #ifndef UNIV_INNOCHECKSUM
 /*************************************************************//**
-Sets the given header field. */
-UNIV_INLINE
-void
-page_header_set_field(
-/*==================*/
-	page_t*		page,	/*!< in/out: page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
-				uncompressed part will be updated, or NULL */
-	ulint		field,	/*!< in: PAGE_N_DIR_SLOTS, ... */
-	ulint		val);	/*!< in: value */
-/*************************************************************//**
 Returns the offset stored in the given header field.
 @return offset from the start of the page, or 0 */
 UNIV_INLINE
@@ -507,29 +521,13 @@ Returns the pointer stored in the given header field, or NULL. */
 #define page_header_get_ptr(page, field)			\
 	(page_header_get_offs(page, field)			\
 	 ? page + page_header_get_offs(page, field) : NULL)
-/*************************************************************//**
-Sets the pointer stored in the given header field. */
-UNIV_INLINE
-void
-page_header_set_ptr(
-/*================*/
-	page_t*		page,	/*!< in/out: page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
-				uncompressed part will be updated, or NULL */
-	ulint		field,	/*!< in/out: PAGE_FREE, ... */
-	const byte*	ptr);	/*!< in: pointer or NULL*/
 
-/*************************************************************//**
-Resets the last insert info field in the page header. Writes to mlog
-about this operation. */
-UNIV_INLINE
-void
-page_header_reset_last_insert(
-/*==========================*/
-	page_t*		page,	/*!< in: page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
-				uncompressed part will be updated, or NULL */
-	mtr_t*		mtr);	/*!< in: mtr */
+/**
+Reset PAGE_LAST_INSERT.
+@param[in,out]  block    file page
+@param[in,out]  mtr      mini-transaction */
+inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
 #define page_get_infimum_rec(page) ((page) + page_get_infimum_offset(page))
 #define page_get_supremum_rec(page) ((page) + page_get_supremum_offset(page))
 
@@ -612,19 +610,6 @@ page_dir_get_n_heap(
 /*================*/
 	const page_t*	page);	/*!< in: index page */
 /*************************************************************//**
-Sets the number of records in the heap. */
-UNIV_INLINE
-void
-page_dir_set_n_heap(
-/*================*/
-	page_t*		page,	/*!< in/out: index page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
-				uncompressed part will be updated, or NULL.
-				Note that the size of the dense page directory
-				in the compressed page trailer is
-				n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */
-	ulint		n_heap);/*!< in: number of records */
-/*************************************************************//**
 Gets the number of dir slots in directory.
 @return number of slots */
 UNIV_INLINE
@@ -632,31 +617,19 @@ uint16_t
 page_dir_get_n_slots(
 /*=================*/
 	const page_t*	page);	/*!< in: index page */
-/*************************************************************//**
-Sets the number of dir slots in directory. */
-UNIV_INLINE
-void
-page_dir_set_n_slots(
-/*=================*/
-	page_t*		page,	/*!< in/out: page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
-				uncompressed part will be updated, or NULL */
-	ulint		n_slots);/*!< in: number of slots */
-#ifdef UNIV_DEBUG
-/*************************************************************//**
-Gets pointer to nth directory slot.
-@return pointer to dir slot */
-UNIV_INLINE
-page_dir_slot_t*
-page_dir_get_nth_slot(
-/*==================*/
-	const page_t*	page,	/*!< in: index page */
-	ulint		n);	/*!< in: position */
-#else /* UNIV_DEBUG */
-# define page_dir_get_nth_slot(page, n)			\
-	((page) + (srv_page_size - PAGE_DIR		\
-		   - (n + 1) * PAGE_DIR_SLOT_SIZE))
-#endif /* UNIV_DEBUG */
+/** Gets the pointer to a directory slot.
+@param n  sparse directory slot number
+@return pointer to the sparse directory slot */
+inline page_dir_slot_t *page_dir_get_nth_slot(page_t *page, ulint n)
+{
+  ut_ad(page_dir_get_n_slots(page) > n);
+  static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+  return my_assume_aligned<2>(page + srv_page_size - (PAGE_DIR + 2) - n * 2);
+}
+inline const page_dir_slot_t *page_dir_get_nth_slot(const page_t *page,ulint n)
+{
+  return page_dir_get_nth_slot(const_cast<page_t*>(page), n);
+}
 /**************************************************************//**
 Used to check the consistency of a record on a page.
 @return TRUE if succeed */
@@ -665,22 +638,17 @@ ibool
 page_rec_check(
 /*===========*/
 	const rec_t*	rec);	/*!< in: record */
-/***************************************************************//**
-Gets the record pointed to by a directory slot.
+/** Get the record pointed to by a directory slot.
+@param[in] slot   directory slot
 @return pointer to record */
-UNIV_INLINE
-const rec_t*
-page_dir_slot_get_rec(
-/*==================*/
-	const page_dir_slot_t*	slot);	/*!< in: directory slot */
-/***************************************************************//**
-This is used to set the record offset in a directory slot. */
-UNIV_INLINE
-void
-page_dir_slot_set_rec(
-/*==================*/
-	page_dir_slot_t* slot,	/*!< in: directory slot */
-	rec_t*		 rec);	/*!< in: record on the page */
+inline rec_t *page_dir_slot_get_rec(page_dir_slot_t *slot)
+{
+  return page_align(slot) + mach_read_from_2(my_assume_aligned<2>(slot));
+}
+inline const rec_t *page_dir_slot_get_rec(const page_dir_slot_t *slot)
+{
+  return page_dir_slot_get_rec(const_cast<rec_t*>(slot));
+}
 /***************************************************************//**
 Gets the number of records owned by a directory slot.
 @return number of records */
@@ -689,15 +657,6 @@ ulint
 page_dir_slot_get_n_owned(
 /*======================*/
 	const page_dir_slot_t*	slot);	/*!< in: page directory slot */
-/***************************************************************//**
-This is used to set the owned records field of a directory slot. */
-UNIV_INLINE
-void
-page_dir_slot_set_n_owned(
-/*======================*/
-	page_dir_slot_t*slot,	/*!< in/out: directory slot */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	ulint		n);	/*!< in: number of records owned by the slot */
 /************************************************************//**
 Calculates the space reserved for directory slots of a given
 number of records. The exact value is a fraction number
@@ -754,6 +713,19 @@ inline bool page_has_next(const page_t* page)
 		!= FIL_NULL;
 }
 
+/** Read the AUTO_INCREMENT value from a clustered index root page.
+@param[in]	page	clustered index root page
+@return	the persisted AUTO_INCREMENT value */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline uint64_t page_get_autoinc(const page_t *page)
+{
+  ut_d(uint16_t page_type= fil_page_get_type(page));
+  ut_ad(page_type == FIL_PAGE_INDEX || page_type == FIL_PAGE_TYPE_INSTANT);
+  ut_ad(!page_has_siblings(page));
+  const auto *p= my_assume_aligned<8>(page + PAGE_HEADER + PAGE_ROOT_AUTO_INC);
+  return mach_read_from_8(p);
+}
+
 /************************************************************//**
 Gets the pointer to the next record on the page.
 @return pointer to next record */
@@ -790,16 +762,6 @@ page_rec_get_next_non_del_marked(
 /*=============================*/
 	const rec_t*	rec);	/*!< in: pointer to record */
 /************************************************************//**
-Sets the pointer to the next record on the page. */
-UNIV_INLINE
-void
-page_rec_set_next(
-/*==============*/
-	rec_t*		rec,	/*!< in: pointer to record,
-				must not be page supremum */
-	const rec_t*	next);	/*!< in: pointer to next record,
-				must not be page infimum */
-/************************************************************//**
 Gets the pointer to the previous record.
 @return pointer to previous record */
 UNIV_INLINE
@@ -878,15 +840,6 @@ page_rec_is_second_last(
 	const page_t*	page)	/*!< in: page */
 	MY_ATTRIBUTE((warn_unused_result));
 
-/***************************************************************//**
-Looks for the record which owns the given record.
-@return the owner record */
-UNIV_INLINE
-rec_t*
-page_rec_find_owner_rec(
-/*====================*/
-	rec_t*	rec);	/*!< in: the physical record */
-
 /************************************************************//**
 Returns the maximum combined size of records which can be inserted on top
 of record heap.
@@ -916,15 +869,6 @@ page_get_free_space_of_empty(
 /*=========================*/
 	ulint	comp)	/*!< in: nonzero=compact page format */
 		MY_ATTRIBUTE((const));
-/**********************************************************//**
-Returns the base extra size of a physical record.  This is the
-size of the fixed header, independent of the record size.
-@return REC_N_NEW_EXTRA_BYTES or REC_N_OLD_EXTRA_BYTES */
-UNIV_INLINE
-ulint
-page_rec_get_base_extra_size(
-/*=========================*/
-	const rec_t*	rec);	/*!< in: physical record */
 /************************************************************//**
 Returns the sum of the sizes of the records in the record list
 excluding the infimum and supremum records.
@@ -934,35 +878,6 @@ uint16_t
 page_get_data_size(
 /*===============*/
 	const page_t*	page);	/*!< in: index page */
-/************************************************************//**
-Allocates a block of memory from the head of the free list
-of an index page. */
-UNIV_INLINE
-void
-page_mem_alloc_free(
-/*================*/
-	page_t*		page,	/*!< in/out: index page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page with enough
-				space available for inserting the record,
-				or NULL */
-	rec_t*		next_rec,/*!< in: pointer to the new head of the
-				free record list */
-	ulint		need);	/*!< in: number of bytes allocated */
-/************************************************************//**
-Puts a record to free list. */
-UNIV_INLINE
-void
-page_mem_free(
-/*==========*/
-	page_t*			page,	/*!< in/out: index page */
-	page_zip_des_t*		page_zip,/*!< in/out: compressed page,
-					 or NULL */
-	rec_t*			rec,	/*!< in: pointer to the (origin of)
-					record */
-	const dict_index_t*	index,	/*!< in: index of rec */
-	const rec_offs*		offsets);/*!< in: array returned by
-					 rec_get_offsets() */
-
 /** Read the PAGE_DIRECTION field from a byte.
 @param[in]	ptr	pointer to PAGE_DIRECTION_B
 @return	the value of the PAGE_DIRECTION field */
@@ -970,13 +885,6 @@ inline
 byte
 page_ptr_get_direction(const byte* ptr);
 
-/** Set the PAGE_DIRECTION field.
-@param[in]	ptr	pointer to PAGE_DIRECTION_B
-@param[in]	dir	the value of the PAGE_DIRECTION field */
-inline
-void
-page_ptr_set_direction(byte* ptr, byte dir);
-
 /** Read the PAGE_DIRECTION field.
 @param[in]	page	index page
 @return	the value of the PAGE_DIRECTION field */
@@ -994,21 +902,14 @@ inline
 uint16_t
 page_get_instant(const page_t* page);
 
+/** Create an uncompressed index page.
+@param[in,out]	block	buffer block
+@param[in,out]	mtr	mini-transaction
+@param[in]	comp	set unless ROW_FORMAT=REDUNDANT */
+void page_create(buf_block_t *block, mtr_t *mtr, bool comp);
 /**********************************************************//**
-Create an uncompressed B-tree index page.
-@return pointer to the page */
-page_t*
-page_create(
-/*========*/
-	buf_block_t*	block,		/*!< in: a buffer block where the
-					page is created */
-	mtr_t*		mtr,		/*!< in: mini-transaction handle */
-	ulint		comp,		/*!< in: nonzero=compact page format */
-	bool		is_rtree);	/*!< in: if creating R-tree page */
-/**********************************************************//**
-Create a compressed B-tree index page.
-@return pointer to the page */
-page_t*
+Create a compressed B-tree index page. */
+void
 page_create_zip(
 /*============*/
 	buf_block_t*		block,		/*!< in/out: a buffer frame
@@ -1153,52 +1054,10 @@ page_move_rec_list_start(
 	dict_index_t*	index,		/*!< in: record descriptor */
 	mtr_t*		mtr)		/*!< in: mtr */
 	MY_ATTRIBUTE((nonnull(1, 2, 4, 5)));
-/****************************************************************//**
-Splits a directory slot which owns too many records. */
-void
-page_dir_split_slot(
-/*================*/
-	page_t*		page,	/*!< in: index page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
-				uncompressed part will be written, or NULL */
-	ulint		slot_no)/*!< in: the directory slot */
-	MY_ATTRIBUTE((nonnull(1)));
-/*************************************************************//**
-Tries to balance the given directory slot with too few records
-with the upper neighbor, so that there are at least the minimum number
-of records owned by the slot; this may result in the merging of
-two slots. */
-void
-page_dir_balance_slot(
-/*==================*/
-	page_t*		page,	/*!< in/out: index page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	ulint		slot_no)/*!< in: the directory slot */
-	MY_ATTRIBUTE((nonnull(1)));
-/**********************************************************//**
-Parses a log record of a record list end or start deletion.
-@return end of log record or NULL */
-byte*
-page_parse_delete_rec_list(
-/*=======================*/
-	mlog_id_t	type,	/*!< in: MLOG_LIST_END_DELETE,
-				MLOG_LIST_START_DELETE,
-				MLOG_COMP_LIST_END_DELETE or
-				MLOG_COMP_LIST_START_DELETE */
-	byte*		ptr,	/*!< in: buffer */
-	byte*		end_ptr,/*!< in: buffer end */
-	buf_block_t*	block,	/*!< in/out: buffer block or NULL */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	mtr_t*		mtr);	/*!< in: mtr or NULL */
-/** Parses a redo log record of creating a page.
-@param[in,out]	block	buffer block, or NULL
-@param[in]	comp	nonzero=compact page format
-@param[in]	is_rtree whether it is rtree page */
-void
-page_parse_create(
-	buf_block_t*	block,
-	ulint		comp,
-	bool		is_rtree);
+/** Create an index page.
+@param[in,out]	block	buffer block
+@param[in]	comp	nonzero=compact page format */
+void page_create_low(const buf_block_t* block, bool comp);
 
 /************************************************************//**
 Prints record contents including the data relevant only in
diff --git a/storage/innobase/include/page0page.inl b/storage/innobase/include/page0page.inl
index 04e850ea301..6514886dd67 100644
--- a/storage/innobase/include/page0page.inl
+++ b/storage/innobase/include/page0page.inl
@@ -28,25 +28,11 @@ Created 2/2/1994 Heikki Tuuri
 #define page0page_ic
 
 #ifndef UNIV_INNOCHECKSUM
-#include "mach0data.h"
 #include "rem0cmp.h"
 #include "mtr0log.h"
 #include "page0zip.h"
 
 /*************************************************************//**
-Returns the max trx id field value. */
-UNIV_INLINE
-trx_id_t
-page_get_max_trx_id(
-/*================*/
-	const page_t*	page)	/*!< in: page */
-{
-	ut_ad(page);
-
-	return(mach_read_from_8(page + PAGE_HEADER + PAGE_MAX_TRX_ID));
-}
-
-/*************************************************************//**
 Sets the max trx id field value if trx_id is bigger than the previous
 value. */
 UNIV_INLINE
@@ -60,14 +46,8 @@ page_update_max_trx_id(
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ut_ad(block);
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-	/* During crash recovery, this function may be called on
-	something else than a leaf page of a secondary index or the
-	insert buffer index tree (dict_index_is_sec_or_ibuf() returns
-	TRUE for the dummy indexes constructed during redo log
-	application).  In that case, PAGE_MAX_TRX_ID is unused,
-	and trx_id is usually zero. */
-	ut_ad(trx_id || recv_recovery_is_on());
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(trx_id);
 	ut_ad(page_is_leaf(buf_block_get_frame(block)));
 
 	if (page_get_max_trx_id(buf_block_get_frame(block)) < trx_id) {
@@ -76,18 +56,6 @@ page_update_max_trx_id(
 	}
 }
 
-/** Read the AUTO_INCREMENT value from a clustered index root page.
-@param[in]	page	clustered index root page
-@return	the persisted AUTO_INCREMENT value */
-UNIV_INLINE
-ib_uint64_t
-page_get_autoinc(const page_t* page)
-{
-	ut_ad(fil_page_index_page_check(page));
-	ut_ad(!page_has_siblings(page));
-	return(mach_read_from_8(PAGE_HEADER + PAGE_ROOT_AUTO_INC + page));
-}
-
 /*************************************************************//**
 Returns the RTREE SPLIT SEQUENCE NUMBER (FIL_RTREE_SPLIT_SEQ_NUM).
 @return	SPLIT SEQUENCE NUMBER */
@@ -115,67 +83,20 @@ page_set_ssn_id(
 	node_seq_t	ssn_id,	/*!< in: transaction id */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	page_t*	page = buf_block_get_frame(block);
-
-	ut_ad(!mtr || mtr_memo_contains_flagged(mtr, block,
-						MTR_MEMO_PAGE_SX_FIX
-						| MTR_MEMO_PAGE_X_FIX));
-
-	if (page_zip) {
-		mach_write_to_8(page + FIL_RTREE_SPLIT_SEQ_NUM, ssn_id);
-		page_zip_write_header(page_zip,
-				      page + FIL_RTREE_SPLIT_SEQ_NUM,
-				      8, mtr);
-	} else if (mtr) {
-		mlog_write_ull(page + FIL_RTREE_SPLIT_SEQ_NUM, ssn_id, mtr);
-	} else {
-		mach_write_to_8(page + FIL_RTREE_SPLIT_SEQ_NUM, ssn_id);
-	}
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_SX_FIX |
+                                   MTR_MEMO_PAGE_X_FIX));
+  ut_ad(!page_zip || page_zip == &block->page.zip);
+  constexpr uint16_t field= FIL_RTREE_SPLIT_SEQ_NUM;
+  byte *b= my_assume_aligned<2>(&block->frame[field]);
+  if (mtr->write<8,mtr_t::MAYBE_NOP>(*block, b, ssn_id) &&
+      UNIV_LIKELY_NULL(page_zip))
+    memcpy_aligned<2>(&page_zip->data[field], b, 8);
 }
 
 #endif /* !UNIV_INNOCHECKSUM */
 
-/*************************************************************//**
-Reads the given header field. */
-UNIV_INLINE
-uint16_t
-page_header_get_field(
-/*==================*/
-	const page_t*	page,	/*!< in: page */
-	ulint		field)	/*!< in: PAGE_LEVEL, ... */
-{
-	ut_ad(page);
-	ut_ad(field <= PAGE_INDEX_ID);
-
-	return(mach_read_from_2(page + PAGE_HEADER + field));
-}
-
 #ifndef UNIV_INNOCHECKSUM
 /*************************************************************//**
-Sets the given header field. */
-UNIV_INLINE
-void
-page_header_set_field(
-/*==================*/
-	page_t*		page,	/*!< in/out: page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
-				uncompressed part will be updated, or NULL */
-	ulint		field,	/*!< in: PAGE_N_DIR_SLOTS, ... */
-	ulint		val)	/*!< in: value */
-{
-	ut_ad(page);
-	ut_ad(field <= PAGE_N_RECS);
-	ut_ad(field == PAGE_N_HEAP || val < srv_page_size);
-	ut_ad(field != PAGE_N_HEAP || (val & 0x7fff) < srv_page_size);
-
-	mach_write_to_2(page + PAGE_HEADER + field, val);
-	if (page_zip) {
-		page_zip_write_header(page_zip,
-				      page + PAGE_HEADER + field, 2, NULL);
-	}
-}
-
-/*************************************************************//**
 Returns the offset stored in the given header field.
 @return offset from the start of the page, or 0 */
 UNIV_INLINE
@@ -196,60 +117,18 @@ page_header_get_offs(
 	return(offs);
 }
 
-/*************************************************************//**
-Sets the pointer stored in the given header field. */
-UNIV_INLINE
-void
-page_header_set_ptr(
-/*================*/
-	page_t*		page,	/*!< in: page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
-				uncompressed part will be updated, or NULL */
-	ulint		field,	/*!< in: PAGE_FREE, ... */
-	const byte*	ptr)	/*!< in: pointer or NULL*/
-{
-	ulint	offs;
-
-	ut_ad(page);
-	ut_ad((field == PAGE_FREE)
-	      || (field == PAGE_LAST_INSERT)
-	      || (field == PAGE_HEAP_TOP));
-
-	if (ptr == NULL) {
-		offs = 0;
-	} else {
-		offs = ulint(ptr - page);
-	}
-
-	ut_ad((field != PAGE_HEAP_TOP) || offs);
-
-	page_header_set_field(page, page_zip, field, offs);
-}
 
-/*************************************************************//**
-Resets the last insert info field in the page header. Writes to mlog
-about this operation. */
-UNIV_INLINE
-void
-page_header_reset_last_insert(
-/*==========================*/
-	page_t*		page,	/*!< in/out: page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
-				uncompressed part will be updated, or NULL */
-	mtr_t*		mtr)	/*!< in: mtr */
+/**
+Reset PAGE_LAST_INSERT.
+@param[in,out]  block    file page
+@param[in,out]  mtr      mini-transaction */
+inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr)
 {
-	ut_ad(page != NULL);
-	ut_ad(mtr != NULL);
-
-	if (page_zip) {
-		mach_write_to_2(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0);
-		page_zip_write_header(page_zip,
-				      page + (PAGE_HEADER + PAGE_LAST_INSERT),
-				      2, mtr);
-	} else {
-		mlog_write_ulint(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0,
-				 MLOG_2BYTES, mtr);
-	}
+  constexpr uint16_t field= PAGE_HEADER + PAGE_LAST_INSERT;
+  byte *b= my_assume_aligned<2>(&block->frame[field]);
+  if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, 0U) &&
+      UNIV_LIKELY_NULL(block->page.zip.data))
+    memset_aligned<2>(&block->page.zip.data[field], 0, 2);
 }
 
 /***************************************************************//**
@@ -426,8 +305,8 @@ page_get_page_no(
 /*=============*/
 	const page_t*	page)	/*!< in: page */
 {
-	ut_ad(page == page_align((page_t*) page));
-	return(mach_read_from_4(page + FIL_PAGE_OFFSET));
+  ut_ad(page == page_align((page_t*) page));
+  return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_OFFSET));
 }
 
 #ifndef UNIV_INNOCHECKSUM
@@ -440,8 +319,9 @@ page_get_space_id(
 /*==============*/
 	const page_t*	page)	/*!< in: page */
 {
-	ut_ad(page == page_align((page_t*) page));
-	return(mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+  ut_ad(page == page_align((page_t*) page));
+  return mach_read_from_4(my_assume_aligned<2>
+                          (page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
 }
 
 #endif /* !UNIV_INNOCHECKSUM */
@@ -471,19 +351,6 @@ page_dir_get_n_slots(
 {
 	return(page_header_get_field(page, PAGE_N_DIR_SLOTS));
 }
-/*************************************************************//**
-Sets the number of dir slots in directory. */
-UNIV_INLINE
-void
-page_dir_set_n_slots(
-/*=================*/
-	page_t*		page,	/*!< in/out: page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
-				uncompressed part will be updated, or NULL */
-	ulint		n_slots)/*!< in: number of slots */
-{
-	page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots);
-}
 
 /*************************************************************//**
 Gets the number of records in the heap.
@@ -497,48 +364,6 @@ page_dir_get_n_heap(
 	return(page_header_get_field(page, PAGE_N_HEAP) & 0x7fff);
 }
 
-/*************************************************************//**
-Sets the number of records in the heap. */
-UNIV_INLINE
-void
-page_dir_set_n_heap(
-/*================*/
-	page_t*		page,	/*!< in/out: index page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
-				uncompressed part will be updated, or NULL.
-				Note that the size of the dense page directory
-				in the compressed page trailer is
-				n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */
-	ulint		n_heap)	/*!< in: number of records */
-{
-	ut_ad(n_heap < 0x8000);
-	ut_ad(!page_zip || uint16_t(n_heap)
-	      == (page_header_get_field(page, PAGE_N_HEAP) & 0x7fff) + 1);
-
-	page_header_set_field(page, page_zip, PAGE_N_HEAP, n_heap
-			      | (0x8000
-				 & page_header_get_field(page, PAGE_N_HEAP)));
-}
-
-#ifdef UNIV_DEBUG
-/*************************************************************//**
-Gets pointer to nth directory slot.
-@return pointer to dir slot */
-UNIV_INLINE
-page_dir_slot_t*
-page_dir_get_nth_slot(
-/*==================*/
-	const page_t*	page,	/*!< in: index page */
-	ulint		n)	/*!< in: position */
-{
-	ut_ad(page_dir_get_n_slots(page) > n);
-
-	return((page_dir_slot_t*)
-	       page + srv_page_size - PAGE_DIR
-	       - (n + 1) * PAGE_DIR_SLOT_SIZE);
-}
-#endif /* UNIV_DEBUG */
-
 /**************************************************************//**
 Used to check the consistency of a record on a page.
 @return TRUE if succeed */
@@ -559,32 +384,6 @@ page_rec_check(
 }
 
 /***************************************************************//**
-Gets the record pointed to by a directory slot.
-@return pointer to record */
-UNIV_INLINE
-const rec_t*
-page_dir_slot_get_rec(
-/*==================*/
-	const page_dir_slot_t*	slot)	/*!< in: directory slot */
-{
-	return(page_align(slot) + mach_read_from_2(slot));
-}
-
-/***************************************************************//**
-This is used to set the record offset in a directory slot. */
-UNIV_INLINE
-void
-page_dir_slot_set_rec(
-/*==================*/
-	page_dir_slot_t* slot,	/*!< in: directory slot */
-	rec_t*		 rec)	/*!< in: record on the page */
-{
-	ut_ad(page_rec_check(rec));
-
-	mach_write_to_2(slot, page_offset(rec));
-}
-
-/***************************************************************//**
 Gets the number of records owned by a directory slot.
 @return number of records */
 UNIV_INLINE
@@ -601,25 +400,6 @@ page_dir_slot_get_n_owned(
 	}
 }
 
-/***************************************************************//**
-This is used to set the owned records field of a directory slot. */
-UNIV_INLINE
-void
-page_dir_slot_set_n_owned(
-/*======================*/
-	page_dir_slot_t*slot,	/*!< in/out: directory slot */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	ulint		n)	/*!< in: number of records owned by the slot */
-{
-	rec_t*	rec	= (rec_t*) page_dir_slot_get_rec(slot);
-	if (page_rec_is_comp(slot)) {
-		rec_set_n_owned_new(rec, page_zip, n);
-	} else {
-		ut_ad(!page_zip);
-		rec_set_n_owned_old(rec, n);
-	}
-}
-
 /************************************************************//**
 Calculates the space reserved for directory slots of a given number of
 records. The exact value is a fraction number n * PAGE_DIR_SLOT_SIZE /
@@ -725,35 +505,6 @@ page_rec_get_next_non_del_marked(
 }
 
 /************************************************************//**
-Sets the pointer to the next record on the page. */
-UNIV_INLINE
-void
-page_rec_set_next(
-/*==============*/
-	rec_t*		rec,	/*!< in: pointer to record,
-				must not be page supremum */
-	const rec_t*	next)	/*!< in: pointer to next record,
-				must not be page infimum */
-{
-	ulint	offs;
-
-	ut_ad(page_rec_check(rec));
-	ut_ad(!page_rec_is_supremum(rec));
-	ut_ad(rec != next);
-
-	ut_ad(!next || !page_rec_is_infimum(next));
-	ut_ad(!next || page_align(rec) == page_align(next));
-
-	offs = next != NULL ? page_offset(next) : 0;
-
-	if (page_rec_is_comp(rec)) {
-		rec_set_next_offs_new(rec, offs);
-	} else {
-		rec_set_next_offs_old(rec, offs);
-	}
-}
-
-/************************************************************//**
 Gets the pointer to the previous record.
 @return pointer to previous record */
 UNIV_INLINE
@@ -813,45 +564,6 @@ page_rec_get_prev(
 	return((rec_t*) page_rec_get_prev_const(rec));
 }
 
-/***************************************************************//**
-Looks for the record which owns the given record.
-@return the owner record */
-UNIV_INLINE
-rec_t*
-page_rec_find_owner_rec(
-/*====================*/
-	rec_t*	rec)	/*!< in: the physical record */
-{
-	ut_ad(page_rec_check(rec));
-
-	if (page_rec_is_comp(rec)) {
-		while (rec_get_n_owned_new(rec) == 0) {
-			rec = page_rec_get_next(rec);
-		}
-	} else {
-		while (rec_get_n_owned_old(rec) == 0) {
-			rec = page_rec_get_next(rec);
-		}
-	}
-
-	return(rec);
-}
-
-/**********************************************************//**
-Returns the base extra size of a physical record.  This is the
-size of the fixed header, independent of the record size.
-@return REC_N_NEW_EXTRA_BYTES or REC_N_OLD_EXTRA_BYTES */
-UNIV_INLINE
-ulint
-page_rec_get_base_extra_size(
-/*=========================*/
-	const rec_t*	rec)	/*!< in: physical record */
-{
-	compile_time_assert(REC_N_NEW_EXTRA_BYTES + 1
-			    == REC_N_OLD_EXTRA_BYTES);
-	return(REC_N_NEW_EXTRA_BYTES + (ulint) !page_rec_is_comp(rec));
-}
-
 #endif /* UNIV_INNOCHECKSUM */
 
 /************************************************************//**
@@ -864,49 +576,16 @@ page_get_data_size(
 /*===============*/
 	const page_t*	page)	/*!< in: index page */
 {
-	uint16_t	ret = page_header_get_field(page, PAGE_HEAP_TOP)
+	unsigned ret = page_header_get_field(page, PAGE_HEAP_TOP)
 		- (page_is_comp(page)
 		   ? PAGE_NEW_SUPREMUM_END
 		   : PAGE_OLD_SUPREMUM_END)
 		- page_header_get_field(page, PAGE_GARBAGE);
 	ut_ad(ret < srv_page_size);
-	return(ret);
+	return static_cast<uint16_t>(ret);
 }
 
 #ifndef UNIV_INNOCHECKSUM
-/************************************************************//**
-Allocates a block of memory from the free list of an index page. */
-UNIV_INLINE
-void
-page_mem_alloc_free(
-/*================*/
-	page_t*		page,	/*!< in/out: index page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page with enough
-				space available for inserting the record,
-				or NULL */
-	rec_t*		next_rec,/*!< in: pointer to the new head of the
-				free record list */
-	ulint		need)	/*!< in: number of bytes allocated */
-{
-	ulint		garbage;
-
-#ifdef UNIV_DEBUG
-	const rec_t*	old_rec	= page_header_get_ptr(page, PAGE_FREE);
-	ulint		next_offs;
-
-	ut_ad(old_rec);
-	next_offs = rec_get_next_offs(old_rec, page_is_comp(page));
-	ut_ad(next_rec == (next_offs ? page + next_offs : NULL));
-#endif
-
-	page_header_set_ptr(page, page_zip, PAGE_FREE, next_rec);
-
-	garbage = page_header_get_field(page, PAGE_GARBAGE);
-	ut_ad(garbage >= need);
-
-	page_header_set_field(page, page_zip, PAGE_GARBAGE, garbage - need);
-}
-
 /*************************************************************//**
 Calculates free space if a page is emptied.
 @return free space */
@@ -1002,48 +681,6 @@ page_get_max_insert_size_after_reorganize(
 	return(free_space - occupied);
 }
 
-/************************************************************//**
-Puts a record to free list. */
-UNIV_INLINE
-void
-page_mem_free(
-/*==========*/
-	page_t*			page,		/*!< in/out: index page */
-	page_zip_des_t*		page_zip,	/*!< in/out: compressed page,
-						or NULL */
-	rec_t*			rec,		/*!< in: pointer to the
-						(origin of) record */
-	const dict_index_t*	index,		/*!< in: index of rec */
-	const rec_offs*		offsets)	/*!< in: array returned by
-						rec_get_offsets() */
-{
-	rec_t*		free;
-	ulint		garbage;
-
-	ut_ad(rec_offs_validate(rec, index, offsets));
-	free = page_header_get_ptr(page, PAGE_FREE);
-
-	if (srv_immediate_scrub_data_uncompressed) {
-		/* scrub record */
-		memset(rec, 0, rec_offs_data_size(offsets));
-	}
-
-	page_rec_set_next(rec, free);
-	page_header_set_ptr(page, page_zip, PAGE_FREE, rec);
-
-	garbage = page_header_get_field(page, PAGE_GARBAGE);
-
-	page_header_set_field(page, page_zip, PAGE_GARBAGE,
-			      garbage + rec_offs_size(offsets));
-
-	if (page_zip) {
-		page_zip_dir_delete(page_zip, rec, index, offsets, free);
-	} else {
-		page_header_set_field(page, page_zip, PAGE_N_RECS,
-				      ulint(page_get_n_recs(page)) - 1);
-	}
-}
-
 /** Read the PAGE_DIRECTION field from a byte.
 @param[in]	ptr	pointer to PAGE_DIRECTION_B
 @return	the value of the PAGE_DIRECTION field */
@@ -1055,19 +692,6 @@ page_ptr_get_direction(const byte* ptr)
 	return *ptr & ((1U << 3) - 1);
 }
 
-/** Set the PAGE_DIRECTION field.
-@param[in]	ptr	pointer to PAGE_DIRECTION_B
-@param[in]	dir	the value of the PAGE_DIRECTION field */
-inline
-void
-page_ptr_set_direction(byte* ptr, byte dir)
-{
-	ut_ad(page_offset(ptr) == PAGE_HEADER + PAGE_DIRECTION_B);
-	ut_ad(dir >= PAGE_LEFT);
-	ut_ad(dir <= PAGE_NO_DIRECTION);
-	*ptr = (*ptr & ~((1U << 3) - 1)) | dir;
-}
-
 /** Read the PAGE_INSTANT field.
 @param[in]	page	index page
 @return the value of the PAGE_INSTANT field */
@@ -1089,7 +713,7 @@ page_get_instant(const page_t* page)
 		ut_ad(i <= PAGE_NO_DIRECTION);
 		break;
 	default:
-		ut_ad(!"invalid page type");
+		ut_ad("invalid page type" == 0);
 		break;
 	}
 #endif /* UNIV_DEBUG */
diff --git a/storage/innobase/include/page0types.h b/storage/innobase/include/page0types.h
index 14ccc2eae36..6c5a681f3b5 100644
--- a/storage/innobase/include/page0types.h
+++ b/storage/innobase/include/page0types.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -39,6 +40,8 @@ typedef	byte		page_t;
 #ifndef UNIV_INNOCHECKSUM
 /** Index page cursor */
 struct page_cur_t;
+/** Buffer pool block */
+struct buf_block_t;
 
 /** Compressed index page */
 typedef byte		page_zip_t;
@@ -144,47 +147,15 @@ extern page_zip_stat_t			page_zip_stat[PAGE_ZIP_SSIZE_MAX];
 extern page_zip_stat_per_index_t	page_zip_stat_per_index;
 
 /**********************************************************************//**
-Write the "deleted" flag of a record on a compressed page.  The flag must
-already have been written on the uncompressed page. */
-void
-page_zip_rec_set_deleted(
-/*=====================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
-	const byte*	rec,	/*!< in: record on the uncompressed page */
-	ulint		flag)	/*!< in: the deleted flag (nonzero=TRUE) */
-	MY_ATTRIBUTE((nonnull));
-
-/**********************************************************************//**
 Write the "owned" flag of a record on a compressed page.  The n_owned field
 must already have been written on the uncompressed page. */
 void
 page_zip_rec_set_owned(
 /*===================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	buf_block_t*	block,	/*!< in/out: ROW_FORMAT=COMPRESSED page */
 	const byte*	rec,	/*!< in: record on the uncompressed page */
-	ulint		flag)	/*!< in: the owned flag (nonzero=TRUE) */
-	MY_ATTRIBUTE((nonnull));
-
-/**********************************************************************//**
-Shift the dense page directory when a record is deleted. */
-void
-page_zip_dir_delete(
-/*================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
-	byte*		rec,	/*!< in: deleted record */
-	dict_index_t*	index,	/*!< in: index of rec */
-	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec) */
-	const byte*	free)	/*!< in: previous start of the free list */
-	MY_ATTRIBUTE((nonnull(1,2,3,4)));
-
-/**********************************************************************//**
-Add a slot to the dense page directory. */
-void
-page_zip_dir_add_slot(
-/*==================*/
-	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
-	ulint		is_clustered)	/*!< in: nonzero for clustered index,
-					zero for others */
+	ulint		flag,	/*!< in: the owned flag (nonzero=TRUE) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 	MY_ATTRIBUTE((nonnull));
 #endif /* !UNIV_INNOCHECKSUM */
 #endif
diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h
index 4c2c88e6de0..5b98fdea004 100644
--- a/storage/innobase/include/page0zip.h
+++ b/storage/innobase/include/page0zip.h
@@ -59,10 +59,6 @@ extern uint	page_zip_level;
 /** 'deleted' flag */
 #define PAGE_ZIP_DIR_SLOT_DEL		0x8000U
 
-/* Whether or not to log compressed page images to avoid possible
-compression algorithm changes in zlib. */
-extern my_bool	page_zip_log_pages;
-
 /**********************************************************************//**
 Determine the size of a compressed page in bytes.
 @return size in bytes */
@@ -128,22 +124,16 @@ page_zip_set_alloc(
 	void*		stream,		/*!< in/out: zlib stream */
 	mem_heap_t*	heap);		/*!< in: memory heap to use */
 
-/**********************************************************************//**
-Compress a page.
-@return TRUE on success, FALSE on failure; page_zip will be left
-intact on failure. */
-ibool
+/** Attempt to compress a ROW_FORMAT=COMPRESSED page.
+@retval true on success
+@retval false on failure; block->page.zip will be left intact. */
+bool
 page_zip_compress(
-/*==============*/
-	page_zip_des_t*		page_zip,	/*!< in: size; out: data,
-						n_blobs, m_start, m_end,
-						m_nonempty */
-	const page_t*		page,		/*!< in: uncompressed page */
-	dict_index_t*		index,		/*!< in: index of the B-tree
-						node */
-	ulint			level,		/*!< in: commpression level */
-	mtr_t*			mtr);		/*!< in/out: mini-transaction,
-						or NULL */
+	buf_block_t*		block,	/*!< in/out: buffer block */
+	dict_index_t*		index,	/*!< in: index of the B-tree node */
+	ulint			level,	/*!< in: commpression level */
+	mtr_t*			mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
 
 /**********************************************************************//**
 Write the index information for the compressed page.
@@ -240,42 +230,18 @@ page_zip_available(
 					the heap */
 	MY_ATTRIBUTE((warn_unused_result));
 
-/**********************************************************************//**
-Write data to the uncompressed header portion of a page.  The data must
-already have been written to the uncompressed page. */
-UNIV_INLINE
-void
-page_zip_write_header(
-/*==================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
-	const byte*	str,	/*!< in: address on the uncompressed page */
-	ulint		length,	/*!< in: length of the data */
-	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
-	MY_ATTRIBUTE((nonnull(1,2)));
-
-/**********************************************************************//**
-Write an entire record on the compressed page.  The data must already
-have been written to the uncompressed page. */
-void
-page_zip_write_rec(
-/*===============*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
-	const byte*	rec,	/*!< in: record being written */
-	dict_index_t*	index,	/*!< in: the index the record belongs to */
-	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
-	ulint		create)	/*!< in: nonzero=insert, zero=update */
-	MY_ATTRIBUTE((nonnull));
-
-/***********************************************************//**
-Parses a log record of writing a BLOB pointer of a record.
-@return end of log record or NULL */
-byte*
-page_zip_parse_write_blob_ptr(
-/*==========================*/
-	byte*		ptr,	/*!< in: redo log buffer */
-	byte*		end_ptr,/*!< in: redo log buffer end */
-	page_t*		page,	/*!< in/out: uncompressed page */
-	page_zip_des_t*	page_zip);/*!< in/out: compressed page */
+/** Write an entire record to the ROW_FORMAT=COMPRESSED page.
+The data must already have been written to the uncompressed page.
+@param[in,out]	block		ROW_FORMAT=COMPRESSED page
+@param[in]	rec		record in the uncompressed page
+@param[in]	index		the index that the page belongs to
+@param[in]	offsets		rec_get_offsets(rec, index)
+@param[in]	create		nonzero=insert, zero=update
+@param[in,out]	mtr		mini-transaction */
+void page_zip_write_rec(buf_block_t *block, const byte *rec,
+                        const dict_index_t *index, const rec_offs *offsets,
+                        ulint create, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
 
 /**********************************************************************//**
 Write a BLOB pointer of a record on the leaf page of a clustered index.
@@ -283,174 +249,101 @@ The information must already have been updated on the uncompressed page. */
 void
 page_zip_write_blob_ptr(
 /*====================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	buf_block_t*	block,	/*!< in/out: ROW_FORMAT=COMPRESSED page */
 	const byte*	rec,	/*!< in/out: record whose data is being
 				written */
 	dict_index_t*	index,	/*!< in: index of the page */
 	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
 	ulint		n,	/*!< in: column index */
-	mtr_t*		mtr);	/*!< in: mini-transaction handle,
-				or NULL if no logging is needed */
-
-/***********************************************************//**
-Parses a log record of writing the node pointer of a record.
-@return end of log record or NULL */
-byte*
-page_zip_parse_write_node_ptr(
-/*==========================*/
-	byte*		ptr,	/*!< in: redo log buffer */
-	byte*		end_ptr,/*!< in: redo log buffer end */
-	page_t*		page,	/*!< in/out: uncompressed page */
-	page_zip_des_t*	page_zip);/*!< in/out: compressed page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
 
 /**********************************************************************//**
 Write the node pointer of a record on a non-leaf compressed page. */
 void
 page_zip_write_node_ptr(
 /*====================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	buf_block_t*	block,	/*!< in/out: compressed page */
 	byte*		rec,	/*!< in/out: record */
 	ulint		size,	/*!< in: data size of rec */
 	ulint		ptr,	/*!< in: node pointer */
-	mtr_t*		mtr);	/*!< in: mini-transaction, or NULL */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
 
 /** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record.
-@param[in,out]	page_zip	compressed page
+@param[in,out]	block		ROW_FORMAT=COMPRESSED page
 @param[in,out]	rec		record
 @param[in]	offsets		rec_get_offsets(rec, index)
 @param[in]	trx_id_field	field number of DB_TRX_ID (number of PK fields)
 @param[in]	trx_id		DB_TRX_ID value (transaction identifier)
 @param[in]	roll_ptr	DB_ROLL_PTR value (undo log pointer)
-@param[in,out]	mtr		mini-transaction, or NULL to skip logging */
+@param[in,out]	mtr		mini-transaction */
 void
 page_zip_write_trx_id_and_roll_ptr(
-	page_zip_des_t*	page_zip,
+	buf_block_t*	block,
 	byte*		rec,
 	const rec_offs*	offsets,
 	ulint		trx_id_col,
 	trx_id_t	trx_id,
 	roll_ptr_t	roll_ptr,
-	mtr_t*		mtr = NULL)
-	MY_ATTRIBUTE((nonnull(1,2,3)));
-
-/** Parse a MLOG_ZIP_WRITE_TRX_ID record.
-@param[in]	ptr		redo log buffer
-@param[in]	end_ptr		end of redo log buffer
-@param[in,out]	page		uncompressed page
-@param[in,out]	page_zip	compressed page
-@return end of log record
-@retval	NULL	if the log record is incomplete */
-byte*
-page_zip_parse_write_trx_id(
-	byte*		ptr,
-	byte*		end_ptr,
-	page_t*		page,
-	page_zip_des_t*	page_zip)
-	MY_ATTRIBUTE((nonnull(1,2), warn_unused_result));
-/**********************************************************************//**
-Write the "deleted" flag of a record on a compressed page.  The flag must
-already have been written on the uncompressed page. */
-void
-page_zip_rec_set_deleted(
-/*=====================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
-	const byte*	rec,	/*!< in: record on the uncompressed page */
-	ulint		flag)	/*!< in: the deleted flag (nonzero=TRUE) */
+	mtr_t*		mtr)
 	MY_ATTRIBUTE((nonnull));
 
-/**********************************************************************//**
-Write the "owned" flag of a record on a compressed page.  The n_owned field
-must already have been written on the uncompressed page. */
-void
-page_zip_rec_set_owned(
-/*===================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
-	const byte*	rec,	/*!< in: record on the uncompressed page */
-	ulint		flag)	/*!< in: the owned flag (nonzero=TRUE) */
-	MY_ATTRIBUTE((nonnull));
+/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record.
+@param[in,out]  block   buffer block
+@param[in,out]  rec     record on a physical index page
+@param[in]      flag    the value of the delete-mark flag
+@param[in,out]  mtr     mini-transaction  */
+void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag,
+                              mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
 
 /**********************************************************************//**
 Insert a record to the dense page directory. */
 void
 page_zip_dir_insert(
 /*================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
-	const byte*	prev_rec,/*!< in: record after which to insert */
-	const byte*	free_rec,/*!< in: record from which rec was
-				allocated, or NULL */
-	byte*		rec);	/*!< in: record to insert */
-
-/**********************************************************************//**
-Shift the dense page directory and the array of BLOB pointers
-when a record is deleted. */
-void
-page_zip_dir_delete(
-/*================*/
-	page_zip_des_t*		page_zip,	/*!< in/out: compressed page */
-	byte*			rec,		/*!< in: deleted record */
-	const dict_index_t*	index,		/*!< in: index of rec */
-	const rec_offs*		offsets,	/*!< in: rec_get_offsets(rec) */
-	const byte*		free)		/*!< in: previous start of
-						the free list */
-	MY_ATTRIBUTE((nonnull(1,2,3,4)));
-
-/**********************************************************************//**
-Add a slot to the dense page directory. */
-void
-page_zip_dir_add_slot(
-/*==================*/
-	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
-	ulint		is_clustered)	/*!< in: nonzero for clustered index,
-					zero for others */
-	MY_ATTRIBUTE((nonnull));
-
-/***********************************************************//**
-Parses a log record of writing to the header of a page.
-@return end of log record or NULL */
-byte*
-page_zip_parse_write_header(
-/*========================*/
-	byte*		ptr,	/*!< in: redo log buffer */
-	byte*		end_ptr,/*!< in: redo log buffer end */
-	page_t*		page,	/*!< in/out: uncompressed page */
-	page_zip_des_t*	page_zip);/*!< in/out: compressed page */
-
-/**********************************************************************//**
-Write data to the uncompressed header portion of a page.  The data must
-already have been written to the uncompressed page.
-However, the data portion of the uncompressed page may differ from
-the compressed page when a record is being inserted in
-page_cur_insert_rec_low(). */
-UNIV_INLINE
-void
-page_zip_write_header(
-/*==================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
-	const byte*	str,	/*!< in: address on the uncompressed page */
-	ulint		length,	/*!< in: length of the data */
-	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
-	MY_ATTRIBUTE((nonnull(1,2)));
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
+	uint16_t	free_rec,/*!< in: record from which rec was
+				allocated, or 0 */
+	byte*		rec,	/*!< in: record to insert */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull(1,3,4)));
+
+/** Shift the dense page directory and the array of BLOB pointers
+when a record is deleted.
+@param[in,out]  block   index page
+@param[in,out]  rec     record being deleted
+@param[in]      index   the index that the page belongs to
+@param[in]      offsets rec_get_offsets(rec, index)
+@param[in]	free	previous start of the free list
+@param[in,out]  mtr     mini-transaction */
+void page_zip_dir_delete(buf_block_t *block, byte *rec,
+                         const dict_index_t *index, const rec_offs *offsets,
+                         const byte *free, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull(1,2,3,4,6)));
 
 /**********************************************************************//**
 Reorganize and compress a page.  This is a low-level operation for
 compressed pages, to be used when page_zip_compress() fails.
-On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written.
+On success, redo log will be written.
 The function btr_page_reorganize() should be preferred whenever possible.
 IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
 non-clustered index, the caller must update the insert buffer free
 bits in the same mini-transaction in such a way that the modification
 will be redo-logged.
-@return TRUE on success, FALSE on failure; page_zip will be left
-intact on failure, but page will be overwritten. */
-ibool
+@retval true on success
+@retval false on failure; the block_zip will be left intact */
+bool
 page_zip_reorganize(
-/*================*/
 	buf_block_t*	block,	/*!< in/out: page with compressed page;
 				on the compressed page, in: size;
 				out: data, n_blobs,
 				m_start, m_end, m_nonempty */
 	dict_index_t*	index,	/*!< in: index of the B-tree node */
-	mtr_t*		mtr)	/*!< in: mini-transaction */
+	ulint		z_level,/*!< in: compression level */
+	mtr_t*		mtr,	/*!< in: mini-transaction */
+	bool		restore = false)/*!< whether to restore on failure */
 	MY_ATTRIBUTE((nonnull));
 
 /**********************************************************************//**
@@ -460,25 +353,11 @@ related to the storage of records.  Also copy PAGE_MAX_TRX_ID.
 NOTE: The caller must update the lock table and the adaptive hash index. */
 void
 page_zip_copy_recs(
-/*===============*/
-	page_zip_des_t*		page_zip,	/*!< out: copy of src_zip
-						(n_blobs, m_start, m_end,
-						m_nonempty, data[0..size-1]) */
-	page_t*			page,		/*!< out: copy of src */
+	buf_block_t*		block,		/*!< in/out: buffer block */
 	const page_zip_des_t*	src_zip,	/*!< in: compressed page */
 	const page_t*		src,		/*!< in: page */
 	dict_index_t*		index,		/*!< in: index of the B-tree */
 	mtr_t*			mtr);		/*!< in: mini-transaction */
-
-/** Parse and optionally apply MLOG_ZIP_PAGE_COMPRESS.
-@param[in]	ptr	log record
-@param[in]	end_ptr	end of log
-@param[in,out]	block	ROW_FORMAT=COMPRESSED block, or NULL for parsing only
-@return	end of log record
-@retval	NULL	if the log record is incomplete */
-byte* page_zip_parse_compress(const byte* ptr, const byte* end_ptr,
-			      buf_block_t* block);
-
 #endif /* !UNIV_INNOCHECKSUM */
 
 /** Calculate the compressed page checksum.
@@ -500,30 +379,6 @@ bool page_zip_verify_checksum(const byte *data, size_t size);
 
 #ifndef UNIV_INNOCHECKSUM
 /**********************************************************************//**
-Write a log record of compressing an index page without the data on the page. */
-UNIV_INLINE
-void
-page_zip_compress_write_log_no_data(
-/*================================*/
-	ulint		level,	/*!< in: compression level */
-	const page_t*	page,	/*!< in: page that is compressed */
-	dict_index_t*	index,	/*!< in: index */
-	mtr_t*		mtr);	/*!< in: mtr */
-/**********************************************************************//**
-Parses a log record of compressing an index page without the data.
-@return end of log record or NULL */
-UNIV_INLINE
-byte*
-page_zip_parse_compress_no_data(
-/*============================*/
-	byte*		ptr,		/*!< in: buffer */
-	byte*		end_ptr,	/*!< in: buffer end */
-	page_t*		page,		/*!< in: uncompressed page */
-	page_zip_des_t*	page_zip,	/*!< out: compressed page */
-	dict_index_t*	index)		/*!< in: index */
-	MY_ATTRIBUTE((nonnull(1,2)));
-
-/**********************************************************************//**
 Reset the counters used for filling
 INFORMATION_SCHEMA.innodb_cmp_per_index. */
 UNIV_INLINE
diff --git a/storage/innobase/include/page0zip.inl b/storage/innobase/include/page0zip.inl
index 337debd30e9..b0622ba79c3 100644
--- a/storage/innobase/include/page0zip.inl
+++ b/storage/innobase/include/page0zip.inl
@@ -1,8 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -25,10 +24,7 @@ Compressed page interface
 Created June 2005 by Marko Makela
 *******************************************************/
 
-#include "page0zip.h"
-#include "mtr0log.h"
 #include "page0page.h"
-#include "srv0srv.h"
 
 /* The format of compressed pages is as follows.
 
@@ -136,7 +132,7 @@ page_zip_set_size(
 		for (ssize = 1; size > (512U << ssize); ssize++) {
 		}
 
-		page_zip->ssize = ssize;
+		page_zip->ssize = ssize & ((1U << PAGE_ZIP_SSIZE_BITS) - 1);
 	} else {
 		page_zip->ssize = 0;
 	}
@@ -320,101 +316,6 @@ page_zip_des_init(
 }
 
 /**********************************************************************//**
-Write a log record of writing to the uncompressed header portion of a page. */
-void
-page_zip_write_header_log(
-/*======================*/
-	const byte*	data,/*!< in: data on the uncompressed page */
-	ulint		length,	/*!< in: length of the data */
-	mtr_t*		mtr);	/*!< in: mini-transaction */
-
-/**********************************************************************//**
-Write data to the uncompressed header portion of a page.  The data must
-already have been written to the uncompressed page.
-However, the data portion of the uncompressed page may differ from
-the compressed page when a record is being inserted in
-page_cur_insert_rec_zip(). */
-UNIV_INLINE
-void
-page_zip_write_header(
-/*==================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
-	const byte*	str,	/*!< in: address on the uncompressed page */
-	ulint		length,	/*!< in: length of the data */
-	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
-{
-	ulint	pos;
-
-	ut_ad(page_zip_simple_validate(page_zip));
-	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
-
-	pos = page_offset(str);
-
-	ut_ad(pos < PAGE_DATA);
-
-	memcpy(page_zip->data + pos, str, length);
-
-	/* The following would fail in page_cur_insert_rec_zip(). */
-	/* ut_ad(page_zip_validate(page_zip, str - pos)); */
-
-	if (mtr) {
-		page_zip_write_header_log(str, length, mtr);
-	}
-}
-
-/**********************************************************************//**
-Write a log record of compressing an index page without the data on the page. */
-UNIV_INLINE
-void
-page_zip_compress_write_log_no_data(
-/*================================*/
-	ulint		level,	/*!< in: compression level */
-	const page_t*	page,	/*!< in: page that is compressed */
-	dict_index_t*	index,	/*!< in: index */
-	mtr_t*		mtr)	/*!< in: mtr */
-{
-	byte* log_ptr = mlog_open_and_write_index(
-		mtr, page, index, MLOG_ZIP_PAGE_COMPRESS_NO_DATA, 1);
-
-	if (log_ptr) {
-		mach_write_to_1(log_ptr, level);
-		mlog_close(mtr, log_ptr + 1);
-	}
-}
-
-/**********************************************************************//**
-Parses a log record of compressing an index page without the data.
-@return end of log record or NULL */
-UNIV_INLINE
-byte*
-page_zip_parse_compress_no_data(
-/*============================*/
-	byte*		ptr,		/*!< in: buffer */
-	byte*		end_ptr,	/*!< in: buffer end */
-	page_t*		page,		/*!< in: uncompressed page */
-	page_zip_des_t*	page_zip,	/*!< out: compressed page */
-	dict_index_t*	index)		/*!< in: index */
-{
-	ulint	level;
-	if (end_ptr == ptr) {
-		return(NULL);
-	}
-
-	level = mach_read_from_1(ptr);
-
-	/* If page compression fails then there must be something wrong
-	because a compress log record is logged only if the compression
-	was successful. Crash in this case. */
-
-	if (page
-	    && !page_zip_compress(page_zip, page, index, level, NULL)) {
-		ut_error;
-	}
-
-	return(ptr + 1);
-}
-
-/**********************************************************************//**
 Reset the counters used for filling
 INFORMATION_SCHEMA.innodb_cmp_per_index. */
 UNIV_INLINE
diff --git a/storage/innobase/include/pars0pars.h b/storage/innobase/include/pars0pars.h
index 475dd523d38..4c588dca061 100644
--- a/storage/innobase/include/pars0pars.h
+++ b/storage/innobase/include/pars0pars.h
@@ -87,7 +87,7 @@ pars_sql(
 /*************************************************************//**
 Retrieves characters to the lexical analyzer.
 @return number of characters copied or 0 on EOF */
-size_t
+int
 pars_get_lex_chars(
 /*===============*/
 	char*	buf,		/*!< in/out: buffer where to copy */
diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h
index 594219bd5d9..962bd359f0b 100644
--- a/storage/innobase/include/que0que.h
+++ b/storage/innobase/include/que0que.h
@@ -105,22 +105,7 @@ ibool
 que_thr_stop(
 /*=========*/
 	que_thr_t*	thr);	/*!< in: query thread */
-/**********************************************************************//**
-Moves a thread from another state to the QUE_THR_RUNNING state. Increments
-the n_active_thrs counters of the query graph and transaction. */
-void
-que_thr_move_to_run_state_for_mysql(
-/*================================*/
-	que_thr_t*	thr,	/*!< in: an query thread */
-	trx_t*		trx);	/*!< in: transaction */
-/**********************************************************************//**
-A patch for MySQL used to 'stop' a dummy query thread used in MySQL
-select, when there is no error or lock wait. */
-void
-que_thr_stop_for_mysql_no_error(
-/*============================*/
-	que_thr_t*	thr,	/*!< in: query thread */
-	trx_t*		trx);	/*!< in: transaction */
+
 /**********************************************************************//**
 A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The
 query thread is stopped and made inactive, except in the case where
@@ -324,16 +309,10 @@ trx_t::mutex with the exceptions named below */
 
 struct que_thr_t{
 	que_common_t	common;		/*!< type: QUE_NODE_THR */
-	ulint		magic_n;	/*!< magic number to catch memory
-					corruption */
 	que_node_t*	child;		/*!< graph child node */
 	que_t*		graph;		/*!< graph where this node belongs */
 	que_thr_state_t	state;		/*!< state of the query thread */
-	ibool		is_active;	/*!< TRUE if the thread has been set
-					to the run state in
-					que_thr_move_to_run_state, but not
-					deactivated in
-					que_thr_dec_reference_count */
+	bool		is_active;	/*!< whether the thread is active */
 	/*------------------------------*/
 	/* The following fields are private to the OS thread executing the
 	query thread, and are not protected by any mutex: */
@@ -365,23 +344,39 @@ struct que_thr_t{
 	row_prebuilt_t*	prebuilt;	/*!< prebuilt structure processed by
 					the query thread */
 
-	/** a slot of srv_sys.sys_threads, for DEBUG_SYNC in purge thread */
-	ut_d(srv_slot_t* thread_slot;)
-};
+#ifdef UNIV_DEBUG
+  /** Change the 'active' status */
+  inline void set_active(bool active);
+#endif
+  /** Transition to the QUE_THR_RUNNING state. */
+  inline void start_running()
+  {
+    ut_d(if (!is_active) set_active(true));
+    is_active= true;
+    state= QUE_THR_RUNNING;
+  }
 
-#define QUE_THR_MAGIC_N		8476583
-#define QUE_THR_MAGIC_FREED	123461526
+  /** Stop query execution when there is no error or lock wait. */
+  void stop_no_error()
+  {
+    ut_ad(is_active);
+    ut_d(set_active(false));
+    state= QUE_THR_COMPLETED;
+    is_active= false;
+  }
+};
 
 /* Query graph fork node: its fields are protected by the query thread mutex */
 struct que_fork_t{
 	que_common_t	common;		/*!< type: QUE_NODE_FORK */
 	que_t*		graph;		/*!< query graph of this node */
 	ulint		fork_type;	/*!< fork type */
-	ulint		n_active_thrs;	/*!< if this is the root of a graph, the
-					number query threads that have been
-					started in que_thr_move_to_run_state
-					but for which que_thr_dec_refer_count
-					has not yet been called */
+#ifdef UNIV_DEBUG
+  /** For the query graph root, updated in set_active() */
+  ulint n_active_thrs;
+  /** Change the 'active' status */
+  void set_active(bool active);
+#endif
 	trx_t*		trx;		/*!< transaction: this is set only in
 					the root node */
 	ulint		state;		/*!< state of the fork node */
@@ -407,6 +402,10 @@ struct que_fork_t{
 
 };
 
+#ifdef UNIV_DEBUG
+inline void que_thr_t::set_active(bool active) { graph->set_active(active); };
+#endif
+
 /* Query fork (or graph) types */
 #define QUE_FORK_SELECT_NON_SCROLL	1	/* forward-only cursor */
 #define QUE_FORK_SELECT_SCROLL		2	/* scrollable cursor */
diff --git a/storage/innobase/include/que0types.h b/storage/innobase/include/que0types.h
index d9005095d3c..38f6e380a30 100644
--- a/storage/innobase/include/que0types.h
+++ b/storage/innobase/include/que0types.h
@@ -35,6 +35,7 @@ typedef void	que_node_t;
 /* Query graph root is a fork node */
 typedef	struct que_fork_t	que_t;
 
+struct row_prebuilt_t;
 struct que_thr_t;
 
 /* Query graph node types */
diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h
index 48575feda10..21143ab609d 100644
--- a/storage/innobase/include/read0types.h
+++ b/storage/innobase/include/read0types.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, MariaDB Corporation.
+Copyright (c) 2018, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -32,56 +32,47 @@ Created 2/16/1997 Heikki Tuuri
 #include <algorithm>
 
 
-/** View is not visible to purge thread. */
-#define READ_VIEW_STATE_CLOSED 0
-
-/** View is being opened, purge thread must wait for state change. */
-#define READ_VIEW_STATE_SNAPSHOT 1
-
-/** View is visible to purge thread. */
-#define READ_VIEW_STATE_OPEN 2
-
-
 /**
   Read view lists the trx ids of those transactions for which a consistent read
   should not see the modifications to the database.
 */
-class ReadView
+class ReadViewBase
 {
   /**
-    View state.
-
-    It is not defined as enum as it has to be updated using atomic operations.
-    Possible values are READ_VIEW_STATE_CLOSED, READ_VIEW_STATE_SNAPSHOT and
-    READ_VIEW_STATE_OPEN.
-
-    Possible state transfers...
+    The read should not see any transaction with trx id >= this value.
+    In other words, this is the "high water mark".
+  */
+  trx_id_t m_low_limit_id;
 
-    Start view open:
-    READ_VIEW_STATE_CLOSED -> READ_VIEW_STATE_SNAPSHOT
+  /**
+    The read should see all trx ids which are strictly
+    smaller (<) than this value. In other words, this is the
+    low water mark".
+  */
+  trx_id_t m_up_limit_id;
 
-    Complete view open:
-    READ_VIEW_STATE_SNAPSHOT -> READ_VIEW_STATE_OPEN
+  /** Set of RW transactions that was active when this snapshot was taken */
+  trx_ids_t m_ids;
 
-    Close view:
-    READ_VIEW_STATE_OPEN -> READ_VIEW_STATE_CLOSED
+  /**
+    The view does not need to see the undo logs for transactions whose
+    transaction number is strictly smaller (<) than this value: they can be
+    removed in purge if not needed by other views.
   */
-  std::atomic<uint32_t> m_state;
+  trx_id_t m_low_limit_no;
 
+protected:
+  bool empty() { return m_ids.empty(); }
 
-  /** m_state getter for ReadView owner thread */
-  uint32_t state() const
-  {
-    return m_state.load(std::memory_order_relaxed);
-  }
-
+  /** @return the up limit id */
+  trx_id_t up_limit_id() const { return m_up_limit_id; }
 
 public:
-  ReadView(): m_state(READ_VIEW_STATE_CLOSED), m_low_limit_id(0) {}
+  ReadViewBase(): m_low_limit_id(0) {}
 
 
   /**
-    Copy state from another view.
+    Append state from another view.
 
     This method is used to find min(m_low_limit_no), min(m_low_limit_id) and
     all transaction ids below min(m_low_limit_id). These values effectively
@@ -89,7 +80,7 @@ public:
 
     @param other    view to copy from
   */
-  void copy(const ReadView &other)
+  void append(const ReadViewBase &other)
   {
     ut_ad(&other != this);
     if (m_low_limit_no > other.m_low_limit_no)
@@ -98,25 +89,24 @@ public:
       m_low_limit_id= other.m_low_limit_id;
 
     trx_ids_t::iterator dst= m_ids.begin();
-    for (trx_ids_t::const_iterator src= other.m_ids.begin();
-         src != other.m_ids.end(); src++)
+    for (const trx_id_t id : other.m_ids)
     {
-      if (*src >= m_low_limit_id)
+      if (id >= m_low_limit_id)
         break;
 loop:
       if (dst == m_ids.end())
       {
-        m_ids.push_back(*src);
+        m_ids.push_back(id);
         dst= m_ids.end();
         continue;
       }
-      if (*dst < *src)
+      if (*dst < id)
       {
         dst++;
         goto loop;
       }
-      else if (*dst > *src)
-        dst= m_ids.insert(dst, *src) + 1;
+      else if (*dst > id)
+        dst= m_ids.insert(dst, id) + 1;
     }
     m_ids.erase(std::lower_bound(dst, m_ids.end(), m_low_limit_id),
                 m_ids.end());
@@ -127,66 +117,122 @@ loop:
 
 
   /**
-    Opens a read view where exactly the transactions serialized before this
+    Creates a snapshot where exactly the transactions serialized before this
     point in time are seen in the view.
 
-    View becomes visible to purge thread.
-
     @param[in,out] trx transaction
   */
-  void open(trx_t *trx);
+  inline void snapshot(trx_t *trx);
 
 
   /**
-    Closes the view.
+    Check whether transaction id is valid.
+    @param[in] id transaction id to check
+    @param[in] name table name
+
+    @todo changes_visible() was an unfortunate choice for this check.
+    It should be moved towards the functions that load trx id like
+    trx_read_trx_id(). No need to issue a warning, error log message should
+    be enough. Although statement should ideally fail if it sees corrupt
+    data.
+  */
+  static void check_trx_id_sanity(trx_id_t id, const table_name_t &name);
 
-    View becomes not visible to purge thread.
 
-    This method is intended to be called by ReadView owner thread, thus
-    m_state cannot change.
+  /**
+    Check whether the changes by id are visible.
+    @param[in] id transaction id to check against the view
+    @param[in] name table name
+    @return whether the view sees the modifications of id.
   */
-  void close()
+  bool changes_visible(trx_id_t id, const table_name_t &name) const
+  MY_ATTRIBUTE((warn_unused_result))
   {
-    ut_ad(state() == READ_VIEW_STATE_CLOSED ||
-          state() == READ_VIEW_STATE_OPEN);
-    m_state.store(READ_VIEW_STATE_CLOSED, std::memory_order_relaxed);
+    if (id >= m_low_limit_id)
+    {
+      check_trx_id_sanity(id, name);
+      return false;
+    }
+    return id < m_up_limit_id ||
+           m_ids.empty() ||
+           !std::binary_search(m_ids.begin(), m_ids.end(), id);
   }
 
 
-  /** m_state getter for trx_sys::clone_oldest_view() trx_sys::size(). */
-  uint32_t get_state() const
-  {
-    return m_state.load(std::memory_order_acquire);
-  }
+  /**
+    @param id transaction to check
+    @return true if view sees transaction id
+  */
+  bool sees(trx_id_t id) const { return id < m_up_limit_id; }
 
+  /** @return the low limit no */
+  trx_id_t low_limit_no() const { return m_low_limit_no; }
 
+  /** @return the low limit id */
+  trx_id_t low_limit_id() const { return m_low_limit_id; }
+};
+
+
+/** A ReadView with extra members required for trx_t::read_view. */
+class ReadView: public ReadViewBase
+{
   /**
-    Returns true if view is open.
+    View state.
+
+    Implemented as atomic to allow mutex-free view close and re-use.
+    Non-owner thread is allowed to call is_open() alone without mutex
+    protection as well. E.g. trx_sys.view_count() does this.
 
-    This method is intended to be called by ReadView owner thread, thus
-    m_state cannot change.
+    If non-owner thread intends to access other members as well, both
+    is_open() and other members accesses must be protected by m_mutex.
+    E.g. copy_to().
   */
-  bool is_open() const
-  {
-    ut_ad(state() == READ_VIEW_STATE_OPEN ||
-          state() == READ_VIEW_STATE_CLOSED);
-    return state() == READ_VIEW_STATE_OPEN;
-  }
+  std::atomic<bool> m_open;
 
+  /** For synchronisation with purge coordinator. */
+  mutable ib_mutex_t m_mutex;
 
   /**
-    Creates a snapshot where exactly the transactions serialized before this
+    trx id of creating transaction.
+    Used exclusively by the read view owner thread.
+  */
+  trx_id_t m_creator_trx_id;
+
+public:
+  ReadView(): m_open(false) { mutex_create(LATCH_ID_READ_VIEW, &m_mutex); }
+  ~ReadView() { mutex_free(&m_mutex); }
+
+
+  /**
+    Opens a read view where exactly the transactions serialized before this
     point in time are seen in the view.
 
+    View becomes visible to purge thread. Intended to be called by the ReadView
+    owner thread.
+
     @param[in,out] trx transaction
   */
-  inline void snapshot(trx_t *trx);
+  void open(trx_t *trx);
+
+
+  /**
+    Closes the view.
+
+    View becomes not visible to purge thread. Intended to be called by the
+    ReadView owner thread.
+  */
+  void close() { m_open.store(false, std::memory_order_relaxed); }
+
+
+  /** Returns true if view is open. */
+  bool is_open() const { return m_open.load(std::memory_order_relaxed); }
 
 
   /**
     Sets the creator transaction id.
 
     This should be set only for views created by RW transactions.
+    Intended to be called by the ReadView owner thread.
   */
   void set_creator_trx_id(trx_id_t id)
   {
@@ -196,97 +242,52 @@ loop:
   }
 
 
-	/** Check whether transaction id is valid.
-	@param[in]	id		transaction id to check
-	@param[in]	name		table name */
-	static void check_trx_id_sanity(
-		trx_id_t		id,
-		const table_name_t&	name);
-
-	/** Check whether the changes by id are visible.
-	@param[in]	id	transaction id to check against the view
-	@param[in]	name	table name
-	@return whether the view sees the modifications of id. */
-	bool changes_visible(
-		trx_id_t		id,
-		const table_name_t&	name) const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		if (id < m_up_limit_id || id == m_creator_trx_id) {
-
-			return(true);
-		}
-
-		check_trx_id_sanity(id, name);
-
-		if (id >= m_low_limit_id) {
-
-			return(false);
-
-		} else if (m_ids.empty()) {
-
-			return(true);
-		}
-
-		return(!std::binary_search(m_ids.begin(), m_ids.end(), id));
-	}
-
-	/**
-	@param id		transaction to check
-	@return true if view sees transaction id */
-	bool sees(trx_id_t id) const
-	{
-		return(id < m_up_limit_id);
-	}
-
-	/**
-	Write the limits to the file.
-	@param file		file to write to */
-	void print_limits(FILE* file) const
-	{
-		fprintf(file,
-			"Trx read view will not see trx with"
-			" id >= " TRX_ID_FMT ", sees < " TRX_ID_FMT "\n",
-			m_low_limit_id, m_up_limit_id);
-	}
-
-	/**
-	@return the low limit no */
-	trx_id_t low_limit_no() const
-	{
-		return(m_low_limit_no);
-	}
-
-	/**
-	@return the low limit id */
-	trx_id_t low_limit_id() const
-	{
-		return(m_low_limit_id);
-	}
-
-
-private:
-	/** The read should not see any transaction with trx id >= this
-	value. In other words, this is the "high water mark". */
-	trx_id_t	m_low_limit_id;
-
-	/** The read should see all trx ids which are strictly
-	smaller (<) than this value.  In other words, this is the
-	low water mark". */
-	trx_id_t	m_up_limit_id;
-
-	/** trx id of creating transaction, set to TRX_ID_MAX for free
-	views. */
-	trx_id_t	m_creator_trx_id;
-
-	/** Set of RW transactions that was active when this snapshot
-	was taken */
-	trx_ids_t	m_ids;
-
-	/** The view does not need to see the undo logs for transactions
-	whose transaction number is strictly smaller (<) than this value:
-	they can be removed in purge if not needed by other views */
-	trx_id_t	m_low_limit_no;
-};
+  /**
+    Writes the limits to the file.
+    @param file file to write to
+  */
+  void print_limits(FILE *file) const
+  {
+    mutex_enter(&m_mutex);
+    if (is_open())
+      fprintf(file, "Trx read view will not see trx with"
+                    " id >= " TRX_ID_FMT ", sees < " TRX_ID_FMT "\n",
+                    low_limit_id(), up_limit_id());
+    mutex_exit(&m_mutex);
+  }
+
+
+  /**
+    A wrapper around ReadViewBase::changes_visible().
+    Intended to be called by the ReadView owner thread.
+  */
+  bool changes_visible(trx_id_t id, const table_name_t &name) const
+  { return id == m_creator_trx_id || ReadViewBase::changes_visible(id, name); }
+
 
+  /**
+    A wrapper around ReadViewBase::append().
+    Intended to be called by the purge coordinator task.
+  */
+  void append_to(ReadViewBase *to) const
+  {
+    mutex_enter(&m_mutex);
+    if (is_open())
+      to->append(*this);
+    mutex_exit(&m_mutex);
+  }
+
+
+  /**
+    Declare the object mostly unaccessible.
+    innodb_monitor_set_option is operating also on freed transaction objects.
+  */
+  void mem_noaccess() const
+  {
+    MEM_NOACCESS(&m_open, sizeof m_open);
+    /* m_mutex is accessed by innodb_show_mutex_status()
+    and innodb_monitor_update() even after trx_t::free() */
+    MEM_NOACCESS(&m_creator_trx_id, sizeof m_creator_trx_id);
+  }
+};
 #endif
diff --git a/storage/innobase/include/rem0cmp.h b/storage/innobase/include/rem0cmp.h
index 43319e4c4d0..6f2201971d1 100644
--- a/storage/innobase/include/rem0cmp.h
+++ b/storage/innobase/include/rem0cmp.h
@@ -77,35 +77,62 @@ cmp_dfield_dfield(
 	const dfield_t*	dfield1,/*!< in: data field; must have type field set */
 	const dfield_t*	dfield2);/*!< in: data field */
 
-
+#ifdef UNIV_DEBUG
 /** Compare a GIS data tuple to a physical record.
 @param[in] dtuple data tuple
 @param[in] rec R-tree record
-@param[in] offsets rec_get_offsets(rec)
 @param[in] mode compare mode
 @retval negative if dtuple is less than rec */
-int
-cmp_dtuple_rec_with_gis(
-/*====================*/
-	const dtuple_t*	dtuple,
-	const rec_t*	rec,
-	const rec_offs*	offsets,
-	page_cur_mode_t	mode)
-	MY_ATTRIBUTE((nonnull));
+int cmp_dtuple_rec_with_gis(const dtuple_t *dtuple, const rec_t *rec,
+                            page_cur_mode_t mode)
+  MY_ATTRIBUTE((nonnull));
+#endif
 
-/** Compare a GIS data tuple to a physical record in rtree non-leaf node.
-We need to check the page number field, since we don't store pk field in
-rtree non-leaf node.
-@param[in] dtuple data tuple
-@param[in] rec R-tree record
-@param[in] offsets rec_get_offsets(rec)
-@param[in] mode compare mode
-@retval negative if dtuple is less than rec */
-int
-cmp_dtuple_rec_with_gis_internal(
-	const dtuple_t*	dtuple,
-	const rec_t*	rec,
-	const rec_offs*	offsets);
+/** Compare two minimum bounding rectangles.
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+inline int cmp_geometry_field(const void *a, const void *b)
+{
+  const byte *mbr1= static_cast<const byte*>(a);
+  const byte *mbr2= static_cast<const byte*>(b);
+
+  static_assert(SPDIMS == 2, "compatibility");
+  static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double), "compatibility");
+
+  /* Try to compare mbr left lower corner (xmin, ymin) */
+  double x1= mach_double_read(mbr1);
+  double x2= mach_double_read(mbr2);
+  if (x1 > x2)
+    return 1;
+  if (x2 > x1)
+    return -1;
+
+  double y1= mach_double_read(mbr1 + sizeof(double) * SPDIMS);
+  double y2= mach_double_read(mbr2 + sizeof(double) * SPDIMS);
+
+  if (y1 > y2)
+    return 1;
+  if (y2 > y1)
+    return -1;
+
+  /* left lower corner (xmin, ymin) overlaps, now right upper corner */
+  x1= mach_double_read(mbr1 + sizeof(double));
+  x2= mach_double_read(mbr2 + sizeof(double));
+
+  if (x1 > x2)
+    return 1;
+  if (x2 > x1)
+    return -1;
+
+  y1= mach_double_read(mbr1 + sizeof(double) * 2 + sizeof(double));
+  y2= mach_double_read(mbr2 + sizeof(double) * 2 + sizeof(double));
+
+  if (y1 > y2)
+    return 1;
+  if (y2 > y1)
+    return -1;
+
+  return 0;
+}
 
 /** Compare a data tuple to a physical record.
 @param[in] dtuple data tuple
diff --git a/storage/innobase/include/rem0cmp.inl b/storage/innobase/include/rem0cmp.inl
index 4230543615a..6e21382d187 100644
--- a/storage/innobase/include/rem0cmp.inl
+++ b/storage/innobase/include/rem0cmp.inl
@@ -92,8 +92,7 @@ cmp_dfield_dfield_like_prefix(
 	uint cs_num = (uint) dtype_get_charset_coll(type->prtype);
 
 	if (CHARSET_INFO* cs = get_charset(cs_num, MYF(MY_WME))) {
-		return(cs->coll->strnncoll(
-			       cs,
+		return(cs->strnncoll(
 			       static_cast<const uchar*>(
 				       dfield_get_data(dfield1)),
 			       dfield_get_len(dfield1),
diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h
index 23956dcb068..c2ebad91ecd 100644
--- a/storage/innobase/include/rem0rec.h
+++ b/storage/innobase/include/rem0rec.h
@@ -63,21 +63,21 @@ The status is stored in the low-order bits. */
 
 #ifndef UNIV_INNOCHECKSUM
 /** SQL null flag in a 1-byte offset of ROW_FORMAT=REDUNDANT records */
-static const rec_offs REC_1BYTE_SQL_NULL_MASK= 0x80;
+constexpr rec_offs REC_1BYTE_SQL_NULL_MASK= 0x80;
 /** SQL null flag in a 2-byte offset of ROW_FORMAT=REDUNDANT records */
-static const rec_offs REC_2BYTE_SQL_NULL_MASK= 0x8000;
+constexpr rec_offs REC_2BYTE_SQL_NULL_MASK= 0x8000;
 
 /** In a 2-byte offset of ROW_FORMAT=REDUNDANT records, the second most
 significant bit denotes that the tail of a field is stored off-page. */
-static const rec_offs REC_2BYTE_EXTERN_MASK= 0x4000;
+constexpr rec_offs REC_2BYTE_EXTERN_MASK= 0x4000;
 
-static const size_t RECORD_OFFSET= 2;
-static const size_t INDEX_OFFSET=
+constexpr size_t RECORD_OFFSET= 2;
+constexpr size_t INDEX_OFFSET=
     RECORD_OFFSET + sizeof(rec_t *) / sizeof(rec_offs);
 #endif /* UNIV_INNOCHECKSUM */
 
 /* Length of the rec_get_offsets() header */
-static const size_t REC_OFFS_HEADER_SIZE=
+constexpr size_t REC_OFFS_HEADER_SIZE=
 #ifdef UNIV_DEBUG
 #ifndef UNIV_INNOCHECKSUM
     sizeof(rec_t *) / sizeof(rec_offs) +
@@ -88,9 +88,9 @@ static const size_t REC_OFFS_HEADER_SIZE=
 
 /* Number of elements that should be initially allocated for the
 offsets[] array, first passed to rec_get_offsets() */
-static const size_t REC_OFFS_NORMAL_SIZE= 300;
-static const size_t REC_OFFS_SMALL_SIZE= 18;
-static const size_t REC_OFFS_SEC_INDEX_SIZE=
+constexpr size_t REC_OFFS_NORMAL_SIZE= 300;
+constexpr size_t REC_OFFS_SMALL_SIZE= 18;
+constexpr size_t REC_OFFS_SEC_INDEX_SIZE=
     /* PK max key parts */ 16 + /* sec idx max key parts */ 16 +
     /* child page number for non-leaf pages */ 1;
 
@@ -117,30 +117,30 @@ enum field_type_t
 };
 
 /** without 2 upper bits */
-static const rec_offs DATA_MASK= 0x3fff;
+static constexpr rec_offs DATA_MASK= 0x3fff;
 /** 2 upper bits */
-static const rec_offs TYPE_MASK= ~DATA_MASK;
+static constexpr rec_offs TYPE_MASK= ~DATA_MASK;
 inline field_type_t get_type(rec_offs n)
 {
   return static_cast<field_type_t>(n & TYPE_MASK);
 }
 inline void set_type(rec_offs &n, field_type_t type)
 {
-  n= (n & DATA_MASK) | static_cast<rec_offs>(type);
+  n= static_cast<rec_offs>((n & DATA_MASK) | type);
 }
 inline rec_offs get_value(rec_offs n) { return n & DATA_MASK; }
 inline rec_offs combine(rec_offs value, field_type_t type)
 {
-  return get_value(value) | static_cast<rec_offs>(type);
+  return static_cast<rec_offs>(get_value(value) | type);
 }
 
 /** Compact flag ORed to the extra size returned by rec_get_offsets() */
-const rec_offs REC_OFFS_COMPACT= ~(rec_offs(~0) >> 1);
+constexpr rec_offs REC_OFFS_COMPACT= rec_offs(~(rec_offs(~0) >> 1));
 /** External flag in offsets returned by rec_get_offsets() */
-const rec_offs REC_OFFS_EXTERNAL= REC_OFFS_COMPACT >> 1;
+constexpr rec_offs REC_OFFS_EXTERNAL= REC_OFFS_COMPACT >> 1;
 /** Default value flag in offsets returned by rec_get_offsets() */
-const rec_offs REC_OFFS_DEFAULT= REC_OFFS_COMPACT >> 2;
-const rec_offs REC_OFFS_MASK= REC_OFFS_DEFAULT - 1;
+constexpr rec_offs REC_OFFS_DEFAULT= REC_OFFS_COMPACT >> 2;
+constexpr rec_offs REC_OFFS_MASK= REC_OFFS_DEFAULT - 1;
 /******************************************************//**
 The following function is used to get the pointer of the next chained record
 on the same page.
@@ -241,15 +241,6 @@ rec_get_n_owned_old(
 	const rec_t*	rec)	/*!< in: old-style physical record */
 	MY_ATTRIBUTE((warn_unused_result));
 /******************************************************//**
-The following function is used to set the number of owned records. */
-UNIV_INLINE
-void
-rec_set_n_owned_old(
-/*================*/
-	rec_t*	rec,		/*!< in: old-style physical record */
-	ulint	n_owned)	/*!< in: the number of owned */
-	MY_ATTRIBUTE((nonnull));
-/******************************************************//**
 The following function is used to get the number of records owned by the
 previous directory record.
 @return number of owned records */
@@ -259,45 +250,18 @@ rec_get_n_owned_new(
 /*================*/
 	const rec_t*	rec)	/*!< in: new-style physical record */
 	MY_ATTRIBUTE((warn_unused_result));
-/******************************************************//**
-The following function is used to set the number of owned records. */
-UNIV_INLINE
-void
-rec_set_n_owned_new(
-/*================*/
-	rec_t*		rec,	/*!< in/out: new-style physical record */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	ulint		n_owned)/*!< in: the number of owned */
-	MY_ATTRIBUTE((nonnull(1)));
+
 /******************************************************//**
 The following function is used to retrieve the info bits of
 a record.
 @return info bits */
 UNIV_INLINE
-ulint
+byte
 rec_get_info_bits(
 /*==============*/
 	const rec_t*	rec,	/*!< in: physical record */
 	ulint		comp)	/*!< in: nonzero=compact page format */
 	MY_ATTRIBUTE((warn_unused_result));
-/******************************************************//**
-The following function is used to set the info bits of a record. */
-UNIV_INLINE
-void
-rec_set_info_bits_old(
-/*==================*/
-	rec_t*	rec,	/*!< in: old-style physical record */
-	ulint	bits)	/*!< in: info bits */
-	MY_ATTRIBUTE((nonnull));
-/******************************************************//**
-The following function is used to set the info bits of a record. */
-UNIV_INLINE
-void
-rec_set_info_bits_new(
-/*==================*/
-	rec_t*	rec,	/*!< in/out: new-style physical record */
-	ulint	bits)	/*!< in: info bits */
-	MY_ATTRIBUTE((nonnull));
 
 /** Determine the status bits of a non-REDUNDANT record.
 @param[in]	rec	ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED record
@@ -314,13 +278,11 @@ rec_get_status(const rec_t* rec)
 /** Set the status bits of a non-REDUNDANT record.
 @param[in,out]	rec	ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED record
 @param[in]	bits	status bits */
-inline
-void
-rec_set_status(rec_t* rec, byte bits)
+inline void rec_set_status(rec_t *rec, byte bits)
 {
-	ut_ad(bits <= REC_STATUS_INSTANT);
-	rec[-REC_NEW_STATUS] = (rec[-REC_NEW_STATUS] & ~REC_NEW_STATUS_MASK)
-		| bits;
+  ut_ad(bits <= REC_STATUS_INSTANT);
+  rec[-REC_NEW_STATUS]= static_cast<byte>((rec[-REC_NEW_STATUS] &
+                                           ~REC_NEW_STATUS_MASK) | bits);
 }
 
 /** Get the length of added field count in a REC_STATUS_INSTANT record.
@@ -361,7 +323,7 @@ inline void rec_set_n_add_field(byte*& header, ulint n_add)
 	if (n_add < 0x80) {
 		*header-- = byte(n_add);
 	} else {
-		*header-- = byte(n_add) | 0x80;
+		*header-- = byte(byte(n_add) | 0x80);
 		*header-- = byte(n_add >> 7);
 	}
 }
@@ -369,9 +331,9 @@ inline void rec_set_n_add_field(byte*& header, ulint n_add)
 /******************************************************//**
 The following function is used to retrieve the info and status
 bits of a record.  (Only compact records have status bits.)
-@return info bits */
+@return info and status bits */
 UNIV_INLINE
-ulint
+byte
 rec_get_info_and_status_bits(
 /*=========================*/
 	const rec_t*	rec,	/*!< in: physical record */
@@ -399,25 +361,6 @@ rec_get_deleted_flag(
 	ulint		comp)	/*!< in: nonzero=compact page format */
 	MY_ATTRIBUTE((warn_unused_result));
 /******************************************************//**
-The following function is used to set the deleted bit. */
-UNIV_INLINE
-void
-rec_set_deleted_flag_old(
-/*=====================*/
-	rec_t*	rec,	/*!< in: old-style physical record */
-	ulint	flag)	/*!< in: nonzero if delete marked */
-	MY_ATTRIBUTE((nonnull));
-/******************************************************//**
-The following function is used to set the deleted bit. */
-UNIV_INLINE
-void
-rec_set_deleted_flag_new(
-/*=====================*/
-	rec_t*		rec,	/*!< in/out: new-style physical record */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	ulint		flag)	/*!< in: nonzero if delete marked */
-	MY_ATTRIBUTE((nonnull(1)));
-/******************************************************//**
 The following function tells if a new-style record is a node pointer.
 @return TRUE if node pointer */
 UNIV_INLINE
@@ -437,16 +380,6 @@ rec_get_heap_no_old(
 	const rec_t*	rec)	/*!< in: physical record */
 	MY_ATTRIBUTE((warn_unused_result));
 /******************************************************//**
-The following function is used to set the heap number
-field in an old-style record. */
-UNIV_INLINE
-void
-rec_set_heap_no_old(
-/*================*/
-	rec_t*	rec,	/*!< in: physical record */
-	ulint	heap_no)/*!< in: the heap number */
-	MY_ATTRIBUTE((nonnull));
-/******************************************************//**
 The following function is used to get the order number
 of a new-style record in the heap of the index page.
 @return heap order number */
@@ -457,16 +390,6 @@ rec_get_heap_no_new(
 	const rec_t*	rec)	/*!< in: physical record */
 	MY_ATTRIBUTE((warn_unused_result));
 /******************************************************//**
-The following function is used to set the heap number
-field in a new-style record. */
-UNIV_INLINE
-void
-rec_set_heap_no_new(
-/*================*/
-	rec_t*	rec,	/*!< in/out: physical record */
-	ulint	heap_no)/*!< in: the heap number */
-	MY_ATTRIBUTE((nonnull));
-/******************************************************//**
 The following function is used to test whether the data offsets
 in the record are stored in one-byte or two-byte format.
 @return TRUE if 1-byte form */
@@ -918,26 +841,6 @@ rec_offs_n_extern(
 /*==============*/
 	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
 	MY_ATTRIBUTE((warn_unused_result));
-/***********************************************************//**
-This is used to modify the value of an already existing field in a record.
-The previous value must have exactly the same size as the new value. If len
-is UNIV_SQL_NULL then the field is treated as an SQL null.
-For records in ROW_FORMAT=COMPACT (new-style records), len must not be
-UNIV_SQL_NULL unless the field already is SQL null. */
-UNIV_INLINE
-void
-rec_set_nth_field(
-/*==============*/
-	rec_t*		rec,	/*!< in: record */
-	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
-	ulint		n,	/*!< in: index number of the field */
-	const void*	data,	/*!< in: pointer to the data if not SQL null */
-	ulint		len)	/*!< in: length of the data or UNIV_SQL_NULL.
-				If not SQL null, must have the same
-				length as the previous value.
-				If SQL null, previous value must be
-				SQL null. */
-	MY_ATTRIBUTE((nonnull(1,2)));
 /**********************************************************//**
 The following function returns the data size of an old-style physical
 record, that is the sum of field lengths. SQL null fields
diff --git a/storage/innobase/include/rem0rec.inl b/storage/innobase/include/rem0rec.inl
index 6cecd9f1f08..30c72a7415a 100644
--- a/storage/innobase/include/rem0rec.inl
+++ b/storage/innobase/include/rem0rec.inl
@@ -123,27 +123,10 @@ and the shift needed to obtain each bit-field of the record. */
 # error "sum of new-style masks != 0xFFFFFFUL"
 #endif
 
-/***********************************************************//**
-Sets the value of the ith field SQL null bit of an old-style record. */
-void
-rec_set_nth_field_null_bit(
-/*=======================*/
-	rec_t*	rec,	/*!< in: record */
-	ulint	i,	/*!< in: ith field */
-	ibool	val);	/*!< in: value to set */
-/***********************************************************//**
-Sets an old-style record field to SQL null.
-The physical size of the field is not changed. */
-void
-rec_set_nth_field_sql_null(
-/*=======================*/
-	rec_t*	rec,	/*!< in: record */
-	ulint	n);	/*!< in: index of the field */
-
 /******************************************************//**
 Gets a bit field from within 1 byte. */
 UNIV_INLINE
-ulint
+byte
 rec_get_bit_field_1(
 /*================*/
 	const rec_t*	rec,	/*!< in: pointer to record origin */
@@ -151,9 +134,7 @@ rec_get_bit_field_1(
 	ulint		mask,	/*!< in: mask used to filter bits */
 	ulint		shift)	/*!< in: shift right applied after masking */
 {
-	ut_ad(rec);
-
-	return((mach_read_from_1(rec - offs) & mask) >> shift);
+  return static_cast<byte>((*(rec - offs) & mask) >> shift);
 }
 
 /******************************************************//**
@@ -504,19 +485,6 @@ rec_get_n_owned_old(
 }
 
 /******************************************************//**
-The following function is used to set the number of owned records. */
-UNIV_INLINE
-void
-rec_set_n_owned_old(
-/*================*/
-	rec_t*	rec,		/*!< in: old-style physical record */
-	ulint	n_owned)	/*!< in: the number of owned */
-{
-	rec_set_bit_field_1(rec, n_owned, REC_OLD_N_OWNED,
-			    REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
-}
-
-/******************************************************//**
 The following function is used to get the number of records owned by the
 previous directory record.
 @return number of owned records */
@@ -531,85 +499,38 @@ rec_get_n_owned_new(
 }
 
 /******************************************************//**
-The following function is used to set the number of owned records. */
-UNIV_INLINE
-void
-rec_set_n_owned_new(
-/*================*/
-	rec_t*		rec,	/*!< in/out: new-style physical record */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	ulint		n_owned)/*!< in: the number of owned */
-{
-	rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED,
-			    REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
-	if (page_zip && rec_get_status(rec) != REC_STATUS_SUPREMUM) {
-		page_zip_rec_set_owned(page_zip, rec, n_owned);
-	}
-}
-
-/******************************************************//**
 The following function is used to retrieve the info bits of a record.
 @return info bits */
 UNIV_INLINE
-ulint
+byte
 rec_get_info_bits(
 /*==============*/
 	const rec_t*	rec,	/*!< in: physical record */
 	ulint		comp)	/*!< in: nonzero=compact page format */
 {
-	const ulint	val = rec_get_bit_field_1(
+	return rec_get_bit_field_1(
 		rec, comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS,
 		REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
-	return(val);
-}
-
-/******************************************************//**
-The following function is used to set the info bits of a record. */
-UNIV_INLINE
-void
-rec_set_info_bits_old(
-/*==================*/
-	rec_t*	rec,	/*!< in: old-style physical record */
-	ulint	bits)	/*!< in: info bits */
-{
-	rec_set_bit_field_1(rec, bits, REC_OLD_INFO_BITS,
-			    REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
-}
-/******************************************************//**
-The following function is used to set the info bits of a record. */
-UNIV_INLINE
-void
-rec_set_info_bits_new(
-/*==================*/
-	rec_t*	rec,	/*!< in/out: new-style physical record */
-	ulint	bits)	/*!< in: info bits */
-{
-	rec_set_bit_field_1(rec, bits, REC_NEW_INFO_BITS,
-			    REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
 }
 
 /******************************************************//**
 The following function is used to retrieve the info and status
 bits of a record.  (Only compact records have status bits.)
-@return info bits */
+@return info and status bits */
 UNIV_INLINE
-ulint
+byte
 rec_get_info_and_status_bits(
 /*=========================*/
 	const rec_t*	rec,	/*!< in: physical record */
 	ulint		comp)	/*!< in: nonzero=compact page format */
 {
-	ulint	bits;
-	compile_time_assert(!((REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT)
-			      & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)));
-	if (comp) {
-		bits = rec_get_info_bits(rec, TRUE)
-			| ulint(rec_get_status(rec));
-	} else {
-		bits = rec_get_info_bits(rec, FALSE);
-		ut_ad(!(bits & ~(REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)));
-	}
-	return(bits);
+  compile_time_assert(!((REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT)
+                        & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)));
+  if (comp)
+    return static_cast<byte>(rec_get_info_bits(rec, TRUE) |
+                             rec_get_status(rec));
+  else
+    return rec_get_info_bits(rec, FALSE);
 }
 /******************************************************//**
 The following function is used to set the info and status
@@ -624,7 +545,9 @@ rec_set_info_and_status_bits(
 	compile_time_assert(!((REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT)
 			      & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)));
 	rec_set_status(rec, bits & REC_NEW_STATUS_MASK);
-	rec_set_info_bits_new(rec, bits & ~REC_NEW_STATUS_MASK);
+	rec_set_bit_field_1(rec, bits & ~REC_NEW_STATUS_MASK,
+			    REC_NEW_INFO_BITS,
+			    REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
 }
 
 /******************************************************//**
@@ -649,55 +572,6 @@ rec_get_deleted_flag(
 }
 
 /******************************************************//**
-The following function is used to set the deleted bit. */
-UNIV_INLINE
-void
-rec_set_deleted_flag_old(
-/*=====================*/
-	rec_t*	rec,	/*!< in: old-style physical record */
-	ulint	flag)	/*!< in: nonzero if delete marked */
-{
-	ulint	val;
-
-	val = rec_get_info_bits(rec, FALSE);
-
-	if (flag) {
-		val |= REC_INFO_DELETED_FLAG;
-	} else {
-		val &= ~REC_INFO_DELETED_FLAG;
-	}
-
-	rec_set_info_bits_old(rec, val);
-}
-
-/******************************************************//**
-The following function is used to set the deleted bit. */
-UNIV_INLINE
-void
-rec_set_deleted_flag_new(
-/*=====================*/
-	rec_t*		rec,	/*!< in/out: new-style physical record */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	ulint		flag)	/*!< in: nonzero if delete marked */
-{
-	ulint	val;
-
-	val = rec_get_info_bits(rec, TRUE);
-
-	if (flag) {
-		val |= REC_INFO_DELETED_FLAG;
-	} else {
-		val &= ~REC_INFO_DELETED_FLAG;
-	}
-
-	rec_set_info_bits_new(rec, val);
-
-	if (page_zip) {
-		page_zip_rec_set_deleted(page_zip, rec, flag);
-	}
-}
-
-/******************************************************//**
 The following function tells if a new-style record is a node pointer.
 @return TRUE if node pointer */
 UNIV_INLINE
@@ -724,20 +598,6 @@ rec_get_heap_no_old(
 }
 
 /******************************************************//**
-The following function is used to set the heap number
-field in an old-style record. */
-UNIV_INLINE
-void
-rec_set_heap_no_old(
-/*================*/
-	rec_t*	rec,	/*!< in: physical record */
-	ulint	heap_no)/*!< in: the heap number */
-{
-	rec_set_bit_field_2(rec, heap_no, REC_OLD_HEAP_NO,
-			    REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
-}
-
-/******************************************************//**
 The following function is used to get the order number
 of a new-style record in the heap of the index page.
 @return heap order number */
@@ -752,20 +612,6 @@ rec_get_heap_no_new(
 }
 
 /******************************************************//**
-The following function is used to set the heap number
-field in a new-style record. */
-UNIV_INLINE
-void
-rec_set_heap_no_new(
-/*================*/
-	rec_t*	rec,	/*!< in/out: physical record */
-	ulint	heap_no)/*!< in: the heap number */
-{
-	rec_set_bit_field_2(rec, heap_no, REC_NEW_HEAP_NO,
-			    REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
-}
-
-/******************************************************//**
 The following function is used to test whether the data offsets in the record
 are stored in one-byte or two-byte format.
 @return TRUE if 1-byte form */
@@ -1139,50 +985,6 @@ rec_get_nth_field_size(
 	return(next_os - os);
 }
 
-/***********************************************************//**
-This is used to modify the value of an already existing field in a record.
-The previous value must have exactly the same size as the new value. If len
-is UNIV_SQL_NULL then the field is treated as an SQL null.
-For records in ROW_FORMAT=COMPACT (new-style records), len must not be
-UNIV_SQL_NULL unless the field already is SQL null. */
-UNIV_INLINE
-void
-rec_set_nth_field(
-/*==============*/
-	rec_t*		rec,	/*!< in: record */
-	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
-	ulint		n,	/*!< in: index number of the field */
-	const void*	data,	/*!< in: pointer to the data
-				if not SQL null */
-	ulint		len)	/*!< in: length of the data or UNIV_SQL_NULL */
-{
-	byte*	data2;
-	ulint	len2;
-
-	ut_ad(rec_offs_validate(rec, NULL, offsets));
-	ut_ad(!rec_offs_nth_default(offsets, n));
-
-	if (len == UNIV_SQL_NULL) {
-		if (!rec_offs_nth_sql_null(offsets, n)) {
-			ut_a(!rec_offs_comp(offsets));
-			rec_set_nth_field_sql_null(rec, n);
-		}
-
-		return;
-	}
-
-	data2 = (byte*)rec_get_nth_field(rec, offsets, n, &len2);
-	if (len2 == UNIV_SQL_NULL) {
-		ut_ad(!rec_offs_comp(offsets));
-		rec_set_nth_field_null_bit(rec, n, FALSE);
-		ut_ad(len == rec_get_nth_field_size(rec, n));
-	} else {
-		ut_ad(len2 == len);
-	}
-
-	ut_memcpy(data2, data, len);
-}
-
 /**********************************************************//**
 The following function returns the data size of an old-style physical
 record, that is the sum of field lengths. SQL null fields
@@ -1321,7 +1123,7 @@ rec_copy(
 	extra_len = rec_offs_extra_size(offsets);
 	data_len = rec_offs_data_size(offsets);
 
-	ut_memcpy(buf, rec - extra_len, extra_len + data_len);
+	memcpy(buf, rec - extra_len, extra_len + data_len);
 
 	return((byte*) buf + extra_len);
 }
@@ -1367,8 +1169,8 @@ rec_get_converted_size(
 		ut_ad(dtuple->n_fields > 1);
 	} else if ((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK)
 		   == REC_STATUS_NODE_PTR) {
-		ut_ad(dtuple->n_fields
-		      == dict_index_get_n_unique_in_tree_nonleaf(index) + 1);
+		ut_ad(dtuple->n_fields - 1
+		      == dict_index_get_n_unique_in_tree_nonleaf(index));
 	} else if (index->table->id == DICT_INDEXES_ID) {
 		/* The column SYS_INDEXES.MERGE_THRESHOLD was
 		instantly added in MariaDB 10.2.2 (MySQL 5.7). */
diff --git a/storage/innobase/include/rem0types.h b/storage/innobase/include/rem0types.h
index 11a164f5130..0e4075a94c3 100644
--- a/storage/innobase/include/rem0types.h
+++ b/storage/innobase/include/rem0types.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, MariaDB Corporation.
+Copyright (c) 2019, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -40,11 +40,12 @@ typedef unsigned short int rec_offs;
 
 /* Maximum number of user defined fields/columns. The reserved columns
 are the ones InnoDB adds internally: DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR.
-We need "* 2" because mlog_parse_index() creates a dummy table object
-possibly, with some of the system columns in it, and then adds the 3
-system columns (again) using dict_table_add_system_columns(). The problem
-is that mlog_parse_index() cannot recognize the system columns by
-just having n_fields, n_uniq and the lengths of the columns. */
+Before MariaDB Server 10.5, we needed "* 2" because mlog_parse_index()
+created a dummy table object possibly, with some of the system columns
+in it, and then adds the 3 system columns (again) using
+dict_table_add_system_columns().
+For now, we will keep this limitation to maintain file format compatibility
+with older versions. */
 #define REC_MAX_N_USER_FIELDS	(REC_MAX_N_FIELDS - DATA_N_SYS_COLS * 2)
 
 /* REC_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and is the maximum
diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h
index 0189bb7a4ff..99c85601d5d 100644
--- a/storage/innobase/include/row0ftsort.h
+++ b/storage/innobase/include/row0ftsort.h
@@ -33,6 +33,7 @@ Created 10/13/2010 Jimmy Yang
 #include "rem0types.h"
 #include "row0merge.h"
 #include "btr0bulk.h"
+#include "srv0srv.h"
 
 /** This structure defineds information the scan thread will fetch
 and put to the linked list for parallel tokenization/sort threads
@@ -65,7 +66,6 @@ struct fts_psort_common_t {
 	trx_t*			trx;		/*!< transaction */
 	fts_psort_t*		all_info;	/*!< all parallel sort info */
 	os_event_t		sort_event;	/*!< sort event */
-	os_event_t		merge_event;	/*!< merge event */
 	ibool			opt_doc_id_size;/*!< whether to use 4 bytes
 						instead of 8 bytes integer to
 						store Doc ID during sort, if
@@ -81,17 +81,13 @@ struct fts_psort_t {
 						/*!< sort file */
 	row_merge_block_t*	merge_block[FTS_NUM_AUX_INDEX];
 						/*!< buffer to write to file */
-	row_merge_block_t*	block_alloc[FTS_NUM_AUX_INDEX];
-						/*!< buffer to allocated */
 	row_merge_block_t*	crypt_block[FTS_NUM_AUX_INDEX];
 						/*!< buffer to crypt data */
-	row_merge_block_t*	crypt_alloc[FTS_NUM_AUX_INDEX];
-						/*!< buffer to allocated */
-	ulint			child_status;	/*!< child thread status */
-	ulint			state;		/*!< parent thread state */
+	ulint			child_status;	/*!< child task status */
+	ulint			state;		/*!< parent state */
 	fts_doc_list_t		fts_doc_list;	/*!< doc list to process */
 	fts_psort_common_t*	psort_common;	/*!< ptr to all psort info */
-	os_thread_t		thread_hdl;	/*!< thread handler */
+	tpool::waitable_task*	task;	/*!< threadpool task */
 	dberr_t			error;		/*!< db error during psort */
 	ulint			memory_used;	/*!< memory used by fts_doc_list */
 	ib_mutex_t		mutex;		/*!< mutex for fts_doc_list */
diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h
index 35826e2f4d5..978a3f906c0 100644
--- a/storage/innobase/include/row0log.h
+++ b/storage/innobase/include/row0log.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h
index 3252af0062b..1d7f9bb145b 100644
--- a/storage/innobase/include/row0merge.h
+++ b/storage/innobase/include/row0merge.h
@@ -205,22 +205,6 @@ row_merge_file_destroy_low(
 	const pfs_os_file_t&	fd);	/*!< in: merge file descriptor */
 
 /*********************************************************************//**
-Rename the tables in the data dictionary.  The data dictionary must
-have been locked exclusively by the caller, because the transaction
-will not be committed.
-@return error code or DB_SUCCESS */
-dberr_t
-row_merge_rename_tables_dict(
-/*=========================*/
-	dict_table_t*	old_table,	/*!< in/out: old table, renamed to
-					tmp_name */
-	dict_table_t*	new_table,	/*!< in/out: new table, renamed to
-					old_table->name */
-	const char*	tmp_name,	/*!< in: new name for old_table */
-	trx_t*		trx)		/*!< in/out: dictionary transaction */
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
-
-/*********************************************************************//**
 Rename an index in the dictionary that was created. The data
 dictionary must have been locked exclusively by the caller, because
 the transaction will not be committed.
@@ -282,12 +266,6 @@ row_merge_drop_table(
 	dict_table_t*	table)		/*!< in: table instance to drop */
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
-/** Write an MLOG_INDEX_LOAD record to indicate in the redo-log
-that redo-logging of individual index pages was disabled, and
-the flushing of such pages to the data files was completed.
-@param[in]	index	an index tree on which redo logging was disabled */
-void row_merge_write_redo(const dict_index_t* index);
-
 /** Build indexes on a table by reading a clustered index, creating a temporary
 file containing index entries, merge sorting these index entries and inserting
 sorted index entries to indexes.
diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h
index 51afbedd649..04ac6bb9d70 100644
--- a/storage/innobase/include/row0mysql.h
+++ b/storage/innobase/include/row0mysql.h
@@ -271,8 +271,8 @@ row_update_for_mysql(
 	row_prebuilt_t*		prebuilt)
 	MY_ATTRIBUTE((warn_unused_result));
 
-/** This can only be used when srv_locks_unsafe_for_binlog is TRUE or this
-session is using a READ COMMITTED or READ UNCOMMITTED isolation level.
+/** This can only be used when the current transaction is at
+READ COMMITTED or READ UNCOMMITTED isolation level.
 Before calling this function row_search_for_mysql() must have
 initialized prebuilt->new_rec_locks to store the information which new
 record locks really were set. This function removes a newly set
@@ -685,8 +685,8 @@ struct row_prebuilt_t {
 					updated */
 	dtuple_t*	clust_ref;	/*!< prebuilt dtuple used in
 					sel/upd/del */
-	ulint		select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */
-	ulint		stored_select_lock_type;/*!< this field is used to
+	lock_mode	select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */
+	lock_mode	stored_select_lock_type;/*!< this field is used to
 					remember the original select_lock_type
 					that was decided in ha_innodb.cc,
 					::store_lock(), ::external_lock(),
@@ -694,8 +694,9 @@ struct row_prebuilt_t {
 	ulint		row_read_type;	/*!< ROW_READ_WITH_LOCKS if row locks
 					should be the obtained for records
 					under an UPDATE or DELETE cursor.
-					If innodb_locks_unsafe_for_binlog
-					is TRUE, this can be set to
+					At READ UNCOMMITTED or
+					READ COMMITTED isolation level,
+					this can be set to
 					ROW_READ_TRY_SEMI_CONSISTENT, so that
 					if the row under an UPDATE or DELETE
 					cursor was locked by another
@@ -717,8 +718,7 @@ struct row_prebuilt_t {
 					cases; note that this breaks
 					serializability. */
 	ulint		new_rec_locks;	/*!< normally 0; if
-					srv_locks_unsafe_for_binlog is
-					TRUE or session is using READ
+					the session is using READ
 					COMMITTED or READ UNCOMMITTED
 					isolation level, set in
 					row_search_for_mysql() if we set a new
diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h
index 1505fb9663a..091d80adec5 100644
--- a/storage/innobase/include/row0purge.h
+++ b/storage/innobase/include/row0purge.h
@@ -32,9 +32,11 @@ Created 3/14/1997 Heikki Tuuri
 #include "btr0pcur.h"
 #include "trx0types.h"
 #include "row0types.h"
-#include "ut0vec.h"
 #include "row0mysql.h"
+#include "mysqld.h"
+#include <queue>
 
+class MDL_ticket;
 /** Determines if it is possible to remove a secondary index entry.
 Removal is possible if the secondary index entry does not refer to any
 not delete marked version of a clustered index record where DB_TRX_ID
@@ -79,6 +81,15 @@ row_purge_step(
 	que_thr_t*	thr)	/*!< in: query thread */
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
+/** Info required to purge a record */
+struct trx_purge_rec_t
+{
+  /** Record to purge */
+  trx_undo_rec_t *undo_rec;
+  /** File pointer to undo record */
+  roll_ptr_t roll_ptr;
+};
+
 /* Purge node structure */
 
 struct purge_node_t{
@@ -86,7 +97,6 @@ struct purge_node_t{
 	/*----------------------*/
 	/* Local storage for this graph node */
 	roll_ptr_t	roll_ptr;/* roll pointer to undo log record */
-	ib_vector_t*    undo_recs;/*!< Undo recs to purge */
 
 	undo_no_t	undo_no;/*!< undo number of the record */
 
@@ -127,21 +137,36 @@ public:
 #endif
 	trx_id_t	trx_id;	/*!< trx id for this purging record */
 
-	/** Virtual column information about opening of MariaDB table.
-	It resets after processing each undo log record. */
-	purge_vcol_info_t	vcol_info;
+	/** meta-data lock for the table name */
+	MDL_ticket*		mdl_ticket;
+
+	/** table id of the previous undo log record */
+	table_id_t		last_table_id;
+
+	/** purge thread */
+	THD*			purge_thd;
+
+	/** metadata lock holds for this number of undo log recs */
+	int			mdl_hold_recs;
+
+	/** Undo recs to purge */
+	std::queue<trx_purge_rec_t>	undo_recs;
 
 	/** Constructor */
 	explicit purge_node_t(que_thr_t* parent) :
 		common(QUE_NODE_PURGE, parent),
-		undo_recs(NULL),
 		unavailable_table_id(0),
+		table(NULL),
 		heap(mem_heap_create(256)),
 #ifdef UNIV_DEBUG
 		in_progress(false),
 #endif
-		vcol_info()
-	{}
+		mdl_ticket(NULL),
+		last_table_id(0),
+		purge_thd(NULL),
+		mdl_hold_recs(0)
+	{
+	}
 
 #ifdef UNIV_DEBUG
 	/***********************************************************//**
@@ -154,11 +179,6 @@ public:
 	bool validate_pcur();
 #endif
 
-	/** Whether purge failed to open the maria table for virtual column
-	computation.
-	@return true if the table failed to open. */
-	bool vcol_op_failed() const { return !vcol_info.validate(); }
-
 	/** Determine if a table should be skipped in purge.
 	@param[in]	table_id	table identifier
 	@return	whether to skip the table lookup and processing */
@@ -177,33 +197,72 @@ public:
 		def_trx_id = limit;
 	}
 
-	/** Start processing an undo log record. */
-	void start()
-	{
-		ut_ad(in_progress);
-		DBUG_ASSERT(common.type == QUE_NODE_PURGE);
-
-		table = NULL;
-		row = NULL;
-		ref = NULL;
-		index = NULL;
-		update = NULL;
-		found_clust = FALSE;
-		rec_type = ULINT_UNDEFINED;
-		cmpl_info = ULINT_UNDEFINED;
-	}
+  /** Start processing an undo log record. */
+  void start()
+  {
+    ut_ad(in_progress);
+    DBUG_ASSERT(common.type == QUE_NODE_PURGE);
 
-	/** Reset the state at end
-	@return the query graph parent */
-	que_node_t* end()
-	{
-		DBUG_ASSERT(common.type == QUE_NODE_PURGE);
-		undo_recs = NULL;
-		ut_d(in_progress = false);
-		vcol_info.reset();
-		mem_heap_empty(heap);
-		return common.parent;
-	}
+    row= nullptr;
+    ref= nullptr;
+    index= nullptr;
+    update= nullptr;
+    found_clust= FALSE;
+    rec_type= ULINT_UNDEFINED;
+    cmpl_info= ULINT_UNDEFINED;
+    if (!purge_thd)
+      purge_thd= current_thd;
+  }
+
+
+  /** Close the existing table and release the MDL for it. */
+  void close_table()
+  {
+    last_table_id= 0;
+    if (!table)
+    {
+      ut_ad(!mdl_ticket);
+      return;
+    }
+
+    innobase_reset_background_thd(purge_thd);
+    dict_table_close(table, false, false, purge_thd, mdl_ticket);
+    table= nullptr;
+    mdl_ticket= nullptr;
+  }
+
+
+  /** Retail mdl for the table id.
+  @param[in]	table_id	table id to be processed
+  @return true if retain mdl */
+  bool retain_mdl(table_id_t table_id)
+  {
+    ut_ad(table_id);
+    if (last_table_id == table_id && mdl_hold_recs < 100)
+    {
+      ut_ad(table);
+      mdl_hold_recs++;
+      return true;
+    }
+
+    mdl_hold_recs= 0;
+    close_table();
+    return false;
+  }
+
+
+  /** Reset the state at end
+  @return the query graph parent */
+  que_node_t* end()
+  {
+    DBUG_ASSERT(common.type == QUE_NODE_PURGE);
+    close_table();
+    ut_ad(undo_recs.empty());
+    ut_d(in_progress= false);
+    purge_thd= nullptr;
+    mem_heap_empty(heap);
+    return common.parent;
+  }
 };
 
 #endif
diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h
index a683a2178e4..eb83a4bcad6 100644
--- a/storage/innobase/include/row0sel.h
+++ b/storage/innobase/include/row0sel.h
@@ -342,7 +342,7 @@ struct sel_node_t{
 	ibool		set_x_locks;	/*!< TRUE if the cursor is for update or
 					delete, which means that a row x-lock
 					should be placed on the cursor row */
-	ulint		row_lock_mode;	/*!< LOCK_X or LOCK_S */
+	lock_mode	row_lock_mode;	/*!< LOCK_X or LOCK_S */
 	ulint		n_tables;	/*!< number of tables */
 	ulint		fetch_table;	/*!< number of the next table to access
 					in the join */
diff --git a/storage/innobase/include/row0types.h b/storage/innobase/include/row0types.h
index 048b161b884..5e737c1c14f 100644
--- a/storage/innobase/include/row0types.h
+++ b/storage/innobase/include/row0types.h
@@ -52,97 +52,3 @@ struct row_log_t;
 
 /* MySQL data types */
 struct TABLE;
-
-/** Purge virtual column node information. */
-struct purge_vcol_info_t
-{
-private:
-	/** Is there a possible need to evaluate virtual columns? */
-	bool	requested;
-	/** Do we have to evaluate virtual columns (using mariadb_table)? */
-	bool	used;
-
-	/** True if it is used for the first time. */
-	bool	first_use;
-
-	/** MariaDB table opened for virtual column computation. */
-	TABLE*	mariadb_table;
-
-public:
-	/** Default constructor */
-	purge_vcol_info_t() :
-		requested(false), used(false), first_use(false),
-		mariadb_table(NULL)
-	{}
-	/** Reset the state. */
-	void reset()
-	{
-		requested = false;
-		used = false;
-		first_use = false;
-		mariadb_table = NULL;
-	}
-
-	/** Validate the virtual column information.
-	@return true if the mariadb table opened successfully
-	or doesn't try to calculate virtual column. */
-	bool validate() const { return !used || mariadb_table; }
-
-	/** @return the table handle for evaluating virtual columns */
-	TABLE* table() const { return mariadb_table; }
-
-	/** Set the table handle for evaluating virtual columns.
-	@param[in]	table	table handle */
-	void set_table(TABLE* table)
-	{
-		ut_ad(!table || is_first_fetch());
-		mariadb_table = table;
-	}
-
-	/** Note that virtual column information may be needed. */
-	void set_requested()
-	{
-		ut_ad(!used);
-		ut_ad(!first_use);
-		ut_ad(!mariadb_table);
-		requested = true;
-	}
-
-	/** @return whether the virtual column information may be needed */
-	bool is_requested() const { return requested; }
-
-	/** Note that the virtual column information is needed. */
-	void set_used()
-	{
-		ut_ad(requested);
-
-		if (first_use) {
-			first_use = false;
-			ut_ad(used);
-			return;
-		}
-
-		if (!used) {
-			first_use = used = true;
-		}
-	}
-
-	/** @return whether the virtual column information is needed */
-	bool is_used() const
-	{
-		ut_ad(!first_use || used);
-		ut_ad(!used || requested);
-		ut_ad(used || !mariadb_table);
-		return used;
-	}
-
-	/** Check whether it fetches mariadb table for the first time.
-	@return true if first time tries to open mariadb table. */
-	bool is_first_fetch() const
-	{
-		ut_ad(!first_use || used);
-		ut_ad(!used || requested);
-		ut_ad(used || !mariadb_table);
-		return first_use;
-	}
-};
diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h
index 2615bf55a04..9ee5d77f5e5 100644
--- a/storage/innobase/include/row0upd.h
+++ b/storage/innobase/include/row0upd.h
@@ -75,7 +75,7 @@ void
 upd_field_set_field_no(
 /*===================*/
 	upd_field_t*	upd_field,	/*!< in: update vector field */
-	ulint		field_no,	/*!< in: field number in a clustered
+	uint16_t	field_no,	/*!< in: field number in a clustered
 					index */
 	dict_index_t*	index);
 
@@ -87,7 +87,7 @@ UNIV_INLINE
 void
 upd_field_set_v_field_no(
 	upd_field_t*	upd_field,
-	ulint		field_no,
+	uint16_t	field_no,
 	dict_index_t*	index);
 /*********************************************************************//**
 Returns a field of an update vector by field_no.
@@ -97,24 +97,10 @@ const upd_field_t*
 upd_get_field_by_field_no(
 /*======================*/
 	const upd_t*	update,	/*!< in: update vector */
-	ulint		no,	/*!< in: field_no */
+	uint16_t	no,	/*!< in: field_no */
 	bool		is_virtual) /*!< in: if it is a virtual column */
 	MY_ATTRIBUTE((warn_unused_result));
 /*********************************************************************//**
-Updates the trx id and roll ptr field in a clustered index record when
-a row is updated or marked deleted. */
-UNIV_INLINE
-void
-row_upd_rec_sys_fields(
-/*===================*/
-	rec_t*		rec,	/*!< in/out: record */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
-				uncompressed part will be updated, or NULL */
-	dict_index_t*	index,	/*!< in: clustered index */
-	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
-	const trx_t*	trx,	/*!< in: transaction */
-	roll_ptr_t	roll_ptr);/*!< in: DB_ROLL_PTR to the undo log */
-/*********************************************************************//**
 Creates an update node for a query graph.
 @return own: update node */
 upd_node_t*
@@ -122,17 +108,6 @@ upd_node_create(
 /*============*/
 	mem_heap_t*	heap);	/*!< in: mem heap where created */
 /***********************************************************//**
-Writes to the redo log the new values of the fields occurring in the index. */
-void
-row_upd_index_write_log(
-/*====================*/
-	const upd_t*	update,	/*!< in: update vector */
-	byte*		log_ptr,/*!< in: pointer to mlog buffer: must
-				contain at least MLOG_BUF_MARGIN bytes
-				of free space; the buffer is closed
-				within this function */
-	mtr_t*		mtr);	/*!< in: mtr into whose log to write */
-/***********************************************************//**
 Returns TRUE if row update changes size of some field in index or if some
 field to be updated is stored externally in rec or update.
 @return TRUE if the update changes the size of some field in index or
@@ -151,21 +126,6 @@ row_upd_changes_disowned_external(
 /*==============================*/
 	const upd_t*	update)	/*!< in: update vector */
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
-/***********************************************************//**
-Replaces the new column values stored in the update vector to the
-record given. No field size changes are allowed. This function is
-usually invoked on a clustered index. The only use case for a
-secondary index is row_ins_sec_index_entry_by_modify() or its
-counterpart in ibuf_insert_to_index_page(). */
-void
-row_upd_rec_in_place(
-/*=================*/
-	rec_t*		rec,	/*!< in/out: record where replaced */
-	dict_index_t*	index,	/*!< in: the index the record belongs to */
-	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
-	const upd_t*	update,	/*!< in: update vector */
-	page_zip_des_t*	page_zip);/*!< in: compressed page with enough space
-				available, or NULL */
 
 /***************************************************************//**
 Builds an update vector from those fields which in a secondary index entry
@@ -340,52 +300,17 @@ que_thr_t*
 row_upd_step(
 /*=========*/
 	que_thr_t*	thr);	/*!< in: query thread */
-/*********************************************************************//**
-Parses the log data of system field values.
-@return log data end or NULL */
-byte*
-row_upd_parse_sys_vals(
-/*===================*/
-	const byte*	ptr,	/*!< in: buffer */
-	const byte*	end_ptr,/*!< in: buffer end */
-	ulint*		pos,	/*!< out: TRX_ID position in record */
-	trx_id_t*	trx_id,	/*!< out: trx id */
-	roll_ptr_t*	roll_ptr);/*!< out: roll ptr */
-/*********************************************************************//**
-Updates the trx id and roll ptr field in a clustered index record in database
-recovery. */
-void
-row_upd_rec_sys_fields_in_recovery(
-/*===============================*/
-	rec_t*		rec,	/*!< in/out: record */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
-	ulint		pos,	/*!< in: TRX_ID position in rec */
-	trx_id_t	trx_id,	/*!< in: transaction id */
-	roll_ptr_t	roll_ptr);/*!< in: roll ptr of the undo log record */
-/*********************************************************************//**
-Parses the log data written by row_upd_index_write_log.
-@return log data end or NULL */
-byte*
-row_upd_index_parse(
-/*================*/
-	const byte*	ptr,	/*!< in: buffer */
-	const byte*	end_ptr,/*!< in: buffer end */
-	mem_heap_t*	heap,	/*!< in: memory heap where update vector is
-				built */
-	upd_t**		update_out);/*!< out: update vector */
-
 
 /* Update vector field */
 struct upd_field_t{
-	unsigned	field_no:16;	/*!< field number in an index, usually
+	uint16_t	field_no;	/*!< field number in an index, usually
 					the clustered index, but in updating
 					a secondary index record in btr0cur.cc
 					this is the position in the secondary
 					index. If this field is a virtual
 					column, then field_no represents
 					the nth virtual	column in the table */
-	unsigned	orig_len:16;	/*!< original length of the locally
+	uint16_t	orig_len;	/*!< original length of the locally
 					stored part of an externally stored
 					column, or 0 */
 	que_node_t*	exp;		/*!< expression for calculating a new
@@ -408,7 +333,7 @@ struct upd_field_t{
 /* Update vector structure */
 struct upd_t{
 	mem_heap_t*	heap;		/*!< heap from which memory allocated */
-	ulint		info_bits;	/*!< new value of info bits to record;
+	byte		info_bits;	/*!< new value of info bits to record;
 					default is 0 */
 	dtuple_t*	old_vrow;	/*!< pointer to old row, used for
 					virtual column update now */
@@ -450,7 +375,7 @@ struct upd_t{
 
         /** Determine if the given field_no is modified.
 	@return true if modified, false otherwise.  */
-	bool is_modified(const ulint field_no) const
+	bool is_modified(uint16_t field_no) const
 	{
 		for (ulint i = 0; i < n_fields; ++i) {
 			if (field_no == fields[i].field_no) {
diff --git a/storage/innobase/include/row0upd.inl b/storage/innobase/include/row0upd.inl
index fffb7650da3..13aacf3f003 100644
--- a/storage/innobase/include/row0upd.inl
+++ b/storage/innobase/include/row0upd.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2018, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -93,11 +93,11 @@ void
 upd_field_set_field_no(
 /*===================*/
 	upd_field_t*	upd_field,	/*!< in: update vector field */
-	ulint		field_no,	/*!< in: field number in a clustered
+	uint16_t	field_no,	/*!< in: field number in a clustered
 					index */
 	dict_index_t*	index)		/*!< in: index */
 {
-	upd_field->field_no = unsigned(field_no);
+	upd_field->field_no = field_no;
 	upd_field->orig_len = 0;
 	dict_col_copy_type(dict_index_get_nth_col(index, field_no),
 			   dfield_get_type(&upd_field->new_val));
@@ -111,11 +111,11 @@ UNIV_INLINE
 void
 upd_field_set_v_field_no(
 	upd_field_t*	upd_field,
-	ulint		field_no,
+	uint16_t	field_no,
 	dict_index_t*	index)
 {
 	ut_a(field_no < dict_table_get_n_v_cols(index->table));
-	upd_field->field_no = unsigned(field_no);
+	upd_field->field_no = field_no;
 	upd_field->orig_len = 0;
 
 	dict_col_copy_type(&dict_table_get_nth_v_col(
@@ -131,7 +131,7 @@ const upd_field_t*
 upd_get_field_by_field_no(
 /*======================*/
 	const upd_t*	update,	/*!< in: update vector */
-	ulint		no,	/*!< in: field_no */
+	uint16_t	no,	/*!< in: field_no */
 	bool		is_virtual) /*!< in: if it is virtual column */
 {
 	ulint	i;
@@ -151,47 +151,3 @@ upd_get_field_by_field_no(
 
 	return(NULL);
 }
-
-/*********************************************************************//**
-Updates the trx id and roll ptr field in a clustered index record when
-a row is updated or marked deleted. */
-UNIV_INLINE
-void
-row_upd_rec_sys_fields(
-/*===================*/
-	rec_t*		rec,	/*!< in/out: record */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
-				uncompressed part will be updated, or NULL */
-	dict_index_t*	index,	/*!< in: clustered index */
-	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
-	const trx_t*	trx,	/*!< in: transaction */
-	roll_ptr_t	roll_ptr)/*!< in: DB_ROLL_PTR to the undo log */
-{
-	ut_ad(index->is_primary());
-	ut_ad(rec_offs_validate(rec, index, offsets));
-
-	if (UNIV_LIKELY_NULL(page_zip)) {
-		page_zip_write_trx_id_and_roll_ptr(page_zip, rec, offsets,
-						   index->db_trx_id(),
-						   trx->id, roll_ptr);
-	} else {
-		ulint	offset = index->trx_id_offset;
-
-		if (!offset) {
-			offset = row_get_trx_id_offset(index, offsets);
-		}
-
-		compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
-
-		/* During IMPORT the trx id in the record can be in the
-		future, if the .ibd file is being imported from another
-		instance. During IMPORT roll_ptr will be 0. */
-		ut_ad(roll_ptr == 0
-		      || lock_check_trx_id_sanity(
-			      trx_read_trx_id(rec + offset),
-			      rec, index, offsets));
-
-		trx_write_trx_id(rec + offset, trx->id);
-		trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr);
-	}
-}
diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h
index 032801335f8..d54384f837c 100644
--- a/storage/innobase/include/row0vers.h
+++ b/storage/innobase/include/row0vers.h
@@ -70,7 +70,6 @@ this case we return TRUE.
 @param[in]	ientry		secondary index entry
 @param[in]	roll_ptr	roll_ptr for the purge record
 @param[in]	trx_id		transaction ID on the purging record
-@param[in,out]	vcol_info	virtual column information for purge thread.
 @return TRUE if earlier version should have */
 bool
 row_vers_old_has_index_entry(
@@ -80,8 +79,7 @@ row_vers_old_has_index_entry(
 	dict_index_t*		index,
 	const dtuple_t*		ientry,
 	roll_ptr_t		roll_ptr,
-	trx_id_t		trx_id,
-	purge_vcol_info_t*	vcol_info=NULL);
+	trx_id_t		trx_id);
 
 /*****************************************************************//**
 Constructs the version of a clustered index record which a consistent
diff --git a/storage/innobase/include/rw_lock.h b/storage/innobase/include/rw_lock.h
new file mode 100644
index 00000000000..f3d005ff764
--- /dev/null
+++ b/storage/innobase/include/rw_lock.h
@@ -0,0 +1,122 @@
+/*****************************************************************************
+
+Copyright (c) 2020, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+#include <atomic>
+#include "my_dbug.h"
+
+/** Simple read-write lock based on std::atomic */
+class rw_lock
+{
+  /** The lock word */
+  std::atomic<uint32_t> lock;
+
+protected:
+  /** Available lock */
+  static constexpr uint32_t UNLOCKED= 0;
+  /** Flag to indicate that write_lock() is being held */
+  static constexpr uint32_t WRITER= 1U << 31;
+  /** Flag to indicate that write_lock_wait() is pending */
+  static constexpr uint32_t WRITER_WAITING= 1U << 30;
+  /** Flag to indicate that write_lock() or write_lock_wait() is pending */
+  static constexpr uint32_t WRITER_PENDING= WRITER | WRITER_WAITING;
+
+  /** Start waiting for an exclusive lock. */
+  void write_lock_wait_start()
+  {
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    static_assert(WRITER_WAITING == 1U << 30, "compatibility");
+    __asm__ __volatile__("lock btsl $30, %0" : "+m" (lock));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    static_assert(WRITER_WAITING == 1U << 30, "compatibility");
+    _interlockedbittestandset(reinterpret_cast<volatile long*>(&lock), 30);
+#else
+    lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed);
+#endif
+  }
+  /** Try to acquire a shared lock.
+  @param l the value of the lock word
+  @return whether the lock was acquired */
+  bool read_trylock(uint32_t &l)
+  {
+    l= UNLOCKED;
+    while (!lock.compare_exchange_strong(l, l + 1, std::memory_order_acquire,
+                                         std::memory_order_relaxed))
+    {
+      DBUG_ASSERT(!(WRITER & l) || !(~WRITER_PENDING & l));
+      if (l & WRITER_PENDING)
+        return false;
+    }
+    return true;
+  }
+  /** Wait for an exclusive lock.
+  @return whether the exclusive lock was acquired */
+  bool write_lock_poll()
+  {
+    auto l= WRITER_WAITING;
+    if (lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire,
+                                     std::memory_order_relaxed))
+      return true;
+    if (!(l & WRITER_WAITING))
+      /* write_lock() must have succeeded for another thread */
+      write_lock_wait_start();
+    return false;
+  }
+
+public:
+  /** Default constructor */
+  rw_lock() : lock(UNLOCKED) {}
+
+  /** Release a shared lock */
+  void read_unlock()
+  {
+    IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(1, std::memory_order_release);
+    DBUG_ASSERT(l & ~WRITER_PENDING); /* at least one read lock */
+    DBUG_ASSERT(!(l & WRITER)); /* no write lock must have existed */
+  }
+  /** Release an exclusive lock */
+  void write_unlock()
+  {
+    IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(WRITER, std::memory_order_release);
+    DBUG_ASSERT(l & WRITER); /* the write lock must have existed */
+  }
+  /** Try to acquire a shared lock.
+  @return whether the lock was acquired */
+  bool read_trylock() { uint32_t l; return read_trylock(l); }
+  /** Try to acquire an exclusive lock.
+  @return whether the lock was acquired */
+  bool write_trylock()
+  {
+    auto l= UNLOCKED;
+    return lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire,
+                                        std::memory_order_relaxed);
+  }
+
+  /** @return whether an exclusive lock is being held by any thread */
+  bool is_write_locked() const
+  { return !!(lock.load(std::memory_order_relaxed) & WRITER); }
+  /** @return whether a shared lock is being held by any thread */
+  bool is_read_locked() const
+  {
+    auto l= lock.load(std::memory_order_relaxed);
+    return (l & ~WRITER_PENDING) && !(l & WRITER);
+  }
+  /** @return whether any lock is being held by any thread */
+  bool is_locked() const
+  { return (lock.load(std::memory_order_relaxed) & ~WRITER_WAITING) != 0; }
+};
diff --git a/storage/innobase/include/srv0conc.h b/storage/innobase/include/srv0conc.h
deleted file mode 100644
index d24107735ed..00000000000
--- a/storage/innobase/include/srv0conc.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, MariaDB Corporation.
-
-Portions of this file contain modifications contributed and copyrighted by
-Google, Inc. Those modifications are gratefully acknowledged and are described
-briefly in the InnoDB documentation. The contributions by Google are
-incorporated with their permission, and subject to the conditions contained in
-the file COPYING.Google.
-
-Portions of this file contain modifications contributed and copyrighted
-by Percona Inc.. Those modifications are
-gratefully acknowledged and are described briefly in the InnoDB
-documentation. The contributions by Percona Inc. are incorporated with
-their permission, and subject to the conditions contained in the file
-COPYING.Percona.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file srv/srv0conc.h
-
-InnoDB concurrency manager header file
-
-Created 2011/04/18 Sunny Bains
-*******************************************************/
-
-#ifndef srv_conc_h
-#define srv_conc_h
-
-/** We are prepared for a situation that we have this many threads waiting for
-a semaphore inside InnoDB. srv_start() sets the value. */
-extern	ulint	srv_max_n_threads;
-
-/** The following controls how many threads we let inside InnoDB concurrently:
-threads waiting for locks are not counted into the number because otherwise
-we could get a deadlock. Value of 0 will disable the concurrency check. */
-
-extern ulong	srv_thread_concurrency;
-
-struct row_prebuilt_t;
-/*********************************************************************//**
-Puts an OS thread to wait if there are too many concurrent threads
-(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue.
-@param[in,out]	prebuilt	row prebuilt handler */
-void
-srv_conc_enter_innodb(
-	row_prebuilt_t*	prebuilt);
-
-/*********************************************************************//**
-This lets a thread enter InnoDB regardless of the number of threads inside
-InnoDB. This must be called when a thread ends a lock wait. */
-void
-srv_conc_force_enter_innodb(
-/*========================*/
-	trx_t*	trx);		/*!< in: transaction object associated with
-				the thread */
-
-/*********************************************************************//**
-This must be called when a thread exits InnoDB in a lock wait or at the
-end of an SQL statement. */
-void
-srv_conc_force_exit_innodb(
-/*=======================*/
-	trx_t*	trx);		/*!< in: transaction object associated with
-				the thread */
-
-/*********************************************************************//**
-Get the count of threads waiting inside InnoDB. */
-ulint
-srv_conc_get_waiting_threads(void);
-/*==============================*/
-
-/*********************************************************************//**
-Get the count of threads active inside InnoDB. */
-ulint
-srv_conc_get_active_threads(void);
-/*==============================*/
-
-#endif /* srv_conc_h */
diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h
index bfc24f43441..0070537cc2a 100644
--- a/storage/innobase/include/srv0mon.h
+++ b/storage/innobase/include/srv0mon.h
@@ -195,18 +195,9 @@ enum monitor_id_t {
 	MONITOR_FLUSH_N_TO_FLUSH_REQUESTED,
 
 	MONITOR_FLUSH_N_TO_FLUSH_BY_AGE,
-	MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT,
-	MONITOR_LRU_BATCH_FLUSH_AVG_TIME_SLOT,
-
-	MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD,
-	MONITOR_LRU_BATCH_FLUSH_AVG_TIME_THREAD,
-	MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST,
-	MONITOR_LRU_BATCH_FLUSH_AVG_TIME_EST,
-	MONITOR_FLUSH_AVG_TIME,
+	MONITOR_FLUSH_ADAPTIVE_AVG_TIME,
 
 	MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
-	MONITOR_LRU_BATCH_FLUSH_AVG_PASS,
-	MONITOR_FLUSH_AVG_PASS,
 
 	MONITOR_LRU_GET_FREE_LOOPS,
 	MONITOR_LRU_GET_FREE_WAITS,
@@ -234,9 +225,6 @@ enum monitor_id_t {
 	MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
 	MONITOR_LRU_BATCH_EVICT_COUNT,
 	MONITOR_LRU_BATCH_EVICT_PAGES,
-	MONITOR_LRU_SINGLE_FLUSH_SCANNED,
-	MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
-	MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
 	MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT,
 	MONITOR_LRU_GET_FREE_SEARCH,
 	MONITOR_LRU_SEARCH_SCANNED,
@@ -326,7 +314,6 @@ enum monitor_id_t {
 	MONITOR_LSN_CHECKPOINT_AGE,
 	MONITOR_OVLD_BUF_OLDEST_LSN,
 	MONITOR_OVLD_MAX_AGE_ASYNC,
-	MONITOR_OVLD_MAX_AGE_SYNC,
 	MONITOR_PENDING_LOG_FLUSH,
 	MONITOR_PENDING_CHECKPOINT_WRITE,
 	MONITOR_LOG_IO,
@@ -398,14 +385,10 @@ enum monitor_id_t {
 	MONITOR_MASTER_ACTIVE_LOOPS,
 	MONITOR_MASTER_IDLE_LOOPS,
 	MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND,
-	MONITOR_SRV_IBUF_MERGE_MICROSECOND,
 	MONITOR_SRV_LOG_FLUSH_MICROSECOND,
-	MONITOR_SRV_MEM_VALIDATE_MICROSECOND,
-	MONITOR_SRV_PURGE_MICROSECOND,
 	MONITOR_SRV_DICT_LRU_MICROSECOND,
 	MONITOR_SRV_DICT_LRU_EVICT_COUNT_ACTIVE,
 	MONITOR_SRV_DICT_LRU_EVICT_COUNT_IDLE,
-	MONITOR_SRV_CHECKPOINT_MICROSECOND,
 	MONITOR_OVLD_SRV_DBLWR_WRITES,
 	MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN,
 	MONITOR_OVLD_SRV_PAGE_SIZE,
@@ -654,14 +637,14 @@ Use MONITOR_DEC if appropriate mutex protection exists.
 		}							\
 	}
 
-#ifdef HAVE_valgrind
+#ifdef HAVE_MEM_CHECK
 # define MONITOR_CHECK_DEFINED(value) do {	\
     mon_type_t m __attribute__((unused))= value;        \
 	MEM_CHECK_DEFINED(&m, sizeof m);	\
 } while (0)
-#else /* HAVE_valgrind */
+#else /* HAVE_MEM_CHECK */
 # define MONITOR_CHECK_DEFINED(value) (void) 0
-#endif /* HAVE_valgrind */
+#endif /* HAVE_MEM_CHECK */
 
 #define	MONITOR_INC_VALUE(monitor, value)				\
 	MONITOR_CHECK_DEFINED(value);					\
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index 498efe67f40..15df5d4cfbc 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -39,18 +39,17 @@ The server main program
 Created 10/10/1995 Heikki Tuuri
 *******************************************************/
 
-#ifndef srv0srv_h
-#define srv0srv_h
+#pragma once
 
 #include "log0log.h"
-#include "os0event.h"
 #include "que0types.h"
 #include "trx0types.h"
-#include "srv0conc.h"
 #include "fil0fil.h"
 
 #include "mysql/psi/mysql_stage.h"
 #include "mysql/psi/psi.h"
+#include <tpool.h>
+#include <memory>
 
 /** Global counters used inside InnoDB. */
 struct srv_stats_t
@@ -75,33 +74,16 @@ struct srv_stats_t
 	/** Amount of data written to the log files in bytes */
 	lsn_ctr_1_t		os_log_written;
 
-	/** Number of writes being done to the log files.
-	Protected by log_sys.write_mutex. */
+	/** Number of writes being done to the log files */
 	ulint_ctr_1_t		os_log_pending_writes;
 
 	/** We increase this counter, when we don't have enough
 	space in the log buffer and have to flush it */
 	ulint_ctr_1_t		log_waits;
 
-	/** Count the number of times the doublewrite buffer was flushed */
-	ulint_ctr_1_t		dblwr_writes;
-
-	/** Store the number of pages that have been flushed to the
-	doublewrite buffer */
-	ulint_ctr_1_t		dblwr_pages_written;
-
 	/** Store the number of write requests issued */
 	ulint_ctr_1_t		buf_pool_write_requests;
 
-	/** Store the number of times when we had to wait for a free page
-	in the buffer pool. It happens when the buffer pool is full and we
-	need to make a flush, in order to be able to read or create a page. */
-	ulint_ctr_1_t		buf_pool_wait_free;
-
-	/** Count the number of pages that were written from buffer
-	pool to the disk */
-	ulint_ctr_1_t		buf_pool_flushed;
-
 	/** Number of buffer pool reads that led to the reading of
 	a disk page */
 	ulint_ctr_1_t		buf_pool_reads;
@@ -179,9 +161,6 @@ struct srv_stats_t
 	/** Number of encryption_get_latest_key_version calls */
 	ulint_ctr_64_t		n_key_requests;
 
-	/** Number of log scrub operations */
-	ulint_ctr_64_t		n_log_scrubs;
-
 	/** Number of spaces in keyrotation list */
 	ulint_ctr_64_t		key_rotation_list_length;
 
@@ -190,28 +169,20 @@ struct srv_stats_t
 
 	/** Number of temporary tablespace blocks decrypted */
 	ulint_ctr_64_t		n_temp_blocks_decrypted;
+
+	/** Number of lock deadlocks */
+	ulint_ctr_1_t		lock_deadlock_count;
 };
 
+/** We are prepared for a situation that we have this many threads waiting for
+a semaphore inside InnoDB. srv_start() sets the value. */
+extern ulint srv_max_n_threads;
+
 extern const char*	srv_main_thread_op_info;
 
 /** Prefix used by MySQL to indicate pre-5.1 table name encoding */
 extern const char	srv_mysql50_table_name_prefix[10];
 
-/** Event to signal srv_monitor_thread. Not protected by a mutex.
-Set after setting srv_print_innodb_monitor. */
-extern os_event_t	srv_monitor_event;
-
-/** Event to signal the shutdown of srv_error_monitor_thread.
-Not protected by a mutex. */
-extern os_event_t	srv_error_event;
-
-/** Event for waking up buf_dump_thread. Not protected by a mutex.
-Set on shutdown or by buf_dump_start() or buf_load_start(). */
-extern os_event_t	srv_buf_dump_event;
-
-/** The buffer pool resize thread waits on this event. */
-extern os_event_t	srv_buf_resize_event;
-
 /** The buffer pool dump/load file name */
 #define SRV_BUF_DUMP_FILENAME_DEFAULT	"ib_buffer_pool"
 extern char*		srv_buf_dump_filename;
@@ -250,19 +221,11 @@ recovery and open all tables in RO mode instead of RW mode. We don't
 sync the max trx id to disk either. */
 extern my_bool	srv_read_only_mode;
 /** Set if InnoDB operates in read-only mode or innodb-force-recovery
-is greater than SRV_FORCE_NO_TRX_UNDO. */
+is greater than SRV_FORCE_NO_IBUF_MERGE. */
 extern my_bool	high_level_read_only;
 /** store to its own file each table created by an user; data
 dictionary tables are in the system tablespace 0 */
 extern my_bool	srv_file_per_table;
-/** Sleep delay for threads waiting to enter InnoDB. In micro-seconds. */
-extern	ulong	srv_thread_sleep_delay;
-/** Maximum sleep delay (in micro-seconds), value of 0 disables it.*/
-extern	ulong	srv_adaptive_max_sleep_delay;
-
-/** Place locks to records only i.e. do not use next-key locking except
-on duplicate key checking and foreign key checking */
-extern ibool	srv_locks_unsafe_for_binlog;
 
 /** Sort buffer size in index creation */
 extern ulong	srv_sort_buf_size;
@@ -271,7 +234,7 @@ extern unsigned long long	srv_online_max_size;
 
 /* If this flag is TRUE, then we will use the native aio of the
 OS (provided we compiled Innobase with it in), otherwise we will
-use simulated aio we build below with threads.
+use simulated aio.
 Currently we support native aio on windows and linux */
 extern my_bool	srv_use_native_aio;
 extern my_bool	srv_numa_interleave;
@@ -291,33 +254,11 @@ extern char*	srv_undo_dir;
 /** Number of undo tablespaces to use. */
 extern ulong	srv_undo_tablespaces;
 
-/** The number of UNDO tablespaces that are open and ready to use. */
-extern ulint	srv_undo_tablespaces_open;
-
 /** The number of UNDO tablespaces that are active (hosting some rollback
 segment). It is quite possible that some of the tablespaces doesn't host
 any of the rollback-segment based on configuration used. */
 extern ulint	srv_undo_tablespaces_active;
 
-/** Undo tablespaces starts with space_id. */
-extern	ulint	srv_undo_space_id_start;
-
-/** Check whether given space id is undo tablespace id
-@param[in]	space_id	space id to check
-@return true if it is undo tablespace else false. */
-inline
-bool
-srv_is_undo_tablespace(ulint space_id)
-{
-	return srv_undo_space_id_start > 0
-		&& space_id >= srv_undo_space_id_start
-		&& space_id < (srv_undo_space_id_start
-			       + srv_undo_tablespaces_open);
-}
-
-/** The number of undo segments to use */
-extern ulong	srv_undo_logs;
-
 /** Maximum size of undo tablespace. */
 extern unsigned long long	srv_max_undo_log_size;
 
@@ -334,12 +275,12 @@ extern my_bool	srv_undo_log_truncate;
 /* Enables or disables this prefix optimization.  Disabled by default. */
 extern my_bool	srv_prefix_index_cluster_optimization;
 
-/** Default size of UNDO tablespace while it is created new. */
-extern const ulint	SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
+/** Default size of UNDO tablespace (10MiB for innodb_page_size=16k) */
+constexpr ulint SRV_UNDO_TABLESPACE_SIZE_IN_PAGES= (10U << 20) /
+  UNIV_PAGE_SIZE_DEF;
 
 extern char*	srv_log_group_home_dir;
 
-extern ulong	srv_n_log_files;
 /** The InnoDB redo log file size, or 0 when changing the redo log format
 at startup (while disallowing writes to the redo log). */
 extern ulonglong	srv_log_file_size;
@@ -360,12 +301,6 @@ extern ulint		srv_buf_pool_size;
 /** Requested buffer pool chunk size. Each buffer pool instance consists
 of one or more chunks. */
 extern ulong		srv_buf_pool_chunk_unit;
-/** Requested number of buffer pool instances */
-extern ulong		srv_buf_pool_instances;
-/** Default number of buffer pool instances */
-extern const ulong	srv_buf_pool_instances_default;
-/** Number of locks to protect buf_pool->page_hash */
-extern ulong	srv_n_page_hash_locks;
 /** Scan depth for LRU flush batch i.e.: number of blocks scanned*/
 extern ulong	srv_LRU_scan_depth;
 /** Whether or not to flush neighbors of a block */
@@ -385,11 +320,11 @@ extern ulong srv_buf_pool_load_pages_abort;
 /** Lock table size in bytes */
 extern ulint	srv_lock_table_size;
 
-extern ulint	srv_n_file_io_threads;
+extern uint	srv_n_file_io_threads;
 extern my_bool	srv_random_read_ahead;
 extern ulong	srv_read_ahead_threshold;
-extern ulong	srv_n_read_io_threads;
-extern ulong	srv_n_write_io_threads;
+extern uint	srv_n_read_io_threads;
+extern uint	srv_n_write_io_threads;
 
 /* Defragmentation, Origianlly facebook default value is 100, but it's too high */
 #define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40
@@ -411,10 +346,6 @@ The real value is set based on the value of io_capacity. */
 #define SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT	(~0UL)
 #define SRV_MAX_IO_CAPACITY_LIMIT		(~0UL)
 extern ulong    srv_max_io_capacity;
-/* Returns the number of IO operations that is X percent of the
-capacity. PCT_IO(5) -> returns the number of IO operations that
-is 5% of the max where max is srv_io_capacity.  */
-#define PCT_IO(p) ((ulong) (srv_io_capacity * ((double) (p) / 100.0)))
 
 /* The "innodb_stats_method" setting, decides how InnoDB is going
 to treat NULL value when collecting statistics. It is not defined
@@ -423,9 +354,7 @@ extern ulong	srv_innodb_stats_method;
 
 extern ulint	srv_max_n_open_files;
 
-extern ulong	srv_n_page_cleaners;
-
-extern double	srv_max_dirty_pages_pct;
+extern double	srv_max_buf_pool_modified_pct;
 extern double	srv_max_dirty_pages_pct_lwm;
 
 extern double	srv_adaptive_flushing_lwm;
@@ -433,17 +362,11 @@ extern ulong	srv_flushing_avg_loops;
 
 extern ulong	srv_force_recovery;
 
-extern uint	srv_fast_shutdown;	/*!< If this is 1, do not do a
-					purge and index buffer merge.
-					If this 2, do not even flush the
-					buffer pool to data files at the
-					shutdown: we effectively 'crash'
-					InnoDB (but lose no committed
-					transactions). */
-
-/** Signal to shut down InnoDB (NULL if shutdown was signaled, or if
-running in innodb_read_only mode, srv_read_only_mode) */
-extern std::atomic<st_my_thread_var *> srv_running;
+/** innodb_fast_shutdown=1 skips purge and change buffer merge.
+innodb_fast_shutdown=2 effectively crashes the server (no log checkpoint).
+innodb_fast_shutdown=3 is a clean shutdown that skips the rollback
+of active transaction (to be done on restart). */
+extern uint	srv_fast_shutdown;
 
 extern ibool	srv_innodb_status;
 
@@ -456,20 +379,16 @@ extern unsigned long long	srv_stats_modified_counter;
 extern my_bool			srv_stats_sample_traditional;
 
 extern my_bool	srv_use_doublewrite_buf;
-extern ulong	srv_doublewrite_batch_size;
 extern ulong	srv_checksum_algorithm;
 
-extern double	srv_max_buf_pool_modified_pct;
 extern my_bool	srv_force_primary_key;
 
-extern double	srv_max_buf_pool_modified_pct;
 extern ulong	srv_max_purge_lag;
 extern ulong	srv_max_purge_lag_delay;
 
-extern ulong	srv_replication_delay;
-
 extern my_bool	innodb_encrypt_temporary_tables;
 
+extern my_bool  srv_immediate_scrub_data_uncompressed;
 /*-------------------------------------------*/
 
 /** Modes of operation */
@@ -480,8 +399,6 @@ enum srv_operation_mode {
 	SRV_OPERATION_BACKUP,
 	/** Mariabackup restoring a backup for subsequent --copy-back */
 	SRV_OPERATION_RESTORE,
-	/** Mariabackup restoring a backup with rolling back prepared XA's*/
-	SRV_OPERATION_RESTORE_ROLLBACK_XA,
 	/** Mariabackup restoring the incremental part of a backup */
 	SRV_OPERATION_RESTORE_DELTA,
 	/** Mariabackup restoring a backup for subsequent --export */
@@ -491,52 +408,25 @@ enum srv_operation_mode {
 /** Current mode of operation */
 extern enum srv_operation_mode srv_operation;
 
-inline bool is_mariabackup_restore()
-{
-	/* To rollback XA's trx_sys must be initialized, the rest is the same
-	as regular backup restore, that is why we join this two operations in
-	the most cases. */
-	return srv_operation == SRV_OPERATION_RESTORE
-	       || srv_operation == SRV_OPERATION_RESTORE_ROLLBACK_XA;
-}
-
-inline bool is_mariabackup_restore_or_export()
-{
-	return is_mariabackup_restore()
-	       || srv_operation == SRV_OPERATION_RESTORE_EXPORT;
-}
-
 extern my_bool	srv_print_innodb_monitor;
 extern my_bool	srv_print_innodb_lock_monitor;
 extern ibool	srv_print_verbose_log;
 
 extern bool	srv_monitor_active;
-extern bool	srv_error_monitor_active;
-
-/* TRUE during the lifetime of the buffer pool dump/load thread */
-extern bool	srv_buf_dump_thread_active;
-
-/* true during the lifetime of the buffer pool resize thread */
-extern bool	srv_buf_resize_thread_active;
-
-/* TRUE during the lifetime of the stats thread */
-extern bool	srv_dict_stats_thread_active;
 
-/* TRUE if enable log scrubbing */
-extern my_bool	srv_scrub_log;
 
 extern ulong	srv_n_spin_wait_rounds;
-extern ulong	srv_n_free_tickets_to_enter;
-extern ulong	srv_thread_sleep_delay;
 extern uint	srv_spin_wait_delay;
 
 extern ulint	srv_truncated_status_writes;
 /** Number of initialized rollback segments for persistent undo log */
 extern ulong	srv_available_undo_logs;
-
-#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
-extern my_bool	srv_ibuf_disable_background_merge;
-#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+/** Iterations of the loop bounded by 'srv_active' label. */
+extern ulint	srv_main_active_loops;
+/** Iterations of the loop bounded by the 'srv_idle' label. */
+extern ulint	srv_main_idle_loops;
+/** Log writes involving flush. */
+extern ulint	srv_log_writes_and_flush;
 
 #ifdef UNIV_DEBUG
 extern my_bool	innodb_evict_tables_on_commit_debug;
@@ -547,21 +437,16 @@ extern my_bool	srv_purge_view_update_only_debug;
 extern my_bool	srv_master_thread_disabled_debug;
 /** InnoDB system tablespace to set during recovery */
 extern uint	srv_sys_space_size_debug;
-/** whether redo log files have been created at startup */
-extern bool	srv_log_files_created;
+/** whether redo log file has been created at startup */
+extern bool	srv_log_file_created;
 #endif /* UNIV_DEBUG */
 
 extern ulint	srv_dml_needed_delay;
 
 #define SRV_MAX_N_IO_THREADS	130
 
-/* Array of English strings describing the current state of an
-i/o handler thread */
-extern const char* srv_io_thread_op_info[];
-extern const char* srv_io_thread_function[];
-
-/* the number of purge threads to use from the worker pool (currently 0 or 1) */
-extern ulong srv_n_purge_threads;
+/** innodb_purge_threads; the number of purge tasks to use */
+extern uint srv_n_purge_threads;
 
 /* the number of pages to purge in one batch */
 extern ulong srv_purge_batch_size;
@@ -591,34 +476,18 @@ extern ulong	srv_fatal_semaphore_wait_threshold;
 /** Buffer pool dump status frequence in percentages */
 extern ulong srv_buf_dump_status_frequency;
 
-#define srv_max_purge_threads 32
-
 # ifdef UNIV_PFS_THREAD
-/* Keys to register InnoDB threads with performance schema */
-extern mysql_pfs_key_t	buf_dump_thread_key;
-extern mysql_pfs_key_t	dict_stats_thread_key;
-extern mysql_pfs_key_t	io_handler_thread_key;
-extern mysql_pfs_key_t	io_ibuf_thread_key;
-extern mysql_pfs_key_t	io_log_thread_key;
-extern mysql_pfs_key_t	io_read_thread_key;
-extern mysql_pfs_key_t	io_write_thread_key;
 extern mysql_pfs_key_t	page_cleaner_thread_key;
-extern mysql_pfs_key_t	recv_writer_thread_key;
-extern mysql_pfs_key_t	srv_error_monitor_thread_key;
-extern mysql_pfs_key_t	srv_lock_timeout_thread_key;
-extern mysql_pfs_key_t	srv_master_thread_key;
-extern mysql_pfs_key_t	srv_monitor_thread_key;
-extern mysql_pfs_key_t	srv_purge_thread_key;
-extern mysql_pfs_key_t	srv_worker_thread_key;
 extern mysql_pfs_key_t	trx_rollback_clean_thread_key;
+extern mysql_pfs_key_t	thread_pool_thread_key;
 
 /* This macro register the current thread and its key with performance
 schema */
 #  define pfs_register_thread(key)			\
-do {								\
-	struct PSI_thread* psi = PSI_CALL_new_thread(key, NULL, 0);\
-	/* JAN: TODO: MYSQL 5.7 PSI                             \
-	PSI_CALL_set_thread_os_id(psi);	*/		\
+do {							\
+	struct PSI_thread* psi __attribute__((unused))	\
+		= PSI_CALL_new_thread(key, NULL, 0);	\
+	PSI_CALL_set_thread_os_id(psi);			\
 	PSI_CALL_set_thread(psi);			\
 } while (0)
 
@@ -634,14 +503,10 @@ do {								\
 
 #ifdef HAVE_PSI_STAGE_INTERFACE
 /** Performance schema stage event for monitoring ALTER TABLE progress
-everything after flush log_make_checkpoint(). */
+in ha_innobase::commit_inplace_alter_table(). */
 extern PSI_stage_info	srv_stage_alter_table_end;
 
 /** Performance schema stage event for monitoring ALTER TABLE progress
-log_make_checkpoint(). */
-extern PSI_stage_info	srv_stage_alter_table_flush;
-
-/** Performance schema stage event for monitoring ALTER TABLE progress
 row_merge_insert_index_tuples(). */
 extern PSI_stage_info	srv_stage_alter_table_insert;
 
@@ -665,37 +530,6 @@ extern PSI_stage_info	srv_stage_alter_table_read_pk_internal_sort;
 extern PSI_stage_info	srv_stage_buffer_pool_load;
 #endif /* HAVE_PSI_STAGE_INTERFACE */
 
-
-/** Alternatives for innodb_flush_method */
-enum srv_flush_t {
-	SRV_FSYNC = 0,	/*!< fsync, the default */
-	SRV_O_DSYNC,	/*!< open log files in O_SYNC mode */
-	SRV_LITTLESYNC,	/*!< do not call os_file_flush()
-				when writing data files, but do flush
-				after writing to log files */
-	SRV_NOSYNC,	/*!< do not flush after writing */
-	SRV_O_DIRECT,	/*!< invoke os_file_set_nocache() on
-				data files. This implies using
-				non-buffered IO but still using fsync,
-				the reason for which is that some FS
-				do not flush meta-data when
-				unbuffered IO happens */
-	SRV_O_DIRECT_NO_FSYNC
-				/*!< do not use fsync() when using
-				direct IO i.e.: it can be set to avoid
-				the fsync() call that we make when
-				using SRV_UNIX_O_DIRECT. However, in
-				this case user/DBA should be sure about
-				the integrity of the meta-data */
-#ifdef _WIN32
-	,SRV_ALL_O_DIRECT_FSYNC
-				/*!< Traditional Windows appoach to open 
-				all files without caching, and do FileFlushBuffers()*/
-#endif
-};
-/** innodb_flush_method */
-extern ulong srv_file_flush_method;
-
 /** Alternatives for srv_force_recovery. Non-zero values are intended
 to help the user get a damaged database up so that he can dump intact
 tables and rows with SELECT INTO OUTFILE. The database must not otherwise
@@ -733,17 +567,6 @@ enum srv_stats_method_name_enum {
 
 typedef enum srv_stats_method_name_enum		srv_stats_method_name_t;
 
-/** Types of threads existing in the system. */
-enum srv_thread_type {
-	SRV_NONE,			/*!< None */
-	SRV_WORKER,			/*!< threads serving parallelized
-					queries and queries released from
-					lock wait */
-	SRV_PURGE,			/*!< Purge coordinator thread */
-	SRV_MASTER			/*!< the master thread, (whose type
-					number must be biggest) */
-};
-
 /*********************************************************************//**
 Boots Innobase server. */
 void
@@ -753,36 +576,10 @@ srv_boot(void);
 Frees the data structures created in srv_init(). */
 void
 srv_free(void);
-/*==========*/
-/*********************************************************************//**
-Sets the info describing an i/o thread current state. */
-void
-srv_set_io_thread_op_info(
-/*======================*/
-	ulint		i,	/*!< in: the 'segment' of the i/o thread */
-	const char*	str);	/*!< in: constant char string describing the
-				state */
-/*********************************************************************//**
-Resets the info describing an i/o thread current state. */
-void
-srv_reset_io_thread_op_info();
 
-/** Wake up the purge threads if there is work to do. */
+/** Wake up the purge if there is work to do. */
 void
 srv_wake_purge_thread_if_not_active();
-/** Wake up the InnoDB master thread if it was suspended (not sleeping). */
-void
-srv_active_wake_master_thread_low();
-
-#define srv_active_wake_master_thread()					\
-	do {								\
-		if (!srv_read_only_mode) {				\
-			srv_active_wake_master_thread_low();		\
-		}							\
-	} while (0)
-/** Wake up the master thread if it is suspended or being suspended. */
-void
-srv_wake_master_thread();
 
 /******************************************************************//**
 Outputs to a file the output of the InnoDB Monitor.
@@ -805,19 +602,12 @@ void
 srv_export_innodb_status(void);
 /*==========================*/
 /*******************************************************************//**
-Get current server activity count. We don't hold srv_sys::mutex while
-reading this value as it is only used in heuristics.
+Get current server activity count.
 @return activity count. */
 ulint
 srv_get_activity_count(void);
 /*========================*/
-/*******************************************************************//**
-Check if there has been any activity.
-@return FALSE if no change in activity counter. */
-ibool
-srv_check_activity(
-/*===============*/
-	ulint		old_activity_count);	/*!< old activity count */
+
 /******************************************************************//**
 Increment the server activity counter. */
 void
@@ -832,85 +622,50 @@ srv_que_task_enqueue_low(
 /*=====================*/
 	que_thr_t*	thr);	/*!< in: query thread */
 
-/**********************************************************************//**
-Check whether any background thread is active. If so, return the thread
-type.
-@return SRV_NONE if all are are suspended or have exited, thread
-type if any are still active. */
-enum srv_thread_type
-srv_get_active_thread_type(void);
-/*============================*/
+/**
+Flag which is set, whenever innodb_purge_threads changes.
+It is read and reset in srv_do_purge().
+
+Thus it is Atomic_counter<int>, not bool, since unprotected
+reads are used. We just need an atomic with relaxed memory
+order, to please Thread Sanitizer.
+*/
+extern Atomic_counter<int> srv_purge_thread_count_changed;
+
+#ifdef UNIV_DEBUG
+/** @return whether purge or master task is active */
+bool srv_any_background_activity();
+#endif
 
 extern "C" {
 
-/*********************************************************************//**
-A thread which prints the info output by various InnoDB monitors.
-@return a dummy parameter */
-os_thread_ret_t
-DECLARE_THREAD(srv_monitor_thread)(
-/*===============================*/
-	void*	arg);	/*!< in: a dummy parameter required by
-			os_thread_create */
 
-/*********************************************************************//**
-The master thread controlling the server.
-@return a dummy parameter */
-os_thread_ret_t
-DECLARE_THREAD(srv_master_thread)(
-/*==============================*/
-	void*	arg);	/*!< in: a dummy parameter required by
-			os_thread_create */
-
-/*************************************************************************
-A thread which prints warnings about semaphore waits which have lasted
-too long. These can be used to track bugs which cause hangs.
-@return a dummy parameter */
-os_thread_ret_t
-DECLARE_THREAD(srv_error_monitor_thread)(
-/*=====================================*/
-	void*	arg);	/*!< in: a dummy parameter required by
-			os_thread_create */
+/** Periodic task which prints the info output by various InnoDB monitors.*/
+void srv_monitor_task(void*);
 
-/*********************************************************************//**
-Purge coordinator thread that schedules the purge tasks.
-@return a dummy parameter */
-os_thread_ret_t
-DECLARE_THREAD(srv_purge_coordinator_thread)(
-/*=========================================*/
-	void*	arg MY_ATTRIBUTE((unused)));	/*!< in: a dummy parameter
-						required by os_thread_create */
 
-/*********************************************************************//**
-Worker thread that reads tasks from the work queue and executes them.
-@return a dummy parameter */
-os_thread_ret_t
-DECLARE_THREAD(srv_worker_thread)(
-/*==============================*/
-	void*	arg MY_ATTRIBUTE((unused)));	/*!< in: a dummy parameter
-						required by os_thread_create */
-} /* extern "C" */
+/** The periodic master task controlling the server. */
+void srv_master_callback(void*);
 
-/**********************************************************************//**
-Get count of tasks in the queue.
-@return number of tasks in queue */
-ulint
-srv_get_task_queue_length(void);
-/*===========================*/
 
-/** Ensure that a given number of threads of the type given are running
-(or are already terminated).
-@param[in]	type	thread type
-@param[in]	n	number of threads that have to run */
-void
-srv_release_threads(enum srv_thread_type type, ulint n);
+/**
+Complete the shutdown tasks such as background DROP TABLE,
+and optionally change buffer merge (on innodb_fast_shutdown=0). */
+void srv_shutdown(bool ibuf_merge);
 
-/** Wakeup the purge threads. */
-void
-srv_purge_wakeup();
+} /* extern "C" */
+
+#ifdef UNIV_DEBUG
+/** @return number of tasks in queue */
+ulint srv_get_task_queue_length();
+#endif
 
 /** Shut down the purge threads. */
 void srv_purge_shutdown();
 
+/** Init purge tasks*/
+void srv_init_purge_tasks();
+
 #ifdef UNIV_DEBUG
 /** Disables master thread. It's used by:
 	SET GLOBAL innodb_master_thread_disabled_debug = 1 (0).
@@ -922,14 +677,6 @@ srv_master_thread_disabled_debug_update(THD*, st_mysql_sys_var*, void*,
 
 /** Status variables to be passed to MySQL */
 struct export_var_t{
-	ulint innodb_data_pending_reads;	/*!< Pending reads */
-	ulint innodb_data_pending_writes;	/*!< Pending writes */
-	ulint innodb_data_pending_fsyncs;	/*!< Pending fsyncs */
-	ulint innodb_data_fsyncs;		/*!< Number of fsyncs so far */
-	ulint innodb_data_read;			/*!< Data bytes read */
-	ulint innodb_data_writes;		/*!< I/O write requests */
-	ulint innodb_data_written;		/*!< Data bytes written */
-	ulint innodb_data_reads;		/*!< I/O read requests */
 	char  innodb_buffer_pool_dump_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool dump status */
 	char  innodb_buffer_pool_load_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool load status */
 	char  innodb_buffer_pool_resize_status[512];/*!< Buf pool resize status */
@@ -944,28 +691,44 @@ struct export_var_t{
 #ifdef UNIV_DEBUG
 	ulint innodb_buffer_pool_pages_latched;	/*!< Latched pages */
 #endif /* UNIV_DEBUG */
-	ulint innodb_buffer_pool_read_requests;	/*!< buf_pool->stat.n_page_gets */
+	ulint innodb_buffer_pool_pages_made_not_young;
+	ulint innodb_buffer_pool_pages_made_young;
+	ulint innodb_buffer_pool_pages_old;
+	ulint innodb_buffer_pool_read_requests;	/*!< buf_pool.stat.n_page_gets */
 	ulint innodb_buffer_pool_reads;		/*!< srv_buf_pool_reads */
-	ulint innodb_buffer_pool_wait_free;	/*!< srv_buf_pool_wait_free */
-	ulint innodb_buffer_pool_pages_flushed;	/*!< srv_buf_pool_flushed */
-	ulint innodb_buffer_pool_write_requests;/*!< srv_buf_pool_write_requests */
+	ulint innodb_buffer_pool_write_requests;/*!< srv_stats.buf_pool_write_requests */
 	ulint innodb_buffer_pool_read_ahead_rnd;/*!< srv_read_ahead_rnd */
 	ulint innodb_buffer_pool_read_ahead;	/*!< srv_read_ahead */
 	ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/
+	ulint innodb_checkpoint_age;
+	ulint innodb_checkpoint_max_age;
+	ulint innodb_data_pending_reads;	/*!< Pending reads */
+	ulint innodb_data_pending_writes;	/*!< Pending writes */
+	ulint innodb_data_pending_fsyncs;	/*!< Pending fsyncs */
+	ulint innodb_data_fsyncs;		/*!< Number of fsyncs so far */
+	ulint innodb_data_read;			/*!< Data bytes read */
+	ulint innodb_data_writes;		/*!< I/O write requests */
+	ulint innodb_data_written;		/*!< Data bytes written */
+	ulint innodb_data_reads;		/*!< I/O read requests */
 	ulint innodb_dblwr_pages_written;	/*!< srv_dblwr_pages_written */
 	ulint innodb_dblwr_writes;		/*!< srv_dblwr_writes */
-	ibool innodb_have_atomic_builtins;	/*!< HAVE_ATOMIC_BUILTINS */
+	ulint innodb_deadlocks;
+	ulint innodb_history_list_length;
 	ulint innodb_log_waits;			/*!< srv_log_waits */
 	ulint innodb_log_write_requests;	/*!< srv_log_write_requests */
 	ulint innodb_log_writes;		/*!< srv_log_writes */
+	lsn_t innodb_lsn_current;
+	lsn_t innodb_lsn_flushed;
+	lsn_t innodb_lsn_last_checkpoint;
+	trx_id_t innodb_max_trx_id;
+#ifdef BTR_CUR_HASH_ADAPT
+	ulint innodb_mem_adaptive_hash;
+#endif
+	ulint innodb_mem_dictionary;
 	lsn_t innodb_os_log_written;		/*!< srv_os_log_written */
-	ulint innodb_os_log_fsyncs;		/*!< fil_n_log_flushes */
+	ulint innodb_os_log_fsyncs;		/*!< n_log_flushes */
 	ulint innodb_os_log_pending_writes;	/*!< srv_os_log_pending_writes */
-	ulint innodb_os_log_pending_fsyncs;	/*!< fil_n_pending_log_flushes */
-	ulint innodb_page_size;			/*!< srv_page_size */
-	ulint innodb_pages_created;		/*!< buf_pool->stat.n_pages_created */
-	ulint innodb_pages_read;		/*!< buf_pool->stat.n_pages_read*/
-	ulint innodb_pages_written;		/*!< buf_pool->stat.n_pages_written */
+	ulint innodb_os_log_pending_fsyncs;	/*!< n_pending_log_flushes */
 	ulint innodb_row_lock_waits;		/*!< srv_n_lock_wait_count */
 	ulint innodb_row_lock_current_waits;	/*!< srv_n_lock_wait_current_count */
 	int64_t innodb_row_lock_time;		/*!< srv_n_lock_wait_time
@@ -983,10 +746,8 @@ struct export_var_t{
 	ulint innodb_system_rows_inserted; /*!< srv_n_system_rows_inserted */
 	ulint innodb_system_rows_updated; /*!< srv_n_system_rows_updated */
 	ulint innodb_system_rows_deleted; /*!< srv_n_system_rows_deleted*/
-	ulint innodb_num_open_files;		/*!< fil_system_t::n_open */
 	ulint innodb_truncated_status_writes;	/*!< srv_truncated_status_writes */
-	ulint innodb_available_undo_logs;       /*!< srv_available_undo_logs
-						*/
+
 	/** Number of undo tablespace truncation operations */
 	ulong innodb_undo_truncations;
 	ulint innodb_defragment_compression_failures; /*!< Number of
@@ -1051,25 +812,12 @@ struct export_var_t{
 	ulint innodb_encryption_rotation_estimated_iops;
 	int64_t innodb_encryption_key_requests;
 	int64_t innodb_key_rotation_list_length;
-
-	ulint innodb_scrub_page_reorganizations;
-	ulint innodb_scrub_page_splits;
-	ulint innodb_scrub_page_split_failures_underflow;
-	ulint innodb_scrub_page_split_failures_out_of_filespace;
-	ulint innodb_scrub_page_split_failures_missing_index;
-	ulint innodb_scrub_page_split_failures_unknown;
-	int64_t innodb_scrub_log;
 };
 
 /** Thread slot in the thread table.  */
 struct srv_slot_t{
-	srv_thread_type type;			/*!< thread type: user,
-						utility etc. */
 	ibool		in_use;			/*!< TRUE if this slot
 						is in use */
-	ibool		suspended;		/*!< TRUE if the thread is
-						waiting for the event of this
-						slot */
  	/** time(NULL) when the thread was suspended.
  	FIXME: Use my_interval_timer() or similar, to avoid bogus
  	timeouts in lock_wait_check_and_cancel() or lock_wait_suspend_thread()
@@ -1088,33 +836,25 @@ struct srv_slot_t{
 						to do */
 	que_thr_t*	thr;			/*!< suspended query thread
 						(only used for user threads) */
-#ifdef UNIV_DEBUG
-	struct debug_sync_t {
-		UT_LIST_NODE_T(debug_sync_t)
-			debug_sync_list;
-		char str[1];
-	};
-	UT_LIST_BASE_NODE_T(debug_sync_t)
-		debug_sync;
-	rw_lock_t debug_sync_lock;
-#endif
 };
 
-#ifdef UNIV_DEBUG
-typedef void srv_slot_callback_t(srv_slot_t*, const void*);
+extern tpool::thread_pool *srv_thread_pool;
+extern std::unique_ptr<tpool::timer> srv_master_timer;
+extern std::unique_ptr<tpool::timer> srv_monitor_timer;
 
-void srv_for_each_thread(srv_thread_type type,
-			 srv_slot_callback_t callback,
-			 const void *arg);
-#endif
+/** The interval at which srv_monitor_task is invoked, in milliseconds */
+constexpr unsigned SRV_MONITOR_INTERVAL= 15000; /* 4 times per minute */
 
-#ifdef WITH_WSREP
-UNIV_INTERN
-void
-wsrep_srv_conc_cancel_wait(
-/*==================*/
-	trx_t*	trx);	/*!< in: transaction object associated with the
-			thread */
-#endif /* WITH_WSREP */
+static inline void srv_monitor_timer_schedule_now()
+{
+  srv_monitor_timer->set_time(0, SRV_MONITOR_INTERVAL);
+}
+static inline void srv_start_periodic_timer(std::unique_ptr<tpool::timer>& t,
+                                            void (*func)(void*), int period)
+{
+  t.reset(srv_thread_pool->create_timer(func));
+  t->set_time(0, period);
+}
 
-#endif
+void srv_thread_pool_init();
+void srv_thread_pool_end();
diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h
index c1be0016bb3..324e3f0478d 100644
--- a/storage/innobase/include/srv0start.h
+++ b/storage/innobase/include/srv0start.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,8 +24,7 @@ Starts the Innobase database server
 Created 10/10/1995 Heikki Tuuri
 *******************************************************/
 
-#ifndef srv0start_h
-#define srv0start_h
+#pragma once
 
 #include "log0log.h"
 #include "ut0byte.h"
@@ -33,10 +32,6 @@ Created 10/10/1995 Heikki Tuuri
 // Forward declaration
 struct dict_table_t;
 
-/** If buffer pool is less than the size,
-only one buffer pool instance is used. */
-#define BUF_POOL_SIZE_THRESHOLD		(1024 * 1024 * 1024)
-
 /** Open the configured number of dedicated undo tablespaces.
 @param[in]	create_new_db	whether the database is being initialized
 @return DB_SUCCESS or error code */
@@ -48,6 +43,12 @@ srv_undo_tablespaces_init(bool create_new_db);
 @return DB_SUCCESS or error code */
 dberr_t srv_start(bool create_new_db);
 
+/**
+  Shutdown purge to make sure that there is no possibility that we call any
+  plugin code (e.g., audit) inside virtual column computation.
+*/
+void innodb_preshutdown();
+
 /** Shut down InnoDB. */
 void innodb_shutdown();
 
@@ -92,8 +93,6 @@ srv_get_encryption_data_filename(
 
 /** Log sequence number at shutdown */
 extern	lsn_t	srv_shutdown_lsn;
-/** Log sequence number immediately after startup */
-extern	lsn_t	srv_start_lsn;
 
 /** TRUE if the server is being started */
 extern	bool	srv_is_being_started;
@@ -113,11 +112,6 @@ enum srv_shutdown_t {
 	SRV_SHUTDOWN_INITIATED,
 	SRV_SHUTDOWN_CLEANUP,	/*!< Cleaning up in
 				logs_empty_and_mark_files_at_shutdown() */
-	SRV_SHUTDOWN_FLUSH_PHASE,/*!< At this phase the master and the
-				purge threads must have completed their
-				work. Once we enter this phase the
-				page_cleaner can clean up the buffer
-				pool and exit */
 	SRV_SHUTDOWN_LAST_PHASE,/*!< Last phase after ensuring that
 				the buffer pool can be freed: flush
 				all file spaces and close all files */
@@ -133,4 +127,3 @@ extern	enum srv_shutdown_t	srv_shutdown_state;
 
 /** Files comprising the system tablespace */
 extern pfs_os_file_t	files[1000];
-#endif
diff --git a/storage/innobase/include/sync0arr.h b/storage/innobase/include/sync0arr.h
index 99f980e28e0..60dabf419c0 100644
--- a/storage/innobase/include/sync0arr.h
+++ b/storage/innobase/include/sync0arr.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2018, MariaDB Corporation.
+Copyright (c) 2015, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -79,10 +79,9 @@ sync_array_free_cell(
 	sync_array_t*	arr,	/*!< in: wait array */
 	sync_cell_t*&	cell);	/*!< in: the reserved cell */
 
-/**********************************************************************//**
-Note that one of the wait objects was signalled. */
-void
-sync_array_object_signalled();
+/** count of how many times an object has been signalled */
+extern ulint sg_count;
+#define sync_array_object_signalled() ++sg_count
 
 /**********************************************************************//**
 Prints warnings of long semaphore waits to stderr.
diff --git a/storage/innobase/include/sync0debug.h b/storage/innobase/include/sync0debug.h
index 55ea99cd47b..07e985465e0 100644
--- a/storage/innobase/include/sync0debug.h
+++ b/storage/innobase/include/sync0debug.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 Portions of this file contain modifications contributed and copyrighted by
 Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -44,10 +44,6 @@ void
 sync_check_close();
 
 #ifdef UNIV_DEBUG
-/** Enable sync order checking. */
-void
-sync_check_enable();
-
 /** Check if it is OK to acquire the latch.
 @param[in]	latch	latch type */
 void
diff --git a/storage/innobase/include/sync0policy.h b/storage/innobase/include/sync0policy.h
index 94f49ff628c..68397827891 100644
--- a/storage/innobase/include/sync0policy.h
+++ b/storage/innobase/include/sync0policy.h
@@ -220,7 +220,8 @@ public:
 
 		meta.get_counter()->single_register(&m_count);
 
-		sync_file_created_register(this, filename, uint16_t(line));
+		m_filename = filename;
+		m_line = line;
 	}
 
 	/** Called when the mutex is destroyed. */
@@ -230,8 +231,6 @@ public:
 		latch_meta_t&	meta = sync_latch_get_meta(m_id);
 
 		meta.get_counter()->single_deregister(&m_count);
-
-		sync_file_created_deregister(this);
 	}
 
 	/** Called after a successful mutex acquire.
@@ -272,13 +271,21 @@ public:
 
   /** @return the string representation */
   std::string to_string() const
-  { return sync_mutex_to_string(get_id(), sync_file_created_get(this)); }
+  {
+    return sync_mutex_to_string(get_id(),
+                                std::string(m_filename)
+                                    .append(":")
+                                    .append(std::to_string(m_line)));
+  }
 
 #ifdef UNIV_DEBUG
   MutexDebug<Mutex> context;
 #endif
 
 private:
+  const char *m_filename;
+  uint32_t m_line;
+
   /** The user visible counters, registered with the meta-data. */
   latch_meta_t::CounterType::Count m_count;
 
@@ -286,91 +293,4 @@ private:
 	latch_id_t		m_id;
 };
 
-/** Track agregate metrics policy, used by the page mutex. There are just
-too many of them to count individually. */
-template <typename Mutex>
-class BlockMutexPolicy
-{
-public:
-	/** Called when the mutex is "created". Note: Not from the constructor
-	but when the mutex is initialised.
-	@param[in]	id              Mutex ID */
-	void init(const Mutex&, latch_id_t id, const char*, uint32)
-		UNIV_NOTHROW
-	{
-		/* It can be LATCH_ID_BUF_BLOCK_MUTEX or
-		LATCH_ID_BUF_POOL_ZIP. Unfortunately, they
-		are mapped to the same mutex type in the
-		buffer pool code. */
-
-		m_id = id;
-
-		latch_meta_t&	meta = sync_latch_get_meta(m_id);
-
-		ut_ad(meta.get_id() == id);
-
-		m_count = meta.get_counter()->sum_register();
-	}
-
-	/** Called when the mutex is destroyed. */
-	void destroy()
-		UNIV_NOTHROW
-	{
-		m_count = NULL;
-	}
-
-	/** Called after a successful mutex acquire.
-	@param[in]	n_spins		Number of times the thread did
-					spins while trying to acquire the mutex
-	@param[in]	n_waits		Number of times the thread waited
-					in some type of OS queue */
-	void add(
-		uint32_t	n_spins,
-		uint32_t	n_waits)
-		UNIV_NOTHROW
-	{
-		if (!m_count->m_enabled) {
-
-			return;
-		}
-
-		m_count->m_spins += n_spins;
-		m_count->m_waits += n_waits;
-
-		++m_count->m_calls;
-	}
-
-	/** Print the information about the latch
-	@return the string representation */
-	std::string print() const
-		UNIV_NOTHROW;
-
-	/** @return the latch ID */
-	latch_id_t get_id() const
-	{
-		return(m_id);
-	}
-
-
-  /**
-    I don't think it makes sense to keep track of the file name
-    and line number for each block mutex. Too much of overhead. Use the
-    latch id to figure out the location from the source.
-
-    @return the string representation
-  */
-  std::string to_string() const
-  { return(sync_mutex_to_string(get_id(), "buf0buf.cc:0")); }
-
-#ifdef UNIV_DEBUG
-  MutexDebug<Mutex> context;
-#endif
-
-private:
-  /** The user visible counters, registered with the meta-data. */
-  latch_meta_t::CounterType::Count *m_count;
-
-	/** Latch meta data ID */
-	latch_id_t		m_id;
-};
 #endif /* sync0policy_h */
diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h
index 87d123fc7a9..73ce55af1f2 100644
--- a/storage/innobase/include/sync0rw.h
+++ b/storage/innobase/include/sync0rw.h
@@ -36,6 +36,7 @@ Created 9/11/1995 Heikki Tuuri
 
 #include "os0event.h"
 #include "ut0mutex.h"
+#include "ilist.h"
 
 /** Counters for RW locks. */
 struct rw_lock_stats_t {
@@ -105,9 +106,7 @@ struct rw_lock_t;
 struct rw_lock_debug_t;
 #endif /* UNIV_DEBUG */
 
-typedef UT_LIST_BASE_NODE_T(rw_lock_t)	rw_lock_list_t;
-
-extern rw_lock_list_t			rw_lock_list;
+extern ilist<rw_lock_t> rw_lock_list;
 extern ib_mutex_t			rw_lock_list_mutex;
 
 /** Counters for RW locks. */
@@ -320,8 +319,7 @@ rw_lock_validate(
 	const rw_lock_t*	lock);	/*!< in: rw-lock */
 #endif /* UNIV_DEBUG */
 /******************************************************************//**
-Low-level function which tries to lock an rw-lock in s-mode. Performs no
-spinning.
+Low-level function which tries to lock an rw-lock in s-mode.
 @return TRUE if success */
 UNIV_INLINE
 ibool
@@ -563,11 +561,14 @@ readers, a writer may queue for x-lock by decrementing lock_word: no
 new readers will be let in while the thread waits for readers to
 exit. */
 
-struct rw_lock_t
+struct rw_lock_t :
 #ifdef UNIV_DEBUG
-	: public latch_t
+	public latch_t,
 #endif /* UNIV_DEBUG */
+	public ilist_node<>
 {
+  ut_d(bool created= false;)
+
   /** Holds the state of the lock. */
   Atomic_relaxed<int32_t> lock_word;
 
@@ -610,9 +611,6 @@ struct rw_lock_t
 	/** Count of os_waits. May not be accurate */
 	uint32_t	count_os_wait;
 
-	/** All allocated rw locks are put into a list */
-	UT_LIST_NODE_T(rw_lock_t) list;
-
 #ifdef UNIV_PFS_RWLOCK
 	/** The instrumentation hook */
 	struct PSI_rwlock*	pfs_psi;
diff --git a/storage/innobase/include/sync0rw.inl b/storage/innobase/include/sync0rw.inl
index 603e902d01c..169cbdd9aa5 100644
--- a/storage/innobase/include/sync0rw.inl
+++ b/storage/innobase/include/sync0rw.inl
@@ -220,13 +220,22 @@ rw_lock_lock_word_decr(
 
 			return(true);
 		}
+
+		/* Note that lock_copy was reloaded above. We will
+		keep trying if a spurious conflict occurred, typically
+		caused by concurrent executions of
+		rw_lock_s_lock(). */
+
+		/* Note: unlike this implementation, rw_lock::read_lock()
+		allows concurrent calls without a spin loop */
 	}
+
+	/* A real conflict was detected. */
 	return(false);
 }
 
 /******************************************************************//**
-Low-level function which tries to lock an rw-lock in s-mode. Performs no
-spinning.
+Low-level function which tries to lock an rw-lock in s-mode.
 @return TRUE if success */
 UNIV_INLINE
 ibool
@@ -333,7 +342,7 @@ rw_lock_x_lock_func_nowait(
 	ut_d(rw_lock_add_debug_info(lock, 0, RW_LOCK_X, file_name, line));
 
 	lock->last_x_file_name = file_name;
-	lock->last_x_line = line;
+	lock->last_x_line = line & ((1 << 14) - 1);
 
 	ut_ad(rw_lock_validate(lock));
 
@@ -529,8 +538,6 @@ pfs_rw_lock_x_lock_func(
 
 		/* Record the acquisition of a read-write lock in exclusive
 		mode in performance schema */
-/* MySQL 5.7 New PSI */
-#define PSI_RWLOCK_EXCLUSIVELOCK PSI_RWLOCK_WRITELOCK
 
 		locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)(
 			&state, lock->pfs_psi, PSI_RWLOCK_EXCLUSIVELOCK,
@@ -570,7 +577,6 @@ pfs_rw_lock_x_lock_func_nowait(
 		/* Record the acquisition of a read-write trylock in exclusive
 		mode in performance schema */
 
-#define PSI_RWLOCK_TRYEXCLUSIVELOCK PSI_RWLOCK_TRYWRITELOCK
 		locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)(
 			&state, lock->pfs_psi, PSI_RWLOCK_TRYEXCLUSIVELOCK,
 			file_name, static_cast<uint>(line));
@@ -624,7 +630,6 @@ pfs_rw_lock_s_lock_func(
 		PSI_rwlock_locker*	locker;
 		PSI_rwlock_locker_state	state;
 
-#define  PSI_RWLOCK_SHAREDLOCK  PSI_RWLOCK_READLOCK
 		/* Instrumented to inform we are aquiring a shared rwlock */
 		locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)(
 			&state, lock->pfs_psi, PSI_RWLOCK_SHAREDLOCK,
@@ -659,7 +664,6 @@ pfs_rw_lock_sx_lock_func(
 		PSI_rwlock_locker*	locker;
 		PSI_rwlock_locker_state	state;
 
-#define PSI_RWLOCK_SHAREDEXCLUSIVELOCK PSI_RWLOCK_WRITELOCK
 		/* Instrumented to inform we are aquiring a shared rwlock */
 		locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)(
 			&state, lock->pfs_psi, PSI_RWLOCK_SHAREDEXCLUSIVELOCK,
@@ -696,7 +700,6 @@ pfs_rw_lock_s_lock_low(
 		PSI_rwlock_locker*	locker;
 		PSI_rwlock_locker_state	state;
 
-#define PSI_RWLOCK_TRYSHAREDLOCK PSI_RWLOCK_TRYREADLOCK
 		/* Instrumented to inform we are aquiring a shared rwlock */
 		locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)(
 			&state, lock->pfs_psi, PSI_RWLOCK_TRYSHAREDLOCK,
@@ -736,7 +739,6 @@ pfs_rw_lock_sx_lock_low(
 		PSI_rwlock_locker*	locker;
 		PSI_rwlock_locker_state	state;
 
-#define PSI_RWLOCK_TRYSHAREDEXCLUSIVELOCK PSI_RWLOCK_TRYWRITELOCK
 		/* Instrumented to inform we are aquiring a shared
 		exclusive rwlock */
 		locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)(
diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h
index c9cf963b840..b7f3cff2925 100644
--- a/storage/innobase/include/sync0sync.h
+++ b/storage/innobase/include/sync0sync.h
@@ -37,22 +37,9 @@ Created 9/5/1995 Heikki Tuuri
 
 #include "univ.i"
 
-#if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK
-
-/* By default, buffer mutexes and rwlocks will be excluded from
-instrumentation due to their large number of instances. */
-# define PFS_SKIP_BUFFER_MUTEX_RWLOCK
-
-/* By default, event->mutex will also be excluded from instrumentation */
-# define PFS_SKIP_EVENT_MUTEX
-
-#endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
-
 #ifdef UNIV_PFS_MUTEX
 /* Key defines to register InnoDB mutexes with performance schema */
-extern mysql_pfs_key_t	buffer_block_mutex_key;
 extern mysql_pfs_key_t	buf_pool_mutex_key;
-extern mysql_pfs_key_t	buf_pool_zip_mutex_key;
 extern mysql_pfs_key_t	dict_foreign_err_mutex_key;
 extern mysql_pfs_key_t	dict_sys_mutex_key;
 extern mysql_pfs_key_t	fil_system_mutex_key;
@@ -60,20 +47,15 @@ extern mysql_pfs_key_t	flush_list_mutex_key;
 extern mysql_pfs_key_t	fts_delete_mutex_key;
 extern mysql_pfs_key_t	fts_doc_id_mutex_key;
 extern mysql_pfs_key_t	fts_pll_tokenize_mutex_key;
-extern mysql_pfs_key_t	hash_table_mutex_key;
 extern mysql_pfs_key_t	ibuf_bitmap_mutex_key;
 extern mysql_pfs_key_t	ibuf_mutex_key;
 extern mysql_pfs_key_t	ibuf_pessimistic_insert_mutex_key;
 extern mysql_pfs_key_t	log_sys_mutex_key;
-extern mysql_pfs_key_t	log_sys_write_mutex_key;
 extern mysql_pfs_key_t	log_cmdq_mutex_key;
 extern mysql_pfs_key_t	log_flush_order_mutex_key;
-extern mysql_pfs_key_t	mutex_list_mutex_key;
 extern mysql_pfs_key_t	recalc_pool_mutex_key;
-extern mysql_pfs_key_t	page_cleaner_mutex_key;
 extern mysql_pfs_key_t	purge_sys_pq_mutex_key;
 extern mysql_pfs_key_t	recv_sys_mutex_key;
-extern mysql_pfs_key_t	recv_writer_mutex_key;
 extern mysql_pfs_key_t	rtr_active_mutex_key;
 extern mysql_pfs_key_t	rtr_match_mutex_key;
 extern mysql_pfs_key_t	rtr_path_mutex_key;
@@ -84,7 +66,6 @@ extern mysql_pfs_key_t page_zip_stat_per_index_mutex_key;
 extern mysql_pfs_key_t	rw_lock_debug_mutex_key;
 # endif /* UNIV_DEBUG */
 extern mysql_pfs_key_t	rw_lock_list_mutex_key;
-extern mysql_pfs_key_t	rw_lock_mutex_key;
 extern mysql_pfs_key_t	srv_innodb_monitor_mutex_key;
 extern mysql_pfs_key_t	srv_misc_tmpfile_mutex_key;
 extern mysql_pfs_key_t	srv_monitor_file_mutex_key;
@@ -95,26 +76,19 @@ extern mysql_pfs_key_t	trx_pool_manager_mutex_key;
 extern mysql_pfs_key_t	lock_mutex_key;
 extern mysql_pfs_key_t	lock_wait_mutex_key;
 extern mysql_pfs_key_t	trx_sys_mutex_key;
-extern mysql_pfs_key_t	srv_sys_mutex_key;
 extern mysql_pfs_key_t	srv_threads_mutex_key;
-extern mysql_pfs_key_t	event_mutex_key;
-extern mysql_pfs_key_t	event_manager_mutex_key;
 extern mysql_pfs_key_t	sync_array_mutex_key;
 extern mysql_pfs_key_t	thread_mutex_key;
 extern mysql_pfs_key_t  row_drop_list_mutex_key;
 extern mysql_pfs_key_t	rw_trx_hash_element_mutex_key;
+extern mysql_pfs_key_t	read_view_mutex_key;
 #endif /* UNIV_PFS_MUTEX */
 
 #ifdef UNIV_PFS_RWLOCK
 /* Following are rwlock keys used to register with MySQL
 performance schema */
 extern	mysql_pfs_key_t btr_search_latch_key;
-extern	mysql_pfs_key_t	buf_block_lock_key;
-# ifdef UNIV_DEBUG
-extern	mysql_pfs_key_t	buf_block_debug_latch_key;
-# endif /* UNIV_DEBUG */
 extern	mysql_pfs_key_t	dict_operation_lock_key;
-extern	mysql_pfs_key_t	checkpoint_lock_key;
 extern	mysql_pfs_key_t	fil_space_latch_key;
 extern	mysql_pfs_key_t	fts_cache_rw_lock_key;
 extern	mysql_pfs_key_t	fts_cache_init_rw_lock_key;
@@ -123,7 +97,6 @@ extern	mysql_pfs_key_t	trx_purge_latch_key;
 extern	mysql_pfs_key_t	index_tree_rw_lock_key;
 extern	mysql_pfs_key_t	index_online_log_key;
 extern  mysql_pfs_key_t trx_sys_rw_lock_key;
-extern  mysql_pfs_key_t hash_table_locks_key;
 #endif /* UNIV_PFS_RWLOCK */
 
 /** Prints info of the sync system.
diff --git a/storage/innobase/include/sync0types.h b/storage/innobase/include/sync0types.h
index 1bd52547d93..feb1e3b45ef 100644
--- a/storage/innobase/include/sync0types.h
+++ b/storage/innobase/include/sync0types.h
@@ -147,7 +147,7 @@ V
 lock_sys_mutex				Mutex protecting lock_sys_t
 |
 V
-trx_sys.mutex				Mutex protecting trx_sys_t
+trx_sys.mutex				Mutex protecting trx_sys.trx_list
 |
 V
 Threads mutex				Background thread scheduling mutex
@@ -186,19 +186,8 @@ enum latch_level_t {
 	RW_LOCK_X,
 	RW_LOCK_NOT_LOCKED,
 
-	SYNC_MONITOR_MUTEX,
-
 	SYNC_ANY_LATCH,
 
-	SYNC_DOUBLEWRITE,
-
-	SYNC_BUF_FLUSH_LIST,
-
-	SYNC_BUF_BLOCK,
-	SYNC_BUF_PAGE_HASH,
-
-	SYNC_BUF_POOL,
-
 	SYNC_POOL,
 	SYNC_POOL_MANAGER,
 
@@ -210,15 +199,11 @@ enum latch_level_t {
 	SYNC_FTS_OPTIMIZE,
 	SYNC_FTS_CACHE_INIT,
 	SYNC_RECV,
-	SYNC_LOG_FLUSH_ORDER,
-	SYNC_LOG,
-	SYNC_LOG_WRITE,
-	SYNC_PAGE_CLEANER,
 	SYNC_PURGE_QUEUE,
 	SYNC_TRX_SYS_HEADER,
-	SYNC_THREADS,
 	SYNC_TRX,
 	SYNC_RW_TRX_HASH_ELEMENT,
+	SYNC_READ_VIEW,
 	SYNC_TRX_SYS,
 	SYNC_LOCK_SYS,
 	SYNC_LOCK_WAIT_SYS,
@@ -258,8 +243,6 @@ enum latch_level_t {
 
 	SYNC_TRX_I_S_RWLOCK,
 
-	SYNC_RECV_WRITER,
-
 	/** Level is varying. Only used with buffer pool page locks, which
 	do not have a fixed level, but instead have their level set after
 	the page is locked; see e.g.  ibuf_bitmap_get_map_page(). */
@@ -274,34 +257,21 @@ enum latch_level_t {
 };
 
 /** Each latch has an ID. This id is used for creating the latch and to look
-up its meta-data. See sync0debug.c. */
+up its meta-data. See sync0debug.cc. */
 enum latch_id_t {
 	LATCH_ID_NONE = 0,
-	LATCH_ID_BUF_BLOCK_MUTEX,
-	LATCH_ID_BUF_POOL,
-	LATCH_ID_BUF_POOL_ZIP,
 	LATCH_ID_DICT_FOREIGN_ERR,
 	LATCH_ID_DICT_SYS,
-	LATCH_ID_FILE_FORMAT_MAX,
 	LATCH_ID_FIL_SYSTEM,
-	LATCH_ID_FLUSH_LIST,
 	LATCH_ID_FTS_DELETE,
 	LATCH_ID_FTS_DOC_ID,
 	LATCH_ID_FTS_PLL_TOKENIZE,
-	LATCH_ID_HASH_TABLE_MUTEX,
 	LATCH_ID_IBUF_BITMAP,
 	LATCH_ID_IBUF,
 	LATCH_ID_IBUF_PESSIMISTIC_INSERT,
-	LATCH_ID_LOG_SYS,
-	LATCH_ID_LOG_WRITE,
-	LATCH_ID_LOG_FLUSH_ORDER,
-	LATCH_ID_LIST,
-	LATCH_ID_MUTEX_LIST,
-	LATCH_ID_PAGE_CLEANER,
 	LATCH_ID_PURGE_SYS_PQ,
 	LATCH_ID_RECALC_POOL,
 	LATCH_ID_RECV_SYS,
-	LATCH_ID_RECV_WRITER,
 	LATCH_ID_REDO_RSEG,
 	LATCH_ID_NOREDO_RSEG,
 	LATCH_ID_RW_LOCK_DEBUG,
@@ -309,28 +279,18 @@ enum latch_id_t {
 	LATCH_ID_RTR_MATCH_MUTEX,
 	LATCH_ID_RTR_PATH_MUTEX,
 	LATCH_ID_RW_LOCK_LIST,
-	LATCH_ID_RW_LOCK_MUTEX,
 	LATCH_ID_SRV_INNODB_MONITOR,
 	LATCH_ID_SRV_MISC_TMPFILE,
 	LATCH_ID_SRV_MONITOR_FILE,
-	LATCH_ID_BUF_DBLWR,
 	LATCH_ID_TRX_POOL,
 	LATCH_ID_TRX_POOL_MANAGER,
 	LATCH_ID_TRX,
 	LATCH_ID_LOCK_SYS,
 	LATCH_ID_LOCK_SYS_WAIT,
 	LATCH_ID_TRX_SYS,
-	LATCH_ID_SRV_SYS,
 	LATCH_ID_SRV_SYS_TASKS,
 	LATCH_ID_PAGE_ZIP_STAT_PER_INDEX,
-	LATCH_ID_EVENT_MANAGER,
-	LATCH_ID_EVENT_MUTEX,
 	LATCH_ID_SYNC_ARRAY_MUTEX,
-	LATCH_ID_OS_AIO_READ_MUTEX,
-	LATCH_ID_OS_AIO_WRITE_MUTEX,
-	LATCH_ID_OS_AIO_LOG_MUTEX,
-	LATCH_ID_OS_AIO_IBUF_MUTEX,
-	LATCH_ID_OS_AIO_SYNC_MUTEX,
 	LATCH_ID_ROW_DROP_LIST,
 	LATCH_ID_INDEX_ONLINE_LOG,
 	LATCH_ID_WORK_QUEUE,
@@ -338,7 +298,6 @@ enum latch_id_t {
 	LATCH_ID_BUF_BLOCK_LOCK,
 	LATCH_ID_BUF_BLOCK_DEBUG,
 	LATCH_ID_DICT_OPERATION,
-	LATCH_ID_CHECKPOINT,
 	LATCH_ID_FIL_SPACE,
 	LATCH_ID_FTS_CACHE,
 	LATCH_ID_FTS_CACHE_INIT,
@@ -347,18 +306,14 @@ enum latch_id_t {
 	LATCH_ID_IBUF_INDEX_TREE,
 	LATCH_ID_INDEX_TREE,
 	LATCH_ID_DICT_TABLE_STATS,
-	LATCH_ID_HASH_TABLE_RW_LOCK,
-	LATCH_ID_BUF_CHUNK_MAP_LATCH,
-	LATCH_ID_SYNC_DEBUG_MUTEX,
-	LATCH_ID_SCRUB_STAT_MUTEX,
 	LATCH_ID_DEFRAGMENT_MUTEX,
 	LATCH_ID_BTR_DEFRAGMENT_MUTEX,
 	LATCH_ID_FIL_CRYPT_STAT_MUTEX,
 	LATCH_ID_FIL_CRYPT_DATA_MUTEX,
 	LATCH_ID_FIL_CRYPT_THREADS_MUTEX,
 	LATCH_ID_RW_TRX_HASH_ELEMENT,
-	LATCH_ID_TEST_MUTEX,
-	LATCH_ID_MAX = LATCH_ID_TEST_MUTEX
+	LATCH_ID_READ_VIEW,
+	LATCH_ID_MAX = LATCH_ID_READ_VIEW
 };
 
 #ifndef UNIV_INNOCHECKSUM
@@ -934,27 +889,6 @@ sync_latch_get_name(latch_level_t level);
 const char*
 sync_basename(const char* filename);
 
-/** Register a latch, called when it is created
-@param[in]	ptr		Latch instance that was created
-@param[in]	filename	Filename where it was created
-@param[in]	line		Line number in filename */
-void
-sync_file_created_register(
-	const void*	ptr,
-	const char*	filename,
-	uint16_t	line);
-
-/** Deregister a latch, called when it is destroyed
-@param[in]	ptr		Latch to be destroyed */
-void
-sync_file_created_deregister(const void* ptr);
-
-/** Get the string where the file was created. Its format is "name:line"
-@param[in]	ptr		Latch instance
-@return created information or "" if can't be found */
-std::string
-sync_file_created_get(const void* ptr);
-
 #ifdef UNIV_DEBUG
 
 /** All (ordered) latches, used in debugging, must derive from this class. */
@@ -1032,9 +966,7 @@ struct sync_checker : public sync_check_functor_t
 	{
 		if (some_allowed) {
 			switch (level) {
-			case SYNC_RECV_WRITER:
-				/* This only happens in
-				recv_apply_hashed_log_recs. */
+			case SYNC_FSP:
 			case SYNC_DICT:
 			case SYNC_DICT_OPERATION:
 			case SYNC_FTS_CACHE:
diff --git a/storage/innobase/include/trx0i_s.h b/storage/innobase/include/trx0i_s.h
index ea2f9a18fcc..40160ce4362 100644
--- a/storage/innobase/include/trx0i_s.h
+++ b/storage/innobase/include/trx0i_s.h
@@ -31,6 +31,7 @@ Created July 17, 2007 Vasil Dimov
 
 #include "trx0types.h"
 #include "dict0types.h"
+#include "buf0types.h"
 
 /** The maximum amount of memory that can be consumed by innodb_trx,
 innodb_locks and innodb_lock_waits information schema tables. */
@@ -45,17 +46,9 @@ i_s_trx_row_t::trx_query */
 #define TRX_I_S_TRX_QUERY_MAX_LEN	1024
 
 /** The maximum length of a string that can be stored in
-i_s_trx_row_t::trx_operation_state */
-#define TRX_I_S_TRX_OP_STATE_MAX_LEN	64
-
-/** The maximum length of a string that can be stored in
 i_s_trx_row_t::trx_foreign_key_error */
 #define TRX_I_S_TRX_FK_ERROR_MAX_LEN	256
 
-/** The maximum length of a string that can be stored in
-i_s_trx_row_t::trx_isolation_level */
-#define TRX_I_S_TRX_ISOLATION_LEVEL_MAX_LEN	16
-
 /** Safely copy strings in to the INNODB_TRX table's
 string based columns */
 #define TRX_I_S_STRING_COPY(data, field, constraint, tcache)	\
@@ -94,23 +87,19 @@ struct i_s_hash_chain_t {
 /** This structure represents INFORMATION_SCHEMA.innodb_locks row */
 struct i_s_locks_row_t {
 	trx_id_t	lock_trx_id;	/*!< transaction identifier */
-	const char*	lock_mode;	/*!< lock mode from
-					lock_get_mode_str() */
-	const char*	lock_type;	/*!< lock type from
-					lock_get_type_str() */
 	const char*	lock_table;	/*!< table name from
 					lock_get_table_name() */
-	const char*	lock_index;	/*!< index name from
-					lock_rec_get_index_name() */
-	/** Information for record locks.  All these are
-	ULINT_UNDEFINED for table locks. */
-	/* @{ */
-	ulint		lock_space;	/*!< tablespace identifier */
-	ulint		lock_page;	/*!< page number within the_space */
-	ulint		lock_rec;	/*!< heap number of the record
-					on the page */
-	const char*	lock_data;	/*!< (some) content of the record */
-	/* @} */
+	/** index name of a record lock; NULL for table locks */
+	const char*	lock_index;
+	/** page identifier of the record; (0,0) if !lock_index */
+	page_id_t	lock_page;
+	/** heap number of the record; 0 if !lock_index */
+	uint16_t	lock_rec;
+	/** lock mode corresponding to lock_mode_values_typelib */
+	uint8_t		lock_mode;
+	/** (some) content of the record, if available in the buffer pool;
+	NULL if !lock_index */
+	const char*	lock_data;
 
 	/** The following are auxiliary and not included in the table */
 	/* @{ */
@@ -151,18 +140,15 @@ struct i_s_trx_row_t {
 					trx->lock_heap) */
 	ulint		trx_rows_locked;/*!< lock_number_of_rows_locked() */
 	uintmax_t	trx_rows_modified;/*!< trx_t::undo_no */
-	ulint		trx_concurrency_tickets;
-					/*!< n_tickets_to_enter_innodb in
-					trx_t */
-	const char*	trx_isolation_level;
-					/*!< isolation_level in trx_t */
-	ibool		trx_unique_checks;
+	uint		trx_isolation_level;
+					/*!< trx_t::isolation_level */
+	bool		trx_unique_checks;
 					/*!< check_unique_secondary in trx_t*/
-	ibool		trx_foreign_key_checks;
+	bool		trx_foreign_key_checks;
 					/*!< check_foreigns in trx_t */
 	const char*	trx_foreign_key_error;
 					/*!< detailed_error in trx_t */
-	ulint		trx_is_read_only;
+	bool		trx_is_read_only;
 					/*!< trx_t::read_only */
 	bool		trx_is_autocommit_non_locking;
 					/*!< trx:t::is_autocommit_non_locking()
diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h
index a5a27555f5d..ef94207617d 100644
--- a/storage/innobase/include/trx0purge.h
+++ b/storage/innobase/include/trx0purge.h
@@ -36,16 +36,6 @@ Created 3/26/1996 Heikki Tuuri
 which needs no purge */
 extern trx_undo_rec_t	trx_purge_dummy_rec;
 
-/********************************************************************//**
-Calculates the file address of an undo log header when we have the file
-address of its history list node.
-@return file address of the log */
-UNIV_INLINE
-fil_addr_t
-trx_purge_get_log_from_hist(
-/*========================*/
-	fil_addr_t	node_addr);	/*!< in: file address of the history
-					list node of the log */
 /** Prepend the history list with an undo log.
 Remove the undo log segment from the rseg slot if it is too big for reuse.
 @param[in]	trx		transaction
@@ -53,20 +43,12 @@ Remove the undo log segment from the rseg slot if it is too big for reuse.
 @param[in,out]	mtr		mini-transaction */
 void
 trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr);
-/*******************************************************************//**
-This function runs a purge batch.
+/**
+Run a purge batch.
+@param n_tasks   number of purge tasks to submit to the queue
+@param truncate  whether to truncate the history at the end of the batch
 @return number of undo log pages handled in the batch */
-ulint
-trx_purge(
-/*======*/
-	ulint	n_purge_threads,	/*!< in: number of purge tasks to
-					submit to task queue. */
-	bool	truncate		/*!< in: truncate history if true */
-#ifdef UNIV_DEBUG
-	, srv_slot_t *slot		/*!< in/out: purge coordinator
-					thread slot */
-#endif
-);
+ulint trx_purge(ulint n_tasks, bool truncate);
 
 /** Rollback segements from a given transaction with trx-no
 scheduled for purge. */
@@ -141,14 +123,13 @@ private:
 class purge_sys_t
 {
 public:
-	/** signal state changes; os_event_reset() and os_event_set()
-	are protected by rw_lock_x_lock(latch) */
-	MY_ALIGNED(CACHE_LINE_SIZE)
-	os_event_t	event;
 	/** latch protecting view, m_enabled */
 	MY_ALIGNED(CACHE_LINE_SIZE)
-	rw_lock_t	latch;
+	mutable rw_lock_t		latch;
 private:
+	/** The purge will not remove undo logs which are >= this view */
+	MY_ALIGNED(CACHE_LINE_SIZE)
+	ReadViewBase	view;
 	/** whether purge is enabled; protected by latch and std::atomic */
 	std::atomic<bool>		m_enabled;
 	/** number of pending stop() calls without resume() */
@@ -156,12 +137,6 @@ private:
 public:
 	que_t*		query;		/*!< The query graph which will do the
 					parallelized purge operation */
-	MY_ALIGNED(CACHE_LINE_SIZE)
-	ReadView	view;		/*!< The purge will not remove undo logs
-					which are >= this view (purge view) */
-	/** Number of not completed tasks. Accessed by srv_purge_coordinator
-	and srv_worker_thread by std::atomic. */
-	std::atomic<ulint>	n_tasks;
 
 	/** Iterator to the undo log records of committed transactions */
 	struct iterator
@@ -191,15 +166,15 @@ public:
 					to purge */
 	trx_rseg_t*	rseg;		/*!< Rollback segment for the next undo
 					record to purge */
-	ulint		page_no;	/*!< Page number for the next undo
+	uint32_t	page_no;	/*!< Page number for the next undo
 					record to purge, page number of the
 					log header, if dummy record */
-	ulint		offset;		/*!< Page offset for the next undo
+	uint32_t	hdr_page_no;	/*!< Header page of the undo log where
+					the next record to purge belongs */
+	uint16_t	offset;		/*!< Page offset for the next undo
 					record to purge, 0 if the dummy
 					record */
-	ulint		hdr_page_no;	/*!< Header page of the undo log where
-					the next record to purge belongs */
-	ulint		hdr_offset;	/*!< Header byte offset on the page */
+	uint16_t	hdr_offset;	/*!< Header byte offset on the page */
 
 
 	TrxUndoRsegsIterator
@@ -220,6 +195,8 @@ public:
 		fil_space_t*	last;
 	} truncate;
 
+	/** Heap for reading the undo log records */
+	mem_heap_t*	heap;
   /**
     Constructor.
 
@@ -227,8 +204,7 @@ public:
     uninitialised. Real initialisation happens in create().
   */
 
-  purge_sys_t() : event(NULL), m_enabled(false), n_tasks(0) {}
-
+  purge_sys_t(): m_enabled(false), heap(nullptr) {}
 
   /** Create the instance */
   void create();
@@ -257,23 +233,36 @@ public:
     m_enabled.store(false, std::memory_order_relaxed);
   }
 
-  /** @return whether the purge coordinator thread is active */
-  bool running();
+  /** @return whether the purge tasks are active */
+  bool running() const;
   /** Stop purge during FLUSH TABLES FOR EXPORT */
   void stop();
   /** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */
   void resume();
+  /** A wrapper around ReadView::changes_visible(). */
+  bool changes_visible(trx_id_t id, const table_name_t &name) const
+  {
+    ut_ad(rw_lock_own(&latch, RW_LOCK_S));
+    return view.changes_visible(id, name);
+  }
+  /** A wrapper around ReadView::low_limit_no(). */
+  trx_id_t low_limit_no() const
+  {
+#if 0 /* Unfortunately we don't hold this assertion, see MDEV-22718. */
+    ut_ad(rw_lock_own(&latch, RW_LOCK_S));
+#endif
+    return view.low_limit_no();
+  }
+  /** A wrapper around trx_sys_t::clone_oldest_view(). */
+  void clone_oldest_view()
+  {
+    rw_lock_x_lock(&latch);
+    trx_sys.clone_oldest_view(&view);
+    rw_lock_x_unlock(&latch);
+  }
 };
 
 /** The global data structure coordinating a purge */
 extern purge_sys_t	purge_sys;
 
-/** Info required to purge a record */
-struct trx_purge_rec_t {
-	trx_undo_rec_t*	undo_rec;	/*!< Record to purge */
-	roll_ptr_t	roll_ptr;	/*!< File pointr to UNDO record */
-};
-
-#include "trx0purge.inl"
-
 #endif /* trx0purge_h */
diff --git a/storage/innobase/include/trx0purge.inl b/storage/innobase/include/trx0purge.inl
deleted file mode 100644
index e460676d58e..00000000000
--- a/storage/innobase/include/trx0purge.inl
+++ /dev/null
@@ -1,42 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file include/trx0purge.ic
-Purge old versions
-
-Created 3/26/1996 Heikki Tuuri
-*******************************************************/
-
-#include "trx0undo.h"
-
-/********************************************************************//**
-Calculates the file address of an undo log header when we have the file
-address of its history list node.
-@return file address of the log */
-UNIV_INLINE
-fil_addr_t
-trx_purge_get_log_from_hist(
-/*========================*/
-	fil_addr_t	node_addr)	/*!< in: file address of the history
-					list node of the log */
-{
-	node_addr.boffset -= TRX_UNDO_HISTORY_NODE;
-
-	return(node_addr);
-}
diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h
index fa084ff274c..665c987adff 100644
--- a/storage/innobase/include/trx0rec.h
+++ b/storage/innobase/include/trx0rec.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -110,7 +110,7 @@ trx_undo_update_rec_get_sys_cols(
 					general parameters */
 	trx_id_t*	trx_id,		/*!< out: trx id */
 	roll_ptr_t*	roll_ptr,	/*!< out: roll ptr */
-	ulint*		info_bits);	/*!< out: info bits state */
+	byte*		info_bits);	/*!< out: info bits state */
 /*******************************************************************//**
 Builds an update vector based on a remaining part of an undo log record.
 @return remaining part of the record, NULL if an error detected, which
@@ -132,7 +132,7 @@ trx_undo_update_rec_get_update(
 				the update vector */
 	trx_id_t	trx_id,	/*!< in: transaction id from this undorecord */
 	roll_ptr_t	roll_ptr,/*!< in: roll pointer from this undo record */
-	ulint		info_bits,/*!< in: info bits from this undo record */
+	byte		info_bits,/*!< in: info bits from this undo record */
 	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
 				needed is allocated */
 	upd_t**		upd);	/*!< out, own: update vector */
@@ -240,23 +240,6 @@ trx_undo_prev_version_build(
 				into this function by purge thread or not.
 				And if we read "after image" of undo log */
 
-/** Parse MLOG_UNDO_INSERT.
-@param[in]	ptr	log record
-@param[in]	end_ptr	end of log record buffer
-@param[in,out]	page	page or NULL
-@return	end of log record
-@retval	NULL	if the log record is incomplete */
-byte*
-trx_undo_parse_add_undo_rec(
-	const byte*	ptr,
-	const byte*	end_ptr,
-	page_t*		page);
-/** Erase the unused undo log page end.
-@param[in,out]	undo_page	undo log page
-@return whether the page contained something */
-bool
-trx_undo_erase_page_end(page_t* undo_page);
-
 /** Read from an undo log record a non-virtual column value.
 @param[in,out]	ptr		pointer to remaining part of the undo record
 @param[in,out]	field		stored field
@@ -264,12 +247,8 @@ trx_undo_erase_page_end(page_t* undo_page);
 @param[in,out]	orig_len	original length of the locally stored part
 of an externally stored column, or 0
 @return remaining part of undo log record after reading these values */
-byte*
-trx_undo_rec_get_col_val(
-        const byte*     ptr,
-        const byte**    field,
-        ulint*          len,
-        ulint*          orig_len);
+byte *trx_undo_rec_get_col_val(const byte *ptr, const byte **field,
+                               uint32_t *len, uint32_t *orig_len);
 
 /** Read virtual column value from undo log
 @param[in]	table		the table
@@ -292,7 +271,7 @@ info, and verify the column is still indexed, and output its position
 @param[in,out]	is_undo_log	his function is used to parse both undo log,
 				and online log for virtual columns. So
 				check to see if this is undo log
-@param[out]	field_no	the column number
+@param[out]	field_no	the column number, or FIL_NULL if not indexed
 @return remaining part of undo log record after reading these values */
 const byte*
 trx_undo_read_v_idx(
@@ -300,7 +279,7 @@ trx_undo_read_v_idx(
 	const byte*		ptr,
 	bool			first_v_col,
 	bool*			is_undo_log,
-	ulint*			field_no);
+	uint32_t*		field_no);
 
 /* Types of an undo log record: these have to be smaller than 16, as the
 compilation info multiplied by 16 is ORed to this value in an undo log
@@ -327,6 +306,16 @@ record */
 /** The search tuple corresponding to TRX_UNDO_INSERT_METADATA */
 extern const dtuple_t trx_undo_metadata;
 
+/** Read the table id from an undo log record.
+@param[in]      rec        Undo log record
+@return table id stored as a part of undo log record */
+inline table_id_t trx_undo_rec_get_table_id(const trx_undo_rec_t *rec)
+{
+  rec+= 3;
+  mach_read_next_much_compressed(&rec);
+  return mach_read_next_much_compressed(&rec);
+}
+
 #include "trx0rec.inl"
 
 #endif /* trx0rec_h */
diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h
index d9ea6c19d11..6a562dcb425 100644
--- a/storage/innobase/include/trx0roll.h
+++ b/storage/innobase/include/trx0roll.h
@@ -94,17 +94,6 @@ trx_rollback_last_sql_stat_for_mysql(
 	trx_t*	trx)	/*!< in/out: transaction */
 	MY_ATTRIBUTE((nonnull));
 /*******************************************************************//**
-Rollback a transaction to a given savepoint or do a complete rollback.
-@return error code or DB_SUCCESS */
-dberr_t
-trx_rollback_to_savepoint(
-/*======================*/
-	trx_t*		trx,	/*!< in: transaction handle */
-	trx_savept_t*	savept)	/*!< in: pointer to savepoint undo number, if
-				partial rollback requested, or NULL for
-				complete rollback */
-	MY_ATTRIBUTE((nonnull(1)));
-/*******************************************************************//**
 Rolls back a transaction back to a named savepoint. Modifications after the
 savepoint are undone but InnoDB does NOT release the corresponding locks
 which are stored in memory. If a lock is 'implicit', that is, a new inserted
diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h
index 32153f82198..d08ed709b14 100644
--- a/storage/innobase/include/trx0rseg.h
+++ b/storage/innobase/include/trx0rseg.h
@@ -36,8 +36,8 @@ Created 3/26/1996 Heikki Tuuri
 @param[in,out]	mtr		mini-transaction
 @return rollback segment header, page x-latched */
 UNIV_INLINE
-trx_rsegf_t*
-trx_rsegf_get(fil_space_t* space, ulint page_no, mtr_t* mtr);
+buf_block_t*
+trx_rsegf_get(fil_space_t* space, uint32_t page_no, mtr_t* mtr);
 
 /** Gets a newly created rollback segment header.
 @param[in]	space		space where placed
@@ -45,29 +45,12 @@ trx_rsegf_get(fil_space_t* space, ulint page_no, mtr_t* mtr);
 @param[in,out]	mtr		mini-transaction
 @return rollback segment header, page x-latched */
 UNIV_INLINE
-trx_rsegf_t*
+buf_block_t*
 trx_rsegf_get_new(
 	ulint			space,
-	ulint			page_no,
+	uint32_t		page_no,
 	mtr_t*			mtr);
 
-/***************************************************************//**
-Sets the file page number of the nth undo log slot. */
-UNIV_INLINE
-void
-trx_rsegf_set_nth_undo(
-/*===================*/
-	trx_rsegf_t*	rsegf,	/*!< in: rollback segment header */
-	ulint		n,	/*!< in: index of slot */
-	ulint		page_no,/*!< in: page number of the undo log segment */
-	mtr_t*		mtr);	/*!< in: mtr */
-/****************************************************************//**
-Looks for a free slot for an undo log segment.
-@return slot index or ULINT_UNDEFINED if not found */
-UNIV_INLINE
-ulint
-trx_rsegf_undo_find_free(const trx_rsegf_t* rsegf);
-
 /** Create a rollback segment header.
 @param[in,out]	space		system, undo, or temporary tablespace
 @param[in]	rseg_id		rollback segment identifier
@@ -103,17 +86,6 @@ trx_rseg_create(ulint space_id)
 void
 trx_temp_rseg_create();
 
-/********************************************************************
-Get the number of unique rollback tablespaces in use except space id 0.
-The last space id will be the sentinel value ULINT_UNDEFINED. The array
-will be sorted on space id. Note: space_ids should have have space for
-TRX_SYS_N_RSEGS + 1 elements.
-@return number of unique rollback tablespaces in use. */
-ulint
-trx_rseg_get_n_undo_tablespaces(
-/*============================*/
-	ulint*		space_ids);	/*!< out: array of space ids of
-					UNDO tablespaces */
 /* Number of undo log slots in a rollback segment file copy */
 #define TRX_RSEG_N_SLOTS	(srv_page_size / 16)
 
@@ -135,10 +107,10 @@ struct trx_rseg_t {
 	fil_space_t*			space;
 
 	/** page number of the rollback segment header */
-	ulint				page_no;
+	uint32_t			page_no;
 
 	/** current size in pages */
-	ulint				curr_size;
+	uint32_t			curr_size;
 
 	/*--------------------------------------------------------*/
 	/* Fields for undo logs */
@@ -173,7 +145,7 @@ struct trx_rseg_t {
   uint16_t last_offset() const
   { return static_cast<uint16_t>(last_commit_and_offset >> 48); }
 
-  void set_last_commit(ulint last_offset, trx_id_t trx_no)
+  void set_last_commit(uint16_t last_offset, trx_id_t trx_no)
   {
     last_commit_and_offset= static_cast<uint64_t>(last_offset) << 48 | trx_no;
   }
@@ -252,15 +224,13 @@ If no binlog information is present, the first byte is NUL. */
 /*-------------------------------------------------------------*/
 
 /** Read the page number of an undo log slot.
-@param[in]	rsegf	rollback segment header
-@param[in]	n	slot number */
-inline
-uint32_t
-trx_rsegf_get_nth_undo(const trx_rsegf_t* rsegf, ulint n)
+@param[in]      rseg_header     rollback segment header
+@param[in]      n               slot number */
+inline uint32_t trx_rsegf_get_nth_undo(const buf_block_t *rseg_header, ulint n)
 {
-	ut_ad(n < TRX_RSEG_N_SLOTS);
-	return mach_read_from_4(rsegf + TRX_RSEG_UNDO_SLOTS
-				+ n * TRX_RSEG_SLOT_SIZE);
+  ut_ad(n < TRX_RSEG_N_SLOTS);
+  return mach_read_from_4(TRX_RSEG + TRX_RSEG_UNDO_SLOTS +
+                          n * TRX_RSEG_SLOT_SIZE + rseg_header->frame);
 }
 
 #ifdef WITH_WSREP
@@ -270,7 +240,7 @@ trx_rsegf_get_nth_undo(const trx_rsegf_t* rsegf, ulint n)
 @param[in,out]	mtr		mini-transaction */
 void
 trx_rseg_update_wsrep_checkpoint(
-	trx_rsegf_t*	rseg_header,
+	buf_block_t*	rseg_header,
 	const XID*	xid,
 	mtr_t*		mtr);
 
@@ -292,7 +262,7 @@ bool trx_rseg_read_wsrep_checkpoint(XID& xid);
 /** Upgrade a rollback segment header page to MariaDB 10.3 format.
 @param[in,out]	rseg_header	rollback segment header page
 @param[in,out]	mtr		mini-transaction */
-void trx_rseg_format_upgrade(trx_rsegf_t* rseg_header, mtr_t* mtr);
+void trx_rseg_format_upgrade(buf_block_t *rseg_header, mtr_t *mtr);
 
 /** Update the offset information about the end of the binlog entry
 which corresponds to the transaction just being committed.
@@ -301,8 +271,8 @@ up to which replication has proceeded.
 @param[in,out]	rseg_header	rollback segment header
 @param[in]	trx		committing transaction
 @param[in,out]	mtr		mini-transaction */
-void
-trx_rseg_update_binlog_offset(byte* rseg_header, const trx_t* trx, mtr_t* mtr);
+void trx_rseg_update_binlog_offset(buf_block_t *rseg_header, const trx_t *trx,
+                                   mtr_t *mtr);
 
 #include "trx0rseg.inl"
 
diff --git a/storage/innobase/include/trx0rseg.inl b/storage/innobase/include/trx0rseg.inl
index 0cff8fa1f5c..b293d9f1ae1 100644
--- a/storage/innobase/include/trx0rseg.inl
+++ b/storage/innobase/include/trx0rseg.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2018, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -33,8 +33,8 @@ Created 3/26/1996 Heikki Tuuri
 @param[in,out]	mtr		mini-transaction
 @return rollback segment header, page x-latched */
 UNIV_INLINE
-trx_rsegf_t*
-trx_rsegf_get(fil_space_t* space, ulint page_no, mtr_t* mtr)
+buf_block_t*
+trx_rsegf_get(fil_space_t* space, uint32_t page_no, mtr_t* mtr)
 {
 	ut_ad(space == fil_system.sys_space || space == fil_system.temp_space
 	      || srv_is_undo_tablespace(space->id)
@@ -44,8 +44,7 @@ trx_rsegf_get(fil_space_t* space, ulint page_no, mtr_t* mtr)
 					  0, RW_X_LATCH, mtr);
 
 	buf_block_dbg_add_level(block, SYNC_RSEG_HEADER);
-
-	return TRX_RSEG + block->frame;
+	return block;
 }
 
 /** Gets a newly created rollback segment header.
@@ -54,14 +53,13 @@ trx_rsegf_get(fil_space_t* space, ulint page_no, mtr_t* mtr)
 @param[in,out]	mtr		mini-transaction
 @return rollback segment header, page x-latched */
 UNIV_INLINE
-trx_rsegf_t*
+buf_block_t*
 trx_rsegf_get_new(
 	ulint			space,
-	ulint			page_no,
+	uint32_t		page_no,
 	mtr_t*			mtr)
 {
 	buf_block_t*	block;
-	trx_rsegf_t*	header;
 
 	ut_ad(space <= srv_undo_tablespaces_active || space == SRV_TMP_SPACE_ID
 	      || !srv_was_started);
@@ -70,54 +68,5 @@ trx_rsegf_get_new(
 	block = buf_page_get(page_id_t(space, page_no), 0, RW_X_LATCH, mtr);
 
 	buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW);
-
-	header = TRX_RSEG + buf_block_get_frame(block);
-
-	return(header);
-}
-
-/***************************************************************//**
-Sets the file page number of the nth undo log slot. */
-UNIV_INLINE
-void
-trx_rsegf_set_nth_undo(
-/*===================*/
-	trx_rsegf_t*	rsegf,	/*!< in: rollback segment header */
-	ulint		n,	/*!< in: index of slot */
-	ulint		page_no,/*!< in: page number of the undo log segment */
-	mtr_t*		mtr)	/*!< in: mtr */
-{
-	ut_a(n < TRX_RSEG_N_SLOTS);
-
-	mlog_write_ulint(rsegf + TRX_RSEG_UNDO_SLOTS + n * TRX_RSEG_SLOT_SIZE,
-			 page_no, MLOG_4BYTES, mtr);
-}
-
-/****************************************************************//**
-Looks for a free slot for an undo log segment.
-@return slot index or ULINT_UNDEFINED if not found */
-UNIV_INLINE
-ulint
-trx_rsegf_undo_find_free(const trx_rsegf_t* rsegf)
-{
-	ulint		i;
-	ulint		page_no;
-	ulint		max_slots = TRX_RSEG_N_SLOTS;
-
-#ifdef UNIV_DEBUG
-	if (trx_rseg_n_slots_debug) {
-		max_slots = ut_min(static_cast<ulint>(trx_rseg_n_slots_debug),
-				   static_cast<ulint>(TRX_RSEG_N_SLOTS));
-	}
-#endif
-
-	for (i = 0; i < max_slots; i++) {
-		page_no = trx_rsegf_get_nth_undo(rsegf, i);
-
-		if (page_no == FIL_NULL) {
-			return(i);
-		}
-	}
-
-	return(ULINT_UNDEFINED);
+	return block;
 }
diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
index 323994fa14f..424e4447b41 100644
--- a/storage/innobase/include/trx0sys.h
+++ b/storage/innobase/include/trx0sys.h
@@ -41,16 +41,14 @@ Created 3/26/1996 Heikki Tuuri
 #ifdef WITH_WSREP
 #include "trx0xa.h"
 #endif /* WITH_WSREP */
-
-typedef UT_LIST_BASE_NODE_T(trx_t) trx_ut_list_t;
+#include "ilist.h"
 
 /** Checks if a page address is the trx sys header page.
 @param[in]	page_id	page id
 @return true if trx sys header page */
-inline bool trx_sys_hdr_page(const page_id_t& page_id)
+inline bool trx_sys_hdr_page(const page_id_t page_id)
 {
-	return(page_id.space() == TRX_SYS_SPACE
-	       && page_id.page_no() == TRX_SYS_PAGE_NO);
+  return page_id == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO);
 }
 
 /*****************************************************************//**
@@ -340,9 +338,6 @@ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
 constexpr uint32_t TRX_SYS_DOUBLEWRITE_MAGIC_N= 536853855;
 /** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */
 constexpr uint32_t TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N= 1783657386;
-
-/** Size of the doublewrite block in pages */
-#define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE	FSP_EXTENT_SIZE
 /* @} */
 
 trx_t* current_trx();
@@ -362,6 +357,13 @@ struct rw_trx_hash_element_t
 
 
   trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */
+
+  /**
+    Transaction serialization number.
+
+    Assigned shortly before the transaction is moved to COMMITTED_IN_MEMORY
+    state. Initially set to TRX_ID_MAX.
+  */
   Atomic_counter<trx_id_t> no;
   trx_t *trx;
   ib_mutex_t mutex;
@@ -377,6 +379,10 @@ class rw_trx_hash_t
   LF_HASH hash;
 
 
+  template <typename T>
+  using walk_action= my_bool(rw_trx_hash_element_t *element, T *action);
+
+
   /**
     Constructor callback for lock-free allocator.
 
@@ -479,18 +485,19 @@ class rw_trx_hash_t
   }
 
 
-  struct eliminate_duplicates_arg
+  template <typename T> struct eliminate_duplicates_arg
   {
     trx_ids_t ids;
-    my_hash_walk_action action;
-    void *argument;
-    eliminate_duplicates_arg(size_t size, my_hash_walk_action act, void* arg):
+    walk_action<T> *action;
+    T *argument;
+    eliminate_duplicates_arg(size_t size, walk_action<T> *act, T *arg):
       action(act), argument(arg) { ids.reserve(size); }
   };
 
 
+  template <typename T>
   static my_bool eliminate_duplicates(rw_trx_hash_element_t *element,
-                                      eliminate_duplicates_arg *arg)
+                                      eliminate_duplicates_arg<T> *arg)
   {
     for (trx_ids_t::iterator it= arg->ids.begin(); it != arg->ids.end(); it++)
     {
@@ -517,15 +524,16 @@ class rw_trx_hash_t
   }
 
 
-  struct debug_iterator_arg
+  template <typename T> struct debug_iterator_arg
   {
-    my_hash_walk_action action;
-    void *argument;
+    walk_action<T> *action;
+    T *argument;
   };
 
 
+  template <typename T>
   static my_bool debug_iterator(rw_trx_hash_element_t *element,
-                                debug_iterator_arg *arg)
+                                debug_iterator_arg<T> *arg)
   {
     mutex_enter(&element->mutex);
     if (element->trx)
@@ -740,23 +748,28 @@ public:
       @retval 1 iteration was interrupted (action returned 1)
   */
 
-  int iterate(trx_t *caller_trx, my_hash_walk_action action, void *argument)
+  template <typename T>
+  int iterate(trx_t *caller_trx, walk_action<T> *action, T *argument= nullptr)
   {
     LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
     ut_a(pins);
 #ifdef UNIV_DEBUG
-    debug_iterator_arg debug_arg= { action, argument };
-    action= reinterpret_cast<my_hash_walk_action>(debug_iterator);
-    argument= &debug_arg;
+    debug_iterator_arg<T> debug_arg= { action, argument };
+    action= reinterpret_cast<decltype(action)>(debug_iterator<T>);
+    argument= reinterpret_cast<T*>(&debug_arg);
 #endif
-    int res= lf_hash_iterate(&hash, pins, action, argument);
+    int res= lf_hash_iterate(&hash, pins,
+                             reinterpret_cast<my_hash_walk_action>(action),
+                             const_cast<void*>(static_cast<const void*>
+                             (argument)));
     if (!caller_trx)
       lf_hash_put_pins(pins);
     return res;
   }
 
 
-  int iterate(my_hash_walk_action action, void *argument)
+  template <typename T>
+  int iterate(walk_action<T> *action, T *argument= nullptr)
   {
     return iterate(current_trx(), action, argument);
   }
@@ -768,21 +781,73 @@ public:
     @sa iterate()
   */
 
-  int iterate_no_dups(trx_t *caller_trx, my_hash_walk_action action,
-                      void *argument)
+  template <typename T>
+  int iterate_no_dups(trx_t *caller_trx, walk_action<T> *action,
+                      T *argument= nullptr)
   {
-    eliminate_duplicates_arg arg(size() + 32, action, argument);
-    return iterate(caller_trx, reinterpret_cast<my_hash_walk_action>
-                   (eliminate_duplicates), &arg);
+    eliminate_duplicates_arg<T> arg(size() + 32, action, argument);
+    return iterate(caller_trx, eliminate_duplicates<T>, &arg);
   }
 
 
-  int iterate_no_dups(my_hash_walk_action action, void *argument)
+  template <typename T>
+  int iterate_no_dups(walk_action<T> *action, T *argument= nullptr)
   {
     return iterate_no_dups(current_trx(), action, argument);
   }
 };
 
+class thread_safe_trx_ilist_t
+{
+public:
+  void create() { mutex_create(LATCH_ID_TRX_SYS, &mutex); }
+  void close() { mutex_free(&mutex); }
+
+  bool empty() const
+  {
+    mutex_enter(&mutex);
+    auto result= trx_list.empty();
+    mutex_exit(&mutex);
+    return result;
+  }
+
+  void push_front(trx_t &trx)
+  {
+    mutex_enter(&mutex);
+    trx_list.push_front(trx);
+    mutex_exit(&mutex);
+  }
+
+  void remove(trx_t &trx)
+  {
+    mutex_enter(&mutex);
+    trx_list.remove(trx);
+    mutex_exit(&mutex);
+  }
+
+  template <typename Callable> void for_each(Callable &&callback) const
+  {
+    mutex_enter(&mutex);
+    for (const auto &trx : trx_list)
+      callback(trx);
+    mutex_exit(&mutex);
+  }
+
+  template <typename Callable> void for_each(Callable &&callback)
+  {
+    mutex_enter(&mutex);
+    for (auto &trx : trx_list)
+      callback(trx);
+    mutex_exit(&mutex);
+  }
+
+  void freeze() const { mutex_enter(&mutex); }
+  void unfreeze() const { mutex_exit(&mutex); }
+
+private:
+  alignas(CACHE_LINE_SIZE) mutable TrxSysMutex mutex;
+  alignas(CACHE_LINE_SIZE) ilist<trx_t> trx_list;
+};
 
 /** The transaction system central memory data structure. */
 class trx_sys_t
@@ -813,11 +878,8 @@ public:
   */
   MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<uint32_t> rseg_history_len;
 
-  /** Mutex protecting trx_list. */
-  MY_ALIGNED(CACHE_LINE_SIZE) mutable TrxSysMutex mutex;
-
   /** List of all transactions. */
-  MY_ALIGNED(CACHE_LINE_SIZE) trx_ut_list_t trx_list;
+  thread_safe_trx_ilist_t trx_list;
 
 	MY_ALIGNED(CACHE_LINE_SIZE)
 	/** Temporary rollback segments */
@@ -875,8 +937,7 @@ public:
   trx_id_t get_min_trx_id()
   {
     trx_id_t id= get_max_trx_id();
-    rw_trx_hash.iterate(reinterpret_cast<my_hash_walk_action>
-                        (get_min_trx_id_callback), &id);
+    rw_trx_hash.iterate(get_min_trx_id_callback, &id);
     return id;
   }
 
@@ -931,8 +992,7 @@ public:
   */
   void assign_new_trx_no(trx_t *trx)
   {
-    trx->no= get_new_trx_id_no_refresh();
-    trx->rw_trx_hash_element->no= trx->no;
+    trx->rw_trx_hash_element->no= get_new_trx_id_no_refresh();
     refresh_rw_trx_hash_version();
   }
 
@@ -956,13 +1016,12 @@ public:
     @param[in,out] caller_trx used to get access to rw_trx_hash_pins
     @param[out]    ids        array to store registered transaction identifiers
     @param[out]    max_trx_id variable to store m_max_trx_id value
-    @param[out]    mix_trx_no variable to store min(trx->no) value
+    @param[out]    mix_trx_no variable to store min(no) value
   */
 
   void snapshot_ids(trx_t *caller_trx, trx_ids_t *ids, trx_id_t *max_trx_id,
                     trx_id_t *min_trx_no)
   {
-    ut_ad(!mutex_own(&mutex));
     snapshot_ids_arg arg(ids);
 
     while ((arg.m_id= get_rw_trx_hash_version()) != get_max_trx_id())
@@ -971,9 +1030,7 @@ public:
 
     ids->clear();
     ids->reserve(rw_trx_hash.size() + 32);
-    rw_trx_hash.iterate(caller_trx,
-                        reinterpret_cast<my_hash_walk_action>(copy_one_id),
-                        &arg);
+    rw_trx_hash.iterate(caller_trx, copy_one_id, &arg);
 
     *max_trx_id= arg.m_id;
     *min_trx_no= arg.m_no;
@@ -1061,9 +1118,7 @@ public:
   */
   void register_trx(trx_t *trx)
   {
-    mutex_enter(&mutex);
-    UT_LIST_ADD_FIRST(trx_list, trx);
-    mutex_exit(&mutex);
+    trx_list.push_front(*trx);
   }
 
 
@@ -1074,9 +1129,7 @@ public:
   */
   void deregister_trx(trx_t *trx)
   {
-    mutex_enter(&mutex);
-    UT_LIST_REMOVE(trx_list, trx);
-    mutex_exit(&mutex);
+    trx_list.remove(*trx);
   }
 
 
@@ -1087,7 +1140,7 @@ public:
     in. This function is called by purge thread to determine whether it should
     purge the delete marked record or not.
   */
-  void clone_oldest_view();
+  void clone_oldest_view(ReadViewBase *view) const;
 
 
   /** @return the number of active views */
@@ -1095,14 +1148,11 @@ public:
   {
     size_t count= 0;
 
-    mutex_enter(&mutex);
-    for (const trx_t *trx= UT_LIST_GET_FIRST(trx_list); trx;
-         trx= UT_LIST_GET_NEXT(trx_list, trx))
-    {
-      if (trx->read_view.get_state() == READ_VIEW_STATE_OPEN)
+    trx_list.for_each([&count](const trx_t &trx) {
+      if (trx.read_view.is_open())
         ++count;
-    }
-    mutex_exit(&mutex);
+    });
+
     return count;
   }
 
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
index b84f458374e..68a9812468f 100644
--- a/storage/innobase/include/trx0trx.h
+++ b/storage/innobase/include/trx0trx.h
@@ -35,13 +35,13 @@ Created 3/26/1996 Heikki Tuuri
 #include "ut0vec.h"
 #include "fts0fts.h"
 #include "read0types.h"
+#include "ilist.h"
 
 #include <vector>
 #include <set>
 
 // Forward declaration
 struct mtr_t;
-class FlushObserver;
 struct rw_trx_hash_element_t;
 
 /******************************************************************//**
@@ -233,8 +233,7 @@ trx_commit_step(
 	que_thr_t*	thr);	/*!< in: query thread */
 
 /**********************************************************************//**
-Prints info about a transaction.
-Caller must hold trx_sys.mutex. */
+Prints info about a transaction. */
 void
 trx_print_low(
 /*==========*/
@@ -254,7 +253,6 @@ trx_print_low(
 
 /**********************************************************************//**
 Prints info about a transaction.
-The caller must hold lock_sys.mutex and trx_sys.mutex.
 When possible, use trx_print() instead. */
 void
 trx_print_latched(
@@ -296,7 +294,7 @@ trx_set_dict_operation(
 
 /**********************************************************************//**
 Determines if a transaction is in the given state.
-The caller must hold trx_sys.mutex, or it must be the thread
+The caller must hold trx->mutex, or it must be the thread
 that is serving a running transaction.
 A running RW transaction must be in trx_sys.rw_trx_hash.
 @return TRUE if trx->state == state */
@@ -420,8 +418,11 @@ code and no mutex is required when the query thread is no longer waiting. */
 /** The locks and state of an active transaction. Protected by
 lock_sys.mutex, trx->mutex or both. */
 struct trx_lock_t {
-	ulint		n_active_thrs;	/*!< number of active query threads */
-
+#ifdef UNIV_DEBUG
+	/** number of active query threads; at most 1, except for the
+	dummy transaction in trx_purge() */
+	ulint n_active_thrs;
+#endif
 	trx_que_t	que_state;	/*!< valid when trx->state
 					== TRX_STATE_ACTIVE: TRX_QUE_RUNNING,
 					TRX_QUE_LOCK_WAIT, ... */
@@ -642,7 +643,7 @@ struct trx_rsegs_t {
 	trx_temp_undo_t	m_noredo;
 };
 
-struct trx_t {
+struct trx_t : ilist_node<> {
 private:
   /**
     Count of references.
@@ -663,14 +664,6 @@ public:
 
 	trx_id_t	id;		/*!< transaction id */
 
-	trx_id_t	no;		/*!< transaction serialization number:
-					max trx id shortly before the
-					transaction is moved to
-					COMMITTED_IN_MEMORY state.
-					Protected by trx_sys_t::mutex
-					when trx is in rw_trx_hash. Initially
-					set to TRX_ID_MAX. */
-
 	/** State of the trx from the point of view of concurrency control
 	and the valid state transitions.
 
@@ -710,7 +703,7 @@ public:
 	XA (2PC) transactions are always treated as non-autocommit.
 
 	Transitions to ACTIVE or NOT_STARTED occur when transaction
-	is not in rw_trx_hash (no trx_sys.mutex needed).
+	is not in rw_trx_hash.
 
 	Autocommit non-locking read-only transactions move between states
 	without holding any mutex. They are not in rw_trx_hash.
@@ -726,7 +719,7 @@ public:
 	in rw_trx_hash.
 
 	ACTIVE->PREPARED->COMMITTED is only possible when trx is in rw_trx_hash.
-	The transition ACTIVE->PREPARED is protected by trx_sys.mutex.
+	The transition ACTIVE->PREPARED is protected by trx->mutex.
 
 	ACTIVE->COMMITTED is possible when the transaction is in
 	rw_trx_hash.
@@ -764,7 +757,7 @@ public:
 	const char*	op_info;	/*!< English text describing the
 					current operation, or an empty
 					string */
-	ulint		isolation_level;/*!< TRX_ISO_REPEATABLE_READ, ... */
+	uint		isolation_level;/*!< TRX_ISO_REPEATABLE_READ, ... */
 	bool		check_foreigns;	/*!< normally TRUE, but if the user
 					wants to suppress foreign key checks,
 					(in table imports, for example) we
@@ -804,17 +797,6 @@ public:
 	ulint		duplicates;	/*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
 	trx_dict_op_t	dict_operation;	/**< @see enum trx_dict_op_t */
 
-	/* Fields protected by the srv_conc_mutex. */
-	bool		declared_to_be_inside_innodb;
-					/*!< this is TRUE if we have declared
-					this transaction in
-					srv_conc_enter_innodb to be inside the
-					InnoDB engine */
-	ib_uint32_t	n_tickets_to_enter_innodb;
-					/*!< this can be > 0 only when
-					declared_to_... is TRUE; when we come
-					to srv_conc_innodb_enter, if the value
-					here is > 0, we decrement this by 1 */
 	ib_uint32_t	dict_operation_lock_mode;
 					/*!< 0, RW_S_LATCH, or RW_X_LATCH:
 					the latch mode trx currently holds
@@ -850,10 +832,6 @@ public:
 					/*!< how many tables the current SQL
 					statement uses, except those
 					in consistent read */
-	/*------------------------------*/
-	UT_LIST_NODE_T(trx_t) trx_list;	/*!< list of all transactions;
-					protected by trx_sys.mutex */
-	/*------------------------------*/
 	dberr_t		error_state;	/*!< 0 if no error, otherwise error
 					number; NOTE That ONLY the thread
 					doing the transaction is allowed to
@@ -947,15 +925,6 @@ public:
 	/*------------------------------*/
 	char*		detailed_error;	/*!< detailed error message for last
 					error, or empty. */
-private:
-	/** flush observer used to track flushing of non-redo logged pages
-	during bulk create index */
-	FlushObserver*	flush_observer;
-public:
-#ifdef WITH_WSREP
-	os_event_t	wsrep_event;	/* event waited for in srv_conc_slot */
-#endif /* WITH_WSREP */
-
 	rw_trx_hash_element_t *rw_trx_hash_element;
 	LF_PINS *rw_trx_hash_pins;
 	ulint		magic_n;
@@ -983,20 +952,6 @@ public:
 		return(assign_temp_rseg());
 	}
 
-	/** Set the innodb_log_optimize_ddl page flush observer
-	@param[in,out]	space	tablespace
-	@param[in,out]	stage	performance_schema accounting */
-	void set_flush_observer(fil_space_t* space, ut_stage_alter_t* stage);
-
-	/** Remove the flush observer */
-	void remove_flush_observer();
-
-	/** @return the flush observer */
-	FlushObserver* get_flush_observer() const
-	{
-		return flush_observer;
-	}
-
   /** Transition to committed state, to release implicit locks. */
   inline void commit_state();
 
@@ -1007,17 +962,26 @@ public:
   @param[in]	table_id	table identifier */
   void evict_table(table_id_t table_id);
 
+  /** Initiate rollback.
+  @param savept     savepoint to which to roll back
+  @return error code or DB_SUCCESS */
+  dberr_t rollback(trx_savept_t *savept= nullptr);
+  /** Roll back an active transaction.
+  @param savept     savepoint to which to roll back */
+  inline void rollback_low(trx_savept_t *savept= nullptr);
+  /** Finish rollback.
+  @return whether the rollback was completed normally
+  @retval false if the rollback was aborted by shutdown */
+  inline bool rollback_finish();
 private:
   /** Mark a transaction committed in the main memory data structures. */
   inline void commit_in_memory(const mtr_t *mtr);
-public:
-  /** Commit the transaction. */
-  void commit();
-
   /** Commit the transaction in a mini-transaction.
   @param mtr  mini-transaction (if there are any persistent modifications) */
   void commit_low(mtr_t *mtr= nullptr);
-
+public:
+  /** Commit the transaction. */
+  void commit();
 
 
   bool is_referenced() const { return n_ref > 0; }
@@ -1069,7 +1033,6 @@ public:
     ut_ad(dict_operation == TRX_DICT_OP_NONE);
   }
 
-
   /** @return whether this is a non-locking autocommit transaction */
   bool is_autocommit_non_locking() const { return auto_commit && !will_lock; }
 
diff --git a/storage/innobase/include/trx0trx.inl b/storage/innobase/include/trx0trx.inl
index 72310cbba06..93c9591e0c2 100644
--- a/storage/innobase/include/trx0trx.inl
+++ b/storage/innobase/include/trx0trx.inl
@@ -26,7 +26,7 @@ Created 3/26/1996 Heikki Tuuri
 
 /**********************************************************************//**
 Determines if a transaction is in the given state.
-The caller must hold trx_sys.mutex, or it must be the thread
+The caller must hold trx->mutex, or it must be the thread
 that is serving a running transaction.
 A running RW transaction must be in trx_sys.rw_trx_hash.
 @return TRUE if trx->state == state */
diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h
index 58b746ca189..99a9c66c839 100644
--- a/storage/innobase/include/trx0types.h
+++ b/storage/innobase/include/trx0types.h
@@ -48,6 +48,8 @@ static const ulint TRX_SYS_SPACE = 0;
 /** Random value to check for corruption of trx_t */
 static const ulint TRX_MAGIC_N = 91118598;
 
+constexpr uint innodb_purge_threads_MAX= 32;
+
 /** Transaction execution states when trx->state == TRX_STATE_ACTIVE */
 enum trx_que_t {
 	TRX_QUE_RUNNING,		/*!< transaction is running */
@@ -119,8 +121,6 @@ struct trx_savept_t{
 
 /** File objects */
 /* @{ */
-/** Rollback segment header */
-typedef byte	trx_rsegf_t;
 /** Undo segment header */
 typedef byte	trx_usegf_t;
 /** Undo log header */
diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h
index 55b73f45faa..1ae23856087 100644
--- a/storage/innobase/include/trx0undo.h
+++ b/storage/innobase/include/trx0undo.h
@@ -46,10 +46,10 @@ UNIV_INLINE
 roll_ptr_t
 trx_undo_build_roll_ptr(
 /*====================*/
-	ibool	is_insert,	/*!< in: TRUE if insert undo log */
+	bool	is_insert,	/*!< in: TRUE if insert undo log */
 	ulint	rseg_id,	/*!< in: rollback segment id */
-	ulint	page_no,	/*!< in: page number */
-	ulint	offset);	/*!< in: offset of the undo entry within page */
+	uint32_t page_no,	/*!< in: page number */
+	uint16_t offset);	/*!< in: offset of the undo entry within page */
 /***********************************************************************//**
 Decodes a roll pointer. */
 UNIV_INLINE
@@ -57,16 +57,16 @@ void
 trx_undo_decode_roll_ptr(
 /*=====================*/
 	roll_ptr_t	roll_ptr,	/*!< in: roll pointer */
-	ibool*		is_insert,	/*!< out: TRUE if insert undo log */
+	bool*		is_insert,	/*!< out: TRUE if insert undo log */
 	ulint*		rseg_id,	/*!< out: rollback segment id */
-	ulint*		page_no,	/*!< out: page number */
-	ulint*		offset);	/*!< out: offset of the undo
+	uint32_t*	page_no,	/*!< out: page number */
+	uint16_t*	offset);	/*!< out: offset of the undo
 					entry within page */
 /***********************************************************************//**
-Returns TRUE if the roll pointer is of the insert type.
-@return TRUE if insert undo log */
+Determine if DB_ROLL_PTR is of the insert type.
+@return true if insert */
 UNIV_INLINE
-ibool
+bool
 trx_undo_roll_ptr_is_insert(
 /*========================*/
 	roll_ptr_t	roll_ptr);	/*!< in: roll pointer */
@@ -101,7 +101,7 @@ inline roll_ptr_t trx_read_roll_ptr(const byte* ptr)
 @param[in,out]	mtr		mini-transaction
 @return pointer to page x-latched */
 UNIV_INLINE
-page_t*
+buf_block_t*
 trx_undo_page_get(const page_id_t page_id, mtr_t* mtr);
 
 /** Gets an undo log page and s-latches it.
@@ -109,56 +109,58 @@ trx_undo_page_get(const page_id_t page_id, mtr_t* mtr);
 @param[in,out]	mtr		mini-transaction
 @return pointer to page s-latched */
 UNIV_INLINE
-page_t*
+buf_block_t*
 trx_undo_page_get_s_latched(const page_id_t page_id, mtr_t* mtr);
 
-/******************************************************************//**
-Returns the next undo log record on the page in the specified log, or
-NULL if none exists.
-@return pointer to record, NULL if none */
-UNIV_INLINE
-trx_undo_rec_t*
-trx_undo_page_get_next_rec(
-/*=======================*/
-	trx_undo_rec_t*	rec,	/*!< in: undo log record */
-	ulint		page_no,/*!< in: undo log header page number */
-	ulint		offset);/*!< in: undo log header offset on page */
-/***********************************************************************//**
-Gets the previous record in an undo log.
-@return undo log record, the page s-latched, NULL if none */
+/** Get the next record in an undo log.
+@param[in]      undo_page       undo log page
+@param[in]      rec             undo record offset in the page
+@param[in]      page_no         undo log header page number
+@param[in]      offset          undo log header offset on page
+@return undo log record, the page latched, NULL if none */
+inline trx_undo_rec_t*
+trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec,
+                           uint32_t page_no, uint16_t offset);
+/** Get the previous record in an undo log.
+@param[in,out]  block   undo log page
+@param[in]      rec     undo record offset in the page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      shared  latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
 trx_undo_rec_t*
-trx_undo_get_prev_rec(
-/*==================*/
-	trx_undo_rec_t*	rec,	/*!< in: undo record */
-	ulint		page_no,/*!< in: undo log header page number */
-	ulint		offset,	/*!< in: undo log header offset on page */
-	bool		shared,	/*!< in: true=S-latch, false=X-latch */
-	mtr_t*		mtr);	/*!< in: mtr */
-/***********************************************************************//**
-Gets the next record in an undo log.
-@return undo log record, the page s-latched, NULL if none */
+trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+                      uint16_t offset, bool shared, mtr_t *mtr);
+/** Get the next record in an undo log.
+@param[in,out]  block   undo log page
+@param[in]      rec     undo record offset in the page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
 trx_undo_rec_t*
-trx_undo_get_next_rec(
-/*==================*/
-	trx_undo_rec_t*	rec,	/*!< in: undo record */
-	ulint		page_no,/*!< in: undo log header page number */
-	ulint		offset,	/*!< in: undo log header offset on page */
-	mtr_t*		mtr);	/*!< in: mtr */
-
-/** Gets the first record in an undo log.
-@param[in]	space		undo log header space
-@param[in]	page_no		undo log header page number
-@param[in]	offset		undo log header offset on page
-@param[in]	mode		latching mode: RW_S_LATCH or RW_X_LATCH
-@param[in,out]	mtr		mini-transaction
+trx_undo_get_next_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+                      uint16_t offset, mtr_t *mtr);
+
+/** Get the first record in an undo log.
+@param[in]      space   undo log header space
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      mode    latching mode: RW_S_LATCH or RW_X_LATCH
+@param[out]     block   undo log page
+@param[in,out]  mtr     mini-transaction
 @return undo log record, the page latched, NULL if none */
 trx_undo_rec_t*
-trx_undo_get_first_rec(
-	fil_space_t*		space,
-	ulint			page_no,
-	ulint			offset,
-	ulint			mode,
-	mtr_t*			mtr);
+trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no,
+                       uint16_t offset, ulint mode, buf_block_t*& block,
+                       mtr_t *mtr);
+
+/** Initialize an undo log page.
+NOTE: This corresponds to a redo log record and must not be changed!
+@see mtr_t::undo_create()
+@param[in,out]	block	undo log page */
+void trx_undo_page_init(const buf_block_t &block);
 
 /** Allocate an undo log page.
 @param[in,out]	undo	undo log
@@ -193,8 +195,8 @@ freed, but emptied, if all the records there are below the limit.
 void
 trx_undo_truncate_start(
 	trx_rseg_t*	rseg,
-	ulint		hdr_page_no,
-	ulint		hdr_offset,
+	uint32_t	hdr_page_no,
+	uint16_t	hdr_offset,
 	undo_no_t	limit);
 /** Mark that an undo log header belongs to a data dictionary transaction.
 @param[in]	trx	dictionary transaction
@@ -227,7 +229,7 @@ trx_undo_assign_low(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
 /******************************************************************//**
 Sets the state of the undo log segment at a transaction finish.
 @return undo log segment header page, x-latched */
-page_t*
+buf_block_t*
 trx_undo_set_state_at_finish(
 /*=========================*/
 	trx_undo_t*	undo,	/*!< in: undo log memory copy */
@@ -252,38 +254,6 @@ void trx_undo_commit_cleanup(trx_undo_t *undo);
 void
 trx_undo_free_at_shutdown(trx_t *trx);
 
-/** Parse MLOG_UNDO_INIT.
-@param[in]	ptr	log record
-@param[in]	end_ptr	end of log record buffer
-@param[in,out]	page	page or NULL
-@param[in,out]	mtr	mini-transaction
-@return	end of log record
-@retval	NULL	if the log record is incomplete */
-byte*
-trx_undo_parse_page_init(const byte* ptr, const byte* end_ptr, page_t* page);
-/** Parse MLOG_UNDO_HDR_REUSE for crash-upgrade from MariaDB 10.2.
-@param[in]	ptr	redo log record
-@param[in]	end_ptr	end of log buffer
-@param[in,out]	page	undo page or NULL
-@return end of log record or NULL */
-byte*
-trx_undo_parse_page_header_reuse(
-	const byte*	ptr,
-	const byte*	end_ptr,
-	page_t*		page);
-
-/** Parse the redo log entry of an undo log page header create.
-@param[in]	ptr	redo log record
-@param[in]	end_ptr	end of log buffer
-@param[in,out]	page	page frame or NULL
-@param[in,out]	mtr	mini-transaction or NULL
-@return end of log record or NULL */
-byte*
-trx_undo_parse_page_header(
-	const byte*	ptr,
-	const byte*	end_ptr,
-	page_t*		page,
-	mtr_t*		mtr);
 /** Read an undo log when starting up the database.
 @param[in,out]	rseg		rollback segment
 @param[in]	id		rollback segment slot
@@ -329,20 +299,20 @@ struct trx_undo_t {
 					id */
 	trx_rseg_t*	rseg;		/*!< rseg where the undo log belongs */
 	/*-----------------------------*/
-	ulint		hdr_page_no;	/*!< page number of the header page in
+	uint32_t	hdr_page_no;	/*!< page number of the header page in
 					the undo log */
-	ulint		hdr_offset;	/*!< header offset of the undo log on
-				       	the page */
-	ulint		last_page_no;	/*!< page number of the last page in the
+	uint32_t	last_page_no;	/*!< page number of the last page in the
 					undo log; this may differ from
 					top_page_no during a rollback */
-	ulint		size;		/*!< current size in pages */
+	uint16_t	hdr_offset;	/*!< header offset of the undo log on
+				       	the page */
+	uint32_t	size;		/*!< current size in pages */
 	/*-----------------------------*/
-	ulint		top_page_no;	/*!< page number where the latest undo
+	uint32_t	top_page_no;	/*!< page number where the latest undo
 					log record was catenated; during
 					rollback the page from which the latest
 					undo record was chosen */
-	ulint		top_offset;	/*!< offset of the latest undo record,
+	uint16_t	top_offset;	/*!< offset of the latest undo record,
 					i.e., the topmost element in the undo
 					log if we think of it as a stack */
 	undo_no_t	top_undo_no;	/*!< undo number of the latest record
@@ -473,14 +443,6 @@ which purge would not result in removing delete-marked records. */
 /** Size of the undo log header without XID information */
 #define TRX_UNDO_LOG_OLD_HDR_SIZE (34 + FLST_NODE_SIZE)
 
-/* Note: the writing of the undo log old header is coded by a log record
-MLOG_UNDO_HDR_CREATE. The appending of an XID to the
-header is logged separately. In this sense, the XID is not really a member
-of the undo log header. TODO: do not append the XID to the log header if XA
-is not needed by the user. The XID wastes about 150 bytes of space in every
-undo log. In the history list we may have millions of undo logs, which means
-quite a large overhead. */
-
 /** X/Open XA Transaction Identification (XID) */
 /* @{ */
 /** xid_t::formatID */
diff --git a/storage/innobase/include/trx0undo.inl b/storage/innobase/include/trx0undo.inl
index 6d1ec16869e..43af932708e 100644
--- a/storage/innobase/include/trx0undo.inl
+++ b/storage/innobase/include/trx0undo.inl
@@ -34,22 +34,17 @@ UNIV_INLINE
 roll_ptr_t
 trx_undo_build_roll_ptr(
 /*====================*/
-	ibool	is_insert,	/*!< in: TRUE if insert undo log */
+	bool	is_insert,	/*!< in: TRUE if insert undo log */
 	ulint	rseg_id,	/*!< in: rollback segment id */
-	ulint	page_no,	/*!< in: page number */
-	ulint	offset)		/*!< in: offset of the undo entry within page */
+	uint32_t page_no,	/*!< in: page number */
+	uint16_t offset)		/*!< in: offset of the undo entry within page */
 {
-	roll_ptr_t	roll_ptr;
-	compile_time_assert(DATA_ROLL_PTR_LEN == 7);
-	ut_ad(is_insert == 0 || is_insert == 1);
-	ut_ad(rseg_id < TRX_SYS_N_RSEGS);
-	ut_ad(offset < 65536);
-
-	roll_ptr = (roll_ptr_t) is_insert << ROLL_PTR_INSERT_FLAG_POS
-		| (roll_ptr_t) rseg_id << ROLL_PTR_RSEG_ID_POS
-		| (roll_ptr_t) page_no << ROLL_PTR_PAGE_POS
-		| offset;
-	return(roll_ptr);
+  compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+  ut_ad(rseg_id < TRX_SYS_N_RSEGS);
+
+  return roll_ptr_t{is_insert} << ROLL_PTR_INSERT_FLAG_POS |
+    roll_ptr_t{rseg_id} << ROLL_PTR_RSEG_ID_POS |
+    roll_ptr_t{page_no} << ROLL_PTR_PAGE_POS | offset;
 }
 
 /***********************************************************************//**
@@ -59,35 +54,32 @@ void
 trx_undo_decode_roll_ptr(
 /*=====================*/
 	roll_ptr_t	roll_ptr,	/*!< in: roll pointer */
-	ibool*		is_insert,	/*!< out: TRUE if insert undo log */
+	bool*		is_insert,	/*!< out: TRUE if insert undo log */
 	ulint*		rseg_id,	/*!< out: rollback segment id */
-	ulint*		page_no,	/*!< out: page number */
-	ulint*		offset)		/*!< out: offset of the undo
+	uint32_t*	page_no,	/*!< out: page number */
+	uint16_t*	offset)		/*!< out: offset of the undo
 					entry within page */
 {
-	compile_time_assert(DATA_ROLL_PTR_LEN == 7);
-	ut_ad(roll_ptr < (1ULL << 56));
-	*offset = (ulint) roll_ptr & 0xFFFF;
-	roll_ptr >>= 16;
-	*page_no = (ulint) roll_ptr & 0xFFFFFFFF;
-	roll_ptr >>= 32;
-	*rseg_id = (ulint) roll_ptr & 0x7F;
-	roll_ptr >>= 7;
-	*is_insert = (ibool) roll_ptr; /* TRUE==1 */
+  compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+  ut_ad(roll_ptr < (1ULL << 56));
+  *offset= static_cast<uint16_t>(roll_ptr);
+  *page_no= static_cast<uint32_t>(roll_ptr >> 16);
+  *rseg_id= static_cast<ulint>(roll_ptr >> 48 & 0x7F);
+  *is_insert= static_cast<bool>(roll_ptr >> 55);
 }
 
 /***********************************************************************//**
-Returns TRUE if the roll pointer is of the insert type.
-@return TRUE if insert undo log */
+Determine if DB_ROLL_PTR is of the insert type.
+@return true if insert */
 UNIV_INLINE
-ibool
+bool
 trx_undo_roll_ptr_is_insert(
 /*========================*/
 	roll_ptr_t	roll_ptr)	/*!< in: roll pointer */
 {
 	compile_time_assert(DATA_ROLL_PTR_LEN == 7);
 	ut_ad(roll_ptr < (1ULL << (ROLL_PTR_INSERT_FLAG_POS + 1)));
-	return((ibool) (roll_ptr >> ROLL_PTR_INSERT_FLAG_POS));
+	return static_cast<bool>(roll_ptr >> ROLL_PTR_INSERT_FLAG_POS);
 }
 
 /***********************************************************************//**
@@ -108,14 +100,13 @@ trx_undo_trx_id_is_insert(
 @param[in,out]	mtr		mini-transaction
 @return pointer to page x-latched */
 UNIV_INLINE
-page_t*
+buf_block_t*
 trx_undo_page_get(const page_id_t page_id, mtr_t* mtr)
 {
 	buf_block_t*	block = buf_page_get(page_id, 0, RW_X_LATCH, mtr);
 
 	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
-
-	return(buf_block_get_frame(block));
+	return block;
 }
 
 /** Gets an undo log page and s-latches it.
@@ -123,14 +114,14 @@ trx_undo_page_get(const page_id_t page_id, mtr_t* mtr)
 @param[in,out]	mtr		mini-transaction
 @return pointer to page s-latched */
 UNIV_INLINE
-page_t*
+buf_block_t*
 trx_undo_page_get_s_latched(const page_id_t page_id, mtr_t* mtr)
 {
 	buf_block_t*	block = buf_page_get(page_id, 0, RW_S_LATCH, mtr);
 
 	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
 
-	return(buf_block_get_frame(block));
+	return block;
 }
 
 /** Determine the end offset of undo log records of an undo log page.
@@ -139,46 +130,29 @@ trx_undo_page_get_s_latched(const page_id_t page_id, mtr_t* mtr)
 @param[in]	offset		undo log header offset
 @return end offset */
 inline
-uint16_t
-trx_undo_page_get_end(const page_t* undo_page, ulint page_no, ulint offset)
+uint16_t trx_undo_page_get_end(const buf_block_t *undo_page, uint32_t page_no,
+                               uint16_t offset)
 {
-	if (page_no == page_get_page_no(undo_page)) {
-		if (uint16_t end = mach_read_from_2(TRX_UNDO_NEXT_LOG
-						    + offset + undo_page)) {
-			return end;
-		}
-	}
-
-	return mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
-				+ undo_page);
+  if (page_no == undo_page->page.id().page_no())
+    if (uint16_t end = mach_read_from_2(TRX_UNDO_NEXT_LOG + offset +
+					undo_page->frame))
+      return end;
+
+  return mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
+			  undo_page->frame);
 }
 
-/******************************************************************//**
-Returns the next undo log record on the page in the specified log, or
-NULL if none exists.
-@return pointer to record, NULL if none */
-UNIV_INLINE
-trx_undo_rec_t*
-trx_undo_page_get_next_rec(
-/*=======================*/
-	trx_undo_rec_t*	rec,	/*!< in: undo log record */
-	ulint		page_no,/*!< in: undo log header page number */
-	ulint		offset)	/*!< in: undo log header offset on page */
+/** Get the next record in an undo log.
+@param[in]      undo_page       undo log page
+@param[in]      rec             undo record offset in the page
+@param[in]      page_no         undo log header page number
+@param[in]      offset          undo log header offset on page
+@return undo log record, the page latched, NULL if none */
+inline trx_undo_rec_t*
+trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec,
+                           uint32_t page_no, uint16_t offset)
 {
-	page_t*	undo_page;
-	ulint	end;
-	ulint	next;
-
-	undo_page = (page_t*) ut_align_down(rec, srv_page_size);
-
-	end = trx_undo_page_get_end(undo_page, page_no, offset);
-
-	next = mach_read_from_2(rec);
-
-	if (next == end) {
-
-		return(NULL);
-	}
-
-	return(undo_page + next);
+  uint16_t end= trx_undo_page_get_end(undo_page, page_no, offset);
+  uint16_t next= mach_read_from_2(undo_page->frame + rec);
+  return next == end ? nullptr : undo_page->frame + next;
 }
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
index ddb27c107ff..3c8dfb2ddf6 100644
--- a/storage/innobase/include/univ.i
+++ b/storage/innobase/include/univ.i
@@ -78,6 +78,7 @@ support cross-platform development and expose comonly used SQL names. */
 
 #include <my_global.h>
 #include "my_counter.h"
+#include <m_string.h>
 
 /* JAN: TODO: missing 5.7 header */
 #ifdef HAVE_MY_THREAD_H
@@ -85,7 +86,6 @@ support cross-platform development and expose comonly used SQL names. */
 #endif
 
 #ifndef UNIV_INNOCHECKSUM
-# include <m_string.h>
 # include <mysqld_error.h>
 #endif /* !UNIV_INNOCHECKSUM */
 
@@ -114,8 +114,7 @@ HAVE_PSI_INTERFACE is defined. */
 # define UNIV_PFS_IO
 # define UNIV_PFS_THREAD
 
-// JAN: TODO: MySQL 5.7 PSI
-// # include "mysql/psi/psi.h" /* HAVE_PSI_MEMORY_INTERFACE */
+# include "mysql/psi/psi.h" /* HAVE_PSI_MEMORY_INTERFACE */
 # ifdef HAVE_PSI_MEMORY_INTERFACE
 #  define UNIV_PFS_MEMORY
 # endif /* HAVE_PSI_MEMORY_INTERFACE */
@@ -129,7 +128,6 @@ be excluded from instrumentation. */
 
 # define PFS_IS_INSTRUMENTED(key)	((key) != PFS_NOT_INSTRUMENTED)
 
-/* JAN: TODO: missing 5.7 header */
 #ifdef HAVE_PFS_THREAD_PROVIDER_H
 /* For PSI_MUTEX_CALL() and similar. */
 #include "pfs_thread_provider.h"
@@ -137,7 +135,6 @@ be excluded from instrumentation. */
 
 #include "mysql/psi/mysql_thread.h"
 /* For PSI_FILE_CALL(). */
-/* JAN: TODO: missing 5.7 header */
 #ifdef HAVE_PFS_FILE_PROVIDER_H
 #include "pfs_file_provider.h"
 #endif
@@ -184,8 +181,6 @@ using the call command. */
 						some debug print functions */
 #define UNIV_AHI_DEBUG				/* Enable adaptive hash index
 						debugging without UNIV_DEBUG */
-#define UNIV_BUF_DEBUG				/* Enable buffer pool
-						debugging without UNIV_DEBUG */
 #define UNIV_BLOB_LIGHT_DEBUG			/* Enable off-page column
 						debugging without UNIV_DEBUG */
 #define UNIV_DEBUG_LOCK_VALIDATE		/* Enable
@@ -193,9 +188,6 @@ using the call command. */
 						assertions. */
 #define UNIV_LRU_DEBUG				/* debug the buffer pool LRU */
 #define UNIV_HASH_DEBUG				/* debug HASH_ macros */
-#define UNIV_LOG_LSN_DEBUG			/* write LSN to the redo log;
-this will break redo log file compatibility, but it may be useful when
-debugging redo log application problems. */
 #define UNIV_IBUF_DEBUG				/* debug the insert buffer */
 #define UNIV_PERF_DEBUG                         /* debug flag that enables
                                                 light weight performance
@@ -408,16 +400,19 @@ typedef ssize_t lint;
 #ifdef _WIN32
 /* Use the integer types and formatting strings defined in Visual Studio. */
 # define UINT32PF	"%u"
-# define INT64PF	"%lld"
 # define UINT64scan     "llu"
 # define UINT64PFx	"%016llx"
 #elif defined __APPLE__
 /* Apple prefers to call the 64-bit types 'long long'
 in both 32-bit and 64-bit environments. */
 # define UINT32PF	"%" PRIu32
-# define INT64PF	"%lld"
 # define UINT64scan     "llu"
 # define UINT64PFx	"%016llx"
+#elif defined _AIX
+/* Workaround for macros expension trouble */
+# define UINT32PF      "%u"
+# define UINT64scan    "lu"
+# define UINT64PFx     "%016lx"
 #else
 /* Use the integer types and formatting strings defined in the C99 standard. */
 # define UINT32PF	"%" PRIu32
@@ -445,8 +440,6 @@ typedef	ib_uint64_t		lsn_t;
 /** The 'undefined' value for a ulint */
 #define ULINT_UNDEFINED		((ulint)(-1))
 
-#define ULONG_UNDEFINED		((ulong)(-1))
-
 /** The 'undefined' value for a ib_uint64_t */
 #define UINT64_UNDEFINED	((ib_uint64_t)(-1))
 
@@ -584,8 +577,6 @@ typedef void* os_thread_ret_t;
 extern ulong	srv_page_size_shift;
 extern ulong	srv_page_size;
 
-static const size_t UNIV_SECTOR_SIZE = 512;
-
 /* Dimension of spatial object we support so far. It has its root in
 myisam/sp_defs.h. We only support 2 dimension data */
 #define SPDIMS          2
diff --git a/storage/innobase/include/ut0byte.h b/storage/innobase/include/ut0byte.h
index 1a90b523b2d..7ed64c61c4b 100644
--- a/storage/innobase/include/ut0byte.h
+++ b/storage/innobase/include/ut0byte.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -26,8 +27,6 @@ Created 1/20/1994 Heikki Tuuri
 #ifndef ut0byte_h
 #define ut0byte_h
 
-
-
 #include "univ.i"
 
 /*******************************************************//**
@@ -61,37 +60,38 @@ ut_uint64_align_up(
 	ib_uint64_t	 n,		/*!< in: number to be rounded */
 	ulint		 align_no);	/*!< in: align by this number
 					which must be a power of 2 */
-/*********************************************************//**
-The following function rounds up a pointer to the nearest aligned address.
-@return aligned pointer */
-UNIV_INLINE
-void*
-ut_align(
-/*=====*/
-	const void*	ptr,		/*!< in: pointer */
-	ulint		align_no);	/*!< in: align by this number */
-/*********************************************************//**
-The following function rounds down a pointer to the nearest
-aligned address.
+/** Round down a pointer to the nearest aligned address.
+@param ptr        pointer
+@param alignment  a power of 2
 @return aligned pointer */
-UNIV_INLINE
-void*
-ut_align_down(
-/*==========*/
-	const void*	ptr,		/*!< in: pointer */
-	ulint		align_no)	/*!< in: align by this number */
-		MY_ATTRIBUTE((const));
-/*********************************************************//**
-The following function computes the offset of a pointer from the nearest
-aligned address.
+static inline void *ut_align_down(void *ptr, size_t alignment)
+{
+  ut_ad(alignment > 0);
+  ut_ad(ut_is_2pow(alignment));
+  ut_ad(ptr);
+  static_assert(sizeof ptr == sizeof(size_t), "compatibility");
+
+  return reinterpret_cast<void*>(reinterpret_cast<size_t>(ptr) &
+                                 ~(alignment - 1));
+}
+
+static inline const void *ut_align_down(const void *ptr, size_t alignment)
+{
+  return ut_align_down(const_cast<void*>(ptr), alignment);
+}
+
+/** Compute the offset of a pointer from the nearest aligned address.
+@param ptr        pointer
+@param alignment  a power of 2
 @return distance from aligned pointer */
-UNIV_INLINE
-ulint
-ut_align_offset(
-/*============*/
-	const void*	ptr,		/*!< in: pointer */
-	ulint		align_no)	/*!< in: align by this number */
-			MY_ATTRIBUTE((const));
+inline size_t ut_align_offset(const void *ptr, size_t alignment)
+{
+  ut_ad(alignment > 0);
+  ut_ad(ut_is_2pow(alignment));
+  ut_ad(ptr);
+  return reinterpret_cast<size_t>(ptr) & (alignment - 1);
+}
+
 /*****************************************************************//**
 Gets the nth bit of a ulint.
 @return TRUE if nth bit is 1; 0th bit is defined to be the least significant */
diff --git a/storage/innobase/include/ut0byte.inl b/storage/innobase/include/ut0byte.inl
index 6dd48090c3a..a4b5d4a7d8f 100644
--- a/storage/innobase/include/ut0byte.inl
+++ b/storage/innobase/include/ut0byte.inl
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -74,65 +75,6 @@ ut_uint64_align_up(
 	return((n + align_1) & ~align_1);
 }
 
-/*********************************************************//**
-The following function rounds up a pointer to the nearest aligned address.
-@return aligned pointer */
-UNIV_INLINE ATTRIBUTE_ACCESS((none,1))
-void*
-ut_align(
-/*=====*/
-	const void*	ptr,		/*!< in: pointer */
-	ulint		align_no)	/*!< in: align by this number */
-{
-	ut_ad(align_no > 0);
-	ut_ad(((align_no - 1) & align_no) == 0);
-	ut_ad(ptr);
-
-	ut_ad(sizeof(void*) == sizeof(ulint));
-
-	return((void*)((((ulint) ptr) + align_no - 1) & ~(align_no - 1)));
-}
-
-/*********************************************************//**
-The following function rounds down a pointer to the nearest
-aligned address.
-@return aligned pointer */
-UNIV_INLINE ATTRIBUTE_ACCESS((none,1))
-void*
-ut_align_down(
-/*==========*/
-	const void*	ptr,		/*!< in: pointer */
-	ulint		align_no)	/*!< in: align by this number */
-{
-	ut_ad(align_no > 0);
-	ut_ad(((align_no - 1) & align_no) == 0);
-	ut_ad(ptr);
-
-	ut_ad(sizeof(void*) == sizeof(ulint));
-
-	return((void*)(((ulint) ptr) & ~(align_no - 1)));
-}
-
-/*********************************************************//**
-The following function computes the offset of a pointer from the nearest
-aligned address.
-@return distance from aligned pointer */
-UNIV_INLINE ATTRIBUTE_ACCESS((none,1))
-ulint
-ut_align_offset(
-/*============*/
-	const void*	ptr,		/*!< in: pointer */
-	ulint		align_no)	/*!< in: align by this number */
-{
-	ut_ad(align_no > 0);
-	ut_ad(((align_no - 1) & align_no) == 0);
-	ut_ad(ptr);
-
-	ut_ad(sizeof(void*) == sizeof(ulint));
-
-	return(((ulint) ptr) & (align_no - 1));
-}
-
 /*****************************************************************//**
 Gets the nth bit of a ulint.
 @return TRUE if nth bit is 1; 0th bit is defined to be the least significant */
diff --git a/storage/innobase/include/ut0crc32.h b/storage/innobase/include/ut0crc32.h
index f2c1b7e82b6..0cbccb976e2 100644
--- a/storage/innobase/include/ut0crc32.h
+++ b/storage/innobase/include/ut0crc32.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2011, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2016, 2018, MariaDB Corporation.
+Copyright (c) 2016, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -28,26 +28,10 @@ Created Aug 10, 2011 Vasil Dimov
 #define ut0crc32_h
 
 #include "univ.i"
-
-/********************************************************************//**
-Initializes the data structures used by ut_crc32*(). Does not do any
-allocations, would not hurt if called twice, but would be pointless. */
-void
-ut_crc32_init();
-/*===========*/
-
-/********************************************************************//**
-Calculates CRC32.
-@param ptr - data over which to calculate CRC32.
-@param len - data length in bytes.
-@return CRC32 (CRC-32C, using the GF(2) primitive polynomial 0x11EDC6F41,
-or 0x1EDC6F41 without the high-order bit) */
-typedef uint32_t	(*ut_crc32_func_t)(const byte* ptr, ulint len);
-
-/** Pointer to CRC32 calculation function. */
-extern ut_crc32_func_t	ut_crc32;
-
-/** Text description of CRC32 implementation */
-extern const char*	ut_crc32_implementation;
+#include <my_sys.h>
+static inline uint32_t ut_crc32(const byte *s, size_t size)
+{
+  return my_crc32c(0, s, size);
+}
 
 #endif /* ut0crc32_h */
diff --git a/storage/innobase/include/ut0mem.h b/storage/innobase/include/ut0mem.h
index 99811c400da..a5ed72f9f02 100644
--- a/storage/innobase/include/ut0mem.h
+++ b/storage/innobase/include/ut0mem.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, MariaDB Corporation.
+Copyright (c) 2019, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -27,62 +27,7 @@ Created 5/30/1994 Heikki Tuuri
 #ifndef ut0mem_h
 #define ut0mem_h
 
-#include "os0event.h"
-#include "ut0mutex.h"
-
-/** Wrapper for memcpy(3).  Copy memory area when the source and
-target are not overlapping.
-@param[in,out]	dest	copy to
-@param[in]	src	copy from
-@param[in]	n	number of bytes to copy
-@return dest */
-UNIV_INLINE
-void*
-ut_memcpy(void* dest, const void* src, ulint n);
-
-/** Wrapper for memmove(3).  Copy memory area when the source and
-target are overlapping.
-@param[in,out]	dest	Move to
-@param[in]	src	Move from
-@param[in]	n	number of bytes to move
-@return dest */
-UNIV_INLINE
-void*
-ut_memmove(void* dest, const void* sour, ulint n);
-
-/** Wrapper for memcmp(3).  Compare memory areas.
-@param[in]	str1	first memory block to compare
-@param[in]	str2	second memory block to compare
-@param[in]	n	number of bytes to compare
-@return negative, 0, or positive if str1 is smaller, equal,
-		or greater than str2, respectively. */
-UNIV_INLINE
-int
-ut_memcmp(const void* str1, const void* str2, ulint n);
-
-/** Wrapper for strcpy(3).  Copy a NUL-terminated string.
-@param[in,out]	dest	Destination to copy to
-@param[in]	src	Source to copy from
-@return dest */
-UNIV_INLINE
-char*
-ut_strcpy(char* dest, const char* src);
-
-/** Wrapper for strlen(3).  Determine the length of a NUL-terminated string.
-@param[in]	str	string
-@return length of the string in bytes, excluding the terminating NUL */
-UNIV_INLINE
-ulint
-ut_strlen(const char* str);
-
-/** Wrapper for strcmp(3).  Compare NUL-terminated strings.
-@param[in]	str1	first string to compare
-@param[in]	str2	second string to compare
-@return negative, 0, or positive if str1 is smaller, equal,
-		or greater than str2, respectively. */
-UNIV_INLINE
-int
-ut_strcmp(const char* str1, const char* str2);
+#include "univ.i"
 
 /********************************************************************
 Concatenate 3 strings.*/
diff --git a/storage/innobase/include/ut0mem.inl b/storage/innobase/include/ut0mem.inl
index 8c8788a38aa..cc95a03642e 100644
--- a/storage/innobase/include/ut0mem.inl
+++ b/storage/innobase/include/ut0mem.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, MariaDB Corporation.
+Copyright (c) 2017, 2019, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -27,78 +27,6 @@ Created 5/30/1994 Heikki Tuuri
 #include "ut0byte.h"
 #include "mach0data.h"
 
-/** Wrapper for memcpy(3).  Copy memory area when the source and
-target are not overlapping.
-@param[in,out]	dest	copy to
-@param[in]	src	copy from
-@param[in]	n	number of bytes to copy
-@return dest */
-UNIV_INLINE
-void*
-ut_memcpy(void* dest, const void* src, ulint n)
-{
-	return(memcpy(dest, src, n));
-}
-
-/** Wrapper for memmove(3).  Copy memory area when the source and
-target are overlapping.
-@param[in,out]	dest	Move to
-@param[in]	src	Move from
-@param[in]	n	number of bytes to move
-@return dest */
-UNIV_INLINE
-void*
-ut_memmove(void* dest, const void* src, ulint n)
-{
-	return(memmove(dest, src, n));
-}
-
-/** Wrapper for memcmp(3).  Compare memory areas.
-@param[in]	str1	first memory block to compare
-@param[in]	str2	second memory block to compare
-@param[in]	n	number of bytes to compare
-@return negative, 0, or positive if str1 is smaller, equal,
-		or greater than str2, respectively. */
-UNIV_INLINE
-int
-ut_memcmp(const void* str1, const void* str2, ulint n)
-{
-	return(memcmp(str1, str2, n));
-}
-
-/** Wrapper for strcpy(3).  Copy a NUL-terminated string.
-@param[in,out]	dest	Destination to copy to
-@param[in]	src	Source to copy from
-@return dest */
-UNIV_INLINE
-char*
-ut_strcpy(char* dest, const char* src)
-{
-	return(strcpy(dest, src));
-}
-
-/** Wrapper for strlen(3).  Determine the length of a NUL-terminated string.
-@param[in]	str	string
-@return length of the string in bytes, excluding the terminating NUL */
-UNIV_INLINE
-ulint
-ut_strlen(const char* str)
-{
-	return(strlen(str));
-}
-
-/** Wrapper for strcmp(3).  Compare NUL-terminated strings.
-@param[in]	str1	first string to compare
-@param[in]	str2	second string to compare
-@return negative, 0, or positive if str1 is smaller, equal,
-		or greater than str2, respectively. */
-UNIV_INLINE
-int
-ut_strcmp(const char* str1, const char* str2)
-{
-	return(strcmp(str1, str2));
-}
-
 /**********************************************************************//**
 Converts a raw binary data to a NUL-terminated hex string. The output is
 truncated if there is not enough space in "hex", make sure "hex_size" is at
diff --git a/storage/innobase/include/ut0mutex.h b/storage/innobase/include/ut0mutex.h
index e1b6d129b9d..cb43583c21d 100644
--- a/storage/innobase/include/ut0mutex.h
+++ b/storage/innobase/include/ut0mutex.h
@@ -24,11 +24,8 @@ Policy based mutexes.
 Created 2012-03-24 Sunny Bains.
 ***********************************************************************/
 
+#pragma once
 #ifndef UNIV_INNOCHECKSUM
-
-#ifndef ut0mutex_h
-#define ut0mutex_h
-
 #include "sync0policy.h"
 #include "ib0mutex.h"
 
@@ -38,32 +35,23 @@ Created 2012-03-24 Sunny Bains.
 @param[in]	T		The resulting typedef alias */
 #define UT_MUTEX_TYPE(M, P, T) typedef PolicyMutex<M<P> > T;
 
-# ifdef HAVE_IB_LINUX_FUTEX
+# ifdef __linux__
 UT_MUTEX_TYPE(TTASFutexMutex, GenericPolicy, FutexMutex);
-UT_MUTEX_TYPE(TTASFutexMutex, BlockMutexPolicy, BlockFutexMutex);
-# endif /* HAVE_IB_LINUX_FUTEX */
+# endif /* __linux__ */
 
 UT_MUTEX_TYPE(TTASMutex, GenericPolicy, SpinMutex);
-UT_MUTEX_TYPE(TTASMutex, BlockMutexPolicy, BlockSpinMutex);
-
 UT_MUTEX_TYPE(OSTrackMutex, GenericPolicy, SysMutex);
-UT_MUTEX_TYPE(OSTrackMutex, BlockMutexPolicy, BlockSysMutex);
-
 UT_MUTEX_TYPE(TTASEventMutex, GenericPolicy, SyncArrayMutex);
-UT_MUTEX_TYPE(TTASEventMutex, BlockMutexPolicy, BlockSyncArrayMutex);
 
 #ifdef MUTEX_FUTEX
 /** The default mutex type. */
 typedef FutexMutex ib_mutex_t;
-typedef BlockFutexMutex ib_bpmutex_t;
 #define MUTEX_TYPE	"Uses futexes"
 #elif defined(MUTEX_SYS)
 typedef SysMutex ib_mutex_t;
-typedef BlockSysMutex ib_bpmutex_t;
 #define MUTEX_TYPE	"Uses system mutexes"
 #elif defined(MUTEX_EVENT)
 typedef SyncArrayMutex ib_mutex_t;
-typedef BlockSyncArrayMutex ib_bpmutex_t;
 #define MUTEX_TYPE	"Uses event mutexes"
 #else
 #error "ib_mutex_t type is unknown"
@@ -187,6 +175,4 @@ void mutex_destroy(
 	mutex->destroy();
 }
 
-#endif /* ut0mutex_h */
-
 #endif /* UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/ut0new.h b/storage/innobase/include/ut0new.h
index a190b872549..e8469db9dc3 100644
--- a/storage/innobase/include/ut0new.h
+++ b/storage/innobase/include/ut0new.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -128,16 +128,14 @@ InnoDB:
 #include <stdlib.h> /* malloc() */
 #include <string.h> /* strlen(), strrchr(), strncmp() */
 
+#include <my_sys.h> /* my_large_free/malloc() */
+
 #include "my_global.h" /* needed for headers from mysql/psi/ */
 
-/* JAN: TODO: missing 5.7 header */
-#ifdef HAVE_MYSQL_MEMORY_H
 #include "mysql/psi/mysql_memory.h" /* PSI_MEMORY_CALL() */
-#endif
 
 #include "mysql/psi/psi_memory.h" /* PSI_memory_key, PSI_memory_info */
 
-#include "os0proc.h" /* os_mem_alloc_large() */
 #include "os0thread.h" /* os_thread_sleep() */
 #include "ut0ut.h" /* ut_strcmp_functor, ut_basename_noext() */
 
@@ -146,9 +144,15 @@ InnoDB:
 	" operating system. Note that on most 32-bit computers the process" \
 	" memory space is limited to 2 GB or 4 GB."
 
+/** The total amount of memory currently allocated from the operating
+system with allocate_large() */
+extern Atomic_counter<ulint> os_total_large_mem_allocated;
+
 /** Maximum number of retries to allocate memory. */
 extern const size_t	alloc_max_retries;
 
+constexpr uint32_t INVALID_AUTOEVENT_IDX = 0xFFFFFFFFU;
+
 /** Keys for registering allocations with performance schema.
 Pointers to these variables are supplied to PFS code via the pfs_info[]
 array and the PFS code initializes them via PSI_MEMORY_CALL(register_memory)().
@@ -179,13 +183,13 @@ ut_new_boot();
 
 #ifdef UNIV_PFS_MEMORY
 
-/** Retrieve a memory key (registered with PFS), given a portion of the file
-name of the caller.
-@param[in]	file	portion of the filename - basename without an extension
-@return registered memory key or PSI_NOT_INSTRUMENTED if not found */
-PSI_memory_key
-ut_new_get_key_by_file(
-	const char*	file);
+/**
+Retrieve a memory key (registered with PFS),
+given AUTOEVENT_IDX of the caller
+
+@param[in] autoevent_idx - AUTOEVENT_IDX value of the caller
+@return registered memory key or PSI_NOT_INSTRUMENTED */
+PSI_memory_key ut_new_get_key_by_file(uint32_t autoevent_idx);
 
 #endif /* UNIV_PFS_MEMORY */
 
@@ -232,38 +236,19 @@ struct ut_new_pfx_t {
 #endif
 };
 
-static inline void ut_allocate_trace_dontdump(void *ptr, size_t	bytes,
-					      bool
-#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DONTDUMP)
-					      dontdump
-#endif
-					      , ut_new_pfx_t* pfx,
-					      const char*
-#ifdef UNIV_PFS_MEMORY
-					      file
-#endif
-
-					      )
+#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP)
+static inline void ut_dontdump(void *ptr, size_t m_size, bool dontdump)
 {
 	ut_a(ptr != NULL);
 
-#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DONTDUMP)
-	if (dontdump && madvise(ptr, bytes, MADV_DONTDUMP)) {
+	if (dontdump && madvise(ptr, m_size, MADV_DONTDUMP)) {
 		ib::warn() << "Failed to set memory to " DONTDUMP_STR ": "
 			   << strerror(errno)
 			   << " ptr " << ptr
-			   << " size " << bytes;
-	}
-#endif
-	if (pfx != NULL) {
-#ifdef UNIV_PFS_MEMORY
-		allocate_trace(bytes, file, pfx);
-#endif /* UNIV_PFS_MEMORY */
-		pfx->m_size = bytes;
+			   << " size " << m_size;
 	}
 }
 
-#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP)
 static inline void ut_dodump(void* ptr, size_t m_size)
 {
 	if (ptr && madvise(ptr, m_size, MADV_DODUMP)) {
@@ -274,6 +259,7 @@ static inline void ut_dodump(void* ptr, size_t m_size)
 	}
 }
 #else
+static inline void ut_dontdump(void *, size_t, bool) {}
 static inline void ut_dodump(void*, size_t) {}
 #endif
 
@@ -310,10 +296,14 @@ public:
 		     other
 #endif
 		     )
+	{
 #ifdef UNIV_PFS_MEMORY
-		: m_key(other.m_key)
+		const PSI_memory_key other_key = other.get_mem_key();
+
+		m_key = (other_key != mem_key_std)
+			? other_key
+			: PSI_NOT_INSTRUMENTED;
 #endif /* UNIV_PFS_MEMORY */
-	{
 	}
 
 	/** Return the maximum number of objects that can be allocated by
@@ -330,7 +320,7 @@ public:
 #endif /* UNIV_PFS_MEMORY */
 	}
 
-	pointer allocate(size_type n) { return allocate(n, NULL, NULL); }
+	pointer allocate(size_type n) { return allocate(n, NULL, INVALID_AUTOEVENT_IDX); }
 
 	/** Allocate a chunk of memory that can hold 'n_elements' objects of
 	type 'T' and trace the allocation.
@@ -348,9 +338,9 @@ public:
 	allocate(
 		size_type	n_elements,
 		const_pointer,
-		const char*
+		uint32_t
 #ifdef UNIV_PFS_MEMORY
-		file /*!< file name of the caller */
+		autoevent_idx /* AUTOEVENT_IDX of the caller */
 #endif
 		,
 		bool		set_to_zero = false,
@@ -412,7 +402,7 @@ public:
 #ifdef UNIV_PFS_MEMORY
 		ut_new_pfx_t*	pfx = static_cast<ut_new_pfx_t*>(ptr);
 
-		allocate_trace(total_bytes, file, pfx);
+		allocate_trace(total_bytes, autoevent_idx, pfx);
 
 		return(reinterpret_cast<pointer>(pfx + 1));
 #else
@@ -422,7 +412,7 @@ public:
 
 	/** Free a memory allocated by allocate() and trace the deallocation.
 	@param[in,out]	ptr		pointer to memory to free */
-	void deallocate(pointer ptr, size_type)
+	void deallocate(pointer ptr, size_type n_elements = 0)
 	{
 #ifdef UNIV_PFS_MEMORY
 		if (ptr == NULL) {
@@ -494,7 +484,7 @@ public:
 	reallocate(
 		void*		ptr,
 		size_type	n_elements,
-		const char*	file)
+		uint32_t	autoevent_idx)
 	{
 		if (n_elements == 0) {
 			deallocate(static_cast<pointer>(ptr));
@@ -502,7 +492,7 @@ public:
 		}
 
 		if (ptr == NULL) {
-			return(allocate(n_elements, NULL, file, false, false));
+			return(allocate(n_elements, NULL, autoevent_idx, false, false));
 		}
 
 		if (n_elements > max_size()) {
@@ -545,7 +535,7 @@ public:
 		deallocate_trace(pfx_new);
 
 		/* pfx_new is set here to describe the new block. */
-		allocate_trace(total_bytes, file, pfx_new);
+		allocate_trace(total_bytes, autoevent_idx, pfx_new);
 
 		return(reinterpret_cast<pointer>(pfx_new + 1));
 	}
@@ -561,9 +551,10 @@ public:
 	pointer
 	new_array(
 		size_type	n_elements,
-		const char*	file)
+		uint32_t autoevent_idx
+		)
 	{
-		T*	p = allocate(n_elements, NULL, file, false, false);
+		T*	p = allocate(n_elements, NULL, autoevent_idx, false, false);
 
 		if (p == NULL) {
 			return(NULL);
@@ -639,13 +630,22 @@ public:
 		ulint	n_bytes = n_elements * sizeof(T);
 
 		pointer	ptr = reinterpret_cast<pointer>(
-			os_mem_alloc_large(&n_bytes));
+			my_large_malloc(&n_bytes, MYF(0)));
 
 		if (ptr == NULL) {
 			return NULL;
 		}
 
-		ut_allocate_trace_dontdump(ptr, n_bytes, dontdump, pfx, NULL);
+		ut_dontdump(ptr, n_bytes, dontdump);
+
+		if (pfx != NULL) {
+#ifdef UNIV_PFS_MEMORY
+			allocate_trace(n_bytes, 0, pfx);
+#endif /* UNIV_PFS_MEMORY */
+			pfx->m_size = n_bytes;
+		}
+
+		os_total_large_mem_allocated += n_bytes;
 
 		return(ptr);
 	}
@@ -665,40 +665,26 @@ public:
 	void
 	deallocate_large(
 		pointer			ptr,
-		const ut_new_pfx_t*
-#ifdef UNIV_PFS_MEMORY
-		pfx
-#endif
-		,
-		size_t			size)
+		const ut_new_pfx_t*	pfx)
 	{
+		size_t size = pfx->m_size;
 #ifdef UNIV_PFS_MEMORY
 		if (pfx) {
 			deallocate_trace(pfx);
 		}
 #endif /* UNIV_PFS_MEMORY */
+		os_total_large_mem_allocated -= size;
 
-		os_mem_free_large(ptr, size);
+		my_large_free(ptr, size);
 	}
 
 	void
 	deallocate_large_dodump(
 		pointer			ptr,
-		const ut_new_pfx_t*
-#ifdef UNIV_PFS_MEMORY
-		pfx
-#endif
-		,
-		size_t			size)
+		const ut_new_pfx_t*	pfx)
 	{
-		ut_dodump(ptr, size);
-		deallocate_large(ptr,
-#ifdef UNIV_PFS_MEMORY
-		pfx,
-#else
-		NULL,
-#endif
-		size);
+		ut_dodump(ptr, pfx->m_size);
+		deallocate_large(ptr, pfx);
 	}
 
 #ifdef UNIV_PFS_MEMORY
@@ -707,25 +693,16 @@ public:
 	@return performance schema key */
 	PSI_memory_key
 	get_mem_key(
-		const char*	file) const
+		uint32_t autoevent_idx = INVALID_AUTOEVENT_IDX) const
 	{
 		if (m_key != PSI_NOT_INSTRUMENTED) {
 			return(m_key);
 		}
 
-		if (file == NULL) {
+		if (autoevent_idx == INVALID_AUTOEVENT_IDX) {
 			return(mem_key_std);
 		}
-
-		/* e.g. "btr0cur", derived from "/path/to/btr0cur.cc" */
-		char		keyname[FILENAME_MAX];
-		const size_t	len = ut_basename_noext(file, keyname,
-							sizeof(keyname));
-		/* If sizeof(keyname) was not enough then the output would
-		be truncated, assert that this did not happen. */
-		ut_a(len < sizeof(keyname));
-
-		const PSI_memory_key	key = ut_new_get_key_by_file(keyname);
+		const PSI_memory_key	key = ut_new_get_key_by_file(autoevent_idx);
 
 		if (key != PSI_NOT_INSTRUMENTED) {
 			return(key);
@@ -767,16 +744,16 @@ private:
 	   corresponds to "file", that will be used (see ut_new_boot())
 	4. Otherwise, the name associated with mem_key_other will be used.
 	@param[in]	size	number of bytes that were allocated
-	@param[in]	file	file name of the caller or NULL if unknown
+	@param[in]	autoevent_idx	autoevent_idx of the caller
 	@param[out]	pfx	placeholder to store the info which will be
 	needed when freeing the memory */
 	void
 	allocate_trace(
 		size_t		size,
-		const char*	file,
+		const uint32_t autoevent_idx,
 		ut_new_pfx_t*	pfx)
 	{
-		const PSI_memory_key	key = get_mem_key(file);
+		const PSI_memory_key	key = get_mem_key(autoevent_idx);
 
 		pfx->m_key = PSI_MEMORY_CALL(memory_alloc)(key, size, & pfx->m_owner);
 		pfx->m_size = size;
@@ -826,6 +803,128 @@ operator!=(
 
 #ifdef UNIV_PFS_MEMORY
 
+/*
+ constexpr trickery ahead.
+
+ Compute AUTOEVENT_IDX at compile time.
+ (index in the auto_event_names array, corresponding to basename of __FILE__)
+
+ The tricks are necessary to reduce the cost of lookup the
+ PSI_memory_key for auto event.
+*/
+
+static constexpr const char* cexpr_basename_helper(const char* s, const char* last_slash)
+{
+  return
+    *s == '\0' ? last_slash :
+    *s == '/' || *s == '\\' ? cexpr_basename_helper(s + 1, s + 1) :
+    cexpr_basename_helper(s + 1, last_slash);
+}
+
+static constexpr const char* cexpr_basename(const char* filename)
+{
+  return cexpr_basename_helper(filename, filename);
+}
+
+static constexpr bool cexpr_strequal_ignore_dot(const char* a, const char* b)
+{
+  return  *a == 0 || *a == '.' ? (*b == 0 || *b == '.')
+    : *a == *b ? cexpr_strequal_ignore_dot(a + 1, b + 1) : false;
+}
+
+constexpr const char* const auto_event_names[] =
+{
+  "btr0btr",
+  "btr0buf",
+  "btr0bulk",
+  "btr0cur",
+  "btr0pcur",
+  "btr0sea",
+  "buf0buf",
+  "buf0dblwr",
+  "buf0dump",
+  "dict0dict",
+  "dict0mem",
+  "dict0stats",
+  "eval0eval",
+  "fil0crypt",
+  "fil0fil",
+  "fsp0file",
+  "fts0ast",
+  "fts0blex",
+  "fts0config",
+  "fts0file",
+  "fts0fts",
+  "fts0opt",
+  "fts0pars",
+  "fts0que",
+  "fts0sql",
+  "fts0tlex",
+  "gis0sea",
+  "ha_innodb",
+  "handler0alter",
+  "hash0hash",
+  "i_s",
+  "lexyy",
+  "lock0lock",
+  "mem0mem",
+  "os0event",
+  "os0file",
+  "pars0lex",
+  "rem0rec",
+  "row0ftsort",
+  "row0import",
+  "row0log",
+  "row0merge",
+  "row0mysql",
+  "row0sel",
+  "srv0start",
+  "sync0arr",
+  "sync0debug",
+  "sync0rw",
+  "sync0start",
+  "sync0types",
+  "trx0i_s",
+  "trx0i_s",
+  "trx0roll",
+  "trx0rseg",
+  "trx0seg",
+  "trx0trx",
+  "trx0undo",
+  "ut0list",
+  "ut0mem",
+  "ut0new",
+  "ut0pool",
+  "ut0rbt",
+  "ut0wqueue",
+  "xtrabackup",
+  nullptr
+};
+
+constexpr uint32_t cexpr_lookup_auto_event_name(const char* name, uint32_t idx = 0)
+{
+  return !auto_event_names[idx] ? INVALID_AUTOEVENT_IDX :
+    cexpr_strequal_ignore_dot(name, auto_event_names[idx]) ? idx :
+    cexpr_lookup_auto_event_name(name, idx + 1);
+}
+
+/*
+ The AUTOEVENT_IDX macro.
+
+ Note, that there is a static_assert that checks whether
+ basename of the __FILE is not registered in the auto_event_names array.
+ If you run into this assert, add the basename to the array.
+
+ Weird looking lambda is used to force the evaluation at the compile time.
+*/
+#define AUTOEVENT_IDX []()\
+{\
+  constexpr auto idx = cexpr_lookup_auto_event_name(cexpr_basename(__FILE__)); \
+  static_assert(idx != INVALID_AUTOEVENT_IDX, "auto_event_names contains no entry for " __FILE__);\
+  return idx; \
+}()
+
+
 /** Allocate, trace the allocation and construct an object.
 Use this macro instead of 'new' within InnoDB.
 For example: instead of
@@ -843,7 +942,7 @@ pointer must be passed to UT_DELETE() when no longer needed.
 	object if the passed in pointer is NULL, e.g. if allocate() has
 	failed to allocate memory and has returned NULL. */ \
 	::new(ut_allocator<byte>(key).allocate( \
-		sizeof expr, NULL, __FILE__, false, false)) expr
+		sizeof expr, NULL, AUTOEVENT_IDX, false, false)) expr
 
 /** Allocate, trace the allocation and construct an object.
 Use this macro instead of 'new' within InnoDB and instead of UT_NEW()
@@ -865,6 +964,7 @@ We can't instantiate ut_allocator without having the type of the object, thus
 we redirect this to a templated function. */
 #define UT_DELETE(ptr)		ut_delete(ptr)
 
+
 /** Destroy and account object created by UT_NEW() or UT_NEW_NOKEY().
 @param[in,out]	ptr	pointer to the object */
 template <typename T>
@@ -891,7 +991,7 @@ The returned pointer must be passed to UT_DELETE_ARRAY().
 @param[in]	key		performance schema memory tracing key
 @return pointer to the first allocated object or NULL */
 #define UT_NEW_ARRAY(type, n_elements, key) \
-	ut_allocator<type>(key).new_array(n_elements, __FILE__)
+	ut_allocator<type>(key).new_array(n_elements, AUTOEVENT_IDX)
 
 /** Allocate and account 'n_elements' objects of type 'type'.
 Use this macro to allocate memory within InnoDB instead of 'new[]' and
@@ -922,39 +1022,35 @@ ut_delete_array(
 
 #define ut_malloc(n_bytes, key)		static_cast<void*>( \
 	ut_allocator<byte>(key).allocate( \
-		n_bytes, NULL, __FILE__, false, false))
+		n_bytes, NULL, AUTOEVENT_IDX, false, false))
 
-#define ut_malloc_dontdump(n_bytes) static_cast<void*>( \
-	ut_allocator<byte>(PSI_NOT_INSTRUMENTED).allocate_large( \
-		n_bytes, true))
+#define ut_malloc_dontdump(n_bytes, key) static_cast<void*>( \
+	ut_allocator<byte>(key).allocate_large( \
+		n_bytes, NULL, true))
 
 #define ut_zalloc(n_bytes, key)		static_cast<void*>( \
 	ut_allocator<byte>(key).allocate( \
-		n_bytes, NULL, __FILE__, true, false))
+		n_bytes, NULL, AUTOEVENT_IDX, true, false))
 
 #define ut_malloc_nokey(n_bytes)	static_cast<void*>( \
 	ut_allocator<byte>(PSI_NOT_INSTRUMENTED).allocate( \
-		n_bytes, NULL, __FILE__, false, false))
+		n_bytes, NULL, AUTOEVENT_IDX, false, false))
 
 #define ut_zalloc_nokey(n_bytes)	static_cast<void*>( \
 	ut_allocator<byte>(PSI_NOT_INSTRUMENTED).allocate( \
-		n_bytes, NULL, __FILE__, true, false))
+		n_bytes, NULL, AUTOEVENT_IDX, true, false))
 
 #define ut_zalloc_nokey_nofatal(n_bytes)	static_cast<void*>( \
 	ut_allocator<byte, false>(PSI_NOT_INSTRUMENTED).allocate( \
-		n_bytes, NULL, __FILE__, true, false))
+		n_bytes, NULL, AUTOEVENT_IDX, true, false))
 
 #define ut_realloc(ptr, n_bytes)	static_cast<void*>( \
 	ut_allocator<byte>(PSI_NOT_INSTRUMENTED).reallocate( \
-		ptr, n_bytes, __FILE__))
+		ptr, n_bytes, AUTOEVENT_IDX))
 
 #define ut_free(ptr)	ut_allocator<byte>(PSI_NOT_INSTRUMENTED).deallocate( \
 	reinterpret_cast<byte*>(ptr))
 
-#define ut_free_dodump(ptr, size) static_cast<void*>( \
-	ut_allocator<byte>(PSI_NOT_INSTRUMENTED).deallocate_large( \
-		ptr, NULL, size, true))
-
 #else /* UNIV_PFS_MEMORY */
 
 /* Fallbacks when memory tracing is disabled at compile time. */
@@ -977,11 +1073,15 @@ ut_delete_array(
 
 #define ut_malloc_nokey(n_bytes)	::malloc(n_bytes)
 
-static inline void *ut_malloc_dontdump(size_t n_bytes)
+static inline void *ut_malloc_dontdump(size_t n_bytes, ...)
 {
-	void *ptr = os_mem_alloc_large(&n_bytes);
+	void *ptr = my_large_malloc(&n_bytes, MYF(0));
 
-	ut_allocate_trace_dontdump(ptr, n_bytes, true, NULL, NULL);
+	ut_dontdump(ptr, n_bytes, true);
+
+	if (ptr) {
+		os_total_large_mem_allocated += n_bytes;
+	}
 	return ptr;
 }
 
@@ -993,12 +1093,13 @@ static inline void *ut_malloc_dontdump(size_t n_bytes)
 
 #define ut_free(ptr)			::free(ptr)
 
+#endif /* UNIV_PFS_MEMORY */
+
 static inline void ut_free_dodump(void *ptr, size_t size)
 {
 	ut_dodump(ptr, size);
-	os_mem_free_large(ptr, size);
+	os_total_large_mem_allocated -= size;
+	my_large_free(ptr, size);
 }
 
-#endif /* UNIV_PFS_MEMORY */
-
 #endif /* ut0new_h */
diff --git a/storage/innobase/include/ut0stage.h b/storage/innobase/include/ut0stage.h
index 1eb7810a1bb..17fbd91b7ef 100644
--- a/storage/innobase/include/ut0stage.h
+++ b/storage/innobase/include/ut0stage.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -41,8 +41,6 @@ Created Nov 12, 2014 Vasil Dimov
 
 #ifdef HAVE_PSI_STAGE_INTERFACE
 
-typedef void PSI_stage_progress;
-
 /** Class used to report ALTER TABLE progress via performance_schema.
 The only user of this class is the ALTER TABLE code and it calls the methods
 in the following order
@@ -62,9 +60,6 @@ if any new indexes are being added, for each one:
   being_phase_log_index()
     multiple times:
       inc() // once per log-block applied
-begin_phase_flush()
-    multiple times:
-      inc() // once per page flushed
 begin_phase_log_table()
     multiple times:
       inc() // once per log-block applied
@@ -86,7 +81,6 @@ public:
 		m_n_pk_recs(0),
 		m_n_pk_pages(0),
 		m_n_recs_processed(0),
-		m_n_flush_pages(0),
 		m_cur_phase(NOT_STARTED)
 	{
 	}
@@ -134,13 +128,6 @@ public:
 	void
 	begin_phase_insert();
 
-	/** Flag the beginning of the flush phase.
-	@param[in]	n_flush_pages	this many pages are going to be
-	flushed */
-	void
-	begin_phase_flush(
-		ulint	n_flush_pages);
-
 	/** Flag the beginning of the log index phase. */
 	void
 	begin_phase_log_index();
@@ -166,7 +153,6 @@ private:
 		const PSI_stage_info*	new_stage);
 
 	/** Performance schema accounting object. */
-	/* TODO: MySQL 5.7 PSI */
 	PSI_stage_progress*	m_progress;
 
 	/** Old table PK. Used for calculating the estimate. */
@@ -195,16 +181,12 @@ private:
 	recs-per-page records. */
 	ulint			m_n_recs_processed;
 
-	/** Number of pages to flush. */
-	ulint			m_n_flush_pages;
-
 	/** Current phase. */
 	enum {
 		NOT_STARTED = 0,
 		READ_PK = 1,
 		SORT = 2,
 		INSERT = 3,
-		FLUSH = 4,
 		/* JAN: TODO: MySQL 5.7 vrs. MariaDB sql/log.h
 		LOG_INDEX = 5,
 		LOG_TABLE = 6, */
@@ -222,13 +204,12 @@ ut_stage_alter_t::~ut_stage_alter_t()
 		return;
 	}
 
-	/* TODO: MySQL 5.7 PSI: Set completed = estimated before we quit.
+	/* Set completed = estimated before we quit. */
 	mysql_stage_set_work_completed(
 		m_progress,
 		mysql_stage_get_work_estimated(m_progress));
 
 	mysql_end_stage();
-	*/
 }
 
 /** Flag an ALTER TABLE start (read primary key phase).
@@ -243,12 +224,10 @@ ut_stage_alter_t::begin_phase_read_pk(
 
 	m_cur_phase = READ_PK;
 
-	/* TODO: MySQL 5.7 PSI
 	m_progress = mysql_set_stage(
 		srv_stage_alter_table_read_pk_internal_sort.m_key);
 
 	mysql_stage_set_work_completed(m_progress, 0);
-	*/
 	reestimate();
 }
 
@@ -268,7 +247,7 @@ ut_stage_alter_t::n_pk_recs_inc()
 current phase. */
 inline
 void
-ut_stage_alter_t::inc(ulint)
+ut_stage_alter_t::inc(ulint inc_val)
 {
 	if (m_progress == NULL) {
 		return;
@@ -282,14 +261,12 @@ ut_stage_alter_t::inc(ulint)
 		ut_error;
 	case READ_PK:
 		m_n_pk_pages++;
-#if 0 /* TODO: MySQL 5.7 PSI */
 		ut_ad(inc_val == 1);
 		/* Overall the read pk phase will read all the pages from the
 		PK and will do work, proportional to the number of added
 		indexes, thus when this is called once per read page we
 		increment with 1 + m_n_sort_indexes */
 		inc_val = 1 + m_n_sort_indexes;
-#endif
 		break;
 	case SORT:
 		multi_factor = m_sort_multi_factor;
@@ -303,13 +280,15 @@ ut_stage_alter_t::inc(ulint)
 		(double) N records per page, then the work_completed
 	        should be incremented on the inc() calls round(k*N),
 		for k=1,2,3... */
-		const double	every_nth = m_n_recs_per_page * multi_factor;
+		const double	every_nth = m_n_recs_per_page *
+			static_cast<double>(multi_factor);
 
 		const ulint	k = static_cast<ulint>(
-			round(m_n_recs_processed / every_nth));
+			round(static_cast<double>(m_n_recs_processed) /
+			      every_nth));
 
 		const ulint	nth = static_cast<ulint>(
-			round(k * every_nth));
+			round(static_cast<double>(k) * every_nth));
 
 		should_proceed = m_n_recs_processed == nth;
 
@@ -317,8 +296,6 @@ ut_stage_alter_t::inc(ulint)
 
 		break;
 	}
-	case FLUSH:
-		break;
 	/* JAN: TODO: MySQL 5.7
 	case LOG_INDEX:
 		break;
@@ -332,9 +309,7 @@ ut_stage_alter_t::inc(ulint)
 	}
 
 	if (should_proceed) {
-		/* TODO: MySQL 5.7 PSI
 		mysql_stage_inc_work_completed(m_progress, inc_val);
-		*/
 		reestimate();
 	}
 }
@@ -355,7 +330,8 @@ ut_stage_alter_t::end_phase_read_pk()
 		m_n_recs_per_page = 1.0;
 	} else {
 		m_n_recs_per_page = std::max(
-			static_cast<double>(m_n_pk_recs) / m_n_pk_pages,
+			static_cast<double>(m_n_pk_recs)
+			/ static_cast<double>(m_n_pk_pages),
 			1.0);
 	}
 }
@@ -387,21 +363,6 @@ ut_stage_alter_t::begin_phase_insert()
 	change_phase(&srv_stage_alter_table_insert);
 }
 
-/** Flag the beginning of the flush phase.
-@param[in]	n_flush_pages	this many pages are going to be
-flushed */
-inline
-void
-ut_stage_alter_t::begin_phase_flush(
-	ulint	n_flush_pages)
-{
-	m_n_flush_pages = n_flush_pages;
-
-	reestimate();
-
-	change_phase(&srv_stage_alter_table_flush);
-}
-
 /** Flag the beginning of the log index phase. */
 inline
 void
@@ -438,12 +399,10 @@ ut_stage_alter_t::reestimate()
 	/* During the log table phase we calculate the estimate as
 	work done so far + log size remaining. */
 	if (m_cur_phase == LOG_INNODB_TABLE) {
-		/* TODO: MySQL 5.7 PSI
 		mysql_stage_set_work_estimated(
 			m_progress,
 			mysql_stage_get_work_completed(m_progress)
 			+ row_log_estimate_work(m_pk));
-		*/
 		return;
 	}
 
@@ -458,29 +417,19 @@ ut_stage_alter_t::reestimate()
 		? m_n_pk_pages
 		: m_pk->stat_n_leaf_pages;
 
-	/* If flush phase has not started yet and we do not know how
-	many pages are to be flushed, then use a wild guess - the
-	number of pages in the PK / 2. */
-	if (m_n_flush_pages == 0) {
-		m_n_flush_pages = n_pk_pages / 2;
-	}
-
 	ulonglong	estimate __attribute__((unused))
 		= n_pk_pages
 		* (1 /* read PK */
 		   + m_n_sort_indexes /* row_merge_buf_sort() inside the
 				      read PK per created index */
 		   + m_n_sort_indexes * 2 /* sort & insert per created index */)
-		+ m_n_flush_pages
 		+ row_log_estimate_work(m_pk);
 
 	/* Prevent estimate < completed */
-	/* TODO: MySQL 5.7 PSI
 	estimate = std::max(estimate,
 			    mysql_stage_get_work_completed(m_progress));
 
 	mysql_stage_set_work_estimated(m_progress, estimate);
-	*/
 }
 
 /** Change the current phase.
@@ -500,8 +449,6 @@ ut_stage_alter_t::change_phase(
 		m_cur_phase = SORT;
 	} else if (new_stage == &srv_stage_alter_table_insert) {
 		m_cur_phase = INSERT;
-	} else if (new_stage == &srv_stage_alter_table_flush) {
-		m_cur_phase = FLUSH;
 	/* JAN: TODO: MySQL 5.7 used LOG_INDEX and LOG_TABLE */
 	} else if (new_stage == &srv_stage_alter_table_log_index) {
 		m_cur_phase = LOG_INNODB_INDEX;
@@ -513,7 +460,6 @@ ut_stage_alter_t::change_phase(
 		ut_error;
 	}
 
-	/* TODO: MySQL 5.7 PSI
 	const ulonglong	c = mysql_stage_get_work_completed(m_progress);
 	const ulonglong	e = mysql_stage_get_work_estimated(m_progress);
 
@@ -521,7 +467,6 @@ ut_stage_alter_t::change_phase(
 
 	mysql_stage_set_work_completed(m_progress, c);
 	mysql_stage_set_work_estimated(m_progress, e);
-	*/
 }
 #else /* HAVE_PSI_STAGE_INTERFACE */
 
@@ -542,8 +487,6 @@ public:
 
 	void begin_phase_insert() {}
 
-	void begin_phase_flush(ulint) {}
-
 	void begin_phase_log_index() {}
 
 	void begin_phase_log_table() {}
diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h
index 7031fc47f6a..4ae4739ecfc 100644
--- a/storage/innobase/include/ut0ut.h
+++ b/storage/innobase/include/ut0ut.h
@@ -117,7 +117,7 @@ when m is a power of two.  In other words, rounds n up to m * k.
 @return n rounded up to the smallest possible integer multiple of m */
 #define UT_CALC_ALIGN(n, m) ((n + m - 1) & ~(m - 1))
 template <typename T> inline T ut_calc_align(T n, T m)
-{ return UT_CALC_ALIGN(n, m); }
+{ return static_cast<T>(UT_CALC_ALIGN(n, m)); }
 
 /*************************************************************//**
 Calculates fast the 2-logarithm of a number, rounded upward to an
@@ -136,14 +136,6 @@ ulint
 ut_2_exp(
 /*=====*/
 	ulint	n);	/*!< in: number */
-/*************************************************************//**
-Calculates fast the number rounded up to the nearest power of 2.
-@return first power of 2 which is >= n */
-ulint
-ut_2_power_up(
-/*==========*/
-	ulint	n)	/*!< in: number != 0 */
-	MY_ATTRIBUTE((const));
 
 /**********************************************************//**
 Returns the number of milliseconds since some epoch.  The
@@ -164,7 +156,7 @@ store the given number of bits.
 /** Determines if a number is zero or a power of two.
 @param[in]	n	number
 @return nonzero if n is zero or a power of two; zero otherwise */
-#define ut_is_2pow(n) UNIV_LIKELY(!((n) & ((n) - 1)))
+#define ut_is_2pow(n) (!((n) & ((n) - 1)))
 
 /** Functor that compares two C strings. Can be used as a comparator for
 e.g. std::map that uses char* as keys. */
diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h
index d9cc7aec9c9..26838c95443 100644
--- a/storage/innobase/include/ut0wqueue.h
+++ b/storage/innobase/include/ut0wqueue.h
@@ -48,9 +48,6 @@ struct ib_wqueue_t
 	ib_list_t*	items;
 	/** ib_list_len(*items) */
 	size_t		length;
-	/** event we use to signal additions to list;
-	os_event_set() and os_event_reset() are protected by the mutex */
-	os_event_t	event;
 };
 
 /****************************************************************//**
@@ -81,23 +78,6 @@ ib_wqueue_add(ib_wqueue_t* wq, void* item, mem_heap_t* heap,
 @return whether the queue is empty */
 bool ib_wqueue_is_empty(ib_wqueue_t* wq);
 
-/****************************************************************//**
-Wait for a work item to appear in the queue.
-@return work item */
-void*
-ib_wqueue_wait(
-/*===========*/
-	ib_wqueue_t*	wq);		/*!< in: work queue */
-
-/********************************************************************
-Wait for a work item to appear in the queue for specified time. */
-void*
-ib_wqueue_timedwait(
-/*================*/
-					/* out: work item or NULL on timeout*/
-	ib_wqueue_t*	wq,		/* in: work queue */
-	ulint		wait_in_usecs); /* in: wait time in micro seconds */
-
 /********************************************************************
 Return first item on work queue or NULL if queue is empty
 @return work item or NULL */
diff --git a/storage/innobase/innodb.cmake b/storage/innobase/innodb.cmake
index dede8e8d3cd..65d26aa6799 100644
--- a/storage/innobase/innodb.cmake
+++ b/storage/innobase/innodb.cmake
@@ -83,31 +83,6 @@ IF(INNODB_COMPILER_HINTS)
 ENDIF()
 ADD_FEATURE_INFO(INNODB_COMPILER_HINTS INNODB_COMPILER_HINTS "InnoDB compiled with compiler hints")
 
-SET(MUTEXTYPE "event" CACHE STRING "Mutex type: event, sys or futex")
-
-IF(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
-# After: WL#5825 Using C++ Standard Library with MySQL code
-#       we no longer use -fno-exceptions
-#	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
-
-# Add -Wconversion if compiling with GCC
-## As of Mar 15 2011 this flag causes 3573+ warnings. If you are reading this
-## please fix them and enable the following code:
-#SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wconversion")
-
-  IF (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64" OR
-      CMAKE_SYSTEM_PROCESSOR MATCHES "i386" AND
-      CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.6)
-    INCLUDE(CheckCXXCompilerFlag)
-    CHECK_CXX_COMPILER_FLAG("-fno-builtin-memcmp" HAVE_NO_BUILTIN_MEMCMP)
-    IF (HAVE_NO_BUILTIN_MEMCMP)
-      # Work around http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
-      SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_SOURCE_DIR}/rem/rem0cmp.cc
-	PROPERTIES COMPILE_FLAGS -fno-builtin-memcmp)
-    ENDIF()
-  ENDIF()
-ENDIF()
-
 # Enable InnoDB's UNIV_DEBUG in debug builds
 SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DUNIV_DEBUG")
 
@@ -147,61 +122,28 @@ IF(HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE)
  ADD_DEFINITIONS(-DHAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE=1)
 ENDIF()
 
+IF (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR
+    CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wconversion -Wno-sign-conversion")
+  SET_SOURCE_FILES_PROPERTIES(fts/fts0pars.cc
+    PROPERTIES COMPILE_FLAGS -Wno-conversion)
+ENDIF()
+
 IF(NOT MSVC)
   # Work around MDEV-18417, MDEV-18656, MDEV-18417
   IF(WITH_ASAN AND CMAKE_COMPILER_IS_GNUCC AND
      CMAKE_C_COMPILER_VERSION VERSION_LESS "6.0.0")
     SET_SOURCE_FILES_PROPERTIES(trx/trx0rec.cc PROPERTIES COMPILE_FLAGS -O1)
   ENDIF()
-
-# Only use futexes on Linux if GCC atomics are available
-IF(NOT MSVC AND NOT CMAKE_CROSSCOMPILING)
-  CHECK_C_SOURCE_RUNS(
-  "
-  #include <stdio.h>
-  #include <unistd.h>
-  #include <errno.h>
-  #include <assert.h>
-  #include <linux/futex.h>
-  #include <unistd.h>
-  #include <sys/syscall.h>
-
-   int futex_wait(int* futex, int v) {
-	return(syscall(SYS_futex, futex, FUTEX_WAIT_PRIVATE, v, NULL, NULL, 0));
-   }
-
-   int futex_signal(int* futex) {
-	return(syscall(SYS_futex, futex, FUTEX_WAKE, 1, NULL, NULL, 0));
-   }
-
-  int main() {
-	int	ret;
-	int	m = 1;
-
-	/* It is setup to fail and return EWOULDBLOCK. */
-	ret = futex_wait(&m, 0);
-	assert(ret == -1 && errno == EWOULDBLOCK);
-	/* Shouldn't wake up any threads. */
-	assert(futex_signal(&m) == 0);
-
-	return(0);
-  }"
-  HAVE_IB_LINUX_FUTEX)
-ENDIF()
-
-IF(HAVE_IB_LINUX_FUTEX)
-  ADD_DEFINITIONS(-DHAVE_IB_LINUX_FUTEX=1)
 ENDIF()
 
-ENDIF(NOT MSVC)
-
 CHECK_FUNCTION_EXISTS(vasprintf  HAVE_VASPRINTF)
 
 SET(MUTEXTYPE "event" CACHE STRING "Mutex type: event, sys or futex")
 
 IF(MUTEXTYPE MATCHES "event")
   ADD_DEFINITIONS(-DMUTEX_EVENT)
-ELSEIF(MUTEXTYPE MATCHES "futex" AND DEFINED HAVE_IB_LINUX_FUTEX)
+ELSEIF(MUTEXTYPE MATCHES "futex" AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
   ADD_DEFINITIONS(-DMUTEX_FUTEX)
 ELSE()
    ADD_DEFINITIONS(-DMUTEX_SYS)
@@ -222,16 +164,6 @@ IF(CMAKE_CXX_COMPILER_ID MATCHES "SunPro"
     PROPERTIES COMPILE_FLAGS -xO3)
 ENDIF()
 
-# Avoid generating Hardware Capabilities due to crc32 instructions
-IF(CMAKE_SYSTEM_NAME MATCHES "SunOS" AND CMAKE_SYSTEM_PROCESSOR MATCHES "i386")
-  MY_CHECK_CXX_COMPILER_FLAG("-Wa,-nH")
-  IF(have_CXX__Wa__nH)
-    ADD_COMPILE_FLAGS(
-      ut/ut0crc32.cc
-      COMPILE_FLAGS "-Wa,-nH"
-    )
-  ENDIF()
-ENDIF()
 
 IF(MSVC)
   # Avoid "unreferenced label" warning in generated file
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index 9949ebded8e..07fff9b21ad 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -462,28 +462,24 @@ void lock_sys_t::create(ulint n_cells)
 
 	mutex_create(LATCH_ID_LOCK_SYS_WAIT, &wait_mutex);
 
-	timeout_event = os_event_create(0);
 
-	rec_hash = hash_create(n_cells);
-	prdt_hash = hash_create(n_cells);
-	prdt_page_hash = hash_create(n_cells);
+	rec_hash.create(n_cells);
+	prdt_hash.create(n_cells);
+	prdt_page_hash.create(n_cells);
 
 	if (!srv_read_only_mode) {
 		lock_latest_err_file = os_file_create_tmpfile();
 		ut_a(lock_latest_err_file);
 	}
+	timeout_timer_active = false;
 }
 
 /** Calculates the fold value of a lock: used in migrating the hash table.
 @param[in]	lock	record lock object
 @return	folded value */
-static
-ulint
-lock_rec_lock_fold(
-	const lock_t*	lock)
+static ulint lock_rec_lock_fold(const lock_t *lock)
 {
-	return(lock_rec_fold(lock->un_member.rec_lock.space,
-			     lock->un_member.rec_lock.page_no));
+  return lock->un_member.rec_lock.page_id.fold();
 }
 
 
@@ -498,49 +494,23 @@ void lock_sys_t::resize(ulint n_cells)
 
 	mutex_enter(&mutex);
 
-	hash_table_t* old_hash = rec_hash;
-	rec_hash = hash_create(n_cells);
-	HASH_MIGRATE(old_hash, rec_hash, lock_t, hash,
+	hash_table_t old_hash(rec_hash);
+	rec_hash.create(n_cells);
+	HASH_MIGRATE(&old_hash, &rec_hash, lock_t, hash,
 		     lock_rec_lock_fold);
-	hash_table_free(old_hash);
+	old_hash.free();
 
 	old_hash = prdt_hash;
-	prdt_hash = hash_create(n_cells);
-	HASH_MIGRATE(old_hash, prdt_hash, lock_t, hash,
+	prdt_hash.create(n_cells);
+	HASH_MIGRATE(&old_hash, &prdt_hash, lock_t, hash,
 		     lock_rec_lock_fold);
-	hash_table_free(old_hash);
+	old_hash.free();
 
 	old_hash = prdt_page_hash;
-	prdt_page_hash = hash_create(n_cells);
-	HASH_MIGRATE(old_hash, prdt_page_hash, lock_t, hash,
+	prdt_page_hash.create(n_cells);
+	HASH_MIGRATE(&old_hash, &prdt_page_hash, lock_t, hash,
 		     lock_rec_lock_fold);
-	hash_table_free(old_hash);
-
-	/* need to update block->lock_hash_val */
-	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
-		buf_pool_t*	buf_pool = buf_pool_from_array(i);
-
-		buf_pool_mutex_enter(buf_pool);
-		buf_page_t*	bpage;
-		bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
-
-		while (bpage != NULL) {
-			if (buf_page_get_state(bpage)
-			    == BUF_BLOCK_FILE_PAGE) {
-				buf_block_t*	block;
-				block = reinterpret_cast<buf_block_t*>(
-					bpage);
-
-				block->lock_hash_val
-					= lock_rec_hash(
-						bpage->id.space(),
-						bpage->id.page_no());
-			}
-			bpage = UT_LIST_GET_NEXT(LRU, bpage);
-		}
-		buf_pool_mutex_exit(buf_pool);
-	}
-
+	old_hash.free();
 	mutex_exit(&mutex);
 }
 
@@ -553,15 +523,13 @@ void lock_sys_t::close()
 	if (!m_initialised) return;
 
 	if (lock_latest_err_file != NULL) {
-		fclose(lock_latest_err_file);
+		my_fclose(lock_latest_err_file, MYF(MY_WME));
 		lock_latest_err_file = NULL;
 	}
 
-	hash_table_free(rec_hash);
-	hash_table_free(prdt_hash);
-	hash_table_free(prdt_page_hash);
-
-	os_event_destroy(timeout_event);
+	rec_hash.free();
+	prdt_hash.free();
+	prdt_page_hash.free();
 
 	mutex_destroy(&mutex);
 	mutex_destroy(&wait_mutex);
@@ -724,7 +692,7 @@ lock_rec_has_to_wait(
 	bool		for_locking,
 				/*!< in is called locking or releasing */
 	const trx_t*	trx,	/*!< in: trx of new lock */
-	ulint		type_mode,/*!< in: precise mode of the new lock
+	unsigned	type_mode,/*!< in: precise mode of the new lock
 				to set: LOCK_S or LOCK_X, possibly
 				ORed to LOCK_GAP or LOCK_REC_NOT_GAP,
 				LOCK_INSERT_INTENTION */
@@ -906,26 +874,6 @@ lock_rec_find_set_bit(
 }
 
 /*********************************************************************//**
-Determines if there are explicit record locks on a page.
-@return an explicit record lock on the page, or NULL if there are none */
-lock_t*
-lock_rec_expl_exist_on_page(
-/*========================*/
-	ulint	space,	/*!< in: space id */
-	ulint	page_no)/*!< in: page number */
-{
-	lock_t*	lock;
-
-	lock_mutex_enter();
-	/* Only used in ibuf pages, so rec_hash is good enough */
-	lock = lock_rec_get_first_on_page_addr(lock_sys.rec_hash,
-					       space, page_no);
-	lock_mutex_exit();
-
-	return(lock);
-}
-
-/*********************************************************************//**
 Resets the record lock bitmap to zero. NOTE: does not touch the wait_lock
 pointer in the transaction! This function is used in lock object creation
 and resetting. */
@@ -946,7 +894,7 @@ lock_rec_bitmap_reset(
 
 	ut_ad((lock_rec_get_n_bits(lock) % 8) == 0);
 
-	memset(&lock[1], 0, n_bytes);
+	memset(reinterpret_cast<void*>(&lock[1]), 0, n_bytes);
 }
 
 /*********************************************************************//**
@@ -978,35 +926,21 @@ lock_rec_get_prev(
 	ulint		heap_no)/*!< in: heap number of the record */
 {
 	lock_t*		lock;
-	ulint		space;
-	ulint		page_no;
 	lock_t*		found_lock	= NULL;
-	hash_table_t*	hash;
 
 	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
 
-	space = in_lock->un_member.rec_lock.space;
-	page_no = in_lock->un_member.rec_lock.page_no;
-
-	hash = lock_hash_get(in_lock->type_mode);
-
-	for (lock = lock_rec_get_first_on_page_addr(hash, space, page_no);
-	     /* No op */;
+	for (lock = lock_sys.get_first(*lock_hash_get(in_lock->type_mode),
+				       in_lock->un_member.rec_lock.page_id);
+	     lock != in_lock;
 	     lock = lock_rec_get_next_on_page(lock)) {
-
-		ut_ad(lock);
-
-		if (lock == in_lock) {
-
-			return(found_lock);
-		}
-
 		if (lock_rec_get_nth_bit(lock, heap_no)) {
-
 			found_lock = lock;
 		}
 	}
+
+	return found_lock;
 }
 
 /*============= FUNCTIONS FOR ANALYZING RECORD LOCK QUEUE ================*/
@@ -1036,7 +970,7 @@ lock_rec_has_expl(
 	      || (precise_mode & LOCK_MODE_MASK) == LOCK_X);
 	ut_ad(!(precise_mode & LOCK_INSERT_INTENTION));
 
-	for (lock = lock_rec_get_first(lock_sys.rec_hash, block, heap_no);
+	for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
 	     lock != NULL;
 	     lock = lock_rec_get_next(heap_no, lock)) {
 
@@ -1089,8 +1023,8 @@ lock_rec_other_has_expl_req(
 		return(NULL);
 	}
 
-	for (lock_t* lock = lock_rec_get_first(lock_sys.rec_hash,
-						     block, heap_no);
+	for (lock_t* lock = lock_rec_get_first(&lock_sys.rec_hash,
+					       block, heap_no);
 	     lock != NULL;
 	     lock = lock_rec_get_next(heap_no, lock)) {
 
@@ -1146,7 +1080,7 @@ static
 lock_t*
 lock_rec_other_has_conflicting(
 /*===========================*/
-	ulint			mode,	/*!< in: LOCK_S or LOCK_X,
+	unsigned		mode,	/*!< in: LOCK_S or LOCK_X,
 					possibly ORed to LOCK_GAP or
 					LOC_REC_NOT_GAP,
 					LOCK_INSERT_INTENTION */
@@ -1161,7 +1095,7 @@ lock_rec_other_has_conflicting(
 
 	bool	is_supremum = (heap_no == PAGE_HEAP_NO_SUPREMUM);
 
-	for (lock = lock_rec_get_first(lock_sys.rec_hash, block, heap_no);
+	for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
 	     lock != NULL;
 	     lock = lock_rec_get_next(heap_no, lock)) {
 
@@ -1323,8 +1257,7 @@ static void check_trx_state(const trx_t *trx)
 without checking for deadlocks or conflicts.
 @param[in]	type_mode	lock mode and wait flag; type will be replaced
 				with LOCK_REC
-@param[in]	space		tablespace id
-@param[in]	page_no		index page number
+@param[in]	page_id		index page number
 @param[in]	page		R-tree index page, or NULL
 @param[in]	heap_no		record heap number in the index page
 @param[in]	index		the index tree
@@ -1337,9 +1270,8 @@ lock_rec_create_low(
 	lock_t*		c_lock,	/*!< conflicting lock */
 	que_thr_t*	thr,	/*!< thread owning trx */
 #endif
-	ulint		type_mode,
-	ulint		space,
-	ulint		page_no,
+	unsigned	type_mode,
+	const page_id_t	page_id,
 	const page_t*	page,
 	ulint		heap_no,
 	dict_index_t*	index,
@@ -1406,10 +1338,9 @@ lock_rec_create_low(
 	}
 
 	lock->trx = trx;
-	lock->type_mode = (type_mode & ~LOCK_TYPE_MASK) | LOCK_REC;
+	lock->type_mode = (type_mode & unsigned(~LOCK_TYPE_MASK)) | LOCK_REC;
 	lock->index = index;
-	lock->un_member.rec_lock.space = uint32_t(space);
-	lock->un_member.rec_lock.page_no = uint32_t(page_no);
+	lock->un_member.rec_lock.page_id = page_id;
 
 	if (UNIV_LIKELY(!(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)))) {
 		lock->un_member.rec_lock.n_bits = uint32_t(n_bytes * 8);
@@ -1486,11 +1417,11 @@ lock_rec_create_low(
 	    && innodb_lock_schedule_algorithm
 	    == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
 	    && !thd_is_replication_slave_thread(trx->mysql_thd)) {
-		HASH_PREPEND(lock_t, hash, lock_sys.rec_hash,
-			     lock_rec_fold(space, page_no), lock);
+		HASH_PREPEND(lock_t, hash, &lock_sys.rec_hash,
+			     page_id.fold(), lock);
 	} else {
 		HASH_INSERT(lock_t, hash, lock_hash_get(type_mode),
-			    lock_rec_fold(space, page_no), lock);
+			    page_id.fold(), lock);
 	}
 
 	if (!holds_trx_mutex) {
@@ -1544,21 +1475,15 @@ static
 dberr_t
 lock_rec_insert_by_trx_age(
 	lock_t	*in_lock) /*!< in: lock to be insert */{
-	ulint				space;
-	ulint				page_no;
-	ulint				rec_fold;
 	lock_t*				node;
 	lock_t*				next;
 	hash_table_t*		hash;
 	hash_cell_t*		cell;
 
 	ut_ad(!in_lock->trx->is_wsrep());
-	space = in_lock->un_member.rec_lock.space;
-	page_no = in_lock->un_member.rec_lock.page_no;
-	rec_fold = lock_rec_fold(space, page_no);
+	const page_id_t page_id(in_lock->un_member.rec_lock.page_id);
 	hash = lock_hash_get(in_lock->type_mode);
-	cell = hash_get_nth_cell(hash,
-				 hash_calc_hash(rec_fold, hash));
+	cell = &hash->array[hash->calc_hash(page_id.fold())];
 
 	node = (lock_t *) cell->node;
 	// If in_lock is not a wait lock, we insert it to the head of the list.
@@ -1600,9 +1525,6 @@ bool
 lock_queue_validate(
 	const lock_t	*in_lock) /*!< in: lock whose hash list is to be validated */
 {
-	ulint				space;
-	ulint				page_no;
-	ulint				rec_fold;
 	hash_table_t*		hash;
 	hash_cell_t*		cell;
 	lock_t*				next;
@@ -1612,12 +1534,9 @@ lock_queue_validate(
 		return true;
 	}
 
-	space = in_lock->un_member.rec_lock.space;
-	page_no = in_lock->un_member.rec_lock.page_no;
-	rec_fold = lock_rec_fold(space, page_no);
+	const page_id_t	page_id(in_lock->un_member.rec_lock.page_id);
 	hash = lock_hash_get(in_lock->type_mode);
-	cell = hash_get_nth_cell(hash,
-			hash_calc_hash(rec_fold, hash));
+	cell = &hash->array[hash->calc_hash(page_id.fold())];
 	next = (lock_t *) cell->node;
 	while (next != NULL) {
 		// If this is a granted lock, check that there's no wait lock before it.
@@ -1647,8 +1566,7 @@ lock_rec_insert_to_head(
 	}
 
 	hash = lock_hash_get(in_lock->type_mode);
-	cell = hash_get_nth_cell(hash,
-			hash_calc_hash(rec_fold, hash));
+	cell = &hash->array[hash->calc_hash(rec_fold)];
 	node = (lock_t *) cell->node;
 	if (node != in_lock) {
 		cell->node = in_lock;
@@ -1679,7 +1597,7 @@ lock_rec_enqueue_waiting(
 #ifdef WITH_WSREP
 	lock_t*			c_lock,	/*!< conflicting lock */
 #endif
-	ulint			type_mode,
+	unsigned		type_mode,
 	const buf_block_t*	block,
 	ulint			heap_no,
 	dict_index_t*		index,
@@ -1764,7 +1682,7 @@ lock_rec_enqueue_waiting(
 	    == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
 	    && !prdt
 	    && !thd_is_replication_slave_thread(lock->trx->mysql_thd)) {
-		HASH_DELETE(lock_t, hash, lock_sys.rec_hash,
+		HASH_DELETE(lock_t, hash, &lock_sys.rec_hash,
 			    lock_rec_lock_fold(lock), lock);
 		dberr_t res = lock_rec_insert_by_trx_age(lock);
 		if (res != DB_SUCCESS) {
@@ -1776,6 +1694,36 @@ lock_rec_enqueue_waiting(
 }
 
 /*********************************************************************//**
+Looks for a suitable type record lock struct by the same trx on the same page.
+This can be used to save space when a new record lock should be set on a page:
+no new struct is needed, if a suitable old is found.
+@return lock or NULL */
+static inline
+lock_t*
+lock_rec_find_similar_on_page(
+	ulint           type_mode,      /*!< in: lock type_mode field */
+	ulint           heap_no,        /*!< in: heap number of the record */
+	lock_t*         lock,           /*!< in: lock_sys.get_first() */
+	const trx_t*    trx)            /*!< in: transaction */
+{
+	ut_ad(lock_mutex_own());
+
+	for (/* No op */;
+	     lock != NULL;
+	     lock = lock_rec_get_next_on_page(lock)) {
+
+		if (lock->trx == trx
+		    && lock->type_mode == type_mode
+		    && lock_rec_get_n_bits(lock) > heap_no) {
+
+			return(lock);
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
 Adds a record lock request in the record queue. The request is normally
 added as the last in the queue, but if there are no waiting lock requests
 on the record, and the request to be added is not a waiting request, we
@@ -1787,7 +1735,7 @@ static
 void
 lock_rec_add_to_queue(
 /*==================*/
-	ulint			type_mode,/*!< in: lock mode, wait, gap
+	unsigned		type_mode,/*!< in: lock mode, wait, gap
 					etc. flags; type is ignored
 					and replaced by LOCK_REC */
 	const buf_block_t*	block,	/*!< in: buffer block containing
@@ -1854,11 +1802,11 @@ lock_rec_add_to_queue(
 
 	lock_t*		lock;
 	lock_t*		first_lock;
-	hash_table_t*	hash = lock_hash_get(type_mode);
 
 	/* Look for a waiting lock request on the same record or on a gap */
 
-	for (first_lock = lock = lock_rec_get_first_on_page(hash, block);
+	for (first_lock = lock = lock_sys.get_first(*lock_hash_get(type_mode),
+						    block->page.id());
 	     lock != NULL;
 	     lock = lock_rec_get_next_on_page(lock)) {
 
@@ -1908,7 +1856,7 @@ lock_rec_lock(
 					if no wait is necessary: we
 					assume that the caller will
 					set an implicit lock */
-	ulint			mode,	/*!< in: lock mode: LOCK_X or
+	unsigned		mode,	/*!< in: lock mode: LOCK_X or
 					LOCK_S possibly ORed to either
 					LOCK_GAP or LOCK_REC_NOT_GAP */
 	const buf_block_t*	block,	/*!< in: buffer block containing
@@ -1935,7 +1883,9 @@ lock_rec_lock(
   ut_ad((LOCK_MODE_MASK & mode) != LOCK_X ||
          lock_table_has(trx, index->table, LOCK_IX));
 
-  if (lock_t *lock= lock_rec_get_first_on_page(lock_sys.rec_hash, block))
+  if (lock_table_has(trx, index->table,
+                     static_cast<lock_mode>(LOCK_MODE_MASK & mode)));
+  else if (lock_t *lock= lock_sys.get_first(block->page.id()))
   {
     trx_mutex_enter(trx);
     if (lock_rec_get_next_on_page(lock) ||
@@ -2016,28 +1966,22 @@ lock_rec_has_to_wait_in_queue(
 	const lock_t*	wait_lock)	/*!< in: waiting record lock */
 {
 	const lock_t*	lock;
-	ulint		space;
-	ulint		page_no;
 	ulint		heap_no;
 	ulint		bit_mask;
 	ulint		bit_offset;
-	hash_table_t*	hash;
 
 	ut_ad(wait_lock);
 	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_wait(wait_lock));
 	ut_ad(lock_get_type_low(wait_lock) == LOCK_REC);
 
-	space = wait_lock->un_member.rec_lock.space;
-	page_no = wait_lock->un_member.rec_lock.page_no;
 	heap_no = lock_rec_find_set_bit(wait_lock);
 
 	bit_offset = heap_no / 8;
 	bit_mask = static_cast<ulint>(1) << (heap_no % 8);
 
-	hash = lock_hash_get(wait_lock->type_mode);
-
-	for (lock = lock_rec_get_first_on_page_addr(hash, space, page_no);
+	for (lock = lock_sys.get_first(*lock_hash_get(wait_lock->type_mode),
+				       wait_lock->un_member.rec_lock.page_id);
 	     lock != wait_lock;
 	     lock = lock_rec_get_next_on_page_const(lock)) {
 		const byte*	p = (const byte*) &lock[1];
@@ -2135,27 +2079,22 @@ lock_rec_cancel(
 	trx_mutex_exit(lock->trx);
 }
 
-static
-void
-lock_grant_and_move_on_page(ulint rec_fold, ulint space, ulint page_no)
+static void lock_grant_and_move_on_page(ulint rec_fold, const page_id_t id)
 {
 	lock_t*		lock;
 	lock_t*		previous = static_cast<lock_t*>(
-		hash_get_nth_cell(lock_sys.rec_hash,
-				  hash_calc_hash(rec_fold, lock_sys.rec_hash))
-		->node);
+		lock_sys.rec_hash.array[lock_sys.rec_hash.calc_hash(rec_fold)].
+		node);
 	if (previous == NULL) {
 		return;
 	}
-	if (previous->un_member.rec_lock.space == space &&
-		previous->un_member.rec_lock.page_no == page_no) {
+	if (previous->un_member.rec_lock.page_id == id) {
 		lock = previous;
 	}
 	else {
 		while (previous->hash &&
-				(previous->hash->un_member.rec_lock.space != space ||
-				previous->hash->un_member.rec_lock.page_no != page_no)) {
-					previous = previous->hash;
+		       (previous->hash->un_member.rec_lock.page_id != id)) {
+			previous = previous->hash;
 		}
 		lock = previous->hash;
 	}
@@ -2167,8 +2106,7 @@ lock_grant_and_move_on_page(ulint rec_fold, ulint space, ulint page_no)
 		/* If the lock is a wait lock on this page, and it does not need to wait. */
 		ut_ad(!lock->trx->is_wsrep());
 		if (lock_get_wait(lock)
-		    && lock->un_member.rec_lock.space == space
-		    && lock->un_member.rec_lock.page_no == page_no
+		    && lock->un_member.rec_lock.page_id == id
 		    && !lock_rec_has_to_wait_in_queue(lock)) {
 			lock_grant(lock);
 
@@ -2195,22 +2133,19 @@ to a lock. NOTE: all record locks contained in in_lock are removed.
 @param[in,out]	in_lock		record lock */
 static void lock_rec_dequeue_from_page(lock_t* in_lock)
 {
-	ulint		space;
-	ulint		page_no;
 	hash_table_t*	lock_hash;
 
 	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
 	/* We may or may not be holding in_lock->trx->mutex here. */
 
-	space = in_lock->un_member.rec_lock.space;
-	page_no = in_lock->un_member.rec_lock.page_no;
+	const page_id_t page_id(in_lock->un_member.rec_lock.page_id);
 
 	in_lock->index->table->n_rec_locks--;
 
 	lock_hash = lock_hash_get(in_lock->type_mode);
 
-	ulint rec_fold = lock_rec_fold(space, page_no);
+	const ulint rec_fold = page_id.fold();
 
 	HASH_DELETE(lock_t, hash, lock_hash, rec_fold, in_lock);
 	UT_LIST_REMOVE(in_lock->trx->lock.trx_locks, in_lock);
@@ -2220,14 +2155,13 @@ static void lock_rec_dequeue_from_page(lock_t* in_lock)
 
 	if (innodb_lock_schedule_algorithm
 	    == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS
-	    || lock_hash != lock_sys.rec_hash
+	    || lock_hash != &lock_sys.rec_hash
 	    || thd_is_replication_slave_thread(in_lock->trx->mysql_thd)) {
 		/* Check if waiting locks in the queue can now be granted:
 		grant locks if there are no conflicting locks ahead. Stop at
 		the first X lock that is waiting or has been granted. */
 
-		for (lock_t* lock = lock_rec_get_first_on_page_addr(
-			     lock_hash, space, page_no);
+		for (lock_t* lock = lock_sys.get_first(*lock_hash, page_id);
 		     lock != NULL;
 		     lock = lock_rec_get_next_on_page(lock)) {
 
@@ -2242,7 +2176,7 @@ static void lock_rec_dequeue_from_page(lock_t* in_lock)
 			}
 		}
 	} else {
-		lock_grant_and_move_on_page(rec_fold, space, page_no);
+		lock_grant_and_move_on_page(rec_fold, page_id);
 	}
 }
 
@@ -2255,8 +2189,6 @@ lock_rec_discard(
 					record locks which are contained
 					in this lock object are removed */
 {
-	ulint		space;
-	ulint		page_no;
 	trx_lock_t*	trx_lock;
 
 	ut_ad(lock_mutex_own());
@@ -2264,13 +2196,10 @@ lock_rec_discard(
 
 	trx_lock = &in_lock->trx->lock;
 
-	space = in_lock->un_member.rec_lock.space;
-	page_no = in_lock->un_member.rec_lock.page_no;
-
 	in_lock->index->table->n_rec_locks--;
 
 	HASH_DELETE(lock_t, hash, lock_hash_get(in_lock->type_mode),
-			    lock_rec_fold(space, page_no), in_lock);
+		    in_lock->un_member.rec_lock.page_id.fold(), in_lock);
 
 	UT_LIST_REMOVE(trx_lock->trx_locks, in_lock);
 
@@ -2282,29 +2211,19 @@ lock_rec_discard(
 Removes record lock objects set on an index page which is discarded. This
 function does not move locks, or check for waiting locks, therefore the
 lock bitmaps must already be reset when this function is called. */
-static
-void
-lock_rec_free_all_from_discard_page_low(
-/*====================================*/
-	ulint		space,
-	ulint		page_no,
-	hash_table_t*	lock_hash)
+static void lock_rec_free_all_from_discard_page_low(const page_id_t id,
+                                                    hash_table_t *lock_hash)
 {
-	lock_t*	lock;
-	lock_t*	next_lock;
-
-	lock = lock_rec_get_first_on_page_addr(lock_hash, space, page_no);
-
-	while (lock != NULL) {
-		ut_ad(lock_rec_find_set_bit(lock) == ULINT_UNDEFINED);
-		ut_ad(!lock_get_wait(lock));
-
-		next_lock = lock_rec_get_next_on_page(lock);
+  lock_t *lock= lock_sys.get_first(*lock_hash, id);
 
-		lock_rec_discard(lock);
-
-		lock = next_lock;
-	}
+  while (lock)
+  {
+    ut_ad(lock_rec_find_set_bit(lock) == ULINT_UNDEFINED);
+    ut_ad(!lock_get_wait(lock));
+    lock_t *next_lock= lock_rec_get_next_on_page(lock);
+    lock_rec_discard(lock);
+    lock= next_lock;
+  }
 }
 
 /*************************************************************//**
@@ -2316,20 +2235,10 @@ lock_rec_free_all_from_discard_page(
 /*================================*/
 	const buf_block_t*	block)	/*!< in: page to be discarded */
 {
-	ulint	space;
-	ulint	page_no;
-
-	ut_ad(lock_mutex_own());
-
-	space = block->page.id.space();
-	page_no = block->page.id.page_no();
-
-	lock_rec_free_all_from_discard_page_low(
-		space, page_no, lock_sys.rec_hash);
-	lock_rec_free_all_from_discard_page_low(
-		space, page_no, lock_sys.prdt_hash);
-	lock_rec_free_all_from_discard_page_low(
-		space, page_no, lock_sys.prdt_page_hash);
+  const page_id_t page_id(block->page.id());
+  lock_rec_free_all_from_discard_page_low(page_id, &lock_sys.rec_hash);
+  lock_rec_free_all_from_discard_page_low(page_id, &lock_sys.prdt_hash);
+  lock_rec_free_all_from_discard_page_low(page_id, &lock_sys.prdt_page_hash);
 }
 
 /*============= RECORD LOCK MOVING AND INHERITING ===================*/
@@ -2374,12 +2283,12 @@ lock_rec_reset_and_release_wait(
 	ulint			heap_no)/*!< in: heap number of record */
 {
 	lock_rec_reset_and_release_wait_low(
-		lock_sys.rec_hash, block, heap_no);
+		&lock_sys.rec_hash, block, heap_no);
 
 	lock_rec_reset_and_release_wait_low(
-		lock_sys.prdt_hash, block, PAGE_HEAP_NO_INFIMUM);
+		&lock_sys.prdt_hash, block, PAGE_HEAP_NO_INFIMUM);
 	lock_rec_reset_and_release_wait_low(
-		lock_sys.prdt_page_hash, block, PAGE_HEAP_NO_INFIMUM);
+		&lock_sys.prdt_page_hash, block, PAGE_HEAP_NO_INFIMUM);
 }
 
 /*************************************************************//**
@@ -2406,25 +2315,22 @@ lock_rec_inherit_to_gap(
 
 	ut_ad(lock_mutex_own());
 
-	/* If srv_locks_unsafe_for_binlog is TRUE or session is using
-	READ COMMITTED isolation level, we do not want locks set
+	/* At READ UNCOMMITTED or READ COMMITTED isolation level,
+	we do not want locks set
 	by an UPDATE or a DELETE to be inherited as gap type locks. But we
 	DO want S-locks/X-locks(taken for replace) set by a consistency
 	constraint to be inherited also then. */
 
-	for (lock = lock_rec_get_first(lock_sys.rec_hash, block, heap_no);
+	for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
 	     lock != NULL;
 	     lock = lock_rec_get_next(heap_no, lock)) {
 
 		if (!lock_rec_get_insert_intention(lock)
-		    && !((srv_locks_unsafe_for_binlog
-			  || lock->trx->isolation_level
-			  <= TRX_ISO_READ_COMMITTED)
-			 && lock_get_mode(lock) ==
-			 (lock->trx->duplicates ? LOCK_S : LOCK_X))) {
+		    && (lock->trx->isolation_level > TRX_ISO_READ_COMMITTED
+			|| lock_get_mode(lock) !=
+			(lock->trx->duplicates ? LOCK_S : LOCK_X))) {
 			lock_rec_add_to_queue(
-				LOCK_REC | LOCK_GAP
-				| ulint(lock_get_mode(lock)),
+				LOCK_REC | LOCK_GAP | lock_get_mode(lock),
 				heir_block, heir_heap_no, lock->index,
 				lock->trx, FALSE);
 		}
@@ -2451,7 +2357,7 @@ lock_rec_inherit_to_gap_if_gap_lock(
 
 	lock_mutex_enter();
 
-	for (lock = lock_rec_get_first(lock_sys.rec_hash, block, heap_no);
+	for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
 	     lock != NULL;
 	     lock = lock_rec_get_next(heap_no, lock)) {
 
@@ -2460,8 +2366,7 @@ lock_rec_inherit_to_gap_if_gap_lock(
 			|| !lock_rec_get_rec_not_gap(lock))) {
 
 			lock_rec_add_to_queue(
-				LOCK_REC | LOCK_GAP
-				| ulint(lock_get_mode(lock)),
+				LOCK_REC | LOCK_GAP | lock_get_mode(lock),
 				block, heir_heap_no, lock->index,
 				lock->trx, FALSE);
 		}
@@ -2496,15 +2401,15 @@ lock_rec_move_low(
 	/* If the lock is predicate lock, it resides on INFIMUM record */
 	ut_ad(lock_rec_get_first(
 		lock_hash, receiver, receiver_heap_no) == NULL
-	      || lock_hash == lock_sys.prdt_hash
-	      || lock_hash == lock_sys.prdt_page_hash);
+	      || lock_hash == &lock_sys.prdt_hash
+	      || lock_hash == &lock_sys.prdt_page_hash);
 
 	for (lock = lock_rec_get_first(lock_hash,
 				       donator, donator_heap_no);
 	     lock != NULL;
 	     lock = lock_rec_get_next(donator_heap_no, lock)) {
 
-		const ulint	type_mode = lock->type_mode;
+		const auto type_mode = lock->type_mode;
 
 		lock_rec_reset_nth_bit(lock, donator_heap_no);
 
@@ -2520,8 +2425,8 @@ lock_rec_move_low(
 			lock->index, lock->trx, FALSE);
 	}
 
-	ut_ad(lock_rec_get_first(lock_sys.rec_hash,
-				 donator, donator_heap_no) == NULL);
+	ut_ad(!lock_rec_get_first(&lock_sys.rec_hash,
+				  donator, donator_heap_no));
 }
 
 /** Move all the granted locks to the front of the given lock list.
@@ -2575,7 +2480,7 @@ lock_rec_move(
 	ulint			donator_heap_no)/*!< in: heap_no of the record
                                                 which gives the locks */
 {
-	lock_rec_move_low(lock_sys.rec_hash, receiver, donator,
+	lock_rec_move_low(&lock_sys.rec_hash, receiver, donator,
 			  receiver_heap_no, donator_heap_no);
 }
 
@@ -2600,7 +2505,7 @@ lock_move_reorganize_page(
 	lock_mutex_enter();
 
 	/* FIXME: This needs to deal with predicate lock too */
-	lock = lock_rec_get_first_on_page(lock_sys.rec_hash, block);
+	lock = lock_sys.get_first(block->page.id());
 
 	if (lock == NULL) {
 		lock_mutex_exit();
@@ -2733,11 +2638,12 @@ lock_move_rec_list_end(
 	table to the end of the hash chain, and lock_rec_add_to_queue
 	does not reuse locks if there are waiters in the queue. */
 
-	for (lock = lock_rec_get_first_on_page(lock_sys.rec_hash, block); lock;
+	for (lock = lock_sys.get_first(block->page.id());
+	     lock;
 	     lock = lock_rec_get_next_on_page(lock)) {
 		const rec_t*	rec1	= rec;
 		const rec_t*	rec2;
-		const ulint	type_mode = lock->type_mode;
+		const auto	type_mode = lock->type_mode;
 
 		if (comp) {
 			if (page_offset(rec1) == PAGE_NEW_INFIMUM) {
@@ -2848,11 +2754,12 @@ lock_move_rec_list_start(
 
 	lock_mutex_enter();
 
-	for (lock = lock_rec_get_first_on_page(lock_sys.rec_hash, block); lock;
+	for (lock = lock_sys.get_first(block->page.id());
+	     lock;
 	     lock = lock_rec_get_next_on_page(lock)) {
 		const rec_t*	rec1;
 		const rec_t*	rec2;
-		const ulint	type_mode = lock->type_mode;
+		const auto	type_mode = lock->type_mode;
 
 		if (comp) {
 			rec1 = page_rec_get_next_low(
@@ -2960,12 +2867,13 @@ lock_rtr_move_rec_list(
 
 	lock_mutex_enter();
 
-	for (lock = lock_rec_get_first_on_page(lock_sys.rec_hash, block); lock;
+	for (lock = lock_sys.get_first(block->page.id());
+	     lock;
 	     lock = lock_rec_get_next_on_page(lock)) {
 		ulint		moved = 0;
 		const rec_t*	rec1;
 		const rec_t*	rec2;
-		const ulint	type_mode = lock->type_mode;
+		const auto	type_mode = lock->type_mode;
 
 		/* Copy lock requests on user records to new page and
 		reset the lock bits on the old */
@@ -3072,13 +2980,11 @@ lock_update_merge_right(
 	waiting transactions */
 
 	lock_rec_reset_and_release_wait_low(
-		lock_sys.rec_hash, left_block, PAGE_HEAP_NO_SUPREMUM);
+		&lock_sys.rec_hash, left_block, PAGE_HEAP_NO_SUPREMUM);
 
 	/* there should exist no page lock on the left page,
 	otherwise, it will be blocked from merge */
-	ut_ad(!lock_rec_get_first_on_page_addr(lock_sys.prdt_page_hash,
-					       left_block->page.id.space(),
-					       left_block->page.id.page_no()));
+	ut_ad(!lock_sys.get_first_prdt_page(left_block->page.id()));
 
 	lock_rec_free_all_from_discard_page(left_block);
 
@@ -3186,7 +3092,7 @@ lock_update_merge_left(
 		releasing waiting transactions */
 
 		lock_rec_reset_and_release_wait_low(
-			lock_sys.rec_hash, left_block, PAGE_HEAP_NO_SUPREMUM);
+			&lock_sys.rec_hash, left_block, PAGE_HEAP_NO_SUPREMUM);
 	}
 
 	/* Move the locks from the supremum of right page to the supremum
@@ -3197,10 +3103,7 @@ lock_update_merge_left(
 
 	/* there should exist no page lock on the right page,
 	otherwise, it will be blocked from merge */
-	ut_ad(!lock_rec_get_first_on_page_addr(
-		      lock_sys.prdt_page_hash,
-		      right_block->page.id.space(),
-		      right_block->page.id.page_no()));
+	ut_ad(!lock_sys.get_first_prdt_page(right_block->page.id()));
 
 	lock_rec_free_all_from_discard_page(right_block);
 
@@ -3248,13 +3151,13 @@ lock_update_discard(
 	const page_t*	page = block->frame;
 	const rec_t*	rec;
 	ulint		heap_no;
+	const page_id_t	page_id(block->page.id());
 
 	lock_mutex_enter();
 
-	if (lock_rec_get_first_on_page(lock_sys.rec_hash, block)) {
-		ut_ad(!lock_rec_get_first_on_page(lock_sys.prdt_hash, block));
-		ut_ad(!lock_rec_get_first_on_page(lock_sys.prdt_page_hash,
-						  block));
+	if (lock_sys.get_first(page_id)) {
+		ut_ad(!lock_sys.get_first_prdt(page_id));
+		ut_ad(!lock_sys.get_first_prdt_page(page_id));
 		/* Inherit all the locks on the page to the record and
 		reset all the locks on the page */
 
@@ -3288,16 +3191,13 @@ lock_update_discard(
 			} while (heap_no != PAGE_HEAP_NO_SUPREMUM);
 		}
 
-		lock_rec_free_all_from_discard_page_low(
-			block->page.id.space(), block->page.id.page_no(),
-			lock_sys.rec_hash);
+		lock_rec_free_all_from_discard_page_low(page_id,
+							&lock_sys.rec_hash);
 	} else {
+		lock_rec_free_all_from_discard_page_low(page_id,
+							&lock_sys.prdt_hash);
 		lock_rec_free_all_from_discard_page_low(
-			block->page.id.space(), block->page.id.page_no(),
-			lock_sys.prdt_hash);
-		lock_rec_free_all_from_discard_page_low(
-			block->page.id.space(), block->page.id.page_no(),
-			lock_sys.prdt_page_hash);
+			page_id, &lock_sys.prdt_page_hash);
 	}
 
 	lock_mutex_exit();
@@ -3446,7 +3346,7 @@ lock_table_create(
 /*==============*/
 	dict_table_t*	table,	/*!< in/out: database table
 				in dictionary cache */
-	ulint		type_mode,/*!< in: lock mode possibly ORed with
+	unsigned	type_mode,/*!< in: lock mode possibly ORed with
 				LOCK_WAIT */
 	trx_t*		trx	/*!< in: trx */
 #ifdef WITH_WSREP
@@ -3687,7 +3587,7 @@ static
 dberr_t
 lock_table_enqueue_waiting(
 /*=======================*/
-	ulint		mode,	/*!< in: lock mode this transaction is
+	unsigned	mode,	/*!< in: lock mode this transaction is
 				requesting */
 	dict_table_t*	table,	/*!< in/out: table */
 	que_thr_t*	thr	/*!< in: query thread */
@@ -3724,7 +3624,7 @@ lock_table_enqueue_waiting(
 #endif /* WITH_WSREP */
 
 	/* Enqueue the lock request that will wait to be granted */
-	lock = lock_table_create(table, ulint(mode) | LOCK_WAIT, trx
+	lock = lock_table_create(table, mode | LOCK_WAIT, trx
 #ifdef WITH_WSREP
 				 , c_lock
 #endif
@@ -3818,7 +3718,7 @@ be granted immediately, the query thread is put to wait.
 dberr_t
 lock_table(
 /*=======*/
-	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is set,
+	unsigned	flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is set,
 				does nothing */
 	dict_table_t*	table,	/*!< in/out: database table
 				in dictionary cache */
@@ -3883,14 +3783,14 @@ lock_table(
 	mode: this trx may have to wait */
 
 	if (wait_for != NULL) {
-		err = lock_table_enqueue_waiting(ulint(mode) | flags, table,
+		err = lock_table_enqueue_waiting(flags | mode, table,
 						 thr
 #ifdef WITH_WSREP
 						 , wait_for
 #endif
 						 );
 	} else {
-		lock_table_create(table, ulint(mode) | flags, trx);
+		lock_table_create(table, flags | mode, trx);
 
 		ut_a(!flags || mode == LOCK_S || mode == LOCK_X);
 
@@ -4026,7 +3926,7 @@ lock_table_for_trx(
 		que_fork_get_first_thr(
 			static_cast<que_fork_t*>(que_node_get_parent(thr))));
 
-	que_thr_move_to_run_state_for_mysql(thr, trx);
+	thr->start_running();
 
 run_again:
 	thr->run_node = thr;
@@ -4037,7 +3937,7 @@ run_again:
 	trx->error_state = err;
 
 	if (UNIV_LIKELY(err == DB_SUCCESS)) {
-		que_thr_stop_for_mysql_no_error(thr, trx);
+		thr->stop_no_error();
 	} else {
 		que_thr_stop_for_mysql(thr);
 
@@ -4056,22 +3956,15 @@ run_again:
 static
 void
 lock_grant_and_move_on_rec(
-	hash_table_t*	lock_hash,
 	lock_t*			first_lock,
 	ulint			heap_no)
 {
 	lock_t*		lock;
-	lock_t*		previous;
-	ulint		space;
-	ulint		page_no;
-	ulint		rec_fold;
-
-	space = first_lock->un_member.rec_lock.space;
-	page_no = first_lock->un_member.rec_lock.page_no;
-	rec_fold = lock_rec_fold(space, page_no);
-
-	previous = (lock_t *) hash_get_nth_cell(lock_hash,
-							hash_calc_hash(rec_fold, lock_hash))->node;
+	const page_id_t	page_id(first_lock->un_member.rec_lock.page_id);
+	const ulint	rec_fold= page_id.fold();
+	lock_t*		previous = static_cast<lock_t*>(
+		lock_sys.rec_hash.array[lock_sys.hash(page_id)]
+		.node);
 	if (previous == NULL) {
 		return;
 	}
@@ -4089,8 +3982,7 @@ lock_grant_and_move_on_rec(
 	while (lock) {
 		ut_ad(!lock->trx->is_wsrep());
 		/* If the lock is a wait lock on this page, and it does not need to wait. */
-		if (lock->un_member.rec_lock.space == space
-			&& lock->un_member.rec_lock.page_no == page_no
+		if (lock->un_member.rec_lock.page_id == page_id
 			&& lock_rec_get_nth_bit(lock, heap_no)
 			&& lock_get_wait(lock)
 			&& !lock_rec_has_to_wait_in_queue(lock)) {
@@ -4143,7 +4035,7 @@ lock_rec_unlock(
 	lock_mutex_enter();
 	trx_mutex_enter(trx);
 
-	first_lock = lock_rec_get_first(lock_sys.rec_hash, block, heap_no);
+	first_lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
 
 	/* Find the last lock with the same lock_mode and transaction
 	on the record. */
@@ -4194,7 +4086,7 @@ released:
 			}
 		}
 	} else {
-		lock_grant_and_move_on_rec(lock_sys.rec_hash, first_lock, heap_no);
+		lock_grant_and_move_on_rec(first_lock, heap_no);
 	}
 
 	lock_mutex_exit();
@@ -4327,6 +4219,65 @@ void lock_release(trx_t* trx)
 #endif
 }
 
+/** Release non-exclusive locks on XA PREPARE,
+and release possible other transactions waiting because of these locks. */
+void lock_release_on_prepare(trx_t *trx)
+{
+  ulint count= 0;
+  lock_mutex_enter();
+  ut_ad(!trx_mutex_own(trx));
+
+  for (lock_t *lock= UT_LIST_GET_LAST(trx->lock.trx_locks); lock; )
+  {
+    ut_ad(lock->trx == trx);
+
+    if (lock_get_type_low(lock) == LOCK_REC)
+    {
+      ut_ad(!lock->index->table->is_temporary());
+      if (lock_rec_get_gap(lock) || lock_get_mode(lock) != LOCK_X)
+        lock_rec_dequeue_from_page(lock);
+      else
+      {
+        ut_ad(trx->dict_operation ||
+              lock->index->table->id >= DICT_HDR_FIRST_ID);
+retain_lock:
+        lock= UT_LIST_GET_PREV(trx_locks, lock);
+        continue;
+      }
+    }
+    else
+    {
+      ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+      ut_d(dict_table_t *table= lock->un_member.tab_lock.table);
+      ut_ad(!table->is_temporary());
+
+      switch (lock_get_mode(lock)) {
+      case LOCK_IS:
+      case LOCK_S:
+        lock_table_dequeue(lock);
+        break;
+      case LOCK_IX:
+      case LOCK_X:
+        ut_ad(table->id >= DICT_HDR_FIRST_ID || trx->dict_operation);
+        /* fall through */
+      default:
+        goto retain_lock;
+      }
+    }
+
+    if (++count == LOCK_RELEASE_INTERVAL)
+    {
+      lock_mutex_exit();
+      count= 0;
+      lock_mutex_enter();
+    }
+
+    lock= UT_LIST_GET_LAST(trx->lock.trx_locks);
+  }
+
+  lock_mutex_exit();
+}
+
 /* True if a lock mode is S or X */
 #define IS_LOCK_S_OR_X(lock) \
 	(lock_get_mode(lock) == LOCK_S \
@@ -4425,19 +4376,15 @@ lock_table_print(FILE* file, const lock_t* lock)
 @param[in,out]	mtr	mini-transaction for accessing the record */
 static void lock_rec_print(FILE* file, const lock_t* lock, mtr_t& mtr)
 {
-	ulint			space;
-	ulint			page_no;
-
 	ut_ad(lock_mutex_own());
 	ut_a(lock_get_type_low(lock) == LOCK_REC);
 
-	space = lock->un_member.rec_lock.space;
-	page_no = lock->un_member.rec_lock.page_no;
+	const page_id_t page_id(lock->un_member.rec_lock.page_id);
 
-	fprintf(file, "RECORD LOCKS space id %lu page no %lu n bits %lu "
-		"index %s of table ",
-		(ulong) space, (ulong) page_no,
-		(ulong) lock_rec_get_n_bits(lock),
+	fprintf(file, "RECORD LOCKS space id %u page no %u n bits " ULINTPF
+		" index %s of table ",
+		page_id.space(), page_id.page_no(),
+		lock_rec_get_n_bits(lock),
 		lock->index->name());
 	ut_print_name(file, lock->trx, lock->index->table->name.m_name);
 	fprintf(file, " trx id " TRX_ID_FMT, trx_get_id_for_print(lock->trx));
@@ -4474,8 +4421,7 @@ static void lock_rec_print(FILE* file, const lock_t* lock, mtr_t& mtr)
 	rec_offs_init(offsets_);
 
 	mtr.start();
-	const buf_block_t* block = buf_page_try_get(page_id_t(space, page_no),
-						    &mtr);
+	const buf_block_t* block = buf_page_try_get(page_id, &mtr);
 
 	for (ulint i = 0; i < lock_rec_get_n_bits(lock); ++i) {
 
@@ -4523,21 +4469,18 @@ http://bugs.mysql.com/36942 */
 /*********************************************************************//**
 Calculates the number of record lock structs in the record lock hash table.
 @return number of record locks */
-static
-ulint
-lock_get_n_rec_locks(void)
-/*======================*/
+static ulint lock_get_n_rec_locks()
 {
 	ulint	n_locks	= 0;
 	ulint	i;
 
 	ut_ad(lock_mutex_own());
 
-	for (i = 0; i < hash_get_n_cells(lock_sys.rec_hash); i++) {
+	for (i = 0; i < lock_sys.rec_hash.n_cells; i++) {
 		const lock_t*	lock;
 
 		for (lock = static_cast<const lock_t*>(
-				HASH_GET_FIRST(lock_sys.rec_hash, i));
+			     HASH_GET_FIRST(&lock_sys.rec_hash, i));
 		     lock != 0;
 		     lock = static_cast<const lock_t*>(
 				HASH_GET_NEXT(hash, lock))) {
@@ -4618,15 +4561,7 @@ lock_trx_print_wait_and_mvcc_state(FILE* file, const trx_t* trx, time_t now)
 	fprintf(file, "---");
 
 	trx_print_latched(file, trx, 600);
-
-	/* Note: read_view->get_state() check is race condition. But it
-	should "kind of work" because read_view is freed only at shutdown.
-	Worst thing that may happen is that it'll get transferred to
-	another thread and print wrong values. */
-
-	if (trx->read_view.get_state() == READ_VIEW_STATE_OPEN) {
-		trx->read_view.print_limits(file);
-	}
+	trx->read_view.print_limits(file);
 
 	if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
 
@@ -4689,15 +4624,14 @@ struct lock_print_info
     purge_trx(purge_sys.query ? purge_sys.query->trx : NULL)
   {}
 
-  void operator()(const trx_t* trx) const
+  void operator()(const trx_t &trx) const
   {
-    ut_ad(mutex_own(&trx_sys.mutex));
-    if (UNIV_UNLIKELY(trx == purge_trx))
+    if (UNIV_UNLIKELY(&trx == purge_trx))
       return;
-    lock_trx_print_wait_and_mvcc_state(file, trx, now);
+    lock_trx_print_wait_and_mvcc_state(file, &trx, now);
 
-    if (trx->will_lock && srv_print_innodb_lock_monitor)
-      lock_trx_print_locks(file, trx);
+    if (trx.will_lock && srv_print_innodb_lock_monitor)
+      lock_trx_print_locks(file, &trx);
   }
 
   FILE* const file;
@@ -4717,11 +4651,8 @@ lock_print_info_all_transactions(
 	ut_ad(lock_mutex_own());
 
 	fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n");
-	const time_t now = time(NULL);
 
-	mutex_enter(&trx_sys.mutex);
-	ut_list_map(trx_sys.trx_list, lock_print_info(file, now));
-	mutex_exit(&trx_sys.mutex);
+	trx_sys.trx_list.for_each(lock_print_info(file, time(nullptr)));
 	lock_mutex_exit();
 
 	ut_ad(lock_validate());
@@ -4841,7 +4772,7 @@ lock_rec_queue_validate(
 
 	if (!page_rec_is_user_rec(rec)) {
 
-		for (lock = lock_rec_get_first(lock_sys.rec_hash,
+		for (lock = lock_rec_get_first(&lock_sys.rec_hash,
 					       block, heap_no);
 		     lock != NULL;
 		     lock = lock_rec_get_next_const(heap_no, lock)) {
@@ -4933,7 +4864,7 @@ func_exit:
 		mutex_exit(&impl_trx->mutex);
 	}
 
-	for (lock = lock_rec_get_first(lock_sys.rec_hash, block, heap_no);
+	for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
 	     lock != NULL;
 	     lock = lock_rec_get_next_const(heap_no, lock)) {
 		ut_ad(!lock->trx->read_only
@@ -5004,19 +4935,15 @@ lock_rec_validate_page(
 	rec_offs*	offsets		= offsets_;
 	rec_offs_init(offsets_);
 
-	ut_ad(!lock_mutex_own());
-
 	lock_mutex_enter();
 loop:
-	lock = lock_rec_get_first_on_page_addr(
-		lock_sys.rec_hash,
-		block->page.id.space(), block->page.id.page_no());
+	lock = lock_sys.get_first(block->page.id());
 
 	if (!lock) {
 		goto function_exit;
 	}
 
-	ut_ad(!block->page.file_page_was_freed);
+	DBUG_ASSERT(block->page.status != buf_page_t::FREED);
 
 	for (i = 0; i < nth_lock; i++) {
 
@@ -5083,25 +5010,21 @@ lock_rec_validate(
 /*==============*/
 	ulint		start,		/*!< in: lock_sys.rec_hash
 					bucket */
-	ib_uint64_t*	limit)		/*!< in/out: upper limit of
+	page_id_t*	limit)		/*!< in/out: upper limit of
 					(space, page_no) */
 {
 	ut_ad(lock_mutex_own());
 
 	for (const lock_t* lock = static_cast<const lock_t*>(
-			HASH_GET_FIRST(lock_sys.rec_hash, start));
+		     HASH_GET_FIRST(&lock_sys.rec_hash, start));
 	     lock != NULL;
 	     lock = static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock))) {
 
-		ib_uint64_t	current;
-
 		ut_ad(!lock->trx->read_only
 		      || !lock->trx->is_autocommit_non_locking());
 		ut_ad(lock_get_type(lock) == LOCK_REC);
 
-		current = ut_ull_create(
-			lock->un_member.rec_lock.space,
-			lock->un_member.rec_lock.page_no);
+		page_id_t current(lock->un_member.rec_lock.page_id);
 
 		if (current > *limit) {
 			*limit = current + 1;
@@ -5114,17 +5037,12 @@ lock_rec_validate(
 
 /*********************************************************************//**
 Validate a record lock's block */
-static
-void
-lock_rec_block_validate(
-/*====================*/
-	ulint		space_id,
-	ulint		page_no)
+static void lock_rec_block_validate(const page_id_t page_id)
 {
 	/* The lock and the block that it is referring to may be freed at
 	this point. We pass BUF_GET_POSSIBLY_FREED to skip a debug check.
 	If the lock exists in lock_rec_validate_page() we assert
-	!block->page.file_page_was_freed. */
+	block->page.status != FREED. */
 
 	buf_block_t*	block;
 	mtr_t		mtr;
@@ -5134,12 +5052,12 @@ lock_rec_block_validate(
 	discard or rebuild a tablespace do hold an exclusive table
 	lock, which would conflict with any locks referring to the
 	tablespace from other transactions. */
-	if (fil_space_t* space = fil_space_acquire(space_id)) {
+	if (fil_space_t* space = fil_space_t::get(page_id.space())) {
 		dberr_t err = DB_SUCCESS;
 		mtr_start(&mtr);
 
 		block = buf_page_get_gen(
-			page_id_t(space_id, page_no),
+			page_id,
 			space->zip_size(),
 			RW_X_LATCH, NULL,
 			BUF_GET_POSSIBLY_FREED,
@@ -5148,11 +5066,10 @@ lock_rec_block_validate(
 		if (err != DB_SUCCESS) {
 			ib::error() << "Lock rec block validate failed for tablespace "
 				   << space->name
-				   << " space_id " << space_id
-				   << " page_no " << page_no << " err " << err;
+				   << page_id << " err " << err;
 		}
 
-		if (block) {
+		if (block && block->page.status != buf_page_t::FREED) {
 			buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
 
 			ut_ad(lock_rec_validate_page(block));
@@ -5193,43 +5110,33 @@ bool
 lock_validate()
 /*===========*/
 {
-	typedef	std::pair<ulint, ulint>		page_addr_t;
-	typedef std::set<
-		page_addr_t,
-		std::less<page_addr_t>,
-		ut_allocator<page_addr_t> >	page_addr_set;
-
-	page_addr_set	pages;
+	std::set<page_id_t> pages;
 
 	lock_mutex_enter();
 
 	/* Validate table locks */
-	trx_sys.rw_trx_hash.iterate(reinterpret_cast<my_hash_walk_action>
-				    (lock_validate_table_locks), 0);
+	trx_sys.rw_trx_hash.iterate(lock_validate_table_locks);
 
 	/* Iterate over all the record locks and validate the locks. We
-	don't want to hog the lock_sys_t::mutex and the trx_sys_t::mutex.
-	Release both mutexes during the validation check. */
+	don't want to hog the lock_sys_t::mutex. Release it during the
+	validation check. */
 
-	for (ulint i = 0; i < hash_get_n_cells(lock_sys.rec_hash); i++) {
-		ib_uint64_t	limit = 0;
+	for (ulint i = 0; i < lock_sys.rec_hash.n_cells; i++) {
+		page_id_t limit(0, 0);
 
 		while (const lock_t* lock = lock_rec_validate(i, &limit)) {
 			if (lock_rec_find_set_bit(lock) == ULINT_UNDEFINED) {
 				/* The lock bitmap is empty; ignore it. */
 				continue;
 			}
-			const lock_rec_t& l = lock->un_member.rec_lock;
-			pages.insert(std::make_pair(l.space, l.page_no));
+			pages.insert(lock->un_member.rec_lock.page_id);
 		}
 	}
 
 	lock_mutex_exit();
 
-	for (page_addr_set::const_iterator it = pages.begin();
-	     it != pages.end();
-	     ++it) {
-		lock_rec_block_validate((*it).first, (*it).second);
+	for (page_id_t page_id : pages) {
+		lock_rec_block_validate(page_id);
 	}
 
 	return(true);
@@ -5292,7 +5199,7 @@ lock_rec_insert_check_and_lock(
 	BTR_NO_LOCKING_FLAG and skip the locking altogether. */
 	ut_ad(lock_table_has(trx, index->table, LOCK_IX));
 
-	lock = lock_rec_get_first(lock_sys.rec_hash, block, heap_no);
+	lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
 
 	if (lock == NULL) {
 		/* We optimize CPU time usage in the simplest case */
@@ -5329,7 +5236,7 @@ lock_rec_insert_check_and_lock(
 	had to wait for their insert. Both had waiting gap type lock requests
 	on the successor, which produced an unnecessary deadlock. */
 
-	const ulint	type_mode = LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION;
+	const unsigned	type_mode = LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION;
 
 	if (
 #ifdef WITH_WSREP
@@ -5500,9 +5407,7 @@ static void lock_rec_other_trx_holds_expl(trx_t *caller_trx, trx_t *trx,
     lock_rec_other_trx_holds_expl_arg arg= { page_rec_get_heap_no(rec), block,
                                              trx };
     trx_sys.rw_trx_hash.iterate(caller_trx,
-                                reinterpret_cast<my_hash_walk_action>
-                                (lock_rec_other_trx_holds_expl_callback),
-                                &arg);
+                                lock_rec_other_trx_holds_expl_callback, &arg);
     lock_mutex_exit();
   }
 }
@@ -5765,7 +5670,7 @@ lock_sec_rec_read_check_and_lock(
 					records: LOCK_S or LOCK_X; the
 					latter is possible in
 					SELECT FOR UPDATE */
-	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+	unsigned		gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
 					LOCK_REC_NOT_GAP */
 	que_thr_t*		thr)	/*!< in: query thread */
 {
@@ -5794,7 +5699,9 @@ lock_sec_rec_read_check_and_lock(
 	if the max trx id for the page >= min trx id for the trx list or a
 	database recovery is running. */
 
-	if (!page_rec_is_supremum(rec)
+	trx_t *trx = thr_get_trx(thr);
+	if (!lock_table_has(trx, index->table, LOCK_X)
+	    && !page_rec_is_supremum(rec)
 	    && page_get_max_trx_id(block->frame) >= trx_sys.get_min_trx_id()
 	    && lock_rec_convert_impl_to_expl(thr_get_trx(thr), block, rec,
 					     index, offsets)
@@ -5804,7 +5711,6 @@ lock_sec_rec_read_check_and_lock(
 	}
 
 #ifdef WITH_WSREP
-	trx_t *trx= thr_get_trx(thr);
 	/* If transaction scanning an unique secondary key is wsrep
 	high priority thread (brute force) this scanning may involve
 	GAP-locking in the index. As this locking happens also when
@@ -5816,7 +5722,7 @@ lock_sec_rec_read_check_and_lock(
 		trx->wsrep_UK_scan= true;
 #endif /* WITH_WSREP */
 
-	err = lock_rec_lock(FALSE, ulint(mode) | gap_mode,
+	err = lock_rec_lock(FALSE, gap_mode | mode,
 			    block, heap_no, index, thr);
 
 #ifdef WITH_WSREP
@@ -5853,7 +5759,7 @@ lock_clust_rec_read_check_and_lock(
 					records: LOCK_S or LOCK_X; the
 					latter is possible in
 					SELECT FOR UPDATE */
-	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+	unsigned		gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
 					LOCK_REC_NOT_GAP */
 	que_thr_t*		thr)	/*!< in: query thread */
 {
@@ -5878,15 +5784,17 @@ lock_clust_rec_read_check_and_lock(
 
 	heap_no = page_rec_get_heap_no(rec);
 
-	if (heap_no != PAGE_HEAP_NO_SUPREMUM
-	    && lock_rec_convert_impl_to_expl(thr_get_trx(thr), block, rec,
+	trx_t *trx = thr_get_trx(thr);
+	if (!lock_table_has(trx, index->table, LOCK_X)
+	    && heap_no != PAGE_HEAP_NO_SUPREMUM
+	    && lock_rec_convert_impl_to_expl(trx, block, rec,
 					     index, offsets)
 	    && gap_mode == LOCK_REC_NOT_GAP) {
 		/* We already hold an implicit exclusive lock. */
 		return DB_SUCCESS;
 	}
 
-	err = lock_rec_lock(FALSE, ulint(mode) | gap_mode,
+	err = lock_rec_lock(FALSE, gap_mode | mode,
 			    block, heap_no, index, thr);
 
 	ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
@@ -5921,7 +5829,7 @@ lock_clust_rec_read_check_and_lock_alt(
 					records: LOCK_S or LOCK_X; the
 					latter is possible in
 					SELECT FOR UPDATE */
-	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+	unsigned		gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
 					LOCK_REC_NOT_GAP */
 	que_thr_t*		thr)	/*!< in: query thread */
 {
@@ -6045,71 +5953,6 @@ lock_get_trx_id(
 }
 
 /*******************************************************************//**
-Gets the mode of a lock in a human readable string.
-The string should not be free()'d or modified.
-@return lock mode */
-const char*
-lock_get_mode_str(
-/*==============*/
-	const lock_t*	lock)	/*!< in: lock */
-{
-	ibool	is_gap_lock;
-
-	is_gap_lock = lock_get_type_low(lock) == LOCK_REC
-		&& lock_rec_get_gap(lock);
-
-	switch (lock_get_mode(lock)) {
-	case LOCK_S:
-		if (is_gap_lock) {
-			return("S,GAP");
-		} else {
-			return("S");
-		}
-	case LOCK_X:
-		if (is_gap_lock) {
-			return("X,GAP");
-		} else {
-			return("X");
-		}
-	case LOCK_IS:
-		if (is_gap_lock) {
-			return("IS,GAP");
-		} else {
-			return("IS");
-		}
-	case LOCK_IX:
-		if (is_gap_lock) {
-			return("IX,GAP");
-		} else {
-			return("IX");
-		}
-	case LOCK_AUTO_INC:
-		return("AUTO_INC");
-	default:
-		return("UNKNOWN");
-	}
-}
-
-/*******************************************************************//**
-Gets the type of a lock in a human readable string.
-The string should not be free()'d or modified.
-@return lock type */
-const char*
-lock_get_type_str(
-/*==============*/
-	const lock_t*	lock)	/*!< in: lock */
-{
-	switch (lock_get_type_low(lock)) {
-	case LOCK_REC:
-		return("RECORD");
-	case LOCK_TABLE:
-		return("TABLE");
-	default:
-		return("UNKNOWN");
-	}
-}
-
-/*******************************************************************//**
 Gets the table on which the lock is.
 @return table */
 UNIV_INLINE
@@ -6185,32 +6028,6 @@ lock_rec_get_index_name(
 	return(lock->index->name);
 }
 
-/*******************************************************************//**
-For a record lock, gets the tablespace number on which the lock is.
-@return tablespace number */
-ulint
-lock_rec_get_space_id(
-/*==================*/
-	const lock_t*	lock)	/*!< in: lock */
-{
-	ut_a(lock_get_type_low(lock) == LOCK_REC);
-
-	return(lock->un_member.rec_lock.space);
-}
-
-/*******************************************************************//**
-For a record lock, gets the page number on which the lock is.
-@return page number */
-ulint
-lock_rec_get_page_no(
-/*=================*/
-	const lock_t*	lock)	/*!< in: lock */
-{
-	ut_a(lock_get_type_low(lock) == LOCK_REC);
-
-	return(lock->un_member.rec_lock.page_no);
-}
-
 /*********************************************************************//**
 Cancels a waiting lock request and releases possible other transactions
 waiting behind it. */
@@ -6410,10 +6227,7 @@ lock_table_has_locks(
 
 #ifdef UNIV_DEBUG
 	if (!has_locks) {
-		trx_sys.rw_trx_hash.iterate(
-			reinterpret_cast<my_hash_walk_action>
-			(lock_table_locks_lookup),
-			const_cast<dict_table_t*>(table));
+		trx_sys.rw_trx_hash.iterate(lock_table_locks_lookup, table);
 	}
 #endif /* UNIV_DEBUG */
 
@@ -6442,14 +6256,6 @@ lock_trx_lock_list_init(
 	UT_LIST_INIT(*lock_list, &lock_t::trx_locks);
 }
 
-/*******************************************************************//**
-Set the lock system timeout event. */
-void
-lock_set_timeout_event()
-/*====================*/
-{
-	os_event_set(lock_sys.timeout_event);
-}
 
 #ifdef UNIV_DEBUG
 /*******************************************************************//**
@@ -6532,8 +6338,9 @@ lock_trx_has_expl_x_lock(
 
 	lock_mutex_enter();
 	ut_ad(lock_table_has(trx, table, LOCK_IX));
-	ut_ad(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block, heap_no,
-				trx));
+	ut_ad(lock_table_has(trx, table, LOCK_X)
+	      || lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block, heap_no,
+				   trx));
 	lock_mutex_exit();
 	return(true);
 }
@@ -6668,12 +6475,6 @@ DeadlockChecker::get_first_lock(ulint* heap_no) const
 	const lock_t*	lock = m_wait_lock;
 
 	if (lock_get_type_low(lock) == LOCK_REC) {
-		hash_table_t*	lock_hash;
-
-		lock_hash = lock->type_mode & LOCK_PREDICATE
-			? lock_sys.prdt_hash
-			: lock_sys.rec_hash;
-
 		/* We are only interested in records that match the heap_no. */
 		*heap_no = lock_rec_find_set_bit(lock);
 
@@ -6681,10 +6482,11 @@ DeadlockChecker::get_first_lock(ulint* heap_no) const
 		ut_ad(*heap_no != ULINT_UNDEFINED);
 
 		/* Find the locks on the page. */
-		lock = lock_rec_get_first_on_page_addr(
-			lock_hash,
-			lock->un_member.rec_lock.space,
-			lock->un_member.rec_lock.page_no);
+		lock = lock_sys.get_first(
+			lock->type_mode & LOCK_PREDICATE
+			? lock_sys.prdt_hash
+			: lock_sys.rec_hash,
+			lock->un_member.rec_lock.page_id);
 
 		/* Position on the first lock on the physical record.*/
 		if (!lock_rec_get_nth_bit(lock, *heap_no)) {
@@ -6999,6 +6801,7 @@ DeadlockChecker::check_and_resolve(const lock_t* lock, trx_t* trx)
 			rollback_print(victim_trx, lock);
 
 			MONITOR_INC(MONITOR_DEADLOCK);
+			srv_stats.lock_deadlock_count.inc();
 
 			break;
 
@@ -7011,6 +6814,7 @@ DeadlockChecker::check_and_resolve(const lock_t* lock, trx_t* trx)
 			lock_deadlock_found = true;
 
 			MONITOR_INC(MONITOR_DEADLOCK);
+			srv_stats.lock_deadlock_count.inc();
 		}
 
 	} while (victim_trx != NULL && victim_trx != trx);
diff --git a/storage/innobase/lock/lock0prdt.cc b/storage/innobase/lock/lock0prdt.cc
index 9827243177d..1eb96a0dcf0 100644
--- a/storage/innobase/lock/lock0prdt.cc
+++ b/storage/innobase/lock/lock0prdt.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, MariaDB Corporation.
+Copyright (c) 2018, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -153,7 +153,7 @@ bool
 lock_prdt_has_to_wait(
 /*==================*/
 	const trx_t*	trx,	/*!< in: trx of new lock */
-	ulint		type_mode,/*!< in: precise mode of the new lock
+	unsigned	type_mode,/*!< in: precise mode of the new lock
 				to set: LOCK_S or LOCK_X, possibly
 				ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE,
 				LOCK_INSERT_INTENTION */
@@ -228,7 +228,7 @@ lock_t*
 lock_prdt_has_lock(
 /*===============*/
 	ulint			precise_mode,	/*!< in: LOCK_S or LOCK_X */
-	ulint			type_mode,	/*!< in: LOCK_PREDICATE etc. */
+	unsigned		type_mode,	/*!< in: LOCK_PREDICATE etc. */
 	const buf_block_t*	block,		/*!< in: buffer block
 						containing the record */
 	lock_prdt_t*		prdt,		/*!< in: The predicate to be
@@ -285,7 +285,7 @@ static
 lock_t*
 lock_prdt_other_has_conflicting(
 /*============================*/
-	ulint			mode,	/*!< in: LOCK_S or LOCK_X,
+	unsigned		mode,	/*!< in: LOCK_S or LOCK_X,
 					possibly ORed to LOCK_PREDICATE or
 					LOCK_PRDT_PAGE, LOCK_INSERT_INTENTION */
 	const buf_block_t*	block,	/*!< in: buffer block containing
@@ -385,7 +385,7 @@ static
 lock_t*
 lock_prdt_find_on_page(
 /*===================*/
-	ulint			type_mode,	/*!< in: lock type_mode field */
+	unsigned		type_mode,	/*!< in: lock type_mode field */
 	const buf_block_t*	block,		/*!< in: buffer block */
 	lock_prdt_t*		prdt,		/*!< in: MBR with the lock */
 	const trx_t*		trx)		/*!< in: transaction */
@@ -394,7 +394,8 @@ lock_prdt_find_on_page(
 
 	ut_ad(lock_mutex_own());
 
-	for (lock = lock_rec_get_first_on_page(lock_hash_get(type_mode), block);
+	for (lock = lock_sys.get_first(*lock_hash_get(type_mode),
+				       block->page.id());
 	     lock != NULL;
 	     lock = lock_rec_get_next_on_page(lock)) {
 
@@ -423,7 +424,7 @@ static
 lock_t*
 lock_prdt_add_to_queue(
 /*===================*/
-	ulint			type_mode,/*!< in: lock mode, wait, predicate
+	unsigned		type_mode,/*!< in: lock mode, wait, predicate
 					etc. flags; type is ignored
 					and replaced by LOCK_REC */
 	const buf_block_t*	block,	/*!< in: buffer block containing
@@ -457,7 +458,8 @@ lock_prdt_add_to_queue(
 
 	lock_t*		lock;
 
-	for (lock = lock_rec_get_first_on_page(lock_hash_get(type_mode), block);
+	for (lock = lock_sys.get_first(*lock_hash_get(type_mode),
+				       block->page.id());
 	     lock != NULL;
 	     lock = lock_rec_get_next_on_page(lock)) {
 
@@ -541,7 +543,7 @@ lock_prdt_insert_check_and_lock(
 	lock_t*		lock;
 
 	/* Only need to check locks on prdt_hash */
-	lock = lock_rec_get_first(lock_sys.prdt_hash, block, PRDT_HEAPNO);
+	lock = lock_rec_get_first(&lock_sys.prdt_hash, block, PRDT_HEAPNO);
 
 	if (lock == NULL) {
 		lock_mutex_exit();
@@ -619,16 +621,12 @@ lock_prdt_update_parent(
         buf_block_t*    right_block,	/*!< in/out: the new half page */
         lock_prdt_t*	left_prdt,	/*!< in: MBR on the old page */
         lock_prdt_t*	right_prdt,	/*!< in: MBR on the new page */
-	ulint		space,		/*!< in: parent space id */
-	ulint		page_no)	/*!< in: parent page number */
+	const page_id_t	page_id)	/*!< in: parent page */
 {
-	lock_t*		lock;
-
 	lock_mutex_enter();
 
 	/* Get all locks in parent */
-	for (lock = lock_rec_get_first_on_page_addr(
-			lock_sys.prdt_hash, space, page_no);
+	for (lock_t *lock = lock_sys.get_first_prdt(page_id);
 	     lock;
 	     lock = lock_rec_get_next_on_page(lock)) {
 		lock_prdt_t*	lock_prdt;
@@ -675,21 +673,15 @@ lock_prdt_update_split_low(
 	buf_block_t*	new_block,	/*!< in/out: the new half page */
 	lock_prdt_t*	prdt,		/*!< in: MBR on the old page */
 	lock_prdt_t*	new_prdt,	/*!< in: MBR on the new page */
-	ulint		space,		/*!< in: space id */
-	ulint		page_no,	/*!< in: page number */
-	ulint		type_mode)	/*!< in: LOCK_PREDICATE or
+	const page_id_t	page_id,	/*!< in: page number */
+	unsigned	type_mode)	/*!< in: LOCK_PREDICATE or
 					LOCK_PRDT_PAGE */
 {
 	lock_t*		lock;
 
-	lock_mutex_enter();
-
-	for (lock = lock_rec_get_first_on_page_addr(
-			lock_hash_get(type_mode), space, page_no);
+	for (lock = lock_sys.get_first(*lock_hash_get(type_mode), page_id);
 	     lock;
 	     lock = lock_rec_get_next_on_page(lock)) {
-		ut_ad(lock);
-
 		/* First dealing with Page Lock */
 		if (lock->type_mode & LOCK_PRDT_PAGE) {
 			/* Duplicate the lock to new page */
@@ -739,8 +731,6 @@ lock_prdt_update_split_low(
 			trx_mutex_exit(lock->trx);
 		}
 	}
-
-	lock_mutex_exit();
 }
 
 /**************************************************************//**
@@ -751,14 +741,17 @@ lock_prdt_update_split(
 	buf_block_t*	new_block,	/*!< in/out: the new half page */
 	lock_prdt_t*	prdt,		/*!< in: MBR on the old page */
 	lock_prdt_t*	new_prdt,	/*!< in: MBR on the new page */
-	ulint		space,		/*!< in: space id */
-	ulint		page_no)	/*!< in: page number */
+	const page_id_t	page_id)	/*!< in: page number */
 {
+	lock_mutex_enter();
+
 	lock_prdt_update_split_low(new_block, prdt, new_prdt,
-				   space, page_no, LOCK_PREDICATE);
+				   page_id, LOCK_PREDICATE);
 
 	lock_prdt_update_split_low(new_block, NULL, NULL,
-				   space, page_no, LOCK_PRDT_PAGE);
+				   page_id, LOCK_PRDT_PAGE);
+
+	lock_mutex_exit();
 }
 
 /*********************************************************************//**
@@ -775,7 +768,7 @@ lock_init_prdt_from_mbr(
 
 	if (heap != NULL) {
 		prdt->data = mem_heap_alloc(heap, sizeof(*mbr));
-		ut_memcpy(prdt->data, mbr, sizeof(*mbr));
+		memcpy(prdt->data, mbr, sizeof(*mbr));
 	} else {
 		prdt->data = static_cast<void*>(mbr);
 	}
@@ -797,7 +790,7 @@ lock_prdt_lock(
 				records: LOCK_S or LOCK_X; the
 				latter is possible in
 				SELECT FOR UPDATE */
-	ulint		type_mode,
+	unsigned	type_mode,
 				/*!< in: LOCK_PREDICATE or LOCK_PRDT_PAGE */
 	que_thr_t*	thr)	/*!< in: query thread
 				(can be NULL if BTR_NO_LOCKING_FLAG) */
@@ -814,7 +807,7 @@ lock_prdt_lock(
 	ut_ad(!dict_index_is_online_ddl(index));
 	ut_ad(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE));
 
-	hash_table_t*	hash = type_mode == LOCK_PREDICATE
+	const hash_table_t& hash = type_mode == LOCK_PREDICATE
 		? lock_sys.prdt_hash
 		: lock_sys.prdt_page_hash;
 
@@ -825,15 +818,15 @@ lock_prdt_lock(
 
 	lock_mutex_enter();
 
-	const ulint	prdt_mode = ulint(mode) | type_mode;
-	lock_t*		lock = lock_rec_get_first_on_page(hash, block);
+	const unsigned	prdt_mode = type_mode | mode;
+	lock_t*		lock = lock_sys.get_first(hash, block->page.id());
 
 	if (lock == NULL) {
 		lock = lock_rec_create(
 #ifdef WITH_WSREP
 			NULL, NULL, /* FIXME: replicate SPATIAL INDEX locks */
 #endif
-			ulint(mode) | type_mode, block, PRDT_HEAPNO,
+			prdt_mode, block, PRDT_HEAPNO,
 			index, trx, FALSE);
 
 		status = LOCK_REC_SUCCESS_CREATED;
@@ -865,7 +858,7 @@ lock_prdt_lock(
 						NULL, /* FIXME: replicate
 						      SPATIAL INDEX locks */
 #endif
-						ulint(mode) | type_mode,
+						prdt_mode,
 						block, PRDT_HEAPNO,
 						index, thr, prdt);
 				} else {
@@ -905,9 +898,7 @@ Acquire a "Page" lock on a block
 @return	DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
 dberr_t
 lock_place_prdt_page_lock(
-/*======================*/
-	ulint		space,		/*!< in: space for the page to lock */
-	ulint		page_no,	/*!< in: page number */
+	const page_id_t	page_id,	/*!< in: page identifier */
 	dict_index_t*	index,		/*!< in: secondary index */
 	que_thr_t*	thr)		/*!< in: query thread */
 {
@@ -924,9 +915,7 @@ lock_place_prdt_page_lock(
 
 	lock_mutex_enter();
 
-	const lock_t*	lock = lock_rec_get_first_on_page_addr(
-		lock_sys.prdt_page_hash, space, page_no);
-
+	const lock_t*	lock = lock_sys.get_first_prdt_page(page_id);
 	const ulint	mode = LOCK_S | LOCK_PRDT_PAGE;
 	trx_t*		trx = thr_get_trx(thr);
 
@@ -952,7 +941,7 @@ lock_place_prdt_page_lock(
 #ifdef WITH_WSREP
 			NULL, NULL, /* FIXME: replicate SPATIAL INDEX locks */
 #endif
-			mode, space, page_no, NULL, PRDT_HEAPNO,
+			mode, page_id, NULL, PRDT_HEAPNO,
 			index, trx, FALSE);
 
 #ifdef PRDT_DIAG
@@ -967,25 +956,19 @@ lock_place_prdt_page_lock(
 
 /** Check whether there are R-tree Page lock on a page
 @param[in]	trx	trx to test the lock
-@param[in]	space	space id for the page
-@param[in]	page_no	page number
+@param[in]	page_id	page identifier
 @return	true if there is none */
-bool
-lock_test_prdt_page_lock(
-	const trx_t*    trx,
-	ulint           space,
-	ulint           page_no)
+bool lock_test_prdt_page_lock(const trx_t *trx, const page_id_t page_id)
 {
 	lock_t*		lock;
 
 	lock_mutex_enter();
 
-	lock = lock_rec_get_first_on_page_addr(
-		lock_sys.prdt_page_hash, space, page_no);
+	lock = lock_sys.get_first_prdt_page(page_id);
 
 	lock_mutex_exit();
 
-	return(lock == NULL || trx == lock->trx);
+	return(!lock || trx == lock->trx);
 }
 
 /*************************************************************//**
@@ -999,20 +982,14 @@ lock_prdt_rec_move(
 	const buf_block_t*	donator)	/*!< in: buffer block containing
 						the donating record */
 {
-	lock_t* lock;
-
-	if (!lock_sys.prdt_hash) {
-		return;
-	}
-
 	lock_mutex_enter();
 
-	for (lock = lock_rec_get_first(lock_sys.prdt_hash,
-				       donator, PRDT_HEAPNO);
+	for (lock_t *lock = lock_rec_get_first(&lock_sys.prdt_hash,
+					       donator, PRDT_HEAPNO);
 	     lock != NULL;
 	     lock = lock_rec_get_next(PRDT_HEAPNO, lock)) {
 
-		const ulint     type_mode = lock->type_mode;
+		const auto type_mode = lock->type_mode;
 		lock_prdt_t*	lock_prdt = lock_get_prdt_from_lock(lock);
 
 		lock_rec_reset_nth_bit(lock, PRDT_HEAPNO);
@@ -1036,15 +1013,10 @@ lock_prdt_page_free_from_discard(
 {
 	lock_t*	lock;
 	lock_t*	next_lock;
-	ulint	space;
-	ulint	page_no;
 
 	ut_ad(lock_mutex_own());
 
-	space = block->page.id.space();
-	page_no = block->page.id.page_no();
-
-	lock = lock_rec_get_first_on_page_addr(lock_hash, space, page_no);
+	lock = lock_sys.get_first(*lock_hash, block->page.id());
 
 	while (lock != NULL) {
 		next_lock = lock_rec_get_next_on_page(lock);
diff --git a/storage/innobase/lock/lock0wait.cc b/storage/innobase/lock/lock0wait.cc
index 5eb03f668b3..8182d1230ed 100644
--- a/storage/innobase/lock/lock0wait.cc
+++ b/storage/innobase/lock/lock0wait.cc
@@ -36,6 +36,7 @@ Created 25/5/2010 Sunny Bains
 #include "row0mysql.h"
 #include "srv0start.h"
 #include "lock0priv.h"
+#include "srv0srv.h"
 
 /*********************************************************************//**
 Print the contents of the lock_sys_t::waiting_threads array. */
@@ -51,12 +52,10 @@ lock_wait_table_print(void)
 	for (ulint i = 0; i < srv_max_n_threads; i++, ++slot) {
 
 		fprintf(stderr,
-			"Slot %lu: thread type %lu,"
-			" in use %lu, susp %lu, timeout %lu, time %lu\n",
+			"Slot %lu:"
+			" in use %lu, timeout %lu, time %lu\n",
 			(ulong) i,
-			(ulong) slot->type,
 			(ulong) slot->in_use,
-			(ulong) slot->suspended,
 			slot->wait_timeout,
 			(ulong) difftime(time(NULL), slot->suspend_time));
 	}
@@ -154,7 +153,6 @@ lock_wait_table_reserve_slot(
 			}
 
 			os_event_reset(slot->event);
-			slot->suspended = TRUE;
 			slot->suspend_time = time(NULL);
 			slot->wait_timeout = wait_timeout;
 
@@ -164,7 +162,10 @@ lock_wait_table_reserve_slot(
 
 			ut_ad(lock_sys.last_slot
 			      <= lock_sys.waiting_threads + srv_max_n_threads);
-
+			if (!lock_sys.timeout_timer_active) {
+				lock_sys.timeout_timer_active = true;
+				lock_sys.timeout_timer->set_time(1000, 0);
+			}
 			return(slot);
 		}
 	}
@@ -191,7 +192,7 @@ wsrep_is_BF_lock_timeout(
 	const trx_t*	trx)
 {
 	bool long_wait= (trx->error_state != DB_DEADLOCK &&
-			 trx->is_wsrep() &&
+			 srv_monitor_timer && trx->is_wsrep() &&
 			 wsrep_thd_is_BF(trx->mysql_thd, false));
 	bool was_wait= true;
 
@@ -223,9 +224,9 @@ lock_wait_suspend_thread(
 {
 	srv_slot_t*	slot;
 	trx_t*		trx;
-	ibool		was_declared_inside_innodb;
 	ulong		lock_wait_timeout;
 
+	ut_a(lock_sys.timeout_timer.get());
 	trx = thr_get_trx(thr);
 
 	if (trx->mysql_thd != 0) {
@@ -317,16 +318,6 @@ lock_wait_suspend_thread(
 
 	/* Suspend this thread and wait for the event. */
 
-	was_declared_inside_innodb = trx->declared_to_be_inside_innodb;
-
-	if (was_declared_inside_innodb) {
-		/* We must declare this OS thread to exit InnoDB, since a
-		possible other thread holding a lock which this thread waits
-		for must be allowed to enter, sooner or later */
-
-		srv_conc_force_exit_innodb(trx);
-	}
-
 	/* Unknown is also treated like a record lock */
 	if (lock_type == ULINT_UNDEFINED || lock_type == LOCK_REC) {
 		thd_wait_begin(trx->mysql_thd, THD_WAIT_ROW_LOCK);
@@ -342,13 +333,6 @@ lock_wait_suspend_thread(
 	/* After resuming, reacquire the data dictionary latch if
 	necessary. */
 
-	if (was_declared_inside_innodb) {
-
-		/* Return back inside InnoDB */
-
-		srv_conc_force_enter_innodb(trx);
-	}
-
 	if (had_dict_lock) {
 
 		row_mysql_freeze_data_dictionary(trx);
@@ -449,7 +433,6 @@ lock_wait_check_and_cancel(
 {
 	ut_ad(lock_wait_mutex_own());
 	ut_ad(slot->in_use);
-	ut_ad(slot->suspended);
 
 	double wait_time = difftime(time(NULL), slot->suspend_time);
 	trx_t* trx = thr_get_trx(slot->thr);
@@ -489,67 +472,31 @@ lock_wait_check_and_cancel(
 	}
 }
 
-/*********************************************************************//**
-A thread which wakes up threads whose lock wait may have lasted too long.
-@return a dummy parameter */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(lock_wait_timeout_thread)(void*)
+/** A task which wakes up threads whose lock wait may have lasted too long */
+void lock_wait_timeout_task(void*)
 {
-	int64_t		sig_count = 0;
-	os_event_t	event = lock_sys.timeout_event;
-
-	ut_ad(!srv_read_only_mode);
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(srv_lock_timeout_thread_key);
-#endif /* UNIV_PFS_THREAD */
-
-	do {
-		srv_slot_t*	slot;
-
-		/* When someone is waiting for a lock, we wake up every second
-		and check if a timeout has passed for a lock wait */
-
-		os_event_wait_time_low(event, 1000000, sig_count);
-		sig_count = os_event_reset(event);
-
-		if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
-			break;
-		}
-
-		lock_wait_mutex_enter();
-
-		/* Check all slots for user threads that are waiting
-	       	on locks, and if they have exceeded the time limit. */
-
-		for (slot = lock_sys.waiting_threads;
-		     slot < lock_sys.last_slot;
-		     ++slot) {
-
-			/* We are doing a read without the lock mutex
-			and/or the trx mutex. This is OK because a slot
-		       	can't be freed or reserved without the lock wait
-		       	mutex. */
-
-			if (slot->in_use) {
-				lock_wait_check_and_cancel(slot);
-			}
-		}
-
-		sig_count = os_event_reset(event);
-
-		lock_wait_mutex_exit();
-
-	} while (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP);
-
-	lock_sys.timeout_thread_active = false;
-
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-
-	os_thread_exit();
-
-	OS_THREAD_DUMMY_RETURN;
+  lock_wait_mutex_enter();
+
+  /* Check all slots for user threads that are waiting
+  on locks, and if they have exceeded the time limit. */
+  bool any_slot_in_use= false;
+  for (srv_slot_t *slot= lock_sys.waiting_threads;
+       slot < lock_sys.last_slot; ++slot)
+  {
+    /* We are doing a read without the lock mutex and/or the trx
+    mutex. This is OK because a slot can't be freed or reserved
+    without the lock wait mutex. */
+    if (slot->in_use)
+    {
+      any_slot_in_use= true;
+      lock_wait_check_and_cancel(slot);
+    }
+  }
+
+  if (any_slot_in_use)
+    lock_sys.timeout_timer->set_time(1000, 0);
+  else
+    lock_sys.timeout_timer_active= false;
+
+  lock_wait_mutex_exit();
 }
-
diff --git a/storage/innobase/log/log0crypt.cc b/storage/innobase/log/log0crypt.cc
index 36cfe259526..dbf41c7dc3f 100644
--- a/storage/innobase/log/log0crypt.cc
+++ b/storage/innobase/log/log0crypt.cc
@@ -25,12 +25,11 @@ Modified           Jan Lindström jan.lindstrom@mariadb.com
 MDEV-11782: Rewritten for MariaDB 10.2 by Marko Mäkelä, MariaDB Corporation.
 *******************************************************/
 #include <my_global.h>
-#include "m_string.h"
 #include "log0crypt.h"
 #include <mysql/service_my_crypt.h>
+#include "assume_aligned.h"
 
 #include "log0crypt.h"
-#include "srv0start.h" // for srv_start_lsn
 #include "log0recv.h"  // for recv_sys
 
 /** innodb_encrypt_log: whether to encrypt the redo log */
@@ -39,22 +38,15 @@ my_bool srv_encrypt_log;
 /** Redo log encryption key ID */
 #define LOG_DEFAULT_ENCRYPTION_KEY 1
 
-struct aes_block_t {
-	byte		bytes[MY_AES_BLOCK_SIZE];
-};
-
 struct crypt_info_t {
 	ulint		checkpoint_no; /*!< checkpoint no; 32 bits */
 	uint		key_version;   /*!< mysqld key version */
 	/** random string for encrypting the key */
-	aes_block_t	crypt_msg;
+	alignas(8) byte	crypt_msg[MY_AES_BLOCK_SIZE];
 	/** the secret key */
-	aes_block_t	crypt_key;
+	alignas(8) byte crypt_key[MY_AES_BLOCK_SIZE];
 	/** a random string for the per-block initialization vector */
-	union {
-		uint32_t	word;
-		byte		bytes[4];
-	} crypt_nonce;
+	alignas(4) byte	crypt_nonce[4];
 };
 
 /** The crypt info */
@@ -93,7 +85,7 @@ static bool init_crypt_key(crypt_info_t* info, bool upgrade = false)
 	byte	mysqld_key[MY_AES_MAX_KEY_LENGTH];
 	uint	keylen = sizeof mysqld_key;
 
-	compile_time_assert(16 == sizeof info->crypt_key.bytes);
+	compile_time_assert(16 == sizeof info->crypt_key);
 	compile_time_assert(16 == MY_AES_BLOCK_SIZE);
 
 	if (uint rc = encryption_key_get(LOG_DEFAULT_ENCRYPTION_KEY,
@@ -117,8 +109,8 @@ static bool init_crypt_key(crypt_info_t* info, bool upgrade = false)
 	uint dst_len;
 	int err= my_aes_crypt(MY_AES_ECB,
 			      ENCRYPTION_FLAG_NOPAD | ENCRYPTION_FLAG_ENCRYPT,
-			      info->crypt_msg.bytes, MY_AES_BLOCK_SIZE,
-			      info->crypt_key.bytes, &dst_len,
+			      info->crypt_msg, MY_AES_BLOCK_SIZE,
+			      info->crypt_key, &dst_len,
 			      mysqld_key, keylen, NULL, 0);
 
 	if (err != MY_AES_OK || dst_len != MY_AES_BLOCK_SIZE) {
@@ -143,42 +135,34 @@ bool log_crypt(byte* buf, lsn_t lsn, ulint size, log_crypt_t op)
 	ut_ad(ulint(buf) % OS_FILE_LOG_BLOCK_SIZE == 0);
 	ut_a(info.key_version);
 
-	uint32_t aes_ctr_iv[MY_AES_BLOCK_SIZE / sizeof(uint32_t)];
-	compile_time_assert(sizeof(uint32_t) == 4);
+	alignas(8) byte aes_ctr_iv[MY_AES_BLOCK_SIZE];
 
 #define LOG_CRYPT_HDR_SIZE 4
 	lsn &= ~lsn_t(OS_FILE_LOG_BLOCK_SIZE - 1);
 
 	for (const byte* const end = buf + size; buf != end;
 	     buf += OS_FILE_LOG_BLOCK_SIZE, lsn += OS_FILE_LOG_BLOCK_SIZE) {
-		uint32_t dst[(OS_FILE_LOG_BLOCK_SIZE - LOG_CRYPT_HDR_SIZE
-			      - LOG_BLOCK_CHECKSUM)
-			     / sizeof(uint32_t)];
+		alignas(4) byte dst[OS_FILE_LOG_BLOCK_SIZE - LOG_CRYPT_HDR_SIZE
+				    - LOG_BLOCK_CHECKSUM];
 
 		/* The log block number is not encrypted. */
-		*aes_ctr_iv =
-#ifdef WORDS_BIGENDIAN
-			~LOG_BLOCK_FLUSH_BIT_MASK
-#else
-			~(LOG_BLOCK_FLUSH_BIT_MASK >> 24)
-#endif
-			& (*dst = *reinterpret_cast<const uint32_t*>(
-				   buf + LOG_BLOCK_HDR_NO));
-#if LOG_BLOCK_HDR_NO + 4 != LOG_CRYPT_HDR_SIZE
-# error "LOG_BLOCK_HDR_NO has been moved; redo log format affected!"
-#endif
-		aes_ctr_iv[1] = info.crypt_nonce.word;
-		mach_write_to_8(reinterpret_cast<byte*>(aes_ctr_iv + 2), lsn);
+		memcpy_aligned<4>(dst, buf + LOG_BLOCK_HDR_NO, 4);
+		memcpy_aligned<4>(aes_ctr_iv, buf + LOG_BLOCK_HDR_NO, 4);
+		*aes_ctr_iv &= byte(~(LOG_BLOCK_FLUSH_BIT_MASK >> 24));
+		static_assert(LOG_BLOCK_HDR_NO + 4 == LOG_CRYPT_HDR_SIZE,
+			      "compatibility");
+		memcpy_aligned<4>(aes_ctr_iv + 4, info.crypt_nonce, 4);
+		mach_write_to_8(my_assume_aligned<8>(aes_ctr_iv + 8), lsn);
 		ut_ad(log_block_get_start_lsn(lsn,
 					      log_block_get_hdr_no(buf))
 		      == lsn);
 		byte* key_ver = &buf[OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_KEY
 				     - LOG_BLOCK_CHECKSUM];
-		const uint dst_size
-			= log_sys.log.format == log_t::FORMAT_ENC_10_4
+		const size_t dst_size
+			= log_sys.has_encryption_key_rotation()
 			? sizeof dst - LOG_BLOCK_KEY
 			: sizeof dst;
-		if (log_sys.log.format == log_t::FORMAT_ENC_10_4) {
+		if (log_sys.has_encryption_key_rotation()) {
 			const uint key_version = info.key_version;
 			switch (op) {
 			case LOG_ENCRYPT_ROTATE_KEY:
@@ -214,11 +198,11 @@ bool log_crypt(byte* buf, lsn_t lsn, ulint size, log_crypt_t op)
 
 		uint dst_len;
 		int rc = encryption_crypt(
-			buf + LOG_CRYPT_HDR_SIZE, dst_size,
+			buf + LOG_CRYPT_HDR_SIZE, static_cast<uint>(dst_size),
 			reinterpret_cast<byte*>(dst), &dst_len,
-			const_cast<byte*>(info.crypt_key.bytes),
+			const_cast<byte*>(info.crypt_key),
 			MY_AES_BLOCK_SIZE,
-			reinterpret_cast<byte*>(aes_ctr_iv), sizeof aes_ctr_iv,
+			aes_ctr_iv, sizeof aes_ctr_iv,
 			op == LOG_DECRYPT
 			? ENCRYPTION_FLAG_DECRYPT | ENCRYPTION_FLAG_NOPAD
 			: ENCRYPTION_FLAG_ENCRYPT | ENCRYPTION_FLAG_NOPAD,
@@ -238,37 +222,31 @@ The random parameters will be persisted in the log checkpoint pages.
 @see log_crypt_write_checkpoint_buf()
 @see log_crypt_read_checkpoint_buf()
 @return whether the operation succeeded */
-UNIV_INTERN
-bool
-log_crypt_init()
+bool log_crypt_init()
 {
-	info.key_version = encryption_key_get_latest_version(
-		LOG_DEFAULT_ENCRYPTION_KEY);
-
-	if (info.key_version == ENCRYPTION_KEY_VERSION_INVALID) {
-		ib::error() << "innodb_encrypt_log: cannot get key version";
-		info.key_version = 0;
-		return false;
-	}
-
-	if (my_random_bytes(tmp_iv, MY_AES_BLOCK_SIZE) != MY_AES_OK
-	    || my_random_bytes(info.crypt_msg.bytes, sizeof info.crypt_msg)
-	    != MY_AES_OK
-	    || my_random_bytes(info.crypt_nonce.bytes, sizeof info.crypt_nonce)
-	    != MY_AES_OK) {
-		ib::error() << "innodb_encrypt_log: my_random_bytes() failed";
-		return false;
-	}
-
-	return init_crypt_key(&info);
+  info.key_version=
+    encryption_key_get_latest_version(LOG_DEFAULT_ENCRYPTION_KEY);
+
+  if (info.key_version == ENCRYPTION_KEY_VERSION_INVALID)
+    ib::error() << "log_crypt_init(): cannot get key version";
+  else if (my_random_bytes(tmp_iv, MY_AES_BLOCK_SIZE) != MY_AES_OK ||
+           my_random_bytes(info.crypt_msg, sizeof info.crypt_msg) !=
+           MY_AES_OK ||
+           my_random_bytes(info.crypt_nonce, sizeof info.crypt_nonce) !=
+           MY_AES_OK)
+    ib::error() << "log_crypt_init(): my_random_bytes() failed";
+  else if (init_crypt_key(&info))
+    goto func_exit;
+
+  info.key_version= 0;
+func_exit:
+  return info.key_version != 0;
 }
 
 /** Read the MariaDB 10.1 checkpoint crypto (version, msg and iv) info.
 @param[in]	buf	checkpoint buffer
 @return	whether the operation was successful */
-UNIV_INTERN
-bool
-log_crypt_101_read_checkpoint(const byte* buf)
+ATTRIBUTE_COLD bool log_crypt_101_read_checkpoint(const byte* buf)
 {
 	buf += 20 + 32 * 9;
 
@@ -284,15 +262,14 @@ log_crypt_101_read_checkpoint(const byte* buf)
 			}
 		}
 		if (infos_used >= UT_ARR_SIZE(infos)) {
-			ut_ad(!"too many checkpoint pages");
+			ut_ad("too many checkpoint pages" == 0);
 			goto next_slot;
 		}
 		infos_used++;
 		info.checkpoint_no = checkpoint_no;
 		info.key_version = mach_read_from_4(buf + 4);
-		memcpy(info.crypt_msg.bytes, buf + 8, MY_AES_BLOCK_SIZE);
-		memcpy(info.crypt_nonce.bytes, buf + 24,
-		       sizeof info.crypt_nonce);
+		memcpy(info.crypt_msg, buf + 8, MY_AES_BLOCK_SIZE);
+		memcpy(info.crypt_nonce, buf + 24, sizeof info.crypt_nonce);
 
 		if (!init_crypt_key(&info, true)) {
 			return false;
@@ -305,14 +282,11 @@ next_slot:
 }
 
 /** Decrypt a MariaDB 10.1 redo log block.
-@param[in,out]	buf	log block
+@param[in,out]	buf		log block
+@param[in]	start_lsn	server start LSN
 @return	whether the decryption was successful */
-UNIV_INTERN
-bool
-log_crypt_101_read_block(byte* buf)
+ATTRIBUTE_COLD bool log_crypt_101_read_block(byte* buf, lsn_t start_lsn)
 {
-	ut_ad(log_block_calc_checksum_format_0(buf)
-	      != log_block_get_checksum(buf));
 	const uint32_t checkpoint_no
 		= uint32_t(log_block_get_checkpoint_no(buf));
 	const crypt_info_t* info = infos;
@@ -346,16 +320,16 @@ found:
 	/* The log block header is not encrypted. */
 	memcpy(dst, buf, LOG_BLOCK_HDR_SIZE);
 
-	memcpy(aes_ctr_iv, info->crypt_nonce.bytes, 3);
+	memcpy(aes_ctr_iv, info->crypt_nonce, 3);
 	mach_write_to_8(aes_ctr_iv + 3,
-			log_block_get_start_lsn(srv_start_lsn, log_block_no));
+			log_block_get_start_lsn(start_lsn, log_block_no));
 	memcpy(aes_ctr_iv + 11, buf, 4);
-	aes_ctr_iv[11] &= ~(LOG_BLOCK_FLUSH_BIT_MASK >> 24);
+	aes_ctr_iv[11] &= byte(~(LOG_BLOCK_FLUSH_BIT_MASK >> 24));
 	aes_ctr_iv[15] = 0;
 
 	int rc = encryption_crypt(buf + LOG_BLOCK_HDR_SIZE, src_len,
 				  dst + LOG_BLOCK_HDR_SIZE, &dst_len,
-				  const_cast<byte*>(info->crypt_key.bytes),
+				  const_cast<byte*>(info->crypt_key),
 				  MY_AES_BLOCK_SIZE,
 				  aes_ctr_iv, MY_AES_BLOCK_SIZE,
 				  ENCRYPTION_FLAG_DECRYPT
@@ -378,15 +352,15 @@ void
 log_crypt_write_checkpoint_buf(byte* buf)
 {
 	ut_ad(info.key_version);
-	compile_time_assert(16 == sizeof info.crypt_msg.bytes);
+	compile_time_assert(16 == sizeof info.crypt_msg);
 	compile_time_assert(16 == MY_AES_BLOCK_SIZE);
 	compile_time_assert(LOG_CHECKPOINT_CRYPT_MESSAGE
 			    - LOG_CHECKPOINT_CRYPT_NONCE
 			    == sizeof info.crypt_nonce);
 
-	memcpy(buf + LOG_CHECKPOINT_CRYPT_MESSAGE, info.crypt_msg.bytes,
+	memcpy(buf + LOG_CHECKPOINT_CRYPT_MESSAGE, info.crypt_msg,
 	       MY_AES_BLOCK_SIZE);
-	memcpy(buf + LOG_CHECKPOINT_CRYPT_NONCE, info.crypt_nonce.bytes,
+	memcpy(buf + LOG_CHECKPOINT_CRYPT_NONCE, info.crypt_nonce,
 	       sizeof info.crypt_nonce);
 	mach_write_to_4(buf + LOG_CHECKPOINT_CRYPT_KEY, info.key_version);
 }
@@ -394,9 +368,7 @@ log_crypt_write_checkpoint_buf(byte* buf)
 /** Read the checkpoint crypto (version, msg and iv) info.
 @param[in]	buf	checkpoint buffer
 @return	whether the operation was successful */
-UNIV_INTERN
-bool
-log_crypt_read_checkpoint_buf(const byte* buf)
+bool log_crypt_read_checkpoint_buf(const byte* buf)
 {
 	info.checkpoint_no = mach_read_from_4(buf + (LOG_CHECKPOINT_NO + 4));
 	info.key_version = mach_read_from_4(buf + LOG_CHECKPOINT_CRYPT_KEY);
@@ -404,15 +376,15 @@ log_crypt_read_checkpoint_buf(const byte* buf)
 #if MY_AES_BLOCK_SIZE != 16
 # error "MY_AES_BLOCK_SIZE != 16; redo log checkpoint format affected"
 #endif
-	compile_time_assert(16 == sizeof info.crypt_msg.bytes);
+	compile_time_assert(16 == sizeof info.crypt_msg);
 	compile_time_assert(16 == MY_AES_BLOCK_SIZE);
 	compile_time_assert(LOG_CHECKPOINT_CRYPT_MESSAGE
 			    - LOG_CHECKPOINT_CRYPT_NONCE
 			    == sizeof info.crypt_nonce);
 
-	memcpy(info.crypt_msg.bytes, buf + LOG_CHECKPOINT_CRYPT_MESSAGE,
+	memcpy(info.crypt_msg, buf + LOG_CHECKPOINT_CRYPT_MESSAGE,
 	       MY_AES_BLOCK_SIZE);
-	memcpy(info.crypt_nonce.bytes, buf + LOG_CHECKPOINT_CRYPT_NONCE,
+	memcpy(info.crypt_nonce, buf + LOG_CHECKPOINT_CRYPT_NONCE,
 	       sizeof info.crypt_nonce);
 
 	return init_crypt_key(&info);
@@ -441,7 +413,7 @@ log_tmp_block_encrypt(
 
 	int rc = encryption_crypt(
 		src, uint(size), dst, &dst_len,
-		const_cast<byte*>(info.crypt_key.bytes), MY_AES_BLOCK_SIZE,
+		const_cast<byte*>(info.crypt_key), MY_AES_BLOCK_SIZE,
 		reinterpret_cast<byte*>(iv), uint(sizeof iv),
 		encrypt
 		? ENCRYPTION_FLAG_ENCRYPT|ENCRYPTION_FLAG_NOPAD
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
index bf75b3b7c86..39d606c120a 100644
--- a/storage/innobase/log/log0log.cc
+++ b/storage/innobase/log/log0log.cc
@@ -37,13 +37,11 @@ Created 12/9/1995 Heikki Tuuri
 
 #include "log0log.h"
 #include "log0crypt.h"
-#include "mem0mem.h"
 #include "buf0buf.h"
 #include "buf0flu.h"
 #include "lock0lock.h"
 #include "log0recv.h"
 #include "fil0fil.h"
-#include "dict0boot.h"
 #include "dict0stats_bg.h"
 #include "btr0defragment.h"
 #include "srv0srv.h"
@@ -53,51 +51,19 @@ Created 12/9/1995 Heikki Tuuri
 #include "trx0roll.h"
 #include "srv0mon.h"
 #include "sync0sync.h"
+#include "buf0dump.h"
+#include "log0sync.h"
 
 /*
 General philosophy of InnoDB redo-logs:
 
-1) Every change to a contents of a data page must be done
-through mtr, which in mtr_commit() writes log records
-to the InnoDB redo log.
-
-2) Normally these changes are performed using a mlog_write_ulint()
-or similar function.
-
-3) In some page level operations only a code number of a
-c-function and its parameters are written to the log to
-reduce the size of the log.
-
-  3a) You should not add parameters to these kind of functions
-  (e.g. trx_undo_header_create())
-
-  3b) You should not add such functionality which either change
-  working when compared with the old or are dependent on data
-  outside of the page. These kind of functions should implement
-  self-contained page transformation and it should be unchanged
-  if you don't have very essential reasons to change log
-  semantics or format.
-
-*/
+Every change to a contents of a data page must be done
+through mtr_t, and mtr_t::commit() will write log records
+to the InnoDB redo log. */
 
 /** Redo log system */
 log_t	log_sys;
 
-/** Whether to require checksums on the redo log pages */
-my_bool	innodb_log_checksums;
-
-/* Next log block number to do dummy record filling if no log records written
-for a while */
-static ulint		next_lbn_to_pad = 0;
-
-/* These control how often we print warnings if the last checkpoint is too
-old */
-static bool	log_has_printed_chkp_warning = false;
-static time_t	log_last_warning_time;
-
-static bool	log_has_printed_chkp_margine_warning = false;
-static time_t	log_last_margine_warning_time;
-
 /* A margin for free space in the log buffer before a log entry is catenated */
 #define LOG_BUF_WRITE_MARGIN	(4 * OS_FILE_LOG_BLOCK_SIZE)
 
@@ -106,70 +72,21 @@ static time_t	log_last_margine_warning_time;
 #define LOG_BUF_FLUSH_MARGIN	(LOG_BUF_WRITE_MARGIN		\
 				 + (4U << srv_page_size_shift))
 
-/* This parameter controls asynchronous making of a new checkpoint; the value
-should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */
-
-#define LOG_POOL_CHECKPOINT_RATIO_ASYNC	32
-
-/* This parameter controls synchronous preflushing of modified buffer pages */
-#define LOG_POOL_PREFLUSH_RATIO_SYNC	16
-
-/* The same ratio for asynchronous preflushing; this value should be less than
-the previous */
-#define LOG_POOL_PREFLUSH_RATIO_ASYNC	8
-
-/* Codes used in unlocking flush latches */
-#define LOG_UNLOCK_NONE_FLUSHED_LOCK	1
-#define LOG_UNLOCK_FLUSH_LOCK		2
-
-/** Event to wake up log_scrub_thread */
-os_event_t	log_scrub_event;
-/** Whether log_scrub_thread is active */
-bool		log_scrub_thread_active;
-
-extern "C" UNIV_INTERN
-os_thread_ret_t
-DECLARE_THREAD(log_scrub_thread)(void*);
-
-/****************************************************************//**
-Returns the oldest modified block lsn in the pool, or log_sys.lsn if none
-exists.
-@return LSN of oldest modification */
-static
-lsn_t
-log_buf_pool_get_oldest_modification(void)
-/*======================================*/
-{
-	lsn_t	lsn;
-
-	ut_ad(log_mutex_own());
-
-	lsn = buf_pool_get_oldest_modification();
-
-	if (!lsn) {
-
-		lsn = log_sys.lsn;
-	}
-
-	return(lsn);
-}
-
 /** Extends the log buffer.
 @param[in]	len	requested minimum size in bytes */
 void log_buffer_extend(ulong len)
 {
-	const ulong new_buf_size = ut_calc_align(len, srv_page_size);
-	byte* new_buf = static_cast<byte*>(ut_malloc_dontdump(new_buf_size));
-	TRASH_ALLOC(new_buf, new_buf_size);
-	byte* new_flush_buf =
-		static_cast<byte*>(ut_malloc_dontdump(new_buf_size));
-	TRASH_ALLOC(new_flush_buf, new_buf_size);
+	const size_t new_buf_size = ut_calc_align(len, srv_page_size);
+	byte* new_buf = static_cast<byte*>
+		(ut_malloc_dontdump(new_buf_size, PSI_INSTRUMENT_ME));
+	byte* new_flush_buf = static_cast<byte*>
+		(ut_malloc_dontdump(new_buf_size, PSI_INSTRUMENT_ME));
 
-	log_mutex_enter();
+	mysql_mutex_lock(&log_sys.mutex);
 
 	if (len <= srv_log_buffer_size) {
 		/* Already extended enough by the others */
-		log_mutex_exit();
+		mysql_mutex_unlock(&log_sys.mutex);
 		ut_free_dodump(new_buf, new_buf_size);
 		ut_free_dodump(new_flush_buf, new_buf_size);
 		return;
@@ -182,16 +99,16 @@ void log_buffer_extend(ulong len)
 	byte* old_buf = log_sys.buf;
 	byte* old_flush_buf = log_sys.flush_buf;
 	const ulong old_buf_size = srv_log_buffer_size;
-
-	srv_log_buffer_size = new_buf_size;
+	srv_log_buffer_size = static_cast<ulong>(new_buf_size);
 	log_sys.buf = new_buf;
 	log_sys.flush_buf = new_flush_buf;
-	memcpy(new_buf, old_buf, log_sys.buf_free);
+	memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(new_buf, old_buf,
+					       log_sys.buf_free);
 
 	log_sys.max_buf_free = new_buf_size / LOG_BUF_FLUSH_RATIO
 		- LOG_BUF_FLUSH_MARGIN;
 
-	log_mutex_exit();
+	mysql_mutex_unlock(&log_sys.mutex);
 
 	ut_free_dodump(old_buf, old_buf_size);
 	ut_free_dodump(old_flush_buf, old_buf_size);
@@ -200,279 +117,8 @@ void log_buffer_extend(ulong len)
 		<< new_buf_size << ".";
 }
 
-/** Calculate actual length in redo buffer and file including
-block header and trailer.
-@param[in]	len	length to write
-@return actual length to write including header and trailer. */
-static inline
-ulint
-log_calculate_actual_len(
-	ulint len)
-{
-	ut_ad(log_mutex_own());
-
-	const ulint	framing_size = log_sys.framing_size();
-	/* actual length stored per block */
-	const ulint	len_per_blk = OS_FILE_LOG_BLOCK_SIZE - framing_size;
-
-	/* actual data length in last block already written */
-	ulint	extra_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE);
-
-	ut_ad(extra_len >= LOG_BLOCK_HDR_SIZE);
-	extra_len -= LOG_BLOCK_HDR_SIZE;
-
-	/* total extra length for block header and trailer */
-	extra_len = ((len + extra_len) / len_per_blk) * framing_size;
-
-	return(len + extra_len);
-}
-
-/** Check margin not to overwrite transaction log from the last checkpoint.
-If would estimate the log write to exceed the log_group_capacity,
-waits for the checkpoint is done enough.
-@param[in]	len	length of the data to be written */
-
-void
-log_margin_checkpoint_age(
-	ulint	len)
-{
-	ulint	margin = log_calculate_actual_len(len);
-
-	ut_ad(log_mutex_own());
-
-	if (margin > log_sys.log_group_capacity) {
-		/* return with warning output to avoid deadlock */
-		if (!log_has_printed_chkp_margine_warning
-		    || difftime(time(NULL),
-				log_last_margine_warning_time) > 15) {
-			log_has_printed_chkp_margine_warning = true;
-			log_last_margine_warning_time = time(NULL);
-
-			ib::error() << "The transaction log files are too"
-				" small for the single transaction log (size="
-				<< len << "). So, the last checkpoint age"
-				" might exceed the log group capacity "
-				<< log_sys.log_group_capacity << ".";
-		}
-
-		return;
-	}
-
-	/* Our margin check should ensure that we never reach this condition.
-	Try to do checkpoint once. We cannot keep waiting here as it might
-	result in hang in case the current mtr has latch on oldest lsn */
-	if (log_sys.lsn - log_sys.last_checkpoint_lsn + margin
-	    > log_sys.log_group_capacity) {
-		/* The log write of 'len' might overwrite the transaction log
-		after the last checkpoint. Makes checkpoint. */
-
-		bool	flushed_enough = false;
-
-		if (log_sys.lsn - log_buf_pool_get_oldest_modification()
-		    + margin
-		    <= log_sys.log_group_capacity) {
-			flushed_enough = true;
-		}
-
-		log_sys.check_flush_or_checkpoint = true;
-		log_mutex_exit();
-
-		DEBUG_SYNC_C("margin_checkpoint_age_rescue");
-
-		if (!flushed_enough) {
-			os_thread_sleep(100000);
-		}
-		log_checkpoint(true);
-
-		log_mutex_enter();
-	}
-
-	return;
-}
-
-/** Open the log for log_write_low. The log must be closed with log_close.
-@param[in]	len	length of the data to be written
-@return start lsn of the log record */
-lsn_t
-log_reserve_and_open(
-	ulint	len)
-{
-	ulint	len_upper_limit;
-#ifdef UNIV_DEBUG
-	ulint	count			= 0;
-#endif /* UNIV_DEBUG */
-
-loop:
-	ut_ad(log_mutex_own());
-
-	/* Calculate an upper limit for the space the string may take in the
-	log buffer */
-
-	len_upper_limit = LOG_BUF_WRITE_MARGIN + srv_log_write_ahead_size
-			  + (5 * len) / 4;
-
-	if (log_sys.buf_free + len_upper_limit > srv_log_buffer_size) {
-		log_mutex_exit();
-
-		DEBUG_SYNC_C("log_buf_size_exceeded");
-
-		/* Not enough free space, do a write of the log buffer */
-		log_buffer_sync_in_background(false);
-
-		srv_stats.log_waits.inc();
-
-		ut_ad(++count < 50);
-
-		log_mutex_enter();
-		goto loop;
-	}
-
-	return(log_sys.lsn);
-}
-
-/************************************************************//**
-Writes to the log the string given. It is assumed that the caller holds the
-log mutex. */
-void
-log_write_low(
-/*==========*/
-	const byte*	str,		/*!< in: string */
-	ulint		str_len)	/*!< in: string length */
-{
-	ulint	len;
-
-	ut_ad(log_mutex_own());
-	const ulint trailer_offset = log_sys.trailer_offset();
-part_loop:
-	/* Calculate a part length */
-
-	ulint data_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len;
-
-	if (data_len <= trailer_offset) {
-
-		/* The string fits within the current log block */
-
-		len = str_len;
-	} else {
-		data_len = trailer_offset;
-
-		len = trailer_offset
-			- log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE;
-	}
-
-	memcpy(log_sys.buf + log_sys.buf_free, str, len);
-
-	str_len -= len;
-	str = str + len;
-
-	byte* log_block = static_cast<byte*>(
-		ut_align_down(log_sys.buf + log_sys.buf_free,
-			      OS_FILE_LOG_BLOCK_SIZE));
-
-	log_block_set_data_len(log_block, data_len);
-
-	if (data_len == trailer_offset) {
-		/* This block became full */
-		log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE);
-		log_block_set_checkpoint_no(log_block,
-					    log_sys.next_checkpoint_no);
-		len += log_sys.framing_size();
-
-		log_sys.lsn += len;
-
-		/* Initialize the next block header */
-		log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE,
-			       log_sys.lsn);
-	} else {
-		log_sys.lsn += len;
-	}
-
-	log_sys.buf_free += ulong(len);
-
-	ut_ad(log_sys.buf_free <= srv_log_buffer_size);
-
-	if (str_len > 0) {
-		goto part_loop;
-	}
-
-	srv_stats.log_write_requests.inc();
-}
-
-/************************************************************//**
-Closes the log.
-@return lsn */
-lsn_t
-log_close(void)
-/*===========*/
-{
-	byte*		log_block;
-	ulint		first_rec_group;
-	lsn_t		oldest_lsn;
-	lsn_t		lsn;
-	lsn_t		checkpoint_age;
-
-	ut_ad(log_mutex_own());
-
-	lsn = log_sys.lsn;
-
-	log_block = static_cast<byte*>(
-		ut_align_down(log_sys.buf + log_sys.buf_free,
-			      OS_FILE_LOG_BLOCK_SIZE));
-
-	first_rec_group = log_block_get_first_rec_group(log_block);
-
-	if (first_rec_group == 0) {
-		/* We initialized a new log block which was not written
-		full by the current mtr: the next mtr log record group
-		will start within this block at the offset data_len */
-
-		log_block_set_first_rec_group(
-			log_block, log_block_get_data_len(log_block));
-	}
-
-	if (log_sys.buf_free > log_sys.max_buf_free) {
-		log_sys.check_flush_or_checkpoint = true;
-	}
-
-	checkpoint_age = lsn - log_sys.last_checkpoint_lsn;
-
-	if (checkpoint_age >= log_sys.log_group_capacity) {
-		DBUG_EXECUTE_IF(
-			"print_all_chkp_warnings",
-			log_has_printed_chkp_warning = false;);
-
-		if (!log_has_printed_chkp_warning
-		    || difftime(time(NULL), log_last_warning_time) > 15) {
-
-			log_has_printed_chkp_warning = true;
-			log_last_warning_time = time(NULL);
-
-			ib::error() << "The age of the last checkpoint is "
-				<< checkpoint_age << ", which exceeds the log"
-				" group capacity "
-				<< log_sys.log_group_capacity
-				<< ".";
-		}
-	}
-
-	if (checkpoint_age <= log_sys.max_modified_age_sync) {
-		goto function_exit;
-	}
-
-	oldest_lsn = buf_pool_get_oldest_modification();
-
-	if (!oldest_lsn
-	    || lsn - oldest_lsn > log_sys.max_modified_age_sync
-	    || checkpoint_age > log_sys.max_checkpoint_age_async) {
-		log_sys.check_flush_or_checkpoint = true;
-	}
-function_exit:
-
-	return(lsn);
-}
-
 /** Calculate the recommended highest values for lsn - last_checkpoint_lsn
-and lsn - buf_get_oldest_modification().
+and lsn - buf_pool.get_oldest_modification().
 @param[in]	file_size	requested innodb_log_file_size
 @retval true on success
 @retval false if the smallest log group is too small to
@@ -480,11 +126,16 @@ accommodate the number of OS threads in the database server */
 bool
 log_set_capacity(ulonglong file_size)
 {
+	/* Margin for the free space in the smallest log, before a new query
+	step which modifies the database, is started */
+	const size_t LOG_CHECKPOINT_FREE_PER_THREAD = 4U
+						      << srv_page_size_shift;
+	const size_t LOG_CHECKPOINT_EXTRA_FREE = 8U << srv_page_size_shift;
+
 	lsn_t		margin;
 	ulint		free;
 
-	lsn_t smallest_capacity = (file_size - LOG_FILE_HDR_SIZE)
-		* srv_n_log_files;
+	lsn_t smallest_capacity = file_size - LOG_FILE_HDR_SIZE;
 	/* Add extra safety */
 	smallest_capacity -= smallest_capacity / 10;
 
@@ -493,35 +144,27 @@ log_set_capacity(ulonglong file_size)
 	by single query steps: running out of free log space is a serious
 	system error which requires rebooting the database. */
 
-	free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency)
+	free = LOG_CHECKPOINT_FREE_PER_THREAD * 10
 		+ LOG_CHECKPOINT_EXTRA_FREE;
 	if (free >= smallest_capacity / 2) {
-		ib::error() << "Cannot continue operation. ib_logfiles are too"
-			" small for innodb_thread_concurrency="
-			<< srv_thread_concurrency << ". The combined size of"
-			" ib_logfiles should be bigger than"
-			" 200 kB * innodb_thread_concurrency. "
-			<< INNODB_PARAMETERS_MSG;
-		return(false);
+		ib::error() << "Cannot continue operation because log file is "
+			       "too small. Increase innodb_log_file_size "
+			       "or decrease innodb_thread_concurrency. "
+			    << INNODB_PARAMETERS_MSG;
+		return false;
 	}
 
 	margin = smallest_capacity - free;
 	margin = margin - margin / 10;	/* Add still some extra safety */
 
-	log_mutex_enter();
+	mysql_mutex_lock(&log_sys.mutex);
 
-	log_sys.log_group_capacity = smallest_capacity;
+	log_sys.log_capacity = smallest_capacity;
 
-	log_sys.max_modified_age_async = margin
-		- margin / LOG_POOL_PREFLUSH_RATIO_ASYNC;
-	log_sys.max_modified_age_sync = margin
-		- margin / LOG_POOL_PREFLUSH_RATIO_SYNC;
-
-	log_sys.max_checkpoint_age_async = margin - margin
-		/ LOG_POOL_CHECKPOINT_RATIO_ASYNC;
+	log_sys.max_modified_age_async = margin - margin / 8;
 	log_sys.max_checkpoint_age = margin;
 
-	log_mutex_exit();
+	mysql_mutex_unlock(&log_sys.mutex);
 
 	return(true);
 }
@@ -533,145 +176,381 @@ void log_t::create()
   ut_ad(!is_initialised());
   m_initialised= true;
 
-  mutex_create(LATCH_ID_LOG_SYS, &mutex);
-  mutex_create(LATCH_ID_LOG_WRITE, &write_mutex);
-  mutex_create(LATCH_ID_LOG_FLUSH_ORDER, &log_flush_order_mutex);
+  mysql_mutex_init(log_sys_mutex_key, &mutex, nullptr);
+  mysql_mutex_init(log_flush_order_mutex_key, &flush_order_mutex, nullptr);
 
   /* Start the lsn from one log block from zero: this way every
   log record has a non-zero start lsn, a fact which we will use */
 
-  lsn= LOG_START_LSN;
+  set_lsn(LOG_START_LSN + LOG_BLOCK_HDR_SIZE);
+  set_flushed_lsn(LOG_START_LSN + LOG_BLOCK_HDR_SIZE);
 
   ut_ad(srv_log_buffer_size >= 16 * OS_FILE_LOG_BLOCK_SIZE);
   ut_ad(srv_log_buffer_size >= 4U << srv_page_size_shift);
 
-  buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size));
+  buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size,
+                                             PSI_INSTRUMENT_ME));
   TRASH_ALLOC(buf, srv_log_buffer_size);
-  flush_buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size));
+  flush_buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size,
+                                                   PSI_INSTRUMENT_ME));
   TRASH_ALLOC(flush_buf, srv_log_buffer_size);
 
   max_buf_free= srv_log_buffer_size / LOG_BUF_FLUSH_RATIO -
     LOG_BUF_FLUSH_MARGIN;
-  check_flush_or_checkpoint= true;
+  set_check_flush_or_checkpoint();
 
   n_log_ios_old= n_log_ios;
   last_printout_time= time(NULL);
 
   buf_next_to_write= 0;
-  write_lsn= lsn;
-  flushed_to_disk_lsn= 0;
-  n_pending_flushes= 0;
-  flush_event = os_event_create("log_flush_event");
-  os_event_set(flush_event);
+  last_checkpoint_lsn= write_lsn= LOG_START_LSN;
   n_log_ios= 0;
   n_log_ios_old= 0;
-  log_group_capacity= 0;
+  log_capacity= 0;
   max_modified_age_async= 0;
-  max_modified_age_sync= 0;
-  max_checkpoint_age_async= 0;
   max_checkpoint_age= 0;
   next_checkpoint_no= 0;
   next_checkpoint_lsn= 0;
-  append_on_checkpoint= NULL;
-  n_pending_checkpoint_writes= 0;
-
-  last_checkpoint_lsn= lsn;
-  rw_lock_create(checkpoint_lock_key, &checkpoint_lock, SYNC_NO_ORDER_CHECK);
+  checkpoint_pending= false;
 
-  log_block_init(buf, lsn);
+  log_block_init(buf, LOG_START_LSN);
   log_block_set_first_rec_group(buf, LOG_BLOCK_HDR_SIZE);
 
   buf_free= LOG_BLOCK_HDR_SIZE;
-  lsn= LOG_START_LSN + LOG_BLOCK_HDR_SIZE;
+  checkpoint_buf= static_cast<byte*>
+    (aligned_malloc(OS_FILE_LOG_BLOCK_SIZE, OS_FILE_LOG_BLOCK_SIZE));
+}
+
+file_os_io::file_os_io(file_os_io &&rhs) : m_fd(rhs.m_fd)
+{
+  rhs.m_fd= OS_FILE_CLOSED;
+}
+
+file_os_io &file_os_io::operator=(file_os_io &&rhs)
+{
+  std::swap(m_fd, rhs.m_fd);
+  return *this;
+}
 
-  MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, lsn - last_checkpoint_lsn);
+file_os_io::~file_os_io() noexcept
+{
+  if (is_opened())
+    close();
+}
+
+dberr_t file_os_io::open(const char *path, bool read_only) noexcept
+{
+  ut_ad(!is_opened());
+
+  bool success;
+  auto tmp_fd= os_file_create(
+      innodb_log_file_key, path, OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
+      OS_FILE_NORMAL, OS_LOG_FILE, read_only, &success);
+  if (!success)
+    return DB_ERROR;
+
+  m_durable_writes= srv_file_flush_method == SRV_O_DSYNC;
+  m_fd= tmp_fd;
+  return success ? DB_SUCCESS : DB_ERROR;
+}
+
+dberr_t file_os_io::rename(const char *old_path, const char *new_path) noexcept
+{
+  return os_file_rename(innodb_log_file_key, old_path, new_path) ? DB_SUCCESS
+                                                                 : DB_ERROR;
+}
 
-  log_scrub_thread_active= !srv_read_only_mode && srv_scrub_log;
-  if (log_scrub_thread_active) {
-    log_scrub_event= os_event_create("log_scrub_event");
-    os_thread_create(log_scrub_thread, NULL, NULL);
+dberr_t file_os_io::close() noexcept
+{
+  if (!os_file_close(m_fd))
+    return DB_ERROR;
+
+  m_fd= OS_FILE_CLOSED;
+  return DB_SUCCESS;
+}
+
+dberr_t file_os_io::read(os_offset_t offset, span<byte> buf) noexcept
+{
+  return os_file_read(IORequestRead, m_fd, buf.data(), offset, buf.size());
+}
+
+dberr_t file_os_io::write(const char *path, os_offset_t offset,
+                          span<const byte> buf) noexcept
+{
+  return os_file_write(IORequestWrite, path, m_fd, buf.data(), offset,
+                       buf.size());
+}
+
+dberr_t file_os_io::flush() noexcept
+{
+  return os_file_flush(m_fd) ? DB_SUCCESS : DB_ERROR;
+}
+
+#ifdef HAVE_PMEM
+
+#include <libpmem.h>
+
+/** Memory mapped file */
+class mapped_file_t
+{
+public:
+  mapped_file_t()= default;
+  mapped_file_t(const mapped_file_t &)= delete;
+  mapped_file_t &operator=(const mapped_file_t &)= delete;
+  mapped_file_t(mapped_file_t &&)= delete;
+  mapped_file_t &operator=(mapped_file_t &&)= delete;
+  ~mapped_file_t() noexcept;
+
+  dberr_t map(const char *path, bool read_only= false,
+              bool nvme= false) noexcept;
+  dberr_t unmap() noexcept;
+  byte *data() noexcept { return m_area.data(); }
+
+private:
+  span<byte> m_area;
+};
+
+mapped_file_t::~mapped_file_t() noexcept
+{
+  if (!m_area.empty())
+    unmap();
+}
+
+dberr_t mapped_file_t::map(const char *path, bool read_only,
+                           bool nvme) noexcept
+{
+  auto fd= mysql_file_open(innodb_log_file_key, path,
+                           read_only ? O_RDONLY : O_RDWR, MYF(MY_WME));
+  if (fd == -1)
+    return DB_ERROR;
+
+  const auto file_size= size_t{os_file_get_size(path).m_total_size};
+
+  const int nvme_flag= nvme ? MAP_SYNC : 0;
+  void *ptr=
+      my_mmap(0, file_size, read_only ? PROT_READ : PROT_READ | PROT_WRITE,
+              MAP_SHARED_VALIDATE | nvme_flag, fd, 0);
+  mysql_file_close(fd, MYF(MY_WME));
+
+  if (ptr == MAP_FAILED)
+    return DB_ERROR;
+
+  m_area= {static_cast<byte *>(ptr), file_size};
+  return DB_SUCCESS;
+}
+
+dberr_t mapped_file_t::unmap() noexcept
+{
+  ut_ad(!m_area.empty());
+
+  if (my_munmap(m_area.data(), m_area.size()))
+    return DB_ERROR;
+
+  m_area= {};
+  return DB_SUCCESS;
+}
+
+static bool is_pmem(const char *path) noexcept
+{
+  mapped_file_t mf;
+  return mf.map(path, true, true) == DB_SUCCESS ? true : false;
+}
+
+class file_pmem_io final : public file_io
+{
+public:
+  file_pmem_io() noexcept : file_io(true) {}
+
+  dberr_t open(const char *path, bool read_only) noexcept final
+  {
+    return m_file.map(path, read_only, true);
   }
+  dberr_t rename(const char *old_path, const char *new_path) noexcept final
+  {
+    return os_file_rename(innodb_log_file_key, old_path, new_path) ? DB_SUCCESS
+                                                                   : DB_ERROR;
+  }
+  dberr_t close() noexcept final { return m_file.unmap(); }
+  dberr_t read(os_offset_t offset, span<byte> buf) noexcept final
+  {
+    memcpy(buf.data(), m_file.data() + offset, buf.size());
+    return DB_SUCCESS;
+  }
+  dberr_t write(const char *, os_offset_t offset,
+                span<const byte> buf) noexcept final
+  {
+    pmem_memcpy_persist(m_file.data() + offset, buf.data(), buf.size());
+    return DB_SUCCESS;
+  }
+  dberr_t flush() noexcept final
+  {
+    ut_ad(0);
+    return DB_SUCCESS;
+  }
+
+private:
+  mapped_file_t m_file;
+};
+#endif
+
+dberr_t log_file_t::open(bool read_only) noexcept
+{
+  ut_a(!is_opened());
+
+#ifdef HAVE_PMEM
+  auto ptr= is_pmem(m_path.c_str())
+                ? std::unique_ptr<file_io>(new file_pmem_io)
+                : std::unique_ptr<file_io>(new file_os_io);
+#else
+  auto ptr= std::unique_ptr<file_io>(new file_os_io);
+#endif
+
+  if (dberr_t err= ptr->open(m_path.c_str(), read_only))
+    return err;
+
+  m_file= std::move(ptr);
+  return DB_SUCCESS;
 }
 
-/** Initialize the redo log.
-@param[in]	n_files		number of files */
-void log_t::files::create(ulint n_files)
+bool log_file_t::is_opened() const noexcept
 {
-  ut_ad(n_files <= SRV_N_LOG_FILES_MAX);
-  ut_ad(this == &log_sys.log);
-  ut_ad(log_sys.is_initialised());
+  return static_cast<bool>(m_file);
+}
 
-  this->n_files= n_files;
-  format= srv_encrypt_log ? log_t::FORMAT_ENC_10_4 : log_t::FORMAT_10_4;
-  subformat= 2;
-  file_size= srv_log_file_size;
-  lsn= LOG_START_LSN;
-  lsn_offset= LOG_FILE_HDR_SIZE;
+dberr_t log_file_t::rename(std::string new_path) noexcept
+{
+  if (dberr_t err= m_file->rename(m_path.c_str(), new_path.c_str()))
+    return err;
+
+  m_path = std::move(new_path);
+  return DB_SUCCESS;
 }
 
-/******************************************************//**
-Writes a log file header to a log file space. */
-static
-void
-log_file_header_flush(
-	ulint		nth_file,	/*!< in: header to the nth file in the
-					log file space */
-	lsn_t		start_lsn)	/*!< in: log file data starts at this
-					lsn */
+dberr_t log_file_t::close() noexcept
 {
-	lsn_t	dest_offset;
+  ut_a(is_opened());
 
-	ut_ad(log_write_mutex_own());
-	ut_ad(!recv_no_log_write);
-	ut_a(nth_file < log_sys.log.n_files);
-	ut_ad(log_sys.log.format == log_t::FORMAT_10_4
-	      || log_sys.log.format == log_t::FORMAT_ENC_10_4);
+  if (dberr_t err= m_file->close())
+    return err;
 
-	// man 2 open suggests this buffer to be aligned by 512 for O_DIRECT
-	MY_ALIGNED(OS_FILE_LOG_BLOCK_SIZE)
-	byte buf[OS_FILE_LOG_BLOCK_SIZE] = {0};
+  m_file.reset();
+  return DB_SUCCESS;
+}
 
-	mach_write_to_4(buf + LOG_HEADER_FORMAT, log_sys.log.format);
-	mach_write_to_4(buf + LOG_HEADER_SUBFORMAT, log_sys.log.subformat);
-	mach_write_to_8(buf + LOG_HEADER_START_LSN, start_lsn);
-	strcpy(reinterpret_cast<char*>(buf) + LOG_HEADER_CREATOR,
-	       LOG_HEADER_CREATOR_CURRENT);
-	ut_ad(LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR
-	      >= sizeof LOG_HEADER_CREATOR_CURRENT);
-	log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
+dberr_t log_file_t::read(os_offset_t offset, span<byte> buf) noexcept
+{
+  ut_ad(is_opened());
+  return m_file->read(offset, buf);
+}
 
-	dest_offset = nth_file * log_sys.log.file_size;
+bool log_file_t::writes_are_durable() const noexcept
+{
+  return m_file->writes_are_durable();
+}
 
-	DBUG_PRINT("ib_log", ("write " LSN_PF
-			      " file " ULINTPF " header",
-			      start_lsn, nth_file));
+dberr_t log_file_t::write(os_offset_t offset, span<const byte> buf) noexcept
+{
+  ut_ad(is_opened());
+  return m_file->write(m_path.c_str(), offset, buf);
+}
 
-	log_sys.n_log_ios++;
+dberr_t log_file_t::flush() noexcept
+{
+  ut_ad(is_opened());
+  return m_file->flush();
+}
 
-	srv_stats.os_log_pending_writes.inc();
+void log_t::file::open_file(std::string path)
+{
+  fd= log_file_t(std::move(path));
+  if (const dberr_t err= fd.open(srv_read_only_mode))
+    ib::fatal() << "open(" << fd.get_path() << ") returned " << err;
+}
 
-	const ulint	page_no = ulint(dest_offset >> srv_page_size_shift);
+/** Update the log block checksum. */
+static void log_block_store_checksum(byte* block)
+{
+  log_block_set_checksum(block, log_block_calc_checksum_crc32(block));
+}
 
-	fil_io(IORequestLogWrite, true,
-	       page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no),
-	       0,
-	       ulint(dest_offset & (srv_page_size - 1)),
-	       OS_FILE_LOG_BLOCK_SIZE, buf, NULL);
+void log_t::file::write_header_durable(lsn_t lsn)
+{
+  ut_ad(lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
+  ut_ad(!recv_no_log_write);
+  ut_ad(log_sys.log.format == log_t::FORMAT_10_5 ||
+        log_sys.log.format == log_t::FORMAT_ENC_10_5);
+
+  byte *buf= log_sys.checkpoint_buf;
+  memset_aligned<OS_FILE_LOG_BLOCK_SIZE>(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
+
+  mach_write_to_4(buf + LOG_HEADER_FORMAT, log_sys.log.format);
+  mach_write_to_4(buf + LOG_HEADER_SUBFORMAT, log_sys.log.subformat);
+  mach_write_to_8(buf + LOG_HEADER_START_LSN, lsn);
+  strcpy(reinterpret_cast<char*>(buf) + LOG_HEADER_CREATOR,
+         LOG_HEADER_CREATOR_CURRENT);
+  ut_ad(LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR >=
+        sizeof LOG_HEADER_CREATOR_CURRENT);
+  log_block_store_checksum(buf);
+
+  DBUG_PRINT("ib_log", ("write " LSN_PF, lsn));
+
+  log_sys.log.write(0, {buf, OS_FILE_LOG_BLOCK_SIZE});
+  if (!log_sys.log.writes_are_durable())
+    log_sys.log.flush();
+}
 
-	srv_stats.os_log_pending_writes.dec();
+void log_t::file::read(os_offset_t offset, span<byte> buf)
+{
+  if (const dberr_t err= fd.read(offset, buf))
+    ib::fatal() << "read(" << fd.get_path() << ") returned "<< err;
 }
 
-/******************************************************//**
-Stores a 4-byte checksum to the trailer checksum field of a log block
-before writing it to a log file. This checksum is used in recovery to
-check the consistency of a log block. */
-static
-void
-log_block_store_checksum(
-/*=====================*/
-	byte*	block)	/*!< in/out: pointer to a log block */
+bool log_t::file::writes_are_durable() const noexcept
 {
-	log_block_set_checksum(block, log_block_calc_checksum_crc32(block));
+  return fd.writes_are_durable();
+}
+
+void log_t::file::write(os_offset_t offset, span<byte> buf)
+{
+  srv_stats.os_log_pending_writes.inc();
+  if (const dberr_t err= fd.write(offset, buf))
+    ib::fatal() << "write(" << fd.get_path() << ") returned " << err;
+  srv_stats.os_log_pending_writes.dec();
+  srv_stats.os_log_written.add(buf.size());
+  srv_stats.log_writes.inc();
+  log_sys.n_log_ios++;
+}
+
+void log_t::file::flush()
+{
+  log_sys.pending_flushes.fetch_add(1, std::memory_order_acquire);
+  if (const dberr_t err= fd.flush())
+    ib::fatal() << "flush(" << fd.get_path() << ") returned " << err;
+  log_sys.pending_flushes.fetch_sub(1, std::memory_order_release);
+  log_sys.flushes.fetch_add(1, std::memory_order_release);
+}
+
+void log_t::file::close_file()
+{
+  if (fd.is_opened())
+  {
+    if (const dberr_t err= fd.close())
+      ib::fatal() << "close(" << fd.get_path() << ") returned " << err;
+  }
+  fd.free();                                    // Free path
+}
+
+/** Initialize the redo log. */
+void log_t::file::create()
+{
+  ut_ad(this == &log_sys.log);
+  ut_ad(log_sys.is_initialised());
+
+  format= srv_encrypt_log ? log_t::FORMAT_ENC_10_5 : log_t::FORMAT_10_5;
+  subformat= 2;
+  file_size= srv_log_file_size;
+  lsn= LOG_START_LSN;
+  lsn_offset= LOG_FILE_HDR_SIZE;
 }
 
 /******************************************************//**
@@ -694,11 +573,10 @@ log_write_buf(
 					header */
 {
 	ulint		write_len;
-	bool		write_header	= new_data_offset == 0;
 	lsn_t		next_offset;
 	ulint		i;
 
-	ut_ad(log_write_mutex_own());
+	ut_ad(log_write_lock_own());
 	ut_ad(!recv_no_log_write);
 	ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
 	ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
@@ -711,19 +589,6 @@ loop:
 
 	next_offset = log_sys.log.calc_lsn_offset(start_lsn);
 
-	if (write_header
-	    && next_offset % log_sys.log.file_size == LOG_FILE_HDR_SIZE) {
-		/* We start to write a new log file instance in the group */
-
-		ut_a(next_offset / log_sys.log.file_size <= ULINT_MAX);
-
-		log_file_header_flush(
-			ulint(next_offset / log_sys.log.file_size), start_lsn);
-		srv_stats.os_log_written.add(OS_FILE_LOG_BLOCK_SIZE);
-
-		srv_stats.log_writes.inc();
-	}
-
 	if ((next_offset % log_sys.log.file_size) + len
 	    > log_sys.log.file_size) {
 		/* if the above condition holds, then the below expression
@@ -764,55 +629,23 @@ loop:
 		log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE);
 	}
 
-	log_sys.n_log_ios++;
-
-	srv_stats.os_log_pending_writes.inc();
-
-	ut_a((next_offset >> srv_page_size_shift) <= ULINT_MAX);
-
-	const ulint	page_no = ulint(next_offset >> srv_page_size_shift);
-
-	fil_io(IORequestLogWrite, true,
-	       page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no),
-	       0,
-	       ulint(next_offset & (srv_page_size - 1)), write_len, buf, NULL);
-
-	srv_stats.os_log_pending_writes.dec();
-
-	srv_stats.os_log_written.add(write_len);
-	srv_stats.log_writes.inc();
+	log_sys.log.write(next_offset, {buf, write_len});
 
 	if (write_len < len) {
 		start_lsn += write_len;
 		len -= write_len;
 		buf += write_len;
-
-		write_header = true;
-
 		goto loop;
 	}
 }
 
-/** Flush the recently written changes to the log file.
-and invoke log_mutex_enter(). */
-static void log_write_flush_to_disk_low()
+/** Flush the recently written changes to the log file.*/
+static void log_write_flush_to_disk_low(lsn_t lsn)
 {
-	ut_a(log_sys.n_pending_flushes);
-
-	bool	do_flush = srv_file_flush_method != SRV_O_DSYNC;
-
-	if (do_flush) {
-		fil_flush(SRV_LOG_SPACE_FIRST_ID);
-	}
-
-	log_mutex_enter();
-	if (do_flush) {
-		log_sys.flushed_to_disk_lsn = log_sys.current_flush_lsn;
-	}
-
-	log_sys.n_pending_flushes--;
-
-	os_event_set(log_sys.flush_event);
+  if (!log_sys.log.writes_are_durable())
+    log_sys.log.flush();
+  ut_a(lsn >= log_sys.get_flushed_lsn());
+  log_sys.set_flushed_lsn(lsn);
 }
 
 /** Swap log buffers, and copy the content of last block
@@ -822,16 +655,17 @@ static inline
 void
 log_buffer_switch()
 {
-	ut_ad(log_mutex_own());
-	ut_ad(log_write_mutex_own());
+	mysql_mutex_assert_owner(&log_sys.mutex);
+	ut_ad(log_write_lock_own());
 
-	ulong		area_end = ut_calc_align(
-		log_sys.buf_free, ulong(OS_FILE_LOG_BLOCK_SIZE));
+	size_t		area_end = ut_calc_align<size_t>(
+		log_sys.buf_free, OS_FILE_LOG_BLOCK_SIZE);
 
 	/* Copy the last block to new buf */
-	ut_memcpy(log_sys.flush_buf,
-		  log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
-		  OS_FILE_LOG_BLOCK_SIZE);
+	memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(
+		log_sys.flush_buf,
+		log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
+		OS_FILE_LOG_BLOCK_SIZE);
 
 	std::swap(log_sys.buf, log_sys.flush_buf);
 
@@ -839,90 +673,32 @@ log_buffer_switch()
 	log_sys.buf_next_to_write = log_sys.buf_free;
 }
 
-/** Ensure that the log has been written to the log file up to a given
-log entry (such as that of a transaction commit). Start a new write, or
-wait and check if an already running write is covering the request.
-@param[in]	lsn		log sequence number that should be
-included in the redo log file write
-@param[in]	flush_to_disk	whether the written log should also
-be flushed to the file system
-@param[in]	rotate_key	whether to rotate the encryption key */
-void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key)
-{
-#ifdef UNIV_DEBUG
-	ulint		loop_count	= 0;
-#endif /* UNIV_DEBUG */
-	byte*           write_buf;
-	lsn_t           write_lsn;
-
-	ut_ad(!srv_read_only_mode);
-	ut_ad(!rotate_key || flush_to_disk);
+/** Invoke commit_checkpoint_notify_ha() to notify that outstanding
+log writes have been completed. */
+void log_flush_notify(lsn_t flush_lsn);
 
-	if (recv_no_ibuf_operations) {
-		/* Recovery is running and no operations on the log files are
-		allowed yet (the variable name .._no_ibuf_.. is misleading) */
-
-		return;
-	}
-
-loop:
-	ut_ad(++loop_count < 128);
-
-#if UNIV_WORD_SIZE > 7
-	/* We can do a dirty read of LSN. */
-	/* NOTE: Currently doesn't do dirty read for
-	(flush_to_disk == true) case, because the log_mutex
-	contention also works as the arbitrator for write-IO
-	(fsync) bandwidth between log files and data files. */
-	if (!flush_to_disk && log_sys.write_lsn >= lsn) {
-		return;
-	}
-#endif
+/**
+Writes log buffer to disk
+which is the "write" part of log_write_up_to().
 
-	log_write_mutex_enter();
+This function does not flush anything.
 
-	lsn_t	limit_lsn = flush_to_disk
-		? log_sys.flushed_to_disk_lsn
-		: log_sys.write_lsn;
+Note : the caller must have log_sys.mutex locked, and this
+mutex is released in the function.
 
-	if (limit_lsn >= lsn) {
-		log_write_mutex_exit();
+*/
+static void log_write(bool rotate_key)
+{
+	mysql_mutex_assert_owner(&log_sys.mutex);
+	lsn_t write_lsn;
+	if (log_sys.buf_free == log_sys.buf_next_to_write) {
+		/* Nothing to write */
+		mysql_mutex_unlock(&log_sys.mutex);
 		return;
 	}
 
 	ut_ad(!recv_no_log_write);
 
-	/* If it is a write call we should just go ahead and do it
-	as we checked that write_lsn is not where we'd like it to
-	be. If we have to flush as well then we check if there is a
-	pending flush and based on that we wait for it to finish
-	before proceeding further. */
-	if (flush_to_disk
-	    && (log_sys.n_pending_flushes > 0
-		|| !os_event_is_set(log_sys.flush_event))) {
-		/* Figure out if the current flush will do the job
-		for us. */
-		bool work_done = log_sys.current_flush_lsn >= lsn;
-
-		log_write_mutex_exit();
-
-		os_event_wait(log_sys.flush_event);
-
-		if (work_done) {
-			return;
-		} else {
-			goto loop;
-		}
-	}
-
-	log_mutex_enter();
-	if (!flush_to_disk
-	    && log_sys.buf_free == log_sys.buf_next_to_write) {
-		/* Nothing to write and no flush to disk requested */
-		log_mutex_exit_all();
-		return;
-	}
-
 	ulint		start_offset;
 	ulint		end_offset;
 	ulint		area_start;
@@ -932,20 +708,8 @@ loop:
 
 	DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
 			      log_sys.write_lsn,
-			      log_sys.lsn));
-	if (flush_to_disk) {
-		log_sys.n_pending_flushes++;
-		log_sys.current_flush_lsn = log_sys.lsn;
-		os_event_reset(log_sys.flush_event);
-
-		if (log_sys.buf_free == log_sys.buf_next_to_write) {
-			/* Nothing to write, flush only */
-			log_mutex_exit_all();
-			log_write_flush_to_disk_low();
-			log_mutex_exit();
-			return;
-		}
-	}
+			      log_sys.get_lsn()));
+
 
 	start_offset = log_sys.buf_next_to_write;
 	end_offset = log_sys.buf_free;
@@ -961,14 +725,14 @@ loop:
 		log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
 		log_sys.next_checkpoint_no);
 
-	write_lsn = log_sys.lsn;
-	write_buf = log_sys.buf;
+	write_lsn = log_sys.get_lsn();
+	byte *write_buf = log_sys.buf;
 
 	log_buffer_switch();
 
 	log_sys.log.set_fields(log_sys.write_lsn);
 
-	log_mutex_exit();
+	mysql_mutex_unlock(&log_sys.mutex);
 	/* Erase the end of the last log block. */
 	memset(write_buf + end_offset, 0,
 	       ~end_offset & (OS_FILE_LOG_BLOCK_SIZE - 1));
@@ -996,8 +760,7 @@ loop:
 	if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) {
 		service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
 					       "InnoDB log write: "
-					       LSN_PF "," LSN_PF,
-					       log_sys.write_lsn, lsn);
+					       LSN_PF, log_sys.write_lsn);
 	}
 
 	if (log_sys.is_encrypted()) {
@@ -1006,7 +769,7 @@ loop:
 			  rotate_key ? LOG_ENCRYPT_ROTATE_KEY : LOG_ENCRYPT);
 	}
 
-	/* Do the write to the log files */
+	/* Do the write to the log file */
 	log_write_buf(
 		write_buf + area_start, area_end - area_start + pad_size,
 #ifdef UNIV_DEBUG
@@ -1017,292 +780,137 @@ loop:
 		start_offset - area_start);
 	srv_stats.log_padded.add(pad_size);
 	log_sys.write_lsn = write_lsn;
-
-
-	if (srv_file_flush_method == SRV_O_DSYNC) {
-		/* O_SYNC means the OS did not buffer the log file at all:
-		so we have also flushed to disk what we have written */
-		log_sys.flushed_to_disk_lsn = log_sys.write_lsn;
-	}
-
-	log_write_mutex_exit();
-
-	if (flush_to_disk) {
-		log_write_flush_to_disk_low();
-		ib_uint64_t flush_lsn = log_sys.flushed_to_disk_lsn;
-		log_mutex_exit();
-
-		innobase_mysql_log_notify(flush_lsn);
-	}
+	return;
 }
 
-/** write to the log file up to the last log entry.
-@param[in]	sync	whether we want the written log
-also to be flushed to disk. */
-void log_buffer_flush_to_disk(bool sync)
+static group_commit_lock write_lock;
+static group_commit_lock flush_lock;
+
+#ifdef UNIV_DEBUG
+bool log_write_lock_own()
 {
-  ut_ad(!srv_read_only_mode);
-  log_write_up_to(log_get_lsn(), sync);
+  return write_lock.is_owner();
 }
+#endif
 
-
-/** Durably write the log and release log_sys.mutex */
-ATTRIBUTE_COLD void log_write_and_flush()
+/** Ensure that the log has been written to the log file up to a given
+log entry (such as that of a transaction commit). Start a new write, or
+wait and check if an already running write is covering the request.
+@param[in]	lsn		log sequence number that should be
+included in the redo log file write
+@param[in]	flush_to_disk	whether the written log should also
+be flushed to the file system
+@param[in]	rotate_key	whether to rotate the encryption key */
+void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key)
 {
   ut_ad(!srv_read_only_mode);
-  ut_ad(!recv_no_log_write);
-  ut_ad(!recv_recovery_is_on());
-
-  /* The following code is adapted from log_write_up_to(). */
-  DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
-                        log_sys.write_lsn, log_sys.lsn));
-  log_sys.n_pending_flushes++;
-  log_sys.current_flush_lsn= log_sys.lsn;
-  os_event_reset(log_sys.flush_event);
-  ut_ad(log_sys.buf_free != log_sys.buf_next_to_write);
-  ulint start_offset= log_sys.buf_next_to_write;
-  ulint end_offset= log_sys.buf_free;
-  ulint area_start= ut_2pow_round(start_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
-  ulint area_end= ut_calc_align(end_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
-  ulong write_ahead_size= srv_log_write_ahead_size;
-
-  log_block_set_flush_bit(log_sys.buf + area_start, TRUE);
-  log_block_set_checkpoint_no(log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
-                              log_sys.next_checkpoint_no);
-  lsn_t write_lsn= log_sys.lsn;
-  byte *write_buf= log_sys.buf;
-
-  ut_ad(area_end - area_start > 0);
-
-  log_buffer_switch();
-
-  log_sys.log.set_fields(log_sys.write_lsn);
-
-  /* Erase the end of the last log block. */
-  memset(write_buf + end_offset, 0,
-         ~end_offset & (OS_FILE_LOG_BLOCK_SIZE - 1));
-  /* Calculate pad_size if needed. */
-  ulint pad_size= 0;
-  if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE)
-  {
-    lsn_t end_offset=
-      log_sys.log.calc_lsn_offset(ut_uint64_align_up(write_lsn,
-                                                     OS_FILE_LOG_BLOCK_SIZE));
-    ulint end_offset_in_unit= (ulint) (end_offset % write_ahead_size);
+  ut_ad(!rotate_key || flush_to_disk);
+  ut_ad(lsn != LSN_MAX);
 
-    if (end_offset_in_unit && (area_end - area_start) > end_offset_in_unit)
-    {
-      /* The first block in the unit was initialized after the last
-      writing. Needs to be written padded data once. */
-      pad_size= std::min<ulint>(ulint(write_ahead_size) - end_offset_in_unit,
-                                srv_log_buffer_size - area_end);
-      memset(write_buf + area_end, 0, pad_size);
-    }
+  if (recv_no_ibuf_operations)
+  {
+    /* Recovery is running and no operations on the log files are
+    allowed yet (the variable name .._no_ibuf_.. is misleading) */
+    return;
   }
 
-  if (log_sys.is_encrypted())
-    log_crypt(write_buf + area_start, log_sys.write_lsn,
-              area_end - area_start);
-
-  /* Do the write to the log files */
-  log_write_buf(write_buf + area_start, area_end - area_start + pad_size,
-#ifdef UNIV_DEBUG
-                pad_size,
-#endif /* UNIV_DEBUG */
-                ut_uint64_align_down(log_sys.write_lsn,
-                                     OS_FILE_LOG_BLOCK_SIZE),
-                start_offset - area_start);
-  srv_stats.log_padded.add(pad_size);
-  log_sys.write_lsn= write_lsn;
-
-  log_write_mutex_exit();
-
-  /* Code adapted from log_write_flush_to_disk_low() */
-
-  ut_a(log_sys.n_pending_flushes);
+  if (flush_to_disk &&
+    flush_lock.acquire(lsn) != group_commit_lock::ACQUIRED)
+  {
+    return;
+  }
 
-  if (srv_file_flush_method != SRV_O_DSYNC)
-    fil_flush(SRV_LOG_SPACE_FIRST_ID);
+  if (write_lock.acquire(lsn) == group_commit_lock::ACQUIRED)
+  {
+    mysql_mutex_lock(&log_sys.mutex);
+    lsn_t write_lsn= log_sys.get_lsn();
+    write_lock.set_pending(write_lsn);
 
-  log_sys.flushed_to_disk_lsn= log_sys.current_flush_lsn;
+    log_write(rotate_key);
 
-  log_sys.n_pending_flushes--;
+    ut_a(log_sys.write_lsn == write_lsn);
+    write_lock.release(write_lsn);
+  }
 
-  os_event_set(log_sys.flush_event);
+  if (!flush_to_disk)
+  {
+    return;
+  }
 
-  const lsn_t flush_lsn= log_sys.flushed_to_disk_lsn;
-  log_mutex_exit();
+  /* Flush the highest written lsn.*/
+  auto flush_lsn = write_lock.value();
+  flush_lock.set_pending(flush_lsn);
+  log_write_flush_to_disk_low(flush_lsn);
+  flush_lock.release(flush_lsn);
 
-  innobase_mysql_log_notify(flush_lsn);
+  log_flush_notify(flush_lsn);
 }
 
-/****************************************************************//**
-This functions writes the log buffer to the log file and if 'flush'
-is set it forces a flush of the log file as well. This is meant to be
-called from background master thread only as it does not wait for
-the write (+ possible flush) to finish. */
-void
-log_buffer_sync_in_background(
-/*==========================*/
-	bool	flush)	/*!< in: flush the logs to disk */
+/** Write to the log file up to the last log entry.
+@param sync  whether to wait for a durable write to complete */
+void log_buffer_flush_to_disk(bool sync)
 {
-	lsn_t	lsn;
-
-	log_mutex_enter();
-
-	lsn = log_sys.lsn;
+  ut_ad(!srv_read_only_mode);
+  log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), sync);
+}
 
-	if (flush
-	    && log_sys.n_pending_flushes > 0
-	    && log_sys.current_flush_lsn >= lsn) {
-		/* The write + flush will write enough */
-		log_mutex_exit();
-		return;
-	}
+/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.mutex. */
+ATTRIBUTE_COLD void log_write_and_flush_prepare()
+{
+  mysql_mutex_assert_not_owner(&log_sys.mutex);
 
-	log_mutex_exit();
+  while (flush_lock.acquire(log_sys.get_lsn() + 1) !=
+         group_commit_lock::ACQUIRED);
+  while (write_lock.acquire(log_sys.get_lsn() + 1) !=
+         group_commit_lock::ACQUIRED);
+}
 
-	log_write_up_to(lsn, flush);
+/** Durably write the log and release log_sys.mutex */
+ATTRIBUTE_COLD void log_write_and_flush()
+{
+  ut_ad(!srv_read_only_mode);
+  auto lsn= log_sys.get_lsn();
+  write_lock.set_pending(lsn);
+  log_write(false);
+  ut_a(log_sys.write_lsn == lsn);
+  write_lock.release(lsn);
+
+  lsn= write_lock.value();
+  flush_lock.set_pending(lsn);
+  log_write_flush_to_disk_low(lsn);
+  flush_lock.release(lsn);
 }
 
 /********************************************************************
 
 Tries to establish a big enough margin of free space in the log buffer, such
 that a new log entry can be catenated without an immediate need for a flush. */
-static
-void
-log_flush_margin(void)
-/*==================*/
+ATTRIBUTE_COLD static void log_flush_margin()
 {
 	lsn_t	lsn	= 0;
 
-	log_mutex_enter();
+	mysql_mutex_lock(&log_sys.mutex);
 
 	if (log_sys.buf_free > log_sys.max_buf_free) {
 		/* We can write during flush */
-		lsn = log_sys.lsn;
+		lsn = log_sys.get_lsn();
 	}
 
-	log_mutex_exit();
+	mysql_mutex_unlock(&log_sys.mutex);
 
 	if (lsn) {
 		log_write_up_to(lsn, false);
 	}
 }
 
-/** Advances the smallest lsn for which there are unflushed dirty blocks in the
-buffer pool.
-NOTE: this function may only be called if the calling thread owns no
-synchronization objects!
-@param[in]	new_oldest	try to advance oldest_modified_lsn at least to
-this lsn
-@return false if there was a flush batch of the same type running,
-which means that we could not start this flush batch */
-static bool log_preflush_pool_modified_pages(lsn_t new_oldest)
+/** Write checkpoint info to the log header and release log_sys.mutex.
+@param[in]	end_lsn	start LSN of the FILE_CHECKPOINT mini-transaction */
+ATTRIBUTE_COLD void log_write_checkpoint_info(lsn_t end_lsn)
 {
-	bool	success;
-
-	if (recv_recovery_is_on()) {
-		/* If the recovery is running, we must first apply all
-		log records to their respective file pages to get the
-		right modify lsn values to these pages: otherwise, there
-		might be pages on disk which are not yet recovered to the
-		current lsn, and even after calling this function, we could
-		not know how up-to-date the disk version of the database is,
-		and we could not make a new checkpoint on the basis of the
-		info on the buffer pool only. */
-		recv_apply_hashed_log_recs(true);
-	}
-
-	if (new_oldest == LSN_MAX
-	    || !buf_page_cleaner_is_active
-	    || srv_is_being_started) {
-
-		ulint	n_pages;
-
-		success = buf_flush_lists(ULINT_MAX, new_oldest, &n_pages);
-
-		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
-
-		if (!success) {
-			MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
-		}
-
-		MONITOR_INC_VALUE_CUMULATIVE(
-			MONITOR_FLUSH_SYNC_TOTAL_PAGE,
-			MONITOR_FLUSH_SYNC_COUNT,
-			MONITOR_FLUSH_SYNC_PAGES,
-			n_pages);
-	} else {
-		/* better to wait for flushed by page cleaner */
-
-		if (srv_flush_sync) {
-			/* wake page cleaner for IO burst */
-			buf_flush_request_force(new_oldest);
-		}
-
-		buf_flush_wait_flushed(new_oldest);
-
-		success = true;
-	}
-
-	return(success);
-}
-
-/******************************************************//**
-Completes a checkpoint. */
-static
-void
-log_complete_checkpoint(void)
-/*=========================*/
-{
-	ut_ad(log_mutex_own());
-	ut_ad(log_sys.n_pending_checkpoint_writes == 0);
-
-	log_sys.next_checkpoint_no++;
-
-	log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn;
-	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
-		    log_sys.lsn - log_sys.last_checkpoint_lsn);
-
-	DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF
-			      ", flushed to " LSN_PF,
-			      log_sys.last_checkpoint_lsn,
-			      log_sys.flushed_to_disk_lsn));
-
-	rw_lock_x_unlock_gen(&(log_sys.checkpoint_lock), LOG_CHECKPOINT);
-}
-
-/** Complete an asynchronous checkpoint write. */
-void log_t::complete_checkpoint()
-{
-	ut_ad(this == &log_sys);
-	MONITOR_DEC(MONITOR_PENDING_CHECKPOINT_WRITE);
-
-	log_mutex_enter();
-
-	ut_ad(n_pending_checkpoint_writes > 0);
-
-	if (!--n_pending_checkpoint_writes) {
-		log_complete_checkpoint();
-	}
-
-	log_mutex_exit();
-}
-
-/** Write checkpoint info to the log header.
-@param[in]	end_lsn	start LSN of the MLOG_CHECKPOINT mini-transaction */
-static
-void
-log_group_checkpoint(lsn_t end_lsn)
-{
-	lsn_t		lsn_offset;
-
 	ut_ad(!srv_read_only_mode);
-	ut_ad(log_mutex_own());
 	ut_ad(end_lsn == 0 || end_lsn >= log_sys.next_checkpoint_lsn);
-	ut_ad(end_lsn <= log_sys.lsn);
-	ut_ad(end_lsn + SIZE_OF_MLOG_CHECKPOINT <= log_sys.lsn
+	ut_ad(end_lsn <= log_sys.get_lsn());
+	ut_ad(end_lsn + SIZE_OF_FILE_CHECKPOINT <= log_sys.get_lsn()
 	      || srv_shutdown_state > SRV_SHUTDOWN_INITIATED);
 
 	DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF
@@ -1311,7 +919,7 @@ log_group_checkpoint(lsn_t end_lsn)
 			      log_sys.next_checkpoint_lsn));
 
 	byte* buf = log_sys.checkpoint_buf;
-	memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
+	memset_aligned<OS_FILE_LOG_BLOCK_SIZE>(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
 
 	mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys.next_checkpoint_no);
 	mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys.next_checkpoint_lsn);
@@ -1320,320 +928,85 @@ log_group_checkpoint(lsn_t end_lsn)
 		log_crypt_write_checkpoint_buf(buf);
 	}
 
-	lsn_offset = log_sys.log.calc_lsn_offset(log_sys.next_checkpoint_lsn);
+	lsn_t lsn_offset
+		= log_sys.log.calc_lsn_offset(log_sys.next_checkpoint_lsn);
 	mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET, lsn_offset);
 	mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE,
 			srv_log_buffer_size);
 	mach_write_to_8(buf + LOG_CHECKPOINT_END_LSN, end_lsn);
 
-	log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
-
-	MONITOR_INC(MONITOR_PENDING_CHECKPOINT_WRITE);
-
-	log_sys.n_log_ios++;
-
-	MONITOR_INC(MONITOR_LOG_IO);
+	log_block_store_checksum(buf);
 
 	ut_ad(LOG_CHECKPOINT_1 < srv_page_size);
 	ut_ad(LOG_CHECKPOINT_2 < srv_page_size);
 
-	if (log_sys.n_pending_checkpoint_writes++ == 0) {
-		rw_lock_x_lock_gen(&log_sys.checkpoint_lock,
-				   LOG_CHECKPOINT);
-	}
+	ut_ad(!log_sys.checkpoint_pending);
+	log_sys.checkpoint_pending = true;
+
+	mysql_mutex_unlock(&log_sys.mutex);
 
 	/* Note: We alternate the physical place of the checkpoint info.
 	See the (next_checkpoint_no & 1) below. */
 
-	fil_io(IORequestLogWrite, false,
-	       page_id_t(SRV_LOG_SPACE_FIRST_ID, 0),
-	       0,
-	       (log_sys.next_checkpoint_no & 1)
-	       ? LOG_CHECKPOINT_2 : LOG_CHECKPOINT_1,
-	       OS_FILE_LOG_BLOCK_SIZE,
-	       buf, reinterpret_cast<void*>(1) /* checkpoint write */);
-}
+	log_sys.log.write((log_sys.next_checkpoint_no & 1) ? LOG_CHECKPOINT_2
+							   : LOG_CHECKPOINT_1,
+			  {buf, OS_FILE_LOG_BLOCK_SIZE});
 
-/** Read a log group header page to log_sys.checkpoint_buf.
-@param[in]	header	0 or LOG_CHECKPOINT_1 or LOG_CHECKPOINT2 */
-void log_header_read(ulint header)
-{
-	ut_ad(log_mutex_own());
+	log_sys.log.flush();
 
-	log_sys.n_log_ios++;
+	mysql_mutex_lock(&log_sys.mutex);
 
-	MONITOR_INC(MONITOR_LOG_IO);
-
-	fil_io(IORequestLogRead, true,
-	       page_id_t(SRV_LOG_SPACE_FIRST_ID,
-			 header >> srv_page_size_shift),
-	       0, header & (srv_page_size - 1),
-	       OS_FILE_LOG_BLOCK_SIZE, log_sys.checkpoint_buf, NULL);
-}
+	ut_ad(log_sys.checkpoint_pending);
+	log_sys.checkpoint_pending = false;
 
-/** Write checkpoint info to the log header and invoke log_mutex_exit().
-@param[in]	sync	whether to wait for the write to complete
-@param[in]	end_lsn	start LSN of the MLOG_CHECKPOINT mini-transaction */
-void
-log_write_checkpoint_info(bool sync, lsn_t end_lsn)
-{
-	ut_ad(log_mutex_own());
-	ut_ad(!srv_read_only_mode);
+	log_sys.next_checkpoint_no++;
 
-	log_group_checkpoint(end_lsn);
+	log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn;
 
-	log_mutex_exit();
+	DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF
+			      ", flushed to " LSN_PF,
+			      lsn_t{log_sys.last_checkpoint_lsn},
+			      log_sys.get_flushed_lsn()));
 
 	MONITOR_INC(MONITOR_NUM_CHECKPOINT);
 
-	if (sync) {
-		/* Wait for the checkpoint write to complete */
-		rw_lock_s_lock(&log_sys.checkpoint_lock);
-		rw_lock_s_unlock(&log_sys.checkpoint_lock);
-
-		DBUG_EXECUTE_IF(
-			"crash_after_checkpoint",
-			DBUG_SUICIDE(););
-	}
-}
-
-/** Set extra data to be written to the redo log during checkpoint.
-@param[in]	buf	data to be appended on checkpoint, or NULL
-@return pointer to previous data to be appended on checkpoint */
-mtr_buf_t*
-log_append_on_checkpoint(
-	mtr_buf_t*	buf)
-{
-	log_mutex_enter();
-	mtr_buf_t*	old = log_sys.append_on_checkpoint;
-	log_sys.append_on_checkpoint = buf;
-	log_mutex_exit();
-	return(old);
-}
-
-/** Make a checkpoint. Note that this function does not flush dirty
-blocks from the buffer pool: it only checks what is lsn of the oldest
-modification in the pool, and writes information about the lsn in
-log files. Use log_make_checkpoint() to flush also the pool.
-@param[in]	sync		whether to wait for the write to complete
-@return true if success, false if a checkpoint write was already running */
-bool log_checkpoint(bool sync)
-{
-	lsn_t	oldest_lsn;
-
-	ut_ad(!srv_read_only_mode);
-
-	DBUG_EXECUTE_IF("no_checkpoint",
-			/* We sleep for a long enough time, forcing
-			the checkpoint doesn't happen any more. */
-			os_thread_sleep(360000000););
-
-	if (recv_recovery_is_on()) {
-		recv_apply_hashed_log_recs(true);
-	}
-
-	switch (srv_file_flush_method) {
-	case SRV_NOSYNC:
-		break;
-	case SRV_O_DSYNC:
-	case SRV_FSYNC:
-	case SRV_LITTLESYNC:
-	case SRV_O_DIRECT:
-	case SRV_O_DIRECT_NO_FSYNC:
-#ifdef _WIN32
-	case SRV_ALL_O_DIRECT_FSYNC:
-#endif
-		fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
-	}
-
-	log_mutex_enter();
-
-	oldest_lsn = log_buf_pool_get_oldest_modification();
-
-	/* Because log also contains headers and dummy log records,
-	log_buf_pool_get_oldest_modification() will return log_sys.lsn
-	if the buffer pool contains no dirty buffers.
-	We must make sure that the log is flushed up to that lsn.
-	If there are dirty buffers in the buffer pool, then our
-	write-ahead-logging algorithm ensures that the log has been
-	flushed up to oldest_lsn. */
-
-	ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
-	const lsn_t age = oldest_lsn - log_sys.last_checkpoint_lsn;
-	if (age > SIZE_OF_MLOG_CHECKPOINT + log_sys.framing_size()) {
-		/* Some log has been written since the previous checkpoint. */
-	} else if (age > SIZE_OF_MLOG_CHECKPOINT
-		   && !((log_sys.log.calc_lsn_offset(oldest_lsn)
-			 ^ log_sys.log.calc_lsn_offset(
-				   log_sys.last_checkpoint_lsn))
-			& ~lsn_t{OS_FILE_LOG_BLOCK_SIZE - 1})) {
-		/* Some log has been written to the same log block. */
-	} else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
-		/* MariaDB 10.3 startup expects the redo log file to be
-		logically empty (not even containing a MLOG_CHECKPOINT record)
-		after a clean shutdown. Perform an extra checkpoint at
-		shutdown. */
-	} else {
-		/* Do nothing, because nothing was logged (other than
-		a MLOG_CHECKPOINT marker) since the previous checkpoint. */
-		log_mutex_exit();
-		return(true);
-	}
-
-	ut_ad(!recv_no_log_write);
-
-	/* Repeat the MLOG_FILE_NAME records after the checkpoint, in
-	case some log records between the checkpoint and log_sys.lsn
-	need them. Finally, write a MLOG_CHECKPOINT marker. Redo log
-	apply expects to see a MLOG_CHECKPOINT after the checkpoint,
-	except on clean shutdown, where the log will be empty after
-	the checkpoint.
-	It is important that we write out the redo log before any
-	further dirty pages are flushed to the tablespace files.  At
-	this point, because log_mutex_own(), mtr_commit() in other
-	threads will be blocked, and no pages can be added to the
-	flush lists. */
-	lsn_t		flush_lsn	= oldest_lsn;
-	const lsn_t	end_lsn		= log_sys.lsn;
-	const bool	do_write
-		= srv_shutdown_state <= SRV_SHUTDOWN_INITIATED
-		|| flush_lsn != end_lsn;
-
-	if (fil_names_clear(flush_lsn, do_write)) {
-		ut_ad(log_sys.lsn >= end_lsn + SIZE_OF_MLOG_CHECKPOINT);
-		flush_lsn = log_sys.lsn;
-	}
-
-	log_mutex_exit();
-
-	log_write_up_to(flush_lsn, true, true);
-
-	log_mutex_enter();
-
-	ut_ad(log_sys.flushed_to_disk_lsn >= flush_lsn);
-	ut_ad(flush_lsn >= oldest_lsn);
-
-	if (log_sys.last_checkpoint_lsn >= oldest_lsn) {
-		log_mutex_exit();
-		return(true);
-	}
-
-	if (log_sys.n_pending_checkpoint_writes > 0) {
-		/* A checkpoint write is running */
-		log_mutex_exit();
-
-		if (sync) {
-			/* Wait for the checkpoint write to complete */
-			rw_lock_s_lock(&log_sys.checkpoint_lock);
-			rw_lock_s_unlock(&log_sys.checkpoint_lock);
-		}
-
-		return(false);
-	}
-
-	log_sys.next_checkpoint_lsn = oldest_lsn;
-	log_write_checkpoint_info(sync, end_lsn);
-	ut_ad(!log_mutex_own());
-
-	return(true);
-}
-
-/** Make a checkpoint */
-void log_make_checkpoint()
-{
-	/* Preflush pages synchronously */
-
-	while (!log_preflush_pool_modified_pages(LSN_MAX)) {
-		/* Flush as much as we can */
-	}
-
-	while (!log_checkpoint(true)) {
-		/* Force a checkpoint */
-	}
+	mysql_mutex_unlock(&log_sys.mutex);
 }
 
 /****************************************************************//**
-Tries to establish a big enough margin of free space in the log groups, such
+Tries to establish a big enough margin of free space in the log, such
 that a new log entry can be catenated without an immediate need for a
 checkpoint. NOTE: this function may only be called if the calling thread
 owns no synchronization objects! */
-static
-void
-log_checkpoint_margin(void)
-/*=======================*/
+ATTRIBUTE_COLD static void log_checkpoint_margin()
 {
-	lsn_t		age;
-	lsn_t		checkpoint_age;
-	ib_uint64_t	advance;
-	lsn_t		oldest_lsn;
-	bool		success;
-loop:
-	advance = 0;
-
-	log_mutex_enter();
-	ut_ad(!recv_no_log_write);
-
-	if (!log_sys.check_flush_or_checkpoint) {
-		log_mutex_exit();
-		return;
-	}
-
-	oldest_lsn = log_buf_pool_get_oldest_modification();
-
-	age = log_sys.lsn - oldest_lsn;
-
-	if (age > log_sys.max_modified_age_sync) {
-
-		/* A flush is urgent: we have to do a synchronous preflush */
-		advance = age - log_sys.max_modified_age_sync;
-	}
-
-	checkpoint_age = log_sys.lsn - log_sys.last_checkpoint_lsn;
-
-	bool	checkpoint_sync;
-	bool	do_checkpoint;
-
-	if (checkpoint_age > log_sys.max_checkpoint_age) {
-		/* A checkpoint is urgent: we do it synchronously */
-		checkpoint_sync = true;
-		do_checkpoint = true;
-	} else if (checkpoint_age > log_sys.max_checkpoint_age_async) {
-		/* A checkpoint is not urgent: do it asynchronously */
-		do_checkpoint = true;
-		checkpoint_sync = false;
-		log_sys.check_flush_or_checkpoint = false;
-	} else {
-		do_checkpoint = false;
-		checkpoint_sync = false;
-		log_sys.check_flush_or_checkpoint = false;
-	}
-
-	log_mutex_exit();
-
-	if (advance) {
-		lsn_t	new_oldest = oldest_lsn + advance;
-
-		success = log_preflush_pool_modified_pages(new_oldest);
+  while (log_sys.check_flush_or_checkpoint())
+  {
+    mysql_mutex_lock(&log_sys.mutex);
+    ut_ad(!recv_no_log_write);
 
-		/* If the flush succeeded, this thread has done its part
-		and can proceed. If it did not succeed, there was another
-		thread doing a flush at the same time. */
-		if (!success) {
-			log_mutex_enter();
-			log_sys.check_flush_or_checkpoint = true;
-			log_mutex_exit();
-			goto loop;
-		}
-	}
+    if (!log_sys.check_flush_or_checkpoint())
+    {
+func_exit:
+      mysql_mutex_unlock(&log_sys.mutex);
+      return;
+    }
 
-	if (do_checkpoint) {
-		log_checkpoint(checkpoint_sync);
+    const lsn_t lsn= log_sys.get_lsn();
+    const lsn_t checkpoint= log_sys.last_checkpoint_lsn;
+    const lsn_t sync_lsn= checkpoint + log_sys.max_checkpoint_age;
+    if (lsn <= sync_lsn)
+    {
+      log_sys.set_check_flush_or_checkpoint(false);
+      goto func_exit;
+    }
 
-		if (checkpoint_sync) {
+    mysql_mutex_unlock(&log_sys.mutex);
 
-			goto loop;
-		}
-	}
+    /* We must wait to prevent the tail of the log overwriting the head. */
+    buf_flush_wait_flushed(std::min(sync_lsn, checkpoint + (1U << 20)));
+    os_thread_sleep(10000); /* Sleep 10ms to avoid a thundering herd */
+  }
 }
 
 /**
@@ -1641,29 +1014,21 @@ Checks that there is enough free space in the log to start a new query step.
 Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
 function may only be called if the calling thread owns no synchronization
 objects! */
-void
-log_check_margins(void)
+ATTRIBUTE_COLD void log_check_margins()
 {
-	bool	check;
-
-	do {
-		log_flush_margin();
-		log_checkpoint_margin();
-		log_mutex_enter();
-		ut_ad(!recv_no_log_write);
-		check = log_sys.check_flush_or_checkpoint;
-		log_mutex_exit();
-	} while (check);
+  do
+  {
+    log_flush_margin();
+    log_checkpoint_margin();
+    ut_ad(!recv_no_log_write);
+  }
+  while (log_sys.check_flush_or_checkpoint());
 }
 
-/****************************************************************//**
-Makes a checkpoint at the latest lsn and writes it to first page of each
-data file in the database, so that we know that the file spaces contain
-all modifications up to that lsn. This can only be called at database
-shutdown. This function also writes all log in log files to the log archive. */
-void
-logs_empty_and_mark_files_at_shutdown(void)
-/*=======================================*/
+extern void buf_resize_shutdown();
+
+/** Make a checkpoint at the latest lsn on shutdown. */
+ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown()
 {
 	lsn_t			lsn;
 	ulint			count = 0;
@@ -1672,33 +1037,35 @@ logs_empty_and_mark_files_at_shutdown(void)
 
 	/* Wait until the master thread and all other operations are idle: our
 	algorithm only works if the server is idle at shutdown */
+	bool do_srv_shutdown = false;
+	if (srv_master_timer) {
+		do_srv_shutdown = srv_fast_shutdown < 2;
+		srv_master_timer.reset();
+	}
+
+	/* Wait for the end of the buffer resize task.*/
+	buf_resize_shutdown();
+	dict_stats_shutdown();
+	btr_defragment_shutdown();
 
 	srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
+
+	if (srv_buffer_pool_dump_at_shutdown &&
+		!srv_read_only_mode && srv_fast_shutdown < 2) {
+		buf_dump_start();
+	}
+	srv_monitor_timer.reset();
+	lock_sys.timeout_timer.reset();
+	if (do_srv_shutdown) {
+		srv_shutdown(srv_fast_shutdown == 0);
+	}
+
+
 loop:
 	ut_ad(lock_sys.is_initialised() || !srv_was_started);
 	ut_ad(log_sys.is_initialised() || !srv_was_started);
 	ut_ad(fil_system.is_initialised() || !srv_was_started);
-	os_event_set(srv_buf_resize_event);
 
-	if (!srv_read_only_mode) {
-		os_event_set(srv_error_event);
-		os_event_set(srv_monitor_event);
-		os_event_set(srv_buf_dump_event);
-		if (lock_sys.timeout_thread_active) {
-			os_event_set(lock_sys.timeout_event);
-		}
-		if (dict_stats_event) {
-			os_event_set(dict_stats_event);
-		} else {
-			ut_ad(!srv_dict_stats_thread_active);
-		}
-		if (recv_sys.flush_start) {
-			/* This is in case recv_writer_thread was never
-			started, or buf_flush_page_cleaner_coordinator
-			failed to notice its termination. */
-			os_event_set(recv_sys.flush_start);
-		}
-	}
 #define COUNT_INTERVAL 600U
 #define CHECK_INTERVAL 100000U
 	os_thread_sleep(CHECK_INTERVAL);
@@ -1731,23 +1098,7 @@ loop:
 	/* We need these threads to stop early in shutdown. */
 	const char* thread_name;
 
-	if (srv_error_monitor_active) {
-		thread_name = "srv_error_monitor_thread";
-	} else if (srv_monitor_active) {
-		thread_name = "srv_monitor_thread";
-	} else if (srv_buf_resize_thread_active) {
-		thread_name = "buf_resize_thread";
-		goto wait_suspend_loop;
-	} else if (srv_dict_stats_thread_active) {
-		thread_name = "dict_stats_thread";
-	} else if (lock_sys.timeout_thread_active) {
-		thread_name = "lock_wait_timeout_thread";
-	} else if (srv_buf_dump_thread_active) {
-		thread_name = "buf_dump_thread";
-		goto wait_suspend_loop;
-	} else if (btr_defragment_thread_active) {
-		thread_name = "btr_defragment_thread";
-	} else if (srv_fast_shutdown != 2 && trx_rollback_is_active) {
+   if (srv_fast_shutdown != 2 && trx_rollback_is_active) {
 		thread_name = "rollback of recovered transactions";
 	} else {
 		thread_name = NULL;
@@ -1769,60 +1120,42 @@ wait_suspend_loop:
 
 	/* Check that the background threads are suspended */
 
-	switch (srv_get_active_thread_type()) {
-	case SRV_NONE:
-		if (!srv_n_fil_crypt_threads_started) {
-			srv_shutdown_state = SRV_SHUTDOWN_FLUSH_PHASE;
-			break;
-		}
+	ut_ad(!srv_any_background_activity());
+	if (srv_n_fil_crypt_threads_started) {
 		os_event_set(fil_crypt_threads_event);
 		thread_name = "fil_crypt_thread";
 		goto wait_suspend_loop;
-	case SRV_PURGE:
-	case SRV_WORKER:
-		ut_ad(!"purge was not shut down");
-		srv_purge_wakeup();
-		thread_name = "purge thread";
-		goto wait_suspend_loop;
-	case SRV_MASTER:
-		thread_name = "master thread";
+	}
+
+	if (buf_page_cleaner_is_active) {
+		thread_name = "page cleaner thread";
+		pthread_cond_signal(&buf_pool.do_flush_list);
 		goto wait_suspend_loop;
 	}
 
-	/* At this point only page_cleaner should be active. We wait
-	here to let it complete the flushing of the buffer pools
-	before proceeding further. */
+	buf_load_dump_end();
 
-	count = 0;
-	service_manager_extend_timeout(COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
-		"Waiting for page cleaner");
-	while (buf_page_cleaner_is_active) {
-		++count;
-		os_thread_sleep(CHECK_INTERVAL);
-		if (srv_print_verbose_log && count > COUNT_INTERVAL) {
-			service_manager_extend_timeout(COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
-				"Waiting for page cleaner");
-			ib::info() << "Waiting for page_cleaner to "
-				"finish flushing of buffer pool";
-			/* This is a workaround to avoid the InnoDB hang
-			when OS datetime changed backwards */
-			os_event_set(buf_flush_event);
+	if (!buf_pool.is_initialised()) {
+		ut_ad(!srv_was_started);
+	} else if (ulint pending_io = buf_pool.io_pending()) {
+		if (srv_print_verbose_log && count > 600) {
+			ib::info() << "Waiting for " << pending_io << " buffer"
+				" page I/Os to complete";
 			count = 0;
 		}
-	}
 
-	if (log_scrub_thread_active) {
-		ut_ad(!srv_read_only_mode);
-		os_event_set(log_scrub_event);
+		goto loop;
+	} else {
+		buf_flush_buffer_pool();
 	}
 
 	if (log_sys.is_initialised()) {
-		log_mutex_enter();
-		const ulint	n_write	= log_sys.n_pending_checkpoint_writes;
-		const ulint	n_flush	= log_sys.n_pending_flushes;
-		log_mutex_exit();
+		mysql_mutex_lock(&log_sys.mutex);
+		const size_t n_write{log_sys.checkpoint_pending};
+		const size_t n_flush{log_sys.get_pending_flushes()};
+		mysql_mutex_unlock(&log_sys.mutex);
 
-		if (log_scrub_thread_active || n_write || n_flush) {
+		if (n_write || n_flush) {
 			if (srv_print_verbose_log && count > 600) {
 				ib::info() << "Pending checkpoint_writes: "
 					<< n_write
@@ -1834,20 +1167,6 @@ wait_suspend_loop:
 		}
 	}
 
-	ut_ad(!log_scrub_thread_active);
-
-	if (!buf_pool_ptr) {
-		ut_ad(!srv_was_started);
-	} else if (ulint pending_io = buf_pool_check_no_pending_io()) {
-		if (srv_print_verbose_log && count > 600) {
-			ib::info() << "Waiting for " << pending_io << " buffer"
-				" page I/Os to complete";
-			count = 0;
-		}
-
-		goto loop;
-	}
-
 	if (srv_fast_shutdown == 2 || !srv_was_started) {
 		if (!srv_read_only_mode && srv_was_started) {
 			ib::info() << "MySQL has requested a very fast"
@@ -1869,10 +1188,6 @@ wait_suspend_loop:
 		}
 
 		srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
-
-		if (fil_system.is_initialised()) {
-			fil_close_all_files();
-		}
 		return;
 	}
 
@@ -1881,41 +1196,42 @@ wait_suspend_loop:
 			"ensuring dirty buffer pool are written to log");
 		log_make_checkpoint();
 
-		log_mutex_enter();
+		mysql_mutex_lock(&log_sys.mutex);
 
-		lsn = log_sys.lsn;
+		lsn = log_sys.get_lsn();
 
-		const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn;
+		const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn
+			&& lsn != log_sys.last_checkpoint_lsn
+			+ SIZE_OF_FILE_CHECKPOINT;
 		ut_ad(lsn >= log_sys.last_checkpoint_lsn);
 
-		log_mutex_exit();
+		mysql_mutex_unlock(&log_sys.mutex);
 
 		if (lsn_changed) {
 			goto loop;
 		}
 
-		/* Ensure that all buffered changes are written to the
-		redo log before fil_close_all_files(). */
-		fil_flush_file_spaces(FIL_TYPE_LOG);
+		log_sys.log.flush();
 	} else {
-		lsn = srv_start_lsn;
+		lsn = recv_sys.recovered_lsn;
 	}
 
 	srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
 
 	/* Make some checks that the server really is quiet */
-	ut_a(srv_get_active_thread_type() == SRV_NONE);
+	ut_ad(!srv_any_background_activity());
 
 	service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
 				       "Free innodb buffer pool");
-	buf_all_freed();
+	ut_d(buf_pool.assert_all_freed());
 
-	ut_a(lsn == log_sys.lsn
+	ut_a(lsn == log_sys.get_lsn()
 	     || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
 
-	if (lsn < srv_start_lsn) {
+	if (UNIV_UNLIKELY(lsn < recv_sys.recovered_lsn)) {
 		ib::error() << "Shutdown LSN=" << lsn
-			<< " is less than start LSN=" << srv_start_lsn;
+			    << " is less than start LSN="
+			    << recv_sys.recovered_lsn;
 	}
 
 	srv_shutdown_lsn = lsn;
@@ -1929,35 +1245,14 @@ wait_suspend_loop:
 		}
 	}
 
-	fil_close_all_files();
-
 	/* Make some checks that the server really is quiet */
-	ut_a(srv_get_active_thread_type() == SRV_NONE);
+	ut_ad(!srv_any_background_activity());
 
-	ut_a(lsn == log_sys.lsn
+	ut_a(lsn == log_sys.get_lsn()
 	     || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
 }
 
 /******************************************************//**
-Peeks the current lsn.
-@return TRUE if success, FALSE if could not get the log system mutex */
-ibool
-log_peek_lsn(
-/*=========*/
-	lsn_t*	lsn)	/*!< out: if returns TRUE, current lsn is here */
-{
-	if (0 == mutex_enter_nowait(&(log_sys.mutex))) {
-		*lsn = log_sys.lsn;
-
-		log_mutex_exit();
-
-		return(TRUE);
-	}
-
-	return(FALSE);
-}
-
-/******************************************************//**
 Prints info of the log. */
 void
 log_print(
@@ -1967,17 +1262,22 @@ log_print(
 	double	time_elapsed;
 	time_t	current_time;
 
-	log_mutex_enter();
+	mysql_mutex_lock(&log_sys.mutex);
+
+	const lsn_t lsn= log_sys.get_lsn();
+	mysql_mutex_lock(&buf_pool.flush_list_mutex);
+	const lsn_t pages_flushed = buf_pool.get_oldest_modification(lsn);
+	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
 	fprintf(file,
 		"Log sequence number " LSN_PF "\n"
 		"Log flushed up to   " LSN_PF "\n"
 		"Pages flushed up to " LSN_PF "\n"
 		"Last checkpoint at  " LSN_PF "\n",
-		log_sys.lsn,
-		log_sys.flushed_to_disk_lsn,
-		log_buf_pool_get_oldest_modification(),
-		log_sys.last_checkpoint_lsn);
+		lsn,
+		log_sys.get_flushed_lsn(),
+		pages_flushed,
+		lsn_t{log_sys.last_checkpoint_lsn});
 
 	current_time = time(NULL);
 
@@ -1992,8 +1292,8 @@ log_print(
 		ULINTPF " pending log flushes, "
 		ULINTPF " pending chkp writes\n"
 		ULINTPF " log i/o's done, %.2f log i/o's/second\n",
-		log_sys.n_pending_flushes,
-		log_sys.n_pending_checkpoint_writes,
+		log_sys.pending_flushes.load(),
+		ulint{log_sys.checkpoint_pending},
 		log_sys.n_log_ios,
 		static_cast<double>(
 			log_sys.n_log_ios - log_sys.n_log_ios_old)
@@ -2002,7 +1302,7 @@ log_print(
 	log_sys.n_log_ios_old = log_sys.n_log_ios;
 	log_sys.last_printout_time = current_time;
 
-	log_mutex_exit();
+	mysql_mutex_unlock(&log_sys.mutex);
 }
 
 /**********************************************************************//**
@@ -2020,114 +1320,57 @@ void log_t::close()
 {
   ut_ad(this == &log_sys);
   if (!is_initialised()) return;
-  m_initialised = false;
+  m_initialised= false;
   log.close();
 
   ut_free_dodump(buf, srv_log_buffer_size);
-  buf = NULL;
+  buf= nullptr;
   ut_free_dodump(flush_buf, srv_log_buffer_size);
-  flush_buf = NULL;
+  flush_buf= nullptr;
 
-  os_event_destroy(flush_event);
-  rw_lock_free(&checkpoint_lock);
-  mutex_free(&mutex);
-  mutex_free(&write_mutex);
-  mutex_free(&log_flush_order_mutex);
-
-  if (!srv_read_only_mode && srv_scrub_log)
-    os_event_destroy(log_scrub_event);
+  mysql_mutex_destroy(&mutex);
+  mysql_mutex_destroy(&flush_order_mutex);
 
   recv_sys.close();
-}
-
-/******************************************************//**
-Pads the current log block full with dummy log records. Used in producing
-consistent archived log files and scrubbing redo log. */
-static
-void
-log_pad_current_log_block(void)
-/*===========================*/
-{
-	byte		b		= MLOG_DUMMY_RECORD;
-	ulint		pad_length;
-	ulint		i;
-	lsn_t		lsn;
-
-	ut_ad(!recv_no_log_write);
-	/* We retrieve lsn only because otherwise gcc crashed on HP-UX */
-	lsn = log_reserve_and_open(OS_FILE_LOG_BLOCK_SIZE);
-
-	pad_length = log_sys.trailer_offset()
-		- log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE;
-	if (pad_length == log_sys.payload_size()) {
-		pad_length = 0;
-	}
-
-	if (pad_length) {
-		srv_stats.n_log_scrubs.inc();
-	}
-
-	for (i = 0; i < pad_length; i++) {
-		log_write_low(&b, 1);
-	}
 
-	lsn = log_sys.lsn;
-
-	log_close();
-
-	ut_a(lsn % OS_FILE_LOG_BLOCK_SIZE == LOG_BLOCK_HDR_SIZE);
+  aligned_free(checkpoint_buf);
+  checkpoint_buf= nullptr;
 }
 
-/*****************************************************************//*
-If no log record has been written for a while, fill current log
-block with dummy records. */
-static
-void
-log_scrub()
-/*=========*/
+std::string get_log_file_path(const char *filename)
 {
-	log_mutex_enter();
-	ulint cur_lbn = log_block_convert_lsn_to_no(log_sys.lsn);
+  const size_t size= strlen(srv_log_group_home_dir) + /* path separator */ 1 +
+                     strlen(filename) + /* longest suffix */ 3;
+  std::string path;
+  path.reserve(size);
+  path.assign(srv_log_group_home_dir);
 
-	if (next_lbn_to_pad == cur_lbn)
-	{
-		log_pad_current_log_block();
-	}
-
-	next_lbn_to_pad = log_block_convert_lsn_to_no(log_sys.lsn);
-	log_mutex_exit();
-}
+  std::replace(path.begin(), path.end(), OS_PATH_SEPARATOR_ALT,
+	       OS_PATH_SEPARATOR);
 
-/* log scrubbing speed, in bytes/sec */
-UNIV_INTERN ulonglong innodb_scrub_log_speed;
-
-/*****************************************************************//**
-This is the main thread for log scrub. It waits for an event and
-when waked up fills current log block with dummy records and
-sleeps again.
-@return this function does not return, it calls os_thread_exit() */
-extern "C" UNIV_INTERN
-os_thread_ret_t
-DECLARE_THREAD(log_scrub_thread)(void*)
-{
-	ut_ad(!srv_read_only_mode);
+  if (path.back() != OS_PATH_SEPARATOR)
+    path.push_back(OS_PATH_SEPARATOR);
+  path.append(filename);
 
-	while (srv_shutdown_state < SRV_SHUTDOWN_FLUSH_PHASE) {
-		/* log scrubbing interval in µs. */
-		ulonglong interval = 1000*1000*512/innodb_scrub_log_speed;
-
-		os_event_wait_time(log_scrub_event, static_cast<ulint>(interval));
+  return path;
+}
 
-		log_scrub();
+std::vector<std::string> get_existing_log_files_paths() {
+  std::vector<std::string> result;
 
-		os_event_reset(log_scrub_event);
-	}
+  for (int i= 0; i < 101; i++) {
+    auto path= get_log_file_path(LOG_FILE_NAME_PREFIX)
+                                 .append(std::to_string(i));
+    os_file_stat_t stat;
+    dberr_t err= os_file_get_status(path.c_str(), &stat, false, true);
+    if (err)
+      break;
 
-	log_scrub_thread_active = false;
+    if (stat.type != OS_FILE_TYPE_FILE)
+      break;
 
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-	os_thread_exit();
+    result.push_back(std::move(path));
+  }
 
-	OS_THREAD_DUMMY_RETURN;
+  return result;
 }
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
index 4359fb6b308..d9761fe9d85 100644
--- a/storage/innobase/log/log0recv.cc
+++ b/storage/innobase/log/log0recv.cc
@@ -1,8 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2013, 2020, MariaDB Corporation.
+Copyright (c) 2013, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -40,13 +39,13 @@ Created 9/20/1997 Heikki Tuuri
 #include "log0crypt.h"
 #include "mem0mem.h"
 #include "buf0buf.h"
+#include "buf0dblwr.h"
 #include "buf0flu.h"
 #include "mtr0mtr.h"
 #include "mtr0log.h"
+#include "page0page.h"
 #include "page0cur.h"
-#include "page0zip.h"
-#include "btr0btr.h"
-#include "btr0cur.h"
+#include "trx0undo.h"
 #include "ibuf0ibuf.h"
 #include "trx0undo.h"
 #include "trx0rec.h"
@@ -54,24 +53,10 @@ Created 9/20/1997 Heikki Tuuri
 #include "buf0rea.h"
 #include "srv0srv.h"
 #include "srv0start.h"
-#include "trx0roll.h"
-#include "row0merge.h"
 #include "fil0pagecompress.h"
 
-/** Log records are stored in the hash table in chunks at most of this size;
-this must be less than srv_page_size as it is stored in the buffer pool */
-#define RECV_DATA_BLOCK_SIZE	(MEM_MAX_ALLOC_IN_BUF - sizeof(recv_data_t) - REDZONE_SIZE)
-
-/** Read-ahead area in applying log records to file pages */
-#define RECV_READ_AHEAD_AREA	32
-
 /** The recovery system */
 recv_sys_t	recv_sys;
-/** TRUE when applying redo log records during crash recovery; FALSE
-otherwise.  Note that this is FALSE while a background thread is
-rolling back incomplete transactions. */
-volatile bool	recv_recovery_on;
-
 /** TRUE when recv_init_crash_recovery() has been called. */
 bool	recv_needed_recovery;
 #ifdef UNIV_DEBUG
@@ -92,43 +77,460 @@ to file pages already before the recovery is finished: in this case no
 ibuf operations are allowed, as they could modify the pages read in the
 buffer pool before the pages have been recovered to the up-to-date state.
 
-TRUE means that recovery is running and no operations on the log files
+true means that recovery is running and no operations on the log file
 are allowed yet: the variable name is misleading. */
 bool	recv_no_ibuf_operations;
 
-/** The type of the previous parsed redo log record */
-static mlog_id_t	recv_previous_parsed_rec_type;
-/** The offset of the previous parsed redo log record */
-static ulint	recv_previous_parsed_rec_offset;
-/** The 'multi' flag of the previous parsed redo log record */
-static ulint	recv_previous_parsed_rec_is_multi;
-
 /** The maximum lsn we see for a page during the recovery process. If this
 is bigger than the lsn we are able to scan up to, that is an indication that
 the recovery failed and the database may be corrupt. */
 static lsn_t	recv_max_page_lsn;
 
-#ifdef UNIV_PFS_THREAD
-mysql_pfs_key_t	trx_rollback_clean_thread_key;
-mysql_pfs_key_t	recv_writer_thread_key;
-#endif /* UNIV_PFS_THREAD */
+/** Stored physical log record with logical LSN (@see log_t::FORMAT_10_5) */
+struct log_phys_t : public log_rec_t
+{
+  /** start LSN of the mini-transaction (not necessarily of this record) */
+  const lsn_t start_lsn;
+private:
+  /** @return the start of length and data */
+  const byte *start() const
+  {
+    return my_assume_aligned<sizeof(size_t)>
+      (reinterpret_cast<const byte*>(&start_lsn + 1));
+  }
+  /** @return the start of length and data */
+  byte *start()
+  { return const_cast<byte*>(const_cast<const log_phys_t*>(this)->start()); }
+  /** @return the length of the following record */
+  uint16_t len() const { uint16_t i; memcpy(&i, start(), 2); return i; }
+
+  /** @return start of the log records */
+  byte *begin() { return start() + 2; }
+  /** @return end of the log records */
+  byte *end() { byte *e= begin() + len(); ut_ad(!*e); return e; }
+public:
+  /** @return start of the log records */
+  const byte *begin() const { return const_cast<log_phys_t*>(this)->begin(); }
+  /** @return end of the log records */
+  const byte *end() const { return const_cast<log_phys_t*>(this)->end(); }
+
+  /** Determine the allocated size of the object.
+  @param len  length of recs, excluding terminating NUL byte
+  @return the total allocation size */
+  static inline size_t alloc_size(size_t len);
+
+  /** Constructor.
+  @param start_lsn start LSN of the mini-transaction
+  @param lsn  mtr_t::commit_lsn() of the mini-transaction
+  @param recs the first log record for the page in the mini-transaction
+  @param size length of recs, in bytes, excluding terminating NUL byte */
+  log_phys_t(lsn_t start_lsn, lsn_t lsn, const byte *recs, size_t size) :
+    log_rec_t(lsn), start_lsn(start_lsn)
+  {
+    ut_ad(start_lsn);
+    ut_ad(start_lsn < lsn);
+    const uint16_t len= static_cast<uint16_t>(size);
+    ut_ad(len == size);
+    memcpy(start(), &len, 2);
+    reinterpret_cast<byte*>(memcpy(begin(), recs, size))[size]= 0;
+  }
+
+  /** Append a record to the log.
+  @param recs  log to append
+  @param size  size of the log, in bytes */
+  void append(const byte *recs, size_t size)
+  {
+    ut_ad(start_lsn < lsn);
+    uint16_t l= len();
+    reinterpret_cast<byte*>(memcpy(end(), recs, size))[size]= 0;
+    l= static_cast<uint16_t>(l + size);
+    memcpy(start(), &l, 2);
+  }
+
+  /** Apply an UNDO_APPEND record.
+  @see mtr_t::undo_append()
+  @param block   undo log page
+  @param data    undo log record
+  @param len     length of the undo log record
+  @return whether the operation failed (inconcistency was noticed) */
+  static bool undo_append(const buf_block_t &block, const byte *data,
+                          size_t len)
+  {
+    ut_ad(len > 2);
+    byte *free_p= my_assume_aligned<2>
+      (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block.frame);
+    const uint16_t free= mach_read_from_2(free_p);
+    if (UNIV_UNLIKELY(free < TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE ||
+                      free + len + 6 >= srv_page_size - FIL_PAGE_DATA_END))
+    {
+      ib::error() << "Not applying UNDO_APPEND due to corruption on "
+                  << block.page.id();
+      return true;
+    }
+
+    byte *p= block.frame + free;
+    mach_write_to_2(free_p, free + 4 + len);
+    memcpy(p, free_p, 2);
+    p+= 2;
+    memcpy(p, data, len);
+    p+= len;
+    mach_write_to_2(p, free);
+    return false;
+  }
+
+  /** The status of apply() */
+  enum apply_status {
+    /** The page was not affected */
+    APPLIED_NO= 0,
+    /** The page was modified */
+    APPLIED_YES,
+    /** The page was modified, affecting the encryption parameters */
+    APPLIED_TO_ENCRYPTION,
+    /** The page was modified, affecting the tablespace header */
+    APPLIED_TO_FSP_HEADER
+  };
+
+  /** Apply log to a page frame.
+  @param[in,out] block         buffer block
+  @param[in,out] last_offset   last byte offset, for same_page records
+  @return whether any log was applied to the page */
+  apply_status apply(const buf_block_t &block, uint16_t &last_offset) const
+  {
+    const byte * const recs= begin();
+    byte *const frame= block.page.zip.ssize
+      ? block.page.zip.data : block.frame;
+    const size_t size= block.physical_size();
+    apply_status applied= APPLIED_NO;
+
+    for (const byte *l= recs;;)
+    {
+      const byte b= *l++;
+      if (!b)
+        return applied;
+      ut_ad((b & 0x70) != RESERVED);
+      size_t rlen= b & 0xf;
+      if (!rlen)
+      {
+        const size_t lenlen= mlog_decode_varint_length(*l);
+        const uint32_t addlen= mlog_decode_varint(l);
+        ut_ad(addlen != MLOG_DECODE_ERROR);
+        rlen= addlen + 15 - lenlen;
+        l+= lenlen;
+      }
+      if (!(b & 0x80))
+      {
+        /* Skip the page identifier. It has already been validated. */
+        size_t idlen= mlog_decode_varint_length(*l);
+        ut_ad(idlen <= 5);
+        ut_ad(idlen < rlen);
+        ut_ad(mlog_decode_varint(l) == block.page.id().space());
+        l+= idlen;
+        rlen-= idlen;
+        idlen= mlog_decode_varint_length(*l);
+        ut_ad(idlen <= 5);
+        ut_ad(idlen <= rlen);
+        ut_ad(mlog_decode_varint(l) == block.page.id().page_no());
+        l+= idlen;
+        rlen-= idlen;
+        last_offset= 0;
+      }
+
+      switch (b & 0x70) {
+      case FREE_PAGE:
+        ut_ad(last_offset == 0);
+        goto next_not_same_page;
+      case INIT_PAGE:
+        if (UNIV_LIKELY(rlen == 0))
+        {
+          memset_aligned<UNIV_ZIP_SIZE_MIN>(frame, 0, size);
+          mach_write_to_4(frame + FIL_PAGE_OFFSET, block.page.id().page_no());
+          memset_aligned<8>(FIL_PAGE_PREV + frame, 0xff, 8);
+          mach_write_to_4(frame + FIL_PAGE_SPACE_ID, block.page.id().space());
+          last_offset= FIL_PAGE_TYPE;
+      next_after_applying:
+          if (applied == APPLIED_NO)
+            applied= APPLIED_YES;
+        }
+        else
+        {
+      record_corrupted:
+          if (!srv_force_recovery)
+          {
+            recv_sys.found_corrupt_log= true;
+            return applied;
+          }
+      next_not_same_page:
+          last_offset= 1; /* the next record must not be same_page  */
+        }
+      next:
+        l+= rlen;
+        continue;
+      }
 
-/** Is recv_writer_thread active? */
-bool	recv_writer_thread_active;
+      ut_ad(mach_read_from_4(frame + FIL_PAGE_OFFSET) ==
+            block.page.id().page_no());
+      ut_ad(mach_read_from_4(frame + FIL_PAGE_SPACE_ID) ==
+            block.page.id().space());
+      ut_ad(last_offset <= 1 || last_offset > 8);
+      ut_ad(last_offset <= size);
+
+      switch (b & 0x70) {
+      case OPTION:
+        goto next;
+      case EXTENDED:
+        if (UNIV_UNLIKELY(block.page.id().page_no() < 3 ||
+                          block.page.zip.ssize))
+          goto record_corrupted;
+        static_assert(INIT_ROW_FORMAT_REDUNDANT == 0, "compatiblity");
+        static_assert(INIT_ROW_FORMAT_DYNAMIC == 1, "compatibility");
+        if (UNIV_UNLIKELY(!rlen))
+          goto record_corrupted;
+        switch (const byte subtype= *l) {
+          uint8_t ll;
+          size_t prev_rec, hdr_size;
+        default:
+          goto record_corrupted;
+        case INIT_ROW_FORMAT_REDUNDANT:
+        case INIT_ROW_FORMAT_DYNAMIC:
+          if (UNIV_UNLIKELY(rlen != 1))
+            goto record_corrupted;
+          page_create_low(&block, *l != INIT_ROW_FORMAT_REDUNDANT);
+          break;
+        case UNDO_INIT:
+          if (UNIV_UNLIKELY(rlen != 1))
+            goto record_corrupted;
+          trx_undo_page_init(block);
+          break;
+        case UNDO_APPEND:
+          if (UNIV_UNLIKELY(rlen <= 3))
+            goto record_corrupted;
+          if (undo_append(block, ++l, --rlen) && !srv_force_recovery)
+          {
+page_corrupted:
+            ib::error() << "Set innodb_force_recovery=1 to ignore corruption.";
+            recv_sys.found_corrupt_log= true;
+            return applied;
+          }
+          break;
+        case INSERT_HEAP_REDUNDANT:
+        case INSERT_REUSE_REDUNDANT:
+        case INSERT_HEAP_DYNAMIC:
+        case INSERT_REUSE_DYNAMIC:
+          if (UNIV_UNLIKELY(rlen < 2))
+            goto record_corrupted;
+          rlen--;
+          ll= mlog_decode_varint_length(*++l);
+          if (UNIV_UNLIKELY(ll > 3 || ll >= rlen))
+            goto record_corrupted;
+          prev_rec= mlog_decode_varint(l);
+          ut_ad(prev_rec != MLOG_DECODE_ERROR);
+          rlen-= ll;
+          l+= ll;
+          ll= mlog_decode_varint_length(*l);
+          static_assert(INSERT_HEAP_REDUNDANT == 4, "compatibility");
+          static_assert(INSERT_REUSE_REDUNDANT == 5, "compatibility");
+          static_assert(INSERT_HEAP_DYNAMIC == 6, "compatibility");
+          static_assert(INSERT_REUSE_DYNAMIC == 7, "compatibility");
+          if (subtype & 2)
+          {
+            size_t shift= 0;
+            if (subtype & 1)
+            {
+              if (UNIV_UNLIKELY(ll > 3 || ll >= rlen))
+                goto record_corrupted;
+              shift= mlog_decode_varint(l);
+              ut_ad(shift != MLOG_DECODE_ERROR);
+              rlen-= ll;
+              l+= ll;
+              ll= mlog_decode_varint_length(*l);
+            }
+            if (UNIV_UNLIKELY(ll > 3 || ll >= rlen))
+              goto record_corrupted;
+            size_t enc_hdr_l= mlog_decode_varint(l);
+            ut_ad(enc_hdr_l != MLOG_DECODE_ERROR);
+            rlen-= ll;
+            l+= ll;
+            ll= mlog_decode_varint_length(*l);
+            if (UNIV_UNLIKELY(ll > 2 || ll >= rlen))
+              goto record_corrupted;
+            size_t hdr_c= mlog_decode_varint(l);
+            ut_ad(hdr_c != MLOG_DECODE_ERROR);
+            rlen-= ll;
+            l+= ll;
+            ll= mlog_decode_varint_length(*l);
+            if (UNIV_UNLIKELY(ll > 3 || ll >= rlen))
+              goto record_corrupted;
+            size_t data_c= mlog_decode_varint(l);
+            ut_ad(data_c != MLOG_DECODE_ERROR);
+            rlen-= ll;
+            l+= ll;
+            if (page_apply_insert_dynamic(block, subtype & 1, prev_rec,
+                                          shift, enc_hdr_l, hdr_c, data_c,
+                                          l, rlen) && !srv_force_recovery)
+              goto page_corrupted;
+          }
+          else
+          {
+            if (UNIV_UNLIKELY(ll > 2 || ll >= rlen))
+              goto record_corrupted;
+            size_t header= mlog_decode_varint(l);
+            ut_ad(header != MLOG_DECODE_ERROR);
+            rlen-= ll;
+            l+= ll;
+            ll= mlog_decode_varint_length(*l);
+            if (UNIV_UNLIKELY(ll > 2 || ll >= rlen))
+              goto record_corrupted;
+            size_t hdr_c= mlog_decode_varint(l);
+            ut_ad(hdr_c != MLOG_DECODE_ERROR);
+            rlen-= ll;
+            l+= ll;
+            ll= mlog_decode_varint_length(*l);
+            if (UNIV_UNLIKELY(ll > 2 || ll >= rlen))
+              goto record_corrupted;
+            size_t data_c= mlog_decode_varint(l);
+            rlen-= ll;
+            l+= ll;
+            if (page_apply_insert_redundant(block, subtype & 1, prev_rec,
+                                            header, hdr_c, data_c,
+                                            l, rlen) && !srv_force_recovery)
+              goto page_corrupted;
+          }
+          break;
+        case DELETE_ROW_FORMAT_REDUNDANT:
+          if (UNIV_UNLIKELY(rlen < 2 || rlen > 4))
+            goto record_corrupted;
+          rlen--;
+          ll= mlog_decode_varint_length(*++l);
+          if (UNIV_UNLIKELY(ll != rlen))
+            goto record_corrupted;
+          if (page_apply_delete_redundant(block, mlog_decode_varint(l)) &&
+              !srv_force_recovery)
+            goto page_corrupted;
+          break;
+        case DELETE_ROW_FORMAT_DYNAMIC:
+          if (UNIV_UNLIKELY(rlen < 2))
+            goto record_corrupted;
+          rlen--;
+          ll= mlog_decode_varint_length(*++l);
+          if (UNIV_UNLIKELY(ll > 3 || ll >= rlen))
+            goto record_corrupted;
+          prev_rec= mlog_decode_varint(l);
+          ut_ad(prev_rec != MLOG_DECODE_ERROR);
+          rlen-= ll;
+          l+= ll;
+          ll= mlog_decode_varint_length(*l);
+          if (UNIV_UNLIKELY(ll > 2 || ll >= rlen))
+            goto record_corrupted;
+          hdr_size= mlog_decode_varint(l);
+          ut_ad(hdr_size != MLOG_DECODE_ERROR);
+          rlen-= ll;
+          l+= ll;
+          ll= mlog_decode_varint_length(*l);
+          if (UNIV_UNLIKELY(ll > 3 || ll != rlen))
+            goto record_corrupted;
+          if (page_apply_delete_dynamic(block, prev_rec, hdr_size,
+                                        mlog_decode_varint(l)) &&
+              !srv_force_recovery)
+            goto page_corrupted;
+          break;
+        }
+        last_offset= FIL_PAGE_TYPE;
+        goto next_after_applying;
+      case WRITE:
+      case MEMSET:
+      case MEMMOVE:
+        if (UNIV_UNLIKELY(last_offset == 1))
+          goto record_corrupted;
+        const size_t olen= mlog_decode_varint_length(*l);
+        if (UNIV_UNLIKELY(olen >= rlen) || UNIV_UNLIKELY(olen > 3))
+          goto record_corrupted;
+        const uint32_t offset= mlog_decode_varint(l);
+        ut_ad(offset != MLOG_DECODE_ERROR);
+        static_assert(FIL_PAGE_OFFSET == 4, "compatibility");
+        if (UNIV_UNLIKELY(offset >= size))
+          goto record_corrupted;
+        if (UNIV_UNLIKELY(offset + last_offset < 8 ||
+                          offset + last_offset >= size))
+          goto record_corrupted;
+        last_offset= static_cast<uint16_t>(last_offset + offset);
+        l+= olen;
+        rlen-= olen;
+        size_t llen= rlen;
+        if ((b & 0x70) == WRITE)
+        {
+          if (UNIV_UNLIKELY(rlen + last_offset > size))
+            goto record_corrupted;
+          memcpy(frame + last_offset, l, llen);
+          if (UNIV_LIKELY(block.page.id().page_no()));
+          else if (llen == 11 + MY_AES_BLOCK_SIZE &&
+                   last_offset == FSP_HEADER_OFFSET + MAGIC_SZ +
+                   fsp_header_get_encryption_offset(block.zip_size()))
+            applied= APPLIED_TO_ENCRYPTION;
+          else if (last_offset < FSP_HEADER_OFFSET + FSP_FREE + FLST_LEN + 4 &&
+                   last_offset + llen >= FSP_HEADER_OFFSET + FSP_SIZE)
+            applied= APPLIED_TO_FSP_HEADER;
+        next_after_applying_write:
+          ut_ad(llen + last_offset <= size);
+          last_offset= static_cast<uint16_t>(last_offset + llen);
+          goto next_after_applying;
+        }
+        llen= mlog_decode_varint_length(*l);
+        if (UNIV_UNLIKELY(llen > rlen || llen > 3))
+          goto record_corrupted;
+        const uint32_t len= mlog_decode_varint(l);
+        ut_ad(len != MLOG_DECODE_ERROR);
+        if (UNIV_UNLIKELY(len + last_offset > size))
+          goto record_corrupted;
+        l+= llen;
+        rlen-= llen;
+        llen= len;
+        if ((b & 0x70) == MEMSET)
+        {
+          ut_ad(rlen <= llen);
+          if (UNIV_UNLIKELY(rlen != 1))
+          {
+            size_t s;
+            for (s= 0; s < llen; s+= rlen)
+              memcpy(frame + last_offset + s, l, rlen);
+            memcpy(frame + last_offset + s, l, llen - s);
+          }
+          else
+            memset(frame + last_offset, *l, llen);
+          goto next_after_applying_write;
+        }
+        const size_t slen= mlog_decode_varint_length(*l);
+        if (UNIV_UNLIKELY(slen != rlen || slen > 3))
+          goto record_corrupted;
+        uint32_t s= mlog_decode_varint(l);
+        ut_ad(slen != MLOG_DECODE_ERROR);
+        if (s & 1)
+          s= last_offset - (s >> 1) - 1;
+        else
+          s= last_offset + (s >> 1) + 1;
+        if (UNIV_LIKELY(s >= 8 && s + llen <= size))
+        {
+          memmove(frame + last_offset, frame + s, llen);
+          goto next_after_applying_write;
+        }
+      }
+      goto record_corrupted;
+    }
+  }
+};
+
+
+inline size_t log_phys_t::alloc_size(size_t len)
+{
+  return len + (1 + 2 + sizeof(log_phys_t));
+}
 
-#ifndef	DBUG_OFF
-/** Return string name of the redo log record type.
-@param[in]	type	record log record enum
-@return string name of record log record */
-static const char* get_mlog_string(mlog_id_t type);
-#endif /* !DBUG_OFF */
 
 /** Tablespace item during recovery */
 struct file_name_t {
-	/** Tablespace file name (MLOG_FILE_NAME) */
+	/** Tablespace file name (FILE_MODIFY) */
 	std::string	name;
 	/** Tablespace object (NULL if not valid or not found) */
-	fil_space_t*	space;
+	fil_space_t*	space = nullptr;
 
 	/** Tablespace status. */
 	enum fil_status {
@@ -144,11 +546,10 @@ struct file_name_t {
 	fil_status	status;
 
 	/** FSP_SIZE of tablespace */
-	ulint		size = 0;
+	uint32_t	size = 0;
 
-	/** the log sequence number of the last observed MLOG_INDEX_LOAD
-	record for the tablespace */
-	lsn_t		enable_lsn = 0;
+	/** Freed pages of tablespace */
+	range_set	freed_ranges;
 
 	/** Dummy flags before they have been read from the .ibd file */
 	static constexpr uint32_t initial_flags = FSP_FLAGS_FCRC32_MASK_MARKER;
@@ -157,16 +558,17 @@ struct file_name_t {
 
 	/** Constructor */
 	file_name_t(std::string name_, bool deleted)
-		: name(std::move(name_)), space(NULL),
-		status(deleted ? DELETED: NORMAL) {}
+		: name(std::move(name_)), status(deleted ? DELETED: NORMAL) {}
 
-	/** Report a MLOG_INDEX_LOAD operation, meaning that
-	mlog_init for any earlier LSN must be skipped.
-	@param lsn	log sequence number of the MLOG_INDEX_LOAD */
-	void mlog_index_load(lsn_t lsn)
-	{
-		if (enable_lsn < lsn) enable_lsn = lsn;
-	}
+  /** Add the freed pages */
+  void add_freed_page(uint32_t page_no) { freed_ranges.add_value(page_no); }
+
+  /** Remove the freed pages */
+  void remove_freed_page(uint32_t page_no)
+  {
+    if (freed_ranges.empty()) return;
+    freed_ranges.remove_value(page_no);
+  }
 };
 
 /** Map of dirty tablespaces during recovery */
@@ -178,55 +580,22 @@ typedef std::map<
 
 static recv_spaces_t	recv_spaces;
 
-/** States of recv_addr_t */
-enum recv_addr_state {
-	/** not yet processed */
-	RECV_NOT_PROCESSED,
-	/** not processed; the page will be reinitialized */
-	RECV_WILL_NOT_READ,
-	/** page is being read */
-	RECV_BEING_READ,
-	/** log records are being applied on the page */
-	RECV_BEING_PROCESSED,
-	/** log records have been applied on the page */
-	RECV_PROCESSED,
-	/** log records have been discarded because the tablespace
-	does not exist */
-	RECV_DISCARDED
-};
-
-/** Hashed page file address struct */
-struct recv_addr_t{
-	/** recovery state of the page */
-	recv_addr_state	state;
-	/** tablespace identifier */
-	unsigned	space:32;
-	/** page number */
-	unsigned	page_no:32;
-	/** list of log records for this page */
-	UT_LIST_BASE_NODE_T(recv_t) rec_list;
-	/** hash node in the hash bucket chain */
-	hash_node_t	addr_hash;
-};
-
-/** Report optimized DDL operation (without redo log),
-corresponding to MLOG_INDEX_LOAD.
-@param[in]	space_id	tablespace identifier
-*/
-void (*log_optimized_ddl_op)(ulint space_id);
+/** The last parsed FILE_RENAME records */
+static std::map<uint32_t,std::string> renamed_spaces;
 
 /** Report an operation to create, delete, or rename a file during backup.
 @param[in]	space_id	tablespace identifier
-@param[in]	flags		tablespace flags (NULL if not create)
+@param[in]	create		whether the file is being created
 @param[in]	name		file name (not NUL-terminated)
 @param[in]	len		length of name, in bytes
 @param[in]	new_name	new file name (NULL if not rename)
 @param[in]	new_len		length of new_name, in bytes (0 if NULL) */
-void (*log_file_op)(ulint space_id, const byte* flags,
+void (*log_file_op)(ulint space_id, bool create,
 		    const byte* name, ulint len,
 		    const byte* new_name, ulint new_len);
 
-/** Information about initializing page contents during redo log processing */
+/** Information about initializing page contents during redo log processing.
+FIXME: Rely on recv_sys.pages! */
 class mlog_init_t
 {
 public:
@@ -237,10 +606,8 @@ public:
 		lsn_t lsn;
 		/** Whether btr_page_create() avoided a read of the page.
 
-		At the end of the last recovery batch, ibuf_merge()
-		will invoke change buffer merge for pages that reside
-		in the buffer pool. (In the last batch, loading pages
-		would trigger change buffer merge.) */
+		At the end of the last recovery batch, mark_ibuf_exist()
+		will mark pages for which this flag is set. */
 		bool created;
 	};
 
@@ -250,23 +617,24 @@ private:
 			 ut_allocator<std::pair<const page_id_t, init> > >
 		map;
 	/** Map of page initialization operations.
-	FIXME: Merge this to recv_sys.addr_hash! */
+	FIXME: Merge this to recv_sys.pages! */
 	map inits;
 public:
 	/** Record that a page will be initialized by the redo log.
-	@param[in]	space		tablespace identifier
-	@param[in]	page_no		page number
-	@param[in]	lsn		log sequence number */
-	void add(ulint space, ulint page_no, lsn_t lsn)
+	@param[in]	page_id		page identifier
+	@param[in]	lsn		log sequence number
+	@return whether the state was changed */
+	bool add(const page_id_t page_id, lsn_t lsn)
 	{
 		ut_ad(mutex_own(&recv_sys.mutex));
 		const init init = { lsn, false };
 		std::pair<map::iterator, bool> p = inits.insert(
-			map::value_type(page_id_t(space, page_no), init));
+			map::value_type(page_id, init));
 		ut_ad(!p.first->second.created);
-		if (!p.second && p.first->second.lsn < init.lsn) {
-			p.first->second = init;
-		}
+		if (p.second) return true;
+		if (p.first->second.lsn >= init.lsn) return false;
+		p.first->second = init;
+		return true;
 	}
 
 	/** Get the last stored lsn of the page id and its respective
@@ -281,6 +649,17 @@ public:
 		return inits.find(page_id)->second;
 	}
 
+	/** Determine if a page will be initialized or freed after a time.
+	@param page_id      page identifier
+	@param lsn          log sequence number
+	@return whether page_id will be freed or initialized after lsn */
+	bool will_avoid_read(page_id_t page_id, lsn_t lsn) const
+	{
+		ut_ad(mutex_own(&recv_sys.mutex));
+		auto i= inits.find(page_id);
+		return i != inits.end() && i->second.lsn > lsn;
+	}
+
 	/** At the end of each recovery batch, reset the 'created' flags. */
 	void reset()
 	{
@@ -291,22 +670,13 @@ public:
 		}
 	}
 
-	/** On the last recovery batch, merge buffered changes to those
-	pages that were initialized by buf_page_create() and still reside
-	in the buffer pool. Stale pages are not allowed in the buffer pool.
-
-	Note: When MDEV-14481 implements redo log apply in the
-	background, we will have to ensure that buf_page_get_gen()
-	will not deliver stale pages to users (pages on which the
-	change buffer was not merged yet).  Normally, the change
-	buffer merge is performed on I/O completion. Maybe, add a
-	flag to buf_page_t and perform the change buffer merge on
-	the first actual access?
+	/** On the last recovery batch, mark whether there exist
+	buffered changes for the pages that were initialized
+	by buf_page_create() and still reside in the buffer pool.
 	@param[in,out]	mtr	dummy mini-transaction */
-	void ibuf_merge(mtr_t& mtr)
+	void mark_ibuf_exist(mtr_t& mtr)
 	{
 		ut_ad(mutex_own(&recv_sys.mutex));
-		ut_ad(!recv_no_ibuf_operations);
 		mtr.start();
 
 		for (const map::value_type& i : inits) {
@@ -314,13 +684,32 @@ public:
 				continue;
 			}
 			if (buf_block_t* block = buf_page_get_low(
-				    i.first, 0, RW_X_LATCH, NULL,
+				    i.first, 0, RW_X_LATCH, nullptr,
 				    BUF_GET_IF_IN_POOL, __FILE__, __LINE__,
-				    &mtr, NULL)) {
+				    &mtr, nullptr, false)) {
+				if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+					switch (fil_page_get_type(
+							block->page.zip.data)) {
+					case FIL_PAGE_INDEX:
+					case FIL_PAGE_RTREE:
+						if (page_zip_decompress(
+							    &block->page.zip,
+							    block->frame,
+							    true)) {
+							break;
+						}
+						ib::error() << "corrupted "
+							    << block->page.id();
+					}
+				}
+				if (recv_no_ibuf_operations) {
+					mtr.commit();
+					mtr.start();
+					continue;
+				}
 				mutex_exit(&recv_sys.mutex);
-				ibuf_merge_or_delete_for_page(
-					block, i.first,
-					block->zip_size());
+				block->page.ibuf_exist = ibuf_page_exists(
+					block->page.id(), block->zip_size());
 				mtr.commit();
 				mtr.start();
 				mutex_enter(&recv_sys.mutex);
@@ -336,92 +725,94 @@ public:
 
 static mlog_init_t mlog_init;
 
-/** Process a MLOG_CREATE2 record that indicates that a tablespace
-is being shrunk in size.
-@param[in]	space_id	tablespace identifier
-@param[in]	pages		trimmed size of the file, in pages
-@param[in]	lsn		log sequence number of the operation */
-static void recv_addr_trim(ulint space_id, unsigned pages, lsn_t lsn)
+/** Process a record that indicates that a tablespace is
+being shrunk in size.
+@param page_id	first page identifier that is not in the file
+@param lsn	log sequence number of the shrink operation */
+inline void recv_sys_t::trim(const page_id_t page_id, lsn_t lsn)
 {
-	DBUG_ENTER("recv_addr_trim");
+	DBUG_ENTER("recv_sys_t::trim");
 	DBUG_LOG("ib_log",
 		 "discarding log beyond end of tablespace "
-		 << page_id_t(space_id, pages) << " before LSN " << lsn);
-	ut_ad(mutex_own(&recv_sys.mutex));
-	for (ulint i = recv_sys.addr_hash->n_cells; i--; ) {
-		hash_cell_t* const cell = hash_get_nth_cell(
-			recv_sys.addr_hash, i);
-		for (recv_addr_t* addr = static_cast<recv_addr_t*>(cell->node),
-			     *next;
-		     addr; addr = next) {
-			next = static_cast<recv_addr_t*>(addr->addr_hash);
-
-			if (addr->space != space_id || addr->page_no < pages) {
-				continue;
-			}
-
-			for (recv_t* recv = UT_LIST_GET_FIRST(addr->rec_list);
-			     recv; ) {
-				recv_t* n = UT_LIST_GET_NEXT(rec_list, recv);
-				if (recv->start_lsn < lsn) {
-					DBUG_PRINT("ib_log",
-						   ("Discarding %s for"
-						    " page %u:%u at " LSN_PF,
-						    get_mlog_string(
-							    recv->type),
-						    addr->space, addr->page_no,
-						    recv->start_lsn));
-					UT_LIST_REMOVE(addr->rec_list, recv);
-				}
-				recv = n;
-			}
+		 << page_id << " before LSN " << lsn);
+	ut_ad(mutex_own(&mutex));
+	for (recv_sys_t::map::iterator p = pages.lower_bound(page_id);
+	     p != pages.end() && p->first.space() == page_id.space();) {
+		recv_sys_t::map::iterator r = p++;
+		if (r->second.trim(lsn)) {
+			pages.erase(r);
 		}
 	}
-	if (fil_space_t* space = fil_space_get(space_id)) {
+	if (fil_space_t* space = fil_space_get(page_id.space())) {
 		ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
 		fil_node_t* file = UT_LIST_GET_FIRST(space->chain);
 		ut_ad(file->is_open());
 		os_file_truncate(file->name, file->handle,
-				 os_offset_t(pages) << srv_page_size_shift,
-				 true);
+				 os_offset_t{page_id.page_no()}
+				 << srv_page_size_shift, true);
 	}
 	DBUG_VOID_RETURN;
 }
 
-/** Process a file name from a MLOG_FILE_* record.
+void recv_sys_t::open_log_files_if_needed()
+{
+  if (!recv_sys.files.empty())
+    return;
+
+  for (auto &&path : get_existing_log_files_paths())
+  {
+    recv_sys.files.emplace_back(std::move(path));
+    ut_a(recv_sys.files.back().open(true) == DB_SUCCESS);
+  }
+}
+
+void recv_sys_t::read(os_offset_t total_offset, span<byte> buf)
+{
+  open_log_files_if_needed();
+
+  size_t file_idx= static_cast<size_t>(total_offset / log_sys.log.file_size);
+  os_offset_t offset= total_offset % log_sys.log.file_size;
+  dberr_t err= recv_sys.files[file_idx].read(offset, buf);
+  ut_a(err == DB_SUCCESS);
+}
+
+inline size_t recv_sys_t::files_size()
+{
+  open_log_files_if_needed();
+  return files.size();
+}
+
+/** Process a file name from a FILE_* record.
 @param[in,out]	name		file name
 @param[in]	len		length of the file name
 @param[in]	space_id	the tablespace ID
-@param[in]	deleted		whether this is a MLOG_FILE_DELETE record */
+@param[in]	deleted		whether this is a FILE_DELETE record */
 static
 void
-fil_name_process(
-	char*	name,
-	ulint	len,
-	ulint	space_id,
-	bool	deleted)
+fil_name_process(char* name, ulint len, ulint space_id, bool deleted)
 {
 	if (srv_operation == SRV_OPERATION_BACKUP) {
 		return;
 	}
 
 	ut_ad(srv_operation == SRV_OPERATION_NORMAL
-	      || is_mariabackup_restore_or_export());
+	      || srv_operation == SRV_OPERATION_RESTORE
+	      || srv_operation == SRV_OPERATION_RESTORE_EXPORT);
 
 	/* We will also insert space=NULL into the map, so that
-	further checks can ensure that a MLOG_FILE_NAME record was
+	further checks can ensure that a FILE_MODIFY record was
 	scanned before applying any page records for the space_id. */
 
 	os_normalize_path(name);
-	file_name_t	fname(std::string(name, len - 1), deleted);
-	std::pair<recv_spaces_t::iterator,bool> p = recv_spaces.insert(
-		std::make_pair(space_id, fname));
+	const file_name_t fname(std::string(name, len), deleted);
+	std::pair<recv_spaces_t::iterator,bool> p = recv_spaces.emplace(
+		space_id, fname);
 	ut_ad(p.first->first == space_id);
 
 	file_name_t&	f = p.first->second;
 
 	if (deleted) {
-		/* Got MLOG_FILE_DELETE */
+		/* Got FILE_DELETE */
 
 		if (!p.second && f.status != file_name_t::DELETED) {
 			f.status = file_name_t::DELETED;
@@ -432,7 +823,7 @@ fil_name_process(
 		}
 
 		ut_ad(f.space == NULL);
-	} else if (p.second // the first MLOG_FILE_NAME or MLOG_FILE_RENAME2
+	} else if (p.second // the first FILE_MODIFY or FILE_RENAME
 		   || f.name != fname.name) {
 		fil_space_t*	space;
 
@@ -473,7 +864,7 @@ same_space:
 		case FIL_LOAD_NOT_FOUND:
 			/* No matching tablespace was found; maybe it
 			was renamed, and we will find a subsequent
-			MLOG_FILE_* record. */
+			FILE_* record. */
 			ut_ad(space == NULL);
 
 			if (srv_force_recovery) {
@@ -527,296 +918,33 @@ same_space:
 	}
 }
 
-/** Parse or process a MLOG_FILE_* record.
-@param[in]	ptr		redo log record
-@param[in]	end		end of the redo log buffer
-@param[in]	page_id		first page number in the file
-@param[in]	type		MLOG_FILE_NAME or MLOG_FILE_DELETE
-or MLOG_FILE_CREATE2 or MLOG_FILE_RENAME2
-@param[in]	apply		whether to apply the record
-@return pointer to next redo log record
-@retval NULL if this log record was truncated */
-static
-byte*
-fil_name_parse(
-	byte*		ptr,
-	const byte*	end,
-	const page_id_t	page_id,
-	mlog_id_t	type,
-	bool		apply)
-{
-	if (type == MLOG_FILE_CREATE2) {
-		if (end < ptr + 4) {
-			return(NULL);
-		}
-		ptr += 4;
-	}
-
-	if (end < ptr + 2) {
-		return(NULL);
-	}
-
-	ulint	len = mach_read_from_2(ptr);
-	ptr += 2;
-	if (end < ptr + len) {
-		return(NULL);
-	}
-
-	/* MLOG_FILE_* records should only be written for
-	user-created tablespaces. The name must be long enough
-	and end in .ibd. */
-	bool corrupt = is_predefined_tablespace(page_id.space())
-		|| len < sizeof "/a.ibd\0"
-		|| (!page_id.page_no() != !memcmp(ptr + len - 5, DOT_IBD, 5));
-
-	if (!corrupt && !memchr(ptr, OS_PATH_SEPARATOR, len)) {
-		if (byte* c = static_cast<byte*>
-		    (memchr(ptr, OS_PATH_SEPARATOR_ALT, len))) {
-			ut_ad(c >= ptr);
-			ut_ad(c < ptr + len);
-			do {
-				*c = OS_PATH_SEPARATOR;
-			} while ((c = static_cast<byte*>
-				  (memchr(ptr, OS_PATH_SEPARATOR_ALT,
-					  len - ulint(c - ptr)))) != NULL);
-		} else {
-			corrupt = true;
-		}
-	}
-
-	byte*	end_ptr	= ptr + len;
-
-	switch (type) {
-	default:
-		ut_ad(0); // the caller checked this
-		/* fall through */
-	case MLOG_FILE_NAME:
-		if (UNIV_UNLIKELY(corrupt)) {
-			ib::error() << "MLOG_FILE_NAME incorrect:" << ptr;
-			recv_sys.found_corrupt_log = true;
-			break;
-		}
-
-		fil_name_process(
-			reinterpret_cast<char*>(ptr), len, page_id.space(),
-			false);
-		break;
-	case MLOG_FILE_DELETE:
-		if (UNIV_UNLIKELY(corrupt)) {
-			ib::error() << "MLOG_FILE_DELETE incorrect:" << ptr;
-			recv_sys.found_corrupt_log = true;
-			break;
-		}
-
-		fil_name_process(reinterpret_cast<char*>(ptr), len,
-				 page_id.space(), true);
-		/* fall through */
-	case MLOG_FILE_CREATE2:
-		if (page_id.page_no()) {
-			ut_ad(page_id.page_no()
-			      == SRV_UNDO_TABLESPACE_SIZE_IN_PAGES);
-			ut_a(srv_is_undo_tablespace(page_id.space()));
-			compile_time_assert(
-				UT_ARR_SIZE(recv_sys.truncated_undo_spaces)
-				== TRX_SYS_MAX_UNDO_SPACES);
-			recv_sys_t::trunc& t = recv_sys.truncated_undo_spaces[
-				page_id.space() - srv_undo_space_id_start];
-			t.lsn = recv_sys.recovered_lsn;
-			t.pages = uint32_t(page_id.page_no());
-		} else if (log_file_op) {
-			log_file_op(page_id.space(),
-				    type == MLOG_FILE_CREATE2 ? ptr - 4 : NULL,
-				    ptr, len, NULL, 0);
-		}
-		break;
-	case MLOG_FILE_RENAME2:
-		if (UNIV_UNLIKELY(corrupt)) {
-			ib::error() << "MLOG_FILE_RENAME2 incorrect:" << ptr;
-			recv_sys.found_corrupt_log = true;
-		}
-
-		/* The new name follows the old name. */
-		byte*	new_name = end_ptr + 2;
-		if (end < new_name) {
-			return(NULL);
-		}
-
-		ulint	new_len = mach_read_from_2(end_ptr);
-
-		if (end < end_ptr + 2 + new_len) {
-			return(NULL);
-		}
-
-		end_ptr += 2 + new_len;
-
-		corrupt = corrupt
-			|| new_len < sizeof "/a.ibd\0"
-			|| memcmp(new_name + new_len - 5, DOT_IBD, 5) != 0;
-
-		if (!corrupt && !memchr(new_name, OS_PATH_SEPARATOR, new_len)) {
-			if (byte* c = static_cast<byte*>
-			    (memchr(new_name, OS_PATH_SEPARATOR_ALT,
-				    new_len))) {
-				ut_ad(c >= new_name);
-				ut_ad(c < new_name + new_len);
-				do {
-					*c = OS_PATH_SEPARATOR;
-				} while ((c = static_cast<byte*>
-					  (memchr(ptr, OS_PATH_SEPARATOR_ALT,
-						  new_len
-						  - ulint(c - new_name))))
-					 != NULL);
-			} else {
-				corrupt = true;
-			}
-		}
-
-		if (UNIV_UNLIKELY(corrupt)) {
-			ib::error() << "MLOG_FILE_RENAME2 new_name incorrect:" << ptr
-				    << " new_name: " << new_name;
-			recv_sys.found_corrupt_log = true;
-			break;
-		}
-
-		fil_name_process(
-			reinterpret_cast<char*>(ptr), len,
-			page_id.space(), false);
-		fil_name_process(
-			reinterpret_cast<char*>(new_name), new_len,
-			page_id.space(), false);
-
-		if (log_file_op) {
-			log_file_op(page_id.space(), NULL,
-				    ptr, len, new_name, new_len);
-		}
-
-		if (!apply) {
-			break;
-		}
-		if (!fil_op_replay_rename(
-			    page_id.space(), page_id.page_no(),
-			    reinterpret_cast<const char*>(ptr),
-			    reinterpret_cast<const char*>(new_name))) {
-			recv_sys.found_corrupt_fs = true;
-		}
-	}
-
-	return(end_ptr);
-}
-
 /** Clean up after recv_sys_t::create() */
 void recv_sys_t::close()
 {
-	ut_ad(this == &recv_sys);
-	ut_ad(!recv_writer_thread_active);
-
-	if (is_initialised()) {
-		dblwr.pages.clear();
-
-		if (addr_hash) {
-			hash_table_free(addr_hash);
-			addr_hash = NULL;
-		}
-
-		if (heap) {
-			mem_heap_free(heap);
-			heap = NULL;
-		}
-
-		if (flush_start) {
-			os_event_destroy(flush_start);
-		}
-
-		if (flush_end) {
-			os_event_destroy(flush_end);
-		}
-
-		if (buf) {
-			ut_free_dodump(buf, buf_size);
-			buf = NULL;
-		}
+  ut_ad(this == &recv_sys);
 
-		buf_size = 0;
-		mutex_free(&writer_mutex);
-		mutex_free(&mutex);
-	}
-
-	recv_spaces.clear();
-	mlog_init.clear();
-}
-
-/************************************************************
-Reset the state of the recovery system variables. */
-void
-recv_sys_var_init(void)
-/*===================*/
-{
-	recv_recovery_on = false;
-	recv_needed_recovery = false;
-	recv_lsn_checks_on = false;
-	recv_no_ibuf_operations = false;
-	recv_previous_parsed_rec_type = MLOG_SINGLE_REC_FLAG;
-	recv_previous_parsed_rec_offset	= 0;
-	recv_previous_parsed_rec_is_multi = 0;
-	recv_max_page_lsn = 0;
-}
-
-/******************************************************************//**
-recv_writer thread tasked with flushing dirty pages from the buffer
-pools.
-@return a dummy parameter */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(recv_writer_thread)(
-/*===============================*/
-	void*	arg MY_ATTRIBUTE((unused)))
-			/*!< in: a dummy parameter required by
-			os_thread_create */
-{
-	my_thread_init();
-	ut_ad(!srv_read_only_mode);
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(recv_writer_thread_key);
-#endif /* UNIV_PFS_THREAD */
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	ib::info() << "recv_writer thread running, id "
-		<< os_thread_pf(os_thread_get_curr_id());
-#endif /* UNIV_DEBUG_THREAD_CREATION */
-
-	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
-
-		/* Wait till we get a signal to clean the LRU list.
-		Bounded by max wait time of 100ms. */
-		int64_t      sig_count = os_event_reset(buf_flush_event);
-		os_event_wait_time_low(buf_flush_event, 100000, sig_count);
-
-		mutex_enter(&recv_sys.writer_mutex);
-
-		if (!recv_recovery_is_on()) {
-			mutex_exit(&recv_sys.writer_mutex);
-			break;
-		}
-
-		/* Flush pages from end of LRU if required */
-		os_event_reset(recv_sys.flush_end);
-		recv_sys.flush_type = BUF_FLUSH_LRU;
-		os_event_set(recv_sys.flush_start);
-		os_event_wait(recv_sys.flush_end);
+  if (is_initialised())
+  {
+    dblwr.pages.clear();
+    ut_d(mutex_enter(&mutex));
+    clear();
+    ut_d(mutex_exit(&mutex));
 
-		mutex_exit(&recv_sys.writer_mutex);
-	}
+    if (buf)
+    {
+      ut_free_dodump(buf, RECV_PARSING_BUF_SIZE);
+      buf= nullptr;
+    }
 
-	recv_writer_thread_active = false;
+    last_stored_lsn= 0;
+    mutex_free(&mutex);
+  }
 
-	my_thread_end();
-	/* We count the number of threads in os_thread_exit().
-	A created thread should always use that to exit and not
-	use return() to exit. */
-	os_thread_exit();
+  recv_spaces.clear();
+  renamed_spaces.clear();
+  mlog_init.clear();
 
-	OS_THREAD_DUMMY_RETURN;
+  close_files();
 }
 
 /** Initialize the redo log recovery subsystem. */
@@ -824,24 +952,13 @@ void recv_sys_t::create()
 {
 	ut_ad(this == &recv_sys);
 	ut_ad(!is_initialised());
-	ut_ad(!flush_start);
-	ut_ad(!flush_end);
 	mutex_create(LATCH_ID_RECV_SYS, &mutex);
-	mutex_create(LATCH_ID_RECV_WRITER, &writer_mutex);
-
-	heap = mem_heap_create_typed(256, MEM_HEAP_FOR_RECV_SYS);
-
-	if (!srv_read_only_mode) {
-		flush_start = os_event_create(0);
-		flush_end = os_event_create(0);
-	}
 
-	flush_type = BUF_FLUSH_LRU;
 	apply_log_recs = false;
 	apply_batch_on = false;
 
-	buf = static_cast<byte*>(ut_malloc_dontdump(RECV_PARSING_BUF_SIZE));
-	buf_size = RECV_PARSING_BUF_SIZE;
+	buf = static_cast<byte*>(ut_malloc_dontdump(RECV_PARSING_BUF_SIZE,
+						    PSI_INSTRUMENT_ME));
 	len = 0;
 	parse_start_lsn = 0;
 	scanned_lsn = 0;
@@ -852,68 +969,144 @@ void recv_sys_t::create()
 	found_corrupt_fs = false;
 	mlog_checkpoint_lsn = 0;
 
-	addr_hash = hash_create(buf_pool_get_curr_size() / 512);
-	n_addrs = 0;
 	progress_time = time(NULL);
 	recv_max_page_lsn = 0;
 
 	memset(truncated_undo_spaces, 0, sizeof truncated_undo_spaces);
-	last_stored_lsn = 0;
+	last_stored_lsn = 1;
+	UT_LIST_INIT(blocks, &buf_block_t::unzip_LRU);
 }
 
-/** Empty a fully processed set of stored redo log records. */
-inline void recv_sys_t::empty()
+/** Clear a fully processed set of stored redo log records. */
+inline void recv_sys_t::clear()
 {
-	ut_ad(mutex_own(&mutex));
-	ut_a(n_addrs == 0);
-
-	hash_table_free(addr_hash);
-	mem_heap_empty(heap);
+  ut_ad(mutex_own(&mutex));
+  apply_log_recs= false;
+  apply_batch_on= false;
+  ut_ad(!after_apply || !UT_LIST_GET_LAST(blocks));
+  pages.clear();
 
-	addr_hash = hash_create(buf_pool_get_curr_size() / 512);
+  for (buf_block_t *block= UT_LIST_GET_LAST(blocks); block; )
+  {
+    buf_block_t *prev_block= UT_LIST_GET_PREV(unzip_LRU, block);
+    ut_ad(block->page.state() == BUF_BLOCK_MEMORY);
+    UT_LIST_REMOVE(blocks, block);
+    MEM_MAKE_ADDRESSABLE(block->frame, srv_page_size);
+    buf_block_free(block);
+    block= prev_block;
+  }
 }
 
 /** Free most recovery data structures. */
 void recv_sys_t::debug_free()
 {
-	ut_ad(this == &recv_sys);
-	ut_ad(is_initialised());
-	mutex_enter(&mutex);
+  ut_ad(this == &recv_sys);
+  ut_ad(is_initialised());
+  mutex_enter(&mutex);
 
-	hash_table_free(addr_hash);
-	mem_heap_free(heap);
-	ut_free_dodump(buf, buf_size);
+  recovery_on= false;
+  pages.clear();
+  ut_free_dodump(buf, RECV_PARSING_BUF_SIZE);
 
-	buf = NULL;
-	heap = NULL;
-	addr_hash = NULL;
+  buf= nullptr;
 
-	/* wake page cleaner up to progress */
-	if (!srv_read_only_mode) {
-		ut_ad(!recv_recovery_is_on());
-		ut_ad(!recv_writer_thread_active);
-		os_event_reset(buf_flush_event);
-		os_event_set(flush_start);
-	}
+  mutex_exit(&mutex);
+}
+
+inline void *recv_sys_t::alloc(size_t len)
+{
+  ut_ad(mutex_own(&mutex));
+  ut_ad(len);
+  ut_ad(len <= srv_page_size);
+
+  buf_block_t *block= UT_LIST_GET_FIRST(blocks);
+  if (UNIV_UNLIKELY(!block))
+  {
+create_block:
+    block= buf_block_alloc();
+    block->page.access_time= 1U << 16 |
+      ut_calc_align<uint16_t>(static_cast<uint16_t>(len), ALIGNMENT);
+    static_assert(ut_is_2pow(ALIGNMENT), "ALIGNMENT must be a power of 2");
+    UT_LIST_ADD_FIRST(blocks, block);
+    MEM_MAKE_ADDRESSABLE(block->frame, len);
+    MEM_NOACCESS(block->frame + len, srv_page_size - len);
+    return my_assume_aligned<ALIGNMENT>(block->frame);
+  }
 
-	mutex_exit(&mutex);
+  size_t free_offset= static_cast<uint16_t>(block->page.access_time);
+  ut_ad(!ut_2pow_remainder(free_offset, ALIGNMENT));
+  if (UNIV_UNLIKELY(!free_offset))
+  {
+    ut_ad(srv_page_size == 65536);
+    goto create_block;
+  }
+  ut_ad(free_offset <= srv_page_size);
+  free_offset+= len;
+
+  if (free_offset > srv_page_size)
+    goto create_block;
+
+  block->page.access_time= ((block->page.access_time >> 16) + 1) << 16 |
+    ut_calc_align<uint16_t>(static_cast<uint16_t>(free_offset), ALIGNMENT);
+  MEM_MAKE_ADDRESSABLE(block->frame + free_offset - len, len);
+  return my_assume_aligned<ALIGNMENT>(block->frame + free_offset - len);
 }
 
+
+/** Free a redo log snippet.
+@param data buffer returned by alloc() */
+inline void recv_sys_t::free(const void *data)
+{
+  ut_ad(!ut_align_offset(data, ALIGNMENT));
+  data= page_align(data);
+  ut_ad(mutex_own(&mutex));
+
+  /* MDEV-14481 FIXME: To prevent race condition with buf_pool.resize(),
+  we must acquire and hold the buffer pool mutex here. */
+  ut_ad(!buf_pool.resize_in_progress());
+
+  auto *chunk= buf_pool.chunks;
+  for (auto i= buf_pool.n_chunks; i--; chunk++)
+  {
+    if (data < chunk->blocks->frame)
+      continue;
+    const size_t offs= (reinterpret_cast<const byte*>(data) -
+                        chunk->blocks->frame) >> srv_page_size_shift;
+    if (offs >= chunk->size)
+      continue;
+    buf_block_t *block= &chunk->blocks[offs];
+    ut_ad(block->frame == data);
+    ut_ad(block->page.state() == BUF_BLOCK_MEMORY);
+    ut_ad(static_cast<uint16_t>(block->page.access_time - 1) <
+          srv_page_size);
+    ut_ad(block->page.access_time >= 1U << 16);
+    if (!((block->page.access_time -= 1U << 16) >> 16))
+    {
+      UT_LIST_REMOVE(blocks, block);
+      MEM_MAKE_ADDRESSABLE(block->frame, srv_page_size);
+      buf_block_free(block);
+    }
+    return;
+  }
+  ut_ad(0);
+}
+
+
 /** Read a log segment to log_sys.buf.
 @param[in,out]	start_lsn	in: read area start,
 out: the last read valid lsn
 @param[in]	end_lsn		read area end
 @return	whether no invalid blocks (e.g checksum mismatch) were found */
-bool log_t::files::read_log_seg(lsn_t* start_lsn, lsn_t end_lsn)
+bool log_t::file::read_log_seg(lsn_t* start_lsn, lsn_t end_lsn)
 {
 	ulint	len;
 	bool success = true;
-	ut_ad(log_sys.mutex.is_owned());
+	mysql_mutex_assert_owner(&log_sys.mutex);
 	ut_ad(!(*start_lsn % OS_FILE_LOG_BLOCK_SIZE));
 	ut_ad(!(end_lsn % OS_FILE_LOG_BLOCK_SIZE));
 	byte* buf = log_sys.buf;
 loop:
-	lsn_t source_offset = calc_lsn_offset(*start_lsn);
+	lsn_t source_offset = calc_lsn_offset_old(*start_lsn);
 
 	ut_a(end_lsn - *start_lsn <= ULINT_MAX);
 	len = (ulint) (end_lsn - *start_lsn);
@@ -929,17 +1122,9 @@ loop:
 
 	log_sys.n_log_ios++;
 
-	MONITOR_INC(MONITOR_LOG_IO);
-
 	ut_a((source_offset >> srv_page_size_shift) <= ULINT_MAX);
 
-	const ulint	page_no = ulint(source_offset >> srv_page_size_shift);
-
-	fil_io(IORequestLogRead, true,
-	       page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no),
-	       0,
-	       ulint(source_offset & (srv_page_size - 1)),
-	       len, buf, NULL);
+	recv_sys.read(source_offset, {buf, len});
 
 	for (ulint l = 0; l < len; l += OS_FILE_LOG_BLOCK_SIZE,
 		     buf += OS_FILE_LOG_BLOCK_SIZE,
@@ -958,36 +1143,34 @@ fail:
 			break;
 		}
 
-		if (innodb_log_checksums || is_encrypted()) {
-			ulint crc = log_block_calc_checksum_crc32(buf);
-			ulint cksum = log_block_get_checksum(buf);
-
-			DBUG_EXECUTE_IF("log_intermittent_checksum_mismatch", {
-					 static int block_counter;
-					 if (block_counter++ == 0) {
-						 cksum = crc + 1;
-					 }
-			 });
-
-			DBUG_EXECUTE_IF("log_checksum_mismatch", { cksum = crc + 1; });
-
-			if (crc != cksum) {
-				ib::error_or_warn(srv_operation != SRV_OPERATION_BACKUP)
-					    << "Invalid log block checksum."
-					    << " block: " << block_number
-					    << " checkpoint no: "
-					    << log_block_get_checkpoint_no(buf)
-					    << " expected: " << crc
-					    << " found: " << cksum;
-				goto fail;
-			}
+		ulint crc = log_block_calc_checksum_crc32(buf);
+		ulint cksum = log_block_get_checksum(buf);
 
-			if (is_encrypted()
-			    && !log_crypt(buf, *start_lsn,
-					  OS_FILE_LOG_BLOCK_SIZE,
-					  LOG_DECRYPT)) {
-				goto fail;
-			}
+		DBUG_EXECUTE_IF("log_intermittent_checksum_mismatch", {
+				static int block_counter;
+				if (block_counter++ == 0) {
+					cksum = crc + 1;
+				}
+			});
+
+		DBUG_EXECUTE_IF("log_checksum_mismatch", { cksum = crc + 1; });
+
+		if (UNIV_UNLIKELY(crc != cksum)) {
+			ib::error_or_warn(srv_operation!=SRV_OPERATION_BACKUP)
+				<< "Invalid log block checksum. block: "
+				<< block_number
+				<< " checkpoint no: "
+				<< log_block_get_checkpoint_no(buf)
+				<< " expected: " << crc
+				<< " found: " << cksum;
+			goto fail;
+		}
+
+		if (is_encrypted()
+		    && !log_crypt(buf, *start_lsn,
+				  OS_FILE_LOG_BLOCK_SIZE,
+				  LOG_DECRYPT)) {
+			goto fail;
 		}
 
 		ulint dl = log_block_get_data_len(buf);
@@ -1041,8 +1224,8 @@ recv_synchronize_groups()
 	checkpoint info on disk certain */
 
 	if (!srv_read_only_mode) {
-		log_write_checkpoint_info(true, 0);
-		log_mutex_enter();
+		log_write_checkpoint_info(0);
+		mysql_mutex_lock(&log_sys.mutex);
 	}
 }
 
@@ -1058,127 +1241,221 @@ recv_check_log_header_checksum(
 	       == log_block_calc_checksum_crc32(buf));
 }
 
-/** Find the latest checkpoint in the format-0 log header.
-@param[out]	max_field	LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2
-@return error code or DB_SUCCESS */
-static MY_ATTRIBUTE((warn_unused_result))
-dberr_t
-recv_find_max_checkpoint_0(ulint* max_field)
+static bool redo_file_sizes_are_correct()
 {
-	ib_uint64_t	max_no = 0;
-	ib_uint64_t	checkpoint_no;
-	byte*		buf	= log_sys.checkpoint_buf;
+  auto paths= get_existing_log_files_paths();
+  auto get_size= [](const std::string &path) {
+    return os_file_get_size(path.c_str()).m_total_size;
+  };
+  os_offset_t size= get_size(paths[0]);
+
+  auto it=
+      std::find_if(paths.begin(), paths.end(), [&](const std::string &path) {
+        return get_size(path) != size;
+      });
+
+  if (it == paths.end())
+    return true;
 
-	ut_ad(log_sys.log.format == 0);
+  ib::error() << "Log file " << *it << " is of different size "
+              << get_size(*it) << " bytes than other log files " << size
+              << " bytes!";
+  return false;
+}
 
-	/** Offset of the first checkpoint checksum */
-	static const uint CHECKSUM_1 = 288;
-	/** Offset of the second checkpoint checksum */
-	static const uint CHECKSUM_2 = CHECKSUM_1 + 4;
-	/** Most significant bits of the checkpoint offset */
-	static const uint OFFSET_HIGH32 = CHECKSUM_2 + 12;
-	/** Least significant bits of the checkpoint offset */
-	static const uint OFFSET_LOW32 = 16;
+/** Calculate the checksum for a log block using the pre-10.2.2 algorithm. */
+inline uint32_t log_block_calc_checksum_format_0(const byte *b)
+{
+  uint32_t sum= 1;
+  const byte *const end= &b[512 - 4];
 
-	bool found = false;
+  for (uint32_t sh= 0; b < end; )
+  {
+    sum&= 0x7FFFFFFFUL;
+    sum+= uint32_t{*b} << sh++;
+    sum+= *b++;
+    if (sh > 24)
+      sh= 0;
+  }
 
-	for (ulint field = LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2;
-	     field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1) {
-		log_header_read(field);
-
-		if (static_cast<uint32_t>(ut_fold_binary(buf, CHECKSUM_1))
-		    != mach_read_from_4(buf + CHECKSUM_1)
-		    || static_cast<uint32_t>(
-			    ut_fold_binary(buf + LOG_CHECKPOINT_LSN,
-					   CHECKSUM_2 - LOG_CHECKPOINT_LSN))
-		    != mach_read_from_4(buf + CHECKSUM_2)) {
-			DBUG_LOG("ib_log",
-				 "invalid pre-10.2.2 checkpoint " << field);
-			continue;
-		}
+  return sum;
+}
 
-		checkpoint_no = mach_read_from_8(
-			buf + LOG_CHECKPOINT_NO);
+/** Determine if a redo log from before MariaDB 10.2.2 is clean.
+@return error code
+@retval DB_SUCCESS      if the redo log is clean
+@retval DB_CORRUPTION   if the redo log is corrupted
+@retval DB_ERROR        if the redo log is not empty */
+ATTRIBUTE_COLD static dberr_t recv_log_recover_pre_10_2()
+{
+  uint64_t max_no= 0;
+  byte *buf= log_sys.buf;
 
-		if (!log_crypt_101_read_checkpoint(buf)) {
-			ib::error() << "Decrypting checkpoint failed";
-			continue;
-		}
+  ut_ad(log_sys.log.format == 0);
 
-		DBUG_PRINT("ib_log",
-			   ("checkpoint " UINT64PF " at " LSN_PF " found",
-			    checkpoint_no,
-			    mach_read_from_8(buf + LOG_CHECKPOINT_LSN)));
+  if (!redo_file_sizes_are_correct())
+    return DB_CORRUPTION;
 
-		if (checkpoint_no >= max_no) {
-			found = true;
-			*max_field = field;
-			max_no = checkpoint_no;
+  /** Offset of the first checkpoint checksum */
+  constexpr uint CHECKSUM_1= 288;
+  /** Offset of the second checkpoint checksum */
+  constexpr uint CHECKSUM_2= CHECKSUM_1 + 4;
+  /** the checkpoint LSN field */
+  constexpr uint CHECKPOINT_LSN= 8;
+  /** Most significant bits of the checkpoint offset */
+  constexpr uint OFFS_HI= CHECKSUM_2 + 12;
+  /** Least significant bits of the checkpoint offset */
+  constexpr uint OFFS_LO= 16;
 
-			log_sys.log.set_lsn(mach_read_from_8(
-				buf + LOG_CHECKPOINT_LSN));
-			log_sys.log.set_lsn_offset(
-				lsn_t(mach_read_from_4(buf + OFFSET_HIGH32))
-				<< 32
-				| mach_read_from_4(buf + OFFSET_LOW32));
-		}
-	}
+  lsn_t lsn= 0;
 
-	if (found) {
-		return(DB_SUCCESS);
-	}
+  for (ulint field= LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2;
+       field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1)
+  {
+    log_sys.log.read(field, {buf, OS_FILE_LOG_BLOCK_SIZE});
+
+    if (static_cast<uint32_t>(ut_fold_binary(buf, CHECKSUM_1)) !=
+        mach_read_from_4(buf + CHECKSUM_1) ||
+        static_cast<uint32_t>(ut_fold_binary(buf + CHECKPOINT_LSN,
+                                             CHECKSUM_2 - CHECKPOINT_LSN)) !=
+        mach_read_from_4(buf + CHECKSUM_2))
+     {
+       DBUG_LOG("ib_log", "invalid pre-10.2.2 checkpoint " << field);
+       continue;
+     }
+
+    if (!log_crypt_101_read_checkpoint(buf))
+    {
+      ib::error() << "Decrypting checkpoint failed";
+      continue;
+    }
+
+    const uint64_t checkpoint_no= mach_read_from_8(buf);
+
+    DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF " found",
+                          checkpoint_no,
+                          mach_read_from_8(buf + CHECKPOINT_LSN)));
+
+    if (checkpoint_no >= max_no)
+    {
+      max_no= checkpoint_no;
+      lsn= mach_read_from_8(buf + CHECKPOINT_LSN);
+      log_sys.log.set_lsn(lsn);
+      log_sys.log.set_lsn_offset(lsn_t{mach_read_from_4(buf + OFFS_HI)} << 32 |
+                                 mach_read_from_4(buf + OFFS_LO));
+    }
+  }
+
+  if (!lsn)
+  {
+    ib::error() << "Upgrade after a crash is not supported."
+            " This redo log was created before MariaDB 10.2.2,"
+            " and we did not find a valid checkpoint."
+            " Please follow the instructions at"
+            " https://mariadb.com/kb/en/library/upgrading/";
+    return DB_ERROR;
+  }
 
-	ib::error() << "Upgrade after a crash is not supported."
-		" This redo log was created before MariaDB 10.2.2,"
-		" and we did not find a valid checkpoint."
-		" Please follow the instructions at"
-		" https://mariadb.com/kb/en/library/upgrading/";
-	return(DB_ERROR);
+  log_sys.set_lsn(lsn);
+  log_sys.set_flushed_lsn(lsn);
+  const lsn_t source_offset= log_sys.log.calc_lsn_offset_old(lsn);
+
+  static constexpr char NO_UPGRADE_RECOVERY_MSG[]=
+    "Upgrade after a crash is not supported."
+    " This redo log was created before MariaDB 10.2.2";
+
+  recv_sys.read(source_offset & ~511, {buf, 512});
+
+  if (log_block_calc_checksum_format_0(buf) != log_block_get_checksum(buf) &&
+      !log_crypt_101_read_block(buf, lsn))
+  {
+    ib::error() << NO_UPGRADE_RECOVERY_MSG << ", and it appears corrupted.";
+    return DB_CORRUPTION;
+  }
+
+  if (mach_read_from_2(buf + 4) == (source_offset & 511))
+  {
+    /* Mark the redo log for upgrading. */
+    srv_log_file_size= 0;
+    recv_sys.parse_start_lsn= recv_sys.recovered_lsn= recv_sys.scanned_lsn=
+      recv_sys.mlog_checkpoint_lsn = lsn;
+    log_sys.last_checkpoint_lsn= log_sys.next_checkpoint_lsn=
+      log_sys.write_lsn= log_sys.current_flush_lsn= lsn;
+    log_sys.next_checkpoint_no= 0;
+    return DB_SUCCESS;
+  }
+
+  if (buf[20 + 32 * 9] == 2)
+    ib::error() << "Cannot decrypt log for upgrading."
+                   " The encrypted log was created before MariaDB 10.2.2.";
+  else
+    ib::error() << NO_UPGRADE_RECOVERY_MSG << ".";
+
+  return DB_ERROR;
 }
 
-/** Determine if a pre-MySQL 5.7.9/MariaDB 10.2.2 redo log is clean.
-@param[in]	lsn	checkpoint LSN
-@param[in]	crypt	whether the log might be encrypted
-@return error code
+/** Calculate the offset of a log sequence number
+in an old redo log file (during upgrade check).
+@param[in]	lsn	log sequence number
+@return byte offset within the log */
+inline lsn_t log_t::file::calc_lsn_offset_old(lsn_t lsn) const
+{
+  const lsn_t size= capacity() * recv_sys.files_size();
+  lsn_t l= lsn - this->lsn;
+  if (longlong(l) < 0)
+  {
+    l= lsn_t(-longlong(l)) % size;
+    l= size - l;
+  }
+
+  l+= lsn_offset - LOG_FILE_HDR_SIZE * (1 + lsn_offset / file_size);
+  l%= size;
+  return l + LOG_FILE_HDR_SIZE * (1 + l / (file_size - LOG_FILE_HDR_SIZE));
+}
+
+/** Determine if a redo log from MariaDB 10.2.2+, 10.3, or 10.4 is clean.
+@return	error code
 @retval	DB_SUCCESS	if the redo log is clean
-@retval DB_ERROR	if the redo log is corrupted or dirty */
-static dberr_t recv_log_format_0_recover(lsn_t lsn, bool crypt)
+@retval	DB_CORRUPTION	if the redo log is corrupted
+@retval	DB_ERROR	if the redo log is not empty */
+static dberr_t recv_log_recover_10_4()
 {
-	log_mutex_enter();
-	const lsn_t	source_offset = log_sys.log.calc_lsn_offset(lsn);
-	log_mutex_exit();
-	const ulint	page_no = ulint(source_offset >> srv_page_size_shift);
+	const lsn_t	lsn = log_sys.log.get_lsn();
+	const lsn_t	source_offset =	log_sys.log.calc_lsn_offset_old(lsn);
 	byte*		buf = log_sys.buf;
 
-	static const char* NO_UPGRADE_RECOVERY_MSG =
-		"Upgrade after a crash is not supported."
-		" This redo log was created before MariaDB 10.2.2";
-
-	fil_io(IORequestLogRead, true,
-	       page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no),
-	       0,
-	       ulint((source_offset & ~(OS_FILE_LOG_BLOCK_SIZE - 1))
-		     & (srv_page_size - 1)),
-	       OS_FILE_LOG_BLOCK_SIZE, buf, NULL);
-
-	if (log_block_calc_checksum_format_0(buf)
-	    != log_block_get_checksum(buf)
-	    && !log_crypt_101_read_block(buf)) {
-		ib::error() << NO_UPGRADE_RECOVERY_MSG
-			<< ", and it appears corrupted.";
-		return(DB_CORRUPTION);
+	if (!redo_file_sizes_are_correct()) {
+		return DB_CORRUPTION;
+	}
+
+	recv_sys.read(source_offset & ~(OS_FILE_LOG_BLOCK_SIZE - 1),
+		      {buf, OS_FILE_LOG_BLOCK_SIZE});
+
+	ulint crc = log_block_calc_checksum_crc32(buf);
+	ulint cksum = log_block_get_checksum(buf);
+
+	if (UNIV_UNLIKELY(crc != cksum)) {
+		ib::error() << "Invalid log block checksum."
+			    << " block: "
+			    << log_block_get_hdr_no(buf)
+			    << " checkpoint no: "
+			    << log_block_get_checkpoint_no(buf)
+			    << " expected: " << crc
+			    << " found: " << cksum;
+		return DB_CORRUPTION;
+	}
+
+	if (log_sys.log.is_encrypted()
+	    && !log_crypt(buf, lsn & ~511, 512, LOG_DECRYPT)) {
+		return DB_ERROR;
 	}
 
+	/* On a clean shutdown, the redo log will be logically empty
+	after the checkpoint lsn. */
+
 	if (log_block_get_data_len(buf)
-	    == (source_offset & (OS_FILE_LOG_BLOCK_SIZE - 1))) {
-	} else if (crypt) {
-		ib::error() << "Cannot decrypt log for upgrading."
-			" The encrypted log was created"
-			" before MariaDB 10.2.2.";
+	    != (source_offset & (OS_FILE_LOG_BLOCK_SIZE - 1))) {
 		return DB_ERROR;
-	} else {
-		ib::error() << NO_UPGRADE_RECOVERY_MSG << ".";
-		return(DB_ERROR);
 	}
 
 	/* Mark the redo log for upgrading. */
@@ -1186,12 +1463,12 @@ static dberr_t recv_log_format_0_recover(lsn_t lsn, bool crypt)
 	recv_sys.parse_start_lsn = recv_sys.recovered_lsn
 		= recv_sys.scanned_lsn
 		= recv_sys.mlog_checkpoint_lsn = lsn;
+	log_sys.set_lsn(lsn);
+	log_sys.set_flushed_lsn(lsn);
 	log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn
-		= log_sys.lsn = log_sys.write_lsn
-		= log_sys.current_flush_lsn = log_sys.flushed_to_disk_lsn
-		= lsn;
+		= log_sys.write_lsn = log_sys.current_flush_lsn = lsn;
 	log_sys.next_checkpoint_no = 0;
-	return(DB_SUCCESS);
+	return DB_SUCCESS;
 }
 
 /** Find the latest checkpoint in the log header.
@@ -1210,7 +1487,7 @@ recv_find_max_checkpoint(ulint* max_field)
 
 	buf = log_sys.checkpoint_buf;
 
-	log_header_read(0);
+	log_sys.log.read(0, {buf, OS_FILE_LOG_BLOCK_SIZE});
 	/* Check the header page checksum. There was no
 	checksum in the first redo log format (version 0). */
 	log_sys.log.format = mach_read_from_4(buf + LOG_HEADER_FORMAT);
@@ -1231,13 +1508,15 @@ recv_find_max_checkpoint(ulint* max_field)
 
 	switch (log_sys.log.format) {
 	case log_t::FORMAT_3_23:
-		return(recv_find_max_checkpoint_0(max_field));
+		return recv_log_recover_pre_10_2();
 	case log_t::FORMAT_10_2:
 	case log_t::FORMAT_10_2 | log_t::FORMAT_ENCRYPTED:
 	case log_t::FORMAT_10_3:
 	case log_t::FORMAT_10_3 | log_t::FORMAT_ENCRYPTED:
 	case log_t::FORMAT_10_4:
 	case log_t::FORMAT_10_4 | log_t::FORMAT_ENCRYPTED:
+	case log_t::FORMAT_10_5:
+	case log_t::FORMAT_10_5 | log_t::FORMAT_ENCRYPTED:
 		break;
 	default:
 		ib::error() << "Unsupported redo log format."
@@ -1247,8 +1526,7 @@ recv_find_max_checkpoint(ulint* max_field)
 
 	for (field = LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2;
 	     field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1) {
-
-		log_header_read(field);
+		log_sys.log.read(field, {buf, OS_FILE_LOG_BLOCK_SIZE});
 
 		const ulint crc32 = log_block_calc_checksum_crc32(buf);
 		const ulint cksum = log_block_get_checksum(buf);
@@ -1291,7 +1569,7 @@ recv_find_max_checkpoint(ulint* max_field)
 
 	if (*max_field == 0) {
 		/* Before 10.2.2, we could get here during database
-		initialization if we created an ib_logfile0 file that
+		initialization if we created an LOG_FILE_NAME file that
 		was filled with zeroes, and were killed. After
 		10.2.2, we would reject such a file already earlier,
 		when checking the file header. */
@@ -1302,765 +1580,849 @@ recv_find_max_checkpoint(ulint* max_field)
 		return(DB_ERROR);
 	}
 
-	return(DB_SUCCESS);
-}
-
-/** Try to parse a single log record body and also applies it if
-specified.
-@param[in]	type		redo log entry type
-@param[in]	ptr		redo log record body
-@param[in]	end_ptr		end of buffer
-@param[in]	page_id		page identifier
-@param[in]	apply		whether to apply the record
-@param[in,out]	block		buffer block, or NULL if
-a page log record should not be applied
-or if it is a MLOG_FILE_ operation
-@param[in,out]	mtr		mini-transaction, or NULL if
-a page log record should not be applied
-@return log record end, NULL if not a complete record */
-static
-byte*
-recv_parse_or_apply_log_rec_body(
-	mlog_id_t	type,
-	byte*		ptr,
-	byte*		end_ptr,
-	const page_id_t	page_id,
-	bool		apply,
-	buf_block_t*	block,
-	mtr_t*		mtr)
-{
-	ut_ad(!block == !mtr);
-	ut_ad(!apply || recv_sys.mlog_checkpoint_lsn);
-
-	switch (type) {
-	case MLOG_FILE_NAME:
-	case MLOG_FILE_DELETE:
-	case MLOG_FILE_CREATE2:
-	case MLOG_FILE_RENAME2:
-		ut_ad(block == NULL);
-		/* Collect the file names when parsing the log,
-		before applying any log records. */
-		return fil_name_parse(ptr, end_ptr, page_id, type, apply);
-	case MLOG_INDEX_LOAD:
-		if (end_ptr < ptr + 8) {
-			return(NULL);
-		}
-		return(ptr + 8);
-	case MLOG_TRUNCATE:
-		ib::error() << "Cannot crash-upgrade from "
-			"old-style TRUNCATE TABLE";
-		recv_sys.found_corrupt_log = true;
-		return NULL;
-	default:
-		break;
-	}
-
-	dict_index_t*	index	= NULL;
-	page_t*		page;
-	page_zip_des_t*	page_zip;
-#ifdef UNIV_DEBUG
-	ulint		page_type;
-#endif /* UNIV_DEBUG */
-
-	if (block) {
-		/* Applying a page log record. */
-		ut_ad(apply);
-		page = block->frame;
-		page_zip = buf_block_get_page_zip(block);
-		ut_d(page_type = fil_page_get_type(page));
-	} else if (apply
-		   && !is_predefined_tablespace(page_id.space())
-		   && recv_spaces.find(page_id.space()) == recv_spaces.end()) {
-		if (recv_sys.recovered_lsn < recv_sys.mlog_checkpoint_lsn) {
-			/* We have not seen all records between the
-			checkpoint and MLOG_CHECKPOINT. There should be
-			a MLOG_FILE_DELETE for this tablespace later. */
-			recv_spaces.insert(
-				std::make_pair(page_id.space(),
-					       file_name_t("", false)));
-			goto parse_log;
-		}
-
-		ib::error() << "Missing MLOG_FILE_NAME or MLOG_FILE_DELETE"
-			" for redo log record " << type << page_id << " at "
-			    << recv_sys.recovered_lsn << ".";
-		recv_sys.found_corrupt_log = true;
-		return(NULL);
-	} else {
-parse_log:
-		/* Parsing a page log record. */
-		page = NULL;
-		page_zip = NULL;
-		ut_d(page_type = FIL_PAGE_TYPE_ALLOCATED);
-	}
-
-	const byte*	old_ptr = ptr;
-
-	switch (type) {
-#ifdef UNIV_LOG_LSN_DEBUG
-	case MLOG_LSN:
-		/* The LSN is checked in recv_parse_log_rec(). */
-		break;
-#endif /* UNIV_LOG_LSN_DEBUG */
-	case MLOG_1BYTE: case MLOG_2BYTES: case MLOG_4BYTES: case MLOG_8BYTES:
-	case MLOG_MEMSET:
-#ifdef UNIV_DEBUG
-		if (page && page_type == FIL_PAGE_TYPE_ALLOCATED
-		    && end_ptr >= ptr + 2) {
-			/* It is OK to set FIL_PAGE_TYPE and certain
-			list node fields on an empty page.  Any other
-			write is not OK. */
-
-			/* NOTE: There may be bogus assertion failures for
-			dict_hdr_create(), trx_rseg_header_create(),
-			trx_sys_create_doublewrite_buf(), and
-			trx_sysf_create().
-			These are only called during database creation. */
-			ulint	offs = mach_read_from_2(ptr);
-
-			switch (type) {
-			default:
-				ut_error;
-			case MLOG_2BYTES:
-				/* Note that this can fail when the
-				redo log been written with something
-				older than InnoDB Plugin 1.0.4. */
-				ut_ad(offs == FIL_PAGE_TYPE
-				      || srv_is_undo_tablespace(
-					      page_id.space())
-				      || offs == IBUF_TREE_SEG_HEADER
-				      + IBUF_HEADER + FSEG_HDR_OFFSET
-				      || offs == PAGE_BTR_IBUF_FREE_LIST
-				      + PAGE_HEADER + FIL_ADDR_BYTE
-				      || offs == PAGE_BTR_IBUF_FREE_LIST
-				      + PAGE_HEADER + FIL_ADDR_BYTE
-				      + FIL_ADDR_SIZE
-				      || offs == PAGE_BTR_SEG_LEAF
-				      + PAGE_HEADER + FSEG_HDR_OFFSET
-				      || offs == PAGE_BTR_SEG_TOP
-				      + PAGE_HEADER + FSEG_HDR_OFFSET
-				      || offs == PAGE_BTR_IBUF_FREE_LIST_NODE
-				      + PAGE_HEADER + FIL_ADDR_BYTE
-				      + 0 /*FLST_PREV*/
-				      || offs == PAGE_BTR_IBUF_FREE_LIST_NODE
-				      + PAGE_HEADER + FIL_ADDR_BYTE
-				      + FIL_ADDR_SIZE /*FLST_NEXT*/);
-				break;
-			case MLOG_4BYTES:
-				/* Note that this can fail when the
-				redo log been written with something
-				older than InnoDB Plugin 1.0.4. */
-				ut_ad(0
-				      /* fil_crypt_rotate_page() writes this */
-				      || offs == FIL_PAGE_SPACE_ID
-				      || srv_is_undo_tablespace(
-					      page_id.space())
-				      || offs == IBUF_TREE_SEG_HEADER
-				      + IBUF_HEADER + FSEG_HDR_SPACE
-				      || offs == IBUF_TREE_SEG_HEADER
-				      + IBUF_HEADER + FSEG_HDR_PAGE_NO
-				      || offs == PAGE_BTR_IBUF_FREE_LIST
-				      + PAGE_HEADER/* flst_init */
-				      || offs == PAGE_BTR_IBUF_FREE_LIST
-				      + PAGE_HEADER + FIL_ADDR_PAGE
-				      || offs == PAGE_BTR_IBUF_FREE_LIST
-				      + PAGE_HEADER + FIL_ADDR_PAGE
-				      + FIL_ADDR_SIZE
-				      || offs == PAGE_BTR_SEG_LEAF
-				      + PAGE_HEADER + FSEG_HDR_PAGE_NO
-				      || offs == PAGE_BTR_SEG_LEAF
-				      + PAGE_HEADER + FSEG_HDR_SPACE
-				      || offs == PAGE_BTR_SEG_TOP
-				      + PAGE_HEADER + FSEG_HDR_PAGE_NO
-				      || offs == PAGE_BTR_SEG_TOP
-				      + PAGE_HEADER + FSEG_HDR_SPACE
-				      || offs == PAGE_BTR_IBUF_FREE_LIST_NODE
-				      + PAGE_HEADER + FIL_ADDR_PAGE
-				      + 0 /*FLST_PREV*/
-				      || offs == PAGE_BTR_IBUF_FREE_LIST_NODE
-				      + PAGE_HEADER + FIL_ADDR_PAGE
-				      + FIL_ADDR_SIZE /*FLST_NEXT*/);
-				break;
-			}
-		}
-#endif /* UNIV_DEBUG */
-		ptr = mlog_parse_nbytes(type, ptr, end_ptr, page, page_zip);
-		if (ptr != NULL && page != NULL
-		    && page_id.page_no() == 0 && type == MLOG_4BYTES) {
-			ulint	offs = mach_read_from_2(old_ptr);
-			switch (offs) {
-				fil_space_t*	space;
-				ulint		val;
-			default:
-				break;
-			case FSP_HEADER_OFFSET + FSP_SPACE_FLAGS:
-			case FSP_HEADER_OFFSET + FSP_SIZE:
-			case FSP_HEADER_OFFSET + FSP_FREE_LIMIT:
-			case FSP_HEADER_OFFSET + FSP_FREE + FLST_LEN:
-				space = fil_space_get(page_id.space());
-				ut_a(space != NULL);
-				val = mach_read_from_4(page + offs);
-
-				switch (offs) {
-				case FSP_HEADER_OFFSET + FSP_SPACE_FLAGS:
-					space->flags = val;
-					break;
-				case FSP_HEADER_OFFSET + FSP_SIZE:
-					space->size_in_header = val;
-					break;
-				case FSP_HEADER_OFFSET + FSP_FREE_LIMIT:
-					space->free_limit = val;
-					break;
-				case FSP_HEADER_OFFSET + FSP_FREE + FLST_LEN:
-					space->free_len = val;
-					ut_ad(val == flst_get_len(
-						      page + offs));
-					break;
-				}
-			}
-		}
-		break;
-	case MLOG_REC_INSERT: case MLOG_COMP_REC_INSERT:
-		ut_ad(!page || fil_page_type_is_index(page_type));
-
-		if (NULL != (ptr = mlog_parse_index(
-				     ptr, end_ptr,
-				     type == MLOG_COMP_REC_INSERT,
-				     &index))) {
-			ut_a(!page
-			     || (ibool)!!page_is_comp(page)
-			     == dict_table_is_comp(index->table));
-			ptr = page_cur_parse_insert_rec(FALSE, ptr, end_ptr,
-							block, index, mtr);
-		}
-		break;
-	case MLOG_REC_CLUST_DELETE_MARK: case MLOG_COMP_REC_CLUST_DELETE_MARK:
-		ut_ad(!page || fil_page_type_is_index(page_type));
-
-		if (NULL != (ptr = mlog_parse_index(
-				     ptr, end_ptr,
-				     type == MLOG_COMP_REC_CLUST_DELETE_MARK,
-				     &index))) {
-			ut_a(!page
-			     || (ibool)!!page_is_comp(page)
-			     == dict_table_is_comp(index->table));
-			ptr = btr_cur_parse_del_mark_set_clust_rec(
-				ptr, end_ptr, page, page_zip, index);
-		}
-		break;
-	case MLOG_REC_SEC_DELETE_MARK:
-		ut_ad(!page || fil_page_type_is_index(page_type));
-		ptr = btr_cur_parse_del_mark_set_sec_rec(ptr, end_ptr,
-							 page, page_zip);
-		break;
-	case MLOG_REC_UPDATE_IN_PLACE: case MLOG_COMP_REC_UPDATE_IN_PLACE:
-		ut_ad(!page || fil_page_type_is_index(page_type));
-
-		if (NULL != (ptr = mlog_parse_index(
-				     ptr, end_ptr,
-				     type == MLOG_COMP_REC_UPDATE_IN_PLACE,
-				     &index))) {
-			ut_a(!page
-			     || (ibool)!!page_is_comp(page)
-			     == dict_table_is_comp(index->table));
-			ptr = btr_cur_parse_update_in_place(ptr, end_ptr, page,
-							    page_zip, index);
-		}
-		break;
-	case MLOG_LIST_END_DELETE: case MLOG_COMP_LIST_END_DELETE:
-	case MLOG_LIST_START_DELETE: case MLOG_COMP_LIST_START_DELETE:
-		ut_ad(!page || fil_page_type_is_index(page_type));
-
-		if (NULL != (ptr = mlog_parse_index(
-				     ptr, end_ptr,
-				     type == MLOG_COMP_LIST_END_DELETE
-				     || type == MLOG_COMP_LIST_START_DELETE,
-				     &index))) {
-			ut_a(!page
-			     || (ibool)!!page_is_comp(page)
-			     == dict_table_is_comp(index->table));
-			ptr = page_parse_delete_rec_list(type, ptr, end_ptr,
-							 block, index, mtr);
-		}
-		break;
-	case MLOG_LIST_END_COPY_CREATED: case MLOG_COMP_LIST_END_COPY_CREATED:
-		ut_ad(!page || fil_page_type_is_index(page_type));
-
-		if (NULL != (ptr = mlog_parse_index(
-				     ptr, end_ptr,
-				     type == MLOG_COMP_LIST_END_COPY_CREATED,
-				     &index))) {
-			ut_a(!page
-			     || (ibool)!!page_is_comp(page)
-			     == dict_table_is_comp(index->table));
-			ptr = page_parse_copy_rec_list_to_created_page(
-				ptr, end_ptr, block, index, mtr);
-		}
-		break;
-	case MLOG_PAGE_REORGANIZE:
-	case MLOG_COMP_PAGE_REORGANIZE:
-	case MLOG_ZIP_PAGE_REORGANIZE:
-		ut_ad(!page || fil_page_type_is_index(page_type));
-
-		if (NULL != (ptr = mlog_parse_index(
-				     ptr, end_ptr,
-				     type != MLOG_PAGE_REORGANIZE,
-				     &index))) {
-			ut_a(!page
-			     || (ibool)!!page_is_comp(page)
-			     == dict_table_is_comp(index->table));
-			ptr = btr_parse_page_reorganize(
-				ptr, end_ptr, index,
-				type == MLOG_ZIP_PAGE_REORGANIZE,
-				block, mtr);
-		}
-		break;
-	case MLOG_PAGE_CREATE: case MLOG_COMP_PAGE_CREATE:
-		/* Allow anything in page_type when creating a page. */
-		ut_a(!page_zip);
-		page_parse_create(block, type == MLOG_COMP_PAGE_CREATE, false);
-		break;
-	case MLOG_PAGE_CREATE_RTREE: case MLOG_COMP_PAGE_CREATE_RTREE:
-		page_parse_create(block, type == MLOG_COMP_PAGE_CREATE_RTREE,
-				  true);
-		break;
-	case MLOG_UNDO_INSERT:
-		ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
-		ptr = trx_undo_parse_add_undo_rec(ptr, end_ptr, page);
-		break;
-	case MLOG_UNDO_ERASE_END:
-		if (page) {
-			ut_ad(page_type == FIL_PAGE_UNDO_LOG);
-			trx_undo_erase_page_end(page);
-		}
-		break;
-	case MLOG_UNDO_INIT:
-		/* Allow anything in page_type when creating a page. */
-		ptr = trx_undo_parse_page_init(ptr, end_ptr, page);
-		break;
-	case MLOG_UNDO_HDR_REUSE:
-		ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
-		ptr = trx_undo_parse_page_header_reuse(ptr, end_ptr, page);
-		break;
-	case MLOG_UNDO_HDR_CREATE:
-		ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
-		ptr = trx_undo_parse_page_header(ptr, end_ptr, page, mtr);
-		break;
-	case MLOG_REC_MIN_MARK: case MLOG_COMP_REC_MIN_MARK:
-		ut_ad(!page || fil_page_type_is_index(page_type));
-		/* On a compressed page, MLOG_COMP_REC_MIN_MARK
-		will be followed by MLOG_COMP_REC_DELETE
-		or MLOG_ZIP_WRITE_HEADER(FIL_PAGE_PREV, FIL_NULL)
-		in the same mini-transaction. */
-		ut_a(type == MLOG_COMP_REC_MIN_MARK || !page_zip);
-		ptr = btr_parse_set_min_rec_mark(
-			ptr, end_ptr, type == MLOG_COMP_REC_MIN_MARK,
-			page, mtr);
-		break;
-	case MLOG_REC_DELETE: case MLOG_COMP_REC_DELETE:
-		ut_ad(!page || fil_page_type_is_index(page_type));
-
-		if (NULL != (ptr = mlog_parse_index(
-				     ptr, end_ptr,
-				     type == MLOG_COMP_REC_DELETE,
-				     &index))) {
-			ut_a(!page
-			     || (ibool)!!page_is_comp(page)
-			     == dict_table_is_comp(index->table));
-			ptr = page_cur_parse_delete_rec(ptr, end_ptr,
-							block, index, mtr);
-		}
-		break;
-	case MLOG_IBUF_BITMAP_INIT:
-		/* Allow anything in page_type when creating a page. */
-		if (block) ibuf_bitmap_init_apply(block);
-		break;
-	case MLOG_INIT_FILE_PAGE2:
-		/* Allow anything in page_type when creating a page. */
-		if (block) fsp_apply_init_file_page(block);
-		break;
-	case MLOG_INIT_FREE_PAGE:
-		/* The page can be zero-filled and its previous
-		contents can be ignored. We do not write or apply
-		this record yet. */
-		break;
-	case MLOG_WRITE_STRING:
-		ptr = mlog_parse_string(ptr, end_ptr, page, page_zip);
-		break;
-	case MLOG_ZIP_WRITE_NODE_PTR:
-		ut_ad(!page || fil_page_type_is_index(page_type));
-		ptr = page_zip_parse_write_node_ptr(ptr, end_ptr,
-						    page, page_zip);
-		break;
-	case MLOG_ZIP_WRITE_BLOB_PTR:
-		ut_ad(!page || fil_page_type_is_index(page_type));
-		ptr = page_zip_parse_write_blob_ptr(ptr, end_ptr,
-						    page, page_zip);
-		break;
-	case MLOG_ZIP_WRITE_HEADER:
-		ut_ad(!page || fil_page_type_is_index(page_type));
-		ptr = page_zip_parse_write_header(ptr, end_ptr,
-						  page, page_zip);
-		break;
-	case MLOG_ZIP_PAGE_COMPRESS:
-		/* Allow anything in page_type when creating a page. */
-		ptr = page_zip_parse_compress(ptr, end_ptr, block);
-		break;
-	case MLOG_ZIP_PAGE_COMPRESS_NO_DATA:
-		if (NULL != (ptr = mlog_parse_index(
-				ptr, end_ptr, TRUE, &index))) {
-
-			ut_a(!page || ((ibool)!!page_is_comp(page)
-				== dict_table_is_comp(index->table)));
-			ptr = page_zip_parse_compress_no_data(
-				ptr, end_ptr, page, page_zip, index);
-		}
-		break;
-	case MLOG_ZIP_WRITE_TRX_ID:
-		/* This must be a clustered index leaf page. */
-		ut_ad(!page || page_type == FIL_PAGE_INDEX);
-		ptr = page_zip_parse_write_trx_id(ptr, end_ptr,
-						  page, page_zip);
-		break;
-	case MLOG_FILE_WRITE_CRYPT_DATA:
-		dberr_t err;
-		ptr = const_cast<byte*>(fil_parse_write_crypt_data(ptr, end_ptr, &err));
-
-		if (err != DB_SUCCESS) {
-			recv_sys.found_corrupt_log = TRUE;
-		}
+	switch (log_sys.log.format) {
+	case log_t::FORMAT_10_5:
+	case log_t::FORMAT_10_5 | log_t::FORMAT_ENCRYPTED:
 		break;
 	default:
-		ptr = NULL;
-		ib::error() << "Incorrect log record type "
-			<< ib::hex(unsigned(type));
-
-		recv_sys.found_corrupt_log = true;
-	}
-
-	if (index) {
-		dict_table_t*	table = index->table;
-
-		dict_mem_index_free(index);
-		dict_mem_table_free(table);
+		if (dberr_t err = recv_log_recover_10_4()) {
+			ib::error()
+				<< "Upgrade after a crash is not supported."
+				" The redo log was created with " << creator
+				<< (err == DB_ERROR
+				    ? "." : ", and it appears corrupted.");
+			return err;
+		}
 	}
 
-	return(ptr);
+	return(DB_SUCCESS);
 }
 
-/*********************************************************************//**
-Calculates the fold value of a page file address: used in inserting or
-searching for a log record in the hash table.
-@return folded value */
-UNIV_INLINE
-ulint
-recv_fold(
-/*======*/
-	ulint	space,	/*!< in: space */
-	ulint	page_no)/*!< in: page number */
+/*******************************************************//**
+Calculates the new value for lsn when more data is added to the log. */
+static
+lsn_t
+recv_calc_lsn_on_data_add(
+/*======================*/
+	lsn_t		lsn,	/*!< in: old lsn */
+	ib_uint64_t	len)	/*!< in: this many bytes of data is
+				added, log block headers not included */
 {
-	return(ut_fold_ulint_pair(space, page_no));
+	unsigned frag_len = static_cast<unsigned>(lsn % OS_FILE_LOG_BLOCK_SIZE)
+		- LOG_BLOCK_HDR_SIZE;
+	unsigned payload_size = log_sys.payload_size();
+	ut_ad(frag_len < payload_size);
+	lsn_t lsn_len = len;
+	lsn_len += (lsn_len + frag_len) / payload_size
+		* (OS_FILE_LOG_BLOCK_SIZE - payload_size);
+
+	return(lsn + lsn_len);
 }
 
-/*********************************************************************//**
-Calculates the hash value of a page file address: used in inserting or
-searching for a log record in the hash table.
-@return folded value */
-UNIV_INLINE
-ulint
-recv_hash(
-/*======*/
-	ulint	space,	/*!< in: space */
-	ulint	page_no)/*!< in: page number */
+/** Trim old log records for a page.
+@param start_lsn oldest log sequence number to preserve
+@return whether all the log for the page was trimmed */
+inline bool page_recv_t::trim(lsn_t start_lsn)
 {
-	return(hash_calc_hash(recv_fold(space, page_no), recv_sys.addr_hash));
+  while (log.head)
+  {
+    if (log.head->lsn >= start_lsn) return false;
+    last_offset= 1; /* the next record must not be same_page */
+    log_rec_t *next= log.head->next;
+    recv_sys.free(log.head);
+    log.head= next;
+  }
+  log.tail= nullptr;
+  return true;
 }
 
-/*********************************************************************//**
-Gets the hashed file address struct for a page.
-@return file address struct, NULL if not found from the hash table */
-static
-recv_addr_t*
-recv_get_fil_addr_struct(
-/*=====================*/
-	ulint	space,	/*!< in: space id */
-	ulint	page_no)/*!< in: page number */
-{
-	ut_ad(mutex_own(&recv_sys.mutex));
 
-	recv_addr_t*	recv_addr;
+inline void page_recv_t::recs_t::clear()
+{
+  ut_ad(mutex_own(&recv_sys.mutex));
+  for (const log_rec_t *l= head; l; )
+  {
+    const log_rec_t *next= l->next;
+    recv_sys.free(l);
+    l= next;
+  }
+  head= tail= nullptr;
+}
 
-	for (recv_addr = static_cast<recv_addr_t*>(
-			HASH_GET_FIRST(recv_sys.addr_hash,
-				       recv_hash(space, page_no)));
-	     recv_addr != 0;
-	     recv_addr = static_cast<recv_addr_t*>(
-		     HASH_GET_NEXT(addr_hash, recv_addr))) {
 
-		if (recv_addr->space == space
-		    && recv_addr->page_no == page_no) {
+/** Ignore any earlier redo log records for this page. */
+inline void page_recv_t::will_not_read()
+{
+  ut_ad(state == RECV_NOT_PROCESSED || state == RECV_WILL_NOT_READ);
+  state= RECV_WILL_NOT_READ;
+  log.clear();
+}
 
-			return(recv_addr);
-		}
-	}
 
-	return(NULL);
+/** Register a redo log snippet for a page.
+@param it       page iterator
+@param start_lsn start LSN of the mini-transaction
+@param lsn      @see mtr_t::commit_lsn()
+@param recs     redo log snippet @see log_t::FORMAT_10_5
+@param len      length of l, in bytes */
+inline void recv_sys_t::add(map::iterator it, lsn_t start_lsn, lsn_t lsn,
+                            const byte *l, size_t len)
+{
+  ut_ad(mutex_own(&mutex));
+  page_id_t page_id = it->first;
+  page_recv_t &recs= it->second;
+
+  switch (*l & 0x70) {
+  case FREE_PAGE: case INIT_PAGE:
+    recs.will_not_read();
+    mlog_init.add(page_id, start_lsn); /* FIXME: remove this! */
+    /* fall through */
+  default:
+    log_phys_t *tail= static_cast<log_phys_t*>(recs.log.last());
+    if (!tail)
+      break;
+    if (tail->start_lsn != start_lsn)
+      break;
+    ut_ad(tail->lsn == lsn);
+    buf_block_t *block= UT_LIST_GET_LAST(blocks);
+    ut_ad(block);
+    const size_t used= static_cast<uint16_t>(block->page.access_time - 1) + 1;
+    ut_ad(used >= ALIGNMENT);
+    const byte *end= const_cast<const log_phys_t*>(tail)->end();
+    if (!((reinterpret_cast<size_t>(end + len) ^
+           reinterpret_cast<size_t>(end)) & ~(ALIGNMENT - 1)))
+    {
+      /* Use already allocated 'padding' bytes */
+append:
+      MEM_MAKE_ADDRESSABLE(end + 1, len);
+      /* Append to the preceding record for the page */
+      tail->append(l, len);
+      return;
+    }
+    if (end <= &block->frame[used - ALIGNMENT] || &block->frame[used] >= end)
+      break; /* Not the last allocated record in the page */
+    const size_t new_used= static_cast<size_t>(end - block->frame + len + 1);
+    ut_ad(new_used > used);
+    if (new_used > srv_page_size)
+      break;
+    block->page.access_time= (block->page.access_time & ~0U << 16) |
+      ut_calc_align<uint16_t>(static_cast<uint16_t>(new_used), ALIGNMENT);
+    goto append;
+  }
+  recs.log.append(new (alloc(log_phys_t::alloc_size(len)))
+                  log_phys_t(start_lsn, lsn, l, len));
 }
 
-/** Store a redo log record for applying.
-@param type	record type
-@param space	tablespace identifier
-@param page_no	page number
-@param body	record body
-@param rec_end	end of record
-@param lsn	start LSN of the mini-transaction
-@param end_lsn	end LSN of the mini-transaction */
-inline void recv_sys_t::add(mlog_id_t type, ulint space, ulint page_no,
-			    byte* body, byte* rec_end, lsn_t lsn,
-			    lsn_t end_lsn)
+/** Store/remove the freed pages in fil_name_t of recv_spaces.
+@param[in]	page_id		freed or init page_id
+@param[in]	freed		TRUE if page is freed */
+static void store_freed_or_init_rec(page_id_t page_id, bool freed)
 {
-	ut_ad(type != MLOG_FILE_DELETE);
-	ut_ad(type != MLOG_FILE_CREATE2);
-	ut_ad(type != MLOG_FILE_RENAME2);
-	ut_ad(type != MLOG_FILE_NAME);
-	ut_ad(type != MLOG_DUMMY_RECORD);
-	ut_ad(type != MLOG_CHECKPOINT);
-	ut_ad(type != MLOG_INDEX_LOAD);
-	ut_ad(type != MLOG_TRUNCATE);
-
-	recv_t* recv= static_cast<recv_t*>(mem_heap_alloc(heap, sizeof *recv));
-
-	recv->type = type;
-	recv->len = ulint(rec_end - body);
-	recv->start_lsn = lsn;
-	recv->end_lsn = end_lsn;
-
-	recv_addr_t* recv_addr = recv_get_fil_addr_struct(space, page_no);
-
-	if (recv_addr == NULL) {
-		recv_addr = static_cast<recv_addr_t*>(
-			mem_heap_alloc(heap, sizeof(recv_addr_t)));
-
-		recv_addr->space = space;
-		recv_addr->page_no = page_no;
-		recv_addr->state = RECV_NOT_PROCESSED;
-
-		UT_LIST_INIT(recv_addr->rec_list, &recv_t::rec_list);
-
-		HASH_INSERT(recv_addr_t, addr_hash, addr_hash,
-			    recv_fold(space, page_no), recv_addr);
-		n_addrs++;
-	}
-
-	switch (type) {
-	case MLOG_INIT_FILE_PAGE2:
-	case MLOG_ZIP_PAGE_COMPRESS:
-	case MLOG_INIT_FREE_PAGE:
-		/* Ignore any earlier redo log records for this page. */
-		ut_ad(recv_addr->state == RECV_NOT_PROCESSED
-		      || recv_addr->state == RECV_WILL_NOT_READ);
-		recv_addr->state = RECV_WILL_NOT_READ;
-		mlog_init.add(space, page_no, lsn);
-	default:
-		break;
-	}
-
-	UT_LIST_ADD_LAST(recv_addr->rec_list, recv);
-
-	recv_data_t** prev_field = &recv->data;
-
-	/* Store the log record body in chunks of less than srv_page_size:
-	heap grows into the buffer pool, and bigger chunks could not
-	be allocated */
-
-	while (rec_end > body) {
-		ulint rec_len = ulint(rec_end - body);
-
-		if (rec_len > RECV_DATA_BLOCK_SIZE) {
-			rec_len = RECV_DATA_BLOCK_SIZE;
-		}
-
-		recv_data_t* recv_data = static_cast<recv_data_t*>(
-			mem_heap_alloc(heap, sizeof(recv_data_t) + rec_len));
-
-		*prev_field = recv_data;
-
-		memcpy(recv_data + 1, body, rec_len);
-
-		prev_field = &recv_data->next;
+  uint32_t space_id= page_id.space();
+  uint32_t page_no= page_id.page_no();
+  if (is_predefined_tablespace(space_id))
+  {
+    if (!srv_immediate_scrub_data_uncompressed)
+      return;
+    fil_space_t *space;
+    if (space_id == TRX_SYS_SPACE)
+      space= fil_system.sys_space;
+    else
+      space= fil_space_get(space_id);
 
-		body += rec_len;
-	}
+    space->free_page(page_no, freed);
+    return;
+  }
 
-	*prev_field = NULL;
+  recv_spaces_t::iterator i= recv_spaces.lower_bound(space_id);
+  if (i != recv_spaces.end() && i->first == space_id)
+  {
+    if (freed)
+      i->second.add_freed_page(page_no);
+    else
+      i->second.remove_freed_page(page_no);
+  }
 }
 
-/*********************************************************************//**
-Copies the log record body from recv to buf. */
-static
-void
-recv_data_copy_to_buf(
-/*==================*/
-	byte*	buf,	/*!< in: buffer of length at least recv->len */
-	recv_t*	recv)	/*!< in: log record */
+/** Parse and register one mini-transaction in log_t::FORMAT_10_5.
+@param checkpoint_lsn  the log sequence number of the latest checkpoint
+@param store           whether to store the records
+@param apply           whether to apply file-level log records
+@return whether FILE_CHECKPOINT record was seen the first time,
+or corruption was noticed */
+bool recv_sys_t::parse(lsn_t checkpoint_lsn, store_t *store, bool apply)
 {
-	recv_data_t*	recv_data;
-	ulint		part_len;
-	ulint		len;
+  mysql_mutex_assert_owner(&log_sys.mutex);
+  ut_ad(mutex_own(&mutex));
+  ut_ad(parse_start_lsn);
+  ut_ad(log_sys.is_physical());
 
-	len = recv->len;
-	recv_data = recv->data;
+  bool last_phase= (*store == STORE_IF_EXISTS);
+  const byte *const end= buf + len;
+loop:
+  const byte *const log= buf + recovered_offset;
+  const lsn_t start_lsn= recovered_lsn;
+  map::iterator cached_pages_it = pages.end();
+
+  /* Check that the entire mini-transaction is included within the buffer */
+  const byte *l;
+  uint32_t rlen;
+  for (l= log; l < end; l+= rlen)
+  {
+    if (!*l)
+      goto eom_found;
+    if (UNIV_LIKELY((*l & 0x70) != RESERVED));
+    else if (srv_force_recovery)
+      ib::warn() << "Ignoring unknown log record at LSN " << recovered_lsn;
+    else
+    {
+malformed:
+      ib::error() << "Malformed log record;"
+                     " set innodb_force_recovery=1 to ignore.";
+corrupted:
+      const size_t trailing_bytes= std::min<size_t>(100, size_t(end - l));
+      ib::info() << "Dump from the start of the mini-transaction (LSN="
+                 << start_lsn << ") to "
+                 << trailing_bytes << " bytes after the record:";
+      ut_print_buf(stderr, log, l - log + trailing_bytes);
+      putc('\n', stderr);
+      found_corrupt_log= true;
+      return true;
+    }
+    rlen= *l++ & 0xf;
+    if (l + (rlen ? rlen : 16) >= end)
+      break;
+    if (!rlen)
+    {
+      rlen= mlog_decode_varint_length(*l);
+      if (l + rlen >= end)
+        break;
+      const uint32_t addlen= mlog_decode_varint(l);
+      if (UNIV_UNLIKELY(addlen == MLOG_DECODE_ERROR))
+      {
+        ib::error() << "Corrupted record length";
+        goto corrupted;
+      }
+      rlen= addlen + 15;
+    }
+  }
 
-	while (len > 0) {
-		if (len > RECV_DATA_BLOCK_SIZE) {
-			part_len = RECV_DATA_BLOCK_SIZE;
-		} else {
-			part_len = len;
-		}
+  /* Not the entire mini-transaction was present. */
+  return false;
 
-		ut_memcpy(buf, ((byte*) recv_data) + sizeof(recv_data_t),
-			  part_len);
-		buf += part_len;
-		len -= part_len;
+eom_found:
+  ut_ad(!*l);
+  ut_d(const byte *const el= l + 1);
+
+  const lsn_t end_lsn= recv_calc_lsn_on_data_add(start_lsn, l + 1 - log);
+  if (UNIV_UNLIKELY(end_lsn > scanned_lsn))
+    /* The log record filled a log block, and we require that also the
+    next log block should have been scanned in */
+    return false;
+
+  ut_d(std::set<page_id_t> freed);
+#if 0 && defined UNIV_DEBUG /* MDEV-21727 FIXME: enable this */
+  /* Pages that have been modified in this mini-transaction.
+  If a mini-transaction writes INIT_PAGE for a page, it should not have
+  written any log records for the page. Unfortunately, this does not
+  hold for ROW_FORMAT=COMPRESSED pages, because page_zip_compress()
+  can be invoked in a pessimistic operation, even after log has
+  been written for other pages. */
+  ut_d(std::set<page_id_t> modified);
+#endif
 
-		recv_data = recv_data->next;
-	}
+  uint32_t space_id= 0, page_no= 0, last_offset= 0;
+  bool got_page_op= false;
+  for (l= log; l < end; l+= rlen)
+  {
+    const byte *const recs= l;
+    const byte b= *l++;
+
+    if (!b)
+      break;
+    ut_ad(UNIV_LIKELY(b & 0x70) != RESERVED || srv_force_recovery);
+    rlen= b & 0xf;
+    ut_ad(l + rlen < end);
+    ut_ad(rlen || l + 16 < end);
+    if (!rlen)
+    {
+      const uint32_t lenlen= mlog_decode_varint_length(*l);
+      ut_ad(l + lenlen < end);
+      const uint32_t addlen= mlog_decode_varint(l);
+      ut_ad(addlen != MLOG_DECODE_ERROR);
+      rlen= addlen + 15 - lenlen;
+      l+= lenlen;
+    }
+    ut_ad(l + rlen < end);
+    uint32_t idlen;
+    if ((b & 0x80) && got_page_op)
+    {
+      /* This record is for the same page as the previous one. */
+      if (UNIV_UNLIKELY((b & 0x70) <= INIT_PAGE))
+      {
+record_corrupted:
+        /* FREE_PAGE,INIT_PAGE cannot be with same_page flag */
+        if (!srv_force_recovery)
+          goto malformed;
+        ib::warn() << "Ignoring malformed log record at LSN " << recovered_lsn;
+        last_offset= 1; /* the next record must not be same_page  */
+        continue;
+      }
+      goto same_page;
+    }
+    last_offset= 0;
+    idlen= mlog_decode_varint_length(*l);
+    if (UNIV_UNLIKELY(idlen > 5 || idlen >= rlen))
+    {
+page_id_corrupted:
+      if (!srv_force_recovery)
+      {
+        ib::error() << "Corrupted page identifier at " << recovered_lsn
+                    << "; set innodb_force_recovery=1 to ignore the record.";
+        goto corrupted;
+      }
+      ib::warn() << "Ignoring corrupted page identifier at LSN "
+                 << recovered_lsn;
+      continue;
+    }
+    space_id= mlog_decode_varint(l);
+    if (UNIV_UNLIKELY(space_id == MLOG_DECODE_ERROR))
+      goto page_id_corrupted;
+    l+= idlen;
+    rlen-= idlen;
+    idlen= mlog_decode_varint_length(*l);
+    if (UNIV_UNLIKELY(idlen > 5 || idlen > rlen))
+      goto page_id_corrupted;
+    page_no= mlog_decode_varint(l);
+    if (UNIV_UNLIKELY(page_no == MLOG_DECODE_ERROR))
+      goto page_id_corrupted;
+    l+= idlen;
+    rlen-= idlen;
+    got_page_op = !(b & 0x80);
+    if (got_page_op && apply && !is_predefined_tablespace(space_id))
+    {
+      recv_spaces_t::iterator i= recv_spaces.lower_bound(space_id);
+      if (i != recv_spaces.end() && i->first == space_id);
+      else if (recovered_lsn < mlog_checkpoint_lsn)
+        /* We have not seen all records between the checkpoint and
+        FILE_CHECKPOINT. There should be a FILE_DELETE for this
+        tablespace later. */
+        recv_spaces.emplace_hint(i, space_id, file_name_t("", false));
+      else
+      {
+        const page_id_t id(space_id, page_no);
+        if (!srv_force_recovery)
+        {
+          ib::error() << "Missing FILE_DELETE or FILE_MODIFY for " << id
+                      << " at " << recovered_lsn
+                      << "; set innodb_force_recovery=1 to ignore the record.";
+          goto corrupted;
+        }
+        ib::warn() << "Ignoring record for " << id << " at " << recovered_lsn;
+        continue;
+      }
+    }
+same_page:
+    DBUG_PRINT("ib_log",
+               ("scan " LSN_PF ": rec %x len %zu page %u:%u",
+                recovered_lsn, b, static_cast<size_t>(l + rlen - recs),
+                space_id, page_no));
+
+    if (got_page_op)
+    {
+      const page_id_t id(space_id, page_no);
+      ut_d(if ((b & 0x70) == INIT_PAGE) freed.erase(id));
+      ut_ad(freed.find(id) == freed.end());
+      switch (b & 0x70) {
+      case FREE_PAGE:
+        ut_ad(freed.emplace(id).second);
+        last_offset= 1; /* the next record must not be same_page  */
+        goto free_or_init_page;
+      case INIT_PAGE:
+        last_offset= FIL_PAGE_TYPE;
+      free_or_init_page:
+        store_freed_or_init_rec(id, (b & 0x70) == FREE_PAGE);
+        if (UNIV_UNLIKELY(rlen != 0))
+          goto record_corrupted;
+        break;
+      case EXTENDED:
+        if (UNIV_UNLIKELY(!rlen))
+          goto record_corrupted;
+        if (rlen == 1 && *l == TRIM_PAGES)
+        {
+#if 0 /* For now, we can only truncate an undo log tablespace */
+          if (UNIV_UNLIKELY(!space_id || !page_no))
+            goto record_corrupted;
+#else
+          if (!srv_is_undo_tablespace(space_id) ||
+              page_no != SRV_UNDO_TABLESPACE_SIZE_IN_PAGES)
+            goto record_corrupted;
+          static_assert(UT_ARR_SIZE(truncated_undo_spaces) ==
+                        TRX_SYS_MAX_UNDO_SPACES, "compatibility");
+          truncated_undo_spaces[space_id - srv_undo_space_id_start]=
+            { recovered_lsn, page_no };
+#endif
+          last_offset= 1; /* the next record must not be same_page  */
+          continue;
+        }
+        last_offset= FIL_PAGE_TYPE;
+        break;
+      case RESERVED:
+      case OPTION:
+        continue;
+      case WRITE:
+      case MEMMOVE:
+      case MEMSET:
+        if (UNIV_UNLIKELY(rlen == 0 || last_offset == 1))
+          goto record_corrupted;
+        const uint32_t olen= mlog_decode_varint_length(*l);
+        if (UNIV_UNLIKELY(olen >= rlen) || UNIV_UNLIKELY(olen > 3))
+          goto record_corrupted;
+        const uint32_t offset= mlog_decode_varint(l);
+        ut_ad(offset != MLOG_DECODE_ERROR);
+        static_assert(FIL_PAGE_OFFSET == 4, "compatibility");
+        if (UNIV_UNLIKELY(offset >= srv_page_size))
+          goto record_corrupted;
+        last_offset+= offset;
+        if (UNIV_UNLIKELY(last_offset < 8 || last_offset >= srv_page_size))
+          goto record_corrupted;
+        l+= olen;
+        rlen-= olen;
+        if ((b & 0x70) == WRITE)
+        {
+          if (UNIV_UNLIKELY(rlen + last_offset > srv_page_size))
+            goto record_corrupted;
+          if (UNIV_UNLIKELY(!page_no) && apply)
+          {
+            const bool has_size= last_offset <= FSP_HEADER_OFFSET + FSP_SIZE &&
+              last_offset + rlen >= FSP_HEADER_OFFSET + FSP_SIZE + 4;
+            const bool has_flags= last_offset <=
+              FSP_HEADER_OFFSET + FSP_SPACE_FLAGS &&
+              last_offset + rlen >= FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + 4;
+            if (has_size || has_flags)
+            {
+              recv_spaces_t::iterator it= recv_spaces.find(space_id);
+              const uint32_t size= has_size
+                ? mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + l -
+                                   last_offset)
+                : 0;
+              const uint32_t flags= has_flags
+                ? mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + l -
+                                   last_offset)
+                : file_name_t::initial_flags;
+              if (it == recv_spaces.end())
+                ut_ad(!mlog_checkpoint_lsn || space_id == TRX_SYS_SPACE ||
+                      srv_is_undo_tablespace(space_id));
+              else if (!it->second.space)
+              {
+                if (has_size)
+                  it->second.size= size;
+                if (has_flags)
+                  it->second.flags= flags;
+              }
+              fil_space_set_recv_size_and_flags(space_id, size, flags);
+            }
+          }
+          last_offset+= rlen;
+          break;
+        }
+        uint32_t llen= mlog_decode_varint_length(*l);
+        if (UNIV_UNLIKELY(llen > rlen || llen > 3))
+          goto record_corrupted;
+        const uint32_t len= mlog_decode_varint(l);
+        ut_ad(len != MLOG_DECODE_ERROR);
+        if (UNIV_UNLIKELY(last_offset + len > srv_page_size))
+          goto record_corrupted;
+        l+= llen;
+        rlen-= llen;
+        llen= len;
+        if ((b & 0x70) == MEMSET)
+        {
+          if (UNIV_UNLIKELY(rlen > llen))
+            goto record_corrupted;
+          last_offset+= llen;
+          break;
+        }
+        const uint32_t slen= mlog_decode_varint_length(*l);
+        if (UNIV_UNLIKELY(slen != rlen || slen > 3))
+          goto record_corrupted;
+        uint32_t s= mlog_decode_varint(l);
+        ut_ad(slen != MLOG_DECODE_ERROR);
+        if (s & 1)
+          s= last_offset - (s >> 1) - 1;
+        else
+          s= last_offset + (s >> 1) + 1;
+        if (UNIV_UNLIKELY(s < 8 || s + llen > srv_page_size))
+          goto record_corrupted;
+        last_offset+= llen;
+        break;
+      }
+#if 0 && defined UNIV_DEBUG
+      switch (b & 0x70) {
+      case RESERVED:
+      case OPTION:
+        ut_ad(0); /* we did "continue" earlier */
+        break;
+      case FREE_PAGE:
+        break;
+      default:
+        ut_ad(modified.emplace(id).second || (b & 0x70) != INIT_PAGE);
+      }
+#endif
+      const bool is_init= (b & 0x70) <= INIT_PAGE;
+      switch (*store) {
+      case STORE_IF_EXISTS:
+        if (fil_space_t *space= fil_space_t::get(space_id))
+        {
+          const auto size= space->get_size();
+          space->release();
+          if (!size)
+            continue;
+        }
+        else
+          continue;
+        /* fall through */
+      case STORE_YES:
+        if (!mlog_init.will_avoid_read(id, start_lsn))
+        {
+          if (cached_pages_it == pages.end() || cached_pages_it->first != id)
+            cached_pages_it= pages.emplace(id, page_recv_t()).first;
+          add(cached_pages_it, start_lsn, end_lsn, recs,
+              static_cast<size_t>(l + rlen - recs));
+        }
+        continue;
+      case STORE_NO:
+        if (!is_init)
+          continue;
+        mlog_init.add(id, start_lsn);
+        map::iterator i= pages.find(id);
+        if (i == pages.end())
+          continue;
+        i->second.log.clear();
+        pages.erase(i);
+      }
+    }
+    else if (rlen)
+    {
+      switch (b & 0xf0) {
+      case FILE_CHECKPOINT:
+        if (space_id == 0 && page_no == 0 && rlen == 8)
+        {
+          const lsn_t lsn= mach_read_from_8(l);
+
+          if (UNIV_UNLIKELY(srv_print_verbose_log == 2))
+            fprintf(stderr, "FILE_CHECKPOINT(" LSN_PF ") %s at " LSN_PF "\n",
+                    lsn, lsn != checkpoint_lsn
+                    ? "ignored"
+                    : mlog_checkpoint_lsn ? "reread" : "read",
+                    recovered_lsn);
+
+          DBUG_PRINT("ib_log", ("FILE_CHECKPOINT(" LSN_PF ") %s at " LSN_PF,
+                                lsn, lsn != checkpoint_lsn
+                                ? "ignored"
+                                : mlog_checkpoint_lsn ? "reread" : "read",
+                                recovered_lsn));
+
+          if (lsn == checkpoint_lsn)
+          {
+            /* There can be multiple FILE_CHECKPOINT for the same LSN. */
+            if (mlog_checkpoint_lsn)
+              continue;
+            mlog_checkpoint_lsn= recovered_lsn;
+            l+= 8;
+            recovered_offset= l - buf;
+            return true;
+          }
+          continue;
+        }
+        /* fall through */
+      default:
+        if (!srv_force_recovery)
+          goto malformed;
+        ib::warn() << "Ignoring malformed log record at LSN " << recovered_lsn;
+        continue;
+      case FILE_DELETE:
+      case FILE_MODIFY:
+      case FILE_RENAME:
+        if (UNIV_UNLIKELY(page_no != 0))
+        {
+        file_rec_error:
+          if (!srv_force_recovery)
+          {
+            ib::error() << "Corrupted file-level record;"
+                           " set innodb_force_recovery=1 to ignore.";
+            goto corrupted;
+          }
+
+          ib::warn() << "Ignoring corrupted file-level record at LSN "
+                     << recovered_lsn;
+          continue;
+        }
+        /* fall through */
+      case FILE_CREATE:
+        if (UNIV_UNLIKELY(!space_id || page_no))
+          goto file_rec_error;
+        /* There is no terminating NUL character. Names must end in .ibd.
+        For FILE_RENAME, there is a NUL between the two file names. */
+        const char * const fn= reinterpret_cast<const char*>(l);
+        const char *fn2= static_cast<const char*>(memchr(fn, 0, rlen));
+
+        if (UNIV_UNLIKELY((fn2 == nullptr) == ((b & 0xf0) == FILE_RENAME)))
+          goto file_rec_error;
+
+        const char * const fnend= fn2 ? fn2 : fn + rlen;
+        const char * const fn2end= fn2 ? fn + rlen : nullptr;
+
+        if (fn2)
+        {
+          fn2++;
+          if (memchr(fn2, 0, fn2end - fn2))
+            goto file_rec_error;
+          if (fn2end - fn2 < 4 || memcmp(fn2end - 4, DOT_IBD, 4))
+            goto file_rec_error;
+        }
+
+        if (is_predefined_tablespace(space_id))
+          goto file_rec_error;
+        if (fnend - fn < 4 || memcmp(fnend - 4, DOT_IBD, 4))
+          goto file_rec_error;
+
+        const char saved_end= fn[rlen];
+        const_cast<char&>(fn[rlen])= '\0';
+        fil_name_process(const_cast<char*>(fn), fnend - fn, space_id,
+                         (b & 0xf0) == FILE_DELETE);
+        if (fn2)
+          fil_name_process(const_cast<char*>(fn2), fn2end - fn2, space_id,
+                           false);
+        if ((b & 0xf0) < FILE_MODIFY && log_file_op)
+          log_file_op(space_id, (b & 0xf0) == FILE_CREATE,
+                      l, static_cast<ulint>(fnend - fn),
+                      reinterpret_cast<const byte*>(fn2),
+                      fn2 ? static_cast<ulint>(fn2end - fn2) : 0);
+        const_cast<char&>(fn[rlen])= saved_end;
+
+        if (fn2 && apply)
+        {
+          const size_t len= fn2end - fn2;
+          auto r= renamed_spaces.emplace(space_id, std::string{fn2, len});
+          if (!r.second)
+            r.first->second= std::string{fn2, len};
+        }
+        if (UNIV_UNLIKELY(found_corrupt_fs))
+          return true;
+      }
+    }
+    else
+      goto malformed;
+  }
+
+  ut_ad(l == el);
+  recovered_offset= l - buf;
+  recovered_lsn= end_lsn;
+  if (is_memory_exhausted(store) && last_phase)
+    return false;
+  goto loop;
 }
 
 /** Apply the hashed log records to the page, if the page lsn is less than the
 lsn of a log record.
 @param[in,out]	block		buffer pool page
 @param[in,out]	mtr		mini-transaction
-@param[in,out]	recv_addr	recovery address
+@param[in,out]	p		recovery address
+@param[in,out]	space		tablespace, or NULL if not looked up yet
 @param[in,out]	init		page initialization operation, or NULL */
 static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
-			      recv_addr_t* recv_addr,
+			      const recv_sys_t::map::iterator& p,
+			      fil_space_t* space = NULL,
 			      mlog_init_t::init* init = NULL)
 {
-	page_t*		page;
-	page_zip_des_t*	page_zip;
-
 	ut_ad(mutex_own(&recv_sys.mutex));
 	ut_ad(recv_sys.apply_log_recs);
 	ut_ad(recv_needed_recovery);
-	ut_ad(recv_addr->state != RECV_BEING_PROCESSED);
-	ut_ad(recv_addr->state != RECV_PROCESSED);
 	ut_ad(!init || init->created);
 	ut_ad(!init || init->lsn);
+	ut_ad(block->page.id() == p->first);
+	ut_ad(!p->second.is_being_processed());
+	ut_ad(!space || space->id == block->page.id().space());
+	ut_ad(log_sys.is_physical());
 
 	if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) {
-		fprintf(stderr, "Applying log to page %u:%u\n",
-			recv_addr->space, recv_addr->page_no);
+		ib::info() << "Applying log to page " << block->page.id();
 	}
 
-	DBUG_LOG("ib_log", "Applying log to page " << block->page.id);
+	DBUG_PRINT("ib_log", ("Applying log to page %u:%u",
+			      block->page.id().space(),
+			      block->page.id().page_no()));
 
-	recv_addr->state = RECV_BEING_PROCESSED;
-	mutex_exit(&recv_sys.mutex);
-
-	page = block->frame;
-	page_zip = buf_block_get_page_zip(block);
+	p->second.state = page_recv_t::RECV_BEING_PROCESSED;
 
-	/* The page may have been modified in the buffer pool.
-	FIL_PAGE_LSN would only be updated right before flushing. */
-	lsn_t page_lsn = buf_page_get_newest_modification(&block->page);
-	if (!page_lsn) {
-		page_lsn = mach_read_from_8(page + FIL_PAGE_LSN);
-	}
+	mutex_exit(&recv_sys.mutex);
 
+	byte *frame = UNIV_LIKELY_NULL(block->page.zip.data)
+		? block->page.zip.data
+		: block->frame;
+	const lsn_t page_lsn = init
+		? 0
+		: mach_read_from_8(frame + FIL_PAGE_LSN);
 	bool free_page = false;
 	lsn_t start_lsn = 0, end_lsn = 0;
+	ut_d(lsn_t recv_start_lsn = 0);
 	const lsn_t init_lsn = init ? init->lsn : 0;
 
-	for (recv_t* recv = UT_LIST_GET_FIRST(recv_addr->rec_list);
-	     recv; recv = UT_LIST_GET_NEXT(rec_list, recv)) {
-		ut_ad(recv->start_lsn);
-		end_lsn = recv->end_lsn;
-		ut_ad(end_lsn <= log_sys.log.scanned_lsn);
-
-		if (recv->start_lsn < page_lsn) {
-			/* Ignore this record, because there are later changes
-			for this page. */
-			DBUG_LOG("ib_log", "apply skip "
-				 << get_mlog_string(recv->type)
-				 << " LSN " << recv->start_lsn << " < "
-				 << page_lsn);
-		} else if (recv->start_lsn < init_lsn) {
-			DBUG_LOG("ib_log", "init skip "
-				 << get_mlog_string(recv->type)
-				 << " LSN " << recv->start_lsn << " < "
-				 << init_lsn);
-		} else {
-			if (recv->type == MLOG_INIT_FREE_PAGE) {
-				/* This does not really modify the page. */
-				free_page = true;
-			} else if (!start_lsn) {
-				start_lsn = recv->start_lsn;
-			}
+	bool skipped_after_init = false;
+
+	for (const log_rec_t* recv : p->second.log) {
+		const log_phys_t* l = static_cast<const log_phys_t*>(recv);
+		ut_ad(l->lsn);
+		ut_ad(end_lsn <= l->lsn);
+		ut_ad(l->lsn <= log_sys.log.scanned_lsn);
+
+		ut_ad(l->start_lsn);
+		ut_ad(recv_start_lsn <= l->start_lsn);
+		ut_d(recv_start_lsn = l->start_lsn);
+
+		if (l->start_lsn < page_lsn) {
+			/* This record has already been applied. */
+			DBUG_PRINT("ib_log", ("apply skip %u:%u LSN " LSN_PF
+					      " < " LSN_PF,
+					      block->page.id().space(),
+					      block->page.id().page_no(),
+					      l->start_lsn, page_lsn));
+			skipped_after_init = true;
+			end_lsn = l->lsn;
+			continue;
+		}
 
-			if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) {
-				fprintf(stderr, "apply " LSN_PF ":"
-					" %d len " ULINTPF " page %u:%u\n",
-					recv->start_lsn, recv->type, recv->len,
-					recv_addr->space, recv_addr->page_no);
-			}
+		if (l->start_lsn < init_lsn) {
+			DBUG_PRINT("ib_log", ("init skip %u:%u LSN " LSN_PF
+					      " < " LSN_PF,
+					      block->page.id().space(),
+					      block->page.id().page_no(),
+					      l->start_lsn, init_lsn));
+			skipped_after_init = false;
+			end_lsn = l->lsn;
+			continue;
+		}
 
-			DBUG_LOG("ib_log", "apply " << recv->start_lsn << ": "
-				 << get_mlog_string(recv->type)
-				 << " len " << recv->len
-				 << " page " << block->page.id);
+		/* There is no need to check LSN for just initialized pages. */
+		if (skipped_after_init) {
+			skipped_after_init = false;
+			ut_ad(end_lsn == page_lsn);
+			if (end_lsn != page_lsn)
+				ib::warn()
+					<< "The last skipped log record LSN "
+					<< end_lsn
+					<< " is not equal to page LSN "
+					<< page_lsn;
+		}
 
-			byte* buf;
+		end_lsn = l->lsn;
 
-			if (recv->len > RECV_DATA_BLOCK_SIZE) {
-				/* We have to copy the record body to
-				a separate buffer */
-				buf = static_cast<byte*>
-					(ut_malloc_nokey(recv->len));
-				recv_data_copy_to_buf(buf, recv);
-			} else {
-				buf = reinterpret_cast<byte*>(recv->data)
-					+ sizeof *recv->data;
-			}
+		if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) {
+			ib::info() << "apply " << l->start_lsn
+				   << ": " << block->page.id();
+		}
 
-			recv_parse_or_apply_log_rec_body(
-				recv->type, buf, buf + recv->len,
-				block->page.id, true, block, &mtr);
+		DBUG_PRINT("ib_log", ("apply " LSN_PF ": %u:%u",
+				      l->start_lsn,
+				      block->page.id().space(),
+				      block->page.id().page_no()));
 
-			end_lsn = recv->start_lsn + recv->len;
-			mach_write_to_8(FIL_PAGE_LSN + page, end_lsn);
-			mach_write_to_8(srv_page_size
-					- FIL_PAGE_END_LSN_OLD_CHKSUM
-					+ page, end_lsn);
+		log_phys_t::apply_status a= l->apply(*block,
+						     p->second.last_offset);
+
+		switch (a) {
+		case log_phys_t::APPLIED_NO:
+			ut_ad(!mtr.has_modifications());
+			free_page = true;
+			start_lsn = 0;
+			continue;
+		case log_phys_t::APPLIED_YES:
+			goto set_start_lsn;
+		case log_phys_t::APPLIED_TO_FSP_HEADER:
+		case log_phys_t::APPLIED_TO_ENCRYPTION:
+			break;
+		}
 
-			if (page_zip) {
-				mach_write_to_8(FIL_PAGE_LSN + page_zip->data,
-						end_lsn);
+		if (fil_space_t* s = space
+		    ? space
+		    : fil_space_t::get(block->page.id().space())) {
+			switch (a) {
+			case log_phys_t::APPLIED_TO_FSP_HEADER:
+				s->flags = mach_read_from_4(
+					FSP_HEADER_OFFSET
+					+ FSP_SPACE_FLAGS + frame);
+				s->size_in_header = mach_read_from_4(
+					FSP_HEADER_OFFSET + FSP_SIZE
+					+ frame);
+				s->free_limit = mach_read_from_4(
+					FSP_HEADER_OFFSET
+					+ FSP_FREE_LIMIT + frame);
+				s->free_len = mach_read_from_4(
+					FSP_HEADER_OFFSET + FSP_FREE
+					+ FLST_LEN + frame);
+				break;
+			default:
+				byte* b= frame
+					+ fsp_header_get_encryption_offset(
+						block->zip_size())
+					+ FSP_HEADER_OFFSET;
+				if (memcmp(b, CRYPT_MAGIC, MAGIC_SZ)) {
+					break;
+				}
+				b += MAGIC_SZ;
+				if (*b != CRYPT_SCHEME_UNENCRYPTED
+				    && *b != CRYPT_SCHEME_1) {
+					break;
+				}
+				if (b[1] != MY_AES_BLOCK_SIZE) {
+					break;
+				}
+				if (b[2 + MY_AES_BLOCK_SIZE + 4 + 4]
+				    > FIL_ENCRYPTION_OFF) {
+					break;
+				}
+				fil_crypt_parse(s, b);
 			}
 
-			if (recv->len > RECV_DATA_BLOCK_SIZE) {
-				ut_free(buf);
+			if (!space) {
+				s->release();
 			}
 		}
-	}
 
-#ifdef UNIV_ZIP_DEBUG
-	ut_ad(!fil_page_index_page_check(page)
-	      || !page_zip
-	      || page_zip_validate_low(page_zip, page, NULL, FALSE));
-#endif /* UNIV_ZIP_DEBUG */
+set_start_lsn:
+		if (recv_sys.found_corrupt_log && !srv_force_recovery) {
+			break;
+		}
+
+		if (!start_lsn) {
+			start_lsn = l->start_lsn;
+		}
+	}
 
 	if (start_lsn) {
-		log_flush_order_mutex_enter();
-		buf_flush_note_modification(block, start_lsn, end_lsn, NULL);
-		log_flush_order_mutex_exit();
+		ut_ad(end_lsn >= start_lsn);
+		mach_write_to_8(FIL_PAGE_LSN + frame, end_lsn);
+		if (UNIV_LIKELY(frame == block->frame)) {
+			mach_write_to_8(srv_page_size
+					- FIL_PAGE_END_LSN_OLD_CHKSUM
+					+ frame, end_lsn);
+		} else {
+			buf_zip_decompress(block, false);
+		}
+
+		buf_block_modify_clock_inc(block);
+		mysql_mutex_lock(&log_sys.flush_order_mutex);
+		buf_flush_note_modification(block, start_lsn, end_lsn);
+		mysql_mutex_unlock(&log_sys.flush_order_mutex);
 	} else if (free_page && init) {
-		/* There have been no operations than MLOG_INIT_FREE_PAGE.
+		/* There have been no operations that modify the page.
 		Any buffered changes must not be merged. A subsequent
 		buf_page_create() from a user thread should discard
 		any buffered changes. */
 		init->created = false;
 		ut_ad(!mtr.has_modifications());
+		block->page.status = buf_page_t::FREED;
 	}
 
 	/* Make sure that committing mtr does not change the modification
@@ -2077,56 +2439,42 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
 		recv_max_page_lsn = page_lsn;
 	}
 
-	ut_ad(recv_addr->state == RECV_BEING_PROCESSED);
-	recv_addr->state = RECV_PROCESSED;
+	ut_ad(p->second.is_being_processed());
+	ut_ad(!recv_sys.pages.empty());
 
-	ut_a(recv_sys.n_addrs > 0);
-	if (ulint n = --recv_sys.n_addrs) {
-		if (recv_sys.report(now)) {
-			ib::info() << "To recover: " << n << " pages from log";
-			service_manager_extend_timeout(
-				INNODB_EXTEND_TIMEOUT_INTERVAL, "To recover: " ULINTPF " pages from log", n);
-		}
+	if (recv_sys.report(now)) {
+		const ulint n = recv_sys.pages.size();
+		ib::info() << "To recover: " << n << " pages from log";
+		service_manager_extend_timeout(
+			INNODB_EXTEND_TIMEOUT_INTERVAL, "To recover: " ULINTPF " pages from log", n);
 	}
 }
 
-/** Reduces recv_sys.n_addrs for the corrupted page.
-This function should called when srv_force_recovery > 0.
-@param[in]	page_id	page id of the corrupted page */
-void recv_recover_corrupt_page(page_id_t page_id)
+/** Remove records for a corrupted page.
+This function should only be called when innodb_force_recovery is set.
+@param page_id  corrupted page identifier */
+ATTRIBUTE_COLD void recv_sys_t::free_corrupted_page(page_id_t page_id)
 {
-	ut_ad(srv_force_recovery);
-	mutex_enter(&recv_sys.mutex);
-
-	if (!recv_sys.apply_log_recs) {
-	} else if (recv_addr_t* recv_addr = recv_get_fil_addr_struct(
-			   page_id.space(), page_id.page_no())) {
-		switch (recv_addr->state) {
-		case RECV_WILL_NOT_READ:
-			ut_ad(!"wrong state");
-			break;
-		case RECV_BEING_PROCESSED:
-		case RECV_PROCESSED:
-			break;
-		default:
-			recv_addr->state = RECV_PROCESSED;
-			ut_ad(recv_sys.n_addrs);
-			recv_sys.n_addrs--;
-		}
-	}
-
-	mutex_exit(&recv_sys.mutex);
+  mutex_enter(&mutex);
+  map::iterator p= pages.find(page_id);
+  if (p != pages.end())
+  {
+    p->second.log.clear();
+    pages.erase(p);
+  }
+  mutex_exit(&mutex);
 }
 
 /** Apply any buffered redo log to a page that was just read from a data file.
+@param[in,out]	space	tablespace
 @param[in,out]	bpage	buffer pool page */
-void recv_recover_page(buf_page_t* bpage)
+void recv_recover_page(fil_space_t* space, buf_page_t* bpage)
 {
 	mtr_t mtr;
 	mtr.start();
-	mtr.set_log_mode(MTR_LOG_NONE);
+	mtr.set_log_mode(MTR_LOG_NO_REDO);
 
-	ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
 	buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
 
 	/* Move the ownership of the x-latch on the page to
@@ -2134,22 +2482,18 @@ void recv_recover_page(buf_page_t* bpage)
 	x-latch on it.  This is needed for the operations to
 	the page to pass the debug checks. */
 	rw_lock_x_lock_move_ownership(&block->lock);
-	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
-	ibool	success = buf_page_get_known_nowait(
-		RW_X_LATCH, block, BUF_KEEP_OLD,
-		__FILE__, __LINE__, &mtr);
-	ut_a(success);
+	buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+	rw_lock_x_lock(&block->lock);
+	mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
 
 	mutex_enter(&recv_sys.mutex);
-	if (!recv_sys.apply_log_recs) {
-	} else if (recv_addr_t* recv_addr = recv_get_fil_addr_struct(
-			   bpage->id.space(), bpage->id.page_no())) {
-		switch (recv_addr->state) {
-		case RECV_BEING_PROCESSED:
-		case RECV_PROCESSED:
-			break;
-		default:
-			recv_recover_page(block, mtr, recv_addr);
+	if (recv_sys.apply_log_recs) {
+		recv_sys_t::map::iterator p = recv_sys.pages.find(bpage->id());
+		if (p != recv_sys.pages.end()
+		    && !p->second.is_being_processed()) {
+			recv_recover_page(block, mtr, p, space);
+			p->second.log.clear();
+			recv_sys.pages.erase(p);
 			goto func_exit;
 		}
 	}
@@ -2160,894 +2504,347 @@ func_exit:
 	ut_ad(mtr.has_committed());
 }
 
-/** Reads in pages which have hashed log records, from an area around a given
-page number.
-@param[in]	page_id	page id */
-static void recv_read_in_area(const page_id_t page_id)
+/** Read pages for which log needs to be applied.
+@param page_id	first page identifier to read
+@param i        iterator to recv_sys.pages */
+static void recv_read_in_area(page_id_t page_id, recv_sys_t::map::iterator i)
 {
-	ulint	page_nos[RECV_READ_AHEAD_AREA];
-	ulint	page_no = page_id.page_no()
-		- (page_id.page_no() % RECV_READ_AHEAD_AREA);
-	ulint*	p = page_nos;
-
-	for (const ulint up_limit = page_no + RECV_READ_AHEAD_AREA;
-	     page_no < up_limit; page_no++) {
-		recv_addr_t* recv_addr = recv_get_fil_addr_struct(
-			page_id.space(), page_no);
-		if (recv_addr
-		    && recv_addr->state == RECV_NOT_PROCESSED
-		    && !buf_page_peek(page_id_t(page_id.space(), page_no))) {
-			recv_addr->state = RECV_BEING_READ;
-			*p++ = page_no;
-		}
-	}
-
-	mutex_exit(&recv_sys.mutex);
-	buf_read_recv_pages(FALSE, page_id.space(), page_nos,
-			    ulint(p - page_nos));
-	mutex_enter(&recv_sys.mutex);
-}
-
-/** This is another low level function for the recovery system
-to create a page which has buffered page intialization redo log records.
-@param[in]	page_id		page to be created using redo logs
-@param[in,out]	recv_addr	Hashed redo logs for the given page id
-@return whether the page creation successfully */
-static buf_block_t* recv_recovery_create_page_low(const page_id_t page_id,
-                                                  recv_addr_t* recv_addr)
-{
-  mtr_t mtr;
-  mlog_init_t::init &i= mlog_init.last(page_id);
-  const lsn_t end_lsn= UT_LIST_GET_LAST(recv_addr->rec_list)->end_lsn;
-
-  if (end_lsn < i.lsn)
-  {
-    DBUG_LOG("ib_log", "skip log for page "
-             << page_id
-             << " LSN " << end_lsn
-             << " < " << i.lsn);
-    recv_addr->state= RECV_PROCESSED;
-ignore:
-    ut_a(recv_sys.n_addrs);
-    recv_sys.n_addrs--;
-    return NULL;
-  }
+  uint32_t page_nos[32];
+  ut_ad(page_id == i->first);
+  page_id.set_page_no(ut_2pow_round(page_id.page_no(), 32U));
+  const page_id_t up_limit{page_id + 31};
+  uint32_t* p= page_nos;
 
-  fil_space_t *space= fil_space_acquire_for_io(recv_addr->space);
-  if (!space)
+  for (; i != recv_sys.pages.end() && i->first <= up_limit; i++)
   {
-    recv_addr->state= RECV_PROCESSED;
-    goto ignore;
-  }
-
-  if (space->enable_lsn)
-  {
-init_fail:
-    space->release_for_io();
-    recv_addr->state= RECV_NOT_PROCESSED;
-    return NULL;
+    if (i->second.state == page_recv_t::RECV_NOT_PROCESSED)
+    {
+      i->second.state= page_recv_t::RECV_BEING_READ;
+      *p++= i->first.page_no();
+    }
   }
 
-  /* Determine if a tablespace could be for an internal table
-  for FULLTEXT INDEX. For those tables, no MLOG_INDEX_LOAD record
-  used to be written when redo logging was disabled. Hence, we
-  cannot optimize away page reads, because all the redo
-  log records for initializing and modifying the page in the
-  past could be older than the page in the data file.
-
-  The check is too broad, causing all
-  tables whose names start with FTS_ to skip the optimization. */
-
-  if (strstr(space->name, "/FTS_"))
-    goto init_fail;
-
-  mtr.start();
-  mtr.set_log_mode(MTR_LOG_NONE);
-  buf_block_t *block= buf_page_create(page_id, space->zip_size(), &mtr);
-  if (recv_addr->state == RECV_PROCESSED)
-    /* The page happened to exist in the buffer pool, or it was
-    just being read in. Before buf_page_get_with_no_latch() returned,
-    all changes must have been applied to the page already. */
-    mtr.commit();
-  else
+  if (p != page_nos)
   {
-    i.created= true;
-    buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
-    recv_recover_page(block, mtr, recv_addr, &i);
-    ut_ad(mtr.has_committed());
+    mutex_exit(&recv_sys.mutex);
+    buf_read_recv_pages(page_id.space(), page_nos, ulint(p - page_nos));
+    mutex_enter(&recv_sys.mutex);
   }
-
-  space->release_for_io();
-  return block;
 }
 
-/** This is a low level function for the recovery system
-to create a page which has buffered intialized redo log records.
-@param[in]      page_id page to be created using redo logs
-@return whether the page creation successfully */
-buf_block_t* recv_recovery_create_page_low(const page_id_t page_id)
+/** Attempt to initialize a page based on redo log records.
+@param page_id  page identifier
+@param p        iterator pointing to page_id
+@param mtr      mini-transaction
+@param b        pre-allocated buffer pool block
+@return whether the page was successfully initialized */
+inline buf_block_t *recv_sys_t::recover_low(const page_id_t page_id,
+                                            map::iterator &p, mtr_t &mtr,
+                                            buf_block_t *b)
 {
+  ut_ad(mutex_own(&mutex));
+  ut_ad(p->first == page_id);
+  page_recv_t &recs= p->second;
+  ut_ad(recs.state == page_recv_t::RECV_WILL_NOT_READ);
   buf_block_t* block= nullptr;
-  mutex_enter(&recv_sys.mutex);
-  recv_addr_t* recv_addr= recv_get_fil_addr_struct(page_id.space(),
-                                                   page_id.page_no());
-  if (recv_addr && recv_addr->state == RECV_WILL_NOT_READ)
-    block= recv_recovery_create_page_low(page_id, recv_addr);
-  mutex_exit(&recv_sys.mutex);
-  return block;
-}
-
-/** Apply the hash table of stored log records to persistent data pages.
-@param[in]	last_batch	whether the change buffer merge will be
-				performed as part of the operation */
-void recv_apply_hashed_log_recs(bool last_batch)
-{
-	ut_ad(srv_operation == SRV_OPERATION_NORMAL
-	      || is_mariabackup_restore_or_export());
-
-	mutex_enter(&recv_sys.mutex);
-
-	while (recv_sys.apply_batch_on) {
-		bool abort = recv_sys.found_corrupt_log;
-		mutex_exit(&recv_sys.mutex);
-
-		if (abort) {
-			return;
-		}
-
-		os_thread_sleep(500000);
-		mutex_enter(&recv_sys.mutex);
-	}
-
-	ut_ad(!last_batch == log_mutex_own());
-
-	recv_no_ibuf_operations
-		= !last_batch || is_mariabackup_restore_or_export();
-
-	if (ulint n = recv_sys.n_addrs) {
-		if (!log_sys.log.subformat && !srv_force_recovery
-		    && srv_undo_tablespaces_open) {
-			ib::error() << "Recovery of separately logged"
-				" TRUNCATE operations is no longer supported."
-				" Set innodb_force_recovery=1"
-				" if no *trunc.log files exist";
-			recv_sys.found_corrupt_log = true;
-			mutex_exit(&recv_sys.mutex);
-			return;
-		}
-
-		const char* msg = last_batch
-			? "Starting final batch to recover "
-			: "Starting a batch to recover ";
-		ib::info() << msg << n << " pages from redo log.";
-		sd_notifyf(0, "STATUS=%s" ULINTPF " pages from redo log",
-			   msg, n);
-	}
-	recv_sys.apply_log_recs = true;
-	recv_sys.apply_batch_on = true;
-
-	for (ulint id = srv_undo_tablespaces_open; id--; ) {
-		recv_sys_t::trunc& t = recv_sys.truncated_undo_spaces[id];
-		if (t.lsn) {
-			recv_addr_trim(id + srv_undo_space_id_start, t.pages,
-				       t.lsn);
-		}
-	}
-
-	mtr_t mtr;
-
-	for (ulint i = 0; i < hash_get_n_cells(recv_sys.addr_hash); i++) {
-		for (recv_addr_t* recv_addr = static_cast<recv_addr_t*>(
-			     HASH_GET_FIRST(recv_sys.addr_hash, i));
-		     recv_addr;
-		     recv_addr = static_cast<recv_addr_t*>(
-				HASH_GET_NEXT(addr_hash, recv_addr))) {
-			if (!UT_LIST_GET_LEN(recv_addr->rec_list)) {
-ignore:
-				ut_a(recv_sys.n_addrs);
-				recv_sys.n_addrs--;
-				continue;
-			}
-
-			switch (recv_addr->state) {
-			case RECV_BEING_READ:
-			case RECV_BEING_PROCESSED:
-			case RECV_PROCESSED:
-				continue;
-			case RECV_DISCARDED:
-				goto ignore;
-			case RECV_NOT_PROCESSED:
-			case RECV_WILL_NOT_READ:
-				break;
-			}
-
-			const page_id_t page_id(recv_addr->space,
-						recv_addr->page_no);
-
-			if (recv_addr->state == RECV_NOT_PROCESSED) {
-apply:
-				mtr.start();
-				mtr.set_log_mode(MTR_LOG_NONE);
-				if (buf_block_t* block = buf_page_get_low(
-					    page_id, 0, RW_X_LATCH, NULL,
-					    BUF_GET_IF_IN_POOL,
-					    __FILE__, __LINE__, &mtr, NULL)) {
-					buf_block_dbg_add_level(
-						block, SYNC_NO_ORDER_CHECK);
-					recv_recover_page(block, mtr,
-							  recv_addr);
-					ut_ad(mtr.has_committed());
-				} else {
-					mtr.commit();
-					recv_read_in_area(page_id);
-				}
-			} else if (!recv_recovery_create_page_low(
-					page_id, recv_addr)) {
-				goto apply;
-			}
-		}
-	}
-
-	/* Wait until all the pages have been processed */
-
-	while (recv_sys.n_addrs || buf_get_n_pending_read_ios()) {
-		const bool abort = recv_sys.found_corrupt_log
-			|| recv_sys.found_corrupt_fs;
-
-		if (recv_sys.found_corrupt_fs && !srv_force_recovery) {
-			ib::info() << "Set innodb_force_recovery=1"
-				" to ignore corrupted pages.";
-		}
-
-		mutex_exit(&(recv_sys.mutex));
-
-		if (abort) {
-			return;
-		}
-
-		os_thread_sleep(500000);
-
-		mutex_enter(&(recv_sys.mutex));
-	}
-
-	if (!last_batch) {
-		/* Flush all the file pages to disk and invalidate them in
-		the buffer pool */
-
-		mutex_exit(&(recv_sys.mutex));
-		log_mutex_exit();
-
-		/* Stop the recv_writer thread from issuing any LRU
-		flush batches. */
-		mutex_enter(&recv_sys.writer_mutex);
-
-		/* Wait for any currently run batch to end. */
-		buf_flush_wait_LRU_batch_end();
-
-		os_event_reset(recv_sys.flush_end);
-		recv_sys.flush_type = BUF_FLUSH_LIST;
-		os_event_set(recv_sys.flush_start);
-		os_event_wait(recv_sys.flush_end);
-
-		buf_pool_invalidate();
-
-		/* Allow batches from recv_writer thread. */
-		mutex_exit(&recv_sys.writer_mutex);
-
-		log_mutex_enter();
-		mutex_enter(&(recv_sys.mutex));
-		mlog_init.reset();
-	} else if (!recv_no_ibuf_operations) {
-		/* We skipped this in buf_page_create(). */
-		mlog_init.ibuf_merge(mtr);
-	}
-
-	recv_sys.apply_log_recs = false;
-	recv_sys.apply_batch_on = false;
-
-	recv_sys.empty();
-
-	mutex_exit(&recv_sys.mutex);
-}
-
-/** Parse the redo log to set the space recovery size and flags
-@param[in]	ptr	pointer to parsing redo buffer
-@param[in]	end_ptr	end of the parsing redo buffer
-@param[in]	space	tablespace id */
-static
-void recv_parse_set_size_and_flags(const byte *ptr, byte *end_ptr,
-                                   ulint space)
-{
-  switch (const uint16_t offset= mach_read_from_2(ptr))
+  mlog_init_t::init &i= mlog_init.last(page_id);
+  const lsn_t end_lsn = recs.log.last()->lsn;
+  if (end_lsn < i.lsn)
+    DBUG_LOG("ib_log", "skip log for page " << page_id
+             << " LSN " << end_lsn << " < " << i.lsn);
+  else if (fil_space_t *space= fil_space_t::get(page_id.space()))
   {
-  default:
-    break;
-  case FSP_HEADER_OFFSET + FSP_SIZE:
-  case FSP_HEADER_OFFSET + FSP_SPACE_FLAGS:
-    ptr += 2;
-    ulint val= mach_parse_compressed(&ptr, end_ptr);
-    recv_spaces_t::iterator it= recv_spaces.find(space);
-
-    ut_ad(!recv_sys.mlog_checkpoint_lsn || space == TRX_SYS_SPACE ||
-          srv_is_undo_tablespace(space) || it != recv_spaces.end());
-
-    if (offset == FSP_HEADER_OFFSET + FSP_SIZE)
-      fil_space_set_recv_size_and_flags(
-         space, val, FSP_FLAGS_FCRC32_MASK_MARKER);
-    else
-      fil_space_set_recv_size_and_flags(
-         space, 0, static_cast<uint32_t>(val));
-
-    if (it == recv_spaces.end() || it->second.space)
-      return;
-
-    if (offset == FSP_HEADER_OFFSET + FSP_SIZE)
-      it->second.size= val;
+    mtr.start();
+    mtr.set_log_mode(MTR_LOG_NO_REDO);
+    block= buf_page_create(space, page_id.page_no(), space->zip_size(), &mtr,
+                           b);
+    if (UNIV_UNLIKELY(block != b))
+    {
+      /* The page happened to exist in the buffer pool, or it was just
+      being read in. Before buf_page_get_with_no_latch() returned to
+      buf_page_create(), all changes must have been applied to the
+      page already. */
+      ut_ad(recv_sys.pages.find(page_id) == recv_sys.pages.end());
+      mtr.commit();
+      block= nullptr;
+    }
     else
-      it->second.flags= static_cast<uint32_t>(val);
+    {
+      ut_ad(&recs == &recv_sys.pages.find(page_id)->second);
+      i.created= true;
+      buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+      recv_recover_page(block, mtr, p, space, &i);
+      ut_ad(mtr.has_committed());
+      recs.log.clear();
+      map::iterator r= p++;
+      recv_sys.pages.erase(r);
+    }
+    space->release();
   }
-}
-
-/** Tries to parse a single log record.
-@param[out]	type		log record type
-@param[in]	ptr		pointer to a buffer
-@param[in]	end_ptr		end of the buffer
-@param[out]	space_id	tablespace identifier
-@param[out]	page_no		page number
-@param[in]	apply		whether to apply MLOG_FILE_* records
-@param[out]	body		start of log record body
-@return length of the record, or 0 if the record was not complete */
-static
-ulint
-recv_parse_log_rec(
-	mlog_id_t*	type,
-	byte*		ptr,
-	byte*		end_ptr,
-	ulint*		space,
-	ulint*		page_no,
-	bool		apply,
-	byte**		body)
-{
-	byte*	new_ptr;
-
-	*body = NULL;
-
-	MEM_UNDEFINED(type, sizeof *type);
-	MEM_UNDEFINED(space, sizeof *space);
-	MEM_UNDEFINED(page_no, sizeof *page_no);
-	MEM_UNDEFINED(body, sizeof *body);
 
-	if (ptr == end_ptr) {
-
-		return(0);
-	}
-
-	switch (*ptr) {
-#ifdef UNIV_LOG_LSN_DEBUG
-	case MLOG_LSN | MLOG_SINGLE_REC_FLAG:
-	case MLOG_LSN:
-		new_ptr = mlog_parse_initial_log_record(
-			ptr, end_ptr, type, space, page_no);
-		if (new_ptr != NULL) {
-			const lsn_t	lsn = static_cast<lsn_t>(
-				*space) << 32 | *page_no;
-			ut_a(lsn == recv_sys.recovered_lsn);
-		}
-
-		*type = MLOG_LSN;
-		return(new_ptr - ptr);
-#endif /* UNIV_LOG_LSN_DEBUG */
-	case MLOG_MULTI_REC_END:
-	case MLOG_DUMMY_RECORD:
-		*type = static_cast<mlog_id_t>(*ptr);
-		return(1);
-	case MLOG_CHECKPOINT:
-		if (end_ptr < ptr + SIZE_OF_MLOG_CHECKPOINT) {
-			return(0);
-		}
-		*type = static_cast<mlog_id_t>(*ptr);
-		return(SIZE_OF_MLOG_CHECKPOINT);
-	case MLOG_MULTI_REC_END | MLOG_SINGLE_REC_FLAG:
-	case MLOG_DUMMY_RECORD | MLOG_SINGLE_REC_FLAG:
-	case MLOG_CHECKPOINT | MLOG_SINGLE_REC_FLAG:
-		ib::error() << "Incorrect log record type "
-			<< ib::hex(unsigned(*ptr));
-		recv_sys.found_corrupt_log = true;
-		return(0);
-	}
-
-	new_ptr = mlog_parse_initial_log_record(ptr, end_ptr, type, space,
-						page_no);
-	*body = new_ptr;
-
-	if (UNIV_UNLIKELY(!new_ptr)) {
-
-		return(0);
-	}
-
-	const byte*	old_ptr = new_ptr;
-	new_ptr = recv_parse_or_apply_log_rec_body(
-		*type, new_ptr, end_ptr, page_id_t(*space, *page_no), apply,
-		NULL, NULL);
-
-	if (UNIV_UNLIKELY(new_ptr == NULL)) {
-		return(0);
-	}
-
-	if (*page_no == 0 && *type == MLOG_4BYTES && apply) {
-		recv_parse_set_size_and_flags(old_ptr, end_ptr, *space);
-	}
-
-	return ulint(new_ptr - ptr);
-}
-
-/*******************************************************//**
-Calculates the new value for lsn when more data is added to the log. */
-static
-lsn_t
-recv_calc_lsn_on_data_add(
-/*======================*/
-	lsn_t		lsn,	/*!< in: old lsn */
-	ib_uint64_t	len)	/*!< in: this many bytes of data is
-				added, log block headers not included */
-{
-	unsigned frag_len = (lsn % OS_FILE_LOG_BLOCK_SIZE) - LOG_BLOCK_HDR_SIZE;
-	unsigned payload_size = log_sys.payload_size();
-	ut_ad(frag_len < payload_size);
-	lsn_t lsn_len = len;
-	lsn_len += (lsn_len + frag_len) / payload_size
-		* (OS_FILE_LOG_BLOCK_SIZE - payload_size);
-
-	return(lsn + lsn_len);
-}
-
-/** Prints diagnostic info of corrupt log.
-@param[in]	ptr	pointer to corrupt log record
-@param[in]	type	type of the log record (could be garbage)
-@param[in]	space	tablespace ID (could be garbage)
-@param[in]	page_no	page number (could be garbage)
-@return whether processing should continue */
-ATTRIBUTE_COLD
-static
-bool
-recv_report_corrupt_log(
-	const byte*	ptr,
-	int		type,
-	ulint		space,
-	ulint		page_no)
-{
-	ib::error() <<
-		"############### CORRUPT LOG RECORD FOUND ##################";
-
-	const ulint ptr_offset = ulint(ptr - recv_sys.buf);
-
-	ib::info() << "Log record type " << type << ", page " << space << ":"
-		<< page_no << ". Log parsing proceeded successfully up to "
-		<< recv_sys.recovered_lsn << ". Previous log record type "
-		<< recv_previous_parsed_rec_type << ", is multi "
-		<< recv_previous_parsed_rec_is_multi << " Recv offset "
-		<< ptr_offset << ", prev "
-		<< recv_previous_parsed_rec_offset;
-
-	ut_ad(ptr <= recv_sys.buf + recv_sys.len);
-
-	const ulint	limit	= 100;
-	const ulint	prev_offset = std::min(recv_previous_parsed_rec_offset,
-					       ptr_offset);
-	const ulint	before = std::min(prev_offset, limit);
-	const ulint	after = std::min(recv_sys.len - ptr_offset, limit);
-
-	ib::info() << "Hex dump starting " << before << " bytes before and"
-		" ending " << after << " bytes after the corrupted record:";
-
-	const byte* start = recv_sys.buf + prev_offset - before;
-
-	ut_print_buf(stderr, start, ulint(ptr - start) + after);
-	putc('\n', stderr);
-
-	if (!srv_force_recovery) {
-		ib::info() << "Set innodb_force_recovery to ignore this error.";
-		return(false);
-	}
-
-	ib::warn() << "The log file may have been corrupt and it is possible"
-		" that the log scan did not proceed far enough in recovery!"
-		" Please run CHECK TABLE on your InnoDB tables to check"
-		" that they are ok! If mysqld crashes after this recovery; "
-		<< FORCE_RECOVERY_MSG;
-	return(true);
+  return block;
 }
 
-/** Report a MLOG_INDEX_LOAD operation.
-@param[in]	space_id	tablespace id
-@param[in]	page_no		page number
-@param[in]	lsn		log sequence number */
-ATTRIBUTE_COLD static void
-recv_mlog_index_load(ulint space_id, ulint page_no, lsn_t lsn)
+/** Attempt to initialize a page based on redo log records.
+@param page_id  page identifier
+@return whether the page was successfully initialized */
+buf_block_t *recv_sys_t::recover_low(const page_id_t page_id)
 {
-	recv_spaces_t::iterator it = recv_spaces.find(space_id);
-	if (it != recv_spaces.end()) {
-		it->second.mlog_index_load(lsn);
-	}
+  buf_block_t *free_block= buf_LRU_get_free_block(false);
+  buf_block_t *block= nullptr;
 
-	if (log_optimized_ddl_op) {
-		log_optimized_ddl_op(space_id);
-	}
-}
+  mutex_enter(&mutex);
+  map::iterator p= pages.find(page_id);
 
-/** Check whether read redo log memory exceeds the available memory
-of buffer pool. Store last_stored_lsn if it is not in last phase
-@param[in]	store		whether to store page operations
-@param[in]	available_mem	Available memory in buffer pool to
-				read redo logs. */
-static bool recv_sys_heap_check(store_t* store, ulint available_mem)
-{
-  if (*store != STORE_NO && mem_heap_get_size(recv_sys.heap) >= available_mem)
+  if (p != pages.end() && p->second.state == page_recv_t::RECV_WILL_NOT_READ)
   {
-    if (*store == STORE_YES)
-      recv_sys.last_stored_lsn= recv_sys.recovered_lsn;
-
-    *store= STORE_NO;
-    DBUG_PRINT("ib_log",("Ran out of memory and last "
-			 "stored lsn " LSN_PF " last stored offset "
-			 ULINTPF "\n",
-			 recv_sys.recovered_lsn, recv_sys.recovered_offset));
-    return true;
+    mtr_t mtr;
+    block= recover_low(page_id, p, mtr, free_block);
+    ut_ad(!block || block == free_block);
   }
 
-  return false;
+  mutex_exit(&mutex);
+  if (UNIV_UNLIKELY(!block))
+    buf_pool.free_block(free_block);
+  return block;
 }
 
-/** Parse log records from a buffer and optionally store them to a
-hash table to wait merging to file pages.
-@param[in]	checkpoint_lsn		the LSN of the latest checkpoint
-@param[in]	store			whether to store page operations
-@param[in]	available_mem		memory to read the redo logs
-@param[in]	apply			whether to apply the records
-@return whether MLOG_CHECKPOINT record was seen the first time,
-or corruption was noticed */
-bool recv_parse_log_recs(lsn_t checkpoint_lsn, store_t* store,
-			 ulint available_mem, bool apply)
+/** Thread-safe function which sorts flush_list by oldest_modification */
+static void log_sort_flush_list()
 {
-	byte*		ptr;
-	byte*		end_ptr;
-	bool		single_rec;
-	ulint		len;
-	lsn_t		new_recovered_lsn;
-	lsn_t		old_lsn;
-	mlog_id_t	type;
-	ulint		space;
-	ulint		page_no;
-	byte*		body;
-	const bool	last_phase = (*store == STORE_IF_EXISTS);
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
 
-	ut_ad(log_mutex_own());
-	ut_ad(mutex_own(&recv_sys.mutex));
-	ut_ad(recv_sys.parse_start_lsn != 0);
-loop:
-	ptr = recv_sys.buf + recv_sys.recovered_offset;
-
-	end_ptr = recv_sys.buf + recv_sys.len;
-
-	if (ptr == end_ptr) {
-
-		return(false);
-	}
-
-	/* Check for memory overflow and ignore the parsing of remaining
-	redo log records if InnoDB ran out of memory */
-	if (recv_sys_heap_check(store, available_mem) && last_phase) {
-		return false;
-	}
-
-	switch (*ptr) {
-	case MLOG_CHECKPOINT:
-#ifdef UNIV_LOG_LSN_DEBUG
-	case MLOG_LSN:
-#endif /* UNIV_LOG_LSN_DEBUG */
-	case MLOG_DUMMY_RECORD:
-		single_rec = true;
-		break;
-	default:
-		single_rec = !!(*ptr & MLOG_SINGLE_REC_FLAG);
-	}
-
-	if (single_rec) {
-		/* The mtr did not modify multiple pages */
+  const size_t size= UT_LIST_GET_LEN(buf_pool.flush_list);
+  std::unique_ptr<buf_page_t *[]> list(new buf_page_t *[size]);
 
-		old_lsn = recv_sys.recovered_lsn;
+  size_t idx= 0;
+  for (buf_page_t *p= UT_LIST_GET_FIRST(buf_pool.flush_list); p;
+       p= UT_LIST_GET_NEXT(list, p))
+    list.get()[idx++]= p;
 
-		/* Try to parse a log record, fetching its type, space id,
-		page no, and a pointer to the body of the log record */
+  std::sort(list.get(), list.get() + size,
+            [](const buf_page_t *lhs, const buf_page_t *rhs) {
+              return rhs->oldest_modification() < lhs->oldest_modification();
+            });
 
-		len = recv_parse_log_rec(&type, ptr, end_ptr, &space,
-					 &page_no, apply, &body);
+  UT_LIST_INIT(buf_pool.flush_list, &buf_page_t::list);
 
-		if (UNIV_UNLIKELY(recv_sys.found_corrupt_log)) {
-			recv_report_corrupt_log(ptr, type, space, page_no);
-			return(true);
-		}
-
-		if (UNIV_UNLIKELY(recv_sys.found_corrupt_fs)) {
-			return(true);
-		}
+  for (size_t i= 0; i < size; i++)
+    UT_LIST_ADD_LAST(buf_pool.flush_list, list[i]);
 
-		if (len == 0) {
-			return(false);
-		}
-
-		new_recovered_lsn = recv_calc_lsn_on_data_add(old_lsn, len);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+}
 
-		if (new_recovered_lsn > recv_sys.scanned_lsn) {
-			/* The log record filled a log block, and we require
-			that also the next log block should have been scanned
-			in */
+/** Apply buffered log to persistent data pages.
+@param last_batch     whether it is possible to write more redo log */
+void recv_sys_t::apply(bool last_batch)
+{
+  ut_ad(srv_operation == SRV_OPERATION_NORMAL ||
+        srv_operation == SRV_OPERATION_RESTORE ||
+        srv_operation == SRV_OPERATION_RESTORE_EXPORT);
 
-			return(false);
-		}
+  mutex_enter(&mutex);
 
-		recv_previous_parsed_rec_type = type;
-		recv_previous_parsed_rec_offset = recv_sys.recovered_offset;
-		recv_previous_parsed_rec_is_multi = 0;
+  while (apply_batch_on)
+  {
+    bool abort= found_corrupt_log;
+    mutex_exit(&mutex);
 
-		recv_sys.recovered_offset += len;
-		recv_sys.recovered_lsn = new_recovered_lsn;
+    if (abort)
+      return;
 
-		switch (type) {
-			lsn_t	lsn;
-		case MLOG_DUMMY_RECORD:
-			/* Do nothing */
-			break;
-		case MLOG_CHECKPOINT:
-			compile_time_assert(SIZE_OF_MLOG_CHECKPOINT == 1 + 8);
-			lsn = mach_read_from_8(ptr + 1);
-
-			if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) {
-				fprintf(stderr,
-					"MLOG_CHECKPOINT(" LSN_PF ") %s at "
-					LSN_PF "\n", lsn,
-					lsn != checkpoint_lsn ? "ignored"
-					: recv_sys.mlog_checkpoint_lsn
-					? "reread" : "read",
-					recv_sys.recovered_lsn);
-			}
+    os_thread_sleep(500000);
+    mutex_enter(&mutex);
+  }
 
-			DBUG_PRINT("ib_log",
-				   ("MLOG_CHECKPOINT(" LSN_PF ") %s at "
-				    LSN_PF,
-				    lsn,
-				    lsn != checkpoint_lsn ? "ignored"
-				    : recv_sys.mlog_checkpoint_lsn
-				    ? "reread" : "read",
-				    recv_sys.recovered_lsn));
-
-			if (lsn == checkpoint_lsn) {
-				if (recv_sys.mlog_checkpoint_lsn) {
-					/* There can be multiple
-					MLOG_CHECKPOINT lsn for the
-					same checkpoint. */
-					break;
-				}
-				recv_sys.mlog_checkpoint_lsn
-					= recv_sys.recovered_lsn;
-				return(true);
-			}
-			break;
-#ifdef UNIV_LOG_LSN_DEBUG
-		case MLOG_LSN:
-			/* Do not add these records to the hash table.
-			The page number and space id fields are misused
-			for something else. */
-			break;
-#endif /* UNIV_LOG_LSN_DEBUG */
-		default:
-			switch (*store) {
-			case STORE_NO:
-				break;
-			case STORE_IF_EXISTS:
-				if (fil_space_get_flags(space)
-				    == ULINT_UNDEFINED) {
-					break;
-				}
-				/* fall through */
-			case STORE_YES:
-				recv_sys.add(
-					type, space, page_no, body,
-					ptr + len, old_lsn,
-					recv_sys.recovered_lsn);
-			}
-			/* fall through */
-		case MLOG_INDEX_LOAD:
-			if (type == MLOG_INDEX_LOAD) {
-				recv_mlog_index_load(space, page_no, old_lsn);
-			}
-			/* fall through */
-		case MLOG_FILE_NAME:
-		case MLOG_FILE_DELETE:
-		case MLOG_FILE_CREATE2:
-		case MLOG_FILE_RENAME2:
-		case MLOG_TRUNCATE:
-			/* These were already handled by
-			recv_parse_log_rec() and
-			recv_parse_or_apply_log_rec_body(). */
-			DBUG_PRINT("ib_log",
-				("scan " LSN_PF ": log rec %s"
-				" len " ULINTPF
-				" page " ULINTPF ":" ULINTPF,
-				old_lsn, get_mlog_string(type),
-				len, space, page_no));
-		}
-	} else {
-		/* Check that all the records associated with the single mtr
-		are included within the buffer */
-
-		ulint	total_len	= 0;
-		ulint	n_recs		= 0;
-		bool	only_mlog_file	= true;
-		ulint	mlog_rec_len	= 0;
-
-		for (;;) {
-			len = recv_parse_log_rec(
-				&type, ptr, end_ptr, &space, &page_no,
-				false, &body);
-
-			if (UNIV_UNLIKELY(recv_sys.found_corrupt_log)) {
-corrupted_log:
-				recv_report_corrupt_log(
-					ptr, type, space, page_no);
-				return(true);
-			}
+#ifdef SAFE_MUTEX
+  DBUG_ASSERT(!last_batch == mysql_mutex_is_owner(&log_sys.mutex));
+#endif /* SAFE_MUTEX */
 
-			if (ptr == end_ptr) {
-			} else if (type == MLOG_CHECKPOINT
-				   || (*ptr & MLOG_SINGLE_REC_FLAG)) {
-				recv_sys.found_corrupt_log = true;
-				goto corrupted_log;
-			}
+  recv_no_ibuf_operations = !last_batch ||
+    srv_operation == SRV_OPERATION_RESTORE ||
+    srv_operation == SRV_OPERATION_RESTORE_EXPORT;
 
-			if (recv_sys.found_corrupt_fs) {
-				return(true);
-			}
+  mtr_t mtr;
 
-			if (len == 0) {
-				return(false);
-			}
+  if (!pages.empty())
+  {
+    const char *msg= last_batch
+      ? "Starting final batch to recover "
+      : "Starting a batch to recover ";
+    const ulint n= pages.size();
+    ib::info() << msg << n << " pages from redo log.";
+    sd_notifyf(0, "STATUS=%s" ULINTPF " pages from redo log", msg, n);
 
-			recv_previous_parsed_rec_type = type;
-			recv_previous_parsed_rec_offset
-				= recv_sys.recovered_offset + total_len;
-			recv_previous_parsed_rec_is_multi = 1;
-
-			/* MLOG_FILE_NAME redo log records doesn't make changes
-			to persistent data. If only MLOG_FILE_NAME redo
-			log record exists then reset the parsing buffer pointer
-			by changing recovered_lsn and recovered_offset. */
-			if (type != MLOG_FILE_NAME && only_mlog_file == true) {
-				only_mlog_file = false;
-			}
+    apply_log_recs= true;
+    apply_batch_on= true;
 
-			if (only_mlog_file) {
-				new_recovered_lsn = recv_calc_lsn_on_data_add(
-					recv_sys.recovered_lsn, len);
-				mlog_rec_len += len;
-				recv_sys.recovered_offset += len;
-				recv_sys.recovered_lsn = new_recovered_lsn;
-			}
+    for (auto id= srv_undo_tablespaces_open; id--;)
+    {
+      const trunc& t= truncated_undo_spaces[id];
+      if (t.lsn)
+        trim(page_id_t(id + srv_undo_space_id_start, t.pages), t.lsn);
+    }
 
-			total_len += len;
-			n_recs++;
+    fil_system.extend_to_recv_size();
 
-			ptr += len;
+    buf_block_t *free_block= buf_LRU_get_free_block(false);
 
-			if (type == MLOG_MULTI_REC_END) {
-				DBUG_PRINT("ib_log",
-					   ("scan " LSN_PF
-					    ": multi-log end"
-					    " total_len " ULINTPF
-					    " n=" ULINTPF,
-					    recv_sys.recovered_lsn,
-					    total_len, n_recs));
-				total_len -= mlog_rec_len;
-				break;
-			}
+    for (map::iterator p= pages.begin(); p != pages.end(); )
+    {
+      const page_id_t page_id= p->first;
+      ut_ad(!p->second.log.empty());
+
+      switch (p->second.state) {
+      case page_recv_t::RECV_BEING_READ:
+      case page_recv_t::RECV_BEING_PROCESSED:
+        p++;
+        continue;
+      case page_recv_t::RECV_WILL_NOT_READ:
+        if (UNIV_LIKELY(!!recover_low(page_id, p, mtr, free_block)))
+        {
+          mutex_exit(&mutex);
+          free_block= buf_LRU_get_free_block(false);
+          mutex_enter(&mutex);
+          break;
+        }
+        ut_ad(p == pages.end() || p->first > page_id);
+        continue;
+      case page_recv_t::RECV_NOT_PROCESSED:
+        recv_read_in_area(page_id, p);
+      }
+      p= pages.lower_bound(page_id);
+      /* Ensure that progress will be made. */
+      ut_ad(p == pages.end() || p->first > page_id ||
+            p->second.state >= page_recv_t::RECV_BEING_READ);
+    }
 
-			DBUG_PRINT("ib_log",
-				   ("scan " LSN_PF ": multi-log rec %s"
-				    " len " ULINTPF
-				    " page " ULINTPF ":" ULINTPF,
-				    recv_sys.recovered_lsn,
-				    get_mlog_string(type), len, space, page_no));
-		}
+    buf_pool.free_block(free_block);
 
-		new_recovered_lsn = recv_calc_lsn_on_data_add(
-			recv_sys.recovered_lsn, total_len);
+    /* Wait until all the pages have been processed */
+    while (!pages.empty() || buf_pool.n_pend_reads)
+    {
+      const bool abort= found_corrupt_log || found_corrupt_fs;
 
-		if (new_recovered_lsn > recv_sys.scanned_lsn) {
-			/* The log record filled a log block, and we require
-			that also the next log block should have been scanned
-			in */
+      if (found_corrupt_fs && !srv_force_recovery)
+        ib::info() << "Set innodb_force_recovery=1 to ignore corrupted pages.";
 
-			return(false);
-		}
+      mutex_exit(&mutex);
 
-		/* Add all the records to the hash table */
+      if (abort)
+        return;
+      os_thread_sleep(500000);
+      mutex_enter(&mutex);
+    }
+  }
 
-		ptr = recv_sys.buf + recv_sys.recovered_offset;
+  if (last_batch)
+    /* We skipped this in buf_page_create(). */
+    mlog_init.mark_ibuf_exist(mtr);
+  else
+  {
+    mlog_init.reset();
+    mysql_mutex_unlock(&log_sys.mutex);
+  }
 
-		for (;;) {
-			old_lsn = recv_sys.recovered_lsn;
-			/* This will apply MLOG_FILE_ records. We
-			had to skip them in the first scan, because we
-			did not know if the mini-transaction was
-			completely recovered (until MLOG_MULTI_REC_END). */
-			len = recv_parse_log_rec(
-				&type, ptr, end_ptr, &space, &page_no,
-				apply, &body);
+  mysql_mutex_assert_not_owner(&log_sys.mutex);
+  mutex_exit(&mutex);
 
-			if (UNIV_UNLIKELY(recv_sys.found_corrupt_log)
-			    && !recv_report_corrupt_log(
-				    ptr, type, space, page_no)) {
-				return(true);
-			}
+  if (last_batch && srv_operation != SRV_OPERATION_RESTORE &&
+      srv_operation != SRV_OPERATION_RESTORE_EXPORT)
+    log_sort_flush_list();
+  else
+  {
+    /* Instead of flushing, last_batch could sort the buf_pool.flush_list
+    in ascending order of buf_page_t::oldest_modification. */
+    buf_flush_sync_batch(recovered_lsn);
+  }
 
-			if (UNIV_UNLIKELY(recv_sys.found_corrupt_fs)) {
-				return(true);
-			}
+  if (!last_batch)
+  {
+    buf_pool_invalidate();
+    mysql_mutex_lock(&log_sys.mutex);
+  }
+#if 1 /* Mariabackup FIXME: Remove or adjust rename_table_in_prepare() */
+  else if (srv_operation != SRV_OPERATION_NORMAL);
+#endif
+  else
+  {
+    /* In the last batch, we will apply any rename operations. */
+    for (auto r : renamed_spaces)
+    {
+      const uint32_t id= r.first;
+      fil_space_t *space= fil_space_t::get(id);
+      if (!space)
+        continue;
+      ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
+      const char *old= space->chain.start->name;
+      if (r.second != old)
+      {
+        bool exists;
+        os_file_type_t ftype;
+        const char *new_name= r.second.c_str();
+        if (!os_file_status(new_name, &exists, &ftype) || exists)
+        {
+          ib::error() << "Cannot replay rename of tablespace " << id
+                      << " from '" << old << "' to '" << r.second <<
+                      (exists ? "' because the target file exists" : "'");
+          found_corrupt_fs= true;
+        }
+        else
+        {
+          size_t base= r.second.rfind(OS_PATH_SEPARATOR);
+          ut_ad(base != std::string::npos);
+          size_t start= r.second.rfind(OS_PATH_SEPARATOR, base - 1);
+          if (start == std::string::npos)
+            start= 0;
+          else
+            ++start;
+          /* Keep only databasename/tablename without .ibd suffix */
+          std::string space_name(r.second, start, r.second.size() - start - 4);
+          ut_ad(space_name[base - start] == OS_PATH_SEPARATOR);
+#if OS_PATH_SEPARATOR != '/'
+          space_name[base - start]= '/';
+#endif
+          mysql_mutex_lock(&log_sys.mutex);
+          if (dberr_t err= space->rename(space_name.c_str(), r.second.c_str(),
+                                         false))
+          {
+            ib::error() << "Cannot replay rename of tablespace " << id
+                        << " to '" << r.second << "': " << err;
+            found_corrupt_fs= true;
+          }
+          mysql_mutex_unlock(&log_sys.mutex);
+        }
+      }
+      space->release();
+    }
+    renamed_spaces.clear();
+  }
 
-			ut_a(len != 0);
-			ut_a(!(*ptr & MLOG_SINGLE_REC_FLAG));
-
-			recv_sys.recovered_offset += len;
-			recv_sys.recovered_lsn
-				= recv_calc_lsn_on_data_add(old_lsn, len);
-
-			switch (type) {
-			case MLOG_MULTI_REC_END:
-				/* Found the end mark for the records */
-				goto loop;
-#ifdef UNIV_LOG_LSN_DEBUG
-			case MLOG_LSN:
-				/* Do not add these records to the hash table.
-				The page number and space id fields are misused
-				for something else. */
-				break;
-#endif /* UNIV_LOG_LSN_DEBUG */
-			case MLOG_INDEX_LOAD:
-				recv_mlog_index_load(space, page_no, old_lsn);
-				break;
-			case MLOG_FILE_NAME:
-			case MLOG_FILE_DELETE:
-			case MLOG_FILE_CREATE2:
-			case MLOG_FILE_RENAME2:
-			case MLOG_TRUNCATE:
-				/* These were already handled by
-				recv_parse_log_rec() and
-				recv_parse_or_apply_log_rec_body(). */
-				break;
-			default:
-				switch (*store) {
-				case STORE_NO:
-					break;
-				case STORE_IF_EXISTS:
-					if (fil_space_get_flags(space)
-					    == ULINT_UNDEFINED) {
-						break;
-					}
-					/* fall through */
-				case STORE_YES:
-					recv_sys.add(
-						type, space, page_no,
-						body, ptr + len,
-						old_lsn,
-						new_recovered_lsn);
-				}
-			}
+  mutex_enter(&mutex);
 
-			ptr += len;
-		}
-	}
+  ut_d(after_apply= true);
+  clear();
+  mutex_exit(&mutex);
+}
 
-	goto loop;
+/** Check whether the number of read redo log blocks exceeds the maximum.
+Store last_stored_lsn if the recovery is not in the last phase.
+@param[in,out] store    whether to store page operations
+@return whether the memory is exhausted */
+inline bool recv_sys_t::is_memory_exhausted(store_t *store)
+{
+  if (*store == STORE_NO ||
+      UT_LIST_GET_LEN(blocks) * 3 < buf_pool.get_n_pages())
+    return false;
+  if (*store == STORE_YES)
+    last_stored_lsn= recovered_lsn;
+  *store= STORE_NO;
+  DBUG_PRINT("ib_log",("Ran out of memory and last stored lsn " LSN_PF
+                       " last stored offset " ULINTPF "\n",
+                       recovered_lsn, recovered_offset));
+  return true;
 }
 
 /** Adds data from a new log block to the parsing buffer of recv_sys if
@@ -3104,8 +2901,8 @@ bool recv_sys_add_to_parsing_buf(const byte* log_block, lsn_t scanned_lsn)
 	ut_ad(start_offset <= end_offset);
 
 	if (start_offset < end_offset) {
-		ut_memcpy(recv_sys.buf + recv_sys.len,
-			  log_block + start_offset, end_offset - start_offset);
+		memcpy(recv_sys.buf + recv_sys.len,
+		       log_block + start_offset, end_offset - start_offset);
 
 		recv_sys.len += end_offset - start_offset;
 
@@ -3129,13 +2926,11 @@ void recv_sys_justify_left_parsing_buf()
 /** Scan redo log from a buffer and stores new log data to the parsing buffer.
 Parse and hash the log records if new data found.
 Apply log records automatically when the hash table becomes full.
-@param[in]	available_mem		we let the hash table of recs to
-					grow to this size, at the maximum
-@param[in,out]	store_to_hash		whether the records should be
-					stored to the hash table; this is
+@param[in,out]	store			whether the records should be
+					stored into recv_sys.pages; this is
 					reset if just debug checking is
-					needed, or when the available_mem
-					runs out
+					needed, or when the num_max_blocks in
+					recv_sys runs out
 @param[in]	log_block		log segment
 @param[in]	checkpoint_lsn		latest checkpoint LSN
 @param[in]	start_lsn		buffer start LSN
@@ -3145,8 +2940,7 @@ Apply log records automatically when the hash table becomes full.
 @param[out]	group_scanned_lsn	scanning succeeded upto this lsn
 @return true if not able to scan any more in this log group */
 static bool recv_scan_log_recs(
-	ulint		available_mem,
-	store_t*	store_to_hash,
+	store_t*	store,
 	const byte*	log_block,
 	lsn_t		checkpoint_lsn,
 	lsn_t		start_lsn,
@@ -3160,13 +2954,16 @@ static bool recv_scan_log_recs(
 	bool		more_data	= false;
 	bool		apply		= recv_sys.mlog_checkpoint_lsn != 0;
 	ulint		recv_parsing_buf_size = RECV_PARSING_BUF_SIZE;
-	const bool	last_phase = (*store_to_hash == STORE_IF_EXISTS);
+	const bool	last_phase = (*store == STORE_IF_EXISTS);
 	ut_ad(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
 	ut_ad(end_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
 	ut_ad(end_lsn >= start_lsn + OS_FILE_LOG_BLOCK_SIZE);
+	ut_ad(log_sys.is_physical());
 
 	const byte* const	log_end = log_block
 		+ ulint(end_lsn - start_lsn);
+	constexpr ulint sizeof_checkpoint= SIZE_OF_FILE_CHECKPOINT;
+
 	do {
 		ut_ad(!finished);
 
@@ -3212,11 +3009,13 @@ static bool recv_scan_log_recs(
 
 		scanned_lsn += data_len;
 
-		if (data_len == LOG_BLOCK_HDR_SIZE + SIZE_OF_MLOG_CHECKPOINT
-		    && scanned_lsn == checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT
-		    && log_block[LOG_BLOCK_HDR_SIZE] == MLOG_CHECKPOINT
-		    && checkpoint_lsn == mach_read_from_8(LOG_BLOCK_HDR_SIZE
-							  + 1 + log_block)) {
+		if (data_len == LOG_BLOCK_HDR_SIZE + sizeof_checkpoint
+		    && scanned_lsn == checkpoint_lsn + sizeof_checkpoint
+		    && log_block[LOG_BLOCK_HDR_SIZE]
+		    == (FILE_CHECKPOINT | (SIZE_OF_FILE_CHECKPOINT - 2))
+		    && checkpoint_lsn == mach_read_from_8(
+			    (LOG_BLOCK_HDR_SIZE + 1 + 2)
+			    + log_block)) {
 			/* The redo log is logically empty. */
 			ut_ad(recv_sys.mlog_checkpoint_lsn == 0
 			      || recv_sys.mlog_checkpoint_lsn
@@ -3229,7 +3028,7 @@ static bool recv_scan_log_recs(
 		}
 
 		if (scanned_lsn > recv_sys.scanned_lsn) {
-			ut_ad(!srv_log_files_created);
+			ut_ad(!srv_log_file_created);
 			if (!recv_needed_recovery) {
 				recv_needed_recovery = true;
 
@@ -3240,8 +3039,8 @@ static bool recv_scan_log_recs(
 				}
 
 				ib::info() << "Starting crash recovery from"
-					" checkpoint LSN="
-					<< recv_sys.scanned_lsn;
+					" checkpoint LSN=" << checkpoint_lsn
+					   << "," << recv_sys.scanned_lsn;
 			}
 
 			/* We were able to find more log data: add it to the
@@ -3250,8 +3049,7 @@ static bool recv_scan_log_recs(
 
 			DBUG_EXECUTE_IF(
 				"reduce_recv_parsing_buf",
-				recv_parsing_buf_size
-					= (70 * 1024);
+				recv_parsing_buf_size = RECV_SCAN_SIZE * 2;
 				);
 
 			if (recv_sys.len + 4 * OS_FILE_LOG_BLOCK_SIZE
@@ -3299,10 +3097,7 @@ static bool recv_scan_log_recs(
 
 	if (more_data && !recv_sys.found_corrupt_log) {
 		/* Try to parse more log records */
-
-		if (recv_parse_log_recs(checkpoint_lsn,
-					store_to_hash, available_mem,
-					apply)) {
+		if (recv_sys.parse(checkpoint_lsn, store, apply)) {
 			ut_ad(recv_sys.found_corrupt_log
 			      || recv_sys.found_corrupt_fs
 			      || recv_sys.mlog_checkpoint_lsn
@@ -3311,16 +3106,19 @@ static bool recv_scan_log_recs(
 			goto func_exit;
 		}
 
-		recv_sys_heap_check(store_to_hash, available_mem);
+		recv_sys.is_memory_exhausted(store);
 
-		if (recv_sys.recovered_offset > recv_parsing_buf_size / 4) {
+		if (recv_sys.recovered_offset > recv_parsing_buf_size / 4
+		    || (recv_sys.recovered_offset
+			&& recv_sys.len
+			>= recv_parsing_buf_size - RECV_SCAN_SIZE)) {
 			/* Move parsing buffer data to the buffer start */
 			recv_sys_justify_left_parsing_buf();
 		}
 
 		/* Need to re-parse the redo log which're stored
 		in recv_sys.buf */
-		if (last_phase && *store_to_hash == STORE_NO) {
+		if (last_phase && *store == STORE_NO) {
 			finished = false;
 		}
 	}
@@ -3351,38 +3149,27 @@ recv_group_scan_log_recs(
 	mutex_enter(&recv_sys.mutex);
 	recv_sys.len = 0;
 	recv_sys.recovered_offset = 0;
-	recv_sys.n_addrs = 0;
-	recv_sys.empty();
-	srv_start_lsn = *contiguous_lsn;
+	recv_sys.clear();
 	recv_sys.parse_start_lsn = *contiguous_lsn;
 	recv_sys.scanned_lsn = *contiguous_lsn;
 	recv_sys.recovered_lsn = *contiguous_lsn;
 	recv_sys.scanned_checkpoint_no = 0;
-	recv_previous_parsed_rec_type = MLOG_SINGLE_REC_FLAG;
-	recv_previous_parsed_rec_offset	= 0;
-	recv_previous_parsed_rec_is_multi = 0;
 	ut_ad(recv_max_page_lsn == 0);
-	ut_ad(last_phase || !recv_writer_thread_active);
 	mutex_exit(&recv_sys.mutex);
 
 	lsn_t	start_lsn;
 	lsn_t	end_lsn;
-	store_t	store_to_hash	= recv_sys.mlog_checkpoint_lsn == 0
+	store_t	store	= recv_sys.mlog_checkpoint_lsn == 0
 		? STORE_NO : (last_phase ? STORE_IF_EXISTS : STORE_YES);
-	ulint	available_mem = (buf_pool_get_n_pages() * 2 / 3)
-		<< srv_page_size_shift;
 
 	log_sys.log.scanned_lsn = end_lsn = *contiguous_lsn =
 		ut_uint64_align_down(*contiguous_lsn, OS_FILE_LOG_BLOCK_SIZE);
+	ut_d(recv_sys.after_apply = last_phase);
 
 	do {
-		if (last_phase && store_to_hash == STORE_NO) {
-			store_to_hash = STORE_IF_EXISTS;
-			/* We must not allow change buffer
-			merge here, because it would generate
-			redo log records before we have
-			finished the redo log scan. */
-			recv_apply_hashed_log_recs(false);
+		if (last_phase && store == STORE_NO) {
+			store = STORE_IF_EXISTS;
+			recv_sys.apply(false);
 			/* Rescan the redo logs from last stored lsn */
 			end_lsn = recv_sys.recovered_lsn;
 		}
@@ -3392,11 +3179,9 @@ recv_group_scan_log_recs(
 		end_lsn = start_lsn;
 		log_sys.log.read_log_seg(&end_lsn, start_lsn + RECV_SCAN_SIZE);
 	} while (end_lsn != start_lsn
-		 && !recv_scan_log_recs(
-			 available_mem, &store_to_hash, log_sys.buf,
-			 checkpoint_lsn,
-			 start_lsn, end_lsn,
-			 contiguous_lsn, &log_sys.log.scanned_lsn));
+		 && !recv_scan_log_recs(&store, log_sys.buf, checkpoint_lsn,
+					start_lsn, end_lsn, contiguous_lsn,
+					&log_sys.log.scanned_lsn));
 
 	if (recv_sys.found_corrupt_log || recv_sys.found_corrupt_fs) {
 		DBUG_RETURN(false);
@@ -3406,7 +3191,7 @@ recv_group_scan_log_recs(
 			      last_phase ? "rescan" : "scan",
 			      log_sys.log.scanned_lsn));
 
-	DBUG_RETURN(store_to_hash == STORE_NO);
+	DBUG_RETURN(store == STORE_NO);
 }
 
 /** Report a missing tablespace for which page-redo log exists.
@@ -3417,7 +3202,8 @@ static
 dberr_t
 recv_init_missing_space(dberr_t err, const recv_spaces_t::const_iterator& i)
 {
-	if (is_mariabackup_restore_or_export()) {
+	if (srv_operation == SRV_OPERATION_RESTORE
+	    || srv_operation == SRV_OPERATION_RESTORE_EXPORT) {
 		if (i->second.name.find(TEMP_TABLE_PATH_PREFIX)
 		    != std::string::npos) {
 			ib::warn() << "Tablespace " << i->first << " was not"
@@ -3460,42 +3246,44 @@ recv_validate_tablespace(bool rescan, bool& missing_tablespace)
 {
 	dberr_t err = DB_SUCCESS;
 
-	for (ulint h = 0; h < hash_get_n_cells(recv_sys.addr_hash); h++) {
-		for (recv_addr_t* recv_addr = static_cast<recv_addr_t*>(
-			     HASH_GET_FIRST(recv_sys.addr_hash, h));
-		     recv_addr != 0;
-		     recv_addr = static_cast<recv_addr_t*>(
-			     HASH_GET_NEXT(addr_hash, recv_addr))) {
+	mutex_enter(&recv_sys.mutex);
 
-			const ulint space = recv_addr->space;
+	for (recv_sys_t::map::iterator p = recv_sys.pages.begin();
+	     p != recv_sys.pages.end();) {
+		ut_ad(!p->second.log.empty());
+		const ulint space = p->first.space();
+		if (is_predefined_tablespace(space)) {
+next:
+			p++;
+			continue;
+		}
 
-			if (is_predefined_tablespace(space)) {
-				continue;
-			}
+		recv_spaces_t::iterator i = recv_spaces.find(space);
+		ut_ad(i != recv_spaces.end());
 
-			recv_spaces_t::iterator i = recv_spaces.find(space);
-			ut_ad(i != recv_spaces.end());
-
-			switch (i->second.status) {
-			case file_name_t::MISSING:
-				err = recv_init_missing_space(err, i);
-				i->second.status = file_name_t::DELETED;
-				/* fall through */
-			case file_name_t::DELETED:
-				recv_addr->state = RECV_DISCARDED;
-				/* fall through */
-			case file_name_t::NORMAL:
-				continue;
-			}
-			ut_ad(0);
+		switch (i->second.status) {
+		case file_name_t::NORMAL:
+			goto next;
+		case file_name_t::MISSING:
+			err = recv_init_missing_space(err, i);
+			i->second.status = file_name_t::DELETED;
+			/* fall through */
+		case file_name_t::DELETED:
+			recv_sys_t::map::iterator r = p++;
+			r->second.log.clear();
+			recv_sys.pages.erase(r);
+			continue;
 		}
+		ut_ad(0);
 	}
 
 	if (err != DB_SUCCESS) {
+func_exit:
+		mutex_exit(&recv_sys.mutex);
 		return(err);
 	}
 
-	/* When rescan is not needed, recv_sys.addr_hash will contain the
+	/* When rescan is not needed, recv_sys.pages will contain the
 	entire redo log. If rescan is needed or innodb_force_recovery
 	is set, we can ignore missing tablespaces. */
 	for (const recv_spaces_t::value_type& rs : recv_spaces) {
@@ -3526,7 +3314,8 @@ recv_validate_tablespace(bool rescan, bool& missing_tablespace)
 		missing_tablespace = false;
 	}
 
-	return DB_SUCCESS;
+	err = DB_SUCCESS;
+	goto func_exit;
 }
 
 /** Check if all tablespaces were found for crash recovery.
@@ -3555,12 +3344,20 @@ recv_init_crash_recovery_spaces(bool rescan, bool& missing_tablespace)
 			/* The tablespace was found, and there
 			are some redo log records for it. */
 			fil_names_dirty(rs.second.space);
-			rs.second.space->enable_lsn = rs.second.enable_lsn;
+
+			/* Add the freed page ranges in the respective
+			tablespace */
+			if (!rs.second.freed_ranges.empty()
+			    && (srv_immediate_scrub_data_uncompressed
+				|| rs.second.space->is_compressed())) {
+
+				rs.second.space->add_free_ranges(
+					std::move(rs.second.freed_ranges));
+			}
 		} else if (rs.second.name == "") {
-			ib::error() << "Missing MLOG_FILE_NAME"
-				" or MLOG_FILE_DELETE"
-				" before MLOG_CHECKPOINT for tablespace "
-				<< rs.first;
+			ib::error() << "Missing FILE_CREATE, FILE_DELETE"
+				" or FILE_MODIFY before FILE_CHECKPOINT"
+				" for tablespace " << rs.first;
 			recv_sys.found_corrupt_log = true;
 			return(DB_CORRUPTION);
 		} else {
@@ -3580,7 +3377,6 @@ recv_init_crash_recovery_spaces(bool rescan, bool& missing_tablespace)
 }
 
 /** Start recovering from a redo log checkpoint.
-@see recv_recovery_from_checkpoint_finish
 @param[in]	flush_lsn	FIL_PAGE_FILE_FLUSH_LSN
 of first system tablespace page
 @return error code or DB_SUCCESS */
@@ -3589,18 +3385,19 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 {
 	ulint		max_cp_field;
 	lsn_t		checkpoint_lsn;
-	bool		rescan;
+	bool		rescan = false;
 	ib_uint64_t	checkpoint_no;
 	lsn_t		contiguous_lsn;
 	byte*		buf;
 	dberr_t		err = DB_SUCCESS;
 
 	ut_ad(srv_operation == SRV_OPERATION_NORMAL
-	      || is_mariabackup_restore_or_export());
-
-	/* Initialize red-black tree for fast insertions into the
-	flush_list during recovery process. */
-	buf_flush_init_flush_rbt();
+	      || srv_operation == SRV_OPERATION_RESTORE
+	      || srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+	ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex));
+	ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == 0);
+	ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0);
+	ut_d(mysql_mutex_unlock(&buf_pool.flush_list_mutex));
 
 	if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) {
 
@@ -3609,22 +3406,21 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 		return(DB_SUCCESS);
 	}
 
-	recv_recovery_on = true;
+	recv_sys.recovery_on = true;
 
-	log_mutex_enter();
+	mysql_mutex_lock(&log_sys.mutex);
 
 	err = recv_find_max_checkpoint(&max_cp_field);
 
 	if (err != DB_SUCCESS) {
 
-		srv_start_lsn = recv_sys.recovered_lsn = log_sys.lsn;
-		log_mutex_exit();
+		recv_sys.recovered_lsn = log_sys.get_lsn();
+		mysql_mutex_unlock(&log_sys.mutex);
 		return(err);
 	}
 
-	log_header_read(max_cp_field);
-
 	buf = log_sys.checkpoint_buf;
+	log_sys.log.read(max_cp_field, {buf, OS_FILE_LOG_BLOCK_SIZE});
 
 	checkpoint_lsn = mach_read_from_8(buf + LOG_CHECKPOINT_LSN);
 	checkpoint_no = mach_read_from_8(buf + LOG_CHECKPOINT_NO);
@@ -3639,13 +3435,12 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 	const lsn_t	end_lsn = mach_read_from_8(
 		buf + LOG_CHECKPOINT_END_LSN);
 
-	ut_ad(recv_sys.n_addrs == 0);
+	ut_ad(recv_sys.pages.empty());
 	contiguous_lsn = checkpoint_lsn;
 	switch (log_sys.log.format) {
 	case 0:
-		log_mutex_exit();
-		return recv_log_format_0_recover(checkpoint_lsn,
-						 buf[20 + 32 * 9] == 2);
+		mysql_mutex_unlock(&log_sys.mutex);
+		return DB_SUCCESS;
 	default:
 		if (end_lsn == 0) {
 			break;
@@ -3655,23 +3450,30 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 			break;
 		}
 		recv_sys.found_corrupt_log = true;
-		log_mutex_exit();
+		mysql_mutex_unlock(&log_sys.mutex);
 		return(DB_ERROR);
 	}
 
-	/* Look for MLOG_CHECKPOINT. */
+	size_t sizeof_checkpoint;
+
+	if (!log_sys.is_physical()) {
+		sizeof_checkpoint = 9/* size of MLOG_CHECKPOINT */;
+		goto completed;
+	}
+
+	/* Look for FILE_CHECKPOINT. */
 	recv_group_scan_log_recs(checkpoint_lsn, &contiguous_lsn, false);
 	/* The first scan should not have stored or applied any records. */
-	ut_ad(recv_sys.n_addrs == 0);
+	ut_ad(recv_sys.pages.empty());
 	ut_ad(!recv_sys.found_corrupt_fs);
 
 	if (srv_read_only_mode && recv_needed_recovery) {
-		log_mutex_exit();
+		mysql_mutex_unlock(&log_sys.mutex);
 		return(DB_READ_ONLY);
 	}
 
 	if (recv_sys.found_corrupt_log && !srv_force_recovery) {
-		log_mutex_exit();
+		mysql_mutex_unlock(&log_sys.mutex);
 		ib::warn() << "Log scan aborted at LSN " << contiguous_lsn;
 		return(DB_ERROR);
 	}
@@ -3679,9 +3481,9 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 	if (recv_sys.mlog_checkpoint_lsn == 0) {
 		lsn_t scan_lsn = log_sys.log.scanned_lsn;
 		if (!srv_read_only_mode && scan_lsn != checkpoint_lsn) {
-			log_mutex_exit();
+			mysql_mutex_unlock(&log_sys.mutex);
 			ib::error err;
-			err << "Missing MLOG_CHECKPOINT";
+			err << "Missing FILE_CHECKPOINT";
 			if (end_lsn) {
 				err << " at " << end_lsn;
 			}
@@ -3691,7 +3493,6 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 		}
 
 		log_sys.log.scanned_lsn = checkpoint_lsn;
-		rescan = false;
 	} else {
 		contiguous_lsn = checkpoint_lsn;
 		rescan = recv_group_scan_log_recs(
@@ -3699,7 +3500,7 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 
 		if ((recv_sys.found_corrupt_log && !srv_force_recovery)
 		    || recv_sys.found_corrupt_fs) {
-			log_mutex_exit();
+			mysql_mutex_unlock(&log_sys.mutex);
 			return(DB_ERROR);
 		}
 	}
@@ -3707,33 +3508,40 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 	/* NOTE: we always do a 'recovery' at startup, but only if
 	there is something wrong we will print a message to the
 	user about recovery: */
+	sizeof_checkpoint= SIZE_OF_FILE_CHECKPOINT;
 
-	if (flush_lsn == checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT
+completed:
+	if (flush_lsn == checkpoint_lsn + sizeof_checkpoint
 	    && recv_sys.mlog_checkpoint_lsn == checkpoint_lsn) {
 		/* The redo log is logically empty. */
 	} else if (checkpoint_lsn != flush_lsn) {
-		ut_ad(!srv_log_files_created);
-
-		if (checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT < flush_lsn) {
-			ib::warn() << "Are you sure you are using the"
-				" right ib_logfiles to start up the database?"
-				" Log sequence number in the ib_logfiles is "
-				<< checkpoint_lsn << ", less than the"
-				" log sequence number in the first system"
-				" tablespace file header, " << flush_lsn << ".";
+		ut_ad(!srv_log_file_created);
+
+		if (checkpoint_lsn + sizeof_checkpoint < flush_lsn) {
+			ib::warn()
+				<< "Are you sure you are using the right "
+				<< LOG_FILE_NAME
+				<< " to start up the database? Log sequence "
+				   "number in the "
+				<< LOG_FILE_NAME << " is " << checkpoint_lsn
+				<< ", less than the log sequence number in "
+				   "the first system tablespace file header, "
+				<< flush_lsn << ".";
 		}
 
 		if (!recv_needed_recovery) {
 
-			ib::info() << "The log sequence number " << flush_lsn
+			ib::info()
+				<< "The log sequence number " << flush_lsn
 				<< " in the system tablespace does not match"
-				" the log sequence number " << checkpoint_lsn
-				<< " in the ib_logfiles!";
+				   " the log sequence number "
+				<< checkpoint_lsn << " in the "
+				<< LOG_FILE_NAME << "!";
 
 			if (srv_read_only_mode) {
 				ib::error() << "innodb_read_only"
 					" prevents crash recovery";
-				log_mutex_exit();
+				mysql_mutex_unlock(&log_sys.mutex);
 				return(DB_READ_ONLY);
 			}
 
@@ -3741,7 +3549,12 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 		}
 	}
 
-	log_sys.lsn = recv_sys.recovered_lsn;
+	log_sys.set_lsn(recv_sys.recovered_lsn);
+	if (UNIV_LIKELY(log_sys.get_flushed_lsn() < recv_sys.recovered_lsn)) {
+		/* This may already have been set by create_log_file()
+		if no logs existed when the server started up. */
+		log_sys.set_flushed_lsn(recv_sys.recovered_lsn);
+	}
 
 	if (recv_needed_recovery) {
 		bool missing_tablespace = false;
@@ -3750,7 +3563,7 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 			rescan, missing_tablespace);
 
 		if (err != DB_SUCCESS) {
-			log_mutex_exit();
+			mysql_mutex_unlock(&log_sys.mutex);
 			return(err);
 		}
 
@@ -3780,7 +3593,7 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 					rescan, missing_tablespace);
 
 			if (err != DB_SUCCESS) {
-				log_mutex_exit();
+				mysql_mutex_unlock(&log_sys.mutex);
 				return err;
 			}
 
@@ -3790,16 +3603,11 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 		recv_sys.parse_start_lsn = checkpoint_lsn;
 
 		if (srv_operation == SRV_OPERATION_NORMAL) {
-			buf_dblwr_process();
+			buf_dblwr.recover();
 		}
 
 		ut_ad(srv_force_recovery <= SRV_FORCE_NO_UNDO_LOG_SCAN);
 
-		/* Spawn the background thread to flush dirty pages
-		from the buffer pools. */
-		recv_writer_thread_active = true;
-		os_thread_create(recv_writer_thread, 0, 0);
-
 		if (rescan) {
 			contiguous_lsn = checkpoint_lsn;
 
@@ -3809,16 +3617,17 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 			if ((recv_sys.found_corrupt_log
 			     && !srv_force_recovery)
 			    || recv_sys.found_corrupt_fs) {
-				log_mutex_exit();
+				mysql_mutex_unlock(&log_sys.mutex);
 				return(DB_ERROR);
 			}
 		}
 	} else {
-		ut_ad(!rescan || recv_sys.n_addrs == 0);
+		ut_ad(!rescan || recv_sys.pages.empty());
 	}
 
-	if (log_sys.log.scanned_lsn < checkpoint_lsn
-	    || log_sys.log.scanned_lsn < recv_max_page_lsn) {
+	if (log_sys.is_physical()
+	    && (log_sys.log.scanned_lsn < checkpoint_lsn
+		|| log_sys.log.scanned_lsn < recv_max_page_lsn)) {
 
 		ib::error() << "We scanned the log up to "
 			<< log_sys.log.scanned_lsn
@@ -3829,10 +3638,11 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 	}
 
 	if (recv_sys.recovered_lsn < checkpoint_lsn) {
-		log_mutex_exit();
+		mysql_mutex_unlock(&log_sys.mutex);
 
 		ib::error() << "Recovered only to lsn:"
-			    << recv_sys.recovered_lsn << " checkpoint_lsn: " << checkpoint_lsn;
+			    << recv_sys.recovered_lsn
+			    << " checkpoint_lsn: " << checkpoint_lsn;
 
 		return(DB_ERROR);
 	}
@@ -3842,40 +3652,37 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 
 	recv_synchronize_groups();
 
-	if (!recv_needed_recovery) {
-		ut_a(checkpoint_lsn == recv_sys.recovered_lsn);
-	} else {
-		srv_start_lsn = recv_sys.recovered_lsn;
-	}
+	ut_ad(recv_needed_recovery
+	      || checkpoint_lsn == recv_sys.recovered_lsn);
 
-	log_sys.buf_free = ulong(log_sys.lsn % OS_FILE_LOG_BLOCK_SIZE);
+	log_sys.write_lsn = log_sys.get_lsn();
+	log_sys.buf_free = log_sys.write_lsn % OS_FILE_LOG_BLOCK_SIZE;
 	log_sys.buf_next_to_write = log_sys.buf_free;
-	log_sys.write_lsn = log_sys.lsn;
 
 	log_sys.last_checkpoint_lsn = checkpoint_lsn;
 
-	if (!srv_read_only_mode && srv_operation == SRV_OPERATION_NORMAL) {
-		/* Write a MLOG_CHECKPOINT marker as the first thing,
+	if (!srv_read_only_mode && srv_operation == SRV_OPERATION_NORMAL
+	    && (~log_t::FORMAT_ENCRYPTED & log_sys.log.format)
+	    == log_t::FORMAT_10_5) {
+		/* Write a FILE_CHECKPOINT marker as the first thing,
 		before generating any other redo log. This ensures
 		that subsequent crash recovery will be possible even
 		if the server were killed soon after this. */
 		fil_names_clear(log_sys.last_checkpoint_lsn, true);
 	}
 
-	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
-		    log_sys.lsn - log_sys.last_checkpoint_lsn);
-
 	log_sys.next_checkpoint_no = ++checkpoint_no;
 
 	mutex_enter(&recv_sys.mutex);
 
 	recv_sys.apply_log_recs = true;
-	recv_no_ibuf_operations = is_mariabackup_restore_or_export();
-	ut_d(recv_no_log_write = recv_no_ibuf_operations);
+	recv_no_ibuf_operations = false;
+	ut_d(recv_no_log_write = srv_operation == SRV_OPERATION_RESTORE
+	     || srv_operation == SRV_OPERATION_RESTORE_EXPORT);
 
 	mutex_exit(&recv_sys.mutex);
 
-	log_mutex_exit();
+	mysql_mutex_unlock(&log_sys.mutex);
 
 	recv_lsn_checks_on = true;
 
@@ -3886,80 +3693,6 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 	return(DB_SUCCESS);
 }
 
-/** Complete recovery from a checkpoint. */
-void
-recv_recovery_from_checkpoint_finish(void)
-{
-	/* Make sure that the recv_writer thread is done. This is
-	required because it grabs various mutexes and we want to
-	ensure that when we enable sync_order_checks there is no
-	mutex currently held by any thread. */
-	mutex_enter(&recv_sys.writer_mutex);
-
-	/* Free the resources of the recovery system */
-	recv_recovery_on = false;
-
-	/* By acquring the mutex we ensure that the recv_writer thread
-	won't trigger any more LRU batches. Now wait for currently
-	in progress batches to finish. */
-	buf_flush_wait_LRU_batch_end();
-
-	mutex_exit(&recv_sys.writer_mutex);
-
-	ulint count = 0;
-	while (recv_writer_thread_active) {
-		++count;
-		os_thread_sleep(100000);
-		if (srv_print_verbose_log && count > 600) {
-			ib::info() << "Waiting for recv_writer to"
-				" finish flushing of buffer pool";
-			count = 0;
-		}
-	}
-
-	recv_sys.debug_free();
-
-	/* Free up the flush_rbt. */
-	buf_flush_free_flush_rbt();
-}
-
-/********************************************************//**
-Initiates the rollback of active transactions. */
-void
-recv_recovery_rollback_active(void)
-/*===============================*/
-{
-	ut_ad(!recv_writer_thread_active);
-
-	/* Switch latching order checks on in sync0debug.cc, if
-	--innodb-sync-debug=true (default) */
-	ut_d(sync_check_enable());
-
-	/* We can't start any (DDL) transactions if UNDO logging
-	has been disabled, additionally disable ROLLBACK of recovered
-	user transactions. */
-	if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
-	    && !srv_read_only_mode) {
-
-		/* Drop partially created indexes. */
-		row_merge_drop_temp_indexes();
-		/* Drop garbage tables. */
-		row_mysql_drop_garbage_tables();
-
-		/* Drop any auxiliary tables that were not dropped when the
-		parent table was dropped. This can happen if the parent table
-		was dropped but the server crashed before the auxiliary tables
-		were dropped. */
-		fts_drop_orphaned_tables();
-
-		/* Rollback the uncommitted transactions which have no user
-		session */
-
-		trx_rollback_is_active = true;
-		os_thread_create(trx_rollback_all_recovered, 0, 0);
-	}
-}
-
 bool recv_dblwr_t::validate_page(const page_id_t page_id,
                                  const byte *page,
                                  const fil_space_t *space,
@@ -4042,7 +3775,7 @@ byte *recv_dblwr_t::find_page(const page_id_t page_id,
     if (lsn <= max_lsn ||
         !validate_page(page_id, page, space, tmp_buf))
     {
-      /* Mark processed for subsequent iterations in buf_dblwr_process() */
+      /* Mark processed for subsequent iterations in buf_dblwr_t::recover() */
       memset(page + FIL_PAGE_LSN, 0, 8);
       continue;
     }
@@ -4052,185 +3785,3 @@ byte *recv_dblwr_t::find_page(const page_id_t page_id,
 
   return result;
 }
-
-#ifndef DBUG_OFF
-/** Return string name of the redo log record type.
-@param[in]	type	record log record enum
-@return string name of record log record */
-static const char* get_mlog_string(mlog_id_t type)
-{
-	switch (type) {
-	case MLOG_SINGLE_REC_FLAG:
-		return("MLOG_SINGLE_REC_FLAG");
-
-	case MLOG_1BYTE:
-		return("MLOG_1BYTE");
-
-	case MLOG_2BYTES:
-		return("MLOG_2BYTES");
-
-	case MLOG_4BYTES:
-		return("MLOG_4BYTES");
-
-	case MLOG_8BYTES:
-		return("MLOG_8BYTES");
-
-	case MLOG_REC_INSERT:
-		return("MLOG_REC_INSERT");
-
-	case MLOG_REC_CLUST_DELETE_MARK:
-		return("MLOG_REC_CLUST_DELETE_MARK");
-
-	case MLOG_REC_SEC_DELETE_MARK:
-		return("MLOG_REC_SEC_DELETE_MARK");
-
-	case MLOG_REC_UPDATE_IN_PLACE:
-		return("MLOG_REC_UPDATE_IN_PLACE");
-
-	case MLOG_REC_DELETE:
-		return("MLOG_REC_DELETE");
-
-	case MLOG_LIST_END_DELETE:
-		return("MLOG_LIST_END_DELETE");
-
-	case MLOG_LIST_START_DELETE:
-		return("MLOG_LIST_START_DELETE");
-
-	case MLOG_LIST_END_COPY_CREATED:
-		return("MLOG_LIST_END_COPY_CREATED");
-
-	case MLOG_PAGE_REORGANIZE:
-		return("MLOG_PAGE_REORGANIZE");
-
-	case MLOG_PAGE_CREATE:
-		return("MLOG_PAGE_CREATE");
-
-	case MLOG_UNDO_INSERT:
-		return("MLOG_UNDO_INSERT");
-
-	case MLOG_UNDO_ERASE_END:
-		return("MLOG_UNDO_ERASE_END");
-
-	case MLOG_UNDO_INIT:
-		return("MLOG_UNDO_INIT");
-
-	case MLOG_UNDO_HDR_REUSE:
-		return("MLOG_UNDO_HDR_REUSE");
-
-	case MLOG_UNDO_HDR_CREATE:
-		return("MLOG_UNDO_HDR_CREATE");
-
-	case MLOG_REC_MIN_MARK:
-		return("MLOG_REC_MIN_MARK");
-
-	case MLOG_IBUF_BITMAP_INIT:
-		return("MLOG_IBUF_BITMAP_INIT");
-
-#ifdef UNIV_LOG_LSN_DEBUG
-	case MLOG_LSN:
-		return("MLOG_LSN");
-#endif /* UNIV_LOG_LSN_DEBUG */
-
-	case MLOG_WRITE_STRING:
-		return("MLOG_WRITE_STRING");
-
-	case MLOG_MULTI_REC_END:
-		return("MLOG_MULTI_REC_END");
-
-	case MLOG_DUMMY_RECORD:
-		return("MLOG_DUMMY_RECORD");
-
-	case MLOG_FILE_DELETE:
-		return("MLOG_FILE_DELETE");
-
-	case MLOG_COMP_REC_MIN_MARK:
-		return("MLOG_COMP_REC_MIN_MARK");
-
-	case MLOG_COMP_PAGE_CREATE:
-		return("MLOG_COMP_PAGE_CREATE");
-
-	case MLOG_COMP_REC_INSERT:
-		return("MLOG_COMP_REC_INSERT");
-
-	case MLOG_COMP_REC_CLUST_DELETE_MARK:
-		return("MLOG_COMP_REC_CLUST_DELETE_MARK");
-
-	case MLOG_COMP_REC_UPDATE_IN_PLACE:
-		return("MLOG_COMP_REC_UPDATE_IN_PLACE");
-
-	case MLOG_COMP_REC_DELETE:
-		return("MLOG_COMP_REC_DELETE");
-
-	case MLOG_COMP_LIST_END_DELETE:
-		return("MLOG_COMP_LIST_END_DELETE");
-
-	case MLOG_COMP_LIST_START_DELETE:
-		return("MLOG_COMP_LIST_START_DELETE");
-
-	case MLOG_COMP_LIST_END_COPY_CREATED:
-		return("MLOG_COMP_LIST_END_COPY_CREATED");
-
-	case MLOG_COMP_PAGE_REORGANIZE:
-		return("MLOG_COMP_PAGE_REORGANIZE");
-
-	case MLOG_FILE_CREATE2:
-		return("MLOG_FILE_CREATE2");
-
-	case MLOG_ZIP_WRITE_NODE_PTR:
-		return("MLOG_ZIP_WRITE_NODE_PTR");
-
-	case MLOG_ZIP_WRITE_BLOB_PTR:
-		return("MLOG_ZIP_WRITE_BLOB_PTR");
-
-	case MLOG_ZIP_WRITE_HEADER:
-		return("MLOG_ZIP_WRITE_HEADER");
-
-	case MLOG_ZIP_PAGE_COMPRESS:
-		return("MLOG_ZIP_PAGE_COMPRESS");
-
-	case MLOG_ZIP_PAGE_COMPRESS_NO_DATA:
-		return("MLOG_ZIP_PAGE_COMPRESS_NO_DATA");
-
-	case MLOG_ZIP_PAGE_REORGANIZE:
-		return("MLOG_ZIP_PAGE_REORGANIZE");
-
-	case MLOG_ZIP_WRITE_TRX_ID:
-		return("MLOG_ZIP_WRITE_TRX_ID");
-
-	case MLOG_FILE_RENAME2:
-		return("MLOG_FILE_RENAME2");
-
-	case MLOG_FILE_NAME:
-		return("MLOG_FILE_NAME");
-
-	case MLOG_CHECKPOINT:
-		return("MLOG_CHECKPOINT");
-
-	case MLOG_PAGE_CREATE_RTREE:
-		return("MLOG_PAGE_CREATE_RTREE");
-
-	case MLOG_COMP_PAGE_CREATE_RTREE:
-		return("MLOG_COMP_PAGE_CREATE_RTREE");
-
-	case MLOG_INIT_FILE_PAGE2:
-		return("MLOG_INIT_FILE_PAGE2");
-
-	case MLOG_INDEX_LOAD:
-		return("MLOG_INDEX_LOAD");
-
-	case MLOG_TRUNCATE:
-		return("MLOG_TRUNCATE");
-
-	case MLOG_MEMSET:
-		return("MLOG_MEMSET");
-
-	case MLOG_INIT_FREE_PAGE:
-		return("MLOG_INIT_FREE_PAGE");
-
-	case MLOG_FILE_WRITE_CRYPT_DATA:
-		return("MLOG_FILE_WRITE_CRYPT_DATA");
-	}
-	DBUG_ASSERT(0);
-	return(NULL);
-}
-#endif /* !DBUG_OFF */
diff --git a/storage/innobase/log/log0sync.cc b/storage/innobase/log/log0sync.cc
new file mode 100644
index 00000000000..2a6e1b8b853
--- /dev/null
+++ b/storage/innobase/log/log0sync.cc
@@ -0,0 +1,309 @@
+/*****************************************************************************
+Copyright (c) 2020 MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*
+The  group commit synchronization used in log_write_up_to()
+works as follows
+
+For simplicity, lets consider only write operation,synchronozation of
+flush operation works the same.
+
+Rules of the game
+
+A thread enters log_write_up_to() with lsn of the current transaction
+1. If last written lsn is greater than wait lsn (another thread already
+   wrote the log buffer),then there is no need to do anything.
+2. If no other thread is currently writing, write the log buffer,
+   and update last written lsn.
+3. Otherwise, wait, and go to step 1.
+
+Synchronization can be done in different ways, e.g
+
+a) Simple mutex locking the entire check and write operation
+Disadvantage that threads that could continue after updating
+last written lsn, still wait.
+
+b) Spinlock, with periodic checks for last written lsn.
+Fixes a) but burns CPU unnecessary.
+
+c) Mutex / condition variable  combo.
+
+Condtion variable notifies (broadcast) all waiters, whenever
+last written lsn is changed.
+
+Has a disadvantage of many suprious wakeups, stress on OS scheduler,
+and mutex contention.
+
+d) Something else.
+Make use of the waiter's lsn parameter, and only wakeup "right" waiting
+threads.
+
+We chose d). Even if implementation is more complicated than alternatves
+due to the need to maintain list of waiters, it provides the best performance.
+
+See group_commit_lock implementation for details.
+
+Note that if write operation is very fast, a) or b) can be fine as alternative.
+*/
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+#ifdef __linux__
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#endif
+
+#include <atomic>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <my_cpu.h>
+
+#include <log0types.h>
+#include "log0sync.h"
+#include <mysql/service_thd_wait.h>
+/**
+  Helper class , used in group commit lock.
+
+  Binary semaphore, or (same thing), an auto-reset event
+  Has state (signalled or not), and provides 2 operations.
+  wait() and wake()
+
+  The implementation uses efficient locking primitives on Linux and Windows.
+  Or, mutex/condition combo elsewhere.
+*/
+
+class binary_semaphore
+{
+public:
+  /**Wait until semaphore becomes signalled, and atomically reset the state
+  to non-signalled*/
+  void wait();
+  /** signals the semaphore */
+  void wake();
+
+private:
+#if defined(__linux__) || defined (_WIN32)
+  std::atomic<int> m_signalled;
+  static constexpr std::memory_order mem_order= std::memory_order_acq_rel;
+public:
+  binary_semaphore() :m_signalled(0) {}
+#else
+  std::mutex m_mtx{};
+  std::condition_variable m_cv{};
+  bool m_signalled = false;
+#endif
+};
+
+#if defined (__linux__) || defined (_WIN32)
+void binary_semaphore::wait()
+{
+  for (;;)
+  {
+    if (m_signalled.exchange(0, mem_order) == 1)
+    {
+      break;
+    }
+#ifdef _WIN32
+    int zero = 0;
+    WaitOnAddress(&m_signalled, &zero, sizeof(m_signalled), INFINITE);
+#else
+    syscall(SYS_futex, &m_signalled, FUTEX_WAIT_PRIVATE, 0, NULL, NULL, 0);
+#endif
+  }
+}
+
+void binary_semaphore::wake()
+{
+  if (m_signalled.exchange(1, mem_order) == 0)
+  {
+#ifdef _WIN32
+    WakeByAddressSingle(&m_signalled);
+#else
+    syscall(SYS_futex, &m_signalled, FUTEX_WAKE_PRIVATE, 1, NULL, NULL, 0);
+#endif
+  }
+}
+#else
+void binary_semaphore::wait()
+{
+  std::unique_lock<std::mutex> lk(m_mtx);
+  while (!m_signalled)
+    m_cv.wait(lk);
+  m_signalled = false;
+}
+void binary_semaphore::wake()
+{
+  std::unique_lock<std::mutex> lk(m_mtx);
+  m_signalled = true;
+  m_cv.notify_one();
+}
+#endif
+
+/* A thread helper structure, used in group commit lock below*/
+struct group_commit_waiter_t
+{
+  lsn_t m_value;
+  binary_semaphore m_sema;
+  group_commit_waiter_t* m_next;
+  group_commit_waiter_t() :m_value(), m_sema(), m_next() {}
+};
+
+group_commit_lock::group_commit_lock() :
+  m_mtx(), m_value(0), m_pending_value(0), m_lock(false), m_waiters_list()
+{
+}
+
+group_commit_lock::value_type group_commit_lock::value() const
+{
+  return m_value.load(std::memory_order::memory_order_relaxed);
+}
+
+group_commit_lock::value_type group_commit_lock::pending() const
+{
+  return m_pending_value.load(std::memory_order::memory_order_relaxed);
+}
+
+void group_commit_lock::set_pending(group_commit_lock::value_type num)
+{
+  ut_a(num >= value());
+  m_pending_value.store(num, std::memory_order::memory_order_relaxed);
+}
+
+const unsigned int MAX_SPINS = 1; /** max spins in acquire */
+thread_local group_commit_waiter_t thread_local_waiter;
+
+group_commit_lock::lock_return_code group_commit_lock::acquire(value_type num)
+{
+  unsigned int spins = MAX_SPINS;
+
+  for(;;)
+  {
+    if (num <= value())
+    {
+      /* No need to wait.*/
+      return lock_return_code::EXPIRED;
+    }
+
+    if(spins-- == 0)
+      break;
+    if (num > pending())
+    {
+      /* Longer wait expected (longer than currently running operation),
+        don't spin.*/
+      break;
+    }
+    ut_delay(1);
+  }
+
+  thread_local_waiter.m_value = num;
+  std::unique_lock<std::mutex> lk(m_mtx, std::defer_lock);
+  while (num > value())
+  {
+    lk.lock();
+
+    /* Re-read current value after acquiring the lock*/
+    if (num <= value())
+    {
+      return lock_return_code::EXPIRED;
+    }
+
+    if (!m_lock)
+    {
+      /* Take the lock, become group commit leader.*/
+      m_lock = true;
+#ifndef DBUG_OFF
+      m_owner_id = std::this_thread::get_id();
+#endif
+      return lock_return_code::ACQUIRED;
+    }
+
+    /* Add yourself to waiters list.*/
+    thread_local_waiter.m_next = m_waiters_list;
+    m_waiters_list = &thread_local_waiter;
+    lk.unlock();
+
+    /* Sleep until woken in release().*/
+    thd_wait_begin(0,THD_WAIT_GROUP_COMMIT);
+    thread_local_waiter.m_sema.wait();
+    thd_wait_end(0);
+
+  }
+  return lock_return_code::EXPIRED;
+}
+
+void group_commit_lock::release(value_type num)
+{
+  std::unique_lock<std::mutex> lk(m_mtx);
+  m_lock = false;
+
+  /* Update current value. */
+  ut_a(num >= value());
+  m_value.store(num, std::memory_order_relaxed);
+
+  /*
+    Wake waiters for value <= current value.
+    Wake one more waiter, who will become the group commit lead.
+  */
+  group_commit_waiter_t* cur, * prev, * next;
+  group_commit_waiter_t* wakeup_list = nullptr;
+  int extra_wake = 0;
+
+  for (prev= nullptr, cur= m_waiters_list; cur; cur= next)
+  {
+    next= cur->m_next;
+    if (cur->m_value <= num || extra_wake++ == 0)
+    {
+      /* Move current waiter to wakeup_list*/
+
+      if (!prev)
+      {
+        /* Remove from the start of the list.*/
+        m_waiters_list = next;
+      }
+      else
+      {
+        /* Remove from the middle of the list.*/
+        prev->m_next= cur->m_next;
+      }
+
+      /* Append entry to the wakeup list.*/
+      cur->m_next = wakeup_list;
+      wakeup_list = cur;
+    }
+    else
+    {
+      prev= cur;
+    }
+  }
+  lk.unlock();
+
+  for (cur= wakeup_list; cur; cur= next)
+  {
+    next= cur->m_next;
+    cur->m_sema.wake();
+  }
+}
+
+#ifndef DBUG_OFF
+bool group_commit_lock::is_owner()
+{
+  return m_lock && std::this_thread::get_id() == m_owner_id;
+}
+#endif
+
diff --git a/storage/innobase/log/log0sync.h b/storage/innobase/log/log0sync.h
new file mode 100644
index 00000000000..40afbf74ecd
--- /dev/null
+++ b/storage/innobase/log/log0sync.h
@@ -0,0 +1,81 @@
+/*****************************************************************************
+Copyright (c) 2020 MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#include <atomic>
+#include <thread>
+#include <log0types.h>
+
+struct group_commit_waiter_t;
+
+/**
+Special synchronization primitive, which is helpful for
+performing group commit.
+
+It has a state consisting of
+ - locked (bool)
+ - current value (number). This value is always increasing.
+ - pending value (number). current value can soon become this number
+   This is only used for optimization, does not have to be exact
+
+Operations supported on this semaphore
+
+1.acquire(num):
+- waits until current value exceeds num, or until lock is granted.
+
+- returns EXPIRED if current_value >= num,
+  or ACQUIRED, if current_value < num and lock is granted.
+
+2.release(num)
+- releases lock
+- sets new current value to max(num,current_value)
+- releases some threads waiting in acquire()
+
+3. value()
+- read current value
+
+4. pending_value()
+- read pending value
+
+5. set_pending_value()
+*/
+class group_commit_lock
+{
+  using value_type = lsn_t;
+#ifndef DBUG_OFF
+  std::thread::id m_owner_id{};
+#endif
+  std::mutex m_mtx;
+  std::atomic<value_type> m_value;
+  std::atomic<value_type> m_pending_value;
+  bool m_lock;
+  group_commit_waiter_t* m_waiters_list;
+public:
+  group_commit_lock();
+  enum lock_return_code
+  {
+    ACQUIRED,
+    EXPIRED
+  };
+  lock_return_code acquire(value_type num);
+  void release(value_type num);
+  value_type value() const;
+  value_type pending() const;
+  void set_pending(value_type num);
+#ifndef DBUG_OFF
+  bool is_owner();
+#endif
+};
diff --git a/storage/innobase/mach/mach0data.cc b/storage/innobase/mach/mach0data.cc
deleted file mode 100644
index 85533908d16..00000000000
--- a/storage/innobase/mach/mach0data.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/******************************************************************//**
-@file mach/mach0data.cc
-Utilities for converting data from the database file
-to the machine format.
-
-Created 11/28/1995 Heikki Tuuri
-***********************************************************************/
-
-#include "mach0data.h"
-
-/** Read a 32-bit integer in a compressed form.
-@param[in,out]	ptr	pointer to memory where to read;
-advanced by the number of bytes consumed, or set NULL if out of space
-@param[in]	end_ptr	end of the buffer
-@return unsigned value */
-ib_uint32_t
-mach_parse_compressed(
-	const byte**	ptr,
-	const byte*	end_ptr)
-{
-	ulint	val;
-
-	if (*ptr >= end_ptr) {
-		*ptr = NULL;
-		return(0);
-	}
-
-	val = mach_read_from_1(*ptr);
-
-	if (val < 0x80) {
-		/* 0nnnnnnn (7 bits) */
-		++*ptr;
-		return(static_cast<ib_uint32_t>(val));
-	}
-
-	/* Workaround GCC bug
-	https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77673:
-	the compiler moves mach_read_from_4 right to the beginning of the
-	function, causing and out-of-bounds read if we are reading a short
-	integer close to the end of buffer. */
-#if defined(__GNUC__) && (__GNUC__ >= 5) && !defined(__clang__)
-#define DEPLOY_FENCE
-#endif
-
-#ifdef DEPLOY_FENCE
-	__atomic_thread_fence(__ATOMIC_ACQUIRE);
-#endif
-
-	if (val < 0xC0) {
-		/* 10nnnnnn nnnnnnnn (14 bits) */
-		if (end_ptr >= *ptr + 2) {
-			val = mach_read_from_2(*ptr) & 0x3FFF;
-			ut_ad(val > 0x7F);
-			*ptr += 2;
-			return(static_cast<ib_uint32_t>(val));
-		}
-		*ptr = NULL;
-		return(0);
-	}
-
-#ifdef DEPLOY_FENCE
-	__atomic_thread_fence(__ATOMIC_ACQUIRE);
-#endif
-
-	if (val < 0xE0) {
-		/* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
-		if (end_ptr >= *ptr + 3) {
-			val = mach_read_from_3(*ptr) & 0x1FFFFF;
-			ut_ad(val > 0x3FFF);
-			*ptr += 3;
-			return(static_cast<ib_uint32_t>(val));
-		}
-		*ptr = NULL;
-		return(0);
-	}
-
-#ifdef DEPLOY_FENCE
-	__atomic_thread_fence(__ATOMIC_ACQUIRE);
-#endif
-
-	if (val < 0xF0) {
-		/* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
-		if (end_ptr >= *ptr + 4) {
-			val = mach_read_from_4(*ptr) & 0xFFFFFFF;
-			ut_ad(val > 0x1FFFFF);
-			*ptr += 4;
-			return(static_cast<ib_uint32_t>(val));
-		}
-		*ptr = NULL;
-		return(0);
-	}
-
-#ifdef DEPLOY_FENCE
-	__atomic_thread_fence(__ATOMIC_ACQUIRE);
-#endif
-
-#undef DEPLOY_FENCE
-
-	ut_ad(val == 0xF0);
-
-	/* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
-	if (end_ptr >= *ptr + 5) {
-		val = mach_read_from_4(*ptr + 1);
-		ut_ad(val > 0xFFFFFFF);
-		*ptr += 5;
-		return(static_cast<ib_uint32_t>(val));
-	}
-
-	*ptr = NULL;
-	return(0);
-}
diff --git a/storage/innobase/mem/mem0mem.cc b/storage/innobase/mem/mem0mem.cc
index 03ab4a89f77..6d4593e0ab4 100644
--- a/storage/innobase/mem/mem0mem.cc
+++ b/storage/innobase/mem/mem0mem.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -291,7 +291,7 @@ mem_heap_create_block_func(
 				return(NULL);
 			}
 		} else {
-			buf_block = buf_block_alloc(NULL);
+			buf_block = buf_block_alloc();
 		}
 
 		block = (mem_block_t*) buf_block->frame;
diff --git a/storage/innobase/mtr/mtr0log.cc b/storage/innobase/mtr/mtr0log.cc
deleted file mode 100644
index da7088dd7d9..00000000000
--- a/storage/innobase/mtr/mtr0log.cc
+++ /dev/null
@@ -1,729 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file mtr/mtr0log.cc
-Mini-transaction log routines
-
-Created 12/7/1995 Heikki Tuuri
-*******************************************************/
-
-#include "mtr0log.h"
-#include "buf0buf.h"
-#include "dict0dict.h"
-#include "log0recv.h"
-#include "page0page.h"
-#include "buf0dblwr.h"
-#include "dict0boot.h"
-
-/********************************************************//**
-Catenates n bytes to the mtr log. */
-void
-mlog_catenate_string(
-/*=================*/
-	mtr_t*		mtr,	/*!< in: mtr */
-	const byte*	str,	/*!< in: string to write */
-	ulint		len)	/*!< in: string length */
-{
-	if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) {
-
-		return;
-	}
-
-	mtr->get_log()->push(str, ib_uint32_t(len));
-}
-
-/********************************************************//**
-Writes the initial part of a log record consisting of one-byte item
-type and four-byte space and page numbers. Also pushes info
-to the mtr memo that a buffer page has been modified. */
-void
-mlog_write_initial_log_record(
-/*==========================*/
-	const byte*	ptr,	/*!< in: pointer to (inside) a buffer
-				frame holding the file page where
-				modification is made */
-	mlog_id_t	type,	/*!< in: log item type: MLOG_1BYTE, ... */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle */
-{
-	byte*	log_ptr;
-
-	ut_ad(type <= MLOG_BIGGEST_TYPE || EXTRA_CHECK_MLOG_NUMBER(type));
-	ut_ad(type > MLOG_8BYTES);
-
-	log_ptr = mlog_open(mtr, 11);
-
-	/* If no logging is requested, we may return now */
-	if (log_ptr == NULL) {
-
-		return;
-	}
-
-	log_ptr = mlog_write_initial_log_record_fast(ptr, type, log_ptr, mtr);
-
-	mlog_close(mtr, log_ptr);
-}
-
-/********************************************************//**
-Parses an initial log record written by mlog_write_initial_log_record.
-@return parsed record end, NULL if not a complete record */
-byte*
-mlog_parse_initial_log_record(
-/*==========================*/
-	const byte*	ptr,	/*!< in: buffer */
-	const byte*	end_ptr,/*!< in: buffer end */
-	mlog_id_t*	type,	/*!< out: log record type: MLOG_1BYTE, ... */
-	ulint*		space,	/*!< out: space id */
-	ulint*		page_no)/*!< out: page number */
-{
-	if (end_ptr < ptr + 1) {
-
-		return(NULL);
-	}
-
-	*type = mlog_id_t(*ptr & ~MLOG_SINGLE_REC_FLAG);
-	if (UNIV_UNLIKELY(*type > MLOG_BIGGEST_TYPE
-			  && !EXTRA_CHECK_MLOG_NUMBER(*type))) {
-		recv_sys.found_corrupt_log = true;
-		return NULL;
-	}
-
-	ptr++;
-
-	if (end_ptr < ptr + 2) {
-
-		return(NULL);
-	}
-
-	*space = mach_parse_compressed(&ptr, end_ptr);
-
-	if (ptr != NULL) {
-		*page_no = mach_parse_compressed(&ptr, end_ptr);
-	}
-
-	return(const_cast<byte*>(ptr));
-}
-
-/********************************************************//**
-Parses a log record written by mlog_write_ulint, mlog_write_ull, mlog_memset.
-@return parsed record end, NULL if not a complete record or a corrupt record */
-byte*
-mlog_parse_nbytes(
-/*==============*/
-	mlog_id_t	type,	/*!< in: log record type: MLOG_1BYTE, ... */
-	const byte*	ptr,	/*!< in: buffer */
-	const byte*	end_ptr,/*!< in: buffer end */
-	byte*		page,	/*!< in: page where to apply the log
-				record, or NULL */
-	void*		page_zip)/*!< in/out: compressed page, or NULL */
-{
-	ulint		offset;
-	ulint		val;
-	ib_uint64_t	dval;
-
-	ut_ad(type <= MLOG_8BYTES || type == MLOG_MEMSET);
-	ut_a(!page || !page_zip
-	     || type == MLOG_MEMSET
-	     || !fil_page_index_page_check(page));
-	if (end_ptr < ptr + 2) {
-		return NULL;
-	}
-
-	offset = mach_read_from_2(ptr);
-	ptr += 2;
-
-	if (UNIV_UNLIKELY(offset >= srv_page_size)) {
-		goto corrupt;
-	}
-
-	switch (type) {
-	case MLOG_MEMSET:
-		if (end_ptr < ptr + 3) {
-			return NULL;
-		}
-		val = mach_read_from_2(ptr);
-		ptr += 2;
-		if (UNIV_UNLIKELY(offset + val > srv_page_size)) {
-			goto corrupt;
-		}
-		if (page) {
-			memset(page + offset, *ptr, val);
-			if (page_zip) {
-				ut_ad(offset + val <= PAGE_DATA
-				      || !fil_page_index_page_check(page));
-				memset(static_cast<page_zip_des_t*>(page_zip)
-				       ->data + offset, *ptr, val);
-			}
-		}
-		return const_cast<byte*>(++ptr);
-	case MLOG_8BYTES:
-		dval = mach_u64_parse_compressed(&ptr, end_ptr);
-
-		if (ptr == NULL) {
-			return NULL;
-		}
-
-		if (page) {
-			if (page_zip) {
-				mach_write_to_8
-					(((page_zip_des_t*) page_zip)->data
-					 + offset, dval);
-			}
-			mach_write_to_8(page + offset, dval);
-		}
-
-		return const_cast<byte*>(ptr);
-	default:
-		val = mach_parse_compressed(&ptr, end_ptr);
-	}
-
-	if (ptr == NULL) {
-		return NULL;
-	}
-
-	switch (type) {
-	case MLOG_1BYTE:
-		if (val > 0xFFUL) {
-			goto corrupt;
-		}
-		if (page) {
-			if (page_zip) {
-				mach_write_to_1
-					(((page_zip_des_t*) page_zip)->data
-					 + offset, val);
-			}
-			mach_write_to_1(page + offset, val);
-		}
-		break;
-	case MLOG_2BYTES:
-		if (val > 0xFFFFUL) {
-			goto corrupt;
-		}
-		if (page) {
-			if (page_zip) {
-				mach_write_to_2
-					(((page_zip_des_t*) page_zip)->data
-					 + offset, val);
-			}
-			mach_write_to_2(page + offset, val);
-		}
-
-		break;
-	case MLOG_4BYTES:
-		if (page) {
-			if (page_zip) {
-				mach_write_to_4
-					(((page_zip_des_t*) page_zip)->data
-					 + offset, val);
-			}
-			mach_write_to_4(page + offset, val);
-		}
-		break;
-	default:
-	corrupt:
-		recv_sys.found_corrupt_log = true;
-		ptr = NULL;
-	}
-
-	return const_cast<byte*>(ptr);
-}
-
-/********************************************************//**
-Writes 1, 2 or 4 bytes to a file page. Writes the corresponding log
-record to the mini-transaction log if mtr is not NULL. */
-void
-mlog_write_ulint(
-/*=============*/
-	byte*		ptr,	/*!< in: pointer where to write */
-	ulint		val,	/*!< in: value to write */
-	mlog_id_t	type,	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle */
-{
-	switch (type) {
-	case MLOG_1BYTE:
-		mach_write_to_1(ptr, val);
-		break;
-	case MLOG_2BYTES:
-		mach_write_to_2(ptr, val);
-		break;
-	case MLOG_4BYTES:
-		mach_write_to_4(ptr, val);
-		break;
-	default:
-		ut_error;
-	}
-
-	if (mtr != 0) {
-		byte*	log_ptr = mlog_open(mtr, 11 + 2 + 5);
-
-		/* If no logging is requested, we may return now */
-
-		if (log_ptr != 0) {
-
-			log_ptr = mlog_write_initial_log_record_fast(
-				ptr, type, log_ptr, mtr);
-
-			mach_write_to_2(log_ptr, page_offset(ptr));
-			log_ptr += 2;
-
-			log_ptr += mach_write_compressed(log_ptr, val);
-
-			mlog_close(mtr, log_ptr);
-		}
-	}
-}
-
-/********************************************************//**
-Writes 8 bytes to a file page. Writes the corresponding log
-record to the mini-transaction log, only if mtr is not NULL */
-void
-mlog_write_ull(
-/*===========*/
-	byte*		ptr,	/*!< in: pointer where to write */
-	ib_uint64_t	val,	/*!< in: value to write */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle */
-{
-	mach_write_to_8(ptr, val);
-
-	if (mtr != 0) {
-		byte*	log_ptr = mlog_open(mtr, 11 + 2 + 9);
-
-		/* If no logging is requested, we may return now */
-		if (log_ptr != 0) {
-
-			log_ptr = mlog_write_initial_log_record_fast(
-				ptr, MLOG_8BYTES, log_ptr, mtr);
-
-			mach_write_to_2(log_ptr, page_offset(ptr));
-			log_ptr += 2;
-
-			log_ptr += mach_u64_write_compressed(log_ptr, val);
-
-			mlog_close(mtr, log_ptr);
-		}
-	}
-}
-
-/********************************************************//**
-Writes a string to a file page buffered in the buffer pool. Writes the
-corresponding log record to the mini-transaction log. */
-void
-mlog_write_string(
-/*==============*/
-	byte*		ptr,	/*!< in: pointer where to write */
-	const byte*	str,	/*!< in: string to write */
-	ulint		len,	/*!< in: string length */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle */
-{
-	ut_ad(ptr && mtr);
-	ut_a(len < srv_page_size);
-
-	memcpy(ptr, str, len);
-
-	mlog_log_string(ptr, len, mtr);
-}
-
-/********************************************************//**
-Logs a write of a string to a file page buffered in the buffer pool.
-Writes the corresponding log record to the mini-transaction log. */
-void
-mlog_log_string(
-/*============*/
-	byte*	ptr,	/*!< in: pointer written to */
-	ulint	len,	/*!< in: string length */
-	mtr_t*	mtr)	/*!< in: mini-transaction handle */
-{
-	byte*	log_ptr;
-
-	ut_ad(ptr && mtr);
-	ut_ad(len <= srv_page_size);
-
-	log_ptr = mlog_open(mtr, 30);
-
-	/* If no logging is requested, we may return now */
-	if (log_ptr == NULL) {
-
-		return;
-	}
-
-	log_ptr = mlog_write_initial_log_record_fast(ptr, MLOG_WRITE_STRING,
-						     log_ptr, mtr);
-	mach_write_to_2(log_ptr, page_offset(ptr));
-	log_ptr += 2;
-
-	mach_write_to_2(log_ptr, len);
-	log_ptr += 2;
-
-	mlog_close(mtr, log_ptr);
-
-	mlog_catenate_string(mtr, ptr, len);
-}
-
-/********************************************************//**
-Parses a log record written by mlog_write_string.
-@return parsed record end, NULL if not a complete record */
-byte*
-mlog_parse_string(
-/*==============*/
-	byte*	ptr,	/*!< in: buffer */
-	byte*	end_ptr,/*!< in: buffer end */
-	byte*	page,	/*!< in: page where to apply the log record, or NULL */
-	void*	page_zip)/*!< in/out: compressed page, or NULL */
-{
-	ulint	offset;
-	ulint	len;
-
-	ut_a(!page || !page_zip
-	     || (fil_page_get_type(page) != FIL_PAGE_INDEX
-		 && fil_page_get_type(page) != FIL_PAGE_RTREE));
-
-	if (end_ptr < ptr + 4) {
-
-		return(NULL);
-	}
-
-	offset = mach_read_from_2(ptr);
-	ptr += 2;
-	len = mach_read_from_2(ptr);
-	ptr += 2;
-
-	if (offset >= srv_page_size || len + offset > srv_page_size) {
-		recv_sys.found_corrupt_log = TRUE;
-
-		return(NULL);
-	}
-
-	if (end_ptr < ptr + len) {
-
-		return(NULL);
-	}
-
-	if (page) {
-		if (page_zip) {
-			memcpy(((page_zip_des_t*) page_zip)->data
-				+ offset, ptr, len);
-		}
-		memcpy(page + offset, ptr, len);
-	}
-
-	return(ptr + len);
-}
-
-/** Initialize a string of bytes.
-@param[in,out]	b	buffer page
-@param[in]	ofs	byte offset from block->frame
-@param[in]	len	length of the data to write
-@param[in]	val	the data byte to write
-@param[in,out]	mtr	mini-transaction */
-void
-mlog_memset(buf_block_t* b, ulint ofs, ulint len, byte val, mtr_t* mtr)
-{
-	ut_ad(len);
-	ut_ad(ofs <= ulint(srv_page_size));
-	ut_ad(ofs + len <= ulint(srv_page_size));
-	memset(ofs + b->frame, val, len);
-
-	mtr->set_modified();
-	switch (mtr->get_log_mode()) {
-	case MTR_LOG_NONE:
-	case MTR_LOG_NO_REDO:
-		return;
-	case MTR_LOG_SHORT_INSERTS:
-		ut_ad(0);
-		/* fall through */
-	case MTR_LOG_ALL:
-		break;
-	}
-
-	byte* l = mtr->get_log()->open(11 + 2 + 2 + 1);
-	l = mlog_write_initial_log_record_low(
-		MLOG_MEMSET, b->page.id.space(), b->page.id.page_no(), l, mtr);
-	mach_write_to_2(l, ofs);
-	mach_write_to_2(l + 2, len);
-	l[4] = val;
-	mlog_close(mtr, l + 5);
-}
-
-/** Initialize a string of bytes.
-@param[in,out]	byte	byte address
-@param[in]	len	length of the data to write
-@param[in]	val	the data byte to write
-@param[in,out]	mtr	mini-transaction */
-void mlog_memset(byte* b, ulint len, byte val, mtr_t* mtr)
-{
-	ut_ad(len);
-	ut_ad(page_offset(b) + len <= ulint(srv_page_size));
-	memset(b, val, len);
-
-	mtr->set_modified();
-	switch (mtr->get_log_mode()) {
-	case MTR_LOG_NONE:
-	case MTR_LOG_NO_REDO:
-		return;
-	case MTR_LOG_SHORT_INSERTS:
-		ut_ad(0);
-		/* fall through */
-	case MTR_LOG_ALL:
-		break;
-	}
-
-	byte* l = mtr->get_log()->open(11 + 2 + 2 + 1);
-	l = mlog_write_initial_log_record_fast(b, MLOG_MEMSET, l, mtr);
-	mach_write_to_2(l, page_offset(b));
-	mach_write_to_2(l + 2, len);
-	l[4] = val;
-	mlog_close(mtr, l + 5);
-}
-
-/********************************************************//**
-Opens a buffer for mlog, writes the initial log record and,
-if needed, the field lengths of an index.
-@return buffer, NULL if log mode MTR_LOG_NONE */
-byte*
-mlog_open_and_write_index(
-/*======================*/
-	mtr_t*			mtr,	/*!< in: mtr */
-	const byte*		rec,	/*!< in: index record or page */
-	const dict_index_t*	index,	/*!< in: record descriptor */
-	mlog_id_t		type,	/*!< in: log item type */
-	ulint			size)	/*!< in: requested buffer size in bytes
-					(if 0, calls mlog_close() and
-					returns NULL) */
-{
-	byte*		log_ptr;
-	const byte*	log_start;
-	const byte*	log_end;
-
-	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
-
-	mtr->set_modified();
-	switch (mtr->get_log_mode()) {
-	case MTR_LOG_NONE:
-	case MTR_LOG_NO_REDO:
-		return NULL;
-	case MTR_LOG_SHORT_INSERTS:
-		ut_ad(0);
-		/* fall through */
-	case MTR_LOG_ALL:
-		break;
-	}
-
-	if (!page_rec_is_comp(rec)) {
-		log_start = log_ptr = mtr->get_log()->open(11 + size);
-		log_ptr = mlog_write_initial_log_record_fast(rec, type,
-							     log_ptr, mtr);
-		log_end = log_ptr + 11 + size;
-	} else {
-		ulint	i;
-		bool	is_instant = index->is_instant();
-		ulint	n	= dict_index_get_n_fields(index);
-		ulint	total	= 11 + (is_instant ? 2 : 0) + size + (n + 2) * 2;
-		ulint	alloc	= std::min(total,
-					   ulint(mtr_buf_t::MAX_DATA_SIZE));
-
-		const bool is_leaf = page_is_leaf(page_align(rec));
-
-		/* For spatial index, on non-leaf page, we just keep
-		2 fields, MBR and page no. */
-		if (!is_leaf && dict_index_is_spatial(index)) {
-			n = DICT_INDEX_SPATIAL_NODEPTR_SIZE;
-		}
-
-		log_start = log_ptr = mtr->get_log()->open(alloc);
-		log_end = log_ptr + alloc;
-
-		log_ptr = mlog_write_initial_log_record_fast(
-			rec, type, log_ptr, mtr);
-
-		if (is_instant) {
-			// marked as instant index
-			mach_write_to_2(log_ptr, n | 0x8000);
-
-			log_ptr += 2;
-
-			// record the n_core_fields
-			mach_write_to_2(log_ptr, index->n_core_fields);
-		} else {
-			mach_write_to_2(log_ptr, n);
-		}
-
-		log_ptr += 2;
-		mach_write_to_2(
-			log_ptr, is_leaf
-			? dict_index_get_n_unique_in_tree(index)
-			: dict_index_get_n_unique_in_tree_nonleaf(index));
-		log_ptr += 2;
-
-		for (i = 0; i < n; i++) {
-			dict_field_t*		field;
-			const dict_col_t*	col;
-			ulint			len;
-
-			field = dict_index_get_nth_field(index, i);
-			col = dict_field_get_col(field);
-			len = field->fixed_len;
-			ut_ad(len < 0x7fff);
-			if (len == 0
-			    && (DATA_BIG_COL(col))) {
-				/* variable-length field
-				with maximum length > 255 */
-				len = 0x7fff;
-			}
-			if (col->prtype & DATA_NOT_NULL) {
-				len |= 0x8000;
-			}
-			if (log_ptr + 2 > log_end) {
-				mlog_close(mtr, log_ptr);
-				ut_a(total > ulint(log_ptr - log_start));
-				total -= ulint(log_ptr - log_start);
-				alloc = std::min(
-					total,
-					ulint(mtr_buf_t::MAX_DATA_SIZE));
-
-				log_start = log_ptr = mtr->get_log()->open(
-					alloc);
-				log_end = log_ptr + alloc;
-			}
-			mach_write_to_2(log_ptr, len);
-			log_ptr += 2;
-		}
-	}
-	if (size == 0) {
-		mlog_close(mtr, log_ptr);
-		log_ptr = NULL;
-	} else if (log_ptr + size > log_end) {
-		mlog_close(mtr, log_ptr);
-		log_ptr = mlog_open(mtr, size);
-	}
-	return(log_ptr);
-}
-
-/********************************************************//**
-Parses a log record written by mlog_open_and_write_index.
-@return parsed record end, NULL if not a complete record */
-byte*
-mlog_parse_index(
-/*=============*/
-	byte*		ptr,	/*!< in: buffer */
-	const byte*	end_ptr,/*!< in: buffer end */
-	ibool		comp,	/*!< in: TRUE=compact row format */
-	dict_index_t**	index)	/*!< out, own: dummy index */
-{
-	ulint		i, n, n_uniq;
-	dict_table_t*	table;
-	dict_index_t*	ind;
-	ulint		n_core_fields = 0;
-
-	ut_ad(comp == FALSE || comp == TRUE);
-
-	if (comp) {
-		if (end_ptr < ptr + 4) {
-			return(NULL);
-		}
-		n = mach_read_from_2(ptr);
-		ptr += 2;
-		if (n & 0x8000) { /* record after instant ADD COLUMN */
-			n &= 0x7FFF;
-
-			n_core_fields = mach_read_from_2(ptr);
-
-			if (!n_core_fields || n_core_fields > n) {
-				recv_sys.found_corrupt_log = TRUE;
-				return(NULL);
-			}
-
-			ptr += 2;
-
-			if (end_ptr < ptr + 2) {
-				return(NULL);
-			}
-		}
-
-		n_uniq = mach_read_from_2(ptr);
-		ptr += 2;
-		ut_ad(n_uniq <= n);
-		if (end_ptr < ptr + n * 2) {
-			return(NULL);
-		}
-	} else {
-		n = n_uniq = 1;
-	}
-	table = dict_mem_table_create("LOG_DUMMY", NULL, n, 0,
-				      comp ? DICT_TF_COMPACT : 0, 0);
-	ind = dict_mem_index_create(table, "LOG_DUMMY", 0, n);
-	ind->n_uniq = (unsigned int) n_uniq;
-	if (n_uniq != n) {
-		ut_a(n_uniq + DATA_ROLL_PTR <= n);
-		ind->type = DICT_CLUSTERED;
-	}
-	if (comp) {
-		for (i = 0; i < n; i++) {
-			ulint	len = mach_read_from_2(ptr);
-			ptr += 2;
-			/* The high-order bit of len is the NOT NULL flag;
-			the rest is 0 or 0x7fff for variable-length fields,
-			and 1..0x7ffe for fixed-length fields. */
-			dict_mem_table_add_col(
-				table, NULL, NULL,
-				((len + 1) & 0x7fff) <= 1
-				? DATA_BINARY : DATA_FIXBINARY,
-				len & 0x8000 ? DATA_NOT_NULL : 0,
-				len & 0x7fff);
-
-			dict_index_add_col(ind, table,
-					   dict_table_get_nth_col(table, i),
-					   0);
-		}
-		dict_table_add_system_columns(table, table->heap);
-		if (n_uniq != n) {
-			/* Identify DB_TRX_ID and DB_ROLL_PTR in the index. */
-			ut_a(DATA_TRX_ID_LEN
-			     == dict_index_get_nth_col(ind, DATA_TRX_ID - 1
-						       + n_uniq)->len);
-			ut_a(DATA_ROLL_PTR_LEN
-			     == dict_index_get_nth_col(ind, DATA_ROLL_PTR - 1
-						       + n_uniq)->len);
-			ind->fields[DATA_TRX_ID - 1 + n_uniq].col
-				= &table->cols[n + DATA_TRX_ID];
-			ind->fields[DATA_ROLL_PTR - 1 + n_uniq].col
-				= &table->cols[n + DATA_ROLL_PTR];
-		}
-
-		ut_ad(table->n_cols == table->n_def);
-
-		if (n_core_fields) {
-			for (i = n_core_fields; i < n; i++) {
-				ind->fields[i].col->def_val.len
-					= UNIV_SQL_NULL;
-			}
-			ind->n_core_fields = n_core_fields;
-			ind->n_core_null_bytes = UT_BITS_IN_BYTES(
-				ind->get_n_nullable(n_core_fields));
-		} else {
-			ind->n_core_null_bytes = UT_BITS_IN_BYTES(
-				unsigned(ind->n_nullable));
-			ind->n_core_fields = ind->n_fields;
-		}
-	}
-	/* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
-	ind->cached = TRUE;
-	ut_d(ind->is_dummy = true);
-	*index = ind;
-	return(ptr);
-}
diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc
index b6e6055bbc4..2daada16a91 100644
--- a/storage/innobase/mtr/mtr0mtr.cc
+++ b/storage/innobase/mtr/mtr0mtr.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -126,7 +126,7 @@ struct Find {
 	mtr_memo_slot_t*m_slot;
 
 	/** Type of the object to look for */
-	ulint		m_type;
+	const ulint	m_type;
 
 	/** The object instance to look for */
 	const void*	m_object;
@@ -205,13 +205,6 @@ private:
 static void memo_slot_release(mtr_memo_slot_t *slot)
 {
   switch (slot->type) {
-#ifdef UNIV_DEBUG
-  default:
-    ut_ad(!"invalid type");
-    break;
-  case MTR_MEMO_MODIFY:
-    break;
-#endif /* UNIV_DEBUG */
   case MTR_MEMO_S_LOCK:
     rw_lock_s_unlock(reinterpret_cast<rw_lock_t*>(slot->object));
     break;
@@ -221,23 +214,32 @@ static void memo_slot_release(mtr_memo_slot_t *slot)
   case MTR_MEMO_SPACE_X_LOCK:
     {
       fil_space_t *space= static_cast<fil_space_t*>(slot->object);
-      space->committed_size= space->size;
+      space->set_committed_size();
       rw_lock_x_unlock(&space->latch);
     }
     break;
   case MTR_MEMO_X_LOCK:
     rw_lock_x_unlock(reinterpret_cast<rw_lock_t*>(slot->object));
     break;
-  case MTR_MEMO_BUF_FIX:
-  case MTR_MEMO_PAGE_S_FIX:
-  case MTR_MEMO_PAGE_SX_FIX:
-  case MTR_MEMO_PAGE_X_FIX:
+  default:
+#ifdef UNIV_DEBUG
+    switch (slot->type & ~MTR_MEMO_MODIFY) {
+    case MTR_MEMO_BUF_FIX:
+    case MTR_MEMO_PAGE_S_FIX:
+    case MTR_MEMO_PAGE_SX_FIX:
+    case MTR_MEMO_PAGE_X_FIX:
+      break;
+    default:
+      ut_ad("invalid type" == 0);
+      break;
+    }
+#endif /* UNIV_DEBUG */
     buf_block_t *block= reinterpret_cast<buf_block_t*>(slot->object);
-    buf_page_release_latch(block, slot->type);
+    buf_page_release_latch(block, slot->type & ~MTR_MEMO_MODIFY);
     block->unfix();
     break;
   }
-  slot->object= NULL;
+  slot->object= nullptr;
 }
 
 /** Release the latches acquired by the mini-transaction. */
@@ -248,20 +250,13 @@ struct ReleaseLatches {
     if (!slot->object)
       return true;
     switch (slot->type) {
-#ifdef UNIV_DEBUG
-    default:
-      ut_ad(!"invalid type");
-      break;
-    case MTR_MEMO_MODIFY:
-      break;
-#endif /* UNIV_DEBUG */
     case MTR_MEMO_S_LOCK:
       rw_lock_s_unlock(reinterpret_cast<rw_lock_t*>(slot->object));
       break;
     case MTR_MEMO_SPACE_X_LOCK:
       {
         fil_space_t *space= static_cast<fil_space_t*>(slot->object);
-        space->committed_size= space->size;
+        space->set_committed_size();
         rw_lock_x_unlock(&space->latch);
       }
       break;
@@ -271,12 +266,21 @@ struct ReleaseLatches {
     case MTR_MEMO_SX_LOCK:
       rw_lock_sx_unlock(reinterpret_cast<rw_lock_t*>(slot->object));
       break;
-    case MTR_MEMO_BUF_FIX:
-    case MTR_MEMO_PAGE_S_FIX:
-    case MTR_MEMO_PAGE_SX_FIX:
-    case MTR_MEMO_PAGE_X_FIX:
+    default:
+#ifdef UNIV_DEBUG
+      switch (slot->type & ~MTR_MEMO_MODIFY) {
+      case MTR_MEMO_BUF_FIX:
+      case MTR_MEMO_PAGE_S_FIX:
+      case MTR_MEMO_PAGE_SX_FIX:
+      case MTR_MEMO_PAGE_X_FIX:
+        break;
+      default:
+        ut_ad("invalid type" == 0);
+        break;
+      }
+#endif /* UNIV_DEBUG */
       buf_block_t *block= reinterpret_cast<buf_block_t*>(slot->object);
-      buf_page_release_latch(block, slot->type);
+      buf_page_release_latch(block, slot->type & ~MTR_MEMO_MODIFY);
       block->unfix();
       break;
     }
@@ -296,6 +300,50 @@ struct ReleaseAll {
   }
 };
 
+/** Stops iteration is savepoint is reached */
+template <typename Functor> struct TillSavepoint
+{
+
+  /** Constructor
+  @param[in] functor functor which is called if savepoint is not reached
+  @param[in] savepoint savepoint value to rollback
+  @param[in] used current position in slots container */
+  TillSavepoint(const Functor &functor, ulint savepoint, ulint used)
+      : functor(functor),
+        m_slots_count((used - savepoint) / sizeof(mtr_memo_slot_t))
+  {
+    ut_ad(savepoint);
+    ut_ad(used >= savepoint);
+  }
+
+  /** @return true if savepoint is not reached, false otherwise */
+  bool operator()(mtr_memo_slot_t *slot)
+  {
+#ifdef UNIV_DEBUG
+    /** This check is added because the code is invoked only from
+    row_search_mvcc() to release latches acquired during clustered index search
+    for secondary index record. To make it more universal we could add one more
+    member in this functor for debug build to pass only certain slot types,
+    but this is currently not necessary. */
+    switch (slot->type)
+    {
+    case MTR_MEMO_S_LOCK:
+    case MTR_MEMO_PAGE_S_FIX:
+      break;
+    default:
+      ut_a(false);
+    }
+#endif
+    return m_slots_count-- && functor(slot);
+  }
+
+private:
+  /** functor to invoke */
+  const Functor &functor;
+  /** slots count left till savepoint */
+  ulint m_slots_count;
+};
+
 #ifdef UNIV_DEBUG
 /** Check that all slots have been handled. */
 struct DebugCheck {
@@ -308,91 +356,58 @@ struct DebugCheck {
 };
 #endif
 
-/** Release a resource acquired by the mini-transaction. */
-struct ReleaseBlocks {
-	/** Release specific object */
-	ReleaseBlocks(lsn_t start_lsn, lsn_t end_lsn, FlushObserver* observer)
-		:
-		m_end_lsn(end_lsn),
-		m_start_lsn(start_lsn),
-		m_flush_observer(observer)
-	{
-		/* Do nothing */
-	}
-
-	/** Add the modified page to the buffer flush list. */
-	void add_dirty_page_to_flush_list(mtr_memo_slot_t* slot) const
-	{
-		ut_ad(m_end_lsn > 0);
-		ut_ad(m_start_lsn > 0);
-
-		buf_block_t*	block;
-
-		block = reinterpret_cast<buf_block_t*>(slot->object);
-
-		buf_flush_note_modification(block, m_start_lsn,
-					    m_end_lsn, m_flush_observer);
-	}
-
-	/** @return true always. */
-	bool operator()(mtr_memo_slot_t* slot) const
-	{
-		if (slot->object != NULL) {
-
-			if (slot->type == MTR_MEMO_PAGE_X_FIX
-			    || slot->type == MTR_MEMO_PAGE_SX_FIX) {
-
-				add_dirty_page_to_flush_list(slot);
-			}
-		}
-
-		return(true);
-	}
-
-	/** Mini-transaction REDO start LSN */
-	lsn_t		m_end_lsn;
+/** Release page latches held by the mini-transaction. */
+struct ReleaseBlocks
+{
+  const lsn_t start, end;
+#ifdef UNIV_DEBUG
+  const mtr_buf_t &memo;
 
-	/** Mini-transaction REDO end LSN */
-	lsn_t		m_start_lsn;
+  ReleaseBlocks(lsn_t start, lsn_t end, const mtr_buf_t &memo) :
+    start(start), end(end), memo(memo)
+#else /* UNIV_DEBUG */
+  ReleaseBlocks(lsn_t start, lsn_t end, const mtr_buf_t&) :
+    start(start), end(end)
+#endif /* UNIV_DEBUG */
+  {
+    ut_ad(start);
+    ut_ad(end);
+  }
 
-	/** Flush observer */
-	FlushObserver*	m_flush_observer;
-};
+  /** @return true always */
+  bool operator()(mtr_memo_slot_t* slot) const
+  {
+    if (!slot->object)
+      return true;
+    switch (slot->type) {
+    case MTR_MEMO_PAGE_X_MODIFY:
+    case MTR_MEMO_PAGE_SX_MODIFY:
+      break;
+    default:
+      ut_ad(!(slot->type & MTR_MEMO_MODIFY));
+      return true;
+    }
 
-/** Write the block contents to the REDO log */
-struct mtr_write_log_t {
-	/** Append a block to the redo log buffer.
-	@return whether the appending should continue */
-	bool operator()(const mtr_buf_t::block_t* block) const
-	{
-		log_write_low(block->begin(), block->used());
-		return(true);
-	}
+    buf_flush_note_modification(static_cast<buf_block_t*>(slot->object),
+                                start, end);
+    return true;
+  }
 };
 
-/** Append records to the system-wide redo log buffer.
-@param[in]	log	redo log records */
-void
-mtr_write_log(
-	const mtr_buf_t*	log)
-{
-	const ulint	len = log->size();
-	mtr_write_log_t	write_log;
-
-	ut_ad(!recv_no_log_write);
-	DBUG_PRINT("ib_log",
-		   (ULINTPF " extra bytes written at " LSN_PF,
-		    len, log_sys.lsn));
-
-	log_reserve_and_open(len);
-	log->for_each_block(write_log);
-	log_close();
-}
-
 /** Start a mini-transaction. */
 void mtr_t::start()
 {
+  ut_ad(!m_freed_pages);
+  ut_ad(!m_freed_space);
   MEM_UNDEFINED(this, sizeof *this);
+  MEM_MAKE_DEFINED(&m_freed_space, sizeof m_freed_space);
+  MEM_MAKE_DEFINED(&m_freed_pages, sizeof m_freed_pages);
+
+  ut_d(m_start= true);
+  ut_d(m_commit= false);
+
+  m_last= nullptr;
+  m_last_offset= 0;
 
   new(&m_memo) mtr_buf_t();
   new(&m_log) mtr_buf_t();
@@ -400,27 +415,25 @@ void mtr_t::start()
   m_made_dirty= false;
   m_inside_ibuf= false;
   m_modifications= false;
-  m_n_log_recs= 0;
   m_log_mode= MTR_LOG_ALL;
   ut_d(m_user_space_id= TRX_SYS_SPACE);
-  m_user_space= NULL;
-  m_state= MTR_STATE_ACTIVE;
-  m_flush_observer= NULL;
+  m_user_space= nullptr;
   m_commit_lsn= 0;
+  m_trim_pages= false;
 }
 
 /** Release the resources */
 inline void mtr_t::release_resources()
 {
+  ut_ad(is_active());
   ut_d(m_memo.for_each_block_in_reverse(CIterate<DebugCheck>()));
   m_log.erase();
   m_memo.erase();
-  m_state= MTR_STATE_COMMITTED;
+  ut_d(m_commit= true);
 }
 
 /** Commit a mini-transaction. */
-void
-mtr_t::commit()
+void mtr_t::commit()
 {
   ut_ad(is_active());
   ut_ad(!is_inside_ibuf());
@@ -429,33 +442,69 @@ mtr_t::commit()
   ut_ad(!m_modifications || !recv_no_log_write);
   ut_ad(!m_modifications || m_log_mode != MTR_LOG_NONE);
 
-  if (m_modifications
-      && (m_n_log_recs || m_log_mode == MTR_LOG_NO_REDO))
+  if (m_modifications && (m_log_mode == MTR_LOG_NO_REDO || !m_log.empty()))
   {
     ut_ad(!srv_read_only_mode || m_log_mode == MTR_LOG_NO_REDO);
 
-    lsn_t start_lsn;
+    std::pair<lsn_t,page_flush_ahead> lsns;
 
-    if (const ulint len= prepare_write())
-      start_lsn= finish_write(len);
-    else
-      start_lsn= m_commit_lsn;
+    if (UNIV_LIKELY(m_log_mode == MTR_LOG_ALL))
+    {
+      lsns= do_write();
 
-    if (m_made_dirty)
-      log_flush_order_mutex_enter();
+      if (m_made_dirty)
+        mysql_mutex_lock(&log_sys.flush_order_mutex);
 
-    /* It is now safe to release the log mutex because the
-    flush_order mutex will ensure that we are the first one
-    to insert into the flush list. */
-    log_mutex_exit();
+      /* It is now safe to release log_sys.mutex because the
+      buf_pool.flush_order_mutex will ensure that we are the first one
+      to insert into buf_pool.flush_list. */
+      mysql_mutex_unlock(&log_sys.mutex);
+    }
+    else
+    {
+      ut_ad(m_log_mode == MTR_LOG_NO_REDO);
+      ut_ad(m_log.size() == 0);
+      m_commit_lsn= log_sys.get_lsn();
+      lsns= { m_commit_lsn, PAGE_FLUSH_NO };
+      if (UNIV_UNLIKELY(m_made_dirty)) /* This should be IMPORT TABLESPACE */
+        mysql_mutex_lock(&log_sys.flush_order_mutex);
+    }
+
+    if (m_freed_pages)
+    {
+      ut_ad(!m_freed_pages->empty());
+      ut_ad(m_freed_space);
+      ut_ad(memo_contains(*m_freed_space));
+      ut_ad(is_named_space(m_freed_space));
+      /* Update the last freed lsn */
+      m_freed_space->update_last_freed_lsn(m_commit_lsn);
+
+      if (!is_trim_pages())
+        for (const auto &range : *m_freed_pages)
+          m_freed_space->add_free_range(range);
+      else
+        m_freed_space->clear_freed_ranges();
+      delete m_freed_pages;
+      m_freed_pages= nullptr;
+      m_freed_space= nullptr;
+      /* mtr_t::start() will reset m_trim_pages */
+    }
+    else
+      ut_ad(!m_freed_space);
 
     m_memo.for_each_block_in_reverse(CIterate<const ReleaseBlocks>
-                                     (ReleaseBlocks(start_lsn, m_commit_lsn,
-                                                    m_flush_observer)));
+                                     (ReleaseBlocks(lsns.first, m_commit_lsn,
+                                                    m_memo)));
     if (m_made_dirty)
-      log_flush_order_mutex_exit();
+      mysql_mutex_unlock(&log_sys.flush_order_mutex);
 
     m_memo.for_each_block_in_reverse(CIterate<ReleaseLatches>());
+
+    if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO))
+      buf_flush_ahead(m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC);
+
+    if (m_made_dirty)
+      srv_stats.log_write_requests.inc();
   }
   else
     m_memo.for_each_block_in_reverse(CIterate<ReleaseAll>());
@@ -463,14 +512,30 @@ mtr_t::commit()
   release_resources();
 }
 
-#ifdef UNIV_DEBUG
-/** Check that all pages belong to a shrunk tablespace. */
+/** Release latches till savepoint. To simplify the code only
+MTR_MEMO_S_LOCK and MTR_MEMO_PAGE_S_FIX slot types are allowed to be
+released, otherwise it would be neccesary to add one more argument in the
+function to point out what slot types are allowed for rollback, and this
+would be overengineering as corrently the function is used only in one place
+in the code.
+@param savepoint   savepoint, can be obtained with get_savepoint */
+void mtr_t::rollback_to_savepoint(ulint savepoint)
+{
+  Iterate<TillSavepoint<ReleaseLatches>> iteration(
+      TillSavepoint<ReleaseLatches>(ReleaseLatches(), savepoint,
+                                    get_savepoint()));
+  m_memo.for_each_block_in_reverse(iteration);
+}
+
+/** Shrink a tablespace. */
 struct Shrink
 {
-  const fil_space_t &space;
-  Shrink(const fil_space_t &space) : space(space) {}
+  /** the first non-existing page in the tablespace */
+  const page_id_t high;
 
-  bool operator()(const mtr_memo_slot_t *slot) const
+  Shrink(const fil_space_t &space) : high({space.id, space.size}) {}
+
+  bool operator()(mtr_memo_slot_t *slot) const
   {
     if (!slot->object)
       return true;
@@ -478,30 +543,32 @@ struct Shrink
     default:
       ut_ad("invalid type" == 0);
       return false;
-    case MTR_MEMO_MODIFY:
-      break;
     case MTR_MEMO_SPACE_X_LOCK:
-      ut_ad(&space == slot->object);
+      ut_ad(high.space() == static_cast<fil_space_t*>(slot->object)->id);
       return true;
+    case MTR_MEMO_PAGE_X_MODIFY:
+    case MTR_MEMO_PAGE_SX_MODIFY:
     case MTR_MEMO_PAGE_X_FIX:
     case MTR_MEMO_PAGE_SX_FIX:
-      const buf_page_t &bpage= static_cast<buf_block_t*>(slot->object)->page;
-      const page_id_t &id= bpage.id;
-      if (id.space() == 0 && id.page_no() == TRX_SYS_PAGE_NO)
+      auto &bpage= static_cast<buf_block_t*>(slot->object)->page;
+      ut_ad(bpage.io_fix() == BUF_IO_NONE);
+      const auto id= bpage.id();
+      if (id < high)
       {
-        ut_ad(srv_is_undo_tablespace(space.id));
+        ut_ad(id.space() == high.space() ||
+              (id == page_id_t{0, TRX_SYS_PAGE_NO} &&
+               srv_is_undo_tablespace(high.space())));
         break;
       }
-      ut_ad(id.space() == space.id);
-      ut_ad(id.page_no() < space.size);
-      ut_ad(bpage.state == BUF_BLOCK_FILE_PAGE);
-      ut_ad(!bpage.oldest_modification);
-      break;
+      ut_ad(id.space() == high.space());
+      ut_ad(bpage.state() == BUF_BLOCK_FILE_PAGE);
+      if (bpage.oldest_modification() > 1)
+        bpage.clear_oldest_modification(false);
+      slot->type= static_cast<mtr_memo_type_t>(slot->type & ~MTR_MEMO_MODIFY);
     }
     return true;
   }
 };
-#endif
 
 /** Commit a mini-transaction that is shrinking a tablespace.
 @param space   tablespace that is being shrunk */
@@ -518,28 +585,53 @@ void mtr_t::commit_shrink(fil_space_t &space)
 
   log_write_and_flush_prepare();
 
-  const lsn_t start_lsn= finish_write(prepare_write());
+  const lsn_t start_lsn= do_write().first;
 
-  log_flush_order_mutex_enter();
+  mysql_mutex_lock(&log_sys.flush_order_mutex);
   /* Durably write the reduced FSP_SIZE before truncating the data file. */
   log_write_and_flush();
 
-  os_file_truncate(space.chain.start->name, space.chain.start->handle,
-                   os_offset_t(space.size) << srv_page_size_shift, true);
+  if (m_freed_pages)
+  {
+    ut_ad(!m_freed_pages->empty());
+    ut_ad(m_freed_space == &space);
+    ut_ad(memo_contains(*m_freed_space));
+    ut_ad(is_named_space(m_freed_space));
+    m_freed_space->update_last_freed_lsn(m_commit_lsn);
+
+    if (!is_trim_pages())
+      for (const auto &range : *m_freed_pages)
+        m_freed_space->add_free_range(range);
+    else
+      m_freed_space->clear_freed_ranges();
+    delete m_freed_pages;
+    m_freed_pages= nullptr;
+    m_freed_space= nullptr;
+    /* mtr_t::start() will reset m_trim_pages */
+  }
+  else
+    ut_ad(!m_freed_space);
 
-  ut_d(m_memo.for_each_block_in_reverse(CIterate<Shrink>(space)));
+  m_memo.for_each_block_in_reverse(CIterate<Shrink>{space});
 
   m_memo.for_each_block_in_reverse(CIterate<const ReleaseBlocks>
                                    (ReleaseBlocks(start_lsn, m_commit_lsn,
-                                                  m_flush_observer)));
-  log_flush_order_mutex_exit();
+                                                  m_memo)));
+  mysql_mutex_unlock(&log_sys.flush_order_mutex);
 
   mutex_enter(&fil_system.mutex);
   ut_ad(space.is_being_truncated);
-  space.is_being_truncated= false;
+  ut_ad(space.is_stopping());
   space.set_stopping(false);
+  space.is_being_truncated= false;
   mutex_exit(&fil_system.mutex);
 
+  /* Truncate the file before releasing the space.latch. File extension
+  (and any allocation of pages beyond the current intended end of the file)
+  is covered by exclusive space.latch, which we are still holding here. */
+  os_file_truncate(space.chain.start->name, space.chain.start->handle,
+                   os_offset_t{space.size} << srv_page_size_shift, true);
+
   m_memo.for_each_block_in_reverse(CIterate<ReleaseLatches>());
   srv_stats.log_write_requests.inc();
 
@@ -548,56 +640,47 @@ void mtr_t::commit_shrink(fil_space_t &space)
 
 /** Commit a mini-transaction that did not modify any pages,
 but generated some redo log on a higher level, such as
-MLOG_FILE_NAME records and a MLOG_CHECKPOINT marker.
-The caller must invoke log_mutex_enter() and log_mutex_exit().
+FILE_MODIFY records and an optional FILE_CHECKPOINT marker.
+The caller must hold log_sys.mutex.
 This is to be used at log_checkpoint().
-@param[in]	checkpoint_lsn		the LSN of the log checkpoint
-@param[in]	write_mlog_checkpoint	Write MLOG_CHECKPOINT marker
-					if it is enabled. */
-void
-mtr_t::commit_checkpoint(
-	lsn_t	checkpoint_lsn,
-	bool	write_mlog_checkpoint)
+@param[in]	checkpoint_lsn		log checkpoint LSN, or 0 */
+void mtr_t::commit_files(lsn_t checkpoint_lsn)
 {
-	ut_ad(log_mutex_own());
+	mysql_mutex_assert_owner(&log_sys.mutex);
 	ut_ad(is_active());
 	ut_ad(!is_inside_ibuf());
-	ut_ad(get_log_mode() == MTR_LOG_ALL);
+	ut_ad(m_log_mode == MTR_LOG_ALL);
 	ut_ad(!m_made_dirty);
 	ut_ad(m_memo.size() == 0);
 	ut_ad(!srv_read_only_mode);
-	ut_ad(write_mlog_checkpoint || m_n_log_recs > 1);
-
-	switch (m_n_log_recs) {
-	case 0:
-		break;
-	case 1:
-		*m_log.front()->begin() |= MLOG_SINGLE_REC_FLAG;
-		break;
-	default:
-		mlog_catenate_ulint(&m_log, MLOG_MULTI_REC_END, MLOG_1BYTE);
-	}
-
-	if (write_mlog_checkpoint) {
-		byte*	ptr = m_log.push<byte*>(SIZE_OF_MLOG_CHECKPOINT);
-		compile_time_assert(SIZE_OF_MLOG_CHECKPOINT == 1 + 8);
-		*ptr = MLOG_CHECKPOINT;
-		mach_write_to_8(ptr + 1, checkpoint_lsn);
+	ut_ad(!m_freed_space);
+	ut_ad(!m_freed_pages);
+
+	if (checkpoint_lsn) {
+		byte*	ptr = m_log.push<byte*>(SIZE_OF_FILE_CHECKPOINT);
+		compile_time_assert(SIZE_OF_FILE_CHECKPOINT == 3 + 8 + 1);
+		*ptr = FILE_CHECKPOINT | (SIZE_OF_FILE_CHECKPOINT - 2);
+		::memset(ptr + 1, 0, 2);
+		mach_write_to_8(ptr + 3, checkpoint_lsn);
+		ptr[3 + 8] = 0;
+	} else {
+		*m_log.push<byte*>(1) = 0;
 	}
 
 	finish_write(m_log.size());
+	srv_stats.log_write_requests.inc();
 	release_resources();
 
-	if (write_mlog_checkpoint) {
+	if (checkpoint_lsn) {
 		DBUG_PRINT("ib_log",
-			   ("MLOG_CHECKPOINT(" LSN_PF ") written at " LSN_PF,
-			    checkpoint_lsn, log_sys.lsn));
+			   ("FILE_CHECKPOINT(" LSN_PF ") written at " LSN_PF,
+			    checkpoint_lsn, log_sys.get_lsn()));
 	}
 }
 
 #ifdef UNIV_DEBUG
 /** Check if a tablespace is associated with the mini-transaction
-(needed for generating a MLOG_FILE_NAME record)
+(needed for generating a FILE_MODIFY record)
 @param[in]	space	tablespace
 @return whether the mini-transaction is associated with the space */
 bool
@@ -605,12 +688,11 @@ mtr_t::is_named_space(ulint space) const
 {
 	ut_ad(!m_user_space || m_user_space->id != TRX_SYS_SPACE);
 
-	switch (get_log_mode()) {
+	switch (m_log_mode) {
 	case MTR_LOG_NONE:
 	case MTR_LOG_NO_REDO:
 		return(true);
 	case MTR_LOG_ALL:
-	case MTR_LOG_SHORT_INSERTS:
 		return(m_user_space_id == space
 		       || is_predefined_tablespace(space));
 	}
@@ -619,19 +701,18 @@ mtr_t::is_named_space(ulint space) const
 	return(false);
 }
 /** Check if a tablespace is associated with the mini-transaction
-(needed for generating a MLOG_FILE_NAME record)
+(needed for generating a FILE_MODIFY record)
 @param[in]	space	tablespace
 @return whether the mini-transaction is associated with the space */
 bool mtr_t::is_named_space(const fil_space_t* space) const
 {
   ut_ad(!m_user_space || m_user_space->id != TRX_SYS_SPACE);
 
-  switch (get_log_mode()) {
+  switch (m_log_mode) {
   case MTR_LOG_NONE:
   case MTR_LOG_NO_REDO:
     return true;
   case MTR_LOG_ALL:
-  case MTR_LOG_SHORT_INSERTS:
     return m_user_space == space || is_predefined_tablespace(space->id);
   }
 
@@ -658,10 +739,9 @@ mtr_t::x_lock_space(ulint space_id, const char* file, unsigned line)
 	} else if ((space = m_user_space) && space_id == space->id) {
 	} else {
 		space = fil_space_get(space_id);
-		ut_ad(get_log_mode() != MTR_LOG_NO_REDO
+		ut_ad(m_log_mode != MTR_LOG_NO_REDO
 		      || space->purpose == FIL_TYPE_TEMPORARY
-		      || space->purpose == FIL_TYPE_IMPORT
-		      || space->redo_skipped_count > 0);
+		      || space->purpose == FIL_TYPE_IMPORT);
 	}
 
 	ut_ad(space);
@@ -714,83 +794,249 @@ mtr_t::release_page(const void* ptr, mtr_memo_type_t type)
 	ut_ad(0);
 }
 
-/** Prepare to write the mini-transaction log to the redo log buffer.
-@return number of bytes to write in finish_write() */
-inline ulint mtr_t::prepare_write()
+static bool log_margin_warned;
+static time_t log_margin_warn_time;
+static bool log_close_warned;
+static time_t log_close_warn_time;
+
+/** Check margin not to overwrite transaction log from the last checkpoint.
+If would estimate the log write to exceed the log_capacity,
+waits for the checkpoint is done enough.
+@param len   length of the data to be written */
+static void log_margin_checkpoint_age(ulint len)
 {
-	ut_ad(!recv_no_log_write);
+  const ulint framing_size= log_sys.framing_size();
+  /* actual length stored per block */
+  const ulint len_per_blk= OS_FILE_LOG_BLOCK_SIZE - framing_size;
 
-	if (UNIV_UNLIKELY(m_log_mode != MTR_LOG_ALL)) {
-		ut_ad(m_log_mode == MTR_LOG_NO_REDO);
-		ut_ad(m_log.size() == 0);
-		log_mutex_enter();
-		m_commit_lsn = log_sys.lsn;
-		return 0;
-	}
+  /* actual data length in last block already written */
+  ulint extra_len= log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE;
+
+  ut_ad(extra_len >= LOG_BLOCK_HDR_SIZE);
+  extra_len-= LOG_BLOCK_HDR_SIZE;
+
+  /* total extra length for block header and trailer */
+  extra_len= ((len + extra_len) / len_per_blk) * framing_size;
+
+  const ulint margin= len + extra_len;
+
+  mysql_mutex_assert_owner(&log_sys.mutex);
+
+  const lsn_t lsn= log_sys.get_lsn();
+
+  if (UNIV_UNLIKELY(margin > log_sys.log_capacity))
+  {
+    time_t t= time(nullptr);
+
+    /* return with warning output to avoid deadlock */
+    if (!log_margin_warned || difftime(t, log_margin_warn_time) > 15)
+    {
+      log_margin_warned= true;
+      log_margin_warn_time= t;
+
+      ib::error() << "innodb_log_file_size is too small "
+                     "for mini-transaction size " << len;
+    }
+  }
+  else if (UNIV_LIKELY(lsn + margin <= log_sys.last_checkpoint_lsn +
+                       log_sys.log_capacity))
+    return;
+
+  log_sys.set_check_flush_or_checkpoint();
+}
+
+
+/** Open the log for log_write_low(). The log must be closed with log_close().
+@param len length of the data to be written
+@return start lsn of the log record */
+static lsn_t log_reserve_and_open(size_t len)
+{
+  for (ut_d(ulint count= 0);;)
+  {
+    mysql_mutex_assert_owner(&log_sys.mutex);
+
+    /* Calculate an upper limit for the space the string may take in
+    the log buffer */
+
+    size_t len_upper_limit= (4 * OS_FILE_LOG_BLOCK_SIZE) +
+      srv_log_write_ahead_size + (5 * len) / 4;
+
+    if (log_sys.buf_free + len_upper_limit <= srv_log_buffer_size)
+      break;
+
+    mysql_mutex_unlock(&log_sys.mutex);
+    DEBUG_SYNC_C("log_buf_size_exceeded");
+
+    /* Not enough free space, do a write of the log buffer */
+    log_write_up_to(log_sys.get_lsn(), false);
+
+    srv_stats.log_waits.inc();
+
+    ut_ad(++count < 50);
+
+    mysql_mutex_lock(&log_sys.mutex);
+  }
+
+  return log_sys.get_lsn();
+}
+
+/** Append data to the log buffer. */
+static void log_write_low(const void *str, size_t size)
+{
+  mysql_mutex_assert_owner(&log_sys.mutex);
+  const ulint trailer_offset= log_sys.trailer_offset();
+
+  do
+  {
+    /* Calculate a part length */
+    size_t len= size;
+    size_t data_len= (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) + size;
+
+    if (data_len > trailer_offset)
+    {
+      data_len= trailer_offset;
+      len= trailer_offset - log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE;
+    }
+
+    memcpy(log_sys.buf + log_sys.buf_free, str, len);
+
+    size-= len;
+    str= static_cast<const char*>(str) + len;
+
+    byte *log_block= static_cast<byte*>(ut_align_down(log_sys.buf +
+                                                      log_sys.buf_free,
+                                                      OS_FILE_LOG_BLOCK_SIZE));
+
+    log_block_set_data_len(log_block, data_len);
+    lsn_t lsn= log_sys.get_lsn();
+
+    if (data_len == trailer_offset)
+    {
+      /* This block became full */
+      log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE);
+      log_block_set_checkpoint_no(log_block, log_sys.next_checkpoint_no);
+      len+= log_sys.framing_size();
+      lsn+= len;
+      /* Initialize the next block header */
+      log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, lsn);
+    }
+    else
+      lsn+= len;
+
+    log_sys.set_lsn(lsn);
+    log_sys.buf_free+= len;
+
+    ut_ad(log_sys.buf_free <= size_t{srv_log_buffer_size});
+  }
+  while (size);
+}
+
+/** Close the log at mini-transaction commit.
+@return whether buffer pool flushing is needed */
+static mtr_t::page_flush_ahead log_close(lsn_t lsn)
+{
+  mysql_mutex_assert_owner(&log_sys.mutex);
+  ut_ad(lsn == log_sys.get_lsn());
+
+  byte *log_block= static_cast<byte*>(ut_align_down(log_sys.buf +
+                                                    log_sys.buf_free,
+                                                    OS_FILE_LOG_BLOCK_SIZE));
+
+  if (!log_block_get_first_rec_group(log_block))
+  {
+    /* We initialized a new log block which was not written
+    full by the current mtr: the next mtr log record group
+    will start within this block at the offset data_len */
+    log_block_set_first_rec_group(log_block,
+                                  log_block_get_data_len(log_block));
+  }
+
+  if (log_sys.buf_free > log_sys.max_buf_free)
+    log_sys.set_check_flush_or_checkpoint();
+
+  const lsn_t checkpoint_age= lsn - log_sys.last_checkpoint_lsn;
+
+  if (UNIV_UNLIKELY(checkpoint_age >= log_sys.log_capacity) &&
+      /* silence message on create_log_file() after the log had been deleted */
+      checkpoint_age != lsn)
+  {
+    time_t t= time(nullptr);
+    if (!log_close_warned || difftime(t, log_close_warn_time) > 15)
+    {
+      log_close_warned= true;
+      log_close_warn_time= t;
+
+      ib::error() << "The age of the last checkpoint is " << checkpoint_age
+                  << ", which exceeds the log capacity "
+                  << log_sys.log_capacity << ".";
+    }
+  }
+  else if (UNIV_LIKELY(checkpoint_age <= log_sys.max_modified_age_async))
+    return mtr_t::PAGE_FLUSH_NO;
+  else if (UNIV_LIKELY(checkpoint_age <= log_sys.max_checkpoint_age))
+    return mtr_t::PAGE_FLUSH_ASYNC;
+
+  log_sys.set_check_flush_or_checkpoint();
+  return mtr_t::PAGE_FLUSH_SYNC;
+}
+
+/** Write the block contents to the REDO log */
+struct mtr_write_log
+{
+  /** Append a block to the redo log buffer.
+  @return whether the appending should continue */
+  bool operator()(const mtr_buf_t::block_t *block) const
+  {
+    log_write_low(block->begin(), block->used());
+    return true;
+  }
+};
+
+std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::do_write()
+{
+	ut_ad(!recv_no_log_write);
+	ut_ad(m_log_mode == MTR_LOG_ALL);
 
 	ulint	len	= m_log.size();
-	ulint	n_recs	= m_n_log_recs;
 	ut_ad(len > 0);
-	ut_ad(n_recs > 0);
 
 	if (len > srv_log_buffer_size / 2) {
 		log_buffer_extend(ulong((len + 1) * 2));
 	}
 
-	ut_ad(m_n_log_recs == n_recs);
-
 	fil_space_t*	space = m_user_space;
 
 	if (space != NULL && is_predefined_tablespace(space->id)) {
-		/* Omit MLOG_FILE_NAME for predefined tablespaces. */
+		/* Omit FILE_MODIFY for predefined tablespaces. */
 		space = NULL;
 	}
 
-	log_mutex_enter();
+	mysql_mutex_lock(&log_sys.mutex);
 
-	if (fil_names_write_if_was_clean(space, this)) {
-		/* This mini-transaction was the first one to modify
-		this tablespace since the latest checkpoint, so
-		some MLOG_FILE_NAME records were appended to m_log. */
-		ut_ad(m_n_log_recs > n_recs);
-		mlog_catenate_ulint(&m_log, MLOG_MULTI_REC_END, MLOG_1BYTE);
+	if (fil_names_write_if_was_clean(space)) {
 		len = m_log.size();
 	} else {
 		/* This was not the first time of dirtying a
 		tablespace since the latest checkpoint. */
-
-		ut_ad(n_recs == m_n_log_recs);
-
-		if (n_recs <= 1) {
-			ut_ad(n_recs == 1);
-
-			/* Flag the single log record as the
-			only record in this mini-transaction. */
-			*m_log.front()->begin() |= MLOG_SINGLE_REC_FLAG;
-		} else {
-			/* Because this mini-transaction comprises
-			multiple log records, append MLOG_MULTI_REC_END
-			at the end. */
-
-			mlog_catenate_ulint(&m_log, MLOG_MULTI_REC_END,
-					    MLOG_1BYTE);
-			len++;
-		}
+		ut_ad(len == m_log.size());
 	}
 
+	*m_log.push<byte*>(1) = 0;
+	len++;
+
 	/* check and attempt a checkpoint if exceeding capacity */
 	log_margin_checkpoint_age(len);
 
-	return(len);
+	return finish_write(len);
 }
 
-/** Append the redo log records to the redo log buffer
-@param[in] len	number of bytes to write
-@return start_lsn */
-inline lsn_t mtr_t::finish_write(ulint len)
+/** Append the redo log records to the redo log buffer.
+@param len   number of bytes to write
+@return {start_lsn,flush_ahead} */
+inline std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::finish_write(ulint len)
 {
 	ut_ad(m_log_mode == MTR_LOG_ALL);
-	ut_ad(log_mutex_own());
+	mysql_mutex_assert_owner(&log_sys.mutex);
 	ut_ad(m_log.size() == len);
 	ut_ad(len > 0);
 
@@ -803,19 +1049,21 @@ inline lsn_t mtr_t::finish_write(ulint len)
 		m_commit_lsn = log_reserve_and_write_fast(front->begin(), len,
 							  &start_lsn);
 
-		if (m_commit_lsn) {
-			return start_lsn;
+		if (!m_commit_lsn) {
+			goto piecewise;
 		}
+	} else {
+piecewise:
+		/* Open the database log for log_write_low */
+		start_lsn = log_reserve_and_open(len);
+		mtr_write_log write_log;
+		m_log.for_each_block(write_log);
+		m_commit_lsn = log_sys.get_lsn();
 	}
+	page_flush_ahead flush= log_close(m_commit_lsn);
+	DBUG_EXECUTE_IF("ib_log_flush_ahead", flush = PAGE_FLUSH_SYNC;);
 
-	/* Open the database log for log_write_low */
-	start_lsn = log_reserve_and_open(len);
-
-	mtr_write_log_t	write_log;
-	m_log.for_each_block(write_log);
-
-	m_commit_lsn = log_close();
-	return start_lsn;
+	return std::make_pair(start_lsn, flush);
 }
 
 /** Find out whether a block was not X-latched by the mini-transaction */
@@ -863,34 +1111,43 @@ bool mtr_t::have_x_latch(const buf_block_t &block) const
 }
 
 #ifdef UNIV_DEBUG
-/** Check if memo contains the given item.
-@return	true if contains */
-bool
-mtr_t::memo_contains(
-	const mtr_buf_t*	memo,
-	const void*		object,
-	ulint			type)
+/** Check if we are holding an rw-latch in this mini-transaction
+@param lock   latch to search for
+@param type   held latch type
+@return whether (lock,type) is contained */
+bool mtr_t::memo_contains(const rw_lock_t &lock, mtr_memo_type_t type)
 {
-	Iterate<Find> iteration(Find(object, type));
-	if (memo->for_each_block_in_reverse(iteration)) {
-		return(false);
-	}
+  Iterate<Find> iteration(Find(&lock, type));
+  if (m_memo.for_each_block_in_reverse(iteration))
+    return false;
 
-	const rw_lock_t *lock = static_cast<const rw_lock_t*>(object);
-
-	switch (type) {
-	case MTR_MEMO_X_LOCK:
-		ut_ad(rw_lock_own(lock, RW_LOCK_X));
-		break;
-	case MTR_MEMO_SX_LOCK:
-		ut_ad(rw_lock_own(lock, RW_LOCK_SX));
-		break;
-	case MTR_MEMO_S_LOCK:
-		ut_ad(rw_lock_own(lock, RW_LOCK_S));
-		break;
-	}
+  switch (type) {
+  case MTR_MEMO_X_LOCK:
+    ut_ad(rw_lock_own(&lock, RW_LOCK_X));
+    break;
+  case MTR_MEMO_SX_LOCK:
+    ut_ad(rw_lock_own(&lock, RW_LOCK_SX));
+    break;
+  case MTR_MEMO_S_LOCK:
+    ut_ad(rw_lock_own(&lock, RW_LOCK_S));
+    break;
+  default:
+    break;
+  }
 
-	return(true);
+  return true;
+}
+
+/** Check if we are holding exclusive tablespace latch
+@param space  tablespace to search for
+@return whether space.latch is being held */
+bool mtr_t::memo_contains(const fil_space_t& space)
+{
+  Iterate<Find> iteration(Find(&space, MTR_MEMO_SPACE_X_LOCK));
+  if (m_memo.for_each_block_in_reverse(iteration))
+    return false;
+  ut_ad(rw_lock_own(const_cast<rw_lock_t*>(&space.latch), RW_LOCK_X));
+  return true;
 }
 
 /** Debug check for flags */
@@ -982,20 +1239,6 @@ mtr_t::memo_contains_page_flagged(
 		? NULL : iteration.functor.get_block();
 }
 
-/** Mark the given latched page as modified.
-@param[in]	ptr	pointer to within buffer frame */
-void
-mtr_t::memo_modify_page(const byte* ptr)
-{
-	buf_block_t*	block = memo_contains_page_flagged(
-		ptr, MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX);
-	ut_ad(block != NULL);
-
-	if (!memo_contains(get_memo(), block, MTR_MEMO_MODIFY)) {
-		memo_push(block, MTR_MEMO_MODIFY);
-	}
-}
-
 /** Print info of an mtr handle. */
 void
 mtr_t::print() const
@@ -1006,3 +1249,42 @@ mtr_t::print() const
 }
 
 #endif /* UNIV_DEBUG */
+
+
+/** Find a block, preferrably in MTR_MEMO_MODIFY state */
+struct FindModified
+{
+  mtr_memo_slot_t *found= nullptr;
+  const buf_block_t& block;
+
+  FindModified(const buf_block_t &block) : block(block) {}
+  bool operator()(mtr_memo_slot_t *slot)
+  {
+    if (slot->object != &block)
+      return true;
+    found= slot;
+    return !(slot->type & (MTR_MEMO_MODIFY |
+                           MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX));
+  }
+};
+
+/** Mark the given latched page as modified.
+@param block   page that will be modified */
+void mtr_t::modify(const buf_block_t &block)
+{
+  if (UNIV_UNLIKELY(m_memo.empty()))
+  {
+    /* This must be PageConverter::update_page() in IMPORT TABLESPACE. */
+    ut_ad(!block.page.in_LRU_list);
+    return;
+  }
+
+  Iterate<FindModified> iteration((FindModified(block)));
+  if (UNIV_UNLIKELY(m_memo.for_each_block(iteration)))
+  {
+    ut_ad("modifying an unlatched page" == 0);
+    return;
+  }
+  iteration.functor.found->type= static_cast<mtr_memo_type_t>
+    (iteration.functor.found->type | MTR_MEMO_MODIFY);
+}
diff --git a/storage/innobase/mysql-test/storage_engine/col_opt_not_null.opt b/storage/innobase/mysql-test/storage_engine/col_opt_not_null.opt
index 40445305fc6..a007f405c6a 100644
--- a/storage/innobase/mysql-test/storage_engine/col_opt_not_null.opt
+++ b/storage/innobase/mysql-test/storage_engine/col_opt_not_null.opt
@@ -1 +1 @@
---innodb_log_file_size=100M
+--innodb_log_file_size=200M
diff --git a/storage/innobase/mysql-test/storage_engine/col_opt_null.opt b/storage/innobase/mysql-test/storage_engine/col_opt_null.opt
index 40445305fc6..a007f405c6a 100644
--- a/storage/innobase/mysql-test/storage_engine/col_opt_null.opt
+++ b/storage/innobase/mysql-test/storage_engine/col_opt_null.opt
@@ -1 +1 @@
---innodb_log_file_size=100M
+--innodb_log_file_size=200M
diff --git a/storage/innobase/mysql-test/storage_engine/type_blob.opt b/storage/innobase/mysql-test/storage_engine/type_blob.opt
index 40445305fc6..a007f405c6a 100644
--- a/storage/innobase/mysql-test/storage_engine/type_blob.opt
+++ b/storage/innobase/mysql-test/storage_engine/type_blob.opt
@@ -1 +1 @@
---innodb_log_file_size=100M
+--innodb_log_file_size=200M
diff --git a/storage/innobase/mysql-test/storage_engine/type_text.opt b/storage/innobase/mysql-test/storage_engine/type_text.opt
index 40445305fc6..a007f405c6a 100644
--- a/storage/innobase/mysql-test/storage_engine/type_text.opt
+++ b/storage/innobase/mysql-test/storage_engine/type_text.opt
@@ -1 +1 @@
---innodb_log_file_size=100M
+--innodb_log_file_size=200M
diff --git a/storage/innobase/os/os0event.cc b/storage/innobase/os/os0event.cc
index 0676ba5f6c1..f18633ccd45 100644
--- a/storage/innobase/os/os0event.cc
+++ b/storage/innobase/os/os0event.cc
@@ -358,8 +358,8 @@ os_event::wait_time_low(
 
 	if (time_in_usec != OS_SYNC_INFINITE_TIME) {
 		ulonglong usec = ulonglong(time_in_usec) + my_hrtime().val;
-		abstime.tv_sec = usec / 1000000;
-		abstime.tv_nsec = (usec % 1000000) * 1000;
+		abstime.tv_sec = static_cast<time_t>(usec / 1000000);
+		abstime.tv_nsec = static_cast<uint>((usec % 1000000) * 1000);
 	} else {
 		abstime.tv_nsec = 999999999;
 		abstime.tv_sec = (time_t) ULINT_MAX;
diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc
index 4ba31760109..42589b3319a 100644
--- a/storage/innobase/os/os0file.cc
+++ b/storage/innobase/os/os0file.cc
@@ -53,6 +53,7 @@ Created 10/21/1995 Heikki Tuuri
 #include "os0thread.h"
 
 #include <vector>
+#include <tpool_structs.h>
 
 #ifdef LINUX_NATIVE_AIO
 #include <libaio.h>
@@ -77,596 +78,80 @@ Created 10/21/1995 Heikki Tuuri
 #include <my_sys.h>
 #endif
 
+#include "buf0dblwr.h"
 
-/** Insert buffer segment id */
-static const ulint IO_IBUF_SEGMENT = 0;
+#include <thread>
+#include <chrono>
 
-/** Log segment id */
-static const ulint IO_LOG_SEGMENT = 1;
-
-/** Number of retries for partial I/O's */
-static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
-
-/* This specifies the file permissions InnoDB uses when it creates files in
-Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
-my_umask */
-
-#ifndef _WIN32
-/** Umask for creating files */
-static ulint	os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
-#else
-/** Umask for creating files */
-static ulint	os_innodb_umask	= 0;
-static HANDLE	data_completion_port;
-static HANDLE	log_completion_port;
-
-static DWORD	fls_sync_io  = FLS_OUT_OF_INDEXES;
-#define IOCP_SHUTDOWN_KEY (ULONG_PTR)-1
-#endif /* _WIN32 */
-
-/** In simulated aio, merge at most this many consecutive i/os */
-static const ulint	OS_AIO_MERGE_N_CONSECUTIVE = 64;
-
-/** Flag indicating if the page_cleaner is in active state. */
-extern bool buf_page_cleaner_is_active;
-
-/**********************************************************************
-
-InnoDB AIO Implementation:
-=========================
-
-We support native AIO for Windows and Linux. For rest of the platforms
-we simulate AIO by special IO-threads servicing the IO-requests.
-
-Simulated AIO:
-==============
-
-On platforms where we 'simulate' AIO, the following is a rough explanation
-of the high level design.
-There are four io-threads (for ibuf, log, read, write).
-All synchronous IO requests are serviced by the calling thread using
-os_file_write/os_file_read. The Asynchronous requests are queued up
-in an array (there are four such arrays) by the calling thread.
-Later these requests are picked up by the IO-thread and are serviced
-synchronously.
-
-Windows native AIO:
-==================
-
-If srv_use_native_aio is not set then Windows follow the same
-code as simulated AIO. If the flag is set then native AIO interface
-is used. On windows, one of the limitation is that if a file is opened
-for AIO no synchronous IO can be done on it. Therefore we have an
-extra fifth array to queue up synchronous IO requests.
-There are innodb_file_io_threads helper threads. These threads work
-on the four arrays mentioned above in Simulated AIO. No thread is
-required for the sync array.
-If a synchronous IO request is made, it is first queued in the sync
-array. Then the calling thread itself waits on the request, thus
-making the call synchronous.
-If an AIO request is made the calling thread not only queues it in the
-array but also submits the requests. The helper thread then collects
-the completed IO request and calls completion routine on it.
-
-Linux native AIO:
-=================
-
-If we have libaio installed on the system and innodb_use_native_aio
-is set to true we follow the code path of native AIO, otherwise we
-do simulated AIO.
-There are innodb_file_io_threads helper threads. These threads work
-on the four arrays mentioned above in Simulated AIO.
-If a synchronous IO request is made, it is handled by calling
-os_file_write/os_file_read.
-If an AIO request is made the calling thread not only queues it in the
-array but also submits the requests. The helper thread then collects
-the completed IO request and calls completion routine on it.
-
-**********************************************************************/
-
-
-#ifdef UNIV_PFS_IO
-/* Keys to register InnoDB I/O with performance schema */
-mysql_pfs_key_t  innodb_data_file_key;
-mysql_pfs_key_t  innodb_log_file_key;
-mysql_pfs_key_t  innodb_temp_file_key;
-#endif /* UNIV_PFS_IO */
-
-class AIO;
-
-/** The asynchronous I/O context */
-struct Slot {
-
-#ifdef WIN_ASYNC_IO
-	/** Windows control block for the aio request 
-	must be at the very start of Slot, so we can
-	cast Slot* to OVERLAPPED*
-	*/
-	OVERLAPPED		control;
-#endif
-
-	/** index of the slot in the aio array */
-	uint16_t		pos;
-
-	/** true if this slot is reserved */
-	bool			is_reserved;
-
-	/** time when reserved */
-	time_t			reservation_time;
-
-	/** buffer used in i/o */
-	byte*			buf;
-
-	/** Buffer pointer used for actual IO. We advance this
-	when partial IO is required and not buf */
-	byte*			ptr;
-
-	/** OS_FILE_READ or OS_FILE_WRITE */
-	IORequest		type;
-
-	/** file offset in bytes */
-	os_offset_t		offset;
-
-	/** file where to read or write */
-	pfs_os_file_t		file;
-
-	/** file name or path */
-	const char*		name;
-
-	/** used only in simulated aio: true if the physical i/o
-	already made and only the slot message needs to be passed
-	to the caller of os_aio_simulated_handle */
-	bool			io_already_done;
-
-	/*!< file block size */
-	ulint			file_block_size;
-
-	/** The file node for which the IO is requested. */
-	fil_node_t*		m1;
-
-	/** the requester of an aio operation and which can be used
-	to identify which pending aio operation was completed */
-	void*			m2;
-
-	/** AIO completion status */
-	dberr_t			err;
-
-#ifdef WIN_ASYNC_IO
-
-	/** bytes written/read */
-	DWORD			n_bytes;
-
-	/** length of the block to read or write */
-	DWORD			len;
-
-	/** aio array containing this slot */
-	AIO				*array;
-#elif defined(LINUX_NATIVE_AIO)
-	/** Linux control block for aio */
-	struct iocb		control;
-
-	/** AIO return code */
-	int			ret;
-
-	/** bytes written/read. */
-	ssize_t			n_bytes;
-
-	/** length of the block to read or write */
-	ulint			len;
-#else
-	/** length of the block to read or write */
-	ulint			len;
-
-	/** bytes written/read. */
-	ulint			n_bytes;
-#endif /* WIN_ASYNC_IO */
-
-	/** Length of the block before it was compressed */
-	uint32			original_len;
-
-};
-
-/** The asynchronous i/o array structure */
-class AIO {
+/* Per-IO operation environment*/
+class io_slots
+{
+private:
+	tpool::cache<tpool::aiocb> m_cache;
+	tpool::task_group m_group;
+	int m_max_aio;
 public:
-	/** Constructor
-	@param[in]	id		Latch ID
-	@param[in]	n_slots		Number of slots to configure
-	@param[in]	segments	Number of segments to configure */
-	AIO(latch_id_t id, ulint n_slots, ulint segments);
-
-	/** Destructor */
-	~AIO();
-
-	/** Initialize the instance
-	@return DB_SUCCESS or error code */
-	dberr_t init();
-
-	/** Requests for a slot in the aio array. If no slot is available, waits
-	until not_full-event becomes signaled.
-
-	@param[in]	type	IO context
-	@param[in,out]	m1	message to be passed along with the AIO
-				operation
-	@param[in,out]	m2	message to be passed along with the AIO
-				operation
-	@param[in]	file	file handle
-	@param[in]	name	name of the file or path as a null-terminated
-				string
-	@param[in,out]	buf	buffer where to read or from which to write
-	@param[in]	offset	file offset, where to read from or start writing
-	@param[in]	len	length of the block to read or write
-	@return pointer to slot */
-	Slot* reserve_slot(
-		const IORequest&	type,
-		fil_node_t*		m1,
-		void*			m2,
-		pfs_os_file_t		file,
-		const char*		name,
-		void*			buf,
-		os_offset_t		offset,
-		ulint			len)
-		MY_ATTRIBUTE((warn_unused_result));
-
-	/** @return number of reserved slots */
-	ulint pending_io_count() const;
-
-	/** Returns a pointer to the nth slot in the aio array.
-	@param[in]	index	Index of the slot in the array
-	@return pointer to slot */
-	const Slot* at(ulint i) const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		ut_a(i < m_slots.size());
-
-		return(&m_slots[i]);
-	}
-
-	/** Non const version */
-	Slot* at(ulint i)
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		ut_a(i < m_slots.size());
-
-		return(&m_slots[i]);
-	}
-
-	/** Frees a slot in the AIO array, assumes caller owns the mutex.
-	@param[in,out]	slot	Slot to release */
-	void release(Slot* slot);
-
-	/** Frees a slot in the AIO array, assumes caller doesn't own the mutex.
-	@param[in,out]	slot	Slot to release */
-	void release_with_mutex(Slot* slot);
-
-	/** Prints info about the aio array.
-	@param[in,out]	file	Where to print */
-	void print(FILE* file);
-
-	/** @return the number of slots per segment */
-	ulint slots_per_segment() const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		return(m_slots.size() / m_n_segments);
-	}
-
-	/** @return accessor for n_segments */
-	ulint get_n_segments() const
-		MY_ATTRIBUTE((warn_unused_result))
+	io_slots(int max_submitted_io, int max_callback_concurrency) :
+		m_cache(max_submitted_io),
+		m_group(max_callback_concurrency),
+		m_max_aio(max_submitted_io)
 	{
-		return(m_n_segments);
 	}
-
-#ifdef UNIV_DEBUG
-	/** @return true if the thread owns the mutex */
-	bool is_mutex_owned() const
-		MY_ATTRIBUTE((warn_unused_result))
+	/* Get cached AIO control block */
+	tpool::aiocb* acquire()
 	{
-		return(mutex_own(&m_mutex));
+		return m_cache.get();
 	}
-#endif /* UNIV_DEBUG */
-
-	/** Acquire the mutex */
-	void acquire() const
+	/* Release AIO control block back to cache */
+	void release(tpool::aiocb* aiocb)
 	{
-		mutex_enter(&m_mutex);
+		m_cache.put(aiocb);
 	}
 
-	/** Release the mutex */
-	void release() const
+	bool contains(tpool::aiocb* aiocb)
 	{
-		mutex_exit(&m_mutex);
+		return m_cache.contains(aiocb);
 	}
 
-	/** Write out the state to the file/stream
-	@param[in, out]	file	File to write to */
-	void to_file(FILE* file) const;
-
-#ifdef LINUX_NATIVE_AIO
-	/** Dispatch an AIO request to the kernel.
-	@param[in,out]	slot	an already reserved slot
-	@return true on success. */
-	bool linux_dispatch(Slot* slot)
-		MY_ATTRIBUTE((warn_unused_result));
-
-	/** Accessor for an AIO event
-	@param[in]	index	Index into the array
-	@return the event at the index */
-	io_event* io_events(ulint index)
-		MY_ATTRIBUTE((warn_unused_result))
+	/* Wait for completions of all AIO operations */
+	void wait()
 	{
-		ut_a(index < m_events.size());
-
-		return(&m_events[index]);
+		m_cache.wait();
 	}
 
-	/** Accessor for the AIO context
-	@param[in]	segment	Segment for which to get the context
-	@return the AIO context for the segment */
-	io_context_t io_ctx(ulint segment)
-		MY_ATTRIBUTE((warn_unused_result))
+	size_t pending_io_count()
 	{
-		ut_ad(segment < get_n_segments());
-
-		return(m_aio_ctx[segment]);
+		return (size_t)m_max_aio - m_cache.size();
 	}
 
-	/** Creates an io_context_t for native linux AIO.
-	@param[in]	max_events	number of events
-	@param[out]	io_ctx		io_ctx to initialize.
-	@return true on success. */
-	static bool linux_create_io_ctx(unsigned max_events, io_context_t& io_ctx)
-		MY_ATTRIBUTE((warn_unused_result));
-
-	/** Checks if the system supports native linux aio. On some kernel
-	versions where native aio is supported it won't work on tmpfs. In such
-	cases we can't use native aio as it is not possible to mix simulated
-	and native aio.
-	@return true if supported, false otherwise. */
-	static bool is_linux_native_aio_supported()
-		MY_ATTRIBUTE((warn_unused_result));
-#endif /* LINUX_NATIVE_AIO */
-
-#ifdef WIN_ASYNC_IO
-	HANDLE m_completion_port;
-	/** Wake up all AIO threads in Windows native aio */
-	static void wake_at_shutdown() {
-		AIO *all_arrays[] = {s_reads, s_writes, s_log, s_ibuf };
-		for (size_t i = 0; i < array_elements(all_arrays); i++) {
-			AIO *a = all_arrays[i];
-			if (a) {
-				PostQueuedCompletionStatus(a->m_completion_port, 0,
-					IOCP_SHUTDOWN_KEY, 0);
-			}
-		}
-	}
-#endif /* WIN_ASYNC_IO */
-
-#ifdef _WIN32
-	/** This function can be called if one wants to post a batch of reads
-	and prefers an I/O - handler thread to handle them all at once later.You
-	must call os_aio_simulated_wake_handler_threads later to ensure the
-	threads are not left sleeping! */
-	static void simulated_put_read_threads_to_sleep();
-#endif /* _WIN32 */
-
-	/** Create an instance using new(std::nothrow)
-	@param[in]	id		Latch ID
-	@param[in]	n_slots		The number of AIO request slots
-	@param[in]	segments	The number of segments
-	@return a new AIO instance */
-	static AIO* create(
-		latch_id_t	id,
-		ulint		n_slots,
-		ulint		segments)
-		MY_ATTRIBUTE((warn_unused_result));
-
-	/** Initializes the asynchronous io system. Creates one array each
-	for ibuf and log I/O. Also creates one array each for read and write
-	where each array is divided logically into n_readers and n_writers
-	respectively. The caller must create an i/o handler thread for each
-	segment in these arrays. This function also creates the sync array.
-	No I/O handler thread needs to be created for that
-	@param[in]	n_per_seg	maximum number of pending aio
-					operations allowed per segment
-	@param[in]	n_readers	number of reader threads
-	@param[in]	n_writers	number of writer threads
-	@param[in]	n_slots_sync	number of slots in the sync aio array
-	@return true if AIO sub-system was started successfully */
-	static bool start(
-		ulint		n_per_seg,
-		ulint		n_readers,
-		ulint		n_writers,
-		ulint		n_slots_sync)
-		MY_ATTRIBUTE((warn_unused_result));
-
-	/** Free the AIO arrays */
-	static void shutdown();
-
-	/** Print all the AIO segments
-	@param[in,out]	file		Where to print */
-	static void print_all(FILE* file);
-
-	/** Calculates local segment number and aio array from global
-	segment number.
-	@param[out]	array		AIO wait array
-	@param[in]	segment		global segment number
-	@return local segment number within the aio array */
-	static ulint get_array_and_local_segment(
-		AIO**		array,
-		ulint		segment)
-		MY_ATTRIBUTE((warn_unused_result));
-
-	/** Select the IO slot array
-	@param[in,out]	type		Type of IO, READ or WRITE
-	@param[in]	read_only	true if running in read-only mode
-	@param[in]	mode		IO mode
-	@return slot array or NULL if invalid mode specified */
-	static AIO* select_slot_array(
-		IORequest&		type,
-		bool			read_only,
-		ulint			mode)
-		MY_ATTRIBUTE((warn_unused_result));
-
-	/** Calculates segment number for a slot.
-	@param[in]	array		AIO wait array
-	@param[in]	slot		slot in this array
-	@return segment number (which is the number used by, for example,
-		I/O handler threads) */
-	static ulint get_segment_no_from_slot(
-		const AIO*	array,
-		const Slot*	slot)
-		MY_ATTRIBUTE((warn_unused_result));
-
-	/** Wakes up a simulated AIO I/O-handler thread if it has something
-	to do.
-	@param[in]	global_segment	the number of the segment in the
-					AIO arrays */
-	static void wake_simulated_handler_thread(ulint global_segment);
-
-	/** Check if it is a read request
-	@param[in]	aio		The AIO instance to check
-	@return true if the AIO instance is for reading. */
-	static bool is_read(const AIO* aio)
-		MY_ATTRIBUTE((warn_unused_result))
+	tpool::task_group* get_task_group()
 	{
-		return(s_reads == aio);
+		return &m_group;
 	}
 
-	/** Wait on an event until no pending writes */
-	static void wait_until_no_pending_writes()
+	~io_slots()
 	{
-		os_event_wait(AIO::s_writes->m_is_empty);
+		wait();
 	}
-
-	/** Print to file
-	@param[in]	file		File to write to */
-	static void print_to_file(FILE* file);
-
-	/** Check for pending IO. Gets the count and also validates the
-	data structures.
-	@return count of pending IO requests */
-	static ulint total_pending_io_count();
-
-private:
-	/** Initialise the slots
-	@return DB_SUCCESS or error code */
-	dberr_t init_slots()
-		MY_ATTRIBUTE((warn_unused_result));
-
-	/** Wakes up a simulated AIO I/O-handler thread if it has something
-	to do for a local segment in the AIO array.
-	@param[in]	global_segment	the number of the segment in the
-					AIO arrays
-	@param[in]	segment		the local segment in the AIO array */
-	void wake_simulated_handler_thread(ulint global_segment, ulint segment);
-
-	/** Prints pending IO requests per segment of an aio array.
-	We probably don't need per segment statistics but they can help us
-	during development phase to see if the IO requests are being
-	distributed as expected.
-	@param[in,out]	file		file where to print
-	@param[in]	segments	pending IO array */
-	void print_segment_info(
-		FILE*		file,
-		const ulint*	segments);
-
-#ifdef LINUX_NATIVE_AIO
-	/** Initialise the Linux native AIO data structures
-	@return DB_SUCCESS or error code */
-	dberr_t init_linux_native_aio()
-		MY_ATTRIBUTE((warn_unused_result));
-#endif /* LINUX_NATIVE_AIO */
-
-private:
-	typedef std::vector<Slot> Slots;
-
-	/** the mutex protecting the aio array */
-	mutable SysMutex	m_mutex;
-
-	/** Pointer to the slots in the array.
-	Number of elements must be divisible by n_threads. */
-	Slots			m_slots;
-
-	/** Number of segments in the aio array of pending aio requests.
-	A thread can wait separately for any one of the segments. */
-	ulint			m_n_segments;
-
-	/** The event which is set to the signaled state when
-	there is space in the aio outside the ibuf segment;
-	os_event_set() and os_event_reset() are protected by AIO::m_mutex */
-	os_event_t		m_not_full;
-
-	/** The event which is set to the signaled state when
-	there are no pending i/os in this array;
-	os_event_set() and os_event_reset() are protected by AIO::m_mutex */
-	os_event_t		m_is_empty;
-
-	/** Number of reserved slots in the AIO array outside
-	the ibuf segment */
-	ulint			m_n_reserved;
-
-
-#if defined(LINUX_NATIVE_AIO)
-	typedef std::vector<io_event> IOEvents;
-
-	/** completion queue for IO. There is one such queue per
-	segment. Each thread will work on one ctx exclusively. */
-	std::vector<io_context_t>		m_aio_ctx;
-
-	/** The array to collect completed IOs. There is one such
-	event for each possible pending IO. The size of the array
-	is equal to m_slots.size(). */
-	IOEvents		m_events;
-#endif /* LINUX_NATIV_AIO */
-
-	/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as
-	sync AIO. These are NULL when the module has not yet been
-	initialized. */
-
-	/** Insert buffer */
-	static AIO*		s_ibuf;
-
-	/** Redo log */
-	static AIO*		s_log;
-
-	/** Reads */
-	static AIO*		s_reads;
-
-	/** Writes */
-	static AIO*		s_writes;
-
-	/** Synchronous I/O */
-	static AIO*		s_sync;
 };
 
-/** Static declarations */
-AIO*	AIO::s_reads;
-AIO*	AIO::s_writes;
-AIO*	AIO::s_ibuf;
-AIO*	AIO::s_log;
-AIO*	AIO::s_sync;
+static io_slots *read_slots;
+static io_slots *write_slots;
 
-#if defined(LINUX_NATIVE_AIO)
-/** timeout for each io_getevents() call = 500ms. */
-static const ulint	OS_AIO_REAP_TIMEOUT = 500000000UL;
-
-/** time to sleep, in microseconds if io_setup() returns EAGAIN. */
-static const ulint	OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL;
-
-/** number of attempts before giving up on io_setup(). */
-static const int	OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5;
-#endif /* LINUX_NATIVE_AIO */
-
-/** Array of events used in simulated AIO */
-static os_event_t*	os_aio_segment_wait_events;
+/** Number of retries for partial I/O's */
+constexpr ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
 
-/** Number of asynchronous I/O segments.  Set by os_aio_init(). */
-static ulint		os_aio_n_segments = ULINT_UNDEFINED;
+/* This specifies the file permissions InnoDB uses when it creates files in
+Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
+my_umask */
 
-/** If the following is true, read i/o handler threads try to
-wait until a batch of new read requests have been posted */
-static bool		os_aio_recommend_sleep_for_read_threads;
+#ifndef _WIN32
+/** Umask for creating files */
+static ulint	os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
+#else
+/** Umask for creating files */
+static ulint	os_innodb_umask	= 0;
+#endif /* _WIN32 */
 
 Atomic_counter<ulint> os_n_file_reads;
 static ulint	os_bytes_read_since_printout;
@@ -682,11 +167,12 @@ bool	os_has_said_disk_full;
 /** Default Zip compression level */
 extern uint page_zip_level;
 
-/** Validates the consistency of the aio system.
-@return true if ok */
-static
-bool
-os_aio_validate();
+#ifdef UNIV_PFS_IO
+/* Keys to register InnoDB I/O with performance schema */
+mysql_pfs_key_t  innodb_data_file_key;
+mysql_pfs_key_t  innodb_log_file_key;
+mysql_pfs_key_t  innodb_temp_file_key;
+#endif
 
 /** Handle errors for file operations.
 @param[in]	name		name of a file or NULL
@@ -750,30 +236,8 @@ static void os_file_handle_rename_error(const char* name, const char* new_name)
 	}
 }
 
-/** Does simulated AIO. This function should be called by an i/o-handler
-thread.
-
-@param[in]	segment	The number of the segment in the aio arrays to wait
-			for; segment 0 is the ibuf i/o thread, segment 1 the
-			log i/o thread, then follow the non-ibuf read threads,
-			and as the last are the non-ibuf write threads
-@param[out]	m1	the messages passed with the AIO request; note that
-			also in the case where the AIO operation failed, these
-			output parameters are valid and can be used to restart
-			the operation, for example
-@param[out]	m2	Callback argument
-@param[in]	type	IO context
-@return DB_SUCCESS or error code */
-static
-dberr_t
-os_aio_simulated_handler(
-	ulint		global_segment,
-	fil_node_t**	m1,
-	void**		m2,
-	IORequest*	type);
 
 #ifdef _WIN32
-static HANDLE win_get_syncio_event();
 
 /**
  Wrapper around Windows DeviceIoControl() function.
@@ -797,7 +261,7 @@ os_win32_device_io_control(
 )
 {
 	OVERLAPPED overlapped = { 0 };
-	overlapped.hEvent = win_get_syncio_event();
+	overlapped.hEvent = tpool::win_get_syncio_event();
 	BOOL result = DeviceIoControl(handle, code, inbuf, inbuf_size, outbuf,
 		outbuf_size,  NULL, &overlapped);
 
@@ -811,172 +275,49 @@ os_win32_device_io_control(
 
 #endif
 
-#ifdef WIN_ASYNC_IO
-/** This function is only used in Windows asynchronous i/o.
-Waits for an aio operation to complete. This function is used to wait the
-for completed requests. The aio array of pending requests is divided
-into segments. The thread specifies which segment or slot it wants to wait
-for. NOTE: this function will also take care of freeing the aio slot,
-therefore no other thread is allowed to do the freeing!
-@param[in]	segment		The number of the segment in the aio arrays to
-wait for; segment 0 is the ibuf I/O thread,
-segment 1 the log I/O thread, then follow the
-non-ibuf read threads, and as the last are the
-non-ibuf write threads; if this is
-ULINT_UNDEFINED, then it means that sync AIO
-is used, and this parameter is ignored
-@param[in]	pos		this parameter is used only in sync AIO:
-wait for the aio slot at this position
-@param[out]	m1		the messages passed with the AIO request; note
-that also in the case where the AIO operation
-failed, these output parameters are valid and
-can be used to restart the operation,
-for example
-@param[out]	m2		callback message
-@param[out]	type		OS_FILE_WRITE or ..._READ
-@return DB_SUCCESS or error code */
-static
-dberr_t
-os_aio_windows_handler(
-	ulint		segment,
-	ulint		pos,
-	fil_node_t**	m1,
-	void**		m2,
-	IORequest*	type);
-#endif /* WIN_ASYNC_IO */
 
-/** Generic AIO Handler methods. Currently handles IO post processing. */
-class AIOHandler {
-public:
-	/** Do any post processing after a read/write
-	@return DB_SUCCESS or error code. */
-	static dberr_t post_io_processing(Slot* slot);
-};
 
 /** Helper class for doing synchronous file IO. Currently, the objective
 is to hide the OS specific code, so that the higher level functions aren't
 peppered with #ifdef. Makes the code flow difficult to follow.  */
-class SyncFileIO {
+class SyncFileIO
+{
 public:
-	/** Constructor
-	@param[in]	fh	File handle
-	@param[in,out]	buf	Buffer to read/write
-	@param[in]	n	Number of bytes to read/write
-	@param[in]	offset	Offset where to read or write */
-	SyncFileIO(os_file_t fh, void* buf, ulint n, os_offset_t offset)
-		:
-		m_fh(fh),
-		m_buf(buf),
-		m_n(static_cast<ssize_t>(n)),
-		m_offset(offset)
-	{
-		ut_ad(m_n > 0);
-	}
-
-	/** Destructor */
-	~SyncFileIO()
-	{
-		/* No op */
-	}
-
-	/** Do the read/write
-	@param[in]	request	The IO context and type
-	@return the number of bytes read/written or negative value on error */
-	ssize_t execute(const IORequest& request);
-
-	/** Do the read/write
-	@param[in,out]	slot	The IO slot, it has the IO context
-	@return the number of bytes read/written or negative value on error */
-	static ssize_t execute(Slot* slot);
-
-	/** Move the read/write offset up to where the partial IO succeeded.
-	@param[in]	n_bytes	The number of bytes to advance */
-	void advance(ssize_t n_bytes)
-	{
-		m_offset += n_bytes;
-
-		ut_ad(m_n >= n_bytes);
-
-		m_n -=  n_bytes;
-
-		m_buf = reinterpret_cast<uchar*>(m_buf) + n_bytes;
-	}
+  /** Constructor
+  @param[in]     fh     File handle
+  @param[in,out] buf    Buffer to read/write
+  @param[in]     n      Number of bytes to read/write
+  @param[in]     offset Offset where to read or write */
+  SyncFileIO(os_file_t fh, void *buf, ulint n, os_offset_t offset) :
+    m_fh(fh), m_buf(buf), m_n(static_cast<ssize_t>(n)), m_offset(offset)
+  { ut_ad(m_n > 0); }
+
+  /** Do the read/write
+  @param[in]	request	The IO context and type
+  @return the number of bytes read/written or negative value on error */
+  ssize_t execute(const IORequest &request);
+
+  /** Move the read/write offset up to where the partial IO succeeded.
+  @param[in]	n_bytes	The number of bytes to advance */
+  void advance(ssize_t n_bytes)
+  {
+    m_offset+= n_bytes;
+    ut_ad(m_n >= n_bytes);
+    m_n-= n_bytes;
+    m_buf= reinterpret_cast<uchar*>(m_buf) + n_bytes;
+  }
 
 private:
-	/** Open file handle */
-	os_file_t		m_fh;
-
-	/** Buffer to read/write */
-	void*			m_buf;
-
-	/** Number of bytes to read/write */
-	ssize_t			m_n;
-
-	/** Offset from where to read/write */
-	os_offset_t		m_offset;
+  /** Open file handle */
+  const os_file_t m_fh;
+  /** Buffer to read/write */
+  void *m_buf;
+  /** Number of bytes to read/write */
+  ssize_t m_n;
+  /** Offset from where to read/write */
+  os_offset_t m_offset;
 };
 
-/** Do any post processing after a read/write
-@return DB_SUCCESS or error code. */
-dberr_t
-AIOHandler::post_io_processing(Slot* slot)
-{
-	ut_ad(slot->is_reserved);
-
-	/* Total bytes read so far */
-	ulint	n_bytes = ulint(slot->ptr - slot->buf) + slot->n_bytes;
-
-	return(n_bytes == slot->original_len ? DB_SUCCESS : DB_FAIL);
-}
-
-/** Count the number of free slots
-@return number of reserved slots */
-ulint
-AIO::pending_io_count() const
-{
-	acquire();
-
-#ifdef UNIV_DEBUG
-	ut_a(m_n_segments > 0);
-	ut_a(!m_slots.empty());
-
-	ulint	count = 0;
-
-	for (ulint i = 0; i < m_slots.size(); ++i) {
-
-		const Slot&	slot = m_slots[i];
-
-		if (slot.is_reserved) {
-			++count;
-			ut_a(slot.len > 0);
-		}
-	}
-
-	ut_a(m_n_reserved == count);
-#endif /* UNIV_DEBUG */
-
-	ulint	reserved = m_n_reserved;
-
-	release();
-
-	return(reserved);
-}
-
-#ifdef UNIV_DEBUG
-/** Validates the consistency the aio system some of the time.
-@return true if ok or the check was skipped */
-static
-bool
-os_aio_validate_skip()
-{
-/** Try os_aio_validate() every this many times */
-# define OS_AIO_VALIDATE_SKIP	13
-
-	static Atomic_counter<uint32_t> os_aio_validate_count;
-	return (os_aio_validate_count++ % OS_AIO_VALIDATE_SKIP) || os_aio_validate();
-}
-#endif /* UNIV_DEBUG */
-
 #undef USE_FILE_LOCK
 #ifndef _WIN32
 /* On Windows, mandatory locking is used */
@@ -1024,101 +365,6 @@ os_file_lock(
 }
 #endif /* USE_FILE_LOCK */
 
-/** Calculates local segment number and aio array from global segment number.
-@param[out]	array		aio wait array
-@param[in]	segment		global segment number
-@return local segment number within the aio array */
-ulint
-AIO::get_array_and_local_segment(
-	AIO**		array,
-	ulint		segment)
-{
-	ulint		local_segment;
-	ulint		n_extra_segs = (srv_read_only_mode) ? 0 : 2;
-
-	ut_a(segment < os_aio_n_segments);
-
-	if (!srv_read_only_mode && segment < n_extra_segs) {
-
-		/* We don't support ibuf/log IO during read only mode. */
-
-		if (segment == IO_IBUF_SEGMENT) {
-
-			*array = s_ibuf;
-
-		} else if (segment == IO_LOG_SEGMENT) {
-
-			*array = s_log;
-
-		} else {
-			*array = NULL;
-		}
-
-		local_segment = 0;
-
-	} else if (segment < s_reads->m_n_segments + n_extra_segs) {
-
-		*array = s_reads;
-		local_segment = segment - n_extra_segs;
-
-	} else {
-		*array = s_writes;
-
-		local_segment = segment
-			      - (s_reads->m_n_segments + n_extra_segs);
-	}
-
-	return(local_segment);
-}
-
-/** Frees a slot in the aio array. Assumes caller owns the mutex.
-@param[in,out]	slot		Slot to release */
-void
-AIO::release(Slot* slot)
-{
-	ut_ad(is_mutex_owned());
-
-	ut_ad(slot->is_reserved);
-
-	slot->is_reserved = false;
-
-	--m_n_reserved;
-
-	if (m_n_reserved == m_slots.size() - 1) {
-		os_event_set(m_not_full);
-	}
-
-	if (m_n_reserved == 0) {
-		os_event_set(m_is_empty);
-	}
-
-#if defined(LINUX_NATIVE_AIO)
-
-	if (srv_use_native_aio) {
-		memset(&slot->control, 0x0, sizeof(slot->control));
-		slot->ret = 0;
-		slot->n_bytes = 0;
-	} else {
-		/* These fields should not be used if we are not
-		using native AIO. */
-		ut_ad(slot->n_bytes == 0);
-		ut_ad(slot->ret == 0);
-	}
-
-#endif /* WIN_ASYNC_IO */
-}
-
-/** Frees a slot in the AIO array. Assumes caller doesn't own the mutex.
-@param[in,out]	slot		Slot to release */
-void
-AIO::release_with_mutex(Slot* slot)
-{
-	acquire();
-
-	release(slot);
-
-	release();
-}
 
 /** Create a temporary file. This function is like tmpfile(3), but
 the temporary file is created in the in the mysql server configuration
@@ -1128,23 +374,14 @@ FILE*
 os_file_create_tmpfile()
 {
 	FILE*	file	= NULL;
-	os_file_t	fd	= innobase_mysql_tmpfile(NULL);
+	File	fd	= mysql_tmpfile("ib");
 
-	if (fd != OS_FILE_CLOSED) {
-#ifdef _WIN32
-		int crt_fd = _open_osfhandle((intptr_t)HANDLE(fd), 0);
-		if (crt_fd != -1) {
-			file = fdopen(crt_fd, "w+b");
-			if (!file) {
-				close(crt_fd);
-			}
-		}
-#else
-		file = fdopen(fd, "w+b");
+	if (fd >= 0) {
+		file = my_fdopen(fd, 0, O_RDWR|O_TRUNC|O_CREAT|FILE_BINARY,
+				 MYF(MY_WME));
 		if (!file) {
-			close(fd);
+			my_close(fd, MYF(MY_WME));
 		}
-#endif
 	}
 
 	if (file == NULL) {
@@ -1265,9 +502,9 @@ os_file_make_data_dir_path(
 		return;
 	}
 
-	ulint	tablename_len = ut_strlen(tablename);
+	ulint	tablename_len = strlen(tablename);
 
-	ut_memmove(++ptr, tablename, tablename_len);
+	memmove(++ptr, tablename, tablename_len);
 
 	ptr[tablename_len] = '\0';
 }
@@ -1476,7 +713,7 @@ os_file_create_subdirs_if_needed(
 	return(success ? DB_SUCCESS : DB_ERROR);
 }
 
-#ifndef _WIN32
+
 
 /** Do the read/write
 @param[in]	request	The IO context and type
@@ -1487,14 +724,24 @@ SyncFileIO::execute(const IORequest& request)
 	ssize_t	n_bytes;
 
 	if (request.is_read()) {
+#ifdef _WIN32
+		n_bytes = tpool::pread(m_fh, m_buf, m_n, m_offset);
+#else
 		n_bytes = pread(m_fh, m_buf, m_n, m_offset);
+#endif
 	} else {
 		ut_ad(request.is_write());
+#ifdef _WIN32
+		n_bytes = tpool::pwrite(m_fh, m_buf, m_n, m_offset);
+#else
 		n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
+#endif
 	}
 
 	return(n_bytes);
 }
+
+#ifndef _WIN32
 /** Free storage space associated with a section of the file.
 @param[in]	fh		Open file handle
 @param[in]	off		Starting offset (SEEK_SET)
@@ -1538,772 +785,17 @@ os_file_punch_hole_posix(
 	return(DB_IO_NO_PUNCH_HOLE);
 }
 
-#if defined(LINUX_NATIVE_AIO)
-
-/** Linux native AIO handler */
-class LinuxAIOHandler {
-public:
-	/**
-	@param[in] global_segment	The global segment*/
-	LinuxAIOHandler(ulint global_segment)
-		:
-		m_global_segment(global_segment)
-	{
-		/* Should never be doing Sync IO here. */
-		ut_a(m_global_segment != ULINT_UNDEFINED);
-
-		/* Find the array and the local segment. */
-
-		m_segment = AIO::get_array_and_local_segment(
-			&m_array, m_global_segment);
-
-		m_n_slots = m_array->slots_per_segment();
-	}
-
-	/** Destructor */
-	~LinuxAIOHandler()
-	{
-		// No op
-	}
-
-	/**
-	Process a Linux AIO request
-	@param[out]	m1		the messages passed with the
-	@param[out]	m2		AIO request; note that in case the
-					AIO operation failed, these output
-					parameters are valid and can be used to
-					restart the operation.
-	@param[out]	request		IO context
-	@return DB_SUCCESS or error code */
-	dberr_t poll(fil_node_t** m1, void** m2, IORequest* request);
-
-private:
-	/** Resubmit an IO request that was only partially successful
-	@param[in,out]	slot		Request to resubmit
-	@return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
-	dberr_t	resubmit(Slot* slot);
-
-	/** Check if the AIO succeeded
-	@param[in,out]	slot		The slot to check
-	@return DB_SUCCESS, DB_FAIL if the operation should be retried or
-		DB_IO_ERROR on all other errors */
-	dberr_t	check_state(Slot* slot);
-
-	/** @return true if a shutdown was detected */
-	bool is_shutdown() const
-	{
-		return(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
-		       && !buf_page_cleaner_is_active);
-	}
-
-	/** If no slot was found then the m_array->m_mutex will be released.
-	@param[out]	n_pending	The number of pending IOs
-	@return NULL or a slot that has completed IO */
-	Slot* find_completed_slot(ulint* n_pending);
-
-	/** This is called from within the IO-thread. If there are no completed
-	IO requests in the slot array, the thread calls this function to
-	collect more requests from the Linux kernel.
-	The IO-thread waits on io_getevents(), which is a blocking call, with
-	a timeout value. Unless the system is very heavy loaded, keeping the
-	IO-thread very busy, the io-thread will spend most of its time waiting
-	in this function.
-	The IO-thread also exits in this function. It checks server status at
-	each wakeup and that is why we use timed wait in io_getevents(). */
-	void collect();
-
-private:
-	/** Slot array */
-	AIO*			m_array;
-
-	/** Number of slots inthe local segment */
-	ulint			m_n_slots;
-
-	/** The local segment to check */
-	ulint			m_segment;
-
-	/** The global segment */
-	ulint			m_global_segment;
-};
-
-/** Resubmit an IO request that was only partially successful
-@param[in,out]	slot		Request to resubmit
-@return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
-dberr_t
-LinuxAIOHandler::resubmit(Slot* slot)
-{
-#ifdef UNIV_DEBUG
-	/* Bytes already read/written out */
-	ulint	n_bytes = slot->ptr - slot->buf;
-
-	ut_ad(m_array->is_mutex_owned());
-
-	ut_ad(n_bytes < slot->original_len);
-	ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes);
-	/* Partial read or write scenario */
-	ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes));
-#endif /* UNIV_DEBUG */
-
-	slot->len -= slot->n_bytes;
-	slot->ptr += slot->n_bytes;
-	slot->offset += slot->n_bytes;
-
-	/* Resetting the bytes read/written */
-	slot->n_bytes = 0;
-	slot->io_already_done = false;
-
-	compile_time_assert(sizeof(off_t) >= sizeof(os_offset_t));
-
-	struct iocb*	iocb = &slot->control;
-
-	if (slot->type.is_read()) {
-
-		io_prep_pread(
-			iocb,
-			slot->file,
-			slot->ptr,
-			slot->len,
-			slot->offset);
-	} else {
-
-		ut_a(slot->type.is_write());
-
-		io_prep_pwrite(
-			iocb,
-			slot->file,
-			slot->ptr,
-			slot->len,
-			slot->offset);
-	}
-
-	iocb->data = slot;
-
-	ut_a(reinterpret_cast<size_t>(iocb->u.c.buf) % OS_FILE_LOG_BLOCK_SIZE
-	     == 0);
-
-	/* Resubmit an I/O request */
-	int	ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb);
-	ut_a(ret != -EINVAL);
-
-	if (ret < 0)  {
-		errno = -ret;
-	}
-
-	return(ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS);
-}
-
-/** Check if the AIO succeeded
-@param[in,out]	slot		The slot to check
-@return DB_SUCCESS, DB_FAIL if the operation should be retried or
-	DB_IO_ERROR on all other errors */
-dberr_t
-LinuxAIOHandler::check_state(Slot* slot)
-{
-	ut_ad(m_array->is_mutex_owned());
-
-	/* Note that it may be that there is more then one completed
-	IO requests. We process them one at a time. We may have a case
-	here to improve the performance slightly by dealing with all
-	requests in one sweep. */
-
-	srv_set_io_thread_op_info(
-		m_global_segment, "processing completed aio requests");
-
-	ut_ad(slot->io_already_done);
-
-	dberr_t	err = DB_SUCCESS;
-
-	if (slot->ret == 0) {
-
-		err = AIOHandler::post_io_processing(slot);
-
-	} else {
-		errno = -slot->ret;
-
-		/* os_file_handle_error does tell us if we should retry
-		this IO. As it stands now, we don't do this retry when
-		reaping requests from a different context than
-		the dispatcher. This non-retry logic is the same for
-		Windows and Linux native AIO.
-		We should probably look into this to transparently
-		re-submit the IO. */
-		os_file_handle_error(slot->name, "Linux aio");
-
-		err = DB_IO_ERROR;
-	}
-
-	return(err);
-}
-
-/** If no slot was found then the m_array->m_mutex will be released.
-@param[out]	n_pending		The number of pending IOs
-@return NULL or a slot that has completed IO */
-Slot*
-LinuxAIOHandler::find_completed_slot(ulint* n_pending)
-{
-	ulint	offset = m_n_slots * m_segment;
-
-	*n_pending = 0;
-
-	m_array->acquire();
-
-	Slot*	slot = m_array->at(offset);
-
-	for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
-
-		if (slot->is_reserved) {
-
-			++*n_pending;
-
-			if (slot->io_already_done) {
-
-				/* Something for us to work on.
-				Note: We don't release the mutex. */
-				return(slot);
-			}
-		}
-	}
-
-	m_array->release();
-
-	return(NULL);
-}
-
-/** This function is only used in Linux native asynchronous i/o. This is
-called from within the io-thread. If there are no completed IO requests
-in the slot array, the thread calls this function to collect more
-requests from the kernel.
-The io-thread waits on io_getevents(), which is a blocking call, with
-a timeout value. Unless the system is very heavy loaded, keeping the
-io-thread very busy, the io-thread will spend most of its time waiting
-in this function.
-The io-thread also exits in this function. It checks server status at
-each wakeup and that is why we use timed wait in io_getevents(). */
-void
-LinuxAIOHandler::collect()
-{
-	ut_ad(m_n_slots > 0);
-	ut_ad(m_array != NULL);
-	ut_ad(m_segment < m_array->get_n_segments());
-
-	/* Which io_context_t we are going to use. */
-	io_context_t	io_ctx = m_array->io_ctx(m_segment);
-
-	/* Starting point of the m_segment we will be working on. */
-	ulint	start_pos = m_segment * m_n_slots;
-
-	/* End point. */
-	ulint	end_pos = start_pos + m_n_slots;
-
-	for (;;) {
-		struct io_event*	events;
-
-		/* Which part of event array we are going to work on. */
-		events = m_array->io_events(m_segment * m_n_slots);
-
-		/* Initialize the events. */
-		memset(events, 0, sizeof(*events) * m_n_slots);
-
-		/* The timeout value is arbitrary. We probably need
-		to experiment with it a little. */
-		struct timespec		timeout;
-
-		timeout.tv_sec = 0;
-		timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
-
-		int	ret;
-
-		ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout);
-		ut_a(ret != -EINVAL);
-		ut_ad(ret != -EFAULT);
-
-		for (int i = 0; i < ret; ++i) {
-
-			struct iocb*	iocb;
-
-			iocb = reinterpret_cast<struct iocb*>(events[i].obj);
-			ut_a(iocb != NULL);
-
-			Slot*	slot = reinterpret_cast<Slot*>(iocb->data);
-
-			/* Some sanity checks. */
-			ut_a(slot != NULL);
-			ut_a(slot->is_reserved);
-
-			/* We are not scribbling previous segment. */
-			ut_a(slot->pos >= start_pos);
-
-			/* We have not overstepped to next segment. */
-			ut_a(slot->pos < end_pos);
-
-			/* Deallocate unused blocks from file system.
-			This is newer done to page 0 or to log files.*/
-			if (slot->offset > 0
-			    && !slot->type.is_log()
-			    && slot->type.is_write()
-			    && slot->type.punch_hole()) {
-
-				slot->err = slot->type.punch_hole(
-					slot->file,
-					slot->offset, slot->len);
-			} else {
-				slot->err = DB_SUCCESS;
-			}
-
-			/* Mark this request as completed. The error handling
-			will be done in the calling function. */
-			m_array->acquire();
-
-			/* events[i].res2 should always be ZERO */
-			ut_ad(events[i].res2 == 0);
-			slot->io_already_done = true;
-
-			/*Even though events[i].res is an unsigned number
-			in libaio, it is used to return a negative value
-			(negated errno value) to indicate error and a positive
-			value to indicate number of bytes read or written. */
-
-			if (events[i].res > slot->len) {
-				/* failure */
-				slot->n_bytes = 0;
-				slot->ret = events[i].res;
-			} else {
-				/* success */
-				slot->n_bytes = events[i].res;
-				slot->ret = 0;
-			}
-			m_array->release();
-		}
-
-		if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
-		    || !buf_page_cleaner_is_active
-		    || ret > 0) {
-
-			break;
-		}
-
-		/* This error handling is for any error in collecting the
-		IO requests. The errors, if any, for any particular IO
-		request are simply passed on to the calling routine. */
-
-		switch (ret) {
-		case -EAGAIN:
-			/* Not enough resources! Try again. */
-
-		case -EINTR:
-			/* Interrupted! The behaviour in case of an interrupt.
-			If we have some completed IOs available then the
-			return code will be the number of IOs. We get EINTR
-			only if there are no completed IOs and we have been
-			interrupted. */
-
-		case 0:
-			/* No pending request! Go back and check again. */
-
-			continue;
-		}
-
-		/* All other errors should cause a trap for now. */
-		ib::fatal()
-			<< "Unexpected ret_code[" << ret
-			<< "] from io_getevents()!";
-
-		break;
-	}
-}
-
-/** Process a Linux AIO request
-@param[out]	m1		the messages passed with the
-@param[out]	m2		AIO request; note that in case the
-				AIO operation failed, these output
-				parameters are valid and can be used to
-				restart the operation.
-@param[out]	request		IO context
-@return DB_SUCCESS or error code */
-dberr_t
-LinuxAIOHandler::poll(fil_node_t** m1, void** m2, IORequest* request)
-{
-	dberr_t		err = DB_SUCCESS;
-	Slot*		slot;
-
-	/* Loop until we have found a completed request. */
-	for (;;) {
-
-		ulint	n_pending;
-
-		slot = find_completed_slot(&n_pending);
-
-		if (slot != NULL) {
-
-			ut_ad(m_array->is_mutex_owned());
-
-			err = check_state(slot);
-
-			/* DB_FAIL is not a hard error, we should retry */
-			if (err != DB_FAIL) {
-				break;
-			}
 
-			/* Partial IO, resubmit request for
-			remaining bytes to read/write */
-			err = resubmit(slot);
-
-			if (err != DB_SUCCESS) {
-				break;
-			}
-
-			m_array->release();
-
-		} else if (is_shutdown() && n_pending == 0) {
-
-			/* There is no completed request. If there is
-			no pending request at all, and the system is
-			being shut down, exit. */
-
-			*m1 = NULL;
-			*m2 = NULL;
-
-			return(DB_SUCCESS);
-
-		} else {
-
-			/* Wait for some request. Note that we return
-			from wait if we have found a request. */
-
-			srv_set_io_thread_op_info(
-				m_global_segment,
-				"waiting for completed aio requests");
-
-			collect();
-		}
-	}
-
-	if (err == DB_IO_PARTIAL_FAILED) {
-		/* Aborting in case of submit failure */
-		ib::fatal()
-			<< "Native Linux AIO interface. "
-			"io_submit() call failed when "
-			"resubmitting a partial I/O "
-			"request on the file " << slot->name
-			<< ".";
-	}
-
-	*m1 = slot->m1;
-	*m2 = slot->m2;
-
-	*request = slot->type;
-
-	m_array->release(slot);
-
-	m_array->release();
-
-	return(err);
-}
-
-/** This function is only used in Linux native asynchronous i/o.
-Waits for an aio operation to complete. This function is used to wait for
-the completed requests. The aio array of pending requests is divided
-into segments. The thread specifies which segment or slot it wants to wait
-for. NOTE: this function will also take care of freeing the aio slot,
-therefore no other thread is allowed to do the freeing!
-
-@param[in]	global_seg	segment number in the aio array
-				to wait for; segment 0 is the ibuf
-				i/o thread, segment 1 is log i/o thread,
-				then follow the non-ibuf read threads,
-				and the last are the non-ibuf write
-				threads.
-@param[out]	m1		the messages passed with the
-@param[out]	m2			AIO request; note that in case the
-				AIO operation failed, these output
-				parameters are valid and can be used to
-				restart the operation.
-@param[out]xi	 request	IO context
-@return DB_SUCCESS if the IO was successful */
-static
-dberr_t
-os_aio_linux_handler(
-	ulint		global_segment,
-	fil_node_t**	m1,
-	void**		m2,
-	IORequest*	request)
-{
-	return LinuxAIOHandler(global_segment).poll(m1, m2, request);
-}
-
-/** Dispatch an AIO request to the kernel.
-@param[in,out]	slot		an already reserved slot
-@return true on success. */
-bool
-AIO::linux_dispatch(Slot* slot)
-{
-	ut_a(slot->is_reserved);
-	ut_ad(slot->type.validate());
-
-	/* Find out what we are going to work with.
-	The iocb struct is directly in the slot.
-	The io_context_t is one per segment. */
-
-	ulint		io_ctx_index;
-	struct iocb*	iocb = &slot->control;
-
-	io_ctx_index = (slot->pos * m_n_segments) / m_slots.size();
-
-	ut_a(reinterpret_cast<size_t>(iocb->u.c.buf) % OS_FILE_LOG_BLOCK_SIZE
-	     == 0);
-
-	int	ret = io_submit(io_ctx(io_ctx_index), 1, &iocb);
-	ut_a(ret != -EINVAL);
-
-	/* io_submit() returns number of successfully queued requests
-	or -errno. */
-
-	if (ret != 1) {
-		errno = -ret;
-	}
-
-	return(ret == 1);
-}
-
-/** Creates an io_context_t for native linux AIO.
-@param[in]	max_events	number of events
-@param[out]	io_ctx		io_ctx to initialize.
-@return true on success. */
-bool
-AIO::linux_create_io_ctx(
-	unsigned	max_events,
-	io_context_t&	io_ctx)
-{
-	ssize_t		n_retries = 0;
-
-	for (;;) {
-
-		memset(&io_ctx, 0x0, sizeof(io_ctx));
-
-		/* Initialize the io_ctx. Tell it how many pending
-		IO requests this context will handle. */
-
-		int	ret = io_setup(max_events, &io_ctx);
-		ut_a(ret != -EINVAL);
-
-		if (ret == 0) {
-			/* Success. Return now. */
-			return(true);
-		}
-
-		/* If we hit EAGAIN we'll make a few attempts before failing. */
-
-		switch (ret) {
-		case -EAGAIN:
-			if (n_retries == 0) {
-				/* First time around. */
-				ib::warn()
-					<< "io_setup() failed with EAGAIN."
-					" Will make "
-					<< OS_AIO_IO_SETUP_RETRY_ATTEMPTS
-					<< " attempts before giving up.";
-			}
-
-			if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
-
-				++n_retries;
-
-				ib::warn()
-					<< "io_setup() attempt "
-					<< n_retries << ".";
-
-				os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
-
-				continue;
-			}
-
-			/* Have tried enough. Better call it a day. */
-			ib::warn()
-				<< "io_setup() failed with EAGAIN after "
-				<< OS_AIO_IO_SETUP_RETRY_ATTEMPTS
-				<< " attempts.";
-			break;
-
-		case -ENOSYS:
-			ib::warn()
-				<< "Linux Native AIO interface"
-				" is not supported on this platform. Please"
-				" check your OS documentation and install"
-				" appropriate binary of InnoDB.";
-
-			break;
-
-		default:
-			ib::warn()
-				<< "Linux Native AIO setup"
-				<< " returned following error["
-				<< ret << "]";
-			break;
-		}
-
-		ib::info()
-			<< "You can disable Linux Native AIO by"
-			" setting innodb_use_native_aio = 0 in my.cnf";
-
-		break;
-	}
-
-	return(false);
-}
-
-/** Checks if the system supports native linux aio. On some kernel
-versions where native aio is supported it won't work on tmpfs. In such
-cases we can't use native aio as it is not possible to mix simulated
-and native aio.
-@return: true if supported, false otherwise. */
-bool
-AIO::is_linux_native_aio_supported()
-{
-	int		fd;
-	io_context_t	io_ctx;
-	char		name[1000];
-
-	if (!linux_create_io_ctx(1, io_ctx)) {
-
-		/* The platform does not support native aio. */
-
-		return(false);
-
-	} else if (!srv_read_only_mode) {
-
-		/* Now check if tmpdir supports native aio ops. */
-		fd = innobase_mysql_tmpfile(NULL);
-
-		if (fd < 0) {
-			ib::warn()
-				<< "Unable to create temp file to check"
-				" native AIO support.";
-
-			int ret = io_destroy(io_ctx);
-			ut_a(ret != -EINVAL);
-			ut_ad(ret != -EFAULT);
-
-			return(false);
-		}
-	} else {
-
-		os_normalize_path(srv_log_group_home_dir);
-
-		ulint	dirnamelen = strlen(srv_log_group_home_dir);
-
-		ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
-
-		memcpy(name, srv_log_group_home_dir, dirnamelen);
-
-		/* Add a path separator if needed. */
-		if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) {
-
-			name[dirnamelen++] = OS_PATH_SEPARATOR;
-		}
-
-		strcpy(name + dirnamelen, "ib_logfile0");
-
-		fd = open(name, O_RDONLY | O_CLOEXEC);
-
-		if (fd == -1) {
-
-			ib::warn()
-				<< "Unable to open"
-				<< " \"" << name << "\" to check native"
-				<< " AIO read support.";
-
-			int ret = io_destroy(io_ctx);
-			ut_a(ret != EINVAL);
-			ut_ad(ret != EFAULT);
-
-			return(false);
-		}
-	}
-
-	struct io_event	io_event;
-
-	memset(&io_event, 0x0, sizeof(io_event));
-
-	byte*	buf = static_cast<byte*>(ut_malloc_nokey(srv_page_size * 2));
-	byte*	ptr = static_cast<byte*>(ut_align(buf, srv_page_size));
-
-	struct iocb	iocb;
-
-	/* Suppress valgrind warning. */
-	memset(buf, 0x00, srv_page_size * 2);
-	memset(&iocb, 0x0, sizeof(iocb));
-
-	struct iocb*	p_iocb = &iocb;
-
-	if (!srv_read_only_mode) {
-
-		io_prep_pwrite(p_iocb, fd, ptr, srv_page_size, 0);
-
-	} else {
-		ut_a(srv_page_size >= 4096);
-		io_prep_pread(p_iocb, fd, ptr, srv_page_size, 0);
-	}
-
-	ut_a(reinterpret_cast<size_t>(p_iocb->u.c.buf) % OS_FILE_LOG_BLOCK_SIZE
-	     == 0);
-	int	err = io_submit(io_ctx, 1, &p_iocb);
-	ut_a(err != -EINVAL);
-
-	if (err >= 1) {
-		/* Now collect the submitted IO request. */
-		err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
-		ut_a(err != -EINVAL);
-	}
-
-	ut_free(buf);
-	close(fd);
-
-	switch (err) {
-	case 1:
-		{
-			int ret = io_destroy(io_ctx);
-			ut_a(ret != -EINVAL);
-			ut_ad(ret != -EFAULT);
-
-			return(true);
-		}
-
-	case -EINVAL:
-	case -ENOSYS:
-		ib::error()
-			<< "Linux Native AIO not supported. You can either"
-			" move "
-			<< (srv_read_only_mode ? name : "tmpdir")
-			<< " to a file system that supports native"
-			" AIO or you can set innodb_use_native_aio to"
-			" FALSE to avoid this message.";
-
-		/* fall through. */
-	default:
-		ib::error()
-			<< "Linux Native AIO check on "
-			<< (srv_read_only_mode ? name : "tmpdir")
-			<< "returned error[" << -err << "]";
-	}
-
-	int ret = io_destroy(io_ctx);
-	ut_a(ret != -EINVAL);
-	ut_ad(ret != -EFAULT);
-
-	return(false);
-}
-
-#endif /* LINUX_NATIVE_AIO */
 
 /** Retrieves the last error number if an error occurs in a file io function.
 The number should be retrieved before any other OS calls (because they may
 overwrite the error number). If the number is not known to this program,
-the OS error number + OS_FILE_ERROR_MAX is returned.
+the OS error number + 100 is returned.
 @param[in]	report_all_errors	true if we want an error message
 					printed of all errors
 @param[in]	on_error_silent		true then don't print any diagnostic
 					to the log
-@return error number, or OS error number + OS_FILE_ERROR_MAX */
+@return error number, or OS error number + 100 */
 static
 ulint
 os_file_get_last_error_low(
@@ -2383,55 +875,53 @@ os_file_get_last_error_low(
 	return(OS_FILE_ERROR_MAX + err);
 }
 
-/** Wrapper to fsync(2) that retries the call on some errors.
+/** Wrapper to fsync() or fdatasync() that retries the call on some errors.
 Returns the value 0 if successful; otherwise the value -1 is returned and
 the global variable errno is set to indicate the error.
 @param[in]	file		open file handle
 @return 0 if success, -1 otherwise */
-static
-int
-os_file_fsync_posix(
-	os_file_t	file)
+static int os_file_sync_posix(os_file_t file)
 {
-	ulint		failures = 0;
-
-	for (;;) {
-
-		++os_n_fsyncs;
+#if !defined(HAVE_FDATASYNC) || HAVE_DECL_FDATASYNC == 0
+  auto func= fsync;
+  auto func_name= "fsync()";
+#else
+  auto func= fdatasync;
+  auto func_name= "fdatasync()";
+#endif
 
-		int	ret = fsync(file);
+  ulint failures= 0;
 
-		if (ret == 0) {
-			return(ret);
-		}
-
-		switch(errno) {
-		case ENOLCK:
+  for (;;)
+  {
+    ++os_n_fsyncs;
 
-			++failures;
-			ut_a(failures < 1000);
+    int ret= func(file);
 
-			if (!(failures % 100)) {
+    if (ret == 0)
+      return ret;
 
-				ib::warn()
-					<< "fsync(): "
-					<< "No locks available; retrying";
-			}
+    switch (errno)
+    {
+    case ENOLCK:
+      ++failures;
+      ut_a(failures < 1000);
 
-			/* 0.2 sec */
-			os_thread_sleep(200000);
-			break;
+      if (!(failures % 100))
+        ib::warn() << func_name << ": No locks available; retrying";
 
-		case EINTR:
+      std::this_thread::sleep_for(std::chrono::milliseconds(200));
+      break;
 
-			++failures;
-			ut_a(failures < 2000);
-			break;
+    case EINTR:
+      ++failures;
+      ut_a(failures < 2000);
+      break;
 
-		default:
-			ib::fatal() << "fsync() returned " << errno;
-		}
-	}
+    default:
+      ib::fatal() << func_name << " returned " << errno;
+    }
+  }
 }
 
 /** Check the existence and type of the given file.
@@ -2454,7 +944,7 @@ os_file_status_posix(
 
 	if (!ret) {
 		/* file exists, everything OK */
-
+		MSAN_STAT_WORKAROUND(&statinfo);
 	} else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
 		/* file does not exist */
 		return(true);
@@ -2491,7 +981,7 @@ os_file_flush_func(
 {
 	int	ret;
 
-	ret = os_file_fsync_posix(file);
+	ret = os_file_sync_posix(file);
 
 	if (ret == 0) {
 		return(true);
@@ -2758,18 +1248,17 @@ os_file_create_func(
 
 	ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
 
-#ifdef O_SYNC
-	/* We let O_SYNC only affect log files; note that we map O_DSYNC to
-	O_SYNC because the datasync options seemed to corrupt files in 2001
-	in both Linux and Solaris */
+	/* We let O_DSYNC only affect log files */
 
 	if (!read_only
 	    && type == OS_LOG_FILE
 	    && srv_file_flush_method == SRV_O_DSYNC) {
-
+#ifdef O_DSYNC
+		create_flag |= O_DSYNC;
+#else
 		create_flag |= O_SYNC;
+#endif
 	}
-#endif /* O_SYNC */
 
 	os_file_t	file;
 	bool		retry;
@@ -2801,8 +1290,8 @@ os_file_create_func(
 	/* We disable OS caching (O_DIRECT) only on data files */
 	if (!read_only
 	    && *success
-	    && (type != OS_LOG_FILE
-		&& type != OS_DATA_FILE_NO_O_DIRECT)
+	    && type != OS_LOG_FILE
+	    && type != OS_DATA_FILE_NO_O_DIRECT
 	    && (srv_file_flush_method == SRV_O_DIRECT
 		|| srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) {
 
@@ -3020,19 +1509,15 @@ Closes a file handle. In case of error, error number can be retrieved with
 os_file_get_last_error.
 @param[in]	file		Handle to close
 @return true if success */
-bool
-os_file_close_func(
-	os_file_t	file)
+bool os_file_close_func(os_file_t file)
 {
-	int	ret = close(file);
+  int ret= close(file);
 
-	if (ret == -1) {
-		os_file_handle_error(NULL, "close");
+  if (!ret)
+    return true;
 
-		return(false);
-	}
-
-	return(true);
+  os_file_handle_error(NULL, "close");
+  return false;
 }
 
 /** Gets a file size.
@@ -3041,8 +1526,10 @@ os_file_close_func(
 os_offset_t
 os_file_get_size(os_file_t file)
 {
-	struct stat statbuf;
-	return fstat(file, &statbuf) ? os_offset_t(-1) : statbuf.st_size;
+  struct stat statbuf;
+  if (fstat(file, &statbuf)) return os_offset_t(-1);
+  MSAN_STAT_WORKAROUND(&statbuf);
+  return statbuf.st_size;
 }
 
 /** Gets a file size.
@@ -3059,6 +1546,7 @@ os_file_get_size(
 	int	ret = stat(filename, &s);
 
 	if (ret == 0) {
+		MSAN_STAT_WORKAROUND(&s);
 		file_size.m_total_size = s.st_size;
 		/* st_blocks is in 512 byte sized blocks */
 		file_size.m_alloc_size = s.st_blocks * 512;
@@ -3103,6 +1591,8 @@ os_file_get_status_posix(
 		return(DB_FAIL);
 	}
 
+	MSAN_STAT_WORKAROUND(statinfo);
+
 	switch (statinfo->st_mode & S_IFMT) {
 	case S_IFDIR:
 		stat_info->type = OS_FILE_TYPE_DIR;
@@ -3182,127 +1672,6 @@ os_file_set_eof(
 
 #include <WinIoCtl.h>
 
-/*
-Windows : Handling synchronous IO on files opened asynchronously.
-
-If file is opened for asynchronous IO (FILE_FLAG_OVERLAPPED) and also bound to
-a completion port, then every IO on this file would normally be enqueued to the
-completion port. Sometimes however we would like to do a synchronous IO. This is
-possible if we initialitze have overlapped.hEvent with a valid event and set its
-lowest order bit to 1 (see MSDN ReadFile and WriteFile description for more info)
-
-We'll create this special event once for each thread and store in thread local
-storage.
-*/
-
-
-static void __stdcall win_free_syncio_event(void *data) {
-	if (data) {
-		CloseHandle((HANDLE)data);
-	}
-}
-
-
-/*
-Retrieve per-thread event for doing synchronous io on asyncronously opened files
-*/
-static HANDLE win_get_syncio_event()
-{
-	HANDLE h;
-
-	h = (HANDLE)FlsGetValue(fls_sync_io);
-	if (h) {
-		return h;
-	}
-	h = CreateEventA(NULL, FALSE, FALSE, NULL);
-	ut_a(h);
-	/* Set low-order bit to keeps I/O completion from being queued */
-	h = (HANDLE)((uintptr_t)h | 1);
-	FlsSetValue(fls_sync_io, h);
-	return h;
-}
-
-
-/** Do the read/write
-@param[in]	request	The IO context and type
-@return the number of bytes read/written or negative value on error */
-ssize_t
-SyncFileIO::execute(const IORequest& request)
-{
-	OVERLAPPED	seek;
-
-	memset(&seek, 0x0, sizeof(seek));
-
-	seek.hEvent = win_get_syncio_event();
-	seek.Offset = (DWORD) m_offset & 0xFFFFFFFF;
-	seek.OffsetHigh = (DWORD) (m_offset >> 32);
-
-	BOOL	ret;
-	DWORD	n_bytes;
-
-	if (request.is_read()) {
-		ret = ReadFile(m_fh, m_buf,
-			static_cast<DWORD>(m_n), NULL, &seek);
-
-	} else {
-		ut_ad(request.is_write());
-		ret = WriteFile(m_fh, m_buf,
-			static_cast<DWORD>(m_n), NULL, &seek);
-	}
-	if (ret || (GetLastError() == ERROR_IO_PENDING)) {
-		/* Wait for async io to complete */
-		ret = GetOverlappedResult(m_fh, &seek, &n_bytes, TRUE);
-	}
-
-	return(ret ? static_cast<ssize_t>(n_bytes) : -1);
-}
-
-/** Do the read/write
-@param[in,out]	slot	The IO slot, it has the IO context
-@return the number of bytes read/written or negative value on error */
-ssize_t
-SyncFileIO::execute(Slot* slot)
-{
-	BOOL	ret;
-	slot->control.hEvent = win_get_syncio_event();
-	if (slot->type.is_read()) {
-
-		ret = ReadFile(
-			slot->file, slot->ptr, slot->len,
-			NULL, &slot->control);
-
-	} else {
-		ut_ad(slot->type.is_write());
-
-		ret = WriteFile(
-			slot->file, slot->ptr, slot->len,
-			NULL, &slot->control);
-
-	}
-	if (ret || (GetLastError() == ERROR_IO_PENDING)) {
-		/* Wait for async io to complete */
-		ret = GetOverlappedResult(slot->file, &slot->control, &slot->n_bytes, TRUE);
-	}
-
-	return(ret ? static_cast<ssize_t>(slot->n_bytes) : -1);
-}
-
-/* Startup/shutdown */
-
-struct WinIoInit
-{
-	WinIoInit() {
-		fls_sync_io = FlsAlloc(win_free_syncio_event);
-		ut_a(fls_sync_io != FLS_OUT_OF_INDEXES);
-	}
-
-	~WinIoInit() {
-		FlsFree(fls_sync_io);
-	}
-};
-
-/* Ensures proper initialization and shutdown */
-static WinIoInit win_io_init;
 
 
 /** Free storage space associated with a section of the file.
@@ -3377,50 +1746,67 @@ os_file_status_win32(
 	return(true);
 }
 
+/* Dynamically load NtFlushBuffersFileEx, used in os_file_flush_func */
+#include <winternl.h>
+typedef NTSTATUS(WINAPI* pNtFlushBuffersFileEx)(
+  HANDLE FileHandle, ULONG Flags, PVOID Parameters, ULONG ParametersSize,
+  PIO_STATUS_BLOCK IoStatusBlock);
+
+static pNtFlushBuffersFileEx my_NtFlushBuffersFileEx
+  = (pNtFlushBuffersFileEx)GetProcAddress(GetModuleHandle("ntdll"),
+    "NtFlushBuffersFileEx");
+
 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
 function!
 Flushes the write buffers of a given file to the disk.
 @param[in]	file		handle to a file
 @return true if success */
-bool
-os_file_flush_func(
-	os_file_t	file)
+bool os_file_flush_func(os_file_t file)
 {
-	++os_n_fsyncs;
+  ++os_n_fsyncs;
+  static bool disable_datasync;
 
-	BOOL	ret = FlushFileBuffers(file);
+  if (my_NtFlushBuffersFileEx && !disable_datasync)
+  {
+    IO_STATUS_BLOCK iosb{};
+    NTSTATUS status= my_NtFlushBuffersFileEx(
+        file, FLUSH_FLAGS_FILE_DATA_SYNC_ONLY, nullptr, 0, &iosb);
+    if (!status)
+      return true;
+    /*
+      NtFlushBuffersFileEx(FLUSH_FLAGS_FILE_DATA_SYNC_ONLY) might fail
+      unless on Win10+, and maybe non-NTFS. Switch to using FlushFileBuffers().
+    */
+    disable_datasync= true;
+  }
 
-	if (ret) {
-		return(true);
-	}
+  if (FlushFileBuffers(file))
+    return true;
 
-	/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
-	actually a raw device, we choose to ignore that error if we are using
-	raw disks */
+  /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
+  actually a raw device, we choose to ignore that error if we are using
+  raw disks */
+  if (srv_start_raw_disk_in_use && GetLastError() == ERROR_INVALID_FUNCTION)
+    return true;
 
-	if (srv_start_raw_disk_in_use && GetLastError()
-	    == ERROR_INVALID_FUNCTION) {
-		return(true);
-	}
+  os_file_handle_error(nullptr, "flush");
 
-	os_file_handle_error(NULL, "flush");
+  /* It is a fatal error if a file flush does not succeed, because then
+  the database can get corrupt on disk */
+  ut_error;
 
-	/* It is a fatal error if a file flush does not succeed, because then
-	the database can get corrupt on disk */
-	ut_error;
-
-	return(false);
+  return false;
 }
 
 /** Retrieves the last error number if an error occurs in a file io function.
 The number should be retrieved before any other OS calls (because they may
 overwrite the error number). If the number is not known to this program,
-the OS error number + 100 is returned.
+then OS error number + OS_FILE_ERROR_MAX is returned.
 @param[in]	report_all_errors	true if we want an error message printed
 					of all errors
 @param[in]	on_error_silent		true then don't print any diagnostic
 					to the log
-@return error number, or OS error number + 100 */
+@return error number, or OS error number + OS_FILE_ERROR_MAX */
 static
 ulint
 os_file_get_last_error_low(
@@ -3622,8 +2008,8 @@ os_file_create_simple_func(
 
 		file = CreateFile(
 			(LPCTSTR) name, access,
-			FILE_SHARE_READ | FILE_SHARE_DELETE, NULL,
-			create_flag, attributes, NULL);
+			FILE_SHARE_READ | FILE_SHARE_DELETE,
+			NULL, create_flag, attributes, NULL);
 
 		if (file == INVALID_HANDLE_VALUE) {
 
@@ -3742,7 +2128,7 @@ os_file_create_func(
 	);
 
 	DWORD		create_flag;
-	DWORD		share_mode = srv_operation != SRV_OPERATION_NORMAL
+	DWORD		share_mode = read_only
 		? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
 		: FILE_SHARE_READ | FILE_SHARE_DELETE;
 
@@ -3758,14 +2144,14 @@ os_file_create_func(
 
 		ut_a(!read_only);
 
-		create_flag = OPEN_EXISTING;
-
 		/* On Windows Physical devices require admin privileges and
 		have to have the write-share mode set. See the remarks
 		section for the CreateFile() function documentation in MSDN. */
 
 		share_mode |= FILE_SHARE_WRITE;
 
+		create_flag = OPEN_EXISTING;
+
 	} else if (create_mode == OS_FILE_OPEN
 		   || create_mode == OS_FILE_OPEN_RETRY) {
 
@@ -3824,20 +2210,19 @@ os_file_create_func(
 
 	switch (srv_file_flush_method)
 	{
-	case SRV_O_DSYNC: 
+	case SRV_O_DSYNC:
 		if (type == OS_LOG_FILE) {
-			/* Map O_SYNC to FILE_WRITE_THROUGH */
+			/* Map O_DSYNC to FILE_WRITE_THROUGH */
 			attributes |= FILE_FLAG_WRITE_THROUGH;
 		}
 		break;
 
 	case SRV_O_DIRECT_NO_FSYNC:
 	case SRV_O_DIRECT:
-		if (type == OS_DATA_FILE) {
-			attributes |= FILE_FLAG_NO_BUFFERING;
+		if (type != OS_DATA_FILE) {
+			break;
 		}
-		break;
-
+		/* fall through */
 	case SRV_ALL_O_DIRECT_FSYNC:
 		/*Traditional Windows behavior, no buffering for any files.*/
 		if (type != OS_DATA_FILE_NO_O_DIRECT) {
@@ -3916,18 +2301,9 @@ os_file_create_func(
 		}
 	}
 
-	if (*success && srv_use_native_aio &&  (attributes & FILE_FLAG_OVERLAPPED)) {
-		/* Bind the file handle to completion port. Completion port
-		might not be created yet, in some stages of backup, but
-		must always be there for the server.*/
-		HANDLE port = (type == OS_LOG_FILE) ?
-			log_completion_port : data_completion_port;
-		ut_a(port || srv_operation != SRV_OPERATION_NORMAL);
-		if (port) {
-			ut_a(CreateIoCompletionPort(file, port, 0, 0));
-		}
+	if (*success &&  (attributes & FILE_FLAG_OVERLAPPED) && srv_thread_pool) {
+		srv_thread_pool->bind(file);
 	}
-
 	return(file);
 }
 
@@ -3958,7 +2334,7 @@ os_file_create_simple_no_error_handling_func(
 	DWORD		access;
 	DWORD		create_flag;
 	DWORD		attributes	= 0;
-	DWORD		share_mode = srv_operation != SRV_OPERATION_NORMAL
+	DWORD		share_mode = read_only
 		? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
 		: FILE_SHARE_READ | FILE_SHARE_DELETE;
 
@@ -4011,6 +2387,7 @@ os_file_create_simple_no_error_handling_func(
 
 		share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE
 			| FILE_SHARE_READ;
+
 	} else {
 
 		ib::error()
@@ -4183,19 +2560,18 @@ Closes a file handle. In case of error, error number can be retrieved with
 os_file_get_last_error.
 @param[in,own]	file		Handle to a file
 @return true if success */
-bool
-os_file_close_func(
-	os_file_t	file)
+bool os_file_close_func(os_file_t file)
 {
-	ut_a(file);
-
-	if (CloseHandle(file)) {
-		return(true);
-	}
-
-	os_file_handle_error(NULL, "close");
+  ut_ad(file);
+  if (!CloseHandle(file))
+  {
+    os_file_handle_error(NULL, "close");
+    return false;
+  }
 
-	return(false);
+  if(srv_thread_pool)
+    srv_thread_pool->unbind(file);
+  return true;
 }
 
 /** Gets a file size.
@@ -4407,51 +2783,9 @@ os_file_set_eof(
 	return(SetEndOfFile(h));
 }
 
-/** This function can be called if one wants to post a batch of reads and
-prefers an i/o-handler thread to handle them all at once later. You must
-call os_aio_simulated_wake_handler_threads later to ensure the threads
-are not left sleeping! */
-void
-os_aio_simulated_put_read_threads_to_sleep()
-{
-	AIO::simulated_put_read_threads_to_sleep();
-}
-
-/** This function can be called if one wants to post a batch of reads and
-prefers an i/o-handler thread to handle them all at once later. You must
-call os_aio_simulated_wake_handler_threads later to ensure the threads
-are not left sleeping! */
-void
-AIO::simulated_put_read_threads_to_sleep()
-{
-	/* The idea of putting background IO threads to sleep is only for
-	Windows when using simulated AIO. Windows XP seems to schedule
-	background threads too eagerly to allow for coalescing during
-	readahead requests. */
-
-	if (srv_use_native_aio) {
-		/* We do not use simulated AIO: do nothing */
-
-		return;
-	}
-
-	os_aio_recommend_sleep_for_read_threads	= true;
-
-	for (ulint i = 0; i < os_aio_n_segments; i++) {
-		AIO*	array;
-
-		get_array_and_local_segment(&array, i);
-
-		if (array == s_reads) {
-
-			os_event_reset(os_aio_segment_wait_events[i]);
-		}
-	}
-}
-
 #endif /* !_WIN32*/
 
-/** Does a syncronous read or write depending upon the type specified
+/** Does a synchronous read or write depending upon the type specified
 In case of partial reads/writes the function tries
 NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
 @param[in]	type,		IO flags
@@ -4490,15 +2824,7 @@ os_file_io(
 
 			bytes_returned += n_bytes;
 
-			if (offset > 0
-			    && !type.is_log()
-			    && type.is_write()
-			    && type.punch_hole()) {
-				*err = type.punch_hole(file, offset, n);
-
-			} else {
-				*err = DB_SUCCESS;
-			}
+			*err = type.maybe_punch_hole(offset, n);
 
 			return(original_n);
 		}
@@ -4509,8 +2835,7 @@ os_file_io(
 
 		bytes_returned += n_bytes;
 
-		if (!type.is_partial_io_warning_disabled()) {
-
+		if (type.type != IORequest::READ_MAYBE_PARTIAL) {
 			const char*	op = type.is_read()
 				? "read" : "written";
 
@@ -4528,7 +2853,7 @@ os_file_io(
 
 	*err = DB_IO_ERROR;
 
-	if (!type.is_partial_io_warning_disabled()) {
+	if (type.type != IORequest::READ_MAYBE_PARTIAL) {
 		ib::warn()
 			<< "Retry attempts for "
 			<< (type.is_read() ? "reading" : "writing")
@@ -4556,7 +2881,6 @@ os_file_pwrite(
 	os_offset_t		offset,
 	dberr_t*		err)
 {
-	ut_ad(type.validate());
 	ut_ad(type.is_write());
 
 	++os_n_file_writes;
@@ -4590,7 +2914,6 @@ os_file_write_func(
 {
 	dberr_t		err;
 
-	ut_ad(type.validate());
 	ut_ad(n > 0);
 
 	ssize_t	n_bytes = os_file_pwrite(type, file, (byte*)buf, n, offset, &err);
@@ -4667,7 +2990,7 @@ static MY_ATTRIBUTE((warn_unused_result))
 dberr_t
 os_file_read_page(
 	const IORequest&	type,
-	os_file_t		file,
+	os_file_t	file,
 	void*			buf,
 	os_offset_t		offset,
 	ulint			n,
@@ -4678,7 +3001,6 @@ os_file_read_page(
 
 	os_bytes_read_since_printout += n;
 
-	ut_ad(type.validate());
 	ut_ad(n > 0);
 
 	ssize_t	n_bytes = os_file_pread(type, file, buf, n, offset, &err);
@@ -4690,17 +3012,19 @@ os_file_read_page(
 	if (ulint(n_bytes) == n || (err != DB_SUCCESS && !exit_on_err)) {
 		return err;
 	}
-
-	ib::error() << "Tried to read " << n << " bytes at offset "
-		    << offset << ", but was only able to read " << n_bytes;
+	int os_err = IF_WIN((int)GetLastError(), errno);
 
 	if (!os_file_handle_error_cond_exit(
 		    NULL, "read", exit_on_err, false)) {
 		ib::fatal()
-			<< "Cannot read from file. OS error number "
-			<< errno << ".";
+			<< "Tried to read " << n << " bytes at offset "
+			<< offset << ", but was only able to read " << n_bytes
+			<< ".Cannot read from file. OS error number "
+			<< os_err << ".";
+	} else {
+		ib::error() << "Tried to read " << n << " bytes at offset "
+		<< offset << ", but was only able to read " << n_bytes;
 	}
-
 	if (err == DB_SUCCESS) {
 		err = DB_IO_ERROR;
 	}
@@ -4875,13 +3199,6 @@ short_warning:
 @return true if the file system supports sparse files */
 IF_WIN(static,) bool os_is_sparse_file_supported(os_file_t fh)
 {
-	/* In this debugging mode, we act as if punch hole is supported,
-	then we skip any calls to actually punch a hole.  In this way,
-	Transparent Page Compression is still being tested. */
-	DBUG_EXECUTE_IF("ignore_punch_hole",
-		return(true);
-	);
-
 #ifdef _WIN32
 	FILE_ATTRIBUTE_TAG_INFO info;
 	if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo,
@@ -4959,6 +3276,7 @@ fallback:
 		if (fstat(file, &statbuf)) {
 			err = errno;
 		} else {
+			MSAN_STAT_WORKAROUND(&statbuf);
 			os_offset_t current_size = statbuf.st_size;
 			if (current_size >= size) {
 				return true;
@@ -5015,12 +3333,8 @@ fallback:
 		<< srv_page_size_shift;
 
 	/* Align the buffer for possible raw i/o */
-	byte*	buf2;
-
-	buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + srv_page_size));
-
-	byte*	buf = static_cast<byte*>(ut_align(buf2, srv_page_size));
-
+	byte*	buf = static_cast<byte*>(aligned_malloc(buf_size,
+							srv_page_size));
 	/* Write buffer full of zeros */
 	memset(buf, 0, buf_size);
 
@@ -5034,20 +3348,16 @@ fallback:
 			n_bytes = buf_size;
 		}
 
-		dberr_t		err;
-		IORequest	request(IORequest::WRITE);
-
-		err = os_file_write(
-			request, name, file, buf, current_size, n_bytes);
-
-		if (err != DB_SUCCESS) {
+		if (os_file_write(IORequestWrite, name,
+				  file, buf, current_size, n_bytes) !=
+		    DB_SUCCESS) {
 			break;
 		}
 
 		current_size += n_bytes;
 	}
 
-	ut_free(buf2);
+	aligned_free(buf);
 
 	return(current_size >= size && os_file_flush(file));
 }
@@ -5118,7 +3428,7 @@ Requests a synchronous positioned read operation.
 dberr_t
 os_file_read_no_error_handling_func(
 	const IORequest&	type,
-	os_file_t		file,
+	os_file_t	file,
 	void*			buf,
 	os_offset_t		offset,
 	ulint			n,
@@ -5163,27 +3473,13 @@ os_file_punch_hole(
 #endif /* _WIN32 */
 }
 
-inline bool IORequest::should_punch_hole() const
-{
-	return m_fil_node && m_fil_node->space->punch_hole;
-}
-
 /** Free storage space associated with a section of the file.
-@param[in]	fh		Open file handle
-@param[in]	off		Starting offset (SEEK_SET)
-@param[in]	len		Size of the hole
+@param off   byte offset from the start (SEEK_SET)
+@param len   size of the hole in bytes
 @return DB_SUCCESS or error code */
-dberr_t
-IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)
+dberr_t IORequest::punch_hole(os_offset_t off, ulint len) const
 {
-	/* In this debugging mode, we act as if punch hole is supported,
-	and then skip any calls to actually punch a hole here.
-	In this way, Transparent Page Compression is still being tested. */
-	DBUG_EXECUTE_IF("ignore_punch_hole",
-		return(DB_SUCCESS);
-	);
-
-	ulint trim_len = get_trim_length(len);
+	ulint trim_len = bpage ? bpage->physical_size() - len : 0;
 
 	if (trim_len == 0) {
 		return(DB_SUCCESS);
@@ -5193,11 +3489,11 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)
 
 	/* Check does file system support punching holes for this
 	tablespace. */
-	if (!should_punch_hole()) {
+	if (!node->space->punch_hole) {
 		return DB_IO_NO_PUNCH_HOLE;
 	}
 
-	dberr_t err = os_file_punch_hole(fh, off, trim_len);
+	dberr_t err = os_file_punch_hole(node->handle, off, trim_len);
 
 	if (err == DB_SUCCESS) {
 		srv_stats.page_compressed_trim_op.inc();
@@ -5205,9 +3501,7 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)
 		/* If punch hole is not supported,
 		set space so that it is not used. */
 		if (err == DB_IO_NO_PUNCH_HOLE) {
-			if (m_fil_node) {
-				m_fil_node->space->punch_hole = false;
-			}
+			node->space->punch_hole = false;
 			err = DB_SUCCESS;
 		}
 	}
@@ -5284,1809 +3578,310 @@ os_file_get_status(
 	return(ret);
 }
 
-/**
-Waits for an AIO operation to complete. This function is used to wait the
-for completed requests. The aio array of pending requests is divided
-into segments. The thread specifies which segment or slot it wants to wait
-for. NOTE: this function will also take care of freeing the aio slot,
-therefore no other thread is allowed to do the freeing!
-@param[in]	segment		The number of the segment in the aio arrays to
-				wait for; segment 0 is the ibuf I/O thread,
-				segment 1 the log I/O thread, then follow the
-				non-ibuf read threads, and as the last are the
-				non-ibuf write threads; if this is
-				ULINT_UNDEFINED, then it means that sync AIO
-				is used, and this parameter is ignored
-@param[out]	m1		the messages passed with the AIO request; note
-				that also in the case where the AIO operation
-				failed, these output parameters are valid and
-				can be used to restart the operation,
-				for example
-@param[out]	m2		callback message
-@param[out]	type		OS_FILE_WRITE or ..._READ
-@return DB_SUCCESS or error code */
-dberr_t
-os_aio_handler(
-	ulint		segment,
-	fil_node_t**	m1,
-	void**		m2,
-	IORequest*	request)
-{
-	dberr_t	err;
-
-	if (srv_use_native_aio) {
-		srv_set_io_thread_op_info(segment, "native aio handle");
-
-#ifdef WIN_ASYNC_IO
-
-		err = os_aio_windows_handler(segment, 0, m1, m2, request);
-
-#elif defined(LINUX_NATIVE_AIO)
-
-		err = os_aio_linux_handler(segment, m1, m2, request);
-
-#else
-		ut_error;
-
-		err = DB_ERROR; /* Eliminate compiler warning */
-
-#endif /* WIN_ASYNC_IO */
-
-	} else {
-		srv_set_io_thread_op_info(segment, "simulated aio handle");
-
-		err = os_aio_simulated_handler(segment, m1, m2, request);
-	}
-
-	return(err);
-}
-
-#ifdef WIN_ASYNC_IO
-static HANDLE new_completion_port()
-{
-	HANDLE h = CreateIoCompletionPort(INVALID_HANDLE_VALUE, 0, 0, 0);
-	ut_a(h);
-	return h;
-}
-#endif
-
-/** Constructor
-@param[in]	id		The latch ID
-@param[in]	n		Number of AIO slots
-@param[in]	segments	Number of segments */
-AIO::AIO(
-	latch_id_t	id,
-	ulint		n,
-	ulint		segments)
-	:
-	m_slots(n),
-	m_n_segments(segments),
-	m_n_reserved()
-# ifdef LINUX_NATIVE_AIO
-	,m_events(m_slots.size())
-# endif /* LINUX_NATIVE_AIO */
-#ifdef WIN_ASYNC_IO
-	,m_completion_port(new_completion_port())
-#endif
-{
-	ut_a(n > 0);
-	ut_a(m_n_segments > 0);
-
-	mutex_create(id, &m_mutex);
-
-	m_not_full = os_event_create("aio_not_full");
-	m_is_empty = os_event_create("aio_is_empty");
 
-	memset((void*)&m_slots[0], 0x0, sizeof(m_slots[0]) * m_slots.size());
-#ifdef LINUX_NATIVE_AIO
-	memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size());
-#endif /* LINUX_NATIVE_AIO */
+extern void fil_aio_callback(const IORequest &request);
 
-	os_event_set(m_is_empty);
-}
-
-/** Initialise the slots */
-dberr_t
-AIO::init_slots()
+static void io_callback(tpool::aiocb* cb)
 {
-	for (ulint i = 0; i < m_slots.size(); ++i) {
-		Slot&	slot = m_slots[i];
-
-		slot.pos = static_cast<uint16_t>(i);
-
-		slot.is_reserved = false;
-
-#ifdef WIN_ASYNC_IO
-
-		slot.array = this;
-
-#elif defined(LINUX_NATIVE_AIO)
-
-		slot.ret = 0;
-
-		slot.n_bytes = 0;
-
-		memset(&slot.control, 0x0, sizeof(slot.control));
-
-#endif /* WIN_ASYNC_IO */
-	}
-
-	return(DB_SUCCESS);
-}
-
-#ifdef LINUX_NATIVE_AIO
-/** Initialise the Linux Native AIO interface */
-dberr_t
-AIO::init_linux_native_aio()
-{
-
-	/* Initialize the io_context_t array. One io_context_t
-	per segment in the array. */
-	m_aio_ctx.resize(get_n_segments());
-
-	ulint		max_events = slots_per_segment();
-
-	for (std::vector<io_context_t>::iterator it = m_aio_ctx.begin(),
-						 end = m_aio_ctx.end();
-	     it != end; ++it) {
-
-		if (!linux_create_io_ctx(max_events, *it)) {
-			/* If something bad happened during aio setup
-			we disable linux native aio.
-			This frequently happens when running the test suite
-			with many threads on a system with low fs.aio-max-nr!
-			*/
-
-			ib::warn()
-				<< "Warning: Linux Native AIO disabled "
-				<< "because _linux_create_io_ctx() "
-				<< "failed. To get rid of this warning you can "
-				<< "try increasing system "
-				<< "fs.aio-max-nr to 1048576 or larger or "
-				<< "setting innodb_use_native_aio = 0 in my.cnf";
-
-			for (std::vector<io_context_t>::iterator it2
-			     = m_aio_ctx.begin();
-			     it2 != it; ++it2) {
-				int ret = io_destroy(*it2);
-				ut_a(ret != -EINVAL);
-			}
-
-			m_aio_ctx.clear();
-			srv_use_native_aio = FALSE;
-			return(DB_SUCCESS);
-		}
-	}
+  ut_a(cb->m_err == DB_SUCCESS);
+  const IORequest request(*static_cast<const IORequest*>
+                          (static_cast<const void*>(cb->m_userdata)));
+  /* Return cb back to cache*/
+  if (cb->m_opcode == tpool::aio_opcode::AIO_PREAD)
+  {
+    ut_ad(read_slots->contains(cb));
+    read_slots->release(cb);
+  }
+  else
+  {
+    ut_ad(write_slots->contains(cb));
+    write_slots->release(cb);
+  }
 
-	return(DB_SUCCESS);
+  fil_aio_callback(request);
 }
-#endif /* LINUX_NATIVE_AIO */
 
-/** Initialise the array */
-dberr_t
-AIO::init()
-{
-	ut_a(!m_slots.empty());
-
-
-	if (srv_use_native_aio) {
 #ifdef LINUX_NATIVE_AIO
-		dberr_t	err = init_linux_native_aio();
-
-		if (err != DB_SUCCESS) {
-			return(err);
-		}
-
-#endif /* LINUX_NATIVE_AIO */
-	}
-
-	return(init_slots());
-}
-
-/** Creates an aio wait array. Note that we return NULL in case of failure.
-We don't care about freeing memory here because we assume that a
-failure will result in server refusing to start up.
-@param[in]	id		Latch ID
-@param[in]	n		maximum number of pending AIO operations
-				allowed; n must be divisible by m_n_segments
-@param[in]	n_segments	number of segments in the AIO array
-@return own: AIO array, NULL on failure */
-AIO*
-AIO::create(
-	latch_id_t	id,
-	ulint		n,
-	ulint		n_segments)
-{
-	if ((n % n_segments)) {
-
-		ib::error()
-			<< "Maximum number of AIO operations must be "
-			<< "divisible by number of segments";
-
-		return(NULL);
-	}
-
-	AIO*	array = UT_NEW_NOKEY(AIO(id, n, n_segments));
-
-	if (array != NULL && array->init() != DB_SUCCESS) {
-
-		UT_DELETE(array);
-
-		array = NULL;
-	}
-
-	return(array);
-}
-
-/** AIO destructor */
-AIO::~AIO()
-{
-	mutex_destroy(&m_mutex);
-
-	os_event_destroy(m_not_full);
-	os_event_destroy(m_is_empty);
-
-#if defined(LINUX_NATIVE_AIO)
-	if (srv_use_native_aio) {
-		for (ulint i = 0; i < m_aio_ctx.size(); i++) {
-			int ret = io_destroy(m_aio_ctx[i]);
-			ut_a(ret != -EINVAL);
-		}
-	}
-#endif /* LINUX_NATIVE_AIO */
-#if defined(WIN_ASYNC_IO)
-	CloseHandle(m_completion_port);
-#endif
-}
+/** Checks if the system supports native linux aio. On some kernel
+versions where native aio is supported it won't work on tmpfs. In such
+cases we can't use native aio.
 
-/** Initializes the asynchronous io system. Creates one array each for ibuf
-and log i/o. Also creates one array each for read and write where each
-array is divided logically into n_readers and n_writers
-respectively. The caller must create an i/o handler thread for each
-segment in these arrays. This function also creates the sync array.
-No i/o handler thread needs to be created for that
-@param[in]	n_per_seg	maximum number of pending aio
-				operations allowed per segment
-@param[in]	n_readers	number of reader threads
-@param[in]	n_writers	number of writer threads
-@param[in]	n_slots_sync	number of slots in the sync aio array
-@return true if the AIO sub-system was started successfully */
-bool
-AIO::start(
-	ulint		n_per_seg,
-	ulint		n_readers,
-	ulint		n_writers,
-	ulint		n_slots_sync)
+@return: true if supported, false otherwise. */
+static bool is_linux_native_aio_supported()
 {
-#if defined(LINUX_NATIVE_AIO)
-	/* Check if native aio is supported on this system and tmpfs */
-	if (srv_use_native_aio && !is_linux_native_aio_supported()) {
-
-		ib::warn() << "Linux Native AIO disabled.";
-
-		srv_use_native_aio = FALSE;
-	}
-#endif /* LINUX_NATIVE_AIO */
+	File		fd;
+	io_context_t	io_ctx;
+	std::string log_file_path = get_log_file_path();
 
-	srv_reset_io_thread_op_info();
+	memset(&io_ctx, 0, sizeof(io_ctx));
+	if (io_setup(1, &io_ctx)) {
 
-	s_reads = create(
-		LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers);
+		/* The platform does not support native aio. */
 
-	if (s_reads == NULL) {
 		return(false);
-	}
 
-	ulint	start = srv_read_only_mode ? 0 : 2;
-	ulint	n_segs = n_readers + start;
-
-	/* 0 is the ibuf segment and 1 is the redo log segment. */
-	for (ulint i = start; i < n_segs; ++i) {
-		ut_a(i < SRV_MAX_N_IO_THREADS);
-		srv_io_thread_function[i] = "read thread";
 	}
+	else if (!srv_read_only_mode) {
 
-	ulint	n_segments = n_readers;
+		/* Now check if tmpdir supports native aio ops. */
+		fd = mysql_tmpfile("ib");
 
-	if (!srv_read_only_mode) {
+		if (fd < 0) {
+			ib::warn()
+				<< "Unable to create temp file to check"
+				" native AIO support.";
 
-		s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1);
+			int ret = io_destroy(io_ctx);
+			ut_a(ret != -EINVAL);
+			ut_ad(ret != -EFAULT);
 
-		if (s_ibuf == NULL) {
 			return(false);
 		}
+	}
+	else {
+		fd = my_open(log_file_path.c_str(), O_RDONLY | O_CLOEXEC,
+			     MYF(0));
 
-		++n_segments;
+		if (fd == -1) {
 
-		srv_io_thread_function[0] = "insert buffer thread";
+			ib::warn() << "Unable to open \"" << log_file_path
+				   << "\" to check native"
+				   << " AIO read support.";
 
-		s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1);
+			int ret = io_destroy(io_ctx);
+			ut_a(ret != EINVAL);
+			ut_ad(ret != EFAULT);
 
-		if (s_log == NULL) {
 			return(false);
 		}
-
-		++n_segments;
-
-		srv_io_thread_function[1] = "log thread";
-
-	} else {
-		s_ibuf = s_log = NULL;
 	}
 
-	s_writes = create(
-		LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers);
+	struct io_event	io_event;
 
-	if (s_writes == NULL) {
-		return(false);
-	}
+	memset(&io_event, 0x0, sizeof(io_event));
 
-#ifdef WIN_ASYNC_IO
-	data_completion_port = s_writes->m_completion_port;
-	log_completion_port =
-		s_log ? s_log->m_completion_port : data_completion_port;
-#endif
+	byte* ptr = static_cast<byte*>(aligned_malloc(srv_page_size,
+						      srv_page_size));
 
-	n_segments += n_writers;
+	struct iocb	iocb;
 
-	for (ulint i = start + n_readers; i < n_segments; ++i) {
-		ut_a(i < SRV_MAX_N_IO_THREADS);
-		srv_io_thread_function[i] = "write thread";
-	}
+	/* Suppress valgrind warning. */
+	memset(ptr, 0, srv_page_size);
+	memset(&iocb, 0x0, sizeof(iocb));
 
-	ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4));
+	struct iocb* p_iocb = &iocb;
 
-	s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1);
+	if (!srv_read_only_mode) {
 
-	if (s_sync == NULL) {
+		io_prep_pwrite(p_iocb, fd, ptr, srv_page_size, 0);
 
-		return(false);
 	}
-
-	os_aio_n_segments = n_segments;
-
-	os_aio_validate();
-
-	os_last_printout = time(NULL);
-
-	if (srv_use_native_aio) {
-		return(true);
+	else {
+		ut_a(srv_page_size >= 512);
+		io_prep_pread(p_iocb, fd, ptr, 512, 0);
 	}
 
-	os_aio_segment_wait_events = static_cast<os_event_t*>(
-		ut_zalloc_nokey(
-			n_segments * sizeof *os_aio_segment_wait_events));
-
-	if (os_aio_segment_wait_events == NULL) {
+	int	err = io_submit(io_ctx, 1, &p_iocb);
 
-		return(false);
+	if (err >= 1) {
+		/* Now collect the submitted IO request. */
+		err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
 	}
 
-	for (ulint i = 0; i < n_segments; ++i) {
-		os_aio_segment_wait_events[i] = os_event_create(0);
-	}
+	aligned_free(ptr);
+	my_close(fd, MYF(MY_WME));
 
-	return(true);
-}
-
-/** Free the AIO arrays */
-void
-AIO::shutdown()
-{
-	UT_DELETE(s_ibuf);
-	s_ibuf = NULL;
-
-	UT_DELETE(s_log);
-	s_log = NULL;
-
-	UT_DELETE(s_writes);
-	s_writes = NULL;
-
-	UT_DELETE(s_sync);
-	s_sync = NULL;
-
-	UT_DELETE(s_reads);
-	s_reads = NULL;
-}
-
-/** Initializes the asynchronous io system. Creates one array each for ibuf
-and log i/o. Also creates one array each for read and write where each
-array is divided logically into n_readers and n_writers
-respectively. The caller must create an i/o handler thread for each
-segment in these arrays. This function also creates the sync array.
-No i/o handler thread needs to be created for that
-@param[in]	n_readers	number of reader threads
-@param[in]	n_writers	number of writer threads
-@param[in]	n_slots_sync	number of slots in the sync aio array */
-bool
-os_aio_init(
-	ulint		n_readers,
-	ulint		n_writers,
-	ulint		n_slots_sync)
-{
-	/* Maximum number of pending aio operations allowed per segment */
-	ulint		limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD;
-
-	return(AIO::start(limit, n_readers, n_writers, n_slots_sync));
-}
-
-/** Frees the asynchronous io system. */
-void
-os_aio_free()
-{
-	AIO::shutdown();
-
-	ut_ad(!os_aio_segment_wait_events || !srv_use_native_aio);
-	ut_ad(srv_use_native_aio || os_aio_segment_wait_events
-	      || !srv_was_started);
+	switch (err) {
+	case 1:
+		{
+			int ret = io_destroy(io_ctx);
+			ut_a(ret != -EINVAL);
+			ut_ad(ret != -EFAULT);
 
-	if (!srv_use_native_aio && os_aio_segment_wait_events) {
-		for (ulint i = 0; i < os_aio_n_segments; i++) {
-			os_event_destroy(os_aio_segment_wait_events[i]);
+			return(true);
 		}
 
-		ut_free(os_aio_segment_wait_events);
-		os_aio_segment_wait_events = 0;
-	}
-	os_aio_n_segments = 0;
-}
+	case -EINVAL:
+	case -ENOSYS:
+		ib::warn()
+			<< "Linux Native AIO not supported. You can either"
+			" move "
+			<< (srv_read_only_mode ? log_file_path : "tmpdir")
+			<< " to a file system that supports native"
+			" AIO or you can set innodb_use_native_aio to"
+			" FALSE to avoid this message.";
 
-/** Wakes up all async i/o threads so that they know to exit themselves in
-shutdown. */
-void
-os_aio_wake_all_threads_at_shutdown()
-{
-#ifdef WIN_ASYNC_IO
-	AIO::wake_at_shutdown();
-#elif defined(LINUX_NATIVE_AIO)
-	/* When using native AIO interface the io helper threads
-	wait on io_getevents with a timeout value of 500ms. At
-	each wake up these threads check the server status.
-	No need to do anything to wake them up. */
-#endif /* !WIN_ASYNC_AIO */
-
-	if (srv_use_native_aio) {
-		return;
+		/* fall through. */
+	default:
+		ib::warn()
+			<< "Linux Native AIO check on "
+			<< (srv_read_only_mode ? log_file_path : "tmpdir")
+			<< "returned error[" << -err << "]";
 	}
 
-	/* This loop wakes up all simulated ai/o threads */
-
-	for (ulint i = 0; i < os_aio_n_segments; ++i) {
-
-		os_event_set(os_aio_segment_wait_events[i]);
-	}
-}
+	int ret = io_destroy(io_ctx);
+	ut_a(ret != -EINVAL);
+	ut_ad(ret != -EFAULT);
 
-/** Waits until there are no pending writes in AIO::s_writes. There can
-be other, synchronous, pending writes. */
-void
-os_aio_wait_until_no_pending_writes()
-{
-	AIO::wait_until_no_pending_writes();
+	return(false);
 }
+#endif
 
-/** Calculates segment number for a slot.
-@param[in]	array		AIO wait array
-@param[in]	slot		slot in this array
-@return segment number (which is the number used by, for example,
-	I/O-handler threads) */
-ulint
-AIO::get_segment_no_from_slot(
-	const AIO*	array,
-	const Slot*	slot)
-{
-	ulint	segment;
-	ulint	seg_len;
-
-	if (array == s_ibuf) {
-		ut_ad(!srv_read_only_mode);
-
-		segment = IO_IBUF_SEGMENT;
-
-	} else if (array == s_log) {
-		ut_ad(!srv_read_only_mode);
-
-		segment = IO_LOG_SEGMENT;
-
-	} else if (array == s_reads) {
-		seg_len = s_reads->slots_per_segment();
-
-		segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
-	} else {
-		ut_a(array == s_writes);
+int os_aio_init()
+{
+  int max_write_events= int(srv_n_write_io_threads *
+                            OS_AIO_N_PENDING_IOS_PER_THREAD);
+  int max_read_events= int(srv_n_read_io_threads *
+                           OS_AIO_N_PENDING_IOS_PER_THREAD);
+  int max_events= max_read_events + max_write_events;
+  int ret;
+#if LINUX_NATIVE_AIO
+  if (srv_use_native_aio && !is_linux_native_aio_supported())
+    goto disable;
+#endif
 
-		seg_len = s_writes->slots_per_segment();
+  ret= srv_thread_pool->configure_aio(srv_use_native_aio, max_events);
 
-		segment = s_reads->m_n_segments
-			+ (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
-	}
+#ifdef LINUX_NATIVE_AIO
+  if (ret)
+  {
+    ut_ad(srv_use_native_aio);
+disable:
+    ib::warn() << "Linux Native AIO disabled.";
+    srv_use_native_aio= false;
+    ret= srv_thread_pool->configure_aio(false, max_events);
+  }
+#endif
 
-	return(segment);
+  if (!ret)
+  {
+    read_slots= new io_slots(max_read_events, srv_n_read_io_threads);
+    write_slots= new io_slots(max_write_events, srv_n_write_io_threads);
+  }
+  return ret;
 }
 
-/** Requests for a slot in the aio array. If no slot is available, waits until
-not_full-event becomes signaled.
 
-@param[in]	type		IO context
-@param[in,out]	m1		message to be passed along with the AIO
-				operation
-@param[in,out]	m2		message to be passed along with the AIO
-				operation
-@param[in]	file		file handle
-@param[in]	name		name of the file or path as a NUL-terminated
-				string
-@param[in,out]	buf		buffer where to read or from which to write
-@param[in]	offset		file offset, where to read from or start writing
-@param[in]	len		length of the block to read or write
-@return pointer to slot */
-Slot*
-AIO::reserve_slot(
-	const IORequest&	type,
-	fil_node_t*		m1,
-	void*			m2,
-	pfs_os_file_t		file,
-	const char*		name,
-	void*			buf,
-	os_offset_t		offset,
-	ulint			len)
+void os_aio_free()
 {
-	ut_ad(reinterpret_cast<size_t>(buf) % OS_FILE_LOG_BLOCK_SIZE == 0);
-	ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
-	ut_ad(len % OS_FILE_LOG_BLOCK_SIZE == 0);
-
-#ifdef WIN_ASYNC_IO
-	ut_a((len & 0xFFFFFFFFUL) == len);
-#endif /* WIN_ASYNC_IO */
-
-	/* No need of a mutex. Only reading constant fields */
-	ulint		slots_per_seg;
-
-	ut_ad(type.validate());
-
-	slots_per_seg = slots_per_segment();
-
-	/* We attempt to keep adjacent blocks in the same local
-	segment. This can help in merging IO requests when we are
-	doing simulated AIO */
-	ulint		local_seg;
-
-	local_seg = (offset >> (srv_page_size_shift + 6)) % m_n_segments;
-
-	for (;;) {
-
-		acquire();
-
-		if (m_n_reserved != m_slots.size()) {
-			break;
-		}
-
-		release();
-
-		if (!srv_use_native_aio) {
-			/* If the handler threads are suspended,
-			wake them so that we get more slots */
-
-			os_aio_simulated_wake_handler_threads();
-		}
-
-		os_event_wait(m_not_full);
-	}
-
-	ulint	counter = 0;
-	Slot*	slot = NULL;
-
-	/* We start our search for an available slot from our preferred
-	local segment and do a full scan of the array. We are
-	guaranteed to find a slot in full scan. */
-	for (ulint i = local_seg * slots_per_seg;
-	     counter < m_slots.size();
-	     ++i, ++counter) {
-
-		i %= m_slots.size();
-
-		slot = at(i);
-
-		if (slot->is_reserved == false) {
-			break;
-		}
-	}
-
-	/* We MUST always be able to get hold of a reserved slot. */
-	ut_a(counter < m_slots.size());
-
-	ut_a(slot->is_reserved == false);
-
-	++m_n_reserved;
-
-	if (m_n_reserved == 1) {
-		os_event_reset(m_is_empty);
-	}
-
-	if (m_n_reserved == m_slots.size()) {
-		os_event_reset(m_not_full);
-	}
-
-	slot->is_reserved = true;
-	slot->reservation_time = time(NULL);
-	slot->m1       = m1;
-	slot->m2       = m2;
-	slot->file     = file;
-	slot->name     = name;
-#ifdef _WIN32
-	slot->len      = static_cast<DWORD>(len);
-#else
-	slot->len      = len;
-#endif /* _WIN32 */
-	slot->type     = type;
-	slot->buf      = static_cast<byte*>(buf);
-	slot->ptr      = slot->buf;
-	slot->offset   = offset;
-	slot->err      = DB_SUCCESS;
-	slot->original_len = static_cast<uint32>(len);
-	slot->io_already_done = false;
-	slot->buf      = static_cast<byte*>(buf);
-
-#ifdef WIN_ASYNC_IO
-	{
-		OVERLAPPED*	control;
-
-		control = &slot->control;
-		control->Offset = (DWORD) offset & 0xFFFFFFFF;
-		control->OffsetHigh = (DWORD) (offset >> 32);
-	}
-#elif defined(LINUX_NATIVE_AIO)
-
-	/* If we are not using native AIO skip this part. */
-	if (srv_use_native_aio) {
-
-		off_t		aio_offset;
-
-		/* Check if we are dealing with 64 bit arch.
-		If not then make sure that offset fits in 32 bits. */
-		aio_offset = (off_t) offset;
-
-		ut_a(sizeof(aio_offset) >= sizeof(offset)
-		     || ((os_offset_t) aio_offset) == offset);
-
-		struct iocb*	iocb = &slot->control;
-
-		if (type.is_read()) {
-
-			io_prep_pread(
-				iocb, file, slot->ptr, slot->len, aio_offset);
-		} else {
-			ut_ad(type.is_write());
-
-			io_prep_pwrite(
-				iocb, file, slot->ptr, slot->len, aio_offset);
-		}
-
-		iocb->data = slot;
-
-		slot->n_bytes = 0;
-		slot->ret = 0;
-	}
-#endif /* LINUX_NATIVE_AIO */
-
-	release();
-
-	return(slot);
+  srv_thread_pool->disable_aio();
+  delete read_slots;
+  delete write_slots;
+  read_slots= nullptr;
+  write_slots= nullptr;
 }
 
-/** Wakes up a simulated aio i/o-handler thread if it has something to do.
-@param[in]	global_segment	The number of the segment in the AIO arrays */
-void
-AIO::wake_simulated_handler_thread(ulint global_segment)
+/** Wait until there are no pending asynchronous writes. */
+static void os_aio_wait_until_no_pending_writes_low()
 {
-	ut_ad(!srv_use_native_aio);
-
-	AIO*	array;
-	ulint	segment = get_array_and_local_segment(&array, global_segment);
-
-	array->wake_simulated_handler_thread(global_segment, segment);
-}
-
-/** Wakes up a simulated AIO I/O-handler thread if it has something to do
-for a local segment in the AIO array.
-@param[in]	global_segment	The number of the segment in the AIO arrays
-@param[in]	segment		The local segment in the AIO array */
-void
-AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment)
-{
-	ut_ad(!srv_use_native_aio);
-
-	ulint	n = slots_per_segment();
-	ulint	offset = segment * n;
-
-	/* Look through n slots after the segment * n'th slot */
-
-	acquire();
-
-	const Slot*	slot = at(offset);
-
-	for (ulint i = 0; i < n; ++i, ++slot) {
-
-		if (slot->is_reserved) {
+  bool notify_wait = write_slots->pending_io_count() > 0;
 
-			/* Found an i/o request */
+  if (notify_wait)
+    tpool::tpool_wait_begin();
 
-			release();
+   write_slots->wait();
 
-			os_event_t	event;
-
-			event = os_aio_segment_wait_events[global_segment];
-
-			os_event_set(event);
-
-			return;
-		}
-	}
-
-	release();
+   if (notify_wait)
+     tpool::tpool_wait_end();
 }
 
-/** Wakes up simulated aio i/o-handler threads if they have something to do. */
-void
-os_aio_simulated_wake_handler_threads()
+/** Wait until there are no pending asynchronous writes. */
+void os_aio_wait_until_no_pending_writes()
 {
-	if (srv_use_native_aio) {
-		/* We do not use simulated aio: do nothing */
-
-		return;
-	}
-
-	os_aio_recommend_sleep_for_read_threads	= false;
-
-	for (ulint i = 0; i < os_aio_n_segments; i++) {
-		AIO::wake_simulated_handler_thread(i);
-	}
+  os_aio_wait_until_no_pending_writes_low();
+  buf_dblwr.wait_flush_buffered_writes();
 }
 
-/** Select the IO slot array
-@param[in,out]	type		Type of IO, READ or WRITE
-@param[in]	read_only	true if running in read-only mode
-@param[in]	mode		IO mode
-@return slot array or NULL if invalid mode specified */
-AIO*
-AIO::select_slot_array(IORequest& type, bool read_only, ulint mode)
+/** Wait until there are no pending asynchronous reads. */
+void os_aio_wait_until_no_pending_reads()
 {
-	AIO*	array;
+  const auto notify_wait= read_slots->pending_io_count();
 
-	ut_ad(type.validate());
+  if (notify_wait)
+    tpool::tpool_wait_begin();
 
-	switch (mode) {
-	case OS_AIO_NORMAL:
+  read_slots->wait();
 
-		array = type.is_read() ? AIO::s_reads : AIO::s_writes;
-		break;
-
-	case OS_AIO_IBUF:
-		ut_ad(type.is_read());
-
-		/* Reduce probability of deadlock bugs in connection with ibuf:
-		do not let the ibuf i/o handler sleep */
-
-		type.clear_do_not_wake();
-
-		array = read_only ? AIO::s_reads : AIO::s_ibuf;
-		break;
-
-	case OS_AIO_LOG:
-
-		array = read_only ? AIO::s_reads : AIO::s_log;
-		break;
-
-	case OS_AIO_SYNC:
-
-		array = AIO::s_sync;
-#if defined(LINUX_NATIVE_AIO)
-		/* In Linux native AIO we don't use sync IO array. */
-		ut_a(!srv_use_native_aio);
-#endif /* LINUX_NATIVE_AIO */
-		break;
-
-	default:
-		ut_error;
-		array = NULL; /* Eliminate compiler warning */
-	}
-
-	return(array);
+  if (notify_wait)
+    tpool::tpool_wait_end();
 }
 
-#ifdef WIN_ASYNC_IO
-/** This function is only used in Windows asynchronous i/o.
-Waits for an aio operation to complete. This function is used to wait the
-for completed requests. The aio array of pending requests is divided
-into segments. The thread specifies which segment or slot it wants to wait
-for. NOTE: this function will also take care of freeing the aio slot,
-therefore no other thread is allowed to do the freeing!
-@param[in]	segment		The number of the segment in the aio arrays to
-				wait for; segment 0 is the ibuf I/O thread,
-				segment 1 the log I/O thread, then follow the
-				non-ibuf read threads, and as the last are the
-				non-ibuf write threads; if this is
-				ULINT_UNDEFINED, then it means that sync AIO
-				is used, and this parameter is ignored
-@param[in]	pos		this parameter is used only in sync AIO:
-				wait for the aio slot at this position
-@param[out]	m1		the messages passed with the AIO request; note
-				that also in the case where the AIO operation
-				failed, these output parameters are valid and
-				can be used to restart the operation,
-				for example
-@param[out]	m2		callback message
-@param[out]	type		OS_FILE_WRITE or ..._READ
-@return DB_SUCCESS or error code */
-
-
-
-static
-dberr_t
-os_aio_windows_handler(
-	ulint		segment,
-	ulint		pos,
-	fil_node_t**	m1,
-	void**		m2,
-	IORequest*	type)
+/** Request a read or write.
+@param type		I/O request
+@param buf		buffer
+@param offset		file offset
+@param n		number of bytes
+@retval DB_SUCCESS if request was queued successfully
+@retval DB_IO_ERROR on I/O error */
+dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n)
 {
-	Slot*		slot= 0;
-	dberr_t		err;
-
-	BOOL		ret;
-	ULONG_PTR	key;
-
-	ut_a(segment != ULINT_UNDEFINED);
-
-	/* NOTE! We only access constant fields in os_aio_array. Therefore
-	we do not have to acquire the protecting mutex yet */
-
-	ut_ad(os_aio_validate_skip());
-	AIO *my_array;
-	AIO::get_array_and_local_segment(&my_array, segment);
-
-	HANDLE port = my_array->m_completion_port;
-	ut_ad(port);
-	for (;;) {
-		DWORD len;
-		ret = GetQueuedCompletionStatus(port, &len, &key,
-		(OVERLAPPED **)&slot, INFINITE);
-
-		/* If shutdown key was received, repost the shutdown message and exit */
-		if (ret && key == IOCP_SHUTDOWN_KEY) {
-			PostQueuedCompletionStatus(port, 0, key, NULL);
-			*m1 = NULL;
-			*m2 = NULL;
-			return (DB_SUCCESS);
-		}
-
-		ut_a(slot);
-
-		if (!ret) {
-			/* IO failed */
-			break;
-		}
-
-		slot->n_bytes= len;
-		ut_a(slot->array);
-		HANDLE slot_port = slot->array->m_completion_port;
-		if (slot_port != port) {
-			/* there are no redirections between data and log */
-			ut_ad(port == data_completion_port);
-			ut_ad(slot_port != log_completion_port);
-
-			/*
-			Redirect completions  to the dedicated completion port
-			and threads.
-
-			"Write array" threads receive write,read and ibuf
-			notifications, read and ibuf completions are redirected.
-
-			Forwarding IO completion this way costs a context switch,
-			and this seems tolerable  since asynchronous reads are by
-			far less frequent.
-			*/
-			ut_a(PostQueuedCompletionStatus(slot_port,
-				len, key, &slot->control));
-		}
-		else {
-			break;
-		}
-	}
-
-	ut_a(slot->is_reserved);
-
-	*m1 = slot->m1;
-	*m2 = slot->m2;
-
-	*type = slot->type;
-
-	bool retry = false;
-
-	if (ret && slot->n_bytes == slot->len) {
-
-		err = DB_SUCCESS;
-
-	} else if (os_file_handle_error(slot->name, "Windows aio")) {
-
-		retry = true;
-
-	} else {
-
-		err = DB_IO_ERROR;
-	}
-
-
-	if (retry) {
-		/* Retry failed read/write operation synchronously. */
-
-#ifdef UNIV_PFS_IO
-		/* This read/write does not go through os_file_read
-		and os_file_write APIs, need to register with
-		performance schema explicitly here. */
-		PSI_file_locker_state	state;
-		struct PSI_file_locker* locker = NULL;
-
-		register_pfs_file_io_begin(
-			&state, locker, slot->file, slot->len,
-			slot->type.is_write()
-			? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__);
-#endif /* UNIV_PFS_IO */
-
-		ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
-
-		ssize_t	n_bytes = SyncFileIO::execute(slot);
-
-#ifdef UNIV_PFS_IO
-		register_pfs_file_io_end(locker, slot->len);
-#endif /* UNIV_PFS_IO */
-
-		err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR;
-	}
-
-	if (err == DB_SUCCESS) {
-		err = AIOHandler::post_io_processing(slot);
-	}
-
-	slot->array->release_with_mutex(slot);
-
-	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
-		&& !buf_page_cleaner_is_active
-		&& os_aio_all_slots_free()) {
-			/* Last IO, wakeup other io  threads */
-			AIO::wake_at_shutdown();
-	}
-	return(err);
-}
-#endif /* WIN_ASYNC_IO */
-
-/**
-NOTE! Use the corresponding macro os_aio(), not directly this function!
-Requests an asynchronous i/o operation.
-@param[in,out]	type		IO request context
-@param[in]	mode		IO mode
-@param[in]	name		Name of the file or path as NUL terminated
-				string
-@param[in]	file		Open file handle
-@param[out]	buf		buffer where to read
-@param[in]	offset		file offset where to read
-@param[in]	n		number of bytes to read
-@param[in]	read_only	if true read only mode checks are enforced
-@param[in,out]	m1		Message for the AIO handler, (can be used to
-				identify a completed AIO operation); ignored
-				if mode is OS_AIO_SYNC
-@param[in,out]	m2		message for the AIO handler (can be used to
-				identify a completed AIO operation); ignored
-				if mode is OS_AIO_SYNC
-
-@return DB_SUCCESS or error code */
-dberr_t
-os_aio_func(
-	IORequest&	type,
-	ulint		mode,
-	const char*	name,
-	pfs_os_file_t	file,
-	void*		buf,
-	os_offset_t	offset,
-	ulint		n,
-	bool		read_only,
-	fil_node_t*	m1,
-	void*		m2)
-{
-#ifdef WIN_ASYNC_IO
-	BOOL		ret = TRUE;
-#endif /* WIN_ASYNC_IO */
-
 	ut_ad(n > 0);
 	ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0);
 	ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0);
-	ut_ad(os_aio_validate_skip());
+	ut_ad(type.is_read() || type.is_write());
+	ut_ad(type.node);
+	ut_ad(type.node->is_open());
 
 #ifdef WIN_ASYNC_IO
 	ut_ad((n & 0xFFFFFFFFUL) == n);
 #endif /* WIN_ASYNC_IO */
 
-	DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
-			mode = OS_AIO_SYNC; os_has_said_disk_full = FALSE;);
-
-	if (mode == OS_AIO_SYNC) {
-		if (type.is_read()) {
-			return(os_file_read_func(type, file, buf, offset, n));
-		}
-
-		ut_ad(type.is_write());
-
-		return(os_file_write_func(type, name, file, buf, offset, n));
+#ifdef UNIV_PFS_IO
+	PSI_file_locker_state state;
+	PSI_file_locker* locker= nullptr;
+	register_pfs_file_io_begin(&state, locker, type.node->handle, n,
+				   type.is_write()
+				   ? PSI_FILE_WRITE : PSI_FILE_READ,
+				   __FILE__, __LINE__);
+#endif /* UNIV_PFS_IO */
+	dberr_t err = DB_SUCCESS;
+
+	if (!type.is_async()) {
+		err = type.is_read()
+			? os_file_read_func(type, type.node->handle,
+					    buf, offset, n)
+			: os_file_write_func(type, type.node->name,
+					     type.node->handle,
+					     buf, offset, n);
+func_exit:
+#ifdef UNIV_PFS_IO
+		register_pfs_file_io_end(locker, n);
+#endif /* UNIV_PFS_IO */
+		return err;
 	}
 
-try_again:
-
-	AIO*	array;
-
-	array = AIO::select_slot_array(type, read_only, mode);
-
-	Slot*	slot;
-
-	slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n);
-
 	if (type.is_read()) {
-
-
-		if (srv_use_native_aio) {
-
-			++os_n_file_reads;
-
-			os_bytes_read_since_printout += n;
-#ifdef WIN_ASYNC_IO
-			ret = ReadFile(
-				file, slot->ptr, slot->len,
-				NULL, &slot->control);
-#elif defined(LINUX_NATIVE_AIO)
-			if (!array->linux_dispatch(slot)) {
-				goto err_exit;
-			}
-#endif /* WIN_ASYNC_IO */
-		} else if (type.is_wake()) {
-			AIO::wake_simulated_handler_thread(
-				AIO::get_segment_no_from_slot(array, slot));
-		}
-	} else if (type.is_write()) {
-
-		if (srv_use_native_aio) {
-			++os_n_file_writes;
-
-#ifdef WIN_ASYNC_IO
-			ret = WriteFile(
-				file, slot->ptr, slot->len,
-				NULL, &slot->control);
-#elif defined(LINUX_NATIVE_AIO)
-			if (!array->linux_dispatch(slot)) {
-				goto err_exit;
-			}
-#endif /* WIN_ASYNC_IO */
-
-		} else if (type.is_wake()) {
-			AIO::wake_simulated_handler_thread(
-				AIO::get_segment_no_from_slot(array, slot));
-		}
+		++os_n_file_reads;
 	} else {
-		ut_error;
-	}
-
-#ifdef WIN_ASYNC_IO
-	if (ret || (GetLastError() == ERROR_IO_PENDING)) {
-		/* aio completed or was queued successfully! */
-		return(DB_SUCCESS);
-	}
-
-	goto err_exit;
-
-#endif /* WIN_ASYNC_IO */
-
-	/* AIO request was queued successfully! */
-	return(DB_SUCCESS);
-
-#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
-err_exit:
-#endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
-
-	array->release_with_mutex(slot);
-
-	if (os_file_handle_error(
-		name, type.is_read() ? "aio read" : "aio write")) {
-
-		goto try_again;
-	}
-
-	return(DB_IO_ERROR);
-}
-
-/** Simulated AIO handler for reaping IO requests */
-class SimulatedAIOHandler {
-
-public:
-
-	/** Constructor
-	@param[in,out]	array	The AIO array
-	@param[in]	segment	Local segment in the array */
-	SimulatedAIOHandler(AIO* array, ulint segment)
-		:
-		m_oldest(),
-		m_n_elems(),
-		m_lowest_offset(IB_UINT64_MAX),
-		m_array(array),
-		m_n_slots(),
-		m_segment(segment),
-		m_ptr(),
-		m_buf()
-	{
-		ut_ad(m_segment < 100);
-
-		m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE);
-	}
-
-	/** Destructor */
-	~SimulatedAIOHandler()
-	{
-		if (m_ptr != NULL) {
-			ut_free(m_ptr);
-		}
-	}
-
-	/** Reset the state of the handler
-	@param[in]	n_slots	Number of pending AIO operations supported */
-	void init(ulint n_slots)
-	{
-		m_oldest = 0;
-		m_n_elems = 0;
-		m_n_slots = n_slots;
-		m_lowest_offset = IB_UINT64_MAX;
-
-		if (m_ptr != NULL) {
-			ut_free(m_ptr);
-			m_ptr = m_buf = NULL;
-		}
-
-		m_slots[0] = NULL;
-	}
-
-	/** Check if there is a slot for which the i/o has already been done
-	@param[out]	n_reserved	Number of reserved slots
-	@return the first completed slot that is found. */
-	Slot* check_completed(ulint* n_reserved)
-	{
-		ulint	offset = m_segment * m_n_slots;
-
-		*n_reserved = 0;
-
-		Slot*	slot;
-
-		slot = m_array->at(offset);
-
-		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
-
-			if (slot->is_reserved) {
-
-				if (slot->io_already_done) {
-
-					ut_a(slot->is_reserved);
-
-					return(slot);
-				}
-
-				++*n_reserved;
-			}
-		}
-
-		return(NULL);
-	}
-
-	/** If there are at least 2 seconds old requests, then pick the
-	oldest one to prevent starvation.  If several requests have the
-	same age, then pick the one at the lowest offset.
-	@return true if request was selected */
-	bool select()
-	{
-		if (!select_oldest()) {
-
-			return(select_lowest_offset());
-		}
-
-		return(true);
-	}
-
-	/** Check if there are several consecutive blocks
-	to read or write. Merge them if found. */
-	void merge()
-	{
-		/* if m_n_elems != 0, then we have assigned
-		something valid to consecutive_ios[0] */
-		ut_ad(m_n_elems != 0);
-		ut_ad(first_slot() != NULL);
-
-		Slot*	slot = first_slot();
-
-		while (!merge_adjacent(slot)) {
-			/* No op */
-		}
-	}
-
-	/** We have now collected n_consecutive I/O requests
-	in the array; allocate a single buffer which can hold
-	all data, and perform the I/O
-	@return the length of the buffer */
-	ulint allocate_buffer()
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		ulint	len;
-		Slot*	slot = first_slot();
-
-		ut_ad(m_ptr == NULL);
-
-		if (slot->type.is_read() && m_n_elems > 1) {
-
-			len = 0;
-
-			for (ulint i = 0; i < m_n_elems; ++i) {
-				len += m_slots[i]->len;
-			}
-
-			m_ptr = static_cast<byte*>(
-				ut_malloc_nokey(len + srv_page_size));
-
-			m_buf = static_cast<byte*>(
-				ut_align(m_ptr, srv_page_size));
-
-		} else {
-			len = first_slot()->len;
-			m_buf = first_slot()->buf;
-		}
-
-		return(len);
-	}
-
-	/** We have to compress the individual pages and punch
-	holes in them on a page by page basis when writing to
-	tables that can be compresed at the IO level.
-	@param[in]	len		Value returned by allocate_buffer */
-	void copy_to_buffer(ulint len)
-	{
-		Slot*	slot = first_slot();
-
-		if (len > slot->len && slot->type.is_write()) {
-
-			byte*	ptr = m_buf;
-
-			ut_ad(ptr != slot->buf);
-
-			/* Copy the buffers to the combined buffer */
-			for (ulint i = 0; i < m_n_elems; ++i) {
-
-				slot = m_slots[i];
-
-				memmove(ptr, slot->buf, slot->len);
-
-				ptr += slot->len;
-			}
-		}
+		++os_n_file_writes;
 	}
 
-	/** Do the I/O with ordinary, synchronous i/o functions:
-	@param[in]	len		Length of buffer for IO */
-	void io()
-	{
-		if (first_slot()->type.is_write()) {
+	compile_time_assert(sizeof(IORequest) <= tpool::MAX_AIO_USERDATA_LEN);
+	io_slots* slots= type.is_read() ? read_slots : write_slots;
+	tpool::aiocb* cb = slots->acquire();
 
-			for (ulint i = 0; i < m_n_elems; ++i) {
-				write(m_slots[i]);
-			}
+	cb->m_buffer = buf;
+	cb->m_callback = (tpool::callback_func)io_callback;
+	cb->m_group = slots->get_task_group();
+	cb->m_fh = type.node->handle.m_file;
+	cb->m_len = (int)n;
+	cb->m_offset = offset;
+	cb->m_opcode = type.is_read() ? tpool::aio_opcode::AIO_PREAD : tpool::aio_opcode::AIO_PWRITE;
+	new (cb->m_userdata) IORequest{type};
 
-		} else {
-
-			for (ulint i = 0; i < m_n_elems; ++i) {
-				read(m_slots[i]);
-			}
-		}
-	}
-
-	/** Mark the i/os done in slots */
-	void done()
-	{
-		for (ulint i = 0; i < m_n_elems; ++i) {
-			m_slots[i]->io_already_done = true;
-		}
-	}
-
-	/** @return the first slot in the consecutive array */
-	Slot* first_slot()
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		ut_a(m_n_elems > 0);
-
-		return(m_slots[0]);
-	}
-
-	/** Wait for I/O requests
-	@param[in]	global_segment	The global segment
-	@param[in,out]	event		Wait on event if no active requests
-	@return the number of slots */
-	ulint check_pending(
-		ulint		global_segment,
-		os_event_t	event)
-		MY_ATTRIBUTE((warn_unused_result));
-private:
-
-	/** Do the file read
-	@param[in,out]	slot		Slot that has the IO context */
-	void read(Slot* slot)
-	{
-		dberr_t	err = os_file_read(
-			slot->type,
-			slot->file,
-			slot->ptr,
-			slot->offset,
-			slot->len);
-
-		ut_a(err == DB_SUCCESS);
-	}
-
-	/** Do the file read
-	@param[in,out]	slot		Slot that has the IO context */
-	void write(Slot* slot)
-	{
-		dberr_t	err = os_file_write(
-			slot->type,
-			slot->name,
-			slot->file,
-			slot->ptr,
-			slot->offset,
-			slot->len);
-
-		ut_a(err == DB_SUCCESS);
-	}
-
-	/** @return true if the slots are adjacent and can be merged */
-	bool adjacent(const Slot* s1, const Slot* s2) const
-	{
-		return(s1 != s2
-		       && s1->file == s2->file
-		       && s2->offset == s1->offset + s1->len
-		       && s1->type == s2->type);
-	}
-
-	/** @return true if merge limit reached or no adjacent slots found. */
-	bool merge_adjacent(Slot*& current)
-	{
-		Slot*	slot;
-		ulint	offset = m_segment * m_n_slots;
-
-		slot = m_array->at(offset);
-
-		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
-
-			if (slot->is_reserved && adjacent(current, slot)) {
-
-				current = slot;
-
-				/* Found a consecutive i/o request */
-
-				m_slots[m_n_elems] = slot;
-
-				++m_n_elems;
-
-				return(m_n_elems >= m_slots.capacity());
-			}
-		}
-
-		return(true);
-	}
-
-	/** There were no old requests. Look for an I/O request at the lowest
-	offset in the array (we ignore the high 32 bits of the offset in these
-	heuristics) */
-	bool select_lowest_offset()
-	{
-		ut_ad(m_n_elems == 0);
-
-		ulint	offset = m_segment * m_n_slots;
-
-		m_lowest_offset = IB_UINT64_MAX;
-
-		for (ulint i = 0; i < m_n_slots; ++i) {
-			Slot*	slot;
-
-			slot = m_array->at(i + offset);
-
-			if (slot->is_reserved
-			    && slot->offset < m_lowest_offset) {
-
-				/* Found an i/o request */
-				m_slots[0] = slot;
-
-				m_n_elems = 1;
-
-				m_lowest_offset = slot->offset;
-			}
-		}
-
-		return(m_n_elems > 0);
-	}
-
-	/** Select the slot if it is older than the current oldest slot.
-	@param[in]	slot		The slot to check */
-	void select_if_older(Slot* slot)
-	{
-		ulint	age;
-
-		age = (ulint) difftime(time(NULL), slot->reservation_time);
-
-		if ((age >= 2 && age > m_oldest)
-		    || (age >= 2
-			&& age == m_oldest
-			&& slot->offset < m_lowest_offset)) {
-
-			/* Found an i/o request */
-			m_slots[0] = slot;
-
-			m_n_elems = 1;
-
-			m_oldest = age;
-
-			m_lowest_offset = slot->offset;
-		}
-	}
-
-	/** Select th oldest slot in the array
-	@return true if oldest slot found */
-	bool select_oldest()
-	{
-		ut_ad(m_n_elems == 0);
-
-		Slot*	slot;
-		ulint	offset = m_n_slots * m_segment;
-
-		slot = m_array->at(offset);
-
-		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
-
-			if (slot->is_reserved) {
-				select_if_older(slot);
-			}
-		}
-
-		return(m_n_elems > 0);
-	}
-
-	typedef std::vector<Slot*> slots_t;
-
-private:
-	ulint		m_oldest;
-	ulint		m_n_elems;
-	os_offset_t	m_lowest_offset;
-
-	AIO*		m_array;
-	ulint		m_n_slots;
-	ulint		m_segment;
-
-	slots_t		m_slots;
-
-	byte*		m_ptr;
-	byte*		m_buf;
-};
-
-/** Wait for I/O requests
-@return the number of slots */
-ulint
-SimulatedAIOHandler::check_pending(
-	ulint		global_segment,
-	os_event_t	event)
-{
-	/* NOTE! We only access constant fields in os_aio_array.
-	Therefore we do not have to acquire the protecting mutex yet */
-
-	ut_ad(os_aio_validate_skip());
-
-	ut_ad(m_segment < m_array->get_n_segments());
-
-	/* Look through n slots after the segment * n'th slot */
-
-	if (AIO::is_read(m_array)
-	    && os_aio_recommend_sleep_for_read_threads) {
-
-		/* Give other threads chance to add several
-		I/Os to the array at once. */
-
-		srv_set_io_thread_op_info(
-			global_segment, "waiting for i/o request");
-
-		os_event_wait(event);
-
-		return(0);
-	}
-
-	return(m_array->slots_per_segment());
-}
-
-/** Does simulated AIO. This function should be called by an i/o-handler
-thread.
-
-@param[in]	segment	The number of the segment in the aio arrays to wait
-			for; segment 0 is the ibuf i/o thread, segment 1 the
-			log i/o thread, then follow the non-ibuf read threads,
-			and as the last are the non-ibuf write threads
-@param[out]	m1	the messages passed with the AIO request; note that
-			also in the case where the AIO operation failed, these
-			output parameters are valid and can be used to restart
-			the operation, for example
-@param[out]	m2	Callback argument
-@param[in]	type	IO context
-@return DB_SUCCESS or error code */
-static
-dberr_t
-os_aio_simulated_handler(
-	ulint		global_segment,
-	fil_node_t**	m1,
-	void**		m2,
-	IORequest*	type)
-{
-	Slot*		slot;
-	AIO*		array;
-	ulint		segment;
-	os_event_t	event = os_aio_segment_wait_events[global_segment];
-
-	segment = AIO::get_array_and_local_segment(&array, global_segment);
-
-	SimulatedAIOHandler	handler(array, segment);
-
-	for (;;) {
-
-		srv_set_io_thread_op_info(
-			global_segment, "looking for i/o requests (a)");
-
-		ulint	n_slots = handler.check_pending(global_segment, event);
-
-		if (n_slots == 0) {
-			continue;
-		}
-
-		handler.init(n_slots);
-
-		srv_set_io_thread_op_info(
-			global_segment, "looking for i/o requests (b)");
-
-		array->acquire();
-
-		ulint	n_reserved;
-
-		slot = handler.check_completed(&n_reserved);
-
-		if (slot != NULL) {
-
-			break;
-
-		} else if (n_reserved == 0
-			   && !buf_page_cleaner_is_active
-			   && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
-
-			/* There is no completed request. If there
-			are no pending request at all, and the system
-			is being shut down, exit. */
-
-			array->release();
-
-			*m1 = NULL;
-
-			*m2 = NULL;
-
-			return(DB_SUCCESS);
-
-		} else if (handler.select()) {
-
-			break;
-		}
-
-		/* No I/O requested at the moment */
-
-		srv_set_io_thread_op_info(
-			global_segment, "resetting wait event");
-
-		/* We wait here until tbere are more IO requests
-		for this segment. */
-
-		os_event_reset(event);
-
-		array->release();
-
-		srv_set_io_thread_op_info(
-			global_segment, "waiting for i/o request");
-
-		os_event_wait(event);
-	}
-
-	/** Found a slot that has already completed its IO */
-
-	if (slot == NULL) {
-		/* Merge adjacent requests */
-		handler.merge();
-
-		/* Check if there are several consecutive blocks
-		to read or write */
-
-		srv_set_io_thread_op_info(
-			global_segment, "consecutive i/o requests");
-
-		// Note: We don't support write combining for simulated AIO.
-		//ulint	total_len = handler.allocate_buffer();
-
-		/* We release the array mutex for the time of the I/O: NOTE that
-		this assumes that there is just one i/o-handler thread serving
-		a single segment of slots! */
-
-		array->release();
-
-		// Note: We don't support write combining for simulated AIO.
-		//handler.copy_to_buffer(total_len);
-
-		srv_set_io_thread_op_info(global_segment, "doing file i/o");
-
-		handler.io();
-
-		srv_set_io_thread_op_info(global_segment, "file i/o done");
-
-		array->acquire();
-
-		handler.done();
-
-		/* We return the messages for the first slot now, and if there
-		were several slots, the messages will be returned with
-		subsequent calls of this function */
-
-		slot = handler.first_slot();
-	}
-
-	ut_ad(slot->is_reserved);
-
-	*m1 = slot->m1;
-	*m2 = slot->m2;
-
-	*type = slot->type;
-
-	array->release(slot);
-
-	array->release();
-
-	return(DB_SUCCESS);
-}
-
-/** Get the total number of pending IOs
-@return the total number of pending IOs */
-ulint
-AIO::total_pending_io_count()
-{
-	ulint	count = s_reads->pending_io_count();
-
-	if (s_writes != NULL) {
-		count += s_writes->pending_io_count();
-	}
-
-	if (s_ibuf != NULL) {
-		count += s_ibuf->pending_io_count();
-	}
-
-	if (s_log != NULL) {
-		count += s_log->pending_io_count();
-	}
-
-	if (s_sync != NULL) {
-		count += s_sync->pending_io_count();
-	}
-
-	return(count);
-}
-
-/** Validates the consistency the aio system.
-@return true if ok */
-static
-bool
-os_aio_validate()
-{
-	/* The methods countds and validates, we ignore the count. */
-	AIO::total_pending_io_count();
-
-	return(true);
-}
-
-/** Prints pending IO requests per segment of an aio array.
-We probably don't need per segment statistics but they can help us
-during development phase to see if the IO requests are being
-distributed as expected.
-@param[in,out]	file		File where to print
-@param[in]	segments	Pending IO array */
-void
-AIO::print_segment_info(
-	FILE*		file,
-	const ulint*	segments)
-{
-	ut_ad(m_n_segments > 0);
-
-	if (m_n_segments > 1) {
-
-		fprintf(file, " [");
-
-		for (ulint i = 0; i < m_n_segments; ++i, ++segments) {
-
-			if (i != 0) {
-				fprintf(file, ", ");
-			}
-
-			fprintf(file, ULINTPF, *segments);
-		}
-
-		fprintf(file, "] ");
-	}
-}
-
-/** Prints info about the aio array.
-@param[in,out]	file		Where to print */
-void
-AIO::print(FILE* file)
-{
-	ulint	count = 0;
-	ulint	n_res_seg[SRV_MAX_N_IO_THREADS];
-
-	mutex_enter(&m_mutex);
-
-	ut_a(!m_slots.empty());
-	ut_a(m_n_segments > 0);
-
-	memset(n_res_seg, 0x0, sizeof(n_res_seg));
-
-	for (ulint i = 0; i < m_slots.size(); ++i) {
-		Slot&	slot = m_slots[i];
-		ulint	segment = (i * m_n_segments) / m_slots.size();
-
-		if (slot.is_reserved) {
-
-			++count;
-
-			++n_res_seg[segment];
-
-			ut_a(slot.len > 0);
-		}
-	}
-
-	ut_a(m_n_reserved == count);
-
-	print_segment_info(file, n_res_seg);
-
-	mutex_exit(&m_mutex);
-}
-
-/** Print all the AIO segments
-@param[in,out]	file		Where to print */
-void
-AIO::print_all(FILE* file)
-{
-	s_reads->print(file);
-
-	if (s_writes != NULL) {
-		fputs(", aio writes:", file);
-		s_writes->print(file);
-	}
-
-	if (s_ibuf != NULL) {
-		fputs(",\n ibuf aio reads:", file);
-		s_ibuf->print(file);
-	}
+	ut_a(reinterpret_cast<size_t>(cb->m_buffer) % OS_FILE_LOG_BLOCK_SIZE
+	     == 0);
+	ut_a(cb->m_len % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_a(cb->m_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
 
-	if (s_log != NULL) {
-		fputs(", log i/o's:", file);
-		s_log->print(file);
+	if (srv_thread_pool->submit_io(cb)) {
+		slots->release(cb);
+		os_file_handle_error(type.node->name, type.is_read()
+				     ? "aio read" : "aio write");
+		err = DB_IO_ERROR;
 	}
 
-	if (s_sync != NULL) {
-		fputs(", sync i/o's:", file);
-		s_sync->print(file);
-	}
+	goto func_exit;
 }
 
 /** Prints info of the aio arrays.
@@ -7096,29 +3891,7 @@ os_aio_print(FILE*	file)
 {
 	time_t		current_time;
 	double		time_elapsed;
-	double		avg_bytes_read;
-
-	for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
-		fprintf(file, "I/O thread " ULINTPF " state: %s (%s)",
-			i,
-			srv_io_thread_op_info[i],
-			srv_io_thread_function[i]);
-
-#ifndef _WIN32
-		if (!srv_use_native_aio
-		    && os_event_is_set(os_aio_segment_wait_events[i])) {
-			fprintf(file, " ev set");
-		}
-#endif /* _WIN32 */
 
-		fprintf(file, "\n");
-	}
-
-	fputs("Pending normal aio reads:", file);
-
-	AIO::print_all(file);
-
-	putc('\n', file);
 	current_time = time(NULL);
 	time_elapsed = 0.001 + difftime(current_time, os_last_printout);
 
@@ -7128,8 +3901,8 @@ os_aio_print(FILE*	file)
 		ULINTPF " OS file reads, "
 		ULINTPF " OS file writes, "
 		ULINTPF " OS fsyncs\n",
-		fil_n_pending_log_flushes,
-		fil_n_pending_tablespace_flushes,
+		log_sys.get_pending_flushes(),
+		ulint{fil_n_pending_tablespace_flushes},
 		ulint{os_n_file_reads},
 		os_n_file_writes,
 		os_n_fsyncs);
@@ -7143,22 +3916,20 @@ os_aio_print(FILE*	file)
 			n_reads, n_writes);
 	}
 
-	if (os_n_file_reads == os_n_file_reads_old) {
-		avg_bytes_read = 0.0;
-	} else {
-		avg_bytes_read = (double) os_bytes_read_since_printout
-			/ (os_n_file_reads - os_n_file_reads_old);
-	}
+	ulint avg_bytes_read = (os_n_file_reads == os_n_file_reads_old)
+		? 0
+		: os_bytes_read_since_printout
+		/ (os_n_file_reads - os_n_file_reads_old);
 
 	fprintf(file,
 		"%.2f reads/s, " ULINTPF " avg bytes/read,"
 		" %.2f writes/s, %.2f fsyncs/s\n",
-		(os_n_file_reads - os_n_file_reads_old)
+		static_cast<double>(os_n_file_reads - os_n_file_reads_old)
 		/ time_elapsed,
-		(ulint) avg_bytes_read,
-		(os_n_file_writes - os_n_file_writes_old)
+		avg_bytes_read,
+		static_cast<double>(os_n_file_writes - os_n_file_writes_old)
 		/ time_elapsed,
-		(os_n_fsyncs - os_n_fsyncs_old)
+		static_cast<double>(os_n_fsyncs - os_n_fsyncs_old)
 		/ time_elapsed);
 
 	os_n_file_reads_old = os_n_file_reads;
@@ -7188,82 +3959,6 @@ os_aio_refresh_stats()
 	os_last_printout = time(NULL);
 }
 
-/** Checks that all slots in the system have been freed, that is, there are
-no pending io operations.
-@return true if all free */
-bool
-os_aio_all_slots_free()
-{
-	return(AIO::total_pending_io_count() == 0);
-}
-
-#ifdef UNIV_DEBUG
-/** Prints all pending IO for the array
-@param[in]	file	file where to print
-@param[in]	array	array to process */
-void
-AIO::to_file(FILE* file) const
-{
-	acquire();
-
-	fprintf(file, " " ULINTPF "\n", m_n_reserved);
-
-	for (ulint i = 0; i < m_slots.size(); ++i) {
-
-		const Slot&	slot = m_slots[i];
-
-		if (slot.is_reserved) {
-
-			fprintf(file,
-				"%s IO for %s (offset=" UINT64PF
-				", size=%lu)\n",
-				slot.type.is_read() ? "read" : "write",
-				slot.name, slot.offset, (unsigned long)(slot.len));
-		}
-	}
-
-	release();
-}
-
-/** Print pending IOs for all arrays */
-void
-AIO::print_to_file(FILE* file)
-{
-	fprintf(file, "Pending normal aio reads:");
-
-	s_reads->to_file(file);
-
-	if (s_writes != NULL) {
-		fprintf(file, "Pending normal aio writes:");
-		s_writes->to_file(file);
-	}
-
-	if (s_ibuf != NULL) {
-		fprintf(file, "Pending ibuf aio reads:");
-		s_ibuf->to_file(file);
-	}
-
-	if (s_log != NULL) {
-		fprintf(file, "Pending log i/o's:");
-		s_log->to_file(file);
-	}
-
-	if (s_sync != NULL) {
-		fprintf(file, "Pending sync i/o's:");
-		s_sync->to_file(file);
-	}
-}
-
-/** Prints all pending IO
-@param[in]	file		File where to print */
-void
-os_aio_print_pending_io(
-	FILE*	file)
-{
-	AIO::print_to_file(file);
-}
-
-#endif /* UNIV_DEBUG */
 
 /**
 Set the file create umask
@@ -7468,6 +4163,7 @@ void fil_node_t::find_metadata(os_file_t file
 #else
 	struct stat sbuf;
 	if (!statbuf && !fstat(file, &sbuf)) {
+		MSAN_STAT_WORKAROUND(&sbuf);
 		statbuf = &sbuf;
 	}
 	if (statbuf) {
@@ -7501,24 +4197,23 @@ void fil_node_t::find_metadata(os_file_t file
 }
 
 /** Read the first page of a data file.
-@param[in]	first	whether this is the very first read
 @return	whether the page was found valid */
-bool fil_node_t::read_page0(bool first)
+bool fil_node_t::read_page0()
 {
 	ut_ad(mutex_own(&fil_system.mutex));
-	ut_a(space->purpose != FIL_TYPE_LOG);
-	const ulint psize = space->physical_size();
+	const unsigned psize = space->physical_size();
 #ifndef _WIN32
 	struct stat statbuf;
 	if (fstat(handle, &statbuf)) {
 		return false;
 	}
+	MSAN_STAT_WORKAROUND(&statbuf);
 	os_offset_t size_bytes = statbuf.st_size;
 #else
 	os_offset_t size_bytes = os_file_get_size(handle);
 	ut_a(size_bytes != (os_offset_t) -1);
 #endif
-	const ulint min_size = FIL_IBD_FILE_INITIAL_SIZE * psize;
+	const uint32_t min_size = FIL_IBD_FILE_INITIAL_SIZE * psize;
 
 	if (size_bytes < min_size) {
 		ib::error() << "The size of the file " << name
@@ -7527,22 +4222,25 @@ bool fil_node_t::read_page0(bool first)
 		return false;
 	}
 
-	byte* buf2 = static_cast<byte*>(ut_malloc_nokey(2 * psize));
-
-	/* Align the memory for file i/o if we might have O_DIRECT set */
-	byte* page = static_cast<byte*>(ut_align(buf2, psize));
-	IORequest request(IORequest::READ);
-	if (os_file_read(request, handle, page, 0, psize) != DB_SUCCESS) {
+	page_t *page= static_cast<byte*>(aligned_malloc(psize, psize));
+	if (os_file_read(IORequestRead, handle, page, 0, psize)
+	    != DB_SUCCESS) {
 		ib::error() << "Unable to read first page of file " << name;
-		ut_free(buf2);
+corrupted:
+		aligned_free(page);
 		return false;
 	}
-	const ulint space_id = fsp_header_get_space_id(page);
+
+	const ulint space_id = memcmp_aligned<2>(
+		FIL_PAGE_SPACE_ID + page,
+		FSP_HEADER_OFFSET + FSP_SPACE_ID + page, 4)
+		? ULINT_UNDEFINED
+		: mach_read_from_4(FIL_PAGE_SPACE_ID + page);
 	ulint flags = fsp_header_get_flags(page);
-	const ulint size = fsp_header_get_field(page, FSP_SIZE);
-	const ulint free_limit = fsp_header_get_field(page, FSP_FREE_LIMIT);
-	const ulint free_len = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE
-					    + page);
+	const uint32_t size = fsp_header_get_field(page, FSP_SIZE);
+	const uint32_t free_limit = fsp_header_get_field(page, FSP_FREE_LIMIT);
+	const uint32_t free_len = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE
+					       + page);
 	if (!fil_space_t::is_valid_flags(flags, space->id)) {
 		ulint cflags = fsp_flags_convert_from_101(flags);
 		if (cflags == ULINT_UNDEFINED) {
@@ -7552,8 +4250,7 @@ invalid:
 				<< ib::hex(space->flags)
 				<< " but found " << ib::hex(flags)
 				<< " in the file " << name;
-			ut_free(buf2);
-			return false;
+			goto corrupted;
 		}
 
 		ulint cf = cflags & ~FSP_FLAGS_MEM_MASK;
@@ -7574,7 +4271,7 @@ invalid:
 		space->crypt_data = fil_space_read_crypt_data(
 			fil_space_t::zip_size(flags), page);
 	}
-	ut_free(buf2);
+	aligned_free(page);
 
 	if (UNIV_UNLIKELY(space_id != space->id)) {
 		ib::error() << "Expected tablespace id " << space->id
@@ -7583,40 +4280,26 @@ invalid:
 		return false;
 	}
 
-	if (first) {
-		ut_ad(space->id != TRX_SYS_SPACE);
 #ifdef UNIV_LINUX
-		find_metadata(handle, &statbuf);
+	find_metadata(handle, &statbuf);
 #else
-		find_metadata();
+	find_metadata();
 #endif
+	/* Truncate the size to a multiple of extent size. */
+	ulint	mask = psize * FSP_EXTENT_SIZE - 1;
 
-		/* Truncate the size to a multiple of extent size. */
-		ulint	mask = psize * FSP_EXTENT_SIZE - 1;
-
-		if (size_bytes <= mask) {
-			/* .ibd files start smaller than an
-			extent size. Do not truncate valid data. */
-		} else {
-			size_bytes &= ~os_offset_t(mask);
-		}
-
-		space->flags = (space->flags & FSP_FLAGS_MEM_MASK) | flags;
-
-		this->size = ulint(size_bytes / psize);
-		space->committed_size = space->size += this->size;
-	} else if (space->id != TRX_SYS_SPACE || space->size_in_header) {
-		/* If this is not the first-time open, do nothing.
-		For the system tablespace, we always get invoked as
-		first=false, so we detect the true first-time-open based
-		on size_in_header and proceed to initialize the data. */
-		return true;
+	if (size_bytes <= mask) {
+		/* .ibd files start smaller than an
+		extent size. Do not truncate valid data. */
 	} else {
-		/* Initialize the size of predefined tablespaces
-		to FSP_SIZE. */
-		space->committed_size = size;
+		size_bytes &= ~os_offset_t(mask);
 	}
 
+	space->flags = (space->flags & FSP_FLAGS_MEM_MASK) | flags;
+
+	space->punch_hole = space->is_compressed();
+	this->size = uint32_t(size_bytes / psize);
+	space->set_sizes(this->size);
 	ut_ad(space->free_limit == 0 || space->free_limit == free_limit);
 	ut_ad(space->free_len == 0 || space->free_len == free_len);
 	space->size_in_header = size;
diff --git a/storage/innobase/os/os0proc.cc b/storage/innobase/os/os0proc.cc
deleted file mode 100644
index d22b6ffe887..00000000000
--- a/storage/innobase/os/os0proc.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, 2020, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file os/os0proc.cc
-The interface to the operating system
-process control primitives
-
-Created 9/30/1995 Heikki Tuuri
-*******************************************************/
-
-#include "univ.i"
-#ifdef HAVE_LINUX_LARGE_PAGES
-# include "mysqld.h"
-#endif
-#include "my_valgrind.h"
-
-/* FreeBSD for example has only MAP_ANON, Linux has MAP_ANONYMOUS and
-MAP_ANON but MAP_ANON is marked as deprecated */
-#if defined(MAP_ANONYMOUS)
-#define OS_MAP_ANON	MAP_ANONYMOUS
-#elif defined(MAP_ANON)
-#define OS_MAP_ANON	MAP_ANON
-#endif
-
-/** The total amount of memory currently allocated from the operating
-system with os_mem_alloc_large(). */
-Atomic_counter<ulint>	os_total_large_mem_allocated;
-
-/** Converts the current process id to a number.
-@return process id as a number */
-ulint
-os_proc_get_number(void)
-/*====================*/
-{
-#ifdef _WIN32
-	return(static_cast<ulint>(GetCurrentProcessId()));
-#else
-	return(static_cast<ulint>(getpid()));
-#endif
-}
-
-/** Allocates large pages memory.
-@param[in,out]	n	Number of bytes to allocate
-@return allocated memory */
-void*
-os_mem_alloc_large(
-	ulint*	n)
-{
-	void*	ptr;
-	ulint	size;
-#ifdef HAVE_LINUX_LARGE_PAGES
-	int shmid;
-	struct shmid_ds buf;
-
-	if (!my_use_large_pages || !opt_large_page_size) {
-		goto skip;
-	}
-
-	/* Align block size to opt_large_page_size */
-	ut_ad(ut_is_2pow(opt_large_page_size));
-	size = ut_2pow_round(*n + opt_large_page_size - 1,
-			     ulint(opt_large_page_size));
-
-	shmid = shmget(IPC_PRIVATE, (size_t) size, SHM_HUGETLB | SHM_R | SHM_W);
-	if (shmid < 0) {
-		ib::warn() << "Failed to allocate " << size
-			<< " bytes. errno " << errno;
-		ptr = NULL;
-	} else {
-		ptr = shmat(shmid, NULL, 0);
-		if (ptr == (void*)-1) {
-			ib::warn() << "Failed to attach shared memory segment,"
-				" errno " << errno;
-			ptr = NULL;
-		}
-
-		/* Remove the shared memory segment so that it will be
-		automatically freed after memory is detached or
-		process exits */
-		shmctl(shmid, IPC_RMID, &buf);
-	}
-
-	if (ptr) {
-		*n = size;
-		os_total_large_mem_allocated += size;
-		MEM_UNDEFINED(ptr, size);
-		return(ptr);
-	}
-
-	ib::warn() << "Using conventional memory pool";
-skip:
-#endif /* HAVE_LINUX_LARGE_PAGES */
-
-#ifdef _WIN32
-	SYSTEM_INFO	system_info;
-	GetSystemInfo(&system_info);
-
-	/* Align block size to system page size */
-	ut_ad(ut_is_2pow(system_info.dwPageSize));
-	size = *n = ut_2pow_round<ulint>(*n + (system_info.dwPageSize - 1),
-					 system_info.dwPageSize);
-	ptr = VirtualAlloc(NULL, size, MEM_COMMIT | MEM_RESERVE,
-			   PAGE_READWRITE);
-	if (!ptr) {
-		ib::info() << "VirtualAlloc(" << size << " bytes) failed;"
-			" Windows error " << GetLastError();
-	} else {
-		os_total_large_mem_allocated += size;
-		MEM_UNDEFINED(ptr, size);
-	}
-#else
-	size = getpagesize();
-	/* Align block size to system page size */
-	ut_ad(ut_is_2pow(size));
-	size = *n = ut_2pow_round(*n + (size - 1), size);
-	ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
-		   MAP_PRIVATE | OS_MAP_ANON, -1, 0);
-	if (UNIV_UNLIKELY(ptr == (void*) -1)) {
-		ib::error() << "mmap(" << size << " bytes) failed;"
-			" errno " << errno;
-		ptr = NULL;
-	} else {
-		os_total_large_mem_allocated += size;
-		MEM_UNDEFINED(ptr, size);
-	}
-#endif
-	return(ptr);
-}
-
-/** Frees large pages memory.
-@param[in]	ptr	pointer returned by os_mem_alloc_large()
-@param[in]	size	size returned by os_mem_alloc_large() */
-void
-os_mem_free_large(
-	void	*ptr,
-	ulint	size)
-{
-	ut_a(os_total_large_mem_allocated >= size);
-
-#ifdef __SANITIZE_ADDRESS__
-	// We could have manually poisoned that memory for ASAN.
-	// And we must unpoison it by ourself as specified in documentation
-	// for __asan_poison_memory_region() in sanitizer/asan_interface.h
-	// munmap() doesn't do it for us automatically.
-	MEM_MAKE_ADDRESSABLE(ptr, size);
-#endif /* __SANITIZE_ADDRESS__ */
-
-#ifdef HAVE_LINUX_LARGE_PAGES
-	if (my_use_large_pages && opt_large_page_size && !shmdt(ptr)) {
-		os_total_large_mem_allocated -= size;
-		return;
-	}
-#endif /* HAVE_LINUX_LARGE_PAGES */
-#ifdef _WIN32
-	/* When RELEASE memory, the size parameter must be 0.
-	Do not use MEM_RELEASE with MEM_DECOMMIT. */
-	if (!VirtualFree(ptr, 0, MEM_RELEASE)) {
-		ib::error() << "VirtualFree(" << ptr << ", " << size
-			<< ") failed; Windows error " << GetLastError();
-	} else {
-		os_total_large_mem_allocated -= size;
-	}
-#elif !defined OS_MAP_ANON
-	ut_free(ptr);
-#else
-# if defined(UNIV_SOLARIS)
-	if (munmap(static_cast<caddr_t>(ptr), size)) {
-# else
-	if (munmap(ptr, size)) {
-# endif /* UNIV_SOLARIS */
-		ib::error() << "munmap(" << ptr << ", " << size << ") failed;"
-			" errno " << errno;
-	} else {
-		os_total_large_mem_allocated -= size;
-	}
-#endif
-}
diff --git a/storage/innobase/os/os0thread.cc b/storage/innobase/os/os0thread.cc
index bb5da7df7c4..f3533acfaac 100644
--- a/storage/innobase/os/os0thread.cc
+++ b/storage/innobase/os/os0thread.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2018, MariaDB Corporation.
+Copyright (c) 2017, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -27,60 +27,11 @@ Created 9/8/1995 Heikki Tuuri
 #include "univ.i"
 #include "srv0srv.h"
 
-/** Number of threads active. */
-Atomic_counter<ulint>	os_thread_count;
-
-/***************************************************************//**
-Compares two thread ids for equality.
-@return TRUE if equal */
-ibool
-os_thread_eq(
-/*=========*/
-	os_thread_id_t	a,	/*!< in: OS thread or thread id */
-	os_thread_id_t	b)	/*!< in: OS thread or thread id */
-{
-#ifdef _WIN32
-	if (a == b) {
-		return(TRUE);
-	}
-
-	return(FALSE);
-#else
-	if (pthread_equal(a, b)) {
-		return(TRUE);
-	}
-
-	return(FALSE);
-#endif
-}
-
-/****************************************************************//**
-Converts an OS thread id to a ulint. It is NOT guaranteed that the ulint is
-unique for the thread though!
-@return thread identifier as a number */
-ulint
-os_thread_pf(
-/*=========*/
-	os_thread_id_t	a)	/*!< in: OS thread identifier */
-{
-	return((ulint) a);
-}
-
-/*****************************************************************//**
-Returns the thread identifier of current thread. Currently the thread
-identifier in Unix is the thread handle itself. Note that in HP-UX
-pthread_t is a struct of 3 fields.
-@return current thread identifier */
-os_thread_id_t
-os_thread_get_curr_id(void)
-/*=======================*/
-{
 #ifdef _WIN32
-	return(GetCurrentThreadId());
-#else
-	return(pthread_self());
+bool os_thread_eq(os_thread_id_t a, os_thread_id_t b) { return a == b; }
+void os_thread_yield() { SwitchToThread(); }
+os_thread_id_t os_thread_get_curr_id() { return GetCurrentThreadId(); }
 #endif
-}
 
 /****************************************************************//**
 Creates a new thread of execution. The execution starts from
@@ -89,15 +40,7 @@ NOTE: We count the number of threads in os_thread_exit(). A created
 thread should always use that to exit so thatthe thread count will be
 decremented.
 We do not return an error code because if there is one, we crash here. */
-os_thread_t
-os_thread_create_func(
-/*==================*/
-	os_thread_func_t	func,		/*!< in: pointer to function
-						from which to start */
-	void*			arg,		/*!< in: argument to start
-						function */
-	os_thread_id_t*		thread_id)	/*!< out: id of the created
-						thread, or NULL */
+os_thread_t os_thread_create(os_thread_func_t func, void *arg)
 {
 	os_thread_id_t	new_thread_id;
 
@@ -118,8 +61,6 @@ os_thread_create_func(
 
 	CloseHandle(handle);
 
-	os_thread_count++;
-
 	return((os_thread_t)new_thread_id);
 #else /* _WIN32 else */
 
@@ -133,8 +74,6 @@ os_thread_create_func(
 		abort();
 	}
 
-	os_thread_count++;
-
 	ret = pthread_create(&new_thread_id, &attr, func, arg);
 
 	ut_a(ret == 0);
@@ -143,77 +82,29 @@ os_thread_create_func(
 
 #endif /* not _WIN32 */
 
-	ut_a(os_thread_count <= srv_max_n_threads);
-
-	/* Return the thread_id if the caller requests it. */
-	if (thread_id != NULL) {
-		*thread_id = new_thread_id;
-	}
 	return((os_thread_t)new_thread_id);
 }
 
-/** Waits until the specified thread completes and joins it.
-Its return value is ignored.
-@param[in,out]	thread	thread to join */
-void
-os_thread_join(
-	os_thread_id_t	thread)
-{
-#ifdef _WIN32
-	/* Do nothing. */
-#else
-#ifdef UNIV_DEBUG
-	const int	ret =
-#endif /* UNIV_DEBUG */
-	pthread_join(thread, NULL);
-
-	/* Waiting on already-quit threads is allowed. */
-	ut_ad(ret == 0 || ret == ESRCH);
-#endif /* _WIN32 */
-}
-
-/** Exits the current thread.
-@param[in]	detach	if true, the thread will be detached right before
-exiting. If false, another thread is responsible for joining this thread */
-ATTRIBUTE_NORETURN
-void
-os_thread_exit(bool detach)
+/** Detach and terminate the current thread. */
+ATTRIBUTE_NORETURN void os_thread_exit()
 {
 #ifdef UNIV_DEBUG_THREAD_CREATION
-	ib::info() << "Thread exits, id "
-		<< os_thread_pf(os_thread_get_curr_id());
+	ib::info() << "Thread exits, id " << os_thread_get_curr_id();
 #endif
 
 #ifdef UNIV_PFS_THREAD
 	pfs_delete_thread();
 #endif
 
-	os_thread_count--;
-
 #ifdef _WIN32
 	ExitThread(0);
 #else
-	if (detach) {
-		pthread_detach(pthread_self());
-	}
+	pthread_detach(pthread_self());
 	pthread_exit(NULL);
 #endif
 }
 
 /*****************************************************************//**
-Advises the os to give up remainder of the thread's time slice. */
-void
-os_thread_yield(void)
-/*=================*/
-{
-#if defined(_WIN32)
-	SwitchToThread();
-#else
-	sched_yield();
-#endif
-}
-
-/*****************************************************************//**
 The thread sleeps at least the time given in microseconds. */
 void
 os_thread_sleep(
diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc
index 22a3907d298..cc6b1797d61 100644
--- a/storage/innobase/page/page0cur.cc
+++ b/storage/innobase/page/page0cur.cc
@@ -786,474 +786,496 @@ page_cur_open_on_rnd_user_rec(
 				       ut_rnd_interval(n_recs) + 1);
 }
 
-/** Write a redo log record of inserting a record into an index page.
-@param[in]	insert_rec	inserted record
-@param[in]	rec_size	rec_get_size(insert_rec)
-@param[in]	cursor_rec	predecessor of insert_rec
-@param[in,out]	index		index tree
-@param[in,out]	mtr		mini-transaction */
-void
-page_cur_insert_rec_write_log(
-	const rec_t*	insert_rec,
-	ulint		rec_size,
-	const rec_t*	cursor_rec,
-	dict_index_t*	index,
-	mtr_t*		mtr)
+/**
+Set the number of owned records.
+@param[in,out]  rec     record in block.frame
+@param[in]      n_owned number of records skipped in the sparse page directory
+@param[in]      comp    whether ROW_FORMAT is COMPACT or DYNAMIC */
+static void page_rec_set_n_owned(rec_t *rec, ulint n_owned, bool comp)
 {
-	ulint	cur_rec_size;
-	ulint	extra_size;
-	ulint	cur_extra_size;
-	const byte* ins_ptr;
-	const byte* log_end;
-	ulint	i;
-
-	if (index->table->is_temporary()) {
-		mtr->set_modified();
-		ut_ad(mtr->get_log_mode() == MTR_LOG_NO_REDO);
-		return;
-	}
-
-	ut_a(rec_size < srv_page_size);
-	ut_ad(mtr->is_named_space(index->table->space));
-	ut_ad(page_align(insert_rec) == page_align(cursor_rec));
-	ut_ad(!page_rec_is_comp(insert_rec)
-	      == !dict_table_is_comp(index->table));
-
-	const ulint n_core = page_rec_is_leaf(cursor_rec)
-		? index->n_core_fields : 0;
-
-	{
-		mem_heap_t*	heap		= NULL;
-		rec_offs	cur_offs_[REC_OFFS_NORMAL_SIZE];
-		rec_offs	ins_offs_[REC_OFFS_NORMAL_SIZE];
-
-		rec_offs*	cur_offs;
-		rec_offs*	ins_offs;
-
-		rec_offs_init(cur_offs_);
-		rec_offs_init(ins_offs_);
-
-		cur_offs = rec_get_offsets(cursor_rec, index, cur_offs_,
-					   n_core, ULINT_UNDEFINED, &heap);
-		ins_offs = rec_get_offsets(insert_rec, index, ins_offs_,
-					   n_core, ULINT_UNDEFINED, &heap);
-
-		extra_size = rec_offs_extra_size(ins_offs);
-		cur_extra_size = rec_offs_extra_size(cur_offs);
-		ut_ad(rec_size == rec_offs_size(ins_offs));
-		cur_rec_size = rec_offs_size(cur_offs);
-
-		if (UNIV_LIKELY_NULL(heap)) {
-			mem_heap_free(heap);
-		}
-	}
-
-	ins_ptr = insert_rec - extra_size;
-
-	i = 0;
-
-	if (cur_extra_size == extra_size) {
-		ulint		min_rec_size = ut_min(cur_rec_size, rec_size);
-
-		const byte*	cur_ptr = cursor_rec - cur_extra_size;
-
-		/* Find out the first byte in insert_rec which differs from
-		cursor_rec; skip the bytes in the record info */
-
-		do {
-			if (*ins_ptr == *cur_ptr) {
-				i++;
-				ins_ptr++;
-				cur_ptr++;
-			} else if ((i < extra_size)
-				   && (i >= extra_size
-				       - page_rec_get_base_extra_size
-				       (insert_rec))) {
-				i = extra_size;
-				ins_ptr = insert_rec;
-				cur_ptr = cursor_rec;
-			} else {
-				break;
-			}
-		} while (i < min_rec_size);
-	}
-
-	byte*	log_ptr;
-
-	if (mtr_get_log_mode(mtr) != MTR_LOG_SHORT_INSERTS) {
-
-		if (page_rec_is_comp(insert_rec)) {
-			log_ptr = mlog_open_and_write_index(
-				mtr, insert_rec, index, MLOG_COMP_REC_INSERT,
-				2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN);
-			if (UNIV_UNLIKELY(!log_ptr)) {
-				/* Logging in mtr is switched off
-				during crash recovery: in that case
-				mlog_open returns NULL */
-				return;
-			}
-		} else {
-			log_ptr = mlog_open(mtr, 11
-					    + 2 + 5 + 1 + 5 + 5
-					    + MLOG_BUF_MARGIN);
-			if (UNIV_UNLIKELY(!log_ptr)) {
-				/* Logging in mtr is switched off
-				during crash recovery: in that case
-				mlog_open returns NULL */
-				return;
-			}
-
-			log_ptr = mlog_write_initial_log_record_fast(
-				insert_rec, MLOG_REC_INSERT, log_ptr, mtr);
-		}
-
-		log_end = &log_ptr[2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN];
-		/* Write the cursor rec offset as a 2-byte ulint */
-		mach_write_to_2(log_ptr, page_offset(cursor_rec));
-		log_ptr += 2;
-	} else {
-		log_ptr = mlog_open(mtr, 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN);
-		if (!log_ptr) {
-			/* Logging in mtr is switched off during crash
-			recovery: in that case mlog_open returns NULL */
-			return;
-		}
-		log_end = &log_ptr[5 + 1 + 5 + 5 + MLOG_BUF_MARGIN];
-	}
-
-	if (page_rec_is_comp(insert_rec)) {
-		if (UNIV_UNLIKELY
-		    (rec_get_info_and_status_bits(insert_rec, TRUE)
-		     != rec_get_info_and_status_bits(cursor_rec, TRUE))) {
+  rec-= comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED;
+  *rec= static_cast<byte>((*rec & ~REC_N_OWNED_MASK) |
+                          (n_owned << REC_N_OWNED_SHIFT));
+}
 
-			goto need_extra_info;
-		}
-	} else {
-		if (UNIV_UNLIKELY
-		    (rec_get_info_and_status_bits(insert_rec, FALSE)
-		     != rec_get_info_and_status_bits(cursor_rec, FALSE))) {
+/**
+Split a directory slot which owns too many records.
+@param[in,out]  block   index page
+@param[in,out]  slot    the slot that needs to be split */
+static void page_dir_split_slot(const buf_block_t &block,
+                                page_dir_slot_t *slot)
+{
+  ut_ad(slot <= &block.frame[srv_page_size - PAGE_EMPTY_DIR_START]);
+  slot= my_assume_aligned<2>(slot);
 
-			goto need_extra_info;
-		}
-	}
+  const ulint n_owned= PAGE_DIR_SLOT_MAX_N_OWNED + 1;
 
-	if (extra_size != cur_extra_size || rec_size != cur_rec_size) {
-need_extra_info:
-		/* Write the record end segment length
-		and the extra info storage flag */
-		log_ptr += mach_write_compressed(log_ptr,
-						 2 * (rec_size - i) + 1);
+  ut_ad(page_dir_slot_get_n_owned(slot) == n_owned);
+  static_assert((PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 >=
+                PAGE_DIR_SLOT_MIN_N_OWNED, "compatibility");
 
-		/* Write the info bits */
-		mach_write_to_1(log_ptr,
-				rec_get_info_and_status_bits(
-					insert_rec,
-					page_rec_is_comp(insert_rec)));
-		log_ptr++;
+  /* Find a record approximately in the middle. */
+  const rec_t *rec= page_dir_slot_get_rec(slot + PAGE_DIR_SLOT_SIZE);
 
-		/* Write the record origin offset */
-		log_ptr += mach_write_compressed(log_ptr, extra_size);
+  for (ulint i= n_owned / 2; i--; )
+    rec= page_rec_get_next_const(rec);
 
-		/* Write the mismatch index */
-		log_ptr += mach_write_compressed(log_ptr, i);
+  /* Add a directory slot immediately below this one. */
+  constexpr uint16_t n_slots_f= PAGE_N_DIR_SLOTS + PAGE_HEADER;
+  byte *n_slots_p= my_assume_aligned<2>(n_slots_f + block.frame);
+  const uint16_t n_slots= mach_read_from_2(n_slots_p);
 
-		ut_a(i < srv_page_size);
-		ut_a(extra_size < srv_page_size);
-	} else {
-		/* Write the record end segment length
-		and the extra info storage flag */
-		log_ptr += mach_write_compressed(log_ptr, 2 * (rec_size - i));
-	}
+  page_dir_slot_t *last_slot= static_cast<page_dir_slot_t*>
+          (block.frame + srv_page_size - (PAGE_DIR + PAGE_DIR_SLOT_SIZE) -
+           n_slots * PAGE_DIR_SLOT_SIZE);
+  ut_ad(slot >= last_slot);
+  memmove_aligned<2>(last_slot, last_slot + PAGE_DIR_SLOT_SIZE,
+                     slot - last_slot);
 
-	/* Write to the log the inserted index record end segment which
-	differs from the cursor record */
+  const ulint half_owned= n_owned / 2;
 
-	rec_size -= i;
+  mach_write_to_2(n_slots_p, n_slots + 1);
 
-	if (log_ptr + rec_size <= log_end) {
-		memcpy(log_ptr, ins_ptr, rec_size);
-		mlog_close(mtr, log_ptr + rec_size);
-	} else {
-		mlog_close(mtr, log_ptr);
-		ut_a(rec_size < srv_page_size);
-		mlog_catenate_string(mtr, ins_ptr, rec_size);
-	}
+  mach_write_to_2(slot, rec - block.frame);
+  const bool comp= page_is_comp(block.frame) != 0;
+  page_rec_set_n_owned(page_dir_slot_get_rec(slot), half_owned, comp);
+  page_rec_set_n_owned(page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE),
+                       n_owned - half_owned, comp);
 }
 
-/***********************************************************//**
-Parses a log record of a record insert on a page.
-@return end of log record or NULL */
-byte*
-page_cur_parse_insert_rec(
-/*======================*/
-	ibool		is_short,/*!< in: TRUE if short inserts */
-	const byte*	ptr,	/*!< in: buffer */
-	const byte*	end_ptr,/*!< in: buffer end */
-	buf_block_t*	block,	/*!< in: page or NULL */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	mtr_t*		mtr)	/*!< in: mtr or NULL */
+/**
+Split a directory slot which owns too many records.
+@param[in,out]  block   index page (ROW_FORMAT=COMPRESSED)
+@param[in]      s       the slot that needs to be split
+@param[in,out]  mtr     mini-transaction */
+static void page_zip_dir_split_slot(buf_block_t *block, ulint s, mtr_t* mtr)
 {
-	ulint	origin_offset		= 0; /* remove warning */
-	ulint	end_seg_len;
-	ulint	mismatch_index		= 0; /* remove warning */
-	page_t*	page;
-	rec_t*	cursor_rec;
-	byte	buf1[1024];
-	byte*	buf;
-	const byte*	ptr2		= ptr;
-	ulint		info_and_status_bits = 0; /* remove warning */
-	page_cur_t	cursor;
-	mem_heap_t*	heap		= NULL;
-	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
-	rec_offs*	offsets		= offsets_;
-	rec_offs_init(offsets_);
-
-	page = block ? buf_block_get_frame(block) : NULL;
+  ut_ad(block->page.zip.data);
+  ut_ad(page_is_comp(block->frame));
+  ut_ad(s);
 
-	if (is_short) {
-		cursor_rec = page_rec_get_prev(page_get_supremum_rec(page));
-	} else {
-		ulint	offset;
+  page_dir_slot_t *slot= page_dir_get_nth_slot(block->frame, s);
+  const ulint n_owned= PAGE_DIR_SLOT_MAX_N_OWNED + 1;
 
-		/* Read the cursor rec offset as a 2-byte ulint */
+  ut_ad(page_dir_slot_get_n_owned(slot) == n_owned);
+  static_assert((PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 >=
+                PAGE_DIR_SLOT_MIN_N_OWNED, "compatibility");
 
-		if (UNIV_UNLIKELY(end_ptr < ptr + 2)) {
+  /* 1. We loop to find a record approximately in the middle of the
+  records owned by the slot. */
 
-			return(NULL);
-		}
+  const rec_t *rec= page_dir_slot_get_rec(slot + PAGE_DIR_SLOT_SIZE);
 
-		offset = mach_read_from_2(ptr);
-		ptr += 2;
+  for (ulint i= n_owned / 2; i--; )
+    rec= page_rec_get_next_const(rec);
 
-		cursor_rec = page + offset;
+  /* Add a directory slot immediately below this one. */
+  constexpr uint16_t n_slots_f= PAGE_N_DIR_SLOTS + PAGE_HEADER;
+  byte *n_slots_p= my_assume_aligned<2>(n_slots_f + block->frame);
+  const uint16_t n_slots= mach_read_from_2(n_slots_p);
 
-		if (offset >= srv_page_size) {
+  page_dir_slot_t *last_slot= static_cast<page_dir_slot_t*>
+          (block->frame + srv_page_size - (PAGE_DIR + PAGE_DIR_SLOT_SIZE) -
+           n_slots * PAGE_DIR_SLOT_SIZE);
+  memmove_aligned<2>(last_slot, last_slot + PAGE_DIR_SLOT_SIZE,
+                     slot - last_slot);
 
-			recv_sys.found_corrupt_log = TRUE;
-
-			return(NULL);
-		}
-	}
+  const ulint half_owned= n_owned / 2;
 
-	end_seg_len = mach_parse_compressed(&ptr, end_ptr);
+  mtr->write<2>(*block, n_slots_p, 1U + n_slots);
 
-	if (ptr == NULL) {
+  /* Log changes to the compressed page header and the dense page directory. */
+  memcpy_aligned<2>(&block->page.zip.data[n_slots_f], n_slots_p, 2);
+  mach_write_to_2(slot, page_offset(rec));
+  page_rec_set_n_owned<true>(block, page_dir_slot_get_rec(slot), half_owned,
+                             true, mtr);
+  page_rec_set_n_owned<true>(block,
+                             page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE),
+                             n_owned - half_owned, true, mtr);
+}
 
-		return(NULL);
-	}
+/**
+Try to balance an underfilled directory slot with an adjacent one,
+so that there are at least the minimum number of records owned by the slot;
+this may result in merging the two slots.
+@param[in,out]	block		ROW_FORMAT=COMPRESSED page
+@param[in]	s		the slot to be balanced
+@param[in,out]	mtr		mini-transaction */
+static void page_zip_dir_balance_slot(buf_block_t *block, ulint s, mtr_t *mtr)
+{
+	ut_ad(block->page.zip.data);
+	ut_ad(page_is_comp(block->frame));
+	ut_ad(s > 0);
 
-	if (end_seg_len >= srv_page_size << 1) {
-		recv_sys.found_corrupt_log = TRUE;
+	const ulint n_slots = page_dir_get_n_slots(block->frame);
 
-		return(NULL);
+	if (UNIV_UNLIKELY(s + 1 == n_slots)) {
+		/* The last directory slot cannot be balanced. */
+		return;
 	}
 
-	if (end_seg_len & 0x1UL) {
-		/* Read the info bits */
-
-		if (end_ptr < ptr + 1) {
-
-			return(NULL);
-		}
-
-		info_and_status_bits = mach_read_from_1(ptr);
-		ptr++;
-
-		origin_offset = mach_parse_compressed(&ptr, end_ptr);
-
-		if (ptr == NULL) {
-
-			return(NULL);
-		}
-
-		ut_a(origin_offset < srv_page_size);
-
-		mismatch_index = mach_parse_compressed(&ptr, end_ptr);
-
-		if (ptr == NULL) {
-
-			return(NULL);
-		}
-
-		ut_a(mismatch_index < srv_page_size);
+	ut_ad(s < n_slots);
+
+	page_dir_slot_t* slot = page_dir_get_nth_slot(block->frame, s);
+	rec_t* const up_rec = const_cast<rec_t*>
+		(page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE));
+	rec_t* const slot_rec = const_cast<rec_t*>
+		(page_dir_slot_get_rec(slot));
+	const ulint up_n_owned = rec_get_n_owned_new(up_rec);
+
+	ut_ad(rec_get_n_owned_new(page_dir_slot_get_rec(slot))
+	      == PAGE_DIR_SLOT_MIN_N_OWNED - 1);
+
+	if (up_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
+		compile_time_assert(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1
+				    <= PAGE_DIR_SLOT_MAX_N_OWNED);
+		/* Merge the slots. */
+		page_rec_set_n_owned<true>(block, slot_rec, 0, true, mtr);
+		page_rec_set_n_owned<true>(block, up_rec, up_n_owned
+					   + (PAGE_DIR_SLOT_MIN_N_OWNED - 1),
+					   true, mtr);
+		/* Shift the slots */
+		page_dir_slot_t* last_slot = page_dir_get_nth_slot(
+			block->frame, n_slots - 1);
+		memmove_aligned<2>(last_slot + PAGE_DIR_SLOT_SIZE, last_slot,
+				   slot - last_slot);
+		constexpr uint16_t n_slots_f = PAGE_N_DIR_SLOTS + PAGE_HEADER;
+		byte *n_slots_p= my_assume_aligned<2>
+			(n_slots_f + block->frame);
+		mtr->write<2>(*block, n_slots_p, n_slots - 1);
+		memcpy_aligned<2>(n_slots_f + block->page.zip.data,
+				  n_slots_p, 2);
+		memset_aligned<2>(last_slot, 0, 2);
+		return;
 	}
 
-	if (end_ptr < ptr + (end_seg_len >> 1)) {
+	/* Transfer one record to the underfilled slot */
+	page_rec_set_n_owned<true>(block, slot_rec, 0, true, mtr);
+	rec_t* new_rec = rec_get_next_ptr(slot_rec, TRUE);
+	page_rec_set_n_owned<true>(block, new_rec,
+				   PAGE_DIR_SLOT_MIN_N_OWNED,
+				   true, mtr);
+	mach_write_to_2(slot, page_offset(new_rec));
+	page_rec_set_n_owned(up_rec, up_n_owned - 1, true);
+}
 
-		return(NULL);
-	}
+/**
+Try to balance an underfilled directory slot with an adjacent one,
+so that there are at least the minimum number of records owned by the slot;
+this may result in merging the two slots.
+@param[in,out]	block		index page
+@param[in]	s		the slot to be balanced */
+static void page_dir_balance_slot(const buf_block_t &block, ulint s)
+{
+	const bool comp= page_is_comp(block.frame);
+	ut_ad(!block.page.zip.data);
+	ut_ad(s > 0);
 
-	if (!block) {
+	const ulint n_slots = page_dir_get_n_slots(block.frame);
 
-		return(const_cast<byte*>(ptr + (end_seg_len >> 1)));
+	if (UNIV_UNLIKELY(s + 1 == n_slots)) {
+		/* The last directory slot cannot be balanced. */
+		return;
 	}
 
-	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
-	ut_ad(!buf_block_get_page_zip(block) || page_is_comp(page));
-
-	/* Read from the log the inserted index record end segment which
-	differs from the cursor record */
-
-	const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
-
-	offsets = rec_get_offsets(cursor_rec, index, offsets, n_core,
-				  ULINT_UNDEFINED, &heap);
-
-	if (!(end_seg_len & 0x1UL)) {
-		info_and_status_bits = rec_get_info_and_status_bits(
-			cursor_rec, page_is_comp(page));
-		origin_offset = rec_offs_extra_size(offsets);
-		mismatch_index = rec_offs_size(offsets) - (end_seg_len >> 1);
+	ut_ad(s < n_slots);
+
+	page_dir_slot_t* slot = page_dir_get_nth_slot(block.frame, s);
+	rec_t* const up_rec = const_cast<rec_t*>
+		(page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE));
+	rec_t* const slot_rec = const_cast<rec_t*>
+		(page_dir_slot_get_rec(slot));
+	const ulint up_n_owned = comp
+		? rec_get_n_owned_new(up_rec)
+		: rec_get_n_owned_old(up_rec);
+
+	ut_ad(page_dir_slot_get_n_owned(slot)
+	      == PAGE_DIR_SLOT_MIN_N_OWNED - 1);
+
+	if (up_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
+		compile_time_assert(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1
+				    <= PAGE_DIR_SLOT_MAX_N_OWNED);
+		/* Merge the slots. */
+		page_rec_set_n_owned(slot_rec, 0, comp);
+		page_rec_set_n_owned(up_rec, up_n_owned
+				     + (PAGE_DIR_SLOT_MIN_N_OWNED - 1), comp);
+		/* Shift the slots */
+		page_dir_slot_t* last_slot = page_dir_get_nth_slot(
+			block.frame, n_slots - 1);
+		memmove_aligned<2>(last_slot + PAGE_DIR_SLOT_SIZE, last_slot,
+				   slot - last_slot);
+		memset_aligned<2>(last_slot, 0, 2);
+		constexpr uint16_t n_slots_f = PAGE_N_DIR_SLOTS + PAGE_HEADER;
+		byte *n_slots_p= my_assume_aligned<2>
+			(n_slots_f + block.frame);
+		mach_write_to_2(n_slots_p, n_slots - 1);
+		return;
 	}
 
-	end_seg_len >>= 1;
+	/* Transfer one record to the underfilled slot */
+	rec_t* new_rec;
 
-	if (mismatch_index + end_seg_len < sizeof buf1) {
-		buf = buf1;
+	if (comp) {
+		page_rec_set_n_owned(slot_rec, 0, true);
+		new_rec = rec_get_next_ptr(slot_rec, TRUE);
+		page_rec_set_n_owned(new_rec, PAGE_DIR_SLOT_MIN_N_OWNED, true);
+		page_rec_set_n_owned(up_rec, up_n_owned - 1, true);
 	} else {
-		buf = static_cast<byte*>(
-			ut_malloc_nokey(mismatch_index + end_seg_len));
+		page_rec_set_n_owned(slot_rec, 0, false);
+		new_rec = rec_get_next_ptr(slot_rec, FALSE);
+		page_rec_set_n_owned(new_rec, PAGE_DIR_SLOT_MIN_N_OWNED,
+				     false);
+		page_rec_set_n_owned(up_rec, up_n_owned - 1, false);
 	}
 
-	/* Build the inserted record to buf */
+	mach_write_to_2(slot, page_offset(new_rec));
+}
 
-        if (UNIV_UNLIKELY(mismatch_index >= srv_page_size)) {
+/** Allocate space for inserting an index record.
+@tparam compressed  whether to update the ROW_FORMAT=COMPRESSED
+@param[in,out]	block		index page
+@param[in]	need		number of bytes needed
+@param[out]	heap_no		record heap number
+@return	pointer to the start of the allocated buffer
+@retval	NULL	if allocation fails */
+template<bool compressed=false>
+static byte* page_mem_alloc_heap(buf_block_t *block, ulint need,
+                                 ulint *heap_no)
+{
+  ut_ad(!compressed || block->page.zip.data);
 
-		ib::fatal() << "is_short " << is_short << ", "
-			<< "info_and_status_bits " << info_and_status_bits
-			<< ", offset " << page_offset(cursor_rec) << ","
-			" o_offset " << origin_offset << ", mismatch index "
-			<< mismatch_index << ", end_seg_len " << end_seg_len
-			<< " parsed len " << (ptr - ptr2);
-	}
+  byte *heap_top= my_assume_aligned<2>(PAGE_HEAP_TOP + PAGE_HEADER +
+                                       block->frame);
 
-	ut_memcpy(buf, rec_get_start(cursor_rec, offsets), mismatch_index);
-	ut_memcpy(buf + mismatch_index, ptr, end_seg_len);
+  const uint16_t top= mach_read_from_2(heap_top);
 
-	if (page_is_comp(page)) {
-		rec_set_heap_no_new(buf + origin_offset,
-				    PAGE_HEAP_NO_USER_LOW);
-		rec_set_info_and_status_bits(buf + origin_offset,
-					     info_and_status_bits);
-	} else {
-		rec_set_heap_no_old(buf + origin_offset,
-				    PAGE_HEAP_NO_USER_LOW);
-		rec_set_info_bits_old(buf + origin_offset,
-				      info_and_status_bits);
-	}
+  if (need > page_get_max_insert_size(block->frame, 1))
+    return NULL;
 
-	page_cur_position(cursor_rec, block, &cursor);
+  byte *n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER + block->frame);
 
-	offsets = rec_get_offsets(buf + origin_offset, index, offsets,
-				  n_core, ULINT_UNDEFINED, &heap);
-	if (UNIV_UNLIKELY(!page_cur_rec_insert(&cursor,
-					       buf + origin_offset,
-					       index, offsets, mtr))) {
-		/* The redo log record should only have been written
-		after the write was successful. */
-		ut_error;
-	}
+  const uint16_t h= mach_read_from_2(n_heap);
+  if (UNIV_UNLIKELY((h + 1) & 0x6000))
+  {
+    /* At the minimum record size of 5+2 bytes, we can only reach this
+    condition when using innodb_page_size=64k. */
+    ut_ad((h & 0x7fff) == 8191);
+    ut_ad(srv_page_size == 65536);
+    return NULL;
+  }
 
-	if (buf != buf1) {
+  *heap_no= h & 0x7fff;
+  ut_ad(*heap_no < srv_page_size / REC_N_NEW_EXTRA_BYTES);
+  compile_time_assert(UNIV_PAGE_SIZE_MAX / REC_N_NEW_EXTRA_BYTES < 0x3fff);
 
-		ut_free(buf);
-	}
+  mach_write_to_2(heap_top, top + need);
+  mach_write_to_2(n_heap, h + 1);
 
-	if (UNIV_LIKELY_NULL(heap)) {
-		mem_heap_free(heap);
-	}
+  if (compressed)
+  {
+    ut_ad(h & 0x8000);
+    memcpy_aligned<4>(&block->page.zip.data[PAGE_HEAP_TOP + PAGE_HEADER],
+                      heap_top, 4);
+  }
 
-	return(const_cast<byte*>(ptr + end_seg_len));
+  return &block->frame[top];
 }
 
-/** Reset PAGE_DIRECTION and PAGE_N_DIRECTION.
-@param[in,out]	ptr		the PAGE_DIRECTION_B field
-@param[in,out]	page		index tree page frame
-@param[in]	page_zip	compressed page descriptor, or NULL */
-static inline
-void
-page_direction_reset(byte* ptr, page_t* page, page_zip_des_t* page_zip)
+/** Write log for inserting a B-tree or R-tree record in
+ROW_FORMAT=REDUNDANT.
+@param block      B-tree or R-tree page
+@param reuse      false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev_rec   byte offset of the predecessor of the record to insert,
+                  starting from PAGE_OLD_INFIMUM
+@param info_bits  info_bits of the record
+@param n_fields_s number of fields << 1 | rec_get_1byte_offs_flag()
+@param hdr_c      number of common record header bytes with prev_rec
+@param data_c     number of common data bytes with prev_rec
+@param hdr        record header bytes to copy to the log
+@param hdr_l      number of copied record header bytes
+@param data       record payload bytes to copy to the log
+@param data_l     number of copied record data bytes */
+inline void mtr_t::page_insert(const buf_block_t &block, bool reuse,
+                               ulint prev_rec, byte info_bits,
+                               ulint n_fields_s, size_t hdr_c, size_t data_c,
+                               const byte *hdr, size_t hdr_l,
+                               const byte *data, size_t data_l)
 {
-	ut_ad(ptr == PAGE_HEADER + PAGE_DIRECTION_B + page);
-	page_ptr_set_direction(ptr, PAGE_NO_DIRECTION);
-	if (page_zip) {
-		page_zip_write_header(page_zip, ptr, 1, NULL);
-	}
-	ptr = PAGE_HEADER + PAGE_N_DIRECTION + page;
-	*reinterpret_cast<uint16_t*>(ptr) = 0;
-	if (page_zip) {
-		page_zip_write_header(page_zip, ptr, 2, NULL);
-	}
+  ut_ad(!block.page.zip.data);
+  ut_ad(m_log_mode == MTR_LOG_ALL);
+  ut_d(ulint n_slots= page_dir_get_n_slots(block.frame));
+  ut_ad(n_slots >= 2);
+  ut_d(const byte *page_end= page_dir_get_nth_slot(block.frame, n_slots - 1));
+  ut_ad(&block.frame[prev_rec + PAGE_OLD_INFIMUM] <= page_end);
+  ut_ad(block.frame + page_header_get_offs(block.frame, PAGE_HEAP_TOP) <=
+        page_end);
+  ut_ad(fil_page_index_page_check(block.frame));
+  ut_ad(!(~(REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG) & info_bits));
+  ut_ad(n_fields_s >= 2);
+  ut_ad((n_fields_s >> 1) <= REC_MAX_N_FIELDS);
+  ut_ad(data_l + data_c <= REDUNDANT_REC_MAX_DATA_SIZE);
+
+  set_modified(block);
+
+  static_assert(REC_INFO_MIN_REC_FLAG == 0x10, "compatibility");
+  static_assert(REC_INFO_DELETED_FLAG == 0x20, "compatibility");
+  n_fields_s= (n_fields_s - 2) << 2 | info_bits >> 4;
+
+  size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4;
+  static_assert((REC_MAX_N_FIELDS << 1 | 1) <= MIN_3BYTE, "compatibility");
+  len+= n_fields_s < MIN_2BYTE ? 1 : 2;
+  len+= hdr_c < MIN_2BYTE ? 1 : 2;
+  static_assert(REDUNDANT_REC_MAX_DATA_SIZE <= MIN_3BYTE, "compatibility");
+  len+= data_c < MIN_2BYTE ? 1 : 2;
+  len+= hdr_l + data_l;
+
+  const bool small= len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5);
+  byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, small);
+
+  if (UNIV_LIKELY(small))
+  {
+    ut_d(const byte * const end = l + len);
+    *l++= reuse ? INSERT_REUSE_REDUNDANT : INSERT_HEAP_REDUNDANT;
+    l= mlog_encode_varint(l, prev_rec);
+    l= mlog_encode_varint(l, n_fields_s);
+    l= mlog_encode_varint(l, hdr_c);
+    l= mlog_encode_varint(l, data_c);
+    ::memcpy(l, hdr, hdr_l);
+    l+= hdr_l;
+    ::memcpy(l, data, data_l);
+    l+= data_l;
+    ut_ad(end == l);
+    m_log.close(l);
+  }
+  else
+  {
+    m_log.close(l);
+    l= m_log.open(len - hdr_l - data_l);
+    ut_d(const byte * const end = l + len - hdr_l - data_l);
+    *l++= reuse ? INSERT_REUSE_REDUNDANT : INSERT_HEAP_REDUNDANT;
+    l= mlog_encode_varint(l, prev_rec);
+    l= mlog_encode_varint(l, n_fields_s);
+    l= mlog_encode_varint(l, hdr_c);
+    l= mlog_encode_varint(l, data_c);
+    ut_ad(end == l);
+    m_log.close(l);
+    m_log.push(hdr, static_cast<uint32_t>(hdr_l));
+    m_log.push(data, static_cast<uint32_t>(data_l));
+  }
+
+  m_last_offset= FIL_PAGE_TYPE;
 }
 
-/** Increment PAGE_N_DIRECTION.
-@param[in,out]	ptr		the PAGE_DIRECTION_B field
-@param[in,out]	page		index tree page frame
-@param[in]	page_zip	compressed page descriptor, or NULL
-@param[in]	dir		PAGE_RIGHT or PAGE_LEFT */
-static inline
-void
-page_direction_increment(
-	byte*		ptr,
-	page_t*		page,
-	page_zip_des_t*	page_zip,
-	uint		dir)
+/** Write log for inserting a B-tree or R-tree record in
+ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC.
+@param block       B-tree or R-tree page
+@param reuse       false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev_rec    byte offset of the predecessor of the record to insert,
+                   starting from PAGE_NEW_INFIMUM
+@param info_status rec_get_info_and_status_bits()
+@param shift       unless !reuse: number of bytes the PAGE_FREE is moving
+@param hdr_c       number of common record header bytes with prev_rec
+@param data_c      number of common data bytes with prev_rec
+@param hdr         record header bytes to copy to the log
+@param hdr_l       number of copied record header bytes
+@param data        record payload bytes to copy to the log
+@param data_l      number of copied record data bytes */
+inline void mtr_t::page_insert(const buf_block_t &block, bool reuse,
+                               ulint prev_rec, byte info_status,
+                               ssize_t shift, size_t hdr_c, size_t data_c,
+                               const byte *hdr, size_t hdr_l,
+                               const byte *data, size_t data_l)
 {
-	ut_ad(ptr == PAGE_HEADER + PAGE_DIRECTION_B + page);
-	ut_ad(dir == PAGE_RIGHT || dir == PAGE_LEFT);
-	page_ptr_set_direction(ptr, dir);
-	if (page_zip) {
-		page_zip_write_header(page_zip, ptr, 1, NULL);
-	}
-	page_header_set_field(
-		page, page_zip, PAGE_N_DIRECTION,
-		1U + page_header_get_field(page, PAGE_N_DIRECTION));
-}
-
-/************************************************************//**
-Allocates a block of memory from the heap of an index page.
-@return	pointer to start of allocated buffer, or NULL if allocation fails */
-static
-byte*
-page_mem_alloc_heap(
-/*================*/
-	page_t*		page,	/*!< in/out: index page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page with enough
-				space available for inserting the record,
-				or NULL */
-	ulint		need,	/*!< in: total number of bytes needed */
-	ulint*		heap_no)/*!< out: this contains the heap number
-				of the allocated record
-				if allocation succeeds */
-{
-	byte*	block;
-	ulint	avl_space;
-
-	ut_ad(page && heap_no);
-
-	avl_space = page_get_max_insert_size(page, 1);
-
-	if (avl_space >= need) {
-		const ulint h = page_dir_get_n_heap(page);
-		if (UNIV_UNLIKELY(h >= 8191)) {
-			/* At the minimum record size of 5+2 bytes,
-			we can only reach this condition when using
-			innodb_page_size=64k. */
-			ut_ad(srv_page_size == 65536);
-			return(NULL);
-		}
-		*heap_no = h;
-
-		block = page_header_get_ptr(page, PAGE_HEAP_TOP);
-
-		page_header_set_ptr(page, page_zip, PAGE_HEAP_TOP,
-				    block + need);
-		page_dir_set_n_heap(page, page_zip, 1 + *heap_no);
-
-		return(block);
-	}
+  ut_ad(!block.page.zip.data);
+  ut_ad(m_log_mode == MTR_LOG_ALL);
+  ut_d(ulint n_slots= page_dir_get_n_slots(block.frame));
+  ut_ad(n_slots >= 2);
+  ut_d(const byte *page_end= page_dir_get_nth_slot(block.frame, n_slots - 1));
+  ut_ad(&block.frame[prev_rec + PAGE_NEW_INFIMUM] <= page_end);
+  ut_ad(block.frame + page_header_get_offs(block.frame, PAGE_HEAP_TOP) <=
+        page_end);
+  ut_ad(fil_page_index_page_check(block.frame));
+  ut_ad(hdr_l + hdr_c + data_l + data_c <=
+        static_cast<size_t>(page_end - &block.frame[PAGE_NEW_SUPREMUM_END]));
+  ut_ad(reuse || shift == 0);
+#ifdef UNIV_DEBUG
+  switch (~(REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG) & info_status) {
+  default:
+    ut_ad(0);
+    break;
+  case REC_STATUS_NODE_PTR:
+    ut_ad(!page_is_leaf(block.frame));
+    break;
+  case REC_STATUS_INSTANT:
+  case REC_STATUS_ORDINARY:
+    ut_ad(page_is_leaf(block.frame));
+  }
+#endif
 
-	return(NULL);
+  set_modified(block);
+
+  static_assert(REC_INFO_MIN_REC_FLAG == 0x10, "compatibility");
+  static_assert(REC_INFO_DELETED_FLAG == 0x20, "compatibility");
+  static_assert(REC_STATUS_INSTANT == 4, "compatibility");
+
+  const size_t enc_hdr_l= hdr_l << 3 |
+    (info_status & REC_STATUS_INSTANT) | info_status >> 4;
+  size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4;
+  static_assert(REC_MAX_N_FIELDS * 2 < MIN_3BYTE, "compatibility");
+  if (reuse)
+  {
+    if (shift < 0)
+      shift= -shift << 1 | 1;
+    else
+      shift<<= 1;
+    len+= static_cast<size_t>(shift) < MIN_2BYTE
+      ? 1 : static_cast<size_t>(shift) < MIN_3BYTE ? 2 : 3;
+  }
+  ut_ad(hdr_c + hdr_l <= REC_MAX_N_FIELDS * 2);
+  len+= hdr_c < MIN_2BYTE ? 1 : 2;
+  len+= enc_hdr_l < MIN_2BYTE ? 1 : enc_hdr_l < MIN_3BYTE ? 2 : 3;
+  len+= data_c < MIN_2BYTE ? 1 : data_c < MIN_3BYTE ? 2 : 3;
+  len+= hdr_l + data_l;
+
+  const bool small= len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5);
+  byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, small);
+
+  if (UNIV_LIKELY(small))
+  {
+    ut_d(const byte * const end = l + len);
+    *l++= reuse ? INSERT_REUSE_DYNAMIC : INSERT_HEAP_DYNAMIC;
+    l= mlog_encode_varint(l, prev_rec);
+    if (reuse)
+      l= mlog_encode_varint(l, shift);
+    l= mlog_encode_varint(l, enc_hdr_l);
+    l= mlog_encode_varint(l, hdr_c);
+    l= mlog_encode_varint(l, data_c);
+    ::memcpy(l, hdr, hdr_l);
+    l+= hdr_l;
+    ::memcpy(l, data, data_l);
+    l+= data_l;
+    ut_ad(end == l);
+    m_log.close(l);
+  }
+  else
+  {
+    m_log.close(l);
+    l= m_log.open(len - hdr_l - data_l);
+    ut_d(const byte * const end = l + len - hdr_l - data_l);
+    *l++= reuse ? INSERT_REUSE_DYNAMIC : INSERT_HEAP_DYNAMIC;
+    l= mlog_encode_varint(l, prev_rec);
+    if (reuse)
+      l= mlog_encode_varint(l, shift);
+    l= mlog_encode_varint(l, enc_hdr_l);
+    l= mlog_encode_varint(l, hdr_c);
+    l= mlog_encode_varint(l, data_c);
+    ut_ad(end == l);
+    m_log.close(l);
+    m_log.push(hdr, static_cast<uint32_t>(hdr_l));
+    m_log.push(data, static_cast<uint32_t>(data_l));
+  }
+
+  m_last_offset= FIL_PAGE_TYPE;
 }
 
 /***********************************************************//**
@@ -1264,214 +1286,399 @@ space available, NULL otherwise. The cursor stays at the same position.
 rec_t*
 page_cur_insert_rec_low(
 /*====================*/
-	rec_t*		current_rec,/*!< in: pointer to current record after
-				which the new record is inserted */
+	const page_cur_t*cur,	/*!< in: page cursor */
 	dict_index_t*	index,	/*!< in: record descriptor */
-	const rec_t*	rec,	/*!< in: pointer to a physical record */
+	const rec_t*	rec,	/*!< in: record to insert after cur */
 	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	byte*		insert_buf;
-	ulint		rec_size;
-	page_t*		page;		/*!< the relevant page */
-	rec_t*		last_insert;	/*!< cursor position at previous
-					insert */
-	rec_t*		free_rec;	/*!< a free record that was reused,
-					or NULL */
-	rec_t*		insert_rec;	/*!< inserted record */
-	ulint		heap_no;	/*!< heap number of the inserted
-					record */
-
-	ut_ad(rec_offs_validate(rec, index, offsets));
-
-	page = page_align(current_rec);
-	ut_ad(dict_table_is_comp(index->table)
-	      == (ibool) !!page_is_comp(page));
-	ut_ad(fil_page_index_page_check(page));
-	ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) == index->id
-	      || index->is_dummy
-	      || (mtr ? mtr->is_inside_ibuf() : dict_index_is_ibuf(index)));
-
-	ut_ad(!page_rec_is_supremum(current_rec));
-
-	/* 1. Get the size of the physical record in the page */
-	rec_size = rec_offs_size(offsets);
-
-#ifdef HAVE_valgrind
-	{
-		const void*	rec_start __attribute__((unused))
-			= rec - rec_offs_extra_size(offsets);
-		ulint		extra_size __attribute__((unused))
-			= rec_offs_extra_size(offsets)
-			- (rec_offs_comp(offsets)
-			   ? REC_N_NEW_EXTRA_BYTES
-			   : REC_N_OLD_EXTRA_BYTES);
-
-		/* All data bytes of the record must be valid. */
-		MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
-		/* The variable-length header must be valid. */
-		MEM_CHECK_DEFINED(rec_start, extra_size);
-	}
-#endif /* HAVE_valgrind */
-
-	/* 2. Try to find suitable space from page memory management */
-
-	free_rec = page_header_get_ptr(page, PAGE_FREE);
-	if (UNIV_LIKELY_NULL(free_rec)) {
-		/* Try to allocate from the head of the free list. */
-		rec_offs	foffsets_[REC_OFFS_NORMAL_SIZE];
-		rec_offs*	foffsets	= foffsets_;
-		mem_heap_t*	heap		= NULL;
-
-		rec_offs_init(foffsets_);
-
-		foffsets = rec_get_offsets(
-			free_rec, index, foffsets,
-			page_is_leaf(page) ? index->n_core_fields : 0,
-			ULINT_UNDEFINED, &heap);
-		if (rec_offs_size(foffsets) < rec_size) {
-			if (UNIV_LIKELY_NULL(heap)) {
-				mem_heap_free(heap);
-			}
-
-			goto use_heap;
-		}
-
-		insert_buf = free_rec - rec_offs_extra_size(foffsets);
-
-		if (page_is_comp(page)) {
-			heap_no = rec_get_heap_no_new(free_rec);
-			page_mem_alloc_free(page, NULL,
-					rec_get_next_ptr(free_rec, TRUE),
-					rec_size);
-		} else {
-			heap_no = rec_get_heap_no_old(free_rec);
-			page_mem_alloc_free(page, NULL,
-					rec_get_next_ptr(free_rec, FALSE),
-					rec_size);
-		}
-
-		if (UNIV_LIKELY_NULL(heap)) {
-			mem_heap_free(heap);
-		}
-	} else {
+  buf_block_t* block= cur->block;
+
+  ut_ad(rec_offs_validate(rec, index, offsets));
+  ut_ad(rec_offs_n_fields(offsets) > 0);
+  ut_ad(index->table->not_redundant() == !!page_is_comp(block->frame));
+  ut_ad(!!page_is_comp(block->frame) == !!rec_offs_comp(offsets));
+  ut_ad(fil_page_index_page_check(block->frame));
+  ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->frame) ==
+        index->id ||
+        mtr->is_inside_ibuf());
+  ut_ad(page_dir_get_n_slots(block->frame) >= 2);
+
+  ut_ad(!page_rec_is_supremum(cur->rec));
+
+  /* We should not write log for ROW_FORMAT=COMPRESSED pages here. */
+  ut_ad(mtr->get_log_mode() != MTR_LOG_ALL ||
+        !(index->table->flags & DICT_TF_MASK_ZIP_SSIZE));
+
+  /* 1. Get the size of the physical record in the page */
+  const ulint rec_size= rec_offs_size(offsets);
+
+#ifdef HAVE_MEM_CHECK
+  {
+    const void *rec_start __attribute__((unused))=
+      rec - rec_offs_extra_size(offsets);
+    ulint extra_size __attribute__((unused))=
+      rec_offs_extra_size(offsets) -
+      (page_is_comp(block->frame)
+       ? REC_N_NEW_EXTRA_BYTES
+       : REC_N_OLD_EXTRA_BYTES);
+    /* All data bytes of the record must be valid. */
+    MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+    /* The variable-length header must be valid. */
+    MEM_CHECK_DEFINED(rec_start, extra_size);
+  }
+#endif /* HAVE_MEM_CHECK */
+
+  /* 2. Try to find suitable space from page memory management */
+  bool reuse= false;
+  ssize_t free_offset= 0;
+  ulint heap_no;
+  byte *insert_buf;
+
+  const bool comp= page_is_comp(block->frame);
+  const ulint extra_size= rec_offs_extra_size(offsets);
+
+  if (rec_t* free_rec= page_header_get_ptr(block->frame, PAGE_FREE))
+  {
+    /* Try to reuse the head of PAGE_FREE. */
+    rec_offs foffsets_[REC_OFFS_NORMAL_SIZE];
+    mem_heap_t *heap= nullptr;
+
+    rec_offs_init(foffsets_);
+
+    rec_offs *foffsets= rec_get_offsets(free_rec, index, foffsets_,
+                                        page_is_leaf(block->frame)
+                                        ? index->n_core_fields : 0,
+                                        ULINT_UNDEFINED, &heap);
+    const ulint fextra_size= rec_offs_extra_size(foffsets);
+    insert_buf= free_rec - fextra_size;
+    const bool too_small= (fextra_size + rec_offs_data_size(foffsets)) <
+      rec_size;
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+
+    if (too_small)
+      goto use_heap;
+
+    byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+                                          block->frame);
+    if (comp)
+    {
+      heap_no= rec_get_heap_no_new(free_rec);
+      uint16_t next= mach_read_from_2(free_rec - REC_NEXT);
+      mach_write_to_2(page_free, next
+                      ? static_cast<uint16_t>(free_rec + next - block->frame)
+                      : 0);
+    }
+    else
+    {
+      heap_no= rec_get_heap_no_old(free_rec);
+      memcpy(page_free, free_rec - REC_NEXT, 2);
+    }
+
+    static_assert(PAGE_GARBAGE == PAGE_FREE + 2, "compatibility");
+
+    byte *page_garbage= my_assume_aligned<2>(page_free + 2);
+    ut_ad(mach_read_from_2(page_garbage) >= rec_size);
+    mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) - rec_size);
+    reuse= true;
+    free_offset= extra_size - fextra_size;
+  }
+  else
+  {
 use_heap:
-		free_rec = NULL;
-		insert_buf = page_mem_alloc_heap(page, NULL,
-						 rec_size, &heap_no);
-
-		if (UNIV_UNLIKELY(insert_buf == NULL)) {
-			return(NULL);
-		}
-	}
-
-	/* 3. Create the record */
-	insert_rec = rec_copy(insert_buf, rec, offsets);
-	rec_offs_make_valid(insert_rec, index, page_is_leaf(page), offsets);
-
-	/* 4. Insert the record in the linked list of records */
-	ut_ad(current_rec != insert_rec);
-
-	{
-		/* next record after current before the insertion */
-		rec_t*	next_rec = page_rec_get_next(current_rec);
+    insert_buf= page_mem_alloc_heap(block, rec_size, &heap_no);
+
+    if (UNIV_UNLIKELY(!insert_buf))
+      return nullptr;
+  }
+
+  ut_ad(cur->rec != insert_buf + extra_size);
+
+  rec_t *next_rec= block->frame + rec_get_next_offs(cur->rec, comp);
+  ut_ad(next_rec != block->frame);
+
+  /* Update page header fields */
+  byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER +
+                                               block->frame);
+  const uint16_t last_insert= mach_read_from_2(page_last_insert);
+  ut_ad(!last_insert || !comp ||
+        rec_get_node_ptr_flag(block->frame + last_insert) ==
+        rec_get_node_ptr_flag(rec));
+
+  /* Write PAGE_LAST_INSERT */
+  mach_write_to_2(page_last_insert, page_offset(insert_buf + extra_size));
+
+  /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */
+  if (block->frame[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
+  {
+    byte *dir= &block->frame[PAGE_DIRECTION_B + PAGE_HEADER];
+    byte *n= my_assume_aligned<2>
+      (&block->frame[PAGE_N_DIRECTION + PAGE_HEADER]);
+    if (UNIV_UNLIKELY(!last_insert))
+    {
+no_direction:
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION);
+      memset(n, 0, 2);
+    }
+    else if (block->frame + last_insert == cur->rec &&
+             (*dir & ((1U << 3) - 1)) != PAGE_LEFT)
+    {
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT);
+inc_dir:
+      mach_write_to_2(n, mach_read_from_2(n) + 1);
+    }
+    else if (next_rec == block->frame + last_insert &&
+             (*dir & ((1U << 3) - 1)) != PAGE_RIGHT)
+    {
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT);
+      goto inc_dir;
+    }
+    else
+      goto no_direction;
+  }
+
+  /* Update PAGE_N_RECS. */
+  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+                                          block->frame);
+
+  mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1);
+
+  /* Update the preceding record header, the 'owner' record and
+  prepare the record to insert. */
+  rec_t *insert_rec= insert_buf + extra_size;
+  const ulint data_size= rec_offs_data_size(offsets);
+  memcpy(insert_buf, rec - extra_size, extra_size + data_size);
+  size_t hdr_common= 0;
+  ulint n_owned;
+  const byte info_status= static_cast<byte>
+    (rec_get_info_and_status_bits(rec, comp));
+  ut_ad(!(rec_get_info_bits(rec, comp) &
+          ~(REC_INFO_DELETED_FLAG | REC_INFO_MIN_REC_FLAG)));
+
+  if (comp)
+  {
 #ifdef UNIV_DEBUG
-		if (page_is_comp(page)) {
-			switch (rec_get_status(current_rec)) {
-			case REC_STATUS_ORDINARY:
-			case REC_STATUS_NODE_PTR:
-			case REC_STATUS_INSTANT:
-			case REC_STATUS_INFIMUM:
-				break;
-			case REC_STATUS_SUPREMUM:
-				ut_ad(!"wrong status on current_rec");
-			}
-			switch (rec_get_status(insert_rec)) {
-			case REC_STATUS_ORDINARY:
-			case REC_STATUS_NODE_PTR:
-			case REC_STATUS_INSTANT:
-				break;
-			case REC_STATUS_INFIMUM:
-			case REC_STATUS_SUPREMUM:
-				ut_ad(!"wrong status on insert_rec");
-			}
-			ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
-		}
+    switch (rec_get_status(cur->rec)) {
+    case REC_STATUS_ORDINARY:
+    case REC_STATUS_NODE_PTR:
+    case REC_STATUS_INSTANT:
+    case REC_STATUS_INFIMUM:
+      break;
+    case REC_STATUS_SUPREMUM:
+      ut_ad("wrong status on cur->rec" == 0);
+    }
+    switch (rec_get_status(rec)) {
+    case REC_STATUS_NODE_PTR:
+      ut_ad(!page_is_leaf(block->frame));
+      break;
+    case REC_STATUS_INSTANT:
+      ut_ad(index->is_instant());
+      ut_ad(page_is_leaf(block->frame));
+      if (!rec_is_metadata(rec, true))
+        break;
+      ut_ad(cur->rec == &block->frame[PAGE_NEW_INFIMUM]);
+      break;
+    case REC_STATUS_ORDINARY:
+      ut_ad(page_is_leaf(block->frame));
+      ut_ad(!(rec_get_info_bits(rec, true) & ~REC_INFO_DELETED_FLAG));
+      break;
+    case REC_STATUS_INFIMUM:
+    case REC_STATUS_SUPREMUM:
+      ut_ad("wrong status on rec" == 0);
+    }
+    ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
 #endif
-		page_rec_set_next(insert_rec, next_rec);
-		page_rec_set_next(current_rec, insert_rec);
-	}
-
-	page_header_set_field(page, NULL, PAGE_N_RECS,
-			      1U + page_get_n_recs(page));
-
-	/* 5. Set the n_owned field in the inserted record to zero,
-	and set the heap_no field */
-	if (page_is_comp(page)) {
-		rec_set_n_owned_new(insert_rec, NULL, 0);
-		rec_set_heap_no_new(insert_rec, heap_no);
-	} else {
-		rec_set_n_owned_old(insert_rec, 0);
-		rec_set_heap_no_old(insert_rec, heap_no);
-	}
-
-	MEM_CHECK_DEFINED(rec_get_start(insert_rec, offsets),
-			  rec_offs_size(offsets));
-	/* 6. Update the last insertion info in page header */
-
-	last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT);
-	ut_ad(!last_insert || !page_is_comp(page)
-	      || rec_get_node_ptr_flag(last_insert)
-	      == rec_get_node_ptr_flag(insert_rec));
-
-	if (!dict_index_is_spatial(index)) {
-		byte* ptr = PAGE_HEADER + PAGE_DIRECTION_B + page;
-		if (UNIV_UNLIKELY(last_insert == NULL)) {
-no_direction:
-			page_direction_reset(ptr, page, NULL);
-		} else if (last_insert == current_rec
-			   && page_ptr_get_direction(ptr) != PAGE_LEFT) {
-			page_direction_increment(ptr, page, NULL, PAGE_RIGHT);
-		} else if (page_ptr_get_direction(ptr) != PAGE_RIGHT
-			   && page_rec_get_next(insert_rec) == last_insert) {
-			page_direction_increment(ptr, page, NULL, PAGE_LEFT);
-		} else {
-			goto no_direction;
-		}
-	}
-
-	page_header_set_ptr(page, NULL, PAGE_LAST_INSERT, insert_rec);
 
-	/* 7. It remains to update the owner record. */
-	{
-		rec_t*	owner_rec	= page_rec_find_owner_rec(insert_rec);
-		ulint	n_owned;
-		if (page_is_comp(page)) {
-			n_owned = rec_get_n_owned_new(owner_rec);
-			rec_set_n_owned_new(owner_rec, NULL, n_owned + 1);
-		} else {
-			n_owned = rec_get_n_owned_old(owner_rec);
-			rec_set_n_owned_old(owner_rec, n_owned + 1);
-		}
-
-		/* 8. Now we have incremented the n_owned field of the owner
-		record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED,
-		we have to split the corresponding directory slot in two. */
-
-		if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) {
-			page_dir_split_slot(
-				page, NULL,
-				page_dir_find_owner_slot(owner_rec));
-		}
-	}
-
-	/* 9. Write log record of the insert */
-	if (UNIV_LIKELY(mtr != NULL)) {
-		page_cur_insert_rec_write_log(insert_rec, rec_size,
-					      current_rec, index, mtr);
-	}
+    rec_set_bit_field_1(insert_rec, 0, REC_NEW_N_OWNED,
+                        REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+    insert_rec[-REC_NEW_STATUS]= rec[-REC_NEW_STATUS];
+    rec_set_bit_field_2(insert_rec, heap_no,
+                        REC_NEW_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+    mach_write_to_2(insert_rec - REC_NEXT,
+                    static_cast<uint16_t>(next_rec - insert_rec));
+    mach_write_to_2(cur->rec - REC_NEXT,
+                    static_cast<uint16_t>(insert_rec - cur->rec));
+    while (!(n_owned= rec_get_n_owned_new(next_rec)))
+    {
+      next_rec= block->frame + rec_get_next_offs(next_rec, true);
+      ut_ad(next_rec != block->frame);
+    }
+    rec_set_bit_field_1(next_rec, n_owned + 1, REC_NEW_N_OWNED,
+                        REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+    if (mtr->get_log_mode() != MTR_LOG_ALL)
+    {
+      mtr->set_modified(*block);
+      goto copied;
+    }
+
+    const byte * const c_start= cur->rec - extra_size;
+    if (extra_size > REC_N_NEW_EXTRA_BYTES &&
+        c_start >=
+        &block->frame[PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES])
+    {
+      /* Find common header bytes with the preceding record. */
+      const byte *r= rec - (REC_N_NEW_EXTRA_BYTES + 1);
+      for (const byte *c= cur->rec - (REC_N_NEW_EXTRA_BYTES + 1);
+           *r == *c && c-- != c_start; r--);
+      hdr_common= static_cast<size_t>((rec - (REC_N_NEW_EXTRA_BYTES + 1)) - r);
+      ut_ad(hdr_common <= extra_size - REC_N_NEW_EXTRA_BYTES);
+    }
+  }
+  else
+  {
+#ifdef UNIV_DEBUG
+    if (!page_is_leaf(block->frame));
+    else if (rec_is_metadata(rec, false))
+    {
+      ut_ad(index->is_instant());
+      ut_ad(cur->rec == &block->frame[PAGE_OLD_INFIMUM]);
+    }
+#endif
+    rec_set_bit_field_1(insert_rec, 0, REC_OLD_N_OWNED,
+                        REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+    rec_set_bit_field_2(insert_rec, heap_no,
+                        REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+    memcpy(insert_rec - REC_NEXT, cur->rec - REC_NEXT, 2);
+    mach_write_to_2(cur->rec - REC_NEXT, page_offset(insert_rec));
+    while (!(n_owned= rec_get_n_owned_old(next_rec)))
+    {
+      next_rec= block->frame + rec_get_next_offs(next_rec, false);
+      ut_ad(next_rec != block->frame);
+    }
+    rec_set_bit_field_1(next_rec, n_owned + 1, REC_OLD_N_OWNED,
+                        REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+    if (mtr->get_log_mode() != MTR_LOG_ALL)
+    {
+      mtr->set_modified(*block);
+      goto copied;
+    }
+
+    ut_ad(extra_size > REC_N_OLD_EXTRA_BYTES);
+    const byte * const c_start= cur->rec - extra_size;
+    if (c_start >=
+        &block->frame[PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES])
+    {
+      /* Find common header bytes with the preceding record. */
+      const byte *r= rec - (REC_N_OLD_EXTRA_BYTES + 1);
+      for (const byte *c= cur->rec - (REC_N_OLD_EXTRA_BYTES + 1);
+           *r == *c && c-- != c_start; r--);
+      hdr_common= static_cast<size_t>((rec - (REC_N_OLD_EXTRA_BYTES + 1)) - r);
+      ut_ad(hdr_common <= extra_size - REC_N_OLD_EXTRA_BYTES);
+    }
+  }
+
+  /* Insert the record, possibly copying from the preceding record. */
+  ut_ad(mtr->get_log_mode() == MTR_LOG_ALL);
+
+  {
+    const byte *r= rec;
+    const byte *c= cur->rec;
+    const byte *c_end= cur->rec + data_size;
+    if (c <= insert_buf && c_end > insert_buf)
+      c_end= insert_buf;
+    else
+      c_end= std::min<const byte*>(c_end, block->frame + srv_page_size -
+                                   PAGE_DIR - PAGE_DIR_SLOT_SIZE *
+                                   page_dir_get_n_slots(block->frame));
+    size_t data_common;
+    /* Copy common data bytes of the preceding record. */
+    for (; c != c_end && *r == *c; c++, r++);
+    data_common= static_cast<size_t>(r - rec);
+
+    if (comp)
+      mtr->page_insert(*block, reuse,
+                       cur->rec - block->frame - PAGE_NEW_INFIMUM,
+                       info_status, free_offset, hdr_common, data_common,
+                       insert_buf,
+                       extra_size - hdr_common - REC_N_NEW_EXTRA_BYTES,
+                       r, data_size - data_common);
+    else
+      mtr->page_insert(*block, reuse,
+                       cur->rec - block->frame - PAGE_OLD_INFIMUM,
+                       info_status, rec_get_n_fields_old(insert_rec) << 1 |
+                       rec_get_1byte_offs_flag(insert_rec),
+                       hdr_common, data_common,
+                       insert_buf,
+                       extra_size - hdr_common - REC_N_OLD_EXTRA_BYTES,
+                       r, data_size - data_common);
+  }
+
+copied:
+  ut_ad(!memcmp(insert_buf, rec - extra_size, extra_size -
+                (comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES)));
+  ut_ad(!memcmp(insert_rec, rec, data_size));
+  /* We have incremented the n_owned field of the owner record.
+  If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, we have to split the
+  corresponding directory slot in two. */
+
+  if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+  {
+    const auto owner= page_dir_find_owner_slot(next_rec);
+    page_dir_split_slot(*block, page_dir_get_nth_slot(block->frame, owner));
+  }
+
+  rec_offs_make_valid(insert_buf + extra_size, index,
+                      page_is_leaf(block->frame), offsets);
+  return insert_buf + extra_size;
+}
 
-	return(insert_rec);
+/** Add a slot to the dense page directory.
+@param[in,out]  block   ROW_FORMAT=COMPRESSED page
+@param[in]      index   the index that the page belongs to
+@param[in,out]  mtr     mini-transaction */
+static inline void page_zip_dir_add_slot(buf_block_t *block,
+                                         const dict_index_t *index, mtr_t *mtr)
+{
+  page_zip_des_t *page_zip= &block->page.zip;
+
+  ut_ad(page_is_comp(page_zip->data));
+  MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+  /* Read the old n_dense (n_heap has already been incremented). */
+  ulint n_dense= page_dir_get_n_heap(page_zip->data) - (PAGE_HEAP_NO_USER_LOW +
+                                                        1U);
+
+  byte *dir= page_zip->data + page_zip_get_size(page_zip) -
+    PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
+  byte *stored= dir;
+
+  if (!page_is_leaf(page_zip->data))
+  {
+    ut_ad(!page_zip->n_blobs);
+    stored-= n_dense * REC_NODE_PTR_SIZE;
+  }
+  else if (index->is_clust())
+  {
+    /* Move the BLOB pointer array backwards to make space for the
+    columns DB_TRX_ID,DB_ROLL_PTR and the dense directory slot. */
+
+    stored-= n_dense * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+    byte *externs= stored - page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+    byte *dst= externs - PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
+    ut_ad(!memcmp(dst, field_ref_zero, PAGE_ZIP_CLUST_LEAF_SLOT_SIZE));
+    if (const ulint len = ulint(stored - externs))
+    {
+      memmove(dst, externs, len);
+      mtr->memmove(*block, dst - page_zip->data, externs - page_zip->data,
+                   len);
+    }
+  }
+  else
+  {
+    stored-= page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+    ut_ad(!memcmp(stored - PAGE_ZIP_DIR_SLOT_SIZE, field_ref_zero,
+                  PAGE_ZIP_DIR_SLOT_SIZE));
+  }
+
+  /* Move the uncompressed area backwards to make space
+  for one directory slot. */
+  if (const ulint len = ulint(dir - stored))
+  {
+    byte* dst = stored - PAGE_ZIP_DIR_SLOT_SIZE;
+    memmove(dst, stored, len);
+    mtr->memmove(*block, dst - page_zip->data, stored - page_zip->data, len);
+  }
 }
 
 /***********************************************************//**
@@ -1493,808 +1700,426 @@ page_cur_insert_rec_zip(
 	dict_index_t*	index,	/*!< in: record descriptor */
 	const rec_t*	rec,	/*!< in: pointer to a physical record */
 	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	byte*		insert_buf;
-	ulint		rec_size;
-	page_t*		page;		/*!< the relevant page */
-	rec_t*		last_insert;	/*!< cursor position at previous
-					insert */
-	rec_t*		free_rec;	/*!< a free record that was reused,
-					or NULL */
-	rec_t*		insert_rec;	/*!< inserted record */
-	ulint		heap_no;	/*!< heap number of the inserted
-					record */
-	page_zip_des_t*	page_zip;
-
-	page_zip = page_cur_get_page_zip(cursor);
-	ut_ad(page_zip);
-
-	ut_ad(rec_offs_validate(rec, index, offsets));
-
-	page = page_cur_get_page(cursor);
-	ut_ad(dict_table_is_comp(index->table));
-	ut_ad(page_is_comp(page));
-	ut_ad(fil_page_index_page_check(page));
-	ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) == index->id
-	      || index->is_dummy
-	      || (mtr ? mtr->is_inside_ibuf() : dict_index_is_ibuf(index)));
-	ut_ad(!page_get_instant(page));
-	ut_ad(!page_cur_is_after_last(cursor));
+  page_zip_des_t * const page_zip= page_cur_get_page_zip(cursor);
+  ut_ad(page_zip);
+  ut_ad(rec_offs_validate(rec, index, offsets));
+
+  ut_ad(index->table->not_redundant());
+  ut_ad(page_is_comp(cursor->block->frame));
+  ut_ad(rec_offs_comp(offsets));
+  ut_ad(fil_page_get_type(cursor->block->frame) == FIL_PAGE_INDEX ||
+        fil_page_get_type(cursor->block->frame) == FIL_PAGE_RTREE);
+  ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + cursor->block->frame) ==
+        index->id || mtr->is_inside_ibuf());
+  ut_ad(!page_get_instant(cursor->block->frame));
+  ut_ad(!page_cur_is_after_last(cursor));
 #ifdef UNIV_ZIP_DEBUG
-	ut_a(page_zip_validate(page_zip, page, index));
+  ut_a(page_zip_validate(page_zip, cursor->block->frame, index));
 #endif /* UNIV_ZIP_DEBUG */
 
-	/* 1. Get the size of the physical record in the page */
-	rec_size = rec_offs_size(offsets);
-
-#ifdef HAVE_valgrind
-	{
-		const void*	rec_start __attribute__((unused))
-			= rec - rec_offs_extra_size(offsets);
-		ulint		extra_size __attribute__((unused))
-			= rec_offs_extra_size(offsets)
-			- (rec_offs_comp(offsets)
-			   ? REC_N_NEW_EXTRA_BYTES
-			   : REC_N_OLD_EXTRA_BYTES);
-
-		/* All data bytes of the record must be valid. */
-		MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
-		/* The variable-length header must be valid. */
-		MEM_CHECK_DEFINED(rec_start, extra_size);
-	}
-#endif /* HAVE_valgrind */
-
-	const bool reorg_before_insert = page_has_garbage(page)
-		&& rec_size > page_get_max_insert_size(page, 1)
-		&& rec_size <= page_get_max_insert_size_after_reorganize(
-			page, 1);
-
-	/* 2. Try to find suitable space from page memory management */
-	if (!page_zip_available(page_zip, dict_index_is_clust(index),
-				rec_size, 1)
-	    || reorg_before_insert) {
-		/* The values can change dynamically. */
-		bool	log_compressed	= page_zip_log_pages;
-		ulint	level		= page_zip_level;
+  /* 1. Get the size of the physical record in the page */
+  const ulint rec_size= rec_offs_size(offsets);
+
+#ifdef HAVE_MEM_CHECK
+  {
+    const void *rec_start __attribute__((unused))=
+      rec - rec_offs_extra_size(offsets);
+    ulint extra_size __attribute__((unused))=
+      rec_offs_extra_size(offsets) - REC_N_NEW_EXTRA_BYTES;
+    /* All data bytes of the record must be valid. */
+    MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+    /* The variable-length header must be valid. */
+    MEM_CHECK_DEFINED(rec_start, extra_size);
+  }
+#endif /* HAVE_MEM_CHECK */
+  const bool reorg_before_insert= page_has_garbage(cursor->block->frame) &&
+    rec_size > page_get_max_insert_size(cursor->block->frame, 1) &&
+    rec_size <= page_get_max_insert_size_after_reorganize(cursor->block->frame,
+                                                          1);
+  constexpr uint16_t page_free_f= PAGE_FREE + PAGE_HEADER;
+  byte* const page_free = my_assume_aligned<4>(page_free_f +
+                                               cursor->block->frame);
+  uint16_t free_rec= 0;
+
+  /* 2. Try to find suitable space from page memory management */
+  ulint heap_no;
+  byte *insert_buf;
+
+  if (reorg_before_insert ||
+      !page_zip_available(page_zip, index->is_clust(), rec_size, 1))
+  {
+    /* SET GLOBAL might be executed concurrently. Sample the value once. */
+    ulint level= page_zip_level;
 #ifdef UNIV_DEBUG
-		rec_t*	cursor_rec	= page_cur_get_rec(cursor);
+    const rec_t * const cursor_rec= page_cur_get_rec(cursor);
 #endif /* UNIV_DEBUG */
 
-		/* If we are not writing compressed page images, we
-		must reorganize the page before attempting the
-		insert. */
-		if (recv_recovery_is_on()) {
-			/* Insert into the uncompressed page only.
-			The page reorganization or creation that we
-			would attempt outside crash recovery would
-			have been covered by a previous redo log record. */
-		} else if (page_is_empty(page)) {
-			ut_ad(page_cur_is_before_first(cursor));
-
-			/* This is an empty page. Recreate it to
-			get rid of the modification log. */
-			page_create_zip(page_cur_get_block(cursor), index,
-					page_header_get_field(page, PAGE_LEVEL),
-					0, mtr);
-			ut_ad(!page_header_get_ptr(page, PAGE_FREE));
-
-			if (page_zip_available(
-				    page_zip, dict_index_is_clust(index),
-				    rec_size, 1)) {
-				goto use_heap;
-			}
-
-			/* The cursor should remain on the page infimum. */
-			return(NULL);
-		} else if (!page_zip->m_nonempty && !page_has_garbage(page)) {
-			/* The page has been freshly compressed, so
-			reorganizing it will not help. */
-		} else if (log_compressed && !reorg_before_insert) {
-			/* Insert into uncompressed page only, and
-			try page_zip_reorganize() afterwards. */
-		} else if (btr_page_reorganize_low(
-				   recv_recovery_is_on(), level,
-				   cursor, index, mtr)) {
-			ut_ad(!page_header_get_ptr(page, PAGE_FREE));
-
-			if (page_zip_available(
-				    page_zip, dict_index_is_clust(index),
-				    rec_size, 1)) {
-				/* After reorganizing, there is space
-				available. */
-				goto use_heap;
-			}
-		} else {
-			ut_ad(cursor->rec == cursor_rec);
-			return(NULL);
-		}
-
-		/* Try compressing the whole page afterwards. */
-		insert_rec = page_cur_insert_rec_low(
-			cursor->rec, index, rec, offsets, NULL);
-
-		/* If recovery is on, this implies that the compression
-		of the page was successful during runtime. Had that not
-		been the case or had the redo logging of compressed
-		pages been enabled during runtime then we'd have seen
-		a MLOG_ZIP_PAGE_COMPRESS redo record. Therefore, we
-		know that we don't need to reorganize the page. We,
-		however, do need to recompress the page. That will
-		happen when the next redo record is read which must
-		be of type MLOG_ZIP_PAGE_COMPRESS_NO_DATA and it must
-		contain a valid compression level value.
-		This implies that during recovery from this point till
-		the next redo is applied the uncompressed and
-		compressed versions are not identical and
-		page_zip_validate will fail but that is OK because
-		we call page_zip_validate only after processing
-		all changes to a page under a single mtr during
-		recovery. */
-		if (insert_rec == NULL) {
-			/* Out of space.
-			This should never occur during crash recovery,
-			because the MLOG_COMP_REC_INSERT should only
-			be logged after a successful operation. */
-			ut_ad(!recv_recovery_is_on());
-			ut_ad(!index->is_dummy);
-		} else if (recv_recovery_is_on()) {
-			/* This should be followed by
-			MLOG_ZIP_PAGE_COMPRESS_NO_DATA,
-			which should succeed. */
-			rec_offs_make_valid(insert_rec, index,
-					    page_is_leaf(page), offsets);
-		} else {
-			ulint	pos = page_rec_get_n_recs_before(insert_rec);
-			ut_ad(pos > 0);
-
-			if (!log_compressed) {
-				if (page_zip_compress(
-					    page_zip, page, index,
-					    level, NULL)) {
-					page_cur_insert_rec_write_log(
-						insert_rec, rec_size,
-						cursor->rec, index, mtr);
-					page_zip_compress_write_log_no_data(
-						level, page, index, mtr);
-
-					rec_offs_make_valid(
-						insert_rec, index,
-						page_is_leaf(page), offsets);
-					return(insert_rec);
-				}
-
-				/* Page compress failed. If this happened on a
-				leaf page, put the data size into the sample
-				buffer. */
-				if (page_is_leaf(page)) {
-					ulint occupied = page_get_data_size(page)
-						+ page_dir_calc_reserved_space(
-								page_get_n_recs(page));
-					index->stat_defrag_data_size_sample[
-						index->stat_defrag_sample_next_slot] =
-								occupied;
-					index->stat_defrag_sample_next_slot =
-						(index->stat_defrag_sample_next_slot
-						 + 1) % STAT_DEFRAG_DATA_SIZE_N_SAMPLE;
-				}
-
-				ut_ad(cursor->rec
-				      == (pos > 1
-					  ? page_rec_get_nth(
-						  page, pos - 1)
-					  : page + PAGE_NEW_INFIMUM));
-			} else {
-				/* We are writing entire page images
-				to the log. Reduce the redo log volume
-				by reorganizing the page at the same time. */
-				if (page_zip_reorganize(
-					    cursor->block, index, mtr)) {
-					/* The page was reorganized:
-					Seek to pos. */
-					if (pos > 1) {
-						cursor->rec = page_rec_get_nth(
-							page, pos - 1);
-					} else {
-						cursor->rec = page
-							+ PAGE_NEW_INFIMUM;
-					}
-
-					insert_rec = page + rec_get_next_offs(
-						cursor->rec, TRUE);
-					rec_offs_make_valid(
-						insert_rec, index,
-						page_is_leaf(page), offsets);
-					return(insert_rec);
-				}
-
-				/* Theoretically, we could try one
-				last resort of btr_page_reorganize_low()
-				followed by page_zip_available(), but
-				that would be very unlikely to
-				succeed. (If the full reorganized page
-				failed to compress, why would it
-				succeed to compress the page, plus log
-				the insert of this record? */
-			}
-
-			/* Out of space: restore the page */
-			if (!page_zip_decompress(page_zip, page, FALSE)) {
-				ut_error; /* Memory corrupted? */
-			}
-			ut_ad(page_validate(page, index));
-			insert_rec = NULL;
-		}
-
-		return(insert_rec);
-	}
-
-	free_rec = page_header_get_ptr(page, PAGE_FREE);
-	if (UNIV_LIKELY_NULL(free_rec)) {
-		/* Try to allocate from the head of the free list. */
-		lint	extra_size_diff;
-		rec_offs	foffsets_[REC_OFFS_NORMAL_SIZE];
-		rec_offs*	foffsets	= foffsets_;
-		mem_heap_t*	heap		= NULL;
-
-		rec_offs_init(foffsets_);
-
-		foffsets = rec_get_offsets(free_rec, index, foffsets,
-					   page_rec_is_leaf(free_rec)
-					   ? index->n_core_fields : 0,
-					   ULINT_UNDEFINED, &heap);
-		if (rec_offs_size(foffsets) < rec_size) {
+    if (page_is_empty(cursor->block->frame))
+    {
+      ut_ad(page_cur_is_before_first(cursor));
+
+      /* This is an empty page. Recreate to remove the modification log. */
+      page_create_zip(cursor->block, index,
+                      page_header_get_field(cursor->block->frame, PAGE_LEVEL),
+                      0, mtr);
+      ut_ad(!page_header_get_ptr(cursor->block->frame, PAGE_FREE));
+
+      if (page_zip_available(page_zip, index->is_clust(), rec_size, 1))
+        goto use_heap;
+
+      /* The cursor should remain on the page infimum. */
+      return nullptr;
+    }
+
+    if (page_zip->m_nonempty || page_has_garbage(cursor->block->frame))
+    {
+      ulint pos= page_rec_get_n_recs_before(cursor->rec);
+
+      if (!page_zip_reorganize(cursor->block, index, level, mtr, true))
+      {
+        ut_ad(cursor->rec == cursor_rec);
+        return nullptr;
+      }
+
+      if (pos)
+        cursor->rec= page_rec_get_nth(cursor->block->frame, pos);
+      else
+        ut_ad(cursor->rec == page_get_infimum_rec(cursor->block->frame));
+
+      ut_ad(!page_header_get_ptr(cursor->block->frame, PAGE_FREE));
+
+      if (page_zip_available(page_zip, index->is_clust(), rec_size, 1))
+        goto use_heap;
+    }
+
+    /* Try compressing the whole page afterwards. */
+    const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NONE);
+    rec_t *insert_rec= page_cur_insert_rec_low(cursor, index, rec, offsets,
+                                               mtr);
+    mtr->set_log_mode(log_mode);
+
+    if (insert_rec)
+    {
+      ulint pos= page_rec_get_n_recs_before(insert_rec);
+      ut_ad(pos > 0);
+
+      /* We are writing entire page images to the log.  Reduce the redo
+      log volume by reorganizing the page at the same time. */
+      if (page_zip_reorganize(cursor->block, index, level, mtr))
+      {
+        /* The page was reorganized: Seek to pos. */
+        cursor->rec= pos > 1
+          ? page_rec_get_nth(cursor->block->frame, pos - 1)
+          : cursor->block->frame + PAGE_NEW_INFIMUM;
+        insert_rec= cursor->block->frame + rec_get_next_offs(cursor->rec, 1);
+        rec_offs_make_valid(insert_rec, index,
+                            page_is_leaf(cursor->block->frame), offsets);
+        return insert_rec;
+      }
+
+      /* Theoretically, we could try one last resort of
+      page_zip_reorganize() followed by page_zip_available(), but that
+      would be very unlikely to succeed. (If the full reorganized page
+      failed to compress, why would it succeed to compress the page,
+      plus log the insert of this record?) */
+
+      /* Out of space: restore the page */
+      if (!page_zip_decompress(page_zip, cursor->block->frame, false))
+        ut_error; /* Memory corrupted? */
+      ut_ad(page_validate(cursor->block->frame, index));
+      insert_rec= nullptr;
+    }
+    return insert_rec;
+  }
+
+  free_rec= mach_read_from_2(page_free);
+  if (free_rec)
+  {
+    /* Try to allocate from the head of the free list. */
+    rec_offs foffsets_[REC_OFFS_NORMAL_SIZE];
+    mem_heap_t *heap= nullptr;
+
+    rec_offs_init(foffsets_);
+
+    rec_offs *foffsets= rec_get_offsets(cursor->block->frame + free_rec, index,
+                                        foffsets_,
+                                        page_is_leaf(cursor->block->frame)
+                                        ? index->n_core_fields : 0,
+                                        ULINT_UNDEFINED, &heap);
+    insert_buf= cursor->block->frame + free_rec -
+      rec_offs_extra_size(foffsets);
+
+    if (rec_offs_size(foffsets) < rec_size)
+    {
 too_small:
-			if (UNIV_LIKELY_NULL(heap)) {
-				mem_heap_free(heap);
-			}
-
-			goto use_heap;
-		}
-
-		insert_buf = free_rec - rec_offs_extra_size(foffsets);
-
-		/* On compressed pages, do not relocate records from
-		the free list.  If extra_size would grow, use the heap. */
-		extra_size_diff = lint(rec_offs_extra_size(offsets)
-				       - rec_offs_extra_size(foffsets));
-
-		if (UNIV_UNLIKELY(extra_size_diff < 0)) {
-			/* Add an offset to the extra_size. */
-			if (rec_offs_size(foffsets)
-			    < rec_size - ulint(extra_size_diff)) {
-
-				goto too_small;
-			}
-
-			insert_buf -= extra_size_diff;
-		} else if (UNIV_UNLIKELY(extra_size_diff)) {
-			/* Do not allow extra_size to grow */
-
-			goto too_small;
-		}
-
-		heap_no = rec_get_heap_no_new(free_rec);
-		page_mem_alloc_free(page, page_zip,
-				    rec_get_next_ptr(free_rec, TRUE),
-				    rec_size);
-
-		if (!page_is_leaf(page)) {
-			/* Zero out the node pointer of free_rec,
-			in case it will not be overwritten by
-			insert_rec. */
-
-			ut_ad(rec_size > REC_NODE_PTR_SIZE);
-
-			if (rec_offs_extra_size(foffsets)
-			    + rec_offs_data_size(foffsets) > rec_size) {
-
-				memset(rec_get_end(free_rec, foffsets)
-				       - REC_NODE_PTR_SIZE, 0,
-				       REC_NODE_PTR_SIZE);
-			}
-		} else if (dict_index_is_clust(index)) {
-			/* Zero out the DB_TRX_ID and DB_ROLL_PTR
-			columns of free_rec, in case it will not be
-			overwritten by insert_rec. */
-
-			ulint	trx_id_offs;
-			ulint	len;
-
-			trx_id_offs = rec_get_nth_field_offs(
-				foffsets, index->db_trx_id(), &len);
-			ut_ad(len == DATA_TRX_ID_LEN);
-
-			if (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + trx_id_offs
-			    + rec_offs_extra_size(foffsets) > rec_size) {
-				/* We will have to zero out the
-				DB_TRX_ID and DB_ROLL_PTR, because
-				they will not be fully overwritten by
-				insert_rec. */
-
-				memset(free_rec + trx_id_offs, 0,
-				       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
-			}
-
-			ut_ad(free_rec + trx_id_offs + DATA_TRX_ID_LEN
-			      == rec_get_nth_field(free_rec, foffsets,
-						   index->db_roll_ptr(), &len));
-			ut_ad(len == DATA_ROLL_PTR_LEN);
-		}
-
-		if (UNIV_LIKELY_NULL(heap)) {
-			mem_heap_free(heap);
-		}
-	} else {
+      if (UNIV_LIKELY_NULL(heap))
+        mem_heap_free(heap);
+      free_rec= 0;
+      goto use_heap;
+    }
+
+    /* On compressed pages, do not relocate records from
+    the free list. If extra_size would grow, use the heap. */
+    const ssize_t extra_size_diff= lint(rec_offs_extra_size(offsets) -
+                                        rec_offs_extra_size(foffsets));
+
+    if (UNIV_UNLIKELY(extra_size_diff < 0))
+    {
+      /* Add an offset to the extra_size. */
+      if (rec_offs_size(foffsets) < rec_size - ssize_t(extra_size_diff))
+        goto too_small;
+
+      insert_buf-= extra_size_diff;
+    }
+    else if (UNIV_UNLIKELY(extra_size_diff))
+      /* Do not allow extra_size to grow */
+      goto too_small;
+
+    byte *const free_rec_ptr= cursor->block->frame + free_rec;
+    heap_no= rec_get_heap_no_new(free_rec_ptr);
+    int16_t next_rec= mach_read_from_2(free_rec_ptr - REC_NEXT);
+    /* With innodb_page_size=64k, int16_t would be unsafe to use here,
+    but that cannot be used with ROW_FORMAT=COMPRESSED. */
+    static_assert(UNIV_ZIP_SIZE_SHIFT_MAX == 14, "compatibility");
+    if (next_rec)
+    {
+      next_rec= static_cast<int16_t>(next_rec + free_rec);
+      ut_ad(int{PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES} <= next_rec);
+      ut_ad(static_cast<uint16_t>(next_rec) < srv_page_size);
+    }
+
+    byte *hdr= my_assume_aligned<4>(&page_zip->data[page_free_f]);
+    mach_write_to_2(hdr, static_cast<uint16_t>(next_rec));
+    const byte *const garbage= my_assume_aligned<2>(page_free + 2);
+    ut_ad(mach_read_from_2(garbage) >= rec_size);
+    mach_write_to_2(my_assume_aligned<2>(hdr + 2),
+                    mach_read_from_2(garbage) - rec_size);
+    static_assert(PAGE_GARBAGE == PAGE_FREE + 2, "compatibility");
+    mtr->memcpy(*cursor->block, page_free, hdr, 4);
+
+    if (!page_is_leaf(cursor->block->frame))
+    {
+      /* Zero out the node pointer of free_rec, in case it will not be
+      overwritten by insert_rec. */
+      ut_ad(rec_size > REC_NODE_PTR_SIZE);
+
+      if (rec_offs_size(foffsets) > rec_size)
+        memset(rec_get_end(free_rec_ptr, foffsets) -
+               REC_NODE_PTR_SIZE, 0, REC_NODE_PTR_SIZE);
+    }
+    else if (index->is_clust())
+    {
+      /* Zero out DB_TRX_ID,DB_ROLL_PTR in free_rec, in case they will
+      not be overwritten by insert_rec. */
+
+      ulint len;
+      ulint trx_id_offs= rec_get_nth_field_offs(foffsets, index->db_trx_id(),
+                                                &len);
+      ut_ad(len == DATA_TRX_ID_LEN);
+
+      if (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + trx_id_offs +
+          rec_offs_extra_size(foffsets) > rec_size)
+        memset(free_rec_ptr + trx_id_offs, 0,
+               DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+      ut_ad(free_rec_ptr + trx_id_offs + DATA_TRX_ID_LEN ==
+            rec_get_nth_field(free_rec_ptr, foffsets, index->db_roll_ptr(),
+                              &len));
+      ut_ad(len == DATA_ROLL_PTR_LEN);
+    }
+
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+  }
+  else
+  {
 use_heap:
-		free_rec = NULL;
-		insert_buf = page_mem_alloc_heap(page, page_zip,
-						 rec_size, &heap_no);
-
-		if (UNIV_UNLIKELY(insert_buf == NULL)) {
-			return(NULL);
-		}
-
-		page_zip_dir_add_slot(page_zip, dict_index_is_clust(index));
-	}
-
-	/* 3. Create the record */
-	insert_rec = rec_copy(insert_buf, rec, offsets);
-	rec_offs_make_valid(insert_rec, index, page_is_leaf(page), offsets);
-
-	/* 4. Insert the record in the linked list of records */
-	ut_ad(cursor->rec != insert_rec);
-
-	{
-		/* next record after current before the insertion */
-		const rec_t*	next_rec = page_rec_get_next_low(
-			cursor->rec, TRUE);
-		ut_ad(rec_get_status(cursor->rec)
-		      <= REC_STATUS_INFIMUM);
-		ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM);
-		ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
-
-		page_rec_set_next(insert_rec, next_rec);
-		page_rec_set_next(cursor->rec, insert_rec);
-	}
-
-	page_header_set_field(page, page_zip, PAGE_N_RECS,
-			      1U + page_get_n_recs(page));
-
-	/* 5. Set the n_owned field in the inserted record to zero,
-	and set the heap_no field */
-	rec_set_n_owned_new(insert_rec, NULL, 0);
-	rec_set_heap_no_new(insert_rec, heap_no);
-
-	MEM_CHECK_DEFINED(rec_get_start(insert_rec, offsets),
-			  rec_offs_size(offsets));
-
-	page_zip_dir_insert(page_zip, cursor->rec, free_rec, insert_rec);
-
-	/* 6. Update the last insertion info in page header */
-
-	last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT);
-	ut_ad(!last_insert
-	      || rec_get_node_ptr_flag(last_insert)
-	      == rec_get_node_ptr_flag(insert_rec));
-
-	if (!dict_index_is_spatial(index)) {
-		byte* ptr = PAGE_HEADER + PAGE_DIRECTION_B + page;
-		if (UNIV_UNLIKELY(last_insert == NULL)) {
+    ut_ad(!free_rec);
+    insert_buf= page_mem_alloc_heap<true>(cursor->block, rec_size, &heap_no);
+
+    if (UNIV_UNLIKELY(!insert_buf))
+      return insert_buf;
+
+    static_assert(PAGE_N_HEAP == PAGE_HEAP_TOP + 2, "compatibility");
+    mtr->memcpy(*cursor->block, PAGE_HEAP_TOP + PAGE_HEADER, 4);
+    page_zip_dir_add_slot(cursor->block, index, mtr);
+  }
+
+  /* 3. Create the record */
+  byte *insert_rec= rec_copy(insert_buf, rec, offsets);
+  rec_offs_make_valid(insert_rec, index, page_is_leaf(cursor->block->frame),
+                      offsets);
+
+  /* 4. Insert the record in the linked list of records */
+  ut_ad(cursor->rec != insert_rec);
+
+  /* next record after current before the insertion */
+  const rec_t* next_rec = page_rec_get_next_low(cursor->rec, TRUE);
+  ut_ad(rec_get_status(cursor->rec) <= REC_STATUS_INFIMUM);
+  ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM);
+  ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
+
+  mach_write_to_2(insert_rec - REC_NEXT, static_cast<uint16_t>
+                  (next_rec - insert_rec));
+  mach_write_to_2(cursor->rec - REC_NEXT, static_cast<uint16_t>
+                  (insert_rec - cursor->rec));
+  byte *n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+                                     cursor->block->frame);
+  mtr->write<2>(*cursor->block, n_recs, 1U + mach_read_from_2(n_recs));
+  memcpy_aligned<2>(&page_zip->data[PAGE_N_RECS + PAGE_HEADER], n_recs, 2);
+
+  /* 5. Set the n_owned field in the inserted record to zero,
+  and set the heap_no field */
+  rec_set_bit_field_1(insert_rec, 0, REC_NEW_N_OWNED,
+                      REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+  rec_set_bit_field_2(insert_rec, heap_no, REC_NEW_HEAP_NO,
+                      REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+
+  MEM_CHECK_DEFINED(rec_get_start(insert_rec, offsets),
+                    rec_offs_size(offsets));
+
+  /* 6. Update the last insertion info in page header */
+  byte *last_insert= my_assume_aligned<4>(PAGE_LAST_INSERT + PAGE_HEADER +
+                                          page_zip->data);
+  const uint16_t last_insert_rec= mach_read_from_2(last_insert);
+  ut_ad(!last_insert_rec ||
+        rec_get_node_ptr_flag(cursor->block->frame + last_insert_rec) ==
+        rec_get_node_ptr_flag(insert_rec));
+  mach_write_to_2(last_insert, page_offset(insert_rec));
+
+  if (!index->is_spatial())
+  {
+    byte *dir= &page_zip->data[PAGE_HEADER + PAGE_DIRECTION_B];
+    ut_ad(!(*dir & ~((1U << 3) - 1)));
+    byte *n= my_assume_aligned<2>
+      (&page_zip->data[PAGE_HEADER + PAGE_N_DIRECTION]);
+    if (UNIV_UNLIKELY(!last_insert_rec))
+    {
 no_direction:
-			page_direction_reset(ptr, page, page_zip);
-		} else if (last_insert == cursor->rec
-			   && page_ptr_get_direction(ptr) != PAGE_LEFT) {
-			page_direction_increment(ptr, page, page_zip,
-						 PAGE_RIGHT);
-		} else if (page_ptr_get_direction(ptr) != PAGE_RIGHT
-			   && page_rec_get_next(insert_rec) == last_insert) {
-			page_direction_increment(ptr, page, page_zip,
-						 PAGE_LEFT);
-		} else {
-			goto no_direction;
-		}
-	}
-
-	page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, insert_rec);
-
-	/* 7. It remains to update the owner record. */
-	{
-		rec_t*	owner_rec	= page_rec_find_owner_rec(insert_rec);
-		ulint	n_owned;
-
-		n_owned = rec_get_n_owned_new(owner_rec);
-		rec_set_n_owned_new(owner_rec, page_zip, n_owned + 1);
-
-		/* 8. Now we have incremented the n_owned field of the owner
-		record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED,
-		we have to split the corresponding directory slot in two. */
-
-		if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) {
-			page_dir_split_slot(
-				page, page_zip,
-				page_dir_find_owner_slot(owner_rec));
-		}
-	}
-
-	page_zip_write_rec(page_zip, insert_rec, index, offsets, 1);
-
-	/* 9. Write log record of the insert */
-	if (UNIV_LIKELY(mtr != NULL)) {
-		page_cur_insert_rec_write_log(insert_rec, rec_size,
-					      cursor->rec, index, mtr);
-	}
-
-	return(insert_rec);
-}
-
-/**********************************************************//**
-Writes a log record of copying a record list end to a new created page.
-@return 4-byte field where to write the log data length, or NULL if
-logging is disabled */
-UNIV_INLINE
-byte*
-page_copy_rec_list_to_created_page_write_log(
-/*=========================================*/
-	page_t*		page,	/*!< in: index page */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	mtr_t*		mtr)	/*!< in: mtr */
-{
-	byte*	log_ptr;
-
-	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
-	ut_ad(mtr->is_named_space(index->table->space));
-
-	log_ptr = mlog_open_and_write_index(mtr, page, index,
-					    page_is_comp(page)
-					    ? MLOG_COMP_LIST_END_COPY_CREATED
-					    : MLOG_LIST_END_COPY_CREATED, 4);
-	if (UNIV_LIKELY(log_ptr != NULL)) {
-		mlog_close(mtr, log_ptr + 4);
-	}
-
-	return(log_ptr);
-}
-
-/**********************************************************//**
-Parses a log record of copying a record list end to a new created page.
-@return end of log record or NULL */
-byte*
-page_parse_copy_rec_list_to_created_page(
-/*=====================================*/
-	byte*		ptr,	/*!< in: buffer */
-	byte*		end_ptr,/*!< in: buffer end */
-	buf_block_t*	block,	/*!< in: page or NULL */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	mtr_t*		mtr)	/*!< in: mtr or NULL */
-{
-	byte*		rec_end;
-	ulint		log_data_len;
-	page_t*		page;
-	page_zip_des_t*	page_zip;
-
-	ut_ad(index->is_dummy);
-
-	if (ptr + 4 > end_ptr) {
-
-		return(NULL);
-	}
-
-	log_data_len = mach_read_from_4(ptr);
-	ptr += 4;
-
-	rec_end = ptr + log_data_len;
-
-	if (rec_end > end_ptr) {
-
-		return(NULL);
-	}
-
-	if (!block) {
-
-		return(rec_end);
-	}
-
-	ut_ad(fil_page_index_page_check(block->frame));
-	/* This function is never invoked on the clustered index root page,
-	except in the redo log apply of
-	page_copy_rec_list_end_to_created_page() which was logged by.
-	page_copy_rec_list_to_created_page_write_log().
-	For other pages, this field must be zero-initialized. */
-	ut_ad(!page_get_instant(block->frame)
-	      || !page_has_siblings(block->frame));
-
-	while (ptr < rec_end) {
-		ptr = page_cur_parse_insert_rec(TRUE, ptr, end_ptr,
-						block, index, mtr);
-	}
-
-	ut_a(ptr == rec_end);
-
-	page = buf_block_get_frame(block);
-	page_zip = buf_block_get_page_zip(block);
-
-	page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL);
-
-	if (!dict_index_is_spatial(index)) {
-		page_direction_reset(PAGE_HEADER + PAGE_DIRECTION_B + page,
-				     page, page_zip);
-	}
-
-	return(rec_end);
-}
-
-/*************************************************************//**
-Copies records from page to a newly created page, from a given record onward,
-including that record. Infimum and supremum records are not copied.
-
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if this is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit(). */
-void
-page_copy_rec_list_end_to_created_page(
-/*===================================*/
-	page_t*		new_page,	/*!< in/out: index page to copy to */
-	rec_t*		rec,		/*!< in: first record to copy */
-	dict_index_t*	index,		/*!< in: record descriptor */
-	mtr_t*		mtr)		/*!< in: mtr */
-{
-	page_dir_slot_t* slot = 0; /* remove warning */
-	byte*	heap_top;
-	rec_t*	insert_rec = 0; /* remove warning */
-	rec_t*	prev_rec;
-	ulint	count;
-	ulint	n_recs;
-	ulint	slot_index;
-	ulint	rec_size;
-	byte*	log_ptr;
-	ulint	log_data_len;
-	mem_heap_t*	heap		= NULL;
-	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
-	rec_offs*	offsets		= offsets_;
-	rec_offs_init(offsets_);
-
-	ut_ad(page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW);
-	ut_ad(page_align(rec) != new_page);
-	ut_ad(page_rec_is_comp(rec) == page_is_comp(new_page));
-	ut_ad(fil_page_index_page_check(new_page));
-	/* This function is never invoked on the clustered index root page,
-	except in btr_lift_page_up(). */
-	ut_ad(!page_get_instant(new_page) || !page_has_siblings(new_page));
-
-	if (page_rec_is_infimum(rec)) {
-
-		rec = page_rec_get_next(rec);
-	}
-
-	if (page_rec_is_supremum(rec)) {
-
-		return;
-	}
-
-#ifdef UNIV_DEBUG
-	/* To pass the debug tests we have to set these dummy values
-	in the debug version */
-	page_dir_set_n_slots(new_page, NULL, srv_page_size / 2);
-	page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP,
-			    new_page + srv_page_size - 1);
-#endif
-	log_ptr = page_copy_rec_list_to_created_page_write_log(new_page,
-							       index, mtr);
-
-	log_data_len = mtr->get_log()->size();
-
-	/* Individual inserts are logged in a shorter form */
-
-	const mtr_log_t	log_mode = index->table->is_temporary()
-	    || !index->is_readable() /* IMPORT TABLESPACE */
-		? mtr_get_log_mode(mtr)
-		: mtr_set_log_mode(mtr, MTR_LOG_SHORT_INSERTS);
-
-	prev_rec = page_get_infimum_rec(new_page);
-	if (page_is_comp(new_page)) {
-		heap_top = new_page + PAGE_NEW_SUPREMUM_END;
-	} else {
-		heap_top = new_page + PAGE_OLD_SUPREMUM_END;
-	}
-	count = 0;
-	slot_index = 0;
-	n_recs = 0;
-
-	const ulint n_core = page_is_leaf(new_page)
-		? index->n_core_fields : 0;
-
-	do {
-		offsets = rec_get_offsets(rec, index, offsets, n_core,
-					  ULINT_UNDEFINED, &heap);
-		insert_rec = rec_copy(heap_top, rec, offsets);
-
-		if (page_is_comp(new_page)) {
-			rec_set_next_offs_new(prev_rec,
-					      page_offset(insert_rec));
-
-			rec_set_n_owned_new(insert_rec, NULL, 0);
-			rec_set_heap_no_new(insert_rec,
-					    PAGE_HEAP_NO_USER_LOW + n_recs);
-		} else {
-			rec_set_next_offs_old(prev_rec,
-					      page_offset(insert_rec));
-
-			rec_set_n_owned_old(insert_rec, 0);
-			rec_set_heap_no_old(insert_rec,
-					    PAGE_HEAP_NO_USER_LOW + n_recs);
-		}
-
-		count++;
-		n_recs++;
-
-		if (UNIV_UNLIKELY
-		    (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)) {
-
-			slot_index++;
-
-			slot = page_dir_get_nth_slot(new_page, slot_index);
-
-			page_dir_slot_set_rec(slot, insert_rec);
-			page_dir_slot_set_n_owned(slot, NULL, count);
-
-			count = 0;
-		}
-
-		rec_size = rec_offs_size(offsets);
-
-		ut_ad(heap_top < new_page + srv_page_size);
-
-		heap_top += rec_size;
-
-		rec_offs_make_valid(insert_rec, index, n_core != 0, offsets);
-		page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec,
-					      index, mtr);
-		prev_rec = insert_rec;
-		rec = page_rec_get_next(rec);
-	} while (!page_rec_is_supremum(rec));
-
-	ut_ad(n_recs);
-
-	if ((slot_index > 0) && (count + 1
-				 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2
-				 <= PAGE_DIR_SLOT_MAX_N_OWNED)) {
-		/* We can merge the two last dir slots. This operation is
-		here to make this function imitate exactly the equivalent
-		task made using page_cur_insert_rec, which we use in database
-		recovery to reproduce the task performed by this function.
-		To be able to check the correctness of recovery, it is good
-		that it imitates exactly. */
-
-		count += (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2;
-
-		page_dir_slot_set_n_owned(slot, NULL, 0);
-
-		slot_index--;
-	}
-
-	if (UNIV_LIKELY_NULL(heap)) {
-		mem_heap_free(heap);
-	}
-
-	/* Restore the log mode */
-
-	mtr_set_log_mode(mtr, log_mode);
-
-	log_data_len = mtr->get_log()->size() - log_data_len;
-
-	ut_a(log_data_len < 100U << srv_page_size_shift);
-
-	if (log_ptr != NULL) {
-		mach_write_to_4(log_ptr, log_data_len);
-	}
-
-	if (page_is_comp(new_page)) {
-		rec_set_next_offs_new(insert_rec, PAGE_NEW_SUPREMUM);
-	} else {
-		rec_set_next_offs_old(insert_rec, PAGE_OLD_SUPREMUM);
-	}
-
-	slot = page_dir_get_nth_slot(new_page, 1 + slot_index);
-
-	page_dir_slot_set_rec(slot, page_get_supremum_rec(new_page));
-	page_dir_slot_set_n_owned(slot, NULL, count + 1);
-
-	page_dir_set_n_slots(new_page, NULL, 2 + slot_index);
-	page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP, heap_top);
-	page_dir_set_n_heap(new_page, NULL, PAGE_HEAP_NO_USER_LOW + n_recs);
-	page_header_set_field(new_page, NULL, PAGE_N_RECS, n_recs);
-
-	*reinterpret_cast<uint16_t*>(PAGE_HEADER + PAGE_LAST_INSERT + new_page)
-		= 0;
-	page_direction_reset(PAGE_HEADER + PAGE_DIRECTION_B + new_page,
-			     new_page, NULL);
-}
-
-/***********************************************************//**
-Writes log record of a record delete on a page. */
-UNIV_INLINE
-void
-page_cur_delete_rec_write_log(
-/*==========================*/
-	rec_t*			rec,	/*!< in: record to be deleted */
-	const dict_index_t*	index,	/*!< in: record descriptor */
-	mtr_t*			mtr)	/*!< in: mini-transaction handle */
-{
-	byte*	log_ptr;
-
-	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
-	ut_ad(mtr->is_named_space(index->table->space));
-
-	log_ptr = mlog_open_and_write_index(mtr, rec, index,
-					    page_rec_is_comp(rec)
-					    ? MLOG_COMP_REC_DELETE
-					    : MLOG_REC_DELETE, 2);
-
-	if (!log_ptr) {
-		/* Logging in mtr is switched off during crash recovery:
-		in that case mlog_open returns NULL */
-		return;
-	}
-
-	/* Write the cursor rec offset as a 2-byte ulint */
-	mach_write_to_2(log_ptr, page_offset(rec));
-
-	mlog_close(mtr, log_ptr + 2);
+      *dir= PAGE_NO_DIRECTION;
+      memset(n, 0, 2);
+    }
+    else if (*dir != PAGE_LEFT &&
+             cursor->block->frame + last_insert_rec == cursor->rec)
+    {
+      *dir= PAGE_RIGHT;
+inc_dir:
+      mach_write_to_2(n, mach_read_from_2(n) + 1);
+    }
+    else if (*dir != PAGE_RIGHT && page_rec_get_next(insert_rec) ==
+             cursor->block->frame + last_insert_rec)
+    {
+      *dir= PAGE_LEFT;
+      goto inc_dir;
+    }
+    else
+      goto no_direction;
+  }
+
+  /* Write the header fields in one record. */
+  mtr->memcpy(*cursor->block,
+              my_assume_aligned<8>(PAGE_LAST_INSERT + PAGE_HEADER +
+                                   cursor->block->frame),
+              my_assume_aligned<8>(PAGE_LAST_INSERT + PAGE_HEADER +
+                                   page_zip->data),
+              PAGE_N_RECS - PAGE_LAST_INSERT + 2);
+
+  /* 7. It remains to update the owner record. */
+  ulint n_owned;
+
+  while (!(n_owned = rec_get_n_owned_new(next_rec)))
+    next_rec= page_rec_get_next_low(next_rec, true);
+
+  rec_set_bit_field_1(const_cast<rec_t*>(next_rec), n_owned + 1,
+                      REC_NEW_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+
+  page_zip_dir_insert(cursor, free_rec, insert_rec, mtr);
+
+  /* 8. Now we have incremented the n_owned field of the owner
+  record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED,
+  we have to split the corresponding directory slot in two. */
+  if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+    page_zip_dir_split_slot(cursor->block,
+                            page_dir_find_owner_slot(next_rec), mtr);
+
+  page_zip_write_rec(cursor->block, insert_rec, index, offsets, 1, mtr);
+  return insert_rec;
 }
 
-/***********************************************************//**
-Parses log record of a record delete on a page.
-@return pointer to record end or NULL */
-byte*
-page_cur_parse_delete_rec(
-/*======================*/
-	byte*		ptr,	/*!< in: buffer */
-	byte*		end_ptr,/*!< in: buffer end */
-	buf_block_t*	block,	/*!< in: page or NULL */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	mtr_t*		mtr)	/*!< in: mtr or NULL */
+/** Prepend a record to the PAGE_FREE list, or shrink PAGE_HEAP_TOP.
+@param[in,out]  block        index page
+@param[in,out]  rec          record being deleted
+@param[in]      data_size    record payload size, in bytes
+@param[in]      extra_size   record header size, in bytes */
+static void page_mem_free(const buf_block_t &block, rec_t *rec,
+                          size_t data_size, size_t extra_size)
 {
-	ulint		offset;
-	page_cur_t	cursor;
-
-	if (end_ptr < ptr + 2) {
-
-		return(NULL);
-	}
-
-	/* Read the cursor rec offset as a 2-byte ulint */
-	offset = mach_read_from_2(ptr);
-	ptr += 2;
-
-	if (UNIV_UNLIKELY(offset >= srv_page_size)) {
-		recv_sys.found_corrupt_log = true;
-		return NULL;
-	}
-
-	if (block) {
-		page_t*		page		= buf_block_get_frame(block);
-		mem_heap_t*	heap		= NULL;
-		rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
-		rec_t*		rec		= page + offset;
-		rec_offs_init(offsets_);
-
-		page_cur_position(rec, block, &cursor);
-		ut_ad(!buf_block_get_page_zip(block) || page_is_comp(page));
-
-		page_cur_delete_rec(&cursor, index,
-				    rec_get_offsets(rec, index, offsets_,
-						    page_rec_is_leaf(rec)
-						    ? index->n_core_fields : 0,
-						    ULINT_UNDEFINED, &heap),
-				    mtr);
-		if (UNIV_LIKELY_NULL(heap)) {
-			mem_heap_free(heap);
-		}
-	}
-
-	return(ptr);
+  ut_ad(page_align(rec) == block.frame);
+  ut_ad(!block.page.zip.data);
+  const rec_t *free= page_header_get_ptr(block.frame, PAGE_FREE);
+
+  const uint16_t n_heap= uint16_t(page_header_get_field(block.frame,
+                                                        PAGE_N_HEAP) - 1);
+  ut_ad(page_get_n_recs(block.frame) < (n_heap & 0x7fff));
+  const bool deleting_top= n_heap == ((n_heap & 0x8000)
+                                      ? (rec_get_heap_no_new(rec) | 0x8000)
+                                      : rec_get_heap_no_old(rec));
+
+  if (deleting_top)
+  {
+    byte *page_heap_top= my_assume_aligned<2>(PAGE_HEAP_TOP + PAGE_HEADER +
+                                              block.frame);
+    const uint16_t heap_top= mach_read_from_2(page_heap_top);
+    const size_t extra_savings= heap_top - page_offset(rec + data_size);
+    ut_ad(extra_savings < heap_top);
+
+    /* When deleting the last record, do not add it to the PAGE_FREE list.
+    Instead, decrement PAGE_HEAP_TOP and PAGE_N_HEAP. */
+    mach_write_to_2(page_heap_top, page_offset(rec - extra_size));
+    mach_write_to_2(my_assume_aligned<2>(page_heap_top + 2), n_heap);
+    static_assert(PAGE_N_HEAP == PAGE_HEAP_TOP + 2, "compatibility");
+    if (extra_savings)
+    {
+      byte *page_garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
+                                               block.frame);
+      uint16_t garbage= mach_read_from_2(page_garbage);
+      ut_ad(garbage >= extra_savings);
+      mach_write_to_2(page_garbage, garbage - extra_savings);
+    }
+  }
+  else
+  {
+    byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+                                          block.frame);
+    byte *page_garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
+                                             block.frame);
+    mach_write_to_2(page_free, page_offset(rec));
+    mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) +
+                    extra_size + data_size);
+  }
+
+  memset_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER + block.frame, 0, 2);
+  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+                                          block.frame);
+  mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) - 1);
+
+  const byte* const end= rec + data_size;
+
+  if (!deleting_top)
+  {
+    uint16_t next= free
+      ? ((n_heap & 0x8000)
+         ? static_cast<uint16_t>(free - rec)
+         : static_cast<uint16_t>(free - block.frame))
+      : uint16_t{0};
+    mach_write_to_2(rec - REC_NEXT, next);
+  }
+  else
+    rec-= extra_size;
+
+  memset(rec, 0, end - rec);
 }
 
 /***********************************************************//**
@@ -2307,13 +2132,9 @@ page_cur_delete_rec(
 	const dict_index_t*	index,	/*!< in: record descriptor */
 	const rec_offs*		offsets,/*!< in: rec_get_offsets(
 					cursor->rec, index) */
-	mtr_t*			mtr)	/*!< in: mini-transaction handle
-					or NULL */
+	mtr_t*			mtr)	/*!< in/out: mini-transaction */
 {
 	page_dir_slot_t* cur_dir_slot;
-	page_dir_slot_t* prev_slot;
-	page_t*		page;
-	page_zip_des_t*	page_zip;
 	rec_t*		current_rec;
 	rec_t*		prev_rec	= NULL;
 	rec_t*		next_rec;
@@ -2321,36 +2142,31 @@ page_cur_delete_rec(
 	ulint		cur_n_owned;
 	rec_t*		rec;
 
-	page = page_cur_get_page(cursor);
-	page_zip = page_cur_get_page_zip(cursor);
-
 	/* page_zip_validate() will fail here when
 	btr_cur_pessimistic_delete() invokes btr_set_min_rec_mark().
-	Then, both "page_zip" and "page" would have the min-rec-mark
-	set on the smallest user record, but "page" would additionally
+	Then, both "page_zip" and "block->frame" would have the min-rec-mark
+	set on the smallest user record, but "block->frame" would additionally
 	have it set on the smallest-but-one record.  Because sloppy
 	page_zip_validate_low() only ignores min-rec-flag differences
 	in the smallest user record, it cannot be used here either. */
 
 	current_rec = cursor->rec;
+	buf_block_t* const block = cursor->block;
 	ut_ad(rec_offs_validate(current_rec, index, offsets));
-	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
-	ut_ad(fil_page_index_page_check(page));
-	ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) == index->id
-	      || index->is_dummy
-	      || (mtr ? mtr->is_inside_ibuf() : dict_index_is_ibuf(index)));
-	ut_ad(!mtr || mtr->is_named_space(index->table->space));
+	ut_ad(!!page_is_comp(block->frame) == index->table->not_redundant());
+	ut_ad(fil_page_index_page_check(block->frame));
+	ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->frame)
+	      == index->id
+	      || mtr->is_inside_ibuf());
+	ut_ad(mtr->is_named_space(index->table->space));
 
 	/* The record must not be the supremum or infimum record. */
 	ut_ad(page_rec_is_user_rec(current_rec));
 
-	if (page_get_n_recs(page) == 1 && !recv_recovery_is_on()
+	if (page_get_n_recs(block->frame) == 1
 	    && !rec_is_alter_metadata(current_rec, *index)) {
-		/* Empty the page, unless we are applying the redo log
-		during crash recovery. During normal operation, the
-		page_create_empty() gets logged as one of MLOG_PAGE_CREATE,
-		MLOG_COMP_PAGE_CREATE, MLOG_ZIP_PAGE_COMPRESS. */
-		ut_ad(page_is_leaf(page));
+		/* Empty the page. */
+		ut_ad(page_is_leaf(block->frame));
 		/* Usually, this should be the root page,
 		and the whole index tree should become empty.
 		However, this could also be a call in
@@ -2366,35 +2182,20 @@ page_cur_delete_rec(
 	/* Save to local variables some data associated with current_rec */
 	cur_slot_no = page_dir_find_owner_slot(current_rec);
 	ut_ad(cur_slot_no > 0);
-	cur_dir_slot = page_dir_get_nth_slot(page, cur_slot_no);
+	cur_dir_slot = page_dir_get_nth_slot(block->frame, cur_slot_no);
 	cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot);
 
-	/* 0. Write the log record */
-	if (mtr != 0) {
-		page_cur_delete_rec_write_log(current_rec, index, mtr);
-	}
-
-	/* 1. Reset the last insert info in the page header and increment
-	the modify clock for the frame */
-
-	page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL);
-
-	/* The page gets invalid for optimistic searches: increment the
-	frame modify clock only if there is an mini-transaction covering
-	the change. During IMPORT we allocate local blocks that are not
-	part of the buffer pool. */
+	/* The page gets invalid for btr_pcur_restore_pos().
+	We avoid invoking buf_block_modify_clock_inc(block) because its
+	consistency checks would fail for the dummy block that is being
+	used during IMPORT TABLESPACE. */
+	block->modify_clock++;
 
-	if (mtr != 0) {
-		buf_block_modify_clock_inc(page_cur_get_block(cursor));
-	}
-
-	/* 2. Find the next and the previous record. Note that the cursor is
+	/* Find the next and the previous record. Note that the cursor is
 	left at the next record. */
 
-	ut_ad(cur_slot_no > 0);
-	prev_slot = page_dir_get_nth_slot(page, cur_slot_no - 1);
-
-	rec = (rec_t*) page_dir_slot_get_rec(prev_slot);
+	rec = const_cast<rec_t*>
+		(page_dir_slot_get_rec(cur_dir_slot + PAGE_DIR_SLOT_SIZE));
 
 	/* rec now points to the record of the previous directory slot. Look
 	for the immediate predecessor of current_rec in a loop. */
@@ -2407,36 +2208,752 @@ page_cur_delete_rec(
 	page_cur_move_to_next(cursor);
 	next_rec = cursor->rec;
 
-	/* 3. Remove the record from the linked list of records */
-
-	page_rec_set_next(prev_rec, next_rec);
-
-	/* 4. If the deleted record is pointed to by a dir slot, update the
+	/* Remove the record from the linked list of records */
+	/* If the deleted record is pointed to by a dir slot, update the
 	record pointer in slot. In the following if-clause we assume that
 	prev_rec is owned by the same slot, i.e., PAGE_DIR_SLOT_MIN_N_OWNED
 	>= 2. */
+	/* Update the number of owned records of the slot */
 
 	compile_time_assert(PAGE_DIR_SLOT_MIN_N_OWNED >= 2);
 	ut_ad(cur_n_owned > 1);
 
-	if (current_rec == page_dir_slot_get_rec(cur_dir_slot)) {
-		page_dir_slot_set_rec(cur_dir_slot, prev_rec);
+	rec_t* slot_rec = const_cast<rec_t*>
+		(page_dir_slot_get_rec(cur_dir_slot));
+
+	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+		ut_ad(page_is_comp(block->frame));
+		if (current_rec == slot_rec) {
+			page_zip_rec_set_owned(block, prev_rec, 1, mtr);
+			page_zip_rec_set_owned(block, slot_rec, 0, mtr);
+			slot_rec = prev_rec;
+			mach_write_to_2(cur_dir_slot, page_offset(slot_rec));
+		} else if (cur_n_owned == 1
+			   && !page_rec_is_supremum(slot_rec)) {
+			page_zip_rec_set_owned(block, slot_rec, 0, mtr);
+		}
+
+		mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>
+				(next_rec - prev_rec));
+		slot_rec[-REC_NEW_N_OWNED] = static_cast<byte>(
+			(slot_rec[-REC_NEW_N_OWNED] & ~REC_N_OWNED_MASK)
+			| (cur_n_owned - 1) << REC_N_OWNED_SHIFT);
+
+		page_header_reset_last_insert(block, mtr);
+		page_zip_dir_delete(block, rec, index, offsets,
+				    page_header_get_ptr(block->frame,
+							PAGE_FREE),
+				    mtr);
+		if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
+			page_zip_dir_balance_slot(block, cur_slot_no, mtr);
+		}
+		return;
 	}
 
-	/* 5. Update the number of owned records of the slot */
+	if (current_rec == slot_rec) {
+		slot_rec = prev_rec;
+		mach_write_to_2(cur_dir_slot, page_offset(slot_rec));
+	}
 
-	page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1);
+	const size_t data_size = rec_offs_data_size(offsets);
+	const size_t extra_size = rec_offs_extra_size(offsets);
 
-	/* 6. Free the memory occupied by the record */
-	page_mem_free(page, page_zip, current_rec, index, offsets);
+	if (page_is_comp(block->frame)) {
+		mtr->page_delete(*block, page_offset(prev_rec)
+				 - PAGE_NEW_INFIMUM,
+				 extra_size - REC_N_NEW_EXTRA_BYTES,
+				 data_size);
+		mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>
+				(next_rec - prev_rec));
+		slot_rec[-REC_NEW_N_OWNED] = static_cast<byte>(
+			(slot_rec[-REC_NEW_N_OWNED] & ~REC_N_OWNED_MASK)
+			| (cur_n_owned - 1) << REC_N_OWNED_SHIFT);
+	} else {
+		mtr->page_delete(*block, page_offset(prev_rec)
+				 - PAGE_OLD_INFIMUM);
+		memcpy(prev_rec - REC_NEXT, current_rec - REC_NEXT, 2);
+		slot_rec[-REC_OLD_N_OWNED] = static_cast<byte>(
+			(slot_rec[-REC_OLD_N_OWNED] & ~REC_N_OWNED_MASK)
+			| (cur_n_owned - 1) << REC_N_OWNED_SHIFT);
+	}
+
+	page_mem_free(*block, current_rec, data_size, extra_size);
 
-	/* 7. Now we have decremented the number of owned records of the slot.
+	/* Now we have decremented the number of owned records of the slot.
 	If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the
 	slots. */
 
 	if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
-		page_dir_balance_slot(page, page_zip, cur_slot_no);
+		page_dir_balance_slot(*block, cur_slot_no);
 	}
+
+	ut_ad(page_is_comp(block->frame)
+	      ? page_simple_validate_new(block->frame)
+	      : page_simple_validate_old(block->frame));
+}
+
+/** Apply a INSERT_HEAP_REDUNDANT or INSERT_REUSE_REDUNDANT record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=REDUNDANT page.
+@param block      B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse      false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev       byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@param enc_hdr    encoded fixed-size header bits
+@param hdr_c      number of common record header bytes with prev
+@param data_c     number of common data bytes with prev
+@param data       literal header and data bytes
+@param data_len   length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_redundant(const buf_block_t &block, bool reuse,
+                                 ulint prev, ulint enc_hdr,
+                                 size_t hdr_c, size_t data_c,
+                                 const void *data, size_t data_len)
+{
+  const uint16_t n_slots= page_dir_get_n_slots(block.frame);
+  byte *page_n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER +
+                                          block.frame);
+  const uint16_t h= mach_read_from_2(page_n_heap);
+  const page_id_t id(block.page.id());
+  if (UNIV_UNLIKELY(n_slots < 2 || h < n_slots || h < PAGE_HEAP_NO_USER_LOW ||
+                    h >= srv_page_size / REC_N_OLD_EXTRA_BYTES ||
+                    !fil_page_index_page_check(block.frame) ||
+                    page_get_page_no(block.frame) != id.page_no() ||
+                    mach_read_from_2(my_assume_aligned<2>
+                                     (PAGE_OLD_SUPREMUM - REC_NEXT +
+                                      block.frame))))
+  {
+corrupted:
+    ib::error() << (reuse
+                    ? "Not applying INSERT_REUSE_REDUNDANT"
+                    " due to corruption on "
+                    : "Not applying INSERT_HEAP_REDUNDANT"
+                    " due to corruption on ")
+                << id;
+    return true;
+  }
+
+  byte * const last_slot= page_dir_get_nth_slot(block.frame, n_slots - 1);
+  byte * const page_heap_top= my_assume_aligned<2>
+    (PAGE_HEAP_TOP + PAGE_HEADER + block.frame);
+  const byte *const heap_bot= &block.frame[PAGE_OLD_SUPREMUM_END];
+  byte *heap_top= block.frame + mach_read_from_2(page_heap_top);
+  if (UNIV_UNLIKELY(heap_bot > heap_top || heap_top > last_slot))
+    goto corrupted;
+  if (UNIV_UNLIKELY(mach_read_from_2(last_slot) != PAGE_OLD_SUPREMUM))
+    goto corrupted;
+  if (UNIV_UNLIKELY(mach_read_from_2(page_dir_get_nth_slot(block.frame, 0)) !=
+                                     PAGE_OLD_INFIMUM))
+    goto corrupted;
+  rec_t * const prev_rec= block.frame + PAGE_OLD_INFIMUM + prev;
+  if (!prev);
+  else if (UNIV_UNLIKELY(heap_bot + (REC_N_OLD_EXTRA_BYTES + 1) > prev_rec ||
+                         prev_rec > heap_top))
+    goto corrupted;
+  const ulint pn_fields= rec_get_bit_field_2(prev_rec, REC_OLD_N_FIELDS,
+                                             REC_OLD_N_FIELDS_MASK,
+                                             REC_OLD_N_FIELDS_SHIFT);
+  if (UNIV_UNLIKELY(pn_fields == 0 || pn_fields > REC_MAX_N_FIELDS))
+    goto corrupted;
+  const ulint pextra_size= REC_N_OLD_EXTRA_BYTES +
+    (rec_get_1byte_offs_flag(prev_rec) ? pn_fields : pn_fields * 2);
+  if (prev_rec == &block.frame[PAGE_OLD_INFIMUM]);
+  else if (UNIV_UNLIKELY(prev_rec - pextra_size < heap_bot))
+    goto corrupted;
+  if (UNIV_UNLIKELY(hdr_c && prev_rec - hdr_c < heap_bot))
+    goto corrupted;
+  const ulint pdata_size= rec_get_data_size_old(prev_rec);
+  if (UNIV_UNLIKELY(prev_rec + pdata_size > heap_top))
+    goto corrupted;
+  rec_t * const next_rec= block.frame + mach_read_from_2(prev_rec - REC_NEXT);
+  if (next_rec == block.frame + PAGE_OLD_SUPREMUM);
+  else if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > next_rec ||
+                         next_rec > heap_top))
+    goto corrupted;
+  const bool is_short= (enc_hdr >> 2) & 1;
+  const ulint n_fields= (enc_hdr >> 3) + 1;
+  if (UNIV_UNLIKELY(n_fields > REC_MAX_N_FIELDS))
+    goto corrupted;
+  const ulint extra_size= REC_N_OLD_EXTRA_BYTES +
+    (is_short ? n_fields : n_fields * 2);
+  hdr_c+= REC_N_OLD_EXTRA_BYTES;
+  if (UNIV_UNLIKELY(hdr_c > extra_size))
+    goto corrupted;
+  if (UNIV_UNLIKELY(extra_size - hdr_c > data_len))
+    goto corrupted;
+  /* We buffer all changes to the record header locally, so that
+  we will avoid modifying the page before all consistency checks
+  have been fulfilled. */
+  alignas(2) byte insert_buf[REC_N_OLD_EXTRA_BYTES + REC_MAX_N_FIELDS * 2];
+
+  ulint n_owned;
+  rec_t *owner_rec= next_rec;
+  for (ulint ns= PAGE_DIR_SLOT_MAX_N_OWNED;
+       !(n_owned= rec_get_n_owned_old(owner_rec)); )
+  {
+    owner_rec= block.frame + mach_read_from_2(owner_rec - REC_NEXT);
+    if (owner_rec == &block.frame[PAGE_OLD_SUPREMUM]);
+    else if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > owner_rec ||
+                           owner_rec > heap_top))
+      goto corrupted;
+    if (!ns--)
+      goto corrupted; /* Corrupted (cyclic?) next-record list */
+  }
+
+  page_dir_slot_t *owner_slot= last_slot;
+
+  if (n_owned > PAGE_DIR_SLOT_MAX_N_OWNED)
+    goto corrupted;
+  else
+  {
+    mach_write_to_2(insert_buf, owner_rec - block.frame);
+    static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+    const page_dir_slot_t * const first_slot=
+      page_dir_get_nth_slot(block.frame, 0);
+
+    while (memcmp_aligned<2>(owner_slot, insert_buf, 2))
+      if ((owner_slot+= 2) == first_slot)
+        goto corrupted;
+  }
+
+  memcpy(insert_buf, data, extra_size - hdr_c);
+  byte *insert_rec= &insert_buf[extra_size];
+  memcpy(insert_rec - hdr_c, prev_rec - hdr_c, hdr_c);
+  rec_set_bit_field_1(insert_rec, (enc_hdr & 3) << 4,
+                      REC_OLD_INFO_BITS, REC_INFO_BITS_MASK,
+                      REC_INFO_BITS_SHIFT);
+  rec_set_1byte_offs_flag(insert_rec, is_short);
+  rec_set_n_fields_old(insert_rec, n_fields);
+  rec_set_bit_field_1(insert_rec, 0, REC_OLD_N_OWNED,
+                      REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+
+  const ulint data_size= rec_get_data_size_old(insert_rec);
+  if (UNIV_UNLIKELY(data_c > data_size))
+    goto corrupted;
+  if (UNIV_UNLIKELY(extra_size - hdr_c + data_size - data_c != data_len))
+    goto corrupted;
+
+  /* Perform final consistency checks and then apply the change to the page. */
+  byte *buf;
+  if (reuse)
+  {
+    byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+                                          block.frame);
+    rec_t *free_rec= block.frame + mach_read_from_2(page_free);
+    if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > free_rec ||
+                      free_rec > heap_top))
+      goto corrupted;
+    const ulint fn_fields= rec_get_n_fields_old(free_rec);
+    const ulint fextra_size= REC_N_OLD_EXTRA_BYTES +
+      (rec_get_1byte_offs_flag(free_rec) ? fn_fields : fn_fields * 2);
+    if (UNIV_UNLIKELY(free_rec - fextra_size < heap_bot))
+      goto corrupted;
+    const ulint fdata_size= rec_get_data_size_old(free_rec);
+    if (UNIV_UNLIKELY(free_rec + fdata_size > heap_top))
+      goto corrupted;
+    if (UNIV_UNLIKELY(extra_size + data_size > fextra_size + fdata_size))
+      goto corrupted;
+    byte *page_garbage= my_assume_aligned<2>(page_free + 2);
+    if (UNIV_UNLIKELY(mach_read_from_2(page_garbage) <
+                      fextra_size + fdata_size))
+      goto corrupted;
+    buf= free_rec - fextra_size;
+    const rec_t *const next_free= block.frame +
+      mach_read_from_2(free_rec - REC_NEXT);
+    if (next_free == block.frame);
+    else if (UNIV_UNLIKELY(next_free < &heap_bot[REC_N_OLD_EXTRA_BYTES + 1] ||
+                           heap_top < next_free))
+      goto corrupted;
+    mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) -
+                    extra_size - data_size);
+    rec_set_bit_field_2(insert_rec, rec_get_heap_no_old(free_rec),
+                        REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+    memcpy(page_free, free_rec - REC_NEXT, 2);
+  }
+  else
+  {
+    if (UNIV_UNLIKELY(heap_top + extra_size + data_size > last_slot))
+      goto corrupted;
+    rec_set_bit_field_2(insert_rec, h,
+                        REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+    mach_write_to_2(page_n_heap, h + 1);
+    mach_write_to_2(page_heap_top,
+                    mach_read_from_2(page_heap_top) + extra_size + data_size);
+    buf= heap_top;
+  }
+
+  ut_ad(data_size - data_c == data_len - (extra_size - hdr_c));
+  byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER +
+                                               block.frame);
+  const uint16_t last_insert= mach_read_from_2(page_last_insert);
+  memcpy(buf, insert_buf, extra_size);
+  buf+= extra_size;
+  mach_write_to_2(page_last_insert, buf - block.frame);
+  memcpy(prev_rec - REC_NEXT, page_last_insert, 2);
+  memcpy(buf, prev_rec, data_c);
+  memcpy(buf + data_c, static_cast<const byte*>(data) + (extra_size - hdr_c),
+         data_len - (extra_size - hdr_c));
+  rec_set_bit_field_1(owner_rec, n_owned + 1, REC_OLD_N_OWNED,
+                      REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+
+  /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */
+  if (block.frame[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
+  {
+    byte *dir= &block.frame[PAGE_DIRECTION_B + PAGE_HEADER];
+    byte *n_dir= my_assume_aligned<2>
+      (&block.frame[PAGE_N_DIRECTION + PAGE_HEADER]);
+    if (UNIV_UNLIKELY(!last_insert))
+    {
+no_direction:
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION);
+      memset(n_dir, 0, 2);
+    }
+    else if (block.frame + last_insert == prev_rec &&
+             (*dir & ((1U << 3) - 1)) != PAGE_LEFT)
+    {
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT);
+inc_dir:
+      mach_write_to_2(n_dir, mach_read_from_2(n_dir) + 1);
+    }
+    else if (next_rec == block.frame + last_insert &&
+             (*dir & ((1U << 3) - 1)) != PAGE_RIGHT)
+    {
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT);
+      goto inc_dir;
+    }
+    else
+      goto no_direction;
+  }
+
+  /* Update PAGE_N_RECS. */
+  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+                                          block.frame);
+
+  mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1);
+
+  if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+    page_dir_split_slot(block, owner_slot);
+  ut_ad(page_simple_validate_old(block.frame));
+  return false;
+}
+
+/** Apply a INSERT_HEAP_DYNAMIC or INSERT_REUSE_DYNAMIC record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block      B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse      false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev       byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param shift      unless !reuse: number of bytes the PAGE_FREE is moving
+@param enc_hdr_l  number of copied record header bytes, plus record type bits
+@param hdr_c      number of common record header bytes with prev
+@param data_c     number of common data bytes with prev
+@param data       literal header and data bytes
+@param data_len   length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_dynamic(const buf_block_t &block, bool reuse,
+                               ulint prev, ulint shift, ulint enc_hdr_l,
+                               size_t hdr_c, size_t data_c,
+                               const void *data, size_t data_len)
+{
+  const uint16_t n_slots= page_dir_get_n_slots(block.frame);
+  byte *page_n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER +
+                                          block.frame);
+  ulint h= mach_read_from_2(page_n_heap);
+  const page_id_t id(block.page.id());
+  if (UNIV_UNLIKELY(n_slots < 2 || h < (PAGE_HEAP_NO_USER_LOW | 0x8000) ||
+                    (h & 0x7fff) >= srv_page_size / REC_N_NEW_EXTRA_BYTES ||
+                    (h & 0x7fff) < n_slots ||
+                    !fil_page_index_page_check(block.frame) ||
+                    page_get_page_no(block.frame) != id.page_no() ||
+                    mach_read_from_2(my_assume_aligned<2>
+                                     (PAGE_NEW_SUPREMUM - REC_NEXT +
+                                      block.frame)) ||
+                    ((enc_hdr_l & REC_STATUS_INSTANT) &&
+                     !page_is_leaf(block.frame)) ||
+                    (enc_hdr_l >> 3) > data_len))
+  {
+corrupted:
+    ib::error() << (reuse
+                    ? "Not applying INSERT_REUSE_DYNAMIC"
+                    " due to corruption on "
+                    : "Not applying INSERT_HEAP_DYNAMIC"
+                    " due to corruption on ")
+                << id;
+    return true;
+  }
+
+  byte * const last_slot= page_dir_get_nth_slot(block.frame, n_slots - 1);
+  byte * const page_heap_top= my_assume_aligned<2>
+    (PAGE_HEAP_TOP + PAGE_HEADER + block.frame);
+  const byte *const heap_bot= &block.frame[PAGE_NEW_SUPREMUM_END];
+  byte *heap_top= block.frame + mach_read_from_2(page_heap_top);
+  if (UNIV_UNLIKELY(heap_bot > heap_top || heap_top > last_slot))
+    goto corrupted;
+  if (UNIV_UNLIKELY(mach_read_from_2(last_slot) != PAGE_NEW_SUPREMUM))
+    goto corrupted;
+  if (UNIV_UNLIKELY(mach_read_from_2(page_dir_get_nth_slot(block.frame, 0)) !=
+                                     PAGE_NEW_INFIMUM))
+    goto corrupted;
+
+  uint16_t n= static_cast<uint16_t>(PAGE_NEW_INFIMUM + prev);
+  rec_t *prev_rec= block.frame + n;
+  n= static_cast<uint16_t>(n + mach_read_from_2(prev_rec - REC_NEXT));
+  if (!prev);
+  else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > prev_rec ||
+                         prev_rec > heap_top))
+    goto corrupted;
+
+  rec_t * const next_rec= block.frame + n;
+  if (next_rec == block.frame + PAGE_NEW_SUPREMUM);
+  else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > next_rec ||
+                         next_rec > heap_top))
+    goto corrupted;
+
+  ulint n_owned;
+  rec_t *owner_rec= next_rec;
+  n= static_cast<uint16_t>(next_rec - block.frame);
+
+  for (ulint ns= PAGE_DIR_SLOT_MAX_N_OWNED;
+       !(n_owned= rec_get_n_owned_new(owner_rec)); )
+  {
+    n= static_cast<uint16_t>(n + mach_read_from_2(owner_rec - REC_NEXT));
+    owner_rec= block.frame + n;
+    if (n == PAGE_NEW_SUPREMUM);
+    else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > owner_rec ||
+                           owner_rec > heap_top))
+      goto corrupted;
+    if (!ns--)
+      goto corrupted; /* Corrupted (cyclic?) next-record list */
+  }
+
+  page_dir_slot_t* owner_slot= last_slot;
+
+  if (n_owned > PAGE_DIR_SLOT_MAX_N_OWNED)
+    goto corrupted;
+  else
+  {
+    static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+    alignas(2) byte slot_buf[2];
+    mach_write_to_2(slot_buf, owner_rec - block.frame);
+    const page_dir_slot_t * const first_slot=
+      page_dir_get_nth_slot(block.frame, 0);
+
+    while (memcmp_aligned<2>(owner_slot, slot_buf, 2))
+      if ((owner_slot+= 2) == first_slot)
+        goto corrupted;
+  }
+
+  const ulint extra_size= REC_N_NEW_EXTRA_BYTES + hdr_c + (enc_hdr_l >> 3);
+  const ulint data_size= data_c + data_len - (enc_hdr_l >> 3);
+
+  /* Perform final consistency checks and then apply the change to the page. */
+  byte *buf;
+  if (reuse)
+  {
+    byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+                                          block.frame);
+    rec_t *free_rec= block.frame + mach_read_from_2(page_free);
+    if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > free_rec ||
+                      free_rec > heap_top))
+      goto corrupted;
+    buf= free_rec - extra_size;
+    if (shift & 1)
+      buf-= shift >> 1;
+    else
+      buf+= shift >> 1;
+
+    if (UNIV_UNLIKELY(heap_bot > buf ||
+                      &buf[extra_size + data_size] > heap_top))
+      goto corrupted;
+    byte *page_garbage= my_assume_aligned<2>(page_free + 2);
+    if (UNIV_UNLIKELY(mach_read_from_2(page_garbage) < extra_size + data_size))
+      goto corrupted;
+    if ((n= mach_read_from_2(free_rec - REC_NEXT)) != 0)
+    {
+      n= static_cast<uint16_t>(n + free_rec - block.frame);
+      if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
+                        heap_top < block.frame + n))
+        goto corrupted;
+    }
+    mach_write_to_2(page_free, n);
+    mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) -
+                    (extra_size + data_size));
+    h= rec_get_heap_no_new(free_rec);
+  }
+  else
+  {
+    if (UNIV_UNLIKELY(heap_top + extra_size + data_size > last_slot))
+      goto corrupted;
+    mach_write_to_2(page_n_heap, h + 1);
+    h&= 0x7fff;
+    mach_write_to_2(page_heap_top,
+                    mach_read_from_2(page_heap_top) + extra_size + data_size);
+    buf= heap_top;
+  }
+
+  memcpy(buf, data, (enc_hdr_l >> 3));
+  buf+= enc_hdr_l >> 3;
+  data_len-= enc_hdr_l >> 3;
+  data= &static_cast<const byte*>(data)[enc_hdr_l >> 3];
+
+  memcpy(buf, prev_rec - REC_N_NEW_EXTRA_BYTES - hdr_c, hdr_c);
+  buf+= hdr_c;
+  *buf++= static_cast<byte>((enc_hdr_l & 3) << 4); /* info_bits; n_owned=0 */
+  *buf++= static_cast<byte>(h >> 5); /* MSB of heap number */
+  h= (h & ((1U << 5) - 1)) << 3;
+  static_assert(REC_STATUS_ORDINARY == 0, "compatibility");
+  static_assert(REC_STATUS_INSTANT == 4, "compatibility");
+  if (page_is_leaf(block.frame))
+    h|= enc_hdr_l & REC_STATUS_INSTANT;
+  else
+  {
+    ut_ad(!(enc_hdr_l & REC_STATUS_INSTANT)); /* Checked at the start */
+    h|= REC_STATUS_NODE_PTR;
+  }
+  *buf++= static_cast<byte>(h); /* LSB of heap number, and status */
+  static_assert(REC_NEXT == 2, "compatibility");
+  buf+= REC_NEXT;
+  mach_write_to_2(buf - REC_NEXT, static_cast<uint16_t>(next_rec - buf));
+  byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER +
+                                               block.frame);
+  const uint16_t last_insert= mach_read_from_2(page_last_insert);
+  mach_write_to_2(page_last_insert, buf - block.frame);
+  mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>(buf - prev_rec));
+  memcpy(buf, prev_rec, data_c);
+  buf+= data_c;
+  memcpy(buf, data, data_len);
+
+  rec_set_bit_field_1(owner_rec, n_owned + 1, REC_NEW_N_OWNED,
+                      REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+
+  /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */
+  if (block.frame[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
+  {
+    byte *dir= &block.frame[PAGE_DIRECTION_B + PAGE_HEADER];
+    byte *n_dir= my_assume_aligned<2>
+      (&block.frame[PAGE_N_DIRECTION + PAGE_HEADER]);
+    if (UNIV_UNLIKELY(!last_insert))
+    {
+no_direction:
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION);
+      memset(n_dir, 0, 2);
+    }
+    else if (block.frame + last_insert == prev_rec &&
+             (*dir & ((1U << 3) - 1)) != PAGE_LEFT)
+    {
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT);
+inc_dir:
+      mach_write_to_2(n_dir, mach_read_from_2(n_dir) + 1);
+    }
+    else if (next_rec == block.frame + last_insert &&
+             (*dir & ((1U << 3) - 1)) != PAGE_RIGHT)
+    {
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT);
+      goto inc_dir;
+    }
+    else
+      goto no_direction;
+  }
+
+  /* Update PAGE_N_RECS. */
+  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+                                          block.frame);
+
+  mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1);
+
+  if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+    page_dir_split_slot(block, owner_slot);
+  ut_ad(page_simple_validate_new(block.frame));
+  return false;
+}
+
+/** Apply a DELETE_ROW_FORMAT_REDUNDANT record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=REDUNDANT page.
+@param block    B-tree or R-tree page in ROW_FORMAT=REDUNDANT
+@param prev     byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_redundant(const buf_block_t &block, ulint prev)
+{
+  const uint16_t n_slots= page_dir_get_n_slots(block.frame);
+  ulint n_recs= page_get_n_recs(block.frame);
+  const page_id_t id(block.page.id());
+
+  if (UNIV_UNLIKELY(!n_recs || n_slots < 2 ||
+                    !fil_page_index_page_check(block.frame) ||
+                    page_get_page_no(block.frame) != id.page_no() ||
+                    mach_read_from_2(my_assume_aligned<2>
+                                     (PAGE_OLD_SUPREMUM - REC_NEXT +
+                                      block.frame)) ||
+                    page_is_comp(block.frame)))
+  {
+corrupted:
+    ib::error() << "Not applying DELETE_ROW_FORMAT_REDUNDANT"
+                   " due to corruption on " << id;
+    return true;
+  }
+
+  byte *slot= page_dir_get_nth_slot(block.frame, n_slots - 1);
+  rec_t *prev_rec= block.frame + PAGE_OLD_INFIMUM + prev;
+  if (UNIV_UNLIKELY(prev_rec > slot))
+    goto corrupted;
+  uint16_t n= mach_read_from_2(prev_rec - REC_NEXT);
+  rec_t *rec= block.frame + n;
+  if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES ||
+                    slot < rec))
+    goto corrupted;
+  const ulint extra_size= REC_N_OLD_EXTRA_BYTES + rec_get_n_fields_old(rec) *
+    (rec_get_1byte_offs_flag(rec) ? 1 : 2);
+  const ulint data_size= rec_get_data_size_old(rec);
+  if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + extra_size ||
+                    slot < rec + data_size))
+    goto corrupted;
+
+  n= mach_read_from_2(rec - REC_NEXT);
+  rec_t *next= block.frame + n;
+  if (n == PAGE_OLD_SUPREMUM);
+  else if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES ||
+                         slot < next))
+    goto corrupted;
+
+  rec_t *s= rec;
+  ulint slot_owned;
+  for (ulint i= n_recs; !(slot_owned= rec_get_n_owned_old(s)); )
+  {
+    n= mach_read_from_2(s - REC_NEXT);
+    s= block.frame + n;
+    if (n == PAGE_OLD_SUPREMUM);
+    else if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES ||
+                           slot < s))
+      goto corrupted;
+    if (UNIV_UNLIKELY(!i--)) /* Corrupted (cyclic?) next-record list */
+      goto corrupted;
+  }
+  slot_owned--;
+
+  /* The first slot is always pointing to the infimum record.
+  Find the directory slot pointing to s. */
+  const byte * const first_slot= block.frame + srv_page_size - (PAGE_DIR + 2);
+  alignas(2) byte slot_offs[2];
+  mach_write_to_2(slot_offs, s - block.frame);
+  static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+
+  while (memcmp_aligned<2>(slot, slot_offs, 2))
+    if ((slot+= 2) == first_slot)
+      goto corrupted;
+
+  if (rec == s)
+  {
+    s= prev_rec;
+    mach_write_to_2(slot, s - block.frame);
+  }
+
+  memcpy(prev_rec - REC_NEXT, rec - REC_NEXT, 2);
+  s-= REC_OLD_N_OWNED;
+  *s= static_cast<byte>((*s & ~REC_N_OWNED_MASK) |
+                        slot_owned << REC_N_OWNED_SHIFT);
+  page_mem_free(block, rec, data_size, extra_size);
+
+  if (slot_owned < PAGE_DIR_SLOT_MIN_N_OWNED)
+    page_dir_balance_slot(block, (first_slot - slot) / 2);
+
+  ut_ad(page_simple_validate_old(block.frame));
+  return false;
+}
+
+/** Apply a DELETE_ROW_FORMAT_DYNAMIC record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block      B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param prev       byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param hdr_size   record header size, excluding REC_N_NEW_EXTRA_BYTES
+@param data_size  data payload size, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_dynamic(const buf_block_t &block, ulint prev,
+                               size_t hdr_size, size_t data_size)
+{
+  const uint16_t n_slots= page_dir_get_n_slots(block.frame);
+  ulint n_recs= page_get_n_recs(block.frame);
+  const page_id_t id(block.page.id());
+
+  if (UNIV_UNLIKELY(!n_recs || n_slots < 2 ||
+                    !fil_page_index_page_check(block.frame) ||
+                    page_get_page_no(block.frame) != id.page_no() ||
+                    mach_read_from_2(my_assume_aligned<2>
+                                     (PAGE_NEW_SUPREMUM - REC_NEXT +
+                                      block.frame)) ||
+                    !page_is_comp(block.frame)))
+  {
+corrupted:
+    ib::error() << "Not applying DELETE_ROW_FORMAT_DYNAMIC"
+                   " due to corruption on " << id;
+    return true;
+  }
+
+  byte *slot= page_dir_get_nth_slot(block.frame, n_slots - 1);
+  uint16_t n= static_cast<uint16_t>(PAGE_NEW_INFIMUM + prev);
+  rec_t *prev_rec= block.frame + n;
+  if (UNIV_UNLIKELY(prev_rec > slot))
+    goto corrupted;
+  n= static_cast<uint16_t>(n + mach_read_from_2(prev_rec - REC_NEXT));
+  rec_t *rec= block.frame + n;
+  if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
+                    slot < rec))
+    goto corrupted;
+  const ulint extra_size= REC_N_NEW_EXTRA_BYTES + hdr_size;
+  if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + extra_size ||
+                    slot < rec + data_size))
+    goto corrupted;
+  n= static_cast<uint16_t>(n + mach_read_from_2(rec - REC_NEXT));
+  rec_t *next= block.frame + n;
+  if (n == PAGE_NEW_SUPREMUM);
+  else if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
+                         slot < next))
+    goto corrupted;
+
+  rec_t *s= rec;
+  n= static_cast<uint16_t>(rec - block.frame);
+  ulint slot_owned;
+  for (ulint i= n_recs; !(slot_owned= rec_get_n_owned_new(s)); )
+  {
+    const uint16_t next= mach_read_from_2(s - REC_NEXT);
+    if (UNIV_UNLIKELY(next < REC_N_NEW_EXTRA_BYTES ||
+                      next > static_cast<uint16_t>(-REC_N_NEW_EXTRA_BYTES)))
+      goto corrupted;
+    n= static_cast<uint16_t>(n + next);
+    s= block.frame + n;
+    if (n == PAGE_NEW_SUPREMUM);
+    else if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
+                           slot < s))
+      goto corrupted;
+    if (UNIV_UNLIKELY(!i--)) /* Corrupted (cyclic?) next-record list */
+      goto corrupted;
+  }
+  slot_owned--;
+
+  /* The first slot is always pointing to the infimum record.
+  Find the directory slot pointing to s. */
+  const byte * const first_slot= block.frame + srv_page_size - (PAGE_DIR + 2);
+  alignas(2) byte slot_offs[2];
+  mach_write_to_2(slot_offs, s - block.frame);
+  static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+
+  while (memcmp_aligned<2>(slot, slot_offs, 2))
+    if ((slot+= 2) == first_slot)
+      goto corrupted;
+
+  if (rec == s)
+  {
+    s= prev_rec;
+    mach_write_to_2(slot, s - block.frame);
+  }
+
+  mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>(next - prev_rec));
+  s-= REC_NEW_N_OWNED;
+  *s= static_cast<byte>((*s & ~REC_N_OWNED_MASK) |
+                        slot_owned << REC_N_OWNED_SHIFT);
+  page_mem_free(block, rec, data_size, extra_size);
+
+  if (slot_owned < PAGE_DIR_SLOT_MIN_N_OWNED)
+    page_dir_balance_slot(block, (first_slot - slot) / 2);
+
+  ut_ad(page_simple_validate_new(block.frame));
+  return false;
 }
 
 #ifdef UNIV_COMPILE_TEST_FUNCS
diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc
index fc33b38beda..1b8b3cb339f 100644
--- a/storage/innobase/page/page0page.cc
+++ b/storage/innobase/page/page0page.cc
@@ -197,24 +197,16 @@ page_set_max_trx_id(
 	trx_id_t	trx_id,	/*!< in: transaction id */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction, or NULL */
 {
-	page_t*		page		= buf_block_get_frame(block);
-	ut_ad(!mtr || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-
-	/* It is not necessary to write this change to the redo log, as
-	during a database recovery we assume that the max trx id of every
-	page is the maximum trx id assigned before the crash. */
-
-	if (page_zip) {
-		mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id);
-		page_zip_write_header(page_zip,
-				      page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
-				      8, mtr);
-	} else if (mtr) {
-		mlog_write_ull(page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
-			       trx_id, mtr);
-	} else {
-		mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id);
-	}
+  ut_ad(!mtr || mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+  ut_ad(!page_zip || page_zip == &block->page.zip);
+  static_assert((PAGE_HEADER + PAGE_MAX_TRX_ID) % 8 == 0, "alignment");
+  byte *max_trx_id= my_assume_aligned<8>(PAGE_MAX_TRX_ID +
+                                         PAGE_HEADER + block->frame);
+
+  mtr->write<8>(*block, max_trx_id, trx_id);
+  if (UNIV_LIKELY_NULL(page_zip))
+    memcpy_aligned<8>(&page_zip->data[PAGE_MAX_TRX_ID + PAGE_HEADER],
+                      max_trx_id, 8);
 }
 
 /** Persist the AUTO_INCREMENT value on a clustered index root page.
@@ -228,51 +220,23 @@ page_set_max_trx_id(
 void
 page_set_autoinc(
 	buf_block_t*		block,
-	const dict_index_t*	index MY_ATTRIBUTE((unused)),
 	ib_uint64_t		autoinc,
 	mtr_t*			mtr,
 	bool			reset)
 {
-	ut_ad(mtr_memo_contains_flagged(
-		      mtr, block, MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX));
-	ut_ad(index->is_primary());
-	ut_ad(index->page == block->page.id.page_no());
-	ut_ad(index->table->space_id == block->page.id.space());
-
-	byte*	field = PAGE_HEADER + PAGE_ROOT_AUTO_INC
-		+ buf_block_get_frame(block);
-	if (!reset && mach_read_from_8(field) >= autoinc) {
-		/* nothing to update */
-	} else if (page_zip_des_t* page_zip = buf_block_get_page_zip(block)) {
-		mach_write_to_8(field, autoinc);
-		page_zip_write_header(page_zip, field, 8, mtr);
-	} else {
-		mlog_write_ull(field, autoinc, mtr);
-	}
-}
-
-/**********************************************************//**
-Writes a log record of page creation. */
-UNIV_INLINE
-void
-page_create_write_log(
-/*==================*/
-	buf_frame_t*	frame,	/*!< in: a buffer frame where the page is
-				created */
-	mtr_t*		mtr,	/*!< in: mini-transaction handle */
-	ibool		comp,	/*!< in: TRUE=compact page format */
-	bool		is_rtree) /*!< in: whether it is R-tree */
-{
-	mlog_id_t	type;
-
-	if (is_rtree) {
-		type = comp ? MLOG_COMP_PAGE_CREATE_RTREE
-			    : MLOG_PAGE_CREATE_RTREE;
-	} else {
-		type = comp ? MLOG_COMP_PAGE_CREATE : MLOG_PAGE_CREATE;
-	}
-
-	mlog_write_initial_log_record(frame, type, mtr);
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+
+  byte *field= my_assume_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC +
+                                    block->frame);
+  ib_uint64_t old= mach_read_from_8(field);
+  if (old == autoinc || (old > autoinc && !reset))
+    return; /* nothing to update */
+
+  mtr->write<8>(*block, field, autoinc);
+  if (UNIV_LIKELY_NULL(block->page.zip.data))
+    memcpy_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC + block->page.zip.data,
+                      field, 8);
 }
 
 /** The page infimum and supremum of an empty page in ROW_FORMAT=REDUNDANT */
@@ -307,17 +271,10 @@ static const byte infimum_supremum_compact[] = {
 	's', 'u', 'p', 'r', 'e', 'm', 'u', 'm'
 };
 
-/**********************************************************//**
-The index page creation function.
-@return pointer to the page */
-static
-page_t*
-page_create_low(
-/*============*/
-	buf_block_t*	block,		/*!< in: a buffer block where the
-					page is created */
-	ulint		comp,		/*!< in: nonzero=compact page format */
-	bool		is_rtree)	/*!< in: if it is an R-Tree page */
+/** Create an index page.
+@param[in,out]	block	buffer block
+@param[in]	comp	nonzero=compact page format */
+void page_create_low(const buf_block_t* block, bool comp)
 {
 	page_t*		page;
 
@@ -326,15 +283,9 @@ page_create_low(
 	compile_time_assert(PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE
 			    <= PAGE_DATA);
 
-	buf_block_modify_clock_inc(block);
+	page = block->frame;
 
-	page = buf_block_get_frame(block);
-
-	if (is_rtree) {
-		fil_page_set_type(page, FIL_PAGE_RTREE);
-	} else {
-		fil_page_set_type(page, FIL_PAGE_INDEX);
-	}
+	fil_page_set_type(page, FIL_PAGE_INDEX);
 
 	memset(page + PAGE_HEADER, 0, PAGE_HEADER_PRIV_END);
 	page[PAGE_HEADER + PAGE_N_DIR_SLOTS + 1] = 2;
@@ -367,46 +318,22 @@ page_create_low(
 		page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE + 1]
 			= PAGE_OLD_INFIMUM;
 	}
-
-	return(page);
-}
-
-/** Parses a redo log record of creating a page.
-@param[in,out]	block	buffer block, or NULL
-@param[in]	comp	nonzero=compact page format
-@param[in]	is_rtree whether it is rtree page */
-void
-page_parse_create(
-	buf_block_t*	block,
-	ulint		comp,
-	bool		is_rtree)
-{
-	if (block != NULL) {
-		page_create_low(block, comp, is_rtree);
-	}
 }
 
-/**********************************************************//**
-Create an uncompressed B-tree or R-tree index page.
-@return pointer to the page */
-page_t*
-page_create(
-/*========*/
-	buf_block_t*	block,		/*!< in: a buffer block where the
-					page is created */
-	mtr_t*		mtr,		/*!< in: mini-transaction handle */
-	ulint		comp,		/*!< in: nonzero=compact page format */
-	bool		is_rtree)	/*!< in: whether it is a R-Tree page */
+/** Create an uncompressed index page.
+@param[in,out]	block	buffer block
+@param[in,out]	mtr	mini-transaction
+@param[in]	comp	set unless ROW_FORMAT=REDUNDANT */
+void page_create(buf_block_t *block, mtr_t *mtr, bool comp)
 {
-	ut_ad(mtr->is_named_space(block->page.id.space()));
-	page_create_write_log(buf_block_get_frame(block), mtr, comp, is_rtree);
-	return(page_create_low(block, comp, is_rtree));
+  mtr->page_create(*block, comp);
+  buf_block_modify_clock_inc(block);
+  page_create_low(block, comp);
 }
 
 /**********************************************************//**
-Create a compressed B-tree index page.
-@return pointer to the page */
-page_t*
+Create a compressed B-tree index page. */
+void
 page_create_zip(
 /*============*/
 	buf_block_t*		block,		/*!< in/out: a buffer frame
@@ -419,11 +346,8 @@ page_create_zip(
 	mtr_t*			mtr)		/*!< in/out: mini-transaction
 						handle */
 {
-	page_t*			page;
-	page_zip_des_t*		page_zip = buf_block_get_page_zip(block);
-
 	ut_ad(block);
-	ut_ad(page_zip);
+	ut_ad(buf_block_get_page_zip(block));
 	ut_ad(dict_table_is_comp(index->table));
 
 	/* PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC are always 0 for
@@ -442,17 +366,24 @@ page_create_zip(
 	      || !dict_index_is_sec_or_ibuf(index)
 	      || index->table->is_temporary());
 
-	page = page_create_low(block, TRUE, dict_index_is_spatial(index));
-	mach_write_to_2(PAGE_HEADER + PAGE_LEVEL + page, level);
-	mach_write_to_8(PAGE_HEADER + PAGE_MAX_TRX_ID + page, max_trx_id);
+	buf_block_modify_clock_inc(block);
+	page_create_low(block, true);
+
+	if (index->is_spatial()) {
+		mach_write_to_2(FIL_PAGE_TYPE + block->frame, FIL_PAGE_RTREE);
+		memset(block->frame + FIL_RTREE_SPLIT_SEQ_NUM, 0, 8);
+		memset(block->page.zip.data + FIL_RTREE_SPLIT_SEQ_NUM, 0, 8);
+	}
 
-	if (!page_zip_compress(page_zip, page, index, page_zip_level, mtr)) {
+	mach_write_to_2(PAGE_HEADER + PAGE_LEVEL + block->frame, level);
+	mach_write_to_8(PAGE_HEADER + PAGE_MAX_TRX_ID + block->frame,
+			max_trx_id);
+
+	if (!page_zip_compress(block, index, page_zip_level, mtr)) {
 		/* The compression of a newly created
 		page should always succeed. */
 		ut_error;
 	}
-
-	return(page);
 }
 
 /**********************************************************//**
@@ -465,12 +396,11 @@ page_create_empty(
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	trx_id_t	max_trx_id;
-	page_t*		page	= buf_block_get_frame(block);
 	page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
 
-	ut_ad(fil_page_index_page_check(page));
+	ut_ad(fil_page_index_page_check(block->frame));
 	ut_ad(!index->is_dummy);
-	ut_ad(block->page.id.space() == index->table->space->id);
+	ut_ad(block->page.id().space() == index->table->space->id);
 
 	/* Multiple transactions cannot simultaneously operate on the
 	same temp-table in parallel.
@@ -478,12 +408,12 @@ page_create_empty(
 	for MVCC. */
 	if (dict_index_is_sec_or_ibuf(index)
 	    && !index->table->is_temporary()
-	    && page_is_leaf(page)) {
-		max_trx_id = page_get_max_trx_id(page);
+	    && page_is_leaf(block->frame)) {
+		max_trx_id = page_get_max_trx_id(block->frame);
 		ut_ad(max_trx_id);
-	} else if (block->page.id.page_no() == index->page) {
+	} else if (block->page.id().page_no() == index->page) {
 		/* Preserve PAGE_ROOT_AUTO_INC. */
-		max_trx_id = page_get_max_trx_id(page);
+		max_trx_id = page_get_max_trx_id(block->frame);
 	} else {
 		max_trx_id = 0;
 	}
@@ -491,15 +421,27 @@ page_create_empty(
 	if (page_zip) {
 		ut_ad(!index->table->is_temporary());
 		page_create_zip(block, index,
-				page_header_get_field(page, PAGE_LEVEL),
+				page_header_get_field(block->frame,
+						      PAGE_LEVEL),
 				max_trx_id, mtr);
 	} else {
-		page_create(block, mtr, page_is_comp(page),
-			    dict_index_is_spatial(index));
+		page_create(block, mtr, index->table->not_redundant());
+		if (index->is_spatial()) {
+			static_assert(((FIL_PAGE_INDEX & 0xff00)
+				       | byte(FIL_PAGE_RTREE))
+				      == FIL_PAGE_RTREE, "compatibility");
+			mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+				      byte(FIL_PAGE_RTREE));
+			if (mach_read_from_8(block->frame
+					     + FIL_RTREE_SPLIT_SEQ_NUM)) {
+				mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM,
+					    8, 0);
+			}
+		}
 
 		if (max_trx_id) {
-			mlog_write_ull(PAGE_HEADER + PAGE_MAX_TRX_ID + page,
-				       max_trx_id, mtr);
+			mtr->write<8>(*block, PAGE_HEADER + PAGE_MAX_TRX_ID
+				      + block->frame, max_trx_id);
 		}
 	}
 }
@@ -523,7 +465,7 @@ page_copy_rec_list_end_no_locks(
 {
 	page_t*		new_page	= buf_block_get_frame(new_block);
 	page_cur_t	cur1;
-	rec_t*		cur2;
+	page_cur_t	cur2;
 	mem_heap_t*	heap		= NULL;
 	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
 	rec_offs*	offsets		= offsets_;
@@ -543,7 +485,7 @@ page_copy_rec_list_end_no_locks(
 	const ulint n_core = page_is_leaf(block->frame)
 		? index->n_core_fields : 0;
 
-	cur2 = page_get_infimum_rec(buf_block_get_frame(new_block));
+	page_cur_set_before_first(new_block, &cur2);
 
 	/* Copy records from the original page to the new page */
 
@@ -551,18 +493,18 @@ page_copy_rec_list_end_no_locks(
 		rec_t*	ins_rec;
 		offsets = rec_get_offsets(cur1.rec, index, offsets, n_core,
 					  ULINT_UNDEFINED, &heap);
-		ins_rec = page_cur_insert_rec_low(cur2, index,
+		ins_rec = page_cur_insert_rec_low(&cur2, index,
 						  cur1.rec, offsets, mtr);
 		if (UNIV_UNLIKELY(!ins_rec)) {
 			ib::fatal() << "Rec offset " << page_offset(rec)
 				<< ", cur1 offset " << page_offset(cur1.rec)
-				<< ", cur2 offset " << page_offset(cur2);
+				<< ", cur2 offset " << page_offset(cur2.rec);
 		}
 
 		page_cur_move_to_next(&cur1);
 		ut_ad(!(rec_get_info_bits(cur1.rec, page_is_comp(new_page))
 			& REC_INFO_MIN_REC_FLAG));
-		cur2 = ins_rec;
+		cur2.rec = ins_rec;
 	}
 
 	if (UNIV_LIKELY_NULL(heap)) {
@@ -593,12 +535,13 @@ page_copy_rec_list_end(
 {
 	page_t*		new_page	= buf_block_get_frame(new_block);
 	page_zip_des_t*	new_page_zip	= buf_block_get_page_zip(new_block);
-	page_t*		page		= page_align(rec);
+	page_t*		page		= block->frame;
 	rec_t*		ret		= page_rec_get_next(
 		page_get_infimum_rec(new_page));
 	ulint		num_moved	= 0;
 	rtr_rec_move_t*	rec_move	= NULL;
 	mem_heap_t*	heap		= NULL;
+	ut_ad(page_align(rec) == page);
 
 #ifdef UNIV_ZIP_DEBUG
 	if (new_page_zip) {
@@ -618,36 +561,37 @@ page_copy_rec_list_end(
 	/* Here, "ret" may be pointing to a user record or the
 	predefined supremum record. */
 
-	mtr_log_t	log_mode = MTR_LOG_NONE;
-
-	if (new_page_zip) {
-		log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
-	}
+	const mtr_log_t log_mode = new_page_zip
+		? mtr->set_log_mode(MTR_LOG_NONE) : MTR_LOG_NONE;
+	const bool was_empty = page_dir_get_n_heap(new_page)
+		== PAGE_HEAP_NO_USER_LOW;
+	alignas(2) byte h[PAGE_N_DIRECTION + 2 - PAGE_LAST_INSERT];
+	memcpy_aligned<2>(h, PAGE_HEADER + PAGE_LAST_INSERT + new_page,
+			  sizeof h);
 
-	if (page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW) {
-		page_copy_rec_list_end_to_created_page(new_page, rec,
-						       index, mtr);
-	} else {
-		if (dict_index_is_spatial(index)) {
-			ulint	max_to_move = page_get_n_recs(
-						buf_block_get_frame(block));
-			heap = mem_heap_create(256);
+	if (index->is_spatial()) {
+		ulint	max_to_move = page_get_n_recs(
+			buf_block_get_frame(block));
+		heap = mem_heap_create(256);
 
-			rec_move = static_cast<rtr_rec_move_t*>(mem_heap_alloc(
-					heap,
-					sizeof (*rec_move) * max_to_move));
+		rec_move = static_cast<rtr_rec_move_t*>(
+			mem_heap_alloc(heap, max_to_move * sizeof *rec_move));
 
-			/* For spatial index, we need to insert recs one by one
-			to keep recs ordered. */
-			rtr_page_copy_rec_list_end_no_locks(new_block,
-							    block, rec, index,
-							    heap, rec_move,
-							    max_to_move,
-							    &num_moved,
-							    mtr);
-		} else {
-			page_copy_rec_list_end_no_locks(new_block, block, rec,
-							index, mtr);
+		/* For spatial index, we need to insert recs one by one
+		to keep recs ordered. */
+		rtr_page_copy_rec_list_end_no_locks(new_block,
+						    block, rec, index,
+						    heap, rec_move,
+						    max_to_move,
+						    &num_moved,
+						    mtr);
+	} else {
+		page_copy_rec_list_end_no_locks(new_block, block, rec,
+						index, mtr);
+		if (was_empty) {
+			mtr->memcpy<mtr_t::MAYBE_NOP>(*new_block, PAGE_HEADER
+						      + PAGE_LAST_INSERT
+						      + new_page, h, sizeof h);
 		}
 	}
 
@@ -661,6 +605,9 @@ page_copy_rec_list_end(
 	if (dict_index_is_sec_or_ibuf(index)
 	    && page_is_leaf(page)
 	    && !index->table->is_temporary()) {
+		ut_ad(!was_empty || page_dir_get_n_heap(new_page)
+		      == PAGE_HEAP_NO_USER_LOW
+		      + page_header_get_field(new_page, PAGE_N_RECS));
 		page_update_max_trx_id(new_block, NULL,
 				       page_get_max_trx_id(page), mtr);
 	}
@@ -668,7 +615,7 @@ page_copy_rec_list_end(
 	if (new_page_zip) {
 		mtr_set_log_mode(mtr, log_mode);
 
-		if (!page_zip_compress(new_page_zip, new_page, index,
+		if (!page_zip_compress(new_block, index,
 				       page_zip_level, mtr)) {
 			/* Before trying to reorganize the page,
 			store the number of preceding records on the page. */
@@ -681,7 +628,8 @@ page_copy_rec_list_end(
 			that is smaller than "ret"). */
 			ut_a(ret_pos > 0);
 
-			if (!page_zip_reorganize(new_block, index, mtr)) {
+			if (!page_zip_reorganize(new_block, index,
+						 page_zip_level, mtr)) {
 
 				if (!page_zip_decompress(new_page_zip,
 							 new_page, FALSE)) {
@@ -697,11 +645,7 @@ page_copy_rec_list_end(
 			} else {
 				/* The page was reorganized:
 				Seek to ret_pos. */
-				ret = new_page + PAGE_NEW_INFIMUM;
-
-				do {
-					ret = rec_get_next_ptr(ret, TRUE);
-				} while (--ret_pos);
+				ret = page_rec_get_nth(new_page, ret_pos);
 			}
 		}
 	}
@@ -750,7 +694,7 @@ page_copy_rec_list_start(
 	page_t*		new_page	= buf_block_get_frame(new_block);
 	page_zip_des_t*	new_page_zip	= buf_block_get_page_zip(new_block);
 	page_cur_t	cur1;
-	rec_t*		cur2;
+	page_cur_t	cur2;
 	mem_heap_t*	heap		= NULL;
 	ulint		num_moved	= 0;
 	rtr_rec_move_t*	rec_move	= NULL;
@@ -776,7 +720,7 @@ page_copy_rec_list_start(
 	page_cur_set_before_first(block, &cur1);
 	page_cur_move_to_next(&cur1);
 
-	cur2 = ret;
+	page_cur_position(ret, new_block, &cur2);
 
 	const ulint n_core = page_rec_is_leaf(rec) ? index->n_core_fields : 0;
 
@@ -802,9 +746,10 @@ page_copy_rec_list_start(
 			offsets = rec_get_offsets(cur1.rec, index, offsets,
 						  n_core,
 						  ULINT_UNDEFINED, &heap);
-			cur2 = page_cur_insert_rec_low(cur2, index,
-						       cur1.rec, offsets, mtr);
-			ut_a(cur2);
+			cur2.rec = page_cur_insert_rec_low(&cur2, index,
+							   cur1.rec, offsets,
+							   mtr);
+			ut_a(cur2.rec);
 
 			page_cur_move_to_next(&cur1);
 			ut_ad(!(rec_get_info_bits(cur1.rec,
@@ -822,8 +767,9 @@ page_copy_rec_list_start(
 	for MVCC. */
 	if (n_core && dict_index_is_sec_or_ibuf(index)
 	    && !index->table->is_temporary()) {
-		page_update_max_trx_id(new_block, NULL,
-				       page_get_max_trx_id(page_align(rec)),
+		page_update_max_trx_id(new_block,
+				       new_page_zip,
+				       page_get_max_trx_id(block->frame),
 				       mtr);
 	}
 
@@ -833,7 +779,7 @@ page_copy_rec_list_start(
 		DBUG_EXECUTE_IF("page_copy_rec_list_start_compress_fail",
 				goto zip_reorganize;);
 
-		if (!page_zip_compress(new_page_zip, new_page, index,
+		if (!page_zip_compress(new_block, index,
 				       page_zip_level, mtr)) {
 			ulint	ret_pos;
 #ifndef DBUG_OFF
@@ -849,7 +795,8 @@ zip_reorganize:
 			ret_pos == 0. */
 
 			if (UNIV_UNLIKELY
-			    (!page_zip_reorganize(new_block, index, mtr))) {
+			    (!page_zip_reorganize(new_block, index,
+						  page_zip_level, mtr))) {
 
 				if (UNIV_UNLIKELY
 				    (!page_zip_decompress(new_page_zip,
@@ -888,87 +835,6 @@ zip_reorganize:
 	return(ret);
 }
 
-/**********************************************************//**
-Writes a log record of a record list end or start deletion. */
-UNIV_INLINE
-void
-page_delete_rec_list_write_log(
-/*===========================*/
-	rec_t*		rec,	/*!< in: record on page */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	mlog_id_t	type,	/*!< in: operation type:
-				MLOG_LIST_END_DELETE, ... */
-	mtr_t*		mtr)	/*!< in: mtr */
-{
-	byte*	log_ptr;
-	ut_ad(type == MLOG_LIST_END_DELETE
-	      || type == MLOG_LIST_START_DELETE
-	      || type == MLOG_COMP_LIST_END_DELETE
-	      || type == MLOG_COMP_LIST_START_DELETE);
-
-	log_ptr = mlog_open_and_write_index(mtr, rec, index, type, 2);
-	if (log_ptr) {
-		/* Write the parameter as a 2-byte ulint */
-		mach_write_to_2(log_ptr, page_offset(rec));
-		mlog_close(mtr, log_ptr + 2);
-	}
-}
-
-/**********************************************************//**
-Parses a log record of a record list end or start deletion.
-@return end of log record or NULL */
-byte*
-page_parse_delete_rec_list(
-/*=======================*/
-	mlog_id_t	type,	/*!< in: MLOG_LIST_END_DELETE,
-				MLOG_LIST_START_DELETE,
-				MLOG_COMP_LIST_END_DELETE or
-				MLOG_COMP_LIST_START_DELETE */
-	byte*		ptr,	/*!< in: buffer */
-	byte*		end_ptr,/*!< in: buffer end */
-	buf_block_t*	block,	/*!< in/out: buffer block or NULL */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	mtr_t*		mtr)	/*!< in: mtr or NULL */
-{
-	page_t*	page;
-	ulint	offset;
-
-	ut_ad(type == MLOG_LIST_END_DELETE
-	      || type == MLOG_LIST_START_DELETE
-	      || type == MLOG_COMP_LIST_END_DELETE
-	      || type == MLOG_COMP_LIST_START_DELETE);
-
-	/* Read the record offset as a 2-byte ulint */
-
-	if (end_ptr < ptr + 2) {
-
-		return(NULL);
-	}
-
-	offset = mach_read_from_2(ptr);
-	ptr += 2;
-
-	if (!block) {
-
-		return(ptr);
-	}
-
-	page = buf_block_get_frame(block);
-
-	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
-
-	if (type == MLOG_LIST_END_DELETE
-	    || type == MLOG_COMP_LIST_END_DELETE) {
-		page_delete_rec_list_end(page + offset, block, index,
-					 ULINT_UNDEFINED, ULINT_UNDEFINED,
-					 mtr);
-	} else {
-		page_delete_rec_list_start(page + offset, block, index, mtr);
-	}
-
-	return(ptr);
-}
-
 /*************************************************************//**
 Deletes records from a page from a given record onward, including that record.
 The infimum and supremum records are not deleted. */
@@ -985,201 +851,202 @@ page_delete_rec_list_end(
 				delete, or ULINT_UNDEFINED if not known */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	page_dir_slot_t*slot;
-	ulint		slot_index;
-	rec_t*		last_rec;
-	rec_t*		prev_rec;
-	ulint		n_owned;
-	page_zip_des_t*	page_zip	= buf_block_get_page_zip(block);
-	page_t*		page		= page_align(rec);
-	mem_heap_t*	heap		= NULL;
-	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
-	rec_offs*	offsets		= offsets_;
-	rec_offs_init(offsets_);
-
-	ut_ad(size == ULINT_UNDEFINED || size < srv_page_size);
-	ut_ad(!page_zip || page_rec_is_comp(rec));
+  ut_ad(size == ULINT_UNDEFINED || size < srv_page_size);
+  ut_ad(page_align(rec) == block->frame);
+  ut_ad(index->table->not_redundant() == !!page_is_comp(block->frame));
 #ifdef UNIV_ZIP_DEBUG
-	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+  ut_a(!block->page.zip.data ||
+       page_zip_validate(&block->page.zip, block->frame, index));
 #endif /* UNIV_ZIP_DEBUG */
 
-	if (page_rec_is_supremum(rec)) {
-		ut_ad(n_recs == 0 || n_recs == ULINT_UNDEFINED);
-		/* Nothing to do, there are no records bigger than the
-		page supremum. */
-		return;
-	}
-
-	if (recv_recovery_is_on()) {
-		/* If we are replaying a redo log record, we must
-		replay it exactly. Since MySQL 5.6.11, we should be
-		generating a redo log record for page creation if
-		the page would become empty. Thus, this branch should
-		only be executed when applying redo log that was
-		generated by an older version of MySQL. */
-	} else if (page_rec_is_infimum(rec)
-		   || n_recs == page_get_n_recs(page)) {
-delete_all:
-		/* We are deleting all records. */
-		page_create_empty(block, index, mtr);
-		return;
-	} else if (page_is_comp(page)) {
-		if (page_rec_get_next_low(page + PAGE_NEW_INFIMUM, 1) == rec) {
-			/* We are deleting everything from the first
-			user record onwards. */
-			goto delete_all;
-		}
-	} else {
-		if (page_rec_get_next_low(page + PAGE_OLD_INFIMUM, 0) == rec) {
-			/* We are deleting everything from the first
-			user record onwards. */
-			goto delete_all;
-		}
-	}
-
-	/* Reset the last insert info in the page header and increment
-	the modify clock for the frame */
-
-	page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL);
-
-	/* The page gets invalid for optimistic searches: increment the
-	frame modify clock */
-
-	buf_block_modify_clock_inc(block);
-
-	page_delete_rec_list_write_log(rec, index, page_is_comp(page)
-				       ? MLOG_COMP_LIST_END_DELETE
-				       : MLOG_LIST_END_DELETE, mtr);
-
-	const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
-
-	if (page_zip) {
-		mtr_log_t	log_mode;
-
-		ut_a(page_is_comp(page));
-		/* Individual deletes are not logged */
-
-		log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
-
-		do {
-			page_cur_t	cur;
-			page_cur_position(rec, block, &cur);
+  if (page_rec_is_supremum(rec))
+  {
+    ut_ad(n_recs == 0 || n_recs == ULINT_UNDEFINED);
+    /* Nothing to do, there are no records bigger than the page supremum. */
+    return;
+  }
+
+  if (page_rec_is_infimum(rec) || n_recs == page_get_n_recs(block->frame) ||
+      rec == (page_is_comp(block->frame)
+              ? page_rec_get_next_low(block->frame + PAGE_NEW_INFIMUM, 1)
+              : page_rec_get_next_low(block->frame + PAGE_OLD_INFIMUM, 0)))
+  {
+    /* We are deleting all records. */
+    page_create_empty(block, index, mtr);
+    return;
+  }
+
+#if 0 // FIXME: consider deleting the last record as a special case
+  if (page_rec_is_last(rec))
+  {
+    page_cur_t cursor= { index, rec, offsets, block };
+    page_cur_delete_rec(&cursor, index, offsets, mtr);
+    return;
+  }
+#endif
 
-			offsets = rec_get_offsets(rec, index, offsets, n_core,
-						  ULINT_UNDEFINED, &heap);
-			rec = rec_get_next_ptr(rec, TRUE);
+  /* The page becomes invalid for optimistic searches */
+  buf_block_modify_clock_inc(block);
+
+  const ulint n_core= page_is_leaf(block->frame) ? index->n_core_fields : 0;
+  mem_heap_t *heap= nullptr;
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+  rec_offs_init(offsets_);
+
+#if 1 // FIXME: remove this, and write minimal amount of log! */
+  if (UNIV_LIKELY_NULL(block->page.zip.data))
+  {
+    ut_ad(page_is_comp(block->frame));
+    do
+    {
+      page_cur_t cur;
+      page_cur_position(rec, block, &cur);
+      offsets= rec_get_offsets(rec, index, offsets, n_core,
+			       ULINT_UNDEFINED, &heap);
+      rec= rec_get_next_ptr(rec, TRUE);
 #ifdef UNIV_ZIP_DEBUG
-			ut_a(page_zip_validate(page_zip, page, index));
+      ut_a(page_zip_validate(&block->page.zip, block->frame, index));
 #endif /* UNIV_ZIP_DEBUG */
-			page_cur_delete_rec(&cur, index, offsets, mtr);
-		} while (page_offset(rec) != PAGE_NEW_SUPREMUM);
-
-		if (UNIV_LIKELY_NULL(heap)) {
-			mem_heap_free(heap);
-		}
-
-		/* Restore log mode */
-
-		mtr_set_log_mode(mtr, log_mode);
-		return;
-	}
-
-	prev_rec = page_rec_get_prev(rec);
-
-	last_rec = page_rec_get_prev(page_get_supremum_rec(page));
-
-	bool scrub = srv_immediate_scrub_data_uncompressed;
-	if ((size == ULINT_UNDEFINED) || (n_recs == ULINT_UNDEFINED) ||
-	    scrub) {
-		rec_t*		rec2		= rec;
-		/* Calculate the sum of sizes and the number of records */
-		size = 0;
-		n_recs = 0;
-
-		do {
-			ulint	s;
-			offsets = rec_get_offsets(rec2, index, offsets, n_core,
-						  ULINT_UNDEFINED, &heap);
-			s = rec_offs_size(offsets);
-			ut_ad(ulint(rec2 - page) + s
-			      - rec_offs_extra_size(offsets)
-			      < srv_page_size);
-			ut_ad(size + s < srv_page_size);
-			size += s;
-			n_recs++;
-
-			if (scrub) {
-				/* scrub record */
-				memset(rec2, 0, rec_offs_data_size(offsets));
-			}
-
-			rec2 = page_rec_get_next(rec2);
-		} while (!page_rec_is_supremum(rec2));
-
-		if (UNIV_LIKELY_NULL(heap)) {
-			mem_heap_free(heap);
-		}
-	}
-
-	ut_ad(size < srv_page_size);
-
-	/* Update the page directory; there is no need to balance the number
-	of the records owned by the supremum record, as it is allowed to be
-	less than PAGE_DIR_SLOT_MIN_N_OWNED */
-
-	if (page_is_comp(page)) {
-		rec_t*	rec2	= rec;
-		ulint	count	= 0;
-
-		while (rec_get_n_owned_new(rec2) == 0) {
-			count++;
-
-			rec2 = rec_get_next_ptr(rec2, TRUE);
-		}
-
-		ut_ad(rec_get_n_owned_new(rec2) > count);
-
-		n_owned = rec_get_n_owned_new(rec2) - count;
-		slot_index = page_dir_find_owner_slot(rec2);
-		ut_ad(slot_index > 0);
-		slot = page_dir_get_nth_slot(page, slot_index);
-	} else {
-		rec_t*	rec2	= rec;
-		ulint	count	= 0;
-
-		while (rec_get_n_owned_old(rec2) == 0) {
-			count++;
-
-			rec2 = rec_get_next_ptr(rec2, FALSE);
-		}
-
-		ut_ad(rec_get_n_owned_old(rec2) > count);
-
-		n_owned = rec_get_n_owned_old(rec2) - count;
-		slot_index = page_dir_find_owner_slot(rec2);
-		ut_ad(slot_index > 0);
-		slot = page_dir_get_nth_slot(page, slot_index);
-	}
-
-	page_dir_slot_set_rec(slot, page_get_supremum_rec(page));
-	page_dir_slot_set_n_owned(slot, NULL, n_owned);
-
-	page_dir_set_n_slots(page, NULL, slot_index + 1);
-
-	/* Remove the record chain segment from the record chain */
-	page_rec_set_next(prev_rec, page_get_supremum_rec(page));
-
-	/* Catenate the deleted chain segment to the page free list */
-
-	page_rec_set_next(last_rec, page_header_get_ptr(page, PAGE_FREE));
-	page_header_set_ptr(page, NULL, PAGE_FREE, rec);
-
-	page_header_set_field(page, NULL, PAGE_GARBAGE, size
-			      + page_header_get_field(page, PAGE_GARBAGE));
+      page_cur_delete_rec(&cur, index, offsets, mtr);
+    }
+    while (page_offset(rec) != PAGE_NEW_SUPREMUM);
+
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+    return;
+  }
+#endif
 
-	ut_ad(page_get_n_recs(page) > n_recs);
-	page_header_set_field(page, NULL, PAGE_N_RECS,
-			      (ulint)(page_get_n_recs(page) - n_recs));
+  byte *prev_rec= page_rec_get_prev(rec);
+  byte *last_rec= page_rec_get_prev(page_get_supremum_rec(block->frame));
+
+  // FIXME: consider a special case of shrinking PAGE_HEAP_TOP
+
+  const bool scrub= srv_immediate_scrub_data_uncompressed;
+  if (scrub || size == ULINT_UNDEFINED || n_recs == ULINT_UNDEFINED)
+  {
+    rec_t *rec2= rec;
+    /* Calculate the sum of sizes and the number of records */
+    size= 0;
+    n_recs= 0;
+
+    do
+    {
+      offsets = rec_get_offsets(rec2, index, offsets, n_core,
+                                ULINT_UNDEFINED, &heap);
+      ulint s= rec_offs_size(offsets);
+      ut_ad(ulint(rec2 - block->frame) + s - rec_offs_extra_size(offsets) <
+            srv_page_size);
+      ut_ad(size + s < srv_page_size);
+      size+= s;
+      n_recs++;
+
+      if (scrub)
+        mtr->memset(block, page_offset(rec2), rec_offs_data_size(offsets), 0);
+
+      rec2 = page_rec_get_next(rec2);
+    }
+    while (!page_rec_is_supremum(rec2));
+
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+  }
+
+  ut_ad(size < srv_page_size);
+
+  ulint slot_index, n_owned;
+  {
+    const rec_t *owner_rec= rec;
+    ulint count= 0;
+
+    if (page_is_comp(block->frame))
+      while (!(n_owned= rec_get_n_owned_new(owner_rec)))
+      {
+        count++;
+	owner_rec= rec_get_next_ptr_const(owner_rec, TRUE);
+      }
+    else
+      while (!(n_owned= rec_get_n_owned_old(owner_rec)))
+      {
+        count++;
+	owner_rec= rec_get_next_ptr_const(owner_rec, FALSE);
+      }
+
+    ut_ad(n_owned > count);
+    n_owned-= count;
+    slot_index= page_dir_find_owner_slot(owner_rec);
+    ut_ad(slot_index > 0);
+  }
+
+  mtr->write<2,mtr_t::MAYBE_NOP>(*block, my_assume_aligned<2>
+                                 (PAGE_N_DIR_SLOTS + PAGE_HEADER +
+                                  block->frame), slot_index + 1);
+  mtr->write<2,mtr_t::MAYBE_NOP>(*block, my_assume_aligned<2>
+                                 (PAGE_LAST_INSERT + PAGE_HEADER +
+                                  block->frame), 0U);
+  /* Catenate the deleted chain segment to the page free list */
+  alignas(4) byte page_header[4];
+  byte *page_free= my_assume_aligned<4>(PAGE_HEADER + PAGE_FREE +
+                                        block->frame);
+  const uint16_t free= page_header_get_field(block->frame, PAGE_FREE);
+  static_assert(PAGE_FREE + 2 == PAGE_GARBAGE, "compatibility");
+
+  mach_write_to_2(page_header, page_offset(rec));
+  mach_write_to_2(my_assume_aligned<2>(page_header + 2),
+                  mach_read_from_2(my_assume_aligned<2>(page_free + 2)) +
+                  size);
+  mtr->memcpy(*block, page_free, page_header, 4);
+
+  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+                                          block->frame);
+  mtr->write<2>(*block, page_n_recs,
+                ulint{mach_read_from_2(page_n_recs)} - n_recs);
+
+  /* Update the page directory; there is no need to balance the number
+  of the records owned by the supremum record, as it is allowed to be
+  less than PAGE_DIR_SLOT_MIN_N_OWNED */
+  page_dir_slot_t *slot= page_dir_get_nth_slot(block->frame, slot_index);
+
+  if (page_is_comp(block->frame))
+  {
+    mtr->write<2,mtr_t::MAYBE_NOP>(*block, slot, PAGE_NEW_SUPREMUM);
+    byte *owned= PAGE_NEW_SUPREMUM - REC_NEW_N_OWNED + block->frame;
+    byte new_owned= static_cast<byte>((*owned & ~REC_N_OWNED_MASK) |
+                                      n_owned << REC_N_OWNED_SHIFT);
+#if 0 // FIXME: implement minimal logging for ROW_FORMAT=COMPRESSED
+    if (UNIV_LIKELY_NULL(block->page.zip.data))
+    {
+      *owned= new_owned;
+      memcpy_aligned<2>(PAGE_N_DIR_SLOTS + PAGE_HEADER + block->page.zip.data,
+                        PAGE_N_DIR_SLOTS + PAGE_HEADER + block->frame,
+			PAGE_N_RECS + 2 - PAGE_N_DIR_SLOTS);
+      // TODO: the equivalent of page_zip_dir_delete() for all records
+      mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>
+		      (PAGE_NEW_SUPREMUM - page_offset(prev_rec)));
+      mach_write_to_2(last_rec - REC_NEXT, free
+                    ? static_cast<uint16_t>(free - page_offset(last_rec))
+                    : 0U);
+      return;
+    }
+#endif
+    mtr->write<1,mtr_t::MAYBE_NOP>(*block, owned, new_owned);
+    mtr->write<2>(*block, prev_rec - REC_NEXT, static_cast<uint16_t>
+                  (PAGE_NEW_SUPREMUM - page_offset(prev_rec)));
+    mtr->write<2>(*block, last_rec - REC_NEXT, free
+                  ? static_cast<uint16_t>(free - page_offset(last_rec))
+                  : 0U);
+  }
+  else
+  {
+    mtr->write<2,mtr_t::MAYBE_NOP>(*block, slot, PAGE_OLD_SUPREMUM);
+    byte *owned= PAGE_OLD_SUPREMUM - REC_OLD_N_OWNED + block->frame;
+    byte new_owned= static_cast<byte>((*owned & ~REC_N_OWNED_MASK) |
+                                      n_owned << REC_N_OWNED_SHIFT);
+    mtr->write<1,mtr_t::MAYBE_NOP>(*block, owned, new_owned);
+    mtr->write<2>(*block, prev_rec - REC_NEXT, PAGE_OLD_SUPREMUM);
+    mtr->write<2>(*block, last_rec - REC_NEXT, free);
+  }
 }
 
 /*************************************************************//**
@@ -1228,22 +1095,9 @@ page_delete_rec_list_start(
 		return;
 	}
 
-	mlog_id_t	type;
-
-	if (page_rec_is_comp(rec)) {
-		type = MLOG_COMP_LIST_START_DELETE;
-	} else {
-		type = MLOG_LIST_START_DELETE;
-	}
-
-	page_delete_rec_list_write_log(rec, index, type, mtr);
-
 	page_cur_set_before_first(block, &cur1);
 	page_cur_move_to_next(&cur1);
 
-	/* Individual deletes are not logged */
-
-	mtr_log_t	log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
 	const ulint	n_core = page_rec_is_leaf(rec)
 		? index->n_core_fields : 0;
 
@@ -1257,10 +1111,6 @@ page_delete_rec_list_start(
 	if (UNIV_LIKELY_NULL(heap)) {
 		mem_heap_free(heap);
 	}
-
-	/* Restore log mode */
-
-	mtr_set_log_mode(mtr, log_mode);
 }
 
 /*************************************************************//**
@@ -1354,212 +1204,6 @@ page_move_rec_list_start(
 	return(TRUE);
 }
 
-/**************************************************************//**
-Used to delete n slots from the directory. This function updates
-also n_owned fields in the records, so that the first slot after
-the deleted ones inherits the records of the deleted slots. */
-UNIV_INLINE
-void
-page_dir_delete_slot(
-/*=================*/
-	page_t*		page,	/*!< in/out: the index page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	ulint		slot_no)/*!< in: slot to be deleted */
-{
-	page_dir_slot_t*	slot;
-	ulint			n_owned;
-	ulint			i;
-	ulint			n_slots;
-
-	ut_ad(!page_zip || page_is_comp(page));
-	ut_ad(slot_no > 0);
-	ut_ad(slot_no + 1 < page_dir_get_n_slots(page));
-
-	n_slots = page_dir_get_n_slots(page);
-
-	/* 1. Reset the n_owned fields of the slots to be
-	deleted */
-	slot = page_dir_get_nth_slot(page, slot_no);
-	n_owned = page_dir_slot_get_n_owned(slot);
-	page_dir_slot_set_n_owned(slot, page_zip, 0);
-
-	/* 2. Update the n_owned value of the first non-deleted slot */
-
-	slot = page_dir_get_nth_slot(page, slot_no + 1);
-	page_dir_slot_set_n_owned(slot, page_zip,
-				  n_owned + page_dir_slot_get_n_owned(slot));
-
-	/* 3. Destroy the slot by copying slots */
-	for (i = slot_no + 1; i < n_slots; i++) {
-		rec_t*	rec = (rec_t*)
-			page_dir_slot_get_rec(page_dir_get_nth_slot(page, i));
-		page_dir_slot_set_rec(page_dir_get_nth_slot(page, i - 1), rec);
-	}
-
-	/* 4. Zero out the last slot, which will be removed */
-	mach_write_to_2(page_dir_get_nth_slot(page, n_slots - 1), 0);
-
-	/* 5. Update the page header */
-	page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots - 1);
-}
-
-/**************************************************************//**
-Used to add n slots to the directory. Does not set the record pointers
-in the added slots or update n_owned values: this is the responsibility
-of the caller. */
-UNIV_INLINE
-void
-page_dir_add_slot(
-/*==============*/
-	page_t*		page,	/*!< in/out: the index page */
-	page_zip_des_t*	page_zip,/*!< in/out: comprssed page, or NULL */
-	ulint		start)	/*!< in: the slot above which the new slots
-				are added */
-{
-	page_dir_slot_t*	slot;
-	ulint			n_slots;
-
-	n_slots = page_dir_get_n_slots(page);
-
-	ut_ad(start < n_slots - 1);
-
-	/* Update the page header */
-	page_dir_set_n_slots(page, page_zip, n_slots + 1);
-
-	/* Move slots up */
-	slot = page_dir_get_nth_slot(page, n_slots);
-	memmove(slot, slot + PAGE_DIR_SLOT_SIZE,
-		(n_slots - 1 - start) * PAGE_DIR_SLOT_SIZE);
-}
-
-/****************************************************************//**
-Splits a directory slot which owns too many records. */
-void
-page_dir_split_slot(
-/*================*/
-	page_t*		page,	/*!< in/out: index page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
-				uncompressed part will be written, or NULL */
-	ulint		slot_no)/*!< in: the directory slot */
-{
-	rec_t*			rec;
-	page_dir_slot_t*	new_slot;
-	page_dir_slot_t*	prev_slot;
-	page_dir_slot_t*	slot;
-	ulint			i;
-	ulint			n_owned;
-
-	ut_ad(!page_zip || page_is_comp(page));
-	ut_ad(slot_no > 0);
-
-	slot = page_dir_get_nth_slot(page, slot_no);
-
-	n_owned = page_dir_slot_get_n_owned(slot);
-	ut_ad(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED + 1);
-
-	/* 1. We loop to find a record approximately in the middle of the
-	records owned by the slot. */
-
-	prev_slot = page_dir_get_nth_slot(page, slot_no - 1);
-	rec = (rec_t*) page_dir_slot_get_rec(prev_slot);
-
-	for (i = 0; i < n_owned / 2; i++) {
-		rec = page_rec_get_next(rec);
-	}
-
-	ut_ad(n_owned / 2 >= PAGE_DIR_SLOT_MIN_N_OWNED);
-
-	/* 2. We add one directory slot immediately below the slot to be
-	split. */
-
-	page_dir_add_slot(page, page_zip, slot_no - 1);
-
-	/* The added slot is now number slot_no, and the old slot is
-	now number slot_no + 1 */
-
-	new_slot = page_dir_get_nth_slot(page, slot_no);
-	slot = page_dir_get_nth_slot(page, slot_no + 1);
-
-	/* 3. We store the appropriate values to the new slot. */
-
-	page_dir_slot_set_rec(new_slot, rec);
-	page_dir_slot_set_n_owned(new_slot, page_zip, n_owned / 2);
-
-	/* 4. Finally, we update the number of records field of the
-	original slot */
-
-	page_dir_slot_set_n_owned(slot, page_zip, n_owned - (n_owned / 2));
-}
-
-/*************************************************************//**
-Tries to balance the given directory slot with too few records with the upper
-neighbor, so that there are at least the minimum number of records owned by
-the slot; this may result in the merging of two slots. */
-void
-page_dir_balance_slot(
-/*==================*/
-	page_t*		page,	/*!< in/out: index page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	ulint		slot_no)/*!< in: the directory slot */
-{
-	page_dir_slot_t*	slot;
-	page_dir_slot_t*	up_slot;
-	ulint			n_owned;
-	ulint			up_n_owned;
-	rec_t*			old_rec;
-	rec_t*			new_rec;
-
-	ut_ad(!page_zip || page_is_comp(page));
-	ut_ad(slot_no > 0);
-
-	slot = page_dir_get_nth_slot(page, slot_no);
-
-	/* The last directory slot cannot be balanced with the upper
-	neighbor, as there is none. */
-
-	if (UNIV_UNLIKELY(slot_no + 1 == page_dir_get_n_slots(page))) {
-
-		return;
-	}
-
-	up_slot = page_dir_get_nth_slot(page, slot_no + 1);
-
-	n_owned = page_dir_slot_get_n_owned(slot);
-	up_n_owned = page_dir_slot_get_n_owned(up_slot);
-
-	ut_ad(n_owned == PAGE_DIR_SLOT_MIN_N_OWNED - 1);
-
-	/* If the upper slot has the minimum value of n_owned, we will merge
-	the two slots, therefore we assert: */
-	ut_ad(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1 <= PAGE_DIR_SLOT_MAX_N_OWNED);
-
-	if (up_n_owned > PAGE_DIR_SLOT_MIN_N_OWNED) {
-
-		/* In this case we can just transfer one record owned
-		by the upper slot to the property of the lower slot */
-		old_rec = (rec_t*) page_dir_slot_get_rec(slot);
-
-		if (page_is_comp(page)) {
-			new_rec = rec_get_next_ptr(old_rec, TRUE);
-
-			rec_set_n_owned_new(old_rec, page_zip, 0);
-			rec_set_n_owned_new(new_rec, page_zip, n_owned + 1);
-		} else {
-			new_rec = rec_get_next_ptr(old_rec, FALSE);
-
-			rec_set_n_owned_old(old_rec, 0);
-			rec_set_n_owned_old(new_rec, n_owned + 1);
-		}
-
-		page_dir_slot_set_rec(slot, new_rec);
-
-		page_dir_slot_set_n_owned(up_slot, page_zip, up_n_owned -1);
-	} else {
-		/* In this case we may merge the two slots */
-		page_dir_delete_slot(page, page_zip, slot_no);
-	}
-}
-
 /************************************************************//**
 Returns the nth record of the record list.
 This is the inverse function of page_rec_get_n_recs_before().
@@ -2815,64 +2459,6 @@ page_find_rec_with_heap_no(
 	}
 }
 
-/*******************************************************//**
-Removes the record from a leaf page. This function does not log
-any changes. It is used by the IMPORT tablespace functions.
-The cursor is moved to the next record after the deleted one.
-@return true if success, i.e., the page did not become too empty */
-bool
-page_delete_rec(
-/*============*/
-	const dict_index_t*	index,	/*!< in: The index that the record
-					belongs to */
-	page_cur_t*		pcur,	/*!< in/out: page cursor on record
-					to delete */
-	page_zip_des_t*
-#ifdef UNIV_ZIP_DEBUG
-		page_zip/*!< in: compressed page descriptor */
-#endif
-	,
-	const rec_offs*		offsets)/*!< in: offsets for record */
-{
-	bool		no_compress_needed;
-	buf_block_t*	block = pcur->block;
-	page_t*		page = buf_block_get_frame(block);
-
-	ut_ad(page_is_leaf(page));
-
-	if (!rec_offs_any_extern(offsets)
-	    && ((page_get_data_size(page) - rec_offs_size(offsets)
-		< BTR_CUR_PAGE_COMPRESS_LIMIT(index))
-		|| !page_has_siblings(page)
-		|| (page_get_n_recs(page) < 2))) {
-
-		ulint	root_page_no = dict_index_get_page(index);
-
-		/* The page fillfactor will drop below a predefined
-		minimum value, OR the level in the B-tree contains just
-		one page, OR the page will become empty: we recommend
-		compression if this is not the root page. */
-
-		no_compress_needed = page_get_page_no(page) == root_page_no;
-	} else {
-		no_compress_needed = true;
-	}
-
-	if (no_compress_needed) {
-#ifdef UNIV_ZIP_DEBUG
-		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
-#endif /* UNIV_ZIP_DEBUG */
-
-		page_cur_delete_rec(pcur, index, offsets, 0);
-
-#ifdef UNIV_ZIP_DEBUG
-		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
-#endif /* UNIV_ZIP_DEBUG */
-	}
-
-	return(no_compress_needed);
-}
-
 /** Get the last non-delete-marked record on a page.
 @param[in]	page	index tree leaf page
 @return the last record, not delete-marked
diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc
index 73d9b0d1fcd..86c3a4dff32 100644
--- a/storage/innobase/page/page0zip.cc
+++ b/storage/innobase/page/page0zip.cc
@@ -35,12 +35,6 @@ Created June 2005 by Marko Makela
 
 using st_::span;
 
-/** A BLOB field reference full of zero, for use in assertions and tests.
-Initially, BLOB field references are set to zero, in
-dtuple_convert_big_rec(). */
-alignas(UNIV_PAGE_SIZE_MIN)
-const byte field_ref_zero[UNIV_PAGE_SIZE_MAX] = { 0, };
-
 #ifndef UNIV_INNOCHECKSUM
 #include "mtr0log.h"
 #include "dict0dict.h"
@@ -65,10 +59,6 @@ page_zip_stat_per_index_t	page_zip_stat_per_index;
 /** Compression level to be used by zlib. Settable by user. */
 uint	page_zip_level;
 
-/** Whether or not to log compressed page images to avoid possible
-compression algorithm changes in zlib. */
-my_bool	page_zip_log_pages;
-
 /* Please refer to ../include/page0zip.ic for a description of the
 compressed page format. */
 
@@ -87,7 +77,7 @@ static const byte infimum_data[] = {
 	0x6d, 0x75, 0x6d, 0x00	/* "infimum\0" */
 };
 /** Extra bytes and data bytes of a supremum record */
-static const byte supremum_extra_data[] = {
+static const byte supremum_extra_data alignas(4) [] = {
 	/* 0x0?, */		/* info_bits=0, n_owned=1..8 */
 	0x00, 0x0b,		/* heap_no=1, status=3 */
 	0x00, 0x00,		/* next=0 */
@@ -96,16 +86,12 @@ static const byte supremum_extra_data[] = {
 };
 
 /** Assert that a block of memory is filled with zero bytes.
-Compare at most sizeof(field_ref_zero) bytes.
 @param b in: memory block
 @param s in: size of the memory block, in bytes */
-#define ASSERT_ZERO(b, s)			\
-	ut_ad(!memcmp(b, field_ref_zero,	\
-		      std::min<size_t>(s, sizeof field_ref_zero)));
+#define ASSERT_ZERO(b, s) ut_ad(!memcmp(b, field_ref_zero, s))
 /** Assert that a BLOB pointer is filled with zero bytes.
 @param b in: BLOB pointer */
-#define ASSERT_ZERO_BLOB(b) \
-	ut_ad(!memcmp(b, field_ref_zero, FIELD_REF_SIZE))
+#define ASSERT_ZERO_BLOB(b) ASSERT_ZERO(b, FIELD_REF_SIZE)
 
 /* Enable some extra debugging output.  This code can be enabled
 independently of any UNIV_ debugging conditions. */
@@ -368,66 +354,96 @@ page_zip_dir_get(
 				- PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1)));
 }
 
-/**********************************************************************//**
-Write a log record of compressing an index page. */
-static
-void
-page_zip_compress_write_log(
-/*========================*/
-	const page_zip_des_t*	page_zip,/*!< in: compressed page */
-	const page_t*		page,	/*!< in: uncompressed page */
-	dict_index_t*		index,	/*!< in: index of the B-tree node */
-	mtr_t*			mtr)	/*!< in: mini-transaction */
+/** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+@param[in]      b       ROW_FORMAT=COMPRESSED index page
+@param[in]      offset  byte offset from b.zip.data
+@param[in]      len     length of the data to write */
+inline void mtr_t::zmemcpy(const buf_block_t &b, ulint offset, ulint len)
 {
-	byte*	log_ptr;
-	ulint	trailer_size;
-
-	ut_ad(!dict_index_is_ibuf(index));
+  ut_ad(fil_page_get_type(b.page.zip.data) == FIL_PAGE_INDEX ||
+        fil_page_get_type(b.page.zip.data) == FIL_PAGE_RTREE);
+  ut_ad(page_zip_simple_validate(&b.page.zip));
+  ut_ad(offset + len <= page_zip_get_size(&b.page.zip));
 
-	log_ptr = mlog_open(mtr, 11 + 2 + 2);
-
-	if (!log_ptr) {
+  memcpy_low(b, static_cast<uint16_t>(offset), &b.page.zip.data[offset], len);
+  m_last_offset= static_cast<uint16_t>(offset + len);
+}
 
-		return;
-	}
+/** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+@param[in]      b       ROW_FORMAT=COMPRESSED index page
+@param[in]      dest    destination within b.zip.data
+@param[in]      str     the data to write
+@param[in]      len     length of the data to write
+@tparam w       write request type */
+template<mtr_t::write_type w>
+inline void mtr_t::zmemcpy(const buf_block_t &b, void *dest, const void *str,
+                           ulint len)
+{
+  byte *d= static_cast<byte*>(dest);
+  const byte *s= static_cast<const byte*>(str);
+  ut_ad(d >= b.page.zip.data + FIL_PAGE_OFFSET);
+  if (w != FORCED)
+  {
+    ut_ad(len);
+    const byte *const end= d + len;
+    while (*d++ == *s++)
+    {
+      if (d == end)
+      {
+        ut_ad(w == MAYBE_NOP);
+        return;
+      }
+    }
+    s--;
+    d--;
+    len= static_cast<ulint>(end - d);
+  }
+  ::memcpy(d, s, len);
+  zmemcpy(b, d - b.page.zip.data, len);
+}
 
-	/* Read the number of user records. */
-	trailer_size = ulint(page_dir_get_n_heap(page_zip->data))
-		- PAGE_HEAP_NO_USER_LOW;
-	/* Multiply by uncompressed of size stored per record */
-	if (!page_is_leaf(page)) {
-		trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
-	} else if (dict_index_is_clust(index)) {
-		trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE
-			+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
-	} else {
-		trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE;
-	}
-	/* Add the space occupied by BLOB pointers. */
-	trailer_size += page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
-	ut_a(page_zip->m_end > PAGE_DATA);
-	compile_time_assert(FIL_PAGE_DATA <= PAGE_DATA);
-	ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip));
-
-	log_ptr = mlog_write_initial_log_record_fast((page_t*) page,
-						     MLOG_ZIP_PAGE_COMPRESS,
-						     log_ptr, mtr);
-	mach_write_to_2(log_ptr, ulint(page_zip->m_end - FIL_PAGE_TYPE));
-	log_ptr += 2;
-	mach_write_to_2(log_ptr, trailer_size);
-	log_ptr += 2;
-	mlog_close(mtr, log_ptr);
-
-	/* Write FIL_PAGE_PREV and FIL_PAGE_NEXT */
-	mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_PREV, 4);
-	mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_NEXT, 4);
-	/* Write most of the page header, the compressed stream and
-	the modification log. */
-	mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_TYPE,
-			     ulint(page_zip->m_end - FIL_PAGE_TYPE));
-	/* Write the uncompressed trailer of the compressed page. */
-	mlog_catenate_string(mtr, page_zip->data + page_zip_get_size(page_zip)
-			     - trailer_size, trailer_size);
+/** Write redo log for compressing a ROW_FORMAT=COMPRESSED index page.
+@param[in,out]	block	ROW_FORMAT=COMPRESSED index page
+@param[in]	index	the index that the block belongs to
+@param[in,out]	mtr	mini-transaction */
+static void page_zip_compress_write_log(buf_block_t *block,
+                                        dict_index_t *index, mtr_t *mtr)
+{
+  ut_ad(!index->is_ibuf());
+
+  if (mtr->get_log_mode() != MTR_LOG_ALL)
+  {
+    ut_ad(mtr->get_log_mode() == MTR_LOG_NONE ||
+          mtr->get_log_mode() == MTR_LOG_NO_REDO);
+    return;
+  }
+
+  const page_t *page= block->frame;
+  const page_zip_des_t *page_zip= &block->page.zip;
+  /* Read the number of user records. */
+  ulint trailer_size= ulint(page_dir_get_n_heap(page_zip->data)) -
+    PAGE_HEAP_NO_USER_LOW;
+  /* Multiply by uncompressed of size stored per record */
+  if (!page_is_leaf(page))
+    trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
+  else if (index->is_clust())
+    trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + DATA_TRX_ID_LEN +
+      DATA_ROLL_PTR_LEN;
+  else
+    trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE;
+  /* Add the space occupied by BLOB pointers. */
+  trailer_size+= page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+  ut_a(page_zip->m_end > PAGE_DATA);
+  compile_time_assert(FIL_PAGE_DATA <= PAGE_DATA);
+  ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip));
+
+  mtr->init(block);
+  mtr->zmemcpy(*block, FIL_PAGE_PREV, page_zip->m_end - FIL_PAGE_PREV);
+
+  if (trailer_size)
+    mtr->zmemcpy(*block, page_zip_get_size(page_zip) - trailer_size,
+                 trailer_size);
+  block->page.status = buf_page_t::INIT_ON_FLUSH; /* because of mtr_t::init() */
 }
 
 /******************************************************//**
@@ -1228,22 +1244,15 @@ page_zip_compress_clust(
 func_exit:
 	return(err);}
 
-/**********************************************************************//**
-Compress a page.
-@return TRUE on success, FALSE on failure; page_zip will be left
-intact on failure. */
-ibool
+/** Attempt to compress a ROW_FORMAT=COMPRESSED page.
+@retval true on success
+@retval false on failure; block->page.zip will be left intact. */
+bool
 page_zip_compress(
-/*==============*/
-	page_zip_des_t*		page_zip,	/*!< in: size; out: data,
-						n_blobs, m_start, m_end,
-						m_nonempty */
-	const page_t*		page,		/*!< in: uncompressed page */
-	dict_index_t*		index,		/*!< in: index of the B-tree
-						node */
-	ulint			level,		/*!< in: commpression level */
-	mtr_t*			mtr)		/*!< in/out: mini-transaction,
-						or NULL */
+	buf_block_t*		block,	/*!< in/out: buffer block */
+	dict_index_t*		index,	/*!< in: index of the B-tree node */
+	ulint			level,	/*!< in: commpression level */
+	mtr_t*			mtr)	/*!< in/out: mini-transaction */
 {
 	z_stream		c_stream;
 	int			err;
@@ -1271,6 +1280,9 @@ page_zip_compress(
 	my_bool			cmp_per_index_enabled;
 	cmp_per_index_enabled	= srv_cmp_per_index_enabled;
 
+	page_t* page = block->frame;
+	page_zip_des_t* page_zip = &block->page.zip;
+
 	ut_a(page_is_comp(page));
 	ut_a(fil_page_index_page_check(page));
 	ut_ad(page_simple_validate_new((page_t*) page));
@@ -1371,7 +1383,7 @@ page_zip_compress(
 	page_zip_set_alloc(&c_stream, heap);
 
 	err = deflateInit2(&c_stream, static_cast<int>(level),
-			   Z_DEFLATED, srv_page_size_shift,
+			   Z_DEFLATED, static_cast<int>(srv_page_size_shift),
 			   MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY);
 	ut_a(err == Z_OK);
 
@@ -1481,7 +1493,7 @@ err_exit:
 			fclose(logfile);
 		}
 #endif /* PAGE_ZIP_COMPRESS_DBG */
-		if (page_is_leaf(page) && index) {
+		if (page_is_leaf(page)) {
 			dict_index_zip_failure(index);
 		}
 
@@ -1494,7 +1506,7 @@ err_exit:
 				+= time_diff;
 			mutex_exit(&page_zip_stat_per_index_mutex);
 		}
-		return(FALSE);
+		return false;
 	}
 
 	err = deflateEnd(&c_stream);
@@ -1517,27 +1529,26 @@ err_exit:
 #ifdef UNIV_DEBUG
 	page_zip->m_start =
 #endif /* UNIV_DEBUG */
-		page_zip->m_end = unsigned(PAGE_DATA + c_stream.total_out);
+		page_zip->m_end = uint16_t(PAGE_DATA + c_stream.total_out);
 	page_zip->m_nonempty = FALSE;
-	page_zip->n_blobs = unsigned(n_blobs);
+	page_zip->n_blobs = unsigned(n_blobs) & ((1U << 12) - 1);
 	/* Copy those header fields that will not be written
 	in buf_flush_init_for_writing() */
-	memcpy(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
-	       FIL_PAGE_LSN - FIL_PAGE_PREV);
-	memcpy(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2);
-	memcpy(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
-	       PAGE_DATA - FIL_PAGE_DATA);
+	memcpy_aligned<8>(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+			  FIL_PAGE_LSN - FIL_PAGE_PREV);
+	memcpy_aligned<2>(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE,
+			  2);
+	memcpy_aligned<2>(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+			  PAGE_DATA - FIL_PAGE_DATA);
 	/* Copy the rest of the compressed page */
-	memcpy(page_zip->data + PAGE_DATA, buf,
-	       page_zip_get_size(page_zip) - PAGE_DATA);
+	memcpy_aligned<2>(page_zip->data + PAGE_DATA, buf,
+			  page_zip_get_size(page_zip) - PAGE_DATA);
 	mem_heap_free(heap);
 #ifdef UNIV_ZIP_DEBUG
 	ut_a(page_zip_validate(page_zip, page, index));
 #endif /* UNIV_ZIP_DEBUG */
 
-	if (mtr) {
-		page_zip_compress_write_log(page_zip, page, index, mtr);
-	}
+	page_zip_compress_write_log(block, index, mtr);
 
 	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
 
@@ -1567,7 +1578,7 @@ err_exit:
 		dict_index_zip_success(index);
 	}
 
-	return(TRUE);
+	return true;
 }
 
 /**********************************************************************//**
@@ -1634,7 +1645,7 @@ page_zip_fields_decode(
 	table = dict_mem_table_create("ZIP_DUMMY", NULL, n, 0,
 				      DICT_TF_COMPACT, 0);
 	index = dict_mem_index_create(table, "ZIP_DUMMY", 0, n);
-	index->n_uniq = unsigned(n);
+	index->n_uniq = static_cast<unsigned>(n) & dict_index_t::MAX_N_FIELDS;
 	/* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
 	index->cached = TRUE;
 
@@ -1693,14 +1704,15 @@ fail:
 		if (UNIV_UNLIKELY(index->n_nullable > val)) {
 			goto fail;
 		} else {
-			index->n_nullable = unsigned(val);
+			index->n_nullable = static_cast<unsigned>(val)
+				& dict_index_t::MAX_N_FIELDS;
 		}
 	}
 
 	/* ROW_FORMAT=COMPRESSED does not support instant ADD COLUMN */
 	index->n_core_fields = index->n_fields;
-	index->n_core_null_bytes
-		= UT_BITS_IN_BYTES(unsigned(index->n_nullable));
+	index->n_core_null_bytes = static_cast<uint8_t>(
+		UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
 
 	ut_ad(b == end);
 
@@ -2358,7 +2370,7 @@ zlib_done:
 	}
 
 #ifdef UNIV_DEBUG
-	page_zip->m_start = unsigned(PAGE_DATA + d_stream->total_in);
+	page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in);
 #endif /* UNIV_DEBUG */
 
 	/* Apply the modification log. */
@@ -2373,7 +2385,7 @@ zlib_done:
 		if (UNIV_UNLIKELY(!mod_log_ptr)) {
 			return(FALSE);
 		}
-		page_zip->m_end = unsigned(mod_log_ptr - page_zip->data);
+		page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
 		page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
 	}
 
@@ -2512,7 +2524,7 @@ zlib_done:
 			     - d_stream->next_out));
 	}
 
-	ut_d(page_zip->m_start = unsigned(PAGE_DATA + d_stream->total_in));
+	ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in));
 
 	/* Apply the modification log. */
 	{
@@ -2527,7 +2539,7 @@ zlib_done:
 		if (UNIV_UNLIKELY(!mod_log_ptr)) {
 			return(FALSE);
 		}
-		page_zip->m_end = unsigned(mod_log_ptr - page_zip->data);
+		page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
 		page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
 	}
 
@@ -2843,7 +2855,7 @@ zlib_done:
 			     - d_stream->next_out));
 	}
 
-	ut_d(page_zip->m_start = unsigned(PAGE_DATA + d_stream->total_in));
+	ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in));
 
 	/* Apply the modification log. */
 	{
@@ -2858,7 +2870,7 @@ zlib_done:
 		if (UNIV_UNLIKELY(!mod_log_ptr)) {
 			return(FALSE);
 		}
-		page_zip->m_end = unsigned(mod_log_ptr - page_zip->data);
+		page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
 		page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
 	}
 
@@ -3001,7 +3013,7 @@ page_zip_decompress_low(
 
 	if (all) {
 		/* Copy the page header. */
-		memcpy(page, page_zip->data, PAGE_DATA);
+		memcpy_aligned<2>(page, page_zip->data, PAGE_DATA);
 	} else {
 		/* Check that the bytes that we skip are identical. */
 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
@@ -3014,9 +3026,10 @@ page_zip_decompress_low(
 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
 
 		/* Copy the mutable parts of the page header. */
-		memcpy(page, page_zip->data, FIL_PAGE_TYPE);
-		memcpy(PAGE_HEADER + page, PAGE_HEADER + page_zip->data,
-		       PAGE_LEVEL - PAGE_N_DIR_SLOTS);
+		memcpy_aligned<8>(page, page_zip->data, FIL_PAGE_TYPE);
+		memcpy_aligned<2>(PAGE_HEADER + page,
+				  PAGE_HEADER + page_zip->data,
+				  PAGE_LEVEL - PAGE_N_DIR_SLOTS);
 
 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
 		/* Check that the page headers match after copying. */
@@ -3050,8 +3063,9 @@ zlib_error:
 				      & PAGE_ZIP_DIR_SLOT_MASK);
 	}
 	memcpy(page + PAGE_NEW_INFIMUM, infimum_data, sizeof infimum_data);
-	memcpy(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1),
-	       supremum_extra_data, sizeof supremum_extra_data);
+	memcpy_aligned<4>(PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1
+			  + page, supremum_extra_data,
+			  sizeof supremum_extra_data);
 
 	page_zip_set_alloc(&d_stream, heap);
 
@@ -3063,7 +3077,7 @@ zlib_error:
 	d_stream.next_out = page + PAGE_ZIP_START;
 	d_stream.avail_out = uInt(srv_page_size - PAGE_ZIP_START);
 
-	if (UNIV_UNLIKELY(inflateInit2(&d_stream, srv_page_size_shift)
+	if (UNIV_UNLIKELY(inflateInit2(&d_stream, int(srv_page_size_shift))
 			  != Z_OK)) {
 		ut_error;
 	}
@@ -3262,8 +3276,6 @@ page_zip_validate_low(
 					TRUE=ignore the MIN_REC_FLAG */
 {
 	page_zip_des_t	temp_page_zip;
-	byte*		temp_page_buf;
-	page_t*		temp_page;
 	ibool		valid;
 
 	if (memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
@@ -3298,9 +3310,8 @@ page_zip_validate_low(
 
 	/* page_zip_decompress() expects the uncompressed page to be
 	srv_page_size aligned. */
-	temp_page_buf = static_cast<byte*>(
-		ut_malloc_nokey(2 << srv_page_size_shift));
-	temp_page = static_cast<byte*>(ut_align(temp_page_buf, srv_page_size));
+	page_t* temp_page = static_cast<byte*>(aligned_malloc(srv_page_size,
+							      srv_page_size));
 
 	MEM_CHECK_DEFINED(page, srv_page_size);
 	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
@@ -3457,7 +3468,7 @@ func_exit:
 		page_zip_hexdump(page, srv_page_size);
 		page_zip_hexdump(temp_page, srv_page_size);
 	}
-	ut_free(temp_page_buf);
+	aligned_free(temp_page);
 	return(valid);
 }
 
@@ -3506,22 +3517,24 @@ static
 byte*
 page_zip_write_rec_ext(
 /*===================*/
-	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
-	const page_t*	page,		/*!< in: page containing rec */
+	buf_block_t*	block,		/*!< in/out: compressed page */
 	const byte*	rec,		/*!< in: record being written */
-	dict_index_t*	index,		/*!< in: record descriptor */
+	const dict_index_t*index,	/*!< in: record descriptor */
 	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index) */
 	ulint		create,		/*!< in: nonzero=insert, zero=update */
 	ulint		trx_id_col,	/*!< in: position of DB_TRX_ID */
 	ulint		heap_no,	/*!< in: heap number of rec */
 	byte*		storage,	/*!< in: end of dense page directory */
-	byte*		data)		/*!< in: end of modification log */
+	byte*		data,		/*!< in: end of modification log */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
 	const byte*	start	= rec;
 	ulint		i;
 	ulint		len;
 	byte*		externs	= storage;
 	ulint		n_ext	= rec_offs_n_extern(offsets);
+	const page_t* const page = block->frame;
+	page_zip_des_t* const page_zip = &block->page.zip;
 
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
@@ -3535,25 +3548,28 @@ page_zip_write_rec_ext(
 	the BLOB columns of rec if create==TRUE. */
 	ut_ad(data + rec_offs_data_size(offsets)
 	      - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
-	      - n_ext * BTR_EXTERN_FIELD_REF_SIZE
-	      < externs - BTR_EXTERN_FIELD_REF_SIZE * page_zip->n_blobs);
+	      - n_ext * FIELD_REF_SIZE
+	      < externs - FIELD_REF_SIZE * page_zip->n_blobs);
 
-	{
+	if (n_ext) {
 		ulint	blob_no = page_zip_get_n_prev_extern(
 			page_zip, rec, index);
-		byte*	ext_end = externs - page_zip->n_blobs
-			* BTR_EXTERN_FIELD_REF_SIZE;
+		byte*	ext_end = externs - page_zip->n_blobs * FIELD_REF_SIZE;
 		ut_ad(blob_no <= page_zip->n_blobs);
-		externs -= blob_no * BTR_EXTERN_FIELD_REF_SIZE;
+		externs -= blob_no * FIELD_REF_SIZE;
 
 		if (create) {
-			page_zip->n_blobs += static_cast<unsigned>(n_ext);
-			ASSERT_ZERO_BLOB(ext_end - n_ext
-					 * BTR_EXTERN_FIELD_REF_SIZE);
-			memmove(ext_end - n_ext
-				* BTR_EXTERN_FIELD_REF_SIZE,
-				ext_end,
-				ulint(externs - ext_end));
+			page_zip->n_blobs = (page_zip->n_blobs + n_ext)
+				& ((1U << 12) - 1);
+			ASSERT_ZERO_BLOB(ext_end - n_ext * FIELD_REF_SIZE);
+			if (ulint len = ulint(externs - ext_end)) {
+				byte* ext_start = ext_end
+					- n_ext * FIELD_REF_SIZE;
+				memmove(ext_start, ext_end, len);
+				mtr->memmove(*block,
+					     ext_start - page_zip->data,
+					     ext_end - page_zip->data, len);
+			}
 		}
 
 		ut_a(blob_no + n_ext <= page_zip->n_blobs);
@@ -3585,28 +3601,31 @@ page_zip_write_rec_ext(
 				       + DATA_ROLL_PTR_LEN);
 
 			/* Store trx_id and roll_ptr. */
-			memcpy(storage - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
-			       * (heap_no - 1),
-			       src, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+			constexpr ulint sys_len = DATA_TRX_ID_LEN
+				+ DATA_ROLL_PTR_LEN;
+			byte* sys = storage - sys_len * (heap_no - 1);
+			memcpy(sys, src, sys_len);
 			i++; /* skip also roll_ptr */
+			mtr->zmemcpy(*block, sys - page_zip->data, sys_len);
 		} else if (rec_offs_nth_extern(offsets, i)) {
 			src = rec_get_nth_field(rec, offsets,
 						i, &len);
 
 			ut_ad(dict_index_is_clust(index));
-			ut_ad(len
-			      >= BTR_EXTERN_FIELD_REF_SIZE);
-			src += len - BTR_EXTERN_FIELD_REF_SIZE;
+			ut_ad(len >= FIELD_REF_SIZE);
+			src += len - FIELD_REF_SIZE;
 
 			ASSERT_ZERO(data, src - start);
 			memcpy(data, start, ulint(src - start));
 			data += src - start;
-			start = src + BTR_EXTERN_FIELD_REF_SIZE;
+			start = src + FIELD_REF_SIZE;
 
 			/* Store the BLOB pointer. */
-			externs -= BTR_EXTERN_FIELD_REF_SIZE;
+			externs -= FIELD_REF_SIZE;
 			ut_ad(data < externs);
-			memcpy(externs, src, BTR_EXTERN_FIELD_REF_SIZE);
+			memcpy(externs, src, FIELD_REF_SIZE);
+			mtr->zmemcpy(*block, externs - page_zip->data,
+				     FIELD_REF_SIZE);
 		}
 	}
 
@@ -3620,19 +3639,20 @@ page_zip_write_rec_ext(
 	return(data);
 }
 
-/**********************************************************************//**
-Write an entire record on the compressed page.  The data must already
-have been written to the uncompressed page. */
-void
-page_zip_write_rec(
-/*===============*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
-	const byte*	rec,	/*!< in: record being written */
-	dict_index_t*	index,	/*!< in: the index the record belongs to */
-	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
-	ulint		create)	/*!< in: nonzero=insert, zero=update */
+/** Write an entire record to the ROW_FORMAT=COMPRESSED page.
+The data must already have been written to the uncompressed page.
+@param[in,out]	block		ROW_FORMAT=COMPRESSED page
+@param[in]	rec		record in the uncompressed page
+@param[in]	index		the index that the page belongs to
+@param[in]	offsets		rec_get_offsets(rec, index)
+@param[in]	create		nonzero=insert, zero=update
+@param[in,out]	mtr		mini-transaction */
+void page_zip_write_rec(buf_block_t *block, const byte *rec,
+                        const dict_index_t *index, const rec_offs *offsets,
+                        ulint create, mtr_t *mtr)
 {
-	const page_t*	page;
+	const page_t* const page = block->frame;
+	page_zip_des_t* const page_zip = &block->page.zip;
 	byte*		data;
 	byte*		storage;
 	ulint		heap_no;
@@ -3646,8 +3666,6 @@ page_zip_write_rec(
 
 	ut_ad(page_zip->m_start >= PAGE_DATA);
 
-	page = page_align(rec);
-
 	ut_ad(page_zip_header_cmp(page_zip, page));
 	ut_ad(page_simple_validate_new((page_t*) page));
 
@@ -3658,6 +3676,7 @@ page_zip_write_rec(
 
 	slot = page_zip_dir_find(page_zip, page_offset(rec));
 	ut_a(slot);
+	byte s = *slot;
 	/* Copy the delete mark. */
 	if (rec_get_deleted_flag(rec, TRUE)) {
 		/* In delete-marked records, DB_TRX_ID must
@@ -3665,9 +3684,14 @@ page_zip_write_rec(
 		On non-leaf pages, the delete-mark flag is garbage. */
 		ut_ad(!index->is_primary() || !page_is_leaf(page)
 		      || row_get_rec_trx_id(rec, index, offsets));
-		*slot |= PAGE_ZIP_DIR_SLOT_DEL >> 8;
+		s |= PAGE_ZIP_DIR_SLOT_DEL >> 8;
 	} else {
-		*slot &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8);
+		s &= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8));
+	}
+
+	if (s != *slot) {
+		*slot = s;
+		mtr->zmemcpy(*block, slot - page_zip->data, 1);
 	}
 
 	ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START);
@@ -3712,19 +3736,18 @@ page_zip_write_rec(
 	storage = page_zip_dir_start(page_zip);
 
 	if (page_is_leaf(page)) {
-		ulint		len;
-
 		if (dict_index_is_clust(index)) {
 			/* Store separately trx_id, roll_ptr and
 			the BTR_EXTERN_FIELD_REF of each BLOB column. */
 			if (rec_offs_any_extern(offsets)) {
 				data = page_zip_write_rec_ext(
-					page_zip, page,
+					block,
 					rec, index, offsets, create,
 					index->db_trx_id(), heap_no,
-					storage, data);
+					storage, data, mtr);
 			} else {
 				/* Locate trx_id and roll_ptr. */
+				ulint len;
 				const byte*	src
 					= rec_get_nth_field(rec, offsets,
 							    index->db_trx_id(),
@@ -3742,14 +3765,14 @@ page_zip_write_rec(
 				data += src - rec;
 
 				/* Store trx_id and roll_ptr. */
-				memcpy(storage
-				       - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
-				       * (heap_no - 1),
-				       src,
-				       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
-
-				src += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
-
+				constexpr ulint sys_len
+					= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+				byte* sys = storage - sys_len * (heap_no - 1);
+				memcpy(sys, src, sys_len);
+
+				src += sys_len;
+				mtr->zmemcpy(*block, sys - page_zip->data,
+					     sys_len);
 				/* Log the last bytes of the record. */
 				len = rec_offs_data_size(offsets)
 					- ulint(src - rec);
@@ -3764,7 +3787,7 @@ page_zip_write_rec(
 			ut_ad(!rec_offs_any_extern(offsets));
 
 			/* Log the entire record. */
-			len = rec_offs_data_size(offsets);
+			ulint len = rec_offs_data_size(offsets);
 
 			ASSERT_ZERO(data, len);
 			memcpy(data, rec, len);
@@ -3772,14 +3795,12 @@ page_zip_write_rec(
 		}
 	} else {
 		/* This is a node pointer page. */
-		ulint	len;
-
 		/* Non-leaf nodes should not have any externally
 		stored columns. */
 		ut_ad(!rec_offs_any_extern(offsets));
 
 		/* Copy the data bytes, except node_ptr. */
-		len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE;
+		ulint len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE;
 		ut_ad(data + len < storage - REC_NODE_PTR_SIZE
 		      * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW));
 		ASSERT_ZERO(data, len);
@@ -3787,15 +3808,16 @@ page_zip_write_rec(
 		data += len;
 
 		/* Copy the node pointer to the uncompressed area. */
-		memcpy(storage - REC_NODE_PTR_SIZE
-		       * (heap_no - 1),
-		       rec + len,
-		       REC_NODE_PTR_SIZE);
+		byte* node_ptr = storage - REC_NODE_PTR_SIZE * (heap_no - 1);
+		mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, node_ptr,
+					       rec + len, REC_NODE_PTR_SIZE);
 	}
 
 	ut_a(!*data);
 	ut_ad((ulint) (data - page_zip->data) < page_zip_get_size(page_zip));
-	page_zip->m_end = unsigned(data - page_zip->data);
+	mtr->zmemcpy(*block, page_zip->m_end,
+		     data - page_zip->data - page_zip->m_end);
+	page_zip->m_end = uint16_t(data - page_zip->data);
 	page_zip->m_nonempty = TRUE;
 
 #ifdef UNIV_ZIP_DEBUG
@@ -3803,89 +3825,28 @@ page_zip_write_rec(
 #endif /* UNIV_ZIP_DEBUG */
 }
 
-/***********************************************************//**
-Parses a log record of writing a BLOB pointer of a record.
-@return end of log record or NULL */
-byte*
-page_zip_parse_write_blob_ptr(
-/*==========================*/
-	byte*		ptr,	/*!< in: redo log buffer */
-	byte*		end_ptr,/*!< in: redo log buffer end */
-	page_t*		page,	/*!< in/out: uncompressed page */
-	page_zip_des_t*	page_zip)/*!< in/out: compressed page */
-{
-	ulint	offset;
-	ulint	z_offset;
-
-	ut_ad(ptr != NULL);
-	ut_ad(end_ptr != NULL);
-	ut_ad(!page == !page_zip);
-
-	if (UNIV_UNLIKELY
-	    (end_ptr < ptr + (2 + 2 + BTR_EXTERN_FIELD_REF_SIZE))) {
-
-		return(NULL);
-	}
-
-	offset = mach_read_from_2(ptr);
-	z_offset = mach_read_from_2(ptr + 2);
-
-	if (offset < PAGE_ZIP_START
-	    || offset >= srv_page_size
-	    || z_offset >= srv_page_size) {
-corrupt:
-		recv_sys.found_corrupt_log = TRUE;
-
-		return(NULL);
-	}
-
-	if (page) {
-
-		if (!page_zip || !page_is_leaf(page)) {
-
-			goto corrupt;
-		}
-
-#ifdef UNIV_ZIP_DEBUG
-		ut_a(page_zip_validate(page_zip, page, NULL));
-#endif /* UNIV_ZIP_DEBUG */
-
-		memcpy(page + offset,
-		       ptr + 4, BTR_EXTERN_FIELD_REF_SIZE);
-		memcpy(page_zip->data + z_offset,
-		       ptr + 4, BTR_EXTERN_FIELD_REF_SIZE);
-
-#ifdef UNIV_ZIP_DEBUG
-		ut_a(page_zip_validate(page_zip, page, NULL));
-#endif /* UNIV_ZIP_DEBUG */
-	}
-
-	return(ptr + (2 + 2 + BTR_EXTERN_FIELD_REF_SIZE));
-}
-
 /**********************************************************************//**
 Write a BLOB pointer of a record on the leaf page of a clustered index.
 The information must already have been updated on the uncompressed page. */
 void
 page_zip_write_blob_ptr(
 /*====================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	buf_block_t*	block,	/*!< in/out: ROW_FORMAT=COMPRESSED page */
 	const byte*	rec,	/*!< in/out: record whose data is being
 				written */
 	dict_index_t*	index,	/*!< in: index of the page */
 	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
 	ulint		n,	/*!< in: column index */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle,
-				or NULL if no logging is needed */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	const byte*	field;
 	byte*		externs;
-	const page_t*	page	= page_align(rec);
+	const page_t* const page = block->frame;
+	page_zip_des_t* const page_zip = &block->page.zip;
 	ulint		blob_no;
 	ulint		len;
 
-	ut_ad(page_zip != NULL);
-	ut_ad(rec != NULL);
+	ut_ad(page_align(rec) == page);
 	ut_ad(index != NULL);
 	ut_ad(offsets != NULL);
 	ut_ad(page_simple_validate_new((page_t*) page));
@@ -3921,104 +3882,12 @@ page_zip_write_blob_ptr(
 	externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE;
 	field += len - BTR_EXTERN_FIELD_REF_SIZE;
 
-	memcpy(externs, field, BTR_EXTERN_FIELD_REF_SIZE);
+	mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, externs, field,
+				       BTR_EXTERN_FIELD_REF_SIZE);
 
 #ifdef UNIV_ZIP_DEBUG
 	ut_a(page_zip_validate(page_zip, page, index));
 #endif /* UNIV_ZIP_DEBUG */
-
-	if (mtr) {
-		byte*	log_ptr	= mlog_open(
-			mtr, 11 + 2 + 2 + BTR_EXTERN_FIELD_REF_SIZE);
-		if (UNIV_UNLIKELY(!log_ptr)) {
-			return;
-		}
-
-		log_ptr = mlog_write_initial_log_record_fast(
-			(byte*) field, MLOG_ZIP_WRITE_BLOB_PTR, log_ptr, mtr);
-		mach_write_to_2(log_ptr, page_offset(field));
-		log_ptr += 2;
-		mach_write_to_2(log_ptr, ulint(externs - page_zip->data));
-		log_ptr += 2;
-		memcpy(log_ptr, externs, BTR_EXTERN_FIELD_REF_SIZE);
-		log_ptr += BTR_EXTERN_FIELD_REF_SIZE;
-		mlog_close(mtr, log_ptr);
-	}
-}
-
-/***********************************************************//**
-Parses a log record of writing the node pointer of a record.
-@return end of log record or NULL */
-byte*
-page_zip_parse_write_node_ptr(
-/*==========================*/
-	byte*		ptr,	/*!< in: redo log buffer */
-	byte*		end_ptr,/*!< in: redo log buffer end */
-	page_t*		page,	/*!< in/out: uncompressed page */
-	page_zip_des_t*	page_zip)/*!< in/out: compressed page */
-{
-	ulint	offset;
-	ulint	z_offset;
-
-	ut_ad(ptr != NULL);
-	ut_ad(end_ptr!= NULL);
-	ut_ad(!page == !page_zip);
-
-	if (UNIV_UNLIKELY(end_ptr < ptr + (2 + 2 + REC_NODE_PTR_SIZE))) {
-
-		return(NULL);
-	}
-
-	offset = mach_read_from_2(ptr);
-	z_offset = mach_read_from_2(ptr + 2);
-
-	if (offset < PAGE_ZIP_START
-	    || offset >= srv_page_size
-	    || z_offset >= srv_page_size) {
-corrupt:
-		recv_sys.found_corrupt_log = TRUE;
-
-		return(NULL);
-	}
-
-	if (page) {
-		byte*	storage_end;
-		byte*	field;
-		byte*	storage;
-		ulint	heap_no;
-
-		if (!page_zip || page_is_leaf(page)) {
-
-			goto corrupt;
-		}
-
-#ifdef UNIV_ZIP_DEBUG
-		ut_a(page_zip_validate(page_zip, page, NULL));
-#endif /* UNIV_ZIP_DEBUG */
-
-		field = page + offset;
-		storage = page_zip->data + z_offset;
-
-		storage_end = page_zip_dir_start(page_zip);
-
-		heap_no = 1 + ulint(storage_end - storage) / REC_NODE_PTR_SIZE;
-
-		if (UNIV_UNLIKELY((storage_end - storage) % REC_NODE_PTR_SIZE)
-		    || UNIV_UNLIKELY(heap_no < PAGE_HEAP_NO_USER_LOW)
-		    || UNIV_UNLIKELY(heap_no >= page_dir_get_n_heap(page))) {
-
-			goto corrupt;
-		}
-
-		memcpy(field, ptr + 4, REC_NODE_PTR_SIZE);
-		memcpy(storage, ptr + 4, REC_NODE_PTR_SIZE);
-
-#ifdef UNIV_ZIP_DEBUG
-		ut_a(page_zip_validate(page_zip, page, NULL));
-#endif /* UNIV_ZIP_DEBUG */
-	}
-
-	return(ptr + (2 + 2 + REC_NODE_PTR_SIZE));
 }
 
 /**********************************************************************//**
@@ -4026,18 +3895,17 @@ Write the node pointer of a record on a non-leaf compressed page. */
 void
 page_zip_write_node_ptr(
 /*====================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	buf_block_t*	block,	/*!< in/out: compressed page */
 	byte*		rec,	/*!< in/out: record */
 	ulint		size,	/*!< in: data size of rec */
 	ulint		ptr,	/*!< in: node pointer */
-	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	byte*	field;
 	byte*	storage;
-#ifdef UNIV_DEBUG
-	page_t*	page	= page_align(rec);
-#endif /* UNIV_DEBUG */
+	page_zip_des_t* const page_zip = &block->page.zip;
 
+	ut_d(const page_t* const page = block->frame);
 	ut_ad(page_simple_validate_new(page));
 	ut_ad(page_zip_simple_validate(page_zip));
 	ut_ad(page_zip_get_size(page_zip)
@@ -4061,38 +3929,20 @@ page_zip_write_node_ptr(
 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
 	compile_time_assert(REC_NODE_PTR_SIZE == 4);
 	mach_write_to_4(field, ptr);
-	memcpy(storage, field, REC_NODE_PTR_SIZE);
-
-	if (mtr) {
-		byte*	log_ptr	= mlog_open(mtr,
-					    11 + 2 + 2 + REC_NODE_PTR_SIZE);
-		if (UNIV_UNLIKELY(!log_ptr)) {
-			return;
-		}
-
-		log_ptr = mlog_write_initial_log_record_fast(
-			field, MLOG_ZIP_WRITE_NODE_PTR, log_ptr, mtr);
-		mach_write_to_2(log_ptr, page_offset(field));
-		log_ptr += 2;
-		mach_write_to_2(log_ptr, ulint(storage - page_zip->data));
-		log_ptr += 2;
-		memcpy(log_ptr, field, REC_NODE_PTR_SIZE);
-		log_ptr += REC_NODE_PTR_SIZE;
-		mlog_close(mtr, log_ptr);
-	}
+	mtr->zmemcpy(*block, storage, field, REC_NODE_PTR_SIZE);
 }
 
 /** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record.
-@param[in,out]	page_zip	compressed page
+@param[in,out]	block		ROW_FORMAT=COMPRESSED page
 @param[in,out]	rec		record
 @param[in]	offsets		rec_get_offsets(rec, index)
 @param[in]	trx_id_field	field number of DB_TRX_ID (number of PK fields)
 @param[in]	trx_id		DB_TRX_ID value (transaction identifier)
 @param[in]	roll_ptr	DB_ROLL_PTR value (undo log pointer)
-@param[in,out]	mtr		mini-transaction, or NULL to skip logging */
+@param[in,out]	mtr		mini-transaction */
 void
 page_zip_write_trx_id_and_roll_ptr(
-	page_zip_des_t*	page_zip,
+	buf_block_t*	block,
 	byte*		rec,
 	const rec_offs*	offsets,
 	ulint		trx_id_col,
@@ -4100,13 +3950,10 @@ page_zip_write_trx_id_and_roll_ptr(
 	roll_ptr_t	roll_ptr,
 	mtr_t*		mtr)
 {
-	byte*	field;
-	byte*	storage;
-#ifdef UNIV_DEBUG
-	page_t*	page	= page_align(rec);
-#endif /* UNIV_DEBUG */
-	ulint	len;
+	page_zip_des_t* const page_zip = &block->page.zip;
 
+	ut_d(const page_t* const page = block->frame);
+	ut_ad(page_align(rec) == page);
 	ut_ad(page_simple_validate_new(page));
 	ut_ad(page_zip_simple_validate(page_zip));
 	ut_ad(page_zip_get_size(page_zip)
@@ -4121,106 +3968,71 @@ page_zip_write_trx_id_and_roll_ptr(
 
 	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
 
-	storage = page_zip_dir_start(page_zip)
-		- (rec_get_heap_no_new(rec) - 1)
-		* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+	constexpr ulint sys_len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+	const ulint heap_no = rec_get_heap_no_new(rec);
+	ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+	byte* storage = page_zip_dir_start(page_zip) - (heap_no - 1) * sys_len;
 
 	compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
-	field = rec_get_nth_field(rec, offsets, trx_id_col, &len);
+	ulint len;
+	byte* field = rec_get_nth_field(rec, offsets, trx_id_col, &len);
 	ut_ad(len == DATA_TRX_ID_LEN);
 	ut_ad(field + DATA_TRX_ID_LEN
 	      == rec_get_nth_field(rec, offsets, trx_id_col + 1, &len));
 	ut_ad(len == DATA_ROLL_PTR_LEN);
 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
-	ut_a(!memcmp(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN));
+	ut_a(!memcmp(storage, field, sys_len));
 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
 	compile_time_assert(DATA_TRX_ID_LEN == 6);
 	mach_write_to_6(field, trx_id);
 	compile_time_assert(DATA_ROLL_PTR_LEN == 7);
 	mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr);
-	memcpy(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+	len = 0;
+	if (heap_no > PAGE_HEAP_NO_USER_LOW) {
+		byte* prev = storage + sys_len;
+		for (; len < sys_len && prev[len] == field[len]; len++);
+		if (len > 4) {
+			/* We save space by replacing a single record
+
+			WRITE,offset(storage),byte[13]
+
+			with up to two records:
+
+			MEMMOVE,offset(storage),len(1 byte),+13(1 byte),
+			WRITE|0x80,0,byte[13-len]
+
+			The single WRITE record would be x+13 bytes long (x>2).
+			The MEMMOVE record would be x+1+1 = x+2 bytes, and
+			the second WRITE would be 1+1+13-len = 15-len bytes.
+
+			The total size is: x+13 versus x+2+15-len = x+17-len.
+			To save space, we must have len>4. */
+			memcpy(storage, prev, len);
+			mtr->memmove(*block, ulint(storage - page_zip->data),
+				     ulint(storage - page_zip->data) + sys_len,
+				     len);
+			storage += len;
+			field += len;
+			if (UNIV_LIKELY(len < sys_len)) {
+				goto write;
+			}
+		} else {
+			len = 0;
+			goto write;
+		}
+	} else {
+write:
+                mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, storage, field,
+					       sys_len - len);
+	}
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+	ut_a(!memcmp(storage - len, field - len, sys_len));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
 
 	MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
 	MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
 			  rec_offs_extra_size(offsets));
 	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
-
-	if (mtr) {
-		byte*	log_ptr	= mlog_open(
-			mtr, 11 + 2 + 2 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
-		if (UNIV_UNLIKELY(!log_ptr)) {
-			return;
-		}
-
-		log_ptr = mlog_write_initial_log_record_fast(
-			(byte*) field, MLOG_ZIP_WRITE_TRX_ID, log_ptr, mtr);
-		mach_write_to_2(log_ptr, page_offset(field));
-		log_ptr += 2;
-		mach_write_to_2(log_ptr, ulint(storage - page_zip->data));
-		log_ptr += 2;
-		memcpy(log_ptr, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
-		log_ptr += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
-		mlog_close(mtr, log_ptr);
-	}
-}
-
-/** Parse a MLOG_ZIP_WRITE_TRX_ID record.
-@param[in]	ptr		redo log buffer
-@param[in]	end_ptr		end of redo log buffer
-@param[in,out]	page		uncompressed page
-@param[in,out]	page_zip	compressed page
-@return end of log record
-@retval	NULL	if the log record is incomplete */
-byte*
-page_zip_parse_write_trx_id(
-	byte*		ptr,
-	byte*		end_ptr,
-	page_t*		page,
-	page_zip_des_t*	page_zip)
-{
-	byte* const end = 2 + 2 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + ptr;
-
-	if (UNIV_UNLIKELY(end_ptr < end)) {
-		return(NULL);
-	}
-
-	uint offset = mach_read_from_2(ptr);
-	uint z_offset = mach_read_from_2(ptr + 2);
-
-	if (offset < PAGE_ZIP_START
-	    || offset >= srv_page_size
-	    || z_offset >= srv_page_size) {
-corrupt:
-		recv_sys.found_corrupt_log = TRUE;
-
-		return(NULL);
-	}
-
-	if (page) {
-		if (!page_zip || !page_is_leaf(page)) {
-			goto corrupt;
-		}
-
-#ifdef UNIV_ZIP_DEBUG
-		ut_a(page_zip_validate(page_zip, page, NULL));
-#endif /* UNIV_ZIP_DEBUG */
-
-		byte* field = page + offset;
-		byte* storage = page_zip->data + z_offset;
-
-		if (storage >= page_zip_dir_start(page_zip)) {
-			goto corrupt;
-		}
-
-		memcpy(field, ptr + 4, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
-		memcpy(storage, ptr + 4, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
-
-#ifdef UNIV_ZIP_DEBUG
-		ut_a(page_zip_validate(page_zip, page, NULL));
-#endif /* UNIV_ZIP_DEBUG */
-	}
-
-	return end;
 }
 
 /**********************************************************************//**
@@ -4230,22 +4042,26 @@ static
 void
 page_zip_clear_rec(
 /*===============*/
-	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	buf_block_t*	block,		/*!< in/out: compressed page */
 	byte*		rec,		/*!< in: record to clear */
 	const dict_index_t*	index,	/*!< in: index of rec */
-	const rec_offs*	offsets)	/*!< in: rec_get_offsets(rec, index) */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
 	ulint	heap_no;
-	page_t*	page	= page_align(rec);
 	byte*	storage;
 	byte*	field;
 	ulint	len;
+
+	ut_ad(page_align(rec) == block->frame);
+	page_zip_des_t* const page_zip = &block->page.zip;
+
 	/* page_zip_validate() would fail here if a record
 	containing externally stored columns is being deleted. */
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(!page_zip_dir_find(page_zip, page_offset(rec)));
 	ut_ad(page_zip_dir_find_free(page_zip, page_offset(rec)));
-	ut_ad(page_zip_header_cmp(page_zip, page));
+	ut_ad(page_zip_header_cmp(page_zip, block->frame));
 
 	heap_no = rec_get_heap_no_new(rec);
 	ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
@@ -4255,7 +4071,7 @@ page_zip_clear_rec(
 	MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
 			  rec_offs_extra_size(offsets));
 
-	if (!page_is_leaf(page)) {
+	if (!page_is_leaf(block->frame)) {
 		/* Clear node_ptr. On the compressed page,
 		there is an array of node_ptr immediately before the
 		dense page directory, at the very end of the page. */
@@ -4266,12 +4082,14 @@ page_zip_clear_rec(
 					    rec_offs_n_fields(offsets) - 1,
 					    &len);
 		ut_ad(len == REC_NODE_PTR_SIZE);
-
 		ut_ad(!rec_offs_any_extern(offsets));
 		memset(field, 0, REC_NODE_PTR_SIZE);
-		memset(storage - (heap_no - 1) * REC_NODE_PTR_SIZE,
-		       0, REC_NODE_PTR_SIZE);
-	} else if (dict_index_is_clust(index)) {
+		storage -= (heap_no - 1) * REC_NODE_PTR_SIZE;
+		len = REC_NODE_PTR_SIZE;
+clear_page_zip:
+		memset(storage, 0, len);
+		mtr->memset(*block, storage - page_zip->data, len, 0);
+	} else if (index->is_clust()) {
 		/* Clear trx_id and roll_ptr. On the compressed page,
 		there is an array of these fields immediately before the
 		dense page directory, at the very end of the page. */
@@ -4279,14 +4097,9 @@ page_zip_clear_rec(
 			= dict_col_get_clust_pos(
 			dict_table_get_sys_col(
 				index->table, DATA_TRX_ID), index);
-		storage	= page_zip_dir_start(page_zip);
 		field	= rec_get_nth_field(rec, offsets, trx_id_pos, &len);
 		ut_ad(len == DATA_TRX_ID_LEN);
-
 		memset(field, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
-		memset(storage - (heap_no - 1)
-		       * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN),
-		       0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
 
 		if (rec_offs_any_extern(offsets)) {
 			ulint	i;
@@ -4305,31 +4118,35 @@ page_zip_clear_rec(
 				}
 			}
 		}
+
+		len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+		storage = page_zip_dir_start(page_zip)
+			- (heap_no - 1)
+			* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+		goto clear_page_zip;
 	} else {
 		ut_ad(!rec_offs_any_extern(offsets));
 	}
 }
 
-/**********************************************************************//**
-Write the "deleted" flag of a record on a compressed page.  The flag must
-already have been written on the uncompressed page. */
-void
-page_zip_rec_set_deleted(
-/*=====================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
-	const byte*	rec,	/*!< in: record on the uncompressed page */
-	ulint		flag)	/*!< in: the deleted flag (nonzero=TRUE) */
+/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record.
+@param[in,out]  block   buffer block
+@param[in,out]  rec     record on a physical index page
+@param[in]      flag    the value of the delete-mark flag
+@param[in,out]  mtr     mini-transaction  */
+void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag,
+                              mtr_t *mtr)
 {
-	byte*	slot = page_zip_dir_find(page_zip, page_offset(rec));
-	ut_a(slot);
-	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
-	if (flag) {
-		*slot |= (PAGE_ZIP_DIR_SLOT_DEL >> 8);
-	} else {
-		*slot &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8);
-	}
+  ut_ad(page_align(rec) == block->frame);
+  byte *slot= page_zip_dir_find(&block->page.zip, page_offset(rec));
+  byte b= *slot;
+  if (flag)
+    b|= (PAGE_ZIP_DIR_SLOT_DEL >> 8);
+  else
+    b&= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8));
+  mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1);
 #ifdef UNIV_ZIP_DEBUG
-	ut_a(page_zip_validate(page_zip, page_align(rec), NULL));
+  ut_a(page_zip_validate(&block->page.zip, block->frame, nullptr));
 #endif /* UNIV_ZIP_DEBUG */
 }
 
@@ -4339,18 +4156,21 @@ must already have been written on the uncompressed page. */
 void
 page_zip_rec_set_owned(
 /*===================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	buf_block_t*	block,	/*!< in/out: ROW_FORMAT=COMPRESSED page */
 	const byte*	rec,	/*!< in: record on the uncompressed page */
-	ulint		flag)	/*!< in: the owned flag (nonzero=TRUE) */
+	ulint		flag,	/*!< in: the owned flag (nonzero=TRUE) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	byte*	slot = page_zip_dir_find(page_zip, page_offset(rec));
-	ut_a(slot);
-	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
-	if (flag) {
-		*slot |= (PAGE_ZIP_DIR_SLOT_OWNED >> 8);
-	} else {
-		*slot &= ~(PAGE_ZIP_DIR_SLOT_OWNED >> 8);
-	}
+  ut_ad(page_align(rec) == block->frame);
+  page_zip_des_t *const page_zip= &block->page.zip;
+  byte *slot= page_zip_dir_find(page_zip, page_offset(rec));
+  MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+  byte b= *slot;
+  if (flag)
+    b|= (PAGE_ZIP_DIR_SLOT_OWNED >> 8);
+  else
+    b&= byte(~(PAGE_ZIP_DIR_SLOT_OWNED >> 8));
+  mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1);
 }
 
 /**********************************************************************//**
@@ -4358,23 +4178,27 @@ Insert a record to the dense page directory. */
 void
 page_zip_dir_insert(
 /*================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
-	const byte*	prev_rec,/*!< in: record after which to insert */
-	const byte*	free_rec,/*!< in: record from which rec was
-				allocated, or NULL */
-	byte*		rec)	/*!< in: record to insert */
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
+	uint16_t	free_rec,/*!< in: record from which rec was
+				allocated, or 0 */
+	byte*		rec,	/*!< in: record to insert */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
+	ut_ad(page_align(cursor->rec) == cursor->block->frame);
+	ut_ad(page_align(rec) == cursor->block->frame);
+	page_zip_des_t *const page_zip= &cursor->block->page.zip;
+
 	ulint	n_dense;
 	byte*	slot_rec;
 	byte*	slot_free;
 
-	ut_ad(prev_rec != rec);
-	ut_ad(page_rec_get_next((rec_t*) prev_rec) == rec);
+	ut_ad(cursor->rec != rec);
+	ut_ad(page_rec_get_next_const(cursor->rec) == rec);
 	ut_ad(page_zip_simple_validate(page_zip));
 
 	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
 
-	if (page_rec_is_infimum(prev_rec)) {
+	if (page_rec_is_infimum(cursor->rec)) {
 		/* Use the first slot. */
 		slot_rec = page_zip->data + page_zip_get_size(page_zip);
 	} else {
@@ -4390,7 +4214,7 @@ page_zip_dir_insert(
 		}
 
 		slot_rec = page_zip_dir_find_low(start, end,
-						 page_offset(prev_rec));
+						 page_offset(cursor->rec));
 		ut_a(slot_rec);
 	}
 
@@ -4398,7 +4222,7 @@ page_zip_dir_insert(
 	n_dense = page_dir_get_n_heap(page_zip->data)
 		- (PAGE_HEAP_NO_USER_LOW + 1U);
 
-	if (UNIV_LIKELY_NULL(free_rec)) {
+	if (UNIV_UNLIKELY(free_rec)) {
 		/* The record was allocated from the free list.
 		Shift the dense directory only up to that slot.
 		Note that in this case, n_dense is actually
@@ -4406,8 +4230,8 @@ page_zip_dir_insert(
 		did not increment n_heap. */
 		ut_ad(rec_get_heap_no_new(rec) < n_dense + 1
 		      + PAGE_HEAP_NO_USER_LOW);
-		ut_ad(rec >= free_rec);
-		slot_free = page_zip_dir_find(page_zip, page_offset(free_rec));
+		ut_ad(page_offset(rec) >= free_rec);
+		slot_free = page_zip_dir_find(page_zip, free_rec);
 		ut_ad(slot_free);
 		slot_free += PAGE_ZIP_DIR_SLOT_SIZE;
 	} else {
@@ -4421,305 +4245,194 @@ page_zip_dir_insert(
 			- PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
 	}
 
-	/* Shift the dense directory to allocate place for rec. */
-	memmove(slot_free - PAGE_ZIP_DIR_SLOT_SIZE, slot_free,
-		ulint(slot_rec - slot_free));
-
-	/* Write the entry for the inserted record.
-	The "owned" and "deleted" flags must be zero. */
-	mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, page_offset(rec));
-}
-
-/**********************************************************************//**
-Shift the dense page directory and the array of BLOB pointers
-when a record is deleted. */
-void
-page_zip_dir_delete(
-/*================*/
-	page_zip_des_t*		page_zip,	/*!< in/out: compressed page */
-	byte*			rec,		/*!< in: deleted record */
-	const dict_index_t*	index,		/*!< in: index of rec */
-	const rec_offs*		offsets,	/*!< in: rec_get_offsets(rec) */
-	const byte*		free)		/*!< in: previous start of
-						the free list */
-{
-	byte*	slot_rec;
-	byte*	slot_free;
-	ulint	n_ext;
-	page_t*	page	= page_align(rec);
-
-	ut_ad(rec_offs_validate(rec, index, offsets));
-	ut_ad(rec_offs_comp(offsets));
-
-	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
-	MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
-	MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
-			  rec_offs_extra_size(offsets));
-
-	slot_rec = page_zip_dir_find(page_zip, page_offset(rec));
-
-	ut_a(slot_rec);
-	uint16_t n_recs = page_get_n_recs(page);
-	ut_ad(n_recs);
-	ut_ad(n_recs > 1 || page_get_page_no(page) == index->page);
-	/* This could not be done before page_zip_dir_find(). */
-	page_header_set_field(page, page_zip, PAGE_N_RECS,
-			      n_recs - 1);
-
-	if (UNIV_UNLIKELY(!free)) {
-		/* Make the last slot the start of the free list. */
-		slot_free = page_zip->data + page_zip_get_size(page_zip)
-			- PAGE_ZIP_DIR_SLOT_SIZE
-			* (page_dir_get_n_heap(page_zip->data)
-			   - PAGE_HEAP_NO_USER_LOW);
-	} else {
-		slot_free = page_zip_dir_find_free(page_zip,
-						   page_offset(free));
-		ut_a(slot_free < slot_rec);
-		/* Grow the free list by one slot by moving the start. */
-		slot_free += PAGE_ZIP_DIR_SLOT_SIZE;
-	}
-
-	if (UNIV_LIKELY(slot_rec > slot_free)) {
-		memmove(slot_free + PAGE_ZIP_DIR_SLOT_SIZE,
-			slot_free,
-			ulint(slot_rec - slot_free));
-	}
-
-	/* Write the entry for the deleted record.
-	The "owned" and "deleted" flags will be cleared. */
-	mach_write_to_2(slot_free, page_offset(rec));
-
-	if (!page_is_leaf(page) || !dict_index_is_clust(index)) {
-		ut_ad(!rec_offs_any_extern(offsets));
-		goto skip_blobs;
-	}
-
-	n_ext = rec_offs_n_extern(offsets);
-	if (UNIV_UNLIKELY(n_ext != 0)) {
-		/* Shift and zero fill the array of BLOB pointers. */
-		ulint	blob_no;
-		byte*	externs;
-		byte*	ext_end;
-
-		blob_no = page_zip_get_n_prev_extern(page_zip, rec, index);
-		ut_a(blob_no + n_ext <= page_zip->n_blobs);
-
-		externs = page_zip->data + page_zip_get_size(page_zip)
-			- (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
-			* PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
-
-		ext_end = externs - page_zip->n_blobs
-			* BTR_EXTERN_FIELD_REF_SIZE;
-		externs -= blob_no * BTR_EXTERN_FIELD_REF_SIZE;
-
-		page_zip->n_blobs -= static_cast<unsigned>(n_ext);
-		/* Shift and zero fill the array. */
-		memmove(ext_end + n_ext * BTR_EXTERN_FIELD_REF_SIZE, ext_end,
-			ulint(page_zip->n_blobs - blob_no)
-			* BTR_EXTERN_FIELD_REF_SIZE);
-		memset(ext_end, 0, n_ext * BTR_EXTERN_FIELD_REF_SIZE);
+	if (const ulint slot_len = ulint(slot_rec - slot_free)) {
+		/* Shift the dense directory to allocate place for rec. */
+		memmove_aligned<2>(slot_free - PAGE_ZIP_DIR_SLOT_SIZE,
+				   slot_free, slot_len);
+		mtr->memmove(*cursor->block, (slot_free - page_zip->data)
+			     - PAGE_ZIP_DIR_SLOT_SIZE,
+			     slot_free - page_zip->data, slot_len);
 	}
 
-skip_blobs:
-	/* The compression algorithm expects info_bits and n_owned
-	to be 0 for deleted records. */
-	rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
-
-	page_zip_clear_rec(page_zip, rec, index, offsets);
-}
-
-/**********************************************************************//**
-Add a slot to the dense page directory. */
-void
-page_zip_dir_add_slot(
-/*==================*/
-	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
-	ulint		is_clustered)	/*!< in: nonzero for clustered index,
-					zero for others */
-{
-	ulint	n_dense;
-	byte*	dir;
-	byte*	stored;
-
-	ut_ad(page_is_comp(page_zip->data));
-	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
-
-	/* Read the old n_dense (n_heap has already been incremented). */
-	n_dense = page_dir_get_n_heap(page_zip->data)
-		- (PAGE_HEAP_NO_USER_LOW + 1U);
-
-	dir = page_zip->data + page_zip_get_size(page_zip)
-		- PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
-
-	if (!page_is_leaf(page_zip->data)) {
-		ut_ad(!page_zip->n_blobs);
-		stored = dir - n_dense * REC_NODE_PTR_SIZE;
-	} else if (is_clustered) {
-		/* Move the BLOB pointer array backwards to make space for the
-		roll_ptr and trx_id columns and the dense directory slot. */
-		byte*	externs;
-
-		stored = dir - n_dense
-			* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
-		externs = stored
-			- page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
-		ASSERT_ZERO(externs - PAGE_ZIP_CLUST_LEAF_SLOT_SIZE,
-			               PAGE_ZIP_CLUST_LEAF_SLOT_SIZE);
-		memmove(externs - PAGE_ZIP_CLUST_LEAF_SLOT_SIZE,
-			externs, ulint(stored - externs));
-	} else {
-		stored = dir
-			- page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
-		ASSERT_ZERO(stored - PAGE_ZIP_DIR_SLOT_SIZE,
-			    static_cast<size_t>(PAGE_ZIP_DIR_SLOT_SIZE));
+	/* Write the entry for the inserted record.
+	The "owned" flag must be zero. */
+	uint16_t offs = page_offset(rec);
+	if (rec_get_deleted_flag(rec, true)) {
+		offs |= PAGE_ZIP_DIR_SLOT_DEL;
 	}
 
-	/* Move the uncompressed area backwards to make space
-	for one directory slot. */
-	memmove(stored - PAGE_ZIP_DIR_SLOT_SIZE, stored, ulint(dir - stored));
+	mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, offs);
+	mtr->zmemcpy(*cursor->block, slot_rec - page_zip->data
+		     - PAGE_ZIP_DIR_SLOT_SIZE, PAGE_ZIP_DIR_SLOT_SIZE);
 }
 
-/***********************************************************//**
-Parses a log record of writing to the header of a page.
-@return end of log record or NULL */
-byte*
-page_zip_parse_write_header(
-/*========================*/
-	byte*		ptr,	/*!< in: redo log buffer */
-	byte*		end_ptr,/*!< in: redo log buffer end */
-	page_t*		page,	/*!< in/out: uncompressed page */
-	page_zip_des_t*	page_zip)/*!< in/out: compressed page */
+/** Shift the dense page directory and the array of BLOB pointers
+when a record is deleted.
+@param[in,out]  block   index page
+@param[in,out]  rec     record being deleted
+@param[in]      index   the index that the page belongs to
+@param[in]      offsets rec_get_offsets(rec, index)
+@param[in]      free    previous start of the free list
+@param[in,out]  mtr     mini-transaction */
+void page_zip_dir_delete(buf_block_t *block, byte *rec,
+                         const dict_index_t *index, const rec_offs *offsets,
+                         const byte *free, mtr_t *mtr)
 {
-	ulint	offset;
-	ulint	len;
-
-	ut_ad(ptr != NULL);
-	ut_ad(end_ptr!= NULL);
-	ut_ad(!page == !page_zip);
-
-	if (UNIV_UNLIKELY(end_ptr < ptr + (1 + 1))) {
-
-		return(NULL);
-	}
-
-	offset = (ulint) *ptr++;
-	len = (ulint) *ptr++;
-
-	if (len == 0 || offset + len >= PAGE_DATA) {
-corrupt:
-		recv_sys.found_corrupt_log = TRUE;
-
-		return(NULL);
-	}
-
-	if (end_ptr < ptr + len) {
-
-		return(NULL);
-	}
-
-	if (page) {
-		if (!page_zip) {
-
-			goto corrupt;
-		}
-#ifdef UNIV_ZIP_DEBUG
-		ut_a(page_zip_validate(page_zip, page, NULL));
-#endif /* UNIV_ZIP_DEBUG */
-
-		memcpy(page + offset, ptr, len);
-		memcpy(page_zip->data + offset, ptr, len);
-
-#ifdef UNIV_ZIP_DEBUG
-		ut_a(page_zip_validate(page_zip, page, NULL));
-#endif /* UNIV_ZIP_DEBUG */
-	}
-
-	return(ptr + len);
-}
-
-/**********************************************************************//**
-Write a log record of writing to the uncompressed header portion of a page. */
-void
-page_zip_write_header_log(
-/*======================*/
-	const byte*	data,	/*!< in: data on the uncompressed page */
-	ulint		length,	/*!< in: length of the data */
-	mtr_t*		mtr)	/*!< in: mini-transaction */
-{
-	byte*	log_ptr	= mlog_open(mtr, 11 + 1 + 1);
-	ulint	offset	= page_offset(data);
-
-	ut_ad(offset < PAGE_DATA);
-	ut_ad(offset + length < PAGE_DATA);
-	compile_time_assert(PAGE_DATA < 256U);
-	ut_ad(length > 0);
-	ut_ad(length < 256);
-
-	/* If no logging is requested, we may return now */
-	if (UNIV_UNLIKELY(!log_ptr)) {
-
-		return;
-	}
-
-	log_ptr = mlog_write_initial_log_record_fast(
-		(byte*) data, MLOG_ZIP_WRITE_HEADER, log_ptr, mtr);
-	*log_ptr++ = (byte) offset;
-	*log_ptr++ = (byte) length;
-	mlog_close(mtr, log_ptr);
-
-	mlog_catenate_string(mtr, data, length);
+  ut_ad(page_align(rec) == block->frame);
+  page_zip_des_t *const page_zip= &block->page.zip;
+
+  ut_ad(rec_offs_validate(rec, index, offsets));
+  ut_ad(rec_offs_comp(offsets));
+
+  MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+  MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+  MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+		    rec_offs_extra_size(offsets));
+
+  mach_write_to_2(rec - REC_NEXT,
+                  free ? static_cast<uint16_t>(free - rec) : 0);
+  byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+                                        block->frame);
+  mtr->write<2>(*block, page_free, page_offset(rec));
+  byte *garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
+                                      block->frame);
+  mtr->write<2>(*block, garbage, rec_offs_size(offsets) +
+                mach_read_from_2(garbage));
+  compile_time_assert(PAGE_GARBAGE == PAGE_FREE + 2);
+  memcpy_aligned<4>(PAGE_FREE + PAGE_HEADER + page_zip->data, page_free, 4);
+  byte *slot_rec= page_zip_dir_find(page_zip, page_offset(rec));
+  ut_a(slot_rec);
+  uint16_t n_recs= page_get_n_recs(block->frame);
+  ut_ad(n_recs);
+  ut_ad(n_recs > 1 || page_get_page_no(block->frame) == index->page);
+  /* This could not be done before page_zip_dir_find(). */
+  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+                                          block->frame);
+  mtr->write<2>(*block, page_n_recs, n_recs - 1U);
+  memcpy_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page_zip->data, page_n_recs,
+                    2);
+
+  byte *slot_free;
+
+  if (UNIV_UNLIKELY(!free))
+    /* Make the last slot the start of the free list. */
+    slot_free= page_zip->data + page_zip_get_size(page_zip) -
+      PAGE_ZIP_DIR_SLOT_SIZE * (page_dir_get_n_heap(page_zip->data) -
+                                PAGE_HEAP_NO_USER_LOW);
+  else
+  {
+    slot_free= page_zip_dir_find_free(page_zip, page_offset(free));
+    ut_a(slot_free < slot_rec);
+    /* Grow the free list by one slot by moving the start. */
+    slot_free+= PAGE_ZIP_DIR_SLOT_SIZE;
+  }
+
+  const ulint slot_len= slot_rec > slot_free ? ulint(slot_rec - slot_free) : 0;
+  if (slot_len)
+  {
+    memmove_aligned<2>(slot_free + PAGE_ZIP_DIR_SLOT_SIZE, slot_free,
+                       slot_len);
+    mtr->memmove(*block, (slot_free - page_zip->data) + PAGE_ZIP_DIR_SLOT_SIZE,
+                 slot_free - page_zip->data, slot_len);
+  }
+
+  /* Write the entry for the deleted record.
+  The "owned" and "deleted" flags will be cleared. */
+  mach_write_to_2(slot_free, page_offset(rec));
+  mtr->zmemcpy(*block, slot_free - page_zip->data, 2);
+
+  if (const ulint n_ext= rec_offs_n_extern(offsets))
+  {
+    ut_ad(index->is_primary());
+    ut_ad(page_is_leaf(block->frame));
+
+    /* Shift and zero fill the array of BLOB pointers. */
+    ulint blob_no = page_zip_get_n_prev_extern(page_zip, rec, index);
+    ut_a(blob_no + n_ext <= page_zip->n_blobs);
+
+    byte *externs= page_zip->data + page_zip_get_size(page_zip) -
+      (page_dir_get_n_heap(block->frame) - PAGE_HEAP_NO_USER_LOW) *
+      PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
+    byte *ext_end= externs - page_zip->n_blobs * FIELD_REF_SIZE;
+
+    /* Shift and zero fill the array. */
+    if (const ulint ext_len= ulint(page_zip->n_blobs - n_ext - blob_no) *
+        BTR_EXTERN_FIELD_REF_SIZE)
+    {
+      memmove(ext_end + n_ext * FIELD_REF_SIZE, ext_end, ext_len);
+      mtr->memmove(*block, (ext_end - page_zip->data) + n_ext * FIELD_REF_SIZE,
+                   ext_end - page_zip->data, ext_len);
+    }
+    memset(ext_end, 0, n_ext * FIELD_REF_SIZE);
+    mtr->memset(*block, ext_end - page_zip->data, n_ext * FIELD_REF_SIZE, 0);
+    page_zip->n_blobs = (page_zip->n_blobs - n_ext) & ((1U << 12) - 1);
+  }
+
+  /* The compression algorithm expects info_bits and n_owned
+  to be 0 for deleted records. */
+  rec[-REC_N_NEW_EXTRA_BYTES]= 0; /* info_bits and n_owned */
+
+  page_zip_clear_rec(block, rec, index, offsets, mtr);
 }
 
 /**********************************************************************//**
 Reorganize and compress a page.  This is a low-level operation for
 compressed pages, to be used when page_zip_compress() fails.
-On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written.
+On success, redo log will be written.
 The function btr_page_reorganize() should be preferred whenever possible.
 IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
 non-clustered index, the caller must update the insert buffer free
 bits in the same mini-transaction in such a way that the modification
 will be redo-logged.
-@return TRUE on success, FALSE on failure; page_zip will be left
-intact on failure, but page will be overwritten. */
-ibool
+@retval true on success
+@retval false on failure; the block will be left intact */
+bool
 page_zip_reorganize(
-/*================*/
 	buf_block_t*	block,	/*!< in/out: page with compressed page;
 				on the compressed page, in: size;
 				out: data, n_blobs,
 				m_start, m_end, m_nonempty */
 	dict_index_t*	index,	/*!< in: index of the B-tree node */
-	mtr_t*		mtr)	/*!< in: mini-transaction */
+	ulint		z_level,/*!< in: compression level */
+	mtr_t*		mtr,	/*!< in: mini-transaction */
+	bool		restore)/*!< whether to restore on failure */
 {
-	buf_pool_t*	buf_pool	= buf_pool_from_block(block);
-	page_zip_des_t*	page_zip	= buf_block_get_page_zip(block);
 	page_t*		page		= buf_block_get_frame(block);
 	buf_block_t*	temp_block;
 	page_t*		temp_page;
 
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(block->page.zip.data);
 	ut_ad(page_is_comp(page));
 	ut_ad(!dict_index_is_ibuf(index));
 	ut_ad(!index->table->is_temporary());
 	/* Note that page_zip_validate(page_zip, page, index) may fail here. */
 	MEM_CHECK_DEFINED(page, srv_page_size);
-	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+	MEM_CHECK_DEFINED(buf_block_get_page_zip(block)->data,
+			  page_zip_get_size(buf_block_get_page_zip(block)));
 
 	/* Disable logging */
 	mtr_log_t	log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
 
-	temp_block = buf_block_alloc(buf_pool);
+	temp_block = buf_block_alloc();
 	btr_search_drop_page_hash_index(block);
 	temp_page = temp_block->frame;
 
 	/* Copy the old page to temporary space */
-	buf_frame_copy(temp_page, page);
+	memcpy_aligned<UNIV_PAGE_SIZE_MIN>(temp_block->frame, block->frame,
+					   srv_page_size);
 
 	/* Recreate the page: note that global data on page (possible
 	segment headers, next page-field, etc.) is preserved intact */
 
-	page_create(block, mtr, TRUE, dict_index_is_spatial(index));
+	page_create(block, mtr, true);
+	if (index->is_spatial()) {
+		mach_write_to_2(FIL_PAGE_TYPE + page, FIL_PAGE_RTREE);
+		memcpy_aligned<2>(block->page.zip.data + FIL_PAGE_TYPE,
+				  page + FIL_PAGE_TYPE, 2);
+		memset(FIL_RTREE_SPLIT_SEQ_NUM + page, 0, 8);
+		memset(FIL_RTREE_SPLIT_SEQ_NUM + block->page.zip.data, 0, 8);
+	}
 
 	/* Copy the records from the temporary space to the recreated page;
 	do not copy the lock bits yet */
@@ -4729,8 +4442,8 @@ page_zip_reorganize(
 					index, mtr);
 
 	/* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */
-	memcpy(page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
-	       temp_page + (PAGE_HEADER + PAGE_MAX_TRX_ID), 8);
+	memcpy_aligned<8>(page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
+			  temp_page + (PAGE_HEADER + PAGE_MAX_TRX_ID), 8);
 	/* PAGE_MAX_TRX_ID must be set on secondary index leaf pages. */
 	ut_ad(dict_index_is_clust(index) || !page_is_leaf(temp_page)
 	      || page_get_max_trx_id(page) != 0);
@@ -4744,15 +4457,39 @@ page_zip_reorganize(
 	/* Restore logging. */
 	mtr_set_log_mode(mtr, log_mode);
 
-	if (!page_zip_compress(page_zip, page, index, page_zip_level, mtr)) {
+	if (!page_zip_compress(block, index, z_level, mtr)) {
+		if (restore) {
+			/* Restore the old page and exit. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+			/* Check that the bytes that we skip are identical. */
+			ut_a(!memcmp(page, temp_page, PAGE_HEADER));
+			ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
+				     PAGE_HEADER + PAGE_N_RECS + temp_page,
+				     PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
+			ut_a(!memcmp(srv_page_size - FIL_PAGE_DATA_END + page,
+				     srv_page_size - FIL_PAGE_DATA_END
+				     + temp_page,
+				     FIL_PAGE_DATA_END));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+			memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
+			       PAGE_N_RECS - PAGE_N_DIR_SLOTS);
+			memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
+			       srv_page_size - PAGE_DATA - FIL_PAGE_DATA_END);
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+			ut_a(!memcmp(page, temp_page, srv_page_size));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+		}
+
 		buf_block_free(temp_block);
-		return(FALSE);
+		return false;
 	}
 
 	lock_move_reorganize_page(block, temp_block);
 
 	buf_block_free(temp_block);
-	return(TRUE);
+	return true;
 }
 
 /**********************************************************************//**
@@ -4762,18 +4499,17 @@ related to the storage of records.  Also copy PAGE_MAX_TRX_ID.
 NOTE: The caller must update the lock table and the adaptive hash index. */
 void
 page_zip_copy_recs(
-/*===============*/
-	page_zip_des_t*		page_zip,	/*!< out: copy of src_zip
-						(n_blobs, m_start, m_end,
-						m_nonempty, data[0..size-1]) */
-	page_t*			page,		/*!< out: copy of src */
+	buf_block_t*		block,		/*!< in/out: buffer block */
 	const page_zip_des_t*	src_zip,	/*!< in: compressed page */
 	const page_t*		src,		/*!< in: page */
 	dict_index_t*		index,		/*!< in: index of the B-tree */
 	mtr_t*			mtr)		/*!< in: mini-transaction */
 {
-	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
-	ut_ad(mtr_memo_contains_page(mtr, src, MTR_MEMO_PAGE_X_FIX));
+	page_t* page = block->frame;
+	page_zip_des_t* page_zip = &block->page.zip;
+
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_page_flagged(src, MTR_MEMO_PAGE_X_FIX));
 	ut_ad(!dict_index_is_ibuf(index));
 	ut_ad(!index->table->is_temporary());
 #ifdef UNIV_ZIP_DEBUG
@@ -4799,21 +4535,24 @@ page_zip_copy_recs(
 	PAGE_MAX_TRX_ID.  Skip the rest of the page header and
 	trailer.  On the compressed page, there is no trailer. */
 	compile_time_assert(PAGE_MAX_TRX_ID + 8 == PAGE_HEADER_PRIV_END);
-	memcpy(PAGE_HEADER + page, PAGE_HEADER + src,
-	       PAGE_HEADER_PRIV_END);
-	memcpy(PAGE_DATA + page, PAGE_DATA + src,
-	       srv_page_size - PAGE_DATA - FIL_PAGE_DATA_END);
-	memcpy(PAGE_HEADER + page_zip->data, PAGE_HEADER + src_zip->data,
-	       PAGE_HEADER_PRIV_END);
-	memcpy(PAGE_DATA + page_zip->data, PAGE_DATA + src_zip->data,
-	       page_zip_get_size(page_zip) - PAGE_DATA);
+	memcpy_aligned<2>(PAGE_HEADER + page, PAGE_HEADER + src,
+			  PAGE_HEADER_PRIV_END);
+	memcpy_aligned<2>(PAGE_DATA + page, PAGE_DATA + src,
+			  srv_page_size - (PAGE_DATA + FIL_PAGE_DATA_END));
+	memcpy_aligned<2>(PAGE_HEADER + page_zip->data,
+			  PAGE_HEADER + src_zip->data,
+			  PAGE_HEADER_PRIV_END);
+	memcpy_aligned<2>(PAGE_DATA + page_zip->data,
+			  PAGE_DATA + src_zip->data,
+			  page_zip_get_size(page_zip) - PAGE_DATA);
 
 	if (dict_index_is_clust(index)) {
 		/* Reset the PAGE_ROOT_AUTO_INC field when copying
 		from a root page. */
-		memset(PAGE_HEADER + PAGE_ROOT_AUTO_INC + page, 0, 8);
-		memset(PAGE_HEADER + PAGE_ROOT_AUTO_INC + page_zip->data,
-		       0, 8);
+		memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC
+				  + page, 0, 8);
+		memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC
+				  + page_zip->data, 0, 8);
 	} else {
 		/* The PAGE_MAX_TRX_ID must be nonzero on leaf pages
 		of secondary indexes, and 0 on others. */
@@ -4840,79 +4579,15 @@ page_zip_copy_recs(
 			rec_t*	rec = page + offs;
 			ut_a(rec[-REC_N_NEW_EXTRA_BYTES]
 			     & REC_INFO_MIN_REC_FLAG);
-			rec[-REC_N_NEW_EXTRA_BYTES] &= ~ REC_INFO_MIN_REC_FLAG;
+			rec[-REC_N_NEW_EXTRA_BYTES]
+				&= byte(~REC_INFO_MIN_REC_FLAG);
 		}
 	}
 
 #ifdef UNIV_ZIP_DEBUG
 	ut_a(page_zip_validate(page_zip, page, index));
 #endif /* UNIV_ZIP_DEBUG */
-	page_zip_compress_write_log(page_zip, page, index, mtr);
-}
-
-/** Parse and optionally apply MLOG_ZIP_PAGE_COMPRESS.
-@param[in]	ptr	log record
-@param[in]	end_ptr	end of log
-@param[in,out]	block	ROW_FORMAT=COMPRESSED block, or NULL for parsing only
-@return	end of log record
-@retval	NULL	if the log record is incomplete */
-byte* page_zip_parse_compress(const byte* ptr, const byte* end_ptr,
-			      buf_block_t* block)
-{
-	ulint	size;
-	ulint	trailer_size;
-
-	ut_ad(ptr != NULL);
-	ut_ad(end_ptr!= NULL);
-
-	if (UNIV_UNLIKELY(ptr + (2 + 2) > end_ptr)) {
-
-		return(NULL);
-	}
-
-	size = mach_read_from_2(ptr);
-	ptr += 2;
-	trailer_size = mach_read_from_2(ptr);
-	ptr += 2;
-
-	if (UNIV_UNLIKELY(ptr + 8 + size + trailer_size > end_ptr)) {
-
-		return(NULL);
-	}
-
-	if (block) {
-		ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-		page_zip_des_t* page_zip = buf_block_get_page_zip(block);
-		if (!page_zip || page_zip_get_size(page_zip) < size
-		    || block->page.id.page_no() < 3) {
-corrupt:
-			recv_sys.found_corrupt_log = TRUE;
-
-			return(NULL);
-		}
-
-		memset(page_zip->data, 0, page_zip_get_size(page_zip));
-		mach_write_to_4(FIL_PAGE_OFFSET
-				+ page_zip->data, block->page.id.page_no());
-		mach_write_to_4(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID
-				+ page_zip->data, block->page.id.space());
-		memcpy(page_zip->data + FIL_PAGE_PREV, ptr, 4);
-		memcpy(page_zip->data + FIL_PAGE_NEXT, ptr + 4, 4);
-		memcpy(page_zip->data + FIL_PAGE_TYPE, ptr + 8, size);
-		memset(page_zip->data + FIL_PAGE_TYPE + size, 0,
-		       page_zip_get_size(page_zip) - trailer_size
-		       - (FIL_PAGE_TYPE + size));
-		memcpy(page_zip->data + page_zip_get_size(page_zip)
-		       - trailer_size, ptr + 8 + size, trailer_size);
-
-		if (UNIV_UNLIKELY(!page_zip_decompress(page_zip, block->frame,
-						       TRUE))) {
-
-			goto corrupt;
-		}
-	}
-
-	return(const_cast<byte*>(ptr) + 8 + size + trailer_size);
+	page_zip_compress_write_log(block, index, mtr);
 }
 #endif /* !UNIV_INNOCHECKSUM */
 
diff --git a/storage/innobase/pars/lexyy.cc b/storage/innobase/pars/lexyy.cc
index 1e93ec3ed50..e57a28ce7f1 100644
--- a/storage/innobase/pars/lexyy.cc
+++ b/storage/innobase/pars/lexyy.cc
@@ -260,7 +260,7 @@ static YY_BUFFER_STATE * yy_buffer_stack = NULL; /**< Stack as an array. */
  */
 #define YY_CURRENT_BUFFER ( (yy_buffer_stack) \
                           ? (yy_buffer_stack)[(yy_buffer_stack_top)] \
-                          : NULL)
+                          : 0)
 /* Same as previous macro, but useful when we know that the buffer stack is not
  * NULL or when we need an lvalue. For internal use only.
  */
@@ -816,7 +816,6 @@ Created 12/14/1997 Heikki Tuuri
 #include "pars0grm.h"
 #include "pars0sym.h"
 #include "mem0mem.h"
-#include "os0proc.h"
 
 #define malloc(A)	ut_malloc_nokey(A)
 #define free(A)		ut_free(A)
@@ -1725,7 +1724,7 @@ YY_RULE_SETUP
 {
 			yylval = sym_tab_add_id(pars_sym_tab_global,
 							(byte*) yytext,
-							ut_strlen(yytext));
+							strlen(yytext));
 			return(PARS_ID_TOKEN);
 }
 	YY_BREAK
@@ -1735,7 +1734,7 @@ YY_RULE_SETUP
 {
 			yylval = sym_tab_add_id(pars_sym_tab_global,
 							(byte*) yytext,
-							ut_strlen(yytext));
+							strlen(yytext));
 			return(PARS_TABLE_NAME_TOKEN);
 }
 	YY_BREAK
diff --git a/storage/innobase/pars/pars0lex.l b/storage/innobase/pars/pars0lex.l
index 8b2df6b7940..1ddc5132da1 100644
--- a/storage/innobase/pars/pars0lex.l
+++ b/storage/innobase/pars/pars0lex.l
@@ -58,7 +58,6 @@ Created 12/14/1997 Heikki Tuuri
 #include "pars0grm.h"
 #include "pars0sym.h"
 #include "mem0mem.h"
-#include "os0proc.h"
 
 #define malloc(A)	ut_malloc_nokey(A)
 #define free(A)		ut_free(A)
@@ -476,14 +475,14 @@ In the state 'id', only two actions are possible (defined below). */
 {ID}		{
 			yylval = sym_tab_add_id(pars_sym_tab_global,
 							(byte*) yytext,
-							ut_strlen(yytext));
+							strlen(yytext));
 			return(PARS_ID_TOKEN);
 }
 
 {TABLE_NAME}	{
 			yylval = sym_tab_add_id(pars_sym_tab_global,
 							(byte*) yytext,
-							ut_strlen(yytext));
+							strlen(yytext));
 			return(PARS_TABLE_NAME_TOKEN);
 }
 
diff --git a/storage/innobase/pars/pars0opt.cc b/storage/innobase/pars/pars0opt.cc
index 0cfd93d4120..e1a913b0179 100644
--- a/storage/innobase/pars/pars0opt.cc
+++ b/storage/innobase/pars/pars0opt.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -387,7 +388,7 @@ opt_calc_index_goodness(
 		}
 	}
 
-	if (goodness >= 4 * dict_index_get_n_unique(index)) {
+	if (goodness / 4 >= dict_index_get_n_unique(index)) {
 		goodness += 1024;
 
 		if (dict_index_is_clust(index)) {
@@ -584,8 +585,8 @@ opt_search_plan_for_table(
 			best_goodness = goodness;
 			n_fields = opt_calc_n_fields_from_goodness(goodness);
 
-			ut_memcpy(best_index_plan, index_plan,
-				  n_fields * sizeof(void*));
+			memcpy(best_index_plan, index_plan,
+			       n_fields * sizeof *index_plan);
 			best_last_op = last_op;
 		}
 
@@ -609,8 +610,8 @@ opt_search_plan_for_table(
 				pars_sym_tab_global->heap,
 				n_fields * sizeof(void*)));
 
-		ut_memcpy(plan->tuple_exps, best_index_plan,
-			  n_fields * sizeof(void*));
+		memcpy(plan->tuple_exps, best_index_plan,
+		       n_fields * sizeof *best_index_plan);
 		if (best_last_op == '='
 		    || best_last_op == PARS_LIKE_TOKEN_EXACT
                     || best_last_op == PARS_LIKE_TOKEN_PREFIX
diff --git a/storage/innobase/pars/pars0pars.cc b/storage/innobase/pars/pars0pars.cc
index 9f1aae9aa07..2981e31c05f 100644
--- a/storage/innobase/pars/pars0pars.cc
+++ b/storage/innobase/pars/pars0pars.cc
@@ -597,9 +597,8 @@ pars_resolve_exp_variables_and_types(
 			|| (node->token_type == SYM_CURSOR)
 			|| (node->token_type == SYM_FUNCTION))
 		    && node->name
-		    && (sym_node->name_len == node->name_len)
-		    && (ut_memcmp(sym_node->name, node->name,
-				  node->name_len) == 0)) {
+		    && sym_node->name_len == node->name_len
+		    && !memcmp(sym_node->name, node->name, node->name_len)) {
 
 			/* Found a variable or a cursor declared with
 			the same name */
@@ -707,9 +706,9 @@ pars_resolve_exp_columns(
 			const char*		col_name
 				= dict_table_get_col_name(table, i);
 
-			if ((sym_node->name_len == ut_strlen(col_name))
-			    && (0 == ut_memcmp(sym_node->name, col_name,
-					       sym_node->name_len))) {
+			if (sym_node->name_len == strlen(col_name)
+			    && !memcmp(sym_node->name, col_name,
+				       sym_node->name_len)) {
 				/* Found */
 				sym_node->resolved = TRUE;
 				sym_node->token_type = SYM_COLUMN;
@@ -828,7 +827,7 @@ pars_select_all_columns(
 
 			col_node = sym_tab_add_id(pars_sym_tab_global,
 						  (byte*) col_name,
-						  ut_strlen(col_name));
+						  strlen(col_name));
 
 			select_node->select_list = que_node_list_add_last(
 				select_node->select_list, col_node);
@@ -1132,9 +1131,11 @@ pars_process_assign_list(
 
 		col_sym = assign_node->col;
 
-		upd_field_set_field_no(upd_field, dict_index_get_nth_col_pos(
-						clust_index, col_sym->col_no,
-						NULL),
+		ulint field_no = dict_index_get_nth_col_pos(
+			clust_index, col_sym->col_no, NULL);
+		ut_ad(field_no < clust_index->n_fields);
+		upd_field_set_field_no(upd_field,
+				       static_cast<uint16_t>(field_no),
 				       clust_index);
 		upd_field->exp = assign_node->val;
 
@@ -1931,7 +1932,7 @@ pars_stored_procedure_call(
 
 /*************************************************************//**
 Retrieves characters to the lexical analyzer. */
-size_t
+int
 pars_get_lex_chars(
 /*===============*/
 	char*	buf,		/*!< in/out: buffer where to copy */
@@ -1953,7 +1954,7 @@ pars_get_lex_chars(
 
 	pars_sym_tab_global->next_char_pos += len;
 
-	return(len);
+	return static_cast<int>(len);
 }
 
 /*************************************************************//**
diff --git a/storage/innobase/que/que0que.cc b/storage/innobase/que/que0que.cc
index e98d50ea0fc..121eda36d6d 100644
--- a/storage/innobase/que/que0que.cc
+++ b/storage/innobase/que/que0que.cc
@@ -96,17 +96,6 @@ When the execution of the graph completes, it is like returning
 from a subprocedure: the query thread which requested the operation
 starts running again. */
 
-/**********************************************************************//**
-Moves a thread from another state to the QUE_THR_RUNNING state. Increments
-the n_active_thrs counters of the query graph and transaction.
-***NOTE***: This is the only function in which such a transition is allowed
-to happen! */
-static
-void
-que_thr_move_to_run_state(
-/*======================*/
-	que_thr_t*	thr);	/*!< in: an query thread */
-
 /***********************************************************************//**
 Creates a query graph fork node.
 @return own: fork node */
@@ -166,8 +155,6 @@ que_thr_create(
 
 	thr->common.parent = parent;
 
-	thr->magic_n = QUE_THR_MAGIC_N;
-
 	thr->common.type = QUE_NODE_THR;
 
 	thr->state = QUE_THR_COMMAND_WAIT;
@@ -194,7 +181,6 @@ que_thr_end_lock_wait(
 				QUE_THR_LOCK_WAIT */
 {
 	que_thr_t*	thr;
-	ibool		was_active;
 
 	ut_ad(lock_mutex_own());
 	ut_ad(trx_mutex_own(trx));
@@ -207,9 +193,9 @@ que_thr_end_lock_wait(
 	/* In MySQL this is the only possible state here */
 	ut_a(thr->state == QUE_THR_LOCK_WAIT);
 
-	was_active = thr->is_active;
+	bool was_active = thr->is_active;
 
-	que_thr_move_to_run_state(thr);
+	thr->start_running();
 
 	trx->lock.que_state = TRX_QUE_RUNNING;
 
@@ -231,8 +217,7 @@ que_thr_init_command(
 {
 	thr->run_node = thr;
 	thr->prev_node = thr->common.parent;
-
-	que_thr_move_to_run_state(thr);
+	thr->start_running();
 }
 
 /**********************************************************************//**
@@ -354,12 +339,9 @@ que_fork_start_command(
 	}
 
 	if (suspended_thr) {
-
 		thr = suspended_thr;
-		que_thr_move_to_run_state(thr);
-
+		thr->start_running();
 	} else if (completed_thr) {
-
 		thr = completed_thr;
 		que_thr_init_command(thr);
 	} else {
@@ -428,15 +410,8 @@ que_graph_free_recursive(
 
 		break;
 	case QUE_NODE_THR:
-
 		thr = static_cast<que_thr_t*>(node);
-
-		ut_a(thr->magic_n == QUE_THR_MAGIC_N);
-
-		thr->magic_n = QUE_THR_MAGIC_FREED;
-
 		que_graph_free_recursive(thr->child);
-
 		break;
 	case QUE_NODE_UNDO:
 
@@ -472,6 +447,7 @@ que_graph_free_recursive(
 
 		mem_heap_free(purge->heap);
 
+		purge->~purge_node_t();
 		break;
 
 	case QUE_NODE_UPDATE:
@@ -632,35 +608,6 @@ que_thr_node_step(
 }
 
 /**********************************************************************//**
-Moves a thread from another state to the QUE_THR_RUNNING state. Increments
-the n_active_thrs counters of the query graph and transaction if thr was
-not active.
-***NOTE***: This and ..._mysql are  the only functions in which such a
-transition is allowed to happen! */
-static
-void
-que_thr_move_to_run_state(
-/*======================*/
-	que_thr_t*	thr)	/*!< in: an query thread */
-{
-	ut_ad(thr->state != QUE_THR_RUNNING);
-
-	if (!thr->is_active) {
-		trx_t*	trx;
-
-		trx = thr_get_trx(thr);
-
-		thr->graph->n_active_thrs++;
-
-		trx->lock.n_active_thrs++;
-
-		thr->is_active = TRUE;
-	}
-
-	thr->state = QUE_THR_RUNNING;
-}
-
-/**********************************************************************//**
 Stops a query thread if graph or trx is in a state requiring it. The
 conditions are tested in the order (1) graph, (2) trx.
 @return TRUE if stopped */
@@ -723,7 +670,6 @@ que_thr_dec_refer_count(
 					a new query thread */
 {
 	trx_t*		trx;
-	que_fork_t*	fork;
 
 	trx = thr_get_trx(thr);
 
@@ -760,13 +706,8 @@ que_thr_dec_refer_count(
 		}
 	}
 
-	fork = static_cast<que_fork_t*>(thr->common.parent);
-
-	--trx->lock.n_active_thrs;
-
-	--fork->n_active_thrs;
-
-	thr->is_active = FALSE;
+	ut_d(static_cast<que_fork_t*>(thr->common.parent)->set_active(false));
+	thr->is_active = false;
 }
 
 /**********************************************************************//**
@@ -803,64 +744,31 @@ que_thr_stop_for_mysql(
 		}
 	}
 
-	ut_ad(thr->is_active == TRUE);
-	ut_ad(trx->lock.n_active_thrs == 1);
-	ut_ad(thr->graph->n_active_thrs == 1);
-
-	thr->is_active = FALSE;
-	thr->graph->n_active_thrs--;
-
-	trx->lock.n_active_thrs--;
+	ut_ad(thr->is_active);
+	ut_d(thr->set_active(false));
+	thr->is_active= false;
 
 	trx_mutex_exit(trx);
 }
 
-/**********************************************************************//**
-Moves a thread from another state to the QUE_THR_RUNNING state. Increments
-the n_active_thrs counters of the query graph and transaction if thr was
-not active. */
-void
-que_thr_move_to_run_state_for_mysql(
-/*================================*/
-	que_thr_t*	thr,	/*!< in: an query thread */
-	trx_t*		trx)	/*!< in: transaction */
+#ifdef UNIV_DEBUG
+/** Change the 'active' status */
+void que_fork_t::set_active(bool active)
 {
-	ut_a(thr->magic_n == QUE_THR_MAGIC_N);
-
-	if (!thr->is_active) {
-
-		thr->graph->n_active_thrs++;
-
-		trx->lock.n_active_thrs++;
-
-		thr->is_active = TRUE;
-	}
-
-	thr->state = QUE_THR_RUNNING;
-}
-
-/**********************************************************************//**
-A patch for MySQL used to 'stop' a dummy query thread used in MySQL
-select, when there is no error or lock wait. */
-void
-que_thr_stop_for_mysql_no_error(
-/*============================*/
-	que_thr_t*	thr,	/*!< in: query thread */
-	trx_t*		trx)	/*!< in: transaction */
-{
-	ut_ad(thr->state == QUE_THR_RUNNING);
-	ut_ad(thr->is_active == TRUE);
-	ut_ad(trx->lock.n_active_thrs == 1);
-	ut_ad(thr->graph->n_active_thrs == 1);
-	ut_a(thr->magic_n == QUE_THR_MAGIC_N);
-
-	thr->state = QUE_THR_COMPLETED;
-
-	thr->is_active = FALSE;
-	thr->graph->n_active_thrs--;
-
-	trx->lock.n_active_thrs--;
+  if (active)
+  {
+    n_active_thrs++;
+    trx->lock.n_active_thrs++;
+  }
+  else
+  {
+    ut_ad(n_active_thrs);
+    ut_ad(trx->lock.n_active_thrs);
+    n_active_thrs--;
+    trx->lock.n_active_thrs--;
+  }
 }
+#endif
 
 /****************************************************************//**
 Get the first containing loop node (e.g. while_node_t or for_node_t) for the
diff --git a/storage/innobase/read/read0read.cc b/storage/innobase/read/read0read.cc
index 9ad2eaa1ebe..9047618d01c 100644
--- a/storage/innobase/read/read0read.cc
+++ b/storage/innobase/read/read0read.cc
@@ -161,16 +161,6 @@ but it will never be dereferenced, because the purge view is older
 than any active transaction.
 
 For details see: row_vers_old_has_index_entry() and row_purge_poss_sec()
-
-Some additional issues:
-
-What if trx_sys.view_list == NULL and some transaction T1 and Purge both
-try to open read_view at same time. Only one can acquire trx_sys.mutex.
-In which order will the views be opened? Should it matter? If no, why?
-
-The order does not matter. No new transactions can be created and no running
-RW transaction can commit or rollback (or free views). AC-NL-RO transactions
-will mark their views as closed but not actually free their views.
 */
 
 
@@ -180,7 +170,7 @@ will mark their views as closed but not actually free their views.
 
   @param[in,out] trx transaction
 */
-inline void ReadView::snapshot(trx_t *trx)
+inline void ReadViewBase::snapshot(trx_t *trx)
 {
   trx_sys.snapshot_ids(trx, &m_ids, &m_low_limit_id, &m_low_limit_no);
   std::sort(m_ids.begin(), m_ids.end());
@@ -196,74 +186,52 @@ inline void ReadView::snapshot(trx_t *trx)
   View becomes visible to purge thread.
 
   @param[in,out] trx transaction
+
+  Reuses closed view if there were no read-write transactions since (and at)
+  its creation time.
+
+  Original comment states: there is an inherent race here between purge
+  and this thread.
+
+  To avoid this race we should've checked trx_sys.get_max_trx_id() and
+  set m_open atomically under ReadView::m_mutex protection. But we're cutting
+  edges to achieve greater performance.
+
+  There're at least two types of concurrent threads interested in this
+  value: purge coordinator thread (see trx_sys_t::clone_oldest_view()) and
+  InnoDB monitor thread (see lock_trx_print_wait_and_mvcc_state()).
+
+  What bad things can happen because we allow this race?
+
+  Speculative execution may reorder state change before get_max_trx_id().
+  In this case purge thread has short gap to clone outdated view. Which is
+  probably not that bad: it just won't be able to purge things that it was
+  actually allowed to purge for a short while.
+
+  This thread may as well get suspended after trx_sys.get_max_trx_id() and
+  before m_open is set. New read-write transaction may get started, committed
+  and purged meanwhile. It is acceptable as well, since this view doesn't see
+  it.
 */
 void ReadView::open(trx_t *trx)
 {
   ut_ad(this == &trx->read_view);
-  switch (state())
-  {
-  case READ_VIEW_STATE_OPEN:
+  if (is_open())
     ut_ad(!srv_read_only_mode);
-    return;
-  case READ_VIEW_STATE_CLOSED:
-    if (srv_read_only_mode)
-      return;
-    /*
-      Reuse closed view if there were no read-write transactions since (and at)
-      its creation time.
-
-      Original comment states: there is an inherent race here between purge
-      and this thread.
-
-      To avoid this race we should've checked trx_sys.get_max_trx_id() and
-      set state to READ_VIEW_STATE_OPEN atomically under trx_sys.mutex
-      protection. But we're cutting edges to achieve great scalability.
-
-      There're at least two types of concurrent threads interested in this
-      value: purge coordinator thread (see trx_sys_t::clone_oldest_view()) and
-      InnoDB monitor thread (see lock_trx_print_wait_and_mvcc_state()).
-
-      What bad things can happen because we allow this race?
-
-      Speculative execution may reorder state change before get_max_trx_id().
-      In this case purge thread has short gap to clone outdated view. Which is
-      probably not that bad: it just won't be able to purge things that it was
-      actually allowed to purge for a short while.
-
-      This thread may as well get suspended after trx_sys.get_max_trx_id() and
-      before state is set to READ_VIEW_STATE_OPEN. New read-write transaction
-      may get started, committed and purged meanwhile. It is acceptable as
-      well, since this view doesn't see it.
-    */
-    if (trx->is_autocommit_non_locking() && m_ids.empty() &&
-        m_low_limit_id == trx_sys.get_max_trx_id())
-      goto reopen;
-
-    /*
-      Can't reuse view, take new snapshot.
-
-      Alas this empty critical section is simplest way to make sure concurrent
-      purge thread completed snapshot copy. Of course purge thread may come
-      again and try to copy once again after we release this mutex, but in
-      this case it is guaranteed to see READ_VIEW_STATE_REGISTERED and thus
-      it'll skip this view.
-
-      This critical section can be replaced with new state, which purge thread
-      would set to inform us to wait until it completes snapshot. However it'd
-      complicate m_state even further.
-    */
-    mutex_enter(&trx_sys.mutex);
-    mutex_exit(&trx_sys.mutex);
-    m_state.store(READ_VIEW_STATE_SNAPSHOT, std::memory_order_relaxed);
-    break;
-  default:
-    ut_ad(0);
+  else if (likely(!srv_read_only_mode))
+  {
+    m_creator_trx_id= trx->id;
+    if (trx->is_autocommit_non_locking() && empty() &&
+        low_limit_id() == trx_sys.get_max_trx_id())
+      m_open.store(true, std::memory_order_relaxed);
+    else
+    {
+      mutex_enter(&m_mutex);
+      snapshot(trx);
+      m_open.store(true, std::memory_order_relaxed);
+      mutex_exit(&m_mutex);
+    }
   }
-
-  snapshot(trx);
-reopen:
-  m_creator_trx_id= trx->id;
-  m_state.store(READ_VIEW_STATE_OPEN, std::memory_order_release);
 }
 
 
@@ -274,21 +242,11 @@ reopen:
   in. This function is called by purge thread to determine whether it should
   purge the delete marked record or not.
 */
-void trx_sys_t::clone_oldest_view()
+void trx_sys_t::clone_oldest_view(ReadViewBase *view) const
 {
-  purge_sys.view.snapshot(0);
-  mutex_enter(&mutex);
+  view->snapshot(nullptr);
   /* Find oldest view. */
-  for (const trx_t *trx= UT_LIST_GET_FIRST(trx_list); trx;
-       trx= UT_LIST_GET_NEXT(trx_list, trx))
-  {
-    uint32_t state;
-
-    while ((state= trx->read_view.get_state()) == READ_VIEW_STATE_SNAPSHOT)
-      ut_delay(1);
-
-    if (state == READ_VIEW_STATE_OPEN)
-      purge_sys.view.copy(trx->read_view);
-  }
-  mutex_exit(&mutex);
+  trx_list.for_each([view](const trx_t &trx) {
+                      trx.read_view.append_to(view);
+		    });
 }
diff --git a/storage/innobase/rem/rem0cmp.cc b/storage/innobase/rem/rem0cmp.cc
index e4248ed4ca4..6c42aaa0f38 100644
--- a/storage/innobase/rem/rem0cmp.cc
+++ b/storage/innobase/rem/rem0cmp.cc
@@ -131,11 +131,7 @@ TODO: Remove this function. Everything should use MYSQL_TYPE_NEWDECIMAL.
 respectively */
 static ATTRIBUTE_COLD
 int
-cmp_decimal(
-	const byte*	a,
-	unsigned int	a_length,
-	const byte*	b,
-	unsigned int	b_length)
+cmp_decimal(const byte*	a, ulint a_length, const byte* b, ulint b_length)
 {
 	int	swap_flag;
 
@@ -193,378 +189,124 @@ cmp_decimal(
 	return(swap_flag);
 }
 
-/*************************************************************//**
-Innobase uses this function to compare two geometry data fields
-@return	1, 0, -1, if a is greater, equal, less than b, respectively */
-static
-int
-cmp_geometry_field(
-/*===============*/
-	const byte*	a,		/*!< in: data field */
-	unsigned int	a_length,	/*!< in: data field length,
-					not UNIV_SQL_NULL */
-	const byte*	b,		/*!< in: data field */
-	unsigned int	b_length)	/*!< in: data field length,
-					not UNIV_SQL_NULL */
+/** Compare two data fields.
+@param mtype  main type
+@param prtype precise type
+@param data1  data field
+@param len1   length of data1 in bytes, or UNIV_SQL_NULL
+@param data2  data field
+@param len2   length of data2 in bytes, or UNIV_SQL_NULL
+@return the comparison result of data1 and data2
+@retval 0 if data1 is equal to data2
+@retval negative if data1 is less than data2
+@retval positive if data1 is greater than data2 */
+static int cmp_data(ulint mtype, ulint prtype, const byte *data1, ulint len1,
+                    const byte *data2, ulint len2)
 {
-	double		x1, x2;
-	double		y1, y2;
+  ut_ad(len1 != UNIV_SQL_DEFAULT);
+  ut_ad(len2 != UNIV_SQL_DEFAULT);
 
-	if (a_length < sizeof(double) || b_length < sizeof(double)) {
-		return(0);
-	}
+  if (len1 == UNIV_SQL_NULL || len2 == UNIV_SQL_NULL)
+  {
+    if (len1 == len2)
+      return 0;
 
-	/* Try to compare mbr left lower corner (xmin, ymin) */
-	x1 = mach_double_read(a);
-	x2 = mach_double_read(b);
-	y1 = mach_double_read(a + sizeof(double) * SPDIMS);
-	y2 = mach_double_read(b + sizeof(double) * SPDIMS);
-
-	if (x1 > x2) {
-		return(1);
-	} else if (x2 > x1) {
-		return(-1);
-	}
-
-	if (y1 > y2) {
-		return(1);
-	} else if (y2 > y1) {
-		return(-1);
-	}
-
-	/* left lower corner (xmin, ymin) overlaps, now right upper corner */
-	x1 = mach_double_read(a + sizeof(double));
-	x2 = mach_double_read(b + sizeof(double));
-	y1 = mach_double_read(a + sizeof(double) * SPDIMS + sizeof(double));
-	y2 = mach_double_read(b + sizeof(double) * SPDIMS + sizeof(double));
-
-	if (x1 > x2) {
-		return(1);
-	} else if (x2 > x1) {
-		return(-1);
-	}
-
-	if (y1 > y2) {
-		return(1);
-	} else if (y2 > y1) {
-		return(-1);
-	}
-
-	return(0);
-}
-/*************************************************************//**
-Innobase uses this function to compare two gis data fields
-@return	1, 0, -1, if mode == PAGE_CUR_MBR_EQUAL. And return
-1, 0 for rest compare modes, depends on a and b qualifies the
-relationship (CONTAINT, WITHIN etc.) */
-static
-int
-cmp_gis_field(
-/*============*/
-	page_cur_mode_t	mode,		/*!< in: compare mode */
-	const byte*	a,		/*!< in: data field */
-	unsigned int	a_length,	/*!< in: data field length,
-					not UNIV_SQL_NULL */
-	const byte*	b,		/*!< in: data field */
-	unsigned int	b_length)	/*!< in: data field length,
-					not UNIV_SQL_NULL */
-{
-	if (mode == PAGE_CUR_MBR_EQUAL) {
-		return cmp_geometry_field(a, a_length, b, b_length);
-	} else {
-		return rtree_key_cmp(mode, a, int(a_length), b, int(b_length));
-	}
-}
+    /* We define the SQL null to be the smallest possible value of a field. */
+    return len1 == UNIV_SQL_NULL ? -1 : 1;
+  }
 
-/** Compare two data fields.
-@param[in] mtype main type
-@param[in] prtype precise type
-@param[in] a data field
-@param[in] a_length length of a, in bytes (not UNIV_SQL_NULL)
-@param[in] b data field
-@param[in] b_length length of b, in bytes (not UNIV_SQL_NULL)
-@return positive, 0, negative, if a is greater, equal, less than b,
-respectively */
-static int cmp_whole_field(ulint mtype, ulint prtype,
-                           const byte *a, unsigned a_length,
-                           const byte *b, unsigned b_length)
-{
   switch (mtype) {
   default:
     ib::fatal() << "Unknown data type number " << mtype;
-    return 0;
   case DATA_DECIMAL:
-    return cmp_decimal(a, a_length, b, b_length);
+    return cmp_decimal(data1, len1, data2, len2);
   case DATA_DOUBLE:
     {
-      const double af= mach_double_read(a), bf= mach_double_read(b);
+      const double af= mach_double_read(data1), bf= mach_double_read(data2);
       return af > bf ? 1 : bf > af ? -1 : 0;
     }
   case DATA_FLOAT:
     {
-      const float af= mach_float_read(a), bf= mach_float_read(b);
+      const float af= mach_float_read(data1), bf= mach_float_read(data2);
       return af > bf ? 1 : bf > af ? -1 : 0;
     }
-  case DATA_VARCHAR:
-  case DATA_CHAR:
-    /* latin1_swedish_ci is treated as a special case in InnoDB.
-    Because it is a fixed-length encoding (mbminlen=mbmaxlen=1),
-    non-NULL CHAR(n) values will always occupy n bytes and we
-    can invoke strnncollsp() instead of strnncollsp_nchars(). */
-    return my_charset_latin1.coll->strnncollsp(&my_charset_latin1,
-                                               a, a_length, b, b_length);
+  case DATA_FIXBINARY:
+  case DATA_BINARY:
+    if (dtype_get_charset_coll(prtype) != DATA_MYSQL_BINARY_CHARSET_COLL)
+    {
+      if (ulint len= std::min(len1, len2))
+      {
+        if (int cmp= memcmp(data1, data2, len))
+          return cmp;
+        data1+= len;
+        data2+= len;
+        len1-= len;
+        len2-= len;
+      }
+
+      int cmp= 0;
+      if (len1)
+      {
+        const byte *end= &data1[len1];
+        do
+          cmp= static_cast<int>(*data1++ - byte{0x20});
+        while (cmp == 0 && data1 < end);
+      }
+      else if (len2)
+      {
+        const byte *end= &data2[len2];
+        do
+          cmp= static_cast<int>(byte{0x20} - *data2++);
+        while (cmp == 0 && data2 < end);
+      }
+      return cmp;
+    }
+    /* fall through */
+  case DATA_INT:
+  case DATA_SYS_CHILD:
+  case DATA_SYS:
+    break;
+  case DATA_GEOMETRY:
+    ut_ad(prtype & DATA_BINARY_TYPE);
+    if (prtype & DATA_GIS_MBR)
+    {
+      ut_ad(len1 == DATA_MBR_LEN);
+      ut_ad(len2 == DATA_MBR_LEN);
+      return cmp_geometry_field(data1, data2);
+    }
+    break;
   case DATA_BLOB:
-    ut_ad(!(prtype & DATA_BINARY_TYPE)); /* our only caller tested this */
+    if (prtype & DATA_BINARY_TYPE)
+      break;
     /* fall through */
   case DATA_VARMYSQL:
     DBUG_ASSERT(is_strnncoll_compatible(prtype & DATA_MYSQL_TYPE_MASK));
     if (CHARSET_INFO *cs= get_charset(dtype_get_charset_coll(prtype),
                                       MYF(MY_WME)))
-      return cs->coll->strnncollsp(cs, a, a_length, b, b_length);
-    break;
+      return cs->coll->strnncollsp(cs, data1, len1, data2, len2);
+  no_collation:
+    ib::fatal() << "Unable to find charset-collation for " << prtype;
   case DATA_MYSQL:
     DBUG_ASSERT(is_strnncoll_compatible(prtype & DATA_MYSQL_TYPE_MASK));
     if (CHARSET_INFO *cs= get_charset(dtype_get_charset_coll(prtype),
                                       MYF(MY_WME)))
-      return cs->coll->strnncollsp_nchars(cs, a, a_length, b, b_length,
-                                          std::max(a_length, b_length));
+      return cs->coll->strnncollsp_nchars(cs, data1, len1, data2, len2,
+                                          std::max(len1, len2));
+    goto no_collation;
+  case DATA_VARCHAR:
+  case DATA_CHAR:
+    /* latin1_swedish_ci is treated as a special case in InnoDB.
+    Because it is a fixed-length encoding (mbminlen=mbmaxlen=1),
+    non-NULL CHAR(n) values will always occupy n bytes and we
+    can invoke strnncollsp() instead of strnncollsp_nchars(). */
+    return my_charset_latin1.strnncollsp(data1, len1, data2, len2);
   }
 
-  ib::fatal() << "Unable to find charset-collation for " << prtype;
-}
-
-/** Compare two data fields.
-@param[in] mtype main type
-@param[in] prtype precise type
-@param[in] data1 data field
-@param[in] len1 length of data1 in bytes, or UNIV_SQL_NULL
-@param[in] data2 data field
-@param[in] len2 length of data2 in bytes, or UNIV_SQL_NULL
-@return the comparison result of data1 and data2
-@retval 0 if data1 is equal to data2
-@retval negative if data1 is less than data2
-@retval positive if data1 is greater than data2 */
-inline
-int
-cmp_data(
-	ulint		mtype,
-	ulint		prtype,
-	const byte*	data1,
-	ulint		len1,
-	const byte*	data2,
-	ulint		len2)
-{
-	ut_ad(len1 != UNIV_SQL_DEFAULT);
-	ut_ad(len2 != UNIV_SQL_DEFAULT);
-
-	if (len1 == UNIV_SQL_NULL || len2 == UNIV_SQL_NULL) {
-		if (len1 == len2) {
-			return(0);
-		}
-
-		/* We define the SQL null to be the smallest possible
-		value of a field. */
-		return(len1 == UNIV_SQL_NULL ? -1 : 1);
-	}
-
-	ulint	pad;
-
-	switch (mtype) {
-	case DATA_FIXBINARY:
-	case DATA_BINARY:
-		if (dtype_get_charset_coll(prtype)
-		    != DATA_MYSQL_BINARY_CHARSET_COLL) {
-			pad = 0x20;
-			break;
-		}
-		/* fall through */
-	case DATA_INT:
-	case DATA_SYS_CHILD:
-	case DATA_SYS:
-		pad = ULINT_UNDEFINED;
-		break;
-	case DATA_GEOMETRY:
-		ut_ad(prtype & DATA_BINARY_TYPE);
-		pad = ULINT_UNDEFINED;
-		if (prtype & DATA_GIS_MBR) {
-			return(cmp_geometry_field(data1, (unsigned) len1,
-						  data2, (unsigned) len2));
-		}
-		break;
-	case DATA_BLOB:
-		if (prtype & DATA_BINARY_TYPE) {
-			pad = ULINT_UNDEFINED;
-			break;
-		}
-		/* fall through */
-	default:
-		return(cmp_whole_field(mtype, prtype,
-				       data1, (unsigned) len1,
-				       data2, (unsigned) len2));
-	}
-
-	ulint	len;
-	int	cmp;
-
-	if (len1 < len2) {
-		len = len1;
-		len2 -= len;
-		len1 = 0;
-	} else {
-		len = len2;
-		len1 -= len;
-		len2 = 0;
-	}
-
-	if (len) {
-#if defined __i386__ || defined __x86_64__ || defined _M_IX86 || defined _M_X64
-		/* Compare the first bytes with a loop to avoid the call
-		overhead of memcmp(). On x86 and x86-64, the GCC built-in
-		(repz cmpsb) seems to be very slow, so we will be calling the
-		libc version. http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
-		tracks the slowness of the GCC built-in memcmp().
-
-		We compare up to the first 4..7 bytes with the loop.
-		The (len & 3) is used for "normalizing" or
-		"quantizing" the len parameter for the memcmp() call,
-		in case the whole prefix is equal. On x86 and x86-64,
-		the GNU libc memcmp() of equal strings is faster with
-		len=4 than with len=3.
-
-		On other architectures than the IA32 or AMD64, there could
-		be a built-in memcmp() that is faster than the loop.
-		We only use the loop where we know that it can improve
-		the performance. */
-		for (ulint i = 4 + (len & 3); i > 0; i--) {
-			cmp = int(*data1++) - int(*data2++);
-			if (cmp) {
-				return(cmp);
-			}
-
-			if (!--len) {
-				break;
-			}
-		}
-
-		if (len) {
-#endif /* IA32 or AMD64 */
-			cmp = memcmp(data1, data2, len);
-
-			if (cmp) {
-				return(cmp);
-			}
-
-			data1 += len;
-			data2 += len;
-#if defined __i386__ || defined __x86_64__ || defined _M_IX86 || defined _M_X64
-		}
-#endif /* IA32 or AMD64 */
-	}
-
-	cmp = (int) (len1 - len2);
-
-	if (!cmp || pad == ULINT_UNDEFINED) {
-		return(cmp);
-	}
-
-	len = 0;
-
-	if (len1) {
-		do {
-			cmp = static_cast<int>(
-				mach_read_from_1(&data1[len++]) - pad);
-		} while (cmp == 0 && len < len1);
-	} else {
-		ut_ad(len2 > 0);
-
-		do {
-			cmp = static_cast<int>(
-				pad - mach_read_from_1(&data2[len++]));
-		} while (cmp == 0 && len < len2);
-	}
-
-	return(cmp);
-}
-
-/** Compare a GIS data tuple to a physical record.
-@param[in] dtuple data tuple
-@param[in] rec R-tree record
-@param[in] offsets rec_get_offsets(rec)
-@param[in] mode compare mode
-@retval negative if dtuple is less than rec */
-int
-cmp_dtuple_rec_with_gis(
-/*====================*/
-	const dtuple_t*	dtuple,	/*!< in: data tuple */
-	const rec_t*	rec,	/*!< in: physical record which differs from
-				dtuple in some of the common fields, or which
-				has an equal number or more fields than
-				dtuple */
-	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
-	page_cur_mode_t	mode)	/*!< in: compare mode */
-{
-	const dfield_t*	dtuple_field;	/* current field in logical record */
-	ulint		dtuple_f_len;	/* the length of the current field
-					in the logical record */
-	ulint		rec_f_len;	/* length of current field in rec */
-	const byte*	rec_b_ptr;	/* pointer to the current byte in
-					rec field */
-	int		ret = 0;	/* return value */
-
-	dtuple_field = dtuple_get_nth_field(dtuple, 0);
-	dtuple_f_len = dfield_get_len(dtuple_field);
-
-	rec_b_ptr = rec_get_nth_field(rec, offsets, 0, &rec_f_len);
-	ret = cmp_gis_field(
-		mode, static_cast<const byte*>(dfield_get_data(dtuple_field)),
-		(unsigned) dtuple_f_len, rec_b_ptr, (unsigned) rec_f_len);
-
-	return(ret);
-}
-
-/** Compare a GIS data tuple to a physical record in rtree non-leaf node.
-We need to check the page number field, since we don't store pk field in
-rtree non-leaf node.
-@param[in]	dtuple		data tuple
-@param[in]	rec		R-tree record
-@param[in]	offsets		rec_get_offsets(rec)
-@retval negative if dtuple is less than rec */
-int
-cmp_dtuple_rec_with_gis_internal(
-	const dtuple_t*	dtuple,
-	const rec_t*	rec,
-	const rec_offs*	offsets)
-{
-	const dfield_t*	dtuple_field;	/* current field in logical record */
-	ulint		dtuple_f_len;	/* the length of the current field
-					in the logical record */
-	ulint		rec_f_len;	/* length of current field in rec */
-	const byte*	rec_b_ptr;	/* pointer to the current byte in
-					rec field */
-	int		ret = 0;	/* return value */
-
-	dtuple_field = dtuple_get_nth_field(dtuple, 0);
-	dtuple_f_len = dfield_get_len(dtuple_field);
-
-	rec_b_ptr = rec_get_nth_field(rec, offsets, 0, &rec_f_len);
-	ret = cmp_gis_field(
-		PAGE_CUR_WITHIN,
-		static_cast<const byte*>(dfield_get_data(dtuple_field)),
-		(unsigned) dtuple_f_len, rec_b_ptr, (unsigned) rec_f_len);
-	if (ret != 0) {
-		return(ret);
-	}
-
-	dtuple_field = dtuple_get_nth_field(dtuple, 1);
-	dtuple_f_len = dfield_get_len(dtuple_field);
-	rec_b_ptr = rec_get_nth_field(rec, offsets, 1, &rec_f_len);
+  if (ulint len= std::min(len1, len2))
+    if (int cmp= memcmp(data1, data2, len))
+      return cmp;
 
-	return(cmp_data(dtuple_field->type.mtype,
-			dtuple_field->type.prtype,
-			static_cast<const byte*>(dtuple_field->data),
-			dtuple_f_len,
-			rec_b_ptr,
-			rec_f_len));
+  return len1 > len2 ? 1 : len2 > len1 ? -1 : 0;
 }
 
 /** Compare two data fields.
@@ -1122,7 +864,8 @@ cmp_rec_rec(
 	no need to compare the child page number. */
 	n_fields = std::min(rec_offs_n_fields(offsets1),
 			    rec_offs_n_fields(offsets2));
-	n_fields = std::min(n_fields, dict_index_get_n_unique_in_tree(index));
+	n_fields = std::min<ulint>(n_fields,
+				   dict_index_get_n_unique_in_tree(index));
 
 	for (; cur_field < n_fields; cur_field++) {
 		ulint	mtype;
diff --git a/storage/innobase/rem/rem0rec.cc b/storage/innobase/rem/rem0rec.cc
index 08682304410..902f3f2d5ca 100644
--- a/storage/innobase/rem/rem0rec.cc
+++ b/storage/innobase/rem/rem0rec.cc
@@ -167,7 +167,7 @@ rec_get_n_extern_new(
 	ulint		i;
 
 	ut_ad(dict_table_is_comp(index->table));
-	ut_ad(!index->table->supports_instant() || index->is_dummy);
+	ut_ad(!index->table->supports_instant());
 	ut_ad(!index->is_instant());
 	ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY
 	      || rec_get_status(rec) == REC_STATUS_INSTANT);
@@ -293,8 +293,8 @@ rec_init_offsets_comp_ordinary(
 		   : index->n_core_null_bytes);
 
 	if (mblob) {
-		ut_ad(index->is_dummy || index->table->instant);
-		ut_ad(index->is_dummy || index->is_instant());
+		ut_ad(index->table->instant);
+		ut_ad(index->is_instant());
 		ut_ad(rec_offs_n_fields(offsets)
 		      <= ulint(index->n_fields) + 1);
 		ut_ad(!def_val);
@@ -361,7 +361,8 @@ start:
 	do {
 		if (mblob) {
 			if (i == index->first_user_field()) {
-				offs += FIELD_REF_SIZE;
+				offs = static_cast<rec_offs>(offs
+							     + FIELD_REF_SIZE);
 				len = combine(offs, STORED_OFFPAGE);
 				any |= REC_OFFS_EXTERNAL;
 				field--;
@@ -434,12 +435,12 @@ start:
 			more, or when the field is stored externally. */
 			if ((len & 0x80) && DATA_BIG_COL(col)) {
 				/* 1exxxxxxx xxxxxxxx */
-				len <<= 8;
-				len |= *lens--;
-
-				offs += get_value(len);
+				len = static_cast<rec_offs>(len << 8
+							    | *lens--);
+				offs = static_cast<rec_offs>(offs
+							     + get_value(len));
 				if (UNIV_UNLIKELY(len & 0x4000)) {
-					ut_ad(dict_index_is_clust(index));
+					ut_ad(index->is_primary());
 					any |= REC_OFFS_EXTERNAL;
 					len = combine(offs, STORED_OFFPAGE);
 				} else {
@@ -449,15 +450,17 @@ start:
 				continue;
 			}
 
-			len = offs += len;
+			len = offs = static_cast<rec_offs>(offs + len);
 		} else {
-			len = offs += field->fixed_len;
+			len = offs = static_cast<rec_offs>(offs
+							   + field->fixed_len);
 		}
 	} while (field++, rec_offs_base(offsets)[++i] = len,
 		 i < rec_offs_n_fields(offsets));
 
-	*rec_offs_base(offsets)
-		= static_cast<rec_offs>(rec - (lens + 1)) | REC_OFFS_COMPACT | any;
+	*rec_offs_base(offsets) = static_cast<rec_offs>((rec - (lens + 1))
+							| REC_OFFS_COMPACT
+							| any);
 }
 
 #ifdef UNIV_DEBUG
@@ -476,11 +479,12 @@ rec_offs_make_valid(
 	const bool is_alter_metadata = leaf
 		&& rec_is_alter_metadata(rec, *index);
 	ut_ad(is_alter_metadata
-	      || rec_offs_n_fields(offsets)
-	      <= (leaf
-		  ? dict_index_get_n_fields(index)
-		  : dict_index_get_n_unique_in_tree_nonleaf(index) + 1)
-	      || index->is_dummy || dict_index_is_ibuf(index));
+	      || index->is_dummy || index->is_ibuf()
+	      || (leaf
+		  ? rec_offs_n_fields(offsets)
+		  <= dict_index_get_n_fields(index)
+		  : rec_offs_n_fields(offsets) - 1
+		  <= dict_index_get_n_unique_in_tree_nonleaf(index)));
 	const bool is_user_rec = (dict_table_is_comp(index->table)
 				  ? rec_get_heap_no_new(rec)
 				  : rec_get_heap_no_old(rec))
@@ -536,7 +540,7 @@ rec_offs_validate(
 	}
 	if (index) {
 		ut_ad(!memcmp(&index, &offsets[INDEX_OFFSET], sizeof(index)));
-		ulint max_n_fields = std::max(
+		ulint max_n_fields = std::max<ulint>(
 			dict_index_get_n_fields(index),
 			dict_index_get_n_unique_in_tree(index) + 1);
 		if (comp && rec) {
@@ -674,7 +678,8 @@ rec_init_offsets(
 		do {
 			rec_offs len;
 			if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
-				len = offs += REC_NODE_PTR_SIZE;
+				len = offs = static_cast<rec_offs>(
+					offs + REC_NODE_PTR_SIZE);
 				goto resolved;
 			}
 
@@ -717,33 +722,34 @@ rec_init_offsets(
 				if (DATA_BIG_COL(col)) {
 					if (len & 0x80) {
 						/* 1exxxxxxx xxxxxxxx */
-
-						len <<= 8;
-						len |= *lens--;
+						len = static_cast<rec_offs>(
+							len << 8 | *lens--);
 
 						/* B-tree node pointers
 						must not contain externally
 						stored columns.  Thus
 						the "e" flag must be 0. */
 						ut_a(!(len & 0x4000));
-						offs += get_value(len);
+						offs = static_cast<rec_offs>(
+							offs + get_value(len));
 						len = offs;
 
 						goto resolved;
 					}
 				}
 
-				len = offs += len;
+				len = offs = static_cast<rec_offs>(offs + len);
 			} else {
-				len = offs += field->fixed_len;
+				len = offs = static_cast<rec_offs>(
+					offs + field->fixed_len);
 			}
 resolved:
 			rec_offs_base(offsets)[i + 1] = len;
 		} while (++i < rec_offs_n_fields(offsets));
 
 		*rec_offs_base(offsets)
-			= static_cast<rec_offs>(rec - (lens + 1))
-			  | REC_OFFS_COMPACT;
+			= static_cast<rec_offs>((rec - (lens + 1))
+						| REC_OFFS_COMPACT);
 	} else {
 		/* Old-style record: determine extra size and end offsets */
 		offs = REC_N_OLD_EXTRA_BYTES;
@@ -752,29 +758,32 @@ resolved:
 		rec_offs any;
 
 		if (rec_get_1byte_offs_flag(rec)) {
-			offs += static_cast<rec_offs>(n_fields);
+			offs = static_cast<rec_offs>(offs + n_fields);
 			any = offs;
 			/* Determine offsets to fields */
 			do {
 				offs = rec_1_get_field_end_info(rec, i);
 				if (offs & REC_1BYTE_SQL_NULL_MASK) {
-					offs &= ~REC_1BYTE_SQL_NULL_MASK;
+					offs &= static_cast<rec_offs>(
+						~REC_1BYTE_SQL_NULL_MASK);
 					set_type(offs, SQL_NULL);
 				}
 				rec_offs_base(offsets)[1 + i] = offs;
 			} while (++i < n);
 		} else {
-			offs += 2 * static_cast<rec_offs>(n_fields);
+			offs = static_cast<rec_offs>(offs + 2 * n_fields);
 			any = offs;
 			/* Determine offsets to fields */
 			do {
 				offs = rec_2_get_field_end_info(rec, i);
 				if (offs & REC_2BYTE_SQL_NULL_MASK) {
-					offs &= ~REC_2BYTE_SQL_NULL_MASK;
+					offs &= static_cast<rec_offs>(
+						~REC_2BYTE_SQL_NULL_MASK);
 					set_type(offs, SQL_NULL);
 				}
 				if (offs & REC_2BYTE_EXTERN_MASK) {
-					offs &= ~REC_2BYTE_EXTERN_MASK;
+					offs &= static_cast<rec_offs>(
+						~REC_2BYTE_EXTERN_MASK);
 					set_type(offs, STORED_OFFPAGE);
 					any |= REC_OFFS_EXTERNAL;
 				}
@@ -883,8 +892,8 @@ rec_get_offsets_func(
 		ut_ad(!is_user_rec || n_core || index->is_dummy
 		      || dict_index_is_ibuf(index)
 		      || n == n_fields /* dict_stats_analyze_index_level() */
-		      || n
-		      == dict_index_get_n_unique_in_tree_nonleaf(index) + 1);
+		      || n - 1
+		      == dict_index_get_n_unique_in_tree_nonleaf(index));
 		ut_ad(!is_user_rec || !n_core || index->is_dummy
 		      || dict_index_is_ibuf(index)
 		      || n == n_fields /* btr_pcur_restore_position() */
@@ -927,8 +936,8 @@ rec_get_offsets_func(
 		memcpy(&offsets[INDEX_OFFSET], &index, sizeof index);
 #endif /* UNIV_DEBUG */
 		ut_ad(n_core);
-		ut_ad(index->is_dummy || index->table->instant);
-		ut_ad(index->is_dummy || index->is_instant());
+		ut_ad(index->table->instant);
+		ut_ad(index->is_instant());
 		ut_ad(rec_offs_n_fields(offsets)
 		      <= ulint(index->n_fields) + 1);
 		rec_init_offsets_comp_ordinary<true>(rec, index, offsets,
@@ -991,7 +1000,8 @@ rec_get_offsets_reverse(
 	do {
 		rec_offs len;
 		if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
-			len = offs += REC_NODE_PTR_SIZE;
+			len = offs = static_cast<rec_offs>(
+				offs + REC_NODE_PTR_SIZE);
 			goto resolved;
 		}
 
@@ -1031,10 +1041,11 @@ rec_get_offsets_reverse(
 			if (DATA_BIG_COL(col)) {
 				if (len & 0x80) {
 					/* 1exxxxxxx xxxxxxxx */
-					len <<= 8;
-					len |= *lens++;
+					len = static_cast<rec_offs>(
+						len << 8 | *lens++);
 
-					offs += get_value(len);
+					offs = static_cast<rec_offs>(
+						offs + get_value(len));
 					if (UNIV_UNLIKELY(len & 0x4000)) {
 						any_ext = REC_OFFS_EXTERNAL;
 						len = combine(offs,
@@ -1047,9 +1058,10 @@ rec_get_offsets_reverse(
 				}
 			}
 
-			len = offs += len;
+			len = offs = static_cast<rec_offs>(offs + len);
 		} else {
-			len = offs += static_cast<rec_offs>(field->fixed_len);
+			len = offs = static_cast<rec_offs>(offs
+							   + field->fixed_len);
 		}
 resolved:
 		rec_offs_base(offsets)[i + 1] = len;
@@ -1136,7 +1148,7 @@ rec_get_converted_size_comp_prefix_low(
 {
 	ulint	extra_size = temp ? 0 : REC_N_NEW_EXTRA_BYTES;
 	ut_ad(n_fields > 0);
-	ut_ad(n_fields <= dict_index_get_n_fields(index) + mblob);
+	ut_ad(n_fields - mblob <= dict_index_get_n_fields(index));
 	ut_d(ulint n_null = index->n_nullable);
 	ut_ad(status == REC_STATUS_ORDINARY || status == REC_STATUS_NODE_PTR
 	      || status == REC_STATUS_INSTANT);
@@ -1346,61 +1358,6 @@ rec_get_converted_size_comp(
 	return(ULINT_UNDEFINED);
 }
 
-/***********************************************************//**
-Sets the value of the ith field SQL null bit of an old-style record. */
-void
-rec_set_nth_field_null_bit(
-/*=======================*/
-	rec_t*	rec,	/*!< in: record */
-	ulint	i,	/*!< in: ith field */
-	ibool	val)	/*!< in: value to set */
-{
-	ulint	info;
-
-	if (rec_get_1byte_offs_flag(rec)) {
-
-		info = rec_1_get_field_end_info(rec, i);
-
-		if (val) {
-			info = info | REC_1BYTE_SQL_NULL_MASK;
-		} else {
-			info = info & ~REC_1BYTE_SQL_NULL_MASK;
-		}
-
-		rec_1_set_field_end_info(rec, i, info);
-
-		return;
-	}
-
-	info = rec_2_get_field_end_info(rec, i);
-
-	if (val) {
-		info = info | REC_2BYTE_SQL_NULL_MASK;
-	} else {
-		info = info & ~REC_2BYTE_SQL_NULL_MASK;
-	}
-
-	rec_2_set_field_end_info(rec, i, info);
-}
-
-/***********************************************************//**
-Sets an old-style record field to SQL null.
-The physical size of the field is not changed. */
-void
-rec_set_nth_field_sql_null(
-/*=======================*/
-	rec_t*	rec,	/*!< in: record */
-	ulint	n)	/*!< in: index of the field */
-{
-	ulint	offset;
-
-	offset = rec_get_field_start_offs(rec, n);
-
-	data_write_sql_null(rec + offset, rec_get_nth_field_size(rec, n));
-
-	rec_set_nth_field_null_bit(rec, n, TRUE);
-}
-
 /*********************************************************//**
 Builds an old-style physical record out of a data tuple and
 stores it beginning from the start of the given buffer.
@@ -1438,9 +1395,12 @@ rec_convert_dtuple_to_rec_old(
 	rec_set_n_fields_old(rec, n_fields);
 
 	/* Set the info bits of the record */
-	rec_set_info_bits_old(rec, dtuple_get_info_bits(dtuple)
-			      & REC_INFO_BITS_MASK);
-	rec_set_heap_no_old(rec, PAGE_HEAP_NO_USER_LOW);
+	rec_set_bit_field_1(rec,
+			    dtuple_get_info_bits(dtuple) & REC_INFO_BITS_MASK,
+			    REC_OLD_INFO_BITS,
+			    REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+	rec_set_bit_field_2(rec, PAGE_HEAP_NO_USER_LOW, REC_OLD_HEAP_NO,
+			    REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
 
 	/* Store the data and the offsets */
 
@@ -1557,7 +1517,9 @@ rec_convert_dtuple_to_rec_comp(
 		ut_ad(status == REC_STATUS_INSTANT);
 		ut_ad(n_fields == ulint(index->n_fields) + 1);
 		rec_set_n_add_field(nulls, n_fields - 1 - n_core_fields);
-		rec_set_heap_no_new(rec, PAGE_HEAP_NO_USER_LOW);
+		rec_set_bit_field_2(rec, PAGE_HEAP_NO_USER_LOW,
+				    REC_NEW_HEAP_NO, REC_HEAP_NO_MASK,
+				    REC_HEAP_NO_SHIFT);
 		rec_set_status(rec, REC_STATUS_INSTANT);
 		n_node_ptr_field = ULINT_UNDEFINED;
 		lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
@@ -1572,7 +1534,9 @@ rec_convert_dtuple_to_rec_comp(
 	case REC_STATUS_ORDINARY:
 		ut_ad(n_fields <= dict_index_get_n_fields(index));
 		if (!temp) {
-			rec_set_heap_no_new(rec, PAGE_HEAP_NO_USER_LOW);
+			rec_set_bit_field_2(rec, PAGE_HEAP_NO_USER_LOW,
+					    REC_NEW_HEAP_NO, REC_HEAP_NO_MASK,
+					    REC_HEAP_NO_SHIFT);
 			rec_set_status(rec, n_fields == n_core_fields
 				       ? REC_STATUS_ORDINARY
 				       : REC_STATUS_INSTANT);
@@ -1594,10 +1558,12 @@ rec_convert_dtuple_to_rec_comp(
 		break;
 	case REC_STATUS_NODE_PTR:
 		ut_ad(!temp);
-		rec_set_heap_no_new(rec, PAGE_HEAP_NO_USER_LOW);
+		rec_set_bit_field_2(rec, PAGE_HEAP_NO_USER_LOW,
+				    REC_NEW_HEAP_NO, REC_HEAP_NO_MASK,
+				    REC_HEAP_NO_SHIFT);
 		rec_set_status(rec, status);
-		ut_ad(n_fields
-		      == dict_index_get_n_unique_in_tree_nonleaf(index) + 1);
+		ut_ad(n_fields - 1
+		      == dict_index_get_n_unique_in_tree_nonleaf(index));
 		ut_d(n_null = std::min<uint>(index->n_core_null_bytes * 8U,
 					     index->n_nullable));
 		n_node_ptr_field = n_fields - 1;
@@ -1652,7 +1618,14 @@ start:
 
 			/* set the null flag if necessary */
 			if (dfield_is_null(field)) {
-				*nulls |= null_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+				*nulls |= static_cast<byte>(null_mask);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
 				null_mask <<= 1;
 				continue;
 			}
@@ -1687,21 +1660,20 @@ start:
 			ut_ad(DATA_BIG_COL(ifield->col));
 			ut_ad(len <= REC_ANTELOPE_MAX_INDEX_COL_LEN
 					+ BTR_EXTERN_FIELD_REF_SIZE);
-			*lens-- = (byte) (len >> 8) | 0xc0;
-			*lens-- = (byte) len;
+			*lens-- = static_cast<byte>(len >> 8 | 0xc0);
+			*lens-- = static_cast<byte>(len);
 		} else {
 			ut_ad(len <= field->type.len
 			      || DATA_LARGE_MTYPE(field->type.mtype)
 			      || !strcmp(index->name,
 					 FTS_INDEX_TABLE_IND_NAME));
 			if (len < 128 || !DATA_BIG_LEN_MTYPE(
-				field->type.len, field->type.mtype)) {
-
-				*lens-- = (byte) len;
+				    field->type.len, field->type.mtype)) {
+				*lens-- = static_cast<byte>(len);
 			} else {
 				ut_ad(len < 16384);
-				*lens-- = (byte) (len >> 8) | 0x80;
-				*lens-- = (byte) len;
+				*lens-- = static_cast<byte>(len >> 8 | 0x80);
+				*lens-- = static_cast<byte>(len);
 			}
 		}
 
@@ -1757,7 +1729,9 @@ rec_convert_dtuple_to_rec_new(
 			status, false);
 	}
 
-	rec_set_info_bits_new(buf, dtuple->info_bits & ~REC_NEW_STATUS_MASK);
+	rec_set_bit_field_1(buf, dtuple->info_bits & ~REC_NEW_STATUS_MASK,
+			    REC_NEW_INFO_BITS,
+			    REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
 	return buf;
 }
 
@@ -1932,8 +1906,8 @@ rec_copy_prefix_to_dtuple(
 	rec_offs_init(offsets_);
 
 	ut_ad(n_core <= index->n_core_fields);
-	ut_ad(n_core || n_fields
-	      <= dict_index_get_n_unique_in_tree_nonleaf(index) + 1);
+	ut_ad(n_core || n_fields - 1
+	      <= dict_index_get_n_unique_in_tree_nonleaf(index));
 
 	offsets = rec_get_offsets(rec, index, offsets, n_core,
 				  n_fields, &heap);
@@ -1995,7 +1969,7 @@ rec_copy_prefix_to_buf_old(
 		*buf = static_cast<byte*>(ut_malloc_nokey(prefix_len));
 	}
 
-	ut_memcpy(*buf, rec - area_start, prefix_len);
+	memcpy(*buf, rec - area_start, prefix_len);
 
 	copy_rec = *buf + area_start;
 
@@ -2294,7 +2268,7 @@ rec_print_old(
 	n = rec_get_n_fields_old(rec);
 
 	fprintf(file, "PHYSICAL RECORD: n_fields " ULINTPF ";"
-		" %u-byte offsets; info bits " ULINTPF "\n",
+		" %u-byte offsets; info bits %u\n",
 		n,
 		rec_get_1byte_offs_flag(rec) ? 1 : 2,
 		rec_get_info_bits(rec, FALSE));
@@ -2548,7 +2522,7 @@ rec_print_new(
 	}
 
 	fprintf(file, "PHYSICAL RECORD: n_fields " ULINTPF ";"
-		" compact format; info bits " ULINTPF "\n",
+		" compact format; info bits %u\n",
 		rec_offs_n_fields(offsets),
 		rec_get_info_bits(rec, TRUE));
 
@@ -2786,8 +2760,9 @@ wsrep_rec_get_foreign_key(
 			memcpy(buf, data, len);
 			*buf_len = wsrep_innobase_mysql_sort(
 				(int)(col_f->prtype & DATA_MYSQL_TYPE_MASK),
-				(uint)dtype_get_charset_coll(col_f->prtype),
-				buf, len, *buf_len);
+				dtype_get_charset_coll(col_f->prtype),
+				buf, static_cast<uint>(len),
+				static_cast<uint>(*buf_len));
 		} else { /* new protocol */
 			if (!(col_r->prtype & DATA_NOT_NULL)) {
 				*buf++ = 0;
@@ -2804,7 +2779,7 @@ wsrep_rec_get_foreign_key(
 					}
 					data++;
 				}
-		
+
 				if (!(col_f->prtype & DATA_UNSIGNED)) {
 					buf[len-1] = (byte) (buf[len-1] ^ 128);
 				}
@@ -2816,11 +2791,10 @@ wsrep_rec_get_foreign_key(
 			case DATA_CHAR:
 			case DATA_MYSQL:
 				/* Copy the actual data */
-				ut_memcpy(buf, data, len);
+				memcpy(buf, data, len);
 				len = wsrep_innobase_mysql_sort(
 					(int)
 					(col_f->prtype & DATA_MYSQL_TYPE_MASK),
-					(uint)
 					dtype_get_charset_coll(col_f->prtype),
 					buf, len, *buf_len);
 				break;
diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc
index 8d1dbad22cb..bc93ca25195 100644
--- a/storage/innobase/row/row0ftsort.cc
+++ b/storage/innobase/row/row0ftsort.cc
@@ -103,7 +103,8 @@ row_merge_create_fts_sort_index(
 		? DATA_VARCHAR : DATA_VARMYSQL;
 	field->col->mbminlen = idx_field->col->mbminlen;
 	field->col->mbmaxlen = idx_field->col->mbmaxlen;
-	field->col->len = HA_FT_MAXCHARLEN * unsigned(field->col->mbmaxlen);
+	field->col->len = static_cast<uint16_t>(
+		HA_FT_MAXCHARLEN * field->col->mbmaxlen);
 
 	field->fixed_len = 0;
 
@@ -216,7 +217,6 @@ row_fts_psort_info_init(
 	common_info->trx = trx;
 	common_info->all_info = psort_info;
 	common_info->sort_event = os_event_create(0);
-	common_info->merge_event = os_event_create(0);
 	common_info->opt_doc_id_size = opt_doc_id_size;
 
 	if (log_tmp_is_encrypted()) {
@@ -253,14 +253,9 @@ row_fts_psort_info_init(
 			}
 
 			/* Need to align memory for O_DIRECT write */
-			psort_info[j].block_alloc[i] =
-				static_cast<row_merge_block_t*>(ut_malloc_nokey(
-					block_size + 1024));
-
 			psort_info[j].merge_block[i] =
 				static_cast<row_merge_block_t*>(
-					ut_align(
-					psort_info[j].block_alloc[i], 1024));
+					aligned_malloc(block_size, 1024));
 
 			if (!psort_info[j].merge_block[i]) {
 				ret = FALSE;
@@ -270,23 +265,17 @@ row_fts_psort_info_init(
 			/* If tablespace is encrypted, allocate additional buffer for
 			encryption/decryption. */
 			if (encrypted) {
-
 				/* Need to align memory for O_DIRECT write */
-				psort_info[j].crypt_alloc[i] =
-					static_cast<row_merge_block_t*>(ut_malloc_nokey(
-							block_size + 1024));
-
 				psort_info[j].crypt_block[i] =
 					static_cast<row_merge_block_t*>(
-						ut_align(
-							psort_info[j].crypt_alloc[i], 1024));
+						aligned_malloc(block_size,
+							       1024));
 
 				if (!psort_info[j].crypt_block[i]) {
 					ret = FALSE;
 					goto func_exit;
 				}
 			} else {
-				psort_info[j].crypt_alloc[i] = NULL;
 				psort_info[j].crypt_block[i] = NULL;
 			}
 		}
@@ -338,19 +327,15 @@ row_fts_psort_info_destroy(
 						psort_info[j].merge_file[i]);
 				}
 
-				ut_free(psort_info[j].block_alloc[i]);
+				aligned_free(psort_info[j].merge_block[i]);
 				ut_free(psort_info[j].merge_file[i]);
-
-				if (psort_info[j].crypt_alloc[i]) {
-					ut_free(psort_info[j].crypt_alloc[i]);
-				}
+				aligned_free(psort_info[j].crypt_block[i]);
 			}
 
 			mutex_free(&psort_info[j].mutex);
 		}
 
 		os_event_destroy(merge_info[0].psort_common->sort_event);
-		os_event_destroy(merge_info[0].psort_common->merge_event);
 		ut_free(merge_info[0].psort_common->dup);
 		ut_free(merge_info[0].psort_common);
 		ut_free(psort_info);
@@ -651,7 +636,7 @@ row_merge_fts_doc_tokenize(
 
 		field->type.mtype = DATA_INT;
 		field->type.prtype = DATA_NOT_NULL | DATA_BINARY_TYPE;
-		field->type.len = len;
+		field->type.len = static_cast<uint16_t>(field->len);
 		field->type.mbminlen = 0;
 		field->type.mbmaxlen = 0;
 
@@ -675,7 +660,7 @@ row_merge_fts_doc_tokenize(
 
 		field->type.mtype = DATA_INT;
 		field->type.prtype = DATA_NOT_NULL;
-		field->type.len = len;
+		field->type.len = 4;
 		field->type.mbminlen = 0;
 		field->type.mbmaxlen = 0;
 		cur_len += len;
@@ -754,10 +739,9 @@ row_merge_fts_get_next_doc_item(
 /*********************************************************************//**
 Function performs parallel tokenization of the incoming doc strings.
 It also performs the initial in memory sort of the parsed records.
-@return OS_THREAD_DUMMY_RETURN */
+*/
 static
-os_thread_ret_t
-DECLARE_THREAD(fts_parallel_tokenization)(
+void fts_parallel_tokenization(
 /*======================*/
 	void*		arg)	/*!< in: psort_info for the thread */
 {
@@ -1034,11 +1018,11 @@ exit:
 				       crypt_block[i], table->space_id);
 
 		if (error != DB_SUCCESS) {
-			os_file_close(tmpfd[i]);
+			row_merge_file_destroy_low(tmpfd[i]);
 			goto func_exit;
 		}
 
-		os_file_close(tmpfd[i]);
+		row_merge_file_destroy_low(tmpfd[i]);
 	}
 
 func_exit:
@@ -1066,10 +1050,6 @@ func_exit:
 	psort_info->child_status = FTS_CHILD_COMPLETE;
 	os_event_set(psort_info->psort_common->sort_event);
 	psort_info->child_status = FTS_CHILD_EXITING;
-
-	os_thread_exit();
-
-	OS_THREAD_DUMMY_RETURN;
 }
 
 /*********************************************************************//**
@@ -1080,23 +1060,20 @@ row_fts_start_psort(
 	fts_psort_t*	psort_info)	/*!< parallel sort structure */
 {
 	ulint		i = 0;
-	os_thread_id_t	thd_id;
 
 	for (i = 0; i < fts_sort_pll_degree; i++) {
 		psort_info[i].psort_id = i;
-		psort_info[i].thread_hdl =
-			os_thread_create(fts_parallel_tokenization,
-				(void*) &psort_info[i],
-				 &thd_id);
+		psort_info[i].task =
+			new tpool::waitable_task(fts_parallel_tokenization,&psort_info[i]);
+		srv_thread_pool->submit_task(psort_info[i].task);
 	}
 }
 
 /*********************************************************************//**
-Function performs the merge and insertion of the sorted records.
-@return OS_THREAD_DUMMY_RETURN */
+Function performs the merge and insertion of the sorted records. */
 static
-os_thread_ret_t
-DECLARE_THREAD(fts_parallel_merge)(
+void
+fts_parallel_merge(
 /*===============*/
 	void*		arg)		/*!< in: parallel merge info */
 {
@@ -1110,14 +1087,6 @@ DECLARE_THREAD(fts_parallel_merge)(
 	row_fts_merge_insert(psort_info->psort_common->dup->index,
 			     psort_info->psort_common->new_table,
 			     psort_info->psort_common->all_info, id);
-
-	psort_info->child_status = FTS_CHILD_COMPLETE;
-	os_event_set(psort_info->psort_common->merge_event);
-	psort_info->child_status = FTS_CHILD_EXITING;
-
-	os_thread_exit(false);
-
-	OS_THREAD_DUMMY_RETURN;
 }
 
 /*********************************************************************//**
@@ -1129,15 +1098,15 @@ row_fts_start_parallel_merge(
 {
 	ulint		i = 0;
 
-	/* Kick off merge/insert threads */
+	/* Kick off merge/insert tasks */
 	for (i = 0; i <  FTS_NUM_AUX_INDEX; i++) {
 		merge_info[i].psort_id = i;
 		merge_info[i].child_status = 0;
 
-		merge_info[i].thread_hdl = os_thread_create(
+		merge_info[i].task = new tpool::waitable_task(
 			fts_parallel_merge,
-			(void*) &merge_info[i],
-			&merge_info[i].thread_hdl);
+			(void*) &merge_info[i]);
+		srv_thread_pool->submit_task(merge_info[i].task);
 	}
 }
 
@@ -1653,7 +1622,8 @@ row_fts_merge_insert(
 	in order to get the correct aux table names. */
 	index->table->flags2 |= DICT_TF2_FTS_AUX_HEX_NAME;
 	DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
-			index->table->flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME;);
+			index->table->flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME
+			& ((1U << DICT_TF2_BITS) - 1););
 	fts_table.type = FTS_INDEX_TABLE;
 	fts_table.index_id = index->id;
 	fts_table.table_id = table->id;
@@ -1673,9 +1643,7 @@ row_fts_merge_insert(
 	      == UT_BITS_IN_BYTES(aux_index->n_nullable));
 
 	/* Create bulk load instance */
-	ins_ctx.btr_bulk = UT_NEW_NOKEY(
-		BtrBulk(aux_index, trx, psort_info[0].psort_common->trx
-			->get_flush_observer()));
+	ins_ctx.btr_bulk = UT_NEW_NOKEY(BtrBulk(aux_index, trx));
 
 	/* Create tuple for insert */
 	ins_ctx.tuple = dtuple_create(heap, dict_index_get_n_fields(aux_index));
@@ -1810,9 +1778,5 @@ exit:
 		ib::info() << "InnoDB_FTS: inserted " << count << " records";
 	}
 
-	if (psort_info[0].psort_common->trx->get_flush_observer()) {
-		row_merge_write_redo(aux_index);
-	}
-
 	return(error);
 }
diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc
index cd8e46fe623..15d3b69ccfd 100644
--- a/storage/innobase/row/row0import.cc
+++ b/storage/innobase/row/row0import.cc
@@ -251,6 +251,9 @@ public:
 	RecIterator() UNIV_NOTHROW
 	{
 		memset(&m_cur, 0x0, sizeof(m_cur));
+		/* Make page_cur_delete_rec() happy. */
+		m_mtr.start();
+		m_mtr.set_log_mode(MTR_LOG_NO_REDO);
 	}
 
 	/** Position the cursor on the first user record. */
@@ -277,6 +280,8 @@ public:
 		return(page_cur_get_rec(&m_cur));
 	}
 
+	buf_block_t* current_block() const { return m_cur.block; }
+
 	/**
 	@return true if cursor is at the end */
 	bool	end() UNIV_NOTHROW
@@ -288,22 +293,47 @@ public:
 	@return true on success */
 	bool remove(
 		const dict_index_t*	index,
-		page_zip_des_t*		page_zip,
 		rec_offs*		offsets) UNIV_NOTHROW
 	{
+		ut_ad(page_is_leaf(m_cur.block->frame));
 		/* We can't end up with an empty page unless it is root. */
 		if (page_get_n_recs(m_cur.block->frame) <= 1) {
 			return(false);
 		}
 
-		return(page_delete_rec(index, &m_cur, page_zip, offsets));
+		if (!rec_offs_any_extern(offsets)
+		    && m_cur.block->page.id().page_no() != index->page
+		    && ((page_get_data_size(m_cur.block->frame)
+			 - rec_offs_size(offsets)
+			 < BTR_CUR_PAGE_COMPRESS_LIMIT(index))
+			|| !page_has_siblings(m_cur.block->frame)
+			|| (page_get_n_recs(m_cur.block->frame) < 2))) {
+			return false;
+		}
+
+#ifdef UNIV_ZIP_DEBUG
+		page_zip_des_t* page_zip = buf_block_get_page_zip(m_cur.block);
+		ut_a(!page_zip || page_zip_validate(
+			     page_zip, m_cur.block->frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+		page_cur_delete_rec(&m_cur, index, offsets, &m_mtr);
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(
+			     page_zip, m_cur.block->frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+		return true;
 	}
 
 private:
 	page_cur_t	m_cur;
+public:
+	mtr_t		m_mtr;
 };
 
-/** Class that purges delete marked reocords from indexes, both secondary
+/** Class that purges delete marked records from indexes, both secondary
 and cluster. It does a pessimistic delete. This should only be done if we
 couldn't purge the delete marked reocrds during Phase I. */
 class IndexPurge {
@@ -521,7 +551,7 @@ protected:
 	/** Check if the page is marked as free in the extent descriptor.
 	@param page_no page number to check in the extent descriptor.
 	@return true if the page is marked as free */
-	bool is_free(ulint page_no) const UNIV_NOTHROW
+	bool is_free(uint32_t page_no) const UNIV_NOTHROW
 	{
 		ut_a(xdes_calc_descriptor_page(get_zip_size(), page_no)
 		     == m_xdes_page_no);
@@ -530,7 +560,7 @@ protected:
 			const xdes_t*	xdesc = xdes(page_no, m_xdes);
 			ulint		pos = page_no % FSP_EXTENT_SIZE;
 
-			return(xdes_get_bit(xdesc, XDES_FREE_BIT, pos));
+			return xdes_is_free(xdesc, pos);
 		}
 
 		/* If the current xdes was free, the page must be free. */
@@ -700,7 +730,7 @@ dberr_t FetchIndexRootPages::operator()(buf_block_t* block) UNIV_NOTHROW
 	const page_t*	page = get_frame(block);
 
 	m_index.m_id = btr_page_get_index_id(page);
-	m_index.m_page_no = block->page.id.page_no();
+	m_index.m_page_no = block->page.id().page_no();
 
 	/* Check that the tablespace flags match the table flags. */
 	ulint expected = dict_tf_to_fsp_flags(m_table->flags);
@@ -825,14 +855,11 @@ public:
 		AbstractCallback(trx, space_id),
 		m_cfg(cfg),
 		m_index(cfg->m_indexes),
-		m_current_lsn(log_get_lsn()),
-		m_page_zip_ptr(0),
 		m_rec_iter(),
 		m_offsets_(), m_offsets(m_offsets_),
 		m_heap(0),
 		m_cluster_index(dict_table_get_first_index(cfg->m_table))
 	{
-		ut_ad(m_current_lsn);
 		rec_offs_init(m_offsets_);
 	}
 
@@ -859,9 +886,8 @@ private:
 	@param block block read from file
 	@param page_type type of the page
 	@retval DB_SUCCESS or error code */
-	dberr_t update_page(
-		buf_block_t*	block,
-		ulint&		page_type) UNIV_NOTHROW;
+	dberr_t update_page(buf_block_t* block, uint16_t& page_type)
+		UNIV_NOTHROW;
 
 	/** Update the space, index id, trx id.
 	@param block block to convert
@@ -942,12 +968,6 @@ private:
 	/** Current index whose pages are being imported */
 	row_index_t*		m_index;
 
-	/** Current system LSN */
-	lsn_t			m_current_lsn;
-
-	/** Alias for m_page_zip, only set for compressed pages. */
-	page_zip_des_t*		m_page_zip_ptr;
-
 	/** Iterator over records in a block */
 	RecIterator		m_rec_iter;
 
@@ -1402,7 +1422,7 @@ row_import::set_root_by_name() UNIV_NOTHROW
 		/* We've already checked that it exists. */
 		ut_a(index != 0);
 
-		index->page = cfg_index->m_page_no;
+		index->page = static_cast<uint32_t>(cfg_index->m_page_no);
 	}
 }
 
@@ -1461,9 +1481,8 @@ row_import::set_root_by_heuristic() UNIV_NOTHROW
 
 			cfg_index[i].m_srv_index = index;
 
-			index->page = cfg_index[i].m_page_no;
-
-			++i;
+			index->page = static_cast<uint32_t>(
+				cfg_index[i++].m_page_no);
 		}
 	}
 
@@ -1578,7 +1597,7 @@ IndexPurge::next() UNIV_NOTHROW
 			return status that will be checked in all callers! */
 			switch (next_page) {
 			default:
-				if (next_page != block->page.id.page_no()) {
+				if (next_page != block->page.id().page_no()) {
 					break;
 				}
 				/* MDEV-20931 FIXME: Check that
@@ -1594,8 +1613,7 @@ IndexPurge::next() UNIV_NOTHROW
 
 			dict_index_t* index = m_pcur.btr_cur.index;
 			buf_block_t* next_block = btr_block_get(
-				page_id_t(block->page.id.space(), next_page),
-				block->zip_size(), BTR_MODIFY_LEAF, index,
+				*index, next_page, BTR_MODIFY_LEAF, false,
 				&m_mtr);
 
 			if (UNIV_UNLIKELY(!next_block
@@ -1609,7 +1627,7 @@ IndexPurge::next() UNIV_NOTHROW
 					  != page_is_comp(block->frame)
 					  || btr_page_get_prev(
 						  next_block->frame)
-					  != block->page.id.page_no())) {
+					  != block->page.id().page_no())) {
 				return DB_CORRUPTION;
 			}
 
@@ -1704,9 +1722,10 @@ PageConverter::adjust_cluster_index_blob_column(
 
 	mach_write_to_4(field, get_space_id());
 
-	if (m_page_zip_ptr) {
+	if (UNIV_LIKELY_NULL(m_rec_iter.current_block()->page.zip.data)) {
 		page_zip_write_blob_ptr(
-			m_page_zip_ptr, rec, m_cluster_index, offsets, i, 0);
+			m_rec_iter.current_block(), rec, m_cluster_index,
+			offsets, i, &m_rec_iter.m_mtr);
 	}
 
 	return(DB_SUCCESS);
@@ -1777,7 +1796,7 @@ inline bool PageConverter::purge() UNIV_NOTHROW
 	const dict_index_t*	index = m_index->m_srv_index;
 
 	/* We can't have a page that is empty and not root. */
-	if (m_rec_iter.remove(index, m_page_zip_ptr, m_offsets)) {
+	if (m_rec_iter.remove(index, m_offsets)) {
 
 		++m_index->m_stats.m_n_purged;
 
@@ -1808,11 +1827,13 @@ PageConverter::adjust_cluster_record(
 		record. */
 		ulint	trx_id_pos = m_cluster_index->n_uniq
 			? m_cluster_index->n_uniq : 1;
-		if (m_page_zip_ptr) {
+		if (UNIV_LIKELY_NULL(m_rec_iter.current_block()
+				     ->page.zip.data)) {
 			page_zip_write_trx_id_and_roll_ptr(
-				m_page_zip_ptr, rec, m_offsets, trx_id_pos,
+				m_rec_iter.current_block(),
+				rec, m_offsets, trx_id_pos,
 				0, roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS,
-				NULL);
+				&m_rec_iter.m_mtr);
 		} else {
 			ulint	len;
 			byte*	ptr = rec_get_nth_field(
@@ -1894,42 +1915,39 @@ dberr_t
 PageConverter::update_index_page(
 	buf_block_t*	block) UNIV_NOTHROW
 {
-	index_id_t	id;
-	buf_frame_t*	page = block->frame;
+	const page_id_t page_id(block->page.id());
 
-	if (is_free(block->page.id.page_no())) {
+	if (is_free(page_id.page_no())) {
 		return(DB_SUCCESS);
-	} else if ((id = btr_page_get_index_id(page)) != m_index->m_id) {
-		row_index_t*	index = find_index(id);
+	}
+
+	buf_frame_t* page = block->frame;
+	const index_id_t id = btr_page_get_index_id(page);
+
+	if (id != m_index->m_id) {
+		row_index_t* index = find_index(id);
 
 		if (UNIV_UNLIKELY(!index)) {
-			if (m_cfg->m_missing) {
-				return DB_SUCCESS;
+			if (!m_cfg->m_missing) {
+				ib::warn() << "Unknown index id " << id
+					   << " on page " << page_id.page_no();
 			}
-
-			ib::error() << "Page for tablespace " << m_space
-				<< " is index page with id " << id
-				<< " but that index is not found from"
-				<< " configuration file. Current index name "
-				<< m_index->m_name << " and id " <<  m_index->m_id;
-			m_index = 0;
-			return(DB_CORRUPTION);
+			return DB_SUCCESS;
 		}
 
-		/* Update current index */
 		m_index = index;
 	}
 
 	/* If the .cfg file is missing and there is an index mismatch
 	then ignore the error. */
-	if (m_cfg->m_missing && (m_index == 0 || m_index->m_srv_index == 0)) {
+	if (m_cfg->m_missing && !m_index->m_srv_index) {
 		return(DB_SUCCESS);
 	}
 
-	if (m_index && block->page.id.page_no() == m_index->m_page_no) {
+	if (m_index && page_id.page_no() == m_index->m_page_no) {
 		byte *b = FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + FSEG_HDR_SPACE
 			+ page;
-		mach_write_to_4(b, block->page.id.space());
+		mach_write_to_4(b, page_id.space());
 
 		memcpy(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + FSEG_HDR_SPACE
 		       + page, b, 4);
@@ -1944,31 +1962,47 @@ PageConverter::update_index_page(
 	}
 
 #ifdef UNIV_ZIP_DEBUG
-	ut_a(!is_compressed_table()
-	     || page_zip_validate(m_page_zip_ptr, page, m_index->m_srv_index));
+	ut_a(!block->page.zip.data || page_zip_validate(&block->page.zip, page,
+							m_index->m_srv_index));
 #endif /* UNIV_ZIP_DEBUG */
 
 	/* This has to be written to uncompressed index header. Set it to
 	the current index id. */
-	btr_page_set_index_id(
-		page, m_page_zip_ptr, m_index->m_srv_index->id, 0);
-
-	if (dict_index_is_clust(m_index->m_srv_index)) {
-		dict_index_t* index = const_cast<dict_index_t*>(
-			m_index->m_srv_index);
-		if (block->page.id.page_no() != index->page) {
-			/* Clear PAGE_MAX_TRX_ID so that it can be
-			used for other purposes in the future. IMPORT
-			in MySQL 5.6, 5.7 and MariaDB 10.0 and 10.1
-			would set the field to the transaction ID even
-			on clustered index pages. */
-			page_set_max_trx_id(block, m_page_zip_ptr, 0, NULL);
+	mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID),
+			m_index->m_srv_index->id);
+	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+		memcpy(&block->page.zip.data[PAGE_HEADER + PAGE_INDEX_ID],
+		       &block->frame[PAGE_HEADER + PAGE_INDEX_ID], 8);
+	}
+
+	if (m_index->m_srv_index->is_clust()) {
+		if (page_id.page_no() != m_index->m_srv_index->page) {
+			goto clear_page_max_trx_id;
+		}
+	} else if (page_is_leaf(page)) {
+		/* Set PAGE_MAX_TRX_ID on secondary index leaf pages. */
+		mach_write_to_8(&block->frame[PAGE_HEADER + PAGE_MAX_TRX_ID],
+				m_trx->id);
+		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+			memcpy_aligned<8>(&block->page.zip.data
+					  [PAGE_HEADER + PAGE_MAX_TRX_ID],
+					  &block->frame
+					  [PAGE_HEADER + PAGE_MAX_TRX_ID], 8);
 		}
 	} else {
-		/* Set PAGE_MAX_TRX_ID on secondary index leaf pages,
-		and clear it on non-leaf pages. */
-		page_set_max_trx_id(block, m_page_zip_ptr,
-				    page_is_leaf(page) ? m_trx->id : 0, NULL);
+clear_page_max_trx_id:
+		/* Clear PAGE_MAX_TRX_ID so that it can be
+		used for other purposes in the future. IMPORT
+		in MySQL 5.6, 5.7 and MariaDB 10.0 and 10.1
+		would set the field to the transaction ID even
+		on clustered index pages. */
+		memset_aligned<8>(&block->frame[PAGE_HEADER + PAGE_MAX_TRX_ID],
+				  0, 8);
+		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+			memset_aligned<8>(&block->page.zip.data
+					  [PAGE_HEADER + PAGE_MAX_TRX_ID],
+					  0, 8);
+		}
 	}
 
 	if (page_is_empty(page)) {
@@ -1990,38 +2024,25 @@ PageConverter::update_index_page(
 /** Validate the space flags and update tablespace header page.
 @param block block read from file, not from the buffer pool.
 @retval DB_SUCCESS or error code */
-inline
-dberr_t
-PageConverter::update_header(
-	buf_block_t*	block) UNIV_NOTHROW
+inline dberr_t PageConverter::update_header(buf_block_t* block) UNIV_NOTHROW
 {
-	/* Check for valid header */
-	switch (fsp_header_get_space_id(get_frame(block))) {
-	case 0:
-		return(DB_CORRUPTION);
-	case ULINT_UNDEFINED:
-		ib::warn() << "Space id check in the header failed: ignored";
-	}
-
-	mach_write_to_8(
-		get_frame(block) + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
-		m_current_lsn);
-
-	/* Write back the adjusted flags. */
-	mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS
-			+ get_frame(block), m_space_flags);
+  byte *frame= get_frame(block);
+  if (memcmp_aligned<2>(FIL_PAGE_SPACE_ID + frame,
+                        FSP_HEADER_OFFSET + FSP_SPACE_ID + frame, 4))
+    ib::warn() << "Space id check in the header failed: ignored";
+  else if (!mach_read_from_4(FIL_PAGE_SPACE_ID + frame))
+    return DB_CORRUPTION;
 
-	/* Write space_id to the tablespace header, page 0. */
-	mach_write_to_4(
-		get_frame(block) + FSP_HEADER_OFFSET + FSP_SPACE_ID,
-		get_space_id());
+  memset(frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
 
-	/* This is on every page in the tablespace. */
-	mach_write_to_4(
-		get_frame(block) + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
-		get_space_id());
+  /* Write space_id to the tablespace header, page 0. */
+  mach_write_to_4(FIL_PAGE_SPACE_ID + frame, get_space_id());
+  memcpy_aligned<2>(FSP_HEADER_OFFSET + FSP_SPACE_ID + frame,
+                    FIL_PAGE_SPACE_ID + frame, 4);
+  /* Write back the adjusted flags. */
+  mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + frame, m_space_flags);
 
-	return(DB_SUCCESS);
+  return DB_SUCCESS;
 }
 
 /** Update the page, set the space id, max trx id and index id.
@@ -2029,23 +2050,16 @@ PageConverter::update_header(
 @retval DB_SUCCESS or error code */
 inline
 dberr_t
-PageConverter::update_page(
-	buf_block_t*	block,
-	ulint&		page_type) UNIV_NOTHROW
+PageConverter::update_page(buf_block_t* block, uint16_t& page_type)
+	UNIV_NOTHROW
 {
 	dberr_t		err = DB_SUCCESS;
 
 	ut_ad(!block->page.zip.data == !is_compressed_table());
 
-	if (block->page.zip.data) {
-		m_page_zip_ptr = &block->page.zip;
-	} else {
-		ut_ad(!m_page_zip_ptr);
-	}
-
 	switch (page_type = fil_page_get_type(get_frame(block))) {
 	case FIL_PAGE_TYPE_FSP_HDR:
-		ut_a(block->page.id.page_no() == 0);
+		ut_a(block->page.id().page_no() == 0);
 		/* Work directly on the uncompressed page headers. */
 		return(update_header(block));
 
@@ -2074,7 +2088,7 @@ PageConverter::update_page(
 
 	case FIL_PAGE_TYPE_XDES:
 		err = set_current_xdes(
-			block->page.id.page_no(), get_frame(block));
+			block->page.id().page_no(), get_frame(block));
 		/* fall through */
 	case FIL_PAGE_INODE:
 	case FIL_PAGE_TYPE_TRX_SYS:
@@ -2108,85 +2122,38 @@ dberr_t PageConverter::operator()(buf_block_t* block) UNIV_NOTHROW
 	/* If we already had an old page with matching number
 	in the buffer pool, evict it now, because
 	we no longer evict the pages on DISCARD TABLESPACE. */
-	buf_page_get_gen(block->page.id, get_zip_size(),
+	buf_page_get_gen(block->page.id(), get_zip_size(),
 			 RW_NO_LATCH, NULL, BUF_EVICT_IF_IN_POOL,
 			 __FILE__, __LINE__, NULL, NULL);
 
-	ulint		page_type;
+	uint16_t page_type;
 
 	if (dberr_t err = update_page(block, page_type)) {
 		return err;
 	}
 
 	const bool full_crc32 = fil_space_t::full_crc32(get_space_flags());
+	byte* frame = get_frame(block);
+	memset_aligned<8>(frame + FIL_PAGE_LSN, 0, 8);
 
 	if (!block->page.zip.data) {
 		buf_flush_init_for_writing(
-			NULL, block->frame, NULL, m_current_lsn, full_crc32);
+			NULL, block->frame, NULL, full_crc32);
 	} else if (fil_page_type_is_index(page_type)) {
 		buf_flush_init_for_writing(
 			NULL, block->page.zip.data, &block->page.zip,
-			m_current_lsn, full_crc32);
+			full_crc32);
 	} else {
 		/* Calculate and update the checksum of non-index
 		pages for ROW_FORMAT=COMPRESSED tables. */
 		buf_flush_update_zip_checksum(
-			block->page.zip.data, block->zip_size(),
-			m_current_lsn);
+			block->page.zip.data, block->zip_size());
 	}
 
 	return DB_SUCCESS;
 }
 
 /*****************************************************************//**
-Clean up after import tablespace failure, this function will acquire
-the dictionary latches on behalf of the transaction if the transaction
-hasn't already acquired them. */
-static	MY_ATTRIBUTE((nonnull))
-void
-row_import_discard_changes(
-/*=======================*/
-	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt from handler */
-	trx_t*		trx,		/*!< in/out: transaction for import */
-	dberr_t		err)		/*!< in: error code */
-{
-	dict_table_t*	table = prebuilt->table;
-
-	ut_a(err != DB_SUCCESS);
-
-	prebuilt->trx->error_info = NULL;
-
-	ib::info() << "Discarding tablespace of table "
-		<< prebuilt->table->name
-		<< ": " << err;
-
-	if (trx->dict_operation_lock_mode != RW_X_LATCH) {
-		ut_a(trx->dict_operation_lock_mode == 0);
-		row_mysql_lock_data_dictionary(trx);
-	}
-
-	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
-
-	/* Since we update the index root page numbers on disk after
-	we've done a successful import. The table will not be loadable.
-	However, we need to ensure that the in memory root page numbers
-	are reset to "NULL". */
-
-	for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
-		index != 0;
-		index = UT_LIST_GET_NEXT(indexes, index)) {
-
-		index->page = FIL_NULL;
-	}
-
-	table->file_unreadable = true;
-	if (table->space) {
-		fil_close_tablespace(trx, table->space_id);
-		table->space = NULL;
-	}
-}
-
-/*****************************************************************//**
 Clean up after import tablespace. */
 static	MY_ATTRIBUTE((nonnull, warn_unused_result))
 dberr_t
@@ -2199,7 +2166,27 @@ row_import_cleanup(
 	ut_a(prebuilt->trx != trx);
 
 	if (err != DB_SUCCESS) {
-		row_import_discard_changes(prebuilt, trx, err);
+		dict_table_t* table = prebuilt->table;
+		table->file_unreadable = true;
+		if (table->space) {
+			fil_close_tablespace(table->space_id);
+			table->space = NULL;
+		}
+
+		prebuilt->trx->error_info = NULL;
+
+		ib::info() << "Discarding tablespace of table "
+			   << table->name << ": " << err;
+
+		if (!trx->dict_operation_lock_mode) {
+			row_mysql_lock_data_dictionary(trx);
+		}
+
+		for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+		     index;
+		     index = UT_LIST_GET_NEXT(indexes, index)) {
+			index->page = FIL_NULL;
+		}
 	}
 
 	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
@@ -2216,8 +2203,6 @@ row_import_cleanup(
 
 	DBUG_EXECUTE_IF("ib_import_before_checkpoint_crash", DBUG_SUICIDE(););
 
-	log_make_checkpoint();
-
 	return(err);
 }
 
@@ -2442,7 +2427,7 @@ row_import_cfg_read_string(
 			break;
 		} else if (ch != 0) {
 			if (len < max_len) {
-				ptr[len++] = ch;
+				ptr[len++] = static_cast<byte>(ch);
 			} else {
 				break;
 			}
@@ -2508,10 +2493,10 @@ row_import_cfg_read_index_fields(
 
 		new (field) dict_field_t();
 
-		field->prefix_len = mach_read_from_4(ptr);
+		field->prefix_len = mach_read_from_4(ptr) & ((1U << 12) - 1);
 		ptr += sizeof(ib_uint32_t);
 
-		field->fixed_len = mach_read_from_4(ptr);
+		field->fixed_len = mach_read_from_4(ptr) & ((1U << 10) - 1);
 		ptr += sizeof(ib_uint32_t);
 
 		/* Include the NUL byte in the length. */
@@ -2816,24 +2801,24 @@ row_import_read_columns(
 		col->prtype = mach_read_from_4(ptr);
 		ptr += sizeof(ib_uint32_t);
 
-		col->mtype = mach_read_from_4(ptr);
+		col->mtype = static_cast<byte>(mach_read_from_4(ptr));
 		ptr += sizeof(ib_uint32_t);
 
-		col->len = mach_read_from_4(ptr);
+		col->len = static_cast<uint16_t>(mach_read_from_4(ptr));
 		ptr += sizeof(ib_uint32_t);
 
-		ulint mbminmaxlen = mach_read_from_4(ptr);
-		col->mbmaxlen = mbminmaxlen / 5;
-		col->mbminlen = mbminmaxlen % 5;
+		uint32_t mbminmaxlen = mach_read_from_4(ptr);
+		col->mbmaxlen = (mbminmaxlen / 5) & 7;
+		col->mbminlen = (mbminmaxlen % 5) & 7;
 		ptr += sizeof(ib_uint32_t);
 
-		col->ind = mach_read_from_4(ptr);
+		col->ind = mach_read_from_4(ptr) & dict_index_t::MAX_N_FIELDS;
 		ptr += sizeof(ib_uint32_t);
 
-		col->ord_part = mach_read_from_4(ptr);
+		col->ord_part = mach_read_from_4(ptr) & 1;
 		ptr += sizeof(ib_uint32_t);
 
-		col->max_prefix = mach_read_from_4(ptr);
+		col->max_prefix = mach_read_from_4(ptr) & ((1U << 12) - 1);
 		ptr += sizeof(ib_uint32_t);
 
 		/* Read in the column name as [len, byte array]. The len
@@ -3118,24 +3103,25 @@ static dberr_t decrypt_decompress(fil_space_crypt_t *space_crypt,
                                        page.size(), space_flags, data))
       return err;
   }
-  else if (fil_page_is_compressed_encrypted(data))
-    return DB_CORRUPTION;
-
-  const bool is_full_crc32_compressed=
-      fil_space_t::is_full_crc32_compressed(space_flags);
 
-  const bool page_actually_compressed=
-      (is_full_crc32_compressed &&
-       buf_page_is_compressed(data, space_flags)) ||
-      fil_page_is_compressed_encrypted(data) || fil_page_is_compressed(data);
+  bool page_compressed= false;
 
-  if (page_actually_compressed)
+  if (fil_space_t::full_crc32(space_flags) &&
+      fil_space_t::is_compressed(space_flags))
+    page_compressed= buf_page_is_compressed(data, space_flags);
+  else
   {
-    if (!is_full_crc32_compressed && !fil_space_t::is_compressed(space_flags))
-      return DB_CORRUPTION;
+    switch (fil_page_get_type(data)) {
+    case FIL_PAGE_PAGE_COMPRESSED:
+    case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED:
+      page_compressed= true;
+    }
+  }
 
+  if (page_compressed)
+  {
     auto compress_length=
-        fil_page_decompress(page_compress_buf, data, space_flags);
+      fil_page_decompress(page_compress_buf, data, space_flags);
     ut_ad(compress_length != srv_page_size);
 
     if (compress_length == 0)
@@ -3196,7 +3182,7 @@ static dberr_t handle_instant_metadata(dict_table_t *table,
       static_cast<byte *>(aligned_malloc(srv_page_size, srv_page_size)),
       &aligned_free);
 
-  if (dberr_t err= os_file_read_no_error_handling(IORequest(IORequest::READ),
+  if (dberr_t err= os_file_read_no_error_handling(IORequestReadPartial,
                                                   file, first_page.get(), 0,
                                                   srv_page_size, nullptr))
     return err;
@@ -3211,7 +3197,7 @@ static dberr_t handle_instant_metadata(dict_table_t *table,
       ib::error() << "Invalid FSP_SPACE_FLAGS=" << ib::hex(space_flags);
       return DB_CORRUPTION;
     }
-    space_flags= cflags;
+    space_flags= static_cast<decltype(space_flags)>(cflags);
   }
 
   if (!cfg.m_missing)
@@ -3237,16 +3223,16 @@ static dberr_t handle_instant_metadata(dict_table_t *table,
       &aligned_free);
 
   if (dberr_t err= os_file_read_no_error_handling(
-          IORequest(IORequest::READ), file, page.get(), 3 * physical_size,
+          IORequestReadPartial, file, page.get(), 3 * physical_size,
           physical_size, nullptr))
     return err;
 
   std::unique_ptr<byte[]> page_compress_buf(new byte[get_buf_size()]);
 
-  if (dberr_t err=
-          decrypt_decompress(space_crypt, space_flags,
-                             {page.get(), static_cast<size_t>(physical_size)},
-                             space_id, page_compress_buf.get()))
+  if (dberr_t err= decrypt_decompress(space_crypt, space_flags,
+                                      {page.get(), static_cast<size_t>
+                                       (physical_size)},
+                                      space_id, page_compress_buf.get()))
     return err;
 
   if (table->supports_instant())
@@ -3298,15 +3284,17 @@ static dberr_t handle_instant_metadata(dict_table_t *table,
 
       uint64_t child_page_no= btr_node_ptr_get_child_page_no(rec, offsets);
 
-      if (dberr_t err= os_file_read_no_error_handling(
-              IORequest(IORequest::READ), file, page.get(),
-              child_page_no * physical_size, physical_size, nullptr))
+      if (dberr_t err=
+          os_file_read_no_error_handling(IORequestReadPartial, file,
+                                         page.get(),
+                                         child_page_no * physical_size,
+                                         physical_size, nullptr))
         return err;
 
-      if (dberr_t err= decrypt_decompress(
-              space_crypt, space_flags,
-              {page.get(), static_cast<size_t>(physical_size)}, space_id,
-              page_compress_buf.get()))
+      if (dberr_t err= decrypt_decompress(space_crypt, space_flags,
+                                          {page.get(), static_cast<size_t>
+                                           (physical_size)}, space_id,
+                                          page_compress_buf.get()))
         return err;
     }
 
@@ -3369,20 +3357,23 @@ static dberr_t handle_instant_metadata(dict_table_t *table,
       if (!len || mach_read_from_4(ptr + BTR_EXTERN_OFFSET) != FIL_PAGE_DATA)
         goto incompatible;
 
-      std::unique_ptr<byte[], decltype(&aligned_free)> second_page(
-          static_cast<byte *>(aligned_malloc(physical_size, physical_size)),
-          &aligned_free);
-
-      if (dberr_t err= os_file_read_no_error_handling(
-              IORequest(IORequest::READ), file, second_page.get(),
-              mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO) * physical_size,
-              srv_page_size, nullptr))
+      std::unique_ptr<byte[], decltype(&aligned_free)>
+        second_page(static_cast<byte*>(aligned_malloc(physical_size,
+                                                      physical_size)),
+                    &aligned_free);
+
+      if (dberr_t err=
+          os_file_read_no_error_handling(IORequestReadPartial, file,
+                                         second_page.get(), physical_size *
+                                         mach_read_from_4(ptr +
+                                                          BTR_EXTERN_PAGE_NO),
+                                         srv_page_size, nullptr))
         return err;
 
-      if (dberr_t err= decrypt_decompress(
-              space_crypt, space_flags,
-              {second_page.get(), static_cast<size_t>(physical_size)},
-              space_id, page_compress_buf.get()))
+      if (dberr_t err= decrypt_decompress(space_crypt, space_flags,
+                                          {second_page.get(),
+                                           static_cast<size_t>(physical_size)},
+                                          space_id, page_compress_buf.get()))
         return err;
 
       if (fil_page_get_type(second_page.get()) != FIL_PAGE_TYPE_BLOB ||
@@ -3715,7 +3706,6 @@ tablespace involved. It does help to save the disk space when
 punch hole is enabled
 @param iter     Tablespace iterator
 @param full_crc32    whether the file is in the full_crc32 format
-@param write_request Request to write into the file
 @param offset   offset of the file to be written
 @param writeptr buffer to be written
 @param n_bytes  number of bytes to be written
@@ -3725,7 +3715,6 @@ punch hole is enabled
 static
 dberr_t fil_import_compress_fwrite(const fil_iterator_t &iter,
                                    bool full_crc32,
-                                   const IORequest &write_request,
                                    os_offset_t offset,
                                    const byte *writeptr,
                                    ulint n_bytes,
@@ -3761,7 +3750,7 @@ dberr_t fil_import_compress_fwrite(const fil_iterator_t &iter,
       }
     }
 
-    if (dberr_t err= os_file_write(write_request, iter.filepath, iter.file,
+    if (dberr_t err= os_file_write(IORequestWrite, iter.filepath, iter.file,
                                    writeptr + j, offset + j, n_write_bytes))
       return err;
   }
@@ -3790,23 +3779,17 @@ dberr_t FetchIndexRootPages::run(const fil_iterator_t& iter,
   if (block->page.zip.data)
     block->page.zip.data= readptr;
 
-  IORequest read_request(IORequest::READ);
-  read_request.disable_partial_io_warnings();
-  ulint page_no= 0;
   bool page_compressed= false;
 
   dberr_t err= os_file_read_no_error_handling(
-    read_request, iter.file, readptr, 3 * size, size, 0);
+    IORequestReadPartial, iter.file, readptr, 3 * size, size, 0);
   if (err != DB_SUCCESS)
   {
     ib::error() << iter.filepath << ": os_file_read() failed";
     goto func_exit;
   }
 
-  block->page.id.set_page_no(3);
-  page_no= page_get_page_no(readptr);
-
-  if (page_no != 3)
+  if (page_get_page_no(readptr) != 3)
   {
 page_corrupted:
     ib::warn() << filename() << ": Page 3 at offset "
@@ -3815,14 +3798,19 @@ page_corrupted:
     goto func_exit;
   }
 
-  page_compressed=
-    (full_crc32 && fil_space_t::is_compressed(m_space_flags) &&
-     buf_page_is_compressed(readptr, m_space_flags)) ||
-    (fil_page_is_compressed_encrypted(readptr) ||
-     fil_page_is_compressed(readptr));
-
-  if (page_compressed && block->page.zip.data)
-    goto page_corrupted;
+  block->page.id_.set_page_no(3);
+  if (full_crc32 && fil_space_t::is_compressed(m_space_flags))
+    page_compressed= buf_page_is_compressed(readptr, m_space_flags);
+  else
+  {
+    switch (fil_page_get_type(readptr)) {
+    case FIL_PAGE_PAGE_COMPRESSED:
+    case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED:
+      if (block->page.zip.data)
+        goto page_corrupted;
+      page_compressed= true;
+    }
+  }
 
   if (encrypted)
   {
@@ -3887,7 +3875,6 @@ static dberr_t fil_iterate(
 	dberr_t		err = DB_SUCCESS;
 	bool		page_compressed = false;
 	bool		punch_hole = true;
-	const IORequest	write_request(IORequest::WRITE);
 
 	for (offset = iter.start; offset < iter.end; offset += n_bytes) {
 		if (callback.is_interrupted()) {
@@ -3919,11 +3906,9 @@ static dberr_t fil_iterate(
 			? iter.crypt_io_buffer : io_buffer;
 		byte* const writeptr = readptr;
 
-		IORequest	read_request(IORequest::READ);
-		read_request.disable_partial_io_warnings();
-
 		err = os_file_read_no_error_handling(
-			read_request, iter.file, readptr, offset, n_bytes, 0);
+			IORequestReadPartial,
+			iter.file, readptr, offset, n_bytes, 0);
 		if (err != DB_SUCCESS) {
 			ib::error() << iter.filepath
 				    << ": os_file_read() failed";
@@ -3933,14 +3918,15 @@ static dberr_t fil_iterate(
 		bool		updated = false;
 		os_offset_t	page_off = offset;
 		ulint		n_pages_read = n_bytes / size;
-		block->page.id.set_page_no(ulint(page_off / size));
+		/* This block is not attached to buf_pool */
+		block->page.id_.set_page_no(uint32_t(page_off / size));
 
 		for (ulint i = 0; i < n_pages_read;
-		     block->page.id.set_page_no(block->page.id.page_no() + 1),
+		     ++block->page.id_,
 		     ++i, page_off += size, block->frame += size) {
 			byte*	src = readptr + i * size;
 			const ulint page_no = page_get_page_no(src);
-			if (!page_no && block->page.id.page_no()) {
+			if (!page_no && block->page.id().page_no()) {
 				if (!buf_is_zeroes(span<const byte>(src,
 								    size))) {
 					goto page_corrupted;
@@ -3950,7 +3936,7 @@ static dberr_t fil_iterate(
 				continue;
 			}
 
-			if (page_no != block->page.id.page_no()) {
+			if (page_no != block->page.id().page_no()) {
 page_corrupted:
 				ib::warn() << callback.filename()
 					   << ": Page " << (offset / size)
@@ -3960,19 +3946,20 @@ page_corrupted:
 				goto func_exit;
 			}
 
-			if (block->page.id.page_no() == 0) {
+			if (block->page.id().page_no() == 0) {
 				actual_space_id = mach_read_from_4(
 					src + FIL_PAGE_SPACE_ID);
 			}
 
+			const uint16_t type = fil_page_get_type(src);
 			page_compressed =
 				(full_crc32
 				 && fil_space_t::is_compressed(
 					callback.get_space_flags())
 				 && buf_page_is_compressed(
 					src, callback.get_space_flags()))
-				|| (fil_page_is_compressed_encrypted(src)
-				    || fil_page_is_compressed(src));
+				|| type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED
+				|| type == FIL_PAGE_PAGE_COMPRESSED;
 
 			if (page_compressed && block->page.zip.data) {
 				goto page_corrupted;
@@ -3986,7 +3973,7 @@ page_corrupted:
 
 			if (!encrypted) {
 			} else if (!key_version) {
-				if (block->page.id.page_no() == 0
+				if (block->page.id().page_no() == 0
 				    && block->page.zip.data) {
 					block->page.zip.data = src;
 					frame_changed = true;
@@ -4044,7 +4031,7 @@ page_corrupted:
 			if ((err = callback(block)) != DB_SUCCESS) {
 				goto func_exit;
 			} else if (!updated) {
-				updated = buf_block_get_state(block)
+				updated = block->page.state()
 					== BUF_BLOCK_FILE_PAGE;
 			}
 
@@ -4079,7 +4066,7 @@ page_corrupted:
 			/* When tablespace is encrypted or compressed its
 			first page (i.e. page 0) is not encrypted or
 			compressed and there is no need to copy frame. */
-			if (encrypted && block->page.id.page_no() != 0) {
+			if (encrypted && block->page.id().page_no() != 0) {
 				byte *local_frame = callback.get_frame(block);
 				ut_ad((writeptr + (i * size)) != local_frame);
 				memcpy((writeptr + (i * size)), local_frame, size);
@@ -4116,9 +4103,8 @@ page_corrupted:
 
 				byte* tmp = fil_encrypt_buf(
 					iter.crypt_data,
-					block->page.id.space(),
-					block->page.id.page_no(),
-					mach_read_from_8(src + FIL_PAGE_LSN),
+					block->page.id().space(),
+					block->page.id().page_no(),
 					src, block->zip_size(), dest,
 					full_crc32);
 
@@ -4154,8 +4140,8 @@ page_corrupted:
 
 		if (page_compressed && punch_hole) {
 			err = fil_import_compress_fwrite(
-				iter, full_crc32, write_request, offset,
-				writeptr, n_bytes, !updated);
+				iter, full_crc32, offset, writeptr, n_bytes,
+				!updated);
 
 			if (err != DB_SUCCESS) {
 				punch_hole = false;
@@ -4164,11 +4150,11 @@ page_corrupted:
 				}
 			}
 		} else if (updated) {
-			/* A page was updated in the set, write back to disk. */
 normal_write:
-			err = os_file_write(
-				write_request, iter.filepath, iter.file,
-				writeptr, offset, n_bytes);
+			/* A page was updated in the set, write it back. */
+			err = os_file_write(IORequestWrite,
+					    iter.filepath, iter.file,
+					    writeptr, offset, n_bytes);
 
 			if (err != DB_SUCCESS) {
 				goto func_exit;
@@ -4247,34 +4233,27 @@ fil_tablespace_iterate(
 
 	/* Allocate a page to read in the tablespace header, so that we
 	can determine the page size and zip_size (if it is compressed).
-	We allocate an extra page in case it is a compressed table. One
-	page is to ensure alignement. */
+	We allocate an extra page in case it is a compressed table. */
 
-	void*	page_ptr = ut_malloc_nokey(3U << srv_page_size_shift);
-	byte*	page = static_cast<byte*>(ut_align(page_ptr, srv_page_size));
+	byte*	page = static_cast<byte*>(aligned_malloc(2 * srv_page_size,
+							 srv_page_size));
 
 	buf_block_t* block = reinterpret_cast<buf_block_t*>
 		(ut_zalloc_nokey(sizeof *block));
 	block->frame = page;
-	block->page.id = page_id_t(0, 0);
-	block->page.io_fix = BUF_IO_NONE;
-	block->page.buf_fix_count = 1;
-	block->page.state = BUF_BLOCK_FILE_PAGE;
+        block->page.init(BUF_BLOCK_FILE_PAGE, page_id_t(~0ULL), 1);
 
 	/* Read the first page and determine the page and zip size. */
 
-	IORequest       request(IORequest::READ);
-	request.disable_partial_io_warnings();
-
-	err = os_file_read_no_error_handling(request, file, page, 0,
-					     srv_page_size, 0);
+	err = os_file_read_no_error_handling(IORequestReadPartial,
+					     file, page, 0, srv_page_size, 0);
 
 	if (err == DB_SUCCESS) {
 		err = callback.init(file_size, block);
 	}
 
 	if (err == DB_SUCCESS) {
-		block->page.id = page_id_t(callback.get_space_id(), 0);
+		block->page.id_ = page_id_t(callback.get_space_id(), 0);
 		if (ulint zip_size = callback.get_zip_size()) {
 			page_zip_set_size(&block->page.zip, zip_size);
 			/* ROW_FORMAT=COMPRESSED is not optimised for block IO
@@ -4303,20 +4282,16 @@ fil_tablespace_iterate(
 		iter.n_io_buffers = n_io_buffers;
 
 		/* Add an extra page for compressed page scratch area. */
-		void*	io_buffer = ut_malloc_nokey(
-			(2 + iter.n_io_buffers) << srv_page_size_shift);
-
 		iter.io_buffer = static_cast<byte*>(
-			ut_align(io_buffer, srv_page_size));
+			aligned_malloc((1 + iter.n_io_buffers)
+				       << srv_page_size_shift, srv_page_size));
 
-		void* crypt_io_buffer = NULL;
-		if (iter.crypt_data) {
-			crypt_io_buffer = ut_malloc_nokey(
-				(2 + iter.n_io_buffers)
-				<< srv_page_size_shift);
-			iter.crypt_io_buffer = static_cast<byte*>(
-				ut_align(crypt_io_buffer, srv_page_size));
-		}
+		iter.crypt_io_buffer = iter.crypt_data
+			? static_cast<byte*>(
+				aligned_malloc((1 + iter.n_io_buffers)
+					       << srv_page_size_shift,
+					       srv_page_size))
+			: NULL;
 
 		if (block->page.zip.ssize) {
 			ut_ad(iter.n_io_buffers == 1);
@@ -4330,8 +4305,8 @@ fil_tablespace_iterate(
 			fil_space_destroy_crypt_data(&iter.crypt_data);
 		}
 
-		ut_free(crypt_io_buffer);
-		ut_free(io_buffer);
+		aligned_free(iter.crypt_io_buffer);
+		aligned_free(iter.io_buffer);
 	}
 
 	if (err == DB_SUCCESS) {
@@ -4347,7 +4322,7 @@ fil_tablespace_iterate(
 
 	os_file_close(file);
 
-	ut_free(page_ptr);
+	aligned_free(page);
 	ut_free(filepath);
 	ut_free(block);
 
@@ -4368,7 +4343,6 @@ row_import_for_mysql(
 	trx_t*		trx;
 	ib_uint64_t	autoinc = 0;
 	char*		filepath = NULL;
-	ulint		space_flags MY_ATTRIBUTE((unused));
 
 	/* The caller assured that this is not read_only_mode and that no
 	temorary tablespace is being imported. */
@@ -4376,7 +4350,7 @@ row_import_for_mysql(
 	ut_ad(!table->is_temporary());
 
 	ut_ad(table->space_id);
-	ut_ad(table->space_id < SRV_LOG_SPACE_FIRST_ID);
+	ut_ad(table->space_id < SRV_SPACE_ID_UPPER_BOUND);
 	ut_ad(prebuilt->trx);
 	ut_ad(!table->is_readable());
 
@@ -4511,9 +4485,6 @@ row_import_for_mysql(
 				}
 			}
 		}
-
-		space_flags = fetchIndexRootPages.get_space_flags();
-
 	} else {
 		rw_lock_s_unlock(&dict_sys.latch);
 	}
@@ -4706,17 +4677,16 @@ row_import_for_mysql(
 	/* Ensure that all pages dirtied during the IMPORT make it to disk.
 	The only dirty pages generated should be from the pessimistic purge
 	of delete marked records that couldn't be purged in Phase I. */
-
-	{
-		FlushObserver observer(prebuilt->table->space, trx, NULL);
-		buf_LRU_flush_or_remove_pages(prebuilt->table->space_id,
-					      &observer);
-
-		if (observer.is_interrupted()) {
-			ib::info() << "Phase III - Flush interrupted";
-			return(row_import_error(prebuilt, trx,
-						DB_INTERRUPTED));
+	while (buf_flush_list_space(prebuilt->table->space));
+
+	for (ulint count = 0; prebuilt->table->space->referenced(); count++) {
+		/* Issue a warning every 10.24 seconds, starting after
+		2.56 seconds */
+		if ((count & 511) == 128) {
+			ib::warn() << "Waiting for flush to complete on "
+				   << prebuilt->table->name;
 		}
+		os_thread_sleep(20000);
 	}
 
 	ib::info() << "Phase IV - Flush complete";
@@ -4741,7 +4711,7 @@ row_import_for_mysql(
 	}
 
 	table->file_unreadable = false;
-	table->flags2 &= ~DICT_TF2_DISCARDED;
+	table->flags2 &= ~DICT_TF2_DISCARDED & ((1U << DICT_TF2_BITS) - 1);
 
 	/* Set autoinc value read from .cfg file, if one was specified.
 	Otherwise, keep the PAGE_ROOT_AUTO_INC as is. */
diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc
index a1f5e55d8f4..8b89f2adf56 100644
--- a/storage/innobase/row/row0ins.cc
+++ b/storage/innobase/row/row0ins.cc
@@ -239,7 +239,7 @@ row_ins_sec_index_entry_by_modify(
 		}
 	} else {
 		ut_a(mode == BTR_MODIFY_TREE);
-		if (buf_LRU_buf_pool_running_out()) {
+		if (buf_pool.running_out()) {
 
 			return(DB_LOCK_TABLE_FULL);
 		}
@@ -329,10 +329,8 @@ row_ins_clust_index_entry_by_modify(
 			break;
 		}
 	} else {
-		if (buf_LRU_buf_pool_running_out()) {
-
-			return(DB_LOCK_TABLE_FULL);
-
+		if (buf_pool.running_out()) {
+			return DB_LOCK_TABLE_FULL;
 		}
 
 		big_rec_t*	big_rec	= NULL;
@@ -438,7 +436,7 @@ row_ins_cascade_calc_update_vec(
 	ulint		i;
 	ulint		j;
 	bool		doc_id_updated = false;
-	ulint		doc_id_pos = 0;
+	unsigned	doc_id_pos = 0;
 	doc_id_t	new_doc_id = FTS_NULL_DOC_ID;
 	ulint		prefix_col;
 
@@ -495,10 +493,10 @@ row_ins_cascade_calc_update_vec(
 
 				ufield = update->fields + n_fields_updated;
 
-				ufield->field_no
-					= dict_table_get_nth_col_pos(
+				ufield->field_no = static_cast<uint16_t>(
+					dict_table_get_nth_col_pos(
 						table, dict_col_get_no(col),
-						&prefix_col);
+						&prefix_col));
 
 				ufield->orig_len = 0;
 				ufield->exp = NULL;
@@ -913,7 +911,7 @@ row_ins_foreign_fill_virtual(
 			       | DICT_FOREIGN_ON_UPDATE_SET_NULL
 			       | DICT_FOREIGN_ON_UPDATE_CASCADE));
 
-	for (ulint i = 0; i < n_v_fld; i++) {
+	for (uint16_t i = 0; i < n_v_fld; i++) {
 
 		dict_v_col_t*     col = dict_table_get_nth_v_col(
 				index->table, i);
@@ -1222,8 +1220,9 @@ row_ins_foreign_check_on_constraint(
 						index, i);
 			ulint		prefix_col;
 
-			ufield->field_no = dict_table_get_nth_col_pos(
-				table, col_no, &prefix_col);
+			ufield->field_no = static_cast<uint16_t>(
+				dict_table_get_nth_col_pos(
+					table, col_no, &prefix_col));
 			dict_col_t*	col = dict_table_get_nth_col(
 				table, col_no);
 			dict_col_copy_type(col, dfield_get_type(&ufield->new_val));
@@ -1388,7 +1387,7 @@ static
 dberr_t
 row_ins_set_shared_rec_lock(
 /*========================*/
-	ulint			type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+	unsigned		type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
 					LOCK_REC_NOT_GAP type lock */
 	const buf_block_t*	block,	/*!< in: buffer block of rec */
 	const rec_t*		rec,	/*!< in: record */
@@ -1419,7 +1418,7 @@ static
 dberr_t
 row_ins_set_exclusive_rec_lock(
 /*===========================*/
-	ulint			type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+	unsigned		type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
 					LOCK_REC_NOT_GAP type lock */
 	const buf_block_t*	block,	/*!< in: buffer block of rec */
 	const rec_t*		rec,	/*!< in: record */
@@ -2744,8 +2743,7 @@ do_insert:
 				entry, &insert_rec, &big_rec,
 				n_ext, thr, &mtr);
 		} else {
-			if (buf_LRU_buf_pool_running_out()) {
-
+			if (buf_pool.running_out()) {
 				err = DB_LOCK_TABLE_FULL;
 				goto err_exit;
 			}
@@ -3103,8 +3101,7 @@ row_ins_sec_index_entry_low(
 			}
 		} else {
 			ut_ad(mode == BTR_MODIFY_TREE);
-			if (buf_LRU_buf_pool_running_out()) {
-
+			if (buf_pool.running_out()) {
 				err = DB_LOCK_TABLE_FULL;
 				goto func_exit;
 			}
@@ -3427,7 +3424,6 @@ row_ins_index_entry_set_vals(
 				field->len = UNIV_SQL_NULL;
 				field->type.prtype = DATA_BINARY_TYPE;
 			} else {
-				ut_ad(col->len <= sizeof field_ref_zero);
 				ut_ad(ind_field->fixed_len <= col->len);
 				dfield_set_data(field, field_ref_zero,
 						ind_field->fixed_len);
diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc
index c0396c33cc4..336b5a27cc2 100644
--- a/storage/innobase/row/row0log.cc
+++ b/storage/innobase/row/row0log.cc
@@ -211,11 +211,13 @@ struct row_log_t {
 	row_log_buf_t	tail;	/*!< writer context;
 				protected by mutex and index->lock S-latch,
 				or by index->lock X-latch only */
+	size_t		crypt_tail_size; /*!< size of crypt_tail_size*/
 	byte*		crypt_tail; /*!< writer context;
 				temporary buffer used in encryption,
 				decryption or NULL*/
 	row_log_buf_t	head;	/*!< reader context; protected by MDL only;
 				modifiable by row_log_apply_ops() */
+	size_t		crypt_head_size; /*!< size of crypt_tail_size*/
 	byte*		crypt_head; /*!< reader context;
 				temporary buffer used in encryption,
 				decryption or NULL */
@@ -314,8 +316,7 @@ row_log_block_free(
 	DBUG_ENTER("row_log_block_free");
 	if (log_buf.block != NULL) {
 		ut_allocator<byte>(mem_key_row_log_buf).deallocate_large(
-			log_buf.block, &log_buf.block_pfx,
-			log_buf.size);
+			log_buf.block, &log_buf.block_pfx);
 		log_buf.block = NULL;
 	}
 	DBUG_VOID_RETURN;
@@ -409,7 +410,6 @@ row_log_online_op(
 		const os_offset_t	byte_offset
 			= (os_offset_t) log->tail.blocks
 			* srv_sort_buf_size;
-		IORequest		request(IORequest::WRITE);
 		byte*			buf = log->tail.block;
 
 		if (byte_offset + srv_sort_buf_size >= srv_online_max_size) {
@@ -447,7 +447,7 @@ row_log_online_op(
 
 		log->tail.blocks++;
 		if (os_file_write(
-			    request,
+			    IORequestWrite,
 			    "(modification log)",
 			    log->fd,
 			    buf, byte_offset, srv_sort_buf_size)
@@ -548,7 +548,6 @@ row_log_table_close_func(
 		const os_offset_t	byte_offset
 			= (os_offset_t) log->tail.blocks
 			* srv_sort_buf_size;
-		IORequest		request(IORequest::WRITE);
 		byte*			buf = log->tail.block;
 
 		if (byte_offset + srv_sort_buf_size >= srv_online_max_size) {
@@ -586,7 +585,7 @@ row_log_table_close_func(
 
 		log->tail.blocks++;
 		if (os_file_write(
-			    request,
+			    IORequestWrite,
 			    "(modification log)",
 			    log->fd,
 			    buf, byte_offset, srv_sort_buf_size)
@@ -971,7 +970,7 @@ row_log_table_low(
 		ut_ad(page_get_page_no(page_align(rec)) == index->page);
 		break;
 	default:
-		ut_ad(!"wrong page type");
+		ut_ad("wrong page type" == 0);
 	}
 #endif /* UNIV_DEBUG */
 	ut_ad(!rec_is_metadata(rec, *index));
@@ -2069,9 +2068,8 @@ row_log_table_apply_update(
 
 	ut_ad(dtuple_get_n_fields_cmp(old_pk)
 	      == dict_index_get_n_unique(index));
-	ut_ad(dtuple_get_n_fields(old_pk)
-	      == dict_index_get_n_unique(index)
-	      + (log->same_pk ? 0 : 2));
+	ut_ad(dtuple_get_n_fields(old_pk) - (log->same_pk ? 0 : 2)
+	      == dict_index_get_n_unique(index));
 
 	row = row_log_table_apply_convert_mrec(
 		mrec, dup->index, offsets, log, heap, &error);
@@ -2877,11 +2875,10 @@ all_done:
 			goto func_exit;
 		}
 
-		IORequest		request(IORequest::READ);
 		byte*			buf = index->online_log->head.block;
 
 		if (os_file_read_no_error_handling(
-			    request, index->online_log->fd,
+			    IORequestRead, index->online_log->fd,
 			    buf, ofs, srv_sort_buf_size, 0) != DB_SUCCESS) {
 			ib::error()
 				<< "Unable to read temporary file"
@@ -3245,9 +3242,11 @@ row_log_allocate(
 	dict_index_set_online_status(index, ONLINE_INDEX_CREATION);
 
 	if (log_tmp_is_encrypted()) {
-		ulint size = srv_sort_buf_size;
-		log->crypt_head = static_cast<byte *>(os_mem_alloc_large(&size));
-		log->crypt_tail = static_cast<byte *>(os_mem_alloc_large(&size));
+		log->crypt_head_size = log->crypt_tail_size = srv_sort_buf_size;
+		log->crypt_head = static_cast<byte *>(
+			my_large_malloc(&log->crypt_head_size, MYF(MY_WME)));
+		log->crypt_tail = static_cast<byte *>(
+			my_large_malloc(&log->crypt_tail_size, MYF(MY_WME)));
 
 		if (!log->crypt_head || !log->crypt_tail) {
 			row_log_free(log);
@@ -3280,11 +3279,11 @@ row_log_free(
 	row_merge_file_destroy_low(log->fd);
 
 	if (log->crypt_head) {
-		os_mem_free_large(log->crypt_head, srv_sort_buf_size);
+		my_large_free(log->crypt_head, log->crypt_head_size);
 	}
 
 	if (log->crypt_tail) {
-		os_mem_free_large(log->crypt_tail, srv_sort_buf_size);
+		my_large_free(log->crypt_tail, log->crypt_tail_size);
 	}
 
 	mutex_free(&log->mutex);
@@ -3767,8 +3766,6 @@ all_done:
 		os_offset_t	ofs = static_cast<os_offset_t>(
 			index->online_log->head.blocks)
 			* srv_sort_buf_size;
-		IORequest	request(IORequest::READ);
-
 		ut_ad(has_index_lock);
 		has_index_lock = false;
 		rw_lock_x_unlock(dict_index_get_lock(index));
@@ -3783,7 +3780,7 @@ all_done:
 		byte*	buf = index->online_log->head.block;
 
 		if (os_file_read_no_error_handling(
-			    request, index->online_log->fd,
+			    IORequestRead, index->online_log->fd,
 			    buf, ofs, srv_sort_buf_size, 0) != DB_SUCCESS) {
 			ib::error()
 				<< "Unable to read temporary file"
diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc
index 47e30bea6a5..41a4a0b82d3 100644
--- a/storage/innobase/row/row0merge.cc
+++ b/storage/innobase/row/row0merge.cc
@@ -67,14 +67,10 @@ public:
 	/** constructor
 	@param[in]	heap	memory heap
 	@param[in]	index	index to be created */
-	index_tuple_info_t(
-		mem_heap_t*	heap,
-		dict_index_t*	index) UNIV_NOTHROW
-	{
-		m_heap = heap;
-		m_index = index;
-		m_dtuple_vec = UT_NEW_NOKEY(idx_tuple_vec());
-	}
+	index_tuple_info_t(mem_heap_t* heap, dict_index_t* index) :
+		m_dtuple_vec(UT_NEW_NOKEY(idx_tuple_vec())),
+		m_index(index), m_heap(heap)
+	{ ut_ad(index->is_spatial()); }
 
 	/** destructor */
 	~index_tuple_info_t()
@@ -110,13 +106,11 @@ public:
 	@param[in]	trx_id		transaction id
 	@param[in,out]	row_heap	memory heap
 	@param[in]	pcur		cluster index scanning cursor
+	@param[in,out]	mtr_started	whether scan_mtr is active
 	@param[in,out]	scan_mtr	mini-transaction for pcur
 	@return DB_SUCCESS if successful, else error number */
-	inline dberr_t insert(
-		trx_id_t		trx_id,
-		mem_heap_t*		row_heap,
-		btr_pcur_t*		pcur,
-		mtr_t*			scan_mtr)
+	dberr_t insert(trx_id_t trx_id, mem_heap_t* row_heap, btr_pcur_t* pcur,
+		       bool& mtr_started, mtr_t* scan_mtr) const
 	{
 		big_rec_t*      big_rec;
 		rec_t*          rec;
@@ -131,11 +125,10 @@ public:
 				       | BTR_NO_LOCKING_FLAG
 				       | BTR_KEEP_SYS_FLAG | BTR_CREATE_FLAG;
 
-		ut_ad(dict_index_is_spatial(m_index));
+		ut_ad(mtr_started == scan_mtr->is_active());
 
 		DBUG_EXECUTE_IF("row_merge_instrument_log_check_flush",
-			log_sys.check_flush_or_checkpoint = true;
-		);
+				log_sys.set_check_flush_or_checkpoint(););
 
 		for (idx_tuple_vec::iterator it = m_dtuple_vec->begin();
 		     it != m_dtuple_vec->end();
@@ -143,11 +136,12 @@ public:
 			dtuple = *it;
 			ut_ad(dtuple);
 
-			if (log_sys.check_flush_or_checkpoint) {
-				if (scan_mtr->is_active()) {
+			if (log_sys.check_flush_or_checkpoint()) {
+				if (mtr_started) {
 					btr_pcur_move_to_prev_on_page(pcur);
 					btr_pcur_store_position(pcur, scan_mtr);
 					scan_mtr->commit();
+					mtr_started = false;
 				}
 
 				log_free_check();
@@ -249,13 +243,13 @@ private:
 		idx_tuple_vec;
 
 	/** vector used to cache index rows made from cluster index scan */
-	idx_tuple_vec*		m_dtuple_vec;
+	idx_tuple_vec* const	m_dtuple_vec;
 
 	/** the index being built */
-	dict_index_t*		m_index;
+	dict_index_t* const	m_index;
 
 	/** memory heap for creating index tuples */
-	mem_heap_t*		m_heap;
+	mem_heap_t* const	m_heap;
 };
 
 /* Maximum pending doc memory limit in bytes for a fts tokenization thread */
@@ -372,8 +366,7 @@ row_merge_buf_create(
 	mem_heap_t*		heap;
 
 	max_tuples = srv_sort_buf_size
-		/ ut_max(static_cast<ulint>(1),
-			 dict_index_get_min_size(index));
+		/ std::max<ulint>(1, dict_index_get_min_size(index));
 
 	buf_size = (sizeof *buf);
 
@@ -663,7 +656,8 @@ error:
 				doc_item->field = field;
 				doc_item->doc_id = *doc_id;
 
-				bucket = *doc_id % fts_sort_pll_degree;
+				bucket = static_cast<ulint>(
+					*doc_id % fts_sort_pll_degree);
 
 				/* Add doc item to fts_doc_list */
 				mutex_enter(&psort_info[bucket].mutex);
@@ -1086,9 +1080,8 @@ row_merge_read(
 	DBUG_LOG("ib_merge_sort", "fd=" << fd << " ofs=" << ofs);
 	DBUG_EXECUTE_IF("row_merge_read_failure", DBUG_RETURN(FALSE););
 
-	IORequest	request(IORequest::READ);
 	const bool	success = DB_SUCCESS == os_file_read_no_error_handling(
-		request, fd, buf, ofs, srv_sort_buf_size, 0);
+		IORequestRead, fd, buf, ofs, srv_sort_buf_size, 0);
 
 	/* If encryption is enabled decrypt buffer */
 	if (success && log_tmp_is_encrypted()) {
@@ -1150,9 +1143,8 @@ row_merge_write(
 		out_buf = crypt_buf;
 	}
 
-	IORequest	request(IORequest::WRITE);
 	const bool	success = DB_SUCCESS == os_file_write(
-		request, "(merge)", fd, out_buf, ofs, buf_len);
+		IORequestWrite, "(merge)", fd, out_buf, ofs, buf_len);
 
 #ifdef POSIX_FADV_DONTNEED
 	/* The block will be needed on the next merge pass,
@@ -1569,10 +1561,11 @@ row_mtuple_cmp(
 @param[in]	trx_id		transaction id
 @param[in]	sp_tuples	cached spatial rows
 @param[in]	num_spatial	number of spatial indexes
-@param[in,out]	row_heap	heap for insert
+@param[in,out]	heap		heap for insert
 @param[in,out]	sp_heap		heap for tuples
 @param[in,out]	pcur		cluster index cursor
-@param[in,out]	mtr		mini transaction
+@param[in,out]	started		whether mtr is active
+@param[in,out]	mtr		mini-transaction
 @return DB_SUCCESS or error number */
 static
 dberr_t
@@ -1580,30 +1573,21 @@ row_merge_spatial_rows(
 	trx_id_t		trx_id,
 	index_tuple_info_t**	sp_tuples,
 	ulint			num_spatial,
-	mem_heap_t*		row_heap,
+	mem_heap_t*		heap,
 	mem_heap_t*		sp_heap,
 	btr_pcur_t*		pcur,
+	bool&			started,
 	mtr_t*			mtr)
 {
-	dberr_t			err = DB_SUCCESS;
-
-	if (sp_tuples == NULL) {
-		return(DB_SUCCESS);
-	}
-
-	ut_ad(sp_heap != NULL);
-
-	for (ulint j = 0; j < num_spatial; j++) {
-		err = sp_tuples[j]->insert(trx_id, row_heap, pcur, mtr);
-
-		if (err != DB_SUCCESS) {
-			return(err);
-		}
-	}
+  if (!sp_tuples)
+    return DB_SUCCESS;
 
-	mem_heap_empty(sp_heap);
+  for (ulint j= 0; j < num_spatial; j++)
+    if (dberr_t err= sp_tuples[j]->insert(trx_id, heap, pcur, started, mtr))
+      return err;
 
-	return(err);
+  mem_heap_empty(sp_heap);
+  return DB_SUCCESS;
 }
 
 /** Check if the geometry field is valid.
@@ -1690,7 +1674,7 @@ row_merge_read_clustered_index(
 	ib_sequence_t&		sequence,
 	row_merge_block_t*	block,
 	bool			skip_pk_sort,
-	pfs_os_file_t*			tmpfd,
+	pfs_os_file_t*		tmpfd,
 	ut_stage_alter_t*	stage,
 	double 			pct_cost,
 	row_merge_block_t*	crypt_block,
@@ -1706,6 +1690,7 @@ row_merge_read_clustered_index(
 	btr_pcur_t		pcur;		/* Cursor on the clustered
 						index */
 	mtr_t			mtr;		/* Mini transaction */
+	bool			mtr_started = false;
 	dberr_t			err = DB_SUCCESS;/* Return code */
 	ulint			n_nonnull = 0;	/* number of columns
 						changed to NOT NULL */
@@ -1828,14 +1813,15 @@ row_merge_read_clustered_index(
 		ut_ad(count == num_spatial);
 	}
 
-	mtr_start(&mtr);
+	mtr.start();
+	mtr_started = true;
 
 	/* Find the clustered index and create a persistent cursor
 	based on that. */
 
 	clust_index = dict_table_get_first_index(old_table);
-	const ulint old_trx_id_col = DATA_TRX_ID - DATA_N_SYS_COLS
-		+ ulint(old_table->n_cols);
+	const ulint old_trx_id_col = ulint(old_table->n_cols)
+		- (DATA_N_SYS_COLS - DATA_TRX_ID);
 	ut_ad(old_table->cols[old_trx_id_col].mtype == DATA_SYS);
 	ut_ad(old_table->cols[old_trx_id_col].prtype
 	      == (DATA_TRX_ID | DATA_NOT_NULL));
@@ -1847,6 +1833,7 @@ row_merge_read_clustered_index(
 
 	btr_pcur_open_at_index_side(
 		true, clust_index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
+	mtr_started = true;
 	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
 	if (rec_is_metadata(btr_pcur_get_rec(&pcur), *clust_index)) {
 		ut_ad(btr_pcur_is_on_user_rec(&pcur));
@@ -1953,13 +1940,13 @@ row_merge_read_clustered_index(
 			/* Insert the cached spatial index rows. */
 			err = row_merge_spatial_rows(
 				trx->id, sp_tuples, num_spatial,
-				row_heap, sp_heap, &pcur, &mtr);
+				row_heap, sp_heap, &pcur, mtr_started, &mtr);
 
 			if (err != DB_SUCCESS) {
 				goto func_exit;
 			}
 
-			if (!mtr.is_active()) {
+			if (!mtr_started) {
 				goto scan_next;
 			}
 
@@ -1978,16 +1965,20 @@ row_merge_read_clustered_index(
 				this is the only page in the index tree. */
 				ut_ad(btr_pcur_is_on_user_rec(&pcur)
 				      || btr_pcur_get_block(
-					      &pcur)->page.id.page_no()
+					      &pcur)->page.id().page_no()
 				      == clust_index->page);
 
 				btr_pcur_store_position(&pcur, &mtr);
-				mtr_commit(&mtr);
+				mtr.commit();
+				mtr_started = false;
 
 				/* Give the waiters a chance to proceed. */
 				os_thread_yield();
 scan_next:
-				mtr_start(&mtr);
+				ut_ad(!mtr_started);
+				ut_ad(!mtr.is_active());
+				mtr.start();
+				mtr_started = true;
 				/* Restore position on the record, or its
 				predecessor if the record was purged
 				meanwhile. */
@@ -1999,7 +1990,8 @@ scan_next:
 					    &pcur, &mtr)) {
 end_of_index:
 					row = NULL;
-					mtr_commit(&mtr);
+					mtr.commit();
+					mtr_started = false;
 					mem_heap_free(row_heap);
 					row_heap = NULL;
 					ut_free(nonnull);
@@ -2007,23 +1999,16 @@ end_of_index:
 					goto write_buffers;
 				}
 			} else {
-				ulint		next_page_no;
-				buf_block_t*	block;
-
-				next_page_no = btr_page_get_next(
+				uint32_t next_page_no = btr_page_get_next(
 					page_cur_get_page(cur));
 
 				if (next_page_no == FIL_NULL) {
 					goto end_of_index;
 				}
 
-				block = page_cur_get_block(cur);
-				block = btr_block_get(
-					page_id_t(block->page.id.space(),
-						  next_page_no),
-					block->zip_size(),
-					BTR_SEARCH_LEAF,
-					clust_index, &mtr);
+				buf_block_t* block = btr_block_get(
+					*clust_index, next_page_no,
+					RW_S_LATCH, false, &mtr);
 
 				btr_leaf_page_release(page_cur_get_block(cur),
 						      BTR_SEARCH_LEAF, &mtr);
@@ -2359,15 +2344,6 @@ write_buffers:
 					conv_heap, &err,
 					&v_heap, eval_table, trx)))) {
 
-				/* Set the page flush observer for the
-				transaction when buffering the very first
-				record for a non-redo-logged operation. */
-				if (file->n_rec == 0 && i == 0
-				    && innodb_log_optimize_ddl) {
-					trx->set_flush_observer(
-						new_table->space, stage);
-				}
-
 				/* If we are creating FTS index,
 				a single row can generate more
 				records for tokenized word */
@@ -2470,7 +2446,8 @@ write_buffers:
 							trx->id, sp_tuples,
 							num_spatial,
 							row_heap, sp_heap,
-							&pcur, &mtr);
+							&pcur, mtr_started,
+							&mtr);
 
 						if (err != DB_SUCCESS) {
 							goto func_exit;
@@ -2478,20 +2455,21 @@ write_buffers:
 
 						/* We are not at the end of
 						the scan yet. We must
-						mtr_commit() in order to be
+						mtr.commit() in order to be
 						able to call log_free_check()
 						in row_merge_insert_index_tuples().
-						Due to mtr_commit(), the
+						Due to mtr.commit(), the
 						current row will be invalid, and
 						we must reread it on the next
 						loop iteration. */
-						if (mtr.is_active()) {
+						if (mtr_started) {
 							btr_pcur_move_to_prev_on_page(
 								&pcur);
 							btr_pcur_store_position(
 								&pcur, &mtr);
 
 							mtr.commit();
+							mtr_started = false;
 						}
 					}
 
@@ -2506,8 +2484,7 @@ write_buffers:
 					if (clust_btr_bulk == NULL) {
 						clust_btr_bulk = UT_NEW_NOKEY(
 							BtrBulk(index[i],
-								trx,
-								trx->get_flush_observer()));
+								trx));
 					} else {
 						clust_btr_bulk->latch();
 					}
@@ -2547,7 +2524,8 @@ write_buffers:
 						next record (the one which we
 						had to ignore due to the buffer
 						overflow). */
-						mtr_start(&mtr);
+						mtr.start();
+						mtr_started = true;
 						btr_pcur_restore_position(
 							BTR_SEARCH_LEAF, &pcur,
 							&mtr);
@@ -2621,9 +2599,7 @@ write_buffers:
 						trx->error_key_num = i;
 						goto all_done;);
 
-					BtrBulk	btr_bulk(
-						index[i], trx,
-						trx->get_flush_observer());
+					BtrBulk	btr_bulk(index[i], trx);
 
 					err = row_merge_insert_index_tuples(
 						index[i], old_table,
@@ -2728,15 +2704,17 @@ write_buffers:
 			/* Update progress for each 1000 rows */
 			curr_progress = (read_rows >= table_total_rows) ?
 					pct_cost :
-				((pct_cost * read_rows) / table_total_rows);
+				pct_cost * static_cast<double>(read_rows)
+				/ static_cast<double>(table_total_rows);
 			/* presenting 10.12% as 1012 integer */
 			onlineddl_pct_progress = (ulint) (curr_progress * 100);
 		}
 	}
 
 func_exit:
-	if (mtr.is_active()) {
-		mtr_commit(&mtr);
+	ut_ad(mtr_started == mtr.is_active());
+	if (mtr_started) {
+		mtr.commit();
 	}
 	if (row_heap) {
 		mem_heap_free(row_heap);
@@ -2769,10 +2747,6 @@ all_done:
 	DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Scan Table\n");
 #endif
 	if (fts_pll_sort) {
-		bool	all_exit = false;
-		ulint	trial_count = 0;
-		const ulint max_trial_count = 10000;
-
 wait_again:
                 /* Check if error occurs in child thread */
 		for (ulint j = 0; j < fts_sort_pll_degree; j++) {
@@ -2805,27 +2779,9 @@ wait_again:
 			}
 		}
 
-		/* Now all children should complete, wait a bit until
-		they all finish setting the event, before we free everything.
-		This has a 10 second timeout */
-		do {
-			all_exit = true;
-
-			for (ulint j = 0; j < fts_sort_pll_degree; j++) {
-				if (psort_info[j].child_status
-				    != FTS_CHILD_EXITING) {
-					all_exit = false;
-					os_thread_sleep(1000);
-					break;
-				}
-			}
-			trial_count++;
-		} while (!all_exit && trial_count < max_trial_count);
-
-		if (!all_exit) {
-			ib::fatal() << "Not all child sort threads exited"
-				" when creating FTS index '"
-				<< fts_sort_idx->name << "'";
+		for (ulint j = 0; j < fts_sort_pll_degree; j++) {
+			psort_info[j].task->wait();
+			delete psort_info[j].task;
 		}
 	}
 
@@ -3338,7 +3294,7 @@ row_merge_sort(
 	num_runs = file->offset;
 
 	if (stage != NULL) {
-		stage->begin_phase_sort(log2(num_runs));
+		stage->begin_phase_sort(log2(double(num_runs)));
 	}
 
 	/* If num_runs are less than 1, nothing to merge */
@@ -3395,7 +3351,8 @@ row_merge_sort(
 			merge_count++;
 			curr_progress = (merge_count >= total_merge_sort_count) ?
 				pct_cost :
-				((pct_cost * merge_count) / total_merge_sort_count);
+				pct_cost * static_cast<double>(merge_count)
+				/ static_cast<double>(total_merge_sort_count);
 			/* presenting 10.12% as 1012 integer */;
 			onlineddl_pct_progress = (ulint) ((pct_progress + curr_progress) * 100);
 		}
@@ -3693,7 +3650,8 @@ row_merge_insert_index_tuples(
 			curr_progress = (inserted_rows >= table_total_rows ||
 				table_total_rows <= 0) ?
 				pct_cost :
-				((pct_cost * inserted_rows) / table_total_rows);
+				pct_cost * static_cast<double>(inserted_rows)
+				/ static_cast<double>(table_total_rows);
 
 			/* presenting 10.12% as 1012 integer */;
 			onlineddl_pct_progress = (ulint) ((pct_progress + curr_progress) * 100);
@@ -4097,15 +4055,15 @@ pfs_os_file_t
 row_merge_file_create_low(
 	const char*	path)
 {
+	if (!path) {
+		path = mysql_tmpdir;
+	}
 #ifdef UNIV_PFS_IO
 	/* This temp file open does not go through normal
 	file APIs, add instrumentation to register with
 	performance schema */
 	struct PSI_file_locker*	locker;
 	PSI_file_locker_state	state;
-	if (!path) {
-		path = mysql_tmpdir;
-	}
 	static const char label[] = "/Innodb Merge Temp File";
 	char* name = static_cast<char*>(
 		ut_malloc_nokey(strlen(path) + sizeof label));
@@ -4117,7 +4075,13 @@ row_merge_file_create_low(
 		PSI_FILE_CREATE, path ? name : label, __FILE__, __LINE__);
 
 #endif
-	pfs_os_file_t fd = innobase_mysql_tmpfile(path);
+	DBUG_ASSERT(strlen(path) + 2 <= FN_REFLEN);
+	char filename[FN_REFLEN];
+	File f = create_temp_file(filename, path, "ib",
+				  O_BINARY | O_SEQUENTIAL,
+				  MYF(MY_WME | MY_TEMPORARY));
+	pfs_os_file_t fd = IF_WIN((os_file_t)my_get_osfhandle(f), f);
+
 #ifdef UNIV_PFS_IO
 	register_pfs_file_open_end(locker, fd, 
 		(fd == OS_FILE_CLOSED)?NULL:&fd);
@@ -4162,7 +4126,9 @@ row_merge_file_destroy_low(
 	const pfs_os_file_t& fd)	/*!< in: merge file descriptor */
 {
 	if (fd != OS_FILE_CLOSED) {
-		os_file_close(fd);
+		int res = mysql_file_close(IF_WIN(my_win_handle2File((os_file_t)fd), fd),
+					   MYF(MY_WME));
+		ut_a(res != -1);
 	}
 }
 /*********************************************************************//**
@@ -4283,138 +4249,6 @@ row_merge_rename_index_to_drop(
 	return(err);
 }
 
-/*********************************************************************//**
-Provide a new pathname for a table that is being renamed if it belongs to
-a file-per-table tablespace.  The caller is responsible for freeing the
-memory allocated for the return value.
-@return new pathname of tablespace file, or NULL if space = 0 */
-static
-char*
-row_make_new_pathname(
-/*==================*/
-	dict_table_t*	table,		/*!< in: table to be renamed */
-	const char*	new_name)	/*!< in: new name */
-{
-	ut_ad(!is_system_tablespace(table->space_id));
-	return os_file_make_new_pathname(table->space->chain.start->name,
-					 new_name);
-}
-
-/*********************************************************************//**
-Rename the tables in the data dictionary.  The data dictionary must
-have been locked exclusively by the caller, because the transaction
-will not be committed.
-@return error code or DB_SUCCESS */
-dberr_t
-row_merge_rename_tables_dict(
-/*=========================*/
-	dict_table_t*	old_table,	/*!< in/out: old table, renamed to
-					tmp_name */
-	dict_table_t*	new_table,	/*!< in/out: new table, renamed to
-					old_table->name */
-	const char*	tmp_name,	/*!< in: new name for old_table */
-	trx_t*		trx)		/*!< in/out: dictionary transaction */
-{
-	dberr_t		err	= DB_ERROR;
-	pars_info_t*	info;
-
-	ut_ad(!srv_read_only_mode);
-	ut_ad(old_table != new_table);
-	ut_d(dict_sys.assert_locked());
-	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
-	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_TABLE
-	      || trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
-
-	trx->op_info = "renaming tables";
-
-	/* We use the private SQL parser of Innobase to generate the query
-	graphs needed in updating the dictionary data in system tables. */
-
-	info = pars_info_create();
-
-	pars_info_add_str_literal(info, "new_name", new_table->name.m_name);
-	pars_info_add_str_literal(info, "old_name", old_table->name.m_name);
-	pars_info_add_str_literal(info, "tmp_name", tmp_name);
-
-	err = que_eval_sql(info,
-			   "PROCEDURE RENAME_TABLES () IS\n"
-			   "BEGIN\n"
-			   "UPDATE SYS_TABLES SET NAME = :tmp_name\n"
-			   " WHERE NAME = :old_name;\n"
-			   "UPDATE SYS_TABLES SET NAME = :old_name\n"
-			   " WHERE NAME = :new_name;\n"
-			   "END;\n", FALSE, trx);
-
-	/* Update SYS_TABLESPACES and SYS_DATAFILES if the old table being
-	renamed is a single-table tablespace, which must be implicitly
-	renamed along with the table. */
-	if (err == DB_SUCCESS
-	    && old_table->space_id) {
-		/* Make pathname to update SYS_DATAFILES. */
-		char* tmp_path = row_make_new_pathname(old_table, tmp_name);
-
-		info = pars_info_create();
-
-		pars_info_add_str_literal(info, "tmp_name", tmp_name);
-		pars_info_add_str_literal(info, "tmp_path", tmp_path);
-		pars_info_add_int4_literal(info, "old_space",
-					   old_table->space_id);
-
-		err = que_eval_sql(info,
-				   "PROCEDURE RENAME_OLD_SPACE () IS\n"
-				   "BEGIN\n"
-				   "UPDATE SYS_TABLESPACES"
-				   " SET NAME = :tmp_name\n"
-				   " WHERE SPACE = :old_space;\n"
-				   "UPDATE SYS_DATAFILES"
-				   " SET PATH = :tmp_path\n"
-				   " WHERE SPACE = :old_space;\n"
-				   "END;\n", FALSE, trx);
-
-		ut_free(tmp_path);
-	}
-
-	/* Update SYS_TABLESPACES and SYS_DATAFILES if the new table being
-	renamed is a single-table tablespace, which must be implicitly
-	renamed along with the table. */
-	if (err == DB_SUCCESS
-	    && dict_table_is_file_per_table(new_table)) {
-		/* Make pathname to update SYS_DATAFILES. */
-		char* old_path = row_make_new_pathname(
-			new_table, old_table->name.m_name);
-
-		info = pars_info_create();
-
-		pars_info_add_str_literal(info, "old_name",
-					  old_table->name.m_name);
-		pars_info_add_str_literal(info, "old_path", old_path);
-		pars_info_add_int4_literal(info, "new_space",
-					   new_table->space_id);
-
-		err = que_eval_sql(info,
-				   "PROCEDURE RENAME_NEW_SPACE () IS\n"
-				   "BEGIN\n"
-				   "UPDATE SYS_TABLESPACES"
-				   " SET NAME = :old_name\n"
-				   " WHERE SPACE = :new_space;\n"
-				   "UPDATE SYS_DATAFILES"
-				   " SET PATH = :old_path\n"
-				   " WHERE SPACE = :new_space;\n"
-				   "END;\n", FALSE, trx);
-
-		ut_free(old_path);
-	}
-
-	if (err == DB_SUCCESS && (new_table->flags2 & DICT_TF2_DISCARDED)) {
-		err = row_import_update_discarded_flag(
-			trx, new_table->id, true);
-	}
-
-	trx->op_info = "";
-
-	return(err);
-}
-
 /** Create the index and load in to the dictionary.
 @param[in,out]	table		the index is on this table
 @param[in]	index_def	the index definition
@@ -4518,26 +4352,6 @@ row_merge_drop_table(
 			trx, SQLCOM_DROP_TABLE, false, false));
 }
 
-/** Write an MLOG_INDEX_LOAD record to indicate in the redo-log
-that redo-logging of individual index pages was disabled, and
-the flushing of such pages to the data files was completed.
-@param[in]	index	an index tree on which redo logging was disabled */
-void row_merge_write_redo(const dict_index_t* index)
-{
-	ut_ad(!index->table->is_temporary());
-	ut_ad(!(index->type & (DICT_SPATIAL | DICT_FTS)));
-
-	mtr_t mtr;
-	mtr.start();
-	byte* log_ptr = mlog_open(&mtr, 11 + 8);
-	log_ptr = mlog_write_initial_log_record_low(
-		MLOG_INDEX_LOAD,
-		index->table->space_id, index->page, log_ptr, &mtr);
-	mach_write_to_8(log_ptr, index->id);
-	mlog_close(&mtr, log_ptr + 8);
-	mtr.commit();
-}
-
 /** Build indexes on a table by reading a clustered index, creating a temporary
 file containing index entries, merge sorting these index entries and inserting
 sorted index entries to indexes.
@@ -4600,7 +4414,6 @@ row_merge_build_indexes(
 	dict_index_t*		fts_sort_idx = NULL;
 	fts_psort_t*		psort_info = NULL;
 	fts_psort_t*		merge_info = NULL;
-	int64_t			sig_count = 0;
 	bool			fts_psort_initiated = false;
 
 	double total_static_cost = 0;
@@ -4669,8 +4482,10 @@ row_merge_build_indexes(
 		merge_files[i].n_rec = 0;
 	}
 
-	total_static_cost = COST_BUILD_INDEX_STATIC * n_indexes + COST_READ_CLUSTERED_INDEX;
-	total_dynamic_cost = COST_BUILD_INDEX_DYNAMIC * n_indexes;
+	total_static_cost = COST_BUILD_INDEX_STATIC
+		* static_cast<double>(n_indexes) + COST_READ_CLUSTERED_INDEX;
+	total_dynamic_cost = COST_BUILD_INDEX_DYNAMIC
+		* static_cast<double>(n_indexes);
 	for (i = 0; i < n_indexes; i++) {
 		if (indexes[i]->type & DICT_FTS) {
 			ibool	opt_doc_id_size = FALSE;
@@ -4767,65 +4582,14 @@ row_merge_build_indexes(
 		}
 
 		if (indexes[i]->type & DICT_FTS) {
-			os_event_t	fts_parallel_merge_event;
 
 			sort_idx = fts_sort_idx;
 
-			fts_parallel_merge_event
-				= merge_info[0].psort_common->merge_event;
-
 			if (FTS_PLL_MERGE) {
-				ulint	trial_count = 0;
-				bool	all_exit = false;
-
-				os_event_reset(fts_parallel_merge_event);
 				row_fts_start_parallel_merge(merge_info);
-wait_again:
-				os_event_wait_time_low(
-					fts_parallel_merge_event, 1000000,
-					sig_count);
-
 				for (j = 0; j < FTS_NUM_AUX_INDEX; j++) {
-					if (merge_info[j].child_status
-					    != FTS_CHILD_COMPLETE
-					    && merge_info[j].child_status
-					    != FTS_CHILD_EXITING) {
-						sig_count = os_event_reset(
-						fts_parallel_merge_event);
-
-						goto wait_again;
-					}
-				}
-
-				/* Now all children should complete, wait
-				a bit until they all finish using event */
-				while (!all_exit && trial_count < 10000) {
-					all_exit = true;
-
-					for (j = 0; j < FTS_NUM_AUX_INDEX;
-					     j++) {
-						if (merge_info[j].child_status
-						    != FTS_CHILD_EXITING) {
-							all_exit = false;
-							os_thread_sleep(1000);
-							break;
-						}
-					}
-					trial_count++;
-				}
-
-				if (!all_exit) {
-					ib::error() << "Not all child merge"
-						" threads exited when creating"
-						" FTS index '"
-						<< indexes[i]->name << "'";
-				} else {
-					for (j = 0; j < FTS_NUM_AUX_INDEX;
-					     j++) {
-
-						os_thread_join(merge_info[j]
-							       .thread_hdl);
-					}
+					merge_info[j].task->wait();
+					delete merge_info[j].task;
 				}
 			} else {
 				/* This cannot report duplicates; an
@@ -4844,9 +4608,10 @@ wait_again:
 				sort_idx, table, col_map, 0};
 
 			pct_cost = (COST_BUILD_INDEX_STATIC +
-				(total_dynamic_cost * merge_files[k].offset /
-					total_index_blocks)) /
-				(total_static_cost + total_dynamic_cost)
+				    (total_dynamic_cost
+				     * static_cast<double>(merge_files[k].offset)
+				     / static_cast<double>(total_index_blocks)))
+				/ (total_static_cost + total_dynamic_cost)
 				* PCT_COST_MERGESORT_INDEX * 100;
 			char*	bufend = innobase_convert_name(
 				buf, sizeof buf,
@@ -4886,14 +4651,17 @@ wait_again:
 			}
 
 			if (error == DB_SUCCESS) {
-				BtrBulk	btr_bulk(sort_idx, trx,
-						 trx->get_flush_observer());
+				BtrBulk	btr_bulk(sort_idx, trx);
 
 				pct_cost = (COST_BUILD_INDEX_STATIC +
-					(total_dynamic_cost * merge_files[k].offset /
-						total_index_blocks)) /
-					(total_static_cost + total_dynamic_cost) *
-					PCT_COST_INSERT_INDEX * 100;
+					    (total_dynamic_cost
+					     * static_cast<double>(
+						     merge_files[k].offset)
+					     / static_cast<double>(
+						     total_index_blocks)))
+					/ (total_static_cost
+					   + total_dynamic_cost)
+					* PCT_COST_INSERT_INDEX * 100;
 
 				if (global_system_variables.log_warnings > 2) {
 					sql_print_information(
@@ -4933,21 +4701,10 @@ wait_again:
 		if (indexes[i]->type & DICT_FTS) {
 			row_fts_psort_info_destroy(psort_info, merge_info);
 			fts_psort_initiated = false;
-		} else if (dict_index_is_spatial(indexes[i])) {
-			/* We never disable redo logging for
-			creating SPATIAL INDEX. Avoid writing any
-			unnecessary MLOG_INDEX_LOAD record. */
 		} else if (old_table != new_table) {
 			ut_ad(!sort_idx->online_log);
 			ut_ad(sort_idx->online_status
 			      == ONLINE_INDEX_COMPLETE);
-		} else if (FlushObserver* flush_observer =
-			   trx->get_flush_observer()) {
-			if (error != DB_SUCCESS) {
-				flush_observer->interrupted();
-			}
-			flush_observer->flush();
-			row_merge_write_redo(indexes[i]);
 		}
 
 		if (old_table != new_table
@@ -5003,11 +4760,10 @@ func_exit:
 
 	ut_free(merge_files);
 
-	alloc.deallocate_large(block, &block_pfx, block_size);
+	alloc.deallocate_large(block, &block_pfx);
 
 	if (crypt_block) {
-		alloc.deallocate_large(crypt_block, &crypt_pfx,
-				       block_size);
+		alloc.deallocate_large(crypt_block, &crypt_pfx);
 	}
 
 	DICT_TF2_FLAG_UNSET(new_table, DICT_TF2_FTS_ADD_DOC_ID);
@@ -5050,37 +4806,5 @@ func_exit:
 	}
 
 	DBUG_EXECUTE_IF("ib_index_crash_after_bulk_load", DBUG_SUICIDE(););
-
-	if (FlushObserver* flush_observer = trx->get_flush_observer()) {
-
-		DBUG_EXECUTE_IF("ib_index_build_fail_before_flush",
-			error = DB_INTERRUPTED;
-		);
-
-		if (error != DB_SUCCESS) {
-			flush_observer->interrupted();
-		}
-
-		flush_observer->flush();
-
-		if (old_table != new_table) {
-			for (const dict_index_t* index
-				     = dict_table_get_first_index(new_table);
-			     index != NULL;
-			     index = dict_table_get_next_index(index)) {
-				if (!(index->type
-				      & (DICT_FTS | DICT_SPATIAL))) {
-					row_merge_write_redo(index);
-				}
-			}
-		}
-
-		trx->remove_flush_observer();
-
-		if (trx_is_interrupted(trx)) {
-			error = DB_INTERRUPTED;
-		}
-	}
-
 	DBUG_RETURN(error);
 }
diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc
index 051b9b89cd5..360a5e35fa4 100644
--- a/storage/innobase/row/row0mysql.cc
+++ b/storage/innobase/row/row0mysql.cc
@@ -54,7 +54,6 @@ Created 9/17/2000 Heikki Tuuri
 #include "rem0cmp.h"
 #include "row0import.h"
 #include "row0ins.h"
-#include "row0merge.h"
 #include "row0row.h"
 #include "row0sel.h"
 #include "row0upd.h"
@@ -712,7 +711,7 @@ handle_new_error:
 			/* Roll back the latest, possibly incomplete insertion
 			or update */
 
-			trx_rollback_to_savepoint(trx, savept);
+			trx->rollback(savept);
 		}
 		/* MySQL will roll back the latest SQL statement */
 		break;
@@ -735,7 +734,7 @@ handle_new_error:
 		/* Roll back the whole transaction; this resolution was added
 		to version 3.23.43 */
 
-		trx_rollback_to_savepoint(trx, NULL);
+		trx->rollback();
 		break;
 
 	case DB_MUST_GET_MORE_FILE_SPACE:
@@ -1166,7 +1165,7 @@ row_lock_table_autoinc_for_mysql(
 
 	thr = que_fork_get_first_thr(prebuilt->ins_graph);
 
-	que_thr_move_to_run_state_for_mysql(thr, trx);
+	thr->start_running();
 
 run_again:
 	thr->run_node = node;
@@ -1195,7 +1194,7 @@ run_again:
 		return(err);
 	}
 
-	que_thr_stop_for_mysql_no_error(thr, trx);
+	thr->stop_no_error();
 
 	trx->op_info = "";
 
@@ -1225,7 +1224,7 @@ row_lock_table(row_prebuilt_t* prebuilt)
 
 	thr = que_fork_get_first_thr(prebuilt->sel_graph);
 
-	que_thr_move_to_run_state_for_mysql(thr, trx);
+	thr->start_running();
 
 run_again:
 	thr->run_node = thr;
@@ -1257,7 +1256,7 @@ run_again:
 		return(err);
 	}
 
-	que_thr_stop_for_mysql_no_error(thr, trx);
+	thr->stop_no_error();
 
 	trx->op_info = "";
 
@@ -1392,7 +1391,7 @@ row_insert_for_mysql(
 		node->state = INS_NODE_ALLOC_ROW_ID;
 	}
 
-	que_thr_move_to_run_state_for_mysql(thr, trx);
+	thr->start_running();
 
 run_again:
 	thr->run_node = node;
@@ -1477,7 +1476,7 @@ error_exit:
 		}
 	}
 
-	que_thr_stop_for_mysql_no_error(thr, trx);
+	thr->stop_no_error();
 
 	if (table->is_system_db) {
 		srv_stats.n_system_rows_inserted.inc(size_t(trx->id));
@@ -1789,7 +1788,7 @@ row_update_for_mysql(row_prebuilt_t* prebuilt)
 
 	ut_ad(!prebuilt->sql_stat_start);
 
-	que_thr_move_to_run_state_for_mysql(thr, trx);
+	thr->start_running();
 
 	ut_ad(!prebuilt->versioned_write || node->table->versioned());
 
@@ -1834,13 +1833,13 @@ row_update_for_mysql(row_prebuilt_t* prebuilt)
 		}
 	}
 
-	que_thr_stop_for_mysql_no_error(thr, trx);
+	thr->stop_no_error();
 
 	if (dict_table_has_fts_index(table)
 	    && trx->fts_next_doc_id != UINT64_UNDEFINED) {
 		err = row_fts_update_or_delete(prebuilt);
 		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
-			ut_ad(!"unexpected error");
+			ut_ad("unexpected error" == 0);
 			goto error;
 		}
 	}
@@ -1898,8 +1897,8 @@ error:
 	DBUG_RETURN(err);
 }
 
-/** This can only be used when srv_locks_unsafe_for_binlog is TRUE or this
-session is using a READ COMMITTED or READ UNCOMMITTED isolation level.
+/** This can only be used when the current transaction is at
+READ COMMITTED or READ UNCOMMITTED isolation level.
 Before calling this function row_search_for_mysql() must have
 initialized prebuilt->new_rec_locks to store the information which new
 record locks really were set. This function removes a newly set
@@ -1922,17 +1921,8 @@ row_unlock_for_mysql(
 
 	ut_ad(prebuilt != NULL);
 	ut_ad(trx != NULL);
+	ut_ad(trx->isolation_level <= TRX_ISO_READ_COMMITTED);
 
-	if (UNIV_UNLIKELY
-	    (!srv_locks_unsafe_for_binlog
-	     && trx->isolation_level > TRX_ISO_READ_COMMITTED)) {
-
-		ib::error() << "Calling row_unlock_for_mysql though"
-			" innodb_locks_unsafe_for_binlog is FALSE and this"
-			" session is not using READ COMMITTED isolation"
-			" level.";
-		return;
-	}
 	if (dict_index_is_spatial(prebuilt->index)) {
 		return;
 	}
@@ -2392,7 +2382,7 @@ row_create_table_for_mysql(
 		break;
 	case DB_OUT_OF_FILE_SPACE:
 		trx->error_state = DB_SUCCESS;
-		trx_rollback_to_savepoint(trx, NULL);
+		trx->rollback();
 
 		ib::warn() << "Cannot create table "
 			<< table->name
@@ -2423,7 +2413,7 @@ row_create_table_for_mysql(
 	case DB_TABLESPACE_EXISTS:
 	default:
 		trx->error_state = DB_SUCCESS;
-		trx_rollback_to_savepoint(trx, NULL);
+		trx->rollback();
 		dict_mem_table_free(table);
 		break;
 	}
@@ -2526,8 +2516,8 @@ row_create_index_for_mysql(
 		ut_ad((index == NULL) == (err != DB_SUCCESS));
 		if (UNIV_LIKELY(err == DB_SUCCESS)) {
 			ut_ad(!index->is_instant());
-			index->n_core_null_bytes = UT_BITS_IN_BYTES(
-				unsigned(index->n_nullable));
+			index->n_core_null_bytes = static_cast<uint8_t>(
+				UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
 
 			err = dict_create_index_tree_in_mem(index, trx);
 #ifdef BTR_CUR_HASH_ADAPT
@@ -2735,7 +2725,10 @@ row_mysql_drop_garbage_tables()
 		table_name = mem_heap_strdupl(
 			heap,
 			reinterpret_cast<const char*>(field), len);
-		if (strstr(table_name, "/" TEMP_FILE_PREFIX "-")) {
+		if (strstr(table_name, "/" TEMP_FILE_PREFIX "-") &&
+                    !strstr(table_name, "/" TEMP_FILE_PREFIX "-backup-") &&
+                    !strstr(table_name, "/" TEMP_FILE_PREFIX "-exchange-"))
+                {
 			btr_pcur_store_position(&pcur, &mtr);
 			btr_pcur_commit_specify_mtr(&pcur, &mtr);
 
@@ -2947,13 +2940,13 @@ row_discard_tablespace_end(
 	}
 
 	DBUG_EXECUTE_IF("ib_discard_before_commit_crash",
-			log_write_up_to(LSN_MAX, true);
+			log_buffer_flush_to_disk();
 			DBUG_SUICIDE(););
 
 	trx_commit_for_mysql(trx);
 
 	DBUG_EXECUTE_IF("ib_discard_after_commit_crash",
-			log_write_up_to(LSN_MAX, true);
+			log_buffer_flush_to_disk();
 			DBUG_SUICIDE(););
 
 	row_mysql_unlock_data_dictionary(trx);
@@ -3173,7 +3166,7 @@ row_mysql_lock_table(
 	thr = que_fork_get_first_thr(
 		static_cast<que_fork_t*>(que_node_get_parent(thr)));
 
-	que_thr_move_to_run_state_for_mysql(thr, trx);
+	thr->start_running();
 
 run_again:
 	thr->run_node = thr;
@@ -3184,7 +3177,7 @@ run_again:
 	trx->error_state = err;
 
 	if (err == DB_SUCCESS) {
-		que_thr_stop_for_mysql_no_error(thr, trx);
+		thr->stop_no_error();
 	} else {
 		que_thr_stop_for_mysql(thr);
 
@@ -3339,6 +3332,8 @@ row_drop_table_for_mysql(
 		DBUG_RETURN(DB_TABLE_NOT_FOUND);
 	}
 
+	std::vector<pfs_os_file_t> detached_handles;
+
 	const bool is_temp_name = strstr(table->name.m_name,
 					 "/" TEMP_FILE_PREFIX);
 
@@ -3373,19 +3368,6 @@ row_drop_table_for_mysql(
 		table records yet. Thus it is safe to release and
 		reacquire the data dictionary latches. */
 		if (table->fts) {
-			ut_ad(!table->fts->add_wq);
-			ut_ad(lock_trx_has_sys_table_locks(trx) == 0);
-
-			for (;;) {
-				bool retry = false;
-				if (dict_fts_index_syncing(table)) {
-					retry = true;
-				}
-				if (!retry) {
-					break;
-				}
-				DICT_BG_YIELD(trx);
-			}
 			row_mysql_unlock_data_dictionary(trx);
 			fts_optimize_remove_table(table);
 			row_mysql_lock_data_dictionary(trx);
@@ -3406,9 +3388,8 @@ row_drop_table_for_mysql(
 
 		dict_stats_recalc_pool_del(table);
 		dict_stats_defrag_pool_del(table, NULL);
-		if (btr_defragment_thread_active) {
-			/* During fts_drop_orphaned_tables() in
-			recv_recovery_rollback_active() the
+		if (btr_defragment_active) {
+			/* During fts_drop_orphaned_tables() the
 			btr_defragment_mutex has not yet been
 			initialized by btr_defragment_init(). */
 			btr_defragment_remove_table(table);
@@ -3488,7 +3469,15 @@ row_drop_table_for_mysql(
 
 	if (table->n_foreign_key_checks_running > 0) {
 defer:
-		if (!is_temp_name) {
+		/* Rename #sql-backup to #sql-ib if table has open ref count
+		while dropping the table. This scenario can happen
+		when purge thread is waiting for dict_sys.mutex so
+		that it could close the table. But drop table acquires
+		dict_sys.mutex.
+                In the future this should use 'tmp_file_prefix'!
+                */
+		if (!is_temp_name
+		    || strstr(table->name.m_name, "/#sql-backup-")) {
 			heap = mem_heap_create(FN_REFLEN);
 			const char* tmp_name
 				= dict_mem_create_temporary_tablename(
@@ -3727,7 +3716,8 @@ do_drop:
 		ut_ad(!filepath);
 
 		if (space->id != TRX_SYS_SPACE) {
-			err = fil_delete_tablespace(space->id);
+			err = fil_delete_tablespace(space->id, false,
+						    &detached_handles);
 		}
 		break;
 
@@ -3754,7 +3744,7 @@ do_drop:
 			<< ut_get_name(trx, tablename) << ".";
 
 		trx->error_state = DB_SUCCESS;
-		trx_rollback_to_savepoint(trx, NULL);
+		trx->rollback();
 		trx->error_state = DB_SUCCESS;
 
 		/* Mark all indexes available in the data dictionary
@@ -3807,9 +3797,12 @@ funct_exit_all_freed:
 		row_mysql_unlock_data_dictionary(trx);
 	}
 
-	trx->op_info = "";
+	for (const auto& handle : detached_handles) {
+		ut_ad(handle != OS_FILE_CLOSED);
+		os_file_close(handle);
+	}
 
-	srv_wake_master_thread();
+	trx->op_info = "";
 
 	DBUG_RETURN(err);
 }
@@ -4130,7 +4123,6 @@ row_rename_table_for_mysql(
 					FOREIGN KEY constraints */
 {
 	dict_table_t*	table			= NULL;
-	ibool		dict_locked		= FALSE;
 	dberr_t		err			= DB_ERROR;
 	mem_heap_t*	heap			= NULL;
 	const char**	constraints_to_drop	= NULL;
@@ -4144,6 +4136,8 @@ row_rename_table_for_mysql(
 	ut_a(old_name != NULL);
 	ut_a(new_name != NULL);
 	ut_ad(trx->state == TRX_STATE_ACTIVE);
+	const bool dict_locked = trx->dict_operation_lock_mode == RW_X_LATCH;
+	ut_ad(!commit || dict_locked);
 
 	if (high_level_read_only) {
 		return(DB_READ_ONLY);
@@ -4154,8 +4148,6 @@ row_rename_table_for_mysql(
 	old_is_tmp = dict_table_t::is_temporary_name(old_name);
 	new_is_tmp = dict_table_t::is_temporary_name(new_name);
 
-	dict_locked = trx->dict_operation_lock_mode == RW_X_LATCH;
-
 	table = dict_table_open_on_name(old_name, dict_locked, FALSE,
 					DICT_ERR_IGNORE_FK_NOKEY);
 
@@ -4259,6 +4251,10 @@ row_rename_table_for_mysql(
 	}
 
 	if (!table->is_temporary()) {
+		if (commit) {
+			dict_stats_wait_bg_to_stop_using_table(table, trx);
+		}
+
 		err = trx_undo_report_rename(trx, table);
 
 		if (err != DB_SUCCESS) {
@@ -4283,6 +4279,7 @@ row_rename_table_for_mysql(
 			   "END;\n"
 			   , FALSE, trx);
 
+	/* Assume the caller guarantees destination name doesn't exist. */
 	ut_ad(err != DB_DUPLICATE_KEY);
 
 	/* SYS_TABLESPACES and SYS_DATAFILES need to be updated if
@@ -4318,7 +4315,7 @@ row_rename_table_for_mysql(
 		ut_free(new_path);
 	}
 	if (err != DB_SUCCESS) {
-		goto end;
+		goto err_exit;
 	}
 
 	if (!new_is_tmp) {
@@ -4462,8 +4459,8 @@ row_rename_table_for_mysql(
 		}
 	}
 
-end:
 	if (err != DB_SUCCESS) {
+err_exit:
 		if (err == DB_DUPLICATE_KEY) {
 			ib::error() << "Possible reasons:";
 			ib::error() << "(1) Table rename would cause two"
@@ -4492,7 +4489,7 @@ end:
 				" succeed.";
 		}
 		trx->error_state = DB_SUCCESS;
-		trx_rollback_to_savepoint(trx, NULL);
+		trx->rollback();
 		trx->error_state = DB_SUCCESS;
 	} else {
 		/* The following call will also rename the .ibd data file if
@@ -4502,7 +4499,7 @@ end:
 			table, new_name, !new_is_tmp);
 		if (err != DB_SUCCESS) {
 			trx->error_state = DB_SUCCESS;
-			trx_rollback_to_savepoint(trx, NULL);
+			trx->rollback();
 			trx->error_state = DB_SUCCESS;
 			goto funct_exit;
 		}
@@ -4515,13 +4512,16 @@ end:
 		}
 
 		/* We only want to switch off some of the type checking in
-		an ALTER TABLE...ALGORITHM=COPY, not in a RENAME. */
+		an ALTER TABLE, not in a RENAME. */
 		dict_names_t	fk_tables;
 
 		err = dict_load_foreigns(
-			new_name, NULL,
-			false, !old_is_tmp || trx->check_foreigns,
-			DICT_ERR_IGNORE_NONE, fk_tables);
+			new_name, NULL, false,
+			!old_is_tmp || trx->check_foreigns,
+			use_fk
+			? DICT_ERR_IGNORE_NONE
+			: DICT_ERR_IGNORE_FK_NOKEY,
+			fk_tables);
 
 		if (err != DB_SUCCESS) {
 
@@ -4548,10 +4548,8 @@ end:
 					" with the new table definition.";
 			}
 
-			ut_a(DB_SUCCESS == dict_table_rename_in_cache(
-				table, old_name, FALSE));
 			trx->error_state = DB_SUCCESS;
-			trx_rollback_to_savepoint(trx, NULL);
+			trx->rollback();
 			trx->error_state = DB_SUCCESS;
 		}
 
@@ -4563,7 +4561,7 @@ end:
 			ut_a(DB_SUCCESS == dict_table_rename_in_cache(
 				table, old_name, FALSE));
 			trx->error_state = DB_SUCCESS;
-			trx_rollback_to_savepoint(trx, NULL);
+			trx->rollback();
 			trx->error_state = DB_SUCCESS;
 			goto funct_exit;
 		}
@@ -4615,6 +4613,9 @@ funct_exit:
 	}
 
 	if (table != NULL) {
+		if (commit && !table->is_temporary()) {
+			table->stats_bg_flag &= byte(~BG_STAT_SHOULD_QUIT);
+		}
 		dict_table_close(table, dict_locked, FALSE);
 	}
 
diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc
index d61dc2c2280..ff19ed9830e 100644
--- a/storage/innobase/row/row0purge.cc
+++ b/storage/innobase/row/row0purge.cc
@@ -46,7 +46,6 @@ Created 3/14/1997 Heikki Tuuri
 #include "handler.h"
 #include "ha_innodb.h"
 #include "fil0fil.h"
-#include "debug_sync.h"
 
 /*************************************************************************
 IMPORTANT NOTE: Any operation that generates redo MUST check that there
@@ -105,15 +104,13 @@ row_purge_remove_clust_if_poss_low(
 	purge_node_t*	node,	/*!< in/out: row purge node */
 	ulint		mode)	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
 {
-	ut_ad(rw_lock_own(&dict_sys.latch, RW_LOCK_S)
-	      || node->vcol_info.is_used());
-
 	dict_index_t* index = dict_table_get_first_index(node->table);
 
 	log_free_check();
 
 	mtr_t mtr;
 	mtr.start();
+	index->set_modified(mtr);
 
 	if (!row_purge_reposition_pcur(mode, node, &mtr)) {
 		/* The record was already removed. */
@@ -121,9 +118,6 @@ row_purge_remove_clust_if_poss_low(
 		return true;
 	}
 
-	ut_d(const bool was_instant = !!index->table->instant);
-	index->set_modified(mtr);
-
 	rec_t* rec = btr_pcur_get_rec(&node->pcur);
 	rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
 	rec_offs_init(offsets_);
@@ -164,10 +158,6 @@ row_purge_remove_clust_if_poss_low(
 		}
 	}
 
-	/* Prove that dict_index_t::clear_instant_alter() was
-	not called with index->table->instant != NULL. */
-	ut_ad(!was_instant || index->table->instant);
-
 func_exit:
 	if (heap) {
 		mem_heap_free(heap);
@@ -213,54 +203,6 @@ row_purge_remove_clust_if_poss(
 	return(false);
 }
 
-/** Tries to store secondary index cursor before openin mysql table for
-virtual index condition computation.
-@param[in,out]	node		row purge node
-@param[in]	index		secondary index
-@param[in,out]	sec_pcur	secondary index cursor
-@param[in,out]	sec_mtr		mini-transaction which holds
-				secondary index entry */
-static void row_purge_store_vsec_cur(
-        purge_node_t*   node,
-        dict_index_t*   index,
-        btr_pcur_t*     sec_pcur,
-        mtr_t*          sec_mtr)
-{
-	row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, sec_mtr);
-
-	if (!node->found_clust) {
-		return;
-	}
-
-	node->vcol_info.set_requested();
-
-	btr_pcur_store_position(sec_pcur, sec_mtr);
-
-	btr_pcurs_commit_specify_mtr(&node->pcur, sec_pcur, sec_mtr);
-}
-
-/** Tries to restore secondary index cursor after opening the mysql table
-@param[in,out]	node	row purge node
-@param[in]	index	secondary index
-@param[in,out]	sec_mtr	mini-transaction which holds secondary index entry
-@param[in]	is_tree	true=pessimistic purge,
-			false=optimistic (leaf-page only)
-@return false in case of restore failure. */
-static bool row_purge_restore_vsec_cur(
-	purge_node_t*	node,
-	dict_index_t*	index,
-	btr_pcur_t*	sec_pcur,
-	mtr_t*		sec_mtr,
-	bool		is_tree)
-{
-	sec_mtr->start();
-	index->set_modified(*sec_mtr);
-
-	return btr_pcur_restore_position(
-		is_tree ? BTR_PURGE_TREE : BTR_PURGE_LEAF,
-		sec_pcur, sec_mtr) == btr_pcur_t::SAME_ALL;
-}
-
 /** Determines if it is possible to remove a secondary index entry.
 Removal is possible if the secondary index entry does not refer to any
 not delete marked version of a clustered index record where DB_TRX_ID
@@ -300,53 +242,13 @@ row_purge_poss_sec(
 
 	ut_ad(!dict_index_is_clust(index));
 
-	const bool	store_cur = sec_mtr && !node->vcol_info.is_used()
-		&& dict_index_has_virtual(index);
-
-	if (store_cur) {
-		row_purge_store_vsec_cur(node, index, sec_pcur, sec_mtr);
-		ut_ad(sec_mtr->has_committed()
-		      == node->vcol_info.is_requested());
-
-		/* The PRIMARY KEY value was not found in the clustered
-		index. The secondary index record found. We can purge
-		the secondary index record. */
-		if (!node->vcol_info.is_requested()) {
-			ut_ad(!node->found_clust);
-			return true;
-		}
-	}
-
-retry_purge_sec:
 	mtr_start(&mtr);
 
 	can_delete = !row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr)
 		|| !row_vers_old_has_index_entry(true,
 						 btr_pcur_get_rec(&node->pcur),
 						 &mtr, index, entry,
-						 node->roll_ptr, node->trx_id,
-						 &node->vcol_info);
-
-	if (node->vcol_info.is_first_fetch()) {
-		ut_ad(store_cur);
-
-		const TABLE* t= node->vcol_info.table();
-		DBUG_LOG("purge", "retry " << t
-			 << (is_tree ? " tree" : " leaf")
-			 << index->name << "," << index->table->name
-			 << ": " << rec_printer(entry).str());
-
-		ut_ad(mtr.has_committed());
-
-		if (t) {
-			node->vcol_info.set_used();
-			goto retry_purge_sec;
-		}
-
-		node->table = NULL;
-		sec_pcur = NULL;
-		return false;
-	}
+						 node->roll_ptr, node->trx_id);
 
 	/* Persistent cursor is closed if reposition fails. */
 	if (node->found_clust) {
@@ -357,18 +259,6 @@ retry_purge_sec:
 
 	ut_ad(mtr.has_committed());
 
-	/* If the virtual column info is not used then reset the virtual column
-	info. */
-	if (node->vcol_info.is_requested()
-	    && !node->vcol_info.is_used()) {
-		node->vcol_info.reset();
-	}
-
-	if (store_cur && !row_purge_restore_vsec_cur(
-		    node, index, sec_pcur, sec_mtr, is_tree)) {
-		return false;
-	}
-
 	return can_delete;
 }
 
@@ -485,13 +375,6 @@ row_purge_remove_sec_if_poss_tree(
 		}
 	}
 
-	if (node->vcol_op_failed()) {
-		ut_ad(mtr.has_committed());
-		ut_ad(!pcur.old_rec_buf);
-		ut_ad(pcur.pos_state == BTR_PCUR_NOT_POSITIONED);
-		return false;
-	}
-
 func_exit:
 	btr_pcur_close(&pcur); // FIXME: need this?
 func_exit_no_pcur:
@@ -607,26 +490,20 @@ row_purge_remove_sec_if_poss_leaf(
 				goto func_exit_no_pcur;
 			}
 
-			if (dict_index_is_spatial(index)) {
-				const page_t*   page;
-				const trx_t*	trx = NULL;
-
-				if (btr_cur->rtr_info != NULL
-				    && btr_cur->rtr_info->thr != NULL) {
-					trx = thr_get_trx(
-						btr_cur->rtr_info->thr);
-				}
-
-				page = btr_cur_get_page(btr_cur);
-
-				if (!lock_test_prdt_page_lock(
-					    trx,
-					    page_get_space_id(page),
-					    page_get_page_no(page))
-				    && page_get_n_recs(page) < 2
-				    && btr_cur_get_block(btr_cur)
-				    ->page.id.page_no() !=
-				    dict_index_get_page(index)) {
+			if (index->is_spatial()) {
+				const buf_block_t* block = btr_cur_get_block(
+					btr_cur);
+
+				if (block->page.id().page_no()
+				    != index->page
+				    && page_get_n_recs(block->frame) < 2
+				    && !lock_test_prdt_page_lock(
+					    btr_cur->rtr_info
+					    && btr_cur->rtr_info->thr
+					    ? thr_get_trx(
+						    btr_cur->rtr_info->thr)
+					    : nullptr,
+					    block->page.id())) {
 					/* this is the last record on page,
 					and it has a "page" lock on it,
 					which mean search is still depending
@@ -634,8 +511,7 @@ row_purge_remove_sec_if_poss_leaf(
 					DBUG_LOG("purge",
 						 "skip purging last"
 						 " record on page "
-						 << btr_cur_get_block(btr_cur)
-						 ->page.id);
+						 << block->page.id());
 
 					btr_pcur_close(&pcur);
 					mtr.commit();
@@ -650,11 +526,6 @@ row_purge_remove_sec_if_poss_leaf(
 			}
 		}
 
-		if (node->vcol_op_failed()) {
-			btr_pcur_close(&pcur);
-			return false;
-		}
-
 		/* (The index entry is still needed,
 		or the deletion succeeded) */
 		/* fall through */
@@ -701,10 +572,6 @@ row_purge_remove_sec_if_poss(
 		return;
 	}
 retry:
-	if (node->vcol_op_failed()) {
-		return;
-	}
-
 	success = row_purge_remove_sec_if_poss_tree(node, index, entry);
 	/* The delete operation may fail if we have little
 	file space left: TODO: easiest to crash the database
@@ -771,12 +638,6 @@ row_purge_del_mark(
 				node->row, NULL, node->index,
 				heap, ROW_BUILD_FOR_PURGE);
 			row_purge_remove_sec_if_poss(node, node->index, entry);
-
-			if (node->vcol_op_failed()) {
-				mem_heap_free(heap);
-				return false;
-			}
-
 			mem_heap_empty(heap);
 		}
 
@@ -794,8 +655,6 @@ whose old history can no longer be observed.
 @param[in,out]	mtr	mini-transaction (will be started and committed) */
 static void row_purge_reset_trx_id(purge_node_t* node, mtr_t* mtr)
 {
-	ut_ad(rw_lock_own(&dict_sys.latch, RW_LOCK_S)
-	      || node->vcol_info.is_used());
 	/* Reset DB_TRX_ID, DB_ROLL_PTR for old records. */
 	mtr->start();
 
@@ -836,11 +695,10 @@ static void row_purge_reset_trx_id(purge_node_t* node, mtr_t* mtr)
 						    rec, index, offsets)));
 
 			index->set_modified(*mtr);
-			if (page_zip_des_t* page_zip
-			    = buf_block_get_page_zip(
-				    btr_pcur_get_block(&node->pcur))) {
+			buf_block_t* block = btr_pcur_get_block(&node->pcur);
+			if (UNIV_LIKELY_NULL(block->page.zip.data)) {
 				page_zip_write_trx_id_and_roll_ptr(
-					page_zip, rec, offsets, trx_id_pos,
+					block, rec, offsets, trx_id_pos,
 					0, 1ULL << ROLL_PTR_INSERT_FLAG_POS,
 					mtr);
 			} else {
@@ -848,8 +706,14 @@ static void row_purge_reset_trx_id(purge_node_t* node, mtr_t* mtr)
 				byte*	ptr = rec_get_nth_field(
 					rec, offsets, trx_id_pos, &len);
 				ut_ad(len == DATA_TRX_ID_LEN);
-				mlog_write_string(ptr, reset_trx_id,
-						  sizeof reset_trx_id, mtr);
+				size_t offs = page_offset(ptr);
+				mtr->memset(block, offs, DATA_TRX_ID_LEN, 0);
+				offs += DATA_TRX_ID_LEN;
+				mtr->write<1,mtr_t::MAYBE_NOP>(*block,
+							       block->frame
+							       + offs, 0x80U);
+				mtr->memset(block, offs + 1,
+					    DATA_ROLL_PTR_LEN - 1, 0);
 			}
 		}
 	}
@@ -872,8 +736,6 @@ row_purge_upd_exist_or_extern_func(
 {
 	mem_heap_t*	heap;
 
-	ut_ad(rw_lock_own(&dict_sys.latch, RW_LOCK_S)
-	      || node->vcol_info.is_used());
 	ut_ad(!node->table->skip_alter_undo);
 
 	if (node->rec_type == TRX_UNDO_UPD_DEL_REC
@@ -901,11 +763,6 @@ row_purge_upd_exist_or_extern_func(
 				heap, ROW_BUILD_FOR_PURGE);
 			row_purge_remove_sec_if_poss(node, node->index, entry);
 
-			if (node->vcol_op_failed()) {
-				ut_ad(!node->table);
-				mem_heap_free(heap);
-				return;
-			}
 			ut_ad(node->table);
 
 			mem_heap_empty(heap);
@@ -928,12 +785,11 @@ skip_secondaries:
 		if (dfield_is_ext(&ufield->new_val)) {
 			trx_rseg_t*	rseg;
 			buf_block_t*	block;
-			ulint		internal_offset;
 			byte*		data_field;
-			ibool		is_insert;
+			bool		is_insert;
 			ulint		rseg_id;
-			ulint		page_no;
-			ulint		offset;
+			uint32_t	page_no;
+			uint16_t	offset;
 
 			/* We use the fact that new_val points to
 			undo_rec and get thus the offset of
@@ -941,7 +797,7 @@ skip_secondaries:
 			can calculate from node->roll_ptr the file
 			address of the new_val data */
 
-			internal_offset = ulint(
+			const uint16_t internal_offset = uint16_t(
 				static_cast<const byte*>
 				(dfield_get_data(&ufield->new_val))
 				- undo_rec);
@@ -993,7 +849,7 @@ skip_secondaries:
 				index,
 				data_field + dfield_get_len(&ufield->new_val)
 				- BTR_EXTERN_FIELD_REF_SIZE,
-				NULL, NULL, NULL, 0, false, &mtr);
+				NULL, NULL, block, 0, false, &mtr);
 			mtr.commit();
 		}
 	}
@@ -1009,25 +865,27 @@ skip_secondaries:
 	row_purge_upd_exist_or_extern_func(node,undo_rec)
 #endif /* UNIV_DEBUG */
 
-/***********************************************************//**
-Parses the row reference and other info in a modify undo log record.
+/** Parses the row reference and other info in a modify undo log record.
+@param[in]	node		row undo node
+@param[in]	undo_rec	record to purge
+@param[in]	thr		query thread
+@param[out]	updated_extern	true if an externally stored field was
+				updated
 @return true if purge operation required */
 static
 bool
 row_purge_parse_undo_rec(
-/*=====================*/
-	purge_node_t*		node,		/*!< in: row undo node */
-	trx_undo_rec_t*		undo_rec,	/*!< in: record to purge */
-	bool*			updated_extern, /*!< out: true if an externally
-						stored field was updated */
-	que_thr_t*		thr)		/*!< in: query thread */
+	purge_node_t*		node,
+	trx_undo_rec_t*		undo_rec,
+	que_thr_t*		thr,
+	bool*			updated_extern)
 {
 	dict_index_t*	clust_index;
 	byte*		ptr;
 	undo_no_t	undo_no;
 	table_id_t	table_id;
 	roll_ptr_t	roll_ptr;
-	ulint		info_bits;
+	byte		info_bits;
 	ulint		type;
 
 	ut_ad(node != NULL);
@@ -1052,7 +910,7 @@ row_purge_parse_undo_rec(
 		break;
 	default:
 #ifdef UNIV_DEBUG
-		ut_ad(!"unknown undo log record type");
+		ut_ad("unknown undo log record type" == 0);
 		return false;
 	case TRX_UNDO_UPD_DEL_REC:
 	case TRX_UNDO_UPD_EXIST_REC:
@@ -1067,28 +925,27 @@ row_purge_parse_undo_rec(
 		return false;
 	}
 
-	/* Prevent DROP TABLE etc. from running when we are doing the purge
-	for this row */
+	trx_id_t trx_id = TRX_ID_MAX;
 
-try_again:
-	rw_lock_s_lock_inline(&dict_sys.latch, 0, __FILE__, __LINE__);
+	if (node->retain_mdl(table_id)) {
+		ut_ad(node->table != NULL);
+		goto already_locked;
+	}
 
+try_again:
 	node->table = dict_table_open_on_id(
-		table_id, FALSE, DICT_TABLE_OP_NORMAL);
+		table_id, false, DICT_TABLE_OP_NORMAL, node->purge_thd,
+		&node->mdl_ticket);
 
-	trx_id_t trx_id = TRX_ID_MAX;
-
-	if (node->table == NULL) {
-		/* The table has been dropped: no need to do purge */
+	if (node->table == NULL || node->table->name.is_temporary()) {
+		/* The table has been dropped: no need to do purge and
+		release mdl happened as a part of open process itself */
 		goto err_exit;
 	}
 
+already_locked:
 	ut_ad(!node->table->is_temporary());
 
-	if (!fil_table_accessible(node->table)) {
-		goto inaccessible;
-	}
-
 	switch (type) {
 	case TRX_UNDO_INSERT_METADATA:
 	case TRX_UNDO_INSERT_REC:
@@ -1101,19 +958,13 @@ try_again:
 		/* Need server fully up for virtual column computation */
 		if (!mysqld_server_started) {
 
-			dict_table_close(node->table, FALSE, FALSE);
-			rw_lock_s_unlock(&dict_sys.latch);
-			if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
+			node->close_table();
+			if (srv_shutdown_state > SRV_SHUTDOWN_NONE) {
 				return(false);
 			}
 			os_thread_sleep(1000000);
 			goto try_again;
 		}
-
-		node->vcol_info.set_requested();
-		node->vcol_info.set_used();
-		node->vcol_info.set_table(innobase_init_vc_templ(node->table));
-		node->vcol_info.set_used();
 	}
 
 	clust_index = dict_table_get_first_index(node->table);
@@ -1122,21 +973,20 @@ try_again:
 		/* The table was corrupt in the data dictionary.
 		dict_set_corrupted() works on an index, and
 		we do not have an index to call it with. */
-inaccessible:
 		DBUG_ASSERT(table_id == node->table->id);
 		trx_id = node->table->def_trx_id;
 		if (!trx_id) {
 			trx_id = TRX_ID_MAX;
 		}
 
-		dict_table_close(node->table, FALSE, FALSE);
-		node->table = NULL;
 err_exit:
-		rw_lock_s_unlock(&dict_sys.latch);
+		node->close_table();
 		node->skip(table_id, trx_id);
 		return(false);
 	}
 
+	node->last_table_id = table_id;
+
 	if (type == TRX_UNDO_INSERT_METADATA) {
 		node->ref = &trx_undo_metadata;
 		return(true);
@@ -1169,20 +1019,21 @@ err_exit:
 	return(true);
 }
 
-/***********************************************************//**
-Purges the parsed record.
+/** Purges the parsed record.
+@param[in]	node		row purge node
+@param[in]	undo_rec	record to purge
+@param[in]	thr		query thread
+@param[in]	updated_extern	whether external columns were updated
 @return true if purged, false if skipped */
 static MY_ATTRIBUTE((nonnull, warn_unused_result))
 bool
 row_purge_record_func(
-/*==================*/
-	purge_node_t*	node,		/*!< in: row purge node */
-	trx_undo_rec_t*	undo_rec,	/*!< in: record to purge */
+	purge_node_t*	node,
+	trx_undo_rec_t*	undo_rec,
 #if defined UNIV_DEBUG || defined WITH_WSREP
-	const que_thr_t*thr,		/*!< in: query thread */
+	const que_thr_t*thr,
 #endif /* UNIV_DEBUG || WITH_WSREP */
-	bool		updated_extern)	/*!< in: whether external columns
-					were updated */
+	bool		updated_extern)
 {
 	dict_index_t*	clust_index;
 	bool		purged		= true;
@@ -1229,11 +1080,6 @@ row_purge_record_func(
 		node->found_clust = FALSE;
 	}
 
-	if (node->table != NULL) {
-		dict_table_close(node->table, FALSE, FALSE);
-		node->table = NULL;
-	}
-
 	return(purged);
 }
 
@@ -1261,20 +1107,13 @@ row_purge(
 		bool	updated_extern;
 
 		while (row_purge_parse_undo_rec(
-			       node, undo_rec, &updated_extern, thr)) {
+			       node, undo_rec, thr, &updated_extern)) {
 
 			bool purged = row_purge_record(
 				node, undo_rec, thr, updated_extern);
 
-			if (!node->vcol_info.is_used()) {
-				rw_lock_s_unlock(&dict_sys.latch);
-			}
-
-			ut_ad(!rw_lock_own(&dict_sys.latch, RW_LOCK_S));
-
 			if (purged
-			    || srv_shutdown_state > SRV_SHUTDOWN_INITIATED
-			    || node->vcol_op_failed()) {
+			    || srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
 				return;
 			}
 
@@ -1314,47 +1153,22 @@ row_purge_step(
 
 	node->start();
 
-#ifdef UNIV_DEBUG
-	srv_slot_t *slot = thr->thread_slot;
-	ut_ad(slot);
-
-	rw_lock_x_lock(&slot->debug_sync_lock);
-	while (UT_LIST_GET_LEN(slot->debug_sync)) {
-		srv_slot_t::debug_sync_t *sync =
-					UT_LIST_GET_FIRST(slot->debug_sync);
-		bool result = debug_sync_set_action(current_thd,
-						    sync->str,
-						    strlen(sync->str));
-		ut_a(!result);
-
-		UT_LIST_REMOVE(slot->debug_sync, sync);
-		ut_free(sync);
-	}
-	rw_lock_x_unlock(&slot->debug_sync_lock);
-#endif
-
-	if (!(node->undo_recs == NULL || ib_vector_is_empty(node->undo_recs))) {
-		trx_purge_rec_t*purge_rec;
-
-		purge_rec = static_cast<trx_purge_rec_t*>(
-			ib_vector_pop(node->undo_recs));
+	if (!node->undo_recs.empty()) {
+		trx_purge_rec_t purge_rec = node->undo_recs.front();
+		node->undo_recs.pop();
+		node->roll_ptr = purge_rec.roll_ptr;
 
-		node->roll_ptr = purge_rec->roll_ptr;
+		row_purge(node, purge_rec.undo_rec, thr);
 
-		row_purge(node, purge_rec->undo_rec, thr);
-
-		if (ib_vector_is_empty(node->undo_recs)) {
+		if (node->undo_recs.empty()) {
 			row_purge_end(thr);
 		} else {
 			thr->run_node = node;
-			node->vcol_info.reset();
 		}
 	} else {
 		row_purge_end(thr);
 	}
 
-	innobase_reset_background_thd(thr_get_trx(thr)->mysql_thd);
-
 	return(thr);
 }
 
diff --git a/storage/innobase/row/row0quiesce.cc b/storage/innobase/row/row0quiesce.cc
index 41240a73365..f106cc8a39d 100644
--- a/storage/innobase/row/row0quiesce.cc
+++ b/storage/innobase/row/row0quiesce.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -525,29 +525,30 @@ row_quiesce_table_start(
 	}
 
 	for (ulint count = 0;
-	     ibuf_merge_space(table->space_id) != 0
-	     && !trx_is_interrupted(trx);
+	     ibuf_merge_space(table->space_id);
 	     ++count) {
+		if (trx_is_interrupted(trx)) {
+			goto aborted;
+		}
 		if (!(count % 20)) {
 			ib::info() << "Merging change buffer entries for "
 				<< table->name;
 		}
 	}
 
-	if (!trx_is_interrupted(trx)) {
-		{
-			FlushObserver observer(table->space, trx, NULL);
-			buf_LRU_flush_or_remove_pages(table->space_id,
-						      &observer);
-		}
-
+	while (buf_flush_list_space(table->space)) {
 		if (trx_is_interrupted(trx)) {
+			goto aborted;
+		}
+	}
 
-			ib::warn() << "Quiesce aborted!";
-
-		} else if (row_quiesce_write_cfg(table, trx->mysql_thd)
-			   != DB_SUCCESS) {
+	if (!trx_is_interrupted(trx)) {
+		/* Ensure that all asynchronous IO is completed. */
+		os_aio_wait_until_no_pending_writes();
+		table->space->flush<false>();
 
+		if (row_quiesce_write_cfg(table, trx->mysql_thd)
+		    != DB_SUCCESS) {
 			ib::warn() << "There was an error writing to the"
 				" meta data file";
 		} else {
@@ -555,6 +556,7 @@ row_quiesce_table_start(
 				<< " flushed to disk";
 		}
 	} else {
+aborted:
 		ib::warn() << "Quiesce aborted!";
 	}
 
diff --git a/storage/innobase/row/row0row.cc b/storage/innobase/row/row0row.cc
index f0e5385be85..7e70341a20e 100644
--- a/storage/innobase/row/row0row.cc
+++ b/storage/innobase/row/row0row.cc
@@ -437,7 +437,6 @@ row_build_low(
 	ut_ad(rec != NULL);
 	ut_ad(heap != NULL);
 	ut_ad(dict_index_is_clust(index));
-	ut_ad(!mutex_own(&trx_sys.mutex));
 	ut_ad(!col_map || col_table);
 
 	if (!offsets) {
@@ -768,7 +767,7 @@ row_rec_to_index_entry_impl(
 	      (missing merge_threshold column) is acceptable. */
 	      || (!index->table->is_temporary()
 		  && index->table->id == DICT_INDEXES_ID
-		  && rec_len == dict_index_get_n_fields(index) - 1));
+		  && rec_len + 1 == dict_index_get_n_fields(index)));
 
 	ulint i;
 	for (i = 0; i < (mblob ? index->first_user_field() : rec_len);
diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc
index 781769dbf04..1dae2edbc01 100644
--- a/storage/innobase/row/row0sel.cc
+++ b/storage/innobase/row/row0sel.cc
@@ -120,7 +120,7 @@ row_sel_sec_rec_is_for_blob(
 		    field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)) {
 		/* The externally stored field was not written yet.
 		This record should only be seen by
-		recv_recovery_rollback_active() or any
+		trx_rollback_recovered() or any
 		TRX_ISO_READ_UNCOMMITTED transactions. */
 		return false;
 	}
@@ -599,7 +599,7 @@ row_sel_fetch_columns(
 				externally stored field was not
 				written yet. This record
 				should only be seen by
-				recv_recovery_rollback_active() or any
+				trx_rollback_recovered() or any
 				TRX_ISO_READ_UNCOMMITTED
 				transactions. The InnoDB SQL parser
 				(the sole caller of this function)
@@ -1022,27 +1022,17 @@ row_sel_get_clust_rec(
 
 	if (!node->read_view) {
 		/* Try to place a lock on the index record */
-		ulint	lock_type;
-		trx_t*	trx;
-
-		trx = thr_get_trx(thr);
+		trx_t* trx = thr_get_trx(thr);
 
-		/* If innodb_locks_unsafe_for_binlog option is used
-		or this session is using READ COMMITTED or lower isolation level
+		/* At READ UNCOMMITTED or READ COMMITTED isolation level
 		we lock only the record, i.e., next-key locking is
 		not used. */
-		if (srv_locks_unsafe_for_binlog
-		    || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
-			lock_type = LOCK_REC_NOT_GAP;
-		} else {
-			lock_type = LOCK_ORDINARY;
-		}
-
 		err = lock_clust_rec_read_check_and_lock(
 			0, btr_pcur_get_block(&plan->clust_pcur),
 			clust_rec, index, offsets,
-			static_cast<lock_mode>(node->row_lock_mode),
-			lock_type,
+			node->row_lock_mode,
+			trx->isolation_level <= TRX_ISO_READ_COMMITTED
+			? LOCK_REC_NOT_GAP : LOCK_ORDINARY,
 			thr);
 
 		switch (err) {
@@ -1135,8 +1125,8 @@ sel_set_rtr_rec_lock(
 	const rec_t*		first_rec,/*!< in: record */
 	dict_index_t*		index,	/*!< in: index */
 	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
-	ulint			mode,	/*!< in: lock mode */
-	ulint			type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+	unsigned		mode,	/*!< in: lock mode */
+	unsigned		type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
 					LOC_REC_NOT_GAP */
 	que_thr_t*		thr,	/*!< in: query thread */
 	mtr_t*			mtr)	/*!< in: mtr */
@@ -1193,7 +1183,7 @@ re_scan:
 
 			/* MDEV-14059 FIXME: why re-latch the block?
 			pcur is already positioned on it! */
-			ulint		page_no = page_get_page_no(
+			uint32_t page_no = page_get_page_no(
 				btr_pcur_get_page(pcur));
 
 			cur_block = buf_page_get_gen(
@@ -1302,8 +1292,8 @@ sel_set_rec_lock(
 	const rec_t*		rec,	/*!< in: record */
 	dict_index_t*		index,	/*!< in: index */
 	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
-	ulint			mode,	/*!< in: lock mode */
-	ulint			type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+	unsigned		mode,	/*!< in: lock mode */
+	unsigned		type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
 					LOC_REC_NOT_GAP */
 	que_thr_t*		thr,	/*!< in: query thread */
 	mtr_t*			mtr)	/*!< in: mtr */
@@ -1316,11 +1306,9 @@ sel_set_rec_lock(
 
 	trx = thr_get_trx(thr);
 
-	if (UT_LIST_GET_LEN(trx->lock.trx_locks) > 10000) {
-		if (buf_LRU_buf_pool_running_out()) {
-
-			return(DB_LOCK_TABLE_FULL);
-		}
+	if (UT_LIST_GET_LEN(trx->lock.trx_locks) > 10000
+	    && buf_pool.running_out()) {
+		return DB_LOCK_TABLE_FULL;
 	}
 
 	if (dict_index_is_clust(index)) {
@@ -1781,7 +1769,7 @@ rec_loop:
 
 		if (!consistent_read) {
 			rec_t*	next_rec = page_rec_get_next(rec);
-			ulint	lock_type;
+			unsigned lock_type;
 			trx_t*	trx;
 
 			trx = thr_get_trx(thr);
@@ -1790,16 +1778,11 @@ rec_loop:
 						  index->n_core_fields,
 						  ULINT_UNDEFINED, &heap);
 
-			/* If innodb_locks_unsafe_for_binlog option is used
-			or this session is using READ COMMITTED or lower isolation
-			level, we lock only the record, i.e., next-key
-			locking is not used. */
-			if (srv_locks_unsafe_for_binlog
-			    || trx->isolation_level
-			    <= TRX_ISO_READ_COMMITTED) {
-
+			/* At READ UNCOMMITTED or READ COMMITTED
+			isolation level, we lock only the record,
+			i.e., next-key locking is not used. */
+			if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
 				if (page_rec_is_supremum(next_rec)) {
-
 					goto skip_lock;
 				}
 
@@ -1849,7 +1832,7 @@ skip_lock:
 
 	if (!consistent_read) {
 		/* Try to place a lock on the index record */
-		ulint	lock_type;
+		unsigned lock_type;
 		trx_t*	trx;
 
 		offsets = rec_get_offsets(rec, index, offsets,
@@ -1858,12 +1841,10 @@ skip_lock:
 
 		trx = thr_get_trx(thr);
 
-		/* If innodb_locks_unsafe_for_binlog option is used
-		or this session is using READ COMMITTED or lower isolation level,
+		/* At READ UNCOMMITTED or READ COMMITTED isolation level,
 		we lock only the record, i.e., next-key locking is
 		not used. */
-		if (srv_locks_unsafe_for_binlog
-		    || trx->isolation_level <= TRX_ISO_READ_COMMITTED
+		if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
 		    || dict_index_is_spatial(index)) {
 
 			if (page_rec_is_supremum(rec)) {
@@ -2796,7 +2777,7 @@ row_sel_field_store_in_mysql_format_func(
 		}
 
 		/* Copy the actual data */
-		ut_memcpy(dest, data, len);
+		memcpy(dest, data, len);
 
 		/* Pad with trailing spaces. */
 
@@ -2988,7 +2969,7 @@ row_sel_store_mysql_field(
 		if (UNIV_UNLIKELY(!data)) {
 			/* The externally stored field was not written
 			yet. This record should only be seen by
-			recv_recovery_rollback_active() or any
+			trx_rollback_recovered() or any
 			TRX_ISO_READ_UNCOMMITTED transactions. */
 
 			if (heap != prebuilt->blob_heap) {
@@ -3022,8 +3003,15 @@ row_sel_store_mysql_field(
 			MEM_CHECK_DEFINED(prebuilt->default_rec
 					  + templ->mysql_col_offset,
 					  templ->mysql_col_len);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
 			mysql_rec[templ->mysql_null_byte_offset]
 				|= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
 			memcpy(mysql_rec + templ->mysql_col_offset,
 			       (const byte*) prebuilt->default_rec
 			       + templ->mysql_col_offset,
@@ -3066,7 +3054,7 @@ row_sel_store_mysql_field(
 		/* It is a nullable column with a non-NULL
 		value */
 		mysql_rec[templ->mysql_null_byte_offset]
-			&= ~(byte) templ->mysql_null_bit_mask;
+			&= static_cast<byte>(~templ->mysql_null_bit_mask);
 	}
 
 	DBUG_RETURN(TRUE);
@@ -3118,8 +3106,15 @@ static bool row_sel_store_mysql_rec(
 			    || !prebuilt->read_just_key) {
 				/* Initialize the NULL bit. */
 				if (templ->mysql_null_bit_mask) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
 					mysql_rec[templ->mysql_null_byte_offset]
 						|= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
 				}
 				continue;
 			}
@@ -3139,8 +3134,15 @@ static bool row_sel_store_mysql_rec(
 			}
 
 			if (dfield->len == UNIV_SQL_NULL) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
 				mysql_rec[templ->mysql_null_byte_offset]
 				|= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
 				memcpy(mysql_rec
 				+ templ->mysql_col_offset,
 				(const byte*) prebuilt->default_rec
@@ -3153,8 +3155,9 @@ static bool row_sel_store_mysql_rec(
 				(const byte*)dfield->data, dfield->len);
 				if (templ->mysql_null_bit_mask) {
 					mysql_rec[
-					templ->mysql_null_byte_offset]
-					&= ~(byte) templ->mysql_null_bit_mask;
+						templ->mysql_null_byte_offset]
+						&= static_cast<byte>
+						(~templ->mysql_null_bit_mask);
 				}
 			}
 
@@ -3386,7 +3389,7 @@ Row_sel_get_clust_rec_for_mysql::operator()(
 			same as btr_pcur_get_block(prebuilt->pcur),
 			and is it not unsafe to use RW_NO_LATCH here? */
 			buf_block_t*	block = buf_page_get_gen(
-				btr_pcur_get_block(prebuilt->pcur)->page.id,
+				btr_pcur_get_block(prebuilt->pcur)->page.id(),
 				btr_pcur_get_block(prebuilt->pcur)->zip_size(),
 				RW_NO_LATCH, NULL, BUF_GET,
 				__FILE__, __LINE__, mtr, &err);
@@ -3454,7 +3457,7 @@ Row_sel_get_clust_rec_for_mysql::operator()(
 		err = lock_clust_rec_read_check_and_lock(
 			0, btr_pcur_get_block(prebuilt->clust_pcur),
 			clust_rec, clust_index, *offsets,
-			static_cast<lock_mode>(prebuilt->select_lock_type),
+			prebuilt->select_lock_type,
 			LOCK_REC_NOT_GAP,
 			thr);
 
@@ -3481,14 +3484,11 @@ Row_sel_get_clust_rec_for_mysql::operator()(
 			const buf_page_t& bpage = btr_pcur_get_block(
 				prebuilt->clust_pcur)->page;
 
-			lsn_t lsn = bpage.newest_modification;
-			if (!lsn) {
-				lsn = mach_read_from_8(
-					page_align(clust_rec) + FIL_PAGE_LSN);
-			}
+			const lsn_t lsn = mach_read_from_8(
+				page_align(clust_rec) + FIL_PAGE_LSN);
 
 			if (lsn != cached_lsn
-			    || bpage.id != cached_page_id
+			    || bpage.id() != cached_page_id
 			    || clust_rec != cached_clust_rec) {
 				/* The following call returns 'offsets' associated with
 				'old_vers' */
@@ -3502,7 +3502,7 @@ Row_sel_get_clust_rec_for_mysql::operator()(
 					goto err_exit;
 				}
 				cached_lsn = lsn;
-				cached_page_id = bpage.id;
+				cached_page_id = bpage.id();
 				cached_clust_rec = clust_rec;
 				cached_old_vers = old_vers;
 			} else {
@@ -3588,17 +3588,14 @@ record with the same ordering prefix in in the B-tree index
 @param[in] latch_mode latch mode wished in restoration
 @param[in] pcur cursor whose position has been stored
 @param[in] moves_up true if the cursor moves up in the index
-@param[in] mtr mtr; CAUTION: may commit mtr temporarily!
-@param[in] select_lock_type select lock type: LOCK_NONE, LOCK_S, or LOCK_X
+@param[in,out] mtr mtr; CAUTION: may commit mtr temporarily!
 @return true if we may need to process the record the cursor is now
 positioned on (i.e. we should not go to the next record yet) */
 static bool sel_restore_position_for_mysql(bool *same_user_rec,
                                            ulint latch_mode, btr_pcur_t *pcur,
-                                           bool moves_up, mtr_t *mtr,
-                                           ulint select_lock_type)
+                                           bool moves_up, mtr_t *mtr)
 {
-	btr_pcur_t::restore_status status = btr_pcur_restore_position(
-	    latch_mode, pcur, mtr);
+	auto status = btr_pcur_restore_position(latch_mode, pcur, mtr);
 
 	*same_user_rec = status == btr_pcur_t::SAME_ALL;
 
@@ -3619,8 +3616,7 @@ static bool sel_restore_position_for_mysql(bool *same_user_rec,
 	switch (pcur->rel_pos) {
 	case BTR_PCUR_ON:
 		if (!*same_user_rec && moves_up) {
-			if (status == btr_pcur_t::SAME_UNIQ
-			    && select_lock_type != LOCK_NONE)
+			if (status == btr_pcur_t::SAME_UNIQ)
 			  return true;
 next:
 			if (btr_pcur_move_to_next(pcur, mtr)
@@ -3711,7 +3707,7 @@ row_sel_copy_cached_field_for_mysql(
 		len = templ->mysql_col_len;
 	}
 
-	ut_memcpy(buf, cache, len);
+	memcpy(buf, cache, len);
 }
 
 /** Copy used fields from cached row.
@@ -3741,10 +3737,17 @@ row_sel_copy_cached_fields_for_mysql(
 		/* Copy NULL bit of the current field from cached_rec
 		to buf */
 		if (templ->mysql_null_bit_mask) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
 			buf[templ->mysql_null_byte_offset]
 				^= (buf[templ->mysql_null_byte_offset]
 				    ^ cached_rec[templ->mysql_null_byte_offset])
 				& (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
 		}
 	}
 }
@@ -3778,7 +3781,7 @@ row_sel_dequeue_cached_row_for_mysql(
 		MEM_UNDEFINED(buf, prebuilt->mysql_prefix_len);
 
 		/* First copy the NULL bits. */
-		ut_memcpy(buf, cached_rec, prebuilt->null_bitmap_len);
+		memcpy(buf, cached_rec, prebuilt->null_bitmap_len);
 		/* Then copy the requested fields. */
 
 		for (i = 0; i < prebuilt->n_template; i++) {
@@ -3795,7 +3798,7 @@ row_sel_dequeue_cached_row_for_mysql(
 				buf, cached_rec, templ);
 		}
 	} else {
-		ut_memcpy(buf, cached_rec, prebuilt->mysql_prefix_len);
+		memcpy(buf, cached_rec, prebuilt->mysql_prefix_len);
 	}
 
 	prebuilt->n_fetch_cached--;
@@ -3878,9 +3881,8 @@ row_sel_enqueue_cache_row_for_mysql(
 	next fetch cache slot. */
 
 	if (prebuilt->pk_filter || prebuilt->idx_cond) {
-		byte*	dest = row_sel_fetch_last_buf(prebuilt);
-
-		ut_memcpy(dest, mysql_rec, prebuilt->mysql_row_len);
+		memcpy(row_sel_fetch_last_buf(prebuilt), mysql_rec,
+		       prebuilt->mysql_row_len);
 	}
 
 	++prebuilt->n_fetch_cached;
@@ -3912,7 +3914,7 @@ row_sel_try_search_shortcut_for_mysql(
 	ut_ad(dict_index_is_clust(index));
 	ut_ad(!prebuilt->templ_contains_blob);
 
-	rw_lock_t* ahi_latch = btr_get_search_latch(index);
+	rw_lock_t* ahi_latch = btr_search_sys.get_latch(*index);
 	rw_lock_s_lock(ahi_latch);
 	btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
 				   BTR_SEARCH_LEAF, pcur, ahi_latch, mtr);
@@ -4162,7 +4164,7 @@ rec_field_len_in_chars(
 		return SIZE_T_MAX;
 	}
 
-	return(cs->cset->numchars(cs, rec_field, rec_field + rec_field_len));
+	return cs->numchars(rec_field, rec_field + rec_field_len);
 }
 
 /** Avoid the clustered index lookup if all the following conditions
@@ -4297,29 +4299,18 @@ row_search_mvcc(
 	const rec_t*	result_rec = NULL;
 	const rec_t*	clust_rec;
 	Row_sel_get_clust_rec_for_mysql row_sel_get_clust_rec_for_mysql;
-	dberr_t		err				= DB_SUCCESS;
 	ibool		unique_search			= FALSE;
-	ibool		mtr_has_extra_clust_latch	= FALSE;
+	ulint		mtr_extra_clust_savepoint	= 0;
 	bool		moves_up			= false;
-	ibool		set_also_gap_locks		= TRUE;
-	/* if the query is a plain locking SELECT, and the isolation level
-	is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
-	ibool		did_semi_consistent_read	= FALSE;
 	/* if the returned record was locked and we did a semi-consistent
 	read (fetch the newest committed version), then this is set to
 	TRUE */
 	ulint		next_offs;
 	bool		same_user_rec;
-	mtr_t		mtr;
-	mem_heap_t*	heap				= NULL;
-	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
-	rec_offs*	offsets				= offsets_;
 	ibool		table_lock_waited		= FALSE;
 	byte*		next_buf			= 0;
 	bool		spatial_search			= false;
 
-	rec_offs_init(offsets_);
-
 	ut_ad(index && pcur && search_tuple);
 	ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
 	ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
@@ -4352,8 +4343,8 @@ row_search_mvcc(
 	bool    need_vrow = dict_index_has_virtual(prebuilt->index)
 		&& prebuilt->read_just_key;
 
-	/* Reset the new record lock info if srv_locks_unsafe_for_binlog
-	is set or session is using a READ COMMITTED isolation level. Then
+	/* Reset the new record lock info if READ UNCOMMITTED or
+	READ COMMITED isolation level is used. Then
 	we are able to remove the record locks set here on an individual
 	row. */
 	prebuilt->new_rec_locks = 0;
@@ -4396,20 +4387,18 @@ row_search_mvcc(
 			row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
 
 			prebuilt->n_rows_fetched++;
-
-			err = DB_SUCCESS;
-			goto func_exit;
+			trx->op_info = "";
+			DBUG_RETURN(DB_SUCCESS);
 		}
 
 		if (prebuilt->fetch_cache_first > 0
 		    && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
-
+early_not_found:
 			/* The previous returned row was popped from the fetch
 			cache, but the cache was not full at the time of the
 			popping: no more rows can exist in the result set */
-
-			err = DB_RECORD_NOT_FOUND;
-			goto func_exit;
+			trx->op_info = "";
+			DBUG_RETURN(DB_RECORD_NOT_FOUND);
 		}
 
 		prebuilt->n_rows_fetched++;
@@ -4453,22 +4442,28 @@ row_search_mvcc(
 
 		if (UNIV_UNLIKELY(direction != 0
 				  && !prebuilt->used_in_HANDLER)) {
-
-			err = DB_RECORD_NOT_FOUND;
-			goto func_exit;
+			goto early_not_found;
 		}
 	}
 
 	/* We don't support sequencial scan for Rtree index, because it
 	is no meaning to do so. */
-	if (dict_index_is_spatial(index)
-		&& !RTREE_SEARCH_MODE(mode)) {
-		err = DB_END_OF_INDEX;
-		goto func_exit;
+	if (dict_index_is_spatial(index) && !RTREE_SEARCH_MODE(mode)) {
+		trx->op_info = "";
+		DBUG_RETURN(DB_END_OF_INDEX);
 	}
 
+	/* if the query is a plain locking SELECT, and the isolation level
+	is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
+	bool did_semi_consistent_read = false;
+	mtr_t mtr;
 	mtr.start();
 
+	mem_heap_t*	heap				= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets				= offsets_;
+	rec_offs_init(offsets_);
+
 #ifdef BTR_CUR_HASH_ADAPT
 	/*-------------------------------------------------------------*/
 	/* PHASE 2: Try fast adaptive hash index search if possible */
@@ -4498,6 +4493,7 @@ row_search_mvcc(
 			let us try a search shortcut through the hash
 			index. */
 
+			dberr_t err = DB_SUCCESS;
 			switch (row_sel_try_search_shortcut_for_mysql(
 					&rec, prebuilt, &offsets, &heap,
 					&mtr)) {
@@ -4514,16 +4510,20 @@ row_search_mvcc(
 							buf, prebuilt,
 							rec, offsets)) {
 					case CHECK_ABORTED_BY_USER:
-						err = DB_INTERRUPTED;
-						mtr.commit();
-						goto func_exit;
+						goto aborted;
 					case CHECK_NEG:
 					case CHECK_OUT_OF_RANGE:
                                         case CHECK_ERROR:
-						goto shortcut_mismatch;
+						err = DB_RECORD_NOT_FOUND;
+						goto shortcut_done;
 					case CHECK_POS:
-						goto shortcut_match;
+						goto shortcut_done;
 					}
+
+					ut_ad("incorrect code" == 0);
+aborted:
+					err = DB_INTERRUPTED;
+					goto shortcut_done;
 				}
 
 				if (!row_sel_store_mysql_rec(
@@ -4545,20 +4545,22 @@ row_search_mvcc(
 					break;
 				}
 
-			shortcut_match:
-				mtr.commit();
-				/* NOTE that we do NOT store the cursor
-				position */
-				err = DB_SUCCESS;
-				goto func_exit;
+				goto shortcut_done;
 
 			case SEL_EXHAUSTED:
-			shortcut_mismatch:
+				err = DB_RECORD_NOT_FOUND;
+			shortcut_done:
 				mtr.commit();
+
 				/* NOTE that we do NOT store the cursor
 				position */
-				err = DB_RECORD_NOT_FOUND;
-				goto func_exit;
+				trx->op_info = "";
+				ut_ad(!sync_check_iterate(sync_check()));
+				ut_ad(!did_semi_consistent_read);
+				if (UNIV_LIKELY_NULL(heap)) {
+					mem_heap_free(heap);
+				}
+				DBUG_RETURN(err);
 
 			case SEL_RETRY:
 				break;
@@ -4596,22 +4598,15 @@ row_search_mvcc(
 	      || prebuilt->table->no_rollback()
 	      || srv_read_only_mode);
 
-	if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
-	    && prebuilt->select_lock_type != LOCK_NONE
-	    && trx->mysql_thd != NULL
-	    && thd_is_select(trx->mysql_thd)) {
-		/* It is a plain locking SELECT and the isolation
-		level is low: do not lock gaps */
-
-		set_also_gap_locks = FALSE;
-	}
+	/* Do not lock gaps at READ UNCOMMITTED or READ COMMITTED
+	isolation level */
+	const bool set_also_gap_locks =
+		prebuilt->select_lock_type != LOCK_NONE
+		&& trx->isolation_level > TRX_ISO_READ_COMMITTED
 #ifdef WITH_WSREP
-	else if (wsrep_thd_skip_locking(trx->mysql_thd)) {
-		ut_ad(!strcmp(wsrep_get_sr_table_name(),
-			      prebuilt->table->name.m_name));
-		set_also_gap_locks = FALSE;
-	}
+		&& !wsrep_thd_skip_locking(trx->mysql_thd)
 #endif /* WITH_WSREP */
+		;
 
 	/* Note that if the search mode was GE or G, then the cursor
 	naturally moves upward (in fetch next) in alphabetical order,
@@ -4628,10 +4623,12 @@ row_search_mvcc(
 
 	thr = que_fork_get_first_thr(prebuilt->sel_graph);
 
-	que_thr_move_to_run_state_for_mysql(thr, trx);
+	thr->start_running();
 
 	clust_index = dict_table_get_first_index(prebuilt->table);
 
+	dberr_t err = DB_SUCCESS;
+
 	/* Do some start-of-statement preparations */
 
 	if (prebuilt->table->no_rollback()) {
@@ -4673,7 +4670,7 @@ wait_table_again:
 
 		bool	need_to_process = sel_restore_position_for_mysql(
 			&same_user_rec, BTR_SEARCH_LEAF,
-			pcur, moves_up, &mtr, prebuilt->select_lock_type);
+			pcur, moves_up, &mtr);
 
 		if (UNIV_UNLIKELY(need_to_process)) {
 			if (UNIV_UNLIKELY(prebuilt->row_read_type
@@ -4700,18 +4697,9 @@ wait_table_again:
 		pcur->btr_cur.thr = thr;
 
 		if (dict_index_is_spatial(index)) {
-			bool	need_pred_lock;
-
-			need_pred_lock = (set_also_gap_locks
-					  && !(srv_locks_unsafe_for_binlog
-					      || trx->isolation_level
-						 <= TRX_ISO_READ_COMMITTED)
-					  && prebuilt->select_lock_type
-						 != LOCK_NONE);
-
 			if (!prebuilt->rtr_info) {
 				prebuilt->rtr_info = rtr_create_rtr_info(
-					need_pred_lock, true,
+					set_also_gap_locks, true,
 					btr_pcur_get_btr_cur(pcur), index);
 				prebuilt->rtr_info->search_tuple = search_tuple;
 				prebuilt->rtr_info->search_mode = mode;
@@ -4720,7 +4708,7 @@ wait_table_again:
 			} else {
 				rtr_info_reinit_in_cursor(
 					btr_pcur_get_btr_cur(pcur),
-					index, need_pred_lock);
+					index, set_also_gap_locks);
 				prebuilt->rtr_info->search_tuple = search_tuple;
 				prebuilt->rtr_info->search_mode = mode;
 			}
@@ -4741,11 +4729,8 @@ wait_table_again:
 		ut_ad(page_rec_is_leaf(rec));
 
 		if (!moves_up
-		    && !page_rec_is_supremum(rec)
 		    && set_also_gap_locks
-		    && !(srv_locks_unsafe_for_binlog
-			 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
-		    && prebuilt->select_lock_type != LOCK_NONE
+		    && !page_rec_is_supremum(rec)
 		    && !dict_index_is_spatial(index)) {
 
 			/* Try to place a gap lock on the next index record
@@ -4825,16 +4810,13 @@ rec_loop:
 	if (page_rec_is_supremum(rec)) {
 
 		if (set_also_gap_locks
-		    && !(srv_locks_unsafe_for_binlog
-			 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
-		    && prebuilt->select_lock_type != LOCK_NONE
 		    && !dict_index_is_spatial(index)) {
 
 			/* Try to place a lock on the index record */
 
-			/* If innodb_locks_unsafe_for_binlog option is used
-			or this session is using a READ COMMITTED or lower isolation
-			level we do not lock gaps. Supremum record is really
+			/* If the transaction isolation level is
+			READ UNCOMMITTED or READ COMMITTED,
+			we do not lock gaps. Supremum record is really
 			a gap and therefore we do not set locks there. */
 
 			offsets = rec_get_offsets(rec, index, offsets,
@@ -4899,14 +4881,13 @@ wrong_offs:
 			ib::error() << "Rec address "
 				<< static_cast<const void*>(rec)
 				<< ", buf block fix count "
-				<< btr_cur_get_block(
-					btr_pcur_get_btr_cur(pcur))->page
-					.buf_fix_count;
+				<< btr_pcur_get_block(pcur)->page
+				.buf_fix_count();
 
 			ib::error() << "Index corruption: rec offs "
 				<< page_offset(rec) << " next offs "
-				<< next_offs << ", page no "
-				<< page_get_page_no(page_align(rec))
+				<< next_offs
+				<< btr_pcur_get_block(pcur)->page.id()
 				<< ", index " << index->name
 				<< " of table " << index->table->name
 				<< ". Run CHECK TABLE. You may need to"
@@ -4922,8 +4903,8 @@ wrong_offs:
 
 			ib::info() << "Index corruption: rec offs "
 				<< page_offset(rec) << " next offs "
-				<< next_offs << ", page no "
-				<< page_get_page_no(page_align(rec))
+				<< next_offs
+				<< btr_pcur_get_block(pcur)->page.id()
 				<< ", index " << index->name
 				<< " of table " << index->table->name
 				<< ". We try to skip the rest of the page.";
@@ -4950,8 +4931,8 @@ wrong_offs:
 
 			ib::error() << "Index corruption: rec offs "
 				<< page_offset(rec) << " next offs "
-				<< next_offs << ", page no "
-				<< page_get_page_no(page_align(rec))
+				<< next_offs
+				<< btr_pcur_get_block(pcur)->page.id()
 				<< ", index " << index->name
 				<< " of table " << index->table->name
 				<< ". We try to skip the record.";
@@ -4974,17 +4955,7 @@ wrong_offs:
 		if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
 
 			if (set_also_gap_locks
-			    && !(srv_locks_unsafe_for_binlog
-				 || trx->isolation_level
-				 <= TRX_ISO_READ_COMMITTED)
-			    && prebuilt->select_lock_type != LOCK_NONE
 			    && !dict_index_is_spatial(index)) {
-
-				/* Try to place a gap lock on the index
-				record only if innodb_locks_unsafe_for_binlog
-				option is not set or this session is not
-				using a READ COMMITTED or lower isolation level. */
-
 				err = sel_set_rec_lock(
 					pcur,
 					rec, index, offsets,
@@ -5019,17 +4990,7 @@ wrong_offs:
 		if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
 
 			if (set_also_gap_locks
-			    && !(srv_locks_unsafe_for_binlog
-				 || trx->isolation_level
-				 <= TRX_ISO_READ_COMMITTED)
-			    && prebuilt->select_lock_type != LOCK_NONE
 			    && !dict_index_is_spatial(index)) {
-
-				/* Try to place a gap lock on the index
-				record only if innodb_locks_unsafe_for_binlog
-				option is not set or this session is not
-				using a READ COMMITTED or lower isolation level. */
-
 				err = sel_set_rec_lock(
 					pcur,
 					rec, index, offsets,
@@ -5069,15 +5030,9 @@ wrong_offs:
 		is a non-delete marked record, then it is enough to lock its
 		existence with LOCK_REC_NOT_GAP. */
 
-		/* If innodb_locks_unsafe_for_binlog option is used
-		or this session is using a READ COMMITTED isolation
-		level we lock only the record, i.e., next-key locking is
-		not used. */
-
-		ulint	lock_type;
+		unsigned lock_type;
 
-		if (srv_locks_unsafe_for_binlog
-		    || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+		if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
 			/* At READ COMMITTED or READ UNCOMMITTED
 			isolation levels, do not lock committed
 			delete-marked records. */
@@ -5120,8 +5075,16 @@ wrong_offs:
 			goto no_gap_lock;
 		}
 
-		if (!set_also_gap_locks
-		    || (unique_search && !rec_get_deleted_flag(rec, comp))
+#ifdef WITH_WSREP
+		if (UNIV_UNLIKELY(!set_also_gap_locks)) {
+			ut_ad(wsrep_thd_skip_locking(trx->mysql_thd));
+			goto no_gap_lock;
+		}
+#else /* WITH_WSREP */
+		ut_ad(set_also_gap_locks);
+#endif /* WITH_WSREP */
+
+		if ((unique_search && !rec_get_deleted_flag(rec, comp))
 		    || dict_index_is_spatial(index)) {
 
 			goto no_gap_lock;
@@ -5158,9 +5121,7 @@ no_gap_lock:
 		switch (err) {
 			const rec_t*	old_vers;
 		case DB_SUCCESS_LOCKED_REC:
-			if (srv_locks_unsafe_for_binlog
-			    || trx->isolation_level
-			    <= TRX_ISO_READ_COMMITTED) {
+			if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
 				/* Note that a record of
 				prebuilt->index was locked. */
 				prebuilt->new_rec_locks = 1;
@@ -5224,7 +5185,7 @@ no_gap_lock:
 				goto next_rec;
 			}
 
-			did_semi_consistent_read = TRUE;
+			did_semi_consistent_read = true;
 			rec = old_vers;
 			break;
 		case DB_RECORD_NOT_FOUND:
@@ -5391,7 +5352,7 @@ requires_clust_rec:
 		/* It was a non-clustered index and we must fetch also the
 		clustered index record */
 
-		mtr_has_extra_clust_latch = TRUE;
+		mtr_extra_clust_savepoint = mtr.get_savepoint();
 
 		ut_ad(!vrow);
 		/* The following call returns 'offsets' associated with
@@ -5415,9 +5376,7 @@ requires_clust_rec:
 			break;
 		case DB_SUCCESS_LOCKED_REC:
 			ut_a(clust_rec != NULL);
-			if (srv_locks_unsafe_for_binlog
-			     || trx->isolation_level
-			    <= TRX_ISO_READ_COMMITTED) {
+			if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
 				/* Note that the clustered index record
 				was locked. */
 				prebuilt->new_rec_locks = 2;
@@ -5433,8 +5392,7 @@ requires_clust_rec:
 
 			/* The record is delete marked: we can skip it */
 
-			if ((srv_locks_unsafe_for_binlog
-			     || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+			if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
 			    && prebuilt->select_lock_type != LOCK_NONE) {
 
 				/* No need to keep a lock on a delete-marked
@@ -5657,7 +5615,7 @@ next_rec:
 			  == ROW_READ_DID_SEMI_CONSISTENT)) {
 		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
 	}
-	did_semi_consistent_read = FALSE;
+	did_semi_consistent_read = false;
 	prebuilt->new_rec_locks = 0;
 	vrow = NULL;
 
@@ -5682,27 +5640,15 @@ next_rec:
 		/* No need to do store restore for R-tree */
 		mtr.commit();
 		mtr.start();
-		mtr_has_extra_clust_latch = FALSE;
-	} else if (mtr_has_extra_clust_latch) {
-		/* If we have extra cluster latch, we must commit
-		mtr if we are moving to the next non-clustered
+		mtr_extra_clust_savepoint = 0;
+	} else if (mtr_extra_clust_savepoint) {
+		/* We must release any clustered index latches
+		if we are moving to the next non-clustered
 		index record, because we could break the latching
 		order if we would access a different clustered
 		index page right away without releasing the previous. */
-
-		btr_pcur_store_position(pcur, &mtr);
-		mtr.commit();
-		mtr_has_extra_clust_latch = FALSE;
-
-		mtr.start();
-
-		if (sel_restore_position_for_mysql(&same_user_rec,
-						   BTR_SEARCH_LEAF,
-						   pcur, moves_up, &mtr,
-						   prebuilt->select_lock_type)
-		    ) {
-			goto rec_loop;
-		}
+		mtr.rollback_to_savepoint(mtr_extra_clust_savepoint);
+		mtr_extra_clust_savepoint = 0;
 	}
 
 	if (moves_up) {
@@ -5758,11 +5704,11 @@ page_read_error:
 			  == ROW_READ_DID_SEMI_CONSISTENT)) {
 		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
 	}
-	did_semi_consistent_read = FALSE;
+	did_semi_consistent_read = false;
 
 lock_table_wait:
 	mtr.commit();
-	mtr_has_extra_clust_latch = FALSE;
+	mtr_extra_clust_savepoint = 0;
 
 	trx->error_state = err;
 
@@ -5791,11 +5737,10 @@ lock_table_wait:
 		if (!dict_index_is_spatial(index)) {
 			sel_restore_position_for_mysql(
 				&same_user_rec, BTR_SEARCH_LEAF, pcur,
-				moves_up, &mtr, prebuilt->select_lock_type);
+				moves_up, &mtr);
 		}
 
-		if ((srv_locks_unsafe_for_binlog
-		     || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+		if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
 		    && !same_user_rec) {
 
 			/* Since we were not able to restore the cursor
@@ -5831,10 +5776,10 @@ normal_return:
 	{
 		/* handler_index_cond_check() may pull TR_table search
 		   which initates another row_search_mvcc(). */
-		ulint n_active_thrs= trx->lock.n_active_thrs;
-		trx->lock.n_active_thrs= 1;
-		que_thr_stop_for_mysql_no_error(thr, trx);
-		trx->lock.n_active_thrs= n_active_thrs - 1;
+		ut_d(ulint n_active_thrs= trx->lock.n_active_thrs);
+		ut_d(trx->lock.n_active_thrs= 1);
+		thr->stop_no_error();
+		ut_d(trx->lock.n_active_thrs= n_active_thrs - 1);
 	}
 
 	mtr.commit();
@@ -5873,7 +5818,7 @@ normal_return:
 
 func_exit:
 	trx->op_info = "";
-	if (heap != NULL) {
+	if (UNIV_LIKELY_NULL(heap)) {
 		mem_heap_free(heap);
 	}
 
diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc
index 6e3fb31808d..82c880a5920 100644
--- a/storage/innobase/row/row0uins.cc
+++ b/storage/innobase/row/row0uins.cc
@@ -100,13 +100,12 @@ row_undo_ins_remove_clust_rec(
 	We must log the removal, so that the row will be correctly
 	purged. However, we can log the removal out of sync with the
 	B-tree modification. */
-
 	ut_a(btr_pcur_restore_position(
-		online
-		? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED
-		: (node->rec_type == TRX_UNDO_INSERT_METADATA)
-		? BTR_MODIFY_TREE : BTR_MODIFY_LEAF, &node->pcur, &mtr)
-	    == btr_pcur_t::SAME_ALL);
+	      online ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED
+	      : (node->rec_type == TRX_UNDO_INSERT_METADATA)
+		  ? BTR_MODIFY_TREE
+		  : BTR_MODIFY_LEAF,
+	      &node->pcur, &mtr) == btr_pcur_t::SAME_ALL);
 
 	rec_t* rec = btr_pcur_get_rec(&node->pcur);
 
@@ -132,14 +131,12 @@ row_undo_ins_remove_clust_rec(
 			      == RW_X_LATCH);
 			ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
 
-			dict_drop_index_tree(rec, &node->pcur, node->trx,
-					     &mtr);
+			dict_drop_index_tree(&node->pcur, node->trx, &mtr);
 			mtr.commit();
 
 			mtr.start();
-			ut_a(btr_pcur_restore_position(
-				BTR_MODIFY_LEAF, &node->pcur, &mtr)
-			    == btr_pcur_t::SAME_ALL);
+			ut_a(btr_pcur_restore_position(BTR_MODIFY_LEAF,
+			      &node->pcur, &mtr)== btr_pcur_t::SAME_ALL);
 			break;
 		case DICT_COLUMNS_ID:
 			/* This is rolling back an INSERT into SYS_COLUMNS.
@@ -206,28 +203,7 @@ func_exit:
 	if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_INSERT_METADATA) {
 		/* When rolling back the very first instant ADD COLUMN
 		operation, reset the root page to the basic state. */
-		ut_ad(!index->table->is_temporary());
-		if (page_t* root = btr_root_get(index, &mtr)) {
-			byte* page_type = root + FIL_PAGE_TYPE;
-			ut_ad(mach_read_from_2(page_type)
-			      == FIL_PAGE_TYPE_INSTANT
-			      || mach_read_from_2(page_type)
-			      == FIL_PAGE_INDEX);
-			mlog_write_ulint(page_type, FIL_PAGE_INDEX,
-					 MLOG_2BYTES, &mtr);
-			byte* instant = PAGE_INSTANT + PAGE_HEADER + root;
-			mlog_write_ulint(instant,
-					 page_ptr_get_direction(instant + 1),
-					 MLOG_2BYTES, &mtr);
-			rec_t* infimum = page_get_infimum_rec(root);
-			rec_t* supremum = page_get_supremum_rec(root);
-			static const byte str[8 + 8] = "supremuminfimum";
-			if (memcmp(infimum, str + 8, 8)
-			    || memcmp(supremum, str, 8)) {
-				mlog_write_string(infimum, str + 8, 8, &mtr);
-				mlog_write_string(supremum, str, 8, &mtr);
-			}
-		}
+		btr_reset_instant(*index, true, &mtr);
 	}
 
 	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
@@ -400,7 +376,7 @@ static bool row_undo_ins_parse_undo_rec(undo_node_t* node, bool dict_locked)
 
 	switch (node->rec_type) {
 	default:
-		ut_ad(!"wrong undo record type");
+		ut_ad("wrong undo record type" == 0);
 		goto close_table;
 	case TRX_UNDO_INSERT_METADATA:
 	case TRX_UNDO_INSERT_REC:
@@ -421,7 +397,7 @@ static bool row_undo_ins_parse_undo_rec(undo_node_t* node, bool dict_locked)
 		goto close_table;
 	}
 
-	if (UNIV_UNLIKELY(!fil_table_accessible(node->table))) {
+	if (UNIV_UNLIKELY(!node->table->is_accessible())) {
 close_table:
 		/* Normally, tables should not disappear or become
 		unaccessible during ROLLBACK, because they should be
@@ -564,7 +540,7 @@ row_undo_ins(
 
 	switch (node->rec_type) {
 	default:
-		ut_ad(!"wrong undo record type");
+		ut_ad("wrong undo record type" == 0);
 		/* fall through */
 	case TRX_UNDO_INSERT_REC:
 		/* Skip the clustered index (the first index) */
diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc
index 9f08c92a4e5..bea2baa3cd6 100644
--- a/storage/innobase/row/row0umod.cc
+++ b/storage/innobase/row/row0umod.cc
@@ -99,7 +99,7 @@ row_undo_mod_clust_low(
 	pcur = &node->pcur;
 	btr_cur = btr_pcur_get_btr_cur(pcur);
 
-	ut_d(btr_pcur_t::restore_status pcur_restore_result =)
+	ut_d(auto pcur_restore_result =)
 	btr_pcur_restore_position(mode, pcur, mtr);
 
 	ut_ad(pcur_restore_result == btr_pcur_t::SAME_ALL);
@@ -144,36 +144,12 @@ row_undo_mod_clust_low(
 
 		ut_a(!dummy_big_rec);
 
-		static const byte
-			INFIMUM[8] = {'i','n','f','i','m','u','m',0},
-			SUPREMUM[8] = {'s','u','p','r','e','m','u','m'};
-
 		if (err == DB_SUCCESS
 		    && node->ref == &trx_undo_metadata
 		    && btr_cur_get_index(btr_cur)->table->instant
 		    && node->update->info_bits == REC_INFO_METADATA_ADD) {
-			if (page_t* root = btr_root_get(
-				    btr_cur_get_index(btr_cur), mtr)) {
-				byte* infimum;
-				byte *supremum;
-				if (page_is_comp(root)) {
-					infimum = PAGE_NEW_INFIMUM + root;
-					supremum = PAGE_NEW_SUPREMUM + root;
-				} else {
-					infimum = PAGE_OLD_INFIMUM + root;
-					supremum = PAGE_OLD_SUPREMUM + root;
-				}
-
-				ut_ad(!memcmp(infimum, INFIMUM, 8)
-				      == !memcmp(supremum, SUPREMUM, 8));
-
-				if (memcmp(infimum, INFIMUM, 8)) {
-					mlog_write_string(infimum, INFIMUM,
-							  8, mtr);
-					mlog_write_string(supremum, SUPREMUM,
-							  8, mtr);
-				}
-			}
+			btr_reset_instant(*btr_cur_get_index(btr_cur), false,
+					  mtr);
 		}
 	}
 
@@ -237,8 +213,7 @@ static bool row_undo_mod_must_purge(undo_node_t* node, mtr_t* mtr)
 
 	mtr->s_lock(&purge_sys.latch, __FILE__, __LINE__);
 
-	if (!purge_sys.view.changes_visible(node->new_trx_id,
-					    node->table->name)) {
+	if (!purge_sys.changes_visible(node->new_trx_id, node->table->name)) {
 		return false;
 	}
 
@@ -446,8 +421,8 @@ row_undo_mod_clust(
 		}
 		rec_t* rec = btr_pcur_get_rec(pcur);
 		mtr.s_lock(&purge_sys.latch, __FILE__, __LINE__);
-		if (!purge_sys.view.changes_visible(node->new_trx_id,
-						   node->table->name)) {
+		if (!purge_sys.changes_visible(node->new_trx_id,
+					       node->table->name)) {
 			goto mtr_commit_exit;
 		}
 
@@ -474,7 +449,7 @@ row_undo_mod_clust(
 #endif
 		} else if (rec_is_metadata(rec, *index)) {
 			ut_ad(!buf_block_get_page_zip(btr_pcur_get_block(
-							      &node->pcur)));
+							      pcur)));
 			for (unsigned i = index->first_user_field(); i--; ) {
 				trx_id_offset += index->fields[i].fixed_len;
 			}
@@ -495,16 +470,21 @@ row_undo_mod_clust(
 				      rec, dict_table_is_comp(node->table))
 			      || rec_is_alter_metadata(rec, *index));
 			index->set_modified(mtr);
-			if (page_zip_des_t* page_zip = buf_block_get_page_zip(
-				    btr_pcur_get_block(&node->pcur))) {
+			buf_block_t* block = btr_pcur_get_block(pcur);
+			if (UNIV_LIKELY_NULL(block->page.zip.data)) {
 				page_zip_write_trx_id_and_roll_ptr(
-					page_zip, rec, offsets, trx_id_pos,
+					block, rec, offsets, trx_id_pos,
 					0, 1ULL << ROLL_PTR_INSERT_FLAG_POS,
 					&mtr);
 			} else {
-				mlog_write_string(rec + trx_id_offset,
-						  reset_trx_id,
-						  sizeof reset_trx_id, &mtr);
+				size_t offs = page_offset(rec + trx_id_offset);
+				mtr.memset(block, offs, DATA_TRX_ID_LEN, 0);
+				offs += DATA_TRX_ID_LEN;
+				mtr.write<1,mtr_t::MAYBE_NOP>(*block,
+							      block->frame
+							      + offs, 0x80U);
+				mtr.memset(block, offs + 1,
+					   DATA_ROLL_PTR_LEN - 1, 0);
 			}
 		}
 	} else {
@@ -618,9 +598,8 @@ row_undo_mod_del_mark_or_remove_sec_low(
 	    || row_vers_old_has_index_entry(
 		    false, btr_pcur_get_rec(&node->pcur),
 		    &mtr_vers, index, entry, 0, 0)) {
-		err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
-						   btr_cur, TRUE, thr, &mtr);
-		ut_ad(err == DB_SUCCESS);
+		btr_rec_set_deleted<true>(btr_cur_get_block(btr_cur),
+					  btr_cur_get_rec(btr_cur), &mtr);
 	} else {
 		/* Remove the index record */
 
@@ -855,11 +834,8 @@ try_again:
 
 		break;
 	case ROW_FOUND:
-		err = btr_cur_del_mark_set_sec_rec(
-			BTR_NO_LOCKING_FLAG,
-			btr_cur, FALSE, thr, &mtr);
-
-		ut_a(err == DB_SUCCESS);
+		btr_rec_set_deleted<false>(btr_cur_get_block(btr_cur),
+					   btr_cur_get_rec(btr_cur), &mtr);
 		heap = mem_heap_create(
 			sizeof(upd_t)
 			+ dtuple_get_n_fields(entry) * sizeof(upd_field_t));
@@ -1226,7 +1202,7 @@ static bool row_undo_mod_parse_undo_rec(undo_node_t* node, bool dict_locked)
 	table_id_t	table_id;
 	trx_id_t	trx_id;
 	roll_ptr_t	roll_ptr;
-	ulint		info_bits;
+	byte		info_bits;
 	ulint		type;
 	ulint		cmpl_info;
 	bool		dummy_extern;
@@ -1257,7 +1233,7 @@ static bool row_undo_mod_parse_undo_rec(undo_node_t* node, bool dict_locked)
 
 	ut_ad(!node->table->skip_alter_undo);
 
-	if (UNIV_UNLIKELY(!fil_table_accessible(node->table))) {
+	if (UNIV_UNLIKELY(!node->table->is_accessible())) {
 close_table:
 		/* Normally, tables should not disappear or become
 		unaccessible during ROLLBACK, because they should be
@@ -1291,7 +1267,7 @@ close_table:
 	if (node->update->info_bits & REC_INFO_MIN_REC_FLAG) {
 		if ((node->update->info_bits & ~REC_INFO_DELETED_FLAG)
 		    != REC_INFO_MIN_REC_FLAG) {
-			ut_ad(!"wrong info_bits in undo log record");
+			ut_ad("wrong info_bits in undo log record" == 0);
 			goto close_table;
 		}
 		/* This must be an undo log record for a subsequent
diff --git a/storage/innobase/row/row0undo.cc b/storage/innobase/row/row0undo.cc
index a226bf443f9..3ac8e434f35 100644
--- a/storage/innobase/row/row0undo.cc
+++ b/storage/innobase/row/row0undo.cc
@@ -326,37 +326,30 @@ static bool row_undo_rec_get(undo_node_t* node)
 	mtr_t	mtr;
 	mtr.start();
 
-	page_t*	undo_page = trx_undo_page_get_s_latched(
+	buf_block_t* undo_page = trx_undo_page_get_s_latched(
 		page_id_t(undo->rseg->space->id, undo->top_page_no), &mtr);
 
-	ulint	offset = undo->top_offset;
-
-	trx_undo_rec_t*	prev_rec = trx_undo_get_prev_rec(
-		undo_page + offset, undo->hdr_page_no, undo->hdr_offset,
-		true, &mtr);
-
-	if (prev_rec == NULL) {
-		undo->top_undo_no = IB_ID_MAX;
-		ut_ad(undo->empty());
-	} else {
-		page_t*	prev_rec_page = page_align(prev_rec);
-
-		if (prev_rec_page != undo_page) {
+	uint16_t offset = undo->top_offset;
 
+	buf_block_t* prev_page = undo_page;
+	if (trx_undo_rec_t* prev_rec = trx_undo_get_prev_rec(
+		    prev_page, offset, undo->hdr_page_no, undo->hdr_offset,
+		    true, &mtr)) {
+		if (prev_page != undo_page) {
 			trx->pages_undone++;
 		}
 
-		undo->top_page_no = page_get_page_no(prev_rec_page);
-		undo->top_offset  = ulint(prev_rec - prev_rec_page);
+		undo->top_page_no = prev_page->page.id().page_no();
+		undo->top_offset  = page_offset(prev_rec);
 		undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec);
 		ut_ad(!undo->empty());
+	} else {
+		undo->top_undo_no = IB_ID_MAX;
+		ut_ad(undo->empty());
 	}
 
-	{
-		const trx_undo_rec_t* undo_rec = undo_page + offset;
-		node->undo_rec = trx_undo_rec_copy(undo_rec, node->heap);
-	}
-
+	node->undo_rec = trx_undo_rec_copy(undo_page->frame + offset,
+					   node->heap);
 	mtr.commit();
 
 	switch (trx_undo_rec_get_type(node->undo_rec)) {
@@ -431,7 +424,7 @@ row_undo(
 		err = row_undo_mod(node, thr);
 		break;
 	default:
-		ut_ad(!"wrong state");
+		ut_ad("wrong state" == 0);
 		err = DB_CORRUPTION;
 	}
 
@@ -461,13 +454,7 @@ row_undo_step(
 {
 	dberr_t		err;
 	undo_node_t*	node;
-	trx_t*		trx;
-
-	ut_ad(thr);
-
-	srv_inc_activity_count();
-
-	trx = thr_get_trx(thr);
+	trx_t*		trx = thr_get_trx(thr);
 
 	node = static_cast<undo_node_t*>(thr->run_node);
 
diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc
index 193a728e502..3c63e8ab2c8 100644
--- a/storage/innobase/row/row0upd.cc
+++ b/storage/innobase/row/row0upd.cc
@@ -465,36 +465,6 @@ upd_node_create(
 	return(node);
 }
 
-/*********************************************************************//**
-Updates the trx id and roll ptr field in a clustered index record in database
-recovery. */
-void
-row_upd_rec_sys_fields_in_recovery(
-/*===============================*/
-	rec_t*		rec,	/*!< in/out: record */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
-	ulint		pos,	/*!< in: TRX_ID position in rec */
-	trx_id_t	trx_id,	/*!< in: transaction id */
-	roll_ptr_t	roll_ptr)/*!< in: roll ptr of the undo log record */
-{
-	ut_ad(rec_offs_validate(rec, NULL, offsets));
-
-	if (page_zip) {
-		page_zip_write_trx_id_and_roll_ptr(
-			page_zip, rec, offsets, pos, trx_id, roll_ptr);
-	} else {
-		byte*	field;
-		ulint	len;
-
-		field = rec_get_nth_field(rec, offsets, pos, &len);
-		ut_ad(len == DATA_TRX_ID_LEN);
-		compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
-		trx_write_trx_id(field, trx_id);
-		trx_write_roll_ptr(field + DATA_TRX_ID_LEN, roll_ptr);
-	}
-}
-
 /***********************************************************//**
 Returns TRUE if row update changes size of some field in index or if some
 field to be updated is stored externally in rec or update.
@@ -536,11 +506,6 @@ row_upd_changes_field_size_or_external(
 		ut_ad(new_len != UNIV_SQL_DEFAULT);
 
 		if (dfield_is_null(new_val) && !rec_offs_comp(offsets)) {
-			/* A bug fixed on Dec 31st, 2004: we looked at the
-			SQL NULL size from the wrong field! We may backport
-			this fix also to 4.0. The merge to 5.0 will be made
-			manually immediately after we commit this to 4.1. */
-
 			new_len = dict_col_get_sql_null_size(
 				dict_index_get_nth_col(index,
 						       upd_field->field_no),
@@ -618,277 +583,6 @@ row_upd_changes_disowned_external(
 	return(false);
 }
 
-/***********************************************************//**
-Replaces the new column values stored in the update vector to the
-record given. No field size changes are allowed. This function is
-usually invoked on a clustered index. The only use case for a
-secondary index is row_ins_sec_index_entry_by_modify() or its
-counterpart in ibuf_insert_to_index_page(). */
-void
-row_upd_rec_in_place(
-/*=================*/
-	rec_t*		rec,	/*!< in/out: record where replaced */
-	dict_index_t*	index,	/*!< in: the index the record belongs to */
-	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
-	const upd_t*	update,	/*!< in: update vector */
-	page_zip_des_t*	page_zip)/*!< in: compressed page with enough space
-				available, or NULL */
-{
-	const upd_field_t*	upd_field;
-	const dfield_t*		new_val;
-	ulint			n_fields;
-	ulint			i;
-
-	ut_ad(rec_offs_validate(rec, index, offsets));
-	ut_ad(!index->table->skip_alter_undo);
-
-	if (rec_offs_comp(offsets)) {
-#ifdef UNIV_DEBUG
-		switch (rec_get_status(rec)) {
-		case REC_STATUS_ORDINARY:
-			break;
-		case REC_STATUS_INSTANT:
-			ut_ad(index->is_instant());
-			break;
-		case REC_STATUS_NODE_PTR:
-			if (index->is_dummy
-			    && fil_page_get_type(page_align(rec))
-			    == FIL_PAGE_RTREE) {
-				/* The function rtr_update_mbr_field_in_place()
-				is generating MLOG_COMP_REC_UPDATE_IN_PLACE
-				and MLOG_REC_UPDATE_IN_PLACE records for
-				node pointer pages. */
-				break;
-			}
-			/* fall through */
-		case REC_STATUS_INFIMUM:
-		case REC_STATUS_SUPREMUM:
-			ut_ad(!"wrong record status in update");
-		}
-#endif /* UNIV_DEBUG */
-
-		rec_set_info_bits_new(rec, update->info_bits);
-	} else {
-		rec_set_info_bits_old(rec, update->info_bits);
-	}
-
-	n_fields = upd_get_n_fields(update);
-
-	for (i = 0; i < n_fields; i++) {
-		upd_field = upd_get_nth_field(update, i);
-
-		/* No need to update virtual columns for non-virtual index */
-		if (upd_fld_is_virtual_col(upd_field)
-		    && !dict_index_has_virtual(index)) {
-			continue;
-		}
-
-		new_val = &(upd_field->new_val);
-		ut_ad(!dfield_is_ext(new_val) ==
-		      !rec_offs_nth_extern(offsets, upd_field->field_no));
-
-		rec_set_nth_field(rec, offsets, upd_field->field_no,
-				  dfield_get_data(new_val),
-				  dfield_get_len(new_val));
-	}
-
-	if (page_zip) {
-		page_zip_write_rec(page_zip, rec, index, offsets, 0);
-	}
-}
-
-/*********************************************************************//**
-Parses the log data of system field values.
-@return log data end or NULL */
-byte*
-row_upd_parse_sys_vals(
-/*===================*/
-	const byte*	ptr,	/*!< in: buffer */
-	const byte*	end_ptr,/*!< in: buffer end */
-	ulint*		pos,	/*!< out: TRX_ID position in record */
-	trx_id_t*	trx_id,	/*!< out: trx id */
-	roll_ptr_t*	roll_ptr)/*!< out: roll ptr */
-{
-	*pos = mach_parse_compressed(&ptr, end_ptr);
-
-	if (ptr == NULL) {
-
-		return(NULL);
-	}
-
-	if (end_ptr < ptr + DATA_ROLL_PTR_LEN) {
-
-		return(NULL);
-	}
-
-	*roll_ptr = trx_read_roll_ptr(ptr);
-	ptr += DATA_ROLL_PTR_LEN;
-
-	*trx_id = mach_u64_parse_compressed(&ptr, end_ptr);
-
-	return(const_cast<byte*>(ptr));
-}
-
-/***********************************************************//**
-Writes to the redo log the new values of the fields occurring in the index. */
-void
-row_upd_index_write_log(
-/*====================*/
-	const upd_t*	update,	/*!< in: update vector */
-	byte*		log_ptr,/*!< in: pointer to mlog buffer: must
-				contain at least MLOG_BUF_MARGIN bytes
-				of free space; the buffer is closed
-				within this function */
-	mtr_t*		mtr)	/*!< in: mtr into whose log to write */
-{
-	const upd_field_t*	upd_field;
-	const dfield_t*		new_val;
-	ulint			len;
-	ulint			n_fields;
-	byte*			buf_end;
-	ulint			i;
-
-	n_fields = upd_get_n_fields(update);
-
-	buf_end = log_ptr + MLOG_BUF_MARGIN;
-
-	mach_write_to_1(log_ptr, update->info_bits);
-	log_ptr++;
-	log_ptr += mach_write_compressed(log_ptr, n_fields);
-
-	for (i = 0; i < n_fields; i++) {
-		compile_time_assert(MLOG_BUF_MARGIN > 30);
-
-		if (log_ptr + 30 > buf_end) {
-			mlog_close(mtr, log_ptr);
-
-			log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
-			buf_end = log_ptr + MLOG_BUF_MARGIN;
-		}
-
-		upd_field = upd_get_nth_field(update, i);
-
-		new_val = &(upd_field->new_val);
-
-		len = dfield_get_len(new_val);
-
-		/* If this is a virtual column, mark it using special
-		field_no */
-		ulint	field_no = upd_fld_is_virtual_col(upd_field)
-			? REC_MAX_N_FIELDS + unsigned(upd_field->field_no)
-			: unsigned(upd_field->field_no);
-
-		log_ptr += mach_write_compressed(log_ptr, field_no);
-		log_ptr += mach_write_compressed(log_ptr, len);
-
-		if (len != UNIV_SQL_NULL) {
-			if (log_ptr + len < buf_end) {
-				memcpy(log_ptr, dfield_get_data(new_val), len);
-
-				log_ptr += len;
-			} else {
-				mlog_close(mtr, log_ptr);
-
-				mlog_catenate_string(
-					mtr,
-					static_cast<const byte*>(
-						dfield_get_data(new_val)),
-					len);
-
-				log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
-				buf_end = log_ptr + MLOG_BUF_MARGIN;
-			}
-		}
-	}
-
-	mlog_close(mtr, log_ptr);
-}
-
-/*********************************************************************//**
-Parses the log data written by row_upd_index_write_log.
-@return log data end or NULL */
-byte*
-row_upd_index_parse(
-/*================*/
-	const byte*	ptr,	/*!< in: buffer */
-	const byte*	end_ptr,/*!< in: buffer end */
-	mem_heap_t*	heap,	/*!< in: memory heap where update vector is
-				built */
-	upd_t**		update_out)/*!< out: update vector */
-{
-	upd_t*		update;
-	upd_field_t*	upd_field;
-	dfield_t*	new_val;
-	ulint		len;
-	ulint		n_fields;
-	ulint		info_bits;
-	ulint		i;
-
-	if (end_ptr < ptr + 1) {
-
-		return(NULL);
-	}
-
-	info_bits = mach_read_from_1(ptr);
-	ptr++;
-	n_fields = mach_parse_compressed(&ptr, end_ptr);
-
-	if (ptr == NULL) {
-
-		return(NULL);
-	}
-
-	update = upd_create(n_fields, heap);
-	update->info_bits = info_bits;
-
-	for (i = 0; i < n_fields; i++) {
-		ulint	field_no;
-		upd_field = upd_get_nth_field(update, i);
-		new_val = &(upd_field->new_val);
-
-		field_no = mach_parse_compressed(&ptr, end_ptr);
-
-		if (ptr == NULL) {
-
-			return(NULL);
-		}
-
-		/* Check if this is a virtual column, mark the prtype
-		if that is the case */
-		if (field_no >= REC_MAX_N_FIELDS) {
-			new_val->type.prtype |= DATA_VIRTUAL;
-			field_no -= REC_MAX_N_FIELDS;
-		}
-
-		upd_field->field_no = field_no;
-
-		len = mach_parse_compressed(&ptr, end_ptr);
-
-		if (ptr == NULL) {
-
-			return(NULL);
-		}
-
-		if (len != UNIV_SQL_NULL) {
-
-			if (end_ptr < ptr + len) {
-
-				return(NULL);
-			}
-
-			dfield_set_data(new_val,
-					mem_heap_dup(heap, ptr, len), len);
-			ptr += len;
-		} else {
-			dfield_set_null(new_val);
-		}
-	}
-
-	*update_out = update;
-
-	return(const_cast<byte*>(ptr));
-}
-
 /***************************************************************//**
 Builds an update vector from those fields which in a secondary index entry
 differ from a record that has the equal ordering fields. NOTE: we compare
@@ -909,7 +603,6 @@ row_upd_build_sec_rec_difference_binary(
 	ulint		len;
 	upd_t*		update;
 	ulint		n_diff;
-	ulint		i;
 
 	/* This function is used only for a secondary index */
 	ut_a(!dict_index_is_clust(index));
@@ -923,7 +616,7 @@ row_upd_build_sec_rec_difference_binary(
 
 	n_diff = 0;
 
-	for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+	for (uint16_t i = 0; i < dtuple_get_n_fields(entry); i++) {
 
 		data = rec_get_nth_field(rec, offsets, i, &len);
 
@@ -1012,7 +705,7 @@ row_upd_build_difference_binary(
 		ut_ad(rec_offs_validate(rec, index, offsets));
 	}
 
-	for (ulint i = 0; i < entry->n_fields; i++) {
+	for (uint16_t i = 0; i < entry->n_fields; i++) {
 		const byte* data = rec_get_nth_cfield(rec, index, offsets, i,
 						      &len);
 		const dfield_t* dfield = dtuple_get_nth_field(entry, i);
@@ -1033,7 +726,8 @@ row_upd_build_difference_binary(
 		}
 	}
 
-	for (ulint i = entry->n_fields; i < index->n_fields; i++) {
+	for (uint16_t i = static_cast<uint16_t>(entry->n_fields);
+	     i < index->n_fields; i++) {
 		upd_field_t* uf = upd_get_nth_field(update, n_diff++);
 		const dict_col_t* col = dict_index_get_nth_col(index, i);
 		/* upd_create() zero-initialized uf */
@@ -1062,7 +756,7 @@ row_upd_build_difference_binary(
 		ib_vcol_row vc(NULL);
 		uchar *record = vc.record(thd, index, &mysql_table);
 
-		for (ulint i = 0; i < n_v_fld; i++) {
+		for (uint16_t i = 0; i < n_v_fld; i++) {
 			const dict_v_col_t*     col
                                 = dict_table_get_nth_v_col(index->table, i);
 
@@ -1323,7 +1017,7 @@ row_upd_index_replace_new_col_vals_index_pos(
 
 	dtuple_set_info_bits(entry, update->info_bits);
 
-	for (unsigned i = index->n_fields; i--; ) {
+	for (uint16_t i = index->n_fields; i--; ) {
 		const dict_field_t*	field;
 		const dict_col_t*	col;
 		const upd_field_t*	uf;
@@ -1381,7 +1075,8 @@ row_upd_index_replace_new_col_vals(dtuple_t *entry, const dict_index_t &index,
      uf= upd_get_field_by_field_no(update, vcol->v_pos, true);
    }
    else
-     uf= upd_get_field_by_field_no(update, dict_col_get_clust_pos(col, &index),
+     uf= upd_get_field_by_field_no(update, static_cast<uint16_t>
+                                   (dict_col_get_clust_pos(col, &index)),
                                    false);
 
    if (!uf)
@@ -1494,15 +1189,12 @@ row_upd_replace_vcol(
 		ptr += 2;
 
 		while (ptr != end_ptr) {
-			const byte*             field;
-			ulint                   field_no;
-			ulint                   len;
-			ulint                   orig_len;
-			bool			is_v;
+			const byte* field;
+			uint32_t field_no, len, orig_len;
 
 			field_no = mach_read_next_compressed(&ptr);
 
-			is_v = (field_no >= REC_MAX_N_FIELDS);
+			const bool is_v = (field_no >= REC_MAX_N_FIELDS);
 
 			if (is_v) {
 				ptr = trx_undo_read_v_idx(
@@ -1514,7 +1206,7 @@ row_upd_replace_vcol(
 			ptr = trx_undo_rec_get_col_val(
 				ptr, &field, &len, &orig_len);
 
-			if (field_no == ULINT_UNDEFINED) {
+			if (field_no == FIL_NULL) {
 				ut_ad(is_v);
 				continue;
 			}
@@ -1686,8 +1378,9 @@ row_upd_changes_ord_field_binary_func(
 				update, vcol->v_pos, true);
 		} else {
 			upd_field = upd_get_field_by_field_no(
-				update,
-				dict_col_get_clust_pos(col, clust_index),
+				update, static_cast<uint16_t>(
+					dict_col_get_clust_pos(
+						col, clust_index)),
 				false);
 		}
 
@@ -1850,7 +1543,7 @@ row_upd_changes_ord_field_binary_func(
 					/* The externally stored field
 					was not written yet. This
 					record should only be seen by
-					recv_recovery_rollback_active(),
+					trx_rollback_recovered()
 					when the server had crashed before
 					storing the field. */
 					ut_ad(thr->graph->trx->is_recovered);
@@ -2429,11 +2122,17 @@ row_upd_sec_index_entry(
 		row_ins_sec_index_entry() below */
 		if (!rec_get_deleted_flag(
 			    rec, dict_table_is_comp(index->table))) {
-			err = btr_cur_del_mark_set_sec_rec(
-				flags, btr_cur, TRUE, thr, &mtr);
+			err = lock_sec_rec_modify_check_and_lock(
+				flags,
+				btr_cur_get_block(btr_cur),
+				btr_cur_get_rec(btr_cur), index, thr, &mtr);
 			if (err != DB_SUCCESS) {
 				break;
 			}
+
+			btr_rec_set_deleted<true>(btr_cur_get_block(btr_cur),
+						  btr_cur_get_rec(btr_cur),
+						  &mtr);
 #ifdef WITH_WSREP
 			if (!referenced && foreign
 			    && wsrep_must_process_fk(node, trx)
@@ -2594,14 +2293,13 @@ row_upd_clust_rec_by_insert_inherit_func(
 	const upd_t*	update)	/*!< in: update vector */
 {
 	bool	inherit	= false;
-	ulint	i;
 
 	ut_ad(!rec == !offsets);
 	ut_ad(!rec == !index);
 	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
 	ut_ad(!rec || rec_offs_any_extern(offsets));
 
-	for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+	for (uint16_t i = 0; i < dtuple_get_n_fields(entry); i++) {
 		dfield_t*	dfield	= dtuple_get_nth_field(entry, i);
 		byte*		data;
 		ulint		len;
@@ -2652,7 +2350,7 @@ row_upd_clust_rec_by_insert_inherit_func(
 		a lock wait and we already had disowned the BLOB. */
 		ut_a(rec == NULL
 		     || !(data[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
-		data[BTR_EXTERN_LEN] &= ~BTR_EXTERN_OWNER_FLAG;
+		data[BTR_EXTERN_LEN] &= byte(~BTR_EXTERN_OWNER_FLAG);
 		data[BTR_EXTERN_LEN] |= BTR_EXTERN_INHERITED_FLAG;
 		/* The BTR_EXTERN_INHERITED_FLAG only matters in
 		rollback of a fresh insert. Purge will always free
@@ -2772,7 +2470,7 @@ row_upd_clust_rec_by_insert(
 				insert fails, then this disown will be undone
 				when the operation is rolled back. */
 				btr_cur_disown_inherited_fields(
-					btr_cur_get_page_zip(btr_cur),
+					btr_cur_get_block(btr_cur),
 					rec, index, offsets, node->update,
 					mtr);
 			}
@@ -2893,7 +2591,7 @@ row_upd_clust_rec(
 		goto success;
 	}
 
-	if (buf_LRU_buf_pool_running_out()) {
+	if (buf_pool.running_out()) {
 		err = DB_LOCK_TABLE_FULL;
 		goto func_exit;
 	}
@@ -3138,8 +2836,7 @@ row_upd_clust_step(
 
 		ut_ad(!dict_index_is_online_ddl(index));
 
-		dict_drop_index_tree(
-			btr_pcur_get_rec(pcur), pcur, trx, &mtr);
+		dict_drop_index_tree(pcur, trx, &mtr);
 
 		mtr.commit();
 
@@ -3149,7 +2846,10 @@ row_upd_clust_step(
 		if (btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, &mtr) !=
 		    btr_pcur_t::SAME_ALL) {
 			err = DB_ERROR;
-			goto exit_func;
+
+			mtr.commit();
+
+			return(err);
 		}
 	}
 
@@ -3271,7 +2971,7 @@ row_upd(
 	ut_ad(!thr_get_trx(thr)->in_rollback);
 
 	DBUG_PRINT("row_upd", ("table: %s", node->table->name.m_name));
-	DBUG_PRINT("row_upd", ("info bits in update vector: 0x" ULINTPFx,
+	DBUG_PRINT("row_upd", ("info bits in update vector: 0x%x",
 			       node->update ? node->update->info_bits: 0));
 	DBUG_PRINT("row_upd", ("foreign_id: %s",
 			       node->foreign ? node->foreign->id: "NULL"));
@@ -3478,48 +3178,42 @@ make_versioned_delete().
 @param[in]	vers_sys_idx	table->row_start or table->row_end */
 void upd_node_t::vers_update_fields(const trx_t *trx, ulint idx)
 {
-	ut_ad(in_mysql_interface); // otherwise needs to recalculate
-				   // node->cmpl_info
-	ut_ad(idx == table->vers_start || idx == table->vers_end);
-
-	dict_index_t* clust_index = dict_table_get_first_index(table);
-        const dict_col_t *col= dict_table_get_nth_col(table, idx);
-        ulint field_no= dict_col_get_clust_pos(col, clust_index);
-        upd_field_t *ufield;
-
-        for (ulint i= 0; i < update->n_fields; ++i)
-        {
-          if (update->fields[i].field_no == field_no)
-          {
-            ufield= &update->fields[i];
-            goto skip_append;
-          }
-        }
-
-        /* row_create_update_node_for_mysql() pre-allocated this much.
-	   At least one PK column always remains unchanged. */
-	ut_ad(update->n_fields < ulint(table->n_cols + table->n_v_cols));
-
-	update->n_fields++;
-        ufield= upd_get_nth_field(update, update->n_fields - 1);
-        upd_field_set_field_no(ufield, field_no, clust_index);
+  ut_ad(in_mysql_interface); // otherwise needs to recalculate node->cmpl_info
+  ut_ad(idx == table->vers_start || idx == table->vers_end);
+
+  dict_index_t *clust_index= dict_table_get_first_index(table);
+  const dict_col_t *col= dict_table_get_nth_col(table, idx);
+  ulint field_no= dict_col_get_clust_pos(col, clust_index);
+  upd_field_t *ufield;
+
+  for (ulint i= 0; i < update->n_fields; ++i)
+  {
+    if (update->fields[i].field_no == field_no)
+    {
+      ufield= &update->fields[i];
+      goto skip_append;
+    }
+  }
+
+  /* row_create_update_node_for_mysql() pre-allocated this much.
+  At least one PK column always remains unchanged. */
+  ut_ad(update->n_fields < ulint(table->n_cols + table->n_v_cols));
+
+  update->n_fields++;
+  ufield= upd_get_nth_field(update, update->n_fields - 1);
+  upd_field_set_field_no(ufield, static_cast<uint16_t>(field_no), clust_index);
 
 skip_append:
   char *where= reinterpret_cast<char *>(update->vers_sys_value);
   if (col->vers_native())
-  {
     mach_write_to_8(where, trx->id);
-  }
   else
-  {
     thd_get_query_start_data(trx->mysql_thd, where);
-  }
 
   dfield_set_data(&ufield->new_val, update->vers_sys_value, col->len);
 
   for (ulint col_no= 0; col_no < dict_table_get_n_v_cols(table); col_no++)
   {
-
     const dict_v_col_t *v_col= dict_table_get_nth_v_col(table, col_no);
     if (!v_col->m_col.ord_part)
       continue;
diff --git a/storage/innobase/row/row0vers.cc b/storage/innobase/row/row0vers.cc
index d1ff7bc540e..f354424cb36 100644
--- a/storage/innobase/row/row0vers.cc
+++ b/storage/innobase/row/row0vers.cc
@@ -398,7 +398,6 @@ row_vers_impl_x_locked(
 	dict_index_t*	clust_index;
 
 	ut_ad(!lock_mutex_own());
-	ut_ad(!mutex_own(&trx_sys.mutex));
 
 	mtr_start(&mtr);
 
@@ -443,18 +442,14 @@ row_vers_impl_x_locked(
 @param[in,out]	row		the cluster index row in dtuple form
 @param[in]	clust_index	clustered index
 @param[in]	index		the secondary index
-@param[in]	heap		heap used to build virtual dtuple
-@param[in,out]	vcol_info	virtual column information.
-@return		true in case of success
-		false if virtual column computation fails */
+@param[in]	heap		heap used to build virtual dtuple. */
 static
 bool
 row_vers_build_clust_v_col(
 	dtuple_t*		row,
 	dict_index_t*		clust_index,
 	dict_index_t*		index,
-	mem_heap_t*		heap,
-	purge_vcol_info_t*	vcol_info)
+	mem_heap_t*		heap)
 {
 	THD*		thd= current_thd;
 	TABLE*		maria_table= 0;
@@ -462,42 +457,32 @@ row_vers_build_clust_v_col(
 	ut_ad(dict_index_has_virtual(index));
 	ut_ad(index->table == clust_index->table);
 
-	if (vcol_info != NULL) {
-		vcol_info->set_used();
-		maria_table = vcol_info->table();
-	}
 	DEBUG_SYNC(current_thd, "ib_clust_v_col_before_row_allocated");
 
-	ib_vcol_row vc(NULL);
+	ib_vcol_row vc(nullptr);
 	byte *record = vc.record(thd, index, &maria_table);
 
-	if (vcol_info && !vcol_info->table()) {
-		vcol_info->set_table(maria_table);
-		// wait for second fetch
-		return true;
-	}
+	ut_ad(maria_table);
 
 	for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
-		const dict_field_t* ind_field = dict_index_get_nth_field(
-				index, i);
+		const dict_col_t* c = dict_index_get_nth_col(index, i);
 
-		if (ind_field->col->is_virtual()) {
-			const dict_v_col_t*       col;
-
-			col = reinterpret_cast<const dict_v_col_t*>(
-				ind_field->col);
+		if (c->is_virtual()) {
+			const dict_v_col_t* col
+				= reinterpret_cast<const dict_v_col_t*>(c);
 
 			dfield_t *vfield = innobase_get_computed_value(
 				row, col, clust_index, &vc.heap,
 				heap, NULL, thd, maria_table, record, NULL,
 				NULL);
-			if (vfield == NULL) {
+			if (!vfield) {
 				innobase_report_computed_value_failed(row);
 				ut_ad(0);
 				return false;
 			}
 		}
 	}
+
 	return true;
 }
 
@@ -580,9 +565,8 @@ row_vers_build_cur_vrow_low(
 		all_filled = true;
 
 		for (i = 0; i < entry_len; i++) {
-			const dict_field_t*	ind_field
-				 = dict_index_get_nth_field(index, i);
-			const dict_col_t*	col = ind_field->col;
+			const dict_col_t* col
+				= dict_index_get_nth_col(index, i);
 
 			if (!col->is_virtual()) {
 				continue;
@@ -796,7 +780,6 @@ func_exit:
 @param[in,out]	heap		heap memory
 @param[in,out]	v_heap		heap memory to keep virtual colum dtuple
 @param[in]	mtr		mtr holding the latch on rec
-@param[in,out]	vcol_info	virtual column information for purge thread
 @return dtuple contains virtual column data */
 static
 dtuple_t*
@@ -810,8 +793,7 @@ row_vers_build_cur_vrow(
 	trx_id_t		trx_id,
 	mem_heap_t*		heap,
 	mem_heap_t*		v_heap,
-	mtr_t*			mtr,
-	purge_vcol_info_t*	vcol_info)
+	mtr_t*			mtr)
 {
 	dtuple_t* cur_vrow = NULL;
 
@@ -831,18 +813,9 @@ row_vers_build_cur_vrow(
 					  rec, *clust_offsets,
 					  NULL, NULL, NULL, NULL, heap);
 
-		if (vcol_info && !vcol_info->is_used()) {
-			mtr->commit();
-		}
-
-		bool res = row_vers_build_clust_v_col(
-			row, clust_index, index, heap, vcol_info);
-		if (!res) {
-			return NULL;
-		}
-
-		if (vcol_info != NULL && vcol_info->is_first_fetch()) {
-			return NULL;
+		if (!row_vers_build_clust_v_col(row, clust_index, index,
+						heap)) {
+			return nullptr;
 		}
 
 		cur_vrow = dtuple_copy(row, v_heap);
@@ -876,7 +849,6 @@ this case we return TRUE.
 @param[in]	ientry		secondary index entry
 @param[in]	roll_ptr	roll_ptr for the purge record
 @param[in]	trx_id		transaction ID on the purging record
-@param[in,out]	vcol_info	virtual column information for purge thread.
 @return TRUE if earlier version should have */
 bool
 row_vers_old_has_index_entry(
@@ -886,8 +858,7 @@ row_vers_old_has_index_entry(
 	dict_index_t*		index,
 	const dtuple_t*		ientry,
 	roll_ptr_t		roll_ptr,
-	trx_id_t		trx_id,
-	purge_vcol_info_t*	vcol_info)
+	trx_id_t		trx_id)
 {
 	const rec_t*	version;
 	rec_t*		prev_version;
@@ -902,11 +873,8 @@ row_vers_old_has_index_entry(
 	mem_heap_t*	v_heap = NULL;
 	dtuple_t*	cur_vrow = NULL;
 
-	ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_S_FIX));
-	ut_ad(!rw_lock_own(&purge_sys.latch, RW_LOCK_S));
-	ut_ad(also_curr || !vcol_info);
-
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
 	clust_index = dict_table_get_first_index(index->table);
 
 	comp = page_rec_is_comp(rec);
@@ -958,19 +926,8 @@ row_vers_old_has_index_entry(
 			if (trx_undo_roll_ptr_is_insert(t_roll_ptr)
 			    || dbug_v_purge) {
 
-				if (vcol_info && !vcol_info->is_used()) {
-					mtr->commit();
-				}
-
-				bool res = row_vers_build_clust_v_col(
-					row, clust_index, index, heap,
-					vcol_info);
-
-				if (!res) {
-					goto unsafe_to_purge;
-				}
-
-				if (vcol_info && vcol_info->is_first_fetch()) {
+				if (!row_vers_build_clust_v_col(
+					    row, clust_index, index, heap)) {
 					goto unsafe_to_purge;
 				}
 
@@ -1049,11 +1006,7 @@ unsafe_to_purge:
 
 		cur_vrow = row_vers_build_cur_vrow(
 			also_curr, rec, clust_index, &clust_offsets,
-			index, roll_ptr, trx_id, heap, v_heap, mtr, vcol_info);
-
-		if (vcol_info && vcol_info->is_first_fetch()) {
-			goto unsafe_to_purge;
-		}
+			index, roll_ptr, trx_id, heap, v_heap, mtr);
 	}
 
 	version = rec;
@@ -1178,9 +1131,9 @@ row_vers_build_for_consistent_read(
 	byte*		buf;
 	dberr_t		err;
 
-	ut_ad(dict_index_is_clust(index));
-	ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(index->is_primary());
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
 	ut_ad(!rw_lock_own(&(purge_sys.latch), RW_LOCK_S));
 
 	ut_ad(rec_offs_validate(rec, index, *offsets));
@@ -1291,9 +1244,9 @@ row_vers_build_for_semi_consistent_read(
 	byte*		buf;
 	trx_id_t	rec_trx_id	= 0;
 
-	ut_ad(dict_index_is_clust(index));
-	ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX
-					     | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(index->is_primary());
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
 	ut_ad(!rw_lock_own(&(purge_sys.latch), RW_LOCK_S));
 
 	ut_ad(rec_offs_validate(rec, index, *offsets));
diff --git a/storage/innobase/srv/srv0conc.cc b/storage/innobase/srv/srv0conc.cc
deleted file mode 100644
index ed02fc5c396..00000000000
--- a/storage/innobase/srv/srv0conc.cc
+++ /dev/null
@@ -1,332 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 2011, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2021, MariaDB Corporation.
-
-Portions of this file contain modifications contributed and copyrighted by
-Google, Inc. Those modifications are gratefully acknowledged and are described
-briefly in the InnoDB documentation. The contributions by Google are
-incorporated with their permission, and subject to the conditions contained in
-the file COPYING.Google.
-
-Portions of this file contain modifications contributed and copyrighted
-by Percona Inc.. Those modifications are
-gratefully acknowledged and are described briefly in the InnoDB
-documentation. The contributions by Percona Inc. are incorporated with
-their permission, and subject to the conditions contained in the file
-COPYING.Percona.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file srv/srv0conc.cc
-
-InnoDB concurrency manager
-
-Created 2011/04/18 Sunny Bains
-*******************************************************/
-
-#include "srv0srv.h"
-#include "trx0trx.h"
-#include "row0mysql.h"
-#include "dict0dict.h"
-#include <mysql/service_thd_wait.h>
-#include <mysql/service_wsrep.h>
-#include "wsrep.h"
-#include "log.h"
-
-/** Number of times a thread is allowed to enter InnoDB within the same
-SQL query after it has once got the ticket. */
-ulong	srv_n_free_tickets_to_enter = 500;
-
-/** Maximum sleep delay (in micro-seconds), value of 0 disables it. */
-ulong	srv_adaptive_max_sleep_delay = 150000;
-
-ulong	srv_thread_sleep_delay	= 10000;
-
-
-/** We are prepared for a situation that we have this many threads waiting for
-a semaphore inside InnoDB. srv_start() sets the value. */
-ulint	srv_max_n_threads;
-
-/** The following controls how many threads we let inside InnoDB concurrently:
-threads waiting for locks are not counted into the number because otherwise
-we could get a deadlock. Value of 0 will disable the concurrency check. */
-
-ulong	srv_thread_concurrency	= 0;
-
-/** Variables tracking the active and waiting threads. */
-struct srv_conc_t {
-	/** Number of transactions that have declared_to_be_inside_innodb */
-	MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) Atomic_counter<ulint> n_active;
-
-	/** Number of OS threads waiting in the FIFO for permission to
-	enter InnoDB */
-	MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) Atomic_counter<ulint> n_waiting;
-};
-
-/* Control variables for tracking concurrency. */
-static srv_conc_t	srv_conc;
-
-/*********************************************************************//**
-Note that a user thread is entering InnoDB. */
-static
-void
-srv_enter_innodb_with_tickets(
-/*==========================*/
-	trx_t*	trx)			/*!< in/out: transaction that wants
-					to enter InnoDB */
-{
-	trx->declared_to_be_inside_innodb = TRUE;
-	trx->n_tickets_to_enter_innodb = srv_n_free_tickets_to_enter;
-}
-
-/*********************************************************************//**
-Handle the scheduling of a user thread that wants to enter InnoDB.  Setting
-srv_adaptive_max_sleep_delay > 0 switches the adaptive sleep calibration to
-ON. When set, we want to wait in the queue for as little time as possible.
-However, very short waits will result in a lot of context switches and that
-is also not desirable. When threads need to sleep multiple times we increment
-os_thread_sleep_delay by one. When we see threads getting a slot without
-waiting and there are no other threads waiting in the queue, we try and reduce
-the wait as much as we can. Currently we reduce it by half each time. If the
-thread only had to wait for one turn before it was able to enter InnoDB we
-decrement it by one. This is to try and keep the sleep time stable around the
-"optimum" sleep time. */
-static
-void
-srv_conc_enter_innodb_with_atomics(
-/*===============================*/
-	trx_t*	trx)			/*!< in/out: transaction that wants
-					to enter InnoDB */
-{
-	ulint	n_sleeps = 0;
-	ibool	notified_mysql = FALSE;
-
-	ut_a(!trx->declared_to_be_inside_innodb);
-
-	for (;;) {
-		ulint	sleep_in_us;
-#ifdef WITH_WSREP
-		/* We need to take `thd->LOCK_thd_data` to check WSREP thread state */
-		if (trx->is_wsrep()) {
-			wsrep_thd_LOCK(trx->mysql_thd);
-
-			if (wsrep_thd_is_aborting(trx->mysql_thd)) {
-				WSREP_DEBUG("srv_conc_enter due to MUST_ABORT for"
-					    TRX_ID_FMT, trx->id);
-			}
-			wsrep_thd_UNLOCK(trx->mysql_thd);
-			srv_conc_force_enter_innodb(trx);
-			return;
-		}
-#endif /* WITH_WSREP */
-
-		if (srv_thread_concurrency == 0) {
-			if (notified_mysql) {
-				srv_conc.n_waiting--;
-				thd_wait_end(trx->mysql_thd);
-			}
-
-			return;
-		}
-
-		if (srv_conc.n_active < srv_thread_concurrency) {
-
-			/* Check if there are any free tickets. */
-			if (srv_conc.n_active++ < srv_thread_concurrency) {
-
-				srv_enter_innodb_with_tickets(trx);
-
-				if (notified_mysql) {
-					srv_conc.n_waiting--;
-					thd_wait_end(trx->mysql_thd);
-				}
-
-				if (srv_adaptive_max_sleep_delay > 0) {
-					if (srv_thread_sleep_delay > 20
-					    && n_sleeps == 1) {
-
-						--srv_thread_sleep_delay;
-					}
-
-					if (srv_conc.n_waiting == 0) {
-						srv_thread_sleep_delay >>= 1;
-					}
-				}
-
-				return;
-			}
-
-			/* Since there were no free seats, we relinquish
-			the overbooked ticket. */
-
-			srv_conc.n_active--;
-		}
-
-		if (!notified_mysql) {
-			srv_conc.n_waiting++;
-
-			thd_wait_begin(trx->mysql_thd, THD_WAIT_USER_LOCK);
-
-			notified_mysql = TRUE;
-		}
-
-		DEBUG_SYNC_C("user_thread_waiting");
-		trx->op_info = "sleeping before entering InnoDB";
-
-		sleep_in_us = srv_thread_sleep_delay;
-
-		/* Guard against overflow when adaptive sleep delay is on. */
-
-		if (srv_adaptive_max_sleep_delay > 0
-		    && sleep_in_us > srv_adaptive_max_sleep_delay) {
-
-			sleep_in_us = srv_adaptive_max_sleep_delay;
-			srv_thread_sleep_delay = static_cast<ulong>(sleep_in_us);
-		}
-
-		os_thread_sleep(sleep_in_us);
-
-		trx->op_info = "";
-
-		++n_sleeps;
-
-		if (srv_adaptive_max_sleep_delay > 0 && n_sleeps > 1) {
-			++srv_thread_sleep_delay;
-		}
-	}
-}
-
-/*********************************************************************//**
-Note that a user thread is leaving InnoDB code. */
-static
-void
-srv_conc_exit_innodb_with_atomics(
-/*==============================*/
-	trx_t*	trx)		/*!< in/out: transaction */
-{
-	trx->n_tickets_to_enter_innodb = 0;
-	trx->declared_to_be_inside_innodb = FALSE;
-
-	srv_conc.n_active--;
-}
-
-/*********************************************************************//**
-Puts an OS thread to wait if there are too many concurrent threads
-(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue.
-@param[in,out]	prebuilt	row prebuilt handler */
-void
-srv_conc_enter_innodb(
-	row_prebuilt_t*	prebuilt)
-{
-	trx_t*	trx	= prebuilt->trx;
-
-	ut_ad(!sync_check_iterate(sync_check()));
-
-	srv_conc_enter_innodb_with_atomics(trx);
-}
-
-/*********************************************************************//**
-This lets a thread enter InnoDB regardless of the number of threads inside
-InnoDB. This must be called when a thread ends a lock wait. */
-void
-srv_conc_force_enter_innodb(
-/*========================*/
-	trx_t*	trx)	/*!< in: transaction object associated with the
-			thread */
-{
-	ut_ad(!sync_check_iterate(sync_check()));
-
-	if (!srv_thread_concurrency) {
-
-		return;
-	}
-
-	srv_conc.n_active++;
-
-	trx->n_tickets_to_enter_innodb = 1;
-	trx->declared_to_be_inside_innodb = TRUE;
-}
-
-/*********************************************************************//**
-This must be called when a thread exits InnoDB in a lock wait or at the
-end of an SQL statement. */
-void
-srv_conc_force_exit_innodb(
-/*=======================*/
-	trx_t*	trx)	/*!< in: transaction object associated with the
-			thread */
-{
-	if ((trx->mysql_thd != NULL
-	     && thd_is_replication_slave_thread(trx->mysql_thd))
-	    || trx->declared_to_be_inside_innodb == FALSE) {
-
-		return;
-	}
-
-	srv_conc_exit_innodb_with_atomics(trx);
-
-	ut_ad(!sync_check_iterate(sync_check()));
-}
-
-/*********************************************************************//**
-Get the count of threads waiting inside InnoDB. */
-ulint
-srv_conc_get_waiting_threads(void)
-/*==============================*/
-{
-	return(srv_conc.n_waiting);
-}
-
-/*********************************************************************//**
-Get the count of threads active inside InnoDB. */
-ulint
-srv_conc_get_active_threads(void)
-/*==============================*/
-{
-	return(srv_conc.n_active);
-}
-
-#ifdef WITH_WSREP
-UNIV_INTERN
-void
-wsrep_srv_conc_cancel_wait(
-/*=======================*/
-	trx_t*	trx)	/*!< in: transaction object associated with the
-			thread */
-{
-#ifdef HAVE_ATOMIC_BUILTINS
-	/* aborting transactions will enter innodb by force in
-	   srv_conc_enter_innodb_with_atomics(). No need to cancel here,
-	   thr will wake up after os_sleep and let to enter innodb
-	*/
-	if (UNIV_UNLIKELY(wsrep_debug)) {
-		ib::info() << "WSREP: conc slot cancel, no atomics";
-	}
-#else
-	// JAN: TODO: MySQL 5.7
-	//os_fast_mutex_lock(&srv_conc_mutex);
-	if (trx->wsrep_event) {
-		if (UNIV_UNLIKELY(wsrep_debug)) {
-			ib::info() << "WSREP: conc slot cancel";
-		}
-		os_event_set(trx->wsrep_event);
-	}
-	//os_fast_mutex_unlock(&srv_conc_mutex);
-#endif
-}
-#endif /* WITH_WSREP */
-
diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc
index e64d3b9f426..7a9974d4842 100644
--- a/storage/innobase/srv/srv0mon.cc
+++ b/storage/innobase/srv/srv0mon.cc
@@ -2,7 +2,7 @@
 
 Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2013, 2018, MariaDB Corporation.
+Copyright (c) 2013, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -381,56 +381,16 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_NONE,
 	 MONITOR_DEFAULT_START, MONITOR_FLUSH_N_TO_FLUSH_BY_AGE},
 
-	{"buffer_flush_adaptive_avg_time_slot", "buffer",
-	 "Avg time (ms) spent for adaptive flushing recently per slot.",
+	{"buffer_flush_adaptive_avg_time", "buffer",
+	 "Avg time (ms) spent for adaptive flushing recently.",
 	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT},
-
-	{"buffer_LRU_batch_flush_avg_time_slot", "buffer",
-	 "Avg time (ms) spent for LRU batch flushing recently per slot.",
-	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_LRU_BATCH_FLUSH_AVG_TIME_SLOT},
-
-	{"buffer_flush_adaptive_avg_time_thread", "buffer",
-	 "Avg time (ms) spent for adaptive flushing recently per thread.",
-	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD},
-
-	{"buffer_LRU_batch_flush_avg_time_thread", "buffer",
-	 "Avg time (ms) spent for LRU batch flushing recently per thread.",
-	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_LRU_BATCH_FLUSH_AVG_TIME_THREAD},
-
-	{"buffer_flush_adaptive_avg_time_est", "buffer",
-	 "Estimated time (ms) spent for adaptive flushing recently.",
-	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST},
-
-	{"buffer_LRU_batch_flush_avg_time_est", "buffer",
-	 "Estimated time (ms) spent for LRU batch flushing recently.",
-	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_LRU_BATCH_FLUSH_AVG_TIME_EST},
-
-	{"buffer_flush_avg_time", "buffer",
-	 "Avg time (ms) spent for flushing recently.",
-	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_FLUSH_AVG_TIME},
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_ADAPTIVE_AVG_TIME},
 
 	{"buffer_flush_adaptive_avg_pass", "buffer",
 	 "Number of adaptive flushes passed during the recent Avg period.",
 	 MONITOR_NONE,
 	 MONITOR_DEFAULT_START, MONITOR_FLUSH_ADAPTIVE_AVG_PASS},
 
-	{"buffer_LRU_batch_flush_avg_pass", "buffer",
-	 "Number of LRU batch flushes passed during the recent Avg period.",
-	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_LRU_BATCH_FLUSH_AVG_PASS},
-
-	{"buffer_flush_avg_pass", "buffer",
-	 "Number of flushes passed during the recent Avg period.",
-	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_FLUSH_AVG_PASS},
-
 	{"buffer_LRU_get_free_loops", "buffer",
 	 "Total loops in LRU get free.",
 	 MONITOR_NONE,
@@ -562,23 +522,6 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
 	 MONITOR_LRU_BATCH_EVICT_PAGES},
 
-	/* Cumulative counter for single page LRU scans */
-	{"buffer_LRU_single_flush_scanned", "buffer",
-	 "Total pages scanned as part of single page LRU flush",
-	 MONITOR_SET_OWNER,
-	 MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
-	 MONITOR_LRU_SINGLE_FLUSH_SCANNED},
-
-	{"buffer_LRU_single_flush_num_scan", "buffer",
-	 "Number of times single page LRU flush is called",
-	 MONITOR_SET_MEMBER, MONITOR_LRU_SINGLE_FLUSH_SCANNED,
-	 MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL},
-
-	{"buffer_LRU_single_flush_scanned_per_call", "buffer",
-	 "Page scanned per single LRU flush",
-	 MONITOR_SET_MEMBER, MONITOR_LRU_SINGLE_FLUSH_SCANNED,
-	 MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL},
-
 	{"buffer_LRU_single_flush_failure_count", "Buffer",
 	 "Number of times attempt to flush a single page from LRU failed",
 	 MONITOR_NONE,
@@ -889,7 +832,8 @@ static monitor_info_t	innodb_counter_info[] =
 
 	{"log_lsn_checkpoint_age", "recovery",
 	 "Current LSN value minus LSN at last checkpoint",
-	 MONITOR_NONE,
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
 	 MONITOR_DEFAULT_START, MONITOR_LSN_CHECKPOINT_AGE},
 
 	{"log_lsn_buf_pool_oldest", "recovery",
@@ -904,12 +848,6 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
 	 MONITOR_DEFAULT_START, MONITOR_OVLD_MAX_AGE_ASYNC},
 
-	{"log_max_modified_age_sync", "recovery",
-	 "Maximum LSN difference; when exceeded, start synchronous preflush",
-	 static_cast<monitor_type_t>(
-	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
-	 MONITOR_DEFAULT_START, MONITOR_OVLD_MAX_AGE_SYNC},
-
 	{"log_pending_log_flushes", "recovery", "Pending log flushes",
 	 static_cast<monitor_type_t>(
 	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
@@ -1188,26 +1126,11 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_NONE,
 	 MONITOR_DEFAULT_START, MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND},
 
-	{"innodb_ibuf_merge_usec", "server",
-	 "Time (in microseconds) spent to process change buffer merge",
-	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_SRV_IBUF_MERGE_MICROSECOND},
-
 	{"innodb_log_flush_usec", "server",
 	 "Time (in microseconds) spent to flush log records",
 	 MONITOR_NONE,
 	 MONITOR_DEFAULT_START, MONITOR_SRV_LOG_FLUSH_MICROSECOND},
 
-	{"innodb_mem_validate_usec", "server",
-	 "Time (in microseconds) spent to do memory validation",
-	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_SRV_MEM_VALIDATE_MICROSECOND},
-
-	{"innodb_master_purge_usec", "server",
-	 "Time (in microseconds) spent by master thread to purge records",
-	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_SRV_PURGE_MICROSECOND},
-
 	{"innodb_dict_lru_usec", "server",
 	 "Time (in microseconds) spent to process DICT LRU list",
 	 MONITOR_NONE,
@@ -1223,11 +1146,6 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_NONE,
 	 MONITOR_DEFAULT_START, MONITOR_SRV_DICT_LRU_EVICT_COUNT_IDLE},
 
-	{"innodb_checkpoint_usec", "server",
-	 "Time (in microseconds) spent by master thread to do checkpoint",
-	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_SRV_CHECKPOINT_MICROSECOND},
-
 	{"innodb_dblwr_writes", "server",
 	 "Number of doublewrite operations that have been performed"
 	 " (innodb_dblwr_writes)",
@@ -1482,7 +1400,8 @@ srv_mon_set_module_control(
 	ibool	set_current_module = FALSE;
 
 	ut_a(module_id <= NUM_MONITOR);
-	ut_a(UT_ARR_SIZE(innodb_counter_info) == NUM_MONITOR);
+	compile_time_assert(array_elements(innodb_counter_info)
+			    == NUM_MONITOR);
 
 	/* The module_id must be an ID of MONITOR_MODULE type */
 	ut_a(innodb_counter_info[module_id].monitor_type & MONITOR_MODULE);
@@ -1618,11 +1537,6 @@ srv_mon_process_existing_counter(
 	mon_type_t		value;
 	monitor_info_t*		monitor_info;
 	ibool			update_min = FALSE;
-	buf_pool_stat_t		stat;
-	buf_pools_list_size_t	buf_pools_list_size;
-	ulint			LRU_len;
-	ulint			free_len;
-	ulint			flush_list_len;
 
 	monitor_info = srv_mon_get_info(monitor_id);
 
@@ -1640,8 +1554,7 @@ srv_mon_process_existing_counter(
 	/* innodb_buffer_pool_read_requests, the number of logical
 	read requests */
 	case MONITOR_OVLD_BUF_POOL_READ_REQUESTS:
-		buf_get_total_stat(&stat);
-		value = stat.n_page_gets;
+		value = buf_pool.stat.n_page_gets;
 		break;
 
 	/* innodb_buffer_pool_write_requests, the number of
@@ -1652,73 +1565,66 @@ srv_mon_process_existing_counter(
 
 	/* innodb_buffer_pool_wait_free */
 	case MONITOR_OVLD_BUF_POOL_WAIT_FREE:
-		value = srv_stats.buf_pool_wait_free;
+		value = buf_pool.stat.LRU_waits;
 		break;
 
 	/* innodb_buffer_pool_read_ahead */
 	case MONITOR_OVLD_BUF_POOL_READ_AHEAD:
-		buf_get_total_stat(&stat);
-		value = stat.n_ra_pages_read;
+		value = buf_pool.stat.n_ra_pages_read;
 		break;
 
 	/* innodb_buffer_pool_read_ahead_evicted */
 	case MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED:
-		buf_get_total_stat(&stat);
-		value = stat.n_ra_pages_evicted;
+		value = buf_pool.stat.n_ra_pages_evicted;
 		break;
 
 	/* innodb_buffer_pool_pages_total */
 	case MONITOR_OVLD_BUF_POOL_PAGE_TOTAL:
-		value = buf_pool_get_n_pages();
+		value = buf_pool.get_n_pages();
 		break;
 
 	/* innodb_buffer_pool_pages_misc */
 	case MONITOR_OVLD_BUF_POOL_PAGE_MISC:
-		buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
-		value = buf_pool_get_n_pages() - LRU_len - free_len;
+		value = buf_pool.get_n_pages()
+			- UT_LIST_GET_LEN(buf_pool.LRU)
+			- UT_LIST_GET_LEN(buf_pool.free);
 		break;
 
 	/* innodb_buffer_pool_pages_data */
 	case MONITOR_OVLD_BUF_POOL_PAGES_DATA:
-		buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
-		value = LRU_len;
+		value = UT_LIST_GET_LEN(buf_pool.LRU);
 		break;
 
 	/* innodb_buffer_pool_bytes_data */
 	case MONITOR_OVLD_BUF_POOL_BYTES_DATA:
-		buf_get_total_list_size_in_bytes(&buf_pools_list_size);
-		value = buf_pools_list_size.LRU_bytes
-			+ buf_pools_list_size.unzip_LRU_bytes;
+		value = buf_pool.stat.LRU_bytes
+			+ (UT_LIST_GET_LEN(buf_pool.unzip_LRU)
+			   << srv_page_size_shift);
 		break;
 
 	/* innodb_buffer_pool_pages_dirty */
 	case MONITOR_OVLD_BUF_POOL_PAGES_DIRTY:
-		buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
-		value = flush_list_len;
+		value = UT_LIST_GET_LEN(buf_pool.flush_list);
 		break;
 
 	/* innodb_buffer_pool_bytes_dirty */
 	case MONITOR_OVLD_BUF_POOL_BYTES_DIRTY:
-		buf_get_total_list_size_in_bytes(&buf_pools_list_size);
-		value = buf_pools_list_size.flush_list_bytes;
+		value = buf_pool.stat.flush_list_bytes;
 		break;
 
 	/* innodb_buffer_pool_pages_free */
 	case MONITOR_OVLD_BUF_POOL_PAGES_FREE:
-		buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
-		value = free_len;
+		value = UT_LIST_GET_LEN(buf_pool.free);
 		break;
 
 	/* innodb_pages_created, the number of pages created */
 	case MONITOR_OVLD_PAGE_CREATED:
-		buf_get_total_stat(&stat);
-		value = stat.n_pages_created;
+		value = buf_pool.stat.n_pages_created;
 		break;
 
 	/* innodb_pages_written, the number of page written */
 	case MONITOR_OVLD_PAGES_WRITTEN:
-		buf_get_total_stat(&stat);
-		value = stat.n_pages_written;
+		value = buf_pool.stat.n_pages_written;
 		break;
 
 	/* innodb_index_pages_written, the number of index pages written */
@@ -1733,8 +1639,7 @@ srv_mon_process_existing_counter(
 
 	/* innodb_pages_read */
 	case MONITOR_OVLD_PAGES_READ:
-		buf_get_total_stat(&stat);
-		value = stat.n_pages_read;
+		value = buf_pool.stat.n_pages_read;
 		break;
 
 	/* Number of times secondary index lookup triggered cluster lookup */
@@ -1779,12 +1684,12 @@ srv_mon_process_existing_counter(
 
 	/* innodb_os_log_fsyncs */
 	case MONITOR_OVLD_OS_LOG_FSYNC:
-		value = fil_n_log_flushes;
+		value = log_sys.get_flushes();
 		break;
 
 	/* innodb_os_log_pending_fsyncs */
 	case MONITOR_OVLD_OS_LOG_PENDING_FSYNC:
-		value = fil_n_pending_log_flushes;
+		value = log_sys.get_pending_flushes();
 		update_min = TRUE;
 		break;
 
@@ -1815,12 +1720,16 @@ srv_mon_process_existing_counter(
 
 	/* innodb_dblwr_writes */
 	case MONITOR_OVLD_SRV_DBLWR_WRITES:
-		value = srv_stats.dblwr_writes;
+		buf_dblwr.lock();
+		value = buf_dblwr.batches();
+		buf_dblwr.unlock();
 		break;
 
 	/* innodb_dblwr_pages_written */
 	case MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN:
-		value = srv_stats.dblwr_pages_written;
+		buf_dblwr.lock();
+		value = buf_dblwr.written();
+		buf_dblwr.unlock();
 		break;
 
 	/* innodb_page_size */
@@ -1951,35 +1860,35 @@ srv_mon_process_existing_counter(
 		break;
 
 	case MONITOR_OVLD_IBUF_MERGE_INSERT:
-		value = ibuf->n_merged_ops[IBUF_OP_INSERT];
+		value = ibuf.n_merged_ops[IBUF_OP_INSERT];
 		break;
 
 	case MONITOR_OVLD_IBUF_MERGE_DELETE:
-		value = ibuf->n_merged_ops[IBUF_OP_DELETE_MARK];
+		value = ibuf.n_merged_ops[IBUF_OP_DELETE_MARK];
 		break;
 
 	case MONITOR_OVLD_IBUF_MERGE_PURGE:
-		value = ibuf->n_merged_ops[IBUF_OP_DELETE];
+		value = ibuf.n_merged_ops[IBUF_OP_DELETE];
 		break;
 
 	case MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT:
-		value = ibuf->n_discarded_ops[IBUF_OP_INSERT];
+		value = ibuf.n_discarded_ops[IBUF_OP_INSERT];
 		break;
 
 	case MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE:
-		value = ibuf->n_discarded_ops[IBUF_OP_DELETE_MARK];
+		value = ibuf.n_discarded_ops[IBUF_OP_DELETE_MARK];
 		break;
 
 	case MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE:
-		value = ibuf->n_discarded_ops[IBUF_OP_DELETE];
+		value = ibuf.n_discarded_ops[IBUF_OP_DELETE];
 		break;
 
 	case MONITOR_OVLD_IBUF_MERGES:
-		value = ibuf->n_merges;
+		value = ibuf.n_merges;
 		break;
 
 	case MONITOR_OVLD_IBUF_SIZE:
-		value = ibuf->size;
+		value = ibuf.size;
 		break;
 
 	case MONITOR_OVLD_SERVER_ACTIVITY:
@@ -1987,34 +1896,39 @@ srv_mon_process_existing_counter(
 		break;
 
 	case MONITOR_OVLD_LSN_FLUSHDISK:
-		value = (mon_type_t) log_sys.flushed_to_disk_lsn;
+		value = log_sys.get_flushed_lsn();
 		break;
 
 	case MONITOR_OVLD_LSN_CURRENT:
-		value = (mon_type_t) log_sys.lsn;
+		value = log_sys.get_lsn();
 		break;
 
 	case MONITOR_PENDING_LOG_FLUSH:
-		mutex_enter(&log_sys.mutex);
-		value = static_cast<mon_type_t>(log_sys.n_pending_flushes);
-		mutex_exit(&log_sys.mutex);
+		value = static_cast<mon_type_t>(log_sys.pending_flushes);
+
 		break;
 
 	case MONITOR_PENDING_CHECKPOINT_WRITE:
-		mutex_enter(&log_sys.mutex);
-		value = static_cast<mon_type_t>(
-		    log_sys.n_pending_checkpoint_writes);
-		mutex_exit(&log_sys.mutex);
+		value = log_sys.checkpoint_pending;
 		break;
 
 	case MONITOR_LOG_IO:
-		mutex_enter(&log_sys.mutex);
+		mysql_mutex_lock(&log_sys.mutex);
 		value = static_cast<mon_type_t>(log_sys.n_log_ios);
-		mutex_exit(&log_sys.mutex);
+		mysql_mutex_unlock(&log_sys.mutex);
+		break;
+
+	case MONITOR_LSN_CHECKPOINT_AGE:
+		mysql_mutex_lock(&log_sys.mutex);
+		value = static_cast<mon_type_t>(log_sys.get_lsn()
+						- log_sys.last_checkpoint_lsn);
+		mysql_mutex_unlock(&log_sys.mutex);
 		break;
 
 	case MONITOR_OVLD_BUF_OLDEST_LSN:
-		value = (mon_type_t) buf_pool_get_oldest_modification();
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		value = (mon_type_t) buf_pool.get_oldest_modification(0);
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 		break;
 
 	case MONITOR_OVLD_LSN_CHECKPOINT:
@@ -2025,10 +1939,6 @@ srv_mon_process_existing_counter(
 		value = log_sys.max_modified_age_async;
 		break;
 
-	case MONITOR_OVLD_MAX_AGE_SYNC:
-		value = log_sys.max_modified_age_sync;
-		break;
-
 #ifdef BTR_CUR_HASH_ADAPT
 	case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH:
 		value = btr_cur_n_sea;
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index b8423fb0f4d..a308839ce4f 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -43,8 +43,8 @@ Created 10/8/1995 Heikki Tuuri
 // JAN: TODO: MySQL 5.7 missing header
 //#include "my_thread.h"
 //
-// #include "mysql/psi/mysql_stage.h"
-// #include "mysql/psi/psi.h"
+#include "mysql/psi/mysql_stage.h"
+#include "mysql/psi/psi.h"
 
 #include "btr0sea.h"
 #include "buf0flu.h"
@@ -55,7 +55,6 @@ Created 10/8/1995 Heikki Tuuri
 #include "lock0lock.h"
 #include "log0recv.h"
 #include "mem0mem.h"
-#include "os0proc.h"
 #include "pars0pars.h"
 #include "que0que.h"
 #include "row0mysql.h"
@@ -72,7 +71,8 @@ Created 10/8/1995 Heikki Tuuri
 #include "fil0fil.h"
 #include "fil0crypt.h"
 #include "fil0pagecompress.h"
-#include "btr0scrub.h"
+#include "trx0types.h"
+#include <list>
 
 #include <my_service_manager.h>
 /* The following is the maximum allowed duration of a lock wait. */
@@ -82,14 +82,6 @@ UNIV_INTERN ulong	srv_fatal_semaphore_wait_threshold =  DEFAULT_SRV_FATAL_SEMAPH
 in microseconds, in order to reduce the lagging of the purge thread. */
 ulint	srv_dml_needed_delay;
 
-bool	srv_monitor_active;
-bool	srv_error_monitor_active;
-bool	srv_buf_dump_thread_active;
-bool	srv_dict_stats_thread_active;
-bool	srv_buf_resize_thread_active;
-
-my_bool	srv_scrub_log;
-
 const char*	srv_main_thread_op_info = "";
 
 /** Prefix used by MySQL to indicate pre-5.1 table name encoding */
@@ -116,9 +108,6 @@ segment). It is quite possible that some of the tablespaces doesn't host
 any of the rollback-segment based on configuration used. */
 ulint	srv_undo_tablespaces_active;
 
-/* The number of rollback segments to use */
-ulong	srv_undo_logs;
-
 /** Rate at which UNDO records should be purged. */
 ulong	srv_purge_rseg_truncate_frequency;
 
@@ -132,10 +121,6 @@ my_bool	srv_undo_log_truncate;
 /** Maximum size of undo tablespace. */
 unsigned long long	srv_max_undo_log_size;
 
-/** Default undo tablespace size in UNIV_PAGEs count (10MB). */
-const ulint SRV_UNDO_TABLESPACE_SIZE_IN_PAGES =
-	((1024 * 1024) * 10) / UNIV_PAGE_SIZE_DEF;
-
 /** Set if InnoDB must operate in read-only mode. We don't do any
 recovery and open all tables in RO mode instead of RW mode. We don't
 sync the max trx id to disk either. */
@@ -147,9 +132,6 @@ my_bool	srv_file_per_table;
 is greater than SRV_FORCE_NO_TRX_UNDO. */
 my_bool	high_level_read_only;
 
-/** Place locks to records only i.e. do not use next-key locking except
-on duplicate key checking and foreign key checking */
-ibool	srv_locks_unsafe_for_binlog;
 /** Sort buffer size in index creation */
 ulong	srv_sort_buf_size;
 /** Maximum modification log file size for online index creation */
@@ -176,7 +158,6 @@ static os_event_t	srv_master_thread_disabled_event;
 /*------------------------- LOG FILES ------------------------ */
 char*	srv_log_group_home_dir;
 
-ulong	srv_n_log_files;
 /** The InnoDB redo log file size, or 0 when changing the redo log format
 at startup (while disallowing writes to the redo log). */
 ulonglong	srv_log_file_size;
@@ -200,9 +181,12 @@ my_bool	srv_adaptive_flushing;
 /** innodb_flush_sync; whether to ignore io_capacity at log checkpoints */
 my_bool	srv_flush_sync;
 
+/** common thread pool*/
+tpool::thread_pool* srv_thread_pool;
+
 /** Maximum number of times allowed to conditionally acquire
 mutex before switching to blocking wait on the mutex */
-#define MAX_MUTEX_NOWAIT	20
+#define MAX_MUTEX_NOWAIT	2
 
 /** Check whether the number of failed nonblocking mutex
 acquisition attempts exceeds maximum allowed value. If so,
@@ -215,13 +199,6 @@ ulint	srv_buf_pool_size;
 /** Requested buffer pool chunk size. Each buffer pool instance consists
 of one or more chunks. */
 ulong	srv_buf_pool_chunk_unit;
-/** innodb_buffer_pool_instances (0 is interpreted as 1) */
-ulong	srv_buf_pool_instances;
-/** Default value of innodb_buffer_pool_instances */
-const ulong	srv_buf_pool_instances_default = 0;
-/** innodb_page_hash_locks (a debug-only parameter);
-number of locks to protect buf_pool->page_hash */
-ulong	srv_n_page_hash_locks = 16;
 /** innodb_lru_scan_depth; number of blocks scanned in LRU flush batch */
 ulong	srv_LRU_scan_depth;
 /** innodb_flush_neighbors; whether or not to flush neighbors of a block */
@@ -242,9 +219,9 @@ ulong srv_buf_pool_load_pages_abort = LONG_MAX;
 ulint	srv_lock_table_size	= ULINT_MAX;
 
 /** innodb_read_io_threads */
-ulong	srv_n_read_io_threads;
+uint	srv_n_read_io_threads;
 /** innodb_write_io_threads */
-ulong	srv_n_write_io_threads;
+uint	srv_n_write_io_threads;
 
 /** innodb_random_read_ahead */
 my_bool	srv_random_read_ahead;
@@ -268,9 +245,6 @@ ulong	srv_io_capacity;
 /** innodb_io_capacity_max */
 ulong	srv_max_io_capacity;
 
-/** innodb_page_cleaners; the number of page cleaner threads */
-ulong	srv_n_page_cleaners;
-
 /* The InnoDB main thread tries to keep the ratio of modified pages
 in the buffer pool to all database pages in the buffer pool smaller than
 the following number. But it is not guaranteed that the value stays below
@@ -289,8 +263,8 @@ double	srv_adaptive_flushing_lwm;
 adaptive flushing is averaged */
 ulong	srv_flushing_avg_loops;
 
-/** innodb_purge_threads; the number of purge threads to use */
-ulong	srv_n_purge_threads;
+/** innodb_purge_threads; the number of purge tasks to use */
+uint srv_n_purge_threads;
 
 /** innodb_purge_batch_size, in pages */
 ulong	srv_purge_batch_size;
@@ -319,10 +293,10 @@ my_bool	srv_print_all_deadlocks;
 INFORMATION_SCHEMA.innodb_cmp_per_index */
 my_bool	srv_cmp_per_index_enabled;
 
-/** innodb_fast_shutdown; if 1 then we do not run purge and insert buffer
-merge to completion before shutdown. If it is set to 2, do not even flush the
-buffer pool to data files at the shutdown: we effectively 'crash'
-InnoDB (but lose no committed transactions). */
+/** innodb_fast_shutdown=1 skips purge and change buffer merge.
+innodb_fast_shutdown=2 effectively crashes the server (no log checkpoint).
+innodb_fast_shutdown=3 is a clean shutdown that skips the rollback
+of active transaction (to be done on restart). */
 uint	srv_fast_shutdown;
 
 /** copy of innodb_status_file; generate a innodb_status.<pid> file */
@@ -359,14 +333,6 @@ my_bool	srv_stats_sample_traditional;
 
 my_bool	srv_use_doublewrite_buf;
 
-/** innodb_doublewrite_batch_size (a debug parameter) specifies the
-number of pages to use in LRU and flush_list batch flushing.
-The rest of the doublewrite buffer is used for single-page flushing. */
-ulong	srv_doublewrite_batch_size = 120;
-
-/** innodb_replication_delay */
-ulong	srv_replication_delay;
-
 /** innodb_sync_spin_loops */
 ulong	srv_n_spin_wait_rounds;
 /** innodb_spin_wait_delay */
@@ -415,11 +381,7 @@ my_bool	srv_force_primary_key;
 /** Key version to encrypt the temporary tablespace */
 my_bool innodb_encrypt_temporary_tables;
 
-/* Array of English strings describing the current state of an
-i/o handler thread */
-
-const char* srv_io_thread_op_info[SRV_MAX_N_IO_THREADS];
-const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS];
+my_bool srv_immediate_scrub_data_uncompressed;
 
 static time_t	srv_last_monitor_time;
 
@@ -443,16 +405,16 @@ FILE*	srv_misc_tmpfile;
 static ulint	srv_main_thread_process_no;
 static ulint	srv_main_thread_id;
 
-/* The following counts are used by the srv_master_thread. */
+/* The following counts are used by the srv_master_callback. */
 
 /** Iterations of the loop bounded by 'srv_active' label. */
-static ulint		srv_main_active_loops;
+ulint		srv_main_active_loops;
 /** Iterations of the loop bounded by the 'srv_idle' label. */
-static ulint		srv_main_idle_loops;
+ulint		srv_main_idle_loops;
 /** Iterations of the loop bounded by the 'srv_shutdown' label. */
 static ulint		srv_main_shutdown_loops;
 /** Log writes involving flush. */
-static ulint		srv_log_writes_and_flush;
+ulint		srv_log_writes_and_flush;
 
 /* This is only ever touched by the master thread. It records the
 time when the last flush of log file has happened. The master
@@ -469,24 +431,11 @@ current_time % 60 == 0 and no tasks will be performed when
 current_time % 5 != 0. */
 
 # define	SRV_MASTER_CHECKPOINT_INTERVAL		(7)
-#ifdef MEM_PERIODIC_CHECK
-# define	SRV_MASTER_MEM_VALIDATE_INTERVAL	(13)
-#endif /* MEM_PERIODIC_CHECK */
 # define	SRV_MASTER_DICT_LRU_INTERVAL		(47)
 
 /** Buffer pool dump status frequence in percentages */
 UNIV_INTERN ulong srv_buf_dump_status_frequency;
 
-/** Acquire the system_mutex. */
-#define srv_sys_mutex_enter() do {			\
-	mutex_enter(&srv_sys.mutex);			\
-} while (0)
-
-/** Release the system mutex. */
-#define srv_sys_mutex_exit() do {			\
-	mutex_exit(&srv_sys.mutex);			\
-} while (0)
-
 /*
 	IMPLEMENTATION OF THE SERVER MAIN PROGRAM
 	=========================================
@@ -566,25 +515,6 @@ struct srv_sys_t{
 	UT_LIST_BASE_NODE_T(que_thr_t)
 			tasks;			/*!< task queue */
 
-	ib_mutex_t	mutex;			/*!< variable protecting the
-						fields below. */
-	ulint		n_sys_threads;		/*!< size of the sys_threads
-						array */
-
-	srv_slot_t
-	sys_threads[srv_max_purge_threads + 1];	/*!< server thread table;
-						os_event_set() and
-						os_event_reset() on
-						sys_threads[]->event are
-						covered by srv_sys_t::mutex */
-
-	Atomic_counter<ulint>
-			n_threads_active[SRV_MASTER + 1];
-						/*!< number of threads active
-						in a thread class; protected
-						by both std::atomic and
-						mutex */
-
 	srv_stats_t::ulint_ctr_1_t
 			activity_count;		/*!< For tracking server
 						activity */
@@ -592,26 +522,24 @@ struct srv_sys_t{
 
 static srv_sys_t	srv_sys;
 
-/** @return whether the purge coordinator thread is active */
-bool purge_sys_t::running()
+/*
+  Structure shared by timer and coordinator_callback.
+  No protection necessary since timer and task never run
+  in parallel (being in the same task group of size 1).
+*/
+struct purge_coordinator_state
 {
-  return srv_sys.n_threads_active[SRV_PURGE];
-}
-
-/** Event to signal srv_monitor_thread. Not protected by a mutex.
-Set after setting srv_print_innodb_monitor. */
-os_event_t	srv_monitor_event;
+  /** Snapshot of the last history length before the purge call.*/
+  uint32 m_history_length;
+  Atomic_counter<int> m_running;
+  purge_coordinator_state() : m_history_length(), m_running(0) {}
+};
 
-/** Event to signal the shutdown of srv_error_monitor_thread.
-Not protected by a mutex. */
-os_event_t	srv_error_event;
+static purge_coordinator_state purge_state;
 
-/** Event for waking up buf_dump_thread. Not protected by a mutex.
-Set on shutdown or by buf_dump_start() or buf_load_start(). */
-os_event_t	srv_buf_dump_event;
+/** threadpool timer for srv_monitor_task() */
+std::unique_ptr<tpool::timer> srv_monitor_timer;
 
-/** Event to signal the buffer pool resize thread */
-os_event_t	srv_buf_resize_event;
 
 /** The buffer pool dump/load file name */
 char*	srv_buf_dump_filename;
@@ -621,28 +549,13 @@ and/or load it during startup. */
 char	srv_buffer_pool_dump_at_shutdown = TRUE;
 char	srv_buffer_pool_load_at_startup = TRUE;
 
-/** Slot index in the srv_sys.sys_threads array for the master thread. */
-#define SRV_MASTER_SLOT 0
-
-/** Slot index in the srv_sys.sys_threads array for the purge thread. */
-#define SRV_PURGE_SLOT 1
-
-/** Slot index in the srv_sys.sys_threads array from which purge workers start.
-  */
-#define SRV_WORKER_SLOTS_START 2
-
 #ifdef HAVE_PSI_STAGE_INTERFACE
 /** Performance schema stage event for monitoring ALTER TABLE progress
-everything after flush log_make_checkpoint(). */
+in ha_innobase::commit_inplace_alter_table(). */
 PSI_stage_info	srv_stage_alter_table_end
 	= {0, "alter table (end)", PSI_FLAG_STAGE_PROGRESS};
 
 /** Performance schema stage event for monitoring ALTER TABLE progress
-log_make_checkpoint(). */
-PSI_stage_info	srv_stage_alter_table_flush
-	= {0, "alter table (flush)", PSI_FLAG_STAGE_PROGRESS};
-
-/** Performance schema stage event for monitoring ALTER TABLE progress
 row_merge_insert_index_tuples(). */
 PSI_stage_info	srv_stage_alter_table_insert
 	= {0, "alter table (insert)", PSI_FLAG_STAGE_PROGRESS};
@@ -689,344 +602,63 @@ srv_print_master_thread_info(
 		srv_log_writes_and_flush);
 }
 
-/*********************************************************************//**
-Sets the info describing an i/o thread current state. */
-void
-srv_set_io_thread_op_info(
-/*======================*/
-	ulint		i,	/*!< in: the 'segment' of the i/o thread */
-	const char*	str)	/*!< in: constant char string describing the
-				state */
+static void thread_pool_thread_init()
 {
-	ut_a(i < SRV_MAX_N_IO_THREADS);
-
-	srv_io_thread_op_info[i] = str;
+	my_thread_init();
+	pfs_register_thread(thread_pool_thread_key);
 }
-
-/*********************************************************************//**
-Resets the info describing an i/o thread current state. */
-void
-srv_reset_io_thread_op_info()
-/*=========================*/
+static void thread_pool_thread_end()
 {
-	for (ulint i = 0; i < UT_ARR_SIZE(srv_io_thread_op_info); ++i) {
-		srv_io_thread_op_info[i] = "not started yet";
-	}
+	pfs_delete_thread();
+	my_thread_end();
 }
 
-#ifdef UNIV_DEBUG
-/*********************************************************************//**
-Validates the type of a thread table slot.
-@return TRUE if ok */
-static
-ibool
-srv_thread_type_validate(
-/*=====================*/
-	srv_thread_type	type)	/*!< in: thread type */
-{
-	switch (type) {
-	case SRV_NONE:
-		break;
-	case SRV_WORKER:
-	case SRV_PURGE:
-	case SRV_MASTER:
-		return(TRUE);
-	}
-	ut_error;
-	return(FALSE);
-}
-#endif /* UNIV_DEBUG */
 
-/*********************************************************************//**
-Gets the type of a thread table slot.
-@return thread type */
-static
-srv_thread_type
-srv_slot_get_type(
-/*==============*/
-	const srv_slot_t*	slot)	/*!< in: thread slot */
+#ifndef DBUG_OFF
+static void dbug_after_task_callback()
 {
-	srv_thread_type	type = slot->type;
-	ut_ad(srv_thread_type_validate(type));
-	return(type);
-}
-
-/*********************************************************************//**
-Reserves a slot in the thread table for the current thread.
-@return reserved slot */
-static
-srv_slot_t*
-srv_reserve_slot(
-/*=============*/
-	srv_thread_type	type)	/*!< in: type of the thread */
-{
-	srv_slot_t*	slot = 0;
-
-	srv_sys_mutex_enter();
-
-	ut_ad(srv_thread_type_validate(type));
-
-	switch (type) {
-	case SRV_MASTER:
-		slot = &srv_sys.sys_threads[SRV_MASTER_SLOT];
-		break;
-
-	case SRV_PURGE:
-		slot = &srv_sys.sys_threads[SRV_PURGE_SLOT];
-		break;
-
-	case SRV_WORKER:
-		/* Find an empty slot, skip the master and purge slots. */
-		for (slot = &srv_sys.sys_threads[SRV_WORKER_SLOTS_START];
-		     slot->in_use;
-		     ++slot) {
-
-			ut_a(slot < &srv_sys.sys_threads[
-				     srv_sys.n_sys_threads]);
-		}
-		break;
-
-	case SRV_NONE:
-		ut_error;
-	}
-
-	ut_a(!slot->in_use);
-
-	slot->in_use = TRUE;
-	slot->suspended = FALSE;
-	slot->type = type;
-
-	ut_ad(srv_slot_get_type(slot) == type);
-
-	srv_sys.n_threads_active[type]++;
-
-	srv_sys_mutex_exit();
-
-	return(slot);
-}
-
-/*********************************************************************//**
-Suspends the calling thread to wait for the event in its thread slot.
-@return the current signal count of the event. */
-static
-int64_t
-srv_suspend_thread_low(
-/*===================*/
-	srv_slot_t*	slot)	/*!< in/out: thread slot */
-{
-	ut_ad(!srv_read_only_mode);
-	ut_ad(mutex_own(&srv_sys.mutex));
-
-	ut_ad(slot->in_use);
-
-	srv_thread_type	type = srv_slot_get_type(slot);
-
-	switch (type) {
-	case SRV_NONE:
-		ut_error;
-
-	case SRV_MASTER:
-		/* We have only one master thread and it
-		should be the first entry always. */
-		ut_a(srv_sys.n_threads_active[type] == 1);
-		break;
-
-	case SRV_PURGE:
-		/* We have only one purge coordinator thread
-		and it should be the second entry always. */
-		ut_a(srv_sys.n_threads_active[type] == 1);
-		break;
-
-	case SRV_WORKER:
-		ut_a(srv_n_purge_threads > 1);
-		break;
-	}
-
-	ut_a(!slot->suspended);
-	slot->suspended = TRUE;
-
-	if (srv_sys.n_threads_active[type]-- == 0) {
-		ut_error;
-	}
-
-	return(os_event_reset(slot->event));
+  ut_ad(!sync_check_iterate(sync_check()));
 }
+#endif
 
-/*********************************************************************//**
-Suspends the calling thread to wait for the event in its thread slot.
-@return the current signal count of the event. */
-static
-int64_t
-srv_suspend_thread(
-/*===============*/
-	srv_slot_t*	slot)	/*!< in/out: thread slot */
+void srv_thread_pool_init()
 {
-	srv_sys_mutex_enter();
-
-	int64_t		sig_count = srv_suspend_thread_low(slot);
+  DBUG_ASSERT(!srv_thread_pool);
 
-	srv_sys_mutex_exit();
-
-	return(sig_count);
+#if defined (_WIN32)
+  srv_thread_pool= tpool::create_thread_pool_win();
+#else
+  srv_thread_pool= tpool::create_thread_pool_generic();
+#endif
+  srv_thread_pool->set_thread_callbacks(thread_pool_thread_init,
+                                        thread_pool_thread_end);
+#ifndef DBUG_OFF
+  tpool::set_after_task_callback(dbug_after_task_callback);
+#endif
 }
 
-/** Resume the calling thread.
-@param[in,out]	slot		thread slot
-@param[in]	sig_count	signal count (if wait)
-@param[in]	wait		whether to wait for the event
-@param[in]	timeout_usec	timeout in microseconds (0=infinite)
-@return	whether the wait timed out */
-static
-bool
-srv_resume_thread(srv_slot_t* slot, int64_t sig_count = 0, bool wait = true,
-		  ulint timeout_usec = 0)
-{
-	bool	timeout;
-
-	ut_ad(!srv_read_only_mode);
-	ut_ad(slot->in_use);
-	ut_ad(slot->suspended);
-
-	if (!wait) {
-		timeout = false;
-	} else if (timeout_usec) {
-		timeout = OS_SYNC_TIME_EXCEEDED == os_event_wait_time_low(
-			slot->event, timeout_usec, sig_count);
-	} else {
-		timeout = false;
-		os_event_wait_low(slot->event, sig_count);
-	}
-
-	srv_sys_mutex_enter();
-	ut_ad(slot->in_use);
-	ut_ad(slot->suspended);
 
-	slot->suspended = FALSE;
-	srv_sys.n_threads_active[slot->type]++;
-	srv_sys_mutex_exit();
-	return(timeout);
-}
-
-/** Ensure that a given number of threads of the type given are running
-(or are already terminated).
-@param[in]	type	thread type
-@param[in]	n	number of threads that have to run */
-void
-srv_release_threads(enum srv_thread_type type, ulint n)
+void srv_thread_pool_end()
 {
-	ulint	running;
-
-	ut_ad(srv_thread_type_validate(type));
-	ut_ad(n > 0);
-
-	do {
-		running = 0;
-
-		srv_sys_mutex_enter();
-
-		for (ulint i = 0; i < srv_sys.n_sys_threads; i++) {
-			srv_slot_t*	slot = &srv_sys.sys_threads[i];
-
-			if (!slot->in_use || srv_slot_get_type(slot) != type) {
-				continue;
-			} else if (!slot->suspended) {
-				if (++running >= n) {
-					break;
-				}
-				continue;
-			}
-
-			switch (type) {
-			case SRV_NONE:
-				ut_error;
-
-			case SRV_MASTER:
-				/* We have only one master thread and it
-				should be the first entry always. */
-				ut_a(n == 1);
-				ut_a(i == SRV_MASTER_SLOT);
-				ut_a(srv_sys.n_threads_active[type] == 0);
-				break;
-
-			case SRV_PURGE:
-				/* We have only one purge coordinator thread
-				and it should be the second entry always. */
-				ut_a(n == 1);
-				ut_a(i == SRV_PURGE_SLOT);
-				ut_a(srv_n_purge_threads > 0);
-				ut_a(srv_sys.n_threads_active[type] == 0);
-				break;
-
-			case SRV_WORKER:
-				ut_a(srv_n_purge_threads > 1);
-				ut_a(srv_sys.n_threads_active[type]
-				     < srv_n_purge_threads - 1);
-				break;
-			}
-
-			os_event_set(slot->event);
-		}
-
-		srv_sys_mutex_exit();
-	} while (running && running < n);
+  ut_ad(!srv_master_timer);
+  delete srv_thread_pool;
+  srv_thread_pool= nullptr;
 }
 
-/*********************************************************************//**
-Release a thread's slot. */
-static
-void
-srv_free_slot(
-/*==========*/
-	srv_slot_t*	slot)	/*!< in/out: thread slot */
-{
-	srv_sys_mutex_enter();
-
-	/* Mark the thread as inactive. */
-	srv_suspend_thread_low(slot);
-	/* Free the slot for reuse. */
-	ut_ad(slot->in_use);
-	slot->in_use = FALSE;
-
-	srv_sys_mutex_exit();
-}
+static bool need_srv_free;
 
 /** Initialize the server. */
-static
-void
-srv_init()
+static void srv_init()
 {
 	mutex_create(LATCH_ID_SRV_INNODB_MONITOR, &srv_innodb_monitor_mutex);
 
-	srv_sys.n_sys_threads = srv_read_only_mode
-		? 0
-		: srv_n_purge_threads + 1/* purge coordinator */;
-
 	if (!srv_read_only_mode) {
-		mutex_create(LATCH_ID_SRV_SYS, &srv_sys.mutex);
-
 		mutex_create(LATCH_ID_SRV_SYS_TASKS, &srv_sys.tasks_mutex);
 
-		for (ulint i = 0; i < srv_sys.n_sys_threads; ++i) {
-			srv_slot_t*	slot = &srv_sys.sys_threads[i];
-
-			slot->event = os_event_create(0);
-
-			ut_a(slot->event);
-		}
-
-		srv_error_event = os_event_create(0);
-
-		srv_monitor_event = os_event_create(0);
-
-		srv_buf_dump_event = os_event_create(0);
-
-		buf_flush_event = os_event_create("buf_flush_event");
-
 		UT_LIST_INIT(srv_sys.tasks, &que_thr_t::queue);
 	}
 
-	srv_buf_resize_event = os_event_create(0);
-
+	need_srv_free = true;
 	ut_d(srv_master_thread_disabled_event = os_event_create(0));
 
 	/* page_zip_stat_per_index_mutex is acquired from:
@@ -1039,14 +671,8 @@ srv_init()
 	mutex_create(LATCH_ID_PAGE_ZIP_STAT_PER_INDEX,
 		     &page_zip_stat_per_index_mutex);
 
-	/* Create dummy indexes for infimum and supremum records */
-
-	dict_ind_init();
-
 	/* Initialize some INFORMATION SCHEMA internal structures */
 	trx_i_s_cache_init(trx_i_s_cache);
-
-	ut_crc32_init();
 }
 
 /*********************************************************************//**
@@ -1055,7 +681,7 @@ void
 srv_free(void)
 /*==========*/
 {
-	if (!srv_buf_resize_event) {
+	if (!need_srv_free) {
 		return;
 	}
 
@@ -1063,26 +689,13 @@ srv_free(void)
 	mutex_free(&page_zip_stat_per_index_mutex);
 
 	if (!srv_read_only_mode) {
-		mutex_free(&srv_sys.mutex);
 		mutex_free(&srv_sys.tasks_mutex);
-
-		for (ulint i = 0; i < srv_sys.n_sys_threads; ++i) {
-			os_event_destroy(srv_sys.sys_threads[i].event);
-		}
-
-		os_event_destroy(srv_error_event);
-		os_event_destroy(srv_monitor_event);
-		os_event_destroy(srv_buf_dump_event);
-		os_event_destroy(buf_flush_event);
 	}
 
-	os_event_destroy(srv_buf_resize_event);
-
 	ut_d(os_event_destroy(srv_master_thread_disabled_event));
 
-	dict_ind_free();
-
 	trx_i_s_cache_free(trx_i_s_cache);
+	srv_thread_pool_end();
 }
 
 /*********************************************************************//**
@@ -1091,8 +704,8 @@ void
 srv_boot(void)
 /*==========*/
 {
+	srv_thread_pool_init();
 	sync_check_init();
-	recv_sys_var_init();
 	trx_pool_init();
 	row_mysql_init();
 	srv_init();
@@ -1100,16 +713,11 @@ srv_boot(void)
 
 /******************************************************************//**
 Refreshes the values used to calculate per-second averages. */
-static
-void
-srv_refresh_innodb_monitor_stats(void)
-/*==================================*/
+static void srv_refresh_innodb_monitor_stats(time_t current_time)
 {
 	mutex_enter(&srv_innodb_monitor_mutex);
 
-	time_t current_time = time(NULL);
-
-	if (difftime(current_time, srv_last_monitor_time) <= 60) {
+	if (difftime(current_time, srv_last_monitor_time) < 60) {
 		/* We referesh InnoDB Monitor values so that averages are
 		printed from at most 60 last seconds */
 		mutex_exit(&srv_innodb_monitor_mutex);
@@ -1127,7 +735,7 @@ srv_refresh_innodb_monitor_stats(void)
 
 	log_refresh_stats();
 
-	buf_refresh_io_stats_all();
+	buf_refresh_io_stats();
 
 	srv_n_rows_inserted_old = srv_stats.n_rows_inserted;
 	srv_n_rows_updated_old = srv_stats.n_rows_updated;
@@ -1253,44 +861,28 @@ srv_printf_innodb_monitor(
 	ibuf_print(file);
 
 #ifdef BTR_CUR_HASH_ADAPT
-	btr_search_x_lock_all();
 	for (ulint i = 0; i < btr_ahi_parts && btr_search_enabled; ++i) {
-		const hash_table_t* table = btr_search_sys->hash_tables[i];
-
-		ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-		/* this is only used for buf_pool->page_hash */
-		ut_ad(!table->heaps);
-		/* this is used for the adaptive hash index */
-		ut_ad(table->heap);
-
-		const mem_heap_t* heap = table->heap;
-		/* The heap may change during the following call,
-		so the data displayed may be garbage. We intentionally
-		avoid acquiring btr_search_latches[] so that the
-		diagnostic output will not stop here even in case another
-		thread hangs while holding btr_search_latches[].
-
-		This should be safe from crashes, because
-		table->heap will be pointing to the same object
-		for the full lifetime of the server. Even during
-		btr_search_disable() the heap will stay valid. */
+		const auto part= &btr_search_sys.parts[i];
+		rw_lock_s_lock(&part->latch);
+		ut_ad(part->heap->type == MEM_HEAP_FOR_BTR_SEARCH);
 		fprintf(file, "Hash table size " ULINTPF
 			", node heap has " ULINTPF " buffer(s)\n",
-			table->n_cells, heap->base.count - !heap->free_block);
+			part->table.n_cells,
+			part->heap->base.count - !part->heap->free_block);
+		rw_lock_s_unlock(&part->latch);
 	}
-	btr_search_x_unlock_all();
 
 	fprintf(file,
 		"%.2f hash searches/s, %.2f non-hash searches/s\n",
-		(btr_cur_n_sea - btr_cur_n_sea_old)
+		static_cast<double>(btr_cur_n_sea - btr_cur_n_sea_old)
 		/ time_elapsed,
-		(btr_cur_n_non_sea - btr_cur_n_non_sea_old)
+		static_cast<double>(btr_cur_n_non_sea - btr_cur_n_non_sea_old)
 		/ time_elapsed);
 	btr_cur_n_sea_old = btr_cur_n_sea;
 #else /* BTR_CUR_HASH_ADAPT */
 	fprintf(file,
 		"%.2f non-hash searches/s\n",
-		(btr_cur_n_non_sea - btr_cur_n_non_sea_old)
+		static_cast<double>(btr_cur_n_non_sea - btr_cur_n_non_sea_old)
 		/ time_elapsed);
 #endif /* BTR_CUR_HASH_ADAPT */
 	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
@@ -1307,19 +899,13 @@ srv_printf_innodb_monitor(
 		"Total large memory allocated " ULINTPF "\n"
 		"Dictionary memory allocated " ULINTPF "\n",
 		ulint{os_total_large_mem_allocated},
-		dict_sys_get_size());
+		dict_sys.rough_size());
 
 	buf_print_io(file);
 
 	fputs("--------------\n"
 	      "ROW OPERATIONS\n"
 	      "--------------\n", file);
-	fprintf(file,
-		ULINTPF " queries inside InnoDB, "
-		ULINTPF " queries in queue\n",
-		srv_conc_get_active_threads(),
-		srv_conc_get_waiting_threads());
-
 	fprintf(file, ULINTPF " read views open inside InnoDB\n",
 		trx_sys.view_count());
 
@@ -1349,13 +935,17 @@ srv_printf_innodb_monitor(
 	fprintf(file,
 		"%.2f inserts/s, %.2f updates/s,"
 		" %.2f deletes/s, %.2f reads/s\n",
-		((ulint) srv_stats.n_rows_inserted - srv_n_rows_inserted_old)
+		static_cast<double>(srv_stats.n_rows_inserted
+				    - srv_n_rows_inserted_old)
 		/ time_elapsed,
-		((ulint) srv_stats.n_rows_updated - srv_n_rows_updated_old)
+		static_cast<double>(srv_stats.n_rows_updated
+				    - srv_n_rows_updated_old)
 		/ time_elapsed,
-		((ulint) srv_stats.n_rows_deleted - srv_n_rows_deleted_old)
+		static_cast<double>(srv_stats.n_rows_deleted
+				    - srv_n_rows_deleted_old)
 		/ time_elapsed,
-		((ulint) srv_stats.n_rows_read - srv_n_rows_read_old)
+		static_cast<double>(srv_stats.n_rows_read
+				    - srv_n_rows_read_old)
 		/ time_elapsed);
 	fprintf(file,
 		"Number of system rows inserted " ULINTPF
@@ -1368,14 +958,18 @@ srv_printf_innodb_monitor(
 	fprintf(file,
 		"%.2f inserts/s, %.2f updates/s,"
 		" %.2f deletes/s, %.2f reads/s\n",
-		((ulint) srv_stats.n_system_rows_inserted
-		 - srv_n_system_rows_inserted_old) / time_elapsed,
-		((ulint) srv_stats.n_system_rows_updated
-		 - srv_n_system_rows_updated_old) / time_elapsed,
-		((ulint) srv_stats.n_system_rows_deleted
-		 - srv_n_system_rows_deleted_old) / time_elapsed,
-		((ulint) srv_stats.n_system_rows_read
-		 - srv_n_system_rows_read_old) / time_elapsed);
+		static_cast<double>(srv_stats.n_system_rows_inserted
+				    - srv_n_system_rows_inserted_old)
+		/ time_elapsed,
+		static_cast<double>(srv_stats.n_system_rows_updated
+				    - srv_n_system_rows_updated_old)
+		/ time_elapsed,
+		static_cast<double>(srv_stats.n_system_rows_deleted
+				    - srv_n_system_rows_deleted_old)
+		/ time_elapsed,
+		static_cast<double>(srv_stats.n_system_rows_read
+				    - srv_n_system_rows_read_old)
+		/ time_elapsed);
 	srv_n_rows_inserted_old = srv_stats.n_rows_inserted;
 	srv_n_rows_updated_old = srv_stats.n_rows_updated;
 	srv_n_rows_deleted_old = srv_stats.n_rows_deleted;
@@ -1400,22 +994,30 @@ void
 srv_export_innodb_status(void)
 /*==========================*/
 {
-	buf_pool_stat_t		stat;
-	buf_pools_list_size_t	buf_pools_list_size;
-	ulint			LRU_len;
-	ulint			free_len;
-	ulint			flush_list_len;
 	fil_crypt_stat_t	crypt_stat;
-	btr_scrub_stat_t	scrub_stat;
 
-	buf_get_total_stat(&stat);
-	buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
-	buf_get_total_list_size_in_bytes(&buf_pools_list_size);
 	if (!srv_read_only_mode) {
 		fil_crypt_total_stat(&crypt_stat);
-		btr_scrub_total_stat(&scrub_stat);
 	}
 
+#ifdef BTR_CUR_HASH_ADAPT
+	ulint mem_adaptive_hash = 0;
+	for (ulong i = 0; i < btr_ahi_parts; i++) {
+		const auto part= &btr_search_sys.parts[i];
+		rw_lock_s_lock(&part->latch);
+		if (part->heap) {
+			ut_ad(part->heap->type == MEM_HEAP_FOR_BTR_SEARCH);
+
+			mem_adaptive_hash += mem_heap_get_size(part->heap)
+				+ part->table.n_cells * sizeof(hash_cell_t);
+		}
+		rw_lock_s_unlock(&part->latch);
+	}
+	export_vars.innodb_mem_adaptive_hash = mem_adaptive_hash;
+#endif
+
+	export_vars.innodb_mem_dictionary = dict_sys.rough_size();
+
 	mutex_enter(&srv_innodb_monitor_mutex);
 
 	export_vars.innodb_data_pending_reads =
@@ -1425,7 +1027,7 @@ srv_export_innodb_status(void)
 		ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
 
 	export_vars.innodb_data_pending_fsyncs =
-		fil_n_pending_log_flushes
+		log_sys.get_pending_flushes()
 		+ fil_n_pending_tablespace_flushes;
 
 	export_vars.innodb_data_fsyncs = os_n_fsyncs;
@@ -1436,67 +1038,81 @@ srv_export_innodb_status(void)
 
 	export_vars.innodb_data_writes = os_n_file_writes;
 
-	export_vars.innodb_data_written = srv_stats.data_written;
+	ulint dblwr = 0;
 
-	export_vars.innodb_buffer_pool_read_requests = stat.n_page_gets;
+	if (buf_dblwr.is_initialised()) {
+		buf_dblwr.lock();
+		dblwr = buf_dblwr.submitted();
+		export_vars.innodb_dblwr_pages_written = buf_dblwr.written();
+		export_vars.innodb_dblwr_writes = buf_dblwr.batches();
+		buf_dblwr.unlock();
+	}
 
-	export_vars.innodb_buffer_pool_write_requests =
-		srv_stats.buf_pool_write_requests;
+	export_vars.innodb_data_written = srv_stats.data_written + dblwr;
 
-	export_vars.innodb_buffer_pool_wait_free =
-		srv_stats.buf_pool_wait_free;
+	export_vars.innodb_buffer_pool_read_requests
+		= buf_pool.stat.n_page_gets;
 
-	export_vars.innodb_buffer_pool_pages_flushed =
-		srv_stats.buf_pool_flushed;
+	export_vars.innodb_buffer_pool_write_requests =
+		srv_stats.buf_pool_write_requests;
 
 	export_vars.innodb_buffer_pool_reads = srv_stats.buf_pool_reads;
 
 	export_vars.innodb_buffer_pool_read_ahead_rnd =
-		stat.n_ra_pages_read_rnd;
+		buf_pool.stat.n_ra_pages_read_rnd;
 
 	export_vars.innodb_buffer_pool_read_ahead =
-		stat.n_ra_pages_read;
+		buf_pool.stat.n_ra_pages_read;
 
 	export_vars.innodb_buffer_pool_read_ahead_evicted =
-		stat.n_ra_pages_evicted;
+		buf_pool.stat.n_ra_pages_evicted;
 
-	export_vars.innodb_buffer_pool_pages_data = LRU_len;
+	export_vars.innodb_buffer_pool_pages_data =
+		UT_LIST_GET_LEN(buf_pool.LRU);
 
 	export_vars.innodb_buffer_pool_bytes_data =
-		buf_pools_list_size.LRU_bytes
-		+ buf_pools_list_size.unzip_LRU_bytes;
+		buf_pool.stat.LRU_bytes
+		+ (UT_LIST_GET_LEN(buf_pool.unzip_LRU)
+		   << srv_page_size_shift);
 
-	export_vars.innodb_buffer_pool_pages_dirty = flush_list_len;
+	export_vars.innodb_buffer_pool_pages_dirty =
+		UT_LIST_GET_LEN(buf_pool.flush_list);
+
+	export_vars.innodb_buffer_pool_pages_made_young
+		= buf_pool.stat.n_pages_made_young;
+	export_vars.innodb_buffer_pool_pages_made_not_young
+		= buf_pool.stat.n_pages_not_made_young;
+
+	export_vars.innodb_buffer_pool_pages_old = buf_pool.LRU_old_len;
 
 	export_vars.innodb_buffer_pool_bytes_dirty =
-		buf_pools_list_size.flush_list_bytes;
+		buf_pool.stat.flush_list_bytes;
 
-	export_vars.innodb_buffer_pool_pages_free = free_len;
+	export_vars.innodb_buffer_pool_pages_free =
+		UT_LIST_GET_LEN(buf_pool.free);
 
 #ifdef UNIV_DEBUG
 	export_vars.innodb_buffer_pool_pages_latched =
 		buf_get_latched_pages_number();
 #endif /* UNIV_DEBUG */
-	export_vars.innodb_buffer_pool_pages_total = buf_pool_get_n_pages();
+	export_vars.innodb_buffer_pool_pages_total = buf_pool.get_n_pages();
 
 	export_vars.innodb_buffer_pool_pages_misc =
-		buf_pool_get_n_pages() - LRU_len - free_len;
+		buf_pool.get_n_pages()
+		- UT_LIST_GET_LEN(buf_pool.LRU)
+		- UT_LIST_GET_LEN(buf_pool.free);
 
-#ifdef HAVE_ATOMIC_BUILTINS
-	export_vars.innodb_have_atomic_builtins = 1;
-#else
-	export_vars.innodb_have_atomic_builtins = 0;
-#endif
-
-	export_vars.innodb_page_size = srv_page_size;
+	export_vars.innodb_max_trx_id = trx_sys.get_max_trx_id();
+	export_vars.innodb_history_list_length = trx_sys.rseg_history_len;
 
 	export_vars.innodb_log_waits = srv_stats.log_waits;
 
 	export_vars.innodb_os_log_written = srv_stats.os_log_written;
 
-	export_vars.innodb_os_log_fsyncs = fil_n_log_flushes;
+	export_vars.innodb_os_log_fsyncs = log_sys.get_flushes();
 
-	export_vars.innodb_os_log_pending_fsyncs = fil_n_pending_log_flushes;
+	export_vars.innodb_os_log_pending_fsyncs
+		= log_sys.get_pending_flushes();
 
 	export_vars.innodb_os_log_pending_writes =
 		srv_stats.os_log_pending_writes;
@@ -1505,17 +1121,6 @@ srv_export_innodb_status(void)
 
 	export_vars.innodb_log_writes = srv_stats.log_writes;
 
-	export_vars.innodb_dblwr_pages_written =
-		srv_stats.dblwr_pages_written;
-
-	export_vars.innodb_dblwr_writes = srv_stats.dblwr_writes;
-
-	export_vars.innodb_pages_created = stat.n_pages_created;
-
-	export_vars.innodb_pages_read = stat.n_pages_read;
-
-	export_vars.innodb_pages_written = stat.n_pages_written;
-
 	export_vars.innodb_row_lock_waits = srv_stats.n_lock_wait_count;
 
 	export_vars.innodb_row_lock_current_waits =
@@ -1555,12 +1160,9 @@ srv_export_innodb_status(void)
 	export_vars.innodb_system_rows_deleted =
 		srv_stats.n_system_rows_deleted;
 
-	export_vars.innodb_num_open_files = fil_system.n_open;
-
 	export_vars.innodb_truncated_status_writes =
 		srv_truncated_status_writes;
 
-	export_vars.innodb_available_undo_logs = srv_available_undo_logs;
 	export_vars.innodb_page_compression_saved = srv_stats.page_compression_saved;
 	export_vars.innodb_index_pages_written = srv_stats.index_pages_written;
 	export_vars.innodb_non_index_pages_written = srv_stats.non_index_pages_written;
@@ -1596,105 +1198,79 @@ srv_export_innodb_status(void)
 		srv_stats.n_sec_rec_cluster_reads_avoided;
 
 	if (!srv_read_only_mode) {
-	export_vars.innodb_encryption_rotation_pages_read_from_cache =
-		crypt_stat.pages_read_from_cache;
-	export_vars.innodb_encryption_rotation_pages_read_from_disk =
-		crypt_stat.pages_read_from_disk;
-	export_vars.innodb_encryption_rotation_pages_modified =
-		crypt_stat.pages_modified;
-	export_vars.innodb_encryption_rotation_pages_flushed =
-		crypt_stat.pages_flushed;
-	export_vars.innodb_encryption_rotation_estimated_iops =
-		crypt_stat.estimated_iops;
-	export_vars.innodb_encryption_key_requests =
-		srv_stats.n_key_requests;
-	export_vars.innodb_key_rotation_list_length =
-		srv_stats.key_rotation_list_length;
-
-	export_vars.innodb_scrub_page_reorganizations =
-		scrub_stat.page_reorganizations;
-	export_vars.innodb_scrub_page_splits =
-		scrub_stat.page_splits;
-	export_vars.innodb_scrub_page_split_failures_underflow =
-		scrub_stat.page_split_failures_underflow;
-	export_vars.innodb_scrub_page_split_failures_out_of_filespace =
-		scrub_stat.page_split_failures_out_of_filespace;
-	export_vars.innodb_scrub_page_split_failures_missing_index =
-		scrub_stat.page_split_failures_missing_index;
-	export_vars.innodb_scrub_page_split_failures_unknown =
-		scrub_stat.page_split_failures_unknown;
-	export_vars.innodb_scrub_log = srv_stats.n_log_scrubs;
+		export_vars.innodb_encryption_rotation_pages_read_from_cache =
+			crypt_stat.pages_read_from_cache;
+		export_vars.innodb_encryption_rotation_pages_read_from_disk =
+			crypt_stat.pages_read_from_disk;
+		export_vars.innodb_encryption_rotation_pages_modified =
+			crypt_stat.pages_modified;
+		export_vars.innodb_encryption_rotation_pages_flushed =
+			crypt_stat.pages_flushed;
+		export_vars.innodb_encryption_rotation_estimated_iops =
+			crypt_stat.estimated_iops;
+		export_vars.innodb_encryption_key_requests =
+			srv_stats.n_key_requests;
+		export_vars.innodb_key_rotation_list_length =
+			srv_stats.key_rotation_list_length;
 	}
 
 	mutex_exit(&srv_innodb_monitor_mutex);
+
+	mysql_mutex_lock(&log_sys.mutex);
+	export_vars.innodb_lsn_current = log_sys.get_lsn();
+	export_vars.innodb_lsn_flushed = log_sys.get_flushed_lsn();
+	export_vars.innodb_lsn_last_checkpoint = log_sys.last_checkpoint_lsn;
+	export_vars.innodb_checkpoint_max_age = static_cast<ulint>(
+		log_sys.max_checkpoint_age);
+	mysql_mutex_unlock(&log_sys.mutex);
+
+	export_vars.innodb_checkpoint_age = static_cast<ulint>(
+		export_vars.innodb_lsn_current
+		- export_vars.innodb_lsn_last_checkpoint);
 }
 
-/*********************************************************************//**
-A thread which prints the info output by various InnoDB monitors.
-@return a dummy parameter */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(srv_monitor_thread)(void*)
+struct srv_monitor_state_t
 {
-	int64_t		sig_count;
-	double		time_elapsed;
-	time_t		current_time;
-	time_t		last_monitor_time;
-	ulint		mutex_skipped;
-	ibool		last_srv_print_monitor;
-
-	ut_ad(!srv_read_only_mode);
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	ib::info() << "Lock timeout thread starts, id "
-		<< os_thread_pf(os_thread_get_curr_id());
-#endif /* UNIV_DEBUG_THREAD_CREATION */
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(srv_monitor_thread_key);
-#endif /* UNIV_PFS_THREAD */
-
-	current_time = time(NULL);
-	srv_last_monitor_time = current_time;
-	last_monitor_time = current_time;
-	mutex_skipped = 0;
-	last_srv_print_monitor = srv_print_innodb_monitor;
-loop:
-	/* Wake up every 5 seconds to see if we need to print
-	monitor information or if signalled at shutdown. */
-
-	sig_count = os_event_reset(srv_monitor_event);
+  time_t last_monitor_time;
+  ulint mutex_skipped;
+  bool last_srv_print_monitor;
+  srv_monitor_state_t() : mutex_skipped(0), last_srv_print_monitor(false)
+  {
+    srv_last_monitor_time = time(NULL);
+    last_monitor_time= srv_last_monitor_time;
+  }
+};
 
-	os_event_wait_time_low(srv_monitor_event, 5000000, sig_count);
+static srv_monitor_state_t monitor_state;
 
-	current_time = time(NULL);
-
-	time_elapsed = difftime(current_time, last_monitor_time);
+/** A task which prints the info output by various InnoDB monitors.*/
+static void srv_monitor()
+{
+	time_t current_time = time(NULL);
 
-	if (time_elapsed > 15) {
-		last_monitor_time = current_time;
+	if (difftime(current_time, monitor_state.last_monitor_time) >= 15) {
+		monitor_state.last_monitor_time = current_time;
 
 		if (srv_print_innodb_monitor) {
 			/* Reset mutex_skipped counter everytime
 			srv_print_innodb_monitor changes. This is to
 			ensure we will not be blocked by lock_sys.mutex
-			for short duration information printing,
-			such as requested by sync_array_print_long_waits() */
-			if (!last_srv_print_monitor) {
-				mutex_skipped = 0;
-				last_srv_print_monitor = TRUE;
+			for short duration information printing */
+			if (!monitor_state.last_srv_print_monitor) {
+				monitor_state.mutex_skipped = 0;
+				monitor_state.last_srv_print_monitor = true;
 			}
 
 			if (!srv_printf_innodb_monitor(stderr,
-						MUTEX_NOWAIT(mutex_skipped),
+						MUTEX_NOWAIT(monitor_state.mutex_skipped),
 						NULL, NULL)) {
-				mutex_skipped++;
+				monitor_state.mutex_skipped++;
 			} else {
 				/* Reset the counter */
-				mutex_skipped = 0;
+				monitor_state.mutex_skipped = 0;
 			}
 		} else {
-			last_srv_print_monitor = FALSE;
+			monitor_state.last_monitor_time = 0;
 		}
 
 
@@ -1705,11 +1281,11 @@ loop:
 			mutex_enter(&srv_monitor_file_mutex);
 			rewind(srv_monitor_file);
 			if (!srv_printf_innodb_monitor(srv_monitor_file,
-						MUTEX_NOWAIT(mutex_skipped),
+						MUTEX_NOWAIT(monitor_state.mutex_skipped),
 						NULL, NULL)) {
-				mutex_skipped++;
+				monitor_state.mutex_skipped++;
 			} else {
-				mutex_skipped = 0;
+				monitor_state.mutex_skipped = 0;
 			}
 
 			os_file_set_eof(srv_monitor_file);
@@ -1717,78 +1293,33 @@ loop:
 		}
 	}
 
-	srv_refresh_innodb_monitor_stats();
-
-	if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
-		goto exit_func;
-	}
-
-	if (srv_print_innodb_monitor
-	    || srv_print_innodb_lock_monitor) {
-		goto loop;
-	}
-
-	goto loop;
-
-exit_func:
-	srv_monitor_active = false;
-
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-
-	os_thread_exit();
-
-	OS_THREAD_DUMMY_RETURN;
+	srv_refresh_innodb_monitor_stats(current_time);
 }
 
 /*********************************************************************//**
-A thread which prints warnings about semaphore waits which have lasted
+A task which prints warnings about semaphore waits which have lasted
 too long. These can be used to track bugs which cause hangs.
-@return a dummy parameter */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(srv_error_monitor_thread)(void*)
+*/
+void srv_monitor_task(void*)
 {
 	/* number of successive fatal timeouts observed */
-	ulint		fatal_cnt	= 0;
-	lsn_t		old_lsn;
-	lsn_t		new_lsn;
-	int64_t		sig_count;
+	static ulint		fatal_cnt;
+	static lsn_t		old_lsn = recv_sys.recovered_lsn;
 	/* longest waiting thread for a semaphore */
-	os_thread_id_t	waiter		= os_thread_get_curr_id();
-	os_thread_id_t	old_waiter	= waiter;
+	os_thread_id_t	waiter;
+	static os_thread_id_t	old_waiter = os_thread_get_curr_id();
 	/* the semaphore that is being waited for */
 	const void*	sema		= NULL;
-	const void*	old_sema	= NULL;
+	static const void*	old_sema	= NULL;
 
 	ut_ad(!srv_read_only_mode);
 
-	old_lsn = srv_start_lsn;
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	ib::info() << "Error monitor thread starts, id "
-		<< os_thread_pf(os_thread_get_curr_id());
-#endif /* UNIV_DEBUG_THREAD_CREATION */
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(srv_error_monitor_thread_key);
-#endif /* UNIV_PFS_THREAD */
-
-loop:
 	/* Try to track a strange bug reported by Harald Fuchs and others,
 	where the lsn seems to decrease at times */
 
-	if (log_peek_lsn(&new_lsn)) {
-		if (new_lsn < old_lsn) {
-		ib::error() << "Old log sequence number " << old_lsn << " was"
-			<< " greater than the new log sequence number "
-			<< new_lsn << ". Please submit a bug report to"
-			" https://jira.mariadb.org/";
-			ut_ad(0);
-		}
-
-		old_lsn = new_lsn;
-	}
+	lsn_t new_lsn = log_sys.get_lsn();
+	ut_a(new_lsn >= old_lsn);
+	old_lsn = new_lsn;
 
 	/* Update the statistics collected for deciding LRU
 	eviction policy. */
@@ -1796,8 +1327,7 @@ loop:
 
 	if (sync_array_print_long_waits(&waiter, &sema)
 	    && sema == old_sema && os_thread_eq(waiter, old_waiter)) {
-		fatal_cnt++;
-		if (fatal_cnt > 10) {
+		if (fatal_cnt++) {
 			ib::fatal() << "Semaphore wait has lasted > "
 				<< srv_fatal_semaphore_wait_threshold
 				<< " seconds. We intentionally crash the"
@@ -1809,28 +1339,7 @@ loop:
 		old_sema = sema;
 	}
 
-	/* Flush stderr so that a database user gets the output
-	to possible MySQL error file */
-
-	fflush(stderr);
-
-	sig_count = os_event_reset(srv_error_event);
-
-	os_event_wait_time_low(srv_error_event, 1000000, sig_count);
-
-	if (srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
-
-		goto loop;
-	}
-
-	srv_error_monitor_active = false;
-
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-
-	os_thread_exit();
-
-	OS_THREAD_DUMMY_RETURN;
+	srv_monitor();
 }
 
 /******************************************************************//**
@@ -1842,92 +1351,108 @@ srv_inc_activity_count(void)
 	srv_sys.activity_count.inc();
 }
 
-/**********************************************************************//**
-Check whether any background thread is active. If so return the thread
-type.
-@return SRV_NONE if all are suspended or have exited, thread
-type if any are still active. */
-srv_thread_type
-srv_get_active_thread_type(void)
-/*============================*/
+#ifdef UNIV_DEBUG
+/** @return whether purge or master task is active */
+bool srv_any_background_activity()
 {
-	srv_thread_type ret = SRV_NONE;
-
-	if (srv_read_only_mode) {
-		return(SRV_NONE);
-	}
-
-	srv_sys_mutex_enter();
-
-	for (ulint i = SRV_WORKER; i <= SRV_MASTER; ++i) {
-		if (srv_sys.n_threads_active[i] != 0) {
-			ret = static_cast<srv_thread_type>(i);
-			break;
-		}
-	}
+  if (purge_sys.enabled() || srv_master_timer.get())
+  {
+    ut_ad(!srv_read_only_mode);
+    return true;
+  }
+  return false;
+}
+#endif /* UNIV_DEBUG */
 
-	srv_sys_mutex_exit();
+static void purge_worker_callback(void*);
+static void purge_coordinator_callback(void*);
+static void purge_coordinator_timer_callback(void*);
 
-	if (ret == SRV_NONE && purge_sys.enabled()) {
-		ret = SRV_PURGE;
-	}
+static tpool::task_group purge_task_group;
+tpool::waitable_task purge_worker_task(purge_worker_callback, nullptr,
+                                       &purge_task_group);
+static tpool::task_group purge_coordinator_task_group(1);
+static tpool::waitable_task purge_coordinator_task
+  (purge_coordinator_callback, nullptr, &purge_coordinator_task_group);
 
-	return(ret);
-}
+static tpool::timer *purge_coordinator_timer;
 
-/** Wake up the InnoDB master thread if it was suspended (not sleeping). */
+/** Wake up the purge threads if there is work to do. */
 void
-srv_active_wake_master_thread_low()
+srv_wake_purge_thread_if_not_active()
 {
 	ut_ad(!srv_read_only_mode);
-	ut_ad(!mutex_own(&srv_sys.mutex));
-
-	srv_inc_activity_count();
-
-	if (srv_sys.n_threads_active[SRV_MASTER] == 0) {
-		srv_slot_t*	slot;
 
-		srv_sys_mutex_enter();
-
-		slot = &srv_sys.sys_threads[SRV_MASTER_SLOT];
-
-		/* Only if the master thread has been started. */
-
-		if (slot->in_use) {
-			ut_a(srv_slot_get_type(slot) == SRV_MASTER);
-			os_event_set(slot->event);
+	if (purge_sys.enabled() && !purge_sys.paused()
+	    && trx_sys.rseg_history_len) {
+		if(++purge_state.m_running == 1) {
+			srv_thread_pool->submit_task(&purge_coordinator_task);
 		}
-
-		srv_sys_mutex_exit();
 	}
 }
 
-/** Wake up the purge threads if there is work to do. */
-void
-srv_wake_purge_thread_if_not_active()
+/** @return whether the purge tasks are active */
+bool purge_sys_t::running() const
 {
-	ut_ad(!srv_read_only_mode);
-	ut_ad(!mutex_own(&srv_sys.mutex));
+  return purge_coordinator_task.is_running();
+}
 
-	if (purge_sys.enabled() && !purge_sys.paused()
-	    && !srv_sys.n_threads_active[SRV_PURGE]
-	    && trx_sys.rseg_history_len) {
+/** Stop purge during FLUSH TABLES FOR EXPORT */
+void purge_sys_t::stop()
+{
+  rw_lock_x_lock(&latch);
 
-		srv_release_threads(SRV_PURGE, 1);
-	}
+  if (!enabled())
+  {
+    /* Shutdown must have been initiated during FLUSH TABLES FOR EXPORT. */
+    ut_ad(!srv_undo_sources);
+    rw_lock_x_unlock(&latch);
+    return;
+  }
+
+  ut_ad(srv_n_purge_threads > 0);
+
+  const auto paused= m_paused++;
+
+  rw_lock_x_unlock(&latch);
+
+  if (!paused)
+  {
+    ib::info() << "Stopping purge";
+    MONITOR_ATOMIC_INC(MONITOR_PURGE_STOP_COUNT);
+    purge_coordinator_task.disable();
+  }
 }
 
-/** Wake up the master thread if it is suspended or being suspended. */
-void
-srv_wake_master_thread()
+/** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */
+void purge_sys_t::resume()
 {
-	srv_inc_activity_count();
-	srv_release_threads(SRV_MASTER, 1);
+   if (!enabled())
+   {
+     /* Shutdown must have been initiated during FLUSH TABLES FOR EXPORT. */
+     ut_ad(!srv_undo_sources);
+     return;
+   }
+   ut_ad(!srv_read_only_mode);
+   ut_ad(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
+   ut_ad(!sync_check_iterate(sync_check()));
+   purge_coordinator_task.enable();
+   rw_lock_x_lock(&latch);
+   int32_t paused= m_paused--;
+   ut_a(paused);
+
+   if (paused == 1)
+   {
+     ib::info() << "Resuming purge";
+     purge_state.m_running = 0;
+     srv_wake_purge_thread_if_not_active();
+     MONITOR_ATOMIC_INC(MONITOR_PURGE_RESUME_COUNT);
+   }
+   rw_lock_x_unlock(&latch);
 }
 
 /*******************************************************************//**
-Get current server activity count. We don't hold srv_sys::mutex while
-reading this value as it is only used in heuristics.
+Get current server activity count.
 @return activity count. */
 ulint
 srv_get_activity_count(void)
@@ -1936,15 +1461,19 @@ srv_get_activity_count(void)
 	return(srv_sys.activity_count);
 }
 
-/*******************************************************************//**
-Check if there has been any activity.
-@return FALSE if no change in activity counter. */
-ibool
-srv_check_activity(
-/*===============*/
-	ulint		old_activity_count)	/*!< in: old activity count */
+/** Check if srv_inc_activity_count() has been called.
+@param activity_count   copy of srv_sys.activity_count
+@return whether the activity_count had changed */
+static bool srv_check_activity(ulint *activity_count)
 {
-	return(srv_sys.activity_count != old_activity_count);
+  ulint new_activity_count= srv_sys.activity_count;
+  if (new_activity_count != *activity_count)
+  {
+    *activity_count= new_activity_count;
+    return true;
+  }
+
+  return false;
 }
 
 /********************************************************************//**
@@ -1962,7 +1491,7 @@ srv_sync_log_buffer_in_background(void)
 	srv_main_thread_op_info = "flushing log";
 	if (difftime(current_time, srv_last_log_flush_time)
 	    >= srv_flush_log_at_timeout) {
-		log_buffer_sync_in_background(true);
+		log_buffer_flush_to_disk();
 		srv_last_log_flush_time = current_time;
 		srv_log_writes_and_flush++;
 	}
@@ -2112,13 +1641,6 @@ srv_master_do_active_tasks(void)
 	srv_main_thread_op_info = "checking free log space";
 	log_free_check();
 
-	/* Do an ibuf merge */
-	srv_main_thread_op_info = "doing insert buffer merge";
-	counter_time = microsecond_interval_timer();
-	ibuf_merge_in_background(false);
-	MONITOR_INC_TIME_IN_MICRO_SECS(
-		MONITOR_SRV_IBUF_MERGE_MICROSECOND, counter_time);
-
 	/* Flush logs if needed */
 	srv_main_thread_op_info = "flushing log";
 	srv_sync_log_buffer_in_background();
@@ -2142,28 +1664,6 @@ srv_master_do_active_tasks(void)
 		MONITOR_INC_TIME_IN_MICRO_SECS(
 			MONITOR_SRV_DICT_LRU_MICROSECOND, counter_time);
 	}
-
-	/* The periodic log_checkpoint() call here makes it harder to
-	reproduce bugs in crash recovery or mariabackup --prepare, or
-	in code that writes the redo log records. Omitting the call
-	here should not affect correctness, because log_free_check()
-	should still be invoking checkpoints when needed. In a
-	production server, those calls could cause "furious flushing"
-	and stall the server. Normally we want to perform checkpoints
-	early and often to avoid those situations. */
-	DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", return;);
-
-	if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
-		return;
-	}
-
-	/* Make a new checkpoint */
-	if (cur_time % SRV_MASTER_CHECKPOINT_INTERVAL == 0) {
-		srv_main_thread_op_info = "making checkpoint";
-		log_checkpoint(true);
-		MONITOR_INC_TIME_IN_MICRO_SECS(
-			MONITOR_SRV_CHECKPOINT_MICROSECOND, counter_time);
-	}
 }
 
 /*********************************************************************//**
@@ -2205,13 +1705,6 @@ srv_master_do_idle_tasks(void)
 	srv_main_thread_op_info = "checking free log space";
 	log_free_check();
 
-	/* Do an ibuf merge */
-	counter_time = microsecond_interval_timer();
-	srv_main_thread_op_info = "doing insert buffer merge";
-	ibuf_merge_in_background(true);
-	MONITOR_INC_TIME_IN_MICRO_SECS(
-		MONITOR_SRV_IBUF_MERGE_MICROSECOND, counter_time);
-
 	if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
 		return;
 	}
@@ -2229,37 +1722,12 @@ srv_master_do_idle_tasks(void)
 	srv_sync_log_buffer_in_background();
 	MONITOR_INC_TIME_IN_MICRO_SECS(
 		MONITOR_SRV_LOG_FLUSH_MICROSECOND, counter_time);
-
-	/* The periodic log_checkpoint() call here makes it harder to
-	reproduce bugs in crash recovery or mariabackup --prepare, or
-	in code that writes the redo log records. Omitting the call
-	here should not affect correctness, because log_free_check()
-	should still be invoking checkpoints when needed. In a
-	production server, those calls could cause "furious flushing"
-	and stall the server. Normally we want to perform checkpoints
-	early and often to avoid those situations. */
-	DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", return;);
-
-	if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
-		return;
-	}
-
-	/* Make a new checkpoint */
-	srv_main_thread_op_info = "making checkpoint";
-	log_checkpoint(true);
-	MONITOR_INC_TIME_IN_MICRO_SECS(MONITOR_SRV_CHECKPOINT_MICROSECOND,
-				       counter_time);
-
-	/* This is a workaround to avoid the InnoDB hang when OS datetime
-	changed backwards.*/
-	os_event_set(buf_flush_event);
 }
 
-/** Perform shutdown tasks.
-@param[in]	ibuf_merge	whether to complete the change buffer merge */
-static
-void
-srv_shutdown(bool ibuf_merge)
+/**
+Complete the shutdown tasks such as background DROP TABLE,
+and optionally change buffer merge (on innodb_fast_shutdown=0). */
+void srv_shutdown(bool ibuf_merge)
 {
 	ulint		n_bytes_merged	= 0;
 	ulint		n_tables_to_drop;
@@ -2279,7 +1747,7 @@ srv_shutdown(bool ibuf_merge)
 			srv_main_thread_op_info = "checking free log space";
 			log_free_check();
 			srv_main_thread_op_info = "doing insert buffer merge";
-			n_bytes_merged = ibuf_merge_in_background(true);
+			n_bytes_merged = ibuf_merge_all();
 
 			/* Flush logs if needed */
 			srv_sync_log_buffer_in_background();
@@ -2293,96 +1761,21 @@ srv_shutdown(bool ibuf_merge)
 	} while (n_bytes_merged || n_tables_to_drop);
 }
 
-/*********************************************************************//**
-Puts master thread to sleep. At this point we are using polling to
-service various activities. Master thread sleeps for one second before
-checking the state of the server again */
-static
-void
-srv_master_sleep(void)
-/*==================*/
-{
-	srv_main_thread_op_info = "sleeping";
-	os_thread_sleep(1000000);
-	srv_main_thread_op_info = "";
-}
-
-/*********************************************************************//**
-The master thread controlling the server.
-@return a dummy parameter */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(srv_master_thread)(
-/*==============================*/
-	void*	arg MY_ATTRIBUTE((unused)))
-			/*!< in: a dummy parameter required by
-			os_thread_create */
+/** The periodic master task controlling the server. */
+void srv_master_callback(void*)
 {
-	my_thread_init();
-	DBUG_ENTER("srv_master_thread");
-
-	srv_slot_t*	slot;
-	ulint		old_activity_count = srv_get_activity_count();
-
-	ut_ad(!srv_read_only_mode);
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	ib::info() << "Master thread starts, id "
-		<< os_thread_pf(os_thread_get_curr_id());
-#endif /* UNIV_DEBUG_THREAD_CREATION */
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(srv_master_thread_key);
-#endif /* UNIV_PFS_THREAD */
-
-	srv_main_thread_process_no = os_proc_get_number();
-	srv_main_thread_id = os_thread_pf(os_thread_get_curr_id());
-
-	slot = srv_reserve_slot(SRV_MASTER);
-	ut_a(slot == srv_sys.sys_threads);
+	static ulint old_activity_count;
 
-loop:
-	while (srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
-		srv_master_sleep();
+	ut_a(srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
 
-		MONITOR_INC(MONITOR_MASTER_THREAD_SLEEP);
-
-		if (srv_check_activity(old_activity_count)) {
-			old_activity_count = srv_get_activity_count();
-			srv_master_do_active_tasks();
-		} else {
-			srv_master_do_idle_tasks();
-		}
-	}
-
-	switch (srv_shutdown_state) {
-	case SRV_SHUTDOWN_NONE:
-	case SRV_SHUTDOWN_INITIATED:
-		break;
-	case SRV_SHUTDOWN_FLUSH_PHASE:
-	case SRV_SHUTDOWN_LAST_PHASE:
-		ut_ad(0);
-		/* fall through */
-	case SRV_SHUTDOWN_EXIT_THREADS:
-		/* srv_init_abort() must have been invoked */
-	case SRV_SHUTDOWN_CLEANUP:
-		if (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP
-		    && srv_fast_shutdown < 2) {
-			srv_shutdown(srv_fast_shutdown == 0);
-		}
-		srv_suspend_thread(slot);
-		my_thread_end();
-		os_thread_exit();
+	srv_main_thread_op_info = "";
+	MONITOR_INC(MONITOR_MASTER_THREAD_SLEEP);
+	if (srv_check_activity(&old_activity_count)) {
+		srv_master_do_active_tasks();
+	} else {
+		srv_master_do_idle_tasks();
 	}
-
-	srv_main_thread_op_info = "suspending";
-
-	srv_suspend_thread(slot);
-
-	srv_main_thread_op_info = "waiting for server activity";
-
-	srv_resume_thread(slot);
-	goto loop;
+	srv_main_thread_op_info = "sleeping";
 }
 
 /** @return whether purge should exit due to shutdown */
@@ -2421,7 +1814,7 @@ static bool srv_purge_should_exit()
 Fetch and execute a task from the work queue.
 @param [in,out]	slot	purge worker thread slot
 @return true if a task was executed */
-static bool srv_task_execute(ut_d(srv_slot_t *slot))
+static bool srv_task_execute()
 {
 	ut_ad(!srv_read_only_mode);
 	ut_ad(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
@@ -2432,9 +1825,7 @@ static bool srv_task_execute(ut_d(srv_slot_t *slot))
 		ut_a(que_node_get_type(thr->child) == QUE_NODE_PURGE);
 		UT_LIST_REMOVE(srv_sys.tasks, thr);
 		mutex_exit(&srv_sys.tasks_mutex);
-		ut_d(thr->thread_slot = slot);
 		que_run_threads(thr);
-	        purge_sys.n_tasks.fetch_sub(1, std::memory_order_release);
 		return true;
 	}
 
@@ -2443,88 +1834,23 @@ static bool srv_task_execute(ut_d(srv_slot_t *slot))
 	return false;
 }
 
-/*********************************************************************//**
-Worker thread that reads tasks from the work queue and executes them.
-@return a dummy parameter */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(srv_worker_thread)(
-/*==============================*/
-	void*	arg MY_ATTRIBUTE((unused)))	/*!< in: a dummy parameter
-						required by os_thread_create */
-{
-	my_thread_init();
-
-	srv_slot_t*	slot;
-
-	ut_ad(!srv_read_only_mode);
-	ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
-	my_thread_init();
-	THD*		thd = innobase_create_background_thd("InnoDB purge worker");
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	ib::info() << "Worker thread starting, id "
-		<< os_thread_pf(os_thread_get_curr_id());
-#endif /* UNIV_DEBUG_THREAD_CREATION */
-
-	slot = srv_reserve_slot(SRV_WORKER);
-
-#ifdef UNIV_DEBUG
-	UT_LIST_INIT(slot->debug_sync,
-		     &srv_slot_t::debug_sync_t::debug_sync_list);
-	rw_lock_create(PFS_NOT_INSTRUMENTED, &slot->debug_sync_lock,
-		       SYNC_NO_ORDER_CHECK);
-#endif
-
-	ut_a(srv_n_purge_threads > 1);
-	ut_a(ulong(srv_sys.n_threads_active[SRV_WORKER])
-	     < srv_n_purge_threads);
+static void purge_create_background_thds(int );
 
-	/* We need to ensure that the worker threads exit after the
-	purge coordinator thread. Otherwise the purge coordinator can
-	end up waiting forever in trx_purge_wait_for_workers_to_complete() */
-
-	do {
-		srv_suspend_thread(slot);
-		srv_resume_thread(slot);
-
-		if (srv_task_execute(ut_d(slot))) {
-
-			/* If there are tasks in the queue, wakeup
-			the purge coordinator thread. */
-
-			srv_wake_purge_thread_if_not_active();
-		}
-	} while (purge_sys.enabled());
-
-	ut_d(rw_lock_free(&slot->debug_sync_lock));
-
-	srv_free_slot(slot);
-
-	ut_ad(!purge_sys.enabled());
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	ib::info() << "Purge worker thread exiting, id "
-		<< os_thread_pf(os_thread_get_curr_id());
-#endif /* UNIV_DEBUG_THREAD_CREATION */
-
-	innobase_destroy_background_thd(thd);
-	my_thread_end();
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-	os_thread_exit();
-
-	OS_THREAD_DUMMY_RETURN;	/* Not reached, avoid compiler warning */
+std::mutex purge_thread_count_mtx;
+void srv_update_purge_thread_count(uint n)
+{
+	std::lock_guard<std::mutex> lk(purge_thread_count_mtx);
+	purge_create_background_thds(n);
+	srv_n_purge_threads = n;
+	srv_purge_thread_count_changed = 1;
 }
 
+Atomic_counter<int> srv_purge_thread_count_changed;
+
 /** Do the actual purge operation.
 @param[in,out]	n_total_purged	total number of purged pages
 @return length of history list before the last purge batch. */
-static uint32_t srv_do_purge(ulint* n_total_purged
-#ifdef UNIV_DEBUG
-			     , srv_slot_t* slot /*!< purge coordinator */
-#endif
-			     )
+static uint32_t srv_do_purge(ulint* n_total_purged)
 {
 	ulint		n_pages_purged;
 
@@ -2532,7 +1858,7 @@ static uint32_t srv_do_purge(ulint* n_total_purged
 	static ulint	n_use_threads = 0;
 	static uint32_t	rseg_history_len = 0;
 	ulint		old_activity_count = srv_get_activity_count();
-	const ulint	n_threads = srv_n_purge_threads;
+	static ulint	n_threads = srv_n_purge_threads;
 
 	ut_a(n_threads > 0);
 	ut_ad(!srv_read_only_mode);
@@ -2548,7 +1874,20 @@ static uint32_t srv_do_purge(ulint* n_total_purged
 	}
 
 	do {
-		if (trx_sys.rseg_history_len > rseg_history_len
+		if (UNIV_UNLIKELY(srv_purge_thread_count_changed)) {
+			/* Read the fresh value of srv_n_purge_threads, reset
+			the changed flag. Both variables are protected by
+			purge_thread_count_mtx.
+
+			This code does not run concurrently, it is executed
+			by a single purge_coordinator thread, and no races
+			involving srv_purge_thread_count_changed are possible.
+			*/
+
+			std::lock_guard<std::mutex> lk(purge_thread_count_mtx);
+			n_threads = n_use_threads = srv_n_purge_threads;
+			srv_purge_thread_count_changed = 0;
+		} else if (trx_sys.rseg_history_len > rseg_history_len
 		    || (srv_max_purge_lag > 0
 			&& rseg_history_len > srv_max_purge_lag)) {
 
@@ -2559,15 +1898,13 @@ static uint32_t srv_do_purge(ulint* n_total_purged
 				++n_use_threads;
 			}
 
-		} else if (srv_check_activity(old_activity_count)
+		} else if (srv_check_activity(&old_activity_count)
 			   && n_use_threads > 1) {
 
 			/* History length same or smaller since last snapshot,
 			use fewer threads. */
 
 			--n_use_threads;
-
-			old_activity_count = srv_get_activity_count();
 		}
 
 		/* Ensure that the purge threads are less than what
@@ -2586,11 +1923,7 @@ static uint32_t srv_do_purge(ulint* n_total_purged
 			!(++count % srv_purge_rseg_truncate_frequency)
 			|| purge_sys.truncate.current
 			|| (srv_shutdown_state != SRV_SHUTDOWN_NONE
-			    && srv_fast_shutdown == 0)
-#ifdef UNIV_DEBUG
-			, slot
-#endif
-					   );
+			    && srv_fast_shutdown == 0));
 
 		*n_total_purged += n_pages_purged;
 	} while (n_pages_purged > 0 && !purge_sys.paused()
@@ -2598,176 +1931,149 @@ static uint32_t srv_do_purge(ulint* n_total_purged
 
 	return(rseg_history_len);
 }
-#ifndef UNIV_DEBUG
-# define srv_do_purge(n_total_purged, slot) srv_do_purge(n_total_purged)
-#endif
 
-/*********************************************************************//**
-Suspend the purge coordinator thread. */
-static
-void
-srv_purge_coordinator_suspend(
-/*==========================*/
-	srv_slot_t*	slot,			/*!< in/out: Purge coordinator
-						thread slot */
-	uint32_t	rseg_history_len)	/*!< in: history list length
-						before last purge */
-{
-	ut_ad(!srv_read_only_mode);
-	ut_a(slot->type == SRV_PURGE);
 
-	bool		stop = false;
+static std::list<THD*> purge_thds;
+static std::mutex purge_thd_mutex;
+extern void* thd_attach_thd(THD*);
+extern void thd_detach_thd(void *);
+static int n_purge_thds;
 
-	/** Maximum wait time on the purge event, in micro-seconds. */
-	static const ulint SRV_PURGE_MAX_TIMEOUT = 10000;
-
-	int64_t		sig_count = srv_suspend_thread(slot);
-
-	do {
-		/* We don't wait right away on the the non-timed wait because
-		we want to signal the thread that wants to suspend purge. */
-		const bool wait = stop
-			|| rseg_history_len <= trx_sys.rseg_history_len;
-		const bool timeout = srv_resume_thread(
-			slot, sig_count, wait,
-			stop ? 0 : SRV_PURGE_MAX_TIMEOUT);
-
-		sig_count = srv_suspend_thread(slot);
-
-		rw_lock_x_lock(&purge_sys.latch);
-
-		stop = srv_shutdown_state <= SRV_SHUTDOWN_INITIATED
-			&& purge_sys.paused();
-
-		if (!stop) {
-			if (timeout
-			    && rseg_history_len < 5000
-			    && rseg_history_len == trx_sys.rseg_history_len) {
-				/* No new records were added since the
-				wait started. Simply wait for new
-				records. The magic number 5000 is an
-				approximation for the case where we
-				have cached UNDO log records which
-				prevent truncate of the UNDO
-				segments. */
-				stop = true;
-			}
-		} else {
-			/* Signal that we are suspended. */
-			os_event_set(purge_sys.event);
-		}
-
-		rw_lock_x_unlock(&purge_sys.latch);
-	} while (stop && srv_undo_sources);
-
-	srv_resume_thread(slot, 0, false);
+/* Ensure  that we have at least n background THDs for purge */
+static void purge_create_background_thds(int n)
+{
+	THD *thd= current_thd;
+	std::unique_lock<std::mutex> lk(purge_thd_mutex);
+	while (n_purge_thds < n)
+	{
+		purge_thds.push_back(innobase_create_background_thd("InnoDB purge worker"));
+		n_purge_thds++;
+	}
+	set_current_thd(thd);
 }
 
-/*********************************************************************//**
-Purge coordinator thread that schedules the purge tasks.
-@return a dummy parameter */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(srv_purge_coordinator_thread)(
-/*=========================================*/
-	void*	arg MY_ATTRIBUTE((unused)))	/*!< in: a dummy parameter
-						required by os_thread_create */
+static THD *acquire_thd(void **ctx)
 {
-	my_thread_init();
-	THD*		thd = innobase_create_background_thd("InnoDB purge coordinator");
-	srv_slot_t*	slot;
-	ulint           n_total_purged = ULINT_UNDEFINED;
-
-	ut_ad(!srv_read_only_mode);
-	ut_a(srv_n_purge_threads >= 1);
-	ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
-
-	purge_sys.coordinator_startup();
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(srv_purge_thread_key);
-#endif /* UNIV_PFS_THREAD */
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	ib::info() << "Purge coordinator thread created, id "
-		<< os_thread_pf(os_thread_get_curr_id());
-#endif /* UNIV_DEBUG_THREAD_CREATION */
-
-	slot = srv_reserve_slot(SRV_PURGE);
-
-#ifdef UNIV_DEBUG
-	UT_LIST_INIT(slot->debug_sync,
-		     &srv_slot_t::debug_sync_t::debug_sync_list);
-	rw_lock_create(PFS_NOT_INSTRUMENTED, &slot->debug_sync_lock,
-		       SYNC_NO_ORDER_CHECK);
-#endif
-	uint32_t rseg_history_len = trx_sys.rseg_history_len;
-
-	do {
-		/* If there are no records to purge or the last
-		purge didn't purge any records then wait for activity. */
-
-		if (srv_shutdown_state <= SRV_SHUTDOWN_INITIATED
-		    && srv_undo_sources
-		    && (n_total_purged == 0 || purge_sys.paused())) {
-
-			srv_purge_coordinator_suspend(slot, rseg_history_len);
-		}
-
-		ut_ad(!slot->suspended);
+	std::unique_lock<std::mutex> lk(purge_thd_mutex);
+	ut_a(!purge_thds.empty());
+	THD* thd = purge_thds.front();
+	purge_thds.pop_front();
+	lk.unlock();
+
+	/* Set current thd, and thd->mysys_var as well,
+	it might be used by something in the server.*/
+	*ctx = thd_attach_thd(thd);
+	return thd;
+}
 
-		if (srv_purge_should_exit()) {
-			break;
-		}
+static void release_thd(THD *thd, void *ctx)
+{
+	thd_detach_thd(ctx);
+	std::unique_lock<std::mutex> lk(purge_thd_mutex);
+	purge_thds.push_back(thd);
+	lk.unlock();
+	set_current_thd(0);
+}
 
-		n_total_purged = 0;
 
-		rseg_history_len = srv_do_purge(&n_total_purged, slot);
-	} while (!srv_purge_should_exit());
+/*
+  Called by timer when purge coordinator decides
+  to delay processing of purge records.
+*/
+static void purge_coordinator_timer_callback(void *)
+{
+  if (!purge_sys.enabled() || purge_sys.paused() ||
+      purge_state.m_running || !trx_sys.rseg_history_len)
+    return;
+
+  if (purge_state.m_history_length < 5000 &&
+      purge_state.m_history_length == trx_sys.rseg_history_len)
+    /* No new records were added since wait started.
+    Simply wait for new records. The magic number 5000 is an
+    approximation for the case where we	have cached UNDO
+    log records which prevent truncate of the UNDO segments.*/
+    return;
+  srv_wake_purge_thread_if_not_active();
+}
 
-	/* The task queue should always be empty, independent of fast
-	shutdown state. */
-	ut_a(srv_get_task_queue_length() == 0);
+static void purge_worker_callback(void*)
+{
+  ut_ad(!current_thd);
+  ut_ad(!srv_read_only_mode);
+  ut_ad(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
+  void *ctx;
+  THD *thd= acquire_thd(&ctx);
+  while (srv_task_execute())
+    ut_ad(purge_sys.running());
+  release_thd(thd,ctx);
+}
 
-	ut_d(rw_lock_free(&slot->debug_sync_lock));
+static void purge_coordinator_callback_low()
+{
+  ulint n_total_purged= ULINT_UNDEFINED;
+  purge_state.m_history_length= 0;
 
-	srv_free_slot(slot);
+  if (!purge_sys.enabled() || purge_sys.paused())
+    return;
+  do
+  {
+    n_total_purged = 0;
+    int sigcount= purge_state.m_running;
 
-	/* Note that we are shutting down. */
-	rw_lock_x_lock(&purge_sys.latch);
-	purge_sys.coordinator_shutdown();
-	/* Ensure that the wait in purge_sys_t::stop() will terminate. */
-	os_event_set(purge_sys.event);
+    purge_state.m_history_length= srv_do_purge(&n_total_purged);
 
-	rw_lock_x_unlock(&purge_sys.latch);
+    /* Check if purge was woken by srv_wake_purge_thread_if_not_active() */
 
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	ib::info() << "Purge coordinator exiting, id "
-		<< os_thread_pf(os_thread_get_curr_id());
-#endif /* UNIV_DEBUG_THREAD_CREATION */
+    bool woken_during_purge= purge_state.m_running > sigcount;
 
-	/* Ensure that all the worker threads quit. */
-	if (ulint n_workers = srv_n_purge_threads - 1) {
-		const srv_slot_t* slot;
-		const srv_slot_t* const end = &srv_sys.sys_threads[
-			srv_sys.n_sys_threads];
+    /* If last purge batch processed less than 1 page and there is
+    still work to do, delay the next batch by 10ms. Unless
+    someone added work and woke us up. */
+    if (n_total_purged == 0)
+    {
+      if (trx_sys.rseg_history_len == 0)
+        return;
+      if (!woken_during_purge)
+      {
+        /* Delay next purge round*/
+        purge_coordinator_timer->set_time(10, 0);
+        return;
+      }
+    }
+  }
+  while ((purge_sys.enabled() && !purge_sys.paused()) ||
+         !srv_purge_should_exit());
+}
 
-		do {
-			srv_release_threads(SRV_WORKER, n_workers);
-			srv_sys_mutex_enter();
-			for (slot = &srv_sys.sys_threads[2];
-			     !slot++->in_use && slot < end; );
-			srv_sys_mutex_exit();
-		} while (slot < end);
-	}
+static void purge_coordinator_callback(void*)
+{
+  void *ctx;
+  THD *thd= acquire_thd(&ctx);
+  purge_coordinator_callback_low();
+  release_thd(thd,ctx);
+  purge_state.m_running= 0;
+}
 
-	innobase_destroy_background_thd(thd);
-	my_thread_end();
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-	os_thread_exit();
+void srv_init_purge_tasks()
+{
+  purge_create_background_thds(srv_n_purge_threads);
+  purge_coordinator_timer= srv_thread_pool->create_timer
+    (purge_coordinator_timer_callback, nullptr);
+}
 
-	OS_THREAD_DUMMY_RETURN;	/* Not reached, avoid compiler warning */
+static void srv_shutdown_purge_tasks()
+{
+  purge_coordinator_task.wait();
+  delete purge_coordinator_timer;
+  purge_coordinator_timer= nullptr;
+  purge_worker_task.wait();
+  std::unique_lock<std::mutex> lk(purge_thd_mutex);
+  while (!purge_thds.empty())
+  {
+    innobase_destroy_background_thd(purge_thds.front());
+    purge_thds.pop_front();
+  }
+  n_purge_thds= 0;
 }
 
 /**********************************************************************//**
@@ -2784,16 +2090,11 @@ srv_que_task_enqueue_low(
 	UT_LIST_ADD_LAST(srv_sys.tasks, thr);
 
 	mutex_exit(&srv_sys.tasks_mutex);
-
-	srv_release_threads(SRV_WORKER, 1);
 }
 
-/**********************************************************************//**
-Get count of tasks in the queue.
-@return number of tasks in queue */
-ulint
-srv_get_task_queue_length(void)
-/*===========================*/
+#ifdef UNIV_DEBUG
+/** @return number of tasks in queue */
+ulint srv_get_task_queue_length()
 {
 	ulint	n_tasks;
 
@@ -2807,66 +2108,20 @@ srv_get_task_queue_length(void)
 
 	return(n_tasks);
 }
-
-/** Wake up the purge threads. */
-void
-srv_purge_wakeup()
-{
-	ut_ad(!srv_read_only_mode);
-	ut_ad(!sync_check_iterate(sync_check()));
-
-	if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND) {
-		return;
-	}
-
-	do {
-		srv_release_threads(SRV_PURGE, 1);
-
-		if (srv_n_purge_threads > 1) {
-			ulint	n_workers = srv_n_purge_threads - 1;
-
-			srv_release_threads(SRV_WORKER, n_workers);
-		}
-	} while (!srv_running.load(std::memory_order_relaxed)
-		 && (srv_sys.n_threads_active[SRV_WORKER]
-		     || srv_sys.n_threads_active[SRV_PURGE]));
-}
+#endif
 
 /** Shut down the purge threads. */
 void srv_purge_shutdown()
 {
-	do {
-		ut_ad(!srv_undo_sources);
-		srv_purge_wakeup();
-	} while (srv_sys.sys_threads[SRV_PURGE_SLOT].in_use);
-}
-
-#ifdef UNIV_DEBUG
-static ulint get_first_slot(srv_thread_type type)
-{
-	switch (type) {
-	case SRV_MASTER:
-		return SRV_MASTER_SLOT;
-	case SRV_PURGE:
-		return SRV_PURGE_SLOT;
-	case SRV_WORKER:
-		/* Find an empty slot, skip the master and purge slots. */
-		return SRV_WORKER_SLOTS_START;
-	default:
-		ut_error;
-	}
-}
-
-void srv_for_each_thread(srv_thread_type type,
-			 srv_slot_callback_t callback,
-			 const void *arg)
-{
-	for (ulint slot_idx= get_first_slot(type);
-	     slot_idx < srv_sys.n_sys_threads
-		     && srv_sys.sys_threads[slot_idx].in_use
-		     && srv_sys.sys_threads[slot_idx].type == type;
-	     slot_idx++) {
-		callback(&srv_sys.sys_threads[slot_idx], arg);
+	if (purge_sys.enabled()) {
+		if (!srv_fast_shutdown && !opt_bootstrap)
+			srv_update_purge_thread_count(innodb_purge_threads_MAX);
+		while(!srv_purge_should_exit()) {
+			ut_a(!purge_sys.paused());
+			srv_wake_purge_thread_if_not_active();
+			os_thread_sleep(1000);
+		}
+		purge_sys.coordinator_shutdown();
+		srv_shutdown_purge_tasks();
 	}
 }
-#endif
diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc
index 7ba1b55772a..bf59da1cd16 100644
--- a/storage/innobase/srv/srv0start.cc
+++ b/storage/innobase/srv/srv0start.cc
@@ -52,6 +52,7 @@ Created 2/16/1996 Heikki Tuuri
 #include "data0type.h"
 #include "dict0dict.h"
 #include "buf0buf.h"
+#include "buf0dblwr.h"
 #include "buf0dump.h"
 #include "os0file.h"
 #include "os0thread.h"
@@ -75,7 +76,6 @@ Created 2/16/1996 Heikki Tuuri
 #include "btr0defragment.h"
 #include "mysql/service_wsrep.h" /* wsrep_recovery */
 #include "trx0rseg.h"
-#include "os0proc.h"
 #include "buf0flu.h"
 #include "buf0rea.h"
 #include "dict0boot.h"
@@ -99,10 +99,11 @@ Created 2/16/1996 Heikki Tuuri
 #include "os0event.h"
 #include "zlib.h"
 #include "ut0crc32.h"
-#include "btr0scrub.h"
 
-/** Log sequence number immediately after startup */
-lsn_t	srv_start_lsn;
+/** We are prepared for a situation that we have this many threads waiting for
+a semaphore inside InnoDB. srv_start() sets the value. */
+ulint srv_max_n_threads;
+
 /** Log sequence number at shutdown */
 lsn_t	srv_shutdown_lsn;
 
@@ -110,7 +111,7 @@ lsn_t	srv_shutdown_lsn;
 ibool	srv_start_raw_disk_in_use;
 
 /** Number of IO threads to use */
-ulint	srv_n_file_io_threads;
+uint	srv_n_file_io_threads;
 
 /** UNDO tablespaces starts with space id. */
 ulint	srv_undo_space_id_start;
@@ -135,77 +136,27 @@ UNIV_INTERN bool	srv_undo_sources;
 #ifdef UNIV_DEBUG
 /** InnoDB system tablespace to set during recovery */
 UNIV_INTERN uint	srv_sys_space_size_debug;
-/** whether redo log files have been created at startup */
-UNIV_INTERN bool	srv_log_files_created;
+/** whether redo log file have been created at startup */
+UNIV_INTERN bool	srv_log_file_created;
 #endif /* UNIV_DEBUG */
 
-/** Bit flags for tracking background thread creation. They are used to
-determine which threads need to be stopped if we need to abort during
-the initialisation step. */
-enum srv_start_state_t {
-	/** No thread started */
-	SRV_START_STATE_NONE = 0,		/*!< No thread started */
-	/** lock_wait_timeout_thread started */
-	SRV_START_STATE_LOCK_SYS = 1,		/*!< Started lock-timeout
-						thread. */
-	/** buf_flush_page_cleaner_coordinator,
-	buf_flush_page_cleaner_worker started */
-	SRV_START_STATE_IO = 2,
-	/** srv_error_monitor_thread, srv_monitor_thread started */
-	SRV_START_STATE_MONITOR = 4,
-	/** srv_master_thread started */
-	SRV_START_STATE_MASTER = 8,
-	/** srv_purge_coordinator_thread, srv_worker_thread started */
-	SRV_START_STATE_PURGE = 16,
-	/** fil_crypt_thread, btr_defragment_thread started
-	(all background threads that can generate redo log but not undo log */
-	SRV_START_STATE_REDO = 32
-};
-
-/** Track server thrd starting phases */
-static ulint	srv_start_state;
+/** whether some background threads that create redo log have been started */
+static bool srv_started_redo;
 
 /** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to
 SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
 enum srv_shutdown_t	srv_shutdown_state = SRV_SHUTDOWN_NONE;
 
-/** Files comprising the system tablespace */
-pfs_os_file_t	files[1000];
-
-/** io_handler_thread parameters for thread identification */
-static ulint		n[SRV_MAX_N_IO_THREADS + 6];
-/** io_handler_thread identifiers, 32 is the maximum number of purge threads  */
-/** 6 is the ? */
-#define	START_OLD_THREAD_CNT	(SRV_MAX_N_IO_THREADS + 6 + 32)
-static os_thread_id_t	thread_ids[SRV_MAX_N_IO_THREADS + 6 + 32];
-
-/** Thead handles */
-static os_thread_t	thread_handles[SRV_MAX_N_IO_THREADS + 6 + 32];
-static os_thread_t	buf_dump_thread_handle;
-static os_thread_t	dict_stats_thread_handle;
-/** Status variables, is thread started ?*/
-static bool		thread_started[SRV_MAX_N_IO_THREADS + 6 + 32] = {false};
 /** Name of srv_monitor_file */
 static char*	srv_monitor_file_name;
+std::unique_ptr<tpool::timer> srv_master_timer;
 
 /** */
 #define SRV_MAX_N_PENDING_SYNC_IOS	100
 
 #ifdef UNIV_PFS_THREAD
 /* Keys to register InnoDB threads with performance schema */
-mysql_pfs_key_t	buf_dump_thread_key;
-mysql_pfs_key_t	dict_stats_thread_key;
-mysql_pfs_key_t	io_handler_thread_key;
-mysql_pfs_key_t	io_ibuf_thread_key;
-mysql_pfs_key_t	io_log_thread_key;
-mysql_pfs_key_t	io_read_thread_key;
-mysql_pfs_key_t	io_write_thread_key;
-mysql_pfs_key_t	srv_error_monitor_thread_key;
-mysql_pfs_key_t	srv_lock_timeout_thread_key;
-mysql_pfs_key_t	srv_master_thread_key;
-mysql_pfs_key_t	srv_monitor_thread_key;
-mysql_pfs_key_t	srv_purge_thread_key;
-mysql_pfs_key_t	srv_worker_thread_key;
+mysql_pfs_key_t	thread_pool_thread_key;
 #endif /* UNIV_PFS_THREAD */
 
 #ifdef HAVE_PSI_STAGE_INTERFACE
@@ -214,7 +165,6 @@ performance schema. */
 static PSI_stage_info*	srv_stages[] =
 {
 	&srv_stage_alter_table_end,
-	&srv_stage_alter_table_flush,
 	&srv_stage_alter_table_insert,
 	&srv_stage_alter_table_log_index,
 	&srv_stage_alter_table_log_table,
@@ -275,147 +225,20 @@ srv_file_check_mode(
 	return(true);
 }
 
-/********************************************************************//**
-I/o-handler thread function.
-@return OS_THREAD_DUMMY_RETURN */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(io_handler_thread)(
-/*==============================*/
-	void*	arg)	/*!< in: pointer to the number of the segment in
-			the aio array */
-{
-	ulint	segment;
-
-	segment = *((ulint*) arg);
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	ib::info() << "Io handler thread " << segment << " starts, id "
-		<< os_thread_pf(os_thread_get_curr_id());
-#endif
-
-	/* For read only mode, we don't need ibuf and log I/O thread.
-	Please see srv_start() */
-	ulint   start = (srv_read_only_mode) ? 0 : 2;
-
-	if (segment < start) {
-		if (segment == 0) {
-			pfs_register_thread(io_ibuf_thread_key);
-		} else {
-			ut_ad(segment == 1);
-			pfs_register_thread(io_log_thread_key);
-		}
-	} else if (segment >= start
-		   && segment < (start + srv_n_read_io_threads)) {
-			pfs_register_thread(io_read_thread_key);
-
-	} else if (segment >= (start + srv_n_read_io_threads)
-		   && segment < (start + srv_n_read_io_threads
-				 + srv_n_write_io_threads)) {
-		pfs_register_thread(io_write_thread_key);
-
-	} else {
-		pfs_register_thread(io_handler_thread_key);
-	}
-
-	while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS
-	       || buf_page_cleaner_is_active
-	       || !os_aio_all_slots_free()) {
-		fil_aio_wait(segment);
-	}
-
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit.
-	The thread actually never comes here because it is exited in an
-	os_event_wait(). */
+/** Initial number of the redo log file */
+static const char INIT_LOG_FILE0[]= "101";
 
-	os_thread_exit();
-
-	OS_THREAD_DUMMY_RETURN;
-}
-
-/*********************************************************************//**
-Creates a log file.
+/** Creates log file.
+@param[in]  create_new_db   whether the database is being initialized
+@param[in]  lsn		    FIL_PAGE_FILE_FLUSH_LSN value
+@param[out] logfile0        name of the log file
 @return DB_SUCCESS or error code */
-static MY_ATTRIBUTE((nonnull, warn_unused_result))
-dberr_t
-create_log_file(
-/*============*/
-	pfs_os_file_t*	file,	/*!< out: file handle */
-	const char*	name)	/*!< in: log file name */
+static dberr_t create_log_file(bool create_new_db, lsn_t lsn,
+                               std::string& logfile0)
 {
-	bool		ret;
-
-	*file = os_file_create(
-		innodb_log_file_key, name,
-		OS_FILE_CREATE|OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL,
-		OS_LOG_FILE, srv_read_only_mode, &ret);
-
-	if (!ret) {
-		ib::error() << "Cannot create " << name;
-		return(DB_ERROR);
-	}
-
-	ib::info() << "Setting log file " << name << " size to "
-		<< srv_log_file_size << " bytes";
-
-	ret = os_file_set_size(name, *file, srv_log_file_size);
-	if (!ret) {
-		ib::error() << "Cannot set log file " << name << " size to "
-			<< srv_log_file_size << " bytes";
-		return(DB_ERROR);
-	}
-
-	ret = os_file_close(*file);
-	ut_a(ret);
-
-	return(DB_SUCCESS);
-}
-
-/** Initial number of the first redo log file */
-#define INIT_LOG_FILE0	(SRV_N_LOG_FILES_MAX + 1)
-
-/** Delete all log files.
-@param[in,out]	logfilename	buffer for log file name
-@param[in]	dirnamelen	length of the directory path
-@param[in]	n_files		number of files to delete
-@param[in]	i		first file to delete */
-static
-void
-delete_log_files(char* logfilename, size_t dirnamelen, uint n_files, uint i=0)
-{
-	/* Remove any old log files. */
-	for (; i < n_files; i++) {
-		sprintf(logfilename + dirnamelen, "ib_logfile%u", i);
-
-		/* Ignore errors about non-existent files or files
-		that cannot be removed. The create_log_file() will
-		return an error when the file exists. */
-#ifdef _WIN32
-		DeleteFile((LPCTSTR) logfilename);
-#else
-		unlink(logfilename);
-#endif
-	}
-}
-
-/*********************************************************************//**
-Creates all log files.
-@return DB_SUCCESS or error code */
-static
-dberr_t
-create_log_files(
-/*=============*/
-	char*	logfilename,	/*!< in/out: buffer for log file name */
-	size_t	dirnamelen,	/*!< in: length of the directory path */
-	lsn_t	lsn,		/*!< in: FIL_PAGE_FILE_FLUSH_LSN value */
-	char*&	logfile0)	/*!< out: name of the first log file */
-{
-	dberr_t err;
-
 	if (srv_read_only_mode) {
-		ib::error() << "Cannot create log files in read-only mode";
-		return(DB_READ_ONLY);
+		ib::error() << "Cannot create log file in read-only mode";
+		return DB_READ_ONLY;
 	}
 
 	if (!log_set_capacity(srv_log_file_size_requested)) {
@@ -424,164 +247,141 @@ create_log_files(
 
 	/* Crashing after deleting the first file should be
 	recoverable. The buffer pool was clean, and we can simply
-	create all log files from the scratch. */
-	DBUG_EXECUTE_IF("innodb_log_abort_6",
-			delete_log_files(logfilename, dirnamelen, 1);
-			return(DB_ERROR););
+	create log file from the scratch. */
+	DBUG_EXECUTE_IF("innodb_log_abort_6", delete_log_file("0");
+			return DB_ERROR;);
 
-	delete_log_files(logfilename, dirnamelen, INIT_LOG_FILE0 + 1);
+	for (size_t i = 0; i < 102; i++) {
+		delete_log_file(std::to_string(i).c_str());
+	}
 
 	DBUG_PRINT("ib_log", ("After innodb_log_abort_6"));
-	ut_ad(!buf_pool_check_no_pending_io());
+	DBUG_ASSERT(!buf_pool.any_io_pending());
 
-	DBUG_EXECUTE_IF("innodb_log_abort_7", return(DB_ERROR););
+	DBUG_EXECUTE_IF("innodb_log_abort_7", return DB_ERROR;);
 	DBUG_PRINT("ib_log", ("After innodb_log_abort_7"));
 
-	for (unsigned i = 0; i < srv_n_log_files; i++) {
-		sprintf(logfilename + dirnamelen,
-			"ib_logfile%u", i ? i : INIT_LOG_FILE0);
+	logfile0 = get_log_file_path(LOG_FILE_NAME_PREFIX)
+			   .append(INIT_LOG_FILE0);
 
-		err = create_log_file(&files[i], logfilename);
+	bool ret;
+	pfs_os_file_t file = os_file_create(
+		innodb_log_file_key, logfile0.c_str(),
+		OS_FILE_CREATE|OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL,
+		OS_LOG_FILE, srv_read_only_mode, &ret);
 
-		if (err != DB_SUCCESS) {
-			return(err);
-		}
+	if (!ret) {
+		ib::error() << "Cannot create " << logfile0;
+		return DB_ERROR;
 	}
 
-	DBUG_EXECUTE_IF("innodb_log_abort_8", return(DB_ERROR););
-	DBUG_PRINT("ib_log", ("After innodb_log_abort_8"));
-
-	/* We did not create the first log file initially as
-	ib_logfile0, so that crash recovery cannot find it until it
-	has been completed and renamed. */
-	sprintf(logfilename + dirnamelen, "ib_logfile%u", INIT_LOG_FILE0);
+	ib::info() << "Setting log file " << logfile0 << " size to "
+		   << srv_log_file_size << " bytes";
 
-	fil_space_t*	log_space = fil_space_create(
-		"innodb_redo_log", SRV_LOG_SPACE_FIRST_ID, 0, FIL_TYPE_LOG,
-		NULL/* innodb_encrypt_log works at a different level */);
-
-	ut_a(fil_validate());
-	ut_a(log_space != NULL);
+	ret = os_file_set_size(logfile0.c_str(), file, srv_log_file_size);
+	if (!ret) {
+		os_file_close(file);
+		ib::error() << "Cannot set log file " << logfile0
+			    << " size to " << srv_log_file_size << " bytes";
+		return DB_ERROR;
+	}
 
-	const ulint size = ulint(srv_log_file_size >> srv_page_size_shift);
+	ret = os_file_close(file);
+	ut_a(ret);
 
-	logfile0 = log_space->add(logfilename, OS_FILE_CLOSED, size,
-				  false, false)->name;
-	ut_a(logfile0);
+	DBUG_EXECUTE_IF("innodb_log_abort_8", return(DB_ERROR););
+	DBUG_PRINT("ib_log", ("After innodb_log_abort_8"));
 
-	for (unsigned i = 1; i < srv_n_log_files; i++) {
+	/* We did not create the first log file initially as LOG_FILE_NAME, so
+	that crash recovery cannot find it until it has been completed and
+        renamed. */
 
-		sprintf(logfilename + dirnamelen, "ib_logfile%u", i);
+	log_sys.log.create();
 
-		log_space->add(logfilename, OS_FILE_CLOSED, size,
-			       false, false);
+	log_sys.log.open_file(logfile0);
+	if (!fil_system.sys_space->open(create_new_db)) {
+		return DB_ERROR;
 	}
 
-	log_sys.log.create(srv_n_log_files);
-
-	fil_open_log_and_system_tablespace_files();
-
 	/* Create a log checkpoint. */
-	log_mutex_enter();
+	mysql_mutex_lock(&log_sys.mutex);
 	if (log_sys.is_encrypted() && !log_crypt_init()) {
 		return DB_ERROR;
 	}
 	ut_d(recv_no_log_write = false);
-	log_sys.lsn = ut_uint64_align_up(lsn, OS_FILE_LOG_BLOCK_SIZE);
-
-	log_sys.log.set_lsn(log_sys.lsn);
+	lsn = ut_uint64_align_up(lsn, OS_FILE_LOG_BLOCK_SIZE);
+	log_sys.set_lsn(lsn + LOG_BLOCK_HDR_SIZE);
+	log_sys.log.set_lsn(lsn);
 	log_sys.log.set_lsn_offset(LOG_FILE_HDR_SIZE);
 
 	log_sys.buf_next_to_write = 0;
-	log_sys.write_lsn = log_sys.lsn;
+	log_sys.write_lsn = lsn;
 
 	log_sys.next_checkpoint_no = 0;
 	log_sys.last_checkpoint_lsn = 0;
 
 	memset(log_sys.buf, 0, srv_log_buffer_size);
-	log_block_init(log_sys.buf, log_sys.lsn);
+	log_block_init(log_sys.buf, lsn);
 	log_block_set_first_rec_group(log_sys.buf, LOG_BLOCK_HDR_SIZE);
 	memset(log_sys.flush_buf, 0, srv_log_buffer_size);
 
 	log_sys.buf_free = LOG_BLOCK_HDR_SIZE;
-	log_sys.lsn += LOG_BLOCK_HDR_SIZE;
 
-	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
-		    (log_sys.lsn - log_sys.last_checkpoint_lsn));
-	log_mutex_exit();
+	log_sys.log.write_header_durable(lsn);
+
+	ut_ad(srv_startup_is_before_trx_rollback_phase);
+	if (create_new_db) {
+		srv_startup_is_before_trx_rollback_phase = false;
+	}
+
+	/* Enable checkpoints in buf_flush_page_cleaner(). */
+	recv_sys.recovery_on = false;
+	mysql_mutex_unlock(&log_sys.mutex);
 
 	log_make_checkpoint();
+	log_buffer_flush_to_disk();
 
-	return(DB_SUCCESS);
+	return DB_SUCCESS;
 }
 
 /** Rename the first redo log file.
-@param[in,out]	logfilename	buffer for the log file name
-@param[in]	dirnamelen	length of the directory path
 @param[in]	lsn		FIL_PAGE_FILE_FLUSH_LSN value
 @param[in,out]	logfile0	name of the first log file
 @return	error code
 @retval	DB_SUCCESS	on successful operation */
-MY_ATTRIBUTE((warn_unused_result, nonnull))
-static
-dberr_t
-create_log_files_rename(
-/*====================*/
-	char*	logfilename,	/*!< in/out: buffer for log file name */
-	size_t	dirnamelen,	/*!< in: length of the directory path */
-	lsn_t	lsn,		/*!< in: FIL_PAGE_FILE_FLUSH_LSN value */
-	char*	logfile0)	/*!< in/out: name of the first log file */
+MY_ATTRIBUTE((warn_unused_result))
+static dberr_t create_log_file_rename(lsn_t lsn, std::string &logfile0)
 {
-	/* If innodb_flush_method=O_DSYNC,
-	we need to explicitly flush the log buffers. */
-	fil_flush(SRV_LOG_SPACE_FIRST_ID);
-
-	ut_ad(!srv_log_files_created);
-	ut_d(srv_log_files_created = true);
+  ut_ad(!srv_log_file_created);
+  ut_d(srv_log_file_created= true);
 
-	DBUG_EXECUTE_IF("innodb_log_abort_9", return(DB_ERROR););
-	DBUG_PRINT("ib_log", ("After innodb_log_abort_9"));
+  DBUG_EXECUTE_IF("innodb_log_abort_9", return (DB_ERROR););
+  DBUG_PRINT("ib_log", ("After innodb_log_abort_9"));
 
-	/* Close the log files, so that we can rename
-	the first one. */
-	fil_close_log_files(false);
+  /* Rename the first log file, now that a log checkpoint has been created. */
+  auto new_name = get_log_file_path();
 
-	/* Rename the first log file, now that a log
-	checkpoint has been created. */
-	sprintf(logfilename + dirnamelen, "ib_logfile%u", 0);
+  ib::info() << "Renaming log file " << logfile0 << " to " << new_name;
 
-	ib::info() << "Renaming log file " << logfile0 << " to "
-		<< logfilename;
+  mysql_mutex_lock(&log_sys.mutex);
+  ut_ad(logfile0.size() == 2 + new_name.size());
+  logfile0= new_name;
+  dberr_t err= log_sys.log.rename(std::move(new_name));
 
-	log_mutex_enter();
-	ut_ad(strlen(logfile0) == 2 + strlen(logfilename));
-	dberr_t err = os_file_rename(
-		innodb_log_file_key, logfile0, logfilename)
-		? DB_SUCCESS : DB_ERROR;
+  mysql_mutex_unlock(&log_sys.mutex);
 
-	/* Replace the first file with ib_logfile0. */
-	strcpy(logfile0, logfilename);
-	log_mutex_exit();
+  DBUG_EXECUTE_IF("innodb_log_abort_10", err= DB_ERROR;);
 
-	DBUG_EXECUTE_IF("innodb_log_abort_10", err = DB_ERROR;);
+  if (err == DB_SUCCESS)
+    ib::info() << "New log file created, LSN=" << lsn;
 
-	if (err == DB_SUCCESS) {
-		fil_open_log_and_system_tablespace_files();
-		ib::info() << "New log files created, LSN=" << lsn;
-	}
-
-	return(err);
+  return err;
 }
 
-/*********************************************************************//**
-Create undo tablespace.
+/** Create an undo tablespace file
+@param[in] name  file name
 @return DB_SUCCESS or error code */
-static
-dberr_t
-srv_undo_tablespace_create(
-/*=======================*/
-	const char*	name,		/*!< in: tablespace name */
-	ulint		size)		/*!< in: tablespace size in pages */
+static dberr_t srv_undo_tablespace_create(const char* name)
 {
 	pfs_os_file_t	fh;
 	bool		ret;
@@ -595,11 +395,7 @@ srv_undo_tablespace_create(
 		srv_read_only_mode ? OS_FILE_OPEN : OS_FILE_CREATE,
 		OS_FILE_NORMAL, OS_DATA_FILE, srv_read_only_mode, &ret);
 
-	if (srv_read_only_mode && ret) {
-
-		ib::info() << name << " opened in read-only mode";
-
-	} else if (ret == FALSE) {
+	if (!ret) {
 		if (os_file_get_last_error(false) != OS_FILE_ALREADY_EXISTS
 #ifdef UNIV_AIX
 			/* AIX 5.1 after security patch ML7 may have
@@ -612,27 +408,24 @@ srv_undo_tablespace_create(
 				<< name;
 		}
 		err = DB_ERROR;
+	} else if (srv_read_only_mode) {
+		ib::info() << name << " opened in read-only mode";
 	} else {
-		ut_a(!srv_read_only_mode);
-
 		/* We created the data file and now write it full of zeros */
 
 		ib::info() << "Data file " << name << " did not exist: new to"
 			" be created";
 
 		ib::info() << "Setting file " << name << " size to "
-			<< (size >> (20 - srv_page_size_shift)) << " MB";
+			<< (SRV_UNDO_TABLESPACE_SIZE_IN_PAGES >> (20 - srv_page_size_shift)) << " MB";
 
 		ib::info() << "Database physically writes the file full: "
 			<< "wait...";
 
-		ret = os_file_set_size(
-			name, fh, os_offset_t(size) << srv_page_size_shift);
-
-		if (!ret) {
-			ib::info() << "Error in creating " << name
-				<< ": probably out of disk space";
-
+		if (!os_file_set_size(name, fh, os_offset_t
+				      {SRV_UNDO_TABLESPACE_SIZE_IN_PAGES}
+				      << srv_page_size_shift)) {
+			ib::error() << "Unable to allocate " << name;
 			err = DB_ERROR;
 		}
 
@@ -642,78 +435,159 @@ srv_undo_tablespace_create(
 	return(err);
 }
 
-/** Open an undo tablespace.
-@param[in]	name		tablespace file name
-@param[in]	space_id	tablespace ID
-@param[in]	create_new_db	whether undo tablespaces are being created
-@return whether the tablespace was opened */
-static bool srv_undo_tablespace_open(const char* name, ulint space_id,
-				     bool create_new_db)
+/* Validate the number of undo opened undo tablespace and user given
+undo tablespace
+@return DB_SUCCESS if it is valid */
+static dberr_t srv_validate_undo_tablespaces()
 {
-	pfs_os_file_t	fh;
-	bool		success;
-	char		undo_name[sizeof "innodb_undo000"];
-
-	snprintf(undo_name, sizeof(undo_name),
-		 "innodb_undo%03u", static_cast<unsigned>(space_id));
-
-	fh = os_file_create(
-		innodb_data_file_key, name, OS_FILE_OPEN
-		| OS_FILE_ON_ERROR_NO_EXIT | OS_FILE_ON_ERROR_SILENT,
-		OS_FILE_AIO, OS_DATA_FILE, srv_read_only_mode, &success);
-	if (!success) {
-		return false;
-	}
-
-	os_offset_t size = os_file_get_size(fh);
-	ut_a(size != os_offset_t(-1));
-
-	/* Load the tablespace into InnoDB's internal data structures. */
-
-	/* We set the biggest space id to the undo tablespace
-	because InnoDB hasn't opened any other tablespace apart
-	from the system tablespace. */
-
-	fil_set_max_space_id_if_bigger(space_id);
-
-	ulint fsp_flags;
-	switch (srv_checksum_algorithm) {
-	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
-	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
-		fsp_flags = (FSP_FLAGS_FCRC32_MASK_MARKER
-			     | FSP_FLAGS_FCRC32_PAGE_SSIZE());
-		break;
-	default:
-		fsp_flags = FSP_FLAGS_PAGE_SSIZE();
-	}
-
-	fil_space_t* space = fil_space_create(undo_name, space_id, fsp_flags,
-					      FIL_TYPE_TABLESPACE, NULL);
-
-	ut_a(fil_validate());
-	ut_a(space);
-
-	fil_node_t* file = space->add(name, fh, 0, false, true);
-
-	mutex_enter(&fil_system.mutex);
-
-	if (create_new_db) {
-		space->size = file->size = ulint(size >> srv_page_size_shift);
-		space->size_in_header = SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
-		space->committed_size = SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
-	} else {
-		success = file->read_page0(true);
-		if (!success) {
-			os_file_close(file->handle);
-			file->handle = OS_FILE_CLOSED;
-			ut_a(fil_system.n_open > 0);
-			fil_system.n_open--;
-		}
-	}
+  /* If the user says that there are fewer than what we find we
+  tolerate that discrepancy but not the inverse. Because there could
+  be unused undo tablespaces for future use. */
+
+  if (srv_undo_tablespaces > srv_undo_tablespaces_open)
+  {
+    ib::error() << "Expected to open innodb_undo_tablespaces="
+                << srv_undo_tablespaces
+                << " but was able to find only "
+		<< srv_undo_tablespaces_open;
+
+    return DB_ERROR;
+  }
+  else if (srv_undo_tablespaces_open > 0)
+  {
+    ib::info() << "Opened " << srv_undo_tablespaces_open
+               << " undo tablespaces";
+
+    if (srv_undo_tablespaces == 0)
+      ib::warn() << "innodb_undo_tablespaces=0 disables"
+                 " dedicated undo log tablespaces";
+  }
+  return DB_SUCCESS;
+}
 
-	mutex_exit(&fil_system.mutex);
+/** @return the number of active undo tablespaces (except system tablespace) */
+static ulint trx_rseg_get_n_undo_tablespaces()
+{
+  std::set<uint32_t> space_ids;
+  mtr_t mtr;
+  mtr.start();
+
+  if (const buf_block_t *sys_header= trx_sysf_get(&mtr, false))
+    for (ulint rseg_id= 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++)
+      if (trx_sysf_rseg_get_page_no(sys_header, rseg_id) != FIL_NULL)
+        if (uint32_t space= trx_sysf_rseg_get_space(sys_header, rseg_id))
+          space_ids.insert(space);
+  mtr.commit();
+  return space_ids.size();
+}
 
-	return success;
+/** Open an undo tablespace.
+@param[in]	create	whether undo tablespaces are being created
+@param[in]	name	tablespace file name
+@param[in]	i	undo tablespace count
+@return undo tablespace identifier
+@retval 0 on failure */
+static ulint srv_undo_tablespace_open(bool create, const char* name, ulint i)
+{
+  bool success;
+  char undo_name[sizeof "innodb_undo000"];
+  ulint space_id= 0;
+  ulint fsp_flags= 0;
+
+  if (create)
+  {
+    space_id= srv_undo_space_id_start + i;
+    snprintf(undo_name, sizeof(undo_name),
+             "innodb_undo%03u", static_cast<unsigned>(space_id));
+    switch (srv_checksum_algorithm) {
+    case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+    case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+      fsp_flags= FSP_FLAGS_FCRC32_MASK_MARKER | FSP_FLAGS_FCRC32_PAGE_SSIZE();
+      break;
+    default:
+      fsp_flags= FSP_FLAGS_PAGE_SSIZE();
+    }
+  }
+
+  pfs_os_file_t fh= os_file_create(innodb_data_file_key, name, OS_FILE_OPEN |
+                                   OS_FILE_ON_ERROR_NO_EXIT |
+                                   OS_FILE_ON_ERROR_SILENT,
+                                   OS_FILE_AIO, OS_DATA_FILE,
+                                   srv_read_only_mode, &success);
+
+  if (!success)
+    return 0;
+
+  os_offset_t size= os_file_get_size(fh);
+  ut_a(size != os_offset_t(-1));
+
+  if (!create)
+  {
+    page_t *page= static_cast<byte*>(aligned_malloc(srv_page_size,
+                                                    srv_page_size));
+    dberr_t err= os_file_read(IORequestRead, fh, page, 0, srv_page_size);
+    if (err != DB_SUCCESS)
+    {
+err_exit:
+      ib::error() << "Unable to read first page of file " << name;
+      aligned_free(page);
+      return err;
+    }
+
+    uint32_t id= mach_read_from_4(FIL_PAGE_SPACE_ID + page);
+    if (id == 0 || id >= SRV_SPACE_ID_UPPER_BOUND ||
+        memcmp_aligned<2>(FIL_PAGE_SPACE_ID + page,
+                          FSP_HEADER_OFFSET + FSP_SPACE_ID + page, 4))
+    {
+      ib::error() << "Inconsistent tablespace ID in file " << name;
+      err= DB_CORRUPTION;
+      goto err_exit;
+    }
+
+    fsp_flags= mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page);
+    if (buf_page_is_corrupted(false, page, fsp_flags))
+    {
+      ib::error() << "Checksum mismatch in the first page of file " << name;
+      err= DB_CORRUPTION;
+      goto err_exit;
+    }
+
+    space_id= id;
+    snprintf(undo_name, sizeof undo_name, "innodb_undo%03u", id);
+    aligned_free(page);
+  }
+
+  /* Load the tablespace into InnoDB's internal data structures. */
+
+  /* We set the biggest space id to the undo tablespace
+  because InnoDB hasn't opened any other tablespace apart
+  from the system tablespace. */
+
+  fil_set_max_space_id_if_bigger(space_id);
+
+  fil_space_t *space= fil_space_t::create(undo_name, space_id, fsp_flags,
+					  FIL_TYPE_TABLESPACE, NULL);
+  ut_a(fil_validate());
+  ut_a(space);
+
+  fil_node_t *file= space->add(name, fh, 0, false, true);
+  mutex_enter(&fil_system.mutex);
+
+  if (create)
+  {
+    space->set_sizes(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES);
+    space->size= file->size= uint32_t(size >> srv_page_size_shift);
+  }
+  else if (!file->read_page0())
+  {
+    os_file_close(file->handle);
+    file->handle= OS_FILE_CLOSED;
+    ut_a(fil_system.n_open > 0);
+    fil_system.n_open--;
+  }
+
+  mutex_exit(&fil_system.mutex);
+  return space_id;
 }
 
 /** Check if undo tablespaces and redo log files exist before creating a
@@ -759,228 +633,147 @@ srv_check_undo_redo_logs_exists()
 		}
 	}
 
-	/* Check if any redo log files exist */
-	char	logfilename[OS_FILE_MAX_PATH];
-	size_t dirnamelen = strlen(srv_log_group_home_dir);
-	memcpy(logfilename, srv_log_group_home_dir, dirnamelen);
-
-	for (unsigned i = 0; i < srv_n_log_files; i++) {
-		sprintf(logfilename + dirnamelen,
-			"ib_logfile%u", i);
+	/* Check if redo log file exists */
+	auto logfilename = get_log_file_path();
 
-		fh = os_file_create(
-			innodb_log_file_key, logfilename,
-			OS_FILE_OPEN_RETRY
-			| OS_FILE_ON_ERROR_NO_EXIT
-			| OS_FILE_ON_ERROR_SILENT,
-			OS_FILE_NORMAL,
-			OS_LOG_FILE,
-			srv_read_only_mode,
-			&ret);
+	fh = os_file_create(innodb_log_file_key, logfilename.c_str(),
+			    OS_FILE_OPEN_RETRY | OS_FILE_ON_ERROR_NO_EXIT
+				    | OS_FILE_ON_ERROR_SILENT,
+			    OS_FILE_NORMAL, OS_LOG_FILE, srv_read_only_mode,
+			    &ret);
 
-		if (ret) {
-			os_file_close(fh);
-			ib::error() << "redo log file '" << logfilename
-				<< "' exists. Creating system tablespace with"
-				" existing redo log files is not recommended."
-				" Please delete all redo log files before"
-				" creating new system tablespace.";
-			return(DB_ERROR);
-		}
+	if (ret) {
+		os_file_close(fh);
+		ib::error() << "redo log file '" << logfilename
+			    << "' exists. Creating system tablespace with"
+			       " existing redo log file is not recommended."
+			       " Please delete redo log file before"
+			       " creating new system tablespace.";
+		return DB_ERROR;
 	}
 
 	return(DB_SUCCESS);
 }
 
+static dberr_t srv_all_undo_tablespaces_open(bool create_new_db, ulint n_undo)
+{
+  /* Open all the undo tablespaces that are currently in use. If we
+  fail to open any of these it is a fatal error. The tablespace ids
+  should be contiguous. It is a fatal error because they are required
+  for recovery and are referenced by the UNDO logs (a.k.a RBS). */
+
+  ulint prev_id= create_new_db ? srv_undo_space_id_start - 1 : 0;
+
+  for (ulint i= 0; i < n_undo; ++i)
+  {
+    char name[OS_FILE_MAX_PATH];
+    snprintf(name, sizeof name, "%s%cundo%03zu", srv_undo_dir,
+             OS_PATH_SEPARATOR, i + 1);
+    ulint space_id= srv_undo_tablespace_open(create_new_db, name, i);
+    if (!space_id)
+    {
+      if (!create_new_db)
+        break;
+      ib::error() << "Unable to open create tablespace '" << name << "'.";
+      return DB_ERROR;
+    }
+
+    /* Should be no gaps in undo tablespace ids. */
+    ut_a(!i || prev_id + 1 == space_id);
+
+    prev_id= space_id;
+
+    /* Note the first undo tablespace id in case of
+    no active undo tablespace. */
+    if (0 == srv_undo_tablespaces_open++)
+      srv_undo_space_id_start= space_id;
+  }
+
+  /* Open any extra unused undo tablespaces. These must be contiguous.
+  We stop at the first failure. These are undo tablespaces that are
+  not in use and therefore not required by recovery. We only check
+  that there are no gaps. */
+
+  for (ulint i= prev_id + 1; i < srv_undo_space_id_start + TRX_SYS_N_RSEGS;
+       ++i)
+  {
+     char name[OS_FILE_MAX_PATH];
+     snprintf(name, sizeof(name),
+              "%s%cundo%03zu", srv_undo_dir, OS_PATH_SEPARATOR, i);
+     if (!srv_undo_tablespace_open(create_new_db, name, i))
+       break;
+     ++srv_undo_tablespaces_open;
+  }
+
+  return srv_validate_undo_tablespaces();
+}
+
 /** Open the configured number of dedicated undo tablespaces.
 @param[in]	create_new_db	whether the database is being initialized
 @return DB_SUCCESS or error code */
 dberr_t
 srv_undo_tablespaces_init(bool create_new_db)
 {
-	ulint			i;
-	dberr_t			err = DB_SUCCESS;
-	ulint			prev_space_id = 0;
-	ulint			n_undo_tablespaces;
-	ulint			undo_tablespace_ids[TRX_SYS_N_RSEGS + 1];
-
-	srv_undo_tablespaces_open = 0;
-
-	ut_a(srv_undo_tablespaces <= TRX_SYS_N_RSEGS);
-	ut_a(!create_new_db || srv_operation == SRV_OPERATION_NORMAL);
-
-	if (srv_undo_tablespaces == 1) { /* 1 is not allowed, make it 0 */
-		srv_undo_tablespaces = 0;
-	}
-
-	memset(undo_tablespace_ids, 0x0, sizeof(undo_tablespace_ids));
-
-	/* Create the undo spaces only if we are creating a new
-	instance. We don't allow creating of new undo tablespaces
-	in an existing instance (yet).  This restriction exists because
-	we check in several places for SYSTEM tablespaces to be less than
-	the min of user defined tablespace ids. Once we implement saving
-	the location of the undo tablespaces and their space ids this
-	restriction will/should be lifted. */
-
-	for (i = 0; create_new_db && i < srv_undo_tablespaces; ++i) {
-		char	name[OS_FILE_MAX_PATH];
-		ulint	space_id  = i + 1;
-
-		DBUG_EXECUTE_IF("innodb_undo_upgrade",
-				space_id = i + 3;);
-
-		snprintf(
-			name, sizeof(name),
-			"%s%cundo%03zu",
-			srv_undo_dir, OS_PATH_SEPARATOR, space_id);
-
-		if (i == 0) {
-			srv_undo_space_id_start = space_id;
-			prev_space_id = srv_undo_space_id_start - 1;
-		}
-
-		undo_tablespace_ids[i] = space_id;
-
-		err = srv_undo_tablespace_create(
-			name, SRV_UNDO_TABLESPACE_SIZE_IN_PAGES);
-
-		if (err != DB_SUCCESS) {
-			ib::error() << "Could not create undo tablespace '"
-				<< name << "'.";
-			return(err);
-		}
-	}
-
-	/* Get the tablespace ids of all the undo segments excluding
-	the system tablespace (0). If we are creating a new instance then
-	we build the undo_tablespace_ids ourselves since they don't
-	already exist. */
-	n_undo_tablespaces = create_new_db
-		|| srv_operation == SRV_OPERATION_BACKUP
-		|| srv_operation == SRV_OPERATION_RESTORE_DELTA
-		? srv_undo_tablespaces
-		: trx_rseg_get_n_undo_tablespaces(undo_tablespace_ids);
-	srv_undo_tablespaces_active = srv_undo_tablespaces;
-
-	switch (srv_operation) {
-	case SRV_OPERATION_RESTORE_DELTA:
-	case SRV_OPERATION_BACKUP:
-		for (i = 0; i < n_undo_tablespaces; i++) {
-			undo_tablespace_ids[i] = i + srv_undo_space_id_start;
-		}
-
-		prev_space_id = srv_undo_space_id_start - 1;
-		break;
-	case SRV_OPERATION_NORMAL:
-	case SRV_OPERATION_RESTORE_ROLLBACK_XA:
-	case SRV_OPERATION_RESTORE:
-	case SRV_OPERATION_RESTORE_EXPORT:
-		break;
-	}
-
-	/* Open all the undo tablespaces that are currently in use. If we
-	fail to open any of these it is a fatal error. The tablespace ids
-	should be contiguous. It is a fatal error because they are required
-	for recovery and are referenced by the UNDO logs (a.k.a RBS). */
-
-	for (i = 0; i < n_undo_tablespaces; ++i) {
-		char	name[OS_FILE_MAX_PATH];
-
-		snprintf(
-			name, sizeof(name),
-			"%s%cundo%03zu",
-			srv_undo_dir, OS_PATH_SEPARATOR,
-			undo_tablespace_ids[i]);
-
-		/* Should be no gaps in undo tablespace ids. */
-		ut_a(!i || prev_space_id + 1 == undo_tablespace_ids[i]);
-
-		/* The system space id should not be in this array. */
-		ut_a(undo_tablespace_ids[i] != 0);
-		ut_a(undo_tablespace_ids[i] != ULINT_UNDEFINED);
-
-		if (!srv_undo_tablespace_open(name, undo_tablespace_ids[i],
-					      create_new_db)) {
-			ib::error() << "Unable to open undo tablespace '"
-				<< name << "'.";
-			return DB_ERROR;
-		}
-
-		prev_space_id = undo_tablespace_ids[i];
-
-		/* Note the first undo tablespace id in case of
-		no active undo tablespace. */
-		if (0 == srv_undo_tablespaces_open++) {
-			srv_undo_space_id_start = undo_tablespace_ids[i];
-		}
-	}
-
-	/* Open any extra unused undo tablespaces. These must be contiguous.
-	We stop at the first failure. These are undo tablespaces that are
-	not in use and therefore not required by recovery. We only check
-	that there are no gaps. */
-
-	for (i = prev_space_id + 1;
-	     i < srv_undo_space_id_start + TRX_SYS_N_RSEGS; ++i) {
-		char	name[OS_FILE_MAX_PATH];
-
-		snprintf(
-			name, sizeof(name),
-			"%s%cundo%03zu", srv_undo_dir, OS_PATH_SEPARATOR, i);
-
-		if (!srv_undo_tablespace_open(name, i, create_new_db)) {
-			err = DB_ERROR;
-			break;
-		}
-
-		++n_undo_tablespaces;
-
-		++srv_undo_tablespaces_open;
-	}
-
-	/* Initialize srv_undo_space_id_start=0 when there are no
-	dedicated undo tablespaces. */
-	if (n_undo_tablespaces == 0) {
-		srv_undo_space_id_start = 0;
-	}
-
-	/* If the user says that there are fewer than what we find we
-	tolerate that discrepancy but not the inverse. Because there could
-	be unused undo tablespaces for future use. */
-
-	if (srv_undo_tablespaces > n_undo_tablespaces) {
-		ib::error() << "Expected to open innodb_undo_tablespaces="
-			<< srv_undo_tablespaces
-			<< " but was able to find only "
-			<< n_undo_tablespaces;
-
-		return(err != DB_SUCCESS ? err : DB_ERROR);
-
-	} else if (n_undo_tablespaces > 0) {
-
-		ib::info() << "Opened " << n_undo_tablespaces
-			<< " undo tablespaces";
-
-		if (srv_undo_tablespaces == 0) {
-			ib::warn() << "innodb_undo_tablespaces=0 disables"
-				" dedicated undo log tablespaces";
-		}
-	}
-
-	if (create_new_db) {
-		mtr_t	mtr;
-
-		for (i = 0; i < n_undo_tablespaces; ++i) {
-			mtr.start();
-			fsp_header_init(fil_space_get(undo_tablespace_ids[i]),
-					SRV_UNDO_TABLESPACE_SIZE_IN_PAGES,
-					&mtr);
-			mtr.commit();
-		}
-	}
-
-	return(DB_SUCCESS);
+  srv_undo_tablespaces_open= 0;
+
+  ut_a(srv_undo_tablespaces <= TRX_SYS_N_RSEGS);
+  ut_a(!create_new_db || srv_operation == SRV_OPERATION_NORMAL);
+
+  if (srv_undo_tablespaces == 1)
+    srv_undo_tablespaces= 0;
+
+  /* Create the undo spaces only if we are creating a new
+  instance. We don't allow creating of new undo tablespaces
+  in an existing instance (yet). */
+  if (create_new_db)
+  {
+    srv_undo_space_id_start= 1;
+    DBUG_EXECUTE_IF("innodb_undo_upgrade", srv_undo_space_id_start= 3;);
+
+    for (ulint i= 0; i < srv_undo_tablespaces; ++i)
+    {
+      char name[OS_FILE_MAX_PATH];
+      snprintf(name, sizeof name, "%s%cundo%03zu",
+               srv_undo_dir, OS_PATH_SEPARATOR, i + 1);
+      if (dberr_t err= srv_undo_tablespace_create(name))
+      {
+        ib::error() << "Could not create undo tablespace '" << name << "'.";
+        return err;
+      }
+    }
+  }
+
+  /* Get the tablespace ids of all the undo segments excluding
+  the system tablespace (0). If we are creating a new instance then
+  we build the undo_tablespace_ids ourselves since they don't
+  already exist. */
+  srv_undo_tablespaces_active= srv_undo_tablespaces;
+
+  ulint n_undo= (create_new_db || srv_operation == SRV_OPERATION_BACKUP ||
+                 srv_operation == SRV_OPERATION_RESTORE_DELTA)
+    ? srv_undo_tablespaces : TRX_SYS_N_RSEGS;
+
+  if (dberr_t err= srv_all_undo_tablespaces_open(create_new_db, n_undo))
+    return err;
+
+  /* Initialize srv_undo_space_id_start=0 when there are no
+  dedicated undo tablespaces. */
+  if (srv_undo_tablespaces_open == 0)
+    srv_undo_space_id_start= 0;
+
+  if (create_new_db)
+  {
+    mtr_t mtr;
+    for (ulint i= 0; i < srv_undo_tablespaces; ++i)
+    {
+       mtr.start();
+       fsp_header_init(fil_space_get(srv_undo_space_id_start + i),
+                       SRV_UNDO_TABLESPACE_SIZE_IN_PAGES, &mtr);
+       mtr.commit();
+    }
+  }
+
+  return DB_SUCCESS;
 }
 
 /** Create the temporary file tablespace.
@@ -1017,7 +810,7 @@ srv_open_tmp_tablespace(bool create_new_db)
 			    true, create_new_db, &sum_of_new_sizes, NULL))
 		   != DB_SUCCESS) {
 		ib::error() << "Unable to create the shared innodb_temporary";
-	} else if (fil_system.temp_space->open()) {
+	} else if (fil_system.temp_space->open(true)) {
 		/* Initialize the header page */
 		mtr_t mtr;
 		mtr.start();
@@ -1037,114 +830,22 @@ srv_open_tmp_tablespace(bool create_new_db)
 	return(err);
 }
 
-/****************************************************************//**
-Set state to indicate start of particular group of threads in InnoDB. */
-UNIV_INLINE
-void
-srv_start_state_set(
-/*================*/
-	srv_start_state_t state)	/*!< in: indicate current state of
-					thread startup */
-{
-	srv_start_state |= ulint(state);
-}
-
-/****************************************************************//**
-Check if following group of threads is started.
-@return true if started */
-UNIV_INLINE
-bool
-srv_start_state_is_set(
-/*===================*/
-	srv_start_state_t state)	/*!< in: state to check for */
-{
-	return(srv_start_state & ulint(state));
-}
-
-/**
-Shutdown all background threads created by InnoDB. */
-static
-void
-srv_shutdown_all_bg_threads()
+/** Shutdown background threads, except the page cleaner. */
+static void srv_shutdown_threads()
 {
 	ut_ad(!srv_undo_sources);
 	srv_shutdown_state = SRV_SHUTDOWN_EXIT_THREADS;
 
-	/* All threads end up waiting for certain events. Put those events
-	to the signaled state. Then the threads will exit themselves after
-	os_event_wait(). */
-	for (uint i = 0; i < 1000; ++i) {
-		/* NOTE: IF YOU CREATE THREADS IN INNODB, YOU MUST EXIT THEM
-		HERE OR EARLIER */
-
-		if (srv_start_state_is_set(SRV_START_STATE_LOCK_SYS)) {
-			/* a. Let the lock timeout thread exit */
-			os_event_set(lock_sys.timeout_event);
-		}
-
-		if (!srv_read_only_mode) {
-			/* b. srv error monitor thread exits automatically,
-			no need to do anything here */
-
-			if (srv_start_state_is_set(SRV_START_STATE_MASTER)) {
-				/* c. We wake the master thread so that
-				it exits */
-				srv_wake_master_thread();
-			}
-
-			if (srv_start_state_is_set(SRV_START_STATE_PURGE)) {
-				/* d. Wakeup purge threads. */
-				srv_purge_wakeup();
-			}
-
-			if (srv_n_fil_crypt_threads_started) {
-				os_event_set(fil_crypt_threads_event);
-			}
-
-			if (log_scrub_thread_active) {
-				os_event_set(log_scrub_event);
-			}
-		}
-
-		if (srv_start_state_is_set(SRV_START_STATE_IO)) {
-			ut_ad(!srv_read_only_mode);
+	lock_sys.timeout_timer.reset();
+	srv_master_timer.reset();
 
-			/* e. Exit the i/o threads */
-			if (recv_sys.flush_start != NULL) {
-				os_event_set(recv_sys.flush_start);
-			}
-			if (recv_sys.flush_end != NULL) {
-				os_event_set(recv_sys.flush_end);
-			}
-
-			os_event_set(buf_flush_event);
-		}
-
-		if (!os_thread_count) {
-			return;
-		}
-
-		switch (srv_operation) {
-		case SRV_OPERATION_BACKUP:
-		case SRV_OPERATION_RESTORE_DELTA:
-			break;
-		case SRV_OPERATION_NORMAL:
-		case SRV_OPERATION_RESTORE_ROLLBACK_XA:
-		case SRV_OPERATION_RESTORE:
-		case SRV_OPERATION_RESTORE_EXPORT:
-			if (!buf_page_cleaner_is_active
-			    && os_aio_all_slots_free()) {
-				os_aio_wake_all_threads_at_shutdown();
-			}
-		}
-
-		os_thread_sleep(100000);
+	if (purge_sys.enabled()) {
+		srv_purge_shutdown();
 	}
 
-	ib::warn() << os_thread_count << " threads created by InnoDB"
-		" had not exited at shutdown!";
-	ut_d(os_aio_print_pending_io(stderr));
-	ut_ad(0);
+	if (srv_n_fil_crypt_threads) {
+		fil_crypt_set_thread_cnt(0);
+	}
 }
 
 #ifdef UNIV_DEBUG
@@ -1172,6 +873,8 @@ srv_init_abort_low(
 #endif /* UNIV_DEBUG */
 	dberr_t		err)
 {
+	ut_ad(srv_is_being_started);
+
 	if (create_new_db) {
 		ib::error() << "Database creation was aborted"
 #ifdef UNIV_DEBUG
@@ -1189,108 +892,148 @@ srv_init_abort_low(
 	}
 
 	srv_shutdown_bg_undo_sources();
-	srv_shutdown_all_bg_threads();
+	srv_shutdown_threads();
 	return(err);
 }
 
-/** Prepare to delete the redo log files. Flush the dirty pages from all the
+/** Prepare to delete the redo log file. Flush the dirty pages from all the
 buffer pools.  Flush the redo log buffer to the redo log file.
-@param[in]	n_files		number of old redo log files
+@param[in]	old_exists	old redo log file exists
 @return lsn upto which data pages have been flushed. */
-static
-lsn_t
-srv_prepare_to_delete_redo_log_files(
-	ulint	n_files)
+static lsn_t srv_prepare_to_delete_redo_log_file(bool old_exists)
 {
-	DBUG_ENTER("srv_prepare_to_delete_redo_log_files");
-
-	lsn_t	flushed_lsn;
-	ulint	pending_io = 0;
-	ulint	count = 0;
-
-	if (log_sys.log.subformat != 2) {
-		srv_log_file_size = 0;
-	}
-
-	do {
-		/* Clean the buffer pool. */
-		buf_flush_sync_all_buf_pools();
-
-		DBUG_EXECUTE_IF("innodb_log_abort_1", DBUG_RETURN(0););
-		DBUG_PRINT("ib_log", ("After innodb_log_abort_1"));
-
-		log_mutex_enter();
-
-		fil_names_clear(log_sys.lsn, false);
-
-		flushed_lsn = log_sys.lsn;
-
-		{
-			ib::info	info;
-			if (srv_log_file_size == 0
-			    || (log_sys.log.format & ~log_t::FORMAT_ENCRYPTED)
-			    != log_t::FORMAT_10_4) {
-				info << "Upgrading redo log: ";
-			} else if (n_files != srv_n_log_files
-				   || srv_log_file_size
-				   != srv_log_file_size_requested) {
-				if (srv_encrypt_log
-				    == (my_bool)log_sys.is_encrypted()) {
-					info << (srv_encrypt_log
-						 ? "Resizing encrypted"
-						 : "Resizing");
-				} else if (srv_encrypt_log) {
-					info << "Encrypting and resizing";
-				} else {
-					info << "Removing encryption"
-						" and resizing";
-				}
-
-				info << " redo log from " << n_files
-				     << "*" << srv_log_file_size << " to ";
-			} else if (srv_encrypt_log) {
-				info << "Encrypting redo log: ";
-			} else {
-				info << "Removing redo log encryption: ";
-			}
-
-			info << srv_n_log_files << "*"
-			     << srv_log_file_size_requested
-			     << " bytes; LSN=" << flushed_lsn;
-		}
-
-		srv_start_lsn = flushed_lsn;
-		/* Flush the old log files. */
-		log_mutex_exit();
-
-		log_write_up_to(flushed_lsn, true);
-
-		/* If innodb_flush_method=O_DSYNC,
-		we need to explicitly flush the log buffers. */
-		fil_flush(SRV_LOG_SPACE_FIRST_ID);
-
-		ut_ad(flushed_lsn == log_get_lsn());
-
-		/* Check if the buffer pools are clean.  If not
-		retry till it is clean. */
-		pending_io = buf_pool_check_no_pending_io();
-
-		if (pending_io > 0) {
-			count++;
-			/* Print a message every 60 seconds if we
-			are waiting to clean the buffer pools */
-			if (srv_print_verbose_log && count > 600) {
-				ib::info() << "Waiting for "
-					<< pending_io << " buffer "
-					<< "page I/Os to complete";
-				count = 0;
-			}
-		}
-		os_thread_sleep(100000);
-
-	} while (buf_pool_check_no_pending_io());
+  DBUG_ENTER("srv_prepare_to_delete_redo_log_file");
+
+  /* Disable checkpoints in the page cleaner. */
+  ut_ad(!recv_sys.recovery_on);
+  recv_sys.recovery_on= true;
+
+  /* Clean the buffer pool. */
+  buf_flush_sync();
+
+  if (log_sys.log.subformat != 2)
+    srv_log_file_size= 0;
+
+  DBUG_EXECUTE_IF("innodb_log_abort_1", DBUG_RETURN(0););
+  DBUG_PRINT("ib_log", ("After innodb_log_abort_1"));
+
+  mysql_mutex_lock(&log_sys.mutex);
+  const bool latest_format= (log_sys.log.format & ~log_t::FORMAT_ENCRYPTED) ==
+    log_t::FORMAT_10_5;
+  lsn_t flushed_lsn= log_sys.get_lsn();
+
+  if (latest_format)
+  {
+    fil_names_clear(flushed_lsn, false);
+    flushed_lsn= log_sys.get_lsn();
+  }
+
+  {
+    const char *msg;
+    if (!latest_format || srv_log_file_size == 0)
+    {
+      msg= "Upgrading redo log: ";
+same_size:
+      ib::info() << msg << srv_log_file_size_requested << " bytes; LSN="
+                 << flushed_lsn;
+    }
+    else if (old_exists && srv_log_file_size == srv_log_file_size_requested)
+    {
+      msg= srv_encrypt_log
+        ? "Encrypting redo log: " : "Removing redo log encryption: ";
+      goto same_size;
+    }
+    else
+    {
+      if (srv_encrypt_log == (my_bool)log_sys.is_encrypted())
+        msg= srv_encrypt_log ? "Resizing encrypted" : "Resizing";
+      else
+        msg= srv_encrypt_log
+          ? "Encrypting and resizing"
+          : "Removing encryption and resizing";
+
+      ib::info() << msg << " redo log from " << srv_log_file_size << " to "
+                 << srv_log_file_size_requested
+                 << " bytes; LSN=" << flushed_lsn;
+    }
+  }
+
+  mysql_mutex_unlock(&log_sys.mutex);
+
+  if (flushed_lsn != log_sys.get_flushed_lsn())
+  {
+    log_write_up_to(flushed_lsn, false);
+    log_sys.log.flush();
+  }
+
+  ut_ad(flushed_lsn == log_sys.get_lsn());
+  ut_ad(!buf_pool.any_io_pending());
+
+  DBUG_RETURN(flushed_lsn);
+}
 
-	DBUG_RETURN(flushed_lsn);
+/** Tries to locate LOG_FILE_NAME and check it's size, etc
+@param[out]	log_file_found	returns true here if correct file was found
+@return	dberr_t with DB_SUCCESS or some error */
+static dberr_t find_and_check_log_file(bool &log_file_found)
+{
+  log_file_found= false;
+
+  auto logfile0= get_log_file_path();
+  os_file_stat_t stat_info;
+  const dberr_t err= os_file_get_status(logfile0.c_str(), &stat_info, false,
+                                        srv_read_only_mode);
+
+  auto is_operation_restore= []() -> bool {
+    return srv_operation == SRV_OPERATION_RESTORE ||
+           srv_operation == SRV_OPERATION_RESTORE_EXPORT;
+  };
+
+  if (err == DB_NOT_FOUND)
+  {
+    if (is_operation_restore())
+      return DB_NOT_FOUND;
+
+    return DB_SUCCESS;
+  }
+
+  if (stat_info.type != OS_FILE_TYPE_FILE)
+    return DB_SUCCESS;
+
+  if (!srv_file_check_mode(logfile0.c_str()))
+    return DB_ERROR;
+
+  const os_offset_t size= stat_info.size;
+  ut_a(size != (os_offset_t) -1);
+
+  if (size % OS_FILE_LOG_BLOCK_SIZE)
+  {
+    ib::error() << "Log file " << logfile0 << " size " << size
+                << " is not a multiple of " << OS_FILE_LOG_BLOCK_SIZE
+                << " bytes";
+    return DB_ERROR;
+  }
+
+  if (size == 0 && is_operation_restore())
+  {
+    /* Tolerate an empty LOG_FILE_NAME from a previous run of
+    mariabackup --prepare. */
+    return DB_NOT_FOUND;
+  }
+  /* The first log file must consist of at least the following 512-byte pages:
+  header, checkpoint page 1, empty, checkpoint page 2, redo log page(s).
+
+  Mariabackup --prepare would create an empty LOG_FILE_NAME. Tolerate it. */
+  if (size != 0 && size <= OS_FILE_LOG_BLOCK_SIZE * 4)
+  {
+    ib::error() << "Log file " << logfile0 << " size " << size
+                << " is too small";
+    return DB_ERROR;
+  }
+  srv_log_file_size= size;
+
+  log_file_found= true;
+  return DB_SUCCESS;
 }
 
 /** Start InnoDB.
@@ -1300,16 +1043,12 @@ dberr_t srv_start(bool create_new_db)
 {
 	lsn_t		flushed_lsn;
 	dberr_t		err		= DB_SUCCESS;
-	ulint		srv_n_log_files_found = srv_n_log_files;
+	bool		srv_log_file_found = true;
 	mtr_t		mtr;
-	char		logfilename[10000];
-	char*		logfile0	= NULL;
-	size_t		dirnamelen;
-	unsigned	i = 0;
 
 	ut_ad(srv_operation == SRV_OPERATION_NORMAL
-	      || is_mariabackup_restore_or_export());
-
+	      || srv_operation == SRV_OPERATION_RESTORE
+	      || srv_operation == SRV_OPERATION_RESTORE_EXPORT);
 
 	if (srv_force_recovery) {
 		ib::info() << "!!! innodb_force_recovery is set to "
@@ -1321,11 +1060,10 @@ dberr_t srv_start(bool create_new_db)
 	}
 
 	high_level_read_only = srv_read_only_mode
-		|| srv_force_recovery > SRV_FORCE_NO_TRX_UNDO
+		|| srv_force_recovery > SRV_FORCE_NO_IBUF_MERGE
 		|| srv_sys_space.created_new_raw();
 
-	/* Reset the start state. */
-	srv_start_state = SRV_START_STATE_NONE;
+	srv_started_redo = false;
 
 	compile_time_assert(sizeof(ulint) == sizeof(void*));
 
@@ -1337,19 +1075,6 @@ dberr_t srv_start(bool create_new_db)
 	ib::info() << "!!!!!!!! UNIV_IBUF_DEBUG switched on !!!!!!!!!";
 #endif
 
-#ifdef UNIV_LOG_LSN_DEBUG
-	ib::info() << "!!!!!!!! UNIV_LOG_LSN_DEBUG switched on !!!!!!!!!";
-#endif /* UNIV_LOG_LSN_DEBUG */
-
-#if defined(COMPILER_HINTS_ENABLED)
-	ib::info() << "Compiler hints enabled.";
-#endif /* defined(COMPILER_HINTS_ENABLED) */
-
-#ifdef _WIN32
-	ib::info() << "Mutexes and rw_locks use Windows interlocked functions";
-#else
-	ib::info() << "Mutexes and rw_locks use GCC atomic builtins";
-#endif
 	ib::info() << MUTEX_TYPE;
 
 	ib::info() << "Compressed tables use zlib " ZLIB_VERSION
@@ -1380,39 +1105,34 @@ dberr_t srv_start(bool create_new_db)
 
 	/* Register performance schema stages before any real work has been
 	started which may need to be instrumented. */
-	mysql_stage_register("innodb", srv_stages, UT_ARR_SIZE(srv_stages));
+	mysql_stage_register("innodb", srv_stages,
+			     static_cast<int>(UT_ARR_SIZE(srv_stages)));
 
 	/* Set the maximum number of threads which can wait for a semaphore
-	inside InnoDB: this is the 'sync wait array' size, as well as the
-	maximum number of threads that can wait in the 'srv_conc array' for
-	their time to enter InnoDB. */
+	inside InnoDB: this is the 'sync wait array' size */
 
 	srv_max_n_threads = 1   /* io_ibuf_thread */
 			    + 1 /* io_log_thread */
-			    + 1 /* lock_wait_timeout_thread */
-			    + 1 /* srv_error_monitor_thread */
-			    + 1 /* srv_monitor_thread */
-			    + 1 /* srv_master_thread */
+			    + 1 /* srv_print_monitor_task */
 			    + 1 /* srv_purge_coordinator_thread */
 			    + 1 /* buf_dump_thread */
 			    + 1 /* dict_stats_thread */
 			    + 1 /* fts_optimize_thread */
-			    + 1 /* recv_writer_thread */
 			    + 1 /* trx_rollback_all_recovered */
 			    + 128 /* added as margin, for use of
 				  InnoDB Memcached etc. */
+			    + 1/* buf_flush_page_cleaner */
 			    + max_connections
 			    + srv_n_read_io_threads
 			    + srv_n_write_io_threads
 			    + srv_n_purge_threads
-			    + srv_n_page_cleaners
 			    /* FTS Parallel Sort */
 			    + fts_sort_pll_degree * FTS_NUM_AUX_INDEX
 			      * max_connections;
 
 	srv_boot();
 
-	ib::info() << ut_crc32_implementation;
+	ib::info() << my_crc32c_implementation();
 
 	if (!srv_read_only_mode) {
 
@@ -1429,9 +1149,12 @@ dberr_t srv_start(bool create_new_db)
 			sprintf(srv_monitor_file_name,
 				"%s/innodb_status." ULINTPF,
 				fil_path_to_mysql_datadir,
-				os_proc_get_number());
+				static_cast<ulint>
+				(IF_WIN(GetCurrentProcessId(), getpid())));
 
-			srv_monitor_file = fopen(srv_monitor_file_name, "w+");
+			srv_monitor_file = my_fopen(srv_monitor_file_name,
+						    O_RDWR|O_TRUNC|O_CREAT,
+						    MYF(MY_WME));
 
 			if (!srv_monitor_file) {
 				ib::error() << "Unable to create "
@@ -1465,9 +1188,7 @@ dberr_t srv_start(bool create_new_db)
 		return(srv_init_abort(err));
 	}
 
-	srv_n_file_io_threads = srv_n_read_io_threads;
-
-	srv_n_file_io_threads += srv_n_write_io_threads;
+	srv_n_file_io_threads = srv_n_read_io_threads + srv_n_write_io_threads;
 
 	if (!srv_read_only_mode) {
 		/* Add the log and ibuf IO threads. */
@@ -1479,46 +1200,25 @@ dberr_t srv_start(bool create_new_db)
 
 	ut_a(srv_n_file_io_threads <= SRV_MAX_N_IO_THREADS);
 
-	if (!os_aio_init(srv_n_read_io_threads,
-			 srv_n_write_io_threads,
-			 SRV_MAX_N_PENDING_SYNC_IOS)) {
-
+	if (os_aio_init()) {
 		ib::error() << "Cannot initialize AIO sub-system";
 
 		return(srv_init_abort(DB_ERROR));
 	}
 
-	fil_system.create(srv_file_per_table ? 50000 : 5000);
-
-	double	size;
-	char	unit;
-
-	if (srv_buf_pool_size >= 1024 * 1024 * 1024) {
-		size = ((double) srv_buf_pool_size) / (1024 * 1024 * 1024);
-		unit = 'G';
-	} else {
-		size = ((double) srv_buf_pool_size) / (1024 * 1024);
-		unit = 'M';
+#ifdef LINUX_NATIVE_AIO
+	if (srv_use_native_aio) {
+		ib::info() << "Using Linux native AIO";
 	}
+#endif
 
-	double	chunk_size;
-	char	chunk_unit;
-
-	if (srv_buf_pool_chunk_unit >= 1024 * 1024 * 1024) {
-		chunk_size = srv_buf_pool_chunk_unit / 1024.0 / 1024 / 1024;
-		chunk_unit = 'G';
-	} else {
-		chunk_size = srv_buf_pool_chunk_unit / 1024.0 / 1024;
-		chunk_unit = 'M';
-	}
+	fil_system.create(srv_file_per_table ? 50000 : 5000);
 
 	ib::info() << "Initializing buffer pool, total size = "
-		<< size << unit << ", instances = " << srv_buf_pool_instances
-		<< ", chunk size = " << chunk_size << chunk_unit;
-
-	err = buf_pool_init(srv_buf_pool_size, srv_buf_pool_instances);
+		<< srv_buf_pool_size
+		<< ", chunk size = " << srv_buf_pool_chunk_unit;
 
-	if (err != DB_SUCCESS) {
+	if (buf_pool.create()) {
 		ib::error() << "Cannot allocate memory for the buffer pool";
 
 		return(srv_init_abort(DB_ERROR));
@@ -1543,38 +1243,13 @@ dberr_t srv_start(bool create_new_db)
 	recv_sys.create();
 	lock_sys.create(srv_lock_table_size);
 
-	/* Create i/o-handler threads: */
-
-	for (ulint t = 0; t < srv_n_file_io_threads; ++t) {
-
-		n[t] = t;
-
-		thread_handles[t] = os_thread_create(io_handler_thread, n + t, thread_ids + t);
-		thread_started[t] = true;
-	}
+	srv_startup_is_before_trx_rollback_phase = true;
 
 	if (!srv_read_only_mode) {
 		buf_flush_page_cleaner_init();
-
-		buf_page_cleaner_is_active = true;
-		os_thread_create(buf_flush_page_cleaner_coordinator,
-				 NULL, NULL);
-
-		/* Create page cleaner workers if needed. For example
-		mariabackup could set srv_n_page_cleaners = 0. */
-		if (srv_n_page_cleaners > 1) {
-			buf_flush_set_page_cleaner_thread_cnt(srv_n_page_cleaners);
-		}
-
-#ifdef UNIV_LINUX
-		/* Wait for the setpriority() call to finish. */
-		os_event_wait(recv_sys.flush_end);
-#endif /* UNIV_LINUX */
-		srv_start_state_set(SRV_START_STATE_IO);
+		ut_ad(buf_page_cleaner_is_active);
 	}
 
-	srv_startup_is_before_trx_rollback_phase = !create_new_db;
-
 	/* Check if undo tablespaces and redo log files exist before creating
 	a new system tablespace */
 	if (create_new_db) {
@@ -1611,29 +1286,19 @@ dberr_t srv_start(bool create_new_db)
 		return(srv_init_abort(err));
 	}
 
-	dirnamelen = strlen(srv_log_group_home_dir);
-	ut_a(dirnamelen < (sizeof logfilename) - 10 - sizeof "ib_logfile");
-	memcpy(logfilename, srv_log_group_home_dir, dirnamelen);
-
-	/* Add a path separator if needed. */
-	if (dirnamelen && logfilename[dirnamelen - 1] != OS_PATH_SEPARATOR) {
-		logfilename[dirnamelen++] = OS_PATH_SEPARATOR;
-	}
-
 	srv_log_file_size_requested = srv_log_file_size;
 
 	if (innodb_encrypt_temporary_tables && !log_crypt_init()) {
 		return srv_init_abort(DB_ERROR);
 	}
 
+	std::string logfile0;
+	bool create_new_log = create_new_db;
 	if (create_new_db) {
+		flushed_lsn = log_sys.get_lsn();
+		log_sys.set_flushed_lsn(flushed_lsn);
 
-		buf_flush_sync_all_buf_pools();
-
-		flushed_lsn = log_get_lsn();
-
-		err = create_log_files(
-			logfilename, dirnamelen, flushed_lsn, logfile0);
+		err = create_log_file(true, flushed_lsn, logfile0);
 
 		if (err != DB_SUCCESS) {
 			for (Tablespace::const_iterator
@@ -1647,99 +1312,32 @@ dberr_t srv_start(bool create_new_db)
 	} else {
 		srv_log_file_size = 0;
 
-		for (i = 0; i < SRV_N_LOG_FILES_MAX; i++) {
-			os_file_stat_t	stat_info;
-
-			sprintf(logfilename + dirnamelen,
-				"ib_logfile%u", i);
-
-			err = os_file_get_status(
-				logfilename, &stat_info, false,
-				srv_read_only_mode);
-
+		bool log_file_found;
+		if (dberr_t err = find_and_check_log_file(log_file_found)) {
 			if (err == DB_NOT_FOUND) {
-				if (i == 0
-				    && is_mariabackup_restore_or_export())
-					return (DB_SUCCESS);
-
-				/* opened all files */
-				break;
-			}
-
-			if (stat_info.type != OS_FILE_TYPE_FILE) {
-				break;
-			}
-
-			if (!srv_file_check_mode(logfilename)) {
-				return(srv_init_abort(DB_ERROR));
-			}
-
-			const os_offset_t size = stat_info.size;
-			ut_a(size != (os_offset_t) -1);
-
-			if (size & (OS_FILE_LOG_BLOCK_SIZE - 1)) {
-
-				ib::error() << "Log file " << logfilename
-					<< " size " << size << " is not a"
-					" multiple of 512 bytes";
-				return(srv_init_abort(DB_ERROR));
-			}
-
-			if (i == 0) {
-				if (size == 0
-				    && is_mariabackup_restore_or_export()) {
-					/* Tolerate an empty ib_logfile0
-					from a previous run of
-					mariabackup --prepare. */
-					return(DB_SUCCESS);
-				}
-				/* The first log file must consist of
-				at least the following 512-byte pages:
-				header, checkpoint page 1, empty,
-				checkpoint page 2, redo log page(s).
-
-				Mariabackup --prepare would create an
-				empty ib_logfile0. Tolerate it if there
-				are no other ib_logfile* files. */
-				if ((size != 0 || i != 0)
-				    && size <= OS_FILE_LOG_BLOCK_SIZE * 4) {
-					ib::error() << "Log file "
-						<< logfilename << " size "
-						<< size << " is too small";
-					return(srv_init_abort(DB_ERROR));
-				}
-				srv_log_file_size = size;
-			} else if (size != srv_log_file_size) {
-
-				ib::error() << "Log file " << logfilename
-					<< " is of different size " << size
-					<< " bytes than other log files "
-					<< srv_log_file_size << " bytes!";
-				return(srv_init_abort(DB_ERROR));
+				return DB_SUCCESS;
 			}
+			return srv_init_abort(err);
 		}
 
-		if (srv_log_file_size == 0) {
+		create_new_log = srv_log_file_size == 0;
+		if (create_new_log) {
 			if (flushed_lsn < lsn_t(1000)) {
 				ib::error()
-					<< "Cannot create log files because"
+					<< "Cannot create log file because"
 					" data files are corrupt or the"
 					" database was not shut down cleanly"
 					" after creating the data files.";
 				return srv_init_abort(DB_ERROR);
 			}
 
-			strcpy(logfilename + dirnamelen, "ib_logfile0");
 			srv_log_file_size = srv_log_file_size_requested;
 
-			err = create_log_files(
-				logfilename, dirnamelen,
-				flushed_lsn, logfile0);
+			err = create_log_file(false, flushed_lsn, logfile0);
 
 			if (err == DB_SUCCESS) {
-				err = create_log_files_rename(
-					logfilename, dirnamelen,
-					flushed_lsn, logfile0);
+				err = create_log_file_rename(flushed_lsn,
+							     logfile0);
 			}
 
 			if (err != DB_SUCCESS) {
@@ -1748,54 +1346,32 @@ dberr_t srv_start(bool create_new_db)
 
 			/* Suppress the message about
 			crash recovery. */
-			flushed_lsn = log_get_lsn();
-			goto files_checked;
+			flushed_lsn = log_sys.get_lsn();
+			goto file_checked;
 		}
 
-		srv_n_log_files_found = i;
-
-		/* Create the in-memory file space objects. */
-
-		sprintf(logfilename + dirnamelen, "ib_logfile%u", 0);
-
-		/* Disable the doublewrite buffer for log files. */
-		fil_space_t*	log_space = fil_space_create(
-			"innodb_redo_log",
-			SRV_LOG_SPACE_FIRST_ID, 0,
-			FIL_TYPE_LOG,
-			NULL /* no encryption yet */);
+		srv_log_file_found = log_file_found;
 
-		ut_a(fil_validate());
-		ut_a(log_space);
+		log_sys.log.open_file(get_log_file_path());
 
-		ut_a(srv_log_file_size <= log_group_max_size);
-
-		const ulint size = 1 + ulint((srv_log_file_size - 1)
-					     >> srv_page_size_shift);
-
-		for (unsigned j = 0; j < srv_n_log_files_found; j++) {
-			sprintf(logfilename + dirnamelen, "ib_logfile%u", j);
-
-			log_space->add(logfilename, OS_FILE_CLOSED, size,
-				       false, false);
-		}
-
-		log_sys.log.create(srv_n_log_files_found);
+		log_sys.log.create();
 
 		if (!log_set_capacity(srv_log_file_size_requested)) {
 			return(srv_init_abort(DB_ERROR));
 		}
-	}
 
-files_checked:
-	/* Open all log files and data files in the system
-	tablespace: we keep them open until database
-	shutdown */
+		/* Enable checkpoints in the page cleaner. */
+		recv_sys.recovery_on = false;
+	}
 
-	fil_open_log_and_system_tablespace_files();
+file_checked:
+	/* Open log file and data files in the systemtablespace: we keep
+        them open until database shutdown */
 	ut_d(fil_system.sys_space->recv_size = srv_sys_space_size_debug);
 
-	err = srv_undo_tablespaces_init(create_new_db);
+	err = fil_system.sys_space->open(create_new_db)
+		? srv_undo_tablespaces_init(create_new_db)
+		: DB_ERROR;
 
 	/* If the force recovery is set very high then we carry on regardless
 	of all errors. Basically this is fingers crossed mode. */
@@ -1809,23 +1385,24 @@ files_checked:
 	/* Initialize objects used by dict stats gathering thread, which
 	can also be used by recovery if it tries to drop some table */
 	if (!srv_read_only_mode) {
-		dict_stats_thread_init();
+		dict_stats_init();
 	}
 
 	trx_sys.create();
 
 	if (create_new_db) {
-		ut_a(!srv_read_only_mode);
+		ut_ad(!srv_read_only_mode);
 
 		mtr_start(&mtr);
 		ut_ad(fil_system.sys_space->id == 0);
 		compile_time_assert(TRX_SYS_SPACE == 0);
 		compile_time_assert(IBUF_SPACE_ID == 0);
-		fsp_header_init(fil_system.sys_space, sum_of_new_sizes, &mtr);
+		fsp_header_init(fil_system.sys_space,
+				uint32_t(sum_of_new_sizes), &mtr);
 
 		ulint ibuf_root = btr_create(
 			DICT_CLUSTERED | DICT_IBUF, fil_system.sys_space,
-			DICT_IBUF_ID_MIN, dict_ind_redundant, &mtr);
+			DICT_IBUF_ID_MIN, nullptr, &mtr);
 
 		mtr_commit(&mtr);
 
@@ -1852,34 +1429,32 @@ files_checked:
 			return(srv_init_abort(err));
 		}
 
-		buf_flush_sync_all_buf_pools();
+		buf_flush_sync();
 
-		flushed_lsn = log_get_lsn();
+		flushed_lsn = log_sys.get_lsn();
 
 		err = fil_write_flushed_lsn(flushed_lsn);
 
 		if (err == DB_SUCCESS) {
-			err = create_log_files_rename(
-				logfilename, dirnamelen,
-				flushed_lsn, logfile0);
+			err = create_log_file_rename(flushed_lsn, logfile0);
 		}
 
 		if (err != DB_SUCCESS) {
 			return(srv_init_abort(err));
 		}
 	} else {
-		/* Work around the bug that we were performing a dirty read of
-		at least the TRX_SYS page into the buffer pool above, without
-		reading or applying any redo logs.
-
-		MDEV-19229 FIXME: Remove the dirty reads and this call.
-		Add an assertion that the buffer pool is empty. */
-		buf_pool_invalidate();
+		/* Suppress warnings in fil_space_t::create() for files
+		that are being read before dict_boot() has recovered
+		DICT_HDR_MAX_SPACE_ID. */
+		fil_system.space_id_reuse_warned = true;
 
 		/* We always try to do a recovery, even if the database had
 		been shut down normally: this is the normal startup path */
 
-		err = recv_recovery_from_checkpoint_start(flushed_lsn);
+		err = create_new_log
+			? DB_SUCCESS
+			: recv_recovery_from_checkpoint_start(flushed_lsn);
+		recv_sys.close_files();
 
 		recv_sys.dblwr.pages.clear();
 
@@ -1889,7 +1464,6 @@ files_checked:
 
 		switch (srv_operation) {
 		case SRV_OPERATION_NORMAL:
-		case SRV_OPERATION_RESTORE_ROLLBACK_XA:
 		case SRV_OPERATION_RESTORE_EXPORT:
 			/* Initialize the change buffer. */
 			err = dict_boot();
@@ -1898,8 +1472,13 @@ files_checked:
 			}
 			/* fall through */
 		case SRV_OPERATION_RESTORE:
-			/* This must precede
-			recv_apply_hashed_log_recs(true). */
+			/* This must precede recv_sys.apply(true). */
+			srv_undo_tablespaces_active
+				= trx_rseg_get_n_undo_tablespaces();
+			err = srv_validate_undo_tablespaces();
+			if (err != DB_SUCCESS) {
+				return srv_init_abort(err);
+			}
 			err = trx_lists_init_at_db_start();
 			if (err != DB_SUCCESS) {
 				return srv_init_abort(err);
@@ -1907,7 +1486,7 @@ files_checked:
 			break;
 		case SRV_OPERATION_RESTORE_DELTA:
 		case SRV_OPERATION_BACKUP:
-			ut_ad(!"wrong mariabackup mode");
+			ut_ad("wrong mariabackup mode" == 0);
 		}
 
 		if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) {
@@ -1915,7 +1494,7 @@ files_checked:
 			respective file pages, for the last batch of
 			recv_group_scan_log_recs(). */
 
-			recv_apply_hashed_log_recs(true);
+			recv_sys.apply(true);
 
 			if (recv_sys.found_corrupt_log
 			    || recv_sys.found_corrupt_fs) {
@@ -1929,6 +1508,8 @@ files_checked:
 			}
 		}
 
+		fil_system.space_id_reuse_warned = false;
+
 		if (!srv_read_only_mode) {
 			const ulint flags = FSP_FLAGS_PAGE_SSIZE();
 			for (ulint id = 0; id <= srv_undo_tablespaces; id++) {
@@ -1951,10 +1532,11 @@ files_checked:
 				ut_ad(size == fil_system.sys_space
 				      ->size_in_header);
 				size += sum_of_new_sizes;
-				mlog_write_ulint(FSP_HEADER_OFFSET + FSP_SIZE
-						 + block->frame, size,
-						 MLOG_4BYTES, &mtr);
-				fil_system.sys_space->size_in_header = size;
+				mtr.write<4>(*block,
+					     FSP_HEADER_OFFSET + FSP_SIZE
+					     + block->frame, size);
+				fil_system.sys_space->size_in_header
+					= uint32_t(size);
 				mtr.commit();
 				/* Immediately write the log record about
 				increased tablespace size to disk, so that it
@@ -2013,35 +1595,29 @@ files_checked:
 			}
 		}
 
-		/* recv_recovery_from_checkpoint_finish needs trx lists which
-		are initialized in trx_lists_init_at_db_start(). */
-
-		recv_recovery_from_checkpoint_finish();
+		recv_sys.debug_free();
 
-		if (is_mariabackup_restore_or_export()) {
+		if (srv_operation == SRV_OPERATION_RESTORE
+		    || srv_operation == SRV_OPERATION_RESTORE_EXPORT) {
 			/* After applying the redo log from
 			SRV_OPERATION_BACKUP, flush the changes
 			to the data files and truncate or delete the log.
 			Unless --export is specified, no further change to
 			InnoDB files is needed. */
 			ut_ad(srv_force_recovery <= SRV_FORCE_IGNORE_CORRUPT);
-			ut_ad(srv_n_log_files_found <= 1);
 			ut_ad(recv_no_log_write);
-			buf_flush_sync_all_buf_pools();
-			err = fil_write_flushed_lsn(log_get_lsn());
-			ut_ad(!buf_pool_check_no_pending_io());
-			fil_close_log_files(true);
+			err = fil_write_flushed_lsn(log_sys.get_lsn());
+			DBUG_ASSERT(!buf_pool.any_io_pending());
+			log_sys.log.close_file();
 			if (err == DB_SUCCESS) {
-				bool trunc = is_mariabackup_restore();
-				/* Delete subsequent log files. */
-				delete_log_files(logfilename, dirnamelen,
-						 (uint)srv_n_log_files_found, trunc);
-				if (trunc) {
+				bool trunc = srv_operation
+					== SRV_OPERATION_RESTORE;
+				if (!trunc) {
+					delete_log_file("0");
+				} else {
+					auto logfile0 = get_log_file_path();
 					/* Truncate the first log file. */
-					strcpy(logfilename + dirnamelen,
-					       "ib_logfile0");
-					FILE* f = fopen(logfilename, "w");
-					fclose(f);
+					fclose(fopen(logfile0.c_str(), "w"));
 				}
 			}
 			return(err);
@@ -2049,32 +1625,33 @@ files_checked:
 
 		/* Upgrade or resize or rebuild the redo logs before
 		generating any dirty pages, so that the old redo log
-		files will not be written to. */
+		file will not be written to. */
 
 		if (srv_force_recovery == SRV_FORCE_NO_LOG_REDO) {
 			/* Completely ignore the redo log. */
 		} else if (srv_read_only_mode) {
 			/* Leave the redo log alone. */
 		} else if (srv_log_file_size_requested == srv_log_file_size
-			   && srv_n_log_files_found == srv_n_log_files
+			   && srv_log_file_found
 			   && log_sys.log.format
 			   == (srv_encrypt_log
-			       ? log_t::FORMAT_ENC_10_4
-			       : log_t::FORMAT_10_4)
+			       ? log_t::FORMAT_ENC_10_5
+			       : log_t::FORMAT_10_5)
 			   && log_sys.log.subformat == 2) {
 			/* No need to add or remove encryption,
 			upgrade, downgrade, or resize. */
 		} else {
-			/* Prepare to delete the old redo log files */
-			flushed_lsn = srv_prepare_to_delete_redo_log_files(i);
+			/* Prepare to delete the old redo log file */
+			flushed_lsn = srv_prepare_to_delete_redo_log_file(
+				srv_log_file_found);
 
 			DBUG_EXECUTE_IF("innodb_log_abort_1",
 					return(srv_init_abort(DB_ERROR)););
 			/* Prohibit redo log writes from any other
 			threads until creating a log checkpoint at the
-			end of create_log_files(). */
+			end of create_log_file(). */
 			ut_d(recv_no_log_write = true);
-			ut_ad(!buf_pool_check_no_pending_io());
+			DBUG_ASSERT(!buf_pool.any_io_pending());
 
 			DBUG_EXECUTE_IF("innodb_log_abort_3",
 					return(srv_init_abort(DB_ERROR)););
@@ -2090,33 +1667,60 @@ files_checked:
 				return(srv_init_abort(err));
 			}
 
-			/* Close and free the redo log files, so that
-			we can replace them. */
-			fil_close_log_files(true);
+			/* Close the redo log file, so that we can replace it */
+			log_sys.log.close_file();
 
 			DBUG_EXECUTE_IF("innodb_log_abort_5",
 					return(srv_init_abort(DB_ERROR)););
 			DBUG_PRINT("ib_log", ("After innodb_log_abort_5"));
 
-			ib::info() << "Starting to delete and rewrite log"
-				" files.";
+			ib::info()
+				<< "Starting to delete and rewrite log file.";
 
 			srv_log_file_size = srv_log_file_size_requested;
 
-			err = create_log_files(
-				logfilename, dirnamelen, flushed_lsn,
-				logfile0);
+			err = create_log_file(false, flushed_lsn, logfile0);
 
 			if (err == DB_SUCCESS) {
-				err = create_log_files_rename(
-					logfilename, dirnamelen, flushed_lsn,
-					logfile0);
+				err = create_log_file_rename(flushed_lsn,
+							     logfile0);
 			}
 
 			if (err != DB_SUCCESS) {
 				return(srv_init_abort(err));
 			}
 		}
+	}
+
+	ut_ad(err == DB_SUCCESS);
+	ut_a(sum_of_new_sizes != ULINT_UNDEFINED);
+
+	/* Create the doublewrite buffer to a new tablespace */
+	if (!srv_read_only_mode && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
+	    && !buf_dblwr.create()) {
+		return(srv_init_abort(DB_ERROR));
+	}
+
+	/* Here the double write buffer has already been created and so
+	any new rollback segments will be allocated after the double
+	write buffer. The default segment should already exist.
+	We create the new segments only if it's a new database or
+	the database was shutdown cleanly. */
+
+	/* Note: When creating the extra rollback segments during an upgrade
+	we violate the latching order, even if the change buffer is empty.
+	We make an exception in sync0sync.cc and check srv_is_being_started
+	for that violation. It cannot create a deadlock because we are still
+	running in single threaded mode essentially. Only the IO threads
+	should be running at this stage. */
+
+	if (!trx_sys_create_rsegs()) {
+		return(srv_init_abort(DB_ERROR));
+	}
+
+	if (!create_new_db) {
+		ut_ad(high_level_read_only
+		      || srv_force_recovery <= SRV_FORCE_NO_IBUF_MERGE);
 
 		/* Validate a few system page types that were left
 		uninitialized before MySQL or MariaDB 5.5. */
@@ -2151,24 +1755,29 @@ files_checked:
 			mtr.commit();
 		}
 
-		/* Roll back any recovered data dictionary transactions, so
-		that the data dictionary tables will be free of any locks.
-		The data dictionary latch should guarantee that there is at
-		most one data dictionary transaction active at a time. */
-		if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) {
+		/* Roll back any recovered data dictionary
+		transactions, so that the data dictionary tables will
+		be free of any locks.  The data dictionary latch
+		should guarantee that there is at most one data
+		dictionary transaction active at a time. */
+		if (!high_level_read_only
+		    && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) {
 			/* If the following call is ever removed, the
 			first-time ha_innobase::open() must hold (or
 			acquire and release) a table lock that
 			conflicts with trx_resurrect_table_locks(), to
-			ensure that any recovered incomplete ALTER TABLE
-			will have been rolled back. Otherwise,
-			dict_table_t::instant could be cleared by rollback
-			invoking dict_index_t::clear_instant_alter() while
-			open table handles exist in client connections. */
+			ensure that any recovered incomplete ALTER
+			TABLE will have been rolled back. Otherwise,
+			dict_table_t::instant could be cleared by
+			rollback invoking
+			dict_index_t::clear_instant_alter() while open
+			table handles exist in client connections. */
 			trx_rollback_recovered(false);
 		}
 
-		if (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE) {
+		/* FIXME: Skip the following if srv_read_only_mode,
+		while avoiding "Allocated tablespace ID" warnings. */
+		if (srv_force_recovery <= SRV_FORCE_NO_IBUF_MERGE) {
 			/* Open or Create SYS_TABLESPACES and SYS_DATAFILES
 			so that tablespace names and other metadata can be
 			found. */
@@ -2194,70 +1803,38 @@ files_checked:
 			dict_check_tablespaces_and_store_max_id();
 		}
 
-		if (err != DB_SUCCESS) {
-			return(srv_init_abort(err));
+		if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
+		    && !srv_read_only_mode) {
+			/* Drop partially created indexes. */
+			row_merge_drop_temp_indexes();
+			/* Drop garbage tables. */
+			row_mysql_drop_garbage_tables();
+
+			/* Drop any auxiliary tables that were not
+			dropped when the parent table was
+			dropped. This can happen if the parent table
+			was dropped but the server crashed before the
+			auxiliary tables were dropped. */
+			fts_drop_orphaned_tables();
+
+			/* Rollback incomplete non-DDL transactions */
+			trx_rollback_is_active = true;
+			os_thread_create(trx_rollback_all_recovered);
 		}
-
-		recv_recovery_rollback_active();
-		srv_startup_is_before_trx_rollback_phase = FALSE;
-	}
-
-	ut_ad(err == DB_SUCCESS);
-	ut_a(sum_of_new_sizes != ULINT_UNDEFINED);
-
-	/* Create the doublewrite buffer to a new tablespace */
-	if (!srv_read_only_mode && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
-	    && !buf_dblwr_create()) {
-		return(srv_init_abort(DB_ERROR));
-	}
-
-	/* Here the double write buffer has already been created and so
-	any new rollback segments will be allocated after the double
-	write buffer. The default segment should already exist.
-	We create the new segments only if it's a new database or
-	the database was shutdown cleanly. */
-
-	/* Note: When creating the extra rollback segments during an upgrade
-	we violate the latching order, even if the change buffer is empty.
-	We make an exception in sync0sync.cc and check srv_is_being_started
-	for that violation. It cannot create a deadlock because we are still
-	running in single threaded mode essentially. Only the IO threads
-	should be running at this stage. */
-
-	ut_a(srv_undo_logs > 0);
-	ut_a(srv_undo_logs <= TRX_SYS_N_RSEGS);
-
-	if (!trx_sys_create_rsegs()) {
-		return(srv_init_abort(DB_ERROR));
 	}
 
 	srv_startup_is_before_trx_rollback_phase = false;
 
 	if (!srv_read_only_mode) {
-		/* Create the thread which watches the timeouts
+		/* timer task which watches the timeouts
 		for lock waits */
-		thread_handles[2 + SRV_MAX_N_IO_THREADS] = os_thread_create(
-			lock_wait_timeout_thread,
-			NULL, thread_ids + 2 + SRV_MAX_N_IO_THREADS);
-		thread_started[2 + SRV_MAX_N_IO_THREADS] = true;
-		lock_sys.timeout_thread_active = true;
+		lock_sys.timeout_timer.reset(srv_thread_pool->create_timer(
+			lock_wait_timeout_task));
 
 		DBUG_EXECUTE_IF("innodb_skip_monitors", goto skip_monitors;);
-		/* Create the thread which warns of long semaphore waits */
-		srv_error_monitor_active = true;
-		thread_handles[3 + SRV_MAX_N_IO_THREADS] = os_thread_create(
-			srv_error_monitor_thread,
-			NULL, thread_ids + 3 + SRV_MAX_N_IO_THREADS);
-		thread_started[3 + SRV_MAX_N_IO_THREADS] = true;
-
-		/* Create the thread which prints InnoDB monitor info */
-		srv_monitor_active = true;
-		thread_handles[4 + SRV_MAX_N_IO_THREADS] = os_thread_create(
-			srv_monitor_thread,
-			NULL, thread_ids + 4 + SRV_MAX_N_IO_THREADS);
-		thread_started[4 + SRV_MAX_N_IO_THREADS] = true;
-		srv_start_state |= SRV_START_STATE_LOCK_SYS
-			| SRV_START_STATE_MONITOR;
+		/* Create the task which warns of long semaphore waits */
+		srv_start_periodic_timer(srv_monitor_timer, srv_monitor_task,
+					 SRV_MONITOR_INTERVAL);
 
 #ifndef DBUG_OFF
 skip_monitors:
@@ -2267,11 +1844,8 @@ skip_monitors:
 
 		if (srv_force_recovery < SRV_FORCE_NO_BACKGROUND) {
 			srv_undo_sources = true;
-			/* Create the dict stats gathering thread */
-			srv_dict_stats_thread_active = true;
-			dict_stats_thread_handle = os_thread_create(
-				dict_stats_thread, NULL, NULL);
-
+			/* Create the dict stats gathering task */
+			dict_stats_start();
 			/* Create the thread that will optimize the
 			FULLTEXT search index subsystem. */
 			fts_optimize_init();
@@ -2311,67 +1885,30 @@ skip_monitors:
 		trx_temp_rseg_create();
 
 		if (srv_force_recovery < SRV_FORCE_NO_BACKGROUND) {
-			thread_handles[1 + SRV_MAX_N_IO_THREADS]
-				= os_thread_create(srv_master_thread, NULL,
-						   (1 + SRV_MAX_N_IO_THREADS)
-						   + thread_ids);
-			thread_started[1 + SRV_MAX_N_IO_THREADS] = true;
-			srv_start_state_set(SRV_START_STATE_MASTER);
+			srv_start_periodic_timer(srv_master_timer, srv_master_callback, 1000);
 		}
 	}
 
-	if (!srv_read_only_mode
-	    && (srv_operation == SRV_OPERATION_NORMAL
-		|| srv_operation == SRV_OPERATION_RESTORE_ROLLBACK_XA)
+	if (!srv_read_only_mode && srv_operation == SRV_OPERATION_NORMAL
 	    && srv_force_recovery < SRV_FORCE_NO_BACKGROUND) {
-
-		thread_handles[5 + SRV_MAX_N_IO_THREADS] = os_thread_create(
-			srv_purge_coordinator_thread,
-			NULL, thread_ids + 5 + SRV_MAX_N_IO_THREADS);
-
-		thread_started[5 + SRV_MAX_N_IO_THREADS] = true;
-
-		ut_a(UT_ARR_SIZE(thread_ids)
-		     > 5 + srv_n_purge_threads + SRV_MAX_N_IO_THREADS);
-
-		/* We've already created the purge coordinator thread above. */
-		for (i = 1; i < srv_n_purge_threads; ++i) {
-			thread_handles[5 + i + SRV_MAX_N_IO_THREADS] = os_thread_create(
-				srv_worker_thread, NULL,
-				thread_ids + 5 + i + SRV_MAX_N_IO_THREADS);
-			thread_started[5 + i + SRV_MAX_N_IO_THREADS] = true;
-		}
-
-		while (srv_shutdown_state <= SRV_SHUTDOWN_INITIATED
-		       && srv_force_recovery < SRV_FORCE_NO_BACKGROUND
-		       && !purge_sys.enabled()) {
-			ib::info() << "Waiting for purge to start";
-			os_thread_sleep(50000);
-		}
-
-		srv_start_state_set(SRV_START_STATE_PURGE);
+		srv_init_purge_tasks();
+		purge_sys.coordinator_startup();
+		srv_wake_purge_thread_if_not_active();
 	}
 
 	srv_is_being_started = false;
 
-	if (!srv_read_only_mode) {
-		/* wake main loop of page cleaner up */
-		os_event_set(buf_flush_event);
-	}
-
 	if (srv_print_verbose_log) {
 		ib::info() << INNODB_VERSION_STR
 			   << " started; log sequence number "
-			   << srv_start_lsn
+			   << recv_sys.recovered_lsn
 			   << "; transaction id " << trx_sys.get_max_trx_id();
 	}
 
 	if (srv_force_recovery == 0) {
-		/* In the insert buffer we may have even bigger tablespace
+		/* In the change buffer we may have even bigger tablespace
 		id's, because we may have dropped those tablespaces, but
-		insert buffer merge has not had time to clean the records from
-		the ibuf tree. */
-
+		the buffered records have not been cleaned yet. */
 		ibuf_update_max_tablespace_id();
 	}
 
@@ -2388,10 +1925,8 @@ skip_monitors:
 		if (!get_wsrep_recovery()) {
 #endif /* WITH_WSREP */
 
-		/* Create the buffer pool dump/load thread */
-		srv_buf_dump_thread_active = true;
-		buf_dump_thread_handle=
-			os_thread_create(buf_dump_thread, NULL, NULL);
+		/* Start buffer pool dump/load task */
+		buf_load_at_startup();
 
 #ifdef WITH_WSREP
 		} else {
@@ -2406,75 +1941,99 @@ skip_monitors:
 		will flush dirty pages and that might need e.g.
 		fil_crypt_threads_event. */
 		fil_system_enter();
-		btr_scrub_init();
 		fil_crypt_threads_init();
 		fil_system_exit();
 
 		/* Initialize online defragmentation. */
 		btr_defragment_init();
-		btr_defragment_thread_active = true;
-		os_thread_create(btr_defragment_thread, NULL, NULL);
 
-		srv_start_state |= SRV_START_STATE_REDO;
+		srv_started_redo = true;
 	}
 
-	/* Create the buffer pool resize thread */
-	srv_buf_resize_thread_active = true;
-	os_thread_create(buf_resize_thread, NULL, NULL);
-
 	return(DB_SUCCESS);
 }
 
 /** Shut down background threads that can generate undo log. */
 void srv_shutdown_bg_undo_sources()
 {
+	srv_shutdown_state = SRV_SHUTDOWN_INITIATED;
+
 	if (srv_undo_sources) {
 		ut_ad(!srv_read_only_mode);
-		srv_shutdown_state = SRV_SHUTDOWN_INITIATED;
 		fts_optimize_shutdown();
 		dict_stats_shutdown();
 		while (row_get_background_drop_list_len_low()) {
-			srv_wake_master_thread();
+			srv_inc_activity_count();
 			os_thread_yield();
 		}
 		srv_undo_sources = false;
 	}
 }
 
+/**
+  Shutdown purge to make sure that there is no possibility that we call any
+  plugin code (e.g., audit) inside virtual column computation.
+*/
+void innodb_preshutdown()
+{
+  static bool first_time= true;
+  if (!first_time)
+    return;
+  first_time= false;
+
+  if (srv_read_only_mode)
+    return;
+  if (!srv_fast_shutdown && srv_operation == SRV_OPERATION_NORMAL)
+  {
+    /* Because a slow shutdown must empty the change buffer, we had
+    better prevent any further changes from being buffered. */
+    innodb_change_buffering= 0;
+
+    if (trx_sys.is_initialised())
+      while (trx_sys.any_active_transactions())
+        os_thread_sleep(1000);
+  }
+  srv_shutdown_bg_undo_sources();
+  srv_purge_shutdown();
+
+  if (srv_n_fil_crypt_threads)
+    fil_crypt_set_thread_cnt(0);
+}
+
+
 /** Shut down InnoDB. */
 void innodb_shutdown()
 {
-	ut_ad(!srv_running.load(std::memory_order_relaxed));
+	innodb_preshutdown();
 	ut_ad(!srv_undo_sources);
-
 	switch (srv_operation) {
-	case SRV_OPERATION_RESTORE_ROLLBACK_XA:
-		if (dberr_t err = fil_write_flushed_lsn(log_sys.lsn))
-			ib::error() << "Writing flushed lsn " << log_sys.lsn
-				    << " failed; error=" << err;
-		/* fall through */
 	case SRV_OPERATION_BACKUP:
-	case SRV_OPERATION_RESTORE:
 	case SRV_OPERATION_RESTORE_DELTA:
+		break;
+	case SRV_OPERATION_RESTORE:
 	case SRV_OPERATION_RESTORE_EXPORT:
-		fil_close_all_files();
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
+		while (buf_page_cleaner_is_active) {
+			pthread_cond_signal(&buf_pool.do_flush_list);
+			my_cond_wait(&buf_pool.done_flush_list,
+				     &buf_pool.flush_list_mutex.m_mutex);
+		}
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 		break;
 	case SRV_OPERATION_NORMAL:
 		/* Shut down the persistent files. */
 		logs_empty_and_mark_files_at_shutdown();
-
-		if (ulint n_threads = srv_conc_get_active_threads()) {
-			ib::warn() << "Query counter shows "
-				   << n_threads << " queries still"
-				" inside InnoDB at shutdown";
-		}
 	}
 
+	os_aio_free();
+	fil_space_t::close_all();
 	/* Exit any remaining threads. */
-	srv_shutdown_all_bg_threads();
+	ut_ad(!buf_page_cleaner_is_active);
+	srv_shutdown_threads();
 
 	if (srv_monitor_file) {
-		fclose(srv_monitor_file);
+		my_fclose(srv_monitor_file, MYF(MY_WME));
 		srv_monitor_file = 0;
 		if (srv_monitor_file_name) {
 			unlink(srv_monitor_file_name);
@@ -2483,33 +2042,27 @@ void innodb_shutdown()
 	}
 
 	if (srv_misc_tmpfile) {
-		fclose(srv_misc_tmpfile);
+		my_fclose(srv_misc_tmpfile, MYF(MY_WME));
 		srv_misc_tmpfile = 0;
 	}
 
-	ut_ad(dict_stats_event || !srv_was_started || srv_read_only_mode);
 	ut_ad(dict_sys.is_initialised() || !srv_was_started);
 	ut_ad(trx_sys.is_initialised() || !srv_was_started);
-	ut_ad(buf_dblwr || !srv_was_started || srv_read_only_mode
+	ut_ad(buf_dblwr.is_initialised() || !srv_was_started
+	      || srv_read_only_mode
 	      || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO);
 	ut_ad(lock_sys.is_initialised() || !srv_was_started);
 	ut_ad(log_sys.is_initialised() || !srv_was_started);
-#ifdef BTR_CUR_HASH_ADAPT
-	ut_ad(btr_search_sys || !srv_was_started);
-#endif /* BTR_CUR_HASH_ADAPT */
-	ut_ad(ibuf || !srv_was_started);
+	ut_ad(ibuf.index || !srv_was_started);
 
-	if (dict_stats_event) {
-		dict_stats_thread_deinit();
-	}
+	dict_stats_deinit();
 
-	if (srv_start_state_is_set(SRV_START_STATE_REDO)) {
+	if (srv_started_redo) {
 		ut_ad(!srv_read_only_mode);
 		/* srv_shutdown_bg_undo_sources() already invoked
 		fts_optimize_shutdown(); dict_stats_shutdown(); */
 
 		fil_crypt_threads_cleanup();
-		btr_scrub_cleanup();
 		btr_defragment_shutdown();
 	}
 
@@ -2521,15 +2074,11 @@ void innodb_shutdown()
 		btr_search_disable();
 	}
 #endif /* BTR_CUR_HASH_ADAPT */
-	if (ibuf) {
-		ibuf_close();
-	}
+	ibuf_close();
 	log_sys.close();
 	purge_sys.close();
 	trx_sys.close();
-	if (buf_dblwr) {
-		buf_dblwr_free();
-	}
+	buf_dblwr.close();
 	lock_sys.close();
 	trx_pool_close();
 
@@ -2540,24 +2089,14 @@ void innodb_shutdown()
 
 	dict_sys.close();
 	btr_search_sys_free();
-
-	/* 3. Free all InnoDB's own mutexes and the os_fast_mutexes inside
-	them */
-	os_aio_free();
 	row_mysql_close();
 	srv_free();
 	fil_system.close();
-
-	/* 4. Free all allocated memory */
-
 	pars_lexer_close();
 	recv_sys.close();
 
-	ut_ad(buf_pool_ptr || !srv_was_started);
-	if (buf_pool_ptr) {
-		buf_pool_free(srv_buf_pool_instances);
-	}
-
+	ut_ad(buf_pool.is_initialised() || !srv_was_started);
+	buf_pool.close();
 	sync_check_close();
 
 	srv_sys_space.shutdown();
@@ -2574,8 +2113,8 @@ void innodb_shutdown()
 			   << srv_shutdown_lsn
 			   << "; transaction id " << trx_sys.get_max_trx_id();
 	}
-
-	srv_start_state = SRV_START_STATE_NONE;
+	srv_thread_pool_end();
+	srv_started_redo = false;
 	srv_was_started = false;
 	srv_start_has_been_called = false;
 }
@@ -2607,7 +2146,7 @@ srv_get_meta_data_filename(
 	}
 
 	ut_a(path);
-	len = ut_strlen(path);
+	len = strlen(path);
 	ut_a(max_len >= len);
 
 	strcpy(filename, path);
diff --git a/storage/innobase/sync/sync0arr.cc b/storage/innobase/sync/sync0arr.cc
index 8f8ba52c584..5f39325dfdd 100644
--- a/storage/innobase/sync/sync0arr.cc
+++ b/storage/innobase/sync/sync0arr.cc
@@ -77,7 +77,6 @@ infinite wait The error_monitor thread scans the global wait array to signal
 any waiting threads who have missed the signal. */
 
 typedef TTASEventMutex<GenericPolicy> WaitMutex;
-typedef TTASEventMutex<BlockMutexPolicy> BlockWaitMutex;
 
 /** The latch types that use the sync array. */
 union sync_object_t {
@@ -87,9 +86,6 @@ union sync_object_t {
 
 	/** Mutex instance */
 	WaitMutex*	mutex;
-
-	/** Block mutex instance */
-	BlockWaitMutex*	bpmutex;
 };
 
 /** A cell where an individual thread may wait suspended until a resource
@@ -173,7 +169,7 @@ mutexes and read-write locks */
 sync_array_t**	sync_wait_array;
 
 /** count of how many times an object has been signalled */
-static ulint			sg_count;
+ulint sg_count;
 
 #define sync_array_exit(a)	mutex_exit(&(a)->mutex)
 #define sync_array_enter(a)	mutex_enter(&(a)->mutex)
@@ -294,22 +290,12 @@ sync_cell_get_event(
 /*================*/
 	sync_cell_t*	cell) /*!< in: non-empty sync array cell */
 {
-	ulint	type = cell->request_type;
-
-	if (type == SYNC_MUTEX) {
-
+	switch(cell->request_type) {
+	case SYNC_MUTEX:
 		return(cell->latch.mutex->event());
-
-	} else if (type == SYNC_BUF_BLOCK) {
-
-		return(cell->latch.bpmutex->event());
-
-	} else if (type == RW_LOCK_X_WAIT) {
-
+	case RW_LOCK_X_WAIT:
 		return(cell->latch.lock->wait_ex_event);
-
-	} else { /* RW_LOCK_S and RW_LOCK_X wait on the same event */
-
+	default:
 		return(cell->latch.lock->event);
 	}
 }
@@ -362,8 +348,6 @@ sync_array_reserve_cell(
 
 	if (cell->request_type == SYNC_MUTEX) {
 		cell->latch.mutex = reinterpret_cast<WaitMutex*>(object);
-	} else if (cell->request_type == SYNC_BUF_BLOCK) {
-		cell->latch.bpmutex = reinterpret_cast<BlockWaitMutex*>(object);
 	} else {
 		cell->latch.lock = reinterpret_cast<rw_lock_t*>(object);
 	}
@@ -468,7 +452,9 @@ sync_array_wait_event(
 #endif /* UNIV_DEBUG */
 	sync_array_exit(arr);
 
+	tpool::tpool_wait_begin();
 	os_event_wait_low(sync_cell_get_event(cell), cell->signal_count);
+	tpool::tpool_wait_end();
 
 	sync_array_free_cell(arr, cell);
 
@@ -491,71 +477,19 @@ sync_array_cell_print(
 	type = cell->request_type;
 
 	fprintf(file,
-		"--Thread %lu has waited at %s line %lu"
+		"--Thread " ULINTPF " has waited at %s line " ULINTPF
 		" for %.2f seconds the semaphore:\n",
-		(ulong) os_thread_pf(cell->thread_id),
-		innobase_basename(cell->file), (ulong) cell->line,
+		ulint(cell->thread_id),
+		innobase_basename(cell->file), cell->line,
 		difftime(time(NULL), cell->reservation_time));
 
-	if (type == SYNC_MUTEX) {
-		WaitMutex*	mutex = cell->latch.mutex;
-		const WaitMutex::MutexPolicy&	policy = mutex->policy();
-#ifdef UNIV_DEBUG
-		const char*	name = policy.context.get_enter_filename();
-		if (name == NULL) {
-			/* The mutex might have been released. */
-			name = "NULL";
-		}
-#endif /* UNIV_DEBUG */
-
-		if (mutex) {
-		fprintf(file,
-			"Mutex at %p, %s, lock var %x\n"
-#ifdef UNIV_DEBUG
-			"Last time reserved in file %s line %u"
-#endif /* UNIV_DEBUG */
-			"\n",
-			(void*) mutex,
-			policy.to_string().c_str(),
-			mutex->state()
-#ifdef UNIV_DEBUG
-			,name,
-			policy.context.get_enter_line()
-#endif /* UNIV_DEBUG */
-			);
-		}
-	} else if (type == SYNC_BUF_BLOCK) {
-		BlockWaitMutex*	mutex = cell->latch.bpmutex;
-
-		const BlockWaitMutex::MutexPolicy&	policy =
-			mutex->policy();
-#ifdef UNIV_DEBUG
-		const char*	name = policy.context.get_enter_filename();
-		if (name == NULL) {
-			/* The mutex might have been released. */
-			name = "NULL";
-		}
-#endif /* UNIV_DEBUG */
-
-		fprintf(file,
-			"Mutex at %p, %s, lock var %lu\n"
-#ifdef UNIV_DEBUG
-			"Last time reserved in file %s line %lu"
-#endif /* UNIV_DEBUG */
-			"\n",
-			(void*) mutex,
-			policy.to_string().c_str(),
-			(ulong) mutex->state()
-#ifdef UNIV_DEBUG
-			,name,
-			(ulong) policy.context.get_enter_line()
-#endif /* UNIV_DEBUG */
-		       );
-	} else if (type == RW_LOCK_X
-		   || type == RW_LOCK_X_WAIT
-		   || type == RW_LOCK_SX
-		   || type == RW_LOCK_S) {
-
+	switch (type) {
+	default:
+		ut_error;
+	case RW_LOCK_X:
+	case RW_LOCK_X_WAIT:
+	case RW_LOCK_SX:
+	case RW_LOCK_S:
 		fputs(type == RW_LOCK_X ? "X-lock on"
 		      : type == RW_LOCK_X_WAIT ? "X-lock (wait_ex) on"
 		      : type == RW_LOCK_SX ? "SX-lock on"
@@ -576,7 +510,7 @@ sync_array_cell_print(
 				fprintf(file,
 					"a writer (thread id " ULINTPF ") has"
 					" reserved it in mode %s",
-					os_thread_pf(rwlock->writer_thread),
+					ulint(rwlock->writer_thread),
 				writer == RW_LOCK_X ? " exclusive\n"
 				: writer == RW_LOCK_SX ? " SX\n"
 					: " wait exclusive\n");
@@ -598,15 +532,41 @@ sync_array_cell_print(
 				innobase_basename(rwlock->last_x_file_name),
 				rwlock->last_x_line
 #if 0 /* JAN: TODO: FIX LATER */
-				, os_thread_pf(rwlock->thread_id),
+				, ulint(rwlock->thread_id),
 				innobase_basename(rwlock->file_name),
 				rwlock->line
 #endif
 				);
 		}
+		break;
+	case SYNC_MUTEX:
+		WaitMutex*	mutex = cell->latch.mutex;
+		const WaitMutex::MutexPolicy&	policy = mutex->policy();
+#ifdef UNIV_DEBUG
+		const char*	name = policy.context.get_enter_filename();
+		if (name == NULL) {
+			/* The mutex might have been released. */
+			name = "NULL";
+		}
+#endif /* UNIV_DEBUG */
 
-	} else {
-		ut_error;
+		if (mutex) {
+		fprintf(file,
+			"Mutex at %p, %s, lock var %x\n"
+#ifdef UNIV_DEBUG
+			"Last time reserved in file %s line %u"
+#endif /* UNIV_DEBUG */
+			"\n",
+			(void*) mutex,
+			policy.to_string().c_str(),
+			mutex->state()
+#ifdef UNIV_DEBUG
+			,name,
+			policy.context.get_enter_line()
+#endif /* UNIV_DEBUG */
+			);
+		}
+		break;
 	}
 
 	if (!cell->waiting) {
@@ -765,7 +725,7 @@ sync_array_detect_deadlock(
 
 				ib::info()
 					<< "Mutex " << mutex << " owned by"
-					" thread " << os_thread_pf(thread)
+					" thread " << thread
 					<< " file " << name << " line "
 					<< policy.context.get_enter_line();
 
@@ -779,52 +739,6 @@ sync_array_detect_deadlock(
 		return(false);
 		}
 
-	case SYNC_BUF_BLOCK: {
-
-		BlockWaitMutex*	mutex = cell->latch.bpmutex;
-
-		const BlockWaitMutex::MutexPolicy&	policy =
-			mutex->policy();
-
-		if (mutex->state() != MUTEX_STATE_UNLOCKED) {
-			thread = policy.context.get_thread_id();
-
-			/* Note that mutex->thread_id above may be
-			also OS_THREAD_ID_UNDEFINED, because the
-			thread which held the mutex maybe has not
-			yet updated the value, or it has already
-			released the mutex: in this case no deadlock
-			can occur, as the wait array cannot contain
-			a thread with ID_UNDEFINED value. */
-			ret = sync_array_deadlock_step(
-				arr, start, thread, 0, depth);
-
-			if (ret) {
-				const char*	name;
-
-				name = policy.context.get_enter_filename();
-
-				if (name == NULL) {
-					/* The mutex might have been
-					released. */
-					name = "NULL";
-				}
-
-				ib::info()
-					<< "Mutex " << mutex << " owned by"
-					" thread " << os_thread_pf(thread)
-					<< " file " << name << " line "
-					<< policy.context.get_enter_line();
-
-
-				return(true);
-			}
-		}
-
-		/* No deadlock */
-		return(false);
-	}
-
 	case RW_LOCK_X:
 	case RW_LOCK_X_WAIT:
 
@@ -946,15 +860,6 @@ sync_array_detect_deadlock(
 #endif /* UNIV_DEBUG */
 
 /**********************************************************************//**
-Increments the signalled count. */
-void
-sync_array_object_signalled()
-/*=========================*/
-{
-	++sg_count;
-}
-
-/**********************************************************************//**
 Prints warnings of long semaphore waits to stderr.
 @return TRUE if fatal semaphore wait threshold was exceeded */
 static
@@ -966,7 +871,8 @@ sync_array_print_long_waits_low(
 	const void**	sema,	/*!< out: longest-waited-for semaphore */
 	ibool*		noticed)/*!< out: TRUE if long wait noticed */
 {
-	ulint		fatal_timeout = srv_fatal_semaphore_wait_threshold;
+	double		fatal_timeout = static_cast<double>(
+		srv_fatal_semaphore_wait_threshold);
 	ibool		fatal = FALSE;
 	double		longest_diff = 0;
 	ulint		i;
@@ -987,6 +893,7 @@ sync_array_print_long_waits_low(
 #else
 # define SYNC_ARRAY_TIMEOUT	240
 #endif
+	const time_t now = time(NULL);
 
 	for (ulint i = 0; i < arr->n_cells; i++) {
 
@@ -1002,7 +909,7 @@ sync_array_print_long_waits_low(
 			continue;
 		}
 
-		double	diff = difftime(time(NULL), cell->reservation_time);
+		double	diff = difftime(now, cell->reservation_time);
 
 		if (diff > SYNC_ARRAY_TIMEOUT) {
 			ib::warn() << "A long semaphore wait:";
@@ -1075,14 +982,7 @@ sync_array_print_long_waits(
 		sync_array_exit(arr);
 	}
 
-	if (noticed && srv_monitor_event) {
-
-		fprintf(stderr,
-			"InnoDB: ###### Starts InnoDB Monitor"
-			" for 30 secs to print diagnostic info:\n");
-
-		my_bool old_val = srv_print_innodb_monitor;
-
+	if (noticed) {
 		/* If some crucial semaphore is reserved, then also the InnoDB
 		Monitor can hang, and we do not get diagnostics. Since in
 		many cases an InnoDB hang is caused by a pwrite() or a pread()
@@ -1095,16 +995,7 @@ sync_array_print_long_waits(
 			MONITOR_VALUE(MONITOR_OS_PENDING_READS),
 			MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
 
-		srv_print_innodb_monitor = TRUE;
-
-		lock_set_timeout_event();
-
-		os_thread_sleep(30000000);
-
-		srv_print_innodb_monitor = static_cast<my_bool>(old_val);
-		fprintf(stderr,
-			"InnoDB: ###### Diagnostic info printed"
-			" to the standard error stream\n");
+		lock_wait_timeout_task(nullptr);
 	}
 
 	return(fatal);
@@ -1311,13 +1202,15 @@ sync_arr_fill_sys_semphore_waits_table(
 			WaitMutex* mutex;
 			type = cell->request_type;
 			/* JAN: FIXME
-			OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_THREAD_ID],
-			(longlong)os_thread_pf(cell->thread)));
+			OK(fields[SYS_SEMAPHORE_WAITS_THREAD_ID]->store(,
+			ulint(cell->thread), true));
 			*/
 			OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_FILE], innobase_basename(cell->file)));
 			OK(fields[SYS_SEMAPHORE_WAITS_LINE]->store(cell->line, true));
 			fields[SYS_SEMAPHORE_WAITS_LINE]->set_notnull();
-			OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_WAIT_TIME], (ulint)difftime(time(NULL), cell->reservation_time)));
+			OK(fields[SYS_SEMAPHORE_WAITS_WAIT_TIME]->store(
+				   difftime(time(NULL),
+					    cell->reservation_time)));
 
 			if (type == SYNC_MUTEX) {
 				mutex = static_cast<WaitMutex*>(cell->latch.mutex);
@@ -1325,21 +1218,21 @@ sync_arr_fill_sys_semphore_waits_table(
 				if (mutex) {
 					// JAN: FIXME
 					// OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_OBJECT_NAME], mutex->cmutex_name));
-					OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_WAIT_OBJECT], (longlong)mutex));
+					OK(fields[SYS_SEMAPHORE_WAITS_WAIT_OBJECT]->store((longlong)mutex, true));
 					OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_WAIT_TYPE], "MUTEX"));
-					//OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_HOLDER_THREAD_ID], (longlong)mutex->thread_id));
+					//OK(fields[SYS_SEMAPHORE_WAITS_HOLDER_THREAD_ID]->store(mutex->thread_id, true));
 					//OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_HOLDER_FILE], innobase_basename(mutex->file_name)));
 					//OK(fields[SYS_SEMAPHORE_WAITS_HOLDER_LINE]->store(mutex->line, true));
 					//fields[SYS_SEMAPHORE_WAITS_HOLDER_LINE]->set_notnull();
 					//OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_CREATED_FILE], innobase_basename(mutex->cfile_name)));
 					//OK(fields[SYS_SEMAPHORE_WAITS_CREATED_LINE]->store(mutex->cline, true));
 					//fields[SYS_SEMAPHORE_WAITS_CREATED_LINE]->set_notnull();
-					//OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_WAITERS_FLAG], (longlong)mutex->waiters));
-					//OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_LOCK_WORD], (longlong)mutex->lock_word));
+					//OK(fields[SYS_SEMAPHORE_WAITS_WAITERS_FLAG]->store(mutex->waiters, true));
+					//OK(fields[SYS_SEMAPHORE_WAITS_LOCK_WORD]->store(mutex->lock_word, true));
 					//OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE], innobase_basename(mutex->file_name)));
 					//OK(fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE]->store(mutex->line, true));
 					//fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE]->set_notnull();
-					//OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT], mutex->count_os_wait));
+					//OK(fields[SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT]->store(mutex->count_os_wait, true));
 				}
 			} else if (type == RW_LOCK_X_WAIT
 				|| type == RW_LOCK_X
@@ -1352,7 +1245,7 @@ sync_arr_fill_sys_semphore_waits_table(
 				if (rwlock) {
 					ulint writer = rw_lock_get_writer(rwlock);
 
-					OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_WAIT_OBJECT], (longlong)rwlock));
+					OK(fields[SYS_SEMAPHORE_WAITS_WAIT_OBJECT]->store((longlong)rwlock, true));
 					if (type == RW_LOCK_X) {
 						OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_WAIT_TYPE], "RW_LOCK_X"));
 					} else if (type == RW_LOCK_X_WAIT) {
@@ -1366,7 +1259,7 @@ sync_arr_fill_sys_semphore_waits_table(
 					if (writer != RW_LOCK_NOT_LOCKED) {
 						// JAN: FIXME
 						// OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_OBJECT_NAME], rwlock->lock_name));
-						OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_WRITER_THREAD], (longlong)os_thread_pf(rwlock->writer_thread)));
+						OK(fields[SYS_SEMAPHORE_WAITS_WRITER_THREAD]->store(ulint(rwlock->writer_thread), true));
 
 						if (writer == RW_LOCK_X) {
 							OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_RESERVATION_MODE], "RW_LOCK_X"));
@@ -1376,19 +1269,21 @@ sync_arr_fill_sys_semphore_waits_table(
 							OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_RESERVATION_MODE], "RW_LOCK_SX"));
 						}
 
-						//OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_HOLDER_THREAD_ID], (longlong)rwlock->thread_id));
+						//OK(fields[SYS_SEMAPHORE_WAITS_HOLDER_THREAD_ID]->store(rwlock->thread_id, true));
 						//OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_HOLDER_FILE], innobase_basename(rwlock->file_name)));
 						//OK(fields[SYS_SEMAPHORE_WAITS_HOLDER_LINE]->store(rwlock->line, true));
 						//fields[SYS_SEMAPHORE_WAITS_HOLDER_LINE]->set_notnull();
-						OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_READERS], rw_lock_get_reader_count(rwlock)));
-						OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_WAITERS_FLAG],
-						   rwlock->waiters));
-						OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_LOCK_WORD],
-						   rwlock->lock_word));
+						OK(fields[SYS_SEMAPHORE_WAITS_READERS]->store(rw_lock_get_reader_count(rwlock), true));
+						OK(fields[SYS_SEMAPHORE_WAITS_WAITERS_FLAG]->store(
+							   rwlock->waiters,
+							   true));
+						OK(fields[SYS_SEMAPHORE_WAITS_LOCK_WORD]->store(
+							   rwlock->lock_word,
+							   true));
 						OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE], innobase_basename(rwlock->last_x_file_name)));
 						OK(fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE]->store(rwlock->last_x_line, true));
 						fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE]->set_notnull();
-						OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT], rwlock->count_os_wait));
+						OK(fields[SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT]->store(rwlock->count_os_wait, true));
 					}
 				}
 			}
diff --git a/storage/innobase/sync/sync0debug.cc b/storage/innobase/sync/sync0debug.cc
index 377adc5b009..99d13b7b171 100644
--- a/storage/innobase/sync/sync0debug.cc
+++ b/storage/innobase/sync/sync0debug.cc
@@ -38,7 +38,7 @@ Created 2012-08-21 Sunny Bains
 #include <vector>
 #include <string>
 #include <algorithm>
-#include <iostream>
+#include <map>
 
 #ifdef UNIV_DEBUG
 
@@ -108,7 +108,7 @@ struct LatchDebug {
 			const os_thread_id_t& rhs) const
 			UNIV_NOTHROW
 		{
-			return(os_thread_pf(lhs) < os_thread_pf(rhs));
+			return(ulint(lhs) < ulint(rhs));
 		}
 	};
 
@@ -444,13 +444,7 @@ LatchDebug::LatchDebug()
 	LEVEL_MAP_INSERT(RW_LOCK_S);
 	LEVEL_MAP_INSERT(RW_LOCK_X);
 	LEVEL_MAP_INSERT(RW_LOCK_NOT_LOCKED);
-	LEVEL_MAP_INSERT(SYNC_MONITOR_MUTEX);
 	LEVEL_MAP_INSERT(SYNC_ANY_LATCH);
-	LEVEL_MAP_INSERT(SYNC_DOUBLEWRITE);
-	LEVEL_MAP_INSERT(SYNC_BUF_FLUSH_LIST);
-	LEVEL_MAP_INSERT(SYNC_BUF_BLOCK);
-	LEVEL_MAP_INSERT(SYNC_BUF_PAGE_HASH);
-	LEVEL_MAP_INSERT(SYNC_BUF_POOL);
 	LEVEL_MAP_INSERT(SYNC_POOL);
 	LEVEL_MAP_INSERT(SYNC_POOL_MANAGER);
 	LEVEL_MAP_INSERT(SYNC_SEARCH_SYS);
@@ -459,15 +453,11 @@ LatchDebug::LatchDebug()
 	LEVEL_MAP_INSERT(SYNC_FTS_OPTIMIZE);
 	LEVEL_MAP_INSERT(SYNC_FTS_CACHE_INIT);
 	LEVEL_MAP_INSERT(SYNC_RECV);
-	LEVEL_MAP_INSERT(SYNC_LOG_FLUSH_ORDER);
-	LEVEL_MAP_INSERT(SYNC_LOG);
-	LEVEL_MAP_INSERT(SYNC_LOG_WRITE);
-	LEVEL_MAP_INSERT(SYNC_PAGE_CLEANER);
 	LEVEL_MAP_INSERT(SYNC_PURGE_QUEUE);
 	LEVEL_MAP_INSERT(SYNC_TRX_SYS_HEADER);
-	LEVEL_MAP_INSERT(SYNC_THREADS);
 	LEVEL_MAP_INSERT(SYNC_TRX);
 	LEVEL_MAP_INSERT(SYNC_RW_TRX_HASH_ELEMENT);
+	LEVEL_MAP_INSERT(SYNC_READ_VIEW);
 	LEVEL_MAP_INSERT(SYNC_TRX_SYS);
 	LEVEL_MAP_INSERT(SYNC_LOCK_SYS);
 	LEVEL_MAP_INSERT(SYNC_LOCK_WAIT_SYS);
@@ -499,7 +489,6 @@ LatchDebug::LatchDebug()
 	LEVEL_MAP_INSERT(SYNC_FTS_CACHE);
 	LEVEL_MAP_INSERT(SYNC_DICT_OPERATION);
 	LEVEL_MAP_INSERT(SYNC_TRX_I_S_RWLOCK);
-	LEVEL_MAP_INSERT(SYNC_RECV_WRITER);
 	LEVEL_MAP_INSERT(SYNC_LEVEL_VARYING);
 	LEVEL_MAP_INSERT(SYNC_NO_ORDER_CHECK);
 
@@ -547,7 +536,7 @@ LatchDebug::crash(
 		get_level_name(latched->m_level);
 
 	ib::error()
-		<< "Thread " << os_thread_pf(os_thread_get_curr_id())
+		<< "Thread " << os_thread_get_curr_id()
 		<< " already owns a latch "
 		<< sync_latch_get_name(latch->m_id) << " at level"
 		<< " " << latched->m_level << " (" << latch_level_name
@@ -729,23 +718,17 @@ LatchDebug::check_order(
 
 		/* Fall through */
 
-	case SYNC_MONITOR_MUTEX:
 	case SYNC_RECV:
 	case SYNC_WORK_QUEUE:
 	case SYNC_FTS_TOKENIZE:
 	case SYNC_FTS_OPTIMIZE:
 	case SYNC_FTS_CACHE:
 	case SYNC_FTS_CACHE_INIT:
-	case SYNC_PAGE_CLEANER:
-	case SYNC_LOG:
-	case SYNC_LOG_WRITE:
-	case SYNC_LOG_FLUSH_ORDER:
-	case SYNC_DOUBLEWRITE:
 	case SYNC_SEARCH_SYS:
-	case SYNC_THREADS:
 	case SYNC_LOCK_SYS:
 	case SYNC_LOCK_WAIT_SYS:
 	case SYNC_RW_TRX_HASH_ELEMENT:
+	case SYNC_READ_VIEW:
 	case SYNC_TRX_SYS:
 	case SYNC_IBUF_BITMAP_MUTEX:
 	case SYNC_REDO_RSEG:
@@ -760,8 +743,6 @@ LatchDebug::check_order(
 	case SYNC_STATS_AUTO_RECALC:
 	case SYNC_POOL:
 	case SYNC_POOL_MANAGER:
-	case SYNC_RECV_WRITER:
-
 		basic_check(latches, level, level);
 		break;
 
@@ -800,35 +781,6 @@ LatchDebug::check_order(
 		}
 		break;
 
-	case SYNC_BUF_FLUSH_LIST:
-	case SYNC_BUF_POOL:
-
-		/* We can have multiple mutexes of this type therefore we
-		can only check whether the greater than condition holds. */
-
-		basic_check(latches, level, level - 1);
-		break;
-
-	case SYNC_BUF_PAGE_HASH:
-
-		/* Multiple page_hash locks are only allowed during
-		buf_validate and that is where buf_pool mutex is already
-		held. */
-
-		/* Fall through */
-
-	case SYNC_BUF_BLOCK:
-
-		/* Either the thread must own the (buffer pool) buf_pool->mutex
-		or it is allowed to latch only ONE of (buffer block)
-		block->mutex or buf_pool->zip_mutex. */
-
-		if (less(latches, level) != NULL) {
-			basic_check(latches, level, level - 1);
-			ut_a(find(latches, SYNC_BUF_POOL) != 0);
-		}
-		break;
-
 	case SYNC_IBUF_BITMAP:
 
 		/* Either the thread must own the master mutex to all
@@ -1178,8 +1130,7 @@ sync_check_iterate(const sync_check_functor_t& functor)
 
 Note: We don't enforce any synchronisation checks. The caller must ensure
 that no races can occur */
-void
-sync_check_enable()
+static void sync_check_enable()
 {
 	if (!srv_sync_debug) {
 
@@ -1247,22 +1198,11 @@ void
 sync_latch_meta_init()
 	UNIV_NOTHROW
 {
-	latch_meta.resize(LATCH_ID_MAX);
+	latch_meta.resize(LATCH_ID_MAX + 1);
 
 	/* The latches should be ordered on latch_id_t. So that we can
 	index directly into the vector to update and fetch meta-data. */
 
-#if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC
-	LATCH_ADD_MUTEX(BUF_BLOCK_MUTEX, SYNC_BUF_BLOCK, PFS_NOT_INSTRUMENTED);
-#else
-	LATCH_ADD_MUTEX(BUF_BLOCK_MUTEX, SYNC_BUF_BLOCK,
-			buffer_block_mutex_key);
-#endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
-
-	LATCH_ADD_MUTEX(BUF_POOL, SYNC_BUF_POOL, buf_pool_mutex_key);
-
-	LATCH_ADD_MUTEX(BUF_POOL_ZIP, SYNC_BUF_BLOCK, buf_pool_zip_mutex_key);
-
 	LATCH_ADD_MUTEX(DICT_FOREIGN_ERR, SYNC_NO_ORDER_CHECK,
 			dict_foreign_err_mutex_key);
 
@@ -1270,8 +1210,6 @@ sync_latch_meta_init()
 
 	LATCH_ADD_MUTEX(FIL_SYSTEM, SYNC_ANY_LATCH, fil_system_mutex_key);
 
-	LATCH_ADD_MUTEX(FLUSH_LIST, SYNC_BUF_FLUSH_LIST, flush_list_mutex_key);
-
 	LATCH_ADD_MUTEX(FTS_DELETE, SYNC_FTS_OPTIMIZE, fts_delete_mutex_key);
 
 	LATCH_ADD_MUTEX(FTS_DOC_ID, SYNC_FTS_OPTIMIZE, fts_doc_id_mutex_key);
@@ -1279,9 +1217,6 @@ sync_latch_meta_init()
 	LATCH_ADD_MUTEX(FTS_PLL_TOKENIZE, SYNC_FTS_TOKENIZE,
 			fts_pll_tokenize_mutex_key);
 
-	LATCH_ADD_MUTEX(HASH_TABLE_MUTEX, SYNC_BUF_PAGE_HASH,
-			hash_table_mutex_key);
-
 	LATCH_ADD_MUTEX(IBUF_BITMAP, SYNC_IBUF_BITMAP_MUTEX,
 			ibuf_bitmap_mutex_key);
 
@@ -1290,18 +1225,6 @@ sync_latch_meta_init()
 	LATCH_ADD_MUTEX(IBUF_PESSIMISTIC_INSERT, SYNC_IBUF_PESS_INSERT_MUTEX,
 			ibuf_pessimistic_insert_mutex_key);
 
-	LATCH_ADD_MUTEX(LOG_SYS, SYNC_LOG, log_sys_mutex_key);
-
-	LATCH_ADD_MUTEX(LOG_WRITE, SYNC_LOG_WRITE, log_sys_write_mutex_key);
-
-	LATCH_ADD_MUTEX(LOG_FLUSH_ORDER, SYNC_LOG_FLUSH_ORDER,
-			log_flush_order_mutex_key);
-
-	LATCH_ADD_MUTEX(MUTEX_LIST, SYNC_NO_ORDER_CHECK, mutex_list_mutex_key);
-
-	LATCH_ADD_MUTEX(PAGE_CLEANER, SYNC_PAGE_CLEANER,
-			page_cleaner_mutex_key);
-
 	LATCH_ADD_MUTEX(PURGE_SYS_PQ, SYNC_PURGE_QUEUE,
 			purge_sys_pq_mutex_key);
 
@@ -1310,8 +1233,6 @@ sync_latch_meta_init()
 
 	LATCH_ADD_MUTEX(RECV_SYS, SYNC_RECV, recv_sys_mutex_key);
 
-	LATCH_ADD_MUTEX(RECV_WRITER, SYNC_RECV_WRITER, recv_writer_mutex_key);
-
 	LATCH_ADD_MUTEX(REDO_RSEG, SYNC_REDO_RSEG, redo_rseg_mutex_key);
 
 	LATCH_ADD_MUTEX(NOREDO_RSEG, SYNC_NOREDO_RSEG, noredo_rseg_mutex_key);
@@ -1335,8 +1256,6 @@ sync_latch_meta_init()
 	LATCH_ADD_MUTEX(RW_LOCK_LIST, SYNC_NO_ORDER_CHECK,
 			rw_lock_list_mutex_key);
 
-	LATCH_ADD_MUTEX(RW_LOCK_MUTEX, SYNC_NO_ORDER_CHECK, rw_lock_mutex_key);
-
 	LATCH_ADD_MUTEX(SRV_INNODB_MONITOR, SYNC_NO_ORDER_CHECK,
 			srv_innodb_monitor_mutex_key);
 
@@ -1346,8 +1265,6 @@ sync_latch_meta_init()
 	LATCH_ADD_MUTEX(SRV_MONITOR_FILE, SYNC_NO_ORDER_CHECK,
 			srv_monitor_file_mutex_key);
 
-	LATCH_ADD_MUTEX(BUF_DBLWR, SYNC_DOUBLEWRITE, buf_dblwr_mutex_key);
-
 	LATCH_ADD_MUTEX(TRX_POOL, SYNC_POOL, trx_pool_mutex_key);
 
 	LATCH_ADD_MUTEX(TRX_POOL_MANAGER, SYNC_POOL_MANAGER,
@@ -1362,41 +1279,14 @@ sync_latch_meta_init()
 
 	LATCH_ADD_MUTEX(TRX_SYS, SYNC_TRX_SYS, trx_sys_mutex_key);
 
-	LATCH_ADD_MUTEX(SRV_SYS, SYNC_THREADS, srv_sys_mutex_key);
-
 	LATCH_ADD_MUTEX(SRV_SYS_TASKS, SYNC_ANY_LATCH, srv_threads_mutex_key);
 
 	LATCH_ADD_MUTEX(PAGE_ZIP_STAT_PER_INDEX, SYNC_ANY_LATCH,
 			page_zip_stat_per_index_mutex_key);
 
-#ifndef PFS_SKIP_EVENT_MUTEX
-	LATCH_ADD_MUTEX(EVENT_MANAGER, SYNC_NO_ORDER_CHECK,
-			event_manager_mutex_key);
-#else
-	LATCH_ADD_MUTEX(EVENT_MANAGER, SYNC_NO_ORDER_CHECK,
-			PFS_NOT_INSTRUMENTED);
-#endif /* !PFS_SKIP_EVENT_MUTEX */
-
-	LATCH_ADD_MUTEX(EVENT_MUTEX, SYNC_NO_ORDER_CHECK, event_mutex_key);
-
 	LATCH_ADD_MUTEX(SYNC_ARRAY_MUTEX, SYNC_NO_ORDER_CHECK,
 			sync_array_mutex_key);
 
-	LATCH_ADD_MUTEX(OS_AIO_READ_MUTEX, SYNC_NO_ORDER_CHECK,
-			PFS_NOT_INSTRUMENTED);
-
-	LATCH_ADD_MUTEX(OS_AIO_WRITE_MUTEX, SYNC_NO_ORDER_CHECK,
-			PFS_NOT_INSTRUMENTED);
-
-	LATCH_ADD_MUTEX(OS_AIO_LOG_MUTEX, SYNC_NO_ORDER_CHECK,
-			PFS_NOT_INSTRUMENTED);
-
-	LATCH_ADD_MUTEX(OS_AIO_IBUF_MUTEX, SYNC_NO_ORDER_CHECK,
-			PFS_NOT_INSTRUMENTED);
-
-	LATCH_ADD_MUTEX(OS_AIO_SYNC_MUTEX, SYNC_NO_ORDER_CHECK,
-			PFS_NOT_INSTRUMENTED);
-
 	LATCH_ADD_MUTEX(ROW_DROP_LIST, SYNC_NO_ORDER_CHECK,
 			row_drop_list_mutex_key);
 
@@ -1409,18 +1299,16 @@ sync_latch_meta_init()
 	LATCH_ADD_RWLOCK(BTR_SEARCH, SYNC_SEARCH_SYS, btr_search_latch_key);
 
 	LATCH_ADD_RWLOCK(BUF_BLOCK_LOCK, SYNC_LEVEL_VARYING,
-			 buf_block_lock_key);
+			 PFS_NOT_INSTRUMENTED);
 
 #ifdef UNIV_DEBUG
 	LATCH_ADD_RWLOCK(BUF_BLOCK_DEBUG, SYNC_LEVEL_VARYING,
-			 buf_block_debug_latch_key);
+			 PFS_NOT_INSTRUMENTED);
 #endif /* UNIV_DEBUG */
 
 	LATCH_ADD_RWLOCK(DICT_OPERATION, SYNC_DICT_OPERATION,
 			 dict_operation_lock_key);
 
-	LATCH_ADD_RWLOCK(CHECKPOINT, SYNC_NO_ORDER_CHECK, checkpoint_lock_key);
-
 	LATCH_ADD_RWLOCK(FIL_SPACE, SYNC_FSP, fil_space_latch_key);
 
 	LATCH_ADD_RWLOCK(FTS_CACHE, SYNC_FTS_CACHE, fts_cache_rw_lock_key);
@@ -1438,15 +1326,7 @@ sync_latch_meta_init()
 
 	LATCH_ADD_RWLOCK(INDEX_TREE, SYNC_INDEX_TREE, index_tree_rw_lock_key);
 
-	LATCH_ADD_RWLOCK(HASH_TABLE_RW_LOCK, SYNC_BUF_PAGE_HASH,
-		  hash_table_locks_key);
-
-	LATCH_ADD_MUTEX(SYNC_DEBUG_MUTEX, SYNC_NO_ORDER_CHECK,
-			PFS_NOT_INSTRUMENTED);
-
 	/* JAN: TODO: Add PFS instrumentation */
-	LATCH_ADD_MUTEX(SCRUB_STAT_MUTEX, SYNC_NO_ORDER_CHECK,
-			PFS_NOT_INSTRUMENTED);
 	LATCH_ADD_MUTEX(DEFRAGMENT_MUTEX, SYNC_NO_ORDER_CHECK,
 			PFS_NOT_INSTRUMENTED);
 	LATCH_ADD_MUTEX(BTR_DEFRAGMENT_MUTEX, SYNC_NO_ORDER_CHECK,
@@ -1459,6 +1339,7 @@ sync_latch_meta_init()
 			PFS_NOT_INSTRUMENTED);
 	LATCH_ADD_MUTEX(RW_TRX_HASH_ELEMENT, SYNC_RW_TRX_HASH_ELEMENT,
 			rw_trx_hash_element_mutex_key);
+	LATCH_ADD_MUTEX(READ_VIEW, SYNC_READ_VIEW, read_view_mutex_key);
 
 	latch_id_t	id = LATCH_ID_NONE;
 
@@ -1499,173 +1380,6 @@ sync_latch_meta_destroy()
 	latch_meta.clear();
 }
 
-/** Track mutex file creation name and line number. This is to avoid storing
-{ const char* name; uint16_t line; } in every instance. This results in the
-sizeof(Mutex) > 64. We use a lookup table to store it separately. Fetching
-the values is very rare, only required for diagnostic purposes. And, we
-don't create/destroy mutexes that frequently. */
-struct CreateTracker {
-
-	/** Constructor */
-	CreateTracker()
-		UNIV_NOTHROW
-	{
-		m_mutex.init();
-	}
-
-	/** Destructor */
-	~CreateTracker()
-		UNIV_NOTHROW
-	{
-		ut_ad(m_files.empty());
-
-		m_mutex.destroy();
-	}
-
-	/** Register where the latch was created
-	@param[in]	ptr		Latch instance
-	@param[in]	filename	Where created
-	@param[in]	line		Line number in filename */
-	void register_latch(
-		const void*	ptr,
-		const char*	filename,
-		uint16_t	line)
-		UNIV_NOTHROW
-	{
-		m_mutex.enter();
-
-		Files::iterator	lb = m_files.lower_bound(ptr);
-
-		ut_ad(lb == m_files.end()
-		      || m_files.key_comp()(ptr, lb->first));
-
-		typedef Files::value_type value_type;
-
-		m_files.insert(lb, value_type(ptr, File(filename, line)));
-
-		m_mutex.exit();
-	}
-
-	/** Deregister a latch - when it is destroyed
-	@param[in]	ptr		Latch instance being destroyed */
-	void deregister_latch(const void* ptr)
-		UNIV_NOTHROW
-	{
-		m_mutex.enter();
-
-		Files::iterator	lb = m_files.lower_bound(ptr);
-
-		ut_ad(lb != m_files.end()
-		      && !(m_files.key_comp()(ptr, lb->first)));
-
-		m_files.erase(lb);
-
-		m_mutex.exit();
-	}
-
-	/** Get the create string, format is "name:line"
-	@param[in]	ptr		Latch instance
-	@return the create string or "" if not found */
-	std::string get(const void* ptr)
-		UNIV_NOTHROW
-	{
-		m_mutex.enter();
-
-		std::string	created;
-
-		Files::iterator	lb = m_files.lower_bound(ptr);
-
-		if (lb != m_files.end()
-		    && !(m_files.key_comp()(ptr, lb->first))) {
-
-			std::ostringstream	msg;
-
-			msg << lb->second.m_name << ":" << lb->second.m_line;
-
-			created = msg.str();
-		}
-
-		m_mutex.exit();
-
-		return(created);
-	}
-
-private:
-	/** For tracking the filename and line number */
-	struct File {
-
-		/** Constructor */
-		File() UNIV_NOTHROW : m_name(), m_line() { }
-
-		/** Constructor
-		@param[in]	name		Filename where created
-		@param[in]	line		Line number where created */
-		File(const char*  name, uint16_t line)
-			UNIV_NOTHROW
-			:
-			m_name(sync_basename(name)),
-			m_line(line)
-		{
-			/* No op */
-		}
-
-		/** Filename where created */
-		std::string		m_name;
-
-		/** Line number where created */
-		uint16_t		m_line;
-	};
-
-	/** Map the mutex instance to where it was created */
-	typedef std::map<
-		const void*,
-		File,
-		std::less<const void*>,
-		ut_allocator<std::pair<const void* const, File> > >
-		Files;
-
-	typedef OSMutex	Mutex;
-
-	/** Mutex protecting m_files */
-	Mutex			m_mutex;
-
-	/** Track the latch creation */
-	Files			m_files;
-};
-
-/** Track latch creation location. For reducing the size of the latches */
-static CreateTracker	create_tracker;
-
-/** Register a latch, called when it is created
-@param[in]	ptr		Latch instance that was created
-@param[in]	filename	Filename where it was created
-@param[in]	line		Line number in filename */
-void
-sync_file_created_register(
-	const void*	ptr,
-	const char*	filename,
-	uint16_t	line)
-{
-	create_tracker.register_latch(ptr, filename, line);
-}
-
-/** Deregister a latch, called when it is destroyed
-@param[in]	ptr		Latch to be destroyed */
-void
-sync_file_created_deregister(const void* ptr)
-{
-	create_tracker.deregister_latch(ptr);
-}
-
-/** Get the string where the file was created. Its format is "name:line"
-@param[in]	ptr		Latch instance
-@return created information or "" if can't be found */
-std::string
-sync_file_created_get(const void* ptr)
-{
-	return(create_tracker.get(ptr));
-}
-
 /** Initializes the synchronization data structures. */
 void
 sync_check_init()
@@ -1675,15 +1389,15 @@ sync_check_init()
 
 	sync_latch_meta_init();
 
-	/* Init the rw-lock & mutex list and create the mutex to protect it. */
-
-	UT_LIST_INIT(rw_lock_list, &rw_lock_t::list);
+	/* create the mutex to protect rw_lock list. */
 
 	mutex_create(LATCH_ID_RW_LOCK_LIST, &rw_lock_list_mutex);
 
 	ut_d(LatchDebug::init());
 
 	sync_array_init();
+
+	ut_d(sync_check_enable());
 }
 
 /** Free the InnoDB synchronization data structures. */
diff --git a/storage/innobase/sync/sync0rw.cc b/storage/innobase/sync/sync0rw.cc
index 9f9453e20b5..2624ffb9e46 100644
--- a/storage/innobase/sync/sync0rw.cc
+++ b/storage/innobase/sync/sync0rw.cc
@@ -141,7 +141,7 @@ wait_ex_event:	A thread may only wait on the wait_ex_event after it has
 rw_lock_stats_t		rw_lock_stats;
 
 /* The global list of rw-locks */
-rw_lock_list_t		rw_lock_list;
+ilist<rw_lock_t> rw_lock_list;
 ib_mutex_t		rw_lock_list_mutex;
 
 #ifdef UNIV_DEBUG
@@ -224,8 +224,8 @@ rw_lock_create_func(
 	/* This should hold in practice. If it doesn't then we need to
 	split the source file anyway. Or create the locks on lines
 	less than 8192. cline is unsigned:13. */
-	ut_ad(cline <= 8192);
-	lock->cline = cline;
+	ut_ad(cline <= ((1U << 13) - 1));
+	lock->cline = cline & ((1U << 13) - 1);
 	lock->count_os_wait = 0;
 	lock->last_x_file_name = "not yet reserved";
 	lock->last_x_line = 0;
@@ -234,8 +234,10 @@ rw_lock_create_func(
 
 	lock->is_block_lock = 0;
 
+	ut_d(lock->created = true);
+
 	mutex_enter(&rw_lock_list_mutex);
-	UT_LIST_ADD_FIRST(rw_lock_list, lock);
+	rw_lock_list.push_front(*lock);
 	mutex_exit(&rw_lock_list_mutex);
 }
 
@@ -251,13 +253,15 @@ rw_lock_free_func(
 	ut_ad(rw_lock_validate(lock));
 	ut_a(lock->lock_word == X_LOCK_DECR);
 
+	ut_d(lock->created = false);
+
 	mutex_enter(&rw_lock_list_mutex);
 
 	os_event_destroy(lock->event);
 
 	os_event_destroy(lock->wait_ex_event);
 
-	UT_LIST_REMOVE(rw_lock_list, lock);
+	rw_lock_list.remove(*lock);
 
 	mutex_exit(&rw_lock_list_mutex);
 }
@@ -544,7 +548,7 @@ rw_lock_x_lock_low(
 	ut_d(rw_lock_add_debug_info(lock, pass, RW_LOCK_X, file_name, line));
 
 	lock->last_x_file_name = file_name;
-	lock->last_x_line = line;
+	lock->last_x_line = line & ((1U << 14) - 1);
 
 	return(TRUE);
 }
@@ -623,7 +627,7 @@ rw_lock_sx_lock_low(
 	ut_d(rw_lock_add_debug_info(lock, pass, RW_LOCK_SX, file_name, line));
 
 	lock->last_x_file_name = file_name;
-	lock->last_x_line = line;
+	lock->last_x_line = line & ((1U << 14) - 1);
 
 	return(TRUE);
 }
@@ -850,6 +854,8 @@ rw_lock_validate(
 {
 	ut_ad(lock);
 
+	ut_ad(lock->created);
+
 	int32_t lock_word = lock->lock_word;
 
 	ut_ad(lock->waiters < 2);
@@ -1095,17 +1101,15 @@ rw_lock_list_print_info(
 	      "RW-LATCH INFO\n"
 	      "-------------\n", file);
 
-	for (const rw_lock_t* lock = UT_LIST_GET_FIRST(rw_lock_list);
-	     lock != NULL;
-	     lock = UT_LIST_GET_NEXT(list, lock)) {
+	for (const rw_lock_t& lock : rw_lock_list) {
 
 		count++;
 
-		if (lock->lock_word != X_LOCK_DECR) {
+		if (lock.lock_word != X_LOCK_DECR) {
 
-			fprintf(file, "RW-LOCK: %p ", (void*) lock);
+			fprintf(file, "RW-LOCK: %p ", (void*) &lock);
 
-			if (int32_t waiters= lock->waiters) {
+			if (int32_t waiters= lock.waiters) {
 				fprintf(file, " (%d waiters)\n", waiters);
 			} else {
 				putc('\n', file);
@@ -1115,7 +1119,7 @@ rw_lock_list_print_info(
 
 			rw_lock_debug_mutex_enter();
 
-			for (info = UT_LIST_GET_FIRST(lock->debug_list);
+			for (info = UT_LIST_GET_FIRST(lock.debug_list);
 			     info != NULL;
 			     info = UT_LIST_GET_NEXT(list, info)) {
 
@@ -1140,10 +1144,10 @@ rw_lock_debug_print(
 {
 	ulint	rwt = info->lock_type;
 
-	fprintf(f, "Locked: thread %lu file %s line %lu  ",
-		static_cast<ulong>(os_thread_pf(info->thread_id)),
+	fprintf(f, "Locked: thread " ULINTPF " file %s line %u  ",
+		ulint(info->thread_id),
 		sync_basename(info->file_name),
-		static_cast<ulong>(info->line));
+		info->line);
 
 	switch (rwt) {
 	case RW_LOCK_S:
@@ -1183,7 +1187,7 @@ rw_lock_t::to_string() const
 	ut_ad(rw_lock_validate(this));
 
 	msg << "RW-LATCH: "
-	    << "thread id " << os_thread_pf(os_thread_get_curr_id())
+	    << "thread id " << os_thread_get_curr_id()
 	    << " addr: " << this
 	    << " Locked from: ";
 
diff --git a/storage/innobase/sync/sync0sync.cc b/storage/innobase/sync/sync0sync.cc
index e7be502632d..0a6f8bfbebd 100644
--- a/storage/innobase/sync/sync0sync.cc
+++ b/storage/innobase/sync/sync0sync.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
 Copyright (c) 2008, Google Inc.
 Copyright (c) 2020, MariaDB Corporation.
 
@@ -35,9 +36,7 @@ Created 9/5/1995 Heikki Tuuri
 #include "sync0sync.h"
 
 #ifdef UNIV_PFS_MUTEX
-mysql_pfs_key_t	buffer_block_mutex_key;
 mysql_pfs_key_t	buf_pool_mutex_key;
-mysql_pfs_key_t	buf_pool_zip_mutex_key;
 mysql_pfs_key_t	dict_foreign_err_mutex_key;
 mysql_pfs_key_t	dict_sys_mutex_key;
 mysql_pfs_key_t	fil_system_mutex_key;
@@ -45,20 +44,15 @@ mysql_pfs_key_t	flush_list_mutex_key;
 mysql_pfs_key_t	fts_delete_mutex_key;
 mysql_pfs_key_t	fts_doc_id_mutex_key;
 mysql_pfs_key_t	fts_pll_tokenize_mutex_key;
-mysql_pfs_key_t	hash_table_mutex_key;
 mysql_pfs_key_t	ibuf_bitmap_mutex_key;
 mysql_pfs_key_t	ibuf_mutex_key;
 mysql_pfs_key_t	ibuf_pessimistic_insert_mutex_key;
 mysql_pfs_key_t	log_sys_mutex_key;
-mysql_pfs_key_t	log_sys_write_mutex_key;
 mysql_pfs_key_t	log_cmdq_mutex_key;
 mysql_pfs_key_t	log_flush_order_mutex_key;
-mysql_pfs_key_t	mutex_list_mutex_key;
 mysql_pfs_key_t	recalc_pool_mutex_key;
-mysql_pfs_key_t	page_cleaner_mutex_key;
 mysql_pfs_key_t	purge_sys_pq_mutex_key;
 mysql_pfs_key_t	recv_sys_mutex_key;
-mysql_pfs_key_t	recv_writer_mutex_key;
 mysql_pfs_key_t	redo_rseg_mutex_key;
 mysql_pfs_key_t	noredo_rseg_mutex_key;
 mysql_pfs_key_t page_zip_stat_per_index_mutex_key;
@@ -69,7 +63,6 @@ mysql_pfs_key_t rtr_active_mutex_key;
 mysql_pfs_key_t	rtr_match_mutex_key;
 mysql_pfs_key_t	rtr_path_mutex_key;
 mysql_pfs_key_t	rw_lock_list_mutex_key;
-mysql_pfs_key_t	rw_lock_mutex_key;
 mysql_pfs_key_t	srv_innodb_monitor_mutex_key;
 mysql_pfs_key_t	srv_misc_tmpfile_mutex_key;
 mysql_pfs_key_t	srv_monitor_file_mutex_key;
@@ -80,24 +73,16 @@ mysql_pfs_key_t	trx_pool_manager_mutex_key;
 mysql_pfs_key_t	lock_mutex_key;
 mysql_pfs_key_t	lock_wait_mutex_key;
 mysql_pfs_key_t	trx_sys_mutex_key;
-mysql_pfs_key_t	srv_sys_mutex_key;
 mysql_pfs_key_t	srv_threads_mutex_key;
-mysql_pfs_key_t	event_mutex_key;
-mysql_pfs_key_t	event_manager_mutex_key;
 mysql_pfs_key_t	sync_array_mutex_key;
 mysql_pfs_key_t	thread_mutex_key;
 mysql_pfs_key_t row_drop_list_mutex_key;
 mysql_pfs_key_t	rw_trx_hash_element_mutex_key;
+mysql_pfs_key_t	read_view_mutex_key;
 #endif /* UNIV_PFS_MUTEX */
 #ifdef UNIV_PFS_RWLOCK
 mysql_pfs_key_t	btr_search_latch_key;
-mysql_pfs_key_t	buf_block_lock_key;
-# ifdef UNIV_DEBUG
-mysql_pfs_key_t	buf_block_debug_latch_key;
-# endif /* UNIV_DEBUG */
-mysql_pfs_key_t	checkpoint_lock_key;
 mysql_pfs_key_t	dict_operation_lock_key;
-mysql_pfs_key_t	hash_table_locks_key;
 mysql_pfs_key_t	index_tree_rw_lock_key;
 mysql_pfs_key_t	index_online_log_key;
 mysql_pfs_key_t	fil_space_latch_key;
@@ -137,15 +122,18 @@ sync_print_wait_info(FILE* file)
 	fprintf(file,
 		"Spin rounds per wait: %.2f RW-shared,"
 		" %.2f RW-excl, %.2f RW-sx\n",
-		(double) rw_lock_stats.rw_s_spin_round_count /
-		(rw_lock_stats.rw_s_spin_wait_count
-		 ? rw_lock_stats.rw_s_spin_wait_count : 1LL),
-		(double) rw_lock_stats.rw_x_spin_round_count /
-		(rw_lock_stats.rw_x_spin_wait_count
-		 ? rw_lock_stats.rw_x_spin_wait_count : 1LL),
-		(double) rw_lock_stats.rw_sx_spin_round_count /
-		(rw_lock_stats.rw_sx_spin_wait_count
-		 ? rw_lock_stats.rw_sx_spin_wait_count : 1LL));
+		rw_lock_stats.rw_s_spin_wait_count
+		? static_cast<double>(rw_lock_stats.rw_s_spin_round_count) /
+		static_cast<double>(rw_lock_stats.rw_s_spin_wait_count)
+		: static_cast<double>(rw_lock_stats.rw_s_spin_round_count),
+		rw_lock_stats.rw_x_spin_wait_count
+		? static_cast<double>(rw_lock_stats.rw_x_spin_round_count) /
+		static_cast<double>(rw_lock_stats.rw_x_spin_wait_count)
+		: static_cast<double>(rw_lock_stats.rw_x_spin_round_count),
+		rw_lock_stats.rw_sx_spin_wait_count
+		? static_cast<double>(rw_lock_stats.rw_sx_spin_round_count) /
+		static_cast<double>(rw_lock_stats.rw_sx_spin_wait_count)
+		: static_cast<double>(rw_lock_stats.rw_sx_spin_round_count));
 }
 
 /**
@@ -250,11 +238,8 @@ MutexMonitor::reset()
 
 	mutex_enter(&rw_lock_list_mutex);
 
-	for (rw_lock_t* rw_lock = UT_LIST_GET_FIRST(rw_lock_list);
-	     rw_lock != NULL;
-	     rw_lock = UT_LIST_GET_NEXT(list, rw_lock)) {
-
-		rw_lock->count_os_wait = 0;
+	for (rw_lock_t& rw_lock : rw_lock_list) {
+		rw_lock.count_os_wait = 0;
 	}
 
 	mutex_exit(&rw_lock_list_mutex);
diff --git a/storage/innobase/trx/trx0i_s.cc b/storage/innobase/trx/trx0i_s.cc
index c648acdf575..dc240387f21 100644
--- a/storage/innobase/trx/trx0i_s.cc
+++ b/storage/innobase/trx/trx0i_s.cc
@@ -149,7 +149,7 @@ struct trx_i_s_cache_t {
 	i_s_table_cache_t innodb_lock_waits;/*!< innodb_lock_waits table */
 /** the hash table size is LOCKS_HASH_CELLS_NUM * sizeof(void*) bytes */
 #define LOCKS_HASH_CELLS_NUM		10000
-	hash_table_t*	locks_hash;	/*!< hash table used to eliminate
+	hash_table_t	locks_hash;	/*!< hash table used to eliminate
 					duplicate entries in the
 					innodb_locks table */
 /** Initial size of the cache storage */
@@ -159,7 +159,7 @@ struct trx_i_s_cache_t {
 	ha_storage_t*	storage;	/*!< storage for external volatile
 					data that may become unavailable
 					when we release
-					lock_sys.mutex or trx_sys.mutex */
+					lock_sys.mutex */
 	ulint		mem_allocd;	/*!< the amount of memory
 					allocated with mem_alloc*() */
 	bool		is_truncated;	/*!< this is true if the memory
@@ -176,31 +176,13 @@ INFORMATION SCHEMA tables is fetched and later retrieved by the C++
 code in handler/i_s.cc. */
 trx_i_s_cache_t*	trx_i_s_cache = &trx_i_s_cache_static;
 
-/*******************************************************************//**
-For a record lock that is in waiting state retrieves the only bit that
-is set, for a table lock returns ULINT_UNDEFINED.
-@return record number within the heap */
-static
-ulint
-wait_lock_get_heap_no(
-/*==================*/
-	const lock_t*	lock)	/*!< in: lock */
+/** @return the heap number of a record lock
+@retval 0xFFFF for table locks */
+static uint16_t wait_lock_get_heap_no(const lock_t *lock)
 {
-	ulint	ret;
-
-	switch (lock_get_type(lock)) {
-	case LOCK_REC:
-		ret = lock_rec_find_set_bit(lock);
-		ut_a(ret != ULINT_UNDEFINED);
-		break;
-	case LOCK_TABLE:
-		ret = ULINT_UNDEFINED;
-		break;
-	default:
-		ut_error;
-	}
-
-	return(ret);
+  return lock_get_type(lock) == LOCK_REC
+    ? static_cast<uint16_t>(lock_rec_find_set_bit(lock))
+    : uint16_t{0xFFFF};
 }
 
 /*******************************************************************//**
@@ -402,25 +384,18 @@ i_s_locks_row_validate(
 /*===================*/
 	const i_s_locks_row_t*	row)	/*!< in: row to validate */
 {
-	ut_ad(row->lock_mode != NULL);
-	ut_ad(row->lock_type != NULL);
+	ut_ad(row->lock_mode);
 	ut_ad(row->lock_table != NULL);
 	ut_ad(row->lock_table_id != 0);
 
-	if (row->lock_space == ULINT_UNDEFINED) {
+	if (!row->lock_index) {
 		/* table lock */
-		ut_ad(!strcmp("TABLE", row->lock_type));
-		ut_ad(row->lock_index == NULL);
-		ut_ad(row->lock_data == NULL);
-		ut_ad(row->lock_page == ULINT_UNDEFINED);
-		ut_ad(row->lock_rec == ULINT_UNDEFINED);
+		ut_ad(!row->lock_data);
+		ut_ad(row->lock_page == page_id_t(0, 0));
+		ut_ad(!row->lock_rec);
 	} else {
 		/* record lock */
-		ut_ad(!strcmp("RECORD", row->lock_type));
-		ut_ad(row->lock_index != NULL);
 		/* row->lock_data == NULL if buf_page_try_get() == NULL */
-		ut_ad(row->lock_page != ULINT_UNDEFINED);
-		ut_ad(row->lock_rec != ULINT_UNDEFINED);
 	}
 
 	return(TRUE);
@@ -501,21 +476,7 @@ fill_trx_row(
 	}
 
 thd_done:
-	s = trx->op_info;
-
-	if (s != NULL && s[0] != '\0') {
-
-		TRX_I_S_STRING_COPY(s, row->trx_operation_state,
-				    TRX_I_S_TRX_OP_STATE_MAX_LEN, cache);
-
-		if (row->trx_operation_state == NULL) {
-
-			return(FALSE);
-		}
-	} else {
-
-		row->trx_operation_state = NULL;
-	}
+	row->trx_operation_state = trx->op_info;
 
 	row->trx_tables_in_use = trx->n_mysql_tables_in_use;
 
@@ -533,25 +494,7 @@ thd_done:
 
 	row->trx_rows_modified = trx->undo_no;
 
-	row->trx_concurrency_tickets = trx->n_tickets_to_enter_innodb;
-
-	switch (trx->isolation_level) {
-	case TRX_ISO_READ_UNCOMMITTED:
-		row->trx_isolation_level = "READ UNCOMMITTED";
-		break;
-	case TRX_ISO_READ_COMMITTED:
-		row->trx_isolation_level = "READ COMMITTED";
-		break;
-	case TRX_ISO_REPEATABLE_READ:
-		row->trx_isolation_level = "REPEATABLE READ";
-		break;
-	case TRX_ISO_SERIALIZABLE:
-		row->trx_isolation_level = "SERIALIZABLE";
-		break;
-	/* Should not happen as TRX_ISO_READ_COMMITTED is default */
-	default:
-		row->trx_isolation_level = "UNKNOWN";
-	}
+	row->trx_isolation_level = trx->isolation_level;
 
 	row->trx_unique_checks = (ibool) trx->check_unique_secondary;
 
@@ -683,9 +626,7 @@ fill_lock_data(
 
 	mtr_start(&mtr);
 
-	block = buf_page_try_get(page_id_t(lock_rec_get_space_id(lock),
-					   lock_rec_get_page_no(lock)),
-				 &mtr);
+	block = buf_page_try_get(lock->un_member.rec_lock.page_id, &mtr);
 
 	if (block == NULL) {
 
@@ -749,22 +690,42 @@ fill_lock_data(
 /*******************************************************************//**
 Fills i_s_locks_row_t object. Returns its first argument.
 If memory can not be allocated then FALSE is returned.
-@return FALSE if allocation fails */
-static
-ibool
-fill_locks_row(
-/*===========*/
+@return false if allocation fails */
+static bool fill_locks_row(
 	i_s_locks_row_t* row,	/*!< out: result object that's filled */
 	const lock_t*	lock,	/*!< in: lock to get data from */
-	ulint		heap_no,/*!< in: lock's record number
-				or ULINT_UNDEFINED if the lock
+	uint16_t	heap_no,/*!< in: lock's record number
+				or 0 if the lock
 				is a table lock */
 	trx_i_s_cache_t* cache)	/*!< in/out: cache into which to copy
 				volatile strings */
 {
-	row->lock_trx_id = lock_get_trx_id(lock);
-	row->lock_mode = lock_get_mode_str(lock);
-	row->lock_type = lock_get_type_str(lock);
+	row->lock_trx_id = lock->trx->id;
+	const auto lock_type = lock_get_type(lock);
+	ut_ad(lock_type == LOCK_REC || lock_type == LOCK_TABLE);
+
+	const bool is_gap_lock = lock_type == LOCK_REC
+		&& (lock->type_mode & LOCK_GAP);
+	switch (lock->type_mode & LOCK_MODE_MASK) {
+	case LOCK_S:
+		row->lock_mode = uint8_t(1 + is_gap_lock);
+		break;
+	case LOCK_X:
+		row->lock_mode = uint8_t(3 + is_gap_lock);
+		break;
+	case LOCK_IS:
+		row->lock_mode = uint8_t(5 + is_gap_lock);
+		break;
+	case LOCK_IX:
+		row->lock_mode = uint8_t(7 + is_gap_lock);
+		break;
+	case LOCK_AUTO_INC:
+		row->lock_mode = 9;
+		break;
+	default:
+		ut_ad("unknown lock mode" == 0);
+		row->lock_mode = 0;
+	}
 
 	row->lock_table = ha_storage_put_str_memlim(
 		cache->storage, lock_get_table_name(lock).m_name,
@@ -773,11 +734,10 @@ fill_locks_row(
 	/* memory could not be allocated */
 	if (row->lock_table == NULL) {
 
-		return(FALSE);
+		return false;
 	}
 
-	switch (lock_get_type(lock)) {
-	case LOCK_REC:
+	if (lock_type == LOCK_REC) {
 		row->lock_index = ha_storage_put_str_memlim(
 			cache->storage, lock_rec_get_index_name(lock),
 			MAX_ALLOWED_FOR_STORAGE(cache));
@@ -785,32 +745,24 @@ fill_locks_row(
 		/* memory could not be allocated */
 		if (row->lock_index == NULL) {
 
-			return(FALSE);
+			return false;
 		}
 
-		row->lock_space = lock_rec_get_space_id(lock);
-		row->lock_page = lock_rec_get_page_no(lock);
+		row->lock_page = lock->un_member.rec_lock.page_id;
 		row->lock_rec = heap_no;
 
 		if (!fill_lock_data(&row->lock_data, lock, heap_no, cache)) {
 
 			/* memory could not be allocated */
-			return(FALSE);
+			return false;
 		}
-
-		break;
-	case LOCK_TABLE:
+	} else {
 		row->lock_index = NULL;
 
-		row->lock_space = ULINT_UNDEFINED;
-		row->lock_page = ULINT_UNDEFINED;
-		row->lock_rec = ULINT_UNDEFINED;
+		row->lock_page = page_id_t(0, 0);
+		row->lock_rec = 0;
 
 		row->lock_data = NULL;
-
-		break;
-	default:
-		ut_error;
 	}
 
 	row->lock_table_id = lock_get_table_id(lock);
@@ -818,7 +770,7 @@ fill_locks_row(
 	row->hash_chain.value = row;
 	ut_ad(i_s_locks_row_validate(row));
 
-	return(TRUE);
+	return true;
 }
 
 /*******************************************************************//**
@@ -858,7 +810,7 @@ fold_lock(
 /*======*/
 	const lock_t*	lock,	/*!< in: lock object to fold */
 	ulint		heap_no)/*!< in: lock's record number
-				or ULINT_UNDEFINED if the lock
+				or 0xFFFF if the lock
 				is a table lock */
 {
 #ifdef TEST_LOCK_FOLD_ALWAYS_DIFFERENT
@@ -870,14 +822,10 @@ fold_lock(
 
 	switch (lock_get_type(lock)) {
 	case LOCK_REC:
-		ut_a(heap_no != ULINT_UNDEFINED);
-
-		ret = ut_fold_ulint_pair((ulint) lock_get_trx_id(lock),
-					 lock_rec_get_space_id(lock));
-
-		ret = ut_fold_ulint_pair(ret,
-					 lock_rec_get_page_no(lock));
-
+		ut_a(heap_no != 0xFFFF);
+		ret = ut_fold_ulint_pair((ulint) lock->trx->id,
+					 lock->un_member.rec_lock.page_id.
+					 fold());
 		ret = ut_fold_ulint_pair(ret, heap_no);
 
 		break;
@@ -885,7 +833,7 @@ fold_lock(
 		/* this check is actually not necessary for continuing
 		correct operation, but something must have gone wrong if
 		it fails. */
-		ut_a(heap_no == ULINT_UNDEFINED);
+		ut_a(heap_no == 0xFFFF);
 
 		ret = (ulint) lock_get_table_id(lock);
 
@@ -908,7 +856,7 @@ locks_row_eq_lock(
 	const i_s_locks_row_t*	row,	/*!< in: innodb_locks row */
 	const lock_t*		lock,	/*!< in: lock object */
 	ulint			heap_no)/*!< in: lock's record number
-					or ULINT_UNDEFINED if the lock
+					or 0xFFFF if the lock
 					is a table lock */
 {
 	ut_ad(i_s_locks_row_validate(row));
@@ -917,20 +865,19 @@ locks_row_eq_lock(
 #else
 	switch (lock_get_type(lock)) {
 	case LOCK_REC:
-		ut_a(heap_no != ULINT_UNDEFINED);
+		ut_a(heap_no != 0xFFFF);
 
-		return(row->lock_trx_id == lock_get_trx_id(lock)
-		       && row->lock_space == lock_rec_get_space_id(lock)
-		       && row->lock_page == lock_rec_get_page_no(lock)
+		return(row->lock_trx_id == lock->trx->id
+		       && row->lock_page == lock->un_member.rec_lock.page_id
 		       && row->lock_rec == heap_no);
 
 	case LOCK_TABLE:
 		/* this check is actually not necessary for continuing
 		correct operation, but something must have gone wrong if
 		it fails. */
-		ut_a(heap_no == ULINT_UNDEFINED);
+		ut_a(heap_no == 0xFFFF);
 
-		return(row->lock_trx_id == lock_get_trx_id(lock)
+		return(row->lock_trx_id == lock->trx->id
 		       && row->lock_table_id == lock_get_table_id(lock));
 
 	default:
@@ -951,8 +898,8 @@ search_innodb_locks(
 /*================*/
 	trx_i_s_cache_t*	cache,	/*!< in: cache */
 	const lock_t*		lock,	/*!< in: lock to search for */
-	ulint			heap_no)/*!< in: lock's record number
-					or ULINT_UNDEFINED if the lock
+	uint16_t		heap_no)/*!< in: lock's record number
+					or 0xFFFF if the lock
 					is a table lock */
 {
 	i_s_hash_chain_t*	hash_chain;
@@ -961,7 +908,7 @@ search_innodb_locks(
 		/* hash_chain->"next" */
 		next,
 		/* the hash table */
-		cache->locks_hash,
+		&cache->locks_hash,
 		/* fold */
 		fold_lock(lock, heap_no),
 		/* the type of the next variable */
@@ -994,8 +941,8 @@ add_lock_to_cache(
 /*==============*/
 	trx_i_s_cache_t*	cache,	/*!< in/out: cache */
 	const lock_t*		lock,	/*!< in: the element to add */
-	ulint			heap_no)/*!< in: lock's record number
-					or ULINT_UNDEFINED if the lock
+	uint16_t		heap_no)/*!< in: lock's record number
+					or 0 if the lock
 					is a table lock */
 {
 	i_s_locks_row_t*	dst_row;
@@ -1037,7 +984,7 @@ add_lock_to_cache(
 		/* hash_chain->"next" */
 		next,
 		/* the hash table */
-		cache->locks_hash,
+		&cache->locks_hash,
 		/* fold */
 		fold_lock(lock, heap_no),
 		/* add this data to the hash */
@@ -1109,13 +1056,12 @@ add_trx_relevant_locks_to_cache(
 	if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
 
 		const lock_t*		curr_lock;
-		ulint			wait_lock_heap_no;
 		i_s_locks_row_t*	blocking_lock_row;
 		lock_queue_iterator_t	iter;
 
 		ut_a(trx->lock.wait_lock != NULL);
 
-		wait_lock_heap_no
+		uint16_t wait_lock_heap_no
 			= wait_lock_get_heap_no(trx->lock.wait_lock);
 
 		/* add the requested lock */
@@ -1212,7 +1158,7 @@ trx_i_s_cache_clear(
 	cache->innodb_locks.rows_used = 0;
 	cache->innodb_lock_waits.rows_used = 0;
 
-	hash_table_clear(cache->locks_hash);
+	cache->locks_hash.clear();
 
 	ha_storage_empty(&cache->storage);
 }
@@ -1271,28 +1217,20 @@ static void fetch_data_into_cache_low(trx_i_s_cache_t *cache, const trx_t *trx)
 
 static void fetch_data_into_cache(trx_i_s_cache_t *cache)
 {
-  const trx_t *const purge_trx= purge_sys.query ? purge_sys.query->trx : NULL;
-
   ut_ad(lock_mutex_own());
   trx_i_s_cache_clear(cache);
 
   /* Capture the state of transactions */
-  mutex_enter(&trx_sys.mutex);
-  for (trx_t *trx= UT_LIST_GET_FIRST(trx_sys.trx_list);
-       trx != NULL;
-       trx= UT_LIST_GET_NEXT(trx_list, trx))
-  {
-    if (trx != purge_trx && trx->state != TRX_STATE_NOT_STARTED)
+  trx_sys.trx_list.for_each([cache](trx_t &trx) {
+    if (!cache->is_truncated && trx.state != TRX_STATE_NOT_STARTED &&
+        &trx != (purge_sys.query ? purge_sys.query->trx : nullptr))
     {
-      mutex_enter(&trx->mutex);
-      if (trx->state != TRX_STATE_NOT_STARTED)
-        fetch_data_into_cache_low(cache, trx);
-      mutex_exit(&trx->mutex);
-      if (cache->is_truncated)
-        break;
-     }
-  }
-  mutex_exit(&trx_sys.mutex);
+      mutex_enter(&trx.mutex);
+      if (trx.state != TRX_STATE_NOT_STARTED)
+        fetch_data_into_cache_low(cache, &trx);
+      mutex_exit(&trx.mutex);
+    }
+  });
   cache->is_truncated= false;
 }
 
@@ -1360,7 +1298,7 @@ trx_i_s_cache_init(
 	table_cache_init(&cache->innodb_lock_waits,
 			 sizeof(i_s_lock_waits_row_t));
 
-	cache->locks_hash = hash_create(LOCKS_HASH_CELLS_NUM);
+	cache->locks_hash.create(LOCKS_HASH_CELLS_NUM);
 
 	cache->storage = ha_storage_create(CACHE_STORAGE_INITIAL_SIZE,
 					   CACHE_STORAGE_HASH_CELLS);
@@ -1379,7 +1317,7 @@ trx_i_s_cache_free(
 {
 	rw_lock_free(&cache->rw_lock);
 
-	hash_table_free(cache->locks_hash);
+	cache->locks_hash.free();
 	ha_storage_free(cache->storage);
 	table_cache_free(&cache->innodb_trx);
 	table_cache_free(&cache->innodb_locks);
@@ -1528,13 +1466,13 @@ trx_i_s_create_lock_id(
 
 	/* please adjust TRX_I_S_LOCK_ID_MAX_LEN if you change this */
 
-	if (row->lock_space != ULINT_UNDEFINED) {
+	if (row->lock_index) {
 		/* record lock */
 		res_len = snprintf(lock_id, lock_id_size,
 				   TRX_ID_FMT
-				   ":" ULINTPF ":" ULINTPF ":" ULINTPF,
-				   row->lock_trx_id, row->lock_space,
-				   row->lock_page, row->lock_rec);
+				   ":%u:%u:%u",
+				   row->lock_trx_id, row->lock_page.space(),
+				   row->lock_page.page_no(), row->lock_rec);
 	} else {
 		/* table lock */
 		res_len = snprintf(lock_id, lock_id_size,
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
index 3f2f3626bf2..4d84f295c0b 100644
--- a/storage/innobase/trx/trx0purge.cc
+++ b/storage/innobase/trx/trx0purge.cc
@@ -43,6 +43,8 @@ Created 3/26/1996 Heikki Tuuri
 #include "trx0trx.h"
 #include <mysql/service_wsrep.h>
 
+#include <unordered_map>
+
 /** Maximum allowable purge history length.  <=0 means 'infinite'. */
 ulong		srv_max_purge_lag = 0;
 
@@ -147,7 +149,7 @@ purge_graph_build()
 		NULL, NULL, QUE_FORK_PURGE, heap);
 	fork->trx = trx;
 
-	for (ulint i = 0; i < srv_n_purge_threads; ++i) {
+	for (auto i = innodb_purge_threads_MAX; i; i--) {
 		que_thr_t*	thr = que_thr_create(fork, heap, NULL);
 		thr->child = new(mem_heap_alloc(heap, sizeof(purge_node_t)))
 			purge_node_t(thr);
@@ -160,10 +162,8 @@ purge_graph_build()
 void purge_sys_t::create()
 {
   ut_ad(this == &purge_sys);
+  ut_ad(!heap);
   ut_ad(!enabled());
-  ut_ad(!event);
-  event= os_event_create(0);
-  ut_ad(event);
   m_paused= 0;
   query= purge_graph_build();
   next_stored= false;
@@ -176,16 +176,17 @@ void purge_sys_t::create()
   mutex_create(LATCH_ID_PURGE_SYS_PQ, &pq_mutex);
   truncate.current= NULL;
   truncate.last= NULL;
+  heap= mem_heap_create(4096);
 }
 
 /** Close the purge subsystem on shutdown. */
 void purge_sys_t::close()
 {
   ut_ad(this == &purge_sys);
-  if (!event) return;
+  if (!heap)
+    return;
 
   ut_ad(!enabled());
-  ut_ad(n_tasks.load(std::memory_order_relaxed) == 0);
   trx_t* trx = query->trx;
   que_graph_free(query);
   ut_ad(!trx->id);
@@ -194,7 +195,8 @@ void purge_sys_t::close()
   trx->free();
   rw_lock_free(&latch);
   mutex_free(&pq_mutex);
-  os_event_destroy(event);
+  mem_heap_free(heap);
+  heap= nullptr;
 }
 
 /*================ UNDO LOG HISTORY LIST =============================*/
@@ -208,19 +210,20 @@ void
 trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
 {
 	DBUG_PRINT("trx", ("commit(" TRX_ID_FMT "," TRX_ID_FMT ")",
-			   trx->id, trx->no));
+			   trx->id, trx_id_t{trx->rw_trx_hash_element->no}));
 	ut_ad(undo == trx->rsegs.m_redo.undo);
 	trx_rseg_t*	rseg		= trx->rsegs.m_redo.rseg;
 	ut_ad(undo->rseg == rseg);
-	trx_rsegf_t*	rseg_header	= trx_rsegf_get(
+	buf_block_t*	rseg_header	= trx_rsegf_get(
 		rseg->space, rseg->page_no, mtr);
-	page_t*		undo_page	= trx_undo_set_state_at_finish(
+	buf_block_t*	undo_page	= trx_undo_set_state_at_finish(
 		undo, mtr);
-	trx_ulogf_t*	undo_header	= undo_page + undo->hdr_offset;
+	trx_ulogf_t*	undo_header	= undo_page->frame + undo->hdr_offset;
 
 	ut_ad(mach_read_from_2(undo_header + TRX_UNDO_NEEDS_PURGE) <= 1);
 
-	if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG_FORMAT + rseg_header))) {
+	if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+					   + rseg_header->frame))) {
 		/* This database must have been upgraded from
 		before MariaDB 10.3.5. */
 		trx_rseg_format_upgrade(rseg_header, mtr);
@@ -229,23 +232,27 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
 	if (undo->state != TRX_UNDO_CACHED) {
 		/* The undo log segment will not be reused */
 		ut_a(undo->id < TRX_RSEG_N_SLOTS);
-		trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr);
+		compile_time_assert(FIL_NULL == 0xffffffff);
+		mtr->memset(rseg_header,
+			    TRX_RSEG + TRX_RSEG_UNDO_SLOTS
+			    + undo->id * TRX_RSEG_SLOT_SIZE, 4, 0xff);
 
 		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
 
 		uint32_t hist_size = mach_read_from_4(TRX_RSEG_HISTORY_SIZE
-						      + rseg_header);
+						      + TRX_RSEG
+						      + rseg_header->frame);
 
 		ut_ad(undo->size == flst_get_len(TRX_UNDO_SEG_HDR
 						 + TRX_UNDO_PAGE_LIST
-						 + undo_page));
-
-		mlog_write_ulint(
-			rseg_header + TRX_RSEG_HISTORY_SIZE,
-			hist_size + undo->size, MLOG_4BYTES, mtr);
-
-		mlog_write_ull(rseg_header + TRX_RSEG_MAX_TRX_ID,
-			       trx_sys.get_max_trx_id(), mtr);
+						 + undo_page->frame));
+
+		mtr->write<4>(*rseg_header, TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+			      + rseg_header->frame,
+			      hist_size + undo->size);
+		mtr->write<8>(*rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID
+			      + rseg_header->frame,
+			      trx_sys.get_max_trx_id());
 	}
 
 	/* After the purge thread has been given permission to exit,
@@ -254,7 +261,7 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
 	or in trx_rollback_recovered() in slow shutdown.
 
 	Before any transaction-generating background threads or the
-	purge have been started, recv_recovery_rollback_active() can
+	purge have been started, we can
 	start transactions in row_merge_drop_temp_indexes() and
 	fts_drop_orphaned_tables(), and roll back recovered transactions.
 
@@ -288,21 +295,20 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
 	}
 
 	/* Add the log as the first in the history list */
-	flst_add_first(rseg_header + TRX_RSEG_HISTORY,
-		       undo_header + TRX_UNDO_HISTORY_NODE, mtr);
-
-	mlog_write_ull(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr);
-	/* This is needed for upgrading old undo log pages from
-	before MariaDB 10.3.1. */
-	if (UNIV_UNLIKELY(!mach_read_from_2(undo_header
-					    + TRX_UNDO_NEEDS_PURGE))) {
-		mlog_write_ulint(undo_header + TRX_UNDO_NEEDS_PURGE, 1,
-				 MLOG_2BYTES, mtr);
-	}
+	flst_add_first(rseg_header, TRX_RSEG + TRX_RSEG_HISTORY, undo_page,
+		       static_cast<uint16_t>(undo->hdr_offset
+					     + TRX_UNDO_HISTORY_NODE), mtr);
+
+	mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page,
+				       undo_header + TRX_UNDO_TRX_NO,
+				       trx->rw_trx_hash_element->no);
+	mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_header
+				       + TRX_UNDO_NEEDS_PURGE, 1U);
 
 	if (rseg->last_page_no == FIL_NULL) {
-		rseg->last_page_no = static_cast<uint32_t>(undo->hdr_page_no);
-		rseg->set_last_commit(undo->hdr_offset, trx->no);
+		rseg->last_page_no = undo->hdr_page_no;
+		rseg->set_last_commit(undo->hdr_offset,
+				      trx->rw_trx_hash_element->no);
 		rseg->needs_purge = true;
 	}
 
@@ -320,19 +326,16 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
 }
 
 /** Remove undo log header from the history list.
-@param[in,out]	rseg_hdr	rollback segment header
-@param[in]	log_hdr		undo log segment header
-@param[in,out]	mtr		mini transaction. */
-static
-void
-trx_purge_remove_log_hdr(
-	trx_rsegf_t*	rseg_hdr,
-	trx_ulogf_t*	log_hdr,
-	mtr_t*		mtr)
+@param[in,out]  rseg    rollback segment header page
+@param[in]      log     undo log segment header page
+@param[in]      offset  byte offset in the undo log segment header page
+@param[in,out]  mtr     mini-transaction */
+static void trx_purge_remove_log_hdr(buf_block_t *rseg, buf_block_t* log,
+                                     uint16_t offset, mtr_t *mtr)
 {
-	flst_remove(rseg_hdr + TRX_RSEG_HISTORY,
-		    log_hdr + TRX_UNDO_HISTORY_NODE, mtr);
-	trx_sys.rseg_history_len--;
+  flst_remove(rseg, TRX_RSEG + TRX_RSEG_HISTORY,
+              log, static_cast<uint16_t>(offset + TRX_UNDO_HISTORY_NODE), mtr);
+  trx_sys.rseg_history_len--;
 }
 
 /** Free an undo log segment, and remove the header from the history list.
@@ -343,14 +346,12 @@ void
 trx_purge_free_segment(trx_rseg_t* rseg, fil_addr_t hdr_addr)
 {
 	mtr_t		mtr;
-	trx_rsegf_t*	rseg_hdr;
-	page_t*		undo_page;
 
 	mtr.start();
 	mutex_enter(&rseg->mutex);
 
-	rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr);
-	undo_page = trx_undo_page_get(
+	buf_block_t* rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr);
+	buf_block_t* block = trx_undo_page_get(
 		page_id_t(rseg->space->id, hdr_addr.page), &mtr);
 
 	/* Mark the last undo log totally purged, so that if the
@@ -358,12 +359,12 @@ trx_purge_free_segment(trx_rseg_t* rseg, fil_addr_t hdr_addr)
 	again. The list of pages in the undo log tail gets
 	inconsistent during the freeing of the segment, and therefore
 	purge should not try to access them again. */
-	mlog_write_ulint(undo_page + hdr_addr.boffset + TRX_UNDO_NEEDS_PURGE,
-			 0, MLOG_2BYTES, &mtr);
+	mtr.write<2,mtr_t::MAYBE_NOP>(*block, block->frame + hdr_addr.boffset
+				      + TRX_UNDO_NEEDS_PURGE, 0U);
 
 	while (!fseg_free_step_not_header(
 		       TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
-		       + undo_page, &mtr)) {
+		       + block->frame, &mtr)) {
 		mutex_exit(&rseg->mutex);
 
 		mtr.commit();
@@ -373,7 +374,7 @@ trx_purge_free_segment(trx_rseg_t* rseg, fil_addr_t hdr_addr)
 
 		rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr);
 
-		undo_page = trx_undo_page_get(
+		block = trx_undo_page_get(
 			page_id_t(rseg->space->id, hdr_addr.page), &mtr);
 	}
 
@@ -381,15 +382,15 @@ trx_purge_free_segment(trx_rseg_t* rseg, fil_addr_t hdr_addr)
 	stored in the list base node tells us how big it was before we
 	started the freeing. */
 
-	const ulint seg_size = flst_get_len(
-		TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + undo_page);
+	const uint32_t seg_size = flst_get_len(
+		TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame);
 
 	/* We may free the undo log segment header page; it must be freed
 	within the same mtr as the undo log header is removed from the
 	history list: otherwise, in case of a database crash, the segment
 	could become inaccessible garbage in the file space. */
 
-	trx_purge_remove_log_hdr(rseg_hdr, undo_page + hdr_addr.boffset, &mtr);
+	trx_purge_remove_log_hdr(rseg_hdr, block, hdr_addr.boffset, &mtr);
 
 	do {
 
@@ -399,14 +400,12 @@ trx_purge_free_segment(trx_rseg_t* rseg, fil_addr_t hdr_addr)
 		fsp0fsp.cc. */
 
 	} while (!fseg_free_step(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
-				 + undo_page, &mtr));
+				 + block->frame, &mtr));
 
-	const ulint hist_size = mach_read_from_4(rseg_hdr
-						 + TRX_RSEG_HISTORY_SIZE);
-	ut_ad(hist_size >= seg_size);
+	byte* hist = TRX_RSEG + TRX_RSEG_HISTORY_SIZE + rseg_hdr->frame;
+	ut_ad(mach_read_from_4(hist) >= seg_size);
 
-	mlog_write_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE,
-			 hist_size - seg_size, MLOG_4BYTES, &mtr);
+	mtr.write<4>(*rseg_hdr, hist, mach_read_from_4(hist) - seg_size);
 
 	ut_ad(rseg->curr_size >= seg_size);
 
@@ -428,10 +427,6 @@ trx_purge_truncate_rseg_history(
 {
 	fil_addr_t	hdr_addr;
 	fil_addr_t	prev_hdr_addr;
-	trx_rsegf_t*	rseg_hdr;
-	page_t*		undo_page;
-	trx_ulogf_t*	log_hdr;
-	trx_usegf_t*	seg_hdr;
 	mtr_t		mtr;
 	trx_id_t	undo_trx_no;
 
@@ -439,10 +434,13 @@ trx_purge_truncate_rseg_history(
 	ut_ad(rseg.is_persistent());
 	mutex_enter(&rseg.mutex);
 
-	rseg_hdr = trx_rsegf_get(rseg.space, rseg.page_no, &mtr);
+	buf_block_t* rseg_hdr = trx_rsegf_get(rseg.space, rseg.page_no, &mtr);
+
+	hdr_addr = flst_get_last(TRX_RSEG + TRX_RSEG_HISTORY
+				 + rseg_hdr->frame);
+	hdr_addr.boffset = static_cast<uint16_t>(hdr_addr.boffset
+						 - TRX_UNDO_HISTORY_NODE);
 
-	hdr_addr = trx_purge_get_log_from_hist(
-		flst_get_last(rseg_hdr + TRX_RSEG_HISTORY, &mtr));
 loop:
 	if (hdr_addr.page == FIL_NULL) {
 func_exit:
@@ -451,12 +449,11 @@ func_exit:
 		return;
 	}
 
-	undo_page = trx_undo_page_get(page_id_t(rseg.space->id, hdr_addr.page),
-				      &mtr);
-
-	log_hdr = undo_page + hdr_addr.boffset;
-
-	undo_trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
+	buf_block_t* block = trx_undo_page_get(page_id_t(rseg.space->id,
+							 hdr_addr.page),
+					       &mtr);
+	undo_trx_no = mach_read_from_8(block->frame + hdr_addr.boffset
+				       + TRX_UNDO_TRX_NO);
 
 	if (undo_trx_no >= limit.trx_no) {
 		if (undo_trx_no == limit.trx_no) {
@@ -468,13 +465,15 @@ func_exit:
 		goto func_exit;
 	}
 
-	prev_hdr_addr = trx_purge_get_log_from_hist(
-		flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr));
+	prev_hdr_addr = flst_get_prev_addr(block->frame + hdr_addr.boffset
+					   + TRX_UNDO_HISTORY_NODE);
+	prev_hdr_addr.boffset = static_cast<uint16_t>(prev_hdr_addr.boffset
+						      - TRX_UNDO_HISTORY_NODE);
 
-	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
-
-	if ((mach_read_from_2(seg_hdr + TRX_UNDO_STATE) == TRX_UNDO_TO_PURGE)
-	    && (mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG) == 0)) {
+	if (mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + block->frame)
+	    == TRX_UNDO_TO_PURGE
+	    && !mach_read_from_2(block->frame + hdr_addr.boffset
+				 + TRX_UNDO_NEXT_LOG)) {
 
 		/* We can free the whole log segment */
 
@@ -486,7 +485,8 @@ func_exit:
 		trx_purge_free_segment(&rseg, hdr_addr);
 	} else {
 		/* Remove the log hdr from the rseg history. */
-		trx_purge_remove_log_hdr(rseg_hdr, log_hdr, &mtr);
+		trx_purge_remove_log_hdr(rseg_hdr, block, hdr_addr.boffset,
+					 &mtr);
 
 		mutex_exit(&rseg.mutex);
 		mtr.commit();
@@ -540,262 +540,294 @@ static void trx_purge_cleanse_purge_queue(const fil_space_t& space)
 	mutex_exit(&purge_sys.pq_mutex);
 }
 
+#if defined __GNUC__ && __GNUC__ == 4 && !defined __clang__
+# if defined __arm__ || defined __aarch64__
+/* Work around an internal compiler error in GCC 4.8.5 */
+__attribute__((optimize(0)))
+# endif
+#endif
 /**
 Removes unnecessary history data from rollback segments. NOTE that when this
 function is called, the caller must not have any latches on undo log pages!
 */
 static void trx_purge_truncate_history()
 {
-	ut_ad(purge_sys.head <= purge_sys.tail);
-	purge_sys_t::iterator& head = purge_sys.head.trx_no
-		? purge_sys.head : purge_sys.tail;
+  ut_ad(purge_sys.head <= purge_sys.tail);
+  purge_sys_t::iterator &head= purge_sys.head.trx_no
+    ? purge_sys.head : purge_sys.tail;
 
-	if (head.trx_no >= purge_sys.view.low_limit_no()) {
-		/* This is sometimes necessary. TODO: find out why. */
-		head.trx_no = purge_sys.view.low_limit_no();
-		head.undo_no = 0;
-	}
-
-	for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
-		if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) {
-			ut_ad(rseg->id == i);
-			trx_purge_truncate_rseg_history(*rseg, head);
-		}
-	}
-
-	if (srv_undo_tablespaces_active < 2) {
-		return;
-	}
-
-	while (srv_undo_log_truncate && srv_undo_logs >= 3) {
-		if (!purge_sys.truncate.current) {
-			const ulint threshold = ulint(srv_max_undo_log_size
-						      >> srv_page_size_shift);
-			for (ulint i = purge_sys.truncate.last
-				     ? purge_sys.truncate.last->id
-				     - srv_undo_space_id_start
-				     : 0, j = i;; ) {
-				ulint space_id = srv_undo_space_id_start + i;
-				ut_ad(srv_is_undo_tablespace(space_id));
-
-				if (fil_space_get_size(space_id)
-				    > threshold) {
-					purge_sys.truncate.current
-						= fil_space_get(space_id);
-					break;
-				}
-
-				++i;
-				i %= srv_undo_tablespaces_active;
-				if (i == j) {
-					break;
-				}
-			}
-		}
+  if (head.trx_no >= purge_sys.low_limit_no())
+  {
+    /* This is sometimes necessary. TODO: find out why. */
+    head.trx_no= purge_sys.low_limit_no();
+    head.undo_no= 0;
+  }
 
-		if (!purge_sys.truncate.current) {
-			return;
-		}
+  for (ulint i= 0; i < TRX_SYS_N_RSEGS; ++i)
+  {
+    if (trx_rseg_t *rseg= trx_sys.rseg_array[i])
+    {
+      ut_ad(rseg->id == i);
+      trx_purge_truncate_rseg_history(*rseg, head);
+    }
+  }
 
-		fil_space_t& space = *purge_sys.truncate.current;
-		/* Undo tablespace always are a single file. */
-		ut_a(UT_LIST_GET_LEN(space.chain) == 1);
-		fil_node_t* file = UT_LIST_GET_FIRST(space.chain);
-		/* The undo tablespace files are never closed. */
-		ut_ad(file->is_open());
-
-		DBUG_LOG("undo", "marking for truncate: " << file->name);
-
-		for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
-			if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) {
-				ut_ad(rseg->is_persistent());
-				if (rseg->space == &space) {
-					/* Once set, this rseg will
-					not be allocated to subsequent
-					transactions, but we will wait
-					for existing active
-					transactions to finish. */
-					rseg->skip_allocation = true;
-				}
-			}
-		}
+  if (srv_undo_tablespaces_active < 2)
+    return;
 
-		for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
-			trx_rseg_t*	rseg = trx_sys.rseg_array[i];
-			if (!rseg || rseg->space != &space) {
-				continue;
-			}
-			mutex_enter(&rseg->mutex);
-			ut_ad(rseg->skip_allocation);
-			if (rseg->trx_ref_count) {
+  while (srv_undo_log_truncate)
+  {
+    if (!purge_sys.truncate.current)
+    {
+      const ulint threshold=
+        ulint(srv_max_undo_log_size >> srv_page_size_shift);
+      for (ulint i= purge_sys.truncate.last
+           ? purge_sys.truncate.last->id - srv_undo_space_id_start : 0,
+           j= i;; )
+      {
+        const auto space_id= srv_undo_space_id_start + i;
+        ut_ad(srv_is_undo_tablespace(space_id));
+        fil_space_t *space= fil_space_get(space_id);
+        ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+
+        if (space && space->get_size() > threshold)
+        {
+          purge_sys.truncate.current= space;
+          break;
+        }
+
+        ++i;
+        i %= srv_undo_tablespaces_active;
+        if (i == j)
+          return;
+      }
+    }
+
+    fil_space_t &space= *purge_sys.truncate.current;
+    /* Undo tablespace always are a single file. */
+    fil_node_t *file= UT_LIST_GET_FIRST(space.chain);
+    /* The undo tablespace files are never closed. */
+    ut_ad(file->is_open());
+
+    DBUG_LOG("undo", "marking for truncate: " << file->name);
+
+    for (ulint i= 0; i < TRX_SYS_N_RSEGS; ++i)
+      if (trx_rseg_t *rseg= trx_sys.rseg_array[i])
+        if (rseg->space == &space)
+          /* Once set, this rseg will not be allocated to subsequent
+          transactions, but we will wait for existing active
+          transactions to finish. */
+          rseg->skip_allocation= true;
+
+    for (ulint i= 0; i < TRX_SYS_N_RSEGS; ++i)
+    {
+      trx_rseg_t *rseg= trx_sys.rseg_array[i];
+      if (!rseg || rseg->space != &space)
+        continue;
+      mutex_enter(&rseg->mutex);
+      ut_ad(rseg->skip_allocation);
+      ut_ad(rseg->is_persistent());
+      if (rseg->trx_ref_count)
+      {
 not_free:
-				mutex_exit(&rseg->mutex);
-				return;
-			}
-
-			if (rseg->curr_size != 1) {
-				/* Check if all segments are
-				cached and safe to remove. */
-				ulint cached = 0;
-
-				for (trx_undo_t* undo = UT_LIST_GET_FIRST(
-					     rseg->undo_cached);
-				     undo;
-				     undo = UT_LIST_GET_NEXT(undo_list,
-							     undo)) {
-					if (head.trx_no < undo->trx_id) {
-						goto not_free;
-					} else {
-						cached += undo->size;
-					}
-				}
-
-				ut_ad(rseg->curr_size > cached);
-
-				if (rseg->curr_size > cached + 1) {
-					goto not_free;
-				}
-			}
-
-			mutex_exit(&rseg->mutex);
-		}
-
-		ib::info() << "Truncating " << file->name;
-		trx_purge_cleanse_purge_queue(space);
-
-		/* Flush all to-be-discarded pages of the tablespace.
-
-		During truncation, we do not want any writes to the
-		to-be-discarded area, because we must set the space.size
-		early in order to have deterministic page allocation.
-
-		If a log checkpoint was completed at LSN earlier than our
-		mini-transaction commit and the server was killed, then
-		discarding the to-be-trimmed pages without flushing would
-		break crash recovery. So, we cannot avoid the write. */
-		{
-			FlushObserver observer(
-				purge_sys.truncate.current,
-				UT_LIST_GET_FIRST(purge_sys.query->thrs)
-				->graph->trx,
-				NULL);
-			buf_LRU_flush_or_remove_pages(space.id, &observer);
-		}
-
-		log_free_check();
-
-		/* Adjust the tablespace metadata. */
-		if (!fil_truncate_prepare(space.id)) {
-			ib::error() << "Failed to find UNDO tablespace "
-				<< file->name;
-			return;
-		}
-
-		/* Re-initialize tablespace, in a single mini-transaction. */
-		mtr_t mtr;
-		const ulint size = SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
-		mtr.start();
-		mtr_x_lock_space(purge_sys.truncate.current, &mtr);
-		fil_truncate_log(purge_sys.truncate.current, size, &mtr);
-		fsp_header_init(purge_sys.truncate.current, size, &mtr);
-		mutex_enter(&fil_system.mutex);
-		purge_sys.truncate.current->size = file->size = size;
-		mutex_exit(&fil_system.mutex);
-
-		buf_block_t* sys_header = trx_sysf_get(&mtr);
-
-		for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
-			trx_rseg_t* rseg = trx_sys.rseg_array[i];
-			if (!rseg || rseg->space != &space) {
-				continue;
-			}
-
-			ut_ad(rseg->is_persistent());
-			ut_d(const ulint old_page = rseg->page_no);
-
-			buf_block_t* rblock = trx_rseg_header_create(
-				purge_sys.truncate.current,
-				rseg->id, trx_sys.get_max_trx_id(),
-				sys_header, &mtr);
-			ut_ad(rblock);
-			rseg->page_no = rblock
-				? rblock->page.id.page_no() : FIL_NULL;
-			ut_ad(old_page == rseg->page_no);
-
-			/* Before re-initialization ensure that we
-			free the existing structure. There can't be
-			any active transactions. */
-			ut_a(UT_LIST_GET_LEN(rseg->undo_list) == 0);
-
-			trx_undo_t*	next_undo;
-
-			for (trx_undo_t* undo = UT_LIST_GET_FIRST(
-				     rseg->undo_cached);
-			     undo; undo = next_undo) {
-
-				next_undo = UT_LIST_GET_NEXT(undo_list, undo);
-				UT_LIST_REMOVE(rseg->undo_cached, undo);
-				MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
-				ut_free(undo);
-			}
-
-			UT_LIST_INIT(rseg->undo_list,
-				     &trx_undo_t::undo_list);
-			UT_LIST_INIT(rseg->undo_cached,
-				     &trx_undo_t::undo_list);
-
-			/* These were written by trx_rseg_header_create(). */
-			ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
-						+ rblock->frame));
-			ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE
-						+ rblock->frame));
-
-			/* Initialize the undo log lists according to
-			the rseg header */
-			rseg->curr_size = 1;
-			rseg->trx_ref_count = 0;
-			rseg->last_page_no = FIL_NULL;
-			rseg->last_commit_and_offset = 0;
-			rseg->needs_purge = false;
-		}
-
-		mtr.commit_shrink(space);
-		/* No mutex; this is only updated by the purge coordinator. */
-		export_vars.innodb_undo_truncations++;
-
-		if (purge_sys.rseg
-		    && purge_sys.rseg->last_page_no == FIL_NULL) {
-			/* If purge_sys.rseg is pointing to rseg that
-			was recently truncated then move to next rseg
-			element.  Note: Ideally purge_sys.rseg should
-			be NULL because purge should complete
-			processing of all the records but there is
-			purge_batch_size that can force the purge loop
-			to exit before all the records are purged and
-			in this case purge_sys.rseg could point to a
-			valid rseg waiting for next purge cycle. */
-			purge_sys.next_stored = false;
-			purge_sys.rseg = NULL;
-		}
-
-		DBUG_EXECUTE_IF("ib_undo_trunc",
-				ib::info() << "ib_undo_trunc";
-				log_write_up_to(LSN_MAX, true);
-				DBUG_SUICIDE(););
-
-		for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
-			if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) {
-				ut_ad(rseg->is_persistent());
-				if (rseg->space == &space) {
-					rseg->skip_allocation = false;
-				}
-			}
-		}
-
-		ib::info() << "Truncated " << file->name;
-		purge_sys.truncate.last = purge_sys.truncate.current;
-		purge_sys.truncate.current = NULL;
-	}
+        mutex_exit(&rseg->mutex);
+        return;
+      }
+
+      if (rseg->curr_size != 1)
+      {
+        /* Check if all segments are cached and safe to remove. */
+        ulint cached= 0;
+        for (trx_undo_t *undo= UT_LIST_GET_FIRST(rseg->undo_cached); undo;
+             undo= UT_LIST_GET_NEXT(undo_list, undo))
+        {
+          if (head.trx_no < undo->trx_id)
+            goto not_free;
+          else
+            cached+= undo->size;
+        }
+
+        ut_ad(rseg->curr_size > cached);
+
+        if (rseg->curr_size > cached + 1)
+          goto not_free;
+      }
+
+      mutex_exit(&rseg->mutex);
+    }
+
+    ib::info() << "Truncating " << file->name;
+    trx_purge_cleanse_purge_queue(space);
+
+    log_free_check();
+
+    mtr_t mtr;
+    mtr.start();
+    mtr_x_lock_space(&space, &mtr);
+
+    /* Lock all modified pages of the tablespace.
+
+    During truncation, we do not want any writes to the file.
+
+    If a log checkpoint was completed at LSN earlier than our
+    mini-transaction commit and the server was killed, then
+    discarding the to-be-trimmed pages without flushing would
+    break crash recovery. */
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+    for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
+    {
+      ut_ad(bpage->oldest_modification());
+      ut_ad(bpage->in_file());
+
+      buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+
+      if (bpage->id().space() == space.id &&
+          bpage->oldest_modification() != 1)
+      {
+        ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
+        auto block= reinterpret_cast<buf_block_t*>(bpage);
+        block->fix();
+        ut_ad(rw_lock_s_lock_nowait(block->debug_latch, __FILE__, __LINE__));
+        buf_pool.flush_hp.set(prev);
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+#ifdef BTR_CUR_HASH_ADAPT
+        ut_ad(!block->index); /* There is no AHI on undo tablespaces. */
+#endif
+        rw_lock_x_lock(&block->lock);
+        mysql_mutex_lock(&buf_pool.flush_list_mutex);
+        ut_ad(bpage->io_fix() == BUF_IO_NONE);
+
+        if (bpage->oldest_modification() > 1)
+        {
+          bpage->clear_oldest_modification(false);
+          mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
+        }
+        else
+        {
+          rw_lock_x_unlock(&block->lock);
+          block->unfix();
+        }
+
+        if (prev != buf_pool.flush_hp.get())
+        {
+          /* Rescan, because we may have lost the position. */
+          bpage= UT_LIST_GET_LAST(buf_pool.flush_list);
+          continue;
+        }
+      }
+
+      bpage= prev;
+    }
+
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+    /* Adjust the tablespace metadata. */
+    if (!fil_truncate_prepare(space.id))
+    {
+      ib::error() << "Failed to find UNDO tablespace " << file->name;
+      mtr.commit();
+      return;
+    }
+
+    /* Re-initialize tablespace, in a single mini-transaction. */
+    const ulint size= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
+    /* Associate the undo tablespace with mtr.
+    During mtr::commit_shrink(), InnoDB can use the undo
+    tablespace object to clear all freed ranges */
+    mtr.set_named_space(&space);
+    mtr.trim_pages(page_id_t(space.id, size));
+    fsp_header_init(&space, size, &mtr);
+    mutex_enter(&fil_system.mutex);
+    space.size= file->size= size;
+    mutex_exit(&fil_system.mutex);
+
+    buf_block_t *sys_header= trx_sysf_get(&mtr);
+
+    for (ulint i= 0; i < TRX_SYS_N_RSEGS; ++i)
+    {
+      trx_rseg_t *rseg= trx_sys.rseg_array[i];
+      if (!rseg || rseg->space != &space)
+        continue;
+
+      ut_ad(rseg->id == i);
+      ut_ad(rseg->is_persistent());
+      ut_d(const auto old_page= rseg->page_no);
+
+      buf_block_t *rblock= trx_rseg_header_create(&space, i,
+                                                  trx_sys.get_max_trx_id(),
+                                                  sys_header, &mtr);
+      ut_ad(rblock);
+      rseg->page_no= rblock ? rblock->page.id().page_no() : FIL_NULL;
+      ut_ad(old_page == rseg->page_no);
+
+      /* Before re-initialization ensure that we free the existing
+      structure. There can't be any active transactions. */
+      ut_a(UT_LIST_GET_LEN(rseg->undo_list) == 0);
+
+      for (trx_undo_t *undo= UT_LIST_GET_FIRST(rseg->undo_cached), *next_undo;
+           undo; undo= next_undo)
+      {
+        next_undo= UT_LIST_GET_NEXT(undo_list, undo);
+        UT_LIST_REMOVE(rseg->undo_cached, undo);
+        MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+        ut_free(undo);
+      }
+
+      UT_LIST_INIT(rseg->undo_list, &trx_undo_t::undo_list);
+      UT_LIST_INIT(rseg->undo_cached, &trx_undo_t::undo_list);
+
+      /* These were written by trx_rseg_header_create(). */
+      ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + rblock->frame));
+      ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE +
+                              rblock->frame));
+      /* Initialize the undo log lists according to
+      the rseg header */
+      rseg->curr_size= 1;
+      rseg->trx_ref_count= 0;
+      rseg->last_page_no= FIL_NULL;
+      rseg->last_commit_and_offset= 0;
+      rseg->needs_purge= false;
+    }
+
+    mtr.commit_shrink(space);
+
+    /* No mutex; this is only updated by the purge coordinator. */
+    export_vars.innodb_undo_truncations++;
+
+    if (purge_sys.rseg && purge_sys.rseg->last_page_no == FIL_NULL)
+    {
+      /* If purge_sys.rseg is pointing to rseg that was recently
+      truncated then move to next rseg element.
+
+      Note: Ideally purge_sys.rseg should be NULL because purge should
+      complete processing of all the records but srv_purge_batch_size
+      can force the purge loop to exit before all the records are purged. */
+      purge_sys.rseg= nullptr;
+      purge_sys.next_stored= false;
+    }
+
+    DBUG_EXECUTE_IF("ib_undo_trunc", ib::info() << "ib_undo_trunc";
+                    log_buffer_flush_to_disk();
+                    DBUG_SUICIDE(););
+
+    for (ulint i= 0; i < TRX_SYS_N_RSEGS; ++i)
+    {
+      if (trx_rseg_t *rseg= trx_sys.rseg_array[i])
+      {
+        ut_ad(rseg->id == i);
+        ut_ad(rseg->is_persistent());
+        if (rseg->space == &space)
+          rseg->skip_allocation= false;
+      }
+    }
+
+    ib::info() << "Truncated " << file->name;
+    purge_sys.truncate.last= purge_sys.truncate.current;
+    ut_ad(&space == purge_sys.truncate.current);
+    purge_sys.truncate.current= nullptr;
+  }
 }
 
 /***********************************************************************//**
@@ -805,7 +837,6 @@ static void trx_purge_rseg_get_next_history_log(
 	ulint*		n_pages_handled)/*!< in/out: number of UNDO pages
 					handled */
 {
-	page_t*		undo_page;
 	fil_addr_t	prev_log_addr;
 	trx_id_t	trx_no;
 	mtr_t		mtr;
@@ -820,18 +851,21 @@ static void trx_purge_rseg_get_next_history_log(
 
 	mtr.start();
 
-	undo_page = trx_undo_page_get_s_latched(
+	const buf_block_t* undo_page = trx_undo_page_get_s_latched(
 		page_id_t(purge_sys.rseg->space->id,
 			  purge_sys.rseg->last_page_no), &mtr);
 
-	const trx_ulogf_t* log_hdr = undo_page + purge_sys.rseg->last_offset();
+	const trx_ulogf_t* log_hdr = undo_page->frame
+		+ purge_sys.rseg->last_offset();
 
 	/* Increase the purge page count by one for every handled log */
 
 	(*n_pages_handled)++;
 
-	prev_log_addr = trx_purge_get_log_from_hist(
-		flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr));
+	prev_log_addr = flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE);
+	prev_log_addr.boffset = static_cast<uint16_t>(prev_log_addr.boffset
+						      - TRX_UNDO_HISTORY_NODE);
+
 
 	const bool empty = prev_log_addr.page == FIL_NULL;
 
@@ -852,7 +886,7 @@ static void trx_purge_rseg_get_next_history_log(
 
 	log_hdr = trx_undo_page_get_s_latched(
 		page_id_t(purge_sys.rseg->space->id, prev_log_addr.page),
-		&mtr)
+		&mtr)->frame
 		+ prev_log_addr.boffset;
 
 	trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
@@ -863,8 +897,7 @@ static void trx_purge_rseg_get_next_history_log(
 
 	mutex_enter(&purge_sys.rseg->mutex);
 
-	purge_sys.rseg->last_page_no = static_cast<uint32_t>(
-		prev_log_addr.page);
+	purge_sys.rseg->last_page_no = prev_log_addr.page;
 	purge_sys.rseg->set_last_commit(prev_log_addr.boffset, trx_no);
 	purge_sys.rseg->needs_purge = needs_purge != 0;
 
@@ -885,8 +918,8 @@ static void trx_purge_rseg_get_next_history_log(
 /** Position the purge sys "iterator" on the undo record to use for purging. */
 static void trx_purge_read_undo_rec()
 {
-	ulint		offset;
-	ulint		page_no;
+	uint16_t	offset;
+	uint32_t	page_no;
 	ib_uint64_t	undo_no;
 
 	purge_sys.hdr_offset = purge_sys.rseg->last_offset();
@@ -895,13 +928,15 @@ static void trx_purge_read_undo_rec()
 	if (purge_sys.rseg->needs_purge) {
 		mtr_t		mtr;
 		mtr.start();
+		buf_block_t* undo_page;
 		if (trx_undo_rec_t* undo_rec = trx_undo_get_first_rec(
-			    purge_sys.rseg->space, purge_sys.hdr_page_no,
-			    purge_sys.hdr_offset, RW_S_LATCH, &mtr)) {
+			    *purge_sys.rseg->space, purge_sys.hdr_page_no,
+			    purge_sys.hdr_offset, RW_S_LATCH,
+			    undo_page, &mtr)) {
 
 			offset = page_offset(undo_rec);
 			undo_no = trx_undo_rec_get_undo_no(undo_rec);
-			page_no = page_get_page_no(page_align(undo_rec));
+			page_no = undo_page->page.id().page_no();
 		} else {
 			offset = 0;
 			undo_no = 0;
@@ -951,22 +986,14 @@ trx_purge_get_next_rec(
 					handled */
 	mem_heap_t*	heap)		/*!< in: memory heap where copied */
 {
-	trx_undo_rec_t*	rec;
-	trx_undo_rec_t*	rec_copy;
-	trx_undo_rec_t*	rec2;
-	page_t*		undo_page;
-	page_t*		page;
-	ulint		offset;
-	ulint		page_no;
-	ulint		space;
 	mtr_t		mtr;
 
 	ut_ad(purge_sys.next_stored);
-	ut_ad(purge_sys.tail.trx_no < purge_sys.view.low_limit_no());
+	ut_ad(purge_sys.tail.trx_no < purge_sys.low_limit_no());
 
-	space = purge_sys.rseg->space->id;
-	page_no = purge_sys.page_no;
-	offset = purge_sys.offset;
+	const ulint space = purge_sys.rseg->space->id;
+	const uint32_t page_no = purge_sys.page_no;
+	const uint16_t offset = purge_sys.offset;
 
 	if (offset == 0) {
 		/* It is the dummy undo log record, which means that there is
@@ -983,16 +1010,16 @@ trx_purge_get_next_rec(
 
 	mtr_start(&mtr);
 
-	undo_page = trx_undo_page_get_s_latched(page_id_t(space, page_no),
-						&mtr);
-
-	rec = undo_page + offset;
+	buf_block_t* undo_page = trx_undo_page_get_s_latched(
+		page_id_t(space, page_no), &mtr);
+	buf_block_t* rec2_page = undo_page;
 
-	rec2 = trx_undo_page_get_next_rec(rec, purge_sys.hdr_page_no,
-					  purge_sys.hdr_offset);
+	const trx_undo_rec_t* rec2 = trx_undo_page_get_next_rec(
+		undo_page, offset, purge_sys.hdr_page_no, purge_sys.hdr_offset);
 
 	if (rec2 == NULL) {
-		rec2 = trx_undo_get_next_rec(rec, purge_sys.hdr_page_no,
+		rec2 = trx_undo_get_next_rec(rec2_page, offset,
+					     purge_sys.hdr_page_no,
 					     purge_sys.hdr_offset, &mtr);
 	}
 
@@ -1009,22 +1036,19 @@ trx_purge_get_next_rec(
 
 		undo_page = trx_undo_page_get_s_latched(
 			page_id_t(space, page_no), &mtr);
-
-		rec = undo_page + offset;
 	} else {
-		page = page_align(rec2);
-
-		purge_sys.offset = ulint(rec2 - page);
-		purge_sys.page_no = page_get_page_no(page);
+		purge_sys.offset = page_offset(rec2);
+		purge_sys.page_no = rec2_page->page.id().page_no();
 		purge_sys.tail.undo_no = trx_undo_rec_get_undo_no(rec2);
 
-		if (undo_page != page) {
+		if (undo_page != rec2_page) {
 			/* We advance to a new page of the undo log: */
 			(*n_pages_handled)++;
 		}
 	}
 
-	rec_copy = trx_undo_rec_copy(rec, heap);
+	trx_undo_rec_t*	rec_copy = trx_undo_rec_copy(undo_page->frame + offset,
+						     heap);
 
 	mtr_commit(&mtr);
 
@@ -1055,7 +1079,7 @@ trx_purge_fetch_next_rec(
 		}
 	}
 
-	if (purge_sys.tail.trx_no >= purge_sys.view.low_limit_no()) {
+	if (purge_sys.tail.trx_no >= purge_sys.low_limit_no()) {
 
 		return(NULL);
 	}
@@ -1105,7 +1129,7 @@ trx_purge_attach_undo_recs(ulint n_purge_threads)
 		node = (purge_node_t*) thr->child;
 
 		ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
-		ut_ad(node->undo_recs == NULL);
+		ut_ad(node->undo_recs.empty());
 		ut_ad(!node->in_progress);
 		ut_d(node->in_progress = true);
 	}
@@ -1124,11 +1148,13 @@ trx_purge_attach_undo_recs(ulint n_purge_threads)
 
 	i = 0;
 
-	const ulint batch_size = srv_purge_batch_size;
+	const ulint		batch_size = srv_purge_batch_size;
+	std::unordered_map<table_id_t, purge_node_t*> table_id_map;
+	mem_heap_empty(purge_sys.heap);
 
 	while (UNIV_LIKELY(srv_undo_sources) || !srv_fast_shutdown) {
 		purge_node_t*		node;
-		trx_purge_rec_t*	purge_rec;
+		trx_purge_rec_t		purge_rec;
 
 		ut_a(!thr->is_active);
 
@@ -1136,9 +1162,6 @@ trx_purge_attach_undo_recs(ulint n_purge_threads)
 		node = (purge_node_t*) thr->child;
 		ut_a(que_node_get_type(node) == QUE_NODE_PURGE);
 
-		purge_rec = static_cast<trx_purge_rec_t*>(
-			mem_heap_zalloc(node->heap, sizeof(*purge_rec)));
-
 		/* Track the max {trx_id, undo_no} for truncating the
 		UNDO logs once we have purged the records. */
 
@@ -1147,37 +1170,40 @@ trx_purge_attach_undo_recs(ulint n_purge_threads)
 		}
 
 		/* Fetch the next record, and advance the purge_sys.tail. */
-		purge_rec->undo_rec = trx_purge_fetch_next_rec(
-			&purge_rec->roll_ptr, &n_pages_handled, node->heap);
-
-		if (purge_rec->undo_rec != NULL) {
-
-			if (node->undo_recs == NULL) {
-				node->undo_recs = ib_vector_create(
-					ib_heap_allocator_create(node->heap),
-					sizeof(trx_purge_rec_t),
-					batch_size);
-			} else {
-				ut_a(!ib_vector_is_empty(node->undo_recs));
-			}
+		purge_rec.undo_rec = trx_purge_fetch_next_rec(
+			&purge_rec.roll_ptr, &n_pages_handled,
+			purge_sys.heap);
 
-			ib_vector_push(node->undo_recs, purge_rec);
+		if (purge_rec.undo_rec == NULL) {
+			break;
+		} else if (purge_rec.undo_rec == &trx_purge_dummy_rec) {
+			continue;
+		}
 
-			if (n_pages_handled >= batch_size) {
+		table_id_t table_id = trx_undo_rec_get_table_id(
+			purge_rec.undo_rec);
 
-				break;
-			}
+		purge_node_t *& table_node = table_id_map[table_id];
+
+		if (table_node) {
+			node = table_node;
 		} else {
-			break;
-		}
+			thr = UT_LIST_GET_NEXT(thrs, thr);
 
-		thr = UT_LIST_GET_NEXT(thrs, thr);
+			if (!(++i % n_purge_threads)) {
+				thr = UT_LIST_GET_FIRST(
+					purge_sys.query->thrs);
+			}
 
-		if (!(++i % n_purge_threads)) {
-			thr = UT_LIST_GET_FIRST(purge_sys.query->thrs);
+			ut_a(thr != NULL);
+			table_node = node;
 		}
 
-		ut_a(thr != NULL);
+		node->undo_recs.push(purge_rec);
+
+		if (n_pages_handled >= batch_size) {
+			break;
+		}
 	}
 
 	ut_ad(purge_sys.head <= purge_sys.tail);
@@ -1198,14 +1224,11 @@ trx_purge_dml_delay(void)
 	thread. */
 	ulint	delay = 0; /* in microseconds; default: no delay */
 
-	/* If purge lag is set (ie. > 0) then calculate the new DML delay.
-	Note: we do a dirty read of the trx_sys_t data structure here,
-	without holding trx_sys.mutex. */
+	/* If purge lag is set then calculate the new DML delay. */
 
 	if (srv_max_purge_lag > 0) {
-		float	ratio;
-
-		ratio = float(trx_sys.rseg_history_len) / srv_max_purge_lag;
+		double ratio = static_cast<double>(trx_sys.rseg_history_len) /
+			static_cast<double>(srv_max_purge_lag);
 
 		if (ratio > 1.0) {
 			/* If the history list length exceeds the
@@ -1225,54 +1248,41 @@ trx_purge_dml_delay(void)
 	return(delay);
 }
 
+extern tpool::waitable_task purge_worker_task;
+
 /** Wait for pending purge jobs to complete. */
-static
-void
-trx_purge_wait_for_workers_to_complete()
+static void trx_purge_wait_for_workers_to_complete()
 {
-	/* Ensure that the work queue empties out. */
-	while (purge_sys.n_tasks.load(std::memory_order_acquire)) {
+  bool notify_wait = purge_worker_task.is_running();
 
-		if (srv_get_task_queue_length() > 0) {
-			srv_release_threads(SRV_WORKER, 1);
-		}
+  if (notify_wait)
+   tpool::tpool_wait_begin();
 
-		os_thread_yield();
-	}
+  purge_worker_task.wait();
 
-	/* There should be no outstanding tasks as long
-	as the worker threads are active. */
-	ut_a(srv_get_task_queue_length() == 0);
+  if(notify_wait)
+    tpool::tpool_wait_end();
+
+  /* There should be no outstanding tasks as long
+  as the worker threads are active. */
+  ut_ad(srv_get_task_queue_length() == 0);
 }
 
-/*******************************************************************//**
-This function runs a purge batch.
+/**
+Run a purge batch.
+@param n_tasks   number of purge tasks to submit to the queue
+@param truncate  whether to truncate the history at the end of the batch
 @return number of undo log pages handled in the batch */
-ulint
-trx_purge(
-/*======*/
-	ulint	n_purge_threads,	/*!< in: number of purge tasks
-					to submit to the work queue */
-	bool	truncate		/*!< in: truncate history if true */
-#ifdef UNIV_DEBUG
-	, srv_slot_t *slot		/*!< in/out: purge coordinator
-					thread slot */
-#endif
-)
+ulint trx_purge(ulint n_tasks, bool truncate)
 {
 	que_thr_t*	thr = NULL;
 	ulint		n_pages_handled;
 
-	ut_a(n_purge_threads > 0);
+	ut_ad(n_tasks > 0);
 
 	srv_dml_needed_delay = trx_purge_dml_delay();
 
-	/* All submitted tasks should be completed. */
-	ut_ad(purge_sys.n_tasks.load(std::memory_order_relaxed) == 0);
-
-	rw_lock_x_lock(&purge_sys.latch);
-	trx_sys.clone_oldest_view();
-	rw_lock_x_unlock(&purge_sys.latch);
+	purge_sys.clone_oldest_view();
 
 #ifdef UNIV_DEBUG
 	if (srv_purge_view_update_only_debug) {
@@ -1281,25 +1291,22 @@ trx_purge(
 #endif /* UNIV_DEBUG */
 
 	/* Fetch the UNDO recs that need to be purged. */
-	n_pages_handled = trx_purge_attach_undo_recs(n_purge_threads);
-	purge_sys.n_tasks.store(n_purge_threads - 1, std::memory_order_relaxed);
+	n_pages_handled = trx_purge_attach_undo_recs(n_tasks);
 
 	/* Submit tasks to workers queue if using multi-threaded purge. */
-	for (ulint i = n_purge_threads; --i; ) {
+	for (ulint i = n_tasks; --i; ) {
 		thr = que_fork_scheduler_round_robin(purge_sys.query, thr);
 		ut_a(thr);
 		srv_que_task_enqueue_low(thr);
+		srv_thread_pool->submit_task(&purge_worker_task);
 	}
 
 	thr = que_fork_scheduler_round_robin(purge_sys.query, thr);
 
-	ut_d(thr->thread_slot = slot);
 	que_run_threads(thr);
 
 	trx_purge_wait_for_workers_to_complete();
 
-	ut_ad(purge_sys.n_tasks.load(std::memory_order_relaxed) == 0);
-
 	if (truncate) {
 		trx_purge_truncate_history();
 	}
@@ -1309,63 +1316,3 @@ trx_purge(
 
 	return(n_pages_handled);
 }
-
-/** Stop purge during FLUSH TABLES FOR EXPORT */
-void purge_sys_t::stop()
-{
-  rw_lock_x_lock(&latch);
-
-  if (!enabled())
-  {
-    /* Shutdown must have been initiated during FLUSH TABLES FOR EXPORT. */
-    ut_ad(!srv_undo_sources);
-    rw_lock_x_unlock(&latch);
-    return;
-  }
-
-  ut_ad(srv_n_purge_threads > 0);
-
-  if (m_paused++ == 0)
-  {
-    /* We need to wakeup the purge thread in case it is suspended, so
-    that it can acknowledge the state change. */
-    const int64_t sig_count = os_event_reset(event);
-    rw_lock_x_unlock(&latch);
-    ib::info() << "Stopping purge";
-    srv_purge_wakeup();
-    /* Wait for purge coordinator to signal that it is suspended. */
-    os_event_wait_low(event, sig_count);
-    MONITOR_ATOMIC_INC(MONITOR_PURGE_STOP_COUNT);
-    return;
-  }
-
-  rw_lock_x_unlock(&latch);
-
-  if (running())
-  {
-    ib::info() << "Waiting for purge to stop";
-    while (running())
-      os_thread_sleep(10000);
-  }
-}
-
-/** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */
-void purge_sys_t::resume()
-{
-   if (!enabled())
-   {
-     /* Shutdown must have been initiated during FLUSH TABLES FOR EXPORT. */
-     ut_ad(!srv_undo_sources);
-     return;
-   }
-
-   int32_t paused= m_paused--;
-   ut_a(paused);
-
-   if (paused == 1)
-   {
-     ib::info() << "Resuming purge";
-     srv_purge_wakeup();
-     MONITOR_ATOMIC_INC(MONITOR_PURGE_RESUME_COUNT);
-   }
-}
diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc
index 6faa479a322..438dfcf9352 100644
--- a/storage/innobase/trx/trx0rec.cc
+++ b/storage/innobase/trx/trx0rec.cc
@@ -52,96 +52,6 @@ const dtuple_t trx_undo_metadata = {
 
 /*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/
 
-/** Write redo log of writing an undo log record.
-@param[in]	undo_block	undo log page
-@param[in]	old_free	start offset of the undo log record
-@param[in]	new_free	end offset of the undo log record
-@param[in,out]	mtr		mini-transaction */
-static void trx_undof_page_add_undo_rec_log(const buf_block_t* undo_block,
-					    ulint old_free, ulint new_free,
-					    mtr_t* mtr)
-{
-	ut_ad(old_free >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
-	ut_ad(new_free >= old_free);
-	ut_ad(new_free < srv_page_size);
-	ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
-			       + undo_block->frame)
-	      == new_free);
-	mtr->set_modified();
-	switch (mtr->get_log_mode()) {
-	case MTR_LOG_NONE:
-	case MTR_LOG_NO_REDO:
-		return;
-	case MTR_LOG_SHORT_INSERTS:
-		ut_ad(0);
-		/* fall through */
-	case MTR_LOG_ALL:
-		break;
-	}
-
-	const uint32_t
-		len = uint32_t(new_free - old_free - 4),
-		reserved = std::min<uint32_t>(11 + 13 + len,
-					      mtr->get_log()->MAX_DATA_SIZE);
-	byte* log_ptr = mtr->get_log()->open(reserved);
-	const byte* log_end = log_ptr + reserved;
-	log_ptr = mlog_write_initial_log_record_low(
-		MLOG_UNDO_INSERT,
-		undo_block->page.id.space(), undo_block->page.id.page_no(),
-		log_ptr, mtr);
-	mach_write_to_2(log_ptr, len);
-	if (log_ptr + 2 + len <= log_end) {
-		memcpy(log_ptr + 2, undo_block->frame + old_free + 2, len);
-		mlog_close(mtr, log_ptr + 2 + len);
-	} else {
-		mlog_close(mtr, log_ptr + 2);
-		mtr->get_log()->push(undo_block->frame + old_free + 2, len);
-	}
-}
-
-/** Parse MLOG_UNDO_INSERT.
-@param[in]	ptr	log record
-@param[in]	end_ptr	end of log record buffer
-@param[in,out]	page	page or NULL
-@return	end of log record
-@retval	NULL	if the log record is incomplete */
-byte*
-trx_undo_parse_add_undo_rec(
-	const byte*	ptr,
-	const byte*	end_ptr,
-	page_t*		page)
-{
-	ulint	len;
-
-	if (end_ptr < ptr + 2) {
-
-		return(NULL);
-	}
-
-	len = mach_read_from_2(ptr);
-	ptr += 2;
-
-	if (end_ptr < ptr + len) {
-
-		return(NULL);
-	}
-
-	if (page) {
-		ulint first_free = mach_read_from_2(page + TRX_UNDO_PAGE_HDR
-						    + TRX_UNDO_PAGE_FREE);
-		byte* rec = page + first_free;
-
-		mach_write_to_2(rec, first_free + 4 + len);
-		mach_write_to_2(rec + 2 + len, first_free);
-
-		mach_write_to_2(page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE,
-				first_free + 4 + len);
-		memcpy(rec + 2, ptr, len);
-	}
-
-	return(const_cast<byte*>(ptr + len));
-}
-
 /** Calculate the free space left for extending an undo log record.
 @param undo_block    undo log page
 @param ptr           current end of the undo page
@@ -163,7 +73,7 @@ that was written to ptr. Update the first free value by the number of bytes
 written for this undo record.
 @return offset of the inserted entry on the page if succeeded, 0 if fail */
 static
-ulint
+uint16_t
 trx_undo_page_set_next_prev_and_add(
 /*================================*/
 	buf_block_t*	undo_block,	/*!< in/out: undo log page */
@@ -171,39 +81,31 @@ trx_undo_page_set_next_prev_and_add(
 					written on this undo page. */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	ulint		first_free;	/*!< offset within undo_page */
-	ulint		end_of_rec;	/*!< offset within undo_page */
-	byte*		ptr_to_first_free;
-					/* pointer within undo_page
-					that points to the next free
-					offset value within undo_page.*/
-
-	if (UNIV_UNLIKELY(trx_undo_left(undo_block, ptr) < 2)) {
-		return(0);
-	}
+  ut_ad(page_align(ptr) == undo_block->frame);
 
-	ptr_to_first_free = TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
-		+ undo_block->frame;
+  if (UNIV_UNLIKELY(trx_undo_left(undo_block, ptr) < 2))
+    return 0;
 
-	first_free = mach_read_from_2(ptr_to_first_free);
+  byte *ptr_to_first_free= my_assume_aligned<2>(TRX_UNDO_PAGE_HDR +
+						TRX_UNDO_PAGE_FREE +
+						undo_block->frame);
 
-	/* Write offset of the previous undo log record */
-	mach_write_to_2(ptr, first_free);
-	ptr += 2;
-
-	end_of_rec = ulint(ptr - undo_block->frame);
+  const uint16_t first_free= mach_read_from_2(ptr_to_first_free);
 
-	/* Write offset of the next undo log record */
-	mach_write_to_2(undo_block->frame + first_free, end_of_rec);
+  /* Write offset of the previous undo log record */
+  memcpy(ptr, ptr_to_first_free, 2);
+  ptr += 2;
 
-	/* Update the offset to first free undo record */
-	mach_write_to_2(ptr_to_first_free, end_of_rec);
+  const uint16_t end_of_rec= static_cast<uint16_t>(ptr - undo_block->frame);
 
-	/* Write this log entry to the UNDO log */
-	trx_undof_page_add_undo_rec_log(undo_block, first_free,
-					end_of_rec, mtr);
+  /* Update the offset to first free undo record */
+  mach_write_to_2(ptr_to_first_free, end_of_rec);
+  /* Write offset of the next undo log record */
+  memcpy(undo_block->frame + first_free, ptr_to_first_free, 2);
+  const byte *start= undo_block->frame + first_free + 2;
 
-	return(first_free);
+  mtr->undo_append(*undo_block, start, ptr - start - 2);
+  return first_free;
 }
 
 /** Virtual column undo log version. To distinguish it from a length value
@@ -291,7 +193,7 @@ trx_undo_log_v_idx(
 indexed, and return its position
 @param[in]	table		the table
 @param[in]	ptr		undo log pointer
-@param[out]	col_pos		the column number or ULINT_UNDEFINED
+@param[out]	col_pos		the column number or FIL_NULL
 				if the column is not indexed any more
 @return remaining part of undo log record after reading these values */
 static
@@ -299,12 +201,12 @@ const byte*
 trx_undo_read_v_idx_low(
 	const dict_table_t*	table,
 	const byte*		ptr,
-	ulint*			col_pos)
+	uint32_t*		col_pos)
 {
 	ulint		len = mach_read_from_2(ptr);
 	const byte*	old_ptr = ptr;
 
-	*col_pos = ULINT_UNDEFINED;
+	*col_pos = FIL_NULL;
 
 	ptr += 2;
 
@@ -352,7 +254,7 @@ still indexed, and output its position
 				check to see if this is undo log. When
 				first_v_col is true, is_undo_log is output,
 				when first_v_col is false, is_undo_log is input
-@param[in,out]	field_no	the column number
+@param[out]	field_no	the column number, or FIL_NULL if not indexed
 @return remaining part of undo log record after reading these values */
 const byte*
 trx_undo_read_v_idx(
@@ -360,7 +262,7 @@ trx_undo_read_v_idx(
 	const byte*		ptr,
 	bool			first_v_col,
 	bool*			is_undo_log,
-	ulint*			field_no)
+	uint32_t*		field_no)
 {
 	/* Version marker only put on the first virtual column */
 	if (first_v_col) {
@@ -473,7 +375,7 @@ trx_undo_report_insert_virtual(
 Reports in the undo log of an insert of a clustered index record.
 @return offset of the inserted entry on the page if succeed, 0 if fail */
 static
-ulint
+uint16_t
 trx_undo_page_report_insert(
 /*========================*/
 	buf_block_t*	undo_block,	/*!< in: undo log page */
@@ -483,11 +385,7 @@ trx_undo_page_report_insert(
 					inserted to the clustered index */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	ulint		first_free;
-	byte*		ptr;
-	ulint		i;
-
-	ut_ad(dict_index_is_clust(index));
+	ut_ad(index->is_primary());
 	/* MariaDB 10.3.1+ in trx_undo_page_init() always initializes
 	TRX_UNDO_PAGE_TYPE as 0, but previous versions wrote
 	TRX_UNDO_INSERT == 1 into insert_undo pages,
@@ -495,9 +393,11 @@ trx_undo_page_report_insert(
 	ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
 			       + undo_block->frame) <= 2);
 
-	first_free = mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
-				      + undo_block->frame);
-	ptr = undo_block->frame + first_free;
+	uint16_t first_free = mach_read_from_2(my_assume_aligned<2>
+					       (TRX_UNDO_PAGE_HDR
+						+ TRX_UNDO_PAGE_FREE
+						+ undo_block->frame));
+	byte* ptr = undo_block->frame + first_free;
 
 	if (trx_undo_left(undo_block, ptr) < 2 + 1 + 11 + 11) {
 		/* Not enough space for writing the general parameters */
@@ -523,7 +423,7 @@ trx_undo_page_report_insert(
 		goto done;
 	}
 
-	for (i = 0; i < dict_index_get_n_unique(index); i++) {
+	for (unsigned i = 0; i < dict_index_get_n_unique(index); i++) {
 
 		const dfield_t*	field	= dtuple_get_nth_field(clust_entry, i);
 		ulint		flen	= dfield_get_len(field);
@@ -586,12 +486,14 @@ trx_undo_rec_get_pars(
 
 	*updated_extern = !!(type_cmpl & TRX_UNDO_UPD_EXTERN);
 	type_cmpl &= ~TRX_UNDO_UPD_EXTERN;
-
 	*type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1);
+	ut_ad(*type >= TRX_UNDO_RENAME_TABLE);
+	ut_ad(*type <= TRX_UNDO_DEL_MARK_REC);
 	*cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT;
 
 	*undo_no = mach_read_next_much_compressed(&ptr);
 	*table_id = mach_read_next_much_compressed(&ptr);
+	ut_ad(*table_id);
 
 	return(const_cast<byte*>(ptr));
 }
@@ -607,8 +509,8 @@ byte*
 trx_undo_rec_get_col_val(
 	const byte*	ptr,
 	const byte**	field,
-	ulint*		len,
-	ulint*		orig_len)
+	uint32_t*	len,
+	uint32_t*	orig_len)
 {
 	*len = mach_read_next_compressed(&ptr);
 	*orig_len = 0;
@@ -681,8 +583,7 @@ trx_undo_rec_get_row_ref(
 
 	for (i = 0; i < ref_len; i++) {
 		const byte*	field;
-		ulint		len;
-		ulint		orig_len;
+		uint32_t	len, orig_len;
 
 		dfield_t* dfield = dtuple_get_nth_field(tuple, i);
 
@@ -715,8 +616,7 @@ trx_undo_rec_skip_row_ref(
 
 	for (i = 0; i < ref_len; i++) {
 		const byte*	field;
-		ulint		len;
-		ulint		orig_len;
+		uint32_t len, orig_len;
 
 		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
 	}
@@ -870,7 +770,7 @@ record.
 @return byte offset of the inserted undo log entry on the page if
 succeed, 0 if fail */
 static
-ulint
+uint16_t
 trx_undo_page_report_modify(
 /*========================*/
 	buf_block_t*	undo_block,	/*!< in: undo log page */
@@ -889,9 +789,6 @@ trx_undo_page_report_modify(
 					virtual column info */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	ulint		first_free;
-	byte*		ptr;
-
 	ut_ad(index->is_primary());
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	/* MariaDB 10.3.1+ in trx_undo_page_init() always initializes
@@ -901,9 +798,12 @@ trx_undo_page_report_modify(
 	ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
 			       + undo_block->frame) <= 2);
 
-	first_free = mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
-				      + undo_block->frame);
-	ptr = undo_block->frame + first_free;
+	byte* ptr_to_first_free = my_assume_aligned<2>(TRX_UNDO_PAGE_HDR
+						       + TRX_UNDO_PAGE_FREE
+						       + undo_block->frame);
+
+	const uint16_t first_free = mach_read_from_2(ptr_to_first_free);
+	byte *ptr = undo_block->frame + first_free;
 
 	if (trx_undo_left(undo_block, ptr) < 50) {
 		/* NOTE: the value 50 must be big enough so that the general
@@ -1491,15 +1391,14 @@ already_logged:
 	}
 
 	mach_write_to_2(ptr, first_free);
-	ptr += 2;
-	const ulint new_free = ulint(ptr - undo_block->frame);
+	const uint16_t new_free = static_cast<uint16_t>(
+		ptr + 2 - undo_block->frame);
 	mach_write_to_2(undo_block->frame + first_free, new_free);
 
-	mach_write_to_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
-			+ undo_block->frame, new_free);
+	mach_write_to_2(ptr_to_first_free, new_free);
 
-	/* Write to the REDO log about this change in the UNDO log */
-	trx_undof_page_add_undo_rec_log(undo_block, first_free, new_free, mtr);
+	const byte* start = &undo_block->frame[first_free + 2];
+	mtr->undo_append(*undo_block, start, ptr - start);
 	return(first_free);
 }
 
@@ -1515,11 +1414,10 @@ trx_undo_update_rec_get_sys_cols(
 					general parameters */
 	trx_id_t*	trx_id,		/*!< out: trx id */
 	roll_ptr_t*	roll_ptr,	/*!< out: roll ptr */
-	ulint*		info_bits)	/*!< out: info bits state */
+	byte*		info_bits)	/*!< out: info bits state */
 {
 	/* Read the state of the info bits */
-	*info_bits = mach_read_from_1(ptr);
-	ptr += 1;
+	*info_bits = *ptr++;
 
 	/* Read the values of the system columns */
 
@@ -1550,7 +1448,7 @@ trx_undo_update_rec_get_update(
 				the update vector */
 	trx_id_t	trx_id,	/*!< in: transaction id from this undo record */
 	roll_ptr_t	roll_ptr,/*!< in: roll pointer from this undo record */
-	ulint		info_bits,/*!< in: info bits from this undo record */
+	byte		info_bits,/*!< in: info bits from this undo record */
 	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
 				needed is allocated */
 	upd_t**		upd)	/*!< out, own: update vector */
@@ -1598,12 +1496,11 @@ trx_undo_update_rec_get_update(
 	/* Store then the updated ordinary columns to the update vector */
 
 	for (ulint i = 0; i < n_fields; i++) {
-		const byte*	field;
-		ulint		len;
-		ulint		orig_len;
+		const byte* field;
+		uint32_t len, orig_len;
 
 		upd_field = upd_get_nth_field(update, i);
-		ulint field_no = mach_read_next_compressed(&ptr);
+		uint32_t field_no = mach_read_next_compressed(&ptr);
 
 		const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
 
@@ -1615,7 +1512,7 @@ trx_undo_update_rec_get_update(
 				&field_no);
 			first_v_col = false;
 			/* This column could be dropped or no longer indexed */
-			if (field_no == ULINT_UNDEFINED) {
+			if (field_no >= index->n_fields) {
 				/* Mark this is no longer needed */
 				upd_field->field_no = REC_MAX_N_FIELDS;
 
@@ -1627,12 +1524,14 @@ trx_undo_update_rec_get_update(
 				continue;
 			}
 
-			upd_field_set_v_field_no(upd_field, field_no, index);
+			upd_field_set_v_field_no(
+				upd_field, static_cast<uint16_t>(field_no),
+				index);
 		} else if (UNIV_UNLIKELY((update->info_bits
 					  & ~REC_INFO_DELETED_FLAG)
 					 == REC_INFO_MIN_REC_FLAG)) {
 			ut_ad(type == TRX_UNDO_UPD_EXIST_REC);
-			const ulint uf = index->first_user_field();
+			const uint32_t uf = index->first_user_field();
 			ut_ad(field_no >= uf);
 
 			if (update->info_bits != REC_INFO_MIN_REC_FLAG) {
@@ -1682,9 +1581,12 @@ trx_undo_update_rec_get_update(
 							       field_no),
 					&upd_field->new_val.type);
 			}
-			upd_field->field_no = field_no;
+			upd_field->field_no = field_no
+				& dict_index_t::MAX_N_FIELDS;
 		} else if (field_no < index->n_fields) {
-			upd_field_set_field_no(upd_field, field_no, index);
+			upd_field_set_field_no(upd_field,
+					       static_cast<uint16_t>(field_no),
+					       index);
 		} else {
 			ib::error() << "Trying to access update undo rec"
 				" field " << field_no
@@ -1704,7 +1606,7 @@ trx_undo_update_rec_get_update(
 
 		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
 
-		upd_field->orig_len = orig_len;
+		upd_field->orig_len = static_cast<uint16_t>(orig_len);
 
 		if (len == UNIV_SQL_NULL) {
 			dfield_set_null(&upd_field->new_val);
@@ -1824,15 +1726,13 @@ trx_undo_rec_get_partial_row(
 	while (ptr != end_ptr) {
 		dfield_t*	dfield;
 		const byte*	field;
-		ulint		field_no;
+		uint32_t	field_no;
 		const dict_col_t* col;
-		ulint		len;
-		ulint		orig_len;
-		bool		is_virtual;
+		uint32_t len, orig_len;
 
 		field_no = mach_read_next_compressed(&ptr);
 
-		is_virtual = (field_no >= REC_MAX_N_FIELDS);
+		const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
 
 		if (is_virtual) {
 			ptr = trx_undo_read_v_idx(
@@ -1844,7 +1744,7 @@ trx_undo_rec_get_partial_row(
 		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
 
 		/* This column could be dropped or no longer indexed */
-		if (field_no == ULINT_UNDEFINED) {
+		if (field_no == FIL_NULL) {
 			ut_ad(is_virtual);
 			continue;
 		}
@@ -1941,22 +1841,6 @@ trx_undo_rec_get_partial_row(
 	return(const_cast<byte*>(ptr));
 }
 
-/** Erase the unused undo log page end.
-@param[in,out]	undo_page	undo log page
-@return whether the page contained something */
-bool
-trx_undo_erase_page_end(page_t* undo_page)
-{
-	ulint	first_free;
-
-	first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
-				      + TRX_UNDO_PAGE_FREE);
-	memset(undo_page + first_free, 0,
-	       (srv_page_size - FIL_PAGE_DATA_END) - first_free);
-
-	return(first_free != TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
-}
-
 /** Report a RENAME TABLE operation.
 @param[in,out]	trx	transaction
 @param[in]	table	table that is being renamed
@@ -1965,16 +1849,17 @@ trx_undo_erase_page_end(page_t* undo_page)
 @return	byte offset of the undo log record
 @retval	0	in case of failure */
 static
-ulint
+uint16_t
 trx_undo_page_report_rename(trx_t* trx, const dict_table_t* table,
 			    buf_block_t* block, mtr_t* mtr)
 {
-	byte*	ptr_first_free  = TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
-		+ block->frame;
-	ulint	first_free = mach_read_from_2(ptr_first_free);
+	byte*	ptr_first_free  = my_assume_aligned<2>(TRX_UNDO_PAGE_HDR
+						       + TRX_UNDO_PAGE_FREE
+						       + block->frame);
+	const uint16_t first_free = mach_read_from_2(ptr_first_free);
 	ut_ad(first_free >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
-	ut_ad(first_free <= srv_page_size);
-	byte* start = block->frame + first_free;
+	ut_ad(first_free <= srv_page_size - FIL_PAGE_DATA_END);
+	byte* const start = block->frame + first_free;
 	size_t len = strlen(table->name.m_name);
 	const size_t fixed = 2 + 1 + 11 + 11 + 2;
 	ut_ad(len <= NAME_LEN * 2 + 1);
@@ -1996,12 +1881,9 @@ trx_undo_page_report_rename(trx_t* trx, const dict_table_t* table,
 	memcpy(ptr, table->name.m_name, len);
 	ptr += len;
 	mach_write_to_2(ptr, first_free);
-	ptr += 2;
-	ulint offset = page_offset(ptr);
-	mach_write_to_2(start, offset);
-	mach_write_to_2(ptr_first_free, offset);
-
-	trx_undof_page_add_undo_rec_log(block, first_free, offset, mtr);
+	mach_write_to_2(ptr_first_free, ptr + 2 - block->frame);
+	memcpy(start, ptr_first_free, 2);
+	mtr->undo_append(*block, start + 2, ptr - start - 2);
 	return first_free;
 }
 
@@ -2024,9 +1906,10 @@ dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table)
 		ut_ad(undo);
 		for (ut_d(int loop_count = 0);;) {
 			ut_ad(loop_count++ < 2);
-			ut_ad(undo->last_page_no == block->page.id.page_no());
+			ut_ad(undo->last_page_no
+			      == block->page.id().page_no());
 
-			if (ulint offset = trx_undo_page_report_rename(
+			if (uint16_t offset = trx_undo_page_report_rename(
 				    trx, table, block, &mtr)) {
 				undo->top_page_no = undo->last_page_no;
 				undo->top_offset  = offset;
@@ -2128,7 +2011,7 @@ trx_undo_report_row_operation(
 	ut_ad(undo != NULL);
 
 	do {
-		ulint	offset = !rec
+		uint16_t offset = !rec
 			? trx_undo_page_report_insert(
 				undo_block, trx, index, clust_entry, &mtr)
 			: trx_undo_page_report_modify(
@@ -2136,7 +2019,15 @@ trx_undo_report_row_operation(
 				cmpl_info, clust_entry, &mtr);
 
 		if (UNIV_UNLIKELY(offset == 0)) {
-			if (!trx_undo_erase_page_end(undo_block->frame)) {
+			const uint16_t first_free = mach_read_from_2(
+				TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+				+ undo_block->frame);
+			memset(undo_block->frame + first_free, 0,
+			       (srv_page_size - FIL_PAGE_DATA_END)
+			       - first_free);
+
+			if (first_free
+			    == TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE) {
 				/* The record did not fit on an empty
 				undo page. Discard the freshly allocated
 				page and return an error. */
@@ -2162,12 +2053,27 @@ trx_undo_report_row_operation(
 
 				err = DB_UNDO_RECORD_TOO_BIG;
 				goto err_exit;
+			} else {
+				/* Write log for clearing the unused
+				tail of the undo page. It might
+				contain some garbage from a previously
+				written record, and mtr_t::write()
+				will optimize away writes of unchanged
+				bytes. Failure to write this caused a
+				recovery failure when we avoided
+				reading the undo log page from the
+				data file and initialized it based on
+				redo log records (which included the
+				write of the previous garbage). */
+				mtr.memset(*undo_block, first_free,
+					   srv_page_size - first_free
+					   - FIL_PAGE_DATA_END, 0);
 			}
 
 			mtr.commit();
 		} else {
 			/* Success */
-			undo->top_page_no = undo_block->page.id.page_no();
+			undo->top_page_no = undo_block->page.id().page_no();
 			mtr.commit();
 			undo->top_offset  = offset;
 			undo->top_undo_no = trx->undo_no++;
@@ -2200,7 +2106,7 @@ trx_undo_report_row_operation(
 			return(DB_SUCCESS);
 		}
 
-		ut_ad(undo_block->page.id.page_no() == undo->last_page_no);
+		ut_ad(undo_block->page.id().page_no() == undo->last_page_no);
 
 		/* We have to extend the undo log by one page */
 
@@ -2248,11 +2154,10 @@ trx_undo_get_undo_rec_low(
 {
 	trx_undo_rec_t*	undo_rec;
 	ulint		rseg_id;
-	ulint		page_no;
-	ulint		offset;
-	const page_t*	undo_page;
+	uint32_t	page_no;
+	uint16_t	offset;
 	trx_rseg_t*	rseg;
-	ibool		is_insert;
+	bool		is_insert;
 	mtr_t		mtr;
 
 	trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no,
@@ -2262,14 +2167,14 @@ trx_undo_get_undo_rec_low(
 	rseg = trx_sys.rseg_array[rseg_id];
 	ut_ad(rseg->is_persistent());
 
-	mtr_start(&mtr);
+	mtr.start();
 
-	undo_page = trx_undo_page_get_s_latched(
+	buf_block_t* undo_page = trx_undo_page_get_s_latched(
 		page_id_t(rseg->space->id, page_no), &mtr);
 
-	undo_rec = trx_undo_rec_copy(undo_page + offset, heap);
+	undo_rec = trx_undo_rec_copy(undo_page->frame + offset, heap);
 
-	mtr_commit(&mtr);
+	mtr.commit();
 
 	return(undo_rec);
 }
@@ -2295,11 +2200,9 @@ trx_undo_get_undo_rec(
 	const table_name_t&	name,
 	trx_undo_rec_t**	undo_rec)
 {
-	bool		missing_history;
-
 	rw_lock_s_lock(&purge_sys.latch);
 
-	missing_history = purge_sys.view.changes_visible(trx_id, name);
+	bool missing_history = purge_sys.changes_visible(trx_id, name);
 	if (!missing_history) {
 		*undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
 	}
@@ -2361,16 +2264,16 @@ trx_undo_prev_version_build(
 	roll_ptr_t	roll_ptr;
 	upd_t*		update;
 	byte*		ptr;
-	ulint		info_bits;
+	byte		info_bits;
 	ulint		cmpl_info;
 	bool		dummy_extern;
 	byte*		buf;
 
 	ut_ad(!index->table->is_temporary());
 	ut_ad(!rw_lock_own(&purge_sys.latch, RW_LOCK_S));
-	ut_ad(mtr_memo_contains_page_flagged(index_mtr, index_rec,
-					     MTR_MEMO_PAGE_S_FIX
-					     | MTR_MEMO_PAGE_X_FIX));
+	ut_ad(index_mtr->memo_contains_page_flagged(index_rec,
+						    MTR_MEMO_PAGE_S_FIX
+						    | MTR_MEMO_PAGE_X_FIX));
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_a(index->is_primary());
 
@@ -2465,7 +2368,7 @@ trx_undo_prev_version_build(
 
 			rw_lock_s_lock(&purge_sys.latch);
 
-			missing_extern = purge_sys.view.changes_visible(
+			missing_extern = purge_sys.changes_visible(
 				trx_id,	index->table->name);
 
 			rw_lock_s_unlock(&purge_sys.latch);
@@ -2508,7 +2411,53 @@ trx_undo_prev_version_build(
 
 		*old_vers = rec_copy(buf, rec, offsets);
 		rec_offs_make_valid(*old_vers, index, true, offsets);
-		row_upd_rec_in_place(*old_vers, index, offsets, update, NULL);
+		rec_set_bit_field_1(*old_vers, update->info_bits,
+				    rec_offs_comp(offsets)
+				    ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS,
+				    REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+		for (ulint i = 0; i < update->n_fields; i++) {
+			const upd_field_t* uf = upd_get_nth_field(update, i);
+			if (upd_fld_is_virtual_col(uf)) {
+				/* There are no virtual columns in
+				a clustered index record. */
+				continue;
+			}
+			const ulint n = uf->field_no;
+			ut_ad(!dfield_is_ext(&uf->new_val)
+			      == !rec_offs_nth_extern(offsets, n));
+			ut_ad(!rec_offs_nth_default(offsets, n));
+
+			if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) {
+				if (rec_offs_nth_sql_null(offsets, n)) {
+					ut_ad(index->table->is_instant());
+					ut_ad(n >= index->n_core_fields);
+					continue;
+				}
+				ut_ad(!index->table->not_redundant());
+				ulint l = rec_get_1byte_offs_flag(*old_vers)
+					? (n + 1) : (n + 1) * 2;
+				byte* b = *old_vers - REC_N_OLD_EXTRA_BYTES
+					- l;
+				*b= byte(*b | REC_1BYTE_SQL_NULL_MASK);
+				compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
+						    == REC_2BYTE_SQL_NULL_MASK);
+				continue;
+			}
+
+			ulint len;
+			memcpy(rec_get_nth_field(*old_vers, offsets, n, &len),
+			       uf->new_val.data, uf->new_val.len);
+			if (UNIV_UNLIKELY(len != uf->new_val.len)) {
+				ut_ad(len == UNIV_SQL_NULL);
+				ut_ad(!rec_offs_comp(offsets));
+				ut_ad(uf->new_val.len
+				      == rec_get_nth_field_size(rec, n));
+				ulint l = rec_get_1byte_offs_flag(*old_vers)
+					? (n + 1) : (n + 1) * 2;
+				*(*old_vers - REC_N_OLD_EXTRA_BYTES - l)
+					&= byte(~REC_1BYTE_SQL_NULL_MASK);
+			}
+		}
 	}
 
 	/* Set the old value (which is the after image of an update) in the
@@ -2563,17 +2512,14 @@ trx_undo_read_v_cols(
 	end_ptr = ptr + mach_read_from_2(ptr);
 	ptr += 2;
 	while (ptr < end_ptr) {
-		dfield_t*	dfield;
-		const byte*	field;
-		ulint		field_no;
-		ulint		len;
-		ulint		orig_len;
-		bool		is_virtual;
+		dfield_t* dfield;
+		const byte* field;
+		uint32_t field_no, len, orig_len;
 
 		field_no = mach_read_next_compressed(
 				const_cast<const byte**>(&ptr));
 
-		is_virtual = (field_no >= REC_MAX_N_FIELDS);
+		const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
 
 		if (is_virtual) {
 			ptr = trx_undo_read_v_idx(
@@ -2588,7 +2534,7 @@ trx_undo_read_v_cols(
 		/* The virtual column is no longer indexed or does not exist.
 		This needs to put after trx_undo_rec_get_col_val() so the
 		undo ptr advances */
-		if (field_no == ULINT_UNDEFINED) {
+		if (field_no == FIL_NULL) {
 			ut_ad(is_virtual);
 			continue;
 		}
diff --git a/storage/innobase/trx/trx0roll.cc b/storage/innobase/trx/trx0roll.cc
index 841827b0709..23aa950a14a 100644
--- a/storage/innobase/trx/trx0roll.cc
+++ b/storage/innobase/trx/trx0roll.cc
@@ -44,6 +44,10 @@ Created 3/26/1996 Heikki Tuuri
 #include "trx0trx.h"
 #include "trx0undo.h"
 
+#ifdef UNIV_PFS_THREAD
+mysql_pfs_key_t	trx_rollback_clean_thread_key;
+#endif
+
 /** true if trx_rollback_all_recovered() thread is active */
 bool			trx_rollback_is_active;
 
@@ -51,144 +55,122 @@ bool			trx_rollback_is_active;
 const trx_t*		trx_roll_crash_recv_trx;
 
 /** Finish transaction rollback.
-@param[in,out]	trx	transaction
 @return	whether the rollback was completed normally
 @retval	false	if the rollback was aborted by shutdown  */
-static bool trx_rollback_finish(trx_t* trx)
+inline bool trx_t::rollback_finish()
 {
-	trx->mod_tables.clear();
-	bool finished = trx->error_state == DB_SUCCESS;
-	if (UNIV_LIKELY(finished)) {
-		trx->commit();
-	} else {
-		ut_a(trx->error_state == DB_INTERRUPTED);
-		ut_ad(srv_shutdown_state != SRV_SHUTDOWN_NONE);
-		ut_a(!srv_undo_sources);
-		ut_ad(srv_fast_shutdown);
-		ut_d(trx->in_rollback = false);
-		if (trx_undo_t*& undo = trx->rsegs.m_redo.undo) {
-			UT_LIST_REMOVE(trx->rsegs.m_redo.rseg->undo_list,
-				       undo);
-			ut_free(undo);
-			undo = NULL;
-		}
-		if (trx_undo_t*& undo = trx->rsegs.m_noredo.undo) {
-			UT_LIST_REMOVE(trx->rsegs.m_noredo.rseg->undo_list,
-				       undo);
-			ut_free(undo);
-			undo = NULL;
-		}
-		trx->commit_low();
-	}
-
-	trx->lock.que_state = TRX_QUE_RUNNING;
+  mod_tables.clear();
+  if (UNIV_LIKELY(error_state == DB_SUCCESS))
+  {
+    commit();
+    return true;
+  }
 
-	return finished;
+  ut_a(error_state == DB_INTERRUPTED);
+  ut_ad(srv_shutdown_state != SRV_SHUTDOWN_NONE);
+  ut_a(!srv_undo_sources);
+  ut_ad(srv_fast_shutdown);
+  ut_d(in_rollback= false);
+  if (trx_undo_t *&undo= rsegs.m_redo.undo)
+  {
+    UT_LIST_REMOVE(rsegs.m_redo.rseg->undo_list, undo);
+    ut_free(undo);
+    undo= nullptr;
+  }
+  if (trx_undo_t *&undo= rsegs.m_noredo.undo)
+  {
+    UT_LIST_REMOVE(rsegs.m_noredo.rseg->undo_list, undo);
+    ut_free(undo);
+    undo= nullptr;
+  }
+  commit_low();
+  lock.que_state= TRX_QUE_RUNNING;
+  return false;
 }
 
-/*******************************************************************//**
-Rollback a transaction used in MySQL. */
-static
-void
-trx_rollback_to_savepoint_low(
-/*==========================*/
-	trx_t*		trx,	/*!< in: transaction handle */
-	trx_savept_t*	savept)	/*!< in: pointer to savepoint undo number, if
-				partial rollback requested, or NULL for
-				complete rollback */
+/** Roll back an active transaction. */
+inline void trx_t::rollback_low(trx_savept_t *savept)
 {
-	que_thr_t*	thr;
-	mem_heap_t*	heap;
-	roll_node_t*	roll_node;
-
-	heap = mem_heap_create(512);
-
-	roll_node = roll_node_create(heap);
-	ut_ad(!trx->in_rollback);
-
-	if (savept != NULL) {
-		roll_node->savept = savept;
-		ut_ad(trx->mysql_thd);
-		ut_ad(!trx->is_recovered);
-		ut_ad(trx->state == TRX_STATE_ACTIVE);
-	} else {
-		ut_d(trx_state_t state = trx->state);
-		ut_ad(state == TRX_STATE_ACTIVE
-		      || state == TRX_STATE_PREPARED
-		      || state == TRX_STATE_PREPARED_RECOVERED);
-	}
-
-	trx->error_state = DB_SUCCESS;
-
-	if (trx->has_logged()) {
-
-		ut_ad(trx->rsegs.m_redo.rseg != 0
-		      || trx->rsegs.m_noredo.rseg != 0);
+  mem_heap_t *heap= mem_heap_create(512);
+  roll_node_t *roll_node= roll_node_create(heap);
+  roll_node->savept= savept;
 
-		thr = pars_complete_graph_for_exec(roll_node, trx, heap, NULL);
-
-		ut_a(thr == que_fork_start_command(
-			static_cast<que_fork_t*>(que_node_get_parent(thr))));
-
-		que_run_threads(thr);
-
-		ut_a(roll_node->undo_thr != NULL);
-		que_run_threads(roll_node->undo_thr);
+  ut_ad(!in_rollback);
+#ifdef UNIV_DEBUG
+  {
+    const auto s= state;
+    ut_ad(s == TRX_STATE_ACTIVE ||
+          s == TRX_STATE_PREPARED ||
+          s == TRX_STATE_PREPARED_RECOVERED);
+    if (savept)
+    {
+      ut_ad(s == TRX_STATE_ACTIVE);
+      ut_ad(mysql_thd);
+      ut_ad(!is_recovered);
+    }
+  }
+#endif
 
-		/* Free the memory reserved by the undo graph. */
-		que_graph_free(static_cast<que_t*>(
-				       roll_node->undo_thr->common.parent));
-	}
+  error_state = DB_SUCCESS;
 
-	if (savept == NULL) {
-		trx_rollback_finish(trx);
-		MONITOR_INC(MONITOR_TRX_ROLLBACK);
-	} else {
-		ut_a(trx->error_state == DB_SUCCESS);
-		const undo_no_t limit = savept->least_undo_no;
-		for (trx_mod_tables_t::iterator i = trx->mod_tables.begin();
-		     i != trx->mod_tables.end(); ) {
-			trx_mod_tables_t::iterator j = i++;
-			ut_ad(j->second.valid());
-			if (j->second.rollback(limit)) {
-				trx->mod_tables.erase(j);
-			}
-		}
-		trx->lock.que_state = TRX_QUE_RUNNING;
-		MONITOR_INC(MONITOR_TRX_ROLLBACK_SAVEPOINT);
-	}
+  if (has_logged())
+  {
+    ut_ad(rsegs.m_redo.rseg || rsegs.m_noredo.rseg);
+    que_thr_t *thr= pars_complete_graph_for_exec(roll_node, this, heap,
+                                                 nullptr);
+    ut_a(thr == que_fork_start_command(static_cast<que_fork_t*>
+                                       (que_node_get_parent(thr))));
+    que_run_threads(thr);
+    que_run_threads(roll_node->undo_thr);
+
+    /* Free the memory reserved by the undo graph. */
+    que_graph_free(static_cast<que_t*>(roll_node->undo_thr->common.parent));
+  }
 
-	mem_heap_free(heap);
+  if (!savept)
+  {
+    rollback_finish();
+    MONITOR_INC(MONITOR_TRX_ROLLBACK);
+  }
+  else
+  {
+    ut_a(error_state == DB_SUCCESS);
+    const undo_no_t limit= savept->least_undo_no;
+    for (trx_mod_tables_t::iterator i= mod_tables.begin();
+	 i != mod_tables.end(); )
+    {
+      trx_mod_tables_t::iterator j= i++;
+      ut_ad(j->second.valid());
+      if (j->second.rollback(limit))
+        mod_tables.erase(j);
+    }
+    lock.que_state= TRX_QUE_RUNNING;
+    MONITOR_INC(MONITOR_TRX_ROLLBACK_SAVEPOINT);
+  }
 
-	/* There might be work for utility threads.*/
-	srv_active_wake_master_thread();
+  mem_heap_free(heap);
 
-	MONITOR_DEC(MONITOR_TRX_ACTIVE);
+  MONITOR_DEC(MONITOR_TRX_ACTIVE);
 }
 
-/*******************************************************************//**
-Rollback a transaction to a given savepoint or do a complete rollback.
+/** Initiate rollback.
+@param savept     savepoint
 @return error code or DB_SUCCESS */
-dberr_t
-trx_rollback_to_savepoint(
-/*======================*/
-	trx_t*		trx,	/*!< in: transaction handle */
-	trx_savept_t*	savept)	/*!< in: pointer to savepoint undo number, if
-				partial rollback requested, or NULL for
-				complete rollback */
+dberr_t trx_t::rollback(trx_savept_t *savept)
 {
+  ut_ad(!trx_mutex_own(this));
+  if (state == TRX_STATE_NOT_STARTED)
+  {
+    error_state= DB_SUCCESS;
+    return DB_SUCCESS;
+  }
+  ut_ad(state == TRX_STATE_ACTIVE);
 #ifdef WITH_WSREP
-	if (!savept && trx->is_wsrep() && wsrep_thd_is_SR(trx->mysql_thd)) {
-		wsrep_handle_SR_rollback(NULL, trx->mysql_thd);
-	}
+  if (!savept && is_wsrep() && wsrep_thd_is_SR(mysql_thd))
+    wsrep_handle_SR_rollback(nullptr, mysql_thd);
 #endif /* WITH_WSREP */
-	ut_ad(!trx_mutex_own(trx));
-
-	trx_start_if_not_started_xa(trx, true);
-
-	trx_rollback_to_savepoint_low(trx, savept);
-
-	return(trx->error_state);
+  rollback_low(savept);
+  return error_state;
 }
 
 /*******************************************************************//**
@@ -207,7 +189,7 @@ trx_rollback_for_mysql_low(
 	object, and we set a dummy session that we use for all MySQL
 	transactions. */
 
-	trx_rollback_to_savepoint_low(trx, NULL);
+	trx->rollback_low();
 
 	trx->op_info = "";
 
@@ -219,7 +201,7 @@ trx_rollback_for_mysql_low(
 @return error code or DB_SUCCESS */
 dberr_t trx_rollback_for_mysql(trx_t* trx)
 {
-	/* We are reading trx->state without holding trx_sys.mutex
+	/* We are reading trx->state without holding trx->mutex
 	here, because the rollback should be invoked for a running
 	active MySQL transaction (or recovered prepared transaction)
 	that is associated with the current thread. */
@@ -243,7 +225,7 @@ dberr_t trx_rollback_for_mysql(trx_t* trx)
 	case TRX_STATE_PREPARED:
 	case TRX_STATE_PREPARED_RECOVERED:
 		ut_ad(!trx->is_autocommit_non_locking());
-		if (trx->has_logged_persistent()) {
+		if (trx->rsegs.m_redo.undo) {
 			/* The XA ROLLBACK of a XA PREPARE transaction
 			will consist of multiple mini-transactions.
 
@@ -299,7 +281,7 @@ trx_rollback_last_sql_stat_for_mysql(
 {
 	dberr_t	err;
 
-	/* We are reading trx->state without holding trx_sys.mutex
+	/* We are reading trx->state without holding trx->mutex
 	here, because the statement rollback should be invoked for a
 	running active MySQL transaction that is associated with the
 	current thread. */
@@ -316,8 +298,7 @@ trx_rollback_last_sql_stat_for_mysql(
 
 		trx->op_info = "rollback of SQL statement";
 
-		err = trx_rollback_to_savepoint(
-			trx, &trx->last_sql_stat_start);
+		err = trx->rollback(&trx->last_sql_stat_start);
 
 		if (trx->fts_trx != NULL) {
 			fts_savepoint_rollback_last_stmt(trx);
@@ -358,8 +339,7 @@ trx_savepoint_find(
 	for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
 	     savep != NULL;
 	     savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) {
-
-		if (0 == ut_strcmp(savep->name, name)) {
+		if (!strcmp(savep->name, name)) {
 			return(savep);
 		}
 	}
@@ -439,7 +419,7 @@ trx_rollback_to_savepoint_for_mysql_low(
 
 	trx->op_info = "rollback to a savepoint";
 
-	err = trx_rollback_to_savepoint(trx, &savep->savept);
+	err = trx->rollback(&savep->savept);
 
 	/* Store the current undo_no of the transaction so that
 	we know where to roll back if we have to roll back the
@@ -477,7 +457,7 @@ trx_rollback_to_savepoint_for_mysql(
 {
 	trx_named_savept_t*	savep;
 
-	/* We are reading trx->state without holding trx_sys.mutex
+	/* We are reading trx->state without holding trx->mutex
 	here, because the savepoint rollback should be invoked for a
 	running active MySQL transaction that is associated with the
 	current thread. */
@@ -650,7 +630,7 @@ trx_rollback_active(
 	que_graph_free(
 		static_cast<que_t*>(roll_node->undo_thr->common.parent));
 
-	if (UNIV_UNLIKELY(!trx_rollback_finish(trx))) {
+	if (UNIV_UNLIKELY(!trx->rollback_finish())) {
 		ut_ad(!dictionary_locked);
 		goto func_exit;
 	}
@@ -721,8 +701,7 @@ void trx_roll_report_progress()
 		rows they modified. Numbers must be accurate, because only this
 		thread is allowed to touch recovered transactions. */
 		trx_sys.rw_trx_hash.iterate_no_dups(
-			reinterpret_cast<my_hash_walk_action>
-			(trx_roll_count_callback), &arg);
+			trx_roll_count_callback, &arg);
 
 		if (arg.n_rows > 0) {
 			service_manager_extend_timeout(
@@ -780,8 +759,7 @@ void trx_rollback_recovered(bool all)
     other thread is allowed to modify or remove these transactions from
     rw_trx_hash.
   */
-  trx_sys.rw_trx_hash.iterate_no_dups(reinterpret_cast<my_hash_walk_action>
-                                      (trx_rollback_recovered_callback),
+  trx_sys.rw_trx_hash.iterate_no_dups(trx_rollback_recovered_callback,
                                       &trx_list);
 
   while (!trx_list.empty())
@@ -835,7 +813,6 @@ discard:
   }
 }
 
-
 /*******************************************************************//**
 Rollback or clean up any incomplete transactions which were
 encountered in crash recovery.  If the transaction already was
diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc
index 49b93541620..34e1ccfc277 100644
--- a/storage/innobase/trx/trx0rseg.cc
+++ b/storage/innobase/trx/trx0rseg.cc
@@ -31,8 +31,6 @@ Created 3/26/1996 Heikki Tuuri
 #include "trx0purge.h"
 #include "srv0mon.h"
 
-#include <algorithm>
-
 #ifdef WITH_WSREP
 #include <mysql/service_wsrep.h>
 
@@ -49,7 +47,7 @@ static unsigned char wsrep_uuid[16];
 @param[in,out]	mtr		mini transaction */
 static void
 trx_rseg_write_wsrep_checkpoint(
-	trx_rsegf_t*	rseg_header,
+	buf_block_t*	rseg_header,
 	const XID*	xid,
 	mtr_t*		mtr)
 {
@@ -57,26 +55,34 @@ trx_rseg_write_wsrep_checkpoint(
 	DBUG_ASSERT(xid->bqual_length >= 0);
 	DBUG_ASSERT(xid->gtrid_length + xid->bqual_length < XIDDATASIZE);
 
-	mlog_write_ulint(TRX_RSEG_WSREP_XID_FORMAT + rseg_header,
-			 uint32_t(xid->formatID),
-			 MLOG_4BYTES, mtr);
+	mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+				       TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT
+				       + rseg_header->frame,
+				       uint32_t(xid->formatID));
 
-	mlog_write_ulint(TRX_RSEG_WSREP_XID_GTRID_LEN + rseg_header,
-			 uint32_t(xid->gtrid_length),
-			 MLOG_4BYTES, mtr);
+	mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+				       TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN
+				       + rseg_header->frame,
+				       uint32_t(xid->gtrid_length));
 
-	mlog_write_ulint(TRX_RSEG_WSREP_XID_BQUAL_LEN + rseg_header,
-			 uint32_t(xid->bqual_length),
-			 MLOG_4BYTES, mtr);
+	mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+				       TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN
+				       + rseg_header->frame,
+				       uint32_t(xid->bqual_length));
 
 	const ulint xid_length = static_cast<ulint>(xid->gtrid_length
 						    + xid->bqual_length);
-	mlog_write_string(TRX_RSEG_WSREP_XID_DATA + rseg_header,
-			  reinterpret_cast<const byte*>(xid->data),
-			  xid_length, mtr);
-	if (UNIV_LIKELY(xid_length < XIDDATASIZE)) {
-		mlog_memset(TRX_RSEG_WSREP_XID_DATA + rseg_header + xid_length,
-			    XIDDATASIZE - xid_length, 0, mtr);
+	mtr->memcpy<mtr_t::MAYBE_NOP>(*rseg_header,
+				      TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+				      + rseg_header->frame,
+				      xid->data, xid_length);
+	if (xid_length < XIDDATASIZE
+	    && memcmp(TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+		      + rseg_header->frame, field_ref_zero,
+		      XIDDATASIZE - xid_length)) {
+		mtr->memset(rseg_header,
+			    TRX_RSEG + TRX_RSEG_WSREP_XID_DATA + xid_length,
+			    XIDDATASIZE - xid_length, 0);
 	}
 }
 
@@ -86,7 +92,7 @@ trx_rseg_write_wsrep_checkpoint(
 @param[in,out]	mtr		mini-transaction */
 void
 trx_rseg_update_wsrep_checkpoint(
-	trx_rsegf_t*	rseg_header,
+	buf_block_t*	rseg_header,
 	const XID*	xid,
 	mtr_t*		mtr)
 {
@@ -109,16 +115,13 @@ trx_rseg_update_wsrep_checkpoint(
 }
 
 /** Clear the WSREP XID information from rollback segment header.
-@param[in,out]	rseg_header	Rollback segment header
-@param[in,out]	mtr 		mini-transaction */
-static void
-trx_rseg_clear_wsrep_checkpoint(
-	trx_rsegf_t*	rseg_header,
-	mtr_t*		mtr)
+@param[in,out]	block	rollback segment header
+@param[in,out]	mtr 	mini-transaction */
+static void trx_rseg_clear_wsrep_checkpoint(buf_block_t *block, mtr_t *mtr)
 {
-	mlog_memset(rseg_header + TRX_RSEG_WSREP_XID_INFO,
-		    TRX_RSEG_WSREP_XID_DATA + XIDDATASIZE
-		    - TRX_RSEG_WSREP_XID_INFO, 0, mtr);
+  mtr->memset(block, TRX_RSEG + TRX_RSEG_WSREP_XID_INFO,
+              TRX_RSEG_WSREP_XID_DATA + XIDDATASIZE - TRX_RSEG_WSREP_XID_INFO,
+              0);
 }
 
 static void
@@ -133,9 +136,10 @@ trx_rseg_update_wsrep_checkpoint(const XID* xid, mtr_t* mtr)
 					     sizeof wsrep_uuid);
 	const trx_rseg_t* rseg = trx_sys.rseg_array[0];
 
-	trx_rsegf_t* rseg_header = trx_rsegf_get(rseg->space, rseg->page_no,
+	buf_block_t* rseg_header = trx_rsegf_get(rseg->space, rseg->page_no,
 						 mtr);
-	if (UNIV_UNLIKELY(mach_read_from_4(rseg_header + TRX_RSEG_FORMAT))) {
+	if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+					   + rseg_header->frame))) {
 		trx_rseg_format_upgrade(rseg_header, mtr);
 	}
 
@@ -178,25 +182,26 @@ void trx_rseg_update_wsrep_checkpoint(const XID* xid)
 @param[out]	xid		Transaction XID
 @return	whether the WSREP XID was present */
 static
-bool trx_rseg_read_wsrep_checkpoint(const trx_rsegf_t* rseg_header, XID& xid)
+bool trx_rseg_read_wsrep_checkpoint(const buf_block_t *rseg_header, XID &xid)
 {
 	int formatID = static_cast<int>(
-		mach_read_from_4(
-			TRX_RSEG_WSREP_XID_FORMAT + rseg_header));
+		mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT
+				 + rseg_header->frame));
 	if (formatID == 0) {
 		return false;
 	}
 
 	xid.formatID = formatID;
 	xid.gtrid_length = static_cast<int>(
-		mach_read_from_4(
-			TRX_RSEG_WSREP_XID_GTRID_LEN + rseg_header));
+		mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN
+				 + rseg_header->frame));
 
 	xid.bqual_length = static_cast<int>(
-		mach_read_from_4(
-			TRX_RSEG_WSREP_XID_BQUAL_LEN + rseg_header));
+		mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN
+				 + rseg_header->frame));
 
-	memcpy(xid.data, TRX_RSEG_WSREP_XID_DATA + rseg_header, XIDDATASIZE);
+	memcpy(xid.data, TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+	       + rseg_header->frame, XIDDATASIZE);
 
 	return true;
 }
@@ -252,10 +257,11 @@ bool trx_rseg_read_wsrep_checkpoint(XID& xid)
 			continue;
 		}
 
-		const trx_rsegf_t* rseg_header = trx_rsegf_get_new(
+		const buf_block_t* rseg_header = trx_rsegf_get_new(
 			trx_sysf_rseg_get_space(sys, rseg_id), page_no, &mtr);
 
-		if (mach_read_from_4(rseg_header + TRX_RSEG_FORMAT)) {
+		if (mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+				     + rseg_header->frame)) {
 			continue;
 		}
 
@@ -279,17 +285,15 @@ bool trx_rseg_read_wsrep_checkpoint(XID& xid)
 /** Upgrade a rollback segment header page to MariaDB 10.3 format.
 @param[in,out]	rseg_header	rollback segment header page
 @param[in,out]	mtr		mini-transaction */
-void trx_rseg_format_upgrade(trx_rsegf_t* rseg_header, mtr_t* mtr)
+void trx_rseg_format_upgrade(buf_block_t *rseg_header, mtr_t *mtr)
 {
-	ut_ad(page_offset(rseg_header) == TRX_RSEG);
-	byte* rseg_format = TRX_RSEG_FORMAT + rseg_header;
-	mlog_write_ulint(rseg_format, 0, MLOG_4BYTES, mtr);
-	/* Clear also possible garbage at the end of the page. Old
-	InnoDB versions did not initialize unused parts of pages. */
-	mlog_memset(TRX_RSEG_MAX_TRX_ID + 8 + rseg_header,
-		    srv_page_size
-		    - (FIL_PAGE_DATA_END
-		       + TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8), 0, mtr);
+  mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_FORMAT, 4, 0);
+  /* Clear also possible garbage at the end of the page. Old
+  InnoDB versions did not initialize unused parts of pages. */
+  mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8,
+              srv_page_size
+              - (FIL_PAGE_DATA_END + TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8),
+              0);
 }
 
 /** Create a rollback segment header.
@@ -310,7 +314,7 @@ trx_rseg_header_create(
 {
 	buf_block_t*	block;
 
-	ut_ad(mtr_memo_contains(mtr, space, MTR_MEMO_SPACE_X_LOCK));
+	ut_ad(mtr->memo_contains(*space));
 	ut_ad(!sys_header == (space == fil_system.temp_space));
 
 	/* Allocate a new file segment for the rollback segment */
@@ -331,31 +335,30 @@ trx_rseg_header_create(
 				    + block->frame));
 
 	/* Initialize the history list */
-	if (max_trx_id) {
-		mlog_write_ull(TRX_RSEG + TRX_RSEG_MAX_TRX_ID + block->frame,
-			       max_trx_id, mtr);
-	}
-
 	flst_init(block, TRX_RSEG_HISTORY + TRX_RSEG, mtr);
 
+	mtr->write<8,mtr_t::MAYBE_NOP>(*block,
+				       TRX_RSEG + TRX_RSEG_MAX_TRX_ID
+				       + block->frame, max_trx_id);
+
 	/* Reset the undo log slots */
-	mlog_memset(block, TRX_RSEG_UNDO_SLOTS + TRX_RSEG,
-		    TRX_RSEG_N_SLOTS * 4, 0xff, mtr);
+	mtr->memset(block, TRX_RSEG_UNDO_SLOTS + TRX_RSEG,
+		    TRX_RSEG_N_SLOTS * 4, 0xff);
 
 	if (sys_header) {
 		/* Add the rollback segment info to the free slot in
 		the trx system header */
 
-		mlog_write_ulint(TRX_SYS + TRX_SYS_RSEGS
-				 + TRX_SYS_RSEG_SPACE
-				 + rseg_id * TRX_SYS_RSEG_SLOT_SIZE
-				 + sys_header->frame,
-				 space->id, MLOG_4BYTES, mtr);
-		mlog_write_ulint(TRX_SYS + TRX_SYS_RSEGS
-				 + TRX_SYS_RSEG_PAGE_NO
-				 + rseg_id * TRX_SYS_RSEG_SLOT_SIZE
-				 + sys_header->frame,
-				 block->page.id.page_no(), MLOG_4BYTES, mtr);
+		mtr->write<4,mtr_t::MAYBE_NOP>(
+			*sys_header,
+			TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE
+			+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE
+			+ sys_header->frame, space->id);
+		mtr->write<4,mtr_t::MAYBE_NOP>(
+			*sys_header,
+			TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO
+			+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE
+			+ sys_header->frame, block->page.id().page_no());
 	}
 
 	return block;
@@ -395,7 +398,7 @@ trx_rseg_mem_free(trx_rseg_t* rseg)
 @param[in]	page_no		page number of the segment header */
 static
 trx_rseg_t*
-trx_rseg_mem_create(ulint id, fil_space_t* space, ulint page_no)
+trx_rseg_mem_create(ulint id, fil_space_t* space, uint32_t page_no)
 {
 	trx_rseg_t* rseg = static_cast<trx_rseg_t*>(
 		ut_zalloc_nokey(sizeof *rseg));
@@ -422,7 +425,7 @@ trx_rseg_mem_create(ulint id, fil_space_t* space, ulint page_no)
 @param[in]      rseg_header     rollback segment header
 @return error code */
 static dberr_t trx_undo_lists_init(trx_rseg_t *rseg, trx_id_t &max_trx_id,
-                                   const trx_rsegf_t *rseg_header)
+                                   const buf_block_t *rseg_header)
 {
   ut_ad(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN);
 
@@ -451,42 +454,41 @@ static dberr_t trx_undo_lists_init(trx_rseg_t *rseg, trx_id_t &max_trx_id,
 static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, trx_id_t &max_trx_id,
                                     mtr_t *mtr)
 {
-	/* This is based on trx_rsegf_get_new().
-	We need to access buf_block_t. */
-	buf_block_t *block = buf_page_get(
-		page_id_t(rseg->space->id, rseg->page_no), 0, RW_S_LATCH, mtr);
-	buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW);
+	buf_block_t* rseg_hdr = trx_rsegf_get_new(
+		rseg->space->id, rseg->page_no, mtr);
 
-	const trx_rsegf_t* rseg_header = TRX_RSEG + block->frame;
-
-	if (mach_read_from_4(rseg_header + TRX_RSEG_FORMAT) == 0) {
-		trx_id_t id = mach_read_from_8(rseg_header
-					       + TRX_RSEG_MAX_TRX_ID);
+	if (!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + rseg_hdr->frame)) {
+		trx_id_t id = mach_read_from_8(TRX_RSEG + TRX_RSEG_MAX_TRX_ID
+					       + rseg_hdr->frame);
 
 		if (id > max_trx_id) {
 			max_trx_id = id;
 		}
 
-		if (rseg_header[TRX_RSEG_BINLOG_NAME]) {
-			lsn_t lsn = std::max(block->page.newest_modification,
-					     mach_read_from_8(FIL_PAGE_LSN
-							      + block->frame));
+		const byte* binlog_name = TRX_RSEG + TRX_RSEG_BINLOG_NAME
+			+ rseg_hdr->frame;
+		if (*binlog_name) {
+			lsn_t lsn = mach_read_from_8(my_assume_aligned<8>(
+							     FIL_PAGE_LSN
+							     + rseg_hdr
+							     ->frame));
 			compile_time_assert(TRX_RSEG_BINLOG_NAME_LEN == sizeof
 					    trx_sys.recovered_binlog_filename);
 			if (lsn > trx_sys.recovered_binlog_lsn) {
 				trx_sys.recovered_binlog_lsn = lsn;
 				trx_sys.recovered_binlog_offset
 					= mach_read_from_8(
-						rseg_header
-						+ TRX_RSEG_BINLOG_OFFSET);
+						TRX_RSEG
+						+ TRX_RSEG_BINLOG_OFFSET
+						+ rseg_hdr->frame);
 				memcpy(trx_sys.recovered_binlog_filename,
-				       rseg_header + TRX_RSEG_BINLOG_NAME,
+				       binlog_name,
 				       TRX_RSEG_BINLOG_NAME_LEN);
 			}
 
 #ifdef WITH_WSREP
 			trx_rseg_read_wsrep_checkpoint(
-				rseg_header, trx_sys.recovered_wsrep_xid);
+				rseg_hdr, trx_sys.recovered_wsrep_xid);
 #endif
 		}
 	}
@@ -500,35 +502,43 @@ static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, trx_id_t &max_trx_id,
 
 	/* Initialize the undo log lists according to the rseg header */
 
-	rseg->curr_size = mach_read_from_4(rseg_header + TRX_RSEG_HISTORY_SIZE)
+	rseg->curr_size = mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+					   + rseg_hdr->frame)
 		+ 1;
-	if (dberr_t err = trx_undo_lists_init(rseg, max_trx_id, rseg_header)) {
+	if (dberr_t err = trx_undo_lists_init(rseg, max_trx_id, rseg_hdr)) {
 		return err;
 	}
 
-	if (auto len = flst_get_len(rseg_header + TRX_RSEG_HISTORY)) {
+	if (auto len = flst_get_len(TRX_RSEG + TRX_RSEG_HISTORY
+				    + rseg_hdr->frame)) {
 		trx_sys.rseg_history_len += len;
 
-		fil_addr_t	node_addr = trx_purge_get_log_from_hist(
-			flst_get_last(rseg_header + TRX_RSEG_HISTORY, mtr));
+		fil_addr_t node_addr = flst_get_last(TRX_RSEG
+						     + TRX_RSEG_HISTORY
+						     + rseg_hdr->frame);
+		node_addr.boffset = static_cast<uint16_t>(
+			node_addr.boffset - TRX_UNDO_HISTORY_NODE);
 
-		rseg->last_page_no = static_cast<uint32_t>(node_addr.page);
+		rseg->last_page_no = node_addr.page;
 
-		const trx_ulogf_t*	undo_log_hdr = trx_undo_page_get(
-			page_id_t(rseg->space->id, node_addr.page), mtr)
-			+ node_addr.boffset;
+		const buf_block_t* block = trx_undo_page_get(
+			page_id_t(rseg->space->id, node_addr.page), mtr);
 
-		trx_id_t id = mach_read_from_8(undo_log_hdr + TRX_UNDO_TRX_ID);
+		trx_id_t id = mach_read_from_8(block->frame + node_addr.boffset
+					       + TRX_UNDO_TRX_ID);
 		if (id > max_trx_id) {
 			max_trx_id = id;
 		}
-		id = mach_read_from_8(undo_log_hdr + TRX_UNDO_TRX_NO);
+		id = mach_read_from_8(block->frame + node_addr.boffset
+				      + TRX_UNDO_TRX_NO);
 		if (id > max_trx_id) {
 			max_trx_id = id;
 		}
+
 		rseg->set_last_commit(node_addr.boffset, id);
-		unsigned purge = mach_read_from_2(
-			undo_log_hdr + TRX_UNDO_NEEDS_PURGE);
+		unsigned purge = mach_read_from_2(block->frame
+						  + node_addr.boffset
+						  + TRX_UNDO_NEEDS_PURGE);
 		ut_ad(purge <= 1);
 		rseg->needs_purge = purge != 0;
 
@@ -662,9 +672,9 @@ dberr_t trx_rseg_array_init()
 		}
 
 		/* Finally, clear WSREP XID in TRX_SYS page. */
-		const buf_block_t* sys = trx_sysf_get(&mtr);
-		mlog_memset(TRX_SYS + TRX_SYS_WSREP_XID_INFO + sys->frame,
-			    TRX_SYS_WSREP_XID_LEN, 0, &mtr);
+		mtr.memset(trx_sysf_get(&mtr),
+			   TRX_SYS + TRX_SYS_WSREP_XID_INFO,
+			   TRX_SYS_WSREP_XID_LEN, 0);
 		mtr.commit();
 	}
 #endif
@@ -685,8 +695,6 @@ trx_rseg_create(ulint space_id)
 
 	mtr.start();
 
-	/* To obey the latching order, acquire the file space
-	x-latch before the trx_sys.mutex. */
 	fil_space_t*	space = mtr_x_lock_space(space_id, &mtr);
 	ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
 
@@ -699,7 +707,8 @@ trx_rseg_create(ulint space_id)
 			ut_ad(trx_sysf_rseg_get_space(sys_header, rseg_id)
 			      == space_id);
 			rseg = trx_rseg_mem_create(rseg_id, space,
-						   rblock->page.id.page_no());
+						   rblock->page.id().
+						   page_no());
 			ut_ad(rseg->id == rseg_id);
 			ut_ad(rseg->is_persistent());
 			ut_ad(!trx_sys.rseg_array[rseg->id]);
@@ -726,7 +735,7 @@ trx_temp_rseg_create()
 		buf_block_t* rblock = trx_rseg_header_create(
 			fil_system.temp_space, i, 0, NULL, &mtr);
 		trx_rseg_t* rseg = trx_rseg_mem_create(
-			i, fil_system.temp_space, rblock->page.id.page_no());
+			i, fil_system.temp_space, rblock->page.id().page_no());
 		ut_ad(!rseg->is_persistent());
 		ut_ad(!trx_sys.temp_rsegs[i]);
 		trx_sys.temp_rsegs[i] = rseg;
@@ -734,55 +743,6 @@ trx_temp_rseg_create()
 	}
 }
 
-/********************************************************************
-Get the number of unique rollback tablespaces in use except space id 0.
-The last space id will be the sentinel value ULINT_UNDEFINED. The array
-will be sorted on space id. Note: space_ids should have have space for
-TRX_SYS_N_RSEGS + 1 elements.
-@return number of unique rollback tablespaces in use. */
-ulint
-trx_rseg_get_n_undo_tablespaces(
-/*============================*/
-	ulint*		space_ids)	/*!< out: array of space ids of
-					UNDO tablespaces */
-{
-	mtr_t mtr;
-	mtr.start();
-
-	buf_block_t* sys_header = trx_sysf_get(&mtr, false);
-	if (!sys_header) {
-		mtr.commit();
-		return 0;
-	}
-
-	ulint* end = space_ids;
-
-	for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
-		uint32_t page_no = trx_sysf_rseg_get_page_no(sys_header,
-							     rseg_id);
-
-		if (page_no == FIL_NULL) {
-			continue;
-		}
-
-		if (ulint space = trx_sysf_rseg_get_space(sys_header,
-							  rseg_id)) {
-			if (std::find(space_ids, end, space) == end) {
-				*end++ = space;
-			}
-		}
-	}
-
-	mtr.commit();
-
-	ut_a(end - space_ids <= TRX_SYS_N_RSEGS);
-	*end = ULINT_UNDEFINED;
-
-	std::sort(space_ids, end);
-
-	return ulint(end - space_ids);
-}
-
 /** Update the offset information about the end of the binlog entry
 which corresponds to the transaction just being committed.
 In a replication slave, this updates the master binlog position
@@ -790,8 +750,8 @@ up to which replication has proceeded.
 @param[in,out]	rseg_header	rollback segment header
 @param[in]	trx		committing transaction
 @param[in,out]	mtr		mini-transaction */
-void
-trx_rseg_update_binlog_offset(byte* rseg_header, const trx_t* trx, mtr_t* mtr)
+void trx_rseg_update_binlog_offset(buf_block_t *rseg_header, const trx_t *trx,
+                                   mtr_t *mtr)
 {
 	DBUG_LOG("trx", "trx_mysql_binlog_offset: " << trx->mysql_log_offset);
 
@@ -803,13 +763,14 @@ trx_rseg_update_binlog_offset(byte* rseg_header, const trx_t* trx, mtr_t* mtr)
 		return;
 	}
 
-	mlog_write_ull(rseg_header + TRX_RSEG_BINLOG_OFFSET,
-		       trx->mysql_log_offset, mtr);
-	byte* p = rseg_header + TRX_RSEG_BINLOG_NAME;
-	const byte* binlog_name = reinterpret_cast<const byte*>
-		(trx->mysql_log_file_name);
+	mtr->write<8,mtr_t::MAYBE_NOP>(*rseg_header,
+				       TRX_RSEG + TRX_RSEG_BINLOG_OFFSET
+				       + rseg_header->frame,
+				       trx->mysql_log_offset);
+
+	void* name = TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_header->frame;
 
-	if (memcmp(binlog_name, p, len)) {
-		mlog_write_string(p, binlog_name, len, mtr);
+	if (memcmp(trx->mysql_log_file_name, name, len)) {
+		mtr->memcpy(*rseg_header, name, trx->mysql_log_file_name, len);
 	}
 }
diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc
index 22ed8d0c1cb..bcde969eb41 100644
--- a/storage/innobase/trx/trx0sys.cc
+++ b/storage/innobase/trx/trx0sys.cc
@@ -48,7 +48,7 @@ trx_sys_t		trx_sys;
 @param[in]	id              transaction id to check
 @param[in]      name            table name */
 void
-ReadView::check_trx_id_sanity(
+ReadViewBase::check_trx_id_sanity(
 	trx_id_t		id,
 	const table_name_t&	name)
 {
@@ -147,8 +147,6 @@ trx_sysf_create(
 {
 	ulint		slot_no;
 	buf_block_t*	block;
-	page_t*		page;
-	byte*		ptr;
 
 	ut_ad(mtr);
 
@@ -165,33 +163,31 @@ trx_sysf_create(
 			    mtr);
 	buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
 
-	ut_a(block->page.id.page_no() == TRX_SYS_PAGE_NO);
+	ut_a(block->page.id() == page_id_t(0, TRX_SYS_PAGE_NO));
 
-	page = buf_block_get_frame(block);
+	mtr->write<2>(*block, FIL_PAGE_TYPE + block->frame,
+		      FIL_PAGE_TYPE_TRX_SYS);
 
-	mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS,
-			 MLOG_2BYTES, mtr);
-
-	/* Reset the doublewrite buffer magic number to zero so that we
-	know that the doublewrite buffer has not yet been created (this
-	suppresses a Valgrind warning) */
-
-	mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE
-			 + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr);
+	ut_ad(!mach_read_from_4(block->frame
+				+ TRX_SYS_DOUBLEWRITE
+				+ TRX_SYS_DOUBLEWRITE_MAGIC));
 
 	/* Reset the rollback segment slots.  Old versions of InnoDB
 	(before MySQL 5.5) define TRX_SYS_N_RSEGS as 256 and expect
 	that the whole array is initialized. */
-	ptr = TRX_SYS + TRX_SYS_RSEGS + page;
 	compile_time_assert(256 >= TRX_SYS_N_RSEGS);
-	memset(ptr, 0xff, 256 * TRX_SYS_RSEG_SLOT_SIZE);
-	ptr += 256 * TRX_SYS_RSEG_SLOT_SIZE;
-	ut_a(ptr <= page + (srv_page_size - FIL_PAGE_DATA_END));
-
+	compile_time_assert(TRX_SYS + TRX_SYS_RSEGS
+			    + 256 * TRX_SYS_RSEG_SLOT_SIZE
+			    <= UNIV_PAGE_SIZE_MIN - FIL_PAGE_DATA_END);
+	mtr->memset(block, TRX_SYS + TRX_SYS_RSEGS,
+		    256 * TRX_SYS_RSEG_SLOT_SIZE, 0xff);
 	/* Initialize all of the page.  This part used to be uninitialized. */
-	mlog_memset(block, ptr - page,
-		    srv_page_size - FIL_PAGE_DATA_END + size_t(page - ptr),
-		    0, mtr);
+	mtr->memset(block, TRX_SYS + TRX_SYS_RSEGS
+		    + 256 * TRX_SYS_RSEG_SLOT_SIZE,
+		    srv_page_size
+		    - (FIL_PAGE_DATA_END + TRX_SYS + TRX_SYS_RSEGS
+		       + 256 * TRX_SYS_RSEG_SLOT_SIZE),
+		    0);
 
 	/* Create the first rollback segment in the SYSTEM tablespace */
 	slot_no = trx_sys_rseg_find_free(block);
@@ -199,7 +195,7 @@ trx_sysf_create(
 						     slot_no, 0, block, mtr);
 
 	ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
-	ut_a(rblock->page.id.page_no() == FSP_FIRST_RSEG_PAGE_NO);
+	ut_a(rblock->page.id() == page_id_t(0, FSP_FIRST_RSEG_PAGE_NO));
 }
 
 /** Create the instance */
@@ -209,8 +205,7 @@ trx_sys_t::create()
 	ut_ad(this == &trx_sys);
 	ut_ad(!is_initialised());
 	m_initialised = true;
-	mutex_create(LATCH_ID_TRX_SYS, &mutex);
-	UT_LIST_INIT(trx_list, &trx_t::trx_list);
+	trx_list.create();
 	rseg_history_len= 0;
 
 	rw_trx_hash.init();
@@ -238,16 +233,11 @@ trx_sys_create_rsegs()
 {
 	/* srv_available_undo_logs reflects the number of persistent
 	rollback segments that have been initialized in the
-	transaction system header page.
-
-	srv_undo_logs determines how many of the
-	srv_available_undo_logs rollback segments may be used for
-	logging new transactions. */
+	transaction system header page. */
 	ut_ad(srv_undo_tablespaces <= TRX_SYS_MAX_UNDO_SPACES);
-	ut_ad(srv_undo_logs <= TRX_SYS_N_RSEGS);
 
-	if (srv_read_only_mode) {
-		srv_undo_logs = srv_available_undo_logs = ULONG_UNDEFINED;
+	if (high_level_read_only) {
+		srv_available_undo_logs = 0;
 		return(true);
 	}
 
@@ -262,43 +252,35 @@ trx_sys_create_rsegs()
 	in the system tablespace. */
 	ut_a(srv_available_undo_logs > 0);
 
-	if (srv_force_recovery) {
-		/* Do not create additional rollback segments if
-		innodb_force_recovery has been set. */
-		if (srv_undo_logs > srv_available_undo_logs) {
-			srv_undo_logs = srv_available_undo_logs;
+	for (ulint i = 0; srv_available_undo_logs < TRX_SYS_N_RSEGS;
+	     i++, srv_available_undo_logs++) {
+		/* Tablespace 0 is the system tablespace.
+		Dedicated undo log tablespaces start from 1. */
+		ulint space = srv_undo_tablespaces > 0
+			? (i % srv_undo_tablespaces)
+			+ srv_undo_space_id_start
+			: TRX_SYS_SPACE;
+
+		if (!trx_rseg_create(space)) {
+			ib::error() << "Unable to allocate the"
+				" requested innodb_undo_logs";
+			return(false);
 		}
-	} else {
-		for (ulint i = 0; srv_available_undo_logs < srv_undo_logs;
-		     i++, srv_available_undo_logs++) {
-			/* Tablespace 0 is the system tablespace.
-			Dedicated undo log tablespaces start from 1. */
-			ulint space = srv_undo_tablespaces > 0
-				? (i % srv_undo_tablespaces)
-				+ srv_undo_space_id_start
-				: TRX_SYS_SPACE;
-
-			if (!trx_rseg_create(space)) {
-				ib::error() << "Unable to allocate the"
-					" requested innodb_undo_logs";
-				return(false);
-			}
-
-			/* Increase the number of active undo
-			tablespace in case new rollback segment
-			assigned to new undo tablespace. */
-			if (space > srv_undo_tablespaces_active) {
-				srv_undo_tablespaces_active++;
-
-				ut_ad(srv_undo_tablespaces_active == space);
-			}
+
+		/* Increase the number of active undo
+		tablespace in case new rollback segment
+		assigned to new undo tablespace. */
+		if (space > srv_undo_tablespaces_active) {
+			srv_undo_tablespaces_active++;
+
+			ut_ad(srv_undo_tablespaces_active == space);
 		}
 	}
 
-	ut_ad(srv_undo_logs <= srv_available_undo_logs);
+	ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS);
 
 	ib::info info;
-	info << srv_undo_logs << " out of " << srv_available_undo_logs;
+	info << srv_available_undo_logs;
 	if (srv_undo_tablespaces_active) {
 		info << " rollback segments in " << srv_undo_tablespaces_active
 		<< " undo tablespaces are active.";
@@ -337,8 +319,8 @@ trx_sys_t::close()
 		}
 	}
 
-	ut_a(UT_LIST_GET_LEN(trx_list) == 0);
-	mutex_free(&mutex);
+	ut_a(trx_list.empty());
+	trx_list.close();
 	m_initialised = false;
 }
 
@@ -347,15 +329,11 @@ ulint trx_sys_t::any_active_transactions()
 {
   uint32_t total_trx= 0;
 
-  mutex_enter(&mutex);
-  for (trx_t* trx= UT_LIST_GET_FIRST(trx_sys.trx_list);
-       trx != NULL;
-       trx= UT_LIST_GET_NEXT(trx_list, trx))
-  {
-    if (trx->state == TRX_STATE_COMMITTED_IN_MEMORY ||
-        (trx->state == TRX_STATE_ACTIVE && trx->id))
+  trx_sys.trx_list.for_each([&total_trx](const trx_t &trx) {
+    if (trx.state == TRX_STATE_COMMITTED_IN_MEMORY ||
+        (trx.state == TRX_STATE_ACTIVE && trx.id))
       total_trx++;
-  }
-  mutex_exit(&mutex);
+  });
+
   return total_trx;
 }
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
index 038cf9be825..ffb514a97c4 100644
--- a/storage/innobase/trx/trx0trx.cc
+++ b/storage/innobase/trx/trx0trx.cc
@@ -35,7 +35,6 @@ Created 3/26/1996 Heikki Tuuri
 #include "btr0sea.h"
 #include "lock0lock.h"
 #include "log0log.h"
-#include "os0proc.h"
 #include "que0que.h"
 #include "srv0mon.h"
 #include "srv0srv.h"
@@ -104,8 +103,6 @@ trx_init(
 /*=====*/
 	trx_t*	trx)
 {
-	trx->no = TRX_ID_MAX;
-
 	trx->state = TRX_STATE_NOT_STARTED;
 
 	trx->is_recovered = false;
@@ -163,11 +160,8 @@ trx_init(
 	trx->lock.table_cached = 0;
 #ifdef WITH_WSREP
 	ut_ad(!trx->wsrep);
-	ut_ad(!trx->wsrep_event);
 	ut_ad(!trx->wsrep_UK_scan);
 #endif /* WITH_WSREP */
-
-	ut_ad(trx->get_flush_observer() == NULL);
 }
 
 /** For managing the life-cycle of the trx_t instance that we get
@@ -222,9 +216,12 @@ struct TrxFactory {
 #ifdef __SANITIZE_ADDRESS__
 		/* Unpoison the memory for AddressSanitizer */
 		MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
-#else
-		/* Declare the contents as initialized for Valgrind;
-		we checked this in trx_t::free(). */
+#elif !__has_feature(memory_sanitizer)
+		/* In Valgrind, we cannot cancel MEM_NOACCESS() without
+		changing the state of the V bits (which indicate
+		which bits are initialized).
+		We will declare the contents as initialized.
+		We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
 		MEM_MAKE_DEFINED(trx, sizeof *trx);
 #endif
 
@@ -341,11 +338,12 @@ trx_t *trx_create()
 	/* Unpoison the memory for AddressSanitizer.
 	It may have been poisoned in trx_t::free().*/
 	MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
-#else
-	/* Declare the memory initialized for Valgrind.
-	The trx_t that are released to the pool are
-	actually initialized; we checked that by
-	MEM_CHECK_DEFINED() in trx_t::free(). */
+#elif !__has_feature(memory_sanitizer)
+	/* In Valgrind, we cannot cancel MEM_NOACCESS() without
+	changing the state of the V bits (which indicate
+	which bits are initialized).
+	We will declare the contents as initialized.
+	We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
 	MEM_MAKE_DEFINED(trx, sizeof *trx);
 #endif
 
@@ -373,7 +371,6 @@ trx_t *trx_create()
 	ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
 
 #ifdef WITH_WSREP
-	trx->wsrep_event= NULL;
 	ut_ad(!trx->wsrep_UK_scan);
 #endif /* WITH_WSREP */
 
@@ -388,9 +385,9 @@ void trx_t::free()
   MEM_CHECK_DEFINED(this, sizeof *this);
 
   ut_ad(!n_mysql_tables_in_use);
+  ut_ad(!mysql_log_file_name);
   ut_ad(!mysql_n_tables_locked);
   ut_ad(!internal);
-  ut_ad(!declared_to_be_inside_innodb);
   ut_ad(!will_lock);
   ut_ad(error_state == DB_SUCCESS);
   ut_ad(magic_n == TRX_MAGIC_N);
@@ -402,8 +399,7 @@ void trx_t::free()
   assert_freed();
   trx_sys.rw_trx_hash.put_pins(this);
 
-  mysql_thd= NULL;
-  mysql_log_file_name= NULL;
+  mysql_thd= nullptr;
 
   // FIXME: We need to avoid this heap free/alloc for each commit.
   if (autoinc_locks)
@@ -419,14 +415,12 @@ void trx_t::free()
   MEM_NOACCESS(&n_ref, sizeof n_ref);
   /* do not poison mutex */
   MEM_NOACCESS(&id, sizeof id);
-  MEM_NOACCESS(&no, sizeof no);
   MEM_NOACCESS(&state, sizeof state);
   MEM_NOACCESS(&is_recovered, sizeof is_recovered);
 #ifdef WITH_WSREP
   MEM_NOACCESS(&wsrep, sizeof wsrep);
 #endif
-  MEM_NOACCESS(&read_view, sizeof read_view);
-  MEM_NOACCESS(&trx_list, sizeof trx_list);
+  read_view.mem_noaccess();
   MEM_NOACCESS(&lock, sizeof lock);
   MEM_NOACCESS(&op_info, sizeof op_info);
   MEM_NOACCESS(&isolation_level, sizeof isolation_level);
@@ -438,8 +432,6 @@ void trx_t::free()
   MEM_NOACCESS(&must_flush_log_later, sizeof must_flush_log_later);
   MEM_NOACCESS(&duplicates, sizeof duplicates);
   MEM_NOACCESS(&dict_operation, sizeof dict_operation);
-  MEM_NOACCESS(&declared_to_be_inside_innodb, sizeof declared_to_be_inside_innodb);
-  MEM_NOACCESS(&n_tickets_to_enter_innodb, sizeof n_tickets_to_enter_innodb);
   MEM_NOACCESS(&dict_operation_lock_mode, sizeof dict_operation_lock_mode);
   MEM_NOACCESS(&start_time, sizeof start_time);
   MEM_NOACCESS(&start_time_micro, sizeof start_time_micro);
@@ -478,9 +470,7 @@ void trx_t::free()
   MEM_NOACCESS(&xid, sizeof xid);
   MEM_NOACCESS(&mod_tables, sizeof mod_tables);
   MEM_NOACCESS(&detailed_error, sizeof detailed_error);
-  MEM_NOACCESS(&flush_observer, sizeof flush_observer);
 #ifdef WITH_WSREP
-  MEM_NOACCESS(&wsrep_event, sizeof wsrep_event);
   ut_ad(!wsrep_UK_scan);
   MEM_NOACCESS(&wsrep_UK_scan, sizeof wsrep_UK_scan);
 #endif /* WITH_WSREP */
@@ -538,7 +528,9 @@ trx_free_at_shutdown(trx_t *trx)
 	ut_a(trx_state_eq(trx, TRX_STATE_PREPARED)
 	     || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)
 	     || (trx_state_eq(trx, TRX_STATE_ACTIVE)
-		 && (!srv_was_started || is_mariabackup_restore_or_export()
+		 && (!srv_was_started
+		     || srv_operation == SRV_OPERATION_RESTORE
+		     || srv_operation == SRV_OPERATION_RESTORE_EXPORT
 		     || srv_read_only_mode
 		     || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
 		     || (!srv_is_being_started
@@ -567,11 +559,13 @@ void trx_disconnect_prepared(trx_t *trx)
 {
   ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED));
   ut_ad(trx->mysql_thd);
+  ut_ad(!trx->mysql_log_file_name);
   trx->read_view.close();
   trx->is_recovered= true;
   trx->mysql_thd= NULL;
   /* todo/fixme: suggest to do it at innodb prepare */
   trx->will_lock= false;
+  trx_sys.rw_trx_hash.put_pins(trx);
 }
 
 /****************************************************************//**
@@ -584,8 +578,6 @@ trx_resurrect_table_locks(
 	const trx_undo_t*	undo)	/*!< in: undo log */
 {
 	mtr_t			mtr;
-	page_t*			undo_page;
-	trx_undo_rec_t*		undo_rec;
 	table_id_set		tables;
 
 	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
@@ -600,11 +592,11 @@ trx_resurrect_table_locks(
 
 	/* trx_rseg_mem_create() may have acquired an X-latch on this
 	page, so we cannot acquire an S-latch. */
-	undo_page = trx_undo_page_get(
+	buf_block_t* block = trx_undo_page_get(
 		page_id_t(trx->rsegs.m_redo.rseg->space->id,
 			  undo->top_page_no), &mtr);
-
-	undo_rec = undo_page + undo->top_offset;
+	buf_block_t* undo_block = block;
+	trx_undo_rec_t* undo_rec = block->frame + undo->top_offset;
 
 	do {
 		ulint		type;
@@ -613,11 +605,9 @@ trx_resurrect_table_locks(
 		ulint		cmpl_info;
 		bool		updated_extern;
 
-		page_t*		undo_rec_page = page_align(undo_rec);
-
-		if (undo_rec_page != undo_page) {
-			mtr.release_page(undo_page, MTR_MEMO_PAGE_X_FIX);
-			undo_page = undo_rec_page;
+		if (undo_block != block) {
+			mtr.memo_release(undo_block, MTR_MEMO_PAGE_X_FIX);
+			undo_block = block;
 		}
 
 		trx_undo_rec_get_pars(
@@ -626,7 +616,7 @@ trx_resurrect_table_locks(
 		tables.insert(table_id);
 
 		undo_rec = trx_undo_get_prev_rec(
-			undo_rec, undo->hdr_page_no,
+			block, page_offset(undo_rec), undo->hdr_page_no,
 			undo->hdr_offset, false, &mtr);
 	} while (undo_rec);
 
@@ -673,7 +663,7 @@ static void trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg,
   trx_state_t state;
   /*
     This is single-threaded startup code, we do not need the
-    protection of trx->mutex or trx_sys.mutex here.
+    protection of trx->mutex here.
   */
   switch (undo->state)
   {
@@ -698,7 +688,6 @@ static void trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg,
   trx->state= state;
   ut_d(trx->start_file= __FILE__);
   ut_d(trx->start_line= __LINE__);
-  ut_ad(trx->no == TRX_ID_MAX);
 
   trx->rsegs.m_redo.undo= undo;
   trx->undo_no= undo->top_undo_no + 1;
@@ -801,16 +790,15 @@ dberr_t trx_lists_init_at_db_start()
 		}
 	}
 
-	if (trx_sys.rw_trx_hash.size()) {
-
-		ib::info() << trx_sys.rw_trx_hash.size()
+	if (const auto size = trx_sys.rw_trx_hash.size()) {
+		ib::info() << size
 			<< " transaction(s) which must be rolled back or"
 			" cleaned up in total " << rows_to_undo
 			<< " row operations to undo";
-
 		ib::info() << "Trx id counter is " << trx_sys.get_max_trx_id();
 	}
-	trx_sys.clone_oldest_view();
+
+	purge_sys.clone_oldest_view();
 	return DB_SUCCESS;
 }
 
@@ -820,11 +808,13 @@ evenly distributed between 0 and innodb_undo_logs-1
 @retval	NULL	if innodb_read_only */
 static trx_rseg_t* trx_assign_rseg_low()
 {
-	if (srv_read_only_mode) {
-		ut_ad(srv_undo_logs == ULONG_UNDEFINED);
+	if (high_level_read_only) {
+		ut_ad(!srv_available_undo_logs);
 		return(NULL);
 	}
 
+	ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS);
+
 	/* The first slot is always assigned to the system tablespace. */
 	ut_ad(trx_sys.rseg_array[0]->space == fil_system.sys_space);
 
@@ -832,7 +822,8 @@ static trx_rseg_t* trx_assign_rseg_low()
 	innodb_undo_logs-1 in a round-robin fashion, skipping those
 	undo tablespaces that are scheduled for truncation. */
 	static Atomic_counter<unsigned>	rseg_slot;
-	ulong	slot = ulong{rseg_slot++} % srv_undo_logs;
+	unsigned slot = rseg_slot++ % TRX_SYS_N_RSEGS;
+	ut_d(if (trx_rseg_n_slots_debug) slot = 0);
 	trx_rseg_t*	rseg;
 
 #ifdef UNIV_DEBUG
@@ -855,7 +846,8 @@ static trx_rseg_t* trx_assign_rseg_low()
 			look_for_rollover = true;
 #endif /* UNIV_DEBUG */
 
-			slot = (slot + 1) % srv_undo_logs;
+			ut_d(if (!trx_rseg_n_slots_debug))
+			slot = (slot + 1) % TRX_SYS_N_RSEGS;
 
 			if (rseg == NULL) {
 				continue;
@@ -899,21 +891,6 @@ static trx_rseg_t* trx_assign_rseg_low()
 	return(rseg);
 }
 
-/** Set the innodb_log_optimize_ddl page flush observer
-@param[in,out]	space	tablespace
-@param[in,out]	stage	performance_schema accounting */
-void trx_t::set_flush_observer(fil_space_t* space, ut_stage_alter_t* stage)
-{
-	flush_observer = UT_NEW_NOKEY(FlushObserver(space, this, stage));
-}
-
-/** Remove the flush observer */
-void trx_t::remove_flush_observer()
-{
-	UT_DELETE(flush_observer);
-	flush_observer = NULL;
-}
-
 /** Assign a rollback segment for modifying temporary tables.
 @return the assigned rollback segment */
 trx_rseg_t *trx_t::assign_temp_rseg()
@@ -975,18 +952,13 @@ trx_start_low(
 	trx->xid->null();
 #endif /* WITH_WSREP */
 
-	/* The initial value for trx->no: TRX_ID_MAX is used in
-	read_view_open_now: */
-
-	trx->no = TRX_ID_MAX;
-
 	ut_a(ib_vector_is_empty(trx->autoinc_locks));
 	ut_a(trx->lock.table_locks.empty());
 
-	/* No other thread can access this trx object through rw_trx_hash, thus
-	we don't need trx_sys.mutex protection for that purpose. Still this
-	trx can be found through trx_sys.trx_list, which means state
-	change must be protected by e.g. trx->mutex.
+	/* No other thread can access this trx object through rw_trx_hash,
+	still it can be found through trx_sys.trx_list. Sometimes it's
+	possible to indirectly protect trx_t::state by freezing
+	trx_sys.trx_list.
 
 	For now we update it without mutex protection, because original code
 	did it this way. It has to be reviewed and fixed properly. */
@@ -1056,7 +1028,8 @@ trx_serialise(trx_t* trx)
 	already in the rollback segment. User threads only
 	produce events when a rollback segment is empty. */
 	if (rseg->last_page_no == FIL_NULL) {
-		purge_sys.purge_queue.push(TrxUndoRsegs(trx->no, *rseg));
+		purge_sys.purge_queue.push(TrxUndoRsegs(trx->rw_trx_hash_element->no,
+							*rseg));
 		mutex_exit(&purge_sys.pq_mutex);
 	}
 }
@@ -1123,8 +1096,6 @@ trx_write_serialisation_history(
 	mutex_exit(&rseg->mutex);
 
 	MONITOR_INC(MONITOR_TRX_COMMIT_UNDO);
-
-	trx->mysql_log_file_name = NULL;
 }
 
 /********************************************************************
@@ -1206,6 +1177,7 @@ trx_flush_log_if_needed_low(
 	case 3:
 		/* Write the log and optionally flush it to disk */
 		log_write_up_to(lsn, flush);
+		srv_inc_activity_count();
 		return;
 	case 0:
 		/* Do nothing */
@@ -1322,9 +1294,9 @@ inline void trx_t::commit_in_memory(const mtr_t *mtr)
     /* This state change is not protected by any mutex, therefore
     there is an inherent race here around state transition during
     printouts. We ignore this race for the sake of efficiency.
-    However, the trx_sys_t::mutex will protect the trx_t instance
-    and it cannot be removed from the trx_list and freed
-    without first acquiring the trx_sys_t::mutex. */
+    However, the freezing of trx_sys.trx_list will protect the trx_t
+    instance and it cannot be removed from the trx_list and freed
+    without first unfreezing trx_list. */
     state= TRX_STATE_NOT_STARTED;
 
     MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT);
@@ -1421,11 +1393,6 @@ inline void trx_t::commit_in_memory(const mtr_t *mtr)
       must_flush_log_later= true;
     else if (srv_flush_log_at_trx_commit)
       trx_flush_log_if_needed(commit_lsn, this);
-
-    /* Tell server some activity has happened, since the trx does
-    changes something. Background utility threads like master thread,
-    purge thread or page_cleaner thread might have some work to do. */
-    srv_active_wake_master_thread();
   }
 
   ut_ad(!rsegs.m_noredo.undo);
@@ -1549,7 +1516,7 @@ trx_commit_or_rollback_prepare(
 /*===========================*/
 	trx_t*	trx)		/*!< in/out: transaction */
 {
-	/* We are reading trx->state without holding trx_sys.mutex
+	/* We are reading trx->state without holding trx->mutex
 	here, because the commit or rollback should be invoked for a
 	running (or recovered prepared) transaction that is associated
 	with the current thread. */
@@ -1574,7 +1541,7 @@ trx_commit_or_rollback_prepare(
 			trx->lock.que_state = TRX_QUE_RUNNING;
 		}
 
-		ut_a(trx->lock.n_active_thrs == 1);
+		ut_ad(trx->lock.n_active_thrs == 1);
 		return;
 
 	case TRX_STATE_COMMITTED_IN_MEMORY:
@@ -1755,9 +1722,6 @@ trx_print_low(
 
 	fprintf(f, "TRANSACTION " TRX_ID_FMT, trx_get_id_for_print(trx));
 
-	/* trx->state cannot change from or to NOT_STARTED while we
-	are holding the trx_sys.mutex. It may change from ACTIVE to
-	PREPARED or COMMITTED. */
 	switch (trx->state) {
 	case TRX_STATE_NOT_STARTED:
 		fputs(", not started", f);
@@ -1789,11 +1753,6 @@ state_ok:
 		fputs(" recovered trx", f);
 	}
 
-	if (trx->declared_to_be_inside_innodb) {
-		fprintf(f, ", thread declared inside InnoDB %lu",
-			(ulong) trx->n_tickets_to_enter_innodb);
-	}
-
 	putc('\n', f);
 
 	if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
@@ -2014,6 +1973,20 @@ trx_prepare(
 		We must not be holding any mutexes or latches here. */
 
 		trx_flush_log_if_needed(lsn, trx);
+
+		if (!UT_LIST_GET_LEN(trx->lock.trx_locks)
+		    || trx->isolation_level == TRX_ISO_SERIALIZABLE) {
+			/* Do not release any locks at the
+			SERIALIZABLE isolation level. */
+		} else if (!trx->mysql_thd
+			   || thd_sql_command(trx->mysql_thd)
+			   != SQLCOM_XA_PREPARE) {
+			/* Do not release locks for XA COMMIT ONE PHASE
+			or for internal distributed transactions
+			(XID::get_my_xid() would be nonzero). */
+		} else {
+			lock_release_on_prepare(trx);
+		}
 	}
 }
 
@@ -2106,8 +2079,7 @@ int trx_recover_for_mysql(XID *xid_list, uint len)
   ut_ad(len);
 
   /* Fill xid_list with PREPARED transactions. */
-  trx_sys.rw_trx_hash.iterate_no_dups(reinterpret_cast<my_hash_walk_action>
-                                      (trx_recover_for_mysql_callback), &arg);
+  trx_sys.rw_trx_hash.iterate_no_dups(trx_recover_for_mysql_callback, &arg);
   if (arg.count)
   {
     ib::info() << arg.count
@@ -2117,8 +2089,7 @@ int trx_recover_for_mysql(XID *xid_list, uint len)
     transactions twice, by first calling tc_log->open() and then
     ha_recover() directly. */
     if (arg.count <= len)
-      trx_sys.rw_trx_hash.iterate(reinterpret_cast<my_hash_walk_action>
-                                  (trx_recover_reset_callback), NULL);
+      trx_sys.rw_trx_hash.iterate(trx_recover_reset_callback);
   }
   return int(std::min(arg.count, len));
 }
@@ -2172,8 +2143,7 @@ trx_t* trx_get_trx_by_xid(const XID* xid)
   trx_get_trx_by_xid_callback_arg arg= { xid, 0 };
 
   if (xid)
-    trx_sys.rw_trx_hash.iterate(reinterpret_cast<my_hash_walk_action>
-                                (trx_get_trx_by_xid_callback), &arg);
+    trx_sys.rw_trx_hash.iterate(trx_get_trx_by_xid_callback, &arg);
   return arg.trx;
 }
 
@@ -2321,13 +2291,6 @@ trx_set_rw_mode(
 		return;
 	}
 
-	/* Function is promoting existing trx from ro mode to rw mode.
-	In this process it has acquired trx_sys.mutex as it plan to
-	move trx from ro list to rw list. If in future, some other thread
-	looks at this trx object while it is being promoted then ensure
-	that both threads are synced by acquring trx->mutex to avoid decision
-	based on in-consistent view formed during promotion. */
-
 	trx->rsegs.m_redo.rseg = trx_assign_rseg_low();
 	ut_ad(trx->rsegs.m_redo.rseg != 0);
 
diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc
index 8768b934dba..b14d625993a 100644
--- a/storage/innobase/trx/trx0undo.cc
+++ b/storage/innobase/trx/trx0undo.cc
@@ -75,9 +75,9 @@ can still remove old versions from the bottom of the stack. */
    -------------------------------------------------------------------
 latches?
 -------
-The contention of the trx_sys.mutex should be minimized. When a transaction
-does its first insert or modify in an index, an undo log is assigned for it.
-Then we must have an x-latch to the rollback segment header.
+When a transaction does its first insert or modify in the clustered index, an
+undo log is assigned for it. Then we must have an x-latch to the rollback
+segment header.
 	When the transaction performs modifications or rolls back, its
 undo log is protected by undo page latches.
 Only the thread that is associated with the transaction may hold multiple
@@ -106,37 +106,36 @@ trx_undo_mem_create(
 	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
 				is created */
 	const XID*	xid,	/*!< in: X/Open XA transaction identification*/
-	ulint		page_no,/*!< in: undo log header page number */
-	ulint		offset);/*!< in: undo log header byte offset on page */
+	uint32_t	page_no,/*!< in: undo log header page number */
+	uint16_t	offset);/*!< in: undo log header byte offset on page */
 
 /** Determine the start offset of undo log records of an undo log page.
-@param[in]	undo_page	undo log page
+@param[in]	block	undo log page
 @param[in]	page_no		undo log header page number
 @param[in]	offset		undo log header offset
 @return start offset */
 static
-uint16_t
-trx_undo_page_get_start(const page_t* undo_page, ulint page_no, ulint offset)
+uint16_t trx_undo_page_get_start(const buf_block_t *block, uint32_t page_no,
+                                 uint16_t offset)
 {
-	return page_no == page_get_page_no(undo_page)
-		? mach_read_from_2(offset + TRX_UNDO_LOG_START + undo_page)
-		: TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE;
+  return page_no == block->page.id().page_no()
+    ? mach_read_from_2(offset + TRX_UNDO_LOG_START + block->frame)
+    : TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE;
 }
 
 /** Get the first undo log record on a page.
-@param[in]	page	undo log page
+@param[in]	block	undo log page
 @param[in]	page_no	undo log header page number
 @param[in]	offset	undo log header page offset
 @return	pointer to first record
 @retval	NULL	if none exists */
-static
-trx_undo_rec_t*
-trx_undo_page_get_first_rec(page_t* page, ulint page_no, ulint offset)
+static trx_undo_rec_t*
+trx_undo_page_get_first_rec(const buf_block_t *block, uint32_t page_no,
+                            uint16_t offset)
 {
-	ulint start = trx_undo_page_get_start(page, page_no, offset);
-	return start == trx_undo_page_get_end(page, page_no, offset)
-		? NULL
-		: page + start;
+  uint16_t start= trx_undo_page_get_start(block, page_no, offset);
+  return start == trx_undo_page_get_end(block, page_no, offset)
+    ? nullptr : block->frame + start;
 }
 
 /** Get the last undo log record on a page.
@@ -147,58 +146,43 @@ trx_undo_page_get_first_rec(page_t* page, ulint page_no, ulint offset)
 @retval	NULL	if none exists */
 static
 trx_undo_rec_t*
-trx_undo_page_get_last_rec(page_t* page, ulint page_no, ulint offset)
+trx_undo_page_get_last_rec(const buf_block_t *block, uint32_t page_no,
+                           uint16_t offset)
 {
-	ulint end = trx_undo_page_get_end(page, page_no, offset);
-
-	return trx_undo_page_get_start(page, page_no, offset) == end
-		? NULL
-		: page + mach_read_from_2(page + end - 2);
+  uint16_t end= trx_undo_page_get_end(block, page_no, offset);
+  return trx_undo_page_get_start(block, page_no, offset) == end
+    ? nullptr : block->frame + mach_read_from_2(block->frame + end - 2);
 }
 
-/***********************************************************************//**
-Gets the previous record in an undo log from the previous page.
-@return undo log record, the page s-latched, NULL if none */
-static
-trx_undo_rec_t*
-trx_undo_get_prev_rec_from_prev_page(
-/*=================================*/
-	trx_undo_rec_t*	rec,	/*!< in: undo record */
-	ulint		page_no,/*!< in: undo log header page number */
-	ulint		offset,	/*!< in: undo log header offset on page */
-	bool		shared,	/*!< in: true=S-latch, false=X-latch */
-	mtr_t*		mtr)	/*!< in: mtr */
+/** Get the previous record in an undo log from the previous page.
+@param[in,out]  block   undo log page
+@param[in]      rec     undo record offset in the page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      shared  latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
+static trx_undo_rec_t*
+trx_undo_get_prev_rec_from_prev_page(buf_block_t *&block, uint16_t rec,
+                                     uint32_t page_no, uint16_t offset,
+                                     bool shared, mtr_t *mtr)
 {
-	ulint	space;
-	ulint	prev_page_no;
-	page_t* prev_page;
-	page_t*	undo_page;
-
-	undo_page = page_align(rec);
-
-	prev_page_no = flst_get_prev_addr(undo_page + TRX_UNDO_PAGE_HDR
-					  + TRX_UNDO_PAGE_NODE, mtr)
-		.page;
-
-	if (prev_page_no == FIL_NULL) {
+  uint32_t prev_page_no= flst_get_prev_addr(TRX_UNDO_PAGE_HDR +
+                                            TRX_UNDO_PAGE_NODE +
+                                            block->frame).page;
 
-		return(NULL);
-	}
-
-	space = page_get_space_id(undo_page);
-
-	buf_block_t*	block = buf_page_get(
-		page_id_t(space, prev_page_no), 0,
-		shared ? RW_S_LATCH : RW_X_LATCH, mtr);
-
-	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+  if (prev_page_no == FIL_NULL)
+    return nullptr;
 
-	prev_page = buf_block_get_frame(block);
+  block= buf_page_get(page_id_t(block->page.id().space(), prev_page_no),
+                      0, shared ? RW_S_LATCH : RW_X_LATCH, mtr);
+  buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
 
-	return(trx_undo_page_get_last_rec(prev_page, page_no, offset));
+  return trx_undo_page_get_last_rec(block, page_no, offset);
 }
 
 /** Get the previous undo log record.
+@param[in]	block	undo log page
 @param[in]	rec	undo log record
 @param[in]	page_no	undo log header page number
 @param[in]	offset	undo log header page offset
@@ -206,287 +190,160 @@ trx_undo_get_prev_rec_from_prev_page(
 @retval	NULL if none */
 static
 trx_undo_rec_t*
-trx_undo_page_get_prev_rec(trx_undo_rec_t* rec, ulint page_no, ulint offset)
+trx_undo_page_get_prev_rec(const buf_block_t *block, trx_undo_rec_t *rec,
+                           uint32_t page_no, uint16_t offset)
 {
-	page_t*	undo_page;
-	ulint	start;
-
-	undo_page = (page_t*) ut_align_down(rec, srv_page_size);
-
-	start = trx_undo_page_get_start(undo_page, page_no, offset);
-
-	if (start + undo_page == rec) {
-
-		return(NULL);
-	}
-
-	return(undo_page + mach_read_from_2(rec - 2));
+  ut_ad(block->frame == page_align(rec));
+  return rec == block->frame + trx_undo_page_get_start(block, page_no, offset)
+    ? nullptr
+    : block->frame + mach_read_from_2(rec - 2);
 }
 
-/***********************************************************************//**
-Gets the previous record in an undo log.
-@return undo log record, the page s-latched, NULL if none */
+/** Get the previous record in an undo log.
+@param[in,out]  block   undo log page
+@param[in]      rec     undo record offset in the page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      shared  latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
 trx_undo_rec_t*
-trx_undo_get_prev_rec(
-/*==================*/
-	trx_undo_rec_t*	rec,	/*!< in: undo record */
-	ulint		page_no,/*!< in: undo log header page number */
-	ulint		offset,	/*!< in: undo log header offset on page */
-	bool		shared,	/*!< in: true=S-latch, false=X-latch */
-	mtr_t*		mtr)	/*!< in: mtr */
+trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+                      uint16_t offset, bool shared, mtr_t *mtr)
 {
-	trx_undo_rec_t*	prev_rec;
-
-	prev_rec = trx_undo_page_get_prev_rec(rec, page_no, offset);
+  if (trx_undo_rec_t *prev= trx_undo_page_get_prev_rec(block,
+                                                       block->frame + rec,
+                                                       page_no, offset))
+    return prev;
 
-	if (prev_rec) {
+  /* We have to go to the previous undo log page to look for the
+  previous record */
 
-		return(prev_rec);
-	}
-
-	/* We have to go to the previous undo log page to look for the
-	previous record */
-
-	return(trx_undo_get_prev_rec_from_prev_page(rec, page_no, offset,
-						    shared, mtr));
+  return trx_undo_get_prev_rec_from_prev_page(block, rec, page_no, offset,
+                                              shared, mtr);
 }
 
-/** Gets the next record in an undo log from the next page.
-@param[in]	space		undo log header space
-@param[in]	undo_page	undo log page
-@param[in]	page_no		undo log header page number
-@param[in]	offset		undo log header offset on page
-@param[in]	mode		latch mode: RW_S_LATCH or RW_X_LATCH
-@param[in,out]	mtr		mini-transaction
+/** Get the next record in an undo log from the next page.
+@param[in,out]  block   undo log page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      mode    latching mode: RW_S_LATCH or RW_X_LATCH
+@param[in,out]  mtr     mini-transaction
 @return undo log record, the page latched, NULL if none */
-static
-trx_undo_rec_t*
-trx_undo_get_next_rec_from_next_page(
-	ulint			space,
-	const page_t*		undo_page,
-	ulint			page_no,
-	ulint			offset,
-	ulint			mode,
-	mtr_t*			mtr)
+static trx_undo_rec_t*
+trx_undo_get_next_rec_from_next_page(buf_block_t *&block, uint32_t page_no,
+                                     uint16_t offset, ulint mode, mtr_t *mtr)
 {
-	const trx_ulogf_t*	log_hdr;
-	ulint			next_page_no;
-	page_t*			next_page;
-	ulint			next;
-
-	if (page_no == page_get_page_no(undo_page)) {
-
-		log_hdr = undo_page + offset;
-		next = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG);
-
-		if (next != 0) {
-
-			return(NULL);
-		}
-	}
-
-	next_page_no = flst_get_next_addr(undo_page + TRX_UNDO_PAGE_HDR
-					  + TRX_UNDO_PAGE_NODE, mtr)
-		.page;
-	if (next_page_no == FIL_NULL) {
-
-		return(NULL);
-	}
+  if (page_no == block->page.id().page_no() &&
+      mach_read_from_2(block->frame + offset + TRX_UNDO_NEXT_LOG))
+    return NULL;
 
-	const page_id_t	next_page_id(space, next_page_no);
+  uint32_t next= flst_get_next_addr(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE +
+				    block->frame).page;
+  if (next == FIL_NULL)
+    return NULL;
 
-	if (mode == RW_S_LATCH) {
-		next_page = trx_undo_page_get_s_latched(
-			next_page_id, mtr);
-	} else {
-		ut_ad(mode == RW_X_LATCH);
-		next_page = trx_undo_page_get(next_page_id, mtr);
-	}
+  block= buf_page_get(page_id_t(block->page.id().space(), next), 0, mode, mtr);
+  buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
 
-	return(trx_undo_page_get_first_rec(next_page, page_no, offset));
+  return trx_undo_page_get_first_rec(block, page_no, offset);
 }
 
-/***********************************************************************//**
-Gets the next record in an undo log.
-@return undo log record, the page s-latched, NULL if none */
+/** Get the next record in an undo log.
+@param[in,out]  block   undo log page
+@param[in]      rec     undo record offset in the page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
 trx_undo_rec_t*
-trx_undo_get_next_rec(
-/*==================*/
-	trx_undo_rec_t*	rec,	/*!< in: undo record */
-	ulint		page_no,/*!< in: undo log header page number */
-	ulint		offset,	/*!< in: undo log header offset on page */
-	mtr_t*		mtr)	/*!< in: mtr */
+trx_undo_get_next_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+                      uint16_t offset, mtr_t *mtr)
 {
-	ulint		space;
-	trx_undo_rec_t*	next_rec;
-
-	next_rec = trx_undo_page_get_next_rec(rec, page_no, offset);
+  if (trx_undo_rec_t *next= trx_undo_page_get_next_rec(block, rec, page_no,
+                                                       offset))
+    return next;
 
-	if (next_rec) {
-		return(next_rec);
-	}
-
-	space = page_get_space_id(page_align(rec));
-
-	return(trx_undo_get_next_rec_from_next_page(space,
-						    page_align(rec),
-						    page_no, offset,
-						    RW_S_LATCH, mtr));
+  return trx_undo_get_next_rec_from_next_page(block, page_no, offset,
+                                              RW_S_LATCH, mtr);
 }
 
-/** Gets the first record in an undo log.
-@param[in]	space		undo log header space
-@param[in]	page_no		undo log header page number
-@param[in]	offset		undo log header offset on page
-@param[in]	mode		latching mode: RW_S_LATCH or RW_X_LATCH
-@param[in,out]	mtr		mini-transaction
+/** Get the first record in an undo log.
+@param[in]      space   undo log header space
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      mode    latching mode: RW_S_LATCH or RW_X_LATCH
+@param[out]     block   undo log page
+@param[in,out]  mtr     mini-transaction
 @return undo log record, the page latched, NULL if none */
 trx_undo_rec_t*
-trx_undo_get_first_rec(
-	fil_space_t*		space,
-	ulint			page_no,
-	ulint			offset,
-	ulint			mode,
-	mtr_t*			mtr)
+trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no,
+                       uint16_t offset, ulint mode, buf_block_t*& block,
+                       mtr_t *mtr)
 {
-	page_t*		undo_page;
-	trx_undo_rec_t*	rec;
-
-	const page_id_t	page_id(space->id, page_no);
+  block = buf_page_get(page_id_t(space.id, page_no), 0, mode, mtr);
+  buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
 
-	if (mode == RW_S_LATCH) {
-		undo_page = trx_undo_page_get_s_latched(page_id, mtr);
-	} else {
-		undo_page = trx_undo_page_get(page_id, mtr);
-	}
-
-	rec = trx_undo_page_get_first_rec(undo_page, page_no, offset);
+  if (trx_undo_rec_t *rec= trx_undo_page_get_first_rec(block, page_no, offset))
+    return rec;
 
-	if (rec) {
-		return(rec);
-	}
-
-	return(trx_undo_get_next_rec_from_next_page(space->id,
-						    undo_page, page_no, offset,
-						    mode, mtr));
+  return trx_undo_get_next_rec_from_next_page(block, page_no, offset, mode,
+                                              mtr);
 }
 
 /*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/
 
-/** Parse MLOG_UNDO_INIT.
-@param[in]	ptr	log record
-@param[in]	end_ptr	end of log record buffer
-@param[in,out]	page	page or NULL
-@return	end of log record
-@retval	NULL	if the log record is incomplete */
-byte*
-trx_undo_parse_page_init(const byte* ptr, const byte* end_ptr, page_t* page)
+/** Initialize an undo log page.
+NOTE: This corresponds to a redo log record and must not be changed!
+@see mtr_t::undo_create()
+@param[in,out]	block	undo log page */
+void trx_undo_page_init(const buf_block_t &block)
 {
-	if (end_ptr <= ptr) {
-		return NULL;
-	}
-
-	const ulint type = *ptr++;
-
-	if (type > TRX_UNDO_UPDATE) {
-		recv_sys.found_corrupt_log = true;
-	} else if (page) {
-		/* Starting with MDEV-12288 in MariaDB 10.3.1, we use
-		type=0 for the combined insert/update undo log
-		pages. MariaDB 10.2 would use TRX_UNDO_INSERT or
-		TRX_UNDO_UPDATE. */
-		mach_write_to_2(FIL_PAGE_TYPE + page, FIL_PAGE_UNDO_LOG);
-		mach_write_to_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + page,
-				type);
-		mach_write_to_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + page,
-				TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
-		mach_write_to_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + page,
-				TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
-	}
-
-	return(const_cast<byte*>(ptr));
+  mach_write_to_2(my_assume_aligned<2>(FIL_PAGE_TYPE + block.frame),
+                  FIL_PAGE_UNDO_LOG);
+  static_assert(TRX_UNDO_PAGE_HDR == FIL_PAGE_DATA, "compatibility");
+  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + block.frame,
+                    0, 2);
+  mach_write_to_2(my_assume_aligned<2>
+                  (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.frame),
+                  TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+  memcpy_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block.frame,
+                    TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.frame, 2);
+  /* The following corresponds to flst_zero_both(), but without writing log. */
+  memset_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+                    FIL_ADDR_PAGE + block.frame, 0xff, 4);
+  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+                    FIL_ADDR_BYTE + block.frame, 0, 2);
+  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
+                    FIL_ADDR_PAGE + block.frame, 0xff, 4);
+  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
+                    FIL_ADDR_BYTE + block.frame, 0, 2);
+  static_assert(TRX_UNDO_PAGE_NODE + FLST_NEXT + FIL_ADDR_BYTE + 2 ==
+                TRX_UNDO_PAGE_HDR_SIZE, "compatibility");
+  /* Preserve TRX_UNDO_SEG_HDR, but clear the rest of the page. */
+  memset_aligned<2>(TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE + block.frame, 0,
+                    srv_page_size - (TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE +
+                                     FIL_PAGE_DATA_END));
 }
 
-/** Parse MLOG_UNDO_HDR_REUSE for crash-upgrade from MariaDB 10.2.
-@param[in]	ptr	redo log record
-@param[in]	end_ptr	end of log buffer
-@param[in,out]	page	undo log page or NULL
-@return end of log record or NULL */
-byte*
-trx_undo_parse_page_header_reuse(
-	const byte*	ptr,
-	const byte*	end_ptr,
-	page_t*		undo_page)
+/** Look for a free slot for an undo log segment.
+@param rseg_header   rollback segment header
+@return slot index
+@retval ULINT_UNDEFINED if not found */
+static ulint trx_rsegf_undo_find_free(const buf_block_t *rseg_header)
 {
-	trx_id_t	trx_id = mach_u64_parse_compressed(&ptr, end_ptr);
+  ulint max_slots= TRX_RSEG_N_SLOTS;
 
-	if (!ptr || !undo_page) {
-		return(const_cast<byte*>(ptr));
-	}
-
-	compile_time_assert(TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE
-			    + TRX_UNDO_LOG_XA_HDR_SIZE
-			    < UNIV_PAGE_SIZE_MIN - 100);
-
-	const ulint new_free = TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE
-		+ TRX_UNDO_LOG_OLD_HDR_SIZE;
-
-	/* Insert undo data is not needed after commit: we may free all
-	the space on the page */
-
-	ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
-			       + undo_page) == 1);
-
-	byte*	page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
-	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free);
-	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free);
-	mach_write_to_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + undo_page,
-			TRX_UNDO_ACTIVE);
-
-	byte* log_hdr = undo_page + TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE;
-
-	mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id);
-	mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free);
-
-	mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE);
-	mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE);
-
-	return(const_cast<byte*>(ptr));
-}
+#ifdef UNIV_DEBUG
+  if (trx_rseg_n_slots_debug)
+    max_slots= std::min<ulint>(trx_rseg_n_slots_debug, TRX_RSEG_N_SLOTS);
+#endif
 
-/** Initialize the fields in an undo log segment page.
-@param[in,out]	undo_block	undo page
-@param[in,out]	mtr		mini-transaction */
-static void trx_undo_page_init(buf_block_t* undo_block, mtr_t* mtr)
-{
-	page_t* page = undo_block->frame;
-	mach_write_to_2(FIL_PAGE_TYPE + page, FIL_PAGE_UNDO_LOG);
-	mach_write_to_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + page, 0);
-	mach_write_to_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + page,
-			TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
-	mach_write_to_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + page,
-			TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
-
-	mtr->set_modified();
-	switch (mtr->get_log_mode()) {
-	case MTR_LOG_NONE:
-	case MTR_LOG_NO_REDO:
-		return;
-	case MTR_LOG_SHORT_INSERTS:
-		ut_ad(0);
-		/* fall through */
-	case MTR_LOG_ALL:
-		break;
-	}
+  for (ulint i= 0; i < max_slots; i++)
+    if (trx_rsegf_get_nth_undo(rseg_header, i) == FIL_NULL)
+      return i;
 
-	byte* log_ptr = mtr->get_log()->open(11 + 1);
-	log_ptr = mlog_write_initial_log_record_low(
-		MLOG_UNDO_INIT,
-		undo_block->page.id.space(),
-		undo_block->page.id.page_no(),
-		log_ptr, mtr);
-	*log_ptr++ = 0;
-	mlog_close(mtr, log_ptr);
+  return ULINT_UNDEFINED;
 }
 
 /** Create an undo log segment.
@@ -499,15 +356,14 @@ static void trx_undo_page_init(buf_block_t* undo_block, mtr_t* mtr)
 @retval	NULL	on failure */
 static MY_ATTRIBUTE((nonnull, warn_unused_result))
 buf_block_t*
-trx_undo_seg_create(fil_space_t* space, trx_rsegf_t* rseg_hdr, ulint* id,
-		    dberr_t* err, mtr_t* mtr)
+trx_undo_seg_create(fil_space_t *space, buf_block_t *rseg_hdr, ulint *id,
+                    dberr_t *err, mtr_t *mtr)
 {
-	ulint		slot_no;
 	buf_block_t*	block;
-	ulint		n_reserved;
+	uint32_t	n_reserved;
 	bool		success;
 
-	slot_no = trx_rsegf_undo_find_free(rseg_hdr);
+	const ulint slot_no = trx_rsegf_undo_find_free(rseg_hdr);
 
 	if (slot_no == ULINT_UNDEFINED) {
 		ib::warn() << "Cannot find a free slot for an undo log. Do"
@@ -518,6 +374,8 @@ trx_undo_seg_create(fil_space_t* space, trx_rsegf_t* rseg_hdr, ulint* id,
 		return NULL;
 	}
 
+	ut_ad(slot_no < TRX_RSEG_N_SLOTS);
+
 	success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO,
 					   mtr);
 	if (!success) {
@@ -538,24 +396,26 @@ trx_undo_seg_create(fil_space_t* space, trx_rsegf_t* rseg_hdr, ulint* id,
 
 	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
 
-	trx_undo_page_init(block, mtr);
-
-	mlog_write_ulint(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block->frame,
-			 TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE,
-			 MLOG_2BYTES, mtr);
+	mtr->undo_create(*block);
+	trx_undo_page_init(*block);
 
-	mlog_write_ulint(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG + block->frame,
-			 0, MLOG_2BYTES, mtr);
+	mtr->write<2>(*block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+		      + block->frame,
+		      TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE);
+	mtr->write<2,mtr_t::MAYBE_NOP>(*block,
+				       TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+				       + block->frame, 0U);
 
-	flst_init(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame, mtr);
+	flst_init(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame,
+		  mtr);
 
-	flst_add_last(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame,
-		      TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + block->frame,
-		      mtr);
+	flst_add_last(block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+		      block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
 
 	*id = slot_no;
-	trx_rsegf_set_nth_undo(rseg_hdr, slot_no, block->page.id.page_no(),
-			       mtr);
+	mtr->write<4>(*rseg_hdr, TRX_RSEG + TRX_RSEG_UNDO_SLOTS
+		      + slot_no * TRX_RSEG_SLOT_SIZE + rseg_hdr->frame,
+		      block->page.id().page_no());
 
 	MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED);
 
@@ -563,130 +423,111 @@ trx_undo_seg_create(fil_space_t* space, trx_rsegf_t* rseg_hdr, ulint* id,
 	return block;
 }
 
-/**********************************************************************//**
-Writes the mtr log entry of an undo log header initialization. */
-UNIV_INLINE
-void
-trx_undo_header_create_log(
-/*=======================*/
-	const page_t*	undo_page,	/*!< in: undo log header page */
-	trx_id_t	trx_id,		/*!< in: transaction id */
-	mtr_t*		mtr)		/*!< in: mtr */
-{
-	mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_CREATE, mtr);
-
-	mlog_catenate_ull_compressed(mtr, trx_id);
-}
-
-/***************************************************************//**
-Creates a new undo log header in file. NOTE that this function has its own
-log record type MLOG_UNDO_HDR_CREATE. You must NOT change the operation of
-this function!
+/** Initialize an undo log header.
+@param[in,out]  undo_page   undo log segment header page
+@param[in]      trx_id      transaction identifier
+@param[in,out]  mtr         mini-transaction
 @return header byte offset on page */
-static
-ulint
-trx_undo_header_create(
-/*===================*/
-	page_t*		undo_page,	/*!< in/out: undo log segment
-					header page, x-latched; it is
-					assumed that there is
-					TRX_UNDO_LOG_XA_HDR_SIZE bytes
-					free space on it */
-	trx_id_t	trx_id,		/*!< in: transaction id */
-	mtr_t*		mtr)		/*!< in: mtr */
+static uint16_t trx_undo_header_create(buf_block_t *undo_page, trx_id_t trx_id,
+                                       mtr_t* mtr)
 {
-	trx_upagef_t*	page_hdr;
-	trx_usegf_t*	seg_hdr;
-	trx_ulogf_t*	log_hdr;
-	ulint		prev_log;
-	ulint		free;
-	ulint		new_free;
-
-	ut_ad(mtr && undo_page);
-
-	page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
-	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
-
-	free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE);
-
-	log_hdr = undo_page + free;
-
-	new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE;
-
-	ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < srv_page_size - 100);
-
-	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free);
-
-	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free);
-
-	mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE);
-
-	prev_log = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG);
-
-	if (prev_log != 0) {
-		trx_ulogf_t*	prev_log_hdr;
-
-		prev_log_hdr = undo_page + prev_log;
-
-		mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, free);
-	}
-
-	mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, free);
-
-	log_hdr = undo_page + free;
-
-	mach_write_to_2(log_hdr + TRX_UNDO_NEEDS_PURGE, 1);
-
-	mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id);
-	memset(log_hdr + TRX_UNDO_TRX_NO, 0, 8);
-	mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free);
-
-	mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE);
-	mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE);
-
-	mach_write_to_2(log_hdr + TRX_UNDO_NEXT_LOG, 0);
-	mach_write_to_2(log_hdr + TRX_UNDO_PREV_LOG, prev_log);
-
-	/* Write the log record about the header creation */
-	trx_undo_header_create_log(undo_page, trx_id, mtr);
-
-	return(free);
+  /* Reset the TRX_UNDO_PAGE_TYPE in case this page is being
+  repurposed after upgrading to MariaDB 10.3. */
+  byte *undo_type= my_assume_aligned<2>
+    (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + undo_page->frame);
+  ut_ad(mach_read_from_2(undo_type) <= 2);
+  mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_type, 0U);
+  byte *start= my_assume_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START +
+                                    undo_page->frame);
+  const uint16_t free= mach_read_from_2(start + 2);
+  static_assert(TRX_UNDO_PAGE_START + 2 == TRX_UNDO_PAGE_FREE,
+                "compatibility");
+  ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < srv_page_size - 100);
+
+  mach_write_to_2(start, free + TRX_UNDO_LOG_XA_HDR_SIZE);
+  /* A WRITE of 2 bytes is never longer than a MEMMOVE.
+  So, WRITE 2+2 bytes is better than WRITE+MEMMOVE.
+  But, a MEMSET will only be 1+2 bytes, that is, 1 byte shorter! */
+  memcpy_aligned<2>(start + 2, start, 2);
+  mtr->memset(*undo_page, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START, 4,
+              start, 2);
+  uint16_t prev_log= mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
+                                      undo_page->frame);
+  alignas(4) byte buf[4];
+  mach_write_to_2(buf, TRX_UNDO_ACTIVE);
+  mach_write_to_2(buf + 2, free);
+  static_assert(TRX_UNDO_STATE + 2 == TRX_UNDO_LAST_LOG, "compatibility");
+  static_assert(!((TRX_UNDO_SEG_HDR + TRX_UNDO_STATE) % 4), "alignment");
+  mtr->memcpy(*undo_page, my_assume_aligned<4>
+              (TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + undo_page->frame),
+              buf, 4);
+  if (prev_log)
+    mtr->write<2>(*undo_page, prev_log + TRX_UNDO_NEXT_LOG + undo_page->frame,
+                  free);
+  mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_TRX_ID +
+                                 undo_page->frame, trx_id);
+  if (UNIV_UNLIKELY(mach_read_from_8(free + TRX_UNDO_TRX_NO +
+                                     undo_page->frame) != 0))
+    mtr->memset(undo_page, free + TRX_UNDO_TRX_NO, 8, 0);
+
+  /* Write TRX_UNDO_NEEDS_PURGE=1 and TRX_UNDO_LOG_START. */
+  mach_write_to_2(buf, 1);
+  memcpy_aligned<2>(buf + 2, start, 2);
+  static_assert(TRX_UNDO_NEEDS_PURGE + 2 == TRX_UNDO_LOG_START,
+                "compatibility");
+  mtr->memcpy<mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_NEEDS_PURGE +
+                                undo_page->frame, buf, 4);
+  /* Initialize all fields TRX_UNDO_XID_EXISTS to TRX_UNDO_HISTORY_NODE. */
+  if (prev_log)
+  {
+    mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
+                TRX_UNDO_PREV_LOG - TRX_UNDO_XID_EXISTS, 0);
+    mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_PREV_LOG +
+                                   undo_page->frame, prev_log);
+    static_assert(TRX_UNDO_PREV_LOG + 2 == TRX_UNDO_HISTORY_NODE,
+                  "compatibility");
+    mtr->memset(undo_page, free + TRX_UNDO_HISTORY_NODE, FLST_NODE_SIZE, 0);
+    static_assert(TRX_UNDO_LOG_OLD_HDR_SIZE == TRX_UNDO_HISTORY_NODE +
+                  FLST_NODE_SIZE, "compatibility");
+  }
+  else
+    mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
+                TRX_UNDO_LOG_OLD_HDR_SIZE - TRX_UNDO_XID_EXISTS, 0);
+  return free;
 }
 
-/********************************************************************//**
-Write X/Open XA Transaction Identification (XID) to undo log header */
-static
-void
-trx_undo_write_xid(
-/*===============*/
-	trx_ulogf_t*	log_hdr,/*!< in: undo log header */
-	const XID*	xid,	/*!< in: X/Open XA Transaction Identification */
-	mtr_t*		mtr)	/*!< in: mtr */
+/** Write X/Open XA Transaction Identifier (XID) to undo log header
+@param[in,out]  block   undo header page
+@param[in]      offset  undo header record offset
+@param[in]      xid     distributed transaction identifier
+@param[in,out]  mtr     mini-transaction */
+static void trx_undo_write_xid(buf_block_t *block, uint16_t offset,
+                               const XID &xid, mtr_t *mtr)
 {
-	DBUG_ASSERT(xid->gtrid_length >= 0);
-	DBUG_ASSERT(xid->bqual_length >= 0);
-	DBUG_ASSERT(xid->gtrid_length + xid->bqual_length < XIDDATASIZE);
-
-	mlog_write_ulint(log_hdr + TRX_UNDO_XA_FORMAT,
-			 static_cast<ulint>(xid->formatID),
-			 MLOG_4BYTES, mtr);
-
-	mlog_write_ulint(log_hdr + TRX_UNDO_XA_TRID_LEN,
-			 static_cast<ulint>(xid->gtrid_length),
-			 MLOG_4BYTES, mtr);
-
-	mlog_write_ulint(log_hdr + TRX_UNDO_XA_BQUAL_LEN,
-			 static_cast<ulint>(xid->bqual_length),
-			 MLOG_4BYTES, mtr);
-	const ulint xid_length = static_cast<ulint>(xid->gtrid_length
-						    + xid->bqual_length);
-	mlog_write_string(log_hdr + TRX_UNDO_XA_XID,
-			  reinterpret_cast<const byte*>(xid->data),
-			  xid_length, mtr);
-	if (UNIV_LIKELY(xid_length < XIDDATASIZE)) {
-		mlog_memset(log_hdr + TRX_UNDO_XA_XID + xid_length,
-			    XIDDATASIZE - xid_length, 0, mtr);
-	}
+  DBUG_ASSERT(xid.gtrid_length > 0);
+  DBUG_ASSERT(xid.bqual_length >= 0);
+  DBUG_ASSERT(xid.gtrid_length <= MAXGTRIDSIZE);
+  DBUG_ASSERT(xid.bqual_length <= MAXBQUALSIZE);
+  static_assert(MAXGTRIDSIZE + MAXBQUALSIZE == XIDDATASIZE,
+                "gtrid and bqual don't fit xid data");
+  DBUG_ASSERT(mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
+                               block->frame) == offset);
+
+  trx_ulogf_t* log_hdr= block->frame + offset;
+
+  mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_FORMAT,
+                                 static_cast<uint32_t>(xid.formatID));
+  mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_TRID_LEN,
+                                 static_cast<uint32_t>(xid.gtrid_length));
+  mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_BQUAL_LEN,
+                                 static_cast<uint32_t>(xid.bqual_length));
+  const ulint xid_length= static_cast<ulint>(xid.gtrid_length
+                                             + xid.bqual_length);
+  mtr->memcpy(*block, &block->frame[offset + TRX_UNDO_XA_XID],
+              xid.data, xid_length);
+  if (UNIV_LIKELY(xid_length < XIDDATASIZE))
+    mtr->memset(block, offset + TRX_UNDO_XA_XID + xid_length,
+                XIDDATASIZE - xid_length, 0);
 }
 
 /********************************************************************//**
@@ -707,67 +548,6 @@ trx_undo_read_xid(const trx_ulogf_t* log_hdr, XID* xid)
 	memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE);
 }
 
-/***************************************************************//**
-Adds space for the XA XID after an undo log old-style header. */
-static
-void
-trx_undo_header_add_space_for_xid(
-/*==============================*/
-	page_t*		undo_page,/*!< in: undo log segment header page */
-	trx_ulogf_t*	log_hdr,/*!< in: undo log header */
-	mtr_t*		mtr)	/*!< in: mtr */
-{
-	trx_upagef_t*	page_hdr;
-	ulint		free;
-	ulint		new_free;
-
-	page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
-
-	free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE);
-
-	/* free is now the end offset of the old style undo log header */
-
-	ut_a(free == (ulint)(log_hdr - undo_page) + TRX_UNDO_LOG_OLD_HDR_SIZE);
-
-	new_free = free + (TRX_UNDO_LOG_XA_HDR_SIZE
-			   - TRX_UNDO_LOG_OLD_HDR_SIZE);
-
-	/* Add space for a XID after the header, update the free offset
-	fields on the undo log page and in the undo log header */
-
-	mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_START, new_free,
-			 MLOG_2BYTES, mtr);
-
-	mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE, new_free,
-			 MLOG_2BYTES, mtr);
-
-	mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, new_free,
-			 MLOG_2BYTES, mtr);
-}
-
-/** Parse the redo log entry of an undo log page header create.
-@param[in]	ptr	redo log record
-@param[in]	end_ptr	end of log buffer
-@param[in,out]	page	page frame or NULL
-@param[in,out]	mtr	mini-transaction or NULL
-@return end of log record or NULL */
-byte*
-trx_undo_parse_page_header(
-	const byte*	ptr,
-	const byte*	end_ptr,
-	page_t*		page,
-	mtr_t*		mtr)
-{
-	trx_id_t	trx_id = mach_u64_parse_compressed(&ptr, end_ptr);
-
-	if (ptr != NULL && page != NULL) {
-		trx_undo_header_create(page, trx_id, mtr);
-		return(const_cast<byte*>(ptr));
-	}
-
-	return(const_cast<byte*>(ptr));
-}
-
 /** Allocate an undo log page.
 @param[in,out]	undo	undo log
 @param[in,out]	mtr	mini-transaction that does not hold any page latch
@@ -777,8 +557,7 @@ buf_block_t* trx_undo_add_page(trx_undo_t* undo, mtr_t* mtr)
 {
 	trx_rseg_t*	rseg		= undo->rseg;
 	buf_block_t*	new_block	= NULL;
-	ulint		n_reserved;
-	page_t*		header_page;
+	uint32_t	n_reserved;
 
 	/* When we add a page to an undo log, this is analogous to
 	a pessimistic insert in a B-tree, and we must reserve the
@@ -786,7 +565,7 @@ buf_block_t* trx_undo_add_page(trx_undo_t* undo, mtr_t* mtr)
 
 	mutex_enter(&rseg->mutex);
 
-	header_page = trx_undo_page_get(
+	buf_block_t* header_block = trx_undo_page_get(
 		page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr);
 
 	if (!fsp_reserve_free_extents(&n_reserved, undo->rseg->space, 1,
@@ -796,8 +575,8 @@ buf_block_t* trx_undo_add_page(trx_undo_t* undo, mtr_t* mtr)
 
 	new_block = fseg_alloc_free_page_general(
 		TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
-		+ header_page,
-		undo->top_page_no + 1, FSP_UP, TRUE, mtr, mtr);
+		+ header_block->frame,
+		undo->top_page_no + 1, FSP_UP, true, mtr, mtr);
 
 	rseg->space->release_free_extents(n_reserved);
 
@@ -807,15 +586,13 @@ buf_block_t* trx_undo_add_page(trx_undo_t* undo, mtr_t* mtr)
 
 	ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
 	buf_block_dbg_add_level(new_block, SYNC_TRX_UNDO_PAGE);
-	undo->last_page_no = new_block->page.id.page_no();
+	undo->last_page_no = new_block->page.id().page_no();
 
-	trx_undo_page_init(new_block, mtr);
+	mtr->undo_create(*new_block);
+	trx_undo_page_init(*new_block);
 
-	flst_add_last(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
-		      + header_page,
-		      TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE
-		      + new_block->frame,
-		      mtr);
+	flst_add_last(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+		      new_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
 	undo->size++;
 	rseg->curr_size++;
 
@@ -828,14 +605,14 @@ func_exit:
 Frees an undo log page that is not the header page.
 @return last page number in remaining log */
 static
-ulint
+uint32_t
 trx_undo_free_page(
 /*===============*/
 	trx_rseg_t* rseg,	/*!< in: rollback segment */
 	bool	in_history,	/*!< in: TRUE if the undo log is in the history
 				list */
-	ulint	hdr_page_no,	/*!< in: header page number */
-	ulint	page_no,	/*!< in: page number to free: must not be the
+	uint32_t hdr_page_no,	/*!< in: header page number */
+	uint32_t page_no,	/*!< in: page number to free: must not be the
 				header page */
 	mtr_t*	mtr)		/*!< in: mtr which does not have a latch to any
 				undo log page; the caller must have reserved
@@ -846,28 +623,32 @@ trx_undo_free_page(
 	ut_a(hdr_page_no != page_no);
 	ut_ad(mutex_own(&(rseg->mutex)));
 
-	page_t*	undo_page = trx_undo_page_get(page_id_t(space, page_no), mtr);
-	page_t* header_page = trx_undo_page_get(page_id_t(space, hdr_page_no),
-						mtr);
+	buf_block_t* undo_block = trx_undo_page_get(page_id_t(space, page_no),
+						    mtr);
+	buf_block_t* header_block = trx_undo_page_get(page_id_t(space,
+								hdr_page_no),
+						      mtr);
 
-	flst_remove(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + header_page,
-		    TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + undo_page, mtr);
+	flst_remove(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+		    undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
 
-	fseg_free_page(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER + header_page,
-		       rseg->space, page_no, true, mtr);
+	fseg_free_page(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+		       + header_block->frame,
+		       rseg->space, page_no, mtr);
+	buf_page_free(rseg->space, page_no, mtr, __FILE__, __LINE__);
 
 	const fil_addr_t last_addr = flst_get_last(
-		TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + header_page, mtr);
+		TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + header_block->frame);
 	rseg->curr_size--;
 
 	if (in_history) {
-		trx_rsegf_t* rseg_header = trx_rsegf_get(
+		buf_block_t* rseg_header = trx_rsegf_get(
 			rseg->space, rseg->page_no, mtr);
-		uint32_t hist_size = mach_read_from_4(
-			rseg_header + TRX_RSEG_HISTORY_SIZE);
+		byte* rseg_hist_size = TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+			+ rseg_header->frame;
+		uint32_t hist_size = mach_read_from_4(rseg_hist_size);
 		ut_ad(hist_size > 0);
-		mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
-				 hist_size - 1, MLOG_4BYTES, mtr);
+		mtr->write<4>(*rseg_header, rseg_hist_size, hist_size - 1);
 	}
 
 	return(last_addr.page);
@@ -906,11 +687,11 @@ void trx_undo_truncate_end(trx_undo_t& undo, undo_no_t limit, bool is_temp)
 
 		trx_undo_rec_t* trunc_here = NULL;
 		mutex_enter(&undo.rseg->mutex);
-		page_t*		undo_page = trx_undo_page_get(
+		buf_block_t* undo_block = trx_undo_page_get(
 			page_id_t(undo.rseg->space->id, undo.last_page_no),
 			&mtr);
 		trx_undo_rec_t* rec = trx_undo_page_get_last_rec(
-			undo_page, undo.hdr_page_no, undo.hdr_offset);
+			undo_block, undo.hdr_page_no, undo.hdr_offset);
 		while (rec) {
 			if (trx_undo_rec_get_undo_no(rec) < limit) {
 				goto func_exit;
@@ -918,7 +699,7 @@ void trx_undo_truncate_end(trx_undo_t& undo, undo_no_t limit, bool is_temp)
 			/* Truncate at least this record off, maybe more */
 			trunc_here = rec;
 
-			rec = trx_undo_page_get_prev_rec(rec,
+			rec = trx_undo_page_get_prev_rec(undo_block, rec,
 							 undo.hdr_page_no,
 							 undo.hdr_offset);
 		}
@@ -934,10 +715,10 @@ func_exit:
 		mutex_exit(&undo.rseg->mutex);
 
 		if (trunc_here) {
-			mlog_write_ulint(undo_page + TRX_UNDO_PAGE_HDR
-					 + TRX_UNDO_PAGE_FREE,
-					 ulint(trunc_here - undo_page),
-					 MLOG_2BYTES, &mtr);
+			mtr.write<2>(*undo_block,
+				     TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+				     + undo_block->frame,
+				     ulint(trunc_here - undo_block->frame));
 		}
 
 		mtr.commit();
@@ -956,14 +737,12 @@ freed, but emptied, if all the records there are below the limit.
 void
 trx_undo_truncate_start(
 	trx_rseg_t*	rseg,
-	ulint		hdr_page_no,
-	ulint		hdr_offset,
+	uint32_t	hdr_page_no,
+	uint16_t	hdr_offset,
 	undo_no_t	limit)
 {
-	page_t*		undo_page;
 	trx_undo_rec_t* rec;
 	trx_undo_rec_t* last_rec;
-	ulint		page_no;
 	mtr_t		mtr;
 
 	ut_ad(mutex_own(&(rseg->mutex)));
@@ -978,42 +757,36 @@ loop:
 		mtr.set_log_mode(MTR_LOG_NO_REDO);
 	}
 
-	rec = trx_undo_get_first_rec(rseg->space, hdr_page_no, hdr_offset,
-				     RW_X_LATCH, &mtr);
+	buf_block_t* undo_page;
+	rec = trx_undo_get_first_rec(*rseg->space, hdr_page_no, hdr_offset,
+				     RW_X_LATCH, undo_page, &mtr);
 	if (rec == NULL) {
 		/* Already empty */
-
-		mtr_commit(&mtr);
-
+done:
+		mtr.commit();
 		return;
 	}
 
-	undo_page = page_align(rec);
-
 	last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no,
 					      hdr_offset);
 	if (trx_undo_rec_get_undo_no(last_rec) >= limit) {
-
-		mtr_commit(&mtr);
-
-		return;
+		goto done;
 	}
 
-	page_no = page_get_page_no(undo_page);
-
-	if (page_no == hdr_page_no) {
+	if (undo_page->page.id().page_no() == hdr_page_no) {
 		uint16_t end = mach_read_from_2(hdr_offset + TRX_UNDO_NEXT_LOG
-						+ undo_page);
+						+ undo_page->frame);
 		if (end == 0) {
 			end = mach_read_from_2(TRX_UNDO_PAGE_HDR
 					       + TRX_UNDO_PAGE_FREE
-					       + undo_page);
+					       + undo_page->frame);
 		}
 
-		mlog_write_ulint(undo_page + hdr_offset + TRX_UNDO_LOG_START,
-				 end, MLOG_2BYTES, &mtr);
+		mtr.write<2>(*undo_page, undo_page->frame + hdr_offset
+			     + TRX_UNDO_LOG_START, end);
 	} else {
-		trx_undo_free_page(rseg, true, hdr_page_no, page_no, &mtr);
+		trx_undo_free_page(rseg, true, hdr_page_no,
+				   undo_page->page.id().page_no(), &mtr);
 	}
 
 	mtr_commit(&mtr);
@@ -1025,42 +798,37 @@ loop:
 @param undo	temporary undo log */
 static void trx_undo_seg_free(const trx_undo_t *undo)
 {
-	trx_rseg_t*	rseg;
-	fseg_header_t*	file_seg;
-	trx_rsegf_t*	rseg_header;
-	trx_usegf_t*	seg_header;
-	ibool		finished;
-	mtr_t		mtr;
+	ut_ad(undo->id < TRX_RSEG_N_SLOTS);
 
-	rseg = undo->rseg;
+	trx_rseg_t* const rseg = undo->rseg;
+	bool		finished;
+	mtr_t		mtr;
+	ut_ad(rseg->space == fil_system.temp_space);
 
 	do {
 		mtr.start();
 		mtr.set_log_mode(MTR_LOG_NO_REDO);
 
-		mutex_enter(&(rseg->mutex));
+		buf_block_t* block = trx_undo_page_get(
+			page_id_t(SRV_TMP_SPACE_ID, undo->hdr_page_no), &mtr);
 
-		seg_header = trx_undo_page_get(page_id_t(SRV_TMP_SPACE_ID,
-							 undo->hdr_page_no),
-					       &mtr)
-			+ TRX_UNDO_SEG_HDR;
-
-		file_seg = seg_header + TRX_UNDO_FSEG_HEADER;
+		fseg_header_t* file_seg = TRX_UNDO_SEG_HDR
+			+ TRX_UNDO_FSEG_HEADER + block->frame;
 
 		finished = fseg_free_step(file_seg, &mtr);
 
 		if (finished) {
 			/* Update the rseg header */
-			rseg_header = trx_rsegf_get(
+			buf_block_t* rseg_header = trx_rsegf_get(
 				rseg->space, rseg->page_no, &mtr);
-			trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL,
-					       &mtr);
-
+			compile_time_assert(FIL_NULL == 0xffffffff);
+			memset(TRX_RSEG + TRX_RSEG_UNDO_SLOTS
+			       + undo->id * TRX_RSEG_SLOT_SIZE +
+			       rseg_header->frame, 0xff, 4);
 			MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
 		}
 
-		mutex_exit(&(rseg->mutex));
-		mtr_commit(&mtr);
+		mtr.commit();
 	} while (!finished);
 }
 
@@ -1083,11 +851,11 @@ trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no,
 	ut_ad(id < TRX_RSEG_N_SLOTS);
 
 	mtr.start();
-	const page_t* undo_page = trx_undo_page_get(
+	const buf_block_t* block = trx_undo_page_get(
 		page_id_t(rseg->space->id, page_no), &mtr);
 	const uint16_t type = mach_read_from_2(TRX_UNDO_PAGE_HDR
 					       + TRX_UNDO_PAGE_TYPE
-					       + undo_page);
+					       + block->frame);
 	if (UNIV_UNLIKELY(type > 2)) {
 corrupted_type:
 		sql_print_error("InnoDB: unsupported undo header type %u",
@@ -1098,7 +866,7 @@ corrupted:
 	}
 
 	uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
-					   + undo_page);
+					   + block->frame);
 	if (offset < TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE ||
 	    offset >= srv_page_size - TRX_UNDO_LOG_OLD_HDR_SIZE) {
 		sql_print_error("InnoDB: invalid undo header offset %u",
@@ -1106,9 +874,9 @@ corrupted:
 		goto corrupted;
 	}
 
-	const trx_ulogf_t* const undo_header = undo_page + offset;
+	const trx_ulogf_t* const undo_header = block->frame + offset;
 	uint16_t state = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
-					  + undo_page);
+					  + block->frame);
 	switch (state) {
 	case TRX_UNDO_ACTIVE:
 	case TRX_UNDO_PREPARED:
@@ -1174,20 +942,20 @@ corrupted:
 	undo->dict_operation = undo_header[TRX_UNDO_DICT_TRANS];
 	undo->table_id = mach_read_from_8(undo_header + TRX_UNDO_TABLE_ID);
 	undo->size = flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
-				  + undo_page);
+				  + block->frame);
 
 	fil_addr_t	last_addr = flst_get_last(
-		TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + undo_page, &mtr);
+		TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame);
 
 	undo->last_page_no = last_addr.page;
 	undo->top_page_no = last_addr.page;
 
-	page_t* last_page = trx_undo_page_get(
+	const buf_block_t* last = trx_undo_page_get(
 		page_id_t(rseg->space->id, undo->last_page_no), &mtr);
 
 	if (const trx_undo_rec_t* rec = trx_undo_page_get_last_rec(
-		    last_page, page_no, offset)) {
-		undo->top_offset = ulint(rec - last_page);
+		    last, page_no, offset)) {
+		undo->top_offset = static_cast<uint16_t>(rec - last->frame);
 		undo->top_undo_no = trx_undo_rec_get_undo_no(rec);
 		ut_ad(!undo->empty());
 	} else {
@@ -1220,8 +988,8 @@ trx_undo_mem_create(
 	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
 				is created */
 	const XID*	xid,	/*!< in: X/Open transaction identification */
-	ulint		page_no,/*!< in: undo log header page number */
-	ulint		offset)	/*!< in: undo log header byte offset on page */
+	uint32_t	page_no,/*!< in: undo log header page number */
+	uint16_t	offset)	/*!< in: undo log header byte offset on page */
 {
 	trx_undo_t*	undo;
 
@@ -1268,7 +1036,7 @@ trx_undo_mem_init_for_reuse(
 	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
 				is created */
 	const XID*	xid,	/*!< in: X/Open XA transaction identification*/
-	ulint		offset)	/*!< in: undo log header byte offset on page */
+	uint16_t	offset)	/*!< in: undo log header byte offset on page */
 {
 	ut_ad(mutex_own(&((undo->rseg)->mutex)));
 
@@ -1312,13 +1080,10 @@ trx_undo_create(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
 
 	rseg->curr_size++;
 
-	ulint offset = trx_undo_header_create(block->frame, trx->id, mtr);
-
-	trx_undo_header_add_space_for_xid(block->frame, block->frame + offset,
-					  mtr);
+	uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
 
 	*undo = trx_undo_mem_create(rseg, id, trx->id, trx->xid,
-				    block->page.id.page_no(), offset);
+				    block->page.id().page_no(), offset);
 	if (*undo == NULL) {
 		*err = DB_OUT_OF_MEMORY;
 		 /* FIXME: this will not free the undo block to the file */
@@ -1337,10 +1102,11 @@ trx_undo_create(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
 	case TRX_DICT_OP_TABLE:
 		(*undo)->table_id = trx->table_id;
 		(*undo)->dict_operation = TRUE;
-		mlog_write_ulint(block->frame + offset + TRX_UNDO_DICT_TRANS,
-				 TRUE, MLOG_1BYTE, mtr);
-		mlog_write_ull(block->frame + offset + TRX_UNDO_TABLE_ID,
-			       trx->table_id, mtr);
+		mtr->write<1,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+					       + TRX_UNDO_DICT_TRANS, 1U);
+		mtr->write<8,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+					       + TRX_UNDO_TABLE_ID,
+					       trx->table_id);
 	}
 
 	*err = DB_SUCCESS;
@@ -1385,19 +1151,7 @@ trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo,
 
 	*pundo = undo;
 
-	ulint offset = trx_undo_header_create(block->frame, trx->id, mtr);
-	/* Reset the TRX_UNDO_PAGE_TYPE in case this page is being
-	repurposed after upgrading to MariaDB 10.3. */
-	if (ut_d(ulint type =) UNIV_UNLIKELY(
-		    mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
-				     + block->frame))) {
-		ut_ad(type == 1 || type == 2);
-		mlog_write_ulint(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
-				 + block->frame, 0, MLOG_2BYTES, mtr);
-	}
-
-	trx_undo_header_add_space_for_xid(block->frame, block->frame + offset,
-					  mtr);
+	uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
 
 	trx_undo_mem_init_for_reuse(undo, trx->id, trx->xid, offset);
 
@@ -1415,10 +1169,11 @@ trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo,
 	case TRX_DICT_OP_TABLE:
 		undo->table_id = trx->table_id;
 		undo->dict_operation = TRUE;
-		mlog_write_ulint(block->frame + offset + TRX_UNDO_DICT_TRANS,
-				 TRUE, MLOG_1BYTE, mtr);
-		mlog_write_ull(block->frame + offset + TRX_UNDO_TABLE_ID,
-			       trx->table_id, mtr);
+		mtr->write<1,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+					       + TRX_UNDO_DICT_TRANS, 1U);
+		mtr->write<8,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+					       + TRX_UNDO_TABLE_ID,
+					       trx->table_id);
 	}
 
 	return block;
@@ -1528,69 +1283,48 @@ func_exit:
 /******************************************************************//**
 Sets the state of the undo log segment at a transaction finish.
 @return undo log segment header page, x-latched */
-page_t*
+buf_block_t*
 trx_undo_set_state_at_finish(
 /*=========================*/
 	trx_undo_t*	undo,	/*!< in: undo log memory copy */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	trx_usegf_t*	seg_hdr;
-	trx_upagef_t*	page_hdr;
-	page_t*		undo_page;
-	ulint		state;
-
 	ut_a(undo->id < TRX_RSEG_N_SLOTS);
 
-	undo_page = trx_undo_page_get(
+	buf_block_t* block = trx_undo_page_get(
 		page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr);
 
-	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
-	page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
-
-	if (undo->size == 1
-	    && mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE)
-	       < TRX_UNDO_PAGE_REUSE_LIMIT) {
-
-		state = TRX_UNDO_CACHED;
-	} else {
-		state = TRX_UNDO_TO_PURGE;
-	}
+	const uint16_t state = undo->size == 1
+		&& TRX_UNDO_PAGE_REUSE_LIMIT
+		> mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+				   + block->frame)
+		? TRX_UNDO_CACHED
+		: TRX_UNDO_TO_PURGE;
 
 	undo->state = state;
-
-	mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, state, MLOG_2BYTES, mtr);
-
-	return(undo_page);
+	mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+		      + block->frame, state);
+	return block;
 }
 
 /** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK.
 @param[in,out]	trx		transaction
 @param[in,out]	undo		undo log
 @param[in]	rollback	false=XA PREPARE, true=XA ROLLBACK
-@param[in,out]	mtr		mini-transaction */
-void
-trx_undo_set_state_at_prepare(
-	trx_t*		trx,
-	trx_undo_t*	undo,
-	bool		rollback,
-	mtr_t*		mtr)
+@param[in,out]	mtr		mini-transaction
+@return undo log segment header page, x-latched */
+void trx_undo_set_state_at_prepare(trx_t *trx, trx_undo_t *undo, bool rollback,
+				   mtr_t *mtr)
 {
-	trx_usegf_t*	seg_hdr;
-	trx_ulogf_t*	undo_header;
-	page_t*		undo_page;
-	ulint		offset;
-
 	ut_a(undo->id < TRX_RSEG_N_SLOTS);
 
-	undo_page = trx_undo_page_get(
+	buf_block_t* block = trx_undo_page_get(
 		page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr);
 
-	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
-
 	if (rollback) {
 		ut_ad(undo->state == TRX_UNDO_PREPARED);
-		mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE,
-				 MLOG_2BYTES, mtr);
+		mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+			      + block->frame, TRX_UNDO_ACTIVE);
 		return;
 	}
 
@@ -1600,16 +1334,13 @@ trx_undo_set_state_at_prepare(
 	undo->xid   = *trx->xid;
 	/*------------------------------*/
 
-	mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, undo->state,
-			 MLOG_2BYTES, mtr);
-
-	offset = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG);
-	undo_header = undo_page + offset;
-
-	mlog_write_ulint(undo_header + TRX_UNDO_XID_EXISTS,
-			 TRUE, MLOG_1BYTE, mtr);
+	mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + block->frame,
+		      undo->state);
+	uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+					   + block->frame);
+	mtr->write<1>(*block, block->frame + offset + TRX_UNDO_XID_EXISTS, 1U);
 
-	trx_undo_write_xid(undo_header, &undo->xid, mtr);
+	trx_undo_write_xid(block, offset, undo->xid, mtr);
 }
 
 /** Free temporary undo log after commit or rollback.
@@ -1632,9 +1363,7 @@ void trx_undo_commit_cleanup(trx_undo_t *undo)
 		ut_ad(undo->state == TRX_UNDO_TO_PURGE);
 
 		/* Delete first the undo log segment in the file */
-		mutex_exit(&rseg->mutex);
 		trx_undo_seg_free(undo);
-		mutex_enter(&rseg->mutex);
 
 		ut_ad(rseg->curr_size > undo->size);
 		rseg->curr_size -= undo->size;
diff --git a/storage/innobase/ut/ut0crc32.cc b/storage/innobase/ut/ut0crc32.cc
deleted file mode 100644
index bb2c530a174..00000000000
--- a/storage/innobase/ut/ut0crc32.cc
+++ /dev/null
@@ -1,552 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 2009, 2010 Facebook, Inc. All Rights Reserved.
-Copyright (c) 2011, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2016, 2018, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/***************************************************************//**
-@file ut/ut0crc32.cc
-CRC32 implementation from Facebook, based on the zlib implementation.
-
-Created Aug 8, 2011, Vasil Dimov, based on mysys/my_crc32.c and
-mysys/my_perf.c, contributed by Facebook under the following license.
-********************************************************************/
-
-/* Copyright (C) 2009-2010 Facebook, Inc.  All Rights Reserved.
-
-   Dual licensed under BSD license and GPLv2.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-   1. Redistributions of source code must retain the above copyright notice,
-      this list of conditions and the following disclaimer.
-   2. Redistributions in binary form must reproduce the above copyright notice,
-      this list of conditions and the following disclaimer in the documentation
-      and/or other materials provided with the distribution.
-
-   THIS SOFTWARE IS PROVIDED BY FACEBOOK, INC. ``AS IS'' AND ANY EXPRESS OR
-   IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-   MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
-   EVENT SHALL FACEBOOK, INC. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-   OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-   OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-   ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-   This program is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published by the Free
-   Software Foundation; version 2 of the License.
-
-   This program is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-   more details.
-
-   You should have received a copy of the GNU General Public License along with
-   this program; if not, write to the Free Software Foundation, Inc.,
-   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
-
-/* The below CRC32 implementation is based on the implementation included with
- * zlib with modifications to process 8 bytes at a time and using SSE 4.2
- * extensions when available.  The polynomial constant has been changed to
- * match the one used by SSE 4.2 and does not return the same value as the
- * version used by zlib.  The original zlib copyright notice follows. */
-
-/* crc32.c -- compute the CRC-32 of a buf stream
- * Copyright (C) 1995-2005 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- *
- * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
- * CRC methods: exclusive-oring 32 bits of buf at a time, and pre-computing
- * tables for updating the shift register in one step with three exclusive-ors
- * instead of four steps with four exclusive-ors.  This results in about a
- * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
- */
-
-// First include (the generated) my_config.h, to get correct platform defines.
-#include "my_config.h"
-#include <string.h>
-
-#include "ut0crc32.h"
-#include "my_valgrind.h"
-
-#ifdef _MSC_VER
-#include <intrin.h>
-#endif
-
-/** Swap the byte order of an 8 byte integer.
-@param[in]	i	8-byte integer
-@return 8-byte integer */
-inline
-uint64_t
-ut_crc32_swap_byteorder(
-	uint64_t	i)
-{
-	return(i << 56
-	       | (i & 0x000000000000FF00ULL) << 40
-	       | (i & 0x0000000000FF0000ULL) << 24
-	       | (i & 0x00000000FF000000ULL) << 8
-	       | (i & 0x000000FF00000000ULL) >> 8
-	       | (i & 0x0000FF0000000000ULL) >> 24
-	       | (i & 0x00FF000000000000ULL) >> 40
-	       | i >> 56);
-}
-
-/* CRC32 hardware implementation. */
-
-#ifdef HAVE_CRC32_VPMSUM
-extern "C" {
-unsigned int crc32c_vpmsum(unsigned int crc, const unsigned char *p, unsigned long len);
-};
-UNIV_INLINE
-ib_uint32_t
-ut_crc32_power8(
-/*===========*/
-		const byte*		buf,		/*!< in: data over which to calculate CRC32 */
-		ulint			len)		/*!< in: data length */
-{
-	return crc32c_vpmsum(0, buf, len);
-}
-
-ut_crc32_func_t	ut_crc32 = ut_crc32_power8;
-const char*	ut_crc32_implementation = "Using POWER8 crc32 instructions";
-#else
-uint32_t ut_crc32_sw(const byte* buf, ulint len);
-ut_crc32_func_t	ut_crc32 = ut_crc32_sw;
-const char*	ut_crc32_implementation = "Using generic crc32 instructions";
-#endif
-
-#if (defined(__GNUC__) && defined(__x86_64__)) || defined(_MSC_VER)
-/********************************************************************//**
-Fetches CPU info */
-static
-void
-ut_cpuid(
-/*=====*/
-	uint32_t	vend[3],	/*!< out: CPU vendor */
-	uint32_t*	model,		/*!< out: CPU model */
-	uint32_t*	family,		/*!< out: CPU family */
-	uint32_t*	stepping,	/*!< out: CPU stepping */
-	uint32_t*	features_ecx,	/*!< out: CPU features ecx */
-	uint32_t*	features_edx)	/*!< out: CPU features edx */
-{
-	uint32_t	sig;
-#ifdef _MSC_VER
-	int data[4];
-	__cpuid(data, 0);
-	/* ebx */
-	vend[0] = data[1];
-	/* edx */
-	vend[1] = data[3];
-	/* ecx */
-	vend[2] = data[2];
-
-	__cpuid(data, 1);
-	/* eax */
-	sig = data[0];
-	/* ecx */
-	*features_ecx = data[2];
-	/* edx */
-	*features_edx = data[3];
-#else
-	asm("cpuid" : "=b" (vend[0]), "=c" (vend[2]), "=d" (vend[1]) : "a" (0));
-	asm("cpuid" : "=a" (sig), "=c" (*features_ecx), "=d" (*features_edx)
-	    : "a" (1)
-	    : "ebx");
-#endif
-
-	*model = ((sig >> 4) & 0xF);
-	*family = ((sig >> 8) & 0xF);
-	*stepping = (sig & 0xF);
-
-	if (memcmp(vend, "GenuineIntel", 12) == 0
-	    || (memcmp(vend, "AuthenticAMD", 12) == 0 && *family == 0xF)) {
-
-		*model += (((sig >> 16) & 0xF) << 4);
-		*family += ((sig >> 20) & 0xFF);
-	}
-}
-
-/** Calculate CRC32 over 8-bit data using a hardware/CPU instruction.
-@param[in,out]	crc	crc32 checksum so far when this function is called,
-when the function ends it will contain the new checksum
-@param[in,out]	data	data to be checksummed, the pointer will be advanced
-with 1 byte
-@param[in,out]	len	remaining bytes, it will be decremented with 1 */
-inline
-void
-ut_crc32_8_hw(
-	uint32_t*	crc,
-	const byte**	data,
-	ulint*		len)
-{
-#  ifdef _MSC_VER
-	*crc = _mm_crc32_u8(*crc, (*data)[0]);
-#  elif __has_feature(memory_sanitizer)
-	*crc = __builtin_ia32_crc32qi(*crc, (*data)[0]);
-#  else
-	asm("crc32b %1, %0"
-	    /* output operands */
-	    : "+r" (*crc)
-	    /* input operands */
-	    : "rm" ((*data)[0]));
-#  endif
-
-	(*data)++;
-	(*len)--;
-}
-
-/** Calculate CRC32 over a 64-bit integer using a hardware/CPU instruction.
-@param[in]	crc	crc32 checksum so far
-@param[in]	data	data to be checksummed
-@return resulting checksum of crc + crc(data) */
-inline
-uint32_t
-ut_crc32_64_low_hw(
-	uint32_t	crc,
-	uint64_t	data)
-{
-	uint64_t	crc_64bit = crc;
-#  ifdef _MSC_VER
-#   ifdef _M_X64
-	crc_64bit = _mm_crc32_u64(crc_64bit, data);
-#   elif defined(_M_IX86)
-	crc = _mm_crc32_u32(crc, static_cast<uint32_t>(data));
-	crc_64bit = _mm_crc32_u32(crc, static_cast<uint32_t>(data >> 32));
-#   else
-#    error Not Supported processors type.
-#   endif
-#  elif __has_feature(memory_sanitizer)
-	crc_64bit = __builtin_ia32_crc32di(crc_64bit, data);
-#  else
-	asm("crc32q %1, %0"
-	    /* output operands */
-	    : "+r" (crc_64bit)
-	    /* input operands */
-	    : "rm" (data));
-#  endif
-
-	return(static_cast<uint32_t>(crc_64bit));
-}
-
-/** Calculate CRC32 over 64-bit byte string using a hardware/CPU instruction.
-@param[in,out]	crc	crc32 checksum so far when this function is called,
-when the function ends it will contain the new checksum
-@param[in,out]	data	data to be checksummed, the pointer will be advanced
-with 8 bytes
-@param[in,out]	len	remaining bytes, it will be decremented with 8 */
-inline
-void
-ut_crc32_64_hw(
-	uint32_t*	crc,
-	const byte**	data,
-	ulint*		len)
-{
-	uint64_t	data_int = *reinterpret_cast<const uint64_t*>(*data);
-
-#ifdef WORDS_BIGENDIAN
-	/* Currently we only support x86_64 (little endian) CPUs. In case
-	some big endian CPU supports a CRC32 instruction, then maybe we will
-	need a byte order swap here. */
-#error Dont know how to handle big endian CPUs
-	/*
-	data_int = ut_crc32_swap_byteorder(data_int);
-	*/
-#endif /* WORDS_BIGENDIAN */
-
-	*crc = ut_crc32_64_low_hw(*crc, data_int);
-
-	*data += 8;
-	*len -= 8;
-}
-
-/** Calculates CRC32 using hardware/CPU instructions.
-@param[in]	buf	data over which to calculate CRC32
-@param[in]	len	data length
-@return CRC-32C (polynomial 0x11EDC6F41) */
-uint32_t
-ut_crc32_hw(
-	const byte*	buf,
-	ulint		len)
-{
-	uint32_t	crc = 0xFFFFFFFFU;
-
-	/* Calculate byte-by-byte up to an 8-byte aligned address. After
-	this consume the input 8-bytes at a time. */
-	while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0) {
-		ut_crc32_8_hw(&crc, &buf, &len);
-	}
-
-	/* Perf testing
-	./unittest/gunit/innodb/merge_innodb_tests-t --gtest_filter=ut0crc32.perf
-	on CPU "Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz"
-	with different N in "while (len >= N) {" shows:
-	N=16
-	2.867254 sec
-	2.866860 sec
-	2.867973 sec
-
-	N=32
-	2.715725 sec
-	2.713008 sec
-	2.712520 sec
-	(5.36% speedup over N=16)
-
-	N=64
-	2.634140 sec
-	2.636558 sec
-	2.636488 sec
-	(2.88% speedup over N=32)
-
-	N=128
-	2.599534 sec
-	2.599919 sec
-	2.598035 sec
-	(1.39% speedup over N=64)
-
-	N=256
-	2.576993 sec
-	2.576748 sec
-	2.575700 sec
-	(0.87% speedup over N=128)
-
-	N=512
-	2.693928 sec
-	2.691663 sec
-	2.692142 sec
-	(4.51% slowdown over N=256)
-	*/
-	while (len >= 128) {
-		/* This call is repeated 16 times. 16 * 8 = 128. */
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-		ut_crc32_64_hw(&crc, &buf, &len);
-	}
-
-	while (len >= 8) {
-		ut_crc32_64_hw(&crc, &buf, &len);
-	}
-
-	while (len > 0) {
-		ut_crc32_8_hw(&crc, &buf, &len);
-	}
-
-	return(~crc);
-}
-#endif /* defined(__GNUC__) && defined(__x86_64__) || (_WIN64) */
-
-/* CRC32 software implementation. */
-
-/* Precalculated table used to generate the CRC32 if the CPU does not
-have support for it */
-static uint32_t	ut_crc32_slice8_table[8][256];
-static bool	ut_crc32_slice8_table_initialized = false;
-
-/********************************************************************//**
-Initializes the table that is used to generate the CRC32 if the CPU does
-not have support for it. */
-static
-void
-ut_crc32_slice8_table_init()
-/*========================*/
-{
-	/* bit-reversed poly 0x1EDC6F41 (from SSE42 crc32 instruction) */
-	static const uint32_t	poly = 0x82f63b78;
-	uint32_t		n;
-	uint32_t		k;
-	uint32_t		c;
-
-	for (n = 0; n < 256; n++) {
-		c = n;
-		for (k = 0; k < 8; k++) {
-			c = (c & 1) ? (poly ^ (c >> 1)) : (c >> 1);
-		}
-		ut_crc32_slice8_table[0][n] = c;
-	}
-
-	for (n = 0; n < 256; n++) {
-		c = ut_crc32_slice8_table[0][n];
-		for (k = 1; k < 8; k++) {
-			c = ut_crc32_slice8_table[0][c & 0xFF] ^ (c >> 8);
-			ut_crc32_slice8_table[k][n] = c;
-		}
-	}
-
-	ut_crc32_slice8_table_initialized = true;
-}
-
-/** Calculate CRC32 over 8-bit data using a software implementation.
-@param[in,out]	crc	crc32 checksum so far when this function is called,
-when the function ends it will contain the new checksum
-@param[in,out]	data	data to be checksummed, the pointer will be advanced
-with 1 byte
-@param[in,out]	len	remaining bytes, it will be decremented with 1 */
-inline
-void
-ut_crc32_8_sw(
-	uint32_t*	crc,
-	const byte**	data,
-	ulint*		len)
-{
-	const uint8_t	i = (*crc ^ (*data)[0]) & 0xFF;
-
-	*crc = (*crc >> 8) ^ ut_crc32_slice8_table[0][i];
-
-	(*data)++;
-	(*len)--;
-}
-
-/** Calculate CRC32 over a 64-bit integer using a software implementation.
-@param[in]	crc	crc32 checksum so far
-@param[in]	data	data to be checksummed
-@return resulting checksum of crc + crc(data) */
-inline
-uint32_t
-ut_crc32_64_low_sw(
-	uint32_t	crc,
-	uint64_t	data)
-{
-	const uint64_t	i = crc ^ data;
-
-	return(
-		ut_crc32_slice8_table[7][(i      ) & 0xFF] ^
-		ut_crc32_slice8_table[6][(i >>  8) & 0xFF] ^
-		ut_crc32_slice8_table[5][(i >> 16) & 0xFF] ^
-		ut_crc32_slice8_table[4][(i >> 24) & 0xFF] ^
-		ut_crc32_slice8_table[3][(i >> 32) & 0xFF] ^
-		ut_crc32_slice8_table[2][(i >> 40) & 0xFF] ^
-		ut_crc32_slice8_table[1][(i >> 48) & 0xFF] ^
-		ut_crc32_slice8_table[0][(i >> 56)]
-	);
-}
-
-/** Calculate CRC32 over 64-bit byte string using a software implementation.
-@param[in,out]	crc	crc32 checksum so far when this function is called,
-when the function ends it will contain the new checksum
-@param[in,out]	data	data to be checksummed, the pointer will be advanced
-with 8 bytes
-@param[in,out]	len	remaining bytes, it will be decremented with 8 */
-inline
-void
-ut_crc32_64_sw(
-	uint32_t*	crc,
-	const byte**	data,
-	ulint*		len)
-{
-	uint64_t	data_int = *reinterpret_cast<const uint64_t*>(*data);
-
-#ifdef WORDS_BIGENDIAN
-	data_int = ut_crc32_swap_byteorder(data_int);
-#endif /* WORDS_BIGENDIAN */
-
-	*crc = ut_crc32_64_low_sw(*crc, data_int);
-
-	*data += 8;
-	*len -= 8;
-}
-
-/** Calculates CRC32 in software, without using CPU instructions.
-@param[in]	buf	data over which to calculate CRC32
-@param[in]	len	data length
-@return CRC-32C (polynomial 0x11EDC6F41) */
-uint32_t
-ut_crc32_sw(
-	const byte*	buf,
-	ulint		len)
-{
-	uint32_t	crc = 0xFFFFFFFFU;
-
-	ut_a(ut_crc32_slice8_table_initialized);
-
-	/* Calculate byte-by-byte up to an 8-byte aligned address. After
-	this consume the input 8-bytes at a time. */
-	while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0) {
-		ut_crc32_8_sw(&crc, &buf, &len);
-	}
-
-	while (len >= 128) {
-		/* This call is repeated 16 times. 16 * 8 = 128. */
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-		ut_crc32_64_sw(&crc, &buf, &len);
-	}
-
-	while (len >= 8) {
-		ut_crc32_64_sw(&crc, &buf, &len);
-	}
-
-	while (len > 0) {
-		ut_crc32_8_sw(&crc, &buf, &len);
-	}
-
-	return(~crc);
-}
-
-/********************************************************************//**
-Initializes the data structures used by ut_crc32*(). Does not do any
-allocations, would not hurt if called twice, but would be pointless. */
-void
-ut_crc32_init()
-/*===========*/
-{
-	ut_crc32_slice8_table_init();
-
-#if (defined(__GNUC__) && defined(__x86_64__)) || defined(_MSC_VER)
-	uint32_t	vend[3];
-	uint32_t	model;
-	uint32_t	family;
-	uint32_t	stepping;
-	uint32_t	features_ecx;
-	uint32_t	features_edx;
-
-	ut_cpuid(vend, &model, &family, &stepping,
-		 &features_ecx, &features_edx);
-
-	if (features_ecx & 1 << 20) {
-		ut_crc32 = ut_crc32_hw;
-		ut_crc32_implementation = "Using SSE2 crc32 instructions";
-	}
-#endif
-}
diff --git a/storage/innobase/ut/ut0new.cc b/storage/innobase/ut/ut0new.cc
index f47a5112fd7..5e00a4ca0ea 100644
--- a/storage/innobase/ut/ut0new.cc
+++ b/storage/innobase/ut/ut0new.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, MariaDB Corporation.
+Copyright (c) 2019, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -25,6 +25,10 @@ Created May 26, 2014 Vasil Dimov
 *******************************************************/
 
 #include "univ.i"
+#include <algorithm>
+/** The total amount of memory currently allocated from the operating
+system with allocate_large(). */
+Atomic_counter<ulint> os_total_large_mem_allocated;
 
 /** Maximum number of retries to allocate memory. */
 const size_t	alloc_max_retries = 60;
@@ -59,165 +63,50 @@ the list below:
 Keep this list alphabetically sorted. */
 static PSI_memory_info	pfs_info[] = {
 #ifdef BTR_CUR_HASH_ADAPT
-	{&mem_key_ahi, "adaptive hash index", 0},
+  {&mem_key_ahi, "adaptive hash index", 0},
 #endif /* BTR_CUR_HASH_ADAPT */
-	{&mem_key_buf_buf_pool, "buf_buf_pool", 0},
-	{&mem_key_dict_stats_bg_recalc_pool_t, "dict_stats_bg_recalc_pool_t", 0},
-	{&mem_key_dict_stats_index_map_t, "dict_stats_index_map_t", 0},
-	{&mem_key_dict_stats_n_diff_on_level, "dict_stats_n_diff_on_level", 0},
-	{&mem_key_other, "other", 0},
-	{&mem_key_row_log_buf, "row_log_buf", 0},
-	{&mem_key_row_merge_sort, "row_merge_sort", 0},
-	{&mem_key_std, "std", 0},
+  {&mem_key_buf_buf_pool, "buf_buf_pool", 0},
+  {&mem_key_dict_stats_bg_recalc_pool_t, "dict_stats_bg_recalc_pool_t", 0},
+  {&mem_key_dict_stats_index_map_t, "dict_stats_index_map_t", 0},
+  {&mem_key_dict_stats_n_diff_on_level, "dict_stats_n_diff_on_level", 0},
+  {&mem_key_other, "other", 0},
+  {&mem_key_row_log_buf, "row_log_buf", 0},
+  {&mem_key_row_merge_sort, "row_merge_sort", 0},
+  {&mem_key_std, "std", 0},
 };
 
-/** Map used for default performance schema keys, based on file name of the
-caller. The key is the file name of the caller and the value is a pointer
-to a PSI_memory_key variable to be passed to performance schema methods.
-We use ut_strcmp_functor because by default std::map will compare the pointers
-themselves (cont char*) and not do strcmp(). */
-typedef std::map<const char*, PSI_memory_key*, ut_strcmp_functor>
-	mem_keys_auto_t;
-
-/** Map of filename/pfskey, used for tracing allocations that have not
-provided a manually created pfs key. This map is only ever modified (bulk
-insert) at startup in a single-threaded environment by ut_new_boot().
-Later it is only read (only std::map::find() is called) from multithreaded
-environment, thus it is not protected by any latch. */
-static mem_keys_auto_t	mem_keys_auto;
-
-#endif /* UNIV_PFS_MEMORY */
+static const int NKEYS = static_cast<int>UT_ARR_SIZE(auto_event_names)-1;
+static PSI_memory_key auto_event_keys[NKEYS];
 
 /** Setup the internal objects needed for UT_NEW() to operate.
 This must be called before the first call to UT_NEW(). */
-void
-ut_new_boot()
+void ut_new_boot()
 {
-#ifdef UNIV_PFS_MEMORY
-	static const char*	auto_event_names[] = {
-		/* Keep this list alphabetically sorted. */
-		"btr0btr",
-		"btr0bulk",
-		"btr0cur",
-		"btr0pcur",
-		"btr0sea",
-		"buf0buf",
-		"buf0dblwr",
-		"buf0dump",
-		"buf0flu",
-		"buf0lru",
-		"dict0dict",
-		"dict0mem",
-		"dict0stats",
-		"dict0stats_bg",
-		"eval0eval",
-		"fil0fil",
-		"fsp0file",
-		"fsp0space",
-		"fsp0sysspace",
-		"fts0ast",
-		"fts0config",
-		"fts0fts",
-		"fts0opt",
-		"fts0pars",
-		"fts0que",
-		"fts0sql",
-		"gis0sea",
-		"ha0ha",
-		"ha_innodb",
-		"handler0alter",
-		"hash0hash",
-		"i_s",
-		"ibuf0ibuf",
-		"lexyy",
-		"lock0lock",
-		"log0log",
-		"log0recv",
-		"mem0mem",
-		"os0event",
-		"os0file",
-		"page0cur",
-		"page0zip",
-		"pars0lex",
-		"read0read",
-		"rem0rec",
-		"row0ftsort",
-		"row0import",
-		"row0log",
-		"row0merge",
-		"row0mysql",
-		"row0sel",
-		"srv0conc",
-		"srv0srv",
-		"srv0start",
-		"sync0arr",
-		"sync0debug",
-		"sync0rw",
-		"sync0types",
-		"trx0i_s",
-		"trx0purge",
-		"trx0roll",
-		"trx0rseg",
-		"trx0sys",
-		"trx0trx",
-		"trx0undo",
-		"ut0list",
-		"ut0mem",
-		"ut0mutex",
-		"ut0pool",
-		"ut0rbt",
-		"ut0wqueue",
-	};
-	static const size_t	n_auto = UT_ARR_SIZE(auto_event_names);
-	static PSI_memory_key	auto_event_keys[n_auto];
-	static PSI_memory_info	pfs_info_auto[n_auto];
-
-	for (size_t i = 0; i < n_auto; i++) {
-
-		const std::pair<mem_keys_auto_t::iterator, bool>	ret
-			MY_ATTRIBUTE((unused))
-			= mem_keys_auto.insert(
-			mem_keys_auto_t::value_type(auto_event_names[i],
-						    &auto_event_keys[i]));
-
-		/* ret.second is true if new element has been inserted */
-		ut_a(ret.second);
-
-		/* e.g. "btr0btr" */
-		pfs_info_auto[i].m_name = auto_event_names[i];
-
-		/* a pointer to the pfs key */
-		pfs_info_auto[i].m_key = &auto_event_keys[i];
-
-		pfs_info_auto[i].m_flags = 0;
-	}
-
-	PSI_MEMORY_CALL(register_memory)("innodb",
-					 pfs_info,
-					 UT_ARR_SIZE(pfs_info));
-	PSI_MEMORY_CALL(register_memory)("innodb",
-					 pfs_info_auto,
-					 n_auto);
-#endif /* UNIV_PFS_MEMORY */
-}
+  PSI_MEMORY_CALL(register_memory)("innodb", pfs_info, static_cast<int>
+                                   UT_ARR_SIZE(pfs_info));
 
-#ifdef UNIV_PFS_MEMORY
+  PSI_memory_info pfs_info_auto[NKEYS];
+  for (int i= 0; i < NKEYS; i++)
+  {
+    pfs_info_auto[i]= {&auto_event_keys[i], auto_event_names[i], 0};
+  }
 
-/** Retrieve a memory key (registered with PFS), given a portion of the file
-name of the caller.
-@param[in]	file	portion of the filename - basename without an extension
-@return registered memory key or PSI_NOT_INSTRUMENTED if not found */
-PSI_memory_key
-ut_new_get_key_by_file(
-	const char*	file)
-{
-	mem_keys_auto_t::const_iterator	el = mem_keys_auto.find(file);
+  PSI_MEMORY_CALL(register_memory)("innodb", pfs_info_auto,NKEYS);
+}
 
-	if (el != mem_keys_auto.end()) {
-		return(*(el->second));
-	}
+/** Retrieve a memory key (registered with PFS), corresponding to source file .
 
-	return(PSI_NOT_INSTRUMENTED);
+@param[in] autoevent_idx - offset to the auto_event_names corresponding to the
+file name of the caller.
+
+@return registered memory key or PSI_NOT_INSTRUMENTED
+*/
+PSI_memory_key ut_new_get_key_by_file(uint32_t autoevent_idx)
+{
+  ut_ad(autoevent_idx < NKEYS);
+  return auto_event_keys[autoevent_idx];
 }
 
-#endif /* UNIV_PFS_MEMORY */
+#else /* UNIV_PFS_MEMORY */
+void ut_new_boot(){}
+#endif
diff --git a/storage/innobase/ut/ut0ut.cc b/storage/innobase/ut/ut0ut.cc
index a6a8661f699..1dd1cff6ece 100644
--- a/storage/innobase/ut/ut0ut.cc
+++ b/storage/innobase/ut/ut0ut.cc
@@ -62,42 +62,39 @@ ut_print_timestamp(
 /*===============*/
 	FILE*  file) /*!< in: file where to print */
 {
-	ulint thread_id = 0;
-
-#ifndef UNIV_INNOCHECKSUM
-	thread_id = os_thread_pf(os_thread_get_curr_id());
-#endif /* !UNIV_INNOCHECKSUM */
-
 #ifdef _WIN32
 	SYSTEMTIME cal_tm;
-
 	GetLocalTime(&cal_tm);
-
-	fprintf(file, "%d-%02d-%02d %02d:%02d:%02d %#zx",
-		(int) cal_tm.wYear,
-		(int) cal_tm.wMonth,
-		(int) cal_tm.wDay,
-		(int) cal_tm.wHour,
-		(int) cal_tm.wMinute,
-		(int) cal_tm.wSecond,
-		thread_id);
 #else
-	struct tm* cal_tm_ptr;
 	time_t	   tm;
-
 	struct tm  cal_tm;
 	time(&tm);
 	localtime_r(&tm, &cal_tm);
-	cal_tm_ptr = &cal_tm;
-	fprintf(file, "%d-%02d-%02d %02d:%02d:%02d %#zx",
-		cal_tm_ptr->tm_year + 1900,
-		cal_tm_ptr->tm_mon + 1,
-		cal_tm_ptr->tm_mday,
-		cal_tm_ptr->tm_hour,
-		cal_tm_ptr->tm_min,
-		cal_tm_ptr->tm_sec,
-		thread_id);
 #endif
+	fprintf(file,
+		IF_WIN("%u-%02u-%02u %02u:%02u:%02u %#zx",
+		       "%d-%02d-%02d %02d:%02d:%02d %#zx"),
+#ifdef _WIN32
+		cal_tm.wYear,
+		cal_tm.wMonth,
+		cal_tm.wDay,
+		cal_tm.wHour,
+		cal_tm.wMinute,
+		cal_tm.wSecond,
+#else
+		cal_tm.tm_year + 1900,
+		cal_tm.tm_mon + 1,
+		cal_tm.tm_mday,
+		cal_tm.tm_hour,
+		cal_tm.tm_min,
+		cal_tm.tm_sec,
+#endif
+#ifdef UNIV_INNOCHECKSUM
+		ulint{0}
+#else
+		ulint(os_thread_get_curr_id())
+#endif
+		);
 }
 
 #ifndef UNIV_INNOCHECKSUM
@@ -111,31 +108,27 @@ ut_sprintf_timestamp(
 {
 #ifdef _WIN32
 	SYSTEMTIME cal_tm;
-
 	GetLocalTime(&cal_tm);
 
-	sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
-		(int) cal_tm.wYear % 100,
-		(int) cal_tm.wMonth,
-		(int) cal_tm.wDay,
-		(int) cal_tm.wHour,
-		(int) cal_tm.wMinute,
-		(int) cal_tm.wSecond);
+	sprintf(buf, "%02u%02u%02u %2u:%02u:%02u",
+		cal_tm.wYear % 100,
+		cal_tm.wMonth,
+		cal_tm.wDay,
+		cal_tm.wHour,
+		cal_tm.wMinute,
+		cal_tm.wSecond);
 #else
-	struct tm* cal_tm_ptr;
 	time_t	   tm;
-
 	struct tm  cal_tm;
 	time(&tm);
 	localtime_r(&tm, &cal_tm);
-	cal_tm_ptr = &cal_tm;
 	sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
-		cal_tm_ptr->tm_year % 100,
-		cal_tm_ptr->tm_mon + 1,
-		cal_tm_ptr->tm_mday,
-		cal_tm_ptr->tm_hour,
-		cal_tm_ptr->tm_min,
-		cal_tm_ptr->tm_sec);
+		cal_tm.tm_year % 100,
+		cal_tm.tm_mon + 1,
+		cal_tm.tm_mday,
+		cal_tm.tm_hour,
+		cal_tm.tm_min,
+		cal_tm.tm_sec);
 #endif
 }
 
@@ -215,27 +208,6 @@ ut_print_buf(
 	ut_print_buf_hex(o, buf, len);
 }
 
-/*************************************************************//**
-Calculates fast the number rounded up to the nearest power of 2.
-@return first power of 2 which is >= n */
-ulint
-ut_2_power_up(
-/*==========*/
-	ulint	n)	/*!< in: number != 0 */
-{
-	ulint	res;
-
-	res = 1;
-
-	ut_ad(n > 0);
-
-	while (res < n) {
-		res = res * 2;
-	}
-
-	return(res);
-}
-
 /** Get a fixed-length string, quoted as an SQL identifier.
 If the string contains a slash '/', the string will be
 output as two identifiers separated by a period (.),
diff --git a/storage/innobase/ut/ut0wqueue.cc b/storage/innobase/ut/ut0wqueue.cc
index 45af449cff9..af56bb5cba0 100644
--- a/storage/innobase/ut/ut0wqueue.cc
+++ b/storage/innobase/ut/ut0wqueue.cc
@@ -44,7 +44,6 @@ ib_wqueue_create(void)
 	mutex_create(LATCH_ID_WORK_QUEUE, &wq->mutex);
 
 	wq->items = ib_list_create();
-	wq->event = os_event_create(0);
 	wq->length = 0;
 
 	return(wq);
@@ -59,7 +58,6 @@ ib_wqueue_free(
 {
 	mutex_free(&wq->mutex);
 	ib_list_free(wq->items);
-	os_event_destroy(wq->event);
 
 	ut_free(wq);
 }
@@ -79,94 +77,12 @@ ib_wqueue_add(ib_wqueue_t* wq, void* item, mem_heap_t* heap, bool wq_locked)
 	ib_list_add_last(wq->items, item, heap);
 	wq->length++;
 	ut_ad(wq->length == ib_list_len(wq->items));
-	os_event_set(wq->event);
 
 	if (!wq_locked) {
 		mutex_exit(&wq->mutex);
 	}
 }
 
-/****************************************************************//**
-Wait for a work item to appear in the queue.
-@return work item */
-void*
-ib_wqueue_wait(
-/*===========*/
-	ib_wqueue_t*	wq)	/*!< in: work queue */
-{
-	ib_list_node_t*	node;
-
-	for (;;) {
-		os_event_wait(wq->event);
-
-		mutex_enter(&wq->mutex);
-
-		node = ib_list_get_first(wq->items);
-
-		if (node) {
-			ib_list_remove(wq->items, node);
-			if (!--wq->length) {
-				/* We must reset the event when the list
-				gets emptied. */
-				os_event_reset(wq->event);
-			}
-			ut_ad(wq->length == ib_list_len(wq->items));
-
-			break;
-		}
-
-		mutex_exit(&wq->mutex);
-	}
-
-	mutex_exit(&wq->mutex);
-
-	return(node->data);
-}
-
-
-/********************************************************************
-Wait for a work item to appear in the queue for specified time. */
-void*
-ib_wqueue_timedwait(
-/*================*/
-					/* out: work item or NULL on timeout*/
-	ib_wqueue_t*	wq,		/* in: work queue */
-	ulint		wait_in_usecs)	/* in: wait time in micro seconds */
-{
-	ib_list_node_t*	node = NULL;
-
-	for (;;) {
-		ulint		error;
-		int64_t		sig_count;
-
-		mutex_enter(&wq->mutex);
-
-		node = ib_list_get_first(wq->items);
-
-		if (node) {
-			ib_list_remove(wq->items, node);
-			wq->length--;
-			ut_ad(wq->length == ib_list_len(wq->items));
-			mutex_exit(&wq->mutex);
-			break;
-		}
-
-		sig_count = os_event_reset(wq->event);
-
-		mutex_exit(&wq->mutex);
-
-		error = os_event_wait_time_low(wq->event,
-					       (ulint) wait_in_usecs,
-					       sig_count);
-
-		if (error == OS_SYNC_TIME_EXCEEDED) {
-			break;
-		}
-	}
-
-	return(node ? node->data : NULL);
-}
-
 /********************************************************************
 Return first item on work queue or NULL if queue is empty
 @return work item or NULL */
@@ -184,16 +100,11 @@ ib_wqueue_nowait(
 
 		if (node) {
 			ib_list_remove(wq->items, node);
-
+			--wq->length;
+			ut_ad(wq->length == ib_list_len(wq->items));
 		}
 	}
 
-	/* We must reset the event when the list
-	gets emptied. */
-	if(ib_list_is_empty(wq->items)) {
-		os_event_reset(wq->event);
-	}
-
 	mutex_exit(&wq->mutex);
 
 	return (node ? node->data : NULL);
diff --git a/storage/maria/CMakeLists.txt b/storage/maria/CMakeLists.txt
index 248f7cbe177..13d8035bdc8 100644
--- a/storage/maria/CMakeLists.txt
+++ b/storage/maria/CMakeLists.txt
@@ -1,4 +1,5 @@
 # Copyright (C) 2007 MySQL AB
+# Copyright (C) 2009,2020 MariaDB Corporation Ab
 # 
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -13,14 +14,10 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1335  USA
 
-INCLUDE(CMakeDependentOption)
-
-INCLUDE_DIRECTORIES(
-${SSL_INCLUDE_DIRS}
-)
+INCLUDE_DIRECTORIES(${SSL_INCLUDE_DIRS})
 
 IF(SSL_DEFINES)
-SET_SOURCE_FILES_PROPERTIES(ma_crypt.c PROPERTIES COMPILE_FLAGS ${SSL_DEFINES})
+  SET_SOURCE_FILES_PROPERTIES(ma_crypt.c PROPERTIES COMPILE_FLAGS ${SSL_DEFINES})
 ENDIF()
 
 SET(ARIA_SOURCES ma_init.c ma_open.c ma_extra.c ma_info.c ma_rkey.c 
@@ -30,14 +27,14 @@ SET(ARIA_SOURCES ma_init.c ma_open.c ma_extra.c ma_info.c ma_rkey.c
             ma_rrnd.c ma_scan.c ma_cache.c 
             ma_statrec.c ma_packrec.c ma_dynrec.c 
             ma_blockrec.c ma_bitmap.c 
-            ma_update.c ma_write.c ma_unique.c 
+            ma_update.c ma_write.c ma_unique.c
             ma_delete.c 
             ma_rprev.c ma_rfirst.c ma_rlast.c ma_rsame.c 
             ma_rsamepos.c ma_panic.c ma_close.c ma_create.c
             ma_range.c ma_dbug.c ma_checksum.c 
             ma_changed.c ma_static.c ma_delete_all.c 
             ma_delete_table.c ma_rename.c  ma_check.c 
-            ma_keycache.c ma_preload.c ma_ft_parser.c 
+            ma_keycache.c ma_preload.c ma_ft_parser.c
             ma_ft_update.c ma_ft_boolean_search.c 
             ma_ft_nlq_search.c ft_maria.c ma_sort.c 
             ha_maria.cc trnman.c lockman.c
@@ -55,32 +52,29 @@ IF(APPLE)
   ADD_DEFINITIONS(-fno-common)
 ENDIF()
 
-MYSQL_ADD_PLUGIN(aria ${ARIA_SOURCES} 
-  STORAGE_ENGINE
-  MANDATORY
-  RECOMPILE_FOR_EMBEDDED)
-
-IF(NOT WITH_ARIA_STORAGE_ENGINE)
-  RETURN()
+IF(CMAKE_SYSTEM_NAME MATCHES AIX)
+  # Workaround linker bug on AIX
+  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-berok")
 ENDIF()
 
-TARGET_LINK_LIBRARIES(aria myisam
-  mysys mysys_ssl)
+MYSQL_ADD_PLUGIN(aria ${ARIA_SOURCES} STORAGE_ENGINE MANDATORY
+                 LINK_LIBRARIES myisam mysys mysys_ssl
+                 RECOMPILE_FOR_EMBEDDED)
 
-MYSQL_ADD_EXECUTABLE(aria_ftdump maria_ftdump.c COMPONENT Server)
+MYSQL_ADD_EXECUTABLE(aria_ftdump aria_ftdump.c COMPONENT Server)
 TARGET_LINK_LIBRARIES(aria_ftdump aria)
 
-MYSQL_ADD_EXECUTABLE(aria_chk maria_chk.c COMPONENT Server)
+MYSQL_ADD_EXECUTABLE(aria_chk aria_chk.c COMPONENT Server)
 TARGET_LINK_LIBRARIES(aria_chk aria)
 
-MYSQL_ADD_EXECUTABLE(aria_read_log maria_read_log.c COMPONENT Server)
+MYSQL_ADD_EXECUTABLE(aria_read_log aria_read_log.c COMPONENT Server)
 TARGET_LINK_LIBRARIES(aria_read_log aria)
 
-MYSQL_ADD_EXECUTABLE(aria_dump_log  maria_dump_log.c unittest/ma_loghandler_examples.c COMPONENT Server)
+MYSQL_ADD_EXECUTABLE(aria_dump_log aria_dump_log.c unittest/ma_loghandler_examples.c COMPONENT Server)
 TARGET_LINK_LIBRARIES(aria_dump_log aria)
 SET_TARGET_PROPERTIES(aria_dump_log PROPERTIES COMPILE_FLAGS "-DMARIA_DUMP_LOG")
 
-MYSQL_ADD_EXECUTABLE(aria_pack maria_pack.c COMPONENT Server)
+MYSQL_ADD_EXECUTABLE(aria_pack aria_pack.c COMPONENT Server)
 TARGET_LINK_LIBRARIES(aria_pack aria)
 
 IF(WITH_UNIT_TESTS)
@@ -110,6 +104,35 @@ IF (MSVC)
   SET_TARGET_PROPERTIES(aria_chk aria_pack PROPERTIES LINK_FLAGS "setargv.obj")
 ENDIF()
 
-CMAKE_DEPENDENT_OPTION(USE_ARIA_FOR_TMP_TABLES "Use Aria for temporary tables" ON
-                       "WITH_ARIA_STORAGE_ENGINE" OFF)
+OPTION(USE_ARIA_FOR_TMP_TABLES "Use Aria for temporary tables" ON)
 
+#
+# S3
+#
+INCLUDE (CheckIncludeFiles)
+
+SET(S3_SOURCES s3_func.c
+    libmarias3/src/debug.c libmarias3/src/error.c libmarias3/src/marias3.c
+    libmarias3/src/request.c libmarias3/src/response.c libmarias3/src/sha256.c
+    libmarias3/src/sha256-internal.c libmarias3/src/xml.c
+    libmarias3/src/assume_role.c)
+
+IF(NOT PLUGIN_S3 STREQUAL NO AND NOT WIN32)
+  FIND_PACKAGE(CURL)
+ENDIF()
+
+IF (CURL_FOUND)
+  INCLUDE_DIRECTORIES(${CURL_INCLUDE_DIRS})
+  MYSQL_ADD_PLUGIN(s3 ha_s3.cc ${S3_SOURCES} COMPONENT s3-engine
+    LINK_LIBRARIES ${CURL_LIBRARIES} z STORAGE_ENGINE NOT_EMBEDDED CONFIG s3.cnf)
+ENDIF()
+
+SET(CPACK_RPM_s3-engine_PACKAGE_SUMMARY "Amazon S3 archival storage engine for MariaDB" PARENT_SCOPE)
+SET(CPACK_RPM_s3-engine_PACKAGE_DESCRIPTION "The S3 storage engine allows one to archive MariaDB tables in Amazon S3 (or any third-party public or private cloud that implements S3 API), but still have them accessible in MariaDB in read-only mode." PARENT_SCOPE)
+
+IF(TARGET s3)
+  MYSQL_ADD_EXECUTABLE(aria_s3_copy aria_s3_copy.cc ${S3_SOURCES} COMPONENT s3-engine)
+  TARGET_LINK_LIBRARIES(aria_s3_copy aria myisam mysys mysys_ssl curl z)
+  INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/libmarias3)
+  ADD_DEFINITIONS(-DWITH_S3_STORAGE_ENGINE)
+ENDIF()
diff --git a/storage/maria/maria_chk.c b/storage/maria/aria_chk.c
index 2f130de1c7a..30bb2cf0d96 100644
--- a/storage/maria/maria_chk.c
+++ b/storage/maria/aria_chk.c
@@ -19,10 +19,12 @@
 #include <myisamchk.h>
 #include <my_bit.h>
 #include <m_ctype.h>
-#include <stdarg.h>
 #include <my_getopt.h>
 #include <my_check_opt.h>
 #include <my_handler_errors.h>
+/* Remove next line if you want aria_chk to produce a stack trace */
+#undef HAVE_BACKTRACE
+#include <my_stacktrace.h>
 
 static uint decode_bits;
 static char **default_argv;
@@ -35,6 +37,7 @@ static MY_TMPDIR maria_chk_tmpdir;
 static my_bool opt_transaction_logging, opt_debug;
 static my_bool opt_ignore_control_file, opt_require_control_file;
 static my_bool opt_warning_for_wrong_transid, opt_update_state;
+static my_bool have_control_file= 0;
 
 static const char *type_names[]=
 {
@@ -117,16 +120,16 @@ static void my_exit(int exit_code)
          MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
   exit(exit_code);
 }
-  
 
-	/* Main program */
+/* Main program */
 
 int main(int argc, char **argv)
 {
   int error;
   MY_INIT(argv[0]);
 
-  default_log_dir= opt_log_dir= maria_data_root= (char *)".";
+  my_setup_stacktrace();
+  default_log_dir= opt_log_dir= maria_data_root= ".";
   maria_chk_init(&check_param);
   check_param.opt_lock_memory= 1;		/* Lock memory if possible */
   check_param.using_global_keycache = 0;
@@ -138,15 +141,24 @@ int main(int argc, char **argv)
                     HA_ERR_FIRST+ array_elements(handler_error_messages)-1);
 
   maria_block_size= 0;                 /* Use block size from control file */
-  if (!opt_ignore_control_file &&
-      (ma_control_file_open(FALSE, opt_require_control_file ||
-                            !(check_param.testflag & T_SILENT)) &&
-       (opt_require_control_file ||
-        (opt_transaction_logging && (check_param.testflag & T_REP_ANY)))))
+  if (!opt_ignore_control_file)
   {
-    error= 1;
-    goto end;
+    if ((ma_control_file_open(FALSE, opt_require_control_file ||
+                              !(check_param.testflag & T_SILENT),
+                              TRUE)))
+    {
+      if (opt_require_control_file ||
+          (opt_transaction_logging && (check_param.testflag & T_REP_ANY)))
+      {
+        error= 1;
+        goto end;
+      }
+    }
+    else
+      have_control_file= 1;
   }
+  if (!have_control_file)
+    opt_warning_for_wrong_transid= 0;
 
   /*
     If we are doing a repair, user may want to store this repair into the log
@@ -315,7 +327,7 @@ static struct my_option my_long_options[] =
    0, GET_ULL, REQUIRED_ARG, -1, 0, 0, 0, 0, 0},
   {"datadir", 'h',
    "Path for control file (and logs if --logdir not used).",
-   &maria_data_root, 0, 0, GET_STR, REQUIRED_ARG,
+   (char**) &maria_data_root, 0, 0, GET_STR, REQUIRED_ARG,
    0, 0, 0, 0, 0, 0},
   {"logdir", OPT_LOG_DIR,
    "Path for log files.",
@@ -422,8 +434,8 @@ static struct my_option my_long_options[] =
     ~0UL, (long) MALLOC_OVERHEAD, (long) 1L, 0},
   { "sort_buffer_size", OPT_SORT_BUFFER_SIZE,
     "Size of sort buffer. Used by --recover",
-    &check_param.sort_buffer_length,
-    &check_param.sort_buffer_length, 0, GET_ULL, REQUIRED_ARG,
+    &check_param.orig_sort_buffer_length,
+    &check_param.orig_sort_buffer_length, 0, GET_ULL, REQUIRED_ARG,
     SORT_BUFFER_INIT, MIN_SORT_BUFFER, SIZE_T_MAX, MALLOC_OVERHEAD, 1L, 0},
   { "sort_key_blocks", OPT_SORT_KEY_BLOCKS,
     "Internal buffer for sorting keys; Don't touch :)",
@@ -449,7 +461,7 @@ static struct my_option my_long_options[] =
     (char**) &maria_stats_method_str, (char**) &maria_stats_method_str, 0,
     GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
   { "zerofill", 'z',
-    "Fill empty space in data and index files with zeroes. This makes the data file movable between different servers.",
+    "Fill empty space in data and index files with zeroes. This makes the data file movable between different servers. It also fixes any wrong transaction or LSN numbers in the table after a crash or if someone removed the Aria log files.",
     0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
   { "zerofill-keep-lsn", OPT_ZEROFILL_KEEP_LSN,
     "Like --zerofill but does not zero out LSN of data/index pages;"
@@ -461,7 +473,7 @@ static struct my_option my_long_options[] =
 
 static void print_version(void)
 {
-  printf("%s  Ver 1.2 for %s on %s\n", my_progname, SYSTEM_TYPE,
+  printf("%s  Ver 1.3 for %s on %s\n", my_progname, SYSTEM_TYPE,
 	 MACHINE_TYPE);
 }
 
@@ -579,7 +591,7 @@ Recover (repair)/ options (When using '--recover' or '--safe-recover'):\n\
   -q, --quick         Faster repair by not modifying the data file.\n\
                       One can give a second '-q' to force aria_chk to\n\
 		      modify the original datafile in case of duplicate keys.\n\
-		      NOTE: Tables where the data file is currupted can't be\n\
+		      NOTE: Tables where the data file is corrupted can't be\n\
 		      fixed with this option.\n\
   -u, --unpack        Unpack file packed with ariapack.\n\
 ");
@@ -607,7 +619,9 @@ Recover (repair)/ options (When using '--recover' or '--safe-recover'):\n\
                       Find a record, a block at given offset belongs to.\n\
   -z,  --zerofill     Fill empty space in data and index files with zeroes.\n\
                       This makes the data file movable between different \n\
-                      servers.\n\
+                      servers.  It also fixes any wrong transaction or LSN\n\
+                      numbers in the table after a crash or if someone\n\
+                      removed the Aria log files.\n\
   --zerofill-keep-lsn Like --zerofill but does not zero out LSN of\n\
                       data/index pages.");
 
@@ -631,11 +645,11 @@ TYPELIB maria_stats_method_typelib= {
 	 /* Read options */
 
 static my_bool
-get_one_option(int optid,
-	       const struct my_option *opt __attribute__((unused)),
-	       char *argument)
+get_one_option(const struct my_option *opt,
+	       const char *argument,
+               const char *filename __attribute__((unused)))
 {
-  switch (optid) {
+  switch (opt->id) {
 #ifdef __NETWARE__
   case OPT_AUTO_CLOSE:
     setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
@@ -1023,10 +1037,11 @@ static int maria_chk(HA_CHECK *param, char *filename)
                         ((param->testflag & T_WAIT_FOREVER) ?
                          HA_OPEN_WAIT_IF_LOCKED :
                          (param->testflag & T_DESCRIPT) ?
-                         HA_OPEN_IGNORE_IF_LOCKED : HA_OPEN_ABORT_IF_LOCKED))))
+                         HA_OPEN_IGNORE_IF_LOCKED : HA_OPEN_ABORT_IF_LOCKED),
+                        0)))
   {
     /* Avoid twice printing of isam file name */
-    param->error_printed=1;
+    param->error_printed++;
     switch (my_errno) {
     case HA_ERR_CRASHED:
       _ma_check_print_error(param,"'%s' doesn't have a correct index definition. You need to recreate it before you can do a repair",filename);
@@ -1091,6 +1106,15 @@ static int maria_chk(HA_CHECK *param, char *filename)
       param->testflag|= T_REP_BY_SORT;
     }
   }
+  if ((share->base.extra_options & MA_EXTRA_OPTIONS_ENCRYPTED) &&
+      !(param->testflag & T_DESCRIPT))
+  {
+    _ma_check_print_warning(param,
+                            "Table %s is encrypted. Only --description (-d) "
+                            "option is supported", filename);
+    param->warning_printed= 0;
+    goto end2;
+  }
 
   /*
     Skip the checking of the file if:
@@ -1342,7 +1366,7 @@ static int maria_chk(HA_CHECK *param, char *filename)
       error= maria_zerofill(param, info, filename);
     if (!error)
     {
-      DBUG_PRINT("info", ("Reseting crashed state"));
+      DBUG_PRINT("info", ("Resetting crashed state"));
       share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED_FLAGS |
                                STATE_IN_REPAIR);
     }
@@ -1406,7 +1430,7 @@ static int maria_chk(HA_CHECK *param, char *filename)
            share->state.open_count != 0)
           && (param->testflag & T_UPDATE_STATE))
         info->update|=HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
-      DBUG_PRINT("info", ("Reseting crashed state"));
+      DBUG_PRINT("info", ("Resetting crashed state"));
       share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED_FLAGS |
                                STATE_IN_REPAIR);
     }
@@ -1489,8 +1513,7 @@ end2:
                 "the --force (-f) option or by not using the --quick (-q) "
                 "flag\n");
     }
-    else if (!(param->error_printed & 2) &&
-	     !(param->testflag & T_FORCE_CREATE))
+    else if (!(param->testflag & T_FORCE_CREATE))
       fprintf(stderr, "Aria table '%s' is corrupted\nFix it using switch "
               "\"-r\" or \"-o\"\n", filename);
   }
@@ -1543,6 +1566,8 @@ static void descript(HA_CHECK *param, register MARIA_HA *info, char *name)
 
   if (param->testflag & T_VERBOSE)
   {
+    if (share->base.extra_options & MA_EXTRA_OPTIONS_ENCRYPTED)
+      printf("Encrypted:           yes\n");
     printf("File-version:        %d\n",
 	   (int) share->state.header.file_version[3]);
     if (share->state.create_time)
@@ -1573,6 +1598,10 @@ static void descript(HA_CHECK *param, register MARIA_HA *info, char *name)
     if (share->state.changed & STATE_CRASHED)
       strmov(buff, share->state.changed & STATE_CRASHED_ON_REPAIR ?
              "crashed on repair" : "crashed");
+    else if (have_control_file &&
+             (share->state.changed & (STATE_MOVED | STATE_NOT_ZEROFILLED)) ==
+             (STATE_MOVED | STATE_NOT_ZEROFILLED))
+      strmov(buff, "moved from another system. Use --zerofill to fix it");
     else
     {
       if (share->state.open_count)
@@ -1591,6 +1620,8 @@ static void descript(HA_CHECK *param, register MARIA_HA *info, char *name)
 	pos=strmov(pos,"zerofilled,");
       if (!(share->state.changed & STATE_NOT_MOVABLE))
 	pos=strmov(pos,"movable,");
+      if (have_control_file && (share->state.changed & STATE_MOVED))
+	pos=strmov(pos,"moved,");
       pos[-1]=0;				/* Remove extra ',' */
     }
     printf("Status:              %s\n",buff);
@@ -1755,21 +1786,26 @@ static void descript(HA_CHECK *param, register MARIA_HA *info, char *name)
 	type=share->columndef[field].base_type;
       else
 	type=(enum en_fieldtype) share->columndef[field].type;
-      end=strmov(buff,field_pack[type]);
+      end= strmov(buff, field_pack[type]);
+      if (end != buff)
+      {
+        *(end++)=',';
+        *(end++)=' ';
+      }
       if (share->options & HA_OPTION_COMPRESS_RECORD)
       {
 	if (share->columndef[field].pack_type & PACK_TYPE_SELECTED)
-	  end=strmov(end,", not_always");
+	  end=strmov(end,"not_always, ");
 	if (share->columndef[field].pack_type & PACK_TYPE_SPACE_FIELDS)
-	  end=strmov(end,", no empty");
+	  end=strmov(end,"no empty, ");
 	if (share->columndef[field].pack_type & PACK_TYPE_ZERO_FILL)
 	{
-	  sprintf(end,", zerofill(%d)",share->columndef[field].space_length_bits);
+	  sprintf(end,"zerofill(%d), ",share->columndef[field].space_length_bits);
 	  end=strend(end);
 	}
       }
-      if (buff[0] == ',')
-	strmov(buff,buff+2);
+      if (end != buff)
+        end[-2]= 0;
       int10_to_str((long) share->columndef[field].length,length,10);
       null_bit[0]=null_pos[0]=0;
       if (share->columndef[field].null_bit)
@@ -1887,8 +1923,8 @@ static int maria_sort_records(HA_CHECK *param,
     goto err;
   }
 
-  if (!(sort_param.record=
-        (uchar*) my_malloc((uint) share->base.default_rec_buff_size, MYF(0))))
+  if (!(sort_param.record= (uchar*) my_malloc(PSI_INSTRUMENT_ME,
+                           (uint) share->base.default_rec_buff_size, MYF(0))))
   {
     _ma_check_print_error(param,"Not enough memory for record");
     goto err;
@@ -2005,9 +2041,10 @@ static int sort_record_index(MARIA_SORT_PARAM *sort_param,
   MARIA_HA *info= ma_page->info;
   MARIA_SHARE *share= info->s;
   uint	page_flag, nod_flag,used_length;
+  my_bool buff_alloced;
   uchar *temp_buff,*keypos,*endpos;
   my_off_t next_page,rec_pos;
-  uchar lastkey[MARIA_MAX_KEY_BUFF];
+  uchar *lastkey;
   char llbuff[22];
   MARIA_SORT_INFO *sort_info= sort_param->sort_info;
   HA_CHECK *param=sort_info->param;
@@ -2016,20 +2053,24 @@ static int sort_record_index(MARIA_SORT_PARAM *sort_param,
   const MARIA_KEYDEF *keyinfo= ma_page->keyinfo;
   DBUG_ENTER("sort_record_index");
 
+  temp_buff=0;
   page_flag= ma_page->flag;
   nod_flag=  ma_page->node;
-  temp_buff=0;
   tmp_key.keyinfo= (MARIA_KEYDEF*) keyinfo;
-  tmp_key.data=    lastkey;
 
-  if (nod_flag)
+  alloc_on_stack(*info->stack_end_ptr, lastkey, buff_alloced,
+                 (nod_flag ? keyinfo->block_length  : 0) +
+                 ALIGN_SIZE(keyinfo->max_store_length));
+  if (!lastkey)
   {
-    if (!(temp_buff= (uchar*) my_alloca(tmp_key.keyinfo->block_length)))
-    {
-      _ma_check_print_error(param,"Not Enough memory");
-      DBUG_RETURN(-1);
-    }
+    _ma_check_print_error(param,"Not Enough memory");
+    DBUG_RETURN(-1);
   }
+  if (nod_flag)
+    temp_buff= lastkey + ALIGN_SIZE(keyinfo->max_store_length);
+
+  tmp_key.data=    lastkey;
+
   used_length= ma_page->size;
   keypos= ma_page->buff + share->keypage_header + nod_flag;
   endpos= ma_page->buff + used_length;
@@ -2084,12 +2125,11 @@ static int sort_record_index(MARIA_SORT_PARAM *sort_param,
     _ma_check_print_error(param,"%d when updating keyblock",my_errno);
     goto err;
   }
-  if (temp_buff)
-    my_afree(temp_buff);
+  stack_alloc_free(lastkey, buff_alloced);
   DBUG_RETURN(0);
+
 err:
-  if (temp_buff)
-    my_afree(temp_buff);
+  stack_alloc_free(lastkey, buff_alloced);
   DBUG_RETURN(1);
 } /* sort_record_index */
 
@@ -2100,7 +2140,7 @@ static my_bool write_log_record(HA_CHECK *param)
     Now that all operations including O_NEW_DATA|INDEX are successfully
     done, we can write a log record.
   */
-  MARIA_HA *info= maria_open(param->isam_file_name, O_RDWR, 0);
+  MARIA_HA *info= maria_open(param->isam_file_name, O_RDWR, 0, 0);
   if (info == NULL)
     _ma_check_print_error(param, default_open_errmsg, my_errno,
                           param->isam_file_name);
diff --git a/storage/maria/maria_dump_log.c b/storage/maria/aria_dump_log.c
index 998d2e74014..8e065e9ff9d 100644
--- a/storage/maria/maria_dump_log.c
+++ b/storage/maria/aria_dump_log.c
@@ -87,11 +87,11 @@ static void usage(void)
 
 
 static my_bool
-get_one_option(int optid __attribute__((unused)),
-               const struct my_option *opt __attribute__((unused)),
-               char *argument __attribute__((unused)))
+get_one_option(const struct my_option *opt,
+               const char *argument __attribute__((unused)),
+               const char *filename __attribute__((unused)))
 {
-  switch (optid) {
+  switch (opt->id) {
   case '?':
     usage();
     exit(0);
@@ -143,7 +143,7 @@ int main(int argc, char **argv)
     translog_table_init();
   translog_fill_overhead_table();
 
-  maria_data_root= (char *)".";
+  maria_data_root= ".";
 
   if ((handler= my_open(opt_file, O_RDONLY, MYF(MY_WME))) < 0)
   {
diff --git a/storage/maria/maria_ftdump.c b/storage/maria/aria_ftdump.c
index 75ff9bd2642..8526f5fa74a 100644
--- a/storage/maria/maria_ftdump.c
+++ b/storage/maria/aria_ftdump.c
@@ -21,7 +21,8 @@
 
 static void usage();
 static void complain(int val);
-static my_bool get_one_option(int, const struct my_option *, char *);
+static my_bool get_one_option(const struct my_option *, const char *,
+                              const char*);
 
 static int count=0, stats=0, dump=0, lstats=0;
 static my_bool verbose;
@@ -88,7 +89,7 @@ int main(int argc,char *argv[])
                  MARIA_KEY_BLOCK_LENGTH, 0, MY_WME);
 
   if (!(info=maria_open(argv[0], O_RDONLY,
-                        HA_OPEN_ABORT_IF_LOCKED|HA_OPEN_FROM_SQL_LAYER)))
+                        HA_OPEN_ABORT_IF_LOCKED|HA_OPEN_FROM_SQL_LAYER, 0)))
   {
     error=my_errno;
     goto err;
@@ -232,10 +233,11 @@ err:
 
 
 static my_bool
-get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
-	       char *argument __attribute__((unused)))
+get_one_option(const struct my_option *opt,
+	       const char *argument __attribute__((unused)),
+               const char *filename __attribute__((unused)))
 {
-  switch(optid) {
+  switch(opt->id) {
   case 'd':
     dump=1;
     complain(count || query);
diff --git a/storage/maria/maria_pack.c b/storage/maria/aria_pack.c
index 24438f49c41..40e7e399613 100644
--- a/storage/maria/maria_pack.c
+++ b/storage/maria/aria_pack.c
@@ -204,6 +204,7 @@ static QUEUE queue;
 static HUFF_COUNTS *global_count;
 static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
 static const char *load_default_groups[]= { "ariapack",0 };
+static char **default_argv;
 
 /*
   Register handler error messages for usage with my_error()
@@ -225,11 +226,10 @@ int main(int argc, char **argv)
 {
   int error,ok;
   PACK_MRG_INFO merge;
-  char **default_argv;
   my_bool no_control_file= 0;
   MY_INIT(argv[0]);
 
-  maria_data_root= (char *)".";
+  maria_data_root= ".";
   load_defaults_or_exit("my", load_default_groups, &argc, &argv);
   default_argv= argv;
   get_options(&argc,&argv);
@@ -239,7 +239,7 @@ int main(int argc, char **argv)
   if (!opt_ignore_control_file &&
       (no_control_file= ma_control_file_open(FALSE,
                                              (opt_require_control_file ||
-                                              !silent))) &&
+                                              !silent), FALSE)) &&
        opt_require_control_file)
   {
     error= 1;
@@ -293,6 +293,14 @@ end:
 #endif
 }
 
+static void my_exit(int error)
+{
+  free_defaults(default_argv);
+  maria_end();
+  my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
+  exit(error);
+}
+
 enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
 
 static struct my_option my_long_options[] =
@@ -308,7 +316,7 @@ static struct my_option my_long_options[] =
    (char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
   {"datadir", 'h',
    "Path for control file (and logs if --logdir not used).",
-   &maria_data_root, 0, 0, GET_STR, REQUIRED_ARG,
+   (char**) &maria_data_root, 0, 0, GET_STR, REQUIRED_ARG,
    0, 0, 0, 0, 0, 0},
   {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
    0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
@@ -372,12 +380,13 @@ static void usage(void)
 
 
 static my_bool
-get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
-	       char *argument)
+get_one_option(const struct my_option *opt,
+	       const char *argument,
+               const char *filename __attribute__((unused)))
 {
   uint length;
 
-  switch(optid) {
+  switch(opt->id) {
 #ifdef __NETWARE__
   case OPT_AUTO_CLOSE:
     setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
@@ -414,11 +423,12 @@ get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
     break;
   case 'V':
     print_version();
-    exit(0);
+    my_exit(0);
+    break;
   case 'I':
   case '?':
     usage();
-    exit(0);
+    my_exit(0);
   }
   return 0;
 }
@@ -435,12 +445,12 @@ static void get_options(int *argc,char ***argv)
     write_loop=1;
 
   if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
-    exit(ho_error);
+    my_exit(ho_error);
 
   if (!*argc)
   {
     usage();
-    exit(1);
+    my_exit(1);
   }
   if (join_table)
   {
@@ -500,7 +510,7 @@ static MARIA_HA *open_maria_file(char *name,int mode)
 
   if (!(isam_file=maria_open(name, mode, HA_OPEN_IGNORE_MOVED_STATE |
                              (opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
-                              HA_OPEN_ABORT_IF_LOCKED))))
+                              HA_OPEN_ABORT_IF_LOCKED), 0)))
   {
     print_error(my_errno, name);
     DBUG_RETURN(0);
@@ -537,7 +547,7 @@ static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count)
   uint i,j;
   mrg->count=0;
   mrg->current=0;
-  mrg->file=(MARIA_HA**) my_malloc(sizeof(MARIA_HA*)*count,MYF(MY_FAE));
+  mrg->file=(MARIA_HA**) my_malloc(PSI_NOT_INSTRUMENTED, sizeof(MARIA_HA*)*count,MYF(MY_FAE));
   mrg->free_file=1;
   mrg->src_file_has_indexes_disabled= 0;
   for (i=0; i < count ; i++)
@@ -624,7 +634,7 @@ static int compress(PACK_MRG_INFO *mrg,char *result_table)
 	< 0)
       goto err;
     length=(uint) share->base.keystart;
-    if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
+    if (!(buff= (uchar*) my_malloc(PSI_NOT_INSTRUMENTED, length, MYF(MY_WME))))
       goto err;
     if (my_pread(share->kfile.file, buff, length, 0L, MYF(MY_WME | MY_NABP)) ||
 	my_write(join_maria_file,buff,length,
@@ -871,9 +881,8 @@ static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records)
 {
   reg2 uint i;
   reg1 HUFF_COUNTS *count;
-  if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
-					sizeof(HUFF_COUNTS),
-					MYF(MY_ZEROFILL | MY_WME))))
+  if ((count = (HUFF_COUNTS*) my_malloc(PSI_NOT_INSTRUMENTED,
+        info->s->base.fields*sizeof(HUFF_COUNTS), MYF(MY_ZEROFILL | MY_WME))))
   {
     for (i=0 ; i < info->s->base.fields ; i++)
     {
@@ -900,7 +909,8 @@ static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records)
 		NULL, MYF(0));
       if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
 	count[col_nr].tree_pos=count[col_nr].tree_buff =
-	  my_malloc(count[col_nr].field_length > 1 ? tree_buff_length : 2,
+	  my_malloc(PSI_NOT_INSTRUMENTED,
+                    count[col_nr].field_length > 1 ? tree_buff_length : 2,
 		    MYF(MY_WME));
     }
   }
@@ -1544,8 +1554,8 @@ static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
   HUFF_TREE *huff_tree;
   DBUG_ENTER("make_huff_trees");
 
-  if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
-					 MYF(MY_WME | MY_ZEROFILL))))
+  if (!(huff_tree=(HUFF_TREE*) my_malloc(PSI_NOT_INSTRUMENTED,
+                         trees*sizeof(HUFF_TREE), MYF(MY_WME | MY_ZEROFILL))))
     DBUG_RETURN(0);
 
   for (tree=0 ; tree < trees ; tree++)
@@ -1622,16 +1632,15 @@ static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
   if (!huff_tree->element_buffer)
   {
     if (!(huff_tree->element_buffer=
-	 (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
+	 (HUFF_ELEMENT*) my_malloc(PSI_NOT_INSTRUMENTED,
+                                   found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
       return 1;
   }
   else
   {
     HUFF_ELEMENT *temp;
-    if (!(temp=
-	  (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
-				     found*2*sizeof(HUFF_ELEMENT),
-				     MYF(MY_WME))))
+    if (!(temp= (HUFF_ELEMENT*) my_realloc(PSI_NOT_INSTRUMENTED,
+           (uchar*) huff_tree->element_buffer, found*2*sizeof(HUFF_ELEMENT), MYF(MY_WME))))
       return 1;
     huff_tree->element_buffer=temp;
   }
@@ -1997,8 +2006,8 @@ static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
     {
       elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
       if (!(huff_tree->code =
-            (ulonglong*) my_malloc(elements*
-                                   (sizeof(ulonglong) + sizeof(uchar)),
+            (ulonglong*) my_malloc(PSI_NOT_INSTRUMENTED,
+                                elements* (sizeof(ulonglong) + sizeof(uchar)),
                                    MYF(MY_WME | MY_ZEROFILL))))
 	return 1;
       huff_tree->code_len=(uchar*) (huff_tree->code+elements);
@@ -2899,8 +2908,8 @@ static char *make_old_name(char *new_name, char *old_name)
 static void init_file_buffer(File file, pbool read_buffer)
 {
   file_buffer.file=file;
-  file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
-					 MYF(MY_WME));
+  file_buffer.buffer= (uchar*) my_malloc(PSI_NOT_INSTRUMENTED,
+                                   ALIGN_SIZE(RECORD_CACHE_SIZE), MYF(MY_WME));
   file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
   file_buffer.pos_in_file=0;
   error_on_write=0;
@@ -2956,7 +2965,8 @@ static int flush_buffer(ulong neaded_length)
   {
     uchar *tmp;
     neaded_length+=256;				/* some margin */
-    tmp= (uchar*) my_realloc(file_buffer.buffer, neaded_length,MYF(MY_WME));
+    tmp= (uchar*) my_realloc(PSI_NOT_INSTRUMENTED, file_buffer.buffer,
+                             neaded_length,MYF(MY_WME));
     if (!tmp)
       return 1;
     file_buffer.pos=    (tmp + (ulong) (file_buffer.pos - file_buffer.buffer));
diff --git a/storage/maria/maria_read_log.c b/storage/maria/aria_read_log.c
index 59d5c8abce8..51bfa879702 100644
--- a/storage/maria/maria_read_log.c
+++ b/storage/maria/aria_read_log.c
@@ -1,5 +1,6 @@
 /* Copyright (C) 2007 MySQL AB
    Copyright (C) 2010 Monty Program Ab
+   Copyright (C) 2020 MariaDB Corporation Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -29,15 +30,51 @@ const char *default_dbug_option= "d:t:O,\\aria_read_log.trace";
 const char *default_dbug_option= "d:t:o,/tmp/aria_read_log.trace";
 #endif
 #endif /* DBUG_OFF */
-static my_bool opt_display_only, opt_apply, opt_apply_undo, opt_silent;
-static my_bool opt_check;
+static my_bool opt_display_only, opt_apply, opt_silent, opt_apply_undo;
+static my_bool opt_check, opt_start_from_checkpoint;
 static my_bool opt_print_aria_log_control;
 static const char *opt_tmpdir;
 static ulong opt_translog_buffer_size;
 static ulonglong opt_page_buffer_size;
-static ulonglong opt_start_from_lsn, opt_end_lsn, opt_start_from_checkpoint;
+static ulonglong opt_start_from_lsn, opt_lsn_redo_end, opt_lsn_undo_end;
+static char *start_from_lsn_buf, *lsn_redo_end_buf, *lsn_undo_end_buf;
 static MY_TMPDIR maria_chk_tmpdir;
 
+/*
+  Get lsn from file number and offset
+  Format supported:
+  ulonglong
+  uint,0xhex
+*/
+
+static ulonglong get_lsn(const char *lsn_str)
+{
+  ulong file;
+  ulong pos;
+  if (sscanf(lsn_str, " %lu,0x%lx", &file, &pos) == 2)
+    return MAKE_LSN(file, pos);
+  if (sscanf(lsn_str, " %lu", &pos) == 1)
+    return (ulonglong) pos;
+  return ~(ulonglong) 0;                        /* Error */
+}
+
+static my_bool get_lsn_arg(const char *lsn_string, ulonglong *lsn,
+                           const char *name)
+{
+  ulonglong value;
+  value= get_lsn(lsn_string);
+  if (value != ~(ulonglong) 0)
+  {
+    *lsn= value;
+    return 0;
+  }
+  fprintf(stderr,
+          "Wrong value '%s' for option %s. Value should be in format: "
+          "number,0xhexnumber\n",
+          lsn_string, name);
+  return 1;
+}
+
 
 int main(int argc, char **argv)
 {
@@ -46,7 +83,7 @@ int main(int argc, char **argv)
   uint warnings_count;
   MY_INIT(argv[0]);
 
-  maria_data_root= (char *)".";
+  maria_data_root= ".";
   sf_leaking_memory=1; /* don't report memory leaks on early exits */
   load_defaults_or_exit("my", load_default_groups, &argc, &argv);
   default_argv= argv;
@@ -67,7 +104,7 @@ int main(int argc, char **argv)
     goto end;
   }
   /* we don't want to create a control file, it MUST exist */
-  if (ma_control_file_open(FALSE, TRUE))
+  if (ma_control_file_open(FALSE, TRUE, TRUE))
   {
     fprintf(stderr, "Can't open control file (%d)\n", errno);
     goto err;
@@ -136,17 +173,12 @@ int main(int argc, char **argv)
             LSN_IN_PARTS(lsn));
   }
 
-  if (opt_end_lsn != LSN_IMPOSSIBLE)
-  {
-    /* We can't apply undo if we use end_lsn */
-    opt_apply_undo= 0;
-  }
-
   fprintf(stdout, "TRACE of the last aria_read_log\n");
-  if (maria_apply_log(lsn, opt_end_lsn, opt_apply ?  MARIA_LOG_APPLY :
+  if (maria_apply_log(lsn, opt_lsn_redo_end, opt_lsn_undo_end,
+                      opt_apply ?  MARIA_LOG_APPLY :
                       (opt_check ? MARIA_LOG_CHECK :
                        MARIA_LOG_DISPLAY_HEADER), opt_silent ? NULL : stdout,
-                      opt_apply_undo, FALSE, FALSE, &warnings_count))
+                      FALSE, FALSE, &warnings_count))
     goto err;
   if (warnings_count == 0)
     fprintf(stdout, "%s: SUCCESS\n", my_progname_short);
@@ -204,12 +236,19 @@ static struct my_option my_long_options[] =
   {"display-only", 'd', "display brief info read from records' header",
    &opt_display_only, &opt_display_only, 0, GET_BOOL,
    NO_ARG,0, 0, 0, 0, 0, 0},
-  { "end-lsn", 'e', "Stop applying at this lsn. If end-lsn is used, UNDO:s "
-    "will not be applied", &opt_end_lsn, &opt_end_lsn,
-    0, GET_ULL, REQUIRED_ARG, 0, 0, ~(longlong) 0, 0, 0, 0 },
+  { "end-lsn", 'e', "Alias for lsn-redo-end",
+    &lsn_redo_end_buf, &lsn_redo_end_buf, 0, GET_STR, REQUIRED_ARG, 0, 0,
+    0, 0, 0, 0 },
+  { "lsn-redo-end", 'e', "Stop applying at this lsn during redo. If "
+    "this option is used UNDO:s will not be applied unless --lsn-undo-end is "
+    "given", &lsn_redo_end_buf,
+    &lsn_redo_end_buf, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0 },
+  { "lsn-undo-end", 'E', "Stop applying undo after this lsn has been applied",
+    &lsn_undo_end_buf, &lsn_undo_end_buf, 0, GET_STR, REQUIRED_ARG, 0, 0,
+    0, 0, 0, 0 },
   {"aria-log-dir-path", 'h',
     "Path to the directory where to store transactional log",
-    (uchar **) &maria_data_root, (uchar **) &maria_data_root, 0,
+    (char **) &maria_data_root, (char **) &maria_data_root, 0,
     GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
   { "page-buffer-size", 'P',
     "The size of the buffer used for index blocks for Aria tables",
@@ -230,7 +269,7 @@ static struct my_option my_long_options[] =
    &opt_silent, &opt_silent, 0,
    GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
   {"tables-to-redo", 'T',
-   "List of tables sepearated with , that we should apply REDO on. Use this if you only want to recover some tables",
+   "List of tables separated with , that we should apply REDO on. Use this if you only want to recover some tables",
    0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
   {"tmpdir", 't', "Path for temporary files. Multiple paths can be specified, "
    "separated by "
@@ -246,7 +285,9 @@ static struct my_option my_long_options[] =
     GET_ULONG, REQUIRED_ARG, (long) TRANSLOG_PAGECACHE_SIZE,
     1024L*1024L, (long) ~(ulong) 0, (long) MALLOC_OVERHEAD,
     (long) IO_SIZE, 0},
-  {"undo", 'u', "Apply UNDO records to tables. (disable with --disable-undo)",
+  {"undo", 'u',
+   "Apply UNDO records to tables. (disable with --disable-undo). "
+   "Will be automatically set if lsn-undo-end is used",
    (uchar **) &opt_apply_undo, (uchar **) &opt_apply_undo, 0,
    GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0},
   {"verbose", 'v', "Print more information during apply/undo phase",
@@ -257,10 +298,9 @@ static struct my_option my_long_options[] =
   { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
 };
 
-
 static void print_version(void)
 {
-  printf("%s Ver 1.4 for %s on %s\n",
+  printf("%s Ver 1.5 for %s on %s\n",
               my_progname_short, SYSTEM_TYPE, MACHINE_TYPE);
 }
 
@@ -268,7 +308,7 @@ static void print_version(void)
 static void usage(void)
 {
   print_version();
-  puts("Copyright (C) 2007 MySQL AB, 2009-2011 Monty Program Ab");
+  puts("Copyright (C) 2007 MySQL AB, 2009-2011 Monty Program Ab, 2020 MariaDB Corporation");
   puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
   puts("and you are welcome to modify and redistribute it under the GPL license\n");
 
@@ -284,7 +324,7 @@ static void usage(void)
          my_progname_short);
   printf("or\n");
   printf("Usage: %s OPTIONS -h `aria_log_directory` "
-         "--print-aria-log-control\n\n",
+         "--print-log-control-file\n\n",
          my_progname_short);
 
   my_print_help(my_long_options);
@@ -302,23 +342,26 @@ static uchar* my_hash_get_string(const uchar *record, size_t *length,
 
 
 static my_bool
-get_one_option(int optid __attribute__((unused)),
-               const struct my_option *opt __attribute__((unused)),
-               char *argument)
+get_one_option(const struct my_option *opt,
+               const char *argument,
+               const char *filename __attribute__((unused)))
 {
-  switch (optid) {
+  switch (opt->id) {
   case '?':
     usage();
     exit(0);
   case 'V':
     print_version();
     exit(0);
+  case 'E':
+    opt_apply_undo= TRUE;
+    break;
   case 'T':
   {
     char *pos;
     if (!my_hash_inited(&tables_to_redo))
     {
-      my_hash_init2(&tables_to_redo, 16, &my_charset_bin,
+      my_hash_init2(PSI_INSTRUMENT_ME, &tables_to_redo, 16, &my_charset_bin,
                     16, 0, 0, my_hash_get_string, 0, 0, HASH_UNIQUE);
     }
     do
@@ -342,13 +385,34 @@ get_one_option(int optid __attribute__((unused)),
 static void get_options(int *argc,char ***argv)
 {
   int ho_error;
-  my_bool need_help= 0;
+  my_bool need_help= 0, need_abort= 0;
 
   if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
     exit(ho_error);
 
+  if (start_from_lsn_buf)
+  {
+    if (get_lsn_arg(start_from_lsn_buf, &opt_start_from_lsn,
+                    "start-from-lsn"))
+      need_abort= 1;
+  }
+  if (lsn_redo_end_buf)
+  {
+    if (get_lsn_arg(lsn_redo_end_buf, &opt_lsn_redo_end,
+                    "lsn-redo-end"))
+      need_abort= 1;
+  }
+  if (lsn_undo_end_buf)
+  {
+    if (get_lsn_arg(lsn_undo_end_buf, &opt_lsn_undo_end,
+                    "lsn-undo-end"))
+      need_abort= 1;
+  }
+
   if (!opt_apply)
     opt_apply_undo= FALSE;
+  if (!opt_apply_undo)
+    opt_lsn_undo_end= LSN_MAX;
 
   if (*argc > 0)
   {
@@ -357,21 +421,20 @@ static void get_options(int *argc,char ***argv)
   }
   if ((opt_display_only + opt_apply + opt_print_aria_log_control) != 1)
   {
-    need_help= 1;
+    need_abort= 1;
     fprintf(stderr,
             "You must use one and only one of the options 'display-only', \n"
             "'print-log-control-file' and 'apply'\n");
   }
 
-  if (need_help)
+  if (need_help || need_abort)
   {
     fflush(stderr);
-    need_help =1;
-    usage();
+    if (need_help)
+      usage();
     exit(1);
   }
   if (init_tmpdir(&maria_chk_tmpdir, opt_tmpdir))
     exit(1);
   maria_tmpdir= &maria_chk_tmpdir;
 }
-
diff --git a/storage/maria/aria_s3_copy.cc b/storage/maria/aria_s3_copy.cc
new file mode 100644
index 00000000000..77c41ba4572
--- /dev/null
+++ b/storage/maria/aria_s3_copy.cc
@@ -0,0 +1,348 @@
+/* Copyright (C) 2019 MariaDB corporation
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */
+
+/*
+  Allow copying of Aria tables to and from S3 and also delete them from S3
+*/
+
+#include <my_global.h>
+#include <m_string.h>
+#include "maria_def.h"
+#include <aria_backup.h>
+#include <my_getopt.h>
+#include <my_check_opt.h>
+#include <mysys_err.h>
+#include <mysqld_error.h>
+#include <zlib.h>
+#include <libmarias3/marias3.h>
+#include "s3_func.h"
+
+static const char *op_types[]= {"to_s3", "from_s3", "delete_from_s3", NullS};
+static TYPELIB op_typelib= {array_elements(op_types)-1,"", op_types, NULL};
+#define OP_IMPOSSIBLE array_elements(op_types)
+
+static const char *load_default_groups[]= { "aria_s3_copy", 0 };
+static const char *opt_s3_access_key, *opt_s3_secret_key;
+static const char *opt_s3_region="eu-north-1";
+static const char *opt_s3_host_name= DEFAULT_AWS_HOST_NAME;
+static const char *opt_database;
+static const char *opt_s3_bucket="MariaDB";
+static my_bool opt_compression, opt_verbose, opt_force, opt_s3_debug;
+static my_bool opt_s3_use_http;
+static ulong opt_operation= OP_IMPOSSIBLE, opt_protocol_version= 1;
+static ulong opt_block_size;
+static ulong opt_s3_port;
+static char **default_argv=0;
+static ms3_st *global_s3_client= 0;
+
+
+static struct my_option my_long_options[] =
+{
+  {"help", '?', "Display this help and exit.", 0, 0, 0, GET_NO_ARG, NO_ARG, 0,
+   0, 0, 0, 0, 0},
+  {"s3_access_key", 'k', "AWS access key ID",
+   (char**) &opt_s3_access_key, (char**) &opt_s3_access_key, 0,
+    GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"s3_region", 'r', "AWS region",
+   (char**) &opt_s3_region, (char**) &opt_s3_region, 0,
+    GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"s3_secret_key", 'K', "AWS secret access key ID",
+   (char**) &opt_s3_secret_key, (char**) &opt_s3_secret_key, 0,
+    GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"s3_bucket", 'b', "AWS prefix for tables",
+   (char**) &opt_s3_bucket, (char**) &opt_s3_bucket, 0,
+    GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"s3_host_name", 'h', "Host name to S3 provider",
+   (char**) &opt_s3_host_name, (char**) &opt_s3_host_name, 0,
+    GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"s3_port", 'p', "Port number to connect to (0 means use default)",
+   (char**) &opt_s3_port, (char**) &opt_s3_port, 0, GET_ULONG, REQUIRED_ARG,
+   0, 0, 65536, 0, 1, 0 },
+  {"s3_use_http", 'P', "If true, force use of HTTP protocol",
+   (char**) &opt_s3_use_http, (char**) &opt_s3_use_http,
+   0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"compress", 'c', "Use compression", &opt_compression, &opt_compression,
+   0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"op", 'o', "Operation to execute. One of 'from_s3', 'to_s3' or "
+   "'delete_from_s3'",
+   &opt_operation, &opt_operation, &op_typelib,
+   GET_ENUM, REQUIRED_ARG, OP_IMPOSSIBLE, 0, 0, 0, 0, 0},
+  {"database", 'd',
+   "Database for copied table (second prefix). "
+   "If not given, the directory of the table file is used",
+   &opt_database, &opt_database, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"s3_block_size", 'B', "Block size for data/index blocks in s3",
+   &opt_block_size, &opt_block_size, 0, GET_ULONG, REQUIRED_ARG,
+   4*1024*1024, 64*1024, 16*1024*1024, MALLOC_OVERHEAD, 1024, 0 },
+  {"s3_protocol_version", 'L',
+   "Protocol used to communication with S3. One of \"Auto\", \"Amazon\" or \"Original\".",
+   &opt_protocol_version, &opt_protocol_version, &s3_protocol_typelib,
+   GET_ENUM, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"force", 'f', "Force copy even if target exists",
+   &opt_force, &opt_force, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"verbose", 'v', "Write more information", &opt_verbose, &opt_verbose,
+   0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"version", 'V', "Print version and exit.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifndef DBUG_OFF
+  {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
+   0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+  {"s3_debug",0, "Output debug log from marias3 to stdout",
+  &opt_s3_debug, &opt_s3_debug, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+};
+
+
+static bool get_database_from_path(char *to, size_t to_length, const char *path);
+
+
+static void print_version(void)
+{
+  printf("%s  Ver 1.0 for %s on %s\n", my_progname, SYSTEM_TYPE,
+	 MACHINE_TYPE);
+}
+
+static void usage(void)
+{
+  print_version();
+  puts("\nThis software comes with NO WARRANTY: "
+       " see the PUBLIC for details.\n");
+  puts("Copy an Aria table to and from s3");
+  printf("Usage: %s --aws-access-key=# --aws-secret-access-key=# --aws-region=# "
+         "--op=(from_s3 | to_s3 | delete_from_s3) [OPTIONS] tables[.MAI]\n",
+         my_progname_short);
+  print_defaults("my", load_default_groups);
+  puts("");
+  my_print_help(my_long_options);
+  my_print_variables(my_long_options);
+}
+
+
+ATTRIBUTE_NORETURN static void my_exit(int exit_code)
+{
+  if (global_s3_client)
+  {
+    ms3_deinit(global_s3_client);
+    global_s3_client= 0;
+  }
+  free_defaults(default_argv);
+  s3_deinit_library();
+  my_end(MY_CHECK_ERROR);
+  exit(exit_code);
+}
+
+extern "C" my_bool get_one_option(const struct my_option *opt
+                                  __attribute__((unused)),
+                                  const char *argument, const char *filename)
+{
+  switch (opt->id) {
+  case 'V':
+    print_version();
+    my_exit(0);
+  case '?':
+    usage();
+    my_exit(0);
+  case '#':
+    DBUG_SET_INITIAL(argument ? argument : "d:t:o,/tmp/aria_s3_copy.trace");
+    break;
+  }
+  return 0;
+}
+
+
+static void get_options(int *argc, char ***argv)
+{
+  int ho_error;
+
+  load_defaults_or_exit("my", load_default_groups, argc, argv);
+  default_argv= *argv;
+
+  if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
+    my_exit(ho_error);
+
+  if (*argc == 0)
+  {
+    usage();
+    my_exit(-1);
+  }
+
+  if (!opt_s3_access_key)
+  {
+    fprintf(stderr, "--aws-access-key was not given\n");
+    my_exit(-1);
+  }
+  if (!opt_s3_secret_key)
+  {
+    fprintf(stderr, "--aws-secret-access-key was not given\n");
+    my_exit(-1);
+  }
+  if (opt_operation == OP_IMPOSSIBLE)
+  {
+    fprintf(stderr, "You must specify an operation with --op=[from_s3|to_s3|delete_from_s3]\n");
+    my_exit(-1);
+  }
+  if (opt_s3_debug)
+    ms3_debug();
+
+} /* get_options */
+
+
+int main(int argc, char** argv)
+{
+  MY_INIT(argv[0]);
+  get_options(&argc,(char***) &argv);
+  size_t block_size= opt_block_size;
+
+  s3_init_library();
+  if (!(global_s3_client= ms3_init(opt_s3_access_key,
+                                   opt_s3_secret_key,
+                                   opt_s3_region, opt_s3_host_name)))
+  {
+    fprintf(stderr, "Can't open connection to S3, error: %d %s", errno,
+            ms3_error(errno));
+    my_exit(1);
+  }
+
+  ms3_set_option(global_s3_client, MS3_OPT_BUFFER_CHUNK_SIZE, &block_size);
+
+  if (opt_protocol_version)
+  {
+    uint8_t protocol_version= (uint8_t) opt_protocol_version;
+    ms3_set_option(global_s3_client, MS3_OPT_FORCE_PROTOCOL_VERSION,
+                   &protocol_version);
+  }
+  if (opt_s3_port)
+  {
+    int port= (int) opt_s3_port;
+    ms3_set_option(global_s3_client, MS3_OPT_PORT_NUMBER, &port);
+  }
+  if (opt_s3_use_http)
+    ms3_set_option(global_s3_client, MS3_OPT_USE_HTTP, NULL);
+
+
+  for (; *argv ; argv++)
+  {
+    char database[FN_REFLEN], table_name[FN_REFLEN], *path;
+    const char *db;
+
+    path= *argv;
+
+    fn_format(table_name, path, "", "", MY_REPLACE_DIR | MY_REPLACE_EXT);
+
+    /* Get database from option, path or current directory */
+    if (!(db= opt_database))
+    {
+      if (get_database_from_path(database, sizeof(database), path))
+      {
+        fprintf(stderr, "Aborting copying of %s\n", path);
+        my_exit(-1);
+      }
+      db= database;
+    }
+
+    switch (opt_operation) {
+    case 0:
+      /* Don't copy .frm file for partioned table */
+      if (aria_copy_to_s3(global_s3_client, opt_s3_bucket, path,
+                          db, table_name, opt_block_size, opt_compression,
+                          opt_force, opt_verbose, !strstr(table_name, "#P#")))
+      {
+        fprintf(stderr, "Aborting copying of %s\n", path);
+        my_exit(-1);
+      }
+      break;
+    case 1:
+      if (aria_copy_from_s3(global_s3_client, opt_s3_bucket, path,
+                          db, opt_compression, opt_force, opt_verbose))
+      {
+        fprintf(stderr, "Aborting copying of %s\n", path);
+        my_exit(-1);
+      }
+      break;
+    case 2:
+      if (aria_delete_from_s3(global_s3_client, opt_s3_bucket, db,
+                              table_name, opt_verbose))
+      {
+        fprintf(stderr, "Aborting copying of %s\n", path);
+        my_exit(-1);
+      }
+      break;
+    }
+  }
+  my_exit(0);
+  return 0;
+}
+
+
+/**
+  Calculate database name base on path of Aria file
+
+  @return 0 ok
+  @return 1 error
+*/
+
+static bool get_database_from_path(char *to, size_t to_length,
+                                   const char *path)
+{
+  S3_INFO s3;
+  if (!set_database_and_table_from_path(&s3, path))
+  {
+    strmake(to, s3.database.str, MY_MIN(s3.database.length, to_length-1));
+    return 0;
+  }
+
+  if (my_getwd(to, to_length-1, MYF(MY_WME)))
+    return 1;
+  return get_database_from_path(to, to_length, to);
+}
+
+
+#include "ma_check_standalone.h"
+
+/*
+  Declare all symbols from libmyisam.a, to ensure that we don't have
+  to include the library as it pulls in ha_myisam.cc
+*/
+
+const char *ft_boolean_syntax= 0;
+ulong ft_min_word_len=0, ft_max_word_len=0;
+const HA_KEYSEG ft_keysegs[FT_SEGS]= {
+{
+  0,                                            /* charset  */
+  HA_FT_WLEN,                                   /* start */
+  0,                                            /* null_pos */
+  0,                                            /* Bit pos */
+  HA_VAR_LENGTH_PART | HA_PACK_KEY,             /* flag */
+  HA_FT_MAXBYTELEN,                             /* length */
+  63,                                           /* language (will be overwritten
+) */
+  HA_KEYTYPE_VARTEXT2,                          /* type */
+  0,                                            /* null_bit */
+  2, 0                                          /* bit_start, bit_length */
+},
+{
+  0, 0, 0, 0, HA_NO_SORT, HA_FT_WLEN, 63, HA_FT_WTYPE, 0, 0, 0
+}
+};
+
+struct st_mysql_ftparser ft_default_parser=
+{
+  MYSQL_FTPARSER_INTERFACE_VERSION, 0, 0, 0
+};
+
+C_MODE_START
+int is_stopword(const char *word, size_t len) { return 0; }
+C_MODE_END
diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc
index 0c9db691ca5..cc18163194d 100644
--- a/storage/maria/ha_maria.cc
+++ b/storage/maria/ha_maria.cc
@@ -1,6 +1,6 @@
 /* Copyright (C) 2004-2008 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
    Copyright (C) 2008-2009 Sun Microsystems, Inc.
-   Copyright (c) 2009, 2017, MariaDB Corporation.
+   Copyright (c) 2009, 2021, MariaDB Corporation Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -50,19 +50,18 @@ C_MODE_END
   Note that in future versions, only *transactional* Maria tables can
   rollback, so this flag should be up or down conditionally.
 */
-#ifdef MARIA_CANNOT_ROLLBACK
-#define CANNOT_ROLLBACK_FLAG HA_NO_TRANSACTIONS
-#define trans_register_ha(A, B, C)  do { /* nothing */ } while(0)
+#ifdef ARIA_HAS_TRANSACTIONS
+#define TRANSACTION_STATE
 #else
-#define CANNOT_ROLLBACK_FLAG 0
+#define TRANSACTION_STATE HA_NO_TRANSACTIONS
 #endif
-#define THD_TRN (*(TRN **)thd_ha_data(thd, maria_hton))
+
+#define THD_TRN (TRN*) thd_get_ha_data(thd, maria_hton)
 
 ulong pagecache_division_limit, pagecache_age_threshold, pagecache_file_hash_size;
 ulonglong pagecache_buffer_size;
 const char *zerofill_error_msg=
-  "Table is from another system and must be zerofilled or repaired to be "
-  "usable on this system";
+  "Table is probably from another system and must be zerofilled or repaired ('REPAIR TABLE table_name') to be usable on this system";
 
 /**
    As the auto-repair is initiated when opened from the SQL layer
@@ -187,12 +186,11 @@ static MYSQL_SYSVAR_BOOL(page_checksum, maria_page_checksums, 0,
        "with PAGE_CHECKSUM clause in CREATE TABLE)", 0, 0, 1);
 
 /* It is only command line argument */
-static MYSQL_SYSVAR_STR(log_dir_path, maria_data_root,
+static MYSQL_SYSVAR_CONST_STR(log_dir_path, maria_data_root,
        PLUGIN_VAR_NOSYSVAR | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
        "Path to the directory where to store transactional log",
        NULL, NULL, mysql_real_data_home);
 
-
 static MYSQL_SYSVAR_ULONG(log_file_size, log_file_size,
        PLUGIN_VAR_RQCMDARG,
        "Limit for transaction log size",
@@ -210,7 +208,7 @@ static MYSQL_SYSVAR_ENUM(group_commit, maria_group_commit,
 
 static MYSQL_SYSVAR_ULONG(group_commit_interval, maria_group_commit_interval,
        PLUGIN_VAR_RQCMDARG,
-       "Interval between commite in microseconds (1/1000000c)."
+       "Interval between commits in microseconds (1/1000000 sec)."
        " 0 stands for no waiting"
        " for other threads to come and do a commit in \"hard\" mode and no"
        " sync()/commit at all in \"soft\" mode.  Option has only an effect"
@@ -268,8 +266,9 @@ static MYSQL_THDVAR_ULONG(repair_threads, PLUGIN_VAR_RQCMDARG,
 
 static MYSQL_THDVAR_ULONGLONG(sort_buffer_size, PLUGIN_VAR_RQCMDARG,
        "The buffer that is allocated when sorting the index when doing a "
-       "REPAIR or when creating indexes with CREATE INDEX or ALTER TABLE.", NULL, NULL,
-       SORT_BUFFER_INIT, MIN_SORT_BUFFER, SIZE_T_MAX, 1);
+       "REPAIR or when creating indexes with CREATE INDEX or ALTER TABLE.",
+       NULL, NULL,
+       SORT_BUFFER_INIT, MIN_SORT_BUFFER, SIZE_T_MAX/2, 1);
 
 static MYSQL_THDVAR_ENUM(stats_method, PLUGIN_VAR_RQCMDARG,
        "Specifies how Aria index statistics collection code should treat "
@@ -287,7 +286,7 @@ static MYSQL_SYSVAR_ENUM(sync_log_dir, sync_log_dir, PLUGIN_VAR_RQCMDARG,
 #endif
 my_bool use_maria_for_temp_tables= USE_ARIA_FOR_TMP_TABLES_VAL;
 
-static MYSQL_SYSVAR_BOOL(used_for_temp_tables, 
+static MYSQL_SYSVAR_BOOL(used_for_temp_tables,
        use_maria_for_temp_tables, PLUGIN_VAR_READONLY | PLUGIN_VAR_NOCMDOPT,
        "Whether temporary tables should be MyISAM or Aria", 0, 0,
        1);
@@ -527,7 +526,7 @@ static int table2maria(TABLE *table_arg, data_file_type row_type,
   if (row_type == BLOCK_RECORD)
     options|= HA_OPTION_PACK_RECORD;
 
-  if (!(my_multi_malloc(MYF(MY_WME),
+  if (!(my_multi_malloc(PSI_INSTRUMENT_ME, MYF(MY_WME),
           recinfo_out, (share->fields * 2 + 2) * sizeof(MARIA_COLUMNDEF),
           keydef_out, share->keys * sizeof(MARIA_KEYDEF),
           &keyseg,
@@ -911,7 +910,7 @@ void _ma_check_print_error(HA_CHECK *param, const char *fmt, ...)
 {
   va_list args;
   DBUG_ENTER("_ma_check_print_error");
-  param->error_printed |= 1;
+  param->error_printed++;
   param->out_flag |= O_DATA_LOST;
   if (param->testflag & T_SUPPRESS_ERR_HANDLING)
     DBUG_VOID_RETURN;
@@ -937,7 +936,7 @@ void _ma_check_print_warning(HA_CHECK *param, const char *fmt, ...)
 {
   va_list args;
   DBUG_ENTER("_ma_check_print_warning");
-  param->warning_printed= 1;
+  param->warning_printed++;
   param->out_flag |= O_DATA_LOST;
   va_start(args, fmt);
   _ma_check_print_msg(param, MA_CHECK_WARNING, fmt, args);
@@ -964,17 +963,17 @@ static int maria_create_trn_for_mysql(MARIA_HA *info)
 
   if (!trn)  /* no transaction yet - open it now */
   {
-    trn= trnman_new_trn(& thd->transaction.wt);
+    trn= trnman_new_trn(& thd->transaction->wt);
     if (unlikely(!trn))
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
-    THD_TRN= trn;
+    thd_set_ha_data(thd, maria_hton, trn);
     if (thd->variables.option_bits & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
-      trans_register_ha(thd, TRUE, maria_hton);
+      trans_register_ha(thd, TRUE, maria_hton, trn->trid);
   }
   _ma_set_trn_for_table(info, trn);
   if (!trnman_increment_locked_tables(trn))
   {
-    trans_register_ha(thd, FALSE, maria_hton);
+    trans_register_ha(thd, FALSE, maria_hton, trn->trid);
     trnman_new_statement(trn);
   }
 #ifdef EXTRA_DEBUG
@@ -992,7 +991,7 @@ static int maria_create_trn_for_mysql(MARIA_HA *info)
     DBUG_PRINT("info", ("lock_type: %d  trnman_flags: %u",
                         info->lock_type, trnman_get_flags(trn)));
   }
-  
+
 #endif
   DBUG_RETURN(0);
 }
@@ -1025,7 +1024,7 @@ handler(hton, table_arg), file(0),
 int_table_flags(HA_NULL_IN_KEY | HA_CAN_FULLTEXT | HA_CAN_SQL_HANDLER |
                 HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE |
                 HA_DUPLICATE_POS | HA_CAN_INDEX_BLOBS | HA_AUTO_PART_KEY |
-                HA_FILE_BASED | HA_CAN_GEOMETRY | CANNOT_ROLLBACK_FLAG |
+                HA_FILE_BASED | HA_CAN_GEOMETRY | TRANSACTION_STATE |
                 HA_CAN_BIT_FIELD | HA_CAN_RTREEKEYS | HA_CAN_REPAIR |
                 HA_CAN_VIRTUAL_COLUMNS | HA_CAN_EXPORT |
                 HA_HAS_RECORDS | HA_STATS_RECORDS_IS_EXACT |
@@ -1076,7 +1075,7 @@ ulong ha_maria::index_flags(uint inx, uint part, bool all_parts) const
   ulong flags;
   if (table_share->key_info[inx].algorithm == HA_KEY_ALG_FULLTEXT)
     flags= 0;
-  else 
+  else
   if ((table_share->key_info[inx].flags & HA_SPATIAL ||
       table_share->key_info[inx].algorithm == HA_KEY_ALG_RTREE))
   {
@@ -1084,7 +1083,7 @@ ulong ha_maria::index_flags(uint inx, uint part, bool all_parts) const
     flags= HA_READ_NEXT | HA_READ_PREV | HA_READ_RANGE |
            HA_READ_ORDER | HA_KEYREAD_ONLY | HA_KEY_SCAN_NOT_ROR;
   }
-  else 
+  else
   {
     flags= HA_READ_NEXT | HA_READ_PREV | HA_READ_RANGE |
           HA_READ_ORDER | HA_KEYREAD_ONLY | HA_DO_INDEX_COND_PUSHDOWN;
@@ -1096,7 +1095,8 @@ ulong ha_maria::index_flags(uint inx, uint part, bool all_parts) const
 double ha_maria::scan_time()
 {
   if (file->s->data_file_type == BLOCK_RECORD)
-    return ulonglong2double(stats.data_file_length - file->s->block_size) / MY_MAX(file->s->block_size / 2, IO_SIZE) + 2;
+    return (ulonglong2double(stats.data_file_length - file->s->block_size) /
+            file->s->block_size) + 2;
   return handler::scan_time();
 }
 
@@ -1105,11 +1105,8 @@ double ha_maria::scan_time()
   splitting algorithms depends on this. (With only one key on a page
   we also can't use any compression, which may make the index file much
   larger)
-  We use HA_MAX_KEY_LENGTH as this is a stack restriction imposed by the
-  handler interface.  If we want to increase this, we have also to
-  increase HA_MARIA_KEY_BUFF and MARIA_MAX_KEY_BUFF as the buffer needs
-  to take be able to store the extra lenght bytes that is part of the stored
-  key.
+  We use MARIA_MAX_KEY_LENGTH to limit the key size as we don't want to use
+  too much stack when searching in the b_tree.
 
   We also need to reserve place for a record pointer (8) and 3 bytes
   per key segment to store the length of the segment + possible null bytes.
@@ -1154,7 +1151,11 @@ int ha_maria::open(const char *name, int mode, uint test_if_locked)
     test_if_locked|= HA_OPEN_ABORT_IF_CRASHED;
   }
 
-  if (!(file= maria_open(name, mode, test_if_locked | HA_OPEN_FROM_SQL_LAYER)))
+  if (aria_readonly)
+    test_if_locked|= HA_OPEN_IGNORE_MOVED_STATE;
+
+  if (!(file= maria_open(name, mode, test_if_locked | HA_OPEN_FROM_SQL_LAYER,
+                         s3_open_args())))
   {
     if (my_errno == HA_ERR_OLD_FILE)
     {
@@ -1164,6 +1165,8 @@ int ha_maria::open(const char *name, int mode, uint test_if_locked)
     }
     return (my_errno ? my_errno : -1);
   }
+  if (aria_readonly)
+    file->s->options|= HA_OPTION_READ_ONLY_DATA;
 
   file->s->chst_invalidator= query_cache_invalidate_by_MyISAM_filename_ref;
   /* Set external_ref, mainly for temporary tables */
@@ -1184,10 +1187,13 @@ int ha_maria::open(const char *name, int mode, uint test_if_locked)
       stand up to "when client gets ok the data is safe on disk": the record
       may not even be inserted). In the future, we could enable it back (as a
       client doing INSERT DELAYED knows the specificities; but we then should
-      make sure to regularly commit in the delayed_insert thread). 
+      make sure to regularly commit in the delayed_insert thread).
     */
-    int_table_flags|= HA_CAN_INSERT_DELAYED;
+    int_table_flags|= HA_CAN_INSERT_DELAYED | HA_NO_TRANSACTIONS;
   }
+  else
+    int_table_flags|= HA_CRASH_SAFE;
+
   if (file->s->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))
     int_table_flags |= HA_HAS_NEW_CHECKSUM;
 
@@ -1260,7 +1266,7 @@ int ha_maria::write_row(const uchar * buf)
 
 int ha_maria::check(THD * thd, HA_CHECK_OPT * check_opt)
 {
-  int error;
+  int error, fatal_error;
   HA_CHECK *param= (HA_CHECK*) thd->alloc(sizeof *param);
   MARIA_SHARE *share= file->s;
   const char *old_proc_info;
@@ -1269,6 +1275,7 @@ int ha_maria::check(THD * thd, HA_CHECK_OPT * check_opt)
   if (!file || !param) return HA_ADMIN_INTERNAL_ERROR;
 
   unmap_file(file);
+  register_handler(file);
   maria_chk_init(param);
   param->thd= thd;
   param->op_name= "check";
@@ -1292,6 +1299,7 @@ int ha_maria::check(THD * thd, HA_CHECK_OPT * check_opt)
     return HA_ADMIN_ALREADY_DONE;
 
   maria_chk_init_for_check(param, file);
+  param->max_allowed_lsn= translog_get_horizon();
 
   if ((file->s->state.changed & (STATE_CRASHED_FLAGS | STATE_MOVED)) ==
       STATE_MOVED)
@@ -1323,33 +1331,53 @@ int ha_maria::check(THD * thd, HA_CHECK_OPT * check_opt)
     {
       ulonglong old_testflag= param->testflag;
       param->testflag |= T_MEDIUM;
-      if (!(error= init_io_cache(&param->read_cache, file->dfile.file,
-                                 my_default_record_cache_size, READ_CACHE,
-                                 share->pack.header_length, 1, MYF(MY_WME))))
-      {
+
+      /* BLOCK_RECORD does not need a cache as it is using the page cache */
+      if (file->s->data_file_type != BLOCK_RECORD)
+        error= init_io_cache(&param->read_cache, file->dfile.file,
+                             my_default_record_cache_size, READ_CACHE,
+                             share->pack.header_length, 1, MYF(MY_WME));
+      if (!error)
         error= maria_chk_data_link(param, file,
                                    MY_TEST(param->testflag & T_EXTEND));
+
+      if (file->s->data_file_type != BLOCK_RECORD)
         end_io_cache(&param->read_cache);
-      }
       param->testflag= old_testflag;
     }
   }
-  if (!error)
+  fatal_error= error;
+  if (param->error_printed &&
+      param->error_printed == (param->skip_lsn_error_count +
+                               param->not_visible_rows_found) &&
+      !(share->state.changed & (STATE_CRASHED_FLAGS | STATE_IN_REPAIR)))
   {
-    if ((share->state.changed & (STATE_CHANGED |
+    _ma_check_print_error(param, "%s", zerofill_error_msg);
+    /* This ensures that a future REPAIR TABLE will only do a zerofill */
+    file->update|= STATE_MOVED;
+    share->state.changed|= STATE_MOVED;
+    fatal_error= 0;
+  }
+  if (!fatal_error)
+  {
+    if ((share->state.changed & (STATE_CHANGED | STATE_MOVED |
                                  STATE_CRASHED_FLAGS |
                                  STATE_IN_REPAIR | STATE_NOT_ANALYZED)) ||
         (param->testflag & T_STATISTICS) || maria_is_crashed(file))
     {
       file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
       mysql_mutex_lock(&share->intern_lock);
-      DBUG_PRINT("info", ("Reseting crashed state"));
+      DBUG_PRINT("info", ("Resetting crashed state"));
       share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED_FLAGS |
                                STATE_IN_REPAIR);
       if (!(table->db_stat & HA_READ_ONLY))
-        error= maria_update_state_info(param, file,
-                                       UPDATE_TIME | UPDATE_OPEN_COUNT |
-                                       UPDATE_STAT);
+      {
+        int tmp;
+        if ((tmp= maria_update_state_info(param, file,
+                                          UPDATE_TIME | UPDATE_OPEN_COUNT |
+                                          UPDATE_STAT)))
+          error= tmp;
+      }
       mysql_mutex_unlock(&share->intern_lock);
       info(HA_STATUS_NO_LOCK | HA_STATUS_TIME | HA_STATUS_VARIABLE |
            HA_STATUS_CONST);
@@ -1441,10 +1469,24 @@ int ha_maria::repair(THD * thd, HA_CHECK_OPT *check_opt)
   maria_chk_init(param);
   param->thd= thd;
   param->op_name= "repair";
+
+  /*
+    The following can only be true if the table was marked as STATE_MOVED
+    during a CHECK TABLE and the table has not been used since then
+  */
+  if ((file->s->state.changed & STATE_MOVED) &&
+      !(file->s->state.changed & STATE_CRASHED_FLAGS))
+  {
+    param->db_name= table->s->db.str;
+    param->table_name= table->alias.c_ptr();
+    _ma_check_print_info(param, "Running zerofill on moved table");
+    return zerofill(thd, check_opt);
+  }
+
   param->testflag= ((check_opt->flags & ~(T_EXTEND)) |
                    T_SILENT | T_FORCE_CREATE | T_CALC_CHECKSUM |
                    (check_opt->flags & T_EXTEND ? T_REP : T_REP_BY_SORT));
-  param->sort_buffer_length= THDVAR(thd, sort_buffer_size);
+  param->orig_sort_buffer_length= THDVAR(thd, sort_buffer_size);
   param->backup_time= check_opt->start_time;
   start_records= file->state->records;
   old_proc_info= thd_proc_info(thd, "Checking table");
@@ -1478,6 +1520,13 @@ int ha_maria::repair(THD * thd, HA_CHECK_OPT *check_opt)
     }
     break;
   }
+  /*
+    Commit is needed in the case of tables are locked to ensure that repair
+    is registered in the recovery log
+  */
+  if (implicit_commit(thd, TRUE))
+    error= HA_ADMIN_COMMIT_ERROR;
+
   if (!error && start_records != file->state->records &&
       !(check_opt->flags & T_VERY_SILENT))
   {
@@ -1508,7 +1557,10 @@ int ha_maria::zerofill(THD * thd, HA_CHECK_OPT *check_opt)
   param->thd= thd;
   param->op_name= "zerofill";
   param->testflag= check_opt->flags | T_SILENT | T_ZEROFILL;
-  param->sort_buffer_length= THDVAR(thd, sort_buffer_size);
+  param->orig_sort_buffer_length= THDVAR(thd, sort_buffer_size);
+  param->db_name= table->s->db.str;
+  param->table_name= table->alias.c_ptr();
+
   error=maria_zerofill(param, file, share->open_file_name.str);
 
   /* Reset trn, that may have been set by repair */
@@ -1541,7 +1593,7 @@ int ha_maria::optimize(THD * thd, HA_CHECK_OPT *check_opt)
   param->op_name= "optimize";
   param->testflag= (check_opt->flags | T_SILENT | T_FORCE_CREATE |
                    T_REP_BY_SORT | T_STATISTICS | T_SORT_INDEX);
-  param->sort_buffer_length= THDVAR(thd, sort_buffer_size);
+  param->orig_sort_buffer_length= THDVAR(thd, sort_buffer_size);
   thd_progress_init(thd, 1);
   if ((error= repair(thd, param, 1)) && param->retry_repair)
   {
@@ -1657,11 +1709,11 @@ int ha_maria::repair(THD *thd, HA_CHECK *param, bool do_optimize)
         error= maria_repair_by_sort(param, file, fixed_name,
                                     MY_TEST(param->testflag & T_QUICK));
       }
-      if (error && file->create_unique_index_by_sort && 
+      if (error && file->create_unique_index_by_sort &&
           share->state.dupp_key != MAX_KEY)
       {
         my_errno= HA_ERR_FOUND_DUPP_KEY;
-        print_keydup_error(table, &table->key_info[share->state.dupp_key], 
+        print_keydup_error(table, &table->key_info[share->state.dupp_key],
                            MYF(0));
       }
     }
@@ -1713,7 +1765,7 @@ int ha_maria::repair(THD *thd, HA_CHECK *param, bool do_optimize)
   {
     if ((share->state.changed & STATE_CHANGED) || maria_is_crashed(file))
     {
-      DBUG_PRINT("info", ("Reseting crashed state"));
+      DBUG_PRINT("info", ("Resetting crashed state"));
       share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED_FLAGS |
                                STATE_IN_REPAIR | STATE_MOVED);
       file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
@@ -2009,7 +2061,7 @@ int ha_maria::enable_indexes(uint mode)
     }
 
     param->myf_rw &= ~MY_WAIT_IF_FULL;
-    param->sort_buffer_length= THDVAR(thd,sort_buffer_size);
+    param->orig_sort_buffer_length= THDVAR(thd,sort_buffer_size);
     param->stats_method= (enum_handler_stats_method)THDVAR(thd,stats_method);
     param->tmpdir= &mysql_tmpdir_list;
 
@@ -2104,7 +2156,7 @@ void ha_maria::start_bulk_insert(ha_rows rows, uint flags)
   DBUG_PRINT("info", ("start_bulk_insert: rows %lu", (ulong) rows));
 
   /* don't enable row cache if too few rows */
-  if (!rows || (rows > MARIA_MIN_ROWS_TO_USE_WRITE_CACHE))
+  if ((!rows || rows > MARIA_MIN_ROWS_TO_USE_WRITE_CACHE) && !has_long_unique())
   {
     ulonglong size= thd->variables.read_buff_size, tmp;
     if (rows)
@@ -2287,7 +2339,8 @@ bool ha_maria::check_and_repair(THD *thd)
   check_opt.flags= T_MEDIUM | T_AUTO_REPAIR;
 
   error= 1;
-  if ((file->s->state.changed & (STATE_CRASHED_FLAGS | STATE_MOVED)) ==
+  if (!aria_readonly &&
+      (file->s->state.changed & (STATE_CRASHED_FLAGS | STATE_MOVED)) ==
       STATE_MOVED)
   {
     /* Remove error about crashed table */
@@ -2374,6 +2427,7 @@ int ha_maria::index_read_map(uchar * buf, const uchar * key,
 			     enum ha_rkey_function find_flag)
 {
   DBUG_ASSERT(inited == INDEX);
+  register_handler(file);
   int error= maria_rkey(file, buf, active_index, key, keypart_map, find_flag);
   return error;
 }
@@ -2384,13 +2438,15 @@ int ha_maria::index_read_idx_map(uchar * buf, uint index, const uchar * key,
 				 enum ha_rkey_function find_flag)
 {
   int error;
+  register_handler(file);
+
   /* Use the pushed index condition if it matches the index we're scanning */
   end_range= NULL;
   if (index == pushed_idx_cond_keyno)
     ma_set_index_cond_func(file, handler_index_cond_check, this);
-  
+
   error= maria_rkey(file, buf, index, key, keypart_map, find_flag);
-   
+
   ma_set_index_cond_func(file, NULL, 0);
   return error;
 }
@@ -2401,6 +2457,7 @@ int ha_maria::index_read_last_map(uchar * buf, const uchar * key,
 {
   DBUG_ENTER("ha_maria::index_read_last_map");
   DBUG_ASSERT(inited == INDEX);
+  register_handler(file);
   int error= maria_rkey(file, buf, active_index, key, keypart_map,
                         HA_READ_PREFIX_LAST);
   DBUG_RETURN(error);
@@ -2410,6 +2467,7 @@ int ha_maria::index_read_last_map(uchar * buf, const uchar * key,
 int ha_maria::index_next(uchar * buf)
 {
   DBUG_ASSERT(inited == INDEX);
+  register_handler(file);
   int error= maria_rnext(file, buf, active_index);
   return error;
 }
@@ -2418,6 +2476,7 @@ int ha_maria::index_next(uchar * buf)
 int ha_maria::index_prev(uchar * buf)
 {
   DBUG_ASSERT(inited == INDEX);
+  register_handler(file);
   int error= maria_rprev(file, buf, active_index);
   return error;
 }
@@ -2426,6 +2485,7 @@ int ha_maria::index_prev(uchar * buf)
 int ha_maria::index_first(uchar * buf)
 {
   DBUG_ASSERT(inited == INDEX);
+  register_handler(file);
   int error= maria_rfirst(file, buf, active_index);
   return error;
 }
@@ -2434,6 +2494,7 @@ int ha_maria::index_first(uchar * buf)
 int ha_maria::index_last(uchar * buf)
 {
   DBUG_ASSERT(inited == INDEX);
+  register_handler(file);
   int error= maria_rlast(file, buf, active_index);
   return error;
 }
@@ -2445,6 +2506,7 @@ int ha_maria::index_next_same(uchar * buf,
 {
   int error;
   DBUG_ASSERT(inited == INDEX);
+  register_handler(file);
   /*
     TODO: Delete this loop in Maria 1.5 as versioning will ensure this never
     happens
@@ -2458,11 +2520,11 @@ int ha_maria::index_next_same(uchar * buf,
 
 
 int ha_maria::index_init(uint idx, bool sorted)
-{ 
+{
   active_index=idx;
   if (pushed_idx_cond_keyno == idx)
     ma_set_index_cond_func(file, handler_index_cond_check, this);
-  return 0; 
+  return 0;
 }
 
 
@@ -2472,7 +2534,7 @@ int ha_maria::index_end()
   ma_set_index_cond_func(file, NULL, 0);
   in_range_check_pushed_down= FALSE;
   ds_mrr.dsmrr_close();
-  return 0; 
+  return 0;
 }
 
 
@@ -2495,13 +2557,14 @@ int ha_maria::rnd_end()
 
 int ha_maria::rnd_next(uchar *buf)
 {
-  int error= maria_scan(file, buf);
-  return error;
+  register_handler(file);
+  return maria_scan(file, buf);
 }
 
 
 int ha_maria::remember_rnd_pos()
 {
+  register_handler(file);
   return (*file->s->scan_remember_pos)(file, &remember_pos);
 }
 
@@ -2509,6 +2572,7 @@ int ha_maria::remember_rnd_pos()
 int ha_maria::restart_rnd_next(uchar *buf)
 {
   int error;
+  register_handler(file);
   if ((error= (*file->s->scan_restore_pos)(file, remember_pos)))
     return error;
   return rnd_next(buf);
@@ -2517,6 +2581,7 @@ int ha_maria::restart_rnd_next(uchar *buf)
 
 int ha_maria::rnd_pos(uchar *buf, uchar *pos)
 {
+  register_handler(file);
   int error= maria_rrnd(file, buf, my_get_ptr(pos, ref_length));
   return error;
 }
@@ -2577,11 +2642,13 @@ int ha_maria::info(uint flag)
     data_file_name= index_file_name= 0;
     fn_format(name_buff, file->s->open_file_name.str, "", MARIA_NAME_DEXT,
               MY_APPEND_EXT | MY_UNPACK_FILENAME);
-    if (strcmp(name_buff, maria_info.data_file_name))
-      data_file_name =maria_info.data_file_name;
+    if (strcmp(name_buff, maria_info.data_file_name) &&
+        maria_info.data_file_name[0])
+      data_file_name= maria_info.data_file_name;
     fn_format(name_buff, file->s->open_file_name.str, "", MARIA_NAME_IEXT,
               MY_APPEND_EXT | MY_UNPACK_FILENAME);
-    if (strcmp(name_buff, maria_info.index_file_name))
+    if (strcmp(name_buff, maria_info.index_file_name) &&
+        maria_info.index_file_name[0])
       index_file_name=maria_info.index_file_name;
   }
   if (flag & HA_STATUS_ERRKEY)
@@ -2608,6 +2675,8 @@ int ha_maria::extra(enum ha_extra_function operation)
   if (operation == HA_EXTRA_MMAP && !opt_maria_use_mmap)
     return 0;
 #endif
+  if (operation == HA_EXTRA_WRITE_CACHE && has_long_unique())
+    return 0;
 
   /*
     We have to set file->trn here because in some cases we call
@@ -2672,6 +2741,16 @@ int ha_maria::extra_opt(enum ha_extra_function operation, ulong cache_size)
 }
 
 
+bool ha_maria::auto_repair(int error) const
+{
+  /* Always auto-repair moved tables (error == HA_ERR_OLD_FILE) */
+  return ((MY_TEST(maria_recover_options & HA_RECOVER_ANY) &&
+           error == HA_ERR_CRASHED_ON_USAGE) ||
+          error == HA_ERR_OLD_FILE);
+
+}
+
+
 int ha_maria::delete_all_rows()
 {
   THD *thd= table->in_use;
@@ -2716,9 +2795,9 @@ int ha_maria::delete_table(const char *name)
 
 void ha_maria::drop_table(const char *name)
 {
-  DBUG_ASSERT(file->s->temporary);
+  DBUG_ASSERT(!file || file->s->temporary);
   (void) ha_close();
-  (void) maria_delete_table_files(name, 1, 0);
+  (void) maria_delete_table_files(name, 1, MY_WME);
 }
 
 
@@ -2732,6 +2811,7 @@ void ha_maria::change_table_ptr(TABLE *table_arg, TABLE_SHARE *share)
 
 int ha_maria::external_lock(THD *thd, int lock_type)
 {
+  int result= 0, result2;
   DBUG_ENTER("ha_maria::external_lock");
   file->external_ref= (void*) table;            // For ma_killed()
   /*
@@ -2759,7 +2839,7 @@ int ha_maria::external_lock(THD *thd, int lock_type)
         trnman_increment_locked_tables(file->trn);
       }
 
-      if (!thd->transaction.on)
+      if (!thd->transaction->on)
       {
         /*
           No need to log REDOs/UNDOs. If this is an internal temporary table
@@ -2772,7 +2852,19 @@ int ha_maria::external_lock(THD *thd, int lock_type)
         */
         DBUG_PRINT("info", ("Disabling logging for table"));
         _ma_tmp_disable_logging_for_table(file, TRUE);
+        file->autocommit= 0;
       }
+      else
+        file->autocommit= !(thd->variables.option_bits &
+                            (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN));
+#ifndef ARIA_HAS_TRANSACTIONS
+      /*
+        Until Aria has full transactions support, including MVCC support for
+        delete and update and purging of old states, we have to commit for
+        every statement.
+      */
+      file->autocommit=1;
+#endif
     }
     else
     {
@@ -2814,29 +2906,32 @@ int ha_maria::external_lock(THD *thd, int lock_type)
           */
           DBUG_ASSERT(!thd->get_stmt_da()->is_sent() ||
                       thd->killed);
-          /* autocommit ? rollback a transaction */
-#ifdef MARIA_CANNOT_ROLLBACK
-          if (ma_commit(trn))
-            DBUG_RETURN(1);
-          THD_TRN= 0;
-#else
-          if (!(thd->variables.option_bits & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))
+          /*
+            If autocommit, commit transaction. This can happen when open and
+            lock tables as part of creating triggers, in which case commit
+            is not called.
+            Until ARIA_HAS_TRANSACTIONS is not defined, always commit.
+          */
+          if (file->autocommit)
           {
-            trnman_rollback_trn(trn);
-            DBUG_PRINT("info", ("THD_TRN set to 0x0"));
-            THD_TRN= 0;
+            if (ma_commit(trn))
+              result= HA_ERR_COMMIT_ERROR;
+            thd_set_ha_data(thd, maria_hton, 0);
           }
-#endif
         }
         trnman_set_flags(trn, trnman_get_flags(trn) & ~ TRN_STATE_INFO_LOGGED);
       }
     }
   } /* if transactional table */
-  int result = maria_lock_database(file, !table->s->tmp_table ?
-                                  lock_type : ((lock_type == F_UNLCK) ?
-                                               F_UNLCK : F_EXTRA_LCK));
+  if ((result2= maria_lock_database(file, !table->s->tmp_table ?
+                                    lock_type : ((lock_type == F_UNLCK) ?
+                                                 F_UNLCK : F_EXTRA_LCK))))
+    result= result2;
   if (!file->s->base.born_transactional)
     file->state= &file->s->state.state;         // Restore state if clone
+
+  /* Remember stack end for this thread */
+  file->stack_end_ptr= &ha_thd()->mysys_var->stack_ends_here;
   DBUG_RETURN(result);
 }
 
@@ -2884,7 +2979,7 @@ int ha_maria::start_stmt(THD *thd, thr_lock_type lock_type)
 static void reset_thd_trn(THD *thd, MARIA_HA *first_table)
 {
   DBUG_ENTER("reset_thd_trn");
-  THD_TRN= NULL;
+  thd_set_ha_data(thd, maria_hton, 0);
   MARIA_HA *next;
   for (MARIA_HA *table= first_table; table ; table= next)
   {
@@ -2959,7 +3054,7 @@ int ha_maria::implicit_commit(THD *thd, bool new_trn)
 
   error= 0;
   if (unlikely(ma_commit(trn)))
-    error= 1;
+    error= HA_ERR_COMMIT_ERROR;
   if (!new_trn)
   {
     reset_thd_trn(thd, used_tables);
@@ -2971,8 +3066,8 @@ int ha_maria::implicit_commit(THD *thd, bool new_trn)
     tables may be under LOCK TABLES, and so they will start the next
     statement assuming they have a trn (see ha_maria::start_stmt()).
   */
-  trn= trnman_new_trn(& thd->transaction.wt);
-  THD_TRN= trn;
+  trn= trnman_new_trn(& thd->transaction->wt);
+  thd_set_ha_data(thd, maria_hton, trn);
   if (unlikely(trn == NULL))
   {
     reset_thd_trn(thd, used_tables);
@@ -3126,6 +3221,7 @@ int ha_maria::create(const char *name, TABLE *table_arg,
   MARIA_CREATE_INFO create_info;
   TABLE_SHARE *share= table_arg->s;
   uint options= share->db_options_in_use;
+  ha_table_option_struct *table_options= table_arg->s->option_struct;
   enum data_file_type row_type;
   THD *thd= current_thd;
   DBUG_ENTER("ha_maria::create");
@@ -3170,6 +3266,12 @@ int ha_maria::create(const char *name, TABLE *table_arg,
   create_info.data_file_name= ha_create_info->data_file_name;
   create_info.index_file_name= ha_create_info->index_file_name;
   create_info.language= share->table_charset->number;
+  if (ht != maria_hton)
+  {
+    /* S3 engine */
+    create_info.s3_block_size= (ulong) table_options->s3_block_size;
+    create_info.compression_algorithm= table_options->compression_algorithm;
+  }
 
   /*
     Table is transactional:
@@ -3205,6 +3307,7 @@ int ha_maria::create(const char *name, TABLE *table_arg,
   (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY,
                                  (uchar*) thd->query(), thd->query_length());
 
+  create_info.encrypted= maria_encrypt_tables && ht == maria_hton;
   /* TODO: Check that the following fn_format is really needed */
   error=
     maria_create(fn_format(buff, name, "", "",
@@ -3284,6 +3387,8 @@ void ha_maria::get_auto_increment(ulonglong offset, ulonglong increment,
     inx                 Index to use
     min_key             Start of range.  Null pointer if from first key
     max_key             End of range. Null pointer if to last key
+    pages               Store first and last page for the range in case of
+                        b-trees. In other cases it's not touched.
 
   NOTES
     min_key.flag can have one of the following values:
@@ -3301,10 +3406,20 @@ void ha_maria::get_auto_increment(ulonglong offset, ulonglong increment,
                         the range.
 */
 
-ha_rows ha_maria::records_in_range(uint inx, key_range *min_key,
-                                   key_range *max_key)
+ha_rows ha_maria::records_in_range(uint inx, const key_range *min_key,
+                                   const key_range *max_key, page_range *pages)
 {
-  return (ha_rows) maria_records_in_range(file, (int) inx, min_key, max_key);
+  register_handler(file);
+  return (ha_rows) maria_records_in_range(file, (int) inx, min_key, max_key,
+                                          pages);
+}
+
+
+FT_INFO *ha_maria::ft_init_ext(uint flags, uint inx, String * key)
+{
+  return maria_ft_init_search(flags, file, inx,
+                              (uchar *) key->ptr(), key->length(),
+                              key->charset(), table->record[0]);
 }
 
 
@@ -3315,6 +3430,8 @@ int ha_maria::ft_read(uchar * buf)
   if (!ft_handler)
     return -1;
 
+  register_handler(file);
+
   thread_safe_increment(table->in_use->status_var.ha_read_next_count,
                         &LOCK_status);  // why ?
 
@@ -3360,7 +3477,7 @@ static int maria_hton_panic(handlerton *hton, ha_panic_function flag)
   /* If no background checkpoints, we need to do one now */
   int ret=0;
 
-  if (!checkpoint_interval)
+  if (!checkpoint_interval && !aria_readonly)
     ret= ma_checkpoint_execute(CHECKPOINT_FULL, FALSE);
 
   ret|= maria_panic(flag);
@@ -3374,23 +3491,51 @@ static int maria_commit(handlerton *hton __attribute__ ((unused)),
                         THD *thd, bool all)
 {
   TRN *trn= THD_TRN;
-  int res;
-  MARIA_HA *used_instances= (MARIA_HA*) trn->used_instances;
+  int res= 0;
+  MARIA_HA *used_instances;
   DBUG_ENTER("maria_commit");
 
-  DBUG_ASSERT(trnman_has_locked_tables(trn) == 0);
-  trnman_reset_locked_tables(trn, 0);
-  trnman_set_flags(trn, trnman_get_flags(trn) & ~TRN_STATE_INFO_LOGGED);
+  /* No commit inside lock_tables() */
+  if ((!trn ||
+       thd->locked_tables_mode == LTM_LOCK_TABLES ||
+       thd->locked_tables_mode == LTM_PRELOCKED_UNDER_LOCK_TABLES))
+    DBUG_RETURN(0);
 
   /* statement or transaction ? */
   if ((thd->variables.option_bits & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) &&
       !all)
     DBUG_RETURN(0); // end of statement
-  res= ma_commit(trn);
+
+  used_instances= (MARIA_HA*) trn->used_instances;
+  trnman_reset_locked_tables(trn, 0);
+  trnman_set_flags(trn, trnman_get_flags(trn) & ~TRN_STATE_INFO_LOGGED);
+  trn->used_instances= 0;
+  if (ma_commit(trn))
+    res= HA_ERR_COMMIT_ERROR;
   reset_thd_trn(thd, used_instances);
+  thd_set_ha_data(thd, maria_hton, 0);
   DBUG_RETURN(res);
 }
 
+#ifdef MARIA_CANNOT_ROLLBACK
+static int maria_rollback(handlerton *hton, THD *thd, bool all)
+{
+  TRN *trn= THD_TRN;
+  DBUG_ENTER("maria_rollback");
+  if (!trn)
+    DBUG_RETURN(0);
+  if (trn->undo_lsn)
+    push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE,
+                        ER_DATA_WAS_COMMITED_UNDER_ROLLBACK,
+                        ER_THD(thd, ER_DATA_WAS_COMMITED_UNDER_ROLLBACK),
+                        "Aria");
+  if (all)
+    DBUG_RETURN(maria_commit(hton, thd, all));
+  /* Statement rollbacks are ignored. Commit will happen in external_lock */
+  DBUG_RETURN(0);
+}
+
+#else
 
 static int maria_rollback(handlerton *hton __attribute__ ((unused)),
                           THD *thd, bool all)
@@ -3411,7 +3556,7 @@ static int maria_rollback(handlerton *hton __attribute__ ((unused)),
   DBUG_RETURN(trnman_rollback_trn(trn) ?
               HA_ERR_OUT_OF_MEM : 0); // end of transaction
 }
-
+#endif /* MARIA_CANNOT_ROLLBACK */
 
 
 /**
@@ -3630,15 +3775,24 @@ bool ha_maria::is_changed() const
 
 static int ha_maria_init(void *p)
 {
-  int res;
+  int res= 0, tmp;
   const char *log_dir= maria_data_root;
 
+  /*
+    If aria_readonly is set, then we don't run recovery and we don't allow
+    opening of tables that are crashed. Used by mysqld --help
+   */
+  if ((aria_readonly= opt_help != 0))
+  {
+    maria_recover_options= 0;
+    checkpoint_interval= 0;
+  }
+
 #ifdef HAVE_PSI_INTERFACE
   init_aria_psi_keys();
 #endif
 
   maria_hton= (handlerton *)p;
-  maria_hton->state= SHOW_OPTION_YES;
   maria_hton->db_type= DB_TYPE_ARIA;
   maria_hton->create= maria_create_handler;
   maria_hton->panic= maria_hton_panic;
@@ -3646,20 +3800,25 @@ static int ha_maria_init(void *p)
   maria_hton->commit= maria_commit;
   maria_hton->rollback= maria_rollback;
   maria_hton->checkpoint_state= maria_checkpoint_state;
-#ifdef MARIA_CANNOT_ROLLBACK
-  maria_hton->commit= 0;
-#endif
   maria_hton->flush_logs= maria_flush_logs;
   maria_hton->show_status= maria_show_status;
   maria_hton->prepare_for_backup= maria_prepare_for_backup;
   maria_hton->end_backup= maria_end_backup;
 
   /* TODO: decide if we support Maria being used for log tables */
-  maria_hton->flags= HTON_CAN_RECREATE | HTON_SUPPORT_LOG_TABLES;
+  maria_hton->flags= (HTON_CAN_RECREATE | HTON_SUPPORT_LOG_TABLES |
+                      HTON_NO_ROLLBACK |
+                      HTON_TRANSACTIONAL_AND_NON_TRANSACTIONAL);
   bzero(maria_log_pagecache, sizeof(*maria_log_pagecache));
   maria_tmpdir= &mysql_tmpdir_list;             /* For REDO */
-  res= maria_upgrade() || maria_init() || ma_control_file_open(TRUE, TRUE) ||
-    ((force_start_after_recovery_failures != 0) &&
+
+  if (!aria_readonly)
+    res= maria_upgrade();
+  res= res || maria_init();
+  tmp= ma_control_file_open(!aria_readonly, !aria_readonly, !aria_readonly);
+  res= res || aria_readonly ? tmp == CONTROL_FILE_LOCKED : tmp != 0;
+  res= res ||
+    ((force_start_after_recovery_failures != 0 && !aria_readonly) &&
      mark_recovery_start(log_dir)) ||
     !init_pagecache(maria_pagecache,
                     (size_t) pagecache_buffer_size, pagecache_division_limit,
@@ -3668,13 +3827,16 @@ static int ha_maria_init(void *p)
     !init_pagecache(maria_log_pagecache,
                     TRANSLOG_PAGECACHE_SIZE, 0, 0,
                     TRANSLOG_PAGE_SIZE, 0, 0) ||
-    translog_init(maria_data_root, log_file_size,
-                  MYSQL_VERSION_ID, server_id, maria_log_pagecache,
-                  TRANSLOG_DEFAULT_FLAGS, 0) ||
-    maria_recovery_from_log() ||
-    ((force_start_after_recovery_failures != 0 ||
-      maria_recovery_changed_data || recovery_failures) &&
-     mark_recovery_success()) ||
+    (!aria_readonly &&
+     translog_init(maria_data_root, log_file_size,
+                   MYSQL_VERSION_ID, server_id, maria_log_pagecache,
+                   TRANSLOG_DEFAULT_FLAGS, 0)) ||
+    (!aria_readonly &&
+     (maria_recovery_from_log() ||
+      ((force_start_after_recovery_failures != 0 ||
+        maria_recovery_changed_data || recovery_failures) &&
+       mark_recovery_success()))) ||
+    (aria_readonly && trnman_init(MAX_INTERNAL_TRID-16)) ||
     ma_checkpoint_init(checkpoint_interval);
   maria_multi_threaded= maria_in_ha_maria= TRUE;
   maria_create_trn_hook= maria_create_trn_for_mysql;
@@ -3688,6 +3850,8 @@ static int ha_maria_init(void *p)
   }
 
   ma_killed= ma_killed_in_mariadb;
+  if (res)
+    maria_panic(HA_PANIC_CLOSE);
 
   return res ? HA_ERR_INITIALIZATION : 0;
 }
@@ -3768,7 +3932,7 @@ my_bool ha_maria::register_query_cache_table(THD *thd, const char *table_name,
 }
 #endif
 
-struct st_mysql_sys_var* system_variables[]= {
+static struct st_mysql_sys_var *system_variables[]= {
   MYSQL_SYSVAR(block_size),
   MYSQL_SYSVAR(checkpoint_interval),
   MYSQL_SYSVAR(checkpoint_log_activity),
@@ -3908,7 +4072,7 @@ static void update_log_file_size(MYSQL_THD thd,
 }
 
 
-SHOW_VAR status_variables[]= {
+static SHOW_VAR status_variables[]= {
   {"pagecache_blocks_not_flushed", (char*) &maria_pagecache_var.global_blocks_changed, SHOW_LONG},
   {"pagecache_blocks_unused",      (char*) &maria_pagecache_var.blocks_unused, SHOW_LONG},
   {"pagecache_blocks_used",        (char*) &maria_pagecache_var.blocks_used, SHOW_LONG},
@@ -3925,7 +4089,7 @@ SHOW_VAR status_variables[]= {
  ***************************************************************************/
 
 int ha_maria::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                                    uint n_ranges, uint mode, 
+                                    uint n_ranges, uint mode,
                                     HANDLER_BUFFER *buf)
 {
   return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, mode, buf);
@@ -3937,7 +4101,7 @@ int ha_maria::multi_range_read_next(range_id_t *range_info)
 }
 
 ha_rows ha_maria::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
-                                               void *seq_init_param, 
+                                               void *seq_init_param,
                                                uint n_ranges, uint *bufsz,
                                                uint *flags, Cost_estimate *cost)
 {
@@ -3952,14 +4116,14 @@ ha_rows ha_maria::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
 }
 
 ha_rows ha_maria::multi_range_read_info(uint keyno, uint n_ranges, uint keys,
-                                       uint key_parts, uint *bufsz, 
+                                       uint key_parts, uint *bufsz,
                                        uint *flags, Cost_estimate *cost)
 {
   ds_mrr.init(this, table);
   return ds_mrr.dsmrr_info(keyno, n_ranges, keys, key_parts, bufsz, flags, cost);
 }
 
-int ha_maria::multi_range_read_explain_info(uint mrr_mode, char *str, 
+int ha_maria::multi_range_read_explain_info(uint mrr_mode, char *str,
                                             size_t size)
 {
   return ds_mrr.dsmrr_explain_info(mrr_mode, str, size);
@@ -4016,6 +4180,7 @@ Item *ha_maria::idx_cond_push(uint keyno_arg, Item* idx_cond_arg)
 int ha_maria::find_unique_row(uchar *record, uint constrain_no)
 {
   int rc;
+  register_handler(file);
   if (file->s->state.header.uniques)
   {
     DBUG_ASSERT(file->s->state.header.uniques > constrain_no);
@@ -4050,6 +4215,26 @@ int ha_maria::find_unique_row(uchar *record, uint constrain_no)
   return rc;
 }
 
+
+/**
+   Check if a table needs to be repaired
+*/
+
+int ha_maria::check_for_upgrade(HA_CHECK_OPT *check)
+{
+  if (table->s->mysql_version && table->s->mysql_version <= 100509 &&
+      (file->s->base.extra_options & MA_EXTRA_OPTIONS_ENCRYPTED))
+  {
+    /*
+      Encrypted tables before 10.5.9 had a bug where LSN was not
+      stored on the pages. These must be repaired!
+    */
+    return HA_ADMIN_NEEDS_ALTER;
+  }
+  return HA_ADMIN_OK;
+}
+
+
 struct st_mysql_storage_engine maria_storage_engine=
 { MYSQL_HANDLERTON_INTERFACE_VERSION };
 
diff --git a/storage/maria/ha_maria.h b/storage/maria/ha_maria.h
index 73f3576a34e..6b4302145dd 100644
--- a/storage/maria/ha_maria.h
+++ b/storage/maria/ha_maria.h
@@ -1,6 +1,7 @@
 #ifndef HA_MARIA_INCLUDED
 #define HA_MARIA_INCLUDED
-/* Copyright (C) 2006,2004 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2006, 2004 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+   Copyright (c) 2009, 2020, MariaDB Corporation Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -21,7 +22,7 @@
 
 /* class for the maria handler */
 
-#include <maria.h>
+#include "maria_def.h"
 #include "handler.h"
 #include "table.h"
 
@@ -38,9 +39,16 @@ C_MODE_END
 extern TYPELIB maria_recover_typelib;
 extern ulonglong maria_recover_options;
 
-class ha_maria :public handler
+/*
+  In the ha_maria class there are a few virtual methods that are not marked as
+  'final'. This is because they are re-defined by the ha_s3 engine.
+*/
+
+class __attribute__((visibility("default"))) ha_maria :public handler
 {
+public:
   MARIA_HA *file;
+private:
   ulonglong int_table_flags;
   MARIA_RECORD_POS remember_pos;
   char *data_file_name, *index_file_name;
@@ -48,7 +56,7 @@ class ha_maria :public handler
   bool can_enable_indexes;
   /**
     If a transactional table is doing bulk insert with a single
-    UNDO_BULK_INSERT with/without repair. 
+    UNDO_BULK_INSERT with/without repair.
   */
   uint8 bulk_insert_single_undo;
   int repair(THD * thd, HA_CHECK *param, bool optimize);
@@ -57,109 +65,100 @@ class ha_maria :public handler
 public:
   ha_maria(handlerton *hton, TABLE_SHARE * table_arg);
   ~ha_maria() {}
-  handler *clone(const char *name, MEM_ROOT *mem_root);
-  const char *index_type(uint key_number);
-  ulonglong table_flags() const
+  handler *clone(const char *name, MEM_ROOT *mem_root) override final;
+  const char *index_type(uint key_number) override final;
+  ulonglong table_flags() const override final
   { return int_table_flags; }
-  ulong index_flags(uint inx, uint part, bool all_parts) const;
-  uint max_supported_keys() const
+  ulong index_flags(uint inx, uint part, bool all_parts) const override final;
+  uint max_supported_keys() const override final
   { return MARIA_MAX_KEY; }
-  uint max_supported_key_length() const;
-  uint max_supported_key_part_length() const
+  uint max_supported_key_length() const override final;
+  uint max_supported_key_part_length() const override final
   { return max_supported_key_length(); }
-  enum row_type get_row_type() const;
-  void change_table_ptr(TABLE *table_arg, TABLE_SHARE *share);
-  virtual double scan_time();
-
-  int open(const char *name, int mode, uint test_if_locked);
-  int close(void);
-  int write_row(const uchar * buf);
-  int update_row(const uchar * old_data, const uchar * new_data);
-  int delete_row(const uchar * buf);
+  enum row_type get_row_type() const override final;
+  void change_table_ptr(TABLE *table_arg, TABLE_SHARE *share) override final;
+  virtual double scan_time() override final;
+
+  int open(const char *name, int mode, uint test_if_locked) override;
+  int close(void) override final;
+  int write_row(const uchar * buf) override;
+  int update_row(const uchar * old_data, const uchar * new_data) override;
+  int delete_row(const uchar * buf) override;
   int index_read_map(uchar * buf, const uchar * key, key_part_map keypart_map,
-		     enum ha_rkey_function find_flag);
+		     enum ha_rkey_function find_flag) override final;
   int index_read_idx_map(uchar * buf, uint idx, const uchar * key,
 			 key_part_map keypart_map,
-			 enum ha_rkey_function find_flag);
+			 enum ha_rkey_function find_flag) override final;
   int index_read_last_map(uchar * buf, const uchar * key,
-			  key_part_map keypart_map);
-  int index_next(uchar * buf);
-  int index_prev(uchar * buf);
-  int index_first(uchar * buf);
-  int index_last(uchar * buf);
-  int index_next_same(uchar * buf, const uchar * key, uint keylen);
-  int ft_init()
+			  key_part_map keypart_map) override final;
+  int index_next(uchar * buf) override final;
+  int index_prev(uchar * buf) override final;
+  int index_first(uchar * buf) override final;
+  int index_last(uchar * buf) override final;
+  int index_next_same(uchar * buf, const uchar * key, uint keylen) override final;
+  int ft_init() override final
   {
     if (!ft_handler)
       return 1;
     ft_handler->please->reinit_search(ft_handler);
     return 0;
   }
-  FT_INFO *ft_init_ext(uint flags, uint inx, String * key)
-  {
-    return maria_ft_init_search(flags, file, inx,
-                                (uchar *) key->ptr(), key->length(),
-                                key->charset(), table->record[0]);
-  }
-  int ft_read(uchar * buf);
-  int index_init(uint idx, bool sorted);
-  int index_end();
-  int rnd_init(bool scan);
-  int rnd_end(void);
-  int rnd_next(uchar * buf);
-  int rnd_pos(uchar * buf, uchar * pos);
-  int remember_rnd_pos();
-  int restart_rnd_next(uchar * buf);
-  void position(const uchar * record);
-  int info(uint);
+  FT_INFO *ft_init_ext(uint flags, uint inx, String * key) override final;
+  int ft_read(uchar * buf) override final;
+  int index_init(uint idx, bool sorted) override final;
+  int index_end() override final;
+  int rnd_init(bool scan) override final;
+  int rnd_end(void) override final;
+  int rnd_next(uchar * buf) override final;
+  int rnd_pos(uchar * buf, uchar * pos) override final;
+  int remember_rnd_pos() override final;
+  int restart_rnd_next(uchar * buf) override final;
+  void position(const uchar * record) override final;
+  int info(uint) override final;
   int info(uint, my_bool);
-  int extra(enum ha_extra_function operation);
-  int extra_opt(enum ha_extra_function operation, ulong cache_size);
-  int reset(void);
-  int external_lock(THD * thd, int lock_type);
-  int start_stmt(THD *thd, thr_lock_type lock_type);
-  int delete_all_rows(void);
-  int disable_indexes(uint mode);
-  int enable_indexes(uint mode);
-  int indexes_are_disabled(void);
-  void start_bulk_insert(ha_rows rows, uint flags);
-  int end_bulk_insert();
-  ha_rows records_in_range(uint inx, key_range * min_key, key_range * max_key);
-  void update_create_info(HA_CREATE_INFO * create_info);
-  int create(const char *name, TABLE * form, HA_CREATE_INFO * create_info);
+  int extra(enum ha_extra_function operation) override final;
+  int extra_opt(enum ha_extra_function operation, ulong cache_size) override final;
+  int reset(void) override final;
+  int external_lock(THD * thd, int lock_type) override;
+  int start_stmt(THD *thd, thr_lock_type lock_type) override final;
+  int delete_all_rows(void) override final;
+  int disable_indexes(uint mode) override final;
+  int enable_indexes(uint mode) override final;
+  int indexes_are_disabled(void) override final;
+  void start_bulk_insert(ha_rows rows, uint flags) override final;
+  int end_bulk_insert() override final;
+  ha_rows records_in_range(uint inx, const key_range *min_key,
+                           const key_range *max_key,
+                           page_range *pages) override final;
+  void update_create_info(HA_CREATE_INFO * create_info) override final;
+  int create(const char *name, TABLE * form, HA_CREATE_INFO * create_info) override;
   THR_LOCK_DATA **store_lock(THD * thd, THR_LOCK_DATA ** to,
-                             enum thr_lock_type lock_type);
+                             enum thr_lock_type lock_type) override final;
   virtual void get_auto_increment(ulonglong offset, ulonglong increment,
                                   ulonglong nb_desired_values,
                                   ulonglong *first_value,
-                                  ulonglong *nb_reserved_values);
-  int rename_table(const char *from, const char *to);
-  int delete_table(const char *name);
-  void drop_table(const char *name);
-  int check(THD * thd, HA_CHECK_OPT * check_opt);
-  int analyze(THD * thd, HA_CHECK_OPT * check_opt);
-  int repair(THD * thd, HA_CHECK_OPT * check_opt);
-  bool check_and_repair(THD * thd);
-  bool is_crashed() const;
+                                  ulonglong *nb_reserved_values) override final;
+  int rename_table(const char *from, const char *to) override;
+  int delete_table(const char *name) override;
+  void drop_table(const char *name) override;
+  int check(THD * thd, HA_CHECK_OPT * check_opt) override;
+  int analyze(THD * thd, HA_CHECK_OPT * check_opt) override;
+  int repair(THD * thd, HA_CHECK_OPT * check_opt) override;
+  int check_for_upgrade(HA_CHECK_OPT *check_opt) override;
+  bool check_and_repair(THD * thd) override final;
+  bool is_crashed() const override final;
   bool is_changed() const;
-  bool auto_repair(int error) const
-  {
-    /* Always auto-repair moved tables (error == HA_ERR_OLD_FILE) */
-    return ((MY_TEST(maria_recover_options & HA_RECOVER_ANY) &&
-             error == HA_ERR_CRASHED_ON_USAGE) ||
-            error == HA_ERR_OLD_FILE);
-
-  }
-  int optimize(THD * thd, HA_CHECK_OPT * check_opt);
-  int assign_to_keycache(THD * thd, HA_CHECK_OPT * check_opt);
-  int preload_keys(THD * thd, HA_CHECK_OPT * check_opt);
-  bool check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes);
+  bool auto_repair(int error) const override final;
+  int optimize(THD * thd, HA_CHECK_OPT * check_opt) override final;
+  int assign_to_keycache(THD * thd, HA_CHECK_OPT * check_opt) override final;
+  int preload_keys(THD * thd, HA_CHECK_OPT * check_opt) override;
+  bool check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes) override final;
 #ifdef HAVE_QUERY_CACHE
   my_bool register_query_cache_table(THD *thd, const char *table_key,
                                      uint key_length,
                                      qc_engine_callback
                                      *engine_callback,
-                                     ulonglong *engine_data);
+                                     ulonglong *engine_data) override final;
 #endif
   MARIA_HA *file_ptr(void)
   {
@@ -171,25 +170,31 @@ public:
    * Multi Range Read interface
    */
   int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                            uint n_ranges, uint mode, HANDLER_BUFFER *buf);
-  int multi_range_read_next(range_id_t *range_info);
+                            uint n_ranges, uint mode, HANDLER_BUFFER *buf) override final;
+  int multi_range_read_next(range_id_t *range_info) override final;
   ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
-                                      void *seq_init_param, 
+                                      void *seq_init_param,
                                       uint n_ranges, uint *bufsz,
-                                      uint *flags, Cost_estimate *cost);
+                                      uint *flags, Cost_estimate *cost) override final;
   ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
-                                uint key_parts, uint *bufsz, 
-                                uint *flags, Cost_estimate *cost);
-  int multi_range_read_explain_info(uint mrr_mode, char *str, size_t size);
-  
+                                uint key_parts, uint *bufsz,
+                                uint *flags, Cost_estimate *cost) override final;
+  int multi_range_read_explain_info(uint mrr_mode, char *str, size_t size) override final;
+
   /* Index condition pushdown implementation */
-  Item *idx_cond_push(uint keyno, Item* idx_cond);
+  Item *idx_cond_push(uint keyno, Item* idx_cond) override final;
+
+  int find_unique_row(uchar *record, uint unique_idx) override final;
+
+  /* Following functions are needed by the S3 handler */
+  virtual S3_INFO *s3_open_args() { return 0; }
+  virtual void register_handler(MARIA_HA *file) {}
 
-  int find_unique_row(uchar *record, uint unique_idx);
 private:
   DsMrr_impl ds_mrr;
   friend check_result_t index_cond_func_maria(void *arg);
   friend void reset_thd_trn(THD *thd);
+  friend class ha_s3;
 };
 
 #endif /* HA_MARIA_INCLUDED */
diff --git a/storage/maria/ha_s3.cc b/storage/maria/ha_s3.cc
new file mode 100644
index 00000000000..9a0a458bfe5
--- /dev/null
+++ b/storage/maria/ha_s3.cc
@@ -0,0 +1,1116 @@
+/* Copyright (C) 2019, 2021 MariaDB Corporation Ab
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the
+   Free Software Foundation, Inc.
+   51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA
+*/
+
+/*
+  Implementation of S3 storage engine.
+
+  Storage format:
+
+  The S3 engine is read only storage engine. The data is stored in
+  same format as a non transactional Aria table in BLOCK_RECORD format.
+  This makes it easy to cache both index and rows in the page cache.
+  Data and index file are split into blocks of 's3_block_size', default
+  4M.
+
+  The table and it's associated files are stored in S3 into the following
+  locations:
+
+  frm file (for discovery):
+  aws_bucket/database/table/frm
+
+  First index block (contains description if the Aria file):
+  aws_bucket/database/table/aria
+
+  Rest of the index file:
+  aws_bucket/database/table/index/block_number
+
+  Data file:
+  aws_bucket/database/table/data/block_number
+
+  block_number is 6 digits decimal number, prefixed with 0
+  (Can be larger than 6 numbers, the prefix is just for nice output)
+
+  frm and base blocks are small (just the needed data).
+  index and blocks are of size 's3_block_size'
+
+  If compression is used, then original block size is s3_block_size
+  but the stored block will be the size of the compressed block.
+
+  Implementation:
+  The s3 engine inherits from the ha_maria handler.
+
+  It uses Aria code and relies on Aria being enabled. We don't have to check
+  that Aria is enabled though, because Aria is a mandatory plugin, and
+  the server will refuse to start if Aria failed to initialize.
+
+  s3 will use it's own page cache to not interfere with normal Aria
+  usage but also to ensure that the S3 page cache is large enough
+  (with a 4M s3_block_size the engine will need a large cache to work,
+  at least s3_block_size * 32. The default cache is 512M.
+*/
+
+#define MYSQL_SERVER 1
+#include <my_global.h>
+#include <m_string.h>
+#include "maria_def.h"
+#include "sql_class.h"
+#include <mysys_err.h>
+#include <libmarias3/marias3.h>
+#include <discover.h>
+#include "ha_s3.h"
+#include "s3_func.h"
+#include "aria_backup.h"
+
+#define DEFAULT_AWS_HOST_NAME "s3.amazonaws.com"
+
+static PAGECACHE s3_pagecache;
+static ulong s3_block_size, s3_protocol_version;
+static ulong s3_pagecache_division_limit, s3_pagecache_age_threshold;
+static ulong s3_pagecache_file_hash_size;
+static ulonglong s3_pagecache_buffer_size;
+static char *s3_bucket, *s3_access_key=0, *s3_secret_key=0, *s3_region;
+static char *s3_host_name;
+static int s3_port;
+static my_bool s3_use_http;
+static char *s3_tmp_access_key=0, *s3_tmp_secret_key=0;
+static my_bool s3_debug= 0, s3_slave_ignore_updates= 0;
+static my_bool s3_replicate_alter_as_create_select= 0;
+handlerton *s3_hton= 0;
+
+/* Don't show access or secret keys to users if they exists */
+
+static void update_access_key(MYSQL_THD thd,
+                              struct st_mysql_sys_var *var,
+                              void *var_ptr, const void *save)
+{
+  my_free(s3_access_key);
+  s3_access_key= 0;
+  /* Don't show real key to user in SHOW VARIABLES */
+  if (s3_tmp_access_key[0])
+  {
+    s3_access_key= s3_tmp_access_key;
+    s3_tmp_access_key= my_strdup(PSI_NOT_INSTRUMENTED, "*****", MYF(MY_WME));
+  }
+}
+
+static void update_secret_key(MYSQL_THD thd,
+                              struct st_mysql_sys_var *var,
+                              void *var_ptr, const void *save)
+{
+  my_free(s3_secret_key);
+  s3_secret_key= 0;
+  /* Don't show real key to user in SHOW VARIABLES */
+  if (s3_tmp_secret_key[0])
+  {
+    s3_secret_key= s3_tmp_secret_key;
+    s3_tmp_secret_key= my_strdup(PSI_NOT_INSTRUMENTED, "*****", MYF(MY_WME));
+  }
+}
+
+/* Define system variables for S3 */
+
+static MYSQL_SYSVAR_ULONG(block_size, s3_block_size,
+       PLUGIN_VAR_RQCMDARG,
+       "Block size for S3", 0, 0,
+       4*1024*1024, 65536, 16*1024*1024, 8192);
+
+static MYSQL_SYSVAR_BOOL(debug, s3_debug,
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+      "Generates trace file from libmarias3 on stderr for debugging",
+       0, 0, 0);
+
+static MYSQL_SYSVAR_BOOL(slave_ignore_updates, s3_slave_ignore_updates,
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+      "If the slave has shares same S3 storage as the master",
+       0, 0, 0);
+
+static MYSQL_SYSVAR_BOOL(replicate_alter_as_create_select,
+                         s3_replicate_alter_as_create_select,
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+       "When converting S3 table to local table, log all rows in binary log",
+       0, 0, 1);
+
+static MYSQL_SYSVAR_ENUM(protocol_version, s3_protocol_version,
+                         PLUGIN_VAR_RQCMDARG,
+                         "Protocol used to communication with S3. One of "
+                         "\"Auto\", \"Amazon\" or \"Original\".",
+                         NULL, NULL, 0, &s3_protocol_typelib);
+
+static MYSQL_SYSVAR_ULONG(pagecache_age_threshold,
+       s3_pagecache_age_threshold, PLUGIN_VAR_RQCMDARG,
+       "This characterizes the number of hits a hot block has to be untouched "
+       "until it is considered aged enough to be downgraded to a warm block. "
+       "This specifies the percentage ratio of that number of hits to the "
+       "total number of blocks in the page cache.", 0, 0,
+       300, 100, ~ (ulong) 0L, 100);
+
+static MYSQL_SYSVAR_ULONGLONG(pagecache_buffer_size, s3_pagecache_buffer_size,
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+       "The size of the buffer used for index blocks for S3 tables. "
+       "Increase this to get better index handling (for all reads and "
+       "multiple writes) to as much as you can afford.", 0, 0,
+        128*1024*1024, 1024*1024*32, ~(ulonglong) 0, 8192);
+
+static MYSQL_SYSVAR_ULONG(pagecache_division_limit,
+                          s3_pagecache_division_limit,
+       PLUGIN_VAR_RQCMDARG,
+       "The minimum percentage of warm blocks in key cache", 0, 0,
+       100,  1, 100, 1);
+
+static MYSQL_SYSVAR_ULONG(pagecache_file_hash_size,
+                          s3_pagecache_file_hash_size,
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+       "Number of hash buckets for open files.  If you have a lot "
+       "of S3 files open you should increase this for faster flush of "
+       "changes. A good value is probably 1/10 of number of possible open "
+       "S3 files.", 0,0, 512, 32, 16384, 1);
+
+static MYSQL_SYSVAR_STR(bucket, s3_bucket,
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+      "AWS bucket",
+       0, 0, "MariaDB");
+static MYSQL_SYSVAR_STR(host_name, s3_host_name,
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+      "AWS host name",
+       0, 0, DEFAULT_AWS_HOST_NAME);
+static MYSQL_SYSVAR_INT(port, s3_port,
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+      "Port number to connect to (0 means use default)",
+       NULL /*check*/, NULL /*update*/, 0 /*default*/,
+       0 /*min*/, 65535 /*max*/, 1 /*blk*/);
+static MYSQL_SYSVAR_BOOL(use_http, s3_use_http,
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+      "If true, force use of HTTP protocol",
+       NULL /*check*/, NULL /*update*/, 0 /*default*/);
+static MYSQL_SYSVAR_STR(access_key, s3_tmp_access_key,
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_MEMALLOC,
+      "AWS access key",
+       0, update_access_key, "");
+static MYSQL_SYSVAR_STR(secret_key, s3_tmp_secret_key,
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_MEMALLOC,
+      "AWS secret key",
+       0, update_secret_key, "");
+static MYSQL_SYSVAR_STR(region, s3_region,
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+      "AWS region",
+       0, 0, "");
+
+ha_create_table_option s3_table_option_list[]=
+{
+  /*
+    one numeric option, with the default of UINT_MAX32, valid
+    range of values 0..UINT_MAX32, and a "block size" of 10
+    (any value must be divisible by 10).
+  */
+  HA_TOPTION_SYSVAR("s3_block_size", s3_block_size, block_size),
+  HA_TOPTION_ENUM("compression_algorithm", compression_algorithm, "none,zlib",
+                  0),
+  HA_TOPTION_END
+};
+
+
+/*****************************************************************************
+ S3 handler code
+******************************************************************************/
+
+/**
+   Create S3 handler
+*/
+
+
+ha_s3::ha_s3(handlerton *hton, TABLE_SHARE *table_arg)
+  :ha_maria(hton, table_arg), in_alter_table(S3_NO_ALTER)
+{
+  /* Remove things that S3 doesn't support */
+  int_table_flags&= ~(HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE |
+                      HA_CAN_EXPORT);
+  can_enable_indexes= 0;
+}
+
+
+/**
+   Remember the handler to use for s3_block_read()
+
+   @note
+   In the future the ms3_st objects could be stored in
+   a list in share. In this case we would however need a mutex
+   to access the next free one. By using st_my_thread_var we
+   can avoid the mutex with the small cost of having to call
+   register handler in all handler functions that will access
+   the page cache
+*/
+
+void ha_s3::register_handler(MARIA_HA *file)
+{
+  struct st_my_thread_var *thread= my_thread_var;
+  thread->keycache_file= (void*) file;
+}
+
+
+/**
+   Write a row
+
+   When generating the table as part of ALTER TABLE, writes are allowed.
+   When table is moved to S3, writes are not allowed.
+*/
+
+int ha_s3::write_row(const uchar *buf)
+{
+  DBUG_ENTER("ha_s3::write_row");
+  if (in_alter_table)
+    DBUG_RETURN(ha_maria::write_row(buf));
+  DBUG_RETURN(HA_ERR_TABLE_READONLY);
+}
+
+/* Return true if S3 can be used */
+
+static my_bool s3_usable()
+{
+  return (s3_access_key != 0 && s3_secret_key != 0 && s3_region != 0 &&
+          s3_bucket != 0);
+}
+
+
+static my_bool s3_info_init(S3_INFO *info)
+{
+  if (!s3_usable())
+    return 1;
+  info->protocol_version= (uint8_t) s3_protocol_version;
+  lex_string_set(&info->host_name,  s3_host_name);
+  info->port= s3_port;
+  info->use_http= s3_use_http;
+  lex_string_set(&info->access_key, s3_access_key);
+  lex_string_set(&info->secret_key, s3_secret_key);
+  lex_string_set(&info->region,     s3_region);
+  lex_string_set(&info->bucket,     s3_bucket);
+  return 0;
+}
+
+
+/**
+   Fill information in S3_INFO including paths to table and database
+
+   Notes:
+     Database and table name are set even if s3 variables are not
+     initialized. This is needed by s3::drop_table
+*/
+
+static my_bool s3_info_init(S3_INFO *s3_info, const char *path,
+                            char *database_buff, size_t database_length)
+{
+  set_database_and_table_from_path(s3_info, path);
+  /* Fix database as it's not \0 terminated */
+  strmake(database_buff, s3_info->database.str,
+          MY_MIN(database_length, s3_info->database.length));
+  s3_info->database.str= database_buff;
+  s3_info->base_table= s3_info->table;
+  return s3_info_init(s3_info);
+}
+
+/*
+  Check if table is a temporary table
+
+  Returns 1 if table is a temporary table that should be stored in Aria
+  (to later be copied to S3 with a name change)
+*/
+
+static int is_mariadb_internal_tmp_table(const char *table_name)
+{
+  int length;
+  const int p_length= sizeof(tmp_file_prefix);  // prefix + '-'
+  /* Temporary table from ALTER TABLE */
+  if (!strncmp(table_name, tmp_file_prefix "-" , p_length))
+  {
+    /*
+      Internal temporary tables used by ALTER TABLE and ALTER PARTITION
+      should be stored in S3
+    */
+    if (!strncmp(table_name+p_length, "backup-", sizeof("backup-")-1) ||
+        !strncmp(table_name+p_length, "exchange-", sizeof("exchange-")-1) ||
+        !strncmp(table_name+p_length, "temptable-", sizeof("temptable-")-1))
+      return 0;
+    /* Other temporary tables should be stored in Aria on local disk */
+    return 1;
+  }
+  length= strlen(table_name);
+  if (length > 5 && !strncmp(table_name + length - 5, "#TMP#", 5))
+    return 1;
+  return 0;
+}
+
+
+/**
+  Drop S3 table
+*/
+
+int ha_s3::delete_table(const char *name)
+{
+  ms3_st *s3_client;
+  S3_INFO s3_info;
+  int error;
+  char database[NAME_LEN+1];
+  DBUG_ENTER("ha_s3::delete_table");
+
+  error= s3_info_init(&s3_info, name, database, sizeof(database)-1);
+
+  /* If internal on disk temporary table, let Aria take care of it */
+  if (is_mariadb_internal_tmp_table(s3_info.table.str))
+    DBUG_RETURN(ha_maria::delete_table(name));
+
+  if (error)
+    DBUG_RETURN(HA_ERR_UNSUPPORTED);
+
+  if (!(s3_client= s3_open_connection(&s3_info)))
+    DBUG_RETURN(HA_ERR_NO_CONNECTION);
+  error= aria_delete_from_s3(s3_client, s3_info.bucket.str,
+                             s3_info.database.str,
+                             s3_info.table.str,0);
+  s3_deinit(s3_client);
+  DBUG_RETURN(error);
+}
+
+/*
+  The table is a temporary table as part of ALTER TABLE.
+
+  Copy the on disk 'temporary' Aria table to S3 and delete the Aria table
+*/
+
+static int move_table_to_s3(ms3_st *s3_client,
+                            S3_INFO *to_s3_info,
+                            const char *local_name,
+                            bool is_partition)
+{
+  int error;
+  DBUG_ASSERT(!is_mariadb_internal_tmp_table(to_s3_info->table.str));
+
+  if (!(error= aria_copy_to_s3(s3_client, to_s3_info->bucket.str, local_name,
+                               to_s3_info->database.str,
+                               to_s3_info->table.str,
+                               0, 0, 1, 0, !is_partition)))
+  {
+    /* Table now in S3. Remove original files table files, keep .frm */
+    error= maria_delete_table_files(local_name, 1, 0);
+  }
+  return error;
+}
+
+
+/**
+   Copy an Aria table to S3 or rename a table in S3
+
+   The copy happens as part of the rename in ALTER TABLE when all data
+   is in an Aria table and we now have to copy it to S3.
+
+   If the table is an old table already in S3, we should just rename it.
+*/
+
+int ha_s3::rename_table(const char *from, const char *to)
+{
+  S3_INFO to_s3_info;
+  char to_name[NAME_LEN+1], frm_name[FN_REFLEN];
+  ms3_st *s3_client;
+  MY_STAT stat_info;
+  int error;
+  bool is_partition= (strstr(from, "#P#") != NULL) ||
+                     (strstr(to, "#P#") != NULL);
+  DBUG_ENTER("ha_s3::rename_table");
+
+  if (s3_info_init(&to_s3_info, to, to_name, sizeof(to_name)-1))
+    DBUG_RETURN(HA_ERR_UNSUPPORTED);
+  if (!(s3_client= s3_open_connection(&to_s3_info)))
+    DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+
+  /*
+    Check if this is a on disk table created by ALTER TABLE that should be
+    copied to S3. We know this is the case if the table is a temporary table
+    and the .MAI file for the table is on disk
+  */
+  fn_format(frm_name, from, "", reg_ext, MYF(0));
+  if (is_mariadb_internal_tmp_table(from + dirname_length(from)) &&
+      (is_partition || my_stat(frm_name, &stat_info, MYF(0))))
+  {
+    error= move_table_to_s3(s3_client, &to_s3_info, from, is_partition);
+  }
+  else
+  {
+    char from_name[NAME_LEN+1];
+    S3_INFO from_s3_info;
+    /* The table is an internal S3 table. Do the renames */
+    s3_info_init(&from_s3_info, from, from_name, sizeof(from_name)-1);
+
+    if (is_mariadb_internal_tmp_table(to + dirname_length(to)))
+    {
+      /*
+        The table is renamed to a temporary table. This only happens
+        in the case of an ALTER PARTITION failure and there will be soon
+        a delete issued for the temporary table. The only thing we can do
+        is to remove the from table. We will get an extra errors for the
+        uppcoming but we will ignore this minor problem for now as this
+        is an unlikely event and the extra warnings are just annoying,
+        not critical.
+      */
+      error= aria_delete_from_s3(s3_client, from_s3_info.bucket.str,
+                                 from_s3_info.database.str,
+                                 from_s3_info.table.str,0);
+    }
+    else
+      error= aria_rename_s3(s3_client, to_s3_info.bucket.str,
+                          from_s3_info.database.str,
+                          from_s3_info.table.str,
+                          to_s3_info.database.str,
+                          to_s3_info.table.str,
+                          !is_partition &&
+                          !current_thd->lex->alter_info.partition_flags);
+  }
+  s3_deinit(s3_client);
+  DBUG_RETURN(error);
+}
+
+
+/**
+   Create a s3 table.
+
+   @notes
+   One can only create an s3 table as part of ALTER TABLE
+   The table is created as a non transactional Aria table with
+   BLOCK_RECORD format
+*/
+
+int ha_s3::create(const char *name, TABLE *table_arg,
+                  HA_CREATE_INFO *ha_create_info)
+{
+  uchar *frm_ptr;
+  size_t frm_len;
+  int error;
+  TABLE_SHARE *share= table_arg->s;
+  DBUG_ENTER("ha_s3::create");
+
+  if (!(ha_create_info->options & HA_CREATE_TMP_ALTER) ||
+      ha_create_info->tmp_table())
+    DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+
+  if (share->table_type == TABLE_TYPE_SEQUENCE)
+    DBUG_RETURN(HA_ERR_UNSUPPORTED);
+
+  /* When using partitions, S3 only supports adding and remove partitions */
+  if ((table_arg->in_use->lex->alter_info.partition_flags &
+       ~(ALTER_PARTITION_REMOVE | ALTER_PARTITION_ADD | ALTER_PARTITION_INFO)))
+    DBUG_RETURN(HA_ERR_UNSUPPORTED);
+
+  if (!s3_usable())
+    DBUG_RETURN(HA_ERR_UNSUPPORTED);
+
+  /* Force the table to a format suitable for S3 */
+  ha_create_info->row_type= ROW_TYPE_PAGE;
+  ha_create_info->transactional= HA_CHOICE_NO;
+  error= ha_maria::create(name, table_arg, ha_create_info);
+  if (error)
+    DBUG_RETURN(error);
+
+#ifdef MOVE_FILES_TO_S3_ON_CREATE
+  /*
+    If we are in ADD PARTITION and we created a new table (not
+    temporary table, which will be moved as part of the final rename),
+    we should move it S3 right away. The other option would to move
+    it as part of close(). We prefer to do this here as there is no error
+    checking with close() which would leave incomplete tables around in
+    case of failures. The downside is that we can't move rows around as
+    part of changing partitions, but that is not a big problem with S3
+    as it's readonly anyway.
+  */
+  if (!is_mariadb_internal_tmp_table(name + dirname_length(name)) &&
+      strstr(name, "#P#"))
+  {
+    S3_INFO to_s3_info;
+    char database[NAME_LEN+1];
+    ms3_st *s3_client;
+
+    if (s3_info_init(&to_s3_info, name, database, sizeof(database)-1))
+      DBUG_RETURN(HA_ERR_UNSUPPORTED);
+    if (!(s3_client= s3_open_connection(&to_s3_info)))
+      DBUG_RETURN(HA_ERR_NO_CONNECTION);
+
+    /* Note that if error is set, then the empty temp table was not removed */
+    error= move_table_to_s3(s3_client, &to_s3_info, name, 1);
+    s3_deinit(s3_client);
+    if (error)
+      maria_delete_table_files(name, 1, 0);
+  else
+#endif /* MOVE_TABLE_TO_S3 */
+  {
+    /* Create the .frm file. Needed for ha_s3::rename_table() later  */
+    if (!table_arg->s->read_frm_image((const uchar**) &frm_ptr, &frm_len))
+    {
+      table_arg->s->write_frm_image(frm_ptr, frm_len);
+      table_arg->s->free_frm_image(frm_ptr);
+    }
+  }
+  DBUG_RETURN(error);
+}
+
+/**
+   Open table
+
+   @notes
+   Table is read only, except if opened by ALTER as in this case we
+   are creating the S3 table.
+*/
+
+int ha_s3::open(const char *name, int mode, uint open_flags)
+{
+  bool internal_tmp_table= 0;
+  int res;
+  S3_INFO s3_info;
+  DBUG_ENTER("ha_s3:open");
+
+  if (!s3_usable())
+    DBUG_RETURN(HA_ERR_UNSUPPORTED);
+
+  /*
+    On slaves with s3_slave_ignore_updates set we allow tables to be
+    opened in write mode to be able to ignore queries that modify
+    the table trough handler::check_if_updates_are_ignored().
+
+    This is needed for the slave to be able to handle
+    CREATE TABLE t1...
+    INSERT INTO TABLE t1 ....
+    ALTER TABLE t1 ENGINE=S3
+    If this is not done, the insert will fail on the slave if the
+    master has already executed the ALTER TABLE.
+
+    We also have to allow open for create, as part of
+    ALTER TABLE ... ENGINE=S3.
+
+    Otherwise we only allow the table to be open in read mode
+  */
+  if (mode != O_RDONLY && !(open_flags & HA_OPEN_FOR_CREATE) &&
+      !s3_slave_ignore_updates)
+    DBUG_RETURN(EACCES);
+
+  open_args= 0;
+  internal_tmp_table= is_mariadb_internal_tmp_table(name +
+                                                    dirname_length(name));
+
+  if (!(open_flags & HA_OPEN_FOR_CREATE) && !internal_tmp_table)
+  {
+    (void) s3_info_init(&s3_info);
+    s3_info.tabledef_version= table->s->tabledef_version;
+    s3_info.base_table= table->s->table_name;
+
+    /* Pass the above arguments to maria_open() */
+    open_args= &s3_info;
+    in_alter_table= S3_NO_ALTER;
+  }
+  else
+  {
+    /*
+      Table was created as an Aria table that will be moved to S3 either
+      by rename_table() or external_lock()
+    */
+    bool is_partition= (strstr(name, "#P#") != NULL);
+    in_alter_table= (!is_partition ? S3_ALTER_TABLE :
+                     internal_tmp_table ? S3_ADD_TMP_PARTITION :
+                     S3_ADD_PARTITION);
+  }
+  DBUG_PRINT("info", ("in_alter_table: %d", in_alter_table));
+
+  if (!(res= ha_maria::open(name, mode, open_flags)))
+  {
+    if (open_args)
+    {
+      /*
+        Table is in S3. We have to modify the pagecache callbacks for the
+        data file, index file and for bitmap handling.
+      */
+      file->s->pagecache= &s3_pagecache;
+      file->dfile.big_block_size= file->s->kfile.big_block_size=
+        file->s->bitmap.file.big_block_size= file->s->base.s3_block_size;
+      file->s->kfile.head_blocks= file->s->base.keystart / file->s->block_size;
+    }
+  }
+  open_args= 0;
+  DBUG_RETURN(res);
+}
+
+
+int ha_s3::external_lock(THD * thd, int lock_type)
+{
+  int error;
+  DBUG_ENTER("ha_s3::external_lock");
+
+  error= ha_maria::external_lock(thd, lock_type);
+  if (in_alter_table == S3_ADD_PARTITION && !error && lock_type == F_UNLCK)
+  {
+    /*
+      This was a new partition. All data is now copied to the table
+      so it's time to move it to S3)
+    */
+
+    MARIA_SHARE *share= file->s;
+    uint org_open_count;
+
+    /* First, flush all data to the Aria table */
+    if (flush_pagecache_blocks(share->pagecache, &share->kfile,
+                               FLUSH_RELEASE))
+      error= my_errno;
+    if (flush_pagecache_blocks(share->pagecache, &share->bitmap.file,
+                               FLUSH_RELEASE))
+      error= my_errno;
+    org_open_count= share->state.open_count;
+    if (share->global_changed)
+      share->state.open_count--;
+    if (_ma_state_info_write(share, MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                             MA_STATE_INFO_WRITE_LOCK))
+      error= my_errno;
+    share->state.open_count= org_open_count;
+
+    if (!error)
+    {
+      S3_INFO to_s3_info;
+      char database[NAME_LEN+1], *name= file->s->open_file_name.str;
+      ms3_st *s3_client;
+
+      /* Copy data to S3 */
+      if (s3_info_init(&to_s3_info, name, database, sizeof(database)-1))
+        DBUG_RETURN(HA_ERR_UNSUPPORTED);
+      if (!(s3_client= s3_open_connection(&to_s3_info)))
+        DBUG_RETURN(HA_ERR_NO_CONNECTION);
+
+      /*
+        Note that if error is set, then the empty temp table was not
+        removed
+      */
+      error= move_table_to_s3(s3_client, &to_s3_info, name, 1);
+      s3_deinit(s3_client);
+
+      maria_delete_table_files(name, 1, 0);
+    }
+  }
+  DBUG_RETURN(error);
+}
+
+
+/******************************************************************************
+ Storage engine handler definitions
+******************************************************************************/
+
+/**
+   Free all resources for s3
+*/
+
+static handler *s3_create_handler(handlerton *hton,
+                                  TABLE_SHARE * table,
+                                  MEM_ROOT *mem_root)
+{
+  return new (mem_root) ha_s3(hton, table);
+}
+
+
+static int s3_hton_panic(handlerton *hton, ha_panic_function flag)
+{
+  if (flag == HA_PANIC_CLOSE && s3_hton)
+  {
+    end_pagecache(&s3_pagecache, TRUE);
+    s3_deinit_library();
+    my_free(s3_access_key);
+    my_free(s3_secret_key);
+    s3_access_key= s3_secret_key= 0;
+    s3_hton= 0;
+  }
+  return 0;
+}
+
+
+/**
+  Check if a table is in S3 as part of discovery
+*/
+
+static int s3_discover_table(handlerton *hton, THD* thd, TABLE_SHARE *share)
+{
+  S3_INFO s3_info;
+  S3_BLOCK frm_block, par_block;
+  ms3_st *s3_client;
+  int error;
+  DBUG_ENTER("s3_discover_table");
+
+  if (s3_info_init(&s3_info))
+    DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+  if (!(s3_client= s3_open_connection(&s3_info)))
+    DBUG_RETURN(HA_ERR_NO_CONNECTION);
+
+  s3_info.database=   share->db;
+  s3_info.table=      share->table_name;
+  s3_info.base_table= share->table_name;
+
+  if (s3_get_def(s3_client, &s3_info, &frm_block, "frm"))
+  {
+    s3_free(&frm_block);
+    s3_deinit(s3_client);
+    DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+  }
+  (void) s3_get_def(s3_client, &s3_info, &par_block, "par");
+
+  error= share->init_from_binary_frm_image(thd, 1,
+                                           frm_block.str, frm_block.length,
+                                           par_block.str, par_block.length);
+  s3_free(&frm_block);
+  s3_free(&par_block);
+  s3_deinit(s3_client);
+  DBUG_RETURN((my_errno= error));
+}
+
+
+/**
+  Check if a table exists
+
+   @return 0 frm doesn't exists
+   @return 1 frm exists
+*/
+
+static int s3_discover_table_existance(handlerton *hton, const char *db,
+                                       const char *table_name)
+{
+  S3_INFO s3_info;
+  ms3_st *s3_client;
+  int res;
+  DBUG_ENTER("s3_discover_table_existance");
+
+  /* Ignore names in "mysql" database to speed up boot */
+  if (!strcmp(db, MYSQL_SCHEMA_NAME.str))
+    DBUG_RETURN(0);
+
+  if (s3_info_init(&s3_info))
+    DBUG_RETURN(0);
+  if (!(s3_client= s3_open_connection(&s3_info)))
+    DBUG_RETURN(0);
+
+  s3_info.database.str=    db;
+  s3_info.database.length= strlen(db);
+  s3_info.table.str=       table_name;
+  s3_info.table.length=    strlen(table_name);
+
+  res= s3_frm_exists(s3_client, &s3_info);
+  s3_deinit(s3_client);
+  DBUG_PRINT("exit", ("exists: %d", res == 0));
+  DBUG_RETURN(res == 0);                        // Return 1 if exists
+}
+
+
+/**
+  Return a list of all S3 tables in a database
+
+  Partitoned tables are not shown
+*/
+
+static int s3_discover_table_names(handlerton *hton __attribute__((unused)),
+                                   LEX_CSTRING *db,
+                                   MY_DIR *dir __attribute__((unused)),
+                                   handlerton::discovered_list *result)
+{
+  char aws_path[AWS_PATH_LENGTH];
+  S3_INFO s3_info;
+  ms3_st *s3_client;
+  ms3_list_st *list, *org_list= 0;
+  int error;
+  DBUG_ENTER("s3_discover_table_names");
+
+  /* Ignore names in "mysql" database to speed up boot */
+  if (!strcmp(db->str, MYSQL_SCHEMA_NAME.str))
+    DBUG_RETURN(0);
+
+  if (s3_info_init(&s3_info))
+    DBUG_RETURN(0);
+  if (!(s3_client= s3_open_connection(&s3_info)))
+    DBUG_RETURN(0);
+
+  strxnmov(aws_path, sizeof(aws_path)-1, db->str, "/", NullS);
+
+  if ((error= ms3_list_dir(s3_client, s3_info.bucket.str, aws_path, &org_list)))
+    goto end;
+
+  for (list= org_list ; list ; list= list->next)
+  {
+    const char *name= list->key + db->length + 1;   // Skip database and '/'
+    if (!strstr(name, "#P#"))
+    {
+      size_t name_length= strlen(name)-1;             // Remove end '/'
+      result->add_table(name, name_length);
+    }
+  }
+  if (org_list)
+    ms3_list_free(org_list);
+end:
+  s3_deinit(s3_client);
+  DBUG_RETURN(0);
+}
+
+/*
+  Check if definition of table in S3 is same as in MariaDB.
+  This also covers the case where the table is not in S3 anymore.
+
+  Called when a copy of the S3 table is taken from the MariaDB table cache
+
+  TODO: Could possible be optimized by checking if the file on S3 is
+        of same time, data and size since when table was originally opened.
+*/
+
+int ha_s3::discover_check_version()
+{
+  S3_INFO s3_info= *file->s->s3_path;
+  s3_info.tabledef_version= table->s->tabledef_version;
+  /*
+    We have to change the database and table as the table may part of a
+    partitoned table. In this case we want to check the frm file for the
+    partitioned table, not the part table.
+  */
+  s3_info.base_table= table->s->table_name;
+  return (s3_check_frm_version(file->s3, &s3_info) ?
+          HA_ERR_TABLE_DEF_CHANGED : 0);
+}
+
+
+/**
+  Update the .frm file in S3
+*/
+
+static int s3_notify_tabledef_changed(handlerton *,
+                                      LEX_CSTRING *db, LEX_CSTRING *table,
+                                      LEX_CUSTRING *frm,
+                                      LEX_CUSTRING *org_tabledef_version,
+                                      handler *)
+{
+  char aws_path[AWS_PATH_LENGTH];
+  S3_INFO s3_info;
+  ms3_st *s3_client;
+  int error= 0;
+  DBUG_ENTER("s3_notify_tabledef_changed");
+
+  if (strstr(table->str, "#P#"))
+    DBUG_RETURN(0);                             // Ignore partitions
+
+  if (s3_info_init(&s3_info))
+    DBUG_RETURN(0);
+  if (!(s3_client= s3_open_connection(&s3_info)))
+    DBUG_RETURN(0);
+
+  s3_info.database=    *db;
+  s3_info.base_table=  *table;
+  s3_info.tabledef_version= *org_tabledef_version;
+  if (s3_check_frm_version(s3_client, &s3_info))
+  {
+    error= 1;
+    goto err;
+  }
+
+  strxnmov(aws_path, sizeof(aws_path)-1, db->str, "/", table->str, "/frm",
+           NullS);
+
+  if (s3_put_object(s3_client, s3_info.bucket.str, aws_path, (uchar*) frm->str,
+                    frm->length, 0))
+    error= 2;
+
+err:
+  s3_deinit(s3_client);
+  DBUG_RETURN(error);
+}
+
+
+/**
+   Update the .frm and .par file of a partitioned table stored in s3
+
+   Logic is:
+   - Skip temporary tables used internally by ALTER TABLE and ALTER PARTITION
+   - In case of delete, delete the .frm and .par file from S3
+   - In case of create, copy the .frm and .par files to S3
+   - In case of rename:
+      - Delete from old_path if not internal temporary file and if exists
+      - Copy new .frm and .par file to S3
+
+   To ensure that this works with the reply logic from ALTER PARTITION
+   there should be no errors, only notes, for deletes.
+*/
+
+static int s3_create_partitioning_metadata(const char *path,
+                                           const char *old_path,
+                                           chf_create_flags action_flag)
+{
+  ms3_st *s3_client;
+  S3_INFO s3_info;
+  int error= 0;
+  char database[NAME_LEN+1];
+  const char *tmp_path;
+  DBUG_ENTER("s3_create_partitioning_metadata");
+
+  /* Path is empty in case of delete */
+  tmp_path= path ? path : old_path;
+
+  if (s3_info_init(&s3_info, tmp_path, database, sizeof(database)-1))
+    DBUG_RETURN(HA_ERR_UNSUPPORTED);
+  if (!(s3_client= s3_open_connection(&s3_info)))
+    DBUG_RETURN(HA_ERR_NO_CONNECTION);
+
+  switch (action_flag) {
+  case CHF_DELETE_FLAG:
+  case CHF_RENAME_FLAG:
+  {
+    if (!is_mariadb_internal_tmp_table(old_path + dirname_length(old_path)))
+    {
+      S3_INFO s3_info2;
+      char database2[NAME_LEN+1];
+      s3_info_init(&s3_info2, old_path, database2, sizeof(database2)-1);
+
+      partition_delete_from_s3(s3_client, s3_info2.bucket.str,
+                               s3_info2.database.str, s3_info2.table.str,
+                               MYF(ME_NOTE));
+    }
+    if (action_flag == CHF_DELETE_FLAG)
+      break;
+  }
+  /* Fall through */
+  case CHF_CREATE_FLAG:
+    if (!is_mariadb_internal_tmp_table(path + dirname_length(path)))
+      error= partition_copy_to_s3(s3_client, s3_info.bucket.str,
+                                  path, old_path,
+                                  s3_info.database.str, s3_info.table.str);
+    break;
+  case CHF_INDEX_FLAG:
+    break;
+  }
+  s3_deinit(s3_client);
+  DBUG_RETURN(error);
+}
+
+
+/**
+   Initialize s3 plugin
+*/
+
+static int ha_s3_init(void *p)
+{
+  bool res;
+  static const char *no_exts[]= { 0 };
+
+  s3_hton= (handlerton *)p;
+  s3_hton->db_type= DB_TYPE_S3;
+  s3_hton->create= s3_create_handler;
+  s3_hton->panic=  s3_hton_panic;
+  s3_hton->table_options= s3_table_option_list;
+  s3_hton->discover_table= s3_discover_table;
+  s3_hton->discover_table_names= s3_discover_table_names;
+  s3_hton->discover_table_existence= s3_discover_table_existance;
+  s3_hton->notify_tabledef_changed= s3_notify_tabledef_changed;
+  s3_hton->create_partitioning_metadata= s3_create_partitioning_metadata;
+  s3_hton->tablefile_extensions= no_exts;
+  s3_hton->commit= 0;
+  s3_hton->rollback= 0;
+  s3_hton->checkpoint_state= 0;
+  s3_hton->flush_logs= 0;
+  s3_hton->show_status= 0;
+  s3_hton->prepare_for_backup= 0;
+  s3_hton->end_backup= 0;
+  s3_hton->flags= ((s3_slave_ignore_updates ? HTON_IGNORE_UPDATES : 0) |
+                   (s3_replicate_alter_as_create_select ?
+                    HTON_TABLE_MAY_NOT_EXIST_ON_SLAVE : 0));
+  /* Copy global arguments to s3_access_key and s3_secret_key */
+  update_access_key(0,0,0,0);
+  update_secret_key(0,0,0,0);
+
+  if ((res= !init_pagecache(&s3_pagecache,
+                            (size_t) s3_pagecache_buffer_size,
+                            s3_pagecache_division_limit,
+                            s3_pagecache_age_threshold, maria_block_size,
+                            s3_pagecache_file_hash_size, 0)))
+    s3_hton= 0;
+  s3_pagecache.big_block_read= s3_block_read;
+  s3_pagecache.big_block_free= s3_free;
+  s3_init_library();
+  if (s3_debug)
+    ms3_debug();
+
+  struct s3_func s3f_real =
+  {
+    ms3_set_option, s3_free, ms3_deinit, s3_unique_file_number,
+    read_index_header, s3_check_frm_version, s3_info_copy,
+    set_database_and_table_from_path, s3_open_connection
+  };
+  s3f= s3f_real;
+
+  return res ? HA_ERR_INITIALIZATION : 0;
+}
+
+static int ha_s3_deinit(void*)
+{
+  bzero(&s3f, sizeof(s3f));
+  return 0;
+}
+
+static SHOW_VAR status_variables[]= {
+  {"pagecache_blocks_not_flushed",
+   (char*) &s3_pagecache.global_blocks_changed, SHOW_LONG},
+  {"pagecache_blocks_unused",
+   (char*) &s3_pagecache.blocks_unused, SHOW_LONG},
+  {"pagecache_blocks_used",
+   (char*) &s3_pagecache.blocks_used, SHOW_LONG},
+  {"pagecache_read_requests",
+   (char*) &s3_pagecache.global_cache_r_requests, SHOW_LONGLONG},
+  {"pagecache_reads",
+   (char*) &s3_pagecache.global_cache_read, SHOW_LONGLONG},
+  {NullS, NullS, SHOW_LONG}
+};
+
+
+static struct st_mysql_sys_var* system_variables[]= {
+  MYSQL_SYSVAR(block_size),
+  MYSQL_SYSVAR(debug),
+  MYSQL_SYSVAR(protocol_version),
+  MYSQL_SYSVAR(pagecache_age_threshold),
+  MYSQL_SYSVAR(pagecache_buffer_size),
+  MYSQL_SYSVAR(pagecache_division_limit),
+  MYSQL_SYSVAR(pagecache_file_hash_size),
+  MYSQL_SYSVAR(host_name),
+  MYSQL_SYSVAR(port),
+  MYSQL_SYSVAR(use_http),
+  MYSQL_SYSVAR(bucket),
+  MYSQL_SYSVAR(access_key),
+  MYSQL_SYSVAR(secret_key),
+  MYSQL_SYSVAR(region),
+  MYSQL_SYSVAR(slave_ignore_updates),
+  MYSQL_SYSVAR(replicate_alter_as_create_select),
+  NULL
+};
+
+struct st_mysql_storage_engine s3_storage_engine=
+{ MYSQL_HANDLERTON_INTERFACE_VERSION };
+
+maria_declare_plugin(s3)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &s3_storage_engine,
+  "S3",
+  "MariaDB Corporation Ab",
+  "Read only table stored in S3. Created by running "
+  "ALTER TABLE table_name ENGINE=s3",
+  PLUGIN_LICENSE_GPL,
+  ha_s3_init,                   /* Plugin Init      */
+  ha_s3_deinit,                 /* Plugin Deinit    */
+  0x0100,                       /* 1.0              */
+  status_variables,             /* status variables */
+  system_variables,             /* system variables */
+  "1.0",                        /* string version   */
+  MariaDB_PLUGIN_MATURITY_STABLE/* maturity         */
+}
+maria_declare_plugin_end;
diff --git a/storage/maria/ha_s3.h b/storage/maria/ha_s3.h
new file mode 100644
index 00000000000..d4b9e954154
--- /dev/null
+++ b/storage/maria/ha_s3.h
@@ -0,0 +1,75 @@
+#ifndef HA_S3_INCLUDED
+#define HA_S3_INCLUDED
+/* Copyright (C) 2019, 2020, MariaDB Corporation AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the
+   Free Software Foundation, Inc.
+   51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA
+*/
+
+#include "ha_maria.h"
+
+class ha_s3 final :public ha_maria
+{
+  enum alter_table_op
+  { S3_NO_ALTER, S3_ALTER_TABLE, S3_ADD_PARTITION, S3_ADD_TMP_PARTITION };
+  alter_table_op in_alter_table;
+  S3_INFO *open_args;
+
+public:
+  ha_s3(handlerton *hton, TABLE_SHARE * table_arg);
+  ~ha_s3() {}
+
+  int create(const char *name, TABLE *table_arg,
+             HA_CREATE_INFO *ha_create_info) override;
+  int open(const char *name, int mode, uint open_flags) override;
+  int write_row(const uchar *buf) override;
+  int update_row(const uchar *, const uchar *) override
+  {
+    DBUG_ENTER("update_row");
+    DBUG_RETURN(HA_ERR_TABLE_READONLY);
+  }
+  int delete_row(const uchar *) override
+  {
+    DBUG_ENTER("delete_row");
+    DBUG_RETURN(HA_ERR_TABLE_READONLY);
+  }
+  int analyze(THD *, HA_CHECK_OPT *) override
+  {
+    DBUG_ENTER("analyze");
+    DBUG_RETURN(HA_ERR_TABLE_READONLY);
+  }
+  int repair(THD *, HA_CHECK_OPT *) override
+  {
+    DBUG_ENTER("repair");
+    DBUG_RETURN(HA_ERR_TABLE_READONLY);
+  }
+  int preload_keys(THD *, HA_CHECK_OPT *) override
+  {
+    DBUG_ENTER("preload_keys");
+    DBUG_RETURN(HA_ERR_TABLE_READONLY);
+  }
+  int external_lock(THD * thd, int lock_type) override;
+  /*
+    drop_table() is only used for internal temporary tables,
+    not applicable for s3
+  */
+  void drop_table(const char *) override {}
+  int delete_table(const char *name) override;
+  int rename_table(const char *from, const char *to) override;
+  int discover_check_version() override;
+  int rebind();
+  S3_INFO *s3_open_args() override { return open_args; }
+  void register_handler(MARIA_HA *file) override;
+};
+#endif /* HA_S3_INCLUDED */
diff --git a/storage/maria/libmarias3 b/storage/maria/libmarias3
new file mode 160000
+Subproject 3846890513df0653b8919bc45a7600f9b55cab3
diff --git a/storage/maria/lockman.c b/storage/maria/lockman.c
index 916b70b8344..4cf6a46eff4 100644
--- a/storage/maria/lockman.c
+++ b/storage/maria/lockman.c
@@ -555,7 +555,7 @@ static void initialize_bucket(LOCKMAN *lm, LOCK * volatile *node,
 {
   int res;
   uint parent= my_clear_highest_bit(bucket);
-  LOCK *dummy= (LOCK *)my_malloc(sizeof(LOCK), MYF(MY_WME));
+  LOCK *dummy= (LOCK *)my_malloc(PSI_INSTRUMENT_ME, sizeof(LOCK), MYF(MY_WME));
   LOCK **tmp= 0, *cur;
   LOCK * volatile *el= lf_dynarray_lvalue(&lm->array, parent);
 
diff --git a/storage/maria/ma_backup.c b/storage/maria/ma_backup.c
index 8f20209c48a..0384dfb4cc5 100644
--- a/storage/maria/ma_backup.c
+++ b/storage/maria/ma_backup.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2018 MariaDB corporation
+/* Copyright (C) 2018, 2020 MariaDB Corporation Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -20,10 +20,8 @@
 #include "ma_checkpoint.h"
 #include <aria_backup.h>
 
-static uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base);
-
 /**
-  @brief Get capabilites for an Aria table
+  @brief Get capabilities for an Aria table
 
   @param kfile   key file (.MAI)
   @param cap     Capabilities are stored here
@@ -32,6 +30,7 @@ static uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base);
   @return X      errno
 */
 
+int aria_get_capabilities(File kfile, ARIA_TABLE_CAPABILITIES *cap)__attribute__((visibility("default"))) ;
 int aria_get_capabilities(File kfile, ARIA_TABLE_CAPABILITIES *cap)
 {
   MARIA_SHARE share;
@@ -59,7 +58,7 @@ int aria_get_capabilities(File kfile, ARIA_TABLE_CAPABILITIES *cap)
     Allocate space for header information and for data that is too
     big to keep on stack
   */
-  if (!(disc_cache= my_malloc(info_length, MYF(MY_WME))))
+  if (!(disc_cache= my_malloc(PSI_NOT_INSTRUMENTED, info_length, MYF(MY_WME))))
     DBUG_RETURN(ENOMEM);
 
   if (my_pread(kfile, disc_cache, info_length, 0L, MYF(MY_NABP)))
@@ -77,6 +76,11 @@ int aria_get_capabilities(File kfile, ARIA_TABLE_CAPABILITIES *cap)
                          0) + KEYPAGE_KEYID_SIZE + KEYPAGE_FLAG_SIZE +
                         KEYPAGE_USED_SIZE);
   cap->block_size= share.base.block_size;
+  cap->data_file_type= share.state.header.data_file_type;
+  cap->s3_block_size=  share.base.s3_block_size;
+  cap->compression=    share.base.compression_algorithm;
+  cap->encrypted=      MY_TEST(share.base.extra_options &
+                               MA_EXTRA_OPTIONS_ENCRYPTED);
 
   if (share.state.header.data_file_type == BLOCK_RECORD)
   {
@@ -97,21 +101,13 @@ int aria_get_capabilities(File kfile, ARIA_TABLE_CAPABILITIES *cap)
 err:
   my_free(disc_cache);
   DBUG_RETURN(error);
-} /* maria_get_capabilities */
-
-
-/*
-  This is a copy of my_base_info_read from ma_open().
-  The base information will never change (something may be added
-  last, but not relevant for maria_get_capabilities), so it's safe to
-  copy it here.
-
-  The copy is done to avoid linking in the fill Aria library just
-  because maria_backup uses maria_get_capabilities()
-*/
+} /* aria_get_capabilities */
 
+/****************************************************************************
+**  store MARIA_BASE_INFO
+****************************************************************************/
 
-static uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base)
+uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base)
 {
   bmove(base->uuid, ptr, MY_UUID_SIZE);                 ptr+= MY_UUID_SIZE;
   base->keystart= mi_sizekorr(ptr);			ptr+= 8;
@@ -142,14 +138,15 @@ static uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base)
   base->keys=	       *ptr++;
   base->auto_key=      *ptr++;
   base->born_transactional= *ptr++;
-  ptr++;
+  base->compression_algorithm= *ptr++;
   base->pack_bytes= mi_uint2korr(ptr);			ptr+= 2;
   base->blobs= mi_uint2korr(ptr);			ptr+= 2;
   base->max_key_block_length= mi_uint2korr(ptr);	ptr+= 2;
   base->max_key_length= mi_uint2korr(ptr);		ptr+= 2;
   base->extra_alloc_bytes= mi_uint2korr(ptr);		ptr+= 2;
   base->extra_alloc_procent= *ptr++;
-  ptr+= 16;
+  base->s3_block_size= mi_uint3korr(ptr);               ptr+= 3;
+  ptr+= 13;
   return ptr;
 }
 
diff --git a/storage/maria/ma_bitmap.c b/storage/maria/ma_bitmap.c
index 4f3a2ae5f89..49604fa43f6 100644
--- a/storage/maria/ma_bitmap.c
+++ b/storage/maria/ma_bitmap.c
@@ -239,8 +239,8 @@ my_bool _ma_bitmap_init(MARIA_SHARE *share, File file,
   size*= 2;
 #endif
 
-  if (((bitmap->map= (uchar*) my_malloc(size, flag)) == NULL) ||
-      my_init_dynamic_array(&bitmap->pinned_pages,
+  if (!((bitmap->map= (uchar*) my_malloc(PSI_INSTRUMENT_ME, size, flag))) ||
+      my_init_dynamic_array(PSI_INSTRUMENT_ME, &bitmap->pinned_pages,
                             sizeof(MARIA_PINNED_PAGE), 1, 1, flag))
     return 1;
 
@@ -520,9 +520,10 @@ my_bool _ma_bitmap_flush_all(MARIA_SHARE *share)
 #ifdef EXTRA_DEBUG_BITMAP
     {
       char tmp[MAX_BITMAP_INFO_LENGTH];      
-      _ma_get_bitmap_description(bitmap, bitmap->map, bitmap->page, tmp);
+      size_t len;
+      len= _ma_get_bitmap_description(bitmap, bitmap->map, bitmap->page, tmp);
       (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY,
-                                     (uchar*) tmp, strlen(tmp));
+                                     (uchar*) tmp, len);
     }
 #endif
 
@@ -958,13 +959,13 @@ void _ma_print_bitmap(MARIA_FILE_BITMAP *bitmap, uchar *data,
   Return content of bitmap as a printable string
 */
 
-void _ma_get_bitmap_description(MARIA_FILE_BITMAP *bitmap,
-                                uchar *bitmap_data,
-                                pgcache_page_no_t page,
-                                char *out)
+size_t _ma_get_bitmap_description(MARIA_FILE_BITMAP *bitmap,
+                                  uchar *bitmap_data,
+                                  pgcache_page_no_t page,
+                                  char *out)
 {
   uchar *pos, *end;
-  uint count=0, dot_printed= 0, len;
+  size_t count=0, dot_printed= 0, len;
   char buff[80], last[80];
 
   page++;
@@ -981,7 +982,7 @@ void _ma_get_bitmap_description(MARIA_FILE_BITMAP *bitmap,
         if (memcmp(buff, last, count))
         {
           memcpy(last, buff, count);
-          len= sprintf(out, "%8lu: ", (ulong) page - count);
+          len= sprintf(out, "%8lu: ", (ulong) (page - count));
           memcpy(out+len, buff, count);
           out+= len + count + 1;
           out[-1]= '\n';
@@ -997,10 +998,11 @@ void _ma_get_bitmap_description(MARIA_FILE_BITMAP *bitmap,
       page++;
     }
   }
-  len= sprintf(out, "%8lu: ", (ulong) page - count);
+  len= sprintf(out, "%8lu: ", (ulong) (page - count));
   memcpy(out+len, buff, count);
   out[len + count]= '\n';
   out[len + count + 1]= 0;
+  return len + count + 1;
 }
 
 
diff --git a/storage/maria/ma_blockrec.c b/storage/maria/ma_blockrec.c
index 667cc26758d..e628c5ba5f3 100644
--- a/storage/maria/ma_blockrec.c
+++ b/storage/maria/ma_blockrec.c
@@ -455,11 +455,14 @@ my_bool _ma_once_end_block_record(MARIA_SHARE *share)
       File must be synced as it is going out of the maria_open_list and so
       becoming unknown to Checkpoint.
     */
-    if (share->now_transactional &&
-        mysql_file_sync(share->bitmap.file.file, MYF(MY_WME)))
-      res= 1;
-    if (mysql_file_close(share->bitmap.file.file, MYF(MY_WME)))
-      res= 1;
+    if (!share->s3_path)
+    {
+      if (share->now_transactional &&
+          mysql_file_sync(share->bitmap.file.file, MYF(MY_WME)))
+        res= 1;
+      if (mysql_file_close(share->bitmap.file.file, MYF(MY_WME)))
+        res= 1;
+    }
     /*
       Trivial assignment to guard against multiple invocations
       (May happen if file are closed but we want to keep the maria object
@@ -489,7 +492,7 @@ my_bool _ma_init_block_record(MARIA_HA *info)
   uint default_extents;
   DBUG_ENTER("_ma_init_block_record");
 
-  if (!my_multi_malloc(flag,
+  if (!my_multi_malloc(PSI_INSTRUMENT_ME, flag,
                        &row->empty_bits, share->base.pack_bytes,
                        &row->field_lengths,
                        share->base.max_field_lengths + 2,
@@ -528,11 +531,13 @@ my_bool _ma_init_block_record(MARIA_HA *info)
                      FULL_PAGE_SIZE(share) /
                      BLOB_SEGMENT_MIN_SIZE));
 
-  if (my_init_dynamic_array(&info->bitmap_blocks, sizeof(MARIA_BITMAP_BLOCK),
+  if (my_init_dynamic_array(PSI_INSTRUMENT_ME, &info->bitmap_blocks,
+                            sizeof(MARIA_BITMAP_BLOCK),
                             default_extents, 64, flag))
     goto err;
   info->cur_row.extents_buffer_length= default_extents * ROW_EXTENT_SIZE;
-  if (!(info->cur_row.extents= my_malloc(info->cur_row.extents_buffer_length,
+  if (!(info->cur_row.extents= my_malloc(PSI_INSTRUMENT_ME,
+                                         info->cur_row.extents_buffer_length,
                                          flag)))
     goto err;
 
@@ -2455,11 +2460,12 @@ static my_bool free_full_pages(MARIA_HA *info, MARIA_ROW *row)
     /* Compact events by removing filler and tail events */
     uchar *new_block= 0;
     uchar *end, *to, *compact_extent_info;
-    my_bool res;
+    my_bool res, buff_alloced;
     uint extents_count;
 
-    if (!(compact_extent_info= my_alloca(row->extents_count *
-                                         ROW_EXTENT_SIZE)))
+    alloc_on_stack(*info->stack_end_ptr, compact_extent_info, buff_alloced,
+                   row->extents_count * ROW_EXTENT_SIZE);
+    if (!compact_extent_info)
       DBUG_RETURN(1);
 
     to= compact_extent_info;
@@ -2498,7 +2504,7 @@ static my_bool free_full_pages(MARIA_HA *info, MARIA_ROW *row)
         No ranges. This happens in the rear case when we have a allocated
         place for a blob on a tail page but it did fit into the main page.
       */
-      my_afree(compact_extent_info);
+      stack_alloc_free(compact_extent_info, buff_alloced);
       DBUG_RETURN(0);
     }
     extents_count= (uint) (extents_length / ROW_EXTENT_SIZE);
@@ -2513,7 +2519,7 @@ static my_bool free_full_pages(MARIA_HA *info, MARIA_ROW *row)
                                                   extents_length),
                                TRANSLOG_INTERNAL_PARTS + 2, log_array,
                                log_data, NULL);
-    my_afree(compact_extent_info);
+    stack_alloc_free(compact_extent_info, buff_alloced);
     if (res)
       DBUG_RETURN(1);
   }
@@ -3232,7 +3238,7 @@ static my_bool write_block_record(MARIA_HA *info,
     }
     else
     {
-      if (!my_multi_malloc(MY_WME, &log_array,
+      if (!my_multi_malloc(PSI_INSTRUMENT_ME, MYF(MY_WME), &log_array,
                           (uint) ((bitmap_blocks->count +
                                    TRANSLOG_INTERNAL_PARTS + 2) *
                                   sizeof(*log_array)),
@@ -3678,6 +3684,15 @@ my_bool _ma_write_abort_block_record(MARIA_HA *info)
   _ma_bitmap_unlock(share);
   if (share->now_transactional)
   {
+    /*
+      Write clr to mark end of aborted row insert.
+      The above delete_head_or_tail() calls will only log redo, not undo.
+      The undo just before the row insert is stored in row->orig_undo_lsn.
+
+      When applying undo's, we can skip all undo records between current
+      lsn and row->orig_undo_lsn as logically things are as before the
+      attempted insert.
+    */
     if (_ma_write_clr(info, info->cur_row.orig_undo_lsn,
                       LOGREC_UNDO_ROW_INSERT,
                       share->calc_checksum != 0,
@@ -5192,13 +5207,12 @@ my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
   uchar *org_rec_buff, *old_record;
   size_t org_rec_buff_size;
   int error;
+  my_bool buff_alloced;
   DBUG_ENTER("_ma_cmp_block_unique");
 
-  /*
-    Don't allocate more than 16K on the stack to ensure we don't get
-    stack overflow.
-  */
-  if (!(old_record= my_safe_alloca(info->s->base.reclength)))
+  alloc_on_stack(*info->stack_end_ptr, old_record, buff_alloced,
+                 info->s->base.reclength);
+  if (!old_record)
     DBUG_RETURN(1);
 
   /* Don't let the compare destroy blobs that may be in use */
@@ -5220,7 +5234,7 @@ my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
     info->rec_buff_size= org_rec_buff_size;
   }
   DBUG_PRINT("exit", ("result: %d", error));
-  my_safe_afree(old_record, info->s->base.reclength);
+  stack_alloc_free(old_record, buff_alloced);
   DBUG_RETURN(error != 0);
 }
 
@@ -5257,7 +5271,8 @@ my_bool _ma_scan_init_block_record(MARIA_HA *info)
   */
   if (!(info->scan.bitmap_buff ||
         ((info->scan.bitmap_buff=
-          (uchar *) my_malloc(share->block_size * 2, flag)))))
+          (uchar *) my_malloc(PSI_INSTRUMENT_ME, share->block_size * 2,
+                              flag)))))
     DBUG_RETURN(1);
   info->scan.page_buff= info->scan.bitmap_buff + share->block_size;
   info->scan.bitmap_end= info->scan.bitmap_buff + share->bitmap.max_total_size;
@@ -5312,7 +5327,8 @@ int _ma_scan_remember_block_record(MARIA_HA *info,
   DBUG_ENTER("_ma_scan_remember_block_record");
   if (!(info->scan_save))
   {
-    if (!(info->scan_save= my_malloc(ALIGN_SIZE(sizeof(*info->scan_save)) +
+    if (!(info->scan_save= my_malloc(PSI_INSTRUMENT_ME,
+                                     ALIGN_SIZE(sizeof(*info->scan_save)) +
                                      info->s->block_size * 2,
                                      MYF(MY_WME))))
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
@@ -7136,7 +7152,8 @@ my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn,
     row.blob_length= ma_get_length(&header);
 
   /* We need to build up a record (without blobs) in rec_buff */
-  if (!(record= my_malloc(share->base.reclength, MYF(MY_WME))))
+  if (!(record= my_malloc(PSI_INSTRUMENT_ME, share->base.reclength,
+                          MYF(MY_WME))))
     DBUG_RETURN(1);
 
   memcpy(record, null_bits, share->base.null_bytes);
@@ -7361,7 +7378,8 @@ my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn,
   field_length_data_end= header;
 
   /* Allocate buffer for current row & original row */
-  if (!(current_record= my_malloc(share->base.reclength * 2, MYF(MY_WME))))
+  if (!(current_record= my_malloc(PSI_INSTRUMENT_ME, share->base.reclength * 2,
+                                  MYF(MY_WME))))
     DBUG_RETURN(1);
   orig_record= current_record+ share->base.reclength;
 
diff --git a/storage/maria/ma_blockrec.h b/storage/maria/ma_blockrec.h
index 0382eb44006..42546ebdd3f 100644
--- a/storage/maria/ma_blockrec.h
+++ b/storage/maria/ma_blockrec.h
@@ -245,10 +245,10 @@ void _ma_bitmap_set_pagecache_callbacks(PAGECACHE_FILE *file,
 void _ma_print_bitmap(MARIA_FILE_BITMAP *bitmap, uchar *data,
                       pgcache_page_no_t page);
 #endif
-void _ma_get_bitmap_description(MARIA_FILE_BITMAP *bitmap,
-                                uchar *bitmap_data,
-                                pgcache_page_no_t page,
-                                char *out);
+size_t _ma_get_bitmap_description(MARIA_FILE_BITMAP *bitmap,
+                                  uchar *bitmap_data,
+                                  pgcache_page_no_t page,
+                                  char *out);
 
 uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn,
                                             uint page_type,
@@ -306,7 +306,7 @@ my_bool write_hook_for_file_id(enum translog_record_type type,
 my_bool write_hook_for_commit(enum translog_record_type type,
                               TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
                               void *hook_arg);
-void _ma_block_get_status(void *param, my_bool concurrent_insert);
+my_bool _ma_block_get_status(void *param, my_bool concurrent_insert);
 my_bool _ma_block_start_trans(void* param);
 my_bool _ma_block_start_trans_no_versioning(void *param);
 void _ma_block_update_status(void *param);
diff --git a/storage/maria/ma_check.c b/storage/maria/ma_check.c
index 98f9c49b9fa..f8377df8350 100644
--- a/storage/maria/ma_check.c
+++ b/storage/maria/ma_check.c
@@ -46,8 +46,7 @@
 #include "trnman.h"
 #include "ma_key_recover.h"
 #include <my_check_opt.h>
-
-#include <stdarg.h>
+#include <my_stack_alloc.h>
 #include <my_getopt.h>
 #ifdef HAVE_SYS_VADVISE_H
 #include <sys/vadvise.h>
@@ -75,11 +74,11 @@ static int sort_maria_ft_key_write(MARIA_SORT_PARAM *sort_param,
 static int sort_key_write(MARIA_SORT_PARAM *sort_param, const uchar *a);
 static my_off_t get_record_for_key(MARIA_KEYDEF *keyinfo, const uchar *key);
 static int sort_insert_key(MARIA_SORT_PARAM  *sort_param,
-                           reg1 SORT_KEY_BLOCKS *key_block,
+                           reg1 MA_SORT_KEY_BLOCKS *key_block,
 			   const uchar *key, my_off_t prev_block);
 static int sort_delete_record(MARIA_SORT_PARAM *sort_param);
 /*static int _ma_flush_pending_blocks(HA_CHECK *param);*/
-static SORT_KEY_BLOCKS	*alloc_key_blocks(HA_CHECK *param, uint blocks,
+static MA_SORT_KEY_BLOCKS	*alloc_key_blocks(HA_CHECK *param, uint blocks,
 					  uint buffer_length);
 static ha_checksum maria_byte_checksum(const uchar *buf, uint length);
 static void set_data_file_type(MARIA_SORT_INFO *sort_info, MARIA_SHARE *share);
@@ -115,7 +114,7 @@ void maria_chk_init(HA_CHECK *param)
   param->use_buffers= PAGE_BUFFER_INIT;
   param->read_buffer_length=READ_BUFFER_INIT;
   param->write_buffer_length=READ_BUFFER_INIT;
-  param->sort_buffer_length=SORT_BUFFER_INIT;
+  param->orig_sort_buffer_length=SORT_BUFFER_INIT;
   param->sort_key_blocks=BUFFERS_WHEN_SORTING;
   param->tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
   param->myf_rw=MYF(MY_NABP | MY_WME | MY_WAIT_IF_FULL);
@@ -124,6 +123,8 @@ void maria_chk_init(HA_CHECK *param)
   param->pagecache_block_size= KEY_CACHE_BLOCK_SIZE;
   param->stats_method= MI_STATS_METHOD_NULLS_NOT_EQUAL;
   param->max_stage= 1;
+  param->stack_end_ptr= &my_thread_var->stack_ends_here;
+  param->max_allowed_lsn= (LSN) ~0ULL;
 }
 
 
@@ -412,6 +413,12 @@ int maria_chk_size(HA_CHECK *param, register MARIA_HA *info)
   char buff[22],buff2[22];
   DBUG_ENTER("maria_chk_size");
 
+  if (info->s3)
+  {
+    /* We cannot check file sizes for S3 */
+    DBUG_RETURN(0);
+  }
+
   if (!(param->testflag & T_SILENT))
     puts("- check file-size");
 
@@ -590,8 +597,8 @@ int maria_chk_key(HA_CHECK *param, register MARIA_HA *info)
       {
 	_ma_check_print_error(param,"Found %s keys of %s",llstr(keys,buff),
 		    llstr(share->state.state.records,buff2));
-	if (!(param->testflag & T_INFO))
-	DBUG_RETURN(-1);
+	if (!(param->testflag & (T_INFO | T_EXTEND)))
+          DBUG_RETURN(-1);
 	result= -1;
 	continue;
       }
@@ -860,7 +867,8 @@ static int chk_index(HA_CHECK *param, MARIA_HA *info, MARIA_KEYDEF *keyinfo,
   MARIA_SHARE *share= info->s;
   char llbuff[22];
   uint diff_pos[2];
-  uchar tmp_key_buff[MARIA_MAX_KEY_BUFF];
+  uchar *tmp_key_buff;
+  my_bool temp_buff_alloced;
   MARIA_KEY tmp_key;
   DBUG_ENTER("chk_index");
   DBUG_DUMP("buff", anc_page->buff, anc_page->size);
@@ -869,11 +877,14 @@ static int chk_index(HA_CHECK *param, MARIA_HA *info, MARIA_KEYDEF *keyinfo,
   if (keyinfo->flag & (HA_SPATIAL | HA_RTREE_INDEX))
     DBUG_RETURN(0);
 
-  if (!(temp_buff=(uchar*) my_alloca((uint) keyinfo->block_length)))
+  alloc_on_stack(*param->stack_end_ptr, temp_buff, temp_buff_alloced,
+                 (keyinfo->block_length + keyinfo->max_store_length));
+  if (!temp_buff)
   {
     _ma_check_print_error(param,"Not enough memory for keyblock");
     DBUG_RETURN(-1);
   }
+  tmp_key_buff= temp_buff+ keyinfo->block_length;
 
   if (keyinfo->flag & HA_NOSAME)
   {
@@ -899,15 +910,33 @@ static int chk_index(HA_CHECK *param, MARIA_HA *info, MARIA_KEYDEF *keyinfo,
     _ma_check_print_error(param, "Page at %s is not marked for index %u",
                           llstr(anc_page->pos, llbuff),
                           (uint) keyinfo->key_nr);
-  if ((page_flag & KEYPAGE_FLAG_HAS_TRANSID) &&
-      !share->base.born_transactional)
+  if (page_flag & KEYPAGE_FLAG_HAS_TRANSID)
   {
-    _ma_check_print_error(param,
-                          "Page at %s is marked with HAS_TRANSID even if "
-                          "table is not transactional",
-                          llstr(anc_page->pos, llbuff));
+    if (!share->base.born_transactional)
+    {
+      _ma_check_print_error(param,
+                            "Page at %s is marked with HAS_TRANSID even if "
+                            "table is not transactional",
+                            llstr(anc_page->pos, llbuff));
+    }
+  }
+  if (share->base.born_transactional)
+  {
+    LSN lsn= lsn_korr(anc_page->buff);
+    if ((ulonglong) lsn > param->max_allowed_lsn)
+    {
+      /* Avoid flooding of errors */
+      if (param->skip_lsn_error_count++ < MAX_LSN_ERRORS)
+      {
+        _ma_check_print_error(param,
+                              "Page at %s as wrong LSN " LSN_FMT ". Current "
+                              "LSN is " LSN_FMT,
+                              llstr(anc_page->pos, llbuff),
+                              LSN_IN_PARTS(lsn),
+                              LSN_IN_PARTS(param->max_allowed_lsn));
+      }
+    }
   }
-
   if (anc_page->size > share->max_index_block_size)
   {
     _ma_check_print_error(param,
@@ -1065,10 +1094,10 @@ static int chk_index(HA_CHECK *param, MARIA_HA *info, MARIA_KEYDEF *keyinfo,
                           (uint) (keypos - anc_page->buff));
     goto err;
   }
-  my_afree(temp_buff);
+  stack_alloc_free(temp_buff, temp_buff_alloced);
   DBUG_RETURN(0);
  err:
-  my_afree(temp_buff);
+  stack_alloc_free(temp_buff, temp_buff_alloced);
   DBUG_RETURN(1);
 } /* chk_index */
 
@@ -1115,8 +1144,8 @@ static uint isam_key_length(MARIA_HA *info, register MARIA_KEYDEF *keyinfo)
 
 
 
-static void record_pos_to_txt(MARIA_HA *info, my_off_t recpos,
-                              char *buff)
+static char * record_pos_to_txt(MARIA_HA *info, my_off_t recpos,
+                                char *buff)
 {
   if (info->s->data_file_type != BLOCK_RECORD)
     llstr(recpos, buff);
@@ -1128,6 +1157,7 @@ static void record_pos_to_txt(MARIA_HA *info, my_off_t recpos,
     *(end++)= ':';
     longlong10_to_str(row, end, 10);
   }
+  return buff;
 }
 
 
@@ -1191,11 +1221,14 @@ static int check_keys_in_record(HA_CHECK *param, MARIA_HA *info, int extend,
             _ma_search(info, &key, SEARCH_SAME, share->state.key_root[keynr]);
           if (search_result)
           {
-            record_pos_to_txt(info, start_recpos, llbuff);
             _ma_check_print_error(param,
                                   "Record at: %14s  "
                                   "Can't find key for index: %2d",
-                                  llbuff, keynr+1);
+                                  record_pos_to_txt(info, start_recpos,
+                                                    llbuff),
+                                  keynr+1);
+            if (param->testflag & T_VERBOSE)
+              _ma_print_key(stdout, &key);
             if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
               return -1;
           }
@@ -1538,6 +1571,7 @@ static int check_compressed_record(HA_CHECK *param, MARIA_HA *info, int extend,
                             my_errno, llstr(block_info.filepos, llbuff));
       DBUG_RETURN(1);
     }
+    info->rec_buff[block_info.rec_len]= 0;  /* Keep valgrind happy */
     if (_ma_pack_rec_unpack(info, &info->bit_buff, record,
                             info->rec_buff, block_info.rec_len))
     {
@@ -1841,6 +1875,7 @@ static int check_block_record(HA_CHECK *param, MARIA_HA *info, int extend,
   ha_rows full_page_count, tail_count;
   my_bool UNINIT_VAR(full_dir), now_transactional;
   uint offset_page, offset, free_count;
+  LSN lsn;
 
   if (_ma_scan_init_block_record(info))
   {
@@ -1985,6 +2020,23 @@ static int check_block_record(HA_CHECK *param, MARIA_HA *info, int extend,
       if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
         goto err;
     }
+    if (share->base.born_transactional)
+    {
+      lsn= lsn_korr(page_buff);
+      if ((ulonglong) lsn > param->max_allowed_lsn)
+      {
+        /* Avoid flooding of errors */
+        if (param->skip_lsn_error_count++ < MAX_LSN_ERRORS)
+        {
+          _ma_check_print_error(param,
+                                "Page %9s:  Wrong LSN " LSN_FMT ". Current "
+                                "LSN is " LSN_FMT,
+                                llstr(page, llbuff),
+                                LSN_IN_PARTS(lsn),
+                                LSN_IN_PARTS(param->max_allowed_lsn));
+        }
+      }
+    }
     if ((enum en_page_type) page_type == BLOB_PAGE)
       continue;
     param->empty+= empty_space;
@@ -2075,7 +2127,8 @@ int maria_chk_data_link(HA_CHECK *param, MARIA_HA *info, my_bool extend)
       puts("- check record links");
   }
 
-  if (!(record= (uchar*) my_malloc(share->base.default_rec_buff_size, MYF(0))))
+  if (!(record= (uchar*) my_malloc(PSI_INSTRUMENT_ME,
+                                   share->base.default_rec_buff_size, MYF(0))))
   {
     _ma_check_print_error(param,"Not enough memory for record");
     DBUG_RETURN(-1);
@@ -2350,6 +2403,8 @@ static int initialize_variables_for_repair(HA_CHECK *param,
                                            MARIA_SHARE *org_share)
 {
   MARIA_SHARE *share= info->s;
+  size_t tmp;
+  uint threads;
 
   /*
     We have to clear these variables first, as the cleanup-in-case-of-error
@@ -2371,7 +2426,7 @@ static int initialize_variables_for_repair(HA_CHECK *param,
 
   /* Repair code relies on share->state.state so we have to update it here */
   if (share->lock.update_status)
-    (*share->lock.update_status)(info);
+    (*share->lock.update_status)(info->lock.status_param);
 
   param->testflag|= T_REP;                     /* for easy checking */
   if (share->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))
@@ -2410,6 +2465,7 @@ static int initialize_variables_for_repair(HA_CHECK *param,
 
   /* calculate max_records */
   sort_info->filelength= my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0));
+
   param->max_progress= sort_info->filelength;
   if ((param->testflag & T_CREATE_MISSING_KEYS) ||
       sort_info->org_data_file_type == COMPRESSED_RECORD)
@@ -2422,6 +2478,21 @@ static int initialize_variables_for_repair(HA_CHECK *param,
     sort_info->max_records= (ha_rows) (sort_info->filelength / rec_length);
   }
 
+  /* We don't need a bigger sort buffer than file_length * 8 */
+  threads= (param->testflag & T_REP_PARALLEL) ? (uint) share->base.keys : 1;
+  tmp= (size_t) MY_MIN(sort_info->filelength,
+                       (my_off_t) (SIZE_T_MAX/10/threads));
+  tmp= MY_MAX(tmp * 8 * threads, (size_t) 65536);         /* Some margin */
+  param->sort_buffer_length= MY_MIN(param->orig_sort_buffer_length,
+                                    tmp);
+  set_if_smaller(param->sort_buffer_length, tmp);
+  /* Protect against too big sort buffer length */
+#if SIZEOF_SIZE_T >= 8
+  set_if_smaller(param->sort_buffer_length, 16LL*1024LL*1024LL*1024LL);
+#else
+  set_if_smaller(param->sort_buffer_length, 1L*1024L*1024L*1024L);
+#endif
+
   /* Set up transaction handler so that we can see all rows */
   if (param->max_trid == 0)
   {
@@ -2692,7 +2763,7 @@ int maria_repair(HA_CHECK *param, register MARIA_HA *info,
   }
 
   if (!(sort_param.record=
-        (uchar *) my_malloc((uint)
+        (uchar *) my_malloc(PSI_INSTRUMENT_ME, (uint)
                             share->base.default_rec_buff_size, MYF(0))) ||
       _ma_alloc_buffer(&sort_param.rec_buff, &sort_param.rec_buff_size,
                        share->base.default_rec_buff_size, MYF(0)))
@@ -2734,8 +2805,11 @@ int maria_repair(HA_CHECK *param, register MARIA_HA *info,
                               "Duplicate key %2d for record at %10s against "
                               "new record at %10s",
                               info->errkey+1,
-                              llstr(sort_param.current_filepos, llbuff),
-                              llstr(info->dup_key_pos,llbuff2));
+                              record_pos_to_txt(info,
+                                                sort_param.current_filepos,
+                                                llbuff),
+                              record_pos_to_txt(info,
+                                                info->dup_key_pos, llbuff2));
       if (param->testflag & T_VERBOSE)
       {
         MARIA_KEY tmp_key;
@@ -2749,7 +2823,7 @@ int maria_repair(HA_CHECK *param, register MARIA_HA *info,
       if ((param->testflag & (T_FORCE_UNIQUENESS|T_QUICK)) == T_QUICK)
       {
         param->testflag|=T_RETRY_WITHOUT_QUICK;
-	param->error_printed=1;
+	param->error_printed++;
 	goto err;
       }
       /* purecov: begin tested */
@@ -3237,6 +3311,7 @@ static int sort_one_index(HA_CHECK *param, MARIA_HA *info,
   MARIA_SHARE *share= info->s;
   MARIA_KEY key;
   MARIA_PAGE page;
+  my_bool buff_alloced;
   DBUG_ENTER("sort_one_index");
 
   /* cannot walk over R-tree indices */
@@ -3245,11 +3320,11 @@ static int sort_one_index(HA_CHECK *param, MARIA_HA *info,
   param->new_file_pos+=keyinfo->block_length;
   key.keyinfo= keyinfo;
 
-  if (!(buff= (uchar*) my_alloca((uint) keyinfo->block_length +
-                                 keyinfo->maxlength +
-                                 MARIA_INDEX_OVERHEAD_SIZE)))
+  alloc_on_stack(*param->stack_end_ptr, buff, buff_alloced,
+                 keyinfo->block_length + keyinfo->max_store_length);
+  if (!buff)
   {
-    _ma_check_print_error(param,"Not enough memory for key block");
+    _ma_check_print_error(param,"Not enough memory for keyblock");
     DBUG_RETURN(-1);
   }
   key.data= buff + keyinfo->block_length;
@@ -3316,10 +3391,10 @@ static int sort_one_index(HA_CHECK *param, MARIA_HA *info,
     _ma_check_print_error(param,"Can't write indexblock, error: %d",my_errno);
     goto err;
   }
-  my_afree(buff);
+  stack_alloc_free(buff, buff_alloced);
   DBUG_RETURN(0);
 err:
-  my_afree(buff);
+  stack_alloc_free(buff, buff_alloced);
   DBUG_RETURN(1);
 } /* sort_one_index */
 
@@ -3346,6 +3421,9 @@ static my_bool maria_zerofill_index(HA_CHECK *param, MARIA_HA *info,
   my_bool zero_lsn= (share->base.born_transactional &&
                      !(param->testflag & T_ZEROFILL_KEEP_LSN));
   int error= 1;
+  enum pagecache_page_type page_type= (share->base.born_transactional ?
+                                       PAGECACHE_LSN_PAGE :
+                                       PAGECACHE_PLAIN_PAGE);
   DBUG_ENTER("maria_zerofill_index");
 
   if (!(param->testflag & T_SILENT))
@@ -3360,7 +3438,7 @@ static my_bool maria_zerofill_index(HA_CHECK *param, MARIA_HA *info,
     if (!(buff= pagecache_read(share->pagecache,
                                &share->kfile, page,
                                DFLT_INIT_HITS, 0,
-                               PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
+                               page_type, PAGECACHE_LOCK_WRITE,
                                &page_link.link)))
     {
       pagecache_unlock_by_link(share->pagecache, page_link.link,
@@ -3437,6 +3515,9 @@ static my_bool maria_zerofill_data(HA_CHECK *param, MARIA_HA *info,
   uint block_size= share->block_size;
   MARIA_FILE_BITMAP *bitmap= &share->bitmap;
   my_bool zero_lsn= !(param->testflag & T_ZEROFILL_KEEP_LSN), error;
+  enum pagecache_page_type read_page_type= (share->base.born_transactional ?
+                                            PAGECACHE_LSN_PAGE :
+                                            PAGECACHE_PLAIN_PAGE);
   DBUG_ENTER("maria_zerofill_data");
 
   /* This works only with BLOCK_RECORD files */
@@ -3460,7 +3541,7 @@ static my_bool maria_zerofill_data(HA_CHECK *param, MARIA_HA *info,
     if (!(buff= pagecache_read(share->pagecache,
                                &info->dfile,
                                page, 1, 0,
-                               PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
+                               read_page_type, PAGECACHE_LOCK_WRITE,
                                &page_link.link)))
     {
       _ma_check_print_error(param,
@@ -3636,7 +3717,7 @@ int maria_filecopy(HA_CHECK *param, File to,File from,my_off_t start,
   DBUG_ENTER("maria_filecopy");
 
   buff_length=(ulong) MY_MIN(param->write_buffer_length,length);
-  if (!(buff=my_malloc(buff_length,MYF(0))))
+  if (!(buff=my_malloc(PSI_INSTRUMENT_ME, buff_length, MYF(0))))
   {
     buff=tmp_buff; buff_length=IO_SIZE;
   }
@@ -3780,7 +3861,8 @@ int maria_repair_by_sort(HA_CHECK *param, register MARIA_HA *info,
   }
 
   if (!(sort_param.record=
-        (uchar*) my_malloc((size_t) share->base.default_rec_buff_size,
+        (uchar*) my_malloc(PSI_INSTRUMENT_ME,
+                           (size_t) share->base.default_rec_buff_size,
                            MYF(0))) ||
       _ma_alloc_buffer(&sort_param.rec_buff, &sort_param.rec_buff_size,
                        share->base.default_rec_buff_size, MYF(0)))
@@ -3800,7 +3882,7 @@ int maria_repair_by_sort(HA_CHECK *param, register MARIA_HA *info,
 
   param->read_cache.end_of_file= sort_info.filelength;
   sort_param.wordlist=NULL;
-  init_alloc_root(&sort_param.wordroot, "sort", FTPARSER_MEMROOT_ALLOC_SIZE, 0,
+  init_alloc_root(PSI_INSTRUMENT_ME, &sort_param.wordroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0,
                   MYF(param->malloc_flags));
 
   sort_param.key_cmp=sort_key_cmp;
@@ -4367,7 +4449,7 @@ int maria_repair_parallel(HA_CHECK *param, register MARIA_HA *info,
   del=share->state.state.del;
 
   if (!(sort_param=(MARIA_SORT_PARAM *)
-        my_malloc((uint) share->base.keys *
+        my_malloc(PSI_INSTRUMENT_ME, (uint) share->base.keys *
 		  (sizeof(MARIA_SORT_PARAM) + share->base.pack_reclength),
 		  MYF(MY_ZEROFILL))))
   {
@@ -4455,8 +4537,7 @@ int maria_repair_parallel(HA_CHECK *param, register MARIA_HA *info,
         (FT_MAX_WORD_LEN_FOR_SORT *
          sort_param[i].keyinfo->seg->charset->mbmaxlen);
       sort_param[i].key_length+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN;
-      init_alloc_root(&sort_param[i].wordroot, "sort",
-                      FTPARSER_MEMROOT_ALLOC_SIZE, 0,
+      init_alloc_root(PSI_INSTRUMENT_ME, &sort_param[i].wordroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0,
                       MYF(param->malloc_flags));
     }
   }
@@ -4489,6 +4570,7 @@ int maria_repair_parallel(HA_CHECK *param, register MARIA_HA *info,
 
   (void) pthread_attr_init(&thr_attr);
   (void) pthread_attr_setdetachstate(&thr_attr,PTHREAD_CREATE_DETACHED);
+  (void) my_setstacksize(&thr_attr, (size_t)my_thread_stack_size);
 
   for (i=0 ; i < sort_info.total_keys ; i++)
   {
@@ -4911,10 +4993,12 @@ static int sort_get_next_record(MARIA_SORT_PARAM *sort_param)
           {
             if (param->testflag & T_VERBOSE)
             {
-              record_pos_to_txt(info, info->cur_row.lastpos, llbuff);
               _ma_check_print_info(param,
                                    "Found record with wrong checksum at %s",
-                                   llbuff);
+                                   record_pos_to_txt(info,
+                                                     info->cur_row.lastpos,
+                                                     llbuff));
+
             }
             continue;
           }
@@ -5025,7 +5109,7 @@ static int sort_get_next_record(MARIA_SORT_PARAM *sort_param)
 	}
 	if (searching && ! sort_param->fix_datafile)
 	{
-	  param->error_printed=1;
+	  param->error_printed++;
           param->retry_repair=1;
           param->testflag|=T_RETRY_WITHOUT_QUICK;
           my_errno= HA_ERR_WRONG_IN_RECORD;
@@ -5299,7 +5383,7 @@ static int sort_get_next_record(MARIA_SORT_PARAM *sort_param)
 	DBUG_RETURN(-1);
       if (searching && ! sort_param->fix_datafile)
       {
-	param->error_printed=1;
+	param->error_printed++;
         param->retry_repair=1;
         param->testflag|=T_RETRY_WITHOUT_QUICK;
         my_errno= HA_ERR_WRONG_IN_RECORD;
@@ -5335,10 +5419,7 @@ static int sort_get_next_record(MARIA_SORT_PARAM *sort_param)
 			      llstr(sort_param->pos,llbuff));
 	continue;
       }
-#ifdef HAVE_valgrind
-      bzero(sort_param->rec_buff + block_info.rec_len,
-            share->base.extra_rec_buff_size);
-#endif
+      sort_param->rec_buff[block_info.rec_len]= 0;  /* Keep valgrind happy */
       if (_ma_pack_rec_unpack(info, &sort_param->bit_buff, sort_param->record,
                               sort_param->rec_buff, block_info.rec_len))
       {
@@ -5441,7 +5522,7 @@ int _ma_sort_write_record(MARIA_SORT_PARAM *sort_param)
 	  MARIA_DYN_DELETE_BLOCK_HEADER;
 	if (sort_info->buff_length < reclength)
 	{
-	  if (!(sort_info->buff=my_realloc(sort_info->buff, (uint) reclength,
+	  if (!(sort_info->buff=my_realloc(PSI_INSTRUMENT_ME, sort_info->buff, (uint) reclength,
 					   MYF(MY_FREE_ON_ERROR |
 					       MY_ALLOW_ZERO_PTR))))
 	    DBUG_RETURN(1);
@@ -5534,6 +5615,7 @@ static int sort_key_write(MARIA_SORT_PARAM *sort_param, const uchar *a)
   char llbuff[22],llbuff2[22];
   MARIA_SORT_INFO *sort_info=sort_param->sort_info;
   HA_CHECK *param= sort_info->param;
+  MARIA_HA *info= sort_info->info;
   int cmp;
 
   if (sort_info->key_block->inited)
@@ -5576,11 +5658,14 @@ static int sort_key_write(MARIA_SORT_PARAM *sort_param, const uchar *a)
 			   "Duplicate key %2u for record at %10s against "
                             "record at %10s",
                             sort_param->key + 1,
-                            llstr(sort_info->info->cur_row.lastpos, llbuff),
-                            llstr(get_record_for_key(sort_param->keyinfo,
-                                                     sort_info->key_block->
-                                                     lastkey),
-                                  llbuff2));
+                            record_pos_to_txt(info,
+                                              sort_info->info->cur_row.lastpos,
+                                              llbuff),
+                            record_pos_to_txt(info,
+                                              get_record_for_key(sort_param->
+                                                                 keyinfo,
+                                                                 sort_info->key_block->lastkey),
+                                              llbuff2));
     param->testflag|=T_RETRY_WITHOUT_QUICK;
     if (sort_info->param->testflag & T_VERBOSE)
       _ma_print_keydata(stdout,sort_param->seg, a, USE_WHOLE_KEY);
@@ -5602,11 +5687,11 @@ static int sort_key_write(MARIA_SORT_PARAM *sort_param, const uchar *a)
 int _ma_sort_ft_buf_flush(MARIA_SORT_PARAM *sort_param)
 {
   MARIA_SORT_INFO *sort_info=sort_param->sort_info;
-  SORT_KEY_BLOCKS *key_block=sort_info->key_block;
+  MA_SORT_KEY_BLOCKS *key_block=sort_info->key_block;
   MARIA_SHARE *share=sort_info->info->s;
   uint val_off, val_len;
   int error;
-  SORT_FT_BUF *maria_ft_buf=sort_info->ft_buf;
+  MA_SORT_FT_BUF *maria_ft_buf=sort_info->ft_buf;
   uchar *from, *to;
 
   val_len=share->ft2_keyinfo.keylength;
@@ -5650,8 +5735,8 @@ static int sort_maria_ft_key_write(MARIA_SORT_PARAM *sort_param,
 {
   uint a_len, val_off, val_len, error;
   MARIA_SORT_INFO *sort_info= sort_param->sort_info;
-  SORT_FT_BUF *ft_buf= sort_info->ft_buf;
-  SORT_KEY_BLOCKS *key_block= sort_info->key_block;
+  MA_SORT_FT_BUF *ft_buf= sort_info->ft_buf;
+  MA_SORT_KEY_BLOCKS *key_block= sort_info->key_block;
   MARIA_SHARE *share= sort_info->info->s;
 
   val_len=HA_FT_WLEN+share->rec_reflength;
@@ -5667,8 +5752,8 @@ static int sort_maria_ft_key_write(MARIA_SORT_PARAM *sort_param,
          share->rec_reflength) &&
         (share->options &
           (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)))
-      ft_buf= (SORT_FT_BUF *)my_malloc(sort_param->keyinfo->block_length +
-                                       sizeof(SORT_FT_BUF), MYF(MY_WME));
+      ft_buf= (MA_SORT_FT_BUF *)my_malloc(PSI_INSTRUMENT_ME, sort_param->keyinfo->block_length +
+                                       sizeof(MA_SORT_FT_BUF), MYF(MY_WME));
 
     if (!ft_buf)
     {
@@ -5752,7 +5837,7 @@ static my_off_t get_record_for_key(MARIA_KEYDEF *keyinfo,
 /* Insert a key in sort-key-blocks */
 
 static int sort_insert_key(MARIA_SORT_PARAM *sort_param,
-			   register SORT_KEY_BLOCKS *key_block,
+			   register MA_SORT_KEY_BLOCKS *key_block,
                            const uchar *key,
 			   my_off_t prev_block)
 {
@@ -5934,7 +6019,7 @@ int _ma_flush_pending_blocks(MARIA_SORT_PARAM *sort_param)
 {
   uint nod_flag,length;
   my_off_t filepos;
-  SORT_KEY_BLOCKS *key_block;
+  MA_SORT_KEY_BLOCKS *key_block;
   MARIA_SORT_INFO *sort_info= sort_param->sort_info;
   myf myf_rw=sort_info->param->myf_rw;
   MARIA_HA *info=sort_info->info;
@@ -5986,16 +6071,15 @@ err:
 
 	/* alloc space and pointers for key_blocks */
 
-static SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks,
+static MA_SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks,
                                          uint buffer_length)
 {
   reg1 uint i;
-  SORT_KEY_BLOCKS *block;
+  MA_SORT_KEY_BLOCKS *block;
   DBUG_ENTER("alloc_key_blocks");
 
-  if (!(block= (SORT_KEY_BLOCKS*) my_malloc((sizeof(SORT_KEY_BLOCKS)+
-                                             buffer_length+IO_SIZE)*blocks,
-                                            MYF(0))))
+  if (!(block= (MA_SORT_KEY_BLOCKS*) my_malloc(PSI_INSTRUMENT_ME,
+                         (sizeof(MA_SORT_KEY_BLOCKS)+buffer_length+IO_SIZE)*blocks, MYF(0))))
   {
     _ma_check_print_error(param,"Not enough memory for sort-key-blocks");
     return(0);
@@ -6177,7 +6261,7 @@ int maria_recreate_table(HA_CHECK *param, MARIA_HA **org_info, char *filename)
                           HA_OPEN_WAIT_IF_LOCKED :
                           (param->testflag & T_DESCRIPT) ?
                           HA_OPEN_IGNORE_IF_LOCKED :
-                          HA_OPEN_ABORT_IF_LOCKED)));
+                          HA_OPEN_ABORT_IF_LOCKED)), 0);
   if (!*org_info)
   {
     _ma_check_print_error(param,
@@ -6327,7 +6411,7 @@ void _ma_update_auto_increment_key(HA_CHECK *param, MARIA_HA *info,
     We have to use an allocated buffer instead of info->rec_buff as
     _ma_put_key_in_record() may use info->rec_buff
   */
-  if (!(record= (uchar*) my_malloc((size_t) share->base.default_rec_buff_size,
+  if (!(record= (uchar*) my_malloc(PSI_INSTRUMENT_ME, (size_t) share->base.default_rec_buff_size,
                                    MYF(0))))
   {
     _ma_check_print_error(param,"Not enough memory for extra record");
@@ -6400,7 +6484,7 @@ void _ma_update_auto_increment_key(HA_CHECK *param, MARIA_HA *info,
          keypart_k=c_k for arbitrary constants c_1 ... c_k)
 
      = {assuming that values have uniform distribution and index contains all
-        tuples from the domain (or that {c_1, ..., c_k} tuple is choosen from
+        tuples from the domain (or that {c_1, ..., c_k} tuple is chosen from
         index tuples}
 
      = #tuples-in-the-index / #distinct-tuples-in-the-index.
@@ -6526,7 +6610,7 @@ static my_bool create_new_data_handle(MARIA_SORT_PARAM *param, File new_file)
 
   if (!(sort_info->new_info= maria_open(info->s->open_file_name.str, O_RDWR,
                                         HA_OPEN_COPY | HA_OPEN_FOR_REPAIR |
-                                        HA_OPEN_INTERNAL_TABLE)))
+                                        HA_OPEN_INTERNAL_TABLE, 0)))
     DBUG_RETURN(1);
 
   new_info= sort_info->new_info;
@@ -6991,7 +7075,7 @@ static void print_bitmap_description(MARIA_SHARE *share,
                                      pgcache_page_no_t page,
                                      uchar *bitmap_data)
 {
-  char *tmp= my_malloc(MAX_BITMAP_INFO_LENGTH, MYF(MY_WME));
+  char *tmp= my_malloc(PSI_INSTRUMENT_ME, MAX_BITMAP_INFO_LENGTH, MYF(MY_WME));
   if (!tmp)
     return;
   _ma_get_bitmap_description(&share->bitmap, bitmap_data, page, tmp);
diff --git a/storage/maria/ma_check.h b/storage/maria/ma_check.h
new file mode 100644
index 00000000000..fa78ada6d38
--- /dev/null
+++ b/storage/maria/ma_check.h
@@ -0,0 +1,36 @@
+/* Copyright (C) 2019, 2022, MariaDB Corporation AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+   MA 02110-1335 USA
+*/
+
+/*
+  Types that are different in Aria from those used by MyISAM check tables
+  in myisamchk.h
+*/
+
+struct st_sort_key_blocks		/* Used when sorting */
+{
+  uchar *buff, *end_pos;
+  uchar lastkey[MARIA_MAX_POSSIBLE_KEY_BUFF];
+  uint last_length;
+  int inited;
+};
+
+struct st_sort_ftbuf
+{
+  uchar *buf, *end;
+  int count;
+  uchar lastkey[MARIA_MAX_KEY_BUFF];
+};
diff --git a/storage/maria/ma_check_standalone.h b/storage/maria/ma_check_standalone.h
index e2e651b43f3..9442800a0c7 100644
--- a/storage/maria/ma_check_standalone.h
+++ b/storage/maria/ma_check_standalone.h
@@ -124,7 +124,7 @@ void _ma_check_print_warning(HA_CHECK *param, const char *fmt,...)
 	      param->isam_file_name);
     param->out_flag|= O_DATA_LOST;
   }
-  param->warning_printed=1;
+  param->warning_printed++;
   va_start(args,fmt);
   fprintf(stderr,"%s: warning: ",my_progname_short);
   vfprintf(stderr, fmt, args);
@@ -149,7 +149,7 @@ void _ma_check_print_error(HA_CHECK *param, const char *fmt,...)
       fprintf(stderr,"%s: Aria file %s\n",my_progname_short,param->isam_file_name);
     param->out_flag|= O_DATA_LOST;
   }
-  param->error_printed|=1;
+  param->error_printed++;
   va_start(args,fmt);
   fprintf(stderr,"%s: error: ",my_progname_short);
   vfprintf(stderr, fmt, args);
diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c
index e525602f7a1..2741f54d7d7 100644
--- a/storage/maria/ma_checkpoint.c
+++ b/storage/maria/ma_checkpoint.c
@@ -153,8 +153,10 @@ end:
 static int really_execute_checkpoint(void)
 {
   uint i, error= 0;
+  int error_errno= 0;
   /** @brief checkpoint_start_log_horizon will be stored there */
   char *ptr;
+  const char *error_place= 0;
   LEX_STRING record_pieces[4]; /**< only malloc-ed pieces */
   LSN min_page_rec_lsn, min_trn_rec_lsn, min_first_undo_lsn;
   TRANSLOG_ADDRESS checkpoint_start_log_horizon;
@@ -191,13 +193,19 @@ static int really_execute_checkpoint(void)
                                            &record_pieces[1],
                                            &min_trn_rec_lsn,
                                            &min_first_undo_lsn)))
+  {
+    error_place= "trnman_collect_transaction";
     goto err;
+  }
 
 
   /* STEP 3: fetch information about table files */
   if (unlikely(collect_tables(&record_pieces[2],
                               checkpoint_start_log_horizon)))
+  {
+    error_place= "collect_tables";
     goto err;
+  }
 
 
   /* STEP 4: fetch information about dirty pages */
@@ -211,7 +219,10 @@ static int really_execute_checkpoint(void)
   if (unlikely(pagecache_collect_changed_blocks_with_lsn(maria_pagecache,
                                                          &record_pieces[3],
                                                          &min_page_rec_lsn)))
+  {
+    error_place= "collect_pages";
     goto err;
+  }
 
 
   /* LAST STEP: now write the checkpoint log record */
@@ -240,7 +251,10 @@ static int really_execute_checkpoint(void)
                                        sizeof(log_array)/sizeof(log_array[0]),
                                        log_array, NULL, NULL) ||
                  translog_flush(lsn)))
+    {
+      error_place= "translog_write_record";
       goto err;
+    }
     translog_lock();
     /*
       This cannot be done as a inwrite_rec_hook of LOGREC_CHECKPOINT, because
@@ -251,6 +265,8 @@ static int really_execute_checkpoint(void)
                                                  max_trid_in_control_file,
                                                  recovery_failures)))
     {
+      error_place= "ma_control_file_write";
+      error_errno= my_errno;
       translog_unlock();
       goto err;
     }
@@ -287,7 +303,9 @@ static int really_execute_checkpoint(void)
 
 err:
   error= 1;
-  ma_message_no_user(0, "checkpoint failed");
+  my_printf_error(HA_ERR_GENERIC, "Aria engine: checkpoint failed at %s with "
+                  "error %d", MYF(ME_ERROR_LOG),
+                  error_place, (error_errno ? error_errno : my_errno));
   /* we were possibly not able to determine what pages to flush */
   pages_to_flush_before_next_checkpoint= 0;
 
@@ -562,7 +580,7 @@ pthread_handler_t ma_checkpoint_background(void *arg)
   DBUG_PRINT("info",("Maria background checkpoint thread starts"));
   DBUG_ASSERT(interval > 0);
 
-  PSI_CALL_set_thread_user_host(0,0,0,0);
+  PSI_CALL_set_thread_account(0,0,0,0);
 
   /*
     Recovery ended with all tables closed and a checkpoint: no need to take
@@ -790,7 +808,7 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
     }
   }
   if (unlikely((distinct_shares=
-                (MARIA_SHARE **)my_malloc(nb * sizeof(MARIA_SHARE *),
+                (MARIA_SHARE **)my_malloc(PSI_INSTRUMENT_ME, nb * sizeof(MARIA_SHARE *),
                                           MYF(MY_WME))) == NULL))
     goto err;
   for (total_names_length= 0, i= 0, pos= maria_open_list; pos; pos= pos->next)
@@ -823,7 +841,7 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
      LSN_STORE_SIZE + /* first_log_write_at_lsn */
      1                /* end-of-name 0 */
      ) * nb + total_names_length;
-  if (unlikely((str->str= my_malloc(str->length, MYF(MY_WME))) == NULL))
+  if (unlikely((str->str= my_malloc(PSI_INSTRUMENT_ME, str->length, MYF(MY_WME))) == NULL))
     goto err;
 
   ptr= str->str;
@@ -853,12 +871,12 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
   */
 #define STATE_COPIES 1024
   state_copies= (struct st_state_copy *)
-    my_malloc(STATE_COPIES * sizeof(struct st_state_copy), MYF(MY_WME));
-  dfiles= (PAGECACHE_FILE *)my_realloc((uchar *)dfiles,
+    my_malloc(PSI_INSTRUMENT_ME, STATE_COPIES * sizeof(struct st_state_copy), MYF(MY_WME));
+  dfiles= (PAGECACHE_FILE *)my_realloc(PSI_INSTRUMENT_ME, (uchar *)dfiles,
                                        /* avoid size of 0 for my_realloc */
                                        MY_MAX(1, nb) * sizeof(PAGECACHE_FILE),
                                        MYF(MY_WME | MY_ALLOW_ZERO_PTR));
-  kfiles= (PAGECACHE_FILE *)my_realloc((uchar *)kfiles,
+  kfiles= (PAGECACHE_FILE *)my_realloc(PSI_INSTRUMENT_ME, (uchar *)kfiles,
                                        /* avoid size of 0 for my_realloc */
                                        MY_MAX(1, nb) * sizeof(PAGECACHE_FILE),
                                        MYF(MY_WME | MY_ALLOW_ZERO_PTR));
@@ -1218,10 +1236,9 @@ err:
       MARIA_SHARE *share= distinct_shares[i];
       if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
       {
+        share->in_checkpoint&= ~MARIA_CHECKPOINT_SHOULD_FREE_ME;
         /* maria_close() left us to free the share */
-        mysql_mutex_destroy(&share->intern_lock);
-        ma_crypt_free(share);
-        my_free(share);
+        free_maria_share(share);
       }
       else
       {
diff --git a/storage/maria/ma_close.c b/storage/maria/ma_close.c
index 8a054ec044f..7441e29a97b 100644
--- a/storage/maria/ma_close.c
+++ b/storage/maria/ma_close.c
@@ -1,4 +1,5 @@
 /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+   Copyright (c) 2010, 2020, MariaDB Corporation Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -22,11 +23,13 @@
 
 #include "ma_ftdefs.h"
 #include "ma_crypt.h"
+#ifdef WITH_S3_STORAGE_ENGINE
+#include "s3_func.h"
+#endif /* WITH_S3_STORAGE_ENGINE */
 
 int maria_close(register MARIA_HA *info)
 {
   int error=0,flag;
-  my_bool share_can_be_freed= FALSE;
   MARIA_SHARE *share= info->s;
   my_bool internal_table= share->internal_table;
   DBUG_ENTER("maria_close");
@@ -92,12 +95,16 @@ int maria_close(register MARIA_HA *info)
 
   if (flag)
   {
-    /* Last close of file; Flush everything */
+    /* Last close of file */
 
-    /* Check that we don't have any dangling pointers from the transaction */
-    DBUG_ASSERT(share->in_trans == 0);
+    /*
+      Check that we don't have any dangling open files
+      We may still have some open transactions. In this case the share
+      will be kept around until the transaction has closed
+    */
     DBUG_ASSERT(share->open_list == 0);
 
+    /* Flush everything */
     if (share->kfile.file >= 0)
     {
       my_bool save_global_changed= share->global_changed;
@@ -105,6 +112,7 @@ int maria_close(register MARIA_HA *info)
       /* Avoid _ma_mark_file_changed() when flushing pages */
       share->global_changed= 1;
 
+      /* Flush page cache if BLOCK format */
       if ((*share->once_end)(share))
         error= my_errno;
       /*
@@ -154,9 +162,10 @@ int maria_close(register MARIA_HA *info)
         File must be synced as it is going out of the maria_open_list and so
         becoming unknown to future Checkpoints.
       */
-      if (share->now_transactional && mysql_file_sync(share->kfile.file, MYF(MY_WME)))
+      if (share->now_transactional &&
+          mysql_file_sync(share->kfile.file, MYF(MY_WME)))
         error= my_errno;
-      if (mysql_file_close(share->kfile.file, MYF(0)))
+      if (!share->s3_path && mysql_file_close(share->kfile.file, MYF(0)))
         error= my_errno;
     }
     thr_lock_delete(&share->lock);
@@ -170,7 +179,8 @@ int maria_close(register MARIA_HA *info)
 	mysql_rwlock_destroy(&share->keyinfo[i].root_lock);
       }
     }
-    DBUG_ASSERT(share->now_transactional == share->base.born_transactional);
+    DBUG_ASSERT(share->now_transactional == share->base.born_transactional ||
+                share->internal_table);
     /*
       We assign -1 because checkpoint does not need to flush (in case we
       have concurrent checkpoint if no then we do not need it here also)
@@ -194,8 +204,6 @@ int maria_close(register MARIA_HA *info)
       /* we cannot my_free() the share, Checkpoint would see a bad pointer */
       share->in_checkpoint|= MARIA_CHECKPOINT_SHOULD_FREE_ME;
     }
-    else
-      share_can_be_freed= TRUE;
 
     if (share->state_history)
     {
@@ -210,7 +218,7 @@ int maria_close(register MARIA_HA *info)
           wrong status information.
         */
         if ((history= (MARIA_STATE_HISTORY_CLOSED *)
-             my_malloc(sizeof(*history), MYF(MY_WME))))
+             my_malloc(PSI_INSTRUMENT_ME, sizeof(*history), MYF(MY_WME))))
         {
           history->create_rename_lsn= share->state.create_rename_lsn;
           history->state_history= share->state_history;
@@ -227,24 +235,14 @@ int maria_close(register MARIA_HA *info)
   if (!internal_table)
   {
     mysql_mutex_unlock(&THR_LOCK_maria);
-    mysql_mutex_unlock(&share->intern_lock);
     mysql_mutex_unlock(&share->close_lock);
   }
-  if (share_can_be_freed)
-  {
-    ma_crypt_free(share);
-    (void) mysql_mutex_destroy(&share->intern_lock);
-    (void) mysql_mutex_destroy(&share->close_lock);
-    (void) mysql_cond_destroy(&share->key_del_cond);
-    my_free(share);
-    /*
-      If share cannot be freed, it's because checkpoint has previously
-      recorded to include this share in the checkpoint and so is soon going to
-      look at some of its content (share->in_checkpoint/id/last_version).
-    */
-  }
+
+  /* free_maria_share will free share->internal_lock */
+  free_maria_share(share);
+
   my_free(info->ftparser_param);
-  if (info->dfile.file >= 0)
+  if (info->dfile.file >= 0 && ! info->s3)
   {
     /*
       This is outside of mutex so would confuse a concurrent
@@ -255,6 +253,10 @@ int maria_close(register MARIA_HA *info)
   }
 
   delete_dynamic(&info->pinned_pages);
+#ifdef WITH_S3_STORAGE_ENGINE
+  if (info->s3)
+    s3f.deinit(info->s3);
+#endif /* WITH_S3_STORAGE_ENGINE */
   my_free(info);
 
   if (error)
@@ -264,3 +266,35 @@ int maria_close(register MARIA_HA *info)
   }
   DBUG_RETURN(0);
 } /* maria_close */
+
+
+/**
+  Free Aria table share
+
+  Note that share will not be freed a long as there are active checkpoints
+  or transactions pointing at the shared object
+*/
+
+void free_maria_share(MARIA_SHARE *share)
+{
+  if (!share->internal_table)
+    mysql_mutex_assert_owner(&share->intern_lock);
+
+  if (!share->reopen && !share->in_trans &&
+      !(share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME))
+  {
+    /* No one can access this share anymore, time to delete it ! */
+    if (!share->internal_table)
+      mysql_mutex_unlock(&share->intern_lock);
+    ma_crypt_free(share);
+    my_free(share->s3_path);
+    (void) mysql_mutex_destroy(&share->intern_lock);
+    (void) mysql_mutex_destroy(&share->close_lock);
+    (void) mysql_cond_destroy(&share->key_del_cond);
+    my_free(share);
+    return;
+  }
+  if (!share->internal_table)
+    mysql_mutex_unlock(&share->intern_lock);
+  return;
+}
diff --git a/storage/maria/ma_control_file.c b/storage/maria/ma_control_file.c
index d619a58ff29..af1708c72b2 100644
--- a/storage/maria/ma_control_file.c
+++ b/storage/maria/ma_control_file.c
@@ -215,7 +215,7 @@ static CONTROL_FILE_ERROR create_control_file(const char *name,
   file.
 */
 
-static int lock_control_file(const char *name)
+static int lock_control_file(const char *name, my_bool do_retry)
 {
   /*
     On Windows, my_lock() uses locking() which is mandatory locking and so
@@ -228,6 +228,8 @@ static int lock_control_file(const char *name)
   */
 #ifndef __WIN__
   uint retry= 0;
+  uint retry_count= do_retry ? MARIA_MAX_CONTROL_FILE_LOCK_RETRY : 0;
+
   /*
     We can't here use the automatic wait in my_lock() as the alarm thread
     may not yet exists.
@@ -239,8 +241,8 @@ static int lock_control_file(const char *name)
       my_printf_error(HA_ERR_INITIALIZATION,
                       "Can't lock aria control file '%s' for exclusive use, "
                       "error: %d. Will retry for %d seconds", 0,
-                      name, my_errno, MARIA_MAX_CONTROL_FILE_LOCK_RETRY);
-    if (retry++ > MARIA_MAX_CONTROL_FILE_LOCK_RETRY)
+                      name, my_errno, retry_count);
+    if (++retry > retry_count)
       return 1;
     sleep(1);
   }
@@ -269,7 +271,8 @@ static int lock_control_file(const char *name)
 */
 
 CONTROL_FILE_ERROR ma_control_file_open(my_bool create_if_missing,
-                                        my_bool print_error)
+                                        my_bool print_error,
+                                        my_bool wait_for_lock)
 {
   uchar buffer[CF_MAX_SIZE];
   char name[FN_REFLEN], errmsg_buff[256];
@@ -311,8 +314,9 @@ CONTROL_FILE_ERROR ma_control_file_open(my_bool create_if_missing,
       errmsg= "Can't create file";
       goto err;
     }
-    if (lock_control_file(name))
+    if (!aria_readonly && lock_control_file(name, wait_for_lock))
     {
+      error= CONTROL_FILE_LOCKED;
       errmsg= lock_failed_errmsg;
       goto err;
     }
@@ -320,7 +324,6 @@ CONTROL_FILE_ERROR ma_control_file_open(my_bool create_if_missing,
   }
 
   /* Otherwise, file exists */
-
   if ((control_file_fd= mysql_file_open(key_file_control, name,
                                         open_flags, MYF(MY_WME))) < 0)
   {
@@ -328,8 +331,10 @@ CONTROL_FILE_ERROR ma_control_file_open(my_bool create_if_missing,
     goto err;
   }
 
-  if (lock_control_file(name)) /* lock it before reading content */
+  /* lock it before reading content */
+  if (!aria_readonly && lock_control_file(name, wait_for_lock))
   {
+    error= CONTROL_FILE_LOCKED;
     errmsg= lock_failed_errmsg;
     goto err;
   }
@@ -716,7 +721,7 @@ my_bool print_aria_log_control()
   {
     recovery_fails=
       (buffer + new_cf_create_time_size + CF_RECOV_FAIL_OFFSET)[0];
-    printf("recovery_failuers:   %u\n", recovery_fails);
+    printf("recovery_failures:   %u\n", recovery_fails);
   }
 
   DBUG_RETURN(0);
diff --git a/storage/maria/ma_control_file.h b/storage/maria/ma_control_file.h
index 535b0c71e64..40428f665f4 100644
--- a/storage/maria/ma_control_file.h
+++ b/storage/maria/ma_control_file.h
@@ -21,6 +21,8 @@
 #ifndef _ma_control_file_h
 #define _ma_control_file_h
 
+C_MODE_START
+
 #define CONTROL_FILE_BASE_NAME "aria_log_control"
 /*
   Major version for control file. Should only be changed when doing
@@ -59,12 +61,14 @@ typedef enum enum_control_file_error {
   CONTROL_FILE_MISSING,
   CONTROL_FILE_INCONSISTENT_INFORMATION,
   CONTROL_FILE_WRONG_BLOCKSIZE,
+  CONTROL_FILE_LOCKED,
   CONTROL_FILE_UNKNOWN_ERROR /* any other error */
 } CONTROL_FILE_ERROR;
 
-C_MODE_START
+
 CONTROL_FILE_ERROR ma_control_file_open(my_bool create_if_missing,
-                                        my_bool print_error);
+                                        my_bool print_error,
+                                        my_bool wait_for_lock);
 int ma_control_file_write_and_force(LSN last_checkpoint_lsn_arg,
                                     uint32 last_logno_arg, TrID max_trid_arg,
                                     uint8 recovery_failures_arg);
diff --git a/storage/maria/ma_create.c b/storage/maria/ma_create.c
index 8ef3249c2e4..83d4f584e69 100644
--- a/storage/maria/ma_create.c
+++ b/storage/maria/ma_create.c
@@ -75,7 +75,7 @@ int maria_create(const char *name, enum data_file_type datafile_type,
   uint max_field_lengths, extra_header_size, column_nr;
   uint internal_table= flags & HA_CREATE_INTERNAL_TABLE;
   ulong reclength, real_reclength,min_pack_length;
-  char kfilename[FN_REFLEN], klinkname[FN_REFLEN], *klinkname_ptr= NullS;
+  char kfilename[FN_REFLEN], klinkname[FN_REFLEN], *klinkname_ptr= 0;
   char dfilename[FN_REFLEN], dlinkname[FN_REFLEN], *dlinkname_ptr= 0;
   ulong pack_reclength;
   ulonglong tot_length,max_rows, tmp;
@@ -94,7 +94,7 @@ int maria_create(const char *name, enum data_file_type datafile_type,
   my_bool tmp_table= FALSE; /* cache for presence of HA_OPTION_TMP_TABLE */
   my_bool forced_packed;
   uchar   *log_data= NULL;
-  my_bool encrypted= maria_encrypt_tables && datafile_type == BLOCK_RECORD;
+  my_bool encrypted= ci->encrypted && datafile_type == BLOCK_RECORD;
   my_bool insert_order= MY_TEST(flags & HA_PRESERVE_INSERT_ORDER);
   uint crypt_page_header_space= 0;
   DBUG_ENTER("maria_create");
@@ -147,7 +147,8 @@ int maria_create(const char *name, enum data_file_type datafile_type,
   }
 
   if (!(rec_per_key_part=
-	(double*) my_malloc((keys + uniques)*HA_MAX_KEY_SEG*sizeof(double) +
+	(double*) my_malloc(PSI_INSTRUMENT_ME,
+                            (keys + uniques)*HA_MAX_KEY_SEG*sizeof(double) +
                             (keys + uniques)*HA_MAX_KEY_SEG*sizeof(ulong) +
                             sizeof(uint16) * columns,
                             MYF(common_flag | MY_ZEROFILL))))
@@ -330,6 +331,8 @@ int maria_create(const char *name, enum data_file_type datafile_type,
   share.base.born_transactional= ci->transactional;
   share.base.max_field_lengths= max_field_lengths;
   share.base.field_offsets= 0;                  /* for future */
+  share.base.compression_algorithm= ci->compression_algorithm;
+  share.base.s3_block_size=         ci->s3_block_size;
 
   if (flags & HA_CREATE_CHECKSUM || (options & HA_OPTION_CHECKSUM))
   {
@@ -896,6 +899,7 @@ int maria_create(const char *name, enum data_file_type datafile_type,
     fn_format(kfilename, name, "", MARIA_NAME_IEXT, MY_UNPACK_FILENAME |
               (internal_table ? 0 : MY_RETURN_REAL_PATH) |
               (have_iext ? MY_REPLACE_EXT : MY_APPEND_EXT));
+    klinkname_ptr= NullS;
     /*
       Replace the current file.
       Don't sync dir now if the data file has the same path.
@@ -1027,7 +1031,8 @@ int maria_create(const char *name, enum data_file_type datafile_type,
   {
     /* Store columns in a more efficent order */
     MARIA_COLUMNDEF **col_order, **pos;
-    if (!(col_order= (MARIA_COLUMNDEF**) my_malloc(share.base.fields *
+    if (!(col_order= (MARIA_COLUMNDEF**) my_malloc(PSI_INSTRUMENT_ME,
+                                                   share.base.fields *
                                                    sizeof(MARIA_COLUMNDEF*),
                                                    common_flag)))
       goto err;
@@ -1089,7 +1094,8 @@ int maria_create(const char *name, enum data_file_type datafile_type,
     log_array[TRANSLOG_INTERNAL_PARTS + 1].length= 1 + 2 + 2 +
       (uint) kfile_size_before_extension;
     /* we are needing maybe 64 kB, so don't use the stack */
-    log_data= my_malloc(log_array[TRANSLOG_INTERNAL_PARTS + 1].length, MYF(0));
+    log_data= my_malloc(PSI_INSTRUMENT_ME,
+                        log_array[TRANSLOG_INTERNAL_PARTS + 1].length, MYF(0));
     if ((log_data == NULL) ||
         mysql_file_pread(file, 1 + 2 + 2 + log_data,
                  (size_t) kfile_size_before_extension, 0, MYF(MY_NABP)))
diff --git a/storage/maria/ma_crypt.c b/storage/maria/ma_crypt.c
index 33cf72b0b1e..9282405bae9 100644
--- a/storage/maria/ma_crypt.c
+++ b/storage/maria/ma_crypt.c
@@ -101,7 +101,7 @@ int
 ma_crypt_create(MARIA_SHARE* share)
 {
   MARIA_CRYPT_DATA *crypt_data=
-    (MARIA_CRYPT_DATA*)my_malloc(sizeof(MARIA_CRYPT_DATA), MYF(MY_ZEROFILL));
+    (MARIA_CRYPT_DATA*)my_malloc(PSI_INSTRUMENT_ME, sizeof(MARIA_CRYPT_DATA), MYF(MY_ZEROFILL));
   crypt_data->scheme.type= CRYPT_SCHEME_1;
   crypt_data->scheme.locker= crypt_data_scheme_locker;
   mysql_mutex_init(key_CRYPT_DATA_lock, &crypt_data->lock, MY_MUTEX_INIT_FAST);
@@ -165,7 +165,7 @@ ma_crypt_read(MARIA_SHARE* share, uchar *buff)
   {
     /* opening a table */
     MARIA_CRYPT_DATA *crypt_data=
-      (MARIA_CRYPT_DATA*)my_malloc(sizeof(MARIA_CRYPT_DATA), MYF(MY_ZEROFILL));
+      (MARIA_CRYPT_DATA*)my_malloc(PSI_INSTRUMENT_ME, sizeof(MARIA_CRYPT_DATA), MYF(MY_ZEROFILL));
 
     crypt_data->scheme.type= type;
     mysql_mutex_init(key_CRYPT_DATA_lock, &crypt_data->lock,
@@ -189,7 +189,7 @@ static int ma_decrypt(MARIA_SHARE *, MARIA_CRYPT_DATA *, const uchar *,
 static my_bool ma_crypt_pre_read_hook(PAGECACHE_IO_HOOK_ARGS *args)
 {
   MARIA_SHARE *share= (MARIA_SHARE*) args->data;
-  uchar *crypt_buf= my_malloc(share->block_size, MYF(0));
+  uchar *crypt_buf= my_malloc(PSI_INSTRUMENT_ME, share->block_size, MYF(0));
   if (crypt_buf == NULL)
   {
     args->crypt_buf= NULL; /* for post-hook */
@@ -260,7 +260,7 @@ static my_bool ma_crypt_data_pre_write_hook(PAGECACHE_IO_HOOK_ARGS *args)
   MARIA_SHARE *share= (MARIA_SHARE*) args->data;
   const uint size= share->block_size;
   uint key_version;
-  uchar *crypt_buf= my_malloc(share->block_size, MYF(0));
+  uchar *crypt_buf= my_malloc(PSI_INSTRUMENT_ME, share->block_size, MYF(0));
 
   if (crypt_buf == NULL)
   {
@@ -392,7 +392,7 @@ static my_bool ma_crypt_index_pre_write_hook(PAGECACHE_IO_HOOK_ARGS *args)
   const uint block_size= share->block_size;
   const uint page_used= _ma_get_page_used(share, args->page);
   uint key_version;
-  uchar *crypt_buf= my_malloc(block_size, MYF(0));
+  uchar *crypt_buf= my_malloc(PSI_INSTRUMENT_ME, block_size, MYF(0));
   if (crypt_buf == NULL)
   {
     args->crypt_buf= NULL; /* for post-hook */
diff --git a/storage/maria/ma_delete.c b/storage/maria/ma_delete.c
index 7377f3bf5ad..cbba9d975dc 100644
--- a/storage/maria/ma_delete.c
+++ b/storage/maria/ma_delete.c
@@ -158,14 +158,20 @@ my_bool _ma_ck_delete(MARIA_HA *info, MARIA_KEY *key)
 {
   MARIA_SHARE *share= info->s;
   int res;
+  my_bool buff_alloced;
   LSN lsn= LSN_IMPOSSIBLE;
   my_off_t new_root= share->state.key_root[key->keyinfo->key_nr];
-  uchar key_buff[MARIA_MAX_KEY_BUFF], *save_key_data;
+  uchar *key_buff, *save_key_data;
   MARIA_KEY org_key;
   DBUG_ENTER("_ma_ck_delete");
 
   LINT_INIT_STRUCT(org_key);
 
+  alloc_on_stack(*info->stack_end_ptr, key_buff, buff_alloced,
+                 key->keyinfo->max_store_length);
+  if (!key_buff)
+    DBUG_RETURN(1);
+
   save_key_data= key->data;
   if (share->now_transactional)
   {
@@ -190,6 +196,8 @@ my_bool _ma_ck_delete(MARIA_HA *info, MARIA_KEY *key)
     _ma_fast_unlock_key_del(info);
   }
   _ma_unpin_all_pages_and_finalize_row(info, lsn);
+
+  stack_alloc_free(key_buff, buff_alloced);
   DBUG_RETURN(res != 0);
 } /* _ma_ck_delete */
 
@@ -198,7 +206,7 @@ my_bool _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEY *key,
                            my_off_t *root)
 {
   int error;
-  my_bool result= 0;
+  my_bool result= 0, buff_alloced;
   my_off_t old_root;
   uchar *root_buff;
   MARIA_KEYDEF *keyinfo= key->keyinfo;
@@ -210,13 +218,12 @@ my_bool _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEY *key,
     _ma_set_fatal_error(info->s, HA_ERR_CRASHED);
     DBUG_RETURN(1);
   }
-  if (!(root_buff= (uchar*)  my_alloca((uint) keyinfo->block_length+
-                                       MARIA_MAX_KEY_BUFF*2)))
-  {
-    DBUG_PRINT("error",("Couldn't allocate memory"));
-    my_errno=ENOMEM;
+
+  alloc_on_stack(*info->stack_end_ptr, root_buff, buff_alloced,
+                 (keyinfo->block_length + keyinfo->max_store_length*2));
+  if (!root_buff)
     DBUG_RETURN(1);
-  }
+
   DBUG_PRINT("info",("root_page: %lu",
                      (ulong) (old_root / keyinfo->block_length)));
   if (_ma_fetch_keypage(&page, info, keyinfo, old_root,
@@ -261,7 +268,7 @@ my_bool _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEY *key,
     }
   }
 err:
-  my_afree(root_buff);
+  stack_alloc_free(root_buff, buff_alloced);
   DBUG_PRINT("exit",("Return: %d",result));
   DBUG_RETURN(result);
 } /* _ma_ck_real_delete */
@@ -284,9 +291,8 @@ static int d_search(MARIA_HA *info, MARIA_KEY *key, uint32 comp_flag,
 {
   int flag,ret_value,save_flag;
   uint nod_flag, page_flag;
-  my_bool last_key;
-  uchar *leaf_buff,*keypos;
-  uchar lastkey[MARIA_MAX_KEY_BUFF];
+  my_bool last_key, buff_alloced= 0, lastkey_alloced;
+  uchar *leaf_buff=0, *keypos, *lastkey;
   MARIA_KEY_PARAM s_temp;
   MARIA_SHARE *share= info->s;
   MARIA_KEYDEF *keyinfo= key->keyinfo;
@@ -294,12 +300,17 @@ static int d_search(MARIA_HA *info, MARIA_KEY *key, uint32 comp_flag,
   DBUG_ENTER("d_search");
   DBUG_DUMP("page", anc_page->buff, anc_page->size);
 
+  alloc_on_stack(*info->stack_end_ptr, lastkey, lastkey_alloced,
+                 keyinfo->max_store_length);
+  if (!lastkey)
+    DBUG_RETURN(1);
+
   flag=(*keyinfo->bin_search)(key, anc_page, comp_flag, &keypos, lastkey,
                               &last_key);
   if (flag == MARIA_FOUND_WRONG_KEY)
   {
     DBUG_PRINT("error",("Found wrong key"));
-    DBUG_RETURN(-1);
+    goto err;
   }
   page_flag= anc_page->flag;
   nod_flag=  anc_page->node;
@@ -344,14 +355,14 @@ static int d_search(MARIA_HA *info, MARIA_KEY *key, uint32 comp_flag,
                                                &kpos)))
       {
         _ma_set_fatal_error(share, HA_ERR_CRASHED);
-        DBUG_RETURN(-1);
+        goto err;
       }
       root= _ma_row_pos_from_key(&tmp_key);
       if (subkeys == -1)
       {
         /* the last entry in sub-tree */
         if (_ma_dispose(info, root, 1))
-          DBUG_RETURN(-1);
+          goto err;
         /* fall through to normal delete */
       }
       else
@@ -378,23 +389,20 @@ static int d_search(MARIA_HA *info, MARIA_KEY *key, uint32 comp_flag,
                                        PAGECACHE_LOCK_LEFT_WRITELOCKED,
                                        DFLT_INIT_HITS);
         }
-        DBUG_PRINT("exit",("Return: %d",ret_value));
-        DBUG_RETURN(ret_value);
+        goto end;
       }
     }
   }
-  leaf_buff=0;
   if (nod_flag)
   {
     /* Read left child page */
     leaf_page.pos= _ma_kpos(nod_flag,keypos);
-    if (!(leaf_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
-                                       MARIA_MAX_KEY_BUFF*2)))
-    {
-      DBUG_PRINT("error", ("Couldn't allocate memory"));
-      my_errno=ENOMEM;
-      DBUG_RETURN(-1);
-    }
+
+    alloc_on_stack(*info->stack_end_ptr, leaf_buff, buff_alloced,
+                   (keyinfo->block_length + keyinfo->max_store_length*2));
+    if (!leaf_buff)
+      goto err;
+
     if (_ma_fetch_keypage(&leaf_page, info,keyinfo, leaf_page.pos,
                           PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, leaf_buff,
                           0))
@@ -439,7 +447,7 @@ static int d_search(MARIA_HA *info, MARIA_KEY *key, uint32 comp_flag,
         _ma_log_delete(anc_page, s_temp.key_pos,
                        s_temp.changed_length, s_temp.move_length,
                        0, KEY_OP_DEBUG_LOG_DEL_CHANGE_1))
-      DBUG_RETURN(-1);
+      goto err;
 
     if (!nod_flag)
     {						/* On leaf page */
@@ -448,12 +456,15 @@ static int d_search(MARIA_HA *info, MARIA_KEY *key, uint32 comp_flag,
                               (uint) keyinfo->underflow_block_length))
       {
         /* Page will be written by caller if we return 1 */
-        DBUG_RETURN(1);
+        ret_value= 1;
+        goto end;
       }
       if (_ma_write_keypage(anc_page,
                             PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS))
-	DBUG_RETURN(-1);
-      DBUG_RETURN(0);
+	goto err;
+
+      ret_value= 0;                             /* Return ok */
+      goto end;
     }
     save_flag=1;                         /* Mark that anc_buff is changed */
     ret_value= del(info, key, anc_page, &leaf_page,
@@ -506,12 +517,16 @@ static int d_search(MARIA_HA *info, MARIA_KEY *key, uint32 comp_flag,
   {
     DBUG_DUMP("page", anc_page->buff, anc_page->size);
   }
-  my_afree(leaf_buff);
+
+end:
+  stack_alloc_free(leaf_buff, buff_alloced);
+  stack_alloc_free(lastkey, lastkey_alloced);
   DBUG_PRINT("exit",("Return: %d",ret_value));
   DBUG_RETURN(ret_value);
 
 err:
-  my_afree(leaf_buff);
+  stack_alloc_free(leaf_buff, buff_alloced);
+  stack_alloc_free(lastkey, lastkey_alloced);
   DBUG_PRINT("exit",("Error: %d",my_errno));
   DBUG_RETURN (-1);
 } /* d_search */
@@ -550,8 +565,9 @@ static int del(MARIA_HA *info, MARIA_KEY *key,
 {
   int ret_value,length;
   uint a_length, page_flag, nod_flag, leaf_length, new_leaf_length;
-  uchar keybuff[MARIA_MAX_KEY_BUFF],*endpos,*next_buff,*key_start, *prev_key;
+  uchar *keybuff,*endpos,*next_buff,*key_start, *prev_key;
   uchar *anc_buff;
+  my_bool buff_alloced= 0, keybuff_alloced;
   MARIA_KEY_PARAM s_temp;
   MARIA_KEY tmp_key;
   MARIA_SHARE *share= info->s;
@@ -564,6 +580,11 @@ static int del(MARIA_HA *info, MARIA_KEY *key,
 		      keypos));
   DBUG_DUMP("leaf_buff", leaf_page->buff, leaf_page->size);
 
+  alloc_on_stack(*info->stack_end_ptr, keybuff, keybuff_alloced,
+                 keyinfo->max_store_length);
+  if (!keybuff)
+    DBUG_RETURN(1);
+
   page_flag=   leaf_page->flag;
   leaf_length= leaf_page->size;
   nod_flag=    leaf_page->node;
@@ -574,14 +595,17 @@ static int del(MARIA_HA *info, MARIA_KEY *key,
   next_buff= 0;
 
   if (!(key_start= _ma_get_last_key(&tmp_key, leaf_page, endpos)))
-    DBUG_RETURN(-1);
+    goto err;
 
   if (nod_flag)
   {
     next_page.pos= _ma_kpos(nod_flag,endpos);
-    if (!(next_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
-					MARIA_MAX_KEY_BUFF*2)))
-      DBUG_RETURN(-1);
+
+    alloc_on_stack(*info->stack_end_ptr, next_buff, buff_alloced,
+                   (keyinfo->block_length + keyinfo->max_store_length*2));
+    if (!next_buff)
+      goto err;
+
     if (_ma_fetch_keypage(&next_page, info, keyinfo, next_page.pos,
                           PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, next_buff, 0))
       ret_value= -1;
@@ -634,7 +658,8 @@ static int del(MARIA_HA *info, MARIA_KEY *key,
                                               DFLT_INIT_HITS))
 	goto err;
     }
-    my_afree(next_buff);
+    stack_alloc_free(next_buff, buff_alloced);
+    stack_alloc_free(keybuff, keybuff_alloced);
     DBUG_ASSERT(leaf_page->size <= share->max_index_block_size);
     DBUG_RETURN(ret_value);
   }
@@ -712,13 +737,15 @@ static int del(MARIA_HA *info, MARIA_KEY *key,
     goto err;
 
   DBUG_ASSERT(leaf_page->size <= share->max_index_block_size);
+  stack_alloc_free(next_buff, buff_alloced);
+  stack_alloc_free(keybuff, keybuff_alloced);
   DBUG_RETURN(new_leaf_length <=
               (info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH :
                (uint) keyinfo->underflow_block_length));
-err:
-  if (next_buff)
-    my_afree(next_buff);
 
+err:
+  stack_alloc_free(next_buff, buff_alloced);
+  stack_alloc_free(keybuff, keybuff_alloced);
   DBUG_RETURN(-1);
 } /* del */
 
@@ -761,13 +788,13 @@ static int underflow(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
   uint next_buff_length, new_buff_length, key_reflength;
   uint unchanged_leaf_length, new_leaf_length, new_anc_length;
   uint anc_page_flag, page_flag;
-  uchar anc_key_buff[MARIA_MAX_KEY_BUFF], leaf_key_buff[MARIA_MAX_KEY_BUFF];
+  uchar *anc_key_buff, *leaf_key_buff;
   uchar *endpos, *next_keypos, *anc_pos, *half_pos, *prev_key;
   uchar *anc_buff, *leaf_buff;
   uchar *after_key, *anc_end_pos;
   MARIA_KEY_PARAM key_deleted, key_inserted;
   MARIA_SHARE *share= info->s;
-  my_bool first_key;
+  my_bool first_key, buff_alloced;
   MARIA_KEY tmp_key, anc_key, leaf_key;
   MARIA_PAGE next_page;
   DBUG_ENTER("underflow");
@@ -777,6 +804,13 @@ static int underflow(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
   DBUG_DUMP("anc_buff", anc_page->buff,  anc_page->size);
   DBUG_DUMP("leaf_buff", leaf_page->buff, leaf_page->size);
 
+  alloc_on_stack(*info->stack_end_ptr, anc_key_buff, buff_alloced,
+                 keyinfo->max_store_length*2);
+  if (!anc_key_buff)
+    DBUG_RETURN(1);
+
+  leaf_key_buff= anc_key_buff+ keyinfo->max_store_length;
+
   anc_page_flag= anc_page->flag;
   anc_buff= anc_page->buff;
   leaf_buff= leaf_page->buff;
@@ -1035,6 +1069,7 @@ static int underflow(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
     if (_ma_write_keypage(leaf_page,
                           PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS))
       goto err;
+    stack_alloc_free(anc_key_buff, buff_alloced);
     DBUG_RETURN(new_anc_length <=
                 ((info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH :
                   (uint) keyinfo->underflow_block_length)));
@@ -1264,11 +1299,13 @@ static int underflow(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
                         PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS))
     goto err;
 
+  stack_alloc_free(anc_key_buff, buff_alloced);
   DBUG_RETURN(new_anc_length <=
               ((info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH :
                 (uint) keyinfo->underflow_block_length)));
 
 err:
+  stack_alloc_free(anc_key_buff, buff_alloced);
   DBUG_RETURN(-1);
 } /* underflow */
 
diff --git a/storage/maria/ma_delete_table.c b/storage/maria/ma_delete_table.c
index 0c78476ad44..fa93a7100b0 100644
--- a/storage/maria/ma_delete_table.c
+++ b/storage/maria/ma_delete_table.c
@@ -43,7 +43,7 @@ int maria_delete_table(const char *name)
     'open_for_repair' to be able to open even a crashed table.
   */
   my_errno= 0;
-  if (!(info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR)))
+  if (!(info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR, 0)))
   {
     sync_dir= 0;
     /* Ignore not found errors and wrong symlink errors */
@@ -83,24 +83,34 @@ int maria_delete_table(const char *name)
       DBUG_RETURN(1);
   }
 
-  if (!(error= maria_delete_table_files(name, 0, sync_dir)))
+  if (!(error= maria_delete_table_files(name, 0, sync_dir | MY_WME)))
     error= got_error;
   DBUG_RETURN(error);
 }
 
+/**
+   Delete all files related to a aria table
+*/
 
-int maria_delete_table_files(const char *name, my_bool temporary, myf sync_dir)
+int maria_delete_table_files(const char *name, my_bool temporary, myf flags)
 {
+  int error= 0;
   DBUG_ENTER("maria_delete_table_files");
 
-  if (mysql_file_delete_with_symlink(key_file_kfile, name, MARIA_NAME_IEXT, MYF(MY_WME | sync_dir)) ||
-      mysql_file_delete_with_symlink(key_file_dfile, name, MARIA_NAME_DEXT, MYF(MY_WME | sync_dir)))
-    DBUG_RETURN(my_errno);
-
+  if (mysql_file_delete_with_symlink(key_file_kfile, name, MARIA_NAME_IEXT,
+                                     flags))
+    error= my_errno;
+  if (mysql_file_delete_with_symlink(key_file_dfile, name, MARIA_NAME_DEXT,
+                                     flags))
+    error= my_errno;
   if (!temporary)
   {
-    mysql_file_delete_with_symlink(key_file_dfile, name, ".TMD", MYF(0));
+    /* This is delete a possible temporary aria_chk file */
+    mysql_file_delete_with_symlink(key_file_dfile, name, DATA_TMP_EXT, MYF(0));
+#ifdef SUPPORT_ARIA_PACK
+    /* This is delete a possible temporary aria_pack file */
     mysql_file_delete_with_symlink(key_file_dfile, name, ".OLD", MYF(0));
+#endif
   }
-  DBUG_RETURN(0);
+  DBUG_RETURN(error);
 }
diff --git a/storage/maria/ma_dynrec.c b/storage/maria/ma_dynrec.c
index 829e5b5cd02..f36e7dd9363 100644
--- a/storage/maria/ma_dynrec.c
+++ b/storage/maria/ma_dynrec.c
@@ -249,16 +249,20 @@ my_bool _ma_write_blob_record(MARIA_HA *info, const uchar *record)
   uchar *rec_buff;
   int error;
   ulong reclength,reclength2,extra;
+  my_bool buff_alloced;
 
   extra= (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+
 	  MARIA_DYN_DELETE_BLOCK_HEADER+1);
   reclength= (info->s->base.pack_reclength +
 	      _ma_calc_total_blob_length(info,record)+ extra);
-  if (!(rec_buff=(uchar*) my_safe_alloca(reclength)))
+
+  alloc_on_stack(*info->stack_end_ptr, rec_buff, buff_alloced, reclength);
+  if (!rec_buff)
   {
     my_errno= HA_ERR_OUT_OF_MEM; /* purecov: inspected */
     return(1);
   }
+
   reclength2= _ma_rec_pack(info,
                            rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER),
 			   record);
@@ -275,7 +279,7 @@ my_bool _ma_write_blob_record(MARIA_HA *info, const uchar *record)
                               rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER),
                               reclength2);
 err:
-  my_safe_afree(rec_buff, reclength);
+  stack_alloc_free(rec_buff, buff_alloced);
   return(error != 0);
 }
 
@@ -287,6 +291,7 @@ my_bool _ma_update_blob_record(MARIA_HA *info, MARIA_RECORD_POS pos,
   uchar *rec_buff;
   int error;
   ulong reclength,reclength2,extra;
+  my_bool buff_alloced;
 
   extra= (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+
 	  MARIA_DYN_DELETE_BLOCK_HEADER);
@@ -299,11 +304,14 @@ my_bool _ma_update_blob_record(MARIA_HA *info, MARIA_RECORD_POS pos,
     return 1;
   }
 #endif
-  if (!(rec_buff=(uchar*) my_safe_alloca(reclength)))
+
+  alloc_on_stack(*info->stack_end_ptr, rec_buff, buff_alloced, reclength);
+  if (!rec_buff)
   {
     my_errno= HA_ERR_OUT_OF_MEM; /* purecov: inspected */
     return(1);
   }
+
   reclength2= _ma_rec_pack(info, rec_buff+
                            ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER),
                            record);
@@ -317,7 +325,7 @@ my_bool _ma_update_blob_record(MARIA_HA *info, MARIA_RECORD_POS pos,
 			      rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER),
 			      reclength2);
 err:
-  my_safe_afree(rec_buff, reclength);
+  stack_alloc_free(rec_buff, buff_alloced);
   return(error != 0);
 }
 
@@ -1581,10 +1589,12 @@ my_bool _ma_cmp_dynamic_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
 {
   uchar *old_rec_buff,*old_record;
   size_t old_rec_buff_size;
-  my_bool error;
+  my_bool error, buff_alloced;
   DBUG_ENTER("_ma_cmp_dynamic_unique");
 
-  if (!(old_record= my_safe_alloca(info->s->base.reclength)))
+  alloc_on_stack(*info->stack_end_ptr, old_record, buff_alloced,
+                 info->s->base.reclength);
+  if (!old_record)
     DBUG_RETURN(1);
 
   /* Don't let the compare destroy blobs that may be in use */
@@ -1605,7 +1615,7 @@ my_bool _ma_cmp_dynamic_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
     info->rec_buff=      old_rec_buff;
     info->rec_buff_size= old_rec_buff_size;
   }
-  my_safe_afree(old_record, info->s->base.reclength);
+  stack_alloc_free(old_record, buff_alloced);
   DBUG_RETURN(error);
 }
 
@@ -1619,7 +1629,7 @@ my_bool _ma_cmp_dynamic_record(register MARIA_HA *info,
   my_off_t filepos;
   uchar *buffer;
   MARIA_BLOCK_INFO block_info;
-  my_bool error= 1;
+  my_bool error= 1, buff_alloced= 0;
   size_t UNINIT_VAR(buffer_length);
   DBUG_ENTER("_ma_cmp_dynamic_record");
 
@@ -1640,8 +1650,10 @@ my_bool _ma_cmp_dynamic_record(register MARIA_HA *info,
     {
       buffer_length= (info->s->base.pack_reclength +
                       _ma_calc_total_blob_length(info,record));
-      if (!(buffer=(uchar*) my_safe_alloca(buffer_length)))
-	DBUG_RETURN(1);
+
+      alloc_on_stack(*info->stack_end_ptr, buffer, buff_alloced, buffer_length);
+      if (!buffer)
+        DBUG_RETURN(1);
     }
     if (!(reclength= _ma_rec_pack(info,buffer,record)))
       goto err;
@@ -1693,8 +1705,7 @@ my_bool _ma_cmp_dynamic_record(register MARIA_HA *info,
   my_errno=0;
   error= 0;
 err:
-  if (buffer != info->rec_buff)
-    my_safe_afree(buffer, buffer_length);
+  stack_alloc_free(buffer, buff_alloced);
   DBUG_PRINT("exit", ("result: %d", error));
   DBUG_RETURN(error);
 }
diff --git a/storage/maria/ma_extra.c b/storage/maria/ma_extra.c
index fe2a4c9b8ac..94e5e448b09 100644
--- a/storage/maria/ma_extra.c
+++ b/storage/maria/ma_extra.c
@@ -235,6 +235,9 @@ int maria_extra(MARIA_HA *info, enum ha_extra_function function,
     info->lock_wait= MY_SHORT_WAIT;
     break;
   case HA_EXTRA_NO_KEYS:
+    if (share->s3_path)                    /* Not supported with S3 */
+      break;
+
     /* we're going to modify pieces of the state, stall Checkpoint */
     mysql_mutex_lock(&share->intern_lock);
     if (info->lock_type == F_UNLCK)
@@ -375,16 +378,16 @@ int maria_extra(MARIA_HA *info, enum ha_extra_function function,
       if (end_io_cache(&info->rec_cache))
         error= 1;
     }
-    if (share->kfile.file >= 0)
+    if (share->kfile.file >= 0 && share->s3_path == 0)
     {
       if (do_flush)
       {
         /* Save the state so that others can find it from disk. */
-        if ((share->changed &&
-             _ma_state_info_write(share,
+        if (share->changed &&
+            (_ma_state_info_write(share,
                                   MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
-                                  MA_STATE_INFO_WRITE_FULL_INFO)) ||
-            mysql_file_sync(share->kfile.file, MYF(0)))
+                                  MA_STATE_INFO_WRITE_FULL_INFO) ||
+             mysql_file_sync(share->kfile.file, MYF(0))))
           error= my_errno;
       }
       else
@@ -396,7 +399,7 @@ int maria_extra(MARIA_HA *info, enum ha_extra_function function,
       }
     }
     if (share->data_file_type == BLOCK_RECORD &&
-        share->bitmap.file.file >= 0)
+        share->bitmap.file.file >= 0 && share->s3_path == 0)
     {
       DBUG_ASSERT(share->bitmap.non_flushable == 0 &&
                   share->bitmap.changed == 0);
diff --git a/storage/maria/ma_ft_boolean_search.c b/storage/maria/ma_ft_boolean_search.c
index 7fe6d8ca2cc..91e39716a2b 100644
--- a/storage/maria/ma_ft_boolean_search.c
+++ b/storage/maria/ma_ft_boolean_search.c
@@ -557,7 +557,7 @@ FT_INFO * maria_ft_init_boolean_search(MARIA_HA *info, uint keynr,
   FTB_EXPR  *ftbe;
   FTB_WORD  *ftbw;
 
-  if (!(ftb=(FTB *)my_malloc(sizeof(FTB), MYF(MY_WME))))
+  if (!(ftb=(FTB *)my_malloc(PSI_INSTRUMENT_ME, sizeof(FTB), MYF(MY_WME))))
     return 0;
   ftb->please= (struct _ft_vft *) & _ma_ft_vft_boolean;
   ftb->state=UNINITIALIZED;
@@ -570,7 +570,7 @@ FT_INFO * maria_ft_init_boolean_search(MARIA_HA *info, uint keynr,
   bzero(& ftb->no_dupes, sizeof(TREE));
   ftb->last_word= 0;
 
-  init_alloc_root(&ftb->mem_root, "fulltext", 1024, 1024, 0);
+  init_alloc_root(PSI_INSTRUMENT_ME, &ftb->mem_root, 1024, 1024, 0);
   ftb->queue.max_elements= 0;
   if (!(ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR))))
     goto err;
diff --git a/storage/maria/ma_ft_nlq_search.c b/storage/maria/ma_ft_nlq_search.c
index f7aa3afec9a..6c4e30bca83 100644
--- a/storage/maria/ma_ft_nlq_search.c
+++ b/storage/maria/ma_ft_nlq_search.c
@@ -291,7 +291,7 @@ FT_INFO *maria_ft_init_nlq_search(MARIA_HA *info, uint keynr, uchar *query,
     If ndocs == 0, this will not allocate RAM for FT_INFO.doc[],
     so if ndocs == 0, FT_INFO.doc[] must not be accessed.
    */
-  dlist=(FT_INFO *)my_malloc(sizeof(FT_INFO)+
+  dlist=(FT_INFO *)my_malloc(PSI_INSTRUMENT_ME, sizeof(FT_INFO)+
 			     sizeof(FT_DOC)*
 			     (int)(aio.dtree.elements_in_tree-1),
 			     MYF(0));
diff --git a/storage/maria/ma_ft_parser.c b/storage/maria/ma_ft_parser.c
index dbad6c4e7f5..00532af0b47 100644
--- a/storage/maria/ma_ft_parser.c
+++ b/storage/maria/ma_ft_parser.c
@@ -1,4 +1,5 @@
 /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+   Copyright (c) 2020, MariaDB Corporation.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -126,7 +127,7 @@ uchar maria_ft_get_word(CHARSET_INFO *cs, const uchar **start,
   {
     for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
     {
-      mbl= cs->cset->ctype(cs, &ctype, doc, end);
+      mbl= my_ci_ctype(cs, &ctype, doc, end);
       if (true_word_char(ctype, *doc))
         break;
       if (*doc == FTB_RQUOT && param->quot)
@@ -166,7 +167,7 @@ uchar maria_ft_get_word(CHARSET_INFO *cs, const uchar **start,
     for (word->pos= doc; doc < end; length++,
          doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
     {
-      mbl= cs->cset->ctype(cs, &ctype, doc, end);
+      mbl= my_ci_ctype(cs, &ctype, doc, end);
       if (true_word_char(ctype, *doc))
         mwc=0;
       else if (!misc_word_char(*doc) || mwc)
@@ -219,7 +220,7 @@ uchar maria_ft_simple_get_word(CHARSET_INFO *cs, uchar **start,
     {
       if (doc >= end)
         DBUG_RETURN(0);
-      mbl= cs->cset->ctype(cs, &ctype, doc, end);
+      mbl= my_ci_ctype(cs, &ctype, doc, end);
       if (true_word_char(ctype, *doc))
         break;
     }
@@ -228,7 +229,7 @@ uchar maria_ft_simple_get_word(CHARSET_INFO *cs, uchar **start,
     for (word->pos= doc; doc < end; length++,
          doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
     {
-      mbl= cs->cset->ctype(cs, &ctype, doc, end);
+      mbl= my_ci_ctype(cs, &ctype, doc, end);
       if (true_word_char(ctype, *doc))
         mwc= 0;
       else if (!misc_word_char(*doc) || mwc)
@@ -346,9 +347,9 @@ MYSQL_FTPARSER_PARAM* maria_ftparser_alloc_param(MARIA_HA *info)
       (ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2.
     */
     info->ftparser_param= (MYSQL_FTPARSER_PARAM *)
-      my_malloc(MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) *
+      my_malloc(PSI_INSTRUMENT_ME, MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) *
                 info->s->ftkeys, MYF(MY_WME | MY_ZEROFILL));
-    init_alloc_root(&info->ft_memroot, "fulltext_parser",
+    init_alloc_root(PSI_INSTRUMENT_ME, &info->ft_memroot,
                     FTPARSER_MEMROOT_ALLOC_SIZE, 0, MYF(0));
   }
   return info->ftparser_param;
diff --git a/storage/maria/ma_ft_test1.h b/storage/maria/ma_ft_test1.h
index 0f4997a7142..df86eeceb66 100644
--- a/storage/maria/ma_ft_test1.h
+++ b/storage/maria/ma_ft_test1.h
@@ -311,7 +311,7 @@ struct { const char *f0, *f2; } data[NDATAS] = {
    {"18.4.49", "Problems linking with the C API"},
    {"18.4.50", "How to make a thread-safe client"},
    {"18.5", "MySQL Perl API's"},
-   {"18.5.1", "DBI with DBD::mysql"},
+   {"18.5.1", "DBI with DBD::MariaDB"},
    {"18.5.1.1", "The DBI interface"},
    {"18.5.1.2", "More DBI/DBD information"},
    {"18.6", "MySQL Java connectivity (JDBC)"},
diff --git a/storage/maria/ma_info.c b/storage/maria/ma_info.c
index 6d40f804880..f0b04e020c2 100644
--- a/storage/maria/ma_info.c
+++ b/storage/maria/ma_info.c
@@ -31,7 +31,7 @@ MARIA_RECORD_POS maria_position(MARIA_HA *info)
 uint maria_max_key_length()
 {
   uint tmp= (_ma_max_key_length() - 8 - HA_MAX_KEY_SEG*3);
-  return MY_MIN(HA_MAX_KEY_LENGTH, tmp);
+  return MY_MIN(MARIA_MAX_KEY_LENGTH, tmp);
 }
 
 /* Get information about the table */
diff --git a/storage/maria/ma_init.c b/storage/maria/ma_init.c
index 12d39db806a..029ce4b9128 100644
--- a/storage/maria/ma_init.c
+++ b/storage/maria/ma_init.c
@@ -71,10 +71,9 @@ int maria_init(void)
     trnman_end_trans_hook= _ma_trnman_end_trans_hook;
     maria_create_trn_hook= dummy_maria_create_trn_hook;
   }
-  my_hash_init(&maria_stored_state, &my_charset_bin, 32,
-            0, sizeof(LSN), 0, (my_hash_free_key) history_state_free, 0);
-  DBUG_PRINT("info",("dummy_transaction_object: %p",
-                     &dummy_transaction_object));
+  my_hash_init(PSI_INSTRUMENT_ME, &maria_stored_state, &my_charset_bin, 32, 0,
+               sizeof(LSN), 0, (my_hash_free_key) history_state_free, 0);
+  DBUG_PRINT("info",("dummy_transaction_object: %p", &dummy_transaction_object));
   return 0;
 }
 
@@ -88,12 +87,13 @@ void maria_end(void)
     maria_inited= maria_multi_threaded= FALSE;
     ft_free_stopwords();
     ma_checkpoint_end();
-    if (translog_status == TRANSLOG_OK)
+    if (translog_status == TRANSLOG_OK && !aria_readonly)
     {
       translog_soft_sync_end();
       translog_sync();
     }
-    if ((trid= trnman_get_max_trid()) > max_trid_in_control_file)
+    if ((trid= trnman_get_max_trid()) > max_trid_in_control_file &&
+        !aria_readonly)
     {
       /*
         Store max transaction id into control file, in case logs are removed
diff --git a/storage/maria/ma_key.c b/storage/maria/ma_key.c
index 5061f3d0143..51a042b381e 100644
--- a/storage/maria/ma_key.c
+++ b/storage/maria/ma_key.c
@@ -1,4 +1,5 @@
 /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+   Copyright (c) 2020, MariaDB Corporation.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -32,7 +33,9 @@ static int _ma_put_key_in_record(MARIA_HA *info, uint keynr,
 #define FIX_LENGTH(cs, pos, length, char_length)                            \
             do {                                                            \
               if (length > char_length)                                     \
-                char_length= (uint) my_charpos(cs, pos, pos+length, char_length); \
+                char_length= (uint) my_ci_charpos(cs, (const char *) pos,   \
+                                                      (const char *) pos+length, \
+                                                      char_length);         \
               set_if_smaller(char_length,length);                           \
             } while(0)
 
@@ -237,7 +240,7 @@ MARIA_KEY *_ma_make_key(MARIA_HA *info, MARIA_KEY *int_key, uint keynr,
     {
       if (type != HA_KEYTYPE_NUM)
       {
-        length= (uint) cs->cset->lengthsp(cs, (const char*)pos, length);
+        length= (uint) my_ci_lengthsp(cs, (const char*)pos, length);
       }
       else
       {
@@ -312,7 +315,7 @@ MARIA_KEY *_ma_make_key(MARIA_HA *info, MARIA_KEY *int_key, uint keynr,
     FIX_LENGTH(cs, pos, length, char_length);
     memcpy(key, pos, char_length);
     if (length > char_length)
-      cs->cset->fill(cs, (char*) key+char_length, length-char_length, ' ');
+      my_ci_fill(cs, (char*) key+char_length, length-char_length, ' ');
     key+= length;
   }
   _ma_dpointer(info->s, key, filepos);
@@ -438,7 +441,7 @@ MARIA_KEY *_ma_pack_key(register MARIA_HA *info, MARIA_KEY *int_key,
     FIX_LENGTH(cs, pos, length, char_length);
     memcpy(key, pos, char_length);
     if (length > char_length)
-      cs->cset->fill(cs, (char*) key+char_length, length-char_length, ' ');
+      my_ci_fill(cs, (char*) key+char_length, length-char_length, ' ');
     key+= length;
   }
   if (last_used_keyseg)
@@ -545,8 +548,7 @@ static int _ma_put_key_in_record(register MARIA_HA *info, uint keynr,
       if (keyseg->type != (int) HA_KEYTYPE_NUM)
       {
         memcpy(pos,key,(size_t) length);
-        keyseg->charset->cset->fill(keyseg->charset,
-                                    (char*) pos + length,
+        my_ci_fill(keyseg->charset, (char*) pos + length,
                                     keyseg->length - length,
                                     ' ');
       }
diff --git a/storage/maria/ma_loghandler.c b/storage/maria/ma_loghandler.c
index 8455cbb80de..0725959841b 100644
--- a/storage/maria/ma_loghandler.c
+++ b/storage/maria/ma_loghandler.c
@@ -1649,7 +1649,7 @@ static void translog_file_init(TRANSLOG_FILE *file, uint32 number,
 
 static my_bool translog_create_new_file()
 {
-  TRANSLOG_FILE *file= (TRANSLOG_FILE*)my_malloc(sizeof(TRANSLOG_FILE),
+  TRANSLOG_FILE *file= (TRANSLOG_FILE*)my_malloc(PSI_INSTRUMENT_ME, sizeof(TRANSLOG_FILE),
                                                  MYF(0));
 
   TRANSLOG_FILE *old= get_current_logfile();
@@ -1909,7 +1909,7 @@ static void translog_put_sector_protection(uchar *page,
 static uint32 translog_crc(uchar *area, uint length)
 {
   DBUG_ENTER("translog_crc");
-  DBUG_RETURN(crc32(0L, (unsigned char*) area, length));
+  DBUG_RETURN(my_checksum(0L, area, length));
 }
 
 
@@ -3660,9 +3660,9 @@ my_bool translog_init_with_table(const char *directory,
                       &log_descriptor.new_goal_cond, 0) ||
       mysql_rwlock_init(key_TRANSLOG_DESCRIPTOR_open_files_lock,
                         &log_descriptor.open_files_lock) ||
-      my_init_dynamic_array(&log_descriptor.open_files,
+      my_init_dynamic_array(PSI_INSTRUMENT_ME, &log_descriptor.open_files,
                             sizeof(TRANSLOG_FILE*), 10, 10, MYF(0)) ||
-      my_init_dynamic_array(&log_descriptor.unfinished_files,
+      my_init_dynamic_array(PSI_INSTRUMENT_ME, &log_descriptor.unfinished_files,
                             sizeof(struct st_file_counter),
                             10, 10, MYF(0)))
     goto err;
@@ -3814,7 +3814,7 @@ my_bool translog_init_with_table(const char *directory,
           We can't allocate all file together because they will be freed
           one by one
         */
-        TRANSLOG_FILE *file= (TRANSLOG_FILE *)my_malloc(sizeof(TRANSLOG_FILE),
+        TRANSLOG_FILE *file= (TRANSLOG_FILE *)my_malloc(PSI_INSTRUMENT_ME, sizeof(TRANSLOG_FILE),
                                                         MYF(0));
 
         compile_time_assert(MY_FILEPOS_ERROR > 0xffffffffULL);
@@ -4016,8 +4016,8 @@ my_bool translog_init_with_table(const char *directory,
                       logs_found, old_log_was_recovered));
   if (!logs_found)
   {
-    TRANSLOG_FILE *file= (TRANSLOG_FILE*)my_malloc(sizeof(TRANSLOG_FILE),
-                                                   MYF(MY_WME));
+    TRANSLOG_FILE *file= (TRANSLOG_FILE*)my_malloc(PSI_INSTRUMENT_ME,
+                                           sizeof(TRANSLOG_FILE), MYF(MY_WME));
     DBUG_PRINT("info", ("The log is not found => we will create new log"));
     if (file == NULL)
        goto err;
@@ -4084,7 +4084,7 @@ my_bool translog_init_with_table(const char *directory,
     Log records will refer to a MARIA_SHARE by a unique 2-byte id; set up
     structures for generating 2-byte ids:
   */
-  id_to_share= (MARIA_SHARE **) my_malloc(SHARE_ID_MAX * sizeof(MARIA_SHARE*),
+  id_to_share= (MARIA_SHARE **) my_malloc(PSI_INSTRUMENT_ME, SHARE_ID_MAX * sizeof(MARIA_SHARE*),
                                           MYF(MY_WME | MY_ZEROFILL));
   if (unlikely(!id_to_share))
     goto err;
@@ -5649,7 +5649,7 @@ translog_write_variable_record_mgroup(LSN *lsn,
   used_buffs_init(&cursor.buffs);
   chunk2_header[0]= TRANSLOG_CHUNK_NOHDR;
 
-  if (my_init_dynamic_array(&groups,
+  if (my_init_dynamic_array(PSI_INSTRUMENT_ME, &groups,
                             sizeof(struct st_translog_group_descriptor),
                             10, 10, MYF(0)))
   {
@@ -6994,7 +6994,7 @@ translog_variable_length_header(uchar *page, translog_size_t page_offset,
     DBUG_PRINT("info", ("multi-group"));
     grp_no= buff->groups_no= uint2korr(src + 2);
     if (!(buff->groups=
-          (TRANSLOG_GROUP*) my_malloc(sizeof(TRANSLOG_GROUP) * grp_no,
+          (TRANSLOG_GROUP*) my_malloc(PSI_INSTRUMENT_ME, sizeof(TRANSLOG_GROUP) * grp_no,
                                       MYF(0))))
       DBUG_RETURN(RECHEADER_READ_ERROR);
     DBUG_PRINT("info", ("Groups: %u", (uint) grp_no));
@@ -7925,7 +7925,8 @@ void check_skipped_lsn(MARIA_HA *info, LSN lsn, my_bool index_file,
   else
   {
     /* Give error, but don't flood the log */
-    if (skipped_lsn_err_count++ < 10 && ! info->s->redo_error_given++)
+    if (skipped_lsn_err_count++ < MAX_LSN_ERRORS &&
+        ! info->s->redo_error_given++)
     {
       eprint(tracef, "Table %s has wrong LSN: " LSN_FMT " on page: %llu",
              (index_file ? info->s->data_file_name.str :
diff --git a/storage/maria/ma_loghandler_lsn.h b/storage/maria/ma_loghandler_lsn.h
index c99f0d0af97..c5bd76bb6b8 100644
--- a/storage/maria/ma_loghandler_lsn.h
+++ b/storage/maria/ma_loghandler_lsn.h
@@ -109,4 +109,7 @@ typedef LSN LSN_WITH_FLAGS;
 */
 #define LSN_MAX (LSN)0x00FFFFFFFFFFFFFFULL
 
+/* Max LSN error to print on check or recovery */
+#define MAX_LSN_ERRORS 10
+
 #endif
diff --git a/storage/maria/ma_open.c b/storage/maria/ma_open.c
index 06183c72895..e3385a73f84 100644
--- a/storage/maria/ma_open.c
+++ b/storage/maria/ma_open.c
@@ -1,5 +1,5 @@
 /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
-   Copyright (c) 2009, 2019, MariaDB Corporation.
+   Copyright (c) 2009, 2022, MariaDB Corporation Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -24,6 +24,7 @@
 #include "ma_trnman.h"
 #include <m_ctype.h>
 #include "ma_crypt.h"
+#include "s3_func.h"
 
 #if defined(MSDOS) || defined(__WIN__)
 #ifdef __WIN__
@@ -38,7 +39,6 @@ static my_bool maria_scan_init_dummy(MARIA_HA *info);
 static void maria_scan_end_dummy(MARIA_HA *info);
 static my_bool maria_once_init_dummy(MARIA_SHARE *, File);
 static my_bool maria_once_end_dummy(MARIA_SHARE *);
-static uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base);
 static uchar *_ma_state_info_read(uchar *, MARIA_STATE_INFO *, myf);
 
 #define get_next_element(to,pos,size) { memcpy((char*) to,pos,(size_t) size); \
@@ -89,10 +89,10 @@ MARIA_HA *_ma_test_if_reopen(const char *filename)
     0   Error
 */
 
-
 static MARIA_HA *maria_clone_internal(MARIA_SHARE *share,
                                       int mode, File data_file,
-                                      uint internal_table)
+                                      uint internal_table,
+                                      struct ms3_st *s3)
 {
   int save_errno;
   uint errpos;
@@ -116,7 +116,7 @@ static MARIA_HA *maria_clone_internal(MARIA_SHARE *share,
   errpos= 5;
 
   /* alloc and set up private structure parts */
-  if (!my_multi_malloc(flag,
+  if (!my_multi_malloc(PSI_INSTRUMENT_ME, flag,
 		       &m_info,sizeof(MARIA_HA),
 		       &info.blobs,sizeof(MARIA_BLOB)*share->base.blobs,
 		       &info.buff,(share->base.max_key_block_length*2+
@@ -131,6 +131,7 @@ static MARIA_HA *maria_clone_internal(MARIA_SHARE *share,
     goto err;
   errpos= 6;
 
+  info.s3= s3;
   memcpy(info.blobs,share->blobs,sizeof(MARIA_BLOB)*share->base.blobs);
   info.lastkey_buff2= info.lastkey_buff + share->base.max_key_length;
   info.last_key.data= info.lastkey_buff;
@@ -150,7 +151,8 @@ static MARIA_HA *maria_clone_internal(MARIA_SHARE *share,
   info.last_loop=   share->state.update_count;
 #endif
   info.errkey= -1;
-  info.page_changed=1;
+  info.page_changed= 1;
+  info.autocommit= 1;
   info.keyread_buff= info.buff + share->base.max_key_block_length;
 
   info.lock_type= F_UNLCK;
@@ -164,7 +166,8 @@ static MARIA_HA *maria_clone_internal(MARIA_SHARE *share,
     goto err;
 
   /* The following should be big enough for all pinning purposes */
-  if (my_init_dynamic_array(&info.pinned_pages, sizeof(MARIA_PINNED_PAGE),
+  if (my_init_dynamic_array(PSI_INSTRUMENT_ME, &info.pinned_pages,
+                            sizeof(MARIA_PINNED_PAGE),
                             MY_MAX(share->base.blobs*2 + 4,
                             MARIA_MAX_TREE_LEVELS*3), 16, flag))
     goto err;
@@ -184,9 +187,9 @@ static MARIA_HA *maria_clone_internal(MARIA_SHARE *share,
       maria_delay_key_write)
     share->delay_key_write=1;
 
-  if (!share->base.born_transactional)   /* For transactional ones ... */
+  if (!share->now_transactional)       /* If not transctional table */
   {
-    /* ... force crash if no trn given */
+    /* Pagecache requires access to info->trn->rec_lsn */
     _ma_set_tmp_trn_for_table(&info, &dummy_transaction_object);
     info.state= &share->state.state;	/* Change global values by default */
   }
@@ -238,6 +241,7 @@ err:
   case 6:
     (*share->end)(&info);
     delete_dynamic(&info.pinned_pages);
+    my_free(m_info->s3);
     my_free(m_info);
     /* fall through */
   case 5:
@@ -259,9 +263,10 @@ err:
   have an open count of 0.
 ******************************************************************************/
 
-MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
+MARIA_HA *maria_open(const char *name, int mode, uint open_flags,
+                     S3_INFO *s3)
 {
-  int kfile,open_mode,save_errno;
+  int open_mode= 0,save_errno;
   uint i,j,len,errpos,head_length,base_pos,keys, realpath_err,
     key_parts,base_key_parts,unique_key_parts,fulltext_keys,uniques;
   uint internal_table= MY_TEST(open_flags & HA_OPEN_INTERNAL_TABLE);
@@ -271,37 +276,57 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
   char name_buff[FN_REFLEN], org_name[FN_REFLEN], index_name[FN_REFLEN],
        data_name[FN_REFLEN];
   uchar *UNINIT_VAR(disk_cache), *disk_pos, *end_pos;
-  MARIA_HA info, *UNINIT_VAR(m_info), *old_info;
+  MARIA_HA info, *UNINIT_VAR(m_info), *old_info= NULL;
   MARIA_SHARE share_buff,*share;
   double *rec_per_key_part;
   ulong  *nulls_per_key_part;
   my_off_t key_root[HA_MAX_POSSIBLE_KEY];
   ulonglong max_key_file_length, max_data_file_length;
-  my_bool versioning= 1;
-  File data_file= -1;
+  my_bool versioning= 1, born_transactional;
+  File data_file= -1, kfile= -1;
+  struct ms3_st *s3_client= 0;
+  S3_INFO *share_s3= 0;
+  S3_BLOCK index_header;
   DBUG_ENTER("maria_open");
 
-  kfile= -1;
   errpos= 0;
   head_length=sizeof(share_buff.state.header);
   bzero((uchar*) &info,sizeof(info));
+  bzero((uchar*) &index_header, sizeof(index_header));
 
-  realpath_err= my_realpath(name_buff, fn_format(org_name, name, "",
-                                                 MARIA_NAME_IEXT,
-                                                 MY_UNPACK_FILENAME),MYF(0));
-  if (realpath_err > 0) /* File not found, no point in looking further. */
+#ifndef WITH_S3_STORAGE_ENGINE
+  DBUG_ASSERT(!s3);
+#else
+  if (!s3)
+#endif /* WITH_S3_STORAGE_ENGINE */
   {
-    DBUG_RETURN(NULL);
-  }
+    realpath_err= my_realpath(name_buff, fn_format(org_name, name, "",
+                                                   MARIA_NAME_IEXT,
+                                                   MY_UNPACK_FILENAME),MYF(0));
+    if (realpath_err > 0) /* File not found, no point in looking further. */
+    {
+      DBUG_RETURN(NULL);
+    }
 
-  if (my_is_symlink(org_name) &&
-      (realpath_err || mysys_test_invalid_symlink(name_buff)))
+    if (my_is_symlink(org_name) &&
+        (realpath_err || mysys_test_invalid_symlink(name_buff)))
+    {
+      my_errno= HA_WRONG_CREATE_OPTION;
+      DBUG_RETURN(0);
+    }
+  }
+#ifdef WITH_S3_STORAGE_ENGINE
+  else
   {
-    my_errno= HA_WRONG_CREATE_OPTION;
-    DBUG_RETURN(0);
+    strmake(name_buff, name, sizeof(name_buff)-1); /* test_if_reopen() */
+    if (!(s3_client= s3f.open_connection(s3)))
+    {
+      internal_table= 1;                        /* Avoid unlock on error */
+      goto err;
+    }
   }
+#endif /* WITH_S3_STORAGE_ENGINE */
 
-  old_info= 0;
   if (!internal_table)
     mysql_mutex_lock(&THR_LOCK_maria);
   if ((open_flags & HA_OPEN_COPY) ||
@@ -314,32 +339,71 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
 						 (uint) strlen(name_buff),
                                                  maria_pagecache);
 
-    DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_open",
-                    if (strstr(name, "/t1"))
-                    {
-                      my_errno= HA_ERR_CRASHED;
-                      goto err;
-                    });
-    DEBUG_SYNC_C("mi_open_kfile");
-    if ((kfile=mysql_file_open(key_file_kfile, name_buff,
-                               (open_mode=O_RDWR) | O_SHARE | O_NOFOLLOW | O_CLOEXEC,
-                               MYF(common_flag | MY_NOSYMLINKS))) < 0)
+    if (!s3)
     {
-      if ((errno != EROFS && errno != EACCES) ||
-	  mode != O_RDONLY ||
-	  (kfile=mysql_file_open(key_file_kfile, name_buff,
-                                 (open_mode=O_RDONLY) | O_SHARE | O_NOFOLLOW | O_CLOEXEC,
+      DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_open",
+                      if (strstr(name, "/t1"))
+                      {
+                        my_errno= HA_ERR_CRASHED;
+                        goto err;
+                      });
+      DEBUG_SYNC_C("mi_open_kfile");
+      if ((kfile=mysql_file_open(key_file_kfile, name_buff,
+                                 (open_mode=O_RDWR) | O_SHARE | O_NOFOLLOW | O_CLOEXEC,
                                  MYF(common_flag | MY_NOSYMLINKS))) < 0)
-	goto err;
+      {
+        if ((errno != EROFS && errno != EACCES) ||
+            mode != O_RDONLY ||
+            (kfile=mysql_file_open(key_file_kfile, name_buff,
+                                   (open_mode=O_RDONLY) | O_SHARE | O_NOFOLLOW | O_CLOEXEC,
+                                   MYF(common_flag | MY_NOSYMLINKS))) < 0)
+          goto err;
+      }
+      errpos= 1;
+      if (mysql_file_pread(kfile,share->state.header.file_version, head_length,
+                           0, MYF(MY_NABP)))
+      {
+        my_errno= HA_ERR_NOT_A_TABLE;
+        goto err;
+      }
     }
-    share->mode=open_mode;
-    errpos= 1;
-    if (mysql_file_pread(kfile,share->state.header.file_version, head_length,
-                         0, MYF(MY_NABP)))
+#ifdef WITH_S3_STORAGE_ENGINE
+    else
     {
-      my_errno= HA_ERR_NOT_A_TABLE;
-      goto err;
+      open_mode= mode;
+      errpos= 1;
+      if (s3f.set_database_and_table_from_path(s3, name_buff))
+      {
+        my_printf_error(HA_ERR_NO_SUCH_TABLE,
+                        "Can't find database and path from %s",  MYF(0),
+                        name_buff);
+        my_errno= HA_ERR_NO_SUCH_TABLE;
+        goto err;
+      }
+      if (!(share_s3= share->s3_path= s3f.info_copy(s3)))
+        goto err;                             /* EiOM */
+
+      /* Check if table has changed in S3 */
+      if (s3f.check_frm_version(s3_client, share_s3) == 1)
+      {
+        my_errno= HA_ERR_TABLE_DEF_CHANGED;
+        goto err;
+      }
+
+      if (s3f.read_index_header(s3_client, share_s3, &index_header))
+        goto err;
+      if (index_header.length < head_length)
+      {
+        my_errno=HA_ERR_NOT_A_TABLE;
+        goto err;
+      }
+      memcpy(share->state.header.file_version, index_header.str,
+             head_length);
+      kfile= s3f.unique_file_number();
     }
+#endif /* WITH_S3_STORAGE_ENGINE */
+
+    share->mode=open_mode;
     if (memcmp(share->state.header.file_version, maria_file_magic, 4))
     {
       DBUG_PRINT("error",("Wrong header in %s",name_buff));
@@ -368,23 +432,31 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
       my_errno= HA_ERR_UNSUPPORTED;
       goto err;
     }
-    /* Don't call realpath() if the name can't be a link */
-    if (!strcmp(name_buff, org_name) ||
-        my_readlink(index_name, org_name, MYF(0)) == -1)
-      (void) strmov(index_name, org_name);
-    *strrchr(org_name, FN_EXTCHAR)= '\0';
-    (void) fn_format(data_name,org_name,"",MARIA_NAME_DEXT,
-                     MY_APPEND_EXT|MY_UNPACK_FILENAME);
-    if (my_is_symlink(data_name))
+    if (!s3)
     {
-      if (my_realpath(data_name, data_name, MYF(0)))
-        goto err;
-      if (mysys_test_invalid_symlink(data_name))
+      /* Don't call realpath() if the name can't be a link */
+      if (!strcmp(name_buff, org_name) ||
+          my_readlink(index_name, org_name, MYF(0)) == -1)
+        (void) strmov(index_name, org_name);
+      *strrchr(org_name, FN_EXTCHAR)= '\0';
+      (void) fn_format(data_name,org_name,"",MARIA_NAME_DEXT,
+                       MY_APPEND_EXT|MY_UNPACK_FILENAME);
+      if (my_is_symlink(data_name))
       {
-        my_errno= HA_WRONG_CREATE_OPTION;
-        goto err;
+        if (my_realpath(data_name, data_name, MYF(0)))
+          goto err;
+        if (mysys_test_invalid_symlink(data_name))
+        {
+          my_errno= HA_WRONG_CREATE_OPTION;
+          goto err;
+        }
+        share->mode|= O_NOFOLLOW; /* all symlinks are resolved by realpath() */
       }
-      share->mode|= O_NOFOLLOW; /* all symlinks are resolved by realpath() */
+    }
+    else
+    {
+      /* Don't show DIRECTORY in show create table */
+      index_name[0]= data_name[0]= 0;
     }
 
     info_length=mi_uint2korr(share->state.header.header_length);
@@ -394,7 +466,8 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
       Allocate space for header information and for data that is too
       big to keep on stack
     */
-    if (!(disk_cache= my_malloc(info_length+128, MYF(MY_WME | common_flag))))
+    if (!(disk_cache= my_malloc(PSI_INSTRUMENT_ME, info_length+128,
+                                MYF(MY_WME | common_flag))))
     {
       my_errno=ENOMEM;
       goto err;
@@ -402,11 +475,26 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
 
     end_pos=disk_cache+info_length;
     errpos= 3;
-    if (mysql_file_pread(kfile, disk_cache, info_length, 0L, MYF(MY_NABP)))
+    if (!s3)
     {
-      _ma_set_fatal_error(share, HA_ERR_CRASHED);
-      goto err;
+      if (mysql_file_pread(kfile, disk_cache, info_length, 0L, MYF(MY_NABP)))
+      {
+        _ma_set_fatal_error(share, HA_ERR_CRASHED);
+        goto err;
+      }
     }
+#ifdef WITH_S3_STORAGE_ENGINE
+    else
+    {
+      if (index_header.length < info_length)
+      {
+        my_errno=HA_ERR_NOT_A_TABLE;
+        goto err;
+      }
+      memcpy(disk_cache, index_header.str, info_length);
+    }
+#endif /* WITH_S3_STORAGE_ENGINE */
+
     len=mi_uint2korr(share->state.header.state_info_length);
     keys=    (uint) share->state.header.keys;
     uniques= (uint) share->state.header.uniques;
@@ -437,7 +525,8 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
     file_version= (share->state.header.not_used == 0);
     if (file_version == 0)
       share->base.language= share->state.header.not_used;
-    
+    born_transactional= share->base.born_transactional;
+
     share->state.state_length=base_pos;
     /* For newly opened tables we reset the error-has-been-printed flag */
     share->state.changed&= ~STATE_CRASHED_PRINTED;
@@ -464,25 +553,28 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
       - share->state.create_trid > trnman_get_max_trid()
         - Critical as trid as stored releative to create_trid.
       - uuid is different
-      
+
         STATE_NOT_MOVABLE is reset when a table is zerofilled
         (has no LSN's and no trids)
 
       We can ignore testing uuid if STATE_NOT_MOVABLE is not set, as in this
       case the uuid will be set in _ma_mark_file_changed().
     */
-    if (share->base.born_transactional &&
+    if (born_transactional &&
         ((share->state.create_trid > trnman_get_max_trid() &&
          !maria_in_recovery) ||
          ((share->state.changed & STATE_NOT_MOVABLE) &&
           ((!(open_flags & HA_OPEN_IGNORE_MOVED_STATE) &&
-            memcmp(share->base.uuid, maria_uuid, MY_UUID_SIZE))))))
+            memcmp(share->base.uuid, maria_uuid, MY_UUID_SIZE)))) ||
+         ((share->state.changed & (STATE_MOVED | STATE_NOT_ZEROFILLED)) ==
+          (STATE_MOVED | STATE_NOT_ZEROFILLED))))
     {
-      DBUG_PRINT("warning", ("table is moved from another system.  uuid_diff: %d  create_trid: %lu  max_trid: %lu",
+      DBUG_PRINT("warning", ("table is moved from another system.  uuid_diff: %d  create_trid: %lu  max_trid: %lu  moved: %d",
                             memcmp(share->base.uuid, maria_uuid,
                                    MY_UUID_SIZE) != 0,
                              (ulong) share->state.create_trid,
-                             (ulong) trnman_get_max_trid()));
+                             (ulong) trnman_get_max_trid(),
+                             MY_TEST((share->state.changed & STATE_MOVED))));
       if (open_flags & HA_OPEN_FOR_REPAIR)
         share->state.changed|= STATE_MOVED;
       else
@@ -509,7 +601,7 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
     }
 
     /* Ensure we have space in the key buffer for transaction id's */
-    if (share->base.born_transactional)
+    if (born_transactional)
       share->base.max_key_length= ALIGN_SIZE(share->base.max_key_length +
                                              MARIA_MAX_PACK_TRANSID_SIZE);
 
@@ -528,7 +620,7 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
       my_errno=HA_ERR_UNSUPPORTED;
       my_printf_error(my_errno, "Wrong block size %u; Expected %u",
                       MYF(0),
-                      (uint) share->base.block_size, 
+                      (uint) share->base.block_size,
                       (uint) maria_block_size);
       goto err;
     }
@@ -562,7 +654,7 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
     share->index_file_name.length=  strlen(index_name);
     share->data_file_name.length=   strlen(data_name);
     share->open_file_name.length=   strlen(name);
-    if (!my_multi_malloc(MYF(MY_WME | common_flag),
+    if (!my_multi_malloc(PSI_INSTRUMENT_ME, MYF(MY_WME | common_flag),
 			 &share,sizeof(*share),
 			 &rec_per_key_part, sizeof(double) * key_parts,
                          &nulls_per_key_part, sizeof(long)* key_parts,
@@ -608,7 +700,7 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
 
     share->block_size= share->base.block_size;   /* Convenience */
     share->max_index_block_size= share->block_size - KEYPAGE_CHECKSUM_SIZE;
-    share->keypage_header= ((share->base.born_transactional ?
+    share->keypage_header= ((born_transactional ?
                              LSN_STORE_SIZE + TRANSID_SIZE :
                              0) + KEYPAGE_KEYID_SIZE + KEYPAGE_FLAG_SIZE +
                             KEYPAGE_USED_SIZE);
@@ -628,6 +720,12 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
 	disk_pos=_ma_keydef_read(disk_pos, keyinfo);
         keyinfo->key_nr= i;
 
+        /* Calculate length to store a key + nod flag and transaction info */
+        keyinfo->max_store_length= (keyinfo->maxlength +
+                                    share->base.key_reflength);
+        if (born_transactional)
+          keyinfo->max_store_length+= MARIA_INDEX_OVERHEAD_SIZE;
+
         /* See ma_delete.cc::underflow() */
         if (!(keyinfo->flag & (HA_BINARY_PACK_KEY | HA_PACK_KEY)))
           keyinfo->underflow_block_length= keyinfo->block_length/3;
@@ -764,9 +862,9 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
         file for REPAIR. Don't do logging. This base information will not go
         to disk.
       */
-      share->base.born_transactional= FALSE;
+      born_transactional= FALSE;
     }
-    if (share->base.born_transactional)
+    if (born_transactional)
     {
       share->page_type= PAGECACHE_LSN_PAGE;
       if (share->state.create_rename_lsn == LSN_NEEDS_NEW_STATE_LSNS)
@@ -817,7 +915,7 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
     }
     else
       share->page_type= PAGECACHE_PLAIN_PAGE;
-    share->now_transactional= share->base.born_transactional;
+    share->now_transactional= born_transactional;
 
     /* Use pack_reclength as we don't want to modify base.pack_recklength */
     if (share->state.header.org_data_file_type == DYNAMIC_RECORD)
@@ -870,9 +968,16 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
     if ((share->data_file_type == BLOCK_RECORD ||
          share->data_file_type == COMPRESSED_RECORD))
     {
-      if (_ma_open_datafile(&info, share))
-        goto err;
-      data_file= info.dfile.file;
+      if (!s3)
+      {
+        if (_ma_open_datafile(&info, share))
+          goto err;
+        data_file= info.dfile.file;
+      }
+#ifdef WITH_S3_STORAGE_ENGINE
+      else
+        data_file= info.dfile.file= s3f.unique_file_number();
+#endif /* WITH_S3_STORAGE_ENGINE */
     }
     errpos= 5;
 
@@ -914,6 +1019,7 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
     max_data_file_length= share->base.max_data_file_length;
     if ((*share->once_init)(share, info.dfile.file))
       goto err;
+    errpos= 6;
     if (internal_table)
       set_if_smaller(share->base.max_data_file_length,
                      max_data_file_length);
@@ -941,13 +1047,15 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
       {
         /* Table is not part of any active transaction; Create new history */
         if (!(share->state_history= (MARIA_STATE_HISTORY *)
-              my_malloc(sizeof(*share->state_history), MYF(MY_WME))))
+              my_malloc(PSI_INSTRUMENT_ME, sizeof(*share->state_history),
+                        MYF(MY_WME))))
           goto err;
         share->state_history->trid= 0;          /* Visible by all */
         share->state_history->state= share->state.state;
         share->state_history->next= 0;
       }
     }
+    errpos= 7;
     thr_lock_init(&share->lock);
     mysql_mutex_init(key_SHARE_intern_lock,
                      &share->intern_lock, MY_MUTEX_INIT_FAST);
@@ -1042,6 +1150,13 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
       info.s= share;
       maria_extra(&info, HA_EXTRA_MMAP, 0);
     }
+#ifdef WITH_S3_STORAGE_ENGINE
+    if (s3_client)
+    {
+      size_t block_size= share->base.s3_block_size;
+      s3f.set_option(s3_client, MS3_OPT_BUFFER_CHUNK_SIZE, &block_size);
+    }
+#endif /* WITH_S3_STORAGE_ENGINE */
   }
   else
   {
@@ -1050,8 +1165,13 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
       data_file= share->bitmap.file.file;       /* Only opened once */
   }
 
+#ifdef WITH_S3_STORAGE_ENGINE
+  if (index_header.alloc_ptr)
+    s3f.free(&index_header);
+#endif /* WITH_S3_STORAGE_ENGINE */
+
   if (!(m_info= maria_clone_internal(share, mode, data_file,
-                                     internal_table)))
+                                     internal_table, s3_client)))
     goto err;
 
   if (maria_is_crashed(m_info))
@@ -1062,6 +1182,7 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
     mysql_mutex_unlock(&THR_LOCK_maria);
 
   m_info->open_flags= open_flags;
+  m_info->stack_end_ptr= &my_thread_var->stack_ends_here;
   DBUG_PRINT("exit", ("table: %p  name: %s",m_info, name));
   DBUG_RETURN(m_info);
 
@@ -1078,12 +1199,19 @@ err:
     _ma_report_error(save_errno, &tmp_name);
   }
   switch (errpos) {
+  case 7:
+    thr_lock_delete(&share->lock);
+    /* fall through */
+  case 6:
+    /* Avoid mutex test in _ma_bitmap_end() */
+    share->internal_table= 1;
+    (*share->once_end)(share);
+    /* fall through */
   case 5:
-    if (data_file >= 0)
+    if (data_file >= 0 && !s3_client)
       mysql_file_close(data_file, MYF(0));
     if (old_info)
       break;					/* Don't remove open table */
-    (*share->once_end)(share);
     /* fall through */
   case 4:
     ma_crypt_free(share);
@@ -1094,12 +1222,20 @@ err:
     my_free(share_buff.state.rec_per_key_part);
     /* fall through */
   case 1:
-    mysql_file_close(kfile,MYF(0));
+    if (!s3)
+      mysql_file_close(kfile,MYF(0));
+    my_free(share_s3);
     /* fall through */
   case 0:
   default:
     break;
   }
+#ifdef WITH_S3_STORAGE_ENGINE
+  if (s3_client)
+    s3f.deinit(s3_client);
+  if (index_header.alloc_ptr)
+    s3f.free(&index_header);
+#endif /* WITH_S3_STORAGE_ENGINE */
   if (!internal_table)
     mysql_mutex_unlock(&THR_LOCK_maria);
   my_errno= save_errno;
@@ -1117,7 +1253,7 @@ my_bool _ma_alloc_buffer(uchar **old_addr, size_t *old_size,
   if (*old_size < new_size)
   {
     uchar *addr;
-    if (!(addr= (uchar*) my_realloc(*old_addr, new_size,
+    if (!(addr= (uchar*) my_realloc(PSI_INSTRUMENT_ME, *old_addr, new_size,
                                     MYF(MY_ALLOW_ZERO_PTR | flag))))
       return 1;
     *old_addr= addr;
@@ -1509,7 +1645,7 @@ static uchar *_ma_state_info_read(uchar *ptr, MARIA_STATE_INFO *state, myf flag)
 
   /* Allocate memory for key parts if not already done */
   if (!state->rec_per_key_part &&
-      !my_multi_malloc(MYF(MY_WME | flag),
+      !my_multi_malloc(PSI_INSTRUMENT_ME, MYF(MY_WME | flag),
                        &state->rec_per_key_part,
                        sizeof(*state->rec_per_key_part) * key_parts,
                        &state->nulls_per_key_part,
@@ -1597,7 +1733,7 @@ uint _ma_state_info_read_dsk(File file __attribute__((unused)),
 
 
 /****************************************************************************
-**  store and read of MARIA_BASE_INFO
+**  store MARIA_BASE_INFO
 ****************************************************************************/
 
 uint _ma_base_info_write(File file, MARIA_BASE_INFO *base)
@@ -1633,61 +1769,20 @@ uint _ma_base_info_write(File file, MARIA_BASE_INFO *base)
   *ptr++= base->keys;
   *ptr++= base->auto_key;
   *ptr++= base->born_transactional;
-  *ptr++= 0;                                    /* Reserved */
+  *ptr++= base->compression_algorithm;
   mi_int2store(ptr,base->pack_bytes);			ptr+= 2;
   mi_int2store(ptr,base->blobs);			ptr+= 2;
   mi_int2store(ptr,base->max_key_block_length);		ptr+= 2;
   mi_int2store(ptr,base->max_key_length);		ptr+= 2;
   mi_int2store(ptr,base->extra_alloc_bytes);		ptr+= 2;
   *ptr++= base->extra_alloc_procent;
-  bzero(ptr,16);					ptr+= 16; /* extra */
+  mi_int3store(ptr, base->s3_block_size);               ptr+= 3;
+  bzero(ptr,13);					ptr+= 13; /* extra */
   DBUG_ASSERT((ptr - buff) == MARIA_BASE_INFO_SIZE);
   return mysql_file_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0;
 }
 
 
-static uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base)
-{
-  bmove(base->uuid, ptr, MY_UUID_SIZE);                 ptr+= MY_UUID_SIZE;
-  base->keystart= mi_sizekorr(ptr);			ptr+= 8;
-  base->max_data_file_length= mi_sizekorr(ptr); 	ptr+= 8;
-  base->max_key_file_length= mi_sizekorr(ptr);		ptr+= 8;
-  base->records=  (ha_rows) mi_sizekorr(ptr);		ptr+= 8;
-  base->reloc= (ha_rows) mi_sizekorr(ptr);		ptr+= 8;
-  base->mean_row_length= mi_uint4korr(ptr);		ptr+= 4;
-  base->reclength= mi_uint4korr(ptr);			ptr+= 4;
-  base->pack_reclength= mi_uint4korr(ptr);		ptr+= 4;
-  base->min_pack_length= mi_uint4korr(ptr);		ptr+= 4;
-  base->max_pack_length= mi_uint4korr(ptr);		ptr+= 4;
-  base->min_block_length= mi_uint4korr(ptr);		ptr+= 4;
-  base->fields= mi_uint2korr(ptr);			ptr+= 2;
-  base->fixed_not_null_fields= mi_uint2korr(ptr);       ptr+= 2;
-  base->fixed_not_null_fields_length= mi_uint2korr(ptr);ptr+= 2;
-  base->max_field_lengths= mi_uint2korr(ptr);	        ptr+= 2;
-  base->pack_fields= mi_uint2korr(ptr);			ptr+= 2;
-  base->extra_options= mi_uint2korr(ptr);		ptr+= 2;
-  base->null_bytes= mi_uint2korr(ptr);			ptr+= 2;
-  base->original_null_bytes= mi_uint2korr(ptr);		ptr+= 2;
-  base->field_offsets= mi_uint2korr(ptr);		ptr+= 2;
-  base->language= mi_uint2korr(ptr);		        ptr+= 2;
-  base->block_size= mi_uint2korr(ptr);			ptr+= 2;
-
-  base->rec_reflength= *ptr++;
-  base->key_reflength= *ptr++;
-  base->keys=	       *ptr++;
-  base->auto_key=      *ptr++;
-  base->born_transactional= *ptr++;
-  ptr++;
-  base->pack_bytes= mi_uint2korr(ptr);			ptr+= 2;
-  base->blobs= mi_uint2korr(ptr);			ptr+= 2;
-  base->max_key_block_length= mi_uint2korr(ptr);	ptr+= 2;
-  base->max_key_length= mi_uint2korr(ptr);		ptr+= 2;
-  base->extra_alloc_bytes= mi_uint2korr(ptr);		ptr+= 2;
-  base->extra_alloc_procent= *ptr++;
-  ptr+= 16;
-  return ptr;
-}
-
 /*--------------------------------------------------------------------------
   maria_keydef
 ---------------------------------------------------------------------------*/
@@ -1835,7 +1930,7 @@ uchar *_ma_columndef_read(uchar *ptr, MARIA_COLUMNDEF *columndef)
   columndef->empty_pos= mi_uint2korr(ptr);	ptr+= 2;
   columndef->null_bit=  (uint8) *ptr++;
   columndef->empty_bit= (uint8) *ptr++;
-  high_offset=       mi_uint2korr(ptr);         ptr+= 2;  
+  high_offset=       mi_uint2korr(ptr);         ptr+= 2;
   columndef->offset|= ((ulong) high_offset << 16);
   ptr+= 2;
   return ptr;
@@ -1953,13 +2048,13 @@ void _ma_set_index_pagecache_callbacks(PAGECACHE_FILE *file,
 
 int _ma_open_datafile(MARIA_HA *info, MARIA_SHARE *share)
 {
-  myf flags= MY_WME | (share->mode & O_NOFOLLOW ? MY_NOSYMLINKS : 0);
+  myf flags= (share->mode & O_NOFOLLOW) ? MY_NOSYMLINKS | MY_WME : MY_WME;
   if (share->temporary)
     flags|= MY_THREAD_SPECIFIC;
   DEBUG_SYNC_C("mi_open_datafile");
   info->dfile.file= share->bitmap.file.file=
     mysql_file_open(key_file_dfile, share->data_file_name.str,
-                    share->mode | O_SHARE | O_CLOEXEC, MYF(flags));
+                    share->mode | O_SHARE | O_CLOEXEC, flags);
   return info->dfile.file >= 0 ? 0 : 1;
 }
 
diff --git a/storage/maria/ma_packrec.c b/storage/maria/ma_packrec.c
index d1c30a57146..d7f86a9a7ae 100644
--- a/storage/maria/ma_packrec.c
+++ b/storage/maria/ma_packrec.c
@@ -229,7 +229,7 @@ static my_bool _ma_read_pack_info(MARIA_SHARE *share, File file,
     - Distinct column values
   */
   if (!(share->decode_trees=(MARIA_DECODE_TREE*)
-	my_malloc((uint) (trees*sizeof(MARIA_DECODE_TREE)+
+	my_malloc(PSI_INSTRUMENT_ME, (uint) (trees*sizeof(MARIA_DECODE_TREE)+
 			  intervall_length*sizeof(uchar)),
 		  MYF(MY_WME))))
     goto err0;
@@ -245,7 +245,7 @@ static my_bool _ma_read_pack_info(MARIA_SHARE *share, File file,
   */
   length=(uint) (elements*2+trees*(1 << maria_quick_table_bits));
   if (!(share->decode_tables=(uint16*)
-	my_malloc((length+OFFSET_TABLE_SIZE)*sizeof(uint16)+
+	my_malloc(PSI_INSTRUMENT_ME, (length+OFFSET_TABLE_SIZE)*sizeof(uint16)+
 		  (uint) (share->pack.header_length - sizeof(header)) +
                   share->base.extra_rec_buff_size,
 		  MYF(MY_WME | MY_ZEROFILL))))
@@ -292,9 +292,9 @@ static my_bool _ma_read_pack_info(MARIA_SHARE *share, File file,
       goto err3;
   /* Reallocate the decoding tables to the used size. */
   decode_table=(uint16*)
-    my_realloc((uchar*) share->decode_tables,
+    my_realloc(PSI_INSTRUMENT_ME, (uchar*) share->decode_tables,
 	       (uint) ((uchar*) decode_table - (uchar*) share->decode_tables),
-	       MYF(MY_HOLD_ON_ERROR));
+	       MYF(0));
   /* Fix the table addresses in the tree heads. */
   {
     my_ptrdiff_t diff= PTR_BYTE_DIFF(decode_table,share->decode_tables);
@@ -757,6 +757,8 @@ int _ma_read_pack_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos)
 	      block_info.rec_len - block_info.offset, MYF(MY_NABP)))
     goto panic;
   info->update|= HA_STATE_AKTIV;
+
+  info->rec_buff[block_info.rec_len]= 0; /* Keep valgrind happy */
   DBUG_RETURN(_ma_pack_rec_unpack(info,&info->bit_buff, buf,
                                   info->rec_buff, block_info.rec_len));
 panic:
@@ -1397,8 +1399,9 @@ int _ma_read_rnd_pack_record(MARIA_HA *info,
   info->cur_row.nextpos= block_info.filepos+block_info.rec_len;
   info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED;
 
-  DBUG_RETURN (_ma_pack_rec_unpack(info, &info->bit_buff, buf,
-                                   info->rec_buff, block_info.rec_len));
+  info->rec_buff[block_info.rec_len]= 0; /* Keep valgrind happy */
+  DBUG_RETURN(_ma_pack_rec_unpack(info, &info->bit_buff, buf,
+                                  info->rec_buff, block_info.rec_len));
  err:
   DBUG_RETURN(my_errno);
 }
diff --git a/storage/maria/ma_pagecache.c b/storage/maria/ma_pagecache.c
index e411cfbd54c..a2e9a5cc172 100644
--- a/storage/maria/ma_pagecache.c
+++ b/storage/maria/ma_pagecache.c
@@ -85,6 +85,9 @@
     #define PAGECACHE_DEBUG
     #define PAGECACHE_DEBUG_LOG  "my_pagecache_debug.log"
 */
+#undef PAGECACHE_DEBUG
+#define PAGECACHE_DEBUG_LOG  "my_pagecache_debug.log"
+#define _VARARGS(X) X
 
 /*
   In key cache we have external raw locking here we use
@@ -146,7 +149,7 @@ struct st_pagecache_hash_link
   struct st_pagecache_block_link
     *block;                          /* reference to the block for the page: */
   PAGECACHE_FILE file;               /* from such a file                     */
-  pgcache_page_no_t pageno;            /* this page                            */
+  pgcache_page_no_t pageno;          /* this page                            */
   uint requests;                     /* number of requests for the page      */
 };
 
@@ -174,6 +177,9 @@ struct st_pagecache_hash_link
 #define PCBLOCK_CHANGED    32 /* block buffer contains a dirty page          */
 #define PCBLOCK_DIRECT_W   64 /* possible direct write to the block          */
 #define PCBLOCK_DEL_WRITE 128 /* should be written on delete                 */
+#define PCBLOCK_BIG_READ  256 /* the first block of the big read in progress
+                                 or not first block which other thread wait
+                                 to be read in big read operation           */
 
 /* page status, returned by find_block */
 #define PAGE_READ               0
@@ -507,37 +513,45 @@ static void test_key_cache(PAGECACHE *pagecache,
 
 #define DEFAULT_PAGECACHE_DEBUG_LOG  "pagecache_debug.log"
 
-#if defined(PAGECACHE_DEBUG) && ! defined(PAGECACHE_DEBUG_LOG)
-#define PAGECACHE_DEBUG_LOG  DEFAULT_PAGECACHE_DEBUG_LOG
-#endif
-
-#if defined(PAGECACHE_DEBUG_LOG)
+#if defined(PAGECACHE_DEBUG)
 static FILE *pagecache_debug_log= NULL;
 static void pagecache_debug_print _VARARGS((const char *fmt, ...));
-#define PAGECACHE_DEBUG_OPEN                                                  \
-          if (!pagecache_debug_log)                                           \
-          {                                                                   \
-            pagecache_debug_log= fopen(PAGECACHE_DEBUG_LOG, "w");             \
-            (void) setvbuf(pagecache_debug_log, NULL, _IOLBF, BUFSIZ);        \
+#define PAGECACHE_DEBUG_OPEN                                                 \
+          if (!pagecache_debug_log)                                          \
+          {                                                                  \
+            if ((pagecache_debug_log= fopen(PAGECACHE_DEBUG_LOG, "w")))      \
+              (void) setvbuf(pagecache_debug_log, NULL, _IOLBF, BUFSIZ);     \
           }
 
-#define PAGECACHE_DEBUG_CLOSE                                                 \
-          if (pagecache_debug_log)                                            \
-          {                                                                   \
-            fclose(pagecache_debug_log);                                      \
-            pagecache_debug_log= 0;                                           \
+#define PAGECACHE_DEBUG_CLOSE                                                \
+          if (pagecache_debug_log)                                           \
+          {                                                                  \
+            fclose(pagecache_debug_log);                                     \
+            pagecache_debug_log= 0;                                          \
           }
 #else
 #define PAGECACHE_DEBUG_OPEN
 #define PAGECACHE_DEBUG_CLOSE
 #endif /* defined(PAGECACHE_DEBUG_LOG) */
 
-#if defined(PAGECACHE_DEBUG_LOG) && defined(PAGECACHE_DEBUG)
+#if defined(PAGECACHE_DEBUG)
 #define KEYCACHE_PRINT(l, m) KEYCACHE_DBUG_PRINT(l,m)
+
+#ifdef PAGECACHE_DEBUG_DLOG
+#define KEYCACHE_DBUG_PRINT(l, m)                                             \
+            { if (pagecache_debug_log)                                        \
+              {                                                               \
+                fprintf(pagecache_debug_log, "%s: ", l);                      \
+                DBUG_PRINT("PCDEBUG", ("%s: ", l));                           \
+              }                                                               \
+              pagecache_debug_print m; }
+#else
 #define KEYCACHE_DBUG_PRINT(l, m)                                             \
             { if (pagecache_debug_log)                                        \
                 fprintf(pagecache_debug_log, "%s: ", l);                      \
               pagecache_debug_print m; }
+#endif
+
 
 #define KEYCACHE_DBUG_ASSERT(a)                                               \
             { if (! (a) && pagecache_debug_log)                               \
@@ -547,20 +561,21 @@ static void pagecache_debug_print _VARARGS((const char *fmt, ...));
 #define KEYCACHE_PRINT(l, m)
 #define KEYCACHE_DBUG_PRINT(l, m)  DBUG_PRINT(l, m)
 #define KEYCACHE_DBUG_ASSERT(a)    DBUG_ASSERT(a)
-#endif /* defined(PAGECACHE_DEBUG_LOG) && defined(PAGECACHE_DEBUG) */
+#endif /* defined(PAGECACHE_DEBUG) */
 
 #if defined(PAGECACHE_DEBUG) || !defined(DBUG_OFF)
-static long pagecache_thread_id;
+static my_thread_id pagecache_thread_id;
 #define KEYCACHE_THREAD_TRACE(l)                                              \
-             KEYCACHE_DBUG_PRINT(l,("|thread %ld",pagecache_thread_id))
+             KEYCACHE_DBUG_PRINT(l,("|thread %lld",pagecache_thread_id))
 
 #define KEYCACHE_THREAD_TRACE_BEGIN(l)                                        \
             { struct st_my_thread_var *thread_var= my_thread_var;             \
               pagecache_thread_id= thread_var->id;                            \
-              KEYCACHE_DBUG_PRINT(l,("[thread %ld",pagecache_thread_id)) }
+              KEYCACHE_DBUG_PRINT(l,("[thread %lld",pagecache_thread_id));    \
+ }
 
 #define KEYCACHE_THREAD_TRACE_END(l)                                          \
-            KEYCACHE_DBUG_PRINT(l,("]thread %ld",pagecache_thread_id))
+            KEYCACHE_DBUG_PRINT(l,("]thread %lld",pagecache_thread_id))
 #else
 #define KEYCACHE_PRINT(l,m)
 #define KEYCACHE_THREAD_TRACE_BEGIN(l)
@@ -586,13 +601,13 @@ static int ___pagecache_pthread_mutex_lock(mysql_mutex_t *mutex);
 static void ___pagecache_pthread_mutex_unlock(mysql_mutex_t *mutex);
 static int ___pagecache_pthread_cond_signal(mysql_cond_t *cond);
 #define pagecache_pthread_mutex_lock(M) \
-{ DBUG_PRINT("lock", ("mutex lock 0x%lx %u", (ulong)(M), __LINE__)); \
+{ DBUG_PRINT("lock", ("mutex lock %p %u", (M), __LINE__)); \
   ___pagecache_pthread_mutex_lock(M);}
 #define pagecache_pthread_mutex_unlock(M) \
-{ DBUG_PRINT("lock", ("mutex unlock 0x%lx %u", (ulong)(M), __LINE__)); \
+{ DBUG_PRINT("lock", ("mutex unlock %p %u", (M), __LINE__)); \
   ___pagecache_pthread_mutex_unlock(M);}
 #define pagecache_pthread_cond_signal(M) \
-{ DBUG_PRINT("lock", ("signal 0x%lx %u", (ulong)(M), __LINE__)); \
+{ DBUG_PRINT("lock", ("signal %p %u", (M), __LINE__)); \
   ___pagecache_pthread_cond_signal(M);}
 #else
 #define pagecache_pthread_mutex_lock mysql_mutex_lock
@@ -667,6 +682,10 @@ static my_bool pagecache_fwrite(PAGECACHE *pagecache,
     DBUG_PRINT("error", ("write callback problem"));
     DBUG_RETURN(1);
   }
+#if __has_feature(memory_sanitizer) /* FIXME: encryption.aria_tiny etc. fail */
+  /* FIXME: ENGINE=Aria occasionally writes uninitialized data */
+  __msan_unpoison(args.page, pagecache->block_size);
+#endif
   res= (int)my_pwrite(filedesc->file, args.page, pagecache->block_size,
                  ((my_off_t) pageno << pagecache->shift), flags);
   (*filedesc->post_write_hook)(res, &args);
@@ -748,7 +767,8 @@ static inline uint next_power(uint value)
 
 size_t init_pagecache(PAGECACHE *pagecache, size_t use_mem,
                      uint division_limit, uint age_threshold,
-                     uint block_size, uint changed_blocks_hash_size,
+                     uint block_size,
+                     uint changed_blocks_hash_size,
                      myf my_readwrite_flags)
 {
   size_t blocks, hash_links, length;
@@ -756,6 +776,10 @@ size_t init_pagecache(PAGECACHE *pagecache, size_t use_mem,
   DBUG_ENTER("init_pagecache");
   DBUG_ASSERT(block_size >= 512);
 
+  // By default we init usual cache (variables will be assigned to switch to s3)
+  pagecache->big_block_read= NULL;
+  pagecache->big_block_free= NULL;
+
   PAGECACHE_DEBUG_OPEN;
   if (pagecache->inited && pagecache->disk_blocks > 0)
   {
@@ -770,8 +794,8 @@ size_t init_pagecache(PAGECACHE *pagecache, size_t use_mem,
   {
     if (mysql_mutex_init(key_PAGECACHE_cache_lock,
                          &pagecache->cache_lock, MY_MUTEX_INIT_FAST) ||
-        my_hash_init(&pagecache->files_in_flush, &my_charset_bin, 32,
-                     offsetof(struct st_file_in_flush, file),
+        my_hash_init(PSI_INSTRUMENT_ME, &pagecache->files_in_flush,
+                     &my_charset_bin, 32, offsetof(struct st_file_in_flush, file),
                      sizeof(((struct st_file_in_flush *)NULL)->file),
                      NULL, NULL, 0))
       goto err;
@@ -782,7 +806,7 @@ size_t init_pagecache(PAGECACHE *pagecache, size_t use_mem,
 
   pagecache->mem_size= use_mem;
   pagecache->block_size= block_size;
-  pagecache->shift= my_bit_log2(block_size);
+  pagecache->shift= my_bit_log2_uint64(block_size);
   pagecache->readwrite_flags= my_readwrite_flags | MY_NABP | MY_WAIT_IF_FULL;
   pagecache->org_readwrite_flags= pagecache->readwrite_flags;
   DBUG_PRINT("info", ("block_size: %u", block_size));
@@ -826,15 +850,15 @@ size_t init_pagecache(PAGECACHE *pagecache, size_t use_mem,
            (blocks << pagecache->shift) > use_mem && blocks > 8)
       blocks--;
     /* Allocate memory for cache page buffers */
+    pagecache->mem_size= blocks * pagecache->block_size;
     if ((pagecache->block_mem=
-      my_large_malloc(blocks * pagecache->block_size,
-                         MYF(MY_WME))))
+      my_large_malloc(&pagecache->mem_size, MYF(MY_WME))))
     {
       /*
         Allocate memory for blocks, hash_links and hash entries;
         For each block 2 hash links are allocated
       */
-      if (my_multi_malloc_large(MYF(MY_ZEROFILL),
+      if (my_multi_malloc_large(PSI_INSTRUMENT_ME, MYF(MY_ZEROFILL),
                                 &pagecache->block_root,
                                 (ulonglong) (blocks *
                                              sizeof(PAGECACHE_BLOCK_LINK)),
@@ -852,7 +876,7 @@ size_t init_pagecache(PAGECACHE *pagecache, size_t use_mem,
                                              changed_blocks_hash_size),
                                 NullS))
         break;
-      my_large_free(pagecache->block_mem);
+      my_large_free(pagecache->block_mem, pagecache->mem_size);
       pagecache->block_mem= 0;
     }
     blocks= blocks / 4*3;
@@ -903,7 +927,7 @@ err:
   pagecache->blocks=  0;
   if (pagecache->block_mem)
   {
-    my_large_free(pagecache->block_mem);
+    my_large_free(pagecache->block_mem, pagecache->mem_size);
     pagecache->block_mem= NULL;
   }
   if (pagecache->block_root)
@@ -1177,7 +1201,7 @@ void end_pagecache(PAGECACHE *pagecache, my_bool cleanup)
 
     if (pagecache->block_mem)
     {
-      my_large_free(pagecache->block_mem);
+      my_large_free(pagecache->block_mem, pagecache->mem_size);
       pagecache->block_mem= NULL;
       my_free(pagecache->block_root);
       pagecache->block_root= NULL;
@@ -1350,6 +1374,8 @@ static void link_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block,
       }
     }
     while (thread != last_thread);
+    DBUG_PRINT("hash", ("hash_link (link block): %p,  hash_link: %p -> %p",
+                        hash_link, hash_link->block, block));
     hash_link->block= block;
     /* Ensure that no other thread tries to use this block */
     block->status|= PCBLOCK_REASSIGNED;
@@ -1637,8 +1663,8 @@ static inline void link_hash(PAGECACHE_HASH_LINK **start,
 static void unlink_hash(PAGECACHE *pagecache, PAGECACHE_HASH_LINK *hash_link)
 {
   DBUG_ENTER("unlink_hash");
-  DBUG_PRINT("enter", ("hash_link: %p  fd: %u  pos: %lu  requests: %u",
-                       hash_link, (uint) hash_link->file.file,
+  DBUG_PRINT("enter", ("hash_link: %p  block: %p  fd: %u  pos: %lu  requests: %u",
+                       hash_link, hash_link->block, (uint) hash_link->file.file,
                        (ulong) hash_link->pageno,
                        hash_link->requests));
   DBUG_ASSERT(hash_link->requests == 0);
@@ -1646,6 +1672,7 @@ static void unlink_hash(PAGECACHE *pagecache, PAGECACHE_HASH_LINK *hash_link)
 
   if ((*hash_link->prev= hash_link->next))
     hash_link->next->prev= hash_link->prev;
+
   hash_link->block= NULL;
   if (pagecache->waiting_for_hash_link.last_thread)
   {
@@ -1893,6 +1920,7 @@ static PAGECACHE_BLOCK_LINK *find_block(PAGECACHE *pagecache,
                                         my_bool wrmode,
                                         my_bool block_is_copied,
                                         my_bool reg_req,
+                                        my_bool fast,
                                         int *page_st)
 {
   PAGECACHE_HASH_LINK *hash_link;
@@ -1909,6 +1937,7 @@ static PAGECACHE_BLOCK_LINK *find_block(PAGECACHE *pagecache,
   DBUG_EXECUTE("check_pagecache",
                test_key_cache(pagecache, "start of find_block", 0););
 #endif
+  DBUG_ASSERT(!fast || !wrmode);
 
 restart:
   /* Find the hash link for the requested page (file, pageno) */
@@ -2018,9 +2047,11 @@ restart:
     /* This is a request for a new page or for a page not to be removed */
     if (! block)
     {
+      DBUG_PRINT("info", ("request for a new page"));
       /* No block is assigned for the page yet */
       if (pagecache->blocks_unused)
       {
+        DBUG_PRINT("info", ("there is never used blocks"));
         if (pagecache->free_block_list)
         {
           /* There is a block in the free list. */
@@ -2054,7 +2085,13 @@ restart:
         block->last_hit_time= 0;
         block->rec_lsn= LSN_MAX;
         link_to_file_list(pagecache, block, file, 0);
+        DBUG_PRINT("hash",
+                   ("block (no block assigned): %p  hash_link: %p -> %p",
+                    block, block->hash_link, hash_link));
         block->hash_link= hash_link;
+        DBUG_PRINT("hash",
+                   ("hash_link (no block assignment): %p  hash_link: %p -> %p",
+                    hash_link, hash_link->block, block));
         hash_link->block= block;
         page_status= PAGE_TO_BE_READ;
         DBUG_PRINT("info", ("page to be read set for page %p (%u)",
@@ -2065,6 +2102,7 @@ restart:
       }
       else
       {
+        DBUG_PRINT("info", ("there is NOT never used blocks"));
 	/* There are no never used blocks, use a block from the LRU chain */
 
         /*
@@ -2076,6 +2114,8 @@ restart:
 
         if (! pagecache->used_last)
         {
+          struct st_my_thread_var *thread;
+          DBUG_PRINT("info", ("there is NOT UNUSED blocks"));
           /*
             Wait until a new block is added to the LRU chain;
             several threads might wait here for the same page,
@@ -2084,8 +2124,18 @@ restart:
             The block is given to us by the next thread executing
             link_block().
           */
+          if (fast)
+          {
+            DBUG_ASSERT(hash_link->requests == 0);
+            unlink_hash(pagecache, hash_link);
+            DBUG_PRINT("info", ("fast and no blocks in LRU"));
+
+            KEYCACHE_DBUG_PRINT("find_block",
+                                ("fast and no blocks in LRU"));
+            DBUG_RETURN(0);
+          }
 
-          struct st_my_thread_var *thread= my_thread_var;
+          thread= my_thread_var;
           thread->keycache_link= (void *) hash_link;
           wqueue_link_into_queue(&pagecache->waiting_for_block, thread);
           do
@@ -2104,13 +2154,30 @@ restart:
         }
         else
         {
+          DBUG_PRINT("info", ("take a block from LRU"));
           /*
              Take the first block from the LRU chain
              unlinking it from the chain
           */
           block= pagecache->used_last->next_used;
+          if (fast &&
+              ((block->status & (PCBLOCK_IN_FLUSH | PCBLOCK_CHANGED)) ||
+               (block->hash_link && block->hash_link != hash_link &&
+                block->hash_link->requests)))
+          {
+            DBUG_ASSERT(hash_link->requests == 0);
+            unlink_hash(pagecache, hash_link);
+            DBUG_PRINT("info", ("fast and LRU block is in switch or has "
+                                 "readers"));
+            KEYCACHE_DBUG_PRINT("find_block",
+                                ("fast and LRU block is in switch or has "
+                                 "readers"));
+            DBUG_RETURN (0);
+          }
 	  if (reg_req)
             reg_requests(pagecache, block, 1);
+          DBUG_PRINT("hash", ("hash_link (LRU): %p,  hash_link: %p -> %p",
+                              hash_link, hash_link->block, block));
           hash_link->block= block;
           DBUG_ASSERT(block->requests == 1);
         }
@@ -2181,6 +2248,8 @@ restart:
           link_to_file_list(pagecache, block, file,
                             (my_bool)(block->hash_link ? 1 : 0));
 
+          DBUG_PRINT("hash", ("block (LRU): %p,  hash_link: %p -> %p",
+                              block, block->hash_link, hash_link));
           block->hash_link= hash_link;
           PCBLOCK_INFO(block);
           block->hits_left= init_hits_left;
@@ -2266,7 +2335,7 @@ static void add_pin(PAGECACHE_BLOCK_LINK *block)
 #ifndef DBUG_OFF
   {
     PAGECACHE_PIN_INFO *info=
-      (PAGECACHE_PIN_INFO *)my_malloc(sizeof(PAGECACHE_PIN_INFO), MYF(0));
+      (PAGECACHE_PIN_INFO *)my_malloc(PSI_INSTRUMENT_ME, sizeof(PAGECACHE_PIN_INFO), MYF(0));
     info->thread= my_thread_var;
     info_link(&block->pin_list, info);
   }
@@ -2300,7 +2369,7 @@ static void remove_pin(PAGECACHE_BLOCK_LINK *block, my_bool any
 static void info_add_lock(PAGECACHE_BLOCK_LINK *block, my_bool wl)
 {
   PAGECACHE_LOCK_INFO *info=
-    (PAGECACHE_LOCK_INFO *)my_malloc(sizeof(PAGECACHE_LOCK_INFO), MYF(0));
+    (PAGECACHE_LOCK_INFO *)my_malloc(PSI_INSTRUMENT_ME, sizeof(PAGECACHE_LOCK_INFO), MYF(0));
   info->thread= my_thread_var;
   info->write_lock= wl;
   info_link((PAGECACHE_PIN_INFO **)&block->lock_list,
@@ -2665,8 +2734,303 @@ retry:
   DBUG_ASSERT(block->hash_link->requests > 0);
   block->hash_link->requests--;
   DBUG_RETURN(1);
+}
+
+
+/**
+   @brief Reading of a big block in the S3 storage engine.
+
+   @param pagecache    Page cache
+   @param block        Block to read
+
+   @note
+
+   Page cache is segmented in logical blocks of size 'block_size'. All
+   read request are for blocks of 'block_size'.
+
+   When using a file with 'big blocks', the file is split into a
+   header, header size (for index information) and then blocks of
+   big_block_size.  he last block may be smaller than big_block_size.
+   All 'big blocks' are a multiple of block_size.
+   The header is never read into the page cache. It's used to store
+   the table definition and status and is only read by open().
+
+   When wanting to read a block, we register a read request for that
+   block and for the first block that is part of the big block read.  We
+   also put a special flag on the first block so that if another thread
+   would want to do a big block read, it will wait on signal, and then
+   check if the block it requested is now in the page cache. If it's
+   not in the cache it will retry.
+
+   After the big block is read, we will put all read block that was not in the
+   page cache. Blocks that where already in page cache will not be touched
+   and will not be added first in the FIFO.
+
+   The block for which we had a read request is added first in FIFO and
+   returned.
+*/
+
+#ifdef WITH_S3_STORAGE_ENGINE
+static void read_big_block(PAGECACHE *pagecache,
+                           PAGECACHE_BLOCK_LINK *block)
+{
+  int page_st;
+  size_t big_block_size_in_pages;
+  size_t offset;
+  pgcache_page_no_t page, our_page;
+  pgcache_page_no_t page_to_read;
+  PAGECACHE_BLOCK_LINK *block_to_read= NULL;
+  PAGECACHE_IO_HOOK_ARGS args;
+  S3_BLOCK data;
+  DBUG_ENTER("read_big_block");
+  DBUG_PRINT("enter", ("read BIG block: %p", block));
+  bzero((void*) &data, sizeof(data));
+
+  DBUG_ASSERT(block->hash_link->file.big_block_size %
+              pagecache->block_size == 0);
+  big_block_size_in_pages=
+    block->hash_link->file.big_block_size / pagecache->block_size;
+
+  our_page= block->hash_link->pageno;
+
+  /* find first page of the big block (page_to_read) */
+  page_to_read= ((block->hash_link->pageno -
+                  block->hash_link->file.head_blocks) /
+                 big_block_size_in_pages);
+  page_to_read= (page_to_read * big_block_size_in_pages +
+                 block->hash_link->file.head_blocks);
+  if (page_to_read != our_page)
+  {
+    block_to_read= find_block(pagecache, &block->hash_link->file,
+                              page_to_read, 1,
+                              FALSE, TRUE /* copy under protection (?)*/,
+                              TRUE /*register*/, FALSE, &page_st);
+    DBUG_ASSERT(block_to_read == block_to_read->hash_link->block);
+
+    if (block_to_read->status & PCBLOCK_ERROR)
+    {
+      /* We get first block with an error so all operation failed */
+      DBUG_PRINT("error", ("Got error when reading first page"));
+      block->status|= PCBLOCK_ERROR;
+      block->error= block_to_read->error;
+      remove_reader(block_to_read);
+      unreg_request(pagecache, block_to_read, 1);
+      DBUG_VOID_RETURN;
+    }
+    if (block_to_read->status & PCBLOCK_BIG_READ)
+    {
+      /*
+        Other thread is reading the big block so we will wait when it will
+        have read our block for us
+      */
+      struct st_my_thread_var *thread;
+      DBUG_ASSERT(page_st == PAGE_WAIT_TO_BE_READ);
+      DBUG_ASSERT(page_st != PAGE_TO_BE_READ);
+      block->status|= PCBLOCK_BIG_READ; // will be read by other thread
+      /*
+        Block read failed because somebody else is reading the first block
+        (and all other blocks part of this one).
+        Wait until block is available.
+      */
+      thread= my_thread_var;
+      /* Put the request into a queue and wait until it can be processed */
+      wqueue_add_to_queue(&block_to_read->wqueue[COND_FOR_REQUESTED], thread);
+      do
+      {
+        DBUG_PRINT("wait",
+                   ("suspend thread %s %ld", thread->name,
+                    (ulong) thread->id));
+        pagecache_pthread_cond_wait(&thread->suspend,
+                                   &pagecache->cache_lock);
+      }
+      while (thread->next);
+      // page should be read by other  thread
+      DBUG_ASSERT(block->status & PCBLOCK_READ ||
+                  block->status & PCBLOCK_ERROR);
+      /*
+        It is possible that other thread already removed  the flag (in
+        case of two threads waiting) but it will not make harm to try to
+        remove it even in that case.
+      */
+      block->status&= ~PCBLOCK_BIG_READ;
+      // all is read => lets finish nice
+      DBUG_ASSERT(block_to_read != block);
+      remove_reader(block_to_read);
+      unreg_request(pagecache, block_to_read, 1);
+      DBUG_VOID_RETURN;
+    }
+    else
+    {
+     // only primary request here, PAGE_WAIT_TO_BE_READ is impossible
+     DBUG_ASSERT(page_st != PAGE_WAIT_TO_BE_READ);
+    }
+  }
+  else
+  {
+    block_to_read= block;
+    page_st= PAGE_TO_BE_READ;
+  }
+
+  DBUG_ASSERT(!(block_to_read->status & PCBLOCK_BIG_READ));
+  // Mark the first page of a big block
+  block_to_read->status|= PCBLOCK_BIG_READ;
+
+  // Don't keep cache locked during the possible slow read from s3
+  pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+  // perform read of big block
+  args.page= NULL;
+  args.pageno= page_to_read;
+  args.data= block->hash_link->file.callback_data;
+
+  if (pagecache->big_block_read(pagecache, &args, &block->hash_link->file,
+                                &data))
+  {
+    pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+    block_to_read->status|= PCBLOCK_ERROR;
+    block_to_read->error= (int16) my_errno;
+    pagecache->big_block_free(&data);
+    goto error;
+  }
+
+  /*
+    We need to keep the mutex locked while filling pages.
+    As there is no changed blocks to flush, this operation should
+    be reasonable fast
+  */
+  pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+
+  /* Copy the first page to the cache */
+  if (page_st != PAGE_READ)
+  {
+    DBUG_ASSERT(page_st != PAGE_WAIT_TO_BE_READ);
+    memcpy(block_to_read->buffer, data.str, pagecache->block_size);
+    block_to_read->status|= PCBLOCK_READ;
+  }
+  else
+  {
+    DBUG_ASSERT(block_to_read->status & PCBLOCK_READ);
+  }
+  /* Signal that all pending requests for this page now can be processed */
+  if (block_to_read->wqueue[COND_FOR_REQUESTED].last_thread)
+    wqueue_release_queue(&block_to_read->wqueue[COND_FOR_REQUESTED]);
+
+  /* Copy the rest of the pages */
+  for (offset= pagecache->block_size, page= page_to_read + 1;
+       offset < data.length;
+       offset+= pagecache->block_size, page++)
+  {
+    DBUG_ASSERT(offset + pagecache->block_size <= data.length);
+    if (page == our_page)
+    {
+      DBUG_ASSERT(!(block->status & PCBLOCK_READ));
+      memcpy(block->buffer, data.str + offset, pagecache->block_size);
+      block->status|= PCBLOCK_READ;
+    }
+    else
+    {
+      PAGECACHE_BLOCK_LINK *bl;
+      bl= find_block(pagecache,  &block->hash_link->file, page, 1,
+                     FALSE, TRUE /* copy under protection (?)*/,
+                     TRUE /*register*/, TRUE /*fast*/, &page_st);
+      if (!bl)
+      {
+        /*
+          We can not get this page easy.
+          Maybe we will be lucky with other pages,
+          also among other pages can be page which waited by other thread
+        */
+        continue;
+      }
+      DBUG_ASSERT(bl == bl->hash_link->block);
+      if ((bl->status & PCBLOCK_ERROR) == 0 &&
+          (page_st == PAGE_TO_BE_READ ||       // page should be read
+           (page_st == PAGE_WAIT_TO_BE_READ &&
+            (bl->status & PCBLOCK_BIG_READ)))) // or page waited by other thread
+      {
+        memcpy(bl->buffer, data.str + offset, pagecache->block_size);
+        bl->status|= PCBLOCK_READ;
+      }
+      remove_reader(bl);
+      unreg_request(pagecache, bl, 1);
+      /* Signal that all pending requests for this page now can be processed */
+      if (bl->wqueue[COND_FOR_REQUESTED].last_thread)
+        wqueue_release_queue(&bl->wqueue[COND_FOR_REQUESTED]);
+    }
+  }
+  if (page < our_page)
+  {
+    /* we break earlier, but still have to fill page what was requested */
+    DBUG_ASSERT(!(block->status & PCBLOCK_READ));
+    memcpy(block->buffer,
+           data.str + ((our_page - page_to_read) * pagecache->block_size),
+           pagecache->block_size);
+    block->status|= PCBLOCK_READ;
+  }
+  pagecache->big_block_free(&data);
 
+end:
+  block_to_read->status&= ~PCBLOCK_BIG_READ;
+  if (block_to_read != block)
+  {
+    remove_reader(block_to_read);
+    unreg_request(pagecache, block_to_read, 1);
+  }
+  /* Signal that all pending requests for this page now can be processed */
+  if (block->wqueue[COND_FOR_REQUESTED].last_thread)
+    wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]);
+  DBUG_VOID_RETURN;
+
+error:
+  /*
+    Read failed. Mark all readers waiting for the a block covered by the
+    big block that the read failed
+  */
+  for (offset= pagecache->block_size, page= page_to_read + 1;
+       offset < data.length;
+       offset+= pagecache->block_size, page++)
+  {
+    DBUG_ASSERT(offset + pagecache->block_size <= data.length);
+    if (page == our_page)
+    {
+      DBUG_ASSERT(!(block->status & PCBLOCK_READ));
+      block->status|= PCBLOCK_ERROR;
+      block->error= (int16) my_errno;
+    }
+    else
+    {
+      PAGECACHE_BLOCK_LINK *bl;
+      bl= find_block(pagecache,  &block->hash_link->file, page, 1,
+                     FALSE, TRUE /* copy under protection (?)*/,
+                     TRUE /*register*/, TRUE /*fast*/, &page_st);
+      if (!bl)
+      {
+        /*
+          We can not get this page easy.
+          Maybe we will be lucky with other pages,
+          also among other pages can be page which waited by other thread
+        */
+        continue;
+      }
+      DBUG_ASSERT(bl == bl->hash_link->block);
+      if ((bl->status & PCBLOCK_ERROR) == 0 &&
+          (page_st == PAGE_TO_BE_READ ||       // page should be read
+           (page_st == PAGE_WAIT_TO_BE_READ &&
+            (bl->status & PCBLOCK_BIG_READ)))) // or page waited by other thread
+      {
+        bl->status|= PCBLOCK_ERROR;
+        bl->error= (int16) my_errno;
+      }
+      remove_reader(bl);
+      unreg_request(pagecache, bl, 1);
+      /* Signal that all pending requests for this page now can be processed */
+      if (bl->wqueue[COND_FOR_REQUESTED].last_thread)
+        wqueue_release_queue(&bl->wqueue[COND_FOR_REQUESTED]);
+    }
+  }
+  goto end;
 }
+#endif /* WITH_S3_STORAGE_ENGINE */
 
 
 /*
@@ -2861,7 +3225,7 @@ void pagecache_unlock(PAGECACHE *pagecache,
   inc_counter_for_resize_op(pagecache);
   /* See NOTE for pagecache_unlock about registering requests */
   block= find_block(pagecache, file, pageno, 0, 0, 0,
-                    pin == PAGECACHE_PIN_LEFT_UNPINNED, &page_st);
+                    pin == PAGECACHE_PIN_LEFT_UNPINNED, FALSE, &page_st);
   PCBLOCK_INFO(block);
   DBUG_ASSERT(block != 0 && page_st == PAGE_READ);
   if (first_REDO_LSN_for_page)
@@ -2948,7 +3312,7 @@ void pagecache_unpin(PAGECACHE *pagecache,
 
   inc_counter_for_resize_op(pagecache);
   /* See NOTE for pagecache_unlock about registering requests */
-  block= find_block(pagecache, file, pageno, 0, 0, 0, 0, &page_st);
+  block= find_block(pagecache, file, pageno, 0, 0, 0, 0, FALSE, &page_st);
   DBUG_ASSERT(block != 0);
   DBUG_ASSERT(page_st == PAGE_READ);
   /* we can't unpin such page without unlock */
@@ -3349,7 +3713,7 @@ uchar *pagecache_read(PAGECACHE *pagecache,
   char llbuf[22];
   DBUG_ENTER("pagecache_read");
   DBUG_PRINT("enter", ("fd: %u  page: %s  buffer: %p  level: %u  "
-                       "t:%s  (%d)%s->%s  %s->%s",
+                       "t:%s  (%d)%s->%s  %s->%s  big block: %d",
                        (uint) file->file, ullstr(pageno, llbuf),
                        buff, level,
                        page_cache_page_type_str[type],
@@ -3357,7 +3721,8 @@ uchar *pagecache_read(PAGECACHE *pagecache,
                        page_cache_page_lock_str[lock_to_read[lock].new_lock],
                        page_cache_page_lock_str[lock_to_read[lock].unlock_lock],
                        page_cache_page_pin_str[new_pin],
-                       page_cache_page_pin_str[unlock_pin]));
+                       page_cache_page_pin_str[unlock_pin],
+                       MY_TEST(pagecache->big_block_read)));
   DBUG_ASSERT(buff != 0 || (buff == 0 && (unlock_pin == PAGECACHE_PIN ||
                                           unlock_pin == PAGECACHE_PIN_LEFT_PINNED)));
   DBUG_ASSERT(pageno < ((1ULL) << 40));
@@ -3369,6 +3734,14 @@ uchar *pagecache_read(PAGECACHE *pagecache,
 
 restart:
 
+  /*
+   If we use big block than the big block is multiple of blocks and we
+   have enouch blocks in cache
+  */
+  DBUG_ASSERT(!pagecache->big_block_read ||
+              (file->big_block_size != 0 &&
+               file->big_block_size % pagecache->block_size == 0));
+
   if (pagecache->can_be_used)
   {
     /* Key cache is used */
@@ -3387,19 +3760,40 @@ restart:
     pagecache->global_cache_r_requests++;
     /* See NOTE for pagecache_unlock about registering requests. */
     reg_request= ((new_pin == PAGECACHE_PIN_LEFT_UNPINNED) ||
-                  (new_pin == PAGECACHE_PIN));
+                  (new_pin == PAGECACHE_PIN) ||
+                  pagecache->big_block_read);
     block= find_block(pagecache, file, pageno, level,
                       lock == PAGECACHE_LOCK_WRITE, buff != 0,
-                      reg_request, &page_st);
+                      reg_request, FALSE, &page_st);
     DBUG_PRINT("info", ("Block type: %s current type %s",
                         page_cache_page_type_str[block->type],
                         page_cache_page_type_str[type]));
     if (((block->status & PCBLOCK_ERROR) == 0) && (page_st != PAGE_READ))
     {
-      /* The requested page is to be read into the block buffer */
-      read_block(pagecache, block,
-                 (my_bool)(page_st == PAGE_TO_BE_READ));
-      DBUG_PRINT("info", ("read is done"));
+#ifdef WITH_S3_STORAGE_ENGINE
+      if (!pagecache->big_block_read || page_st == PAGE_WAIT_TO_BE_READ)
+#endif /* WITH_S3_STORAGE_ENGINE */
+      {
+        /* The requested page is to be read into the block buffer */
+        read_block(pagecache, block, page_st == PAGE_TO_BE_READ);
+        DBUG_PRINT("info", ("read is done"));
+      }
+#ifdef WITH_S3_STORAGE_ENGINE
+      else
+      {
+        /* It is  big read and this thread should read */
+        DBUG_ASSERT(page_st == PAGE_TO_BE_READ);
+
+        read_big_block(pagecache, block);
+
+        if (!((new_pin == PAGECACHE_PIN_LEFT_UNPINNED) ||
+              (new_pin == PAGECACHE_PIN)))
+        {
+          /* we registered request only for big_block_read */
+          unreg_request(pagecache, block, 1);
+        }
+      }
+#endif /* WITH_S3_STORAGE_ENGINE */
     }
     /*
       Assert after block is read. Imagine two concurrent SELECTs on same
@@ -3563,7 +3957,6 @@ void pagecache_set_write_on_delete_by_link(PAGECACHE_BLOCK_LINK *block)
 
   @retval 0 deleted or was not present at all
   @retval 1 error
-
 */
 
 static my_bool pagecache_delete_internal(PAGECACHE *pagecache,
@@ -3855,6 +4248,7 @@ restart:
         unreg_request(pagecache, block, 1);
       dec_counter_for_resize_op(pagecache);
       pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+      dec_counter_for_resize_op(pagecache);
       DBUG_PRINT("info", ("restarting..."));
       goto restart;
     }
@@ -3992,6 +4386,7 @@ my_bool pagecache_write_part(PAGECACHE *pagecache,
   DBUG_ASSERT(lock != PAGECACHE_LOCK_READ_UNLOCK);
   DBUG_ASSERT(offset + size <= pagecache->block_size);
   DBUG_ASSERT(pageno < ((1ULL) << 40));
+  DBUG_ASSERT(pagecache->big_block_read == 0);
 #endif
 
   if (!page_link)
@@ -4028,7 +4423,7 @@ restart:
                   (pin == PAGECACHE_PIN));
     block= find_block(pagecache, file, pageno, level,
                       TRUE, FALSE,
-                      reg_request, &page_st);
+                      reg_request, FALSE, &page_st);
     if (!block)
     {
       DBUG_ASSERT(write_mode != PAGECACHE_WRITE_DONE);
@@ -4280,6 +4675,8 @@ static my_bool free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block,
   block->type= PAGECACHE_EMPTY_PAGE;
 #endif
   block->rec_lsn= LSN_MAX;
+  DBUG_PRINT("hash", ("block (Free): %p,  hash_link: %p -> NULL",
+                      block, block->hash_link));
   block->hash_link= NULL;
   if (block->temperature == PCBLOCK_WARM)
     pagecache->warm_blocks--;
@@ -4622,7 +5019,7 @@ static int flush_pagecache_blocks_int(PAGECACHE *pagecache,
       if (count > FLUSH_CACHE &&
           !(cache=
             (PAGECACHE_BLOCK_LINK**)
-            my_malloc(sizeof(PAGECACHE_BLOCK_LINK*)*count, MYF(0))))
+            my_malloc(PSI_INSTRUMENT_ME, sizeof(PAGECACHE_BLOCK_LINK*)*count, MYF(0))))
       {
         cache= cache_buff;
         count= FLUSH_CACHE;
@@ -4978,7 +5375,7 @@ my_bool pagecache_collect_changed_blocks_with_lsn(PAGECACHE *pagecache,
      5 + /* pageno */
      LSN_STORE_SIZE /* rec_lsn */
      ) * stored_list_size;
-  if (NULL == (str->str= my_malloc(str->length, MYF(MY_WME))))
+  if (NULL == (str->str= my_malloc(PSI_INSTRUMENT_ME, str->length, MYF(MY_WME))))
     goto err;
   ptr= str->str;
   int8store(ptr, (ulonglong)stored_list_size);
@@ -5232,6 +5629,7 @@ static int pagecache_pthread_cond_wait(mysql_cond_t *cond,
 #endif
 #endif /* defined(PAGECACHE_TIMEOUT) && !defined(__WIN__) */
 
+
 #if defined(PAGECACHE_DEBUG)
 static int ___pagecache_pthread_mutex_lock(mysql_mutex_t *mutex)
 {
@@ -5258,32 +5656,26 @@ static int ___pagecache_pthread_cond_signal(mysql_cond_t *cond)
 }
 
 
-#if defined(PAGECACHE_DEBUG_LOG)
-
-
 static void pagecache_debug_print(const char * fmt, ...)
 {
   va_list args;
   va_start(args,fmt);
   if (pagecache_debug_log)
   {
-    VOID(vfprintf(pagecache_debug_log, fmt, args));
-    VOID(fputc('\n',pagecache_debug_log));
+    vfprintf(pagecache_debug_log, fmt, args);
+    fputc('\n',pagecache_debug_log);
+#ifdef PAGECACHE_DEBUG_DLOG
+    _db_doprnt_(fmt, args);
+#endif
   }
   va_end(args);
 }
-#endif /* defined(PAGECACHE_DEBUG_LOG) */
-
-#if defined(PAGECACHE_DEBUG_LOG)
-
 
 void pagecache_debug_log_close(void)
 {
   if (pagecache_debug_log)
     fclose(pagecache_debug_log);
 }
-#endif /* defined(PAGECACHE_DEBUG_LOG) */
-
 #endif /* defined(PAGECACHE_DEBUG) */
 
 /**
@@ -5309,8 +5701,7 @@ static void null_post_write_hook(int res __attribute__((unused)),
   return;
 }
 
-void
-pagecache_file_set_null_hooks(PAGECACHE_FILE *file)
+void pagecache_file_set_null_hooks(PAGECACHE_FILE *file)
 {
   file->pre_read_hook= null_pre_hook;
   file->post_read_hook= null_post_read_hook;
@@ -5318,4 +5709,5 @@ pagecache_file_set_null_hooks(PAGECACHE_FILE *file)
   file->post_write_hook= null_post_write_hook;
   file->flush_log_callback= null_pre_hook;
   file->callback_data= NULL;
+  file->head_blocks= file->big_block_size= 0;
 }
diff --git a/storage/maria/ma_pagecache.h b/storage/maria/ma_pagecache.h
index 1fb677995fb..dbd86fc0def 100644
--- a/storage/maria/ma_pagecache.h
+++ b/storage/maria/ma_pagecache.h
@@ -1,4 +1,5 @@
 /* Copyright (C) 2006 MySQL AB
+   Copyright (c) 2011, 2020, MariaDB Corporation Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -86,9 +87,25 @@ typedef struct st_pagecache_io_hook_args
   uchar *crypt_buf; /* when using encryption */
 } PAGECACHE_IO_HOOK_ARGS;
 
+struct st_pagecache;
+
+/* Structure to store things from get_object */
+
+typedef struct st_S3_BLOCK
+{
+  uchar *str, *alloc_ptr;
+  size_t length;
+} S3_BLOCK;
+
+
 /* file descriptor for Maria */
 typedef struct st_pagecache_file
 {
+  /* Number of pages in the header which are not read with big blocks */
+  size_t head_blocks;
+  /* size of a big block for S3 or 0 */
+  size_t big_block_size;
+  /* File number */
   File file;
 
   /** Cannot be NULL */
@@ -99,9 +116,9 @@ typedef struct st_pagecache_file
   my_bool (*pre_write_hook)(PAGECACHE_IO_HOOK_ARGS *args);
   void (*post_write_hook)(int error, PAGECACHE_IO_HOOK_ARGS *args);
 
-  /** Cannot be NULL */
   my_bool (*flush_log_callback)(PAGECACHE_IO_HOOK_ARGS *args);
 
+  /** Cannot be NULL */
   uchar *callback_data;
 } PAGECACHE_FILE;
 
@@ -164,6 +181,17 @@ typedef struct st_pagecache
   /* hash for other file bl.*/
   PAGECACHE_BLOCK_LINK **file_blocks;
 
+  /**
+    Function for reading file in big hunks from S3
+    Data will be filled with pointer and length to data read
+    start_page will be contain first page read.
+  */
+  my_bool (*big_block_read)(struct st_pagecache *pagecache,
+                            PAGECACHE_IO_HOOK_ARGS *args,
+                            struct st_pagecache_file *file, S3_BLOCK *data);
+  void (*big_block_free)(S3_BLOCK *data);
+
+
   /*
     The following variables are and variables used to hold parameters for
     initializing the key cache.
@@ -211,7 +239,7 @@ extern PAGECACHE dflt_pagecache_var, *dflt_pagecache;
 extern size_t init_pagecache(PAGECACHE *pagecache, size_t use_mem,
                             uint division_limit, uint age_threshold,
                             uint block_size, uint changed_blocks_hash_size,
-                            myf my_read_flags);
+                            myf my_read_flags)__attribute__((visibility("default"))) ;
 extern size_t resize_pagecache(PAGECACHE *pagecache,
                               size_t use_mem, uint division_limit,
                               uint age_threshold, uint changed_blocks_hash_size);
@@ -291,7 +319,7 @@ extern int flush_pagecache_blocks_with_filter(PAGECACHE *keycache,
                                               PAGECACHE_FILE *file,
                                               enum flush_type type,
                                               PAGECACHE_FLUSH_FILTER filter,
-                                              void *filter_arg);
+                                              void *filter_arg)__attribute__((visibility("default"))) ;
 extern my_bool pagecache_delete(PAGECACHE *pagecache,
                                 PAGECACHE_FILE *file,
                                 pgcache_page_no_t pageno,
@@ -307,7 +335,7 @@ extern my_bool pagecache_delete_pages(PAGECACHE *pagecache,
                                       uint page_count,
                                       enum pagecache_page_lock lock,
                                       my_bool flush);
-extern void end_pagecache(PAGECACHE *keycache, my_bool cleanup);
+extern void end_pagecache(PAGECACHE *keycache, my_bool cleanup)__attribute__((visibility("default"))) ;
 extern my_bool pagecache_collect_changed_blocks_with_lsn(PAGECACHE *pagecache,
                                                          LEX_STRING *str,
                                                          LSN *min_lsn);
@@ -327,8 +355,6 @@ extern my_bool multi_pagecache_set(const uchar *key, uint length,
 				   PAGECACHE *pagecache);
 extern void multi_pagecache_change(PAGECACHE *old_data,
 				   PAGECACHE *new_data);
-extern int reset_pagecache_counters(const char *name,
-                                    PAGECACHE *pagecache);
 #ifndef DBUG_OFF
 void pagecache_file_no_dirty_page(PAGECACHE *pagecache, PAGECACHE_FILE *file);
 #else
diff --git a/storage/maria/ma_pagecrc.c b/storage/maria/ma_pagecrc.c
index 8da6369011c..4e1389b1163 100644
--- a/storage/maria/ma_pagecrc.c
+++ b/storage/maria/ma_pagecrc.c
@@ -28,7 +28,7 @@
 
 static uint32 maria_page_crc(uint32 start, uchar *data, uint length)
 {
-  uint32 crc= crc32(start, data, length);
+  uint32 crc= my_checksum(start, data, length);
 
   /* we need this assert to get following comparison working */
   compile_time_assert(MARIA_NO_CRC_BITMAP_PAGE ==
@@ -244,10 +244,9 @@ my_bool maria_page_crc_check_index(int res, PAGECACHE_IO_HOOK_ARGS *args)
   pgcache_page_no_t page_no= args->pageno;
   MARIA_SHARE *share= (MARIA_SHARE *)args->data;
   uint length= _ma_get_page_used(share, page);
+
   if (res)
-  {
     return 1;
-  }
   if (length > share->block_size - CRC_SIZE)
   {
     DBUG_PRINT("error", ("Wrong page length: %u", length));
@@ -261,7 +260,7 @@ my_bool maria_page_crc_check_index(int res, PAGECACHE_IO_HOOK_ARGS *args)
 
 
 /**
-  @brief Maria pages dumme read callback for temporary tables
+  @brief Maria pages dummy read callback for temporary tables
 
   @retval 0 OK
   @retval 1 Error
diff --git a/storage/maria/ma_preload.c b/storage/maria/ma_preload.c
index eefc9777be6..60fd9b09bb3 100644
--- a/storage/maria/ma_preload.c
+++ b/storage/maria/ma_preload.c
@@ -56,7 +56,7 @@ int maria_preload(MARIA_HA *info, ulonglong key_map, my_bool ignore_leaves)
 
   block_length= share->pagecache->block_size;
 
-  if (!(buff= (uchar *) my_malloc(block_length, MYF(MY_WME))))
+  if (!(buff= (uchar *) my_malloc(PSI_INSTRUMENT_ME, block_length, MYF(MY_WME))))
     DBUG_RETURN(my_errno= HA_ERR_OUT_OF_MEM);
 
   if (flush_pagecache_blocks(share->pagecache, &share->kfile, FLUSH_RELEASE))
diff --git a/storage/maria/ma_range.c b/storage/maria/ma_range.c
index bd434bc48e1..442adc35858 100644
--- a/storage/maria/ma_range.c
+++ b/storage/maria/ma_range.c
@@ -22,8 +22,9 @@
 #include "ma_rt_index.h"
 
 static ha_rows _ma_record_pos(MARIA_HA *,const uchar *, key_part_map,
-			      enum ha_rkey_function);
-static double _ma_search_pos(MARIA_HA *, MARIA_KEY *, uint32, my_off_t);
+			      enum ha_rkey_function, ulonglong *);
+static double _ma_search_pos(MARIA_HA *, MARIA_KEY *, uint32, my_off_t,
+                             ulonglong *page);
 static uint _ma_keynr(MARIA_PAGE *page, uchar *keypos, uint *ret_max_key);
 
 
@@ -43,8 +44,9 @@ static uint _ma_keynr(MARIA_PAGE *page, uchar *keypos, uint *ret_max_key);
      @retval number        Estimated number of rows
 */
 
-ha_rows maria_records_in_range(MARIA_HA *info, int inx, key_range *min_key,
-                            key_range *max_key)
+ha_rows maria_records_in_range(MARIA_HA *info, int inx,
+                               const key_range *min_key,
+                               const key_range *max_key, page_range *pages)
 {
   ha_rows start_pos,end_pos,res;
   MARIA_SHARE *share= info->s;
@@ -96,11 +98,11 @@ ha_rows maria_records_in_range(MARIA_HA *info, int inx, key_range *min_key,
   default:
     start_pos= (min_key ?
                 _ma_record_pos(info, min_key->key, min_key->keypart_map,
-                               min_key->flag) :
+                               min_key->flag, &pages->first_page) :
                 (ha_rows) 0);
     end_pos=   (max_key ?
                 _ma_record_pos(info, max_key->key, max_key->keypart_map,
-                               max_key->flag) :
+                               max_key->flag, &pages->last_page) :
                 info->state->records + (ha_rows) 1);
     res= (end_pos < start_pos ? (ha_rows) 0 :
           (end_pos == start_pos ? (ha_rows) 1 : end_pos-start_pos));
@@ -128,7 +130,8 @@ ha_rows maria_records_in_range(MARIA_HA *info, int inx, key_range *min_key,
 
 static ha_rows _ma_record_pos(MARIA_HA *info, const uchar *key_data,
                               key_part_map keypart_map,
-			      enum ha_rkey_function search_flag)
+                              enum ha_rkey_function search_flag,
+                              ulonglong *final_page)
 {
   uint inx= (uint) info->lastinx;
   uint32 nextflag;
@@ -185,7 +188,7 @@ static ha_rows _ma_record_pos(MARIA_HA *info, const uchar *key_data,
   */
   pos= _ma_search_pos(info, &key,
                       nextflag | SEARCH_SAVE_BUFF | SEARCH_UPDATE,
-                      info->s->state.key_root[inx]);
+                      info->s->state.key_root[inx], final_page);
   if (pos >= 0.0)
   {
     DBUG_PRINT("exit",("pos: %ld",(ulong) (pos*info->state->records)));
@@ -206,7 +209,8 @@ static ha_rows _ma_record_pos(MARIA_HA *info, const uchar *key_data,
 */
 
 static double _ma_search_pos(MARIA_HA *info, MARIA_KEY *key,
-			     uint32 nextflag, my_off_t pos)
+                             uint32 nextflag, my_off_t pos,
+                             ulonglong *final_page)
 {
   int flag;
   uint keynr, UNINIT_VAR(max_keynr);
@@ -224,6 +228,7 @@ static double _ma_search_pos(MARIA_HA *info, MARIA_KEY *key,
                         PAGECACHE_LOCK_LEFT_UNLOCKED, DFLT_INIT_HITS,
                         info->buff, 1))
     goto err;
+  *final_page= pos;
   flag= (*keyinfo->bin_search)(key, &page, nextflag, &keypos,
                                info->lastkey_buff, &after_key);
   keynr= _ma_keynr(&page, keypos, &max_keynr);
@@ -240,7 +245,8 @@ static double _ma_search_pos(MARIA_HA *info, MARIA_KEY *key,
     if (! page.node)
       offset= 0.0;
     else if ((offset= _ma_search_pos(info, key, nextflag,
-                                     _ma_kpos(page.node,keypos))) < 0)
+                                     _ma_kpos(page.node,keypos),
+                                     final_page)) < 0)
       DBUG_RETURN(offset);
   }
   else
@@ -252,7 +258,7 @@ static double _ma_search_pos(MARIA_HA *info, MARIA_KEY *key,
       pages we are counting keys.
 
       If this is a node then we have to search backwards to find the
-      first occurence of the key.  The row position in a node tree
+      first occurrence of the key.  The row position in a node tree
       is keynr (starting from 0) + offset for sub tree.  If there is
       no sub tree to search, then we are at start of next sub tree.
 
@@ -269,7 +275,8 @@ static double _ma_search_pos(MARIA_HA *info, MARIA_KEY *key,
         Matches keynr + [0-1]
       */
       if ((offset= _ma_search_pos(info, key, SEARCH_FIND,
-                                  _ma_kpos(page.node,keypos))) < 0)
+                                  _ma_kpos(page.node,keypos),
+                                  final_page)) < 0)
 	DBUG_RETURN(offset);			/* Read error */
     }
   }
diff --git a/storage/maria/ma_recovery.c b/storage/maria/ma_recovery.c
index be8a9fe8b2a..ef8bf3b169b 100644
--- a/storage/maria/ma_recovery.c
+++ b/storage/maria/ma_recovery.c
@@ -54,6 +54,7 @@ static my_bool skip_DDLs; /**< if REDO phase should skip DDL records */
 static my_bool checkpoint_useful;
 static my_bool in_redo_phase;
 static my_bool trns_created;
+static int aria_undo_aborted= 0;
 static ulong skipped_undo_phase;
 static ulonglong now; /**< for tracking execution time of phases */
 static void (*save_error_handler_hook)(uint, const char *,myf);
@@ -115,7 +116,7 @@ prototype_undo_exec_hook(UNDO_BULK_INSERT);
 static int run_redo_phase(LSN lsn, LSN end_lsn,
                           enum maria_apply_log_way apply);
 static uint end_of_redo_phase(my_bool prepare_for_undo_phase);
-static int run_undo_phase(uint uncommitted);
+static int run_undo_phase(LSN end_undo_lsn, uint uncommitted);
 static void display_record_position(const LOG_DESC *log_desc,
                                     const TRANSLOG_HEADER_BUFFER *rec,
                                     uint number);
@@ -152,7 +153,7 @@ static void enlarge_buffer(const TRANSLOG_HEADER_BUFFER *rec)
   if (log_record_buffer.length < rec->record_length)
   {
     log_record_buffer.length= rec->record_length;
-    log_record_buffer.str= my_realloc(log_record_buffer.str,
+    log_record_buffer.str= my_realloc(PSI_INSTRUMENT_ME, log_record_buffer.str,
                                       rec->record_length,
                                       MYF(MY_WME | MY_ALLOW_ZERO_PTR));
   }
@@ -236,8 +237,8 @@ int maria_recovery_from_log(void)
 #endif
   tprint(trace_file, "TRACE of the last Aria recovery from mysqld\n");
   DBUG_ASSERT(maria_pagecache->inited);
-  res= maria_apply_log(LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, MARIA_LOG_APPLY,
-                       trace_file, TRUE, TRUE, TRUE, &warnings_count);
+  res= maria_apply_log(LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, 0, MARIA_LOG_APPLY,
+                       trace_file, TRUE, TRUE, &warnings_count);
   if (!res)
   {
     if (warnings_count == 0 && recovery_found_crashed_tables == 0)
@@ -258,7 +259,9 @@ int maria_recovery_from_log(void)
 
    @param  from_lsn        LSN from which log reading/applying should start;
                            LSN_IMPOSSIBLE means "use last checkpoint"
-   @param  end_lsn         Apply until this. LSN_IMPOSSIBLE means until end.
+   @param  end_redo_lsn    Apply until this. LSN_IMPOSSIBLE means until end.
+   @param  end_und_lsn     Apply all undo >= end_undo_lsn. Set to LSN_MAX if
+                           no undo's should be applied.
    @param  apply           how log records should be applied or not
    @param  trace_file      trace file where progress/debug messages will go
    @param  skip_DDLs_arg   Should DDL records (CREATE/RENAME/DROP/REPAIR)
@@ -275,10 +278,10 @@ int maria_recovery_from_log(void)
      @retval !=0    Error
 */
 
-int maria_apply_log(LSN from_lsn, LSN end_lsn,
+int maria_apply_log(LSN from_lsn, LSN end_redo_lsn, LSN end_undo_lsn,
                     enum maria_apply_log_way apply,
                     FILE *trace_file,
-                    my_bool should_run_undo_phase, my_bool skip_DDLs_arg,
+                    my_bool skip_DDLs_arg,
                     my_bool take_checkpoints, uint *warnings_count)
 {
   int error= 0;
@@ -287,19 +290,18 @@ int maria_apply_log(LSN from_lsn, LSN end_lsn,
   my_bool abort_message_printed= 0;
   DBUG_ENTER("maria_apply_log");
 
-  DBUG_ASSERT(apply == MARIA_LOG_APPLY || !should_run_undo_phase);
+  DBUG_ASSERT(apply == MARIA_LOG_APPLY || end_undo_lsn == LSN_MAX);
   DBUG_ASSERT(!maria_multi_threaded);
   recovery_warnings= recovery_found_crashed_tables= 0;
   skipped_lsn_err_count= 0;
   maria_recovery_changed_data= 0;
   /* checkpoints can happen only if TRNs have been built */
-  DBUG_ASSERT(should_run_undo_phase || !take_checkpoints);
-  DBUG_ASSERT(end_lsn == LSN_IMPOSSIBLE || should_run_undo_phase == 0);
+  DBUG_ASSERT(end_undo_lsn != LSN_MAX || !take_checkpoints);
   all_active_trans= (struct st_trn_for_recovery *)
-    my_malloc((SHORT_TRID_MAX + 1) * sizeof(struct st_trn_for_recovery),
+    my_malloc(PSI_INSTRUMENT_ME, (SHORT_TRID_MAX + 1) * sizeof(struct st_trn_for_recovery),
               MYF(MY_ZEROFILL));
   all_tables= (struct st_table_for_recovery *)
-    my_malloc((SHARE_ID_MAX + 1) * sizeof(struct st_table_for_recovery),
+    my_malloc(PSI_INSTRUMENT_ME, (SHARE_ID_MAX + 1) * sizeof(struct st_table_for_recovery),
               MYF(MY_ZEROFILL));
 
   save_error_handler_hook= error_handler_hook;
@@ -313,6 +315,7 @@ int maria_apply_log(LSN from_lsn, LSN end_lsn,
 
   recovery_message_printed= REC_MSG_NONE;
   checkpoint_useful= trns_created= FALSE;
+  aria_undo_aborted= 0;
   tracef= trace_file;
 #ifdef INSTANT_FLUSH_OF_MESSAGES
   /* enable this for instant flush of messages to trace file */
@@ -347,7 +350,7 @@ int maria_apply_log(LSN from_lsn, LSN end_lsn,
 
   now= microsecond_interval_timer();
   in_redo_phase= TRUE;
-  if (run_redo_phase(from_lsn, end_lsn, apply))
+  if (run_redo_phase(from_lsn, end_redo_lsn, apply))
   {
     ma_message_no_user(0, "Redo phase failed");
     trnman_destroy();
@@ -355,7 +358,8 @@ int maria_apply_log(LSN from_lsn, LSN end_lsn,
   }
   trnman_destroy();
 
-  if (end_lsn != LSN_IMPOSSIBLE)
+  if (end_redo_lsn != LSN_IMPOSSIBLE &&
+      (end_undo_lsn == LSN_MAX || end_undo_lsn == LSN_IMPOSSIBLE))
   {
     abort_message_printed= 1;
     if (!trace_file)
@@ -367,7 +371,7 @@ int maria_apply_log(LSN from_lsn, LSN end_lsn,
   }
 
   if ((uncommitted_trans=
-       end_of_redo_phase(should_run_undo_phase)) == (uint)-1)
+       end_of_redo_phase(end_undo_lsn != LSN_MAX)) == (uint)-1)
   {
     ma_message_no_user(0, "End of redo phase failed");
     goto err;
@@ -417,13 +421,19 @@ int maria_apply_log(LSN from_lsn, LSN end_lsn,
   }
 #endif
 
-  if (should_run_undo_phase)
+  if (end_undo_lsn != LSN_MAX)
   {
-    if (run_undo_phase(uncommitted_trans))
+    if (run_undo_phase(end_undo_lsn, uncommitted_trans))
     {
       ma_message_no_user(0, "Undo phase failed");
       goto err;
     }
+    if (aria_undo_aborted)
+      ma_message_no_user(0, "Undo phase aborted in the middle on user request");
+    else if (end_redo_lsn != LSN_IMPOSSIBLE)
+      my_message(HA_ERR_INITIALIZATION,
+                 "Maria recovery aborted as end_lsn followed by end_undo was "
+                 "reached", MYF(0));
   }
   else if (uncommitted_trans > 0)
   {
@@ -493,7 +503,8 @@ err:
 err2:
   if (trns_created)
     delete_all_transactions();
-  error= 1;
+  if (!abort_message_printed)
+    error= 1;
   if (close_all_tables())
   {
     ma_message_no_user(0, "closing of tables failed");
@@ -521,7 +532,7 @@ end:
       fprintf(stderr, "\n");
       fflush(stderr);
     }
-    if (!error)
+    if (!error && !abort_message_printed)
     {
       ma_message_no_user(ME_NOTE, "recovery done");
       maria_recovery_changed_data= 1;
@@ -540,7 +551,7 @@ end:
   {
     my_message(HA_ERR_INITIALIZATION,
                "Aria recovery failed. Please run aria_chk -r on all Aria "
-               "tables and delete all aria_log.######## files", MYF(0));
+               "tables (*.MAI) and delete all aria_log.######## files", MYF(0));
   }
   procent_printed= 0;
   /*
@@ -813,7 +824,7 @@ prototype_redo_exec_hook(REDO_CREATE_TABLE)
     goto end;
   }
   /* we try hard to get create_rename_lsn, to avoid mistakes if possible */
-  info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR);
+  info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR, 0);
   if (info)
   {
     MARIA_SHARE *share= info->s;
@@ -842,7 +853,7 @@ prototype_redo_exec_hook(REDO_CREATE_TABLE)
     if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0)
     {
       tprint(tracef, "Table '%s' has create_rename_lsn " LSN_FMT " more "
-             "recent than record, ignoring creation",
+             "recent than record, ignoring creation\n",
              name, LSN_IN_PARTS(share->state.create_rename_lsn));
       error= 0;
       goto end;
@@ -934,7 +945,7 @@ prototype_redo_exec_hook(REDO_CREATE_TABLE)
       correctly filled. So we just open the table (fortunately, an empty
       data file does not preclude this).
     */
-    if (((info= maria_open(name, O_RDONLY, 0)) == NULL) ||
+    if (((info= maria_open(name, O_RDONLY, 0, 0)) == NULL) ||
         _ma_initialize_data_file(info->s, info->dfile.file))
     {
       eprint(tracef, "Failed to open new table or write to data file");
@@ -975,7 +986,7 @@ prototype_redo_exec_hook(REDO_RENAME_TABLE)
   }
   old_name= (char *)log_record_buffer.str;
   new_name= old_name + strlen(old_name) + 1;
-  tprint(tracef, "Table '%s' to rename to '%s'; old-name table ", old_name,
+  tprint(tracef, "Table '%s' to be renamed to '%s'; old-name table ", old_name,
          new_name);
   /*
     Here is why we skip CREATE/DROP/RENAME when doing a recovery from
@@ -1005,20 +1016,20 @@ prototype_redo_exec_hook(REDO_RENAME_TABLE)
     log insertions of records into the temporary table, so replaying may
     fail (grep for INCOMPLETE_LOG in files).
   */
-  info= maria_open(old_name, O_RDONLY, HA_OPEN_FOR_REPAIR);
+  info= maria_open(old_name, O_RDONLY, HA_OPEN_FOR_REPAIR, 0);
   if (info)
   {
     MARIA_SHARE *share= info->s;
     if (!share->base.born_transactional)
     {
-      tprint(tracef, ", is not transactional, ignoring renaming\n");
+      tprint(tracef, "is not transactional, ignoring renaming");
       ALERT_USER();
       error= 0;
       goto end;
     }
     if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0)
     {
-      tprint(tracef, ", has create_rename_lsn " LSN_FMT " more recent than"
+      tprint(tracef, "has create_rename_lsn " LSN_FMT " more recent than"
              " record, ignoring renaming",
              LSN_IN_PARTS(share->state.create_rename_lsn));
       error= 0;
@@ -1054,26 +1065,26 @@ prototype_redo_exec_hook(REDO_RENAME_TABLE)
     t, renames it to u (if not testing create_rename_lsn) thus overwriting
     old-named v, drops u, and we are stuck, we have lost data.
   */
-  info= maria_open(new_name, O_RDONLY, HA_OPEN_FOR_REPAIR);
+  info= maria_open(new_name, O_RDONLY, HA_OPEN_FOR_REPAIR, 0);
   if (info)
   {
     MARIA_SHARE *share= info->s;
     /* We should not have open instances on this table. */
     if (share->reopen != 1)
     {
-      tprint(tracef, ", is already open (reopen=%u)\n", share->reopen);
+      tprint(tracef, "is already open (reopen=%u)", share->reopen);
       ALERT_USER();
       goto end;
     }
     if (!share->base.born_transactional)
     {
-      tprint(tracef, ", is not transactional, ignoring renaming\n");
+      tprint(tracef, "is not transactional, ignoring renaming");
       ALERT_USER();
       goto drop;
     }
     if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0)
     {
-      tprint(tracef, ", has create_rename_lsn " LSN_FMT " more recent than"
+      tprint(tracef, "has create_rename_lsn " LSN_FMT " more recent than"
              " record, ignoring renaming",
              LSN_IN_PARTS(share->state.create_rename_lsn));
       /*
@@ -1091,7 +1102,7 @@ prototype_redo_exec_hook(REDO_RENAME_TABLE)
     }
     if (maria_is_crashed(info))
     {
-      tprint(tracef, ", is crashed, can't rename it");
+      tprint(tracef, "is crashed, can't rename it");
       ALERT_USER();
       goto end;
     }
@@ -1118,7 +1129,7 @@ prototype_redo_exec_hook(REDO_RENAME_TABLE)
     eprint(tracef, "Failed to rename table");
     goto end;
   }
-  info= maria_open(new_name, O_RDONLY, 0);
+  info= maria_open(new_name, O_RDONLY, 0, 0);
   if (info == NULL)
   {
     eprint(tracef, "Failed to open renamed table");
@@ -1177,7 +1188,7 @@ prototype_redo_exec_hook(REDO_REPAIR_TABLE)
   if (!info)
   {
     /* no such table, don't need to warn */
-    return 0;
+    DBUG_RETURN(0);
   }
 
   if (maria_is_crashed(info))
@@ -1245,7 +1256,7 @@ prototype_redo_exec_hook(REDO_DROP_TABLE)
   }
   name= (char *)log_record_buffer.str;
   tprint(tracef, "Table '%s'", name);
-  info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR);
+  info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR, 0);
   if (info)
   {
     MARIA_SHARE *share= info->s;
@@ -1387,7 +1398,7 @@ static int new_table(uint16 sid, const char *name, LSN lsn_of_file_id)
     goto end;
   }
   tprint(tracef, "Table '%s', id %u", name, sid);
-  info= maria_open(name, O_RDWR, HA_OPEN_FOR_REPAIR);
+  info= maria_open(name, O_RDWR, HA_OPEN_FOR_REPAIR, 0);
   if (info == NULL)
   {
     tprint(tracef, ", is absent (must have been dropped later?)"
@@ -1917,7 +1928,7 @@ prototype_redo_exec_hook(UNDO_ROW_INSERT)
   if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0)
   {
     tprint(tracef, "   state has LSN " LSN_FMT " older than record, updating"
-           " rows' count\n", LSN_IN_PARTS(share->state.is_of_horizon));
+           " row count\n", LSN_IN_PARTS(share->state.is_of_horizon));
     share->state.state.records++;
     if (share->calc_checksum)
     {
@@ -1935,7 +1946,7 @@ prototype_redo_exec_hook(UNDO_ROW_INSERT)
     info->s->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
                               STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
   }
-  tprint(tracef, "   rows' count %lu\n", (ulong)info->s->state.state.records);
+  tprint(tracef, "   row count: %lu\n", (ulong)info->s->state.state.records);
   /* Unpin all pages, stamp them with UNDO's LSN */
   _ma_unpin_all_pages(info, rec->lsn);
   return 0;
@@ -1973,7 +1984,7 @@ prototype_redo_exec_hook(UNDO_ROW_DELETE)
                             STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_ZEROFILLED |
                             STATE_NOT_MOVABLE);
   }
-  tprint(tracef, "   rows' count %lu\n", (ulong)share->state.state.records);
+  tprint(tracef, "   row count: %lu\n", (ulong)share->state.state.records);
   _ma_unpin_all_pages(info, rec->lsn);
   return 0;
 }
@@ -2179,7 +2190,7 @@ prototype_redo_exec_hook(CLR_END)
   if (info == NULL)
     DBUG_RETURN(0);
   share= info->s;
-  tprint(tracef, "   CLR_END was about %s, undo_lsn now LSN " LSN_FMT "\n",
+  tprint(tracef, "   CLR_END was about %s, undo_lsn " LSN_FMT "\n",
          log_desc->name, LSN_IN_PARTS(previous_undo_lsn));
 
   enlarge_buffer(rec);
@@ -2236,7 +2247,7 @@ prototype_redo_exec_hook(CLR_END)
                             STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
   }
   if (row_entry)
-    tprint(tracef, "   rows' count %lu\n", (ulong)share->state.state.records);
+    tprint(tracef, "   row count: %lu\n", (ulong)share->state.state.records);
   _ma_unpin_all_pages(info, rec->lsn);
   DBUG_RETURN(0);
 }
@@ -2248,7 +2259,7 @@ prototype_redo_exec_hook(CLR_END)
 
 prototype_redo_exec_hook(DEBUG_INFO)
 {
-  uchar *data;
+  char *data;
   enum translog_debug_info_type debug_info;
 
   enlarge_buffer(rec);
@@ -2261,11 +2272,10 @@ prototype_redo_exec_hook(DEBUG_INFO)
     return 1;
   }
   debug_info= (enum translog_debug_info_type) log_record_buffer.str[0];
-  data= log_record_buffer.str + 1;
+  data= (char*) log_record_buffer.str + 1;
   switch (debug_info) {
   case LOGREC_DEBUG_INFO_QUERY:
-    tprint(tracef, "Query: %.*s\n", rec->record_length - 1,
-           (char*) data);
+    tprint(tracef, "Query: %.*s\n", (int) rec->record_length - 1, data);
     break;
   default:
     DBUG_ASSERT(0);
@@ -2338,7 +2348,7 @@ prototype_undo_exec_hook(UNDO_ROW_INSERT)
                                    FILEID_STORE_SIZE);
   info->trn= 0;
   /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */
-  tprint(tracef, "   rows' count %lu\n", (ulong)info->s->state.state.records);
+  tprint(tracef, "   row count: %lu\n", (ulong)info->s->state.state.records);
   tprint(tracef, "   undo_lsn now LSN " LSN_FMT "\n",
          LSN_IN_PARTS(trn->undo_lsn));
   return error;
@@ -2378,7 +2388,7 @@ prototype_undo_exec_hook(UNDO_ROW_DELETE)
                                    rec->record_length -
                                    (LSN_STORE_SIZE + FILEID_STORE_SIZE));
   info->trn= 0;
-  tprint(tracef, "   rows' count %lu\n   undo_lsn now LSN " LSN_FMT "\n",
+  tprint(tracef, "   row count: %lu\n   undo_lsn now LSN " LSN_FMT "\n",
          (ulong)share->state.state.records, LSN_IN_PARTS(trn->undo_lsn));
   return error;
 }
@@ -2706,8 +2716,8 @@ static int run_redo_phase(LSN lsn, LSN lsn_end, enum maria_apply_log_way apply)
           if (lsn_end != LSN_IMPOSSIBLE && rec2.lsn >= lsn_end)
           {
             tprint(tracef,
-                   "lsn_end reached at " LSN_FMT ". "
-                   "Skipping rest of redo entries",
+                   "lsn_redo_end reached at " LSN_FMT ". "
+                   "Skipping rest of redo entries\n",
                    LSN_IN_PARTS(rec2.lsn));
             translog_destroy_scanner(&scanner);
             translog_free_record_header(&rec);
@@ -2792,7 +2802,7 @@ static int run_redo_phase(LSN lsn, LSN lsn_end, enum maria_apply_log_way apply)
       switch (len)
       {
       case RECHEADER_READ_EOF:
-        tprint(tracef, "EOF on the log\n");
+        tprint(tracef, "*** End of log ***\n");
         break;
       case RECHEADER_READ_ERROR:
         tprint(tracef, "Error reading log\n");
@@ -2941,7 +2951,7 @@ static uint end_of_redo_phase(my_bool prepare_for_undo_phase)
 }
 
 
-static int run_undo_phase(uint uncommitted)
+static int run_undo_phase(LSN end_undo_lsn, uint uncommitted)
 {
   LSN last_undo __attribute__((unused));
   DBUG_ENTER("run_undo_phase");
@@ -2967,7 +2977,20 @@ static int run_undo_phase(uint uncommitted)
         fflush(stderr);
       }
       if ((uncommitted--) == 0)
+      {
+        if (aria_undo_aborted <= 0)
+        {
+          aria_undo_aborted= 0;
+          break;
+        }
+      }
+      if (aria_undo_aborted)
+      {
+        tprint(tracef,
+               "lsn_undo_end found. Skipping rest of undo entries\n");
         break;
+      }
+
       trn= trnman_get_any_trn();
       DBUG_ASSERT(trn != NULL);
       llstr(trn->trid, llbuf);
@@ -2995,6 +3018,12 @@ static int run_undo_phase(uint uncommitted)
           DBUG_RETURN(1);
         }
         translog_free_record_header(&rec);
+
+        if (last_undo == end_undo_lsn)
+        {
+          aria_undo_aborted= trn->undo_lsn ? 1 : -1;
+          break;
+        }
       }
 
       /* Force a crash to test recovery of recovery */
@@ -3003,6 +3032,7 @@ static int run_undo_phase(uint uncommitted)
         DBUG_ASSERT(--maria_recovery_force_crash_counter > 0);
       }
 
+      trn->undo_lsn= 0;            /* Avoid abort in trnman_rollbac_trn */
       if (trnman_rollback_trn(trn))
         DBUG_RETURN(1);
       /* We could want to span a few threads (4?) instead of 1 */
@@ -3233,7 +3263,10 @@ static MARIA_HA *get_MARIA_HA_from_UNDO_record(const
   }
   DBUG_ASSERT(share->last_version != 0);
   _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); /* to flush state on close */
-  tprint(tracef, ", applying record\n");
+  if (in_redo_phase)
+    tprint(tracef, ", remembering undo\n");
+  else
+    tprint(tracef, ", applying record\n");
   return info;
 }
 
@@ -3352,19 +3385,19 @@ static LSN parse_checkpoint_record(LSN lsn)
   /* dirty pages */
   nb_dirty_pages= uint8korr(ptr);
 
-  /* Ensure casts later will not loose significant bits. */
+  /* Ensure casts later will not lose significant bits. */
   DBUG_ASSERT((nb_dirty_pages <= SIZE_T_MAX/sizeof(struct st_dirty_page)) &&
               (nb_dirty_pages <= ULONG_MAX));
 
   ptr+= 8;
   tprint(tracef, "%lu dirty pages\n", (ulong) nb_dirty_pages);
-  if (my_hash_init(&all_dirty_pages, &my_charset_bin, (ulong)nb_dirty_pages,
-                   offsetof(struct st_dirty_page, file_and_page_id),
+  if (my_hash_init(PSI_INSTRUMENT_ME, &all_dirty_pages, &my_charset_bin,
+                   (ulong)nb_dirty_pages, offsetof(struct st_dirty_page, file_and_page_id),
                    sizeof(((struct st_dirty_page *)NULL)->file_and_page_id),
                    NULL, NULL, 0))
     return LSN_ERROR;
   dirty_pages_pool=
-    (struct st_dirty_page *)my_malloc((size_t)nb_dirty_pages *
+    (struct st_dirty_page *)my_malloc(PSI_INSTRUMENT_ME, (size_t)nb_dirty_pages *
                                       sizeof(struct st_dirty_page),
                                       MYF(MY_WME));
   if (unlikely(dirty_pages_pool == NULL))
diff --git a/storage/maria/ma_recovery.h b/storage/maria/ma_recovery.h
index 0a75479365f..4373ef52983 100644
--- a/storage/maria/ma_recovery.h
+++ b/storage/maria/ma_recovery.h
@@ -26,10 +26,11 @@ C_MODE_START
 enum maria_apply_log_way
 { MARIA_LOG_APPLY, MARIA_LOG_DISPLAY_HEADER, MARIA_LOG_CHECK };
 int maria_recovery_from_log(void);
-int maria_apply_log(LSN lsn, LSN lsn_end, enum maria_apply_log_way apply,
+int maria_apply_log(LSN lsn, LSN lsn_end, LSN lsn_undo_end,
+                    enum maria_apply_log_way apply,
                     FILE *trace_file,
-                    my_bool execute_undo_phase, my_bool skip_DDLs,
-                    my_bool take_checkpoints, uint *warnings_count);
+                    my_bool skip_DDLs, my_bool take_checkpoints,
+                    uint *warnings_count);
 /* Table of tables to recover */
 extern HASH tables_to_redo;
 extern ulong maria_recovery_force_crash_counter;
diff --git a/storage/maria/ma_recovery_util.h b/storage/maria/ma_recovery_util.h
index 0b02f8e51cb..39c16bc5dff 100644
--- a/storage/maria/ma_recovery_util.h
+++ b/storage/maria/ma_recovery_util.h
@@ -31,7 +31,12 @@ extern FILE *tracef;
 my_bool _ma_redo_not_needed_for_page(uint16 shortid, LSN lsn,
                                      pgcache_page_no_t page,
                                      my_bool index);
+#ifdef WAITING_FOR_BUGFIX_TO_VSPRINTF
 void tprint(FILE *trace_file, const char *format, ...)
   ATTRIBUTE_FORMAT(printf, 2, 3);
 void eprint(FILE *trace_file, const char *format, ...)
   ATTRIBUTE_FORMAT(printf, 2, 3);
+#else
+void tprint(FILE *trace_file, const char *format, ...);
+void eprint(FILE *trace_file, const char *format, ...);
+#endif
diff --git a/storage/maria/ma_rename.c b/storage/maria/ma_rename.c
index db5a718dbdd..a4388596f6b 100644
--- a/storage/maria/ma_rename.c
+++ b/storage/maria/ma_rename.c
@@ -48,7 +48,7 @@ int maria_rename(const char *old_name, const char *new_name)
   _ma_check_table_is_closed(new_name,"rename new table2");
 #endif
   /** @todo LOCK take X-lock on table */
-  if (!(info= maria_open(old_name, O_RDWR, HA_OPEN_FOR_REPAIR)))
+  if (!(info= maria_open(old_name, O_RDWR, HA_OPEN_FOR_REPAIR, 0)))
     DBUG_RETURN(my_errno);
   share= info->s;
 #ifdef USE_RAID
diff --git a/storage/maria/ma_rt_index.c b/storage/maria/ma_rt_index.c
index a90efc4ca38..6fddc8955c4 100644
--- a/storage/maria/ma_rt_index.c
+++ b/storage/maria/ma_rt_index.c
@@ -66,12 +66,16 @@ static int maria_rtree_find_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
   int key_data_length;
   uint *saved_key= (uint*) (info->maria_rtree_recursion_state) + level;
   MARIA_PAGE page;
+  my_bool buff_alloced;
 
-  if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length)))
+  alloc_on_stack(*info->stack_end_ptr, page_buf, buff_alloced,
+                 keyinfo->block_length);
+  if (!page_buf)
   {
     my_errno= HA_ERR_OUT_OF_MEM;
-    return -1;
+    return(-1);
   }
+
   if (_ma_fetch_keypage(&page, info, keyinfo, page_pos,
                         PAGECACHE_LOCK_LEFT_UNLOCKED,
                         DFLT_INIT_HITS, page_buf, 0))
@@ -165,11 +169,11 @@ static int maria_rtree_find_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
   res= 1;
 
 ok:
-  my_afree(page_buf);
+  stack_alloc_free(page_buf, buff_alloced);
   return res;
 
 err:
-  my_afree(page_buf);
+  stack_alloc_free(page_buf, buff_alloced);
   info->cur_row.lastpos= HA_OFFSET_ERROR;
   return -1;
 }
@@ -329,10 +333,17 @@ static int maria_rtree_get_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
   uint nod_flag, key_data_length;
   int res;
   uint *saved_key= (uint*) (info->maria_rtree_recursion_state) + level;
+  my_bool buff_alloced;
   MARIA_PAGE page;
 
-  if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length)))
-    return -1;
+  alloc_on_stack(*info->stack_end_ptr, page_buf, buff_alloced,
+                 keyinfo->block_length);
+  if (!page_buf)
+  {
+    my_errno= HA_ERR_OUT_OF_MEM;
+    return(-1);
+  }
+
   if (_ma_fetch_keypage(&page, info, keyinfo, page_pos,
                         PAGECACHE_LOCK_LEFT_UNLOCKED,
                          DFLT_INIT_HITS, page_buf, 0))
@@ -422,11 +433,11 @@ static int maria_rtree_get_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
   res= 1;
 
 ok:
-  my_afree(page_buf);
+  stack_alloc_free(page_buf, buff_alloced);
   return res;
 
 err:
-  my_afree(page_buf);
+  stack_alloc_free(page_buf, buff_alloced);
   info->cur_row.lastpos= HA_OFFSET_ERROR;
   return -1;
 }
@@ -603,18 +614,21 @@ static int maria_rtree_insert_req(MARIA_HA *info, MARIA_KEY *key,
   uint nod_flag;
   uint key_length= key->data_length;
   int res;
+  my_bool buff_alloced;
   uchar *page_buf, *k;
   MARIA_SHARE *share= info->s;
   MARIA_KEYDEF *keyinfo= key->keyinfo;
   MARIA_PAGE page;
   DBUG_ENTER("maria_rtree_insert_req");
 
-  if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length +
-                                     MARIA_MAX_KEY_BUFF)))
+  alloc_on_stack(*info->stack_end_ptr, page_buf, buff_alloced,
+                 keyinfo->block_length + keyinfo->max_store_length);
+  if (!page_buf)
   {
     my_errno= HA_ERR_OUT_OF_MEM;
     DBUG_RETURN(-1); /* purecov: inspected */
   }
+
   if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, PAGECACHE_LOCK_WRITE,
                         DFLT_INIT_HITS, page_buf, 0))
     goto err;
@@ -695,7 +709,7 @@ static int maria_rtree_insert_req(MARIA_HA *info, MARIA_KEY *key,
   }
 
 ok:
-  my_afree(page_buf);
+  stack_alloc_free(page_buf, buff_alloced);
   DBUG_RETURN(res);
 
 err:
@@ -765,6 +779,7 @@ int maria_rtree_insert_level(MARIA_HA *info, MARIA_KEY *key, int ins_level,
     case 1: /* root was split, grow a new root; very rare */
     {
       uchar *new_root_buf, *new_key_buff;
+      my_bool new_root_buf_alloced;
       my_off_t new_root;
       uint nod_flag= share->base.key_reflength;
       MARIA_PINNED_PAGE tmp_page_link, *page_link;
@@ -773,14 +788,16 @@ int maria_rtree_insert_level(MARIA_HA *info, MARIA_KEY *key, int ins_level,
       page_link= &tmp_page_link;
 
       DBUG_PRINT("rtree", ("root was split, grow a new root"));
-      if (!(new_root_buf= (uchar*) my_alloca((uint) keyinfo->block_length +
-                                             MARIA_MAX_KEY_BUFF)))
+
+      alloc_on_stack(*info->stack_end_ptr, new_root_buf, new_root_buf_alloced,
+                     keyinfo->block_length + keyinfo->max_store_length);
+      if (!new_root_buf)
       {
         my_errno= HA_ERR_OUT_OF_MEM;
         DBUG_RETURN(-1); /* purecov: inspected */
       }
 
-      bzero(new_root_buf, share->block_size);
+      bzero(new_root_buf, keyinfo->block_length);
       _ma_store_keypage_flag(share, new_root_buf, KEYPAGE_FLAG_ISNOD);
       _ma_store_keynr(share, new_root_buf, keyinfo->key_nr);
       _ma_store_page_used(share, new_root_buf, share->keypage_header);
@@ -805,14 +822,12 @@ int maria_rtree_insert_level(MARIA_HA *info, MARIA_KEY *key, int ins_level,
       _ma_kpointer(info, new_key_buff - nod_flag, old_root);
       if (maria_rtree_set_key_mbr(info, &new_key, old_root))
         goto err;
-      if (maria_rtree_add_key(&new_key, &page, NULL)
-          == -1)
+      if (maria_rtree_add_key(&new_key, &page, NULL) == -1)
         goto err;
       _ma_kpointer(info, new_key_buff - nod_flag, new_page);
       if (maria_rtree_set_key_mbr(info, &new_key, new_page))
         goto err;
-      if (maria_rtree_add_key(&new_key, &page, NULL)
-          == -1)
+      if (maria_rtree_add_key(&new_key, &page, NULL) == -1)
         goto err;
       if (_ma_write_keypage(&page, write_lock, DFLT_INIT_HITS))
         goto err;
@@ -820,10 +835,10 @@ int maria_rtree_insert_level(MARIA_HA *info, MARIA_KEY *key, int ins_level,
       DBUG_PRINT("rtree", ("new root page: %lu  level: %d  nod_flag: %u",
                            (ulong) new_root, 0, page.node));
 
-      my_afree(new_root_buf);
+      stack_alloc_free(new_root_buf, new_root_buf_alloced);
       break;
 err:
-      my_afree(new_root_buf);
+      stack_alloc_free(new_root_buf, new_root_buf_alloced);
       DBUG_RETURN(-1); /* purecov: inspected */
     }
     default:
@@ -890,7 +905,7 @@ static my_bool maria_rtree_fill_reinsert_list(stPageList *ReinsertList,
   if (ReinsertList->n_pages == ReinsertList->m_pages)
   {
     ReinsertList->m_pages += REINSERT_BUFFER_INC;
-    if (!(ReinsertList->pages= (stPageLevel*)my_realloc((uchar*)ReinsertList->pages,
+    if (!(ReinsertList->pages= (stPageLevel*)my_realloc(PSI_INSTRUMENT_ME, (uchar*)ReinsertList->pages,
       ReinsertList->m_pages * sizeof(stPageLevel), MYF(MY_ALLOW_ZERO_PTR))))
       goto err;
   }
@@ -922,17 +937,21 @@ static int maria_rtree_delete_req(MARIA_HA *info, const MARIA_KEY *key,
   ulong i;
   uint nod_flag;
   int res;
+  my_bool buff_alloced;
   uchar *page_buf, *last, *k;
   MARIA_SHARE *share= info->s;
   MARIA_KEYDEF *keyinfo= key->keyinfo;
   MARIA_PAGE page;
   DBUG_ENTER("maria_rtree_delete_req");
 
-  if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length)))
+  alloc_on_stack(*info->stack_end_ptr, page_buf, buff_alloced,
+                 keyinfo->block_length);
+  if (!page_buf)
   {
     my_errno= HA_ERR_OUT_OF_MEM;
-    DBUG_RETURN(-1); /* purecov: inspected */
+    DBUG_RETURN(-1);
   }
+
   if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, PAGECACHE_LOCK_WRITE,
                         DFLT_INIT_HITS, page_buf, 0))
     goto err;
@@ -1072,11 +1091,11 @@ static int maria_rtree_delete_req(MARIA_HA *info, const MARIA_KEY *key,
   res= 1;
 
 ok:
-  my_afree(page_buf);
+  stack_alloc_free(page_buf, buff_alloced);
   DBUG_RETURN(res);
 
 err:
-  my_afree(page_buf);
+  stack_alloc_free(page_buf, buff_alloced);
   DBUG_RETURN(-1); /* purecov: inspected */
 }
 
@@ -1121,6 +1140,8 @@ my_bool maria_rtree_real_delete(MARIA_HA *info, MARIA_KEY *key,
   MARIA_SHARE *share= info->s;
   MARIA_KEYDEF *keyinfo= key->keyinfo;
   uint key_data_length= key->data_length;
+  my_bool buff_alloced= 0;
+  uchar *page_buf= 0;
   DBUG_ENTER("maria_rtree_real_delete");
 
   if ((old_root= share->state.key_root[keyinfo->key_nr]) ==
@@ -1147,9 +1168,9 @@ my_bool maria_rtree_real_delete(MARIA_HA *info, MARIA_KEY *key,
   {
     uint nod_flag;
     ulong i;
-    uchar *page_buf;
     MARIA_PAGE page;
     MARIA_KEY tmp_key;
+
     tmp_key.keyinfo=     key->keyinfo;
     tmp_key.data_length= key->data_length;
     tmp_key.ref_length=  key->ref_length;
@@ -1157,7 +1178,9 @@ my_bool maria_rtree_real_delete(MARIA_HA *info, MARIA_KEY *key,
 
     if (ReinsertList.n_pages)
     {
-      if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length)))
+      alloc_on_stack(*info->stack_end_ptr, page_buf, buff_alloced,
+                     keyinfo->block_length);
+      if (!page_buf)
       {
         my_errno= HA_ERR_OUT_OF_MEM;
         goto err;
@@ -1186,10 +1209,7 @@ my_bool maria_rtree_real_delete(MARIA_HA *info, MARIA_KEY *key,
           if ((res= maria_rtree_insert_level(info, &tmp_key,
                                              ReinsertList.pages[i].level,
                                              root)) == -1)
-          {
-            my_afree(page_buf);
             goto err;
-          }
           if (res)
           {
             uint j;
@@ -1205,13 +1225,8 @@ my_bool maria_rtree_real_delete(MARIA_HA *info, MARIA_KEY *key,
         }
         page_mark_changed(info, &page);
         if (_ma_dispose(info, page.pos, 0))
-        {
-          my_afree(page_buf);
           goto err;
-        }
       }
-      my_afree(page_buf);
-      my_free(ReinsertList.pages);
     }
 
     /* check for redundant root (not leaf, 1 child) and eliminate */
@@ -1243,9 +1258,13 @@ my_bool maria_rtree_real_delete(MARIA_HA *info, MARIA_KEY *key,
   default:
     goto err;                                 /* purecov: inspected */
   }
+  my_free(ReinsertList.pages);
+  stack_alloc_free(page_buf, buff_alloced);
   DBUG_RETURN(0);
 
 err:
+  my_free(ReinsertList.pages);
+  stack_alloc_free(page_buf, buff_alloced);
   DBUG_RETURN(1);
 }
 
@@ -1268,14 +1287,19 @@ ha_rows maria_rtree_estimate(MARIA_HA *info, MARIA_KEY *key, uint32 flag)
   MARIA_SHARE *share= info->s;
   MARIA_KEYDEF *keyinfo= key->keyinfo;
   MARIA_PAGE page;
+  my_bool buff_alloced;
 
   if (flag & MBR_DISJOINT)
     return HA_POS_ERROR;
 
   if ((root= share->state.key_root[key->keyinfo->key_nr]) == HA_OFFSET_ERROR)
     return HA_POS_ERROR;
-  if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length)))
-    return HA_POS_ERROR;
+
+  alloc_on_stack(*info->stack_end_ptr, page_buf, buff_alloced,
+                 keyinfo->block_length);
+  if (!page_buf)
+    return(HA_POS_ERROR);
+
   if (_ma_fetch_keypage(&page, info, keyinfo, root,
                         PAGECACHE_LOCK_LEFT_UNLOCKED, DFLT_INIT_HITS, page_buf,
                         0))
@@ -1343,11 +1367,11 @@ ha_rows maria_rtree_estimate(MARIA_HA *info, MARIA_KEY *key, uint32 flag)
       res= HA_POS_ERROR;
   }
 
-  my_afree(page_buf);
+  stack_alloc_free(page_buf, buff_alloced);
   return res;
 
 err:
-  my_afree(page_buf);
+  stack_alloc_free(page_buf, buff_alloced);
   return HA_POS_ERROR;
 }
 
diff --git a/storage/maria/ma_rt_split.c b/storage/maria/ma_rt_split.c
index 1eb0ffb5b89..a0acb9ce34d 100644
--- a/storage/maria/ma_rt_split.c
+++ b/storage/maria/ma_rt_split.c
@@ -378,7 +378,7 @@ int maria_rtree_split_page(const MARIA_KEY *key, MARIA_PAGE *page,
   double *next_coord;
   int n_dim;
   uchar *source_cur, *cur1, *cur2;
-  uchar *new_page_buff, *log_internal_copy, *log_internal_copy_ptr,
+  uchar *new_page_buff= 0, *log_internal_copy, *log_internal_copy_ptr,
     *log_key_copy= NULL;
   int err_code= 0;
   uint new_page_length;
@@ -390,15 +390,17 @@ int maria_rtree_split_page(const MARIA_KEY *key, MARIA_PAGE *page,
   int max_keys= ((org_length - share->keypage_header) / (full_length));
   MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link;
   MARIA_KEYDEF *keyinfo= key->keyinfo;
+  my_bool new_page_buff_alloced= 0, coord_buf_alloced= 0;
   DBUG_ENTER("maria_rtree_split_page");
   DBUG_PRINT("rtree", ("splitting block"));
 
   n_dim= keyinfo->keysegs / 2;
 
-  if (!(coord_buf= (double*) my_alloca(n_dim * 2 * sizeof(double) *
-                                       (max_keys + 1 + 4) +
-                                       sizeof(SplitStruct) * (max_keys + 1))))
-    DBUG_RETURN(-1); /* purecov: inspected */
+  alloc_on_stack(*info->stack_end_ptr, coord_buf, coord_buf_alloced,
+                 (n_dim * 2 * sizeof(double) * (max_keys + 1 + 4) +
+                  sizeof(SplitStruct) * (max_keys + 1)));
+  if (!coord_buf)
+    DBUG_RETURN(-1);
 
   task= (SplitStruct *)(coord_buf + n_dim * 2 * (max_keys + 1 + 4));
 
@@ -433,14 +435,15 @@ int maria_rtree_split_page(const MARIA_KEY *key, MARIA_PAGE *page,
   }
 
   /* Allocate buffer for new page and piece of log record */
-  if (!(new_page_buff= (uchar*) my_alloca((uint)keyinfo->block_length +
-                                          (transactional ?
-                                           (max_keys * (2 + 2) +
-                                            1 + 2 + 1 + 2) : 0))))
+  alloc_on_stack(*info->stack_end_ptr, new_page_buff, new_page_buff_alloced,
+                  (keyinfo->block_length +
+                    (transactional ? max_keys * (2 + 2) + 1 + 2 + 1 + 2 : 0)));
+  if (!new_page_buff)
   {
     err_code= -1;
     goto split_err;
   }
+
   log_internal_copy= log_internal_copy_ptr= new_page_buff +
     keyinfo->block_length;
   bzero(new_page_buff, share->block_size);
@@ -538,9 +541,9 @@ int maria_rtree_split_page(const MARIA_KEY *key, MARIA_PAGE *page,
   }
   DBUG_PRINT("rtree", ("split new block: %lu", (ulong) *new_page_offs));
 
-  my_afree(new_page_buff);
 split_err:
-  my_afree(coord_buf);
+  stack_alloc_free(new_page_buff, new_page_buff_alloced);
+  stack_alloc_free(coord_buf, coord_buf_alloced);
   DBUG_RETURN(err_code);
 }
 
diff --git a/storage/maria/ma_rt_test.c b/storage/maria/ma_rt_test.c
index f7e38af3dce..3af7d93879e 100644
--- a/storage/maria/ma_rt_test.c
+++ b/storage/maria/ma_rt_test.c
@@ -95,13 +95,13 @@ int main(int argc, char *argv[])
 {
   char buff[FN_REFLEN];  
   MY_INIT(argv[0]);
-  maria_data_root= (char *)".";
+  maria_data_root= ".";
   get_options(argc, argv);
   /* Maria requires that we always have a page cache */
   if (maria_init() ||
       (init_pagecache(maria_pagecache, maria_block_size * 16, 0, 0,
                       maria_block_size, 0, MY_WME) == 0) ||
-      ma_control_file_open(TRUE, TRUE) ||
+      ma_control_file_open(TRUE, TRUE, TRUE) ||
       (init_pagecache(maria_log_pagecache,
                       TRANSLOG_PAGECACHE_SIZE, 0, 0,
                       TRANSLOG_PAGE_SIZE, 0, MY_WME) == 0) ||
@@ -141,6 +141,7 @@ static int run_test(const char *filename)
   uchar read_record[MAX_REC_LENGTH];
   int upd= 10;
   ha_rows hrows;
+  page_range pages;
 
   bzero(&uniquedef, sizeof(uniquedef));
   bzero(&create_info, sizeof(create_info));
@@ -196,7 +197,7 @@ static int run_test(const char *filename)
   if (!silent)
     printf("- Open isam-file\n");
 
-  if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+  if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED,0)))
     goto err;
   maria_begin(file);
   if (opt_versioning)
@@ -424,7 +425,7 @@ static int run_test(const char *filename)
   range.key= record+1;
   range.length= 1000;                           /* Big enough */
   range.flag= HA_READ_MBR_INTERSECT;
-  hrows= maria_records_in_range(file,0, &range, (key_range*) 0);
+  hrows= maria_records_in_range(file,0, &range, (key_range*) 0, &pages);
   if (!silent)
     printf("     %ld rows\n", (long) hrows);
 
@@ -612,8 +613,8 @@ static struct my_option my_long_options[] =
 #endif
   {"help", '?', "Display help and exit",
    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
-  {"datadir", 'h', "Path to the database root.", &maria_data_root,
-   &maria_data_root, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"datadir", 'h', "Path to the database root.", (char**) &maria_data_root,
+   (char**) &maria_data_root, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
   {"row-fixed-size", 'S', "Fixed size records",
    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
   {"rows-in-block", 'M', "Store rows in block format",
@@ -640,10 +641,11 @@ static struct my_option my_long_options[] =
 
 
 static my_bool
-get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
-	       char *argument __attribute__((unused)))
+get_one_option(const struct my_option *opt,
+	       const char *argument __attribute__((unused)),
+               const char *filename __attribute__((unused)))
 {
-  switch(optid) {
+  switch(opt->id) {
   case 'c':
     create_flag|= HA_CREATE_CHECKSUM | HA_CREATE_PAGE_CHECKSUM;
     break;
diff --git a/storage/maria/ma_search.c b/storage/maria/ma_search.c
index 63035925653..a57db7d2a2d 100644
--- a/storage/maria/ma_search.c
+++ b/storage/maria/ma_search.c
@@ -114,11 +114,11 @@ static int _ma_search_no_save(register MARIA_HA *info, MARIA_KEY *key,
                               MARIA_PINNED_PAGE **res_page_link,
                               uchar **res_page_buff)
 {
-  my_bool last_key_not_used;
+  my_bool last_key_not_used, buff_alloced;
   int error,flag;
   uint page_flag, nod_flag, used_length;
   uchar *keypos,*maxpos;
-  uchar lastkey[MARIA_MAX_KEY_BUFF];
+  uchar *lastkey;
   MARIA_KEYDEF *keyinfo= key->keyinfo;
   MARIA_PAGE page;
   MARIA_PINNED_PAGE *page_link;
@@ -138,6 +138,11 @@ static int _ma_search_no_save(register MARIA_HA *info, MARIA_KEY *key,
     DBUG_RETURN(1);                             /* Search at upper levels */
   }
 
+  alloc_on_stack(*info->stack_end_ptr, lastkey, buff_alloced,
+                 keyinfo->max_store_length);
+  if (!lastkey)
+    DBUG_RETURN(1);
+
   if (_ma_fetch_keypage(&page, info, keyinfo, pos,
                         PAGECACHE_LOCK_READ, DFLT_INIT_HITS, 0, 0))
     goto err;
@@ -164,16 +169,17 @@ static int _ma_search_no_save(register MARIA_HA *info, MARIA_KEY *key,
     if ((error= _ma_search_no_save(info, key, nextflag,
                                    _ma_kpos(nod_flag,keypos),
                                    res_page_link, res_page_buff)) <= 0)
-      DBUG_RETURN(error);
+      goto ret_error;
 
+    error= 1;                                    /* Default return value */
     if (flag >0)
     {
       if (nextflag & (SEARCH_SMALLER | SEARCH_LAST) &&
           keypos == page.buff + info->s->keypage_header + nod_flag)
-        DBUG_RETURN(1);                                 /* Bigger than key */
+        goto ret_error;                         /* Bigger than key */
     }
     else if (nextflag & SEARCH_BIGGER && keypos >= maxpos)
-      DBUG_RETURN(1);                                   /* Smaller than key */
+      goto ret_error;                           /* Smaller than key */
   }
   else
   {
@@ -188,7 +194,7 @@ static int _ma_search_no_save(register MARIA_HA *info, MARIA_KEY *key,
                                      _ma_kpos(nod_flag,keypos),
                                      res_page_link, res_page_buff)) >= 0 ||
           my_errno != HA_ERR_KEY_NOT_FOUND)
-        DBUG_RETURN(error);
+        goto ret_error;
     }
   }
 
@@ -233,6 +239,7 @@ static int _ma_search_no_save(register MARIA_HA *info, MARIA_KEY *key,
   *res_page_link= page_link;
   *res_page_buff= page.buff;
   
+  stack_alloc_free(lastkey, buff_alloced);
   DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos));
   DBUG_RETURN(0);
 
@@ -240,7 +247,11 @@ err:
   DBUG_PRINT("exit",("Error: %d",my_errno));
   info->cur_row.lastpos= HA_OFFSET_ERROR;
   info->page_changed=1;
-  DBUG_RETURN (-1);
+  error= -1;
+
+ret_error:
+  stack_alloc_free(lastkey, buff_alloced);
+  DBUG_RETURN(error);
 }
 
 
diff --git a/storage/maria/ma_sort.c b/storage/maria/ma_sort.c
index 4dc6472bd15..8153ec701a0 100644
--- a/storage/maria/ma_sort.c
+++ b/storage/maria/ma_sort.c
@@ -194,12 +194,12 @@ int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages,
     }
 
     if ((sort_keys= ((uchar**)
-                     my_malloc((size_t) (keys*(sort_length+sizeof(char*))+
+                     my_malloc(PSI_INSTRUMENT_ME, (size_t) (keys*(sort_length+sizeof(char*))+
                                          HA_FT_MAXBYTELEN),
                                MYF(0)))))
     {
-      if (my_init_dynamic_array(&buffpek, sizeof(BUFFPEK), maxbuffer,
-                                MY_MIN(maxbuffer/2, 1000), MYF(0)))
+      if (my_init_dynamic_array(PSI_INSTRUMENT_ME, &buffpek, sizeof(BUFFPEK),
+                                maxbuffer, MY_MIN(maxbuffer/2, 1000), MYF(0)))
       {
 	my_free(sort_keys);
         sort_keys= 0;
@@ -428,11 +428,11 @@ static my_bool _ma_thr_find_all_keys_exec(MARIA_SORT_PARAM* sort_param)
       while ((maxbuffer= (uint) (idx/(keys-1)+1)) != maxbuffer_org);
     }
     if ((sort_keys= (uchar **)
-         my_malloc((size_t)(keys*(sort_length+sizeof(char*))+
+         my_malloc(PSI_INSTRUMENT_ME, (size_t)(keys*(sort_length+sizeof(char*))+
                    ((sort_param->keyinfo->flag & HA_FULLTEXT) ?
                     HA_FT_MAXBYTELEN : 0)), MYF(0))))
     {
-      if (my_init_dynamic_array(&sort_param->buffpek, sizeof(BUFFPEK),
+      if (my_init_dynamic_array(PSI_INSTRUMENT_ME, &sort_param->buffpek, sizeof(BUFFPEK),
                              maxbuffer, MY_MIN(maxbuffer / 2, 1000), MYF(0)))
       {
         my_free(sort_keys);
@@ -628,7 +628,7 @@ int _ma_thr_write_keys(MARIA_SORT_PARAM *sort_param)
         length=(size_t)param->sort_buffer_length;
         while (length >= MIN_SORT_MEMORY)
         {
-          if ((mergebuf= my_malloc((size_t) length, MYF(0))))
+          if ((mergebuf= my_malloc(PSI_INSTRUMENT_ME, (size_t) length, MYF(0))))
               break;
           length=length*3/4;
         }
diff --git a/storage/maria/ma_sp_test.c b/storage/maria/ma_sp_test.c
index 702b1b04d43..ae8f3575438 100644
--- a/storage/maria/ma_sp_test.c
+++ b/storage/maria/ma_sp_test.c
@@ -17,7 +17,7 @@
 /* Written by Alex Barkov, who has a shared copyright to this code */
 
 #include <my_global.h>
-#include "maria.h"
+#include "maria_def.h"
 
 #ifdef HAVE_SPATIAL
 #include "ma_sp_defs.h"
@@ -71,6 +71,7 @@ int run_test(const char *filename)
   uchar read_record[MAX_REC_LENGTH];
   int upd=10;
   ha_rows hrows;
+  page_range pages;
 
   /* Define a column for NULLs and DEL markers*/
 
@@ -119,7 +120,7 @@ int run_test(const char *filename)
   if (!silent)
     printf("- Open isam-file\n");
 
-  if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+  if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED, 0)))
     goto err;
 
   if (!silent)
@@ -258,7 +259,7 @@ int run_test(const char *filename)
   max_range.key= record+1;
   max_range.length= 1000;                       /* Big enough */
   max_range.flag= HA_READ_KEY_EXACT;
-  hrows= maria_records_in_range(file,0, &min_range, &max_range);
+  hrows= maria_records_in_range(file,0, &min_range, &max_range, &pages);
   printf("     %ld rows\n", (long) hrows);
 
   if (maria_close(file)) goto err;
diff --git a/storage/maria/ma_state.c b/storage/maria/ma_state.c
index 2eff64ec7cd..c781f996f04 100644
--- a/storage/maria/ma_state.c
+++ b/storage/maria/ma_state.c
@@ -79,8 +79,8 @@ my_bool _ma_setup_live_state(MARIA_HA *info)
   }
 
   /* Table was not used before, create new table state entry */
-  if (!(tables= (MARIA_USED_TABLES*) my_malloc(sizeof(*tables),
-                                               MYF(MY_WME | MY_ZEROFILL))))
+  if (!(tables= (MARIA_USED_TABLES*) my_malloc(PSI_INSTRUMENT_ME,
+                                 sizeof(*tables), MYF(MY_WME | MY_ZEROFILL))))
     DBUG_RETURN(1);
   tables->next= trn->used_tables;
   trn->used_tables= tables;
@@ -282,7 +282,7 @@ void _ma_reset_state(MARIA_HA *info)
 			(THR_WRITE_CONCURRENT_INSERT was used)
 */
 
-void _ma_get_status(void* param, my_bool concurrent_insert)
+my_bool _ma_get_status(void* param, my_bool concurrent_insert)
 {
   MARIA_HA *info=(MARIA_HA*) param;
   DBUG_ENTER("_ma_get_status");
@@ -301,7 +301,7 @@ void _ma_get_status(void* param, my_bool concurrent_insert)
   info->state= &info->state_save;
   info->state->changed= 0;
   info->append_insert_at_end= concurrent_insert;
-  DBUG_VOID_RETURN;
+  DBUG_RETURN(0);
 }
 
 
@@ -359,7 +359,7 @@ void _ma_update_status_with_lock(MARIA_HA *info)
     locked= 1;
     mysql_mutex_lock(&info->s->lock.mutex);
   }
-  (*info->s->lock.update_status)(info);
+  (*info->s->lock.update_status)(info->lock.status_param);
   if (locked)
     mysql_mutex_unlock(&info->s->lock.mutex);
 }
@@ -379,11 +379,12 @@ void _ma_copy_status(void* to, void *from)
 }
 
 
-void _ma_reset_update_flag(void *param,
-                           my_bool concurrent_insert __attribute__((unused)))
+my_bool _ma_reset_update_flag(void *param,
+                              my_bool concurrent_insert __attribute__((unused)))
 {
   MARIA_HA *info=(MARIA_HA*) param;
   info->state->changed= 0;
+  return 0;
 }
 
 my_bool _ma_start_trans(void* param)
@@ -477,7 +478,7 @@ my_bool _ma_trnman_end_trans_hook(TRN *trn, my_bool commit,
           /*
             The change was done without using transid on rows (like in
             bulk insert). In this case this thread is the only one
-            that is using the table and all rows will be visble
+            that is using the table and all rows will be visible
             for all transactions.
           */
           _ma_reset_history(share);
@@ -495,7 +496,8 @@ my_bool _ma_trnman_end_trans_hook(TRN *trn, my_bool commit,
               ensures that all history items are stored in the list in
               decresing trid order.
             */
-            if (!(history= my_malloc(sizeof(*history), MYF(MY_WME))))
+            if (!(history= my_malloc(PSI_INSTRUMENT_ME, sizeof(*history),
+                                     MYF(MY_WME))))
             {
               /* purecov: begin inspected */
               error= 1;
@@ -534,20 +536,17 @@ my_bool _ma_trnman_end_trans_hook(TRN *trn, my_bool commit,
                               share, share->in_trans));
         }
       }
-      share->in_trans--;
-      mysql_mutex_unlock(&share->intern_lock);
+      /* The following calls frees &share->intern_lock */
+      decrement_share_in_trans(share);
     }
     else
     {
-#ifdef DBUG_ASSERT_EXISTS
       /*
-        We need to keep share->in_trans correct in the debug library
-        because of the assert in maria_close()
+        We need to keep share->in_trans correct because of the check
+        in free_maria_share()
       */
       mysql_mutex_lock(&share->intern_lock);
-      share->in_trans--;
-      mysql_mutex_unlock(&share->intern_lock);
-#endif
+      decrement_share_in_trans(share);
     }
     my_free(tables);
   }
@@ -589,6 +588,10 @@ void _ma_remove_table_from_trnman(MARIA_HA *info)
     if (tables->share == share)
     {
       *prev= tables->next;
+      /*
+        We don't have to and can't call decrement_share_in_trans(share) here
+        as we know there is an active MARIA_HA handler around.
+      */
       share->in_trans--;
       my_free(tables);
       break;
@@ -626,7 +629,7 @@ void _ma_remove_table_from_trnman(MARIA_HA *info)
 			(THR_WRITE_CONCURRENT_INSERT was used)
 */
 
-void _ma_block_get_status(void* param, my_bool concurrent_insert)
+my_bool _ma_block_get_status(void* param, my_bool concurrent_insert)
 {
   MARIA_HA *info=(MARIA_HA*) param;
   DBUG_ENTER("_ma_block_get_status");
@@ -634,9 +637,10 @@ void _ma_block_get_status(void* param, my_bool concurrent_insert)
 
   info->row_base_length= info->s->base_length;
   info->row_flag= info->s->base.default_row_flag;
-  if (concurrent_insert)
+  DBUG_ASSERT(!concurrent_insert ||
+              info->lock.type == TL_WRITE_CONCURRENT_INSERT);
+  if (concurrent_insert || !info->autocommit)
   {
-    DBUG_ASSERT(info->lock.type == TL_WRITE_CONCURRENT_INSERT);
     info->row_flag|= ROW_FLAG_TRANSID;
     info->row_base_length+= TRANSID_SIZE;
   }
@@ -644,7 +648,7 @@ void _ma_block_get_status(void* param, my_bool concurrent_insert)
   {
     DBUG_ASSERT(info->lock.type != TL_WRITE_CONCURRENT_INSERT);
   }
-  DBUG_VOID_RETURN;
+  DBUG_RETURN(0);
 }
 
 
diff --git a/storage/maria/ma_state.h b/storage/maria/ma_state.h
index 4f099a9105c..b27b75f54ab 100644
--- a/storage/maria/ma_state.h
+++ b/storage/maria/ma_state.h
@@ -13,6 +13,10 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 
+#ifndef MA_STATE_INCLUDED
+#define MA_STATE_INCLUDED
+C_MODE_START
+
 /* Struct to store tables in use by one transaction */
 
 typedef struct st_maria_status_info
@@ -61,18 +65,14 @@ MARIA_STATE_HISTORY *_ma_remove_not_visible_states(MARIA_STATE_HISTORY
                                                    my_bool all,
                                                    my_bool trman_is_locked);
 void _ma_reset_state(MARIA_HA *info);
-void _ma_get_status(void* param, my_bool concurrent_insert);
+my_bool _ma_get_status(void* param, my_bool concurrent_insert);
 void _ma_update_status(void* param);
 void _ma_update_status_with_lock(MARIA_HA *info);
 void _ma_restore_status(void *param);
 void _ma_copy_status(void* to, void *from);
-void _ma_reset_update_flag(void *param, my_bool concurrent_insert);
+my_bool _ma_reset_update_flag(void *param, my_bool concurrent_insert);
 my_bool _ma_start_trans(void* param);
 my_bool _ma_check_status(void *param);
-void _ma_block_get_status(void* param, my_bool concurrent_insert);
-void _ma_block_update_status(void *param);
-void _ma_block_restore_status(void *param);
-my_bool _ma_block_check_status(void *param);
 void maria_versioning(MARIA_HA *info, my_bool versioning);
 void _ma_set_share_data_file_length(struct st_maria_share *share,
                                     ulonglong new_length);
@@ -86,3 +86,6 @@ void _ma_remove_not_visible_states_with_lock(struct st_maria_share *share,
                                              my_bool all);
 void _ma_remove_table_from_trnman(MARIA_HA *info);
 void _ma_reset_history(struct st_maria_share *share);
+
+C_MODE_END
+#endif
diff --git a/storage/maria/ma_static.c b/storage/maria/ma_static.c
index a903ee23a31..309069e2feb 100644
--- a/storage/maria/ma_static.c
+++ b/storage/maria/ma_static.c
@@ -1,4 +1,5 @@
 /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+   Copyright (c) 2010, 2020, MariaDB Corporation Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -32,7 +33,7 @@ uchar	maria_pack_file_magic[]=
 /* Unique number for this maria instance */
 uchar   maria_uuid[MY_UUID_SIZE];
 uint	maria_quick_table_bits=9;
-ulong	maria_block_size= MARIA_KEY_BLOCK_LENGTH;
+ulong	__attribute__((visibility("default"))) maria_block_size= MARIA_KEY_BLOCK_LENGTH;
 my_bool maria_flush= 0, maria_single_user= 0;
 my_bool maria_delay_key_write= 0, maria_page_checksums= 1;
 my_bool maria_inited= FALSE;
@@ -41,6 +42,7 @@ my_bool maria_recovery_changed_data= 0, maria_recovery_verbose= 0;
 my_bool maria_assert_if_crashed_table= 0;
 my_bool maria_checkpoint_disabled= 0;
 my_bool maria_encrypt_tables= 0;
+my_bool aria_readonly= 0;
 
 mysql_mutex_t THR_LOCK_maria;
 #ifdef DONT_USE_RW_LOCKS
@@ -60,7 +62,7 @@ PAGECACHE *maria_pagecache= &maria_pagecache_var;
 PAGECACHE maria_log_pagecache_var;
 PAGECACHE *maria_log_pagecache= &maria_log_pagecache_var;
 MY_TMPDIR *maria_tmpdir;                        /* Tempdir for redo */
-char *maria_data_root;
+const char *maria_data_root;
 HASH maria_stored_state;
 int (*maria_create_trn_hook)(MARIA_HA *);
 
@@ -145,3 +147,8 @@ PSI_file_key key_file_translog, key_file_kfile, key_file_dfile,
 
 /* Note that PSI_stage_info globals must always be declared. */
 PSI_stage_info stage_waiting_for_a_resource= { 0, "Waiting for a resource", 0};
+
+#ifdef WITH_S3_STORAGE_ENGINE
+#include "s3_func.h"
+struct s3_func __attribute__((visibility("default"))) s3f;
+#endif
diff --git a/storage/maria/ma_test1.c b/storage/maria/ma_test1.c
index 9d739580470..22f80ca2d9e 100644
--- a/storage/maria/ma_test1.c
+++ b/storage/maria/ma_test1.c
@@ -1,4 +1,5 @@
 /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+   Copyright (c) 2020, MariaDB Corporation.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -24,7 +25,7 @@
 #include "trnman.h"
 
 extern PAGECACHE *maria_log_pagecache;
-extern char *maria_data_root;
+extern const char *maria_data_root;
 
 #define MAX_REC_LENGTH 1024
 
@@ -74,13 +75,13 @@ int main(int argc,char *argv[])
   safe_mutex_deadlock_detector= 1;
 #endif
   MY_INIT(argv[0]);
-  maria_data_root= (char *)".";
+  maria_data_root= ".";
   get_options(argc,argv);
   /* Maria requires that we always have a page cache */
   if (maria_init() ||
       (init_pagecache(maria_pagecache, maria_block_size * 16, 0, 0,
                       maria_block_size, 0, MY_WME) == 0) ||
-      ma_control_file_open(TRUE, TRUE) ||
+      ma_control_file_open(TRUE, TRUE, TRUE) ||
       (init_pagecache(maria_log_pagecache,
                       TRANSLOG_PAGECACHE_SIZE, 0, 0,
                       TRANSLOG_PAGE_SIZE, 0, MY_WME) == 0) ||
@@ -209,7 +210,7 @@ static int run_test(const char *filename)
 		uniques, &uniquedef, &create_info,
 		create_flag))
     goto err;
-  if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+  if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED, 0)))
     goto err;
   if (!silent)
     printf("- Writing key:s\n");
@@ -348,7 +349,7 @@ static int run_test(const char *filename)
     goto err;
   if (maria_close(file))
     goto err;
-  if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+  if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED, 0)))
     goto err;
   if (maria_begin(file))
     goto err;
@@ -675,8 +676,7 @@ static void update_record(uchar *record)
     ptr=blob_key;
     memcpy(pos+4,&ptr,sizeof(char*));	/* Store pointer to new key */
     if (keyinfo[0].seg[0].type != HA_KEYTYPE_NUM)
-      default_charset_info->cset->casedn(default_charset_info,
-                                         (char*) blob_key, length,
+      my_ci_casedn(default_charset_info, (char*) blob_key, length,
                                          (char*) blob_key, length);
     pos+=recinfo[0].length;
   }
@@ -684,16 +684,14 @@ static void update_record(uchar *record)
   {
     uint pack_length= HA_VARCHAR_PACKLENGTH(recinfo[0].length-1);
     uint length= pack_length == 1 ? (uint) *(uchar*) pos : uint2korr(pos);
-    default_charset_info->cset->casedn(default_charset_info,
-                                       (char*) pos + pack_length, length,
+    my_ci_casedn(default_charset_info, (char*) pos + pack_length, length,
                                        (char*) pos + pack_length, length);
     pos+=recinfo[0].length;
   }
   else
   {
     if (keyinfo[0].seg[0].type != HA_KEYTYPE_NUM)
-      default_charset_info->cset->casedn(default_charset_info,
-                                         (char*) pos, keyinfo[0].seg[0].length,
+      my_ci_casedn(default_charset_info, (char*) pos, keyinfo[0].seg[0].length,
                                          (char*) pos, keyinfo[0].seg[0].length);
     pos+=recinfo[0].length;
   }
@@ -741,8 +739,8 @@ static struct my_option my_long_options[] =
   {"debug", '#', "Undocumented",
    0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
 #endif
-  {"datadir", 'h', "Path to the database root.", &maria_data_root,
-   &maria_data_root, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"datadir", 'h', "Path to the database root.", (char**) &maria_data_root,
+   (char**) &maria_data_root, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
   {"delete-rows", 'd', "Abort after this many rows has been deleted",
    (uchar**) &remove_count, (uchar**) &remove_count, 0, GET_UINT, REQUIRED_ARG,
    1000, 0, 0, 0, 0, 0},
@@ -814,10 +812,11 @@ static struct my_option my_long_options[] =
 
 
 static my_bool
-get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
-	       char *argument __attribute__((unused)))
+get_one_option(const struct my_option *opt,
+	       const char *argument __attribute__((unused)),
+               const char *filename __attribute__((unused)))
 {
-  switch(optid) {
+  switch(opt->id) {
   case 'a':
     key_type= HA_KEYTYPE_TEXT;
     break;
@@ -870,7 +869,7 @@ get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
       record_type= DYNAMIC_RECORD;
     break;
   case 'k':
-    if (key_length < 4 || key_length > HA_MAX_KEY_LENGTH)
+    if (key_length < 4 || key_length > MARIA_MAX_KEY_LENGTH)
     {
       fprintf(stderr,"Wrong key length\n");
       exit(1);
diff --git a/storage/maria/ma_test2.c b/storage/maria/ma_test2.c
index 6a25ac8a363..6628465365f 100644
--- a/storage/maria/ma_test2.c
+++ b/storage/maria/ma_test2.c
@@ -69,13 +69,14 @@ int main(int argc, char *argv[])
   char *blob_buffer;
   MARIA_CREATE_INFO create_info;
   char filename[FN_REFLEN];
+  page_range pages;
 
 #ifdef SAFE_MUTEX
   safe_mutex_deadlock_detector= 1;
 #endif
   MY_INIT(argv[0]);
 
-  maria_data_root= (char *)".";
+  maria_data_root= ".";
   get_options(argc,argv);
   fn_format(filename, "test2", maria_data_root, "", MYF(0));
 
@@ -89,7 +90,7 @@ int main(int argc, char *argv[])
   if (maria_init() ||
       (init_pagecache(maria_pagecache, pagecache_size, 0, 0,
 		      maria_block_size, 0, MY_WME) == 0) ||
-      ma_control_file_open(TRUE, TRUE) ||
+      ma_control_file_open(TRUE, TRUE, TRUE) ||
       (init_pagecache(maria_log_pagecache,
 		      TRANSLOG_PAGECACHE_SIZE, 0, 0,
 		      TRANSLOG_PAGE_SIZE, 0, MY_WME) == 0) ||
@@ -235,7 +236,7 @@ int main(int argc, char *argv[])
 		0,(MARIA_UNIQUEDEF*) 0,
 		&create_info,create_flag))
     goto err;
-  if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+  if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED, 0)))
     goto err;
   maria_begin(file);
   if (opt_versioning)
@@ -722,7 +723,8 @@ int main(int argc, char *argv[])
     max_key.keypart_map= HA_WHOLE_KEY;
     max_key.flag= HA_READ_AFTER_KEY;
 
-    range_records= maria_records_in_range(file,(int) i, &min_key, &max_key);
+    range_records= maria_records_in_range(file,(int) i, &min_key, &max_key,
+                                          &pages);
     if (range_records < info.records*8/10 ||
 	range_records > info.records*12/10)
     {
@@ -756,7 +758,8 @@ int main(int argc, char *argv[])
       max_key.key= key2;
       max_key.keypart_map= HA_WHOLE_KEY;
       max_key.flag= HA_READ_BEFORE_KEY;
-      range_records= maria_records_in_range(file, 0, &min_key, &max_key);
+      range_records= maria_records_in_range(file, 0, &min_key, &max_key,
+                                            &pages);
       records=0;
       for (j++ ; j < k ; j++)
 	records+=key1[j];
@@ -1215,7 +1218,7 @@ static void put_blob_in_record(uchar *blob_pos, char **blob_buffer,
   if (use_blob)
   {
     if (! *blob_buffer &&
-        !(*blob_buffer=my_malloc((uint) use_blob,MYF(MY_WME))))
+        !(*blob_buffer=my_malloc(PSI_NOT_INSTRUMENTED, (uint) use_blob,MYF(MY_WME))))
     {
       use_blob= 0;
       return;
diff --git a/storage/maria/ma_test3.c b/storage/maria/ma_test3.c
index bd80a0e8ab4..400016829e2 100644
--- a/storage/maria/ma_test3.c
+++ b/storage/maria/ma_test3.c
@@ -171,8 +171,8 @@ void start_test(int id)
   MARIA_INFO isam_info;
   MARIA_HA *file,*file1,*file2=0,*lock;
 
-  if (!(file1=maria_open(filename,O_RDWR,HA_OPEN_WAIT_IF_LOCKED)) ||
-      !(file2=maria_open(filename,O_RDWR,HA_OPEN_WAIT_IF_LOCKED)))
+  if (!(file1=maria_open(filename,O_RDWR,HA_OPEN_WAIT_IF_LOCKED,0)) ||
+      !(file2=maria_open(filename,O_RDWR,HA_OPEN_WAIT_IF_LOCKED,0)))
   {
     fprintf(stderr,"Can't open isam-file: %s\n",filename);
     exit(1);
diff --git a/storage/maria/ma_unique.c b/storage/maria/ma_unique.c
index ce7967608b9..c9ad27c2a6c 100644
--- a/storage/maria/ma_unique.c
+++ b/storage/maria/ma_unique.c
@@ -1,4 +1,5 @@
 /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+   Copyright (c) 2020, MariaDB Corporation.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -145,9 +146,9 @@ ha_checksum _ma_unique_hash(MARIA_UNIQUEDEF *def, const uchar *record)
     if (type == HA_KEYTYPE_TEXT || type == HA_KEYTYPE_VARTEXT1 ||
         type == HA_KEYTYPE_VARTEXT2)
     {
-      keyseg->charset->coll->hash_sort(keyseg->charset,
-                                       (const uchar*) pos, length, &seed1,
-                                       &seed2);
+      my_ci_hash_sort(keyseg->charset,
+                      (const uchar*) pos, length,
+                      &seed1, &seed2);
       crc+= seed1;
     }
     else
diff --git a/storage/maria/ma_write.c b/storage/maria/ma_write.c
index 86770749a45..58e6d5e083b 100644
--- a/storage/maria/ma_write.c
+++ b/storage/maria/ma_write.c
@@ -623,9 +623,8 @@ static int w_search(register MARIA_HA *info, uint32 comp_flag, MARIA_KEY *key,
 		    my_bool insert_last)
 {
   int error,flag;
-  uchar *temp_buff,*keypos;
-  uchar keybuff[MARIA_MAX_KEY_BUFF];
-  my_bool was_last_key;
+  uchar *temp_buff,*keypos,*keybuff;
+  my_bool was_last_key, buff_alloced;
   my_off_t next_page, dup_key_pos;
   MARIA_SHARE *share= info->s;
   MARIA_KEYDEF *keyinfo= key->keyinfo;
@@ -633,9 +632,13 @@ static int w_search(register MARIA_HA *info, uint32 comp_flag, MARIA_KEY *key,
   DBUG_ENTER("w_search");
   DBUG_PRINT("enter", ("page: %lu", (ulong) (page_pos/keyinfo->block_length)));
 
-  if (!(temp_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
-				      MARIA_MAX_KEY_BUFF*2)))
-    DBUG_RETURN(-1);
+  alloc_on_stack(*info->stack_end_ptr, temp_buff, buff_alloced,
+                 (keyinfo->block_length + keyinfo->max_store_length*3));
+  if (!temp_buff)
+    DBUG_RETURN(1);
+
+  keybuff= temp_buff + (keyinfo->block_length + keyinfo->max_store_length*2);
+
   if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, PAGECACHE_LOCK_WRITE,
                         DFLT_INIT_HITS, temp_buff, 0))
     goto err;
@@ -695,7 +698,7 @@ static int w_search(register MARIA_HA *info, uint32 comp_flag, MARIA_KEY *key,
                                 DFLT_INIT_HITS))
             goto err;
         }
-        my_afree(temp_buff);
+        stack_alloc_free(temp_buff, buff_alloced);
         DBUG_RETURN(error);
       }
     }
@@ -745,10 +748,10 @@ static int w_search(register MARIA_HA *info, uint32 comp_flag, MARIA_KEY *key,
                           DFLT_INIT_HITS))
       goto err;
   }
-  my_afree(temp_buff);
+  stack_alloc_free(temp_buff, buff_alloced);
   DBUG_RETURN(error);
 err:
-  my_afree(temp_buff);
+  stack_alloc_free(temp_buff, buff_alloced);
   DBUG_PRINT("exit",("Error: %d",my_errno));
   DBUG_RETURN(-1);
 } /* w_search */
@@ -892,8 +895,9 @@ ChangeSet@1.2562, 2008-04-09 07:41:40+02:00, serg@janus.mylan +9 -0
       {
         /* Yup. converting */
         info->ft1_to_ft2=(DYNAMIC_ARRAY *)
-          my_malloc(sizeof(DYNAMIC_ARRAY), MYF(MY_WME));
-        my_init_dynamic_array(info->ft1_to_ft2, ft2len, 300, 50, MYF(0));
+          my_malloc(PSI_INSTRUMENT_ME, sizeof(DYNAMIC_ARRAY), MYF(MY_WME));
+        my_init_dynamic_array(PSI_INSTRUMENT_ME, info->ft1_to_ft2, ft2len, 300,
+                              50, MYF(0));
 
         /*
           Now, adding all keys from the page to dynarray
@@ -1246,15 +1250,20 @@ static int _ma_balance_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
 {
   MARIA_PINNED_PAGE tmp_page_link, *new_page_link= &tmp_page_link;
   MARIA_SHARE *share= info->s;
-  my_bool right;
+  my_bool right, buff_alloced;
   uint k_length,father_length,father_keylength,nod_flag,curr_keylength;
   uint right_length,left_length,new_right_length,new_left_length,extra_length;
   uint keys, tmp_length, extra_buff_length;
   uchar *pos, *extra_buff, *parting_key;
-  uchar tmp_part_key[MARIA_MAX_KEY_BUFF];
+  uchar *tmp_part_key;
   MARIA_PAGE next_page, extra_page, *left_page, *right_page;
   DBUG_ENTER("_ma_balance_page");
 
+  alloc_on_stack(*info->stack_end_ptr, tmp_part_key, buff_alloced,
+                 keyinfo->max_store_length);
+  if (!tmp_part_key)
+    DBUG_RETURN(-1);
+
   k_length= keyinfo->keylength;
   father_length= father_page->size;
   father_keylength= k_length + share->base.key_reflength;
@@ -1466,6 +1475,7 @@ static int _ma_balance_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
         _ma_write_keypage(father_page,
                           PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS))
       goto err;
+    stack_alloc_free(tmp_part_key, buff_alloced);
     DBUG_RETURN(0);
   }
 
@@ -1636,9 +1646,11 @@ static int _ma_balance_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
                         DFLT_INIT_HITS))
     goto err;
 
+  stack_alloc_free(tmp_part_key, buff_alloced);
   DBUG_RETURN(1);				/* Middle key up */
 
 err:
+  stack_alloc_free(tmp_part_key, buff_alloced);
   DBUG_RETURN(-1);
 } /* _ma_balance_page */
 
@@ -1760,7 +1772,7 @@ int maria_init_bulk_insert(MARIA_HA *info, size_t cache_size, ha_rows rows)
     cache_size/=total_keylength*16;
 
   info->bulk_insert=(TREE *)
-    my_malloc((sizeof(TREE)*share->base.keys+
+    my_malloc(PSI_INSTRUMENT_ME, (sizeof(TREE)*share->base.keys+
                sizeof(bulk_insert_param)*num_keys),MYF(0));
 
   if (!info->bulk_insert)
diff --git a/storage/maria/maria_def.h b/storage/maria/maria_def.h
index c51ae2b95f7..2107444940d 100644
--- a/storage/maria/maria_def.h
+++ b/storage/maria/maria_def.h
@@ -1,4 +1,5 @@
 /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+   Copyright (c) 2009, 2020, MariaDB Corporation Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -15,7 +16,15 @@
 
 /* This file is included by all internal maria files */
 
+#ifndef MARIA_DEF_INCLUDED
+#define MARIA_DEF_INCLUDED
+
 #include <my_global.h>
+
+#ifdef EMBEDDED_LIBRARY
+#undef WITH_S3_STORAGE_ENGINE
+#endif
+
 #include "maria.h"				/* Structs & some defines */
 #include "ma_pagecache.h"
 #include <myisampack.h>				/* packing of keys */
@@ -30,16 +39,294 @@
 #include <waiting_threads.h>
 #include <mysql/psi/mysql_file.h>
 
-/* For testing recovery */
-#ifdef TO_BE_REMOVED
-#define IDENTICAL_PAGES_AFTER_RECOVERY 1
+#define MARIA_CANNOT_ROLLBACK
+
+C_MODE_START
+
+/*
+  Limit max keys according to HA_MAX_POSSIBLE_KEY; See myisamchk.h for details
+*/
+
+#if MAX_INDEXES > HA_MAX_POSSIBLE_KEY
+#define MARIA_MAX_KEY    HA_MAX_POSSIBLE_KEY    /* Max allowed keys */
+#else
+#define MARIA_MAX_KEY    MAX_INDEXES            /* Max allowed keys */
 #endif
+
+#define MARIA_NAME_IEXT ".MAI"
+#define MARIA_NAME_DEXT ".MAD"
+/* Max extra space to use when sorting keys */
+#define MARIA_MAX_TEMP_LENGTH   (2*1024L*1024L*1024L)
+/* Possible values for maria_block_size (must be power of 2) */
+#define MARIA_KEY_BLOCK_LENGTH  8192            /* default key block length */
+#define MARIA_MIN_KEY_BLOCK_LENGTH      1024    /* Min key block length */
+#define MARIA_MAX_KEY_BLOCK_LENGTH      32768
+/* Minimal page cache when we only want to be able to scan a table */
+#define MARIA_MIN_PAGE_CACHE_SIZE       (8192L*16L)
+
+/*
+  In the following macros '_keyno_' is 0 .. keys-1.
+  If there can be more keys than bits in the key_map, the highest bit
+  is for all upper keys. They cannot be switched individually.
+  This means that clearing of high keys is ignored, setting one high key
+  sets all high keys.
+*/
+#define MARIA_KEYMAP_BITS      (8 * SIZEOF_LONG_LONG)
+#define MARIA_KEYMAP_HIGH_MASK (1ULL << (MARIA_KEYMAP_BITS - 1))
+#define maria_get_mask_all_keys_active(_keys_) \
+                            (((_keys_) < MARIA_KEYMAP_BITS) ? \
+                             ((1ULL << (_keys_)) - 1ULL) : \
+                             (~ 0ULL))
+#if MARIA_MAX_KEY > MARIA_KEYMAP_BITS
+#define maria_is_key_active(_keymap_,_keyno_) \
+                            (((_keyno_) < MARIA_KEYMAP_BITS) ? \
+                             MY_TEST((_keymap_) & (1ULL << (_keyno_))) : \
+                             MY_TEST((_keymap_) & MARIA_KEYMAP_HIGH_MASK))
+#define maria_set_key_active(_keymap_,_keyno_) \
+                            (_keymap_)|= (((_keyno_) < MARIA_KEYMAP_BITS) ? \
+                                          (1ULL << (_keyno_)) : \
+                                          MARIA_KEYMAP_HIGH_MASK)
+#define maria_clear_key_active(_keymap_,_keyno_) \
+                            (_keymap_)&= (((_keyno_) < MARIA_KEYMAP_BITS) ? \
+                                          (~ (1ULL << (_keyno_))) : \
+                                          (~ (0ULL)) /*ignore*/ )
+#else
+#define maria_is_key_active(_keymap_,_keyno_) \
+                            MY_TEST((_keymap_) & (1ULL << (_keyno_)))
+#define maria_set_key_active(_keymap_,_keyno_) \
+                            (_keymap_)|= (1ULL << (_keyno_))
+#define maria_clear_key_active(_keymap_,_keyno_) \
+                            (_keymap_)&= (~ (1ULL << (_keyno_)))
+#endif
+#define maria_is_any_key_active(_keymap_) \
+                            MY_TEST((_keymap_))
+#define maria_is_all_keys_active(_keymap_,_keys_) \
+                            ((_keymap_) == maria_get_mask_all_keys_active(_keys_))
+#define maria_set_all_keys_active(_keymap_,_keys_) \
+                            (_keymap_)= maria_get_mask_all_keys_active(_keys_)
+#define maria_clear_all_keys_active(_keymap_) \
+                            (_keymap_)= 0
+#define maria_intersect_keys_active(_to_,_from_) \
+                            (_to_)&= (_from_)
+#define maria_is_any_intersect_keys_active(_keymap1_,_keys_,_keymap2_) \
+                            ((_keymap1_) & (_keymap2_) & \
+                             maria_get_mask_all_keys_active(_keys_))
+#define maria_copy_keys_active(_to_,_maxkeys_,_from_) \
+                            (_to_)= (maria_get_mask_all_keys_active(_maxkeys_) & \
+                                     (_from_))
+
+        /* Param to/from maria_info */
+
+typedef struct st_maria_info
+{
+  ha_rows records;                      /* Records in database */
+  ha_rows deleted;                      /* Deleted records in database */
+  MARIA_RECORD_POS recpos;              /* Pos for last used record */
+  MARIA_RECORD_POS newrecpos;           /* Pos if we write new record */
+  MARIA_RECORD_POS dup_key_pos;         /* Position to record with dup key */
+  my_off_t data_file_length;            /* Length of data file */
+  my_off_t max_data_file_length, index_file_length;
+  my_off_t max_index_file_length, delete_length;
+  ulonglong auto_increment;
+  ulonglong key_map;                    /* Which keys are used */
+  time_t create_time;                   /* When table was created */
+  time_t check_time;
+  time_t update_time;
+  ulong record_offset;
+  double *rec_per_key;                   /* for sql optimizing */
+  ulong reclength;                      /* Recordlength */
+  ulong mean_reclength;                 /* Mean recordlength (if packed) */
+  char *data_file_name, *index_file_name;
+  enum data_file_type data_file_type;
+  uint keys;                            /* Number of keys in use */
+  uint options;                         /* HA_OPTION_... used */
+  uint reflength;
+  int errkey,                           /* With key was dupplicated on err */
+    sortkey;                            /* clustered by this key */
+  File filenr;                          /* (uniq) filenr for datafile */
+} MARIA_INFO;
+
+struct st_maria_share;
+struct st_maria_handler;                        /* For referense */
+struct st_maria_keydef;
+
+struct st_maria_key                 /* Internal info about a key */
+{
+  uchar *data;                              /* Data for key */
+  struct st_maria_keydef *keyinfo;          /* Definition for key */
+  uint data_length;                         /* Length of key data */
+  uint ref_length;                          /* record ref + transid */
+  uint32 flag;                               /* 0 or SEARCH_PART_KEY */
+};
+
+struct st_maria_decode_tree     /* Decode huff-table */
+{
+  uint16 *table;
+  uint quick_table_bits;
+  uchar *intervalls;
+};
+
+
+typedef struct s3_info S3_INFO;
+
+extern ulong maria_block_size, maria_checkpoint_frequency;
+extern ulong maria_concurrent_insert;
+extern my_bool maria_flush, maria_single_user, maria_page_checksums;
+extern my_off_t maria_max_temp_length;
+extern ulong maria_bulk_insert_tree_size, maria_data_pointer_size;
+extern MY_TMPDIR *maria_tmpdir;
+extern my_bool maria_encrypt_tables;
+
+/*
+  This is used to check if a symlink points into the mysql data home,
+  which is normally forbidden as it can be used to get access to
+  not privileged data
+*/
+extern int (*maria_test_invalid_symlink)(const char *filename);
+
+        /* Prototypes for maria-functions */
+
+extern int maria_init(void);
+extern void maria_end(void);
+extern my_bool maria_upgrade(void);
+extern int maria_close(MARIA_HA *file);
+extern int maria_delete(MARIA_HA *file, const uchar *buff);
+extern MARIA_HA *maria_open(const char *name, int mode,
+                            uint wait_if_locked, S3_INFO *s3);
+extern int maria_panic(enum ha_panic_function function);
+extern int maria_rfirst(MARIA_HA *file, uchar *buf, int inx);
+extern int maria_rkey(MARIA_HA *file, uchar *buf, int inx,
+                      const uchar *key, key_part_map keypart_map,
+                      enum ha_rkey_function search_flag);
+extern int maria_rlast(MARIA_HA *file, uchar *buf, int inx);
+extern int maria_rnext(MARIA_HA *file, uchar *buf, int inx);
+extern int maria_rnext_same(MARIA_HA *info, uchar *buf);
+extern int maria_rprev(MARIA_HA *file, uchar *buf, int inx);
+extern int maria_rrnd(MARIA_HA *file, uchar *buf,
+                      MARIA_RECORD_POS pos);
+extern int maria_scan_init(MARIA_HA *file);
+extern int maria_scan(MARIA_HA *file, uchar *buf);
+extern void maria_scan_end(MARIA_HA *file);
+extern int maria_rsame(MARIA_HA *file, uchar *record, int inx);
+extern int maria_rsame_with_pos(MARIA_HA *file, uchar *record,
+                                int inx, MARIA_RECORD_POS pos);
+extern int maria_update(MARIA_HA *file, const uchar *old,
+                        const uchar *new_record);
+extern int maria_write(MARIA_HA *file, const uchar *buff);
+extern MARIA_RECORD_POS maria_position(MARIA_HA *file);
+extern int maria_status(MARIA_HA *info, MARIA_INFO *x, uint flag);
+extern int maria_lock_database(MARIA_HA *file, int lock_type);
+extern int maria_delete_table(const char *name);
+extern int maria_rename(const char *from, const char *to);
+extern int maria_extra(MARIA_HA *file,
+                       enum ha_extra_function function, void *extra_arg);
+extern int maria_reset(MARIA_HA *file);
+extern ha_rows maria_records_in_range(MARIA_HA *info, int inx,
+                                      const key_range *min_key,
+                                      const key_range *max_key,
+                                      page_range *page);
+extern int maria_is_changed(MARIA_HA *info);
+extern int maria_delete_all_rows(MARIA_HA *info);
+extern uint maria_get_pointer_length(ulonglong file_length, uint def);
+extern int maria_commit(MARIA_HA *info);
+extern int maria_begin(MARIA_HA *info);
+extern void maria_disable_logging(MARIA_HA *info);
+extern void maria_enable_logging(MARIA_HA *info);
+
+#define HA_RECOVER_NONE         0       /* No automatic recover */
+#define HA_RECOVER_DEFAULT      1       /* Automatic recover active */
+#define HA_RECOVER_BACKUP       2       /* Make a backupfile on recover */
+#define HA_RECOVER_FORCE        4       /* Recover even if we loose rows */
+#define HA_RECOVER_QUICK        8       /* Don't check rows in data file */
+
+#define HA_RECOVER_ANY (HA_RECOVER_DEFAULT | HA_RECOVER_BACKUP | HA_RECOVER_FORCE | HA_RECOVER_QUICK)
+
+/* this is used to pass to mysql_mariachk_table */
+
+#define MARIA_CHK_REPAIR 1              /* equivalent to mariachk -r */
+#define MARIA_CHK_VERIFY 2              /* Verify, run repair if failure */
+
+typedef uint maria_bit_type;
+
+typedef struct st_maria_bit_buff
+{                                       /* Used for packing of record */
+  maria_bit_type current_byte;
+  uint bits;
+  uchar *pos, *end, *blob_pos, *blob_end;
+  uint error;
+} MARIA_BIT_BUFF;
+
+/* functions in maria_check */
+void maria_chk_init(HA_CHECK *param);
+void maria_chk_init_for_check(HA_CHECK *param, MARIA_HA *info);
+int maria_chk_status(HA_CHECK *param, MARIA_HA *info);
+int maria_chk_del(HA_CHECK *param, MARIA_HA *info, ulonglong test_flag);
+int maria_chk_size(HA_CHECK *param, MARIA_HA *info);
+int maria_chk_key(HA_CHECK *param, MARIA_HA *info);
+int maria_chk_data_link(HA_CHECK *param, MARIA_HA *info, my_bool extend);
+int maria_repair(HA_CHECK *param, MARIA_HA *info, char * name, my_bool);
+int maria_sort_index(HA_CHECK *param, MARIA_HA *info, char * name);
+int maria_zerofill(HA_CHECK *param, MARIA_HA *info, const char *name);
+int maria_repair_by_sort(HA_CHECK *param, MARIA_HA *info,
+                         const char *name, my_bool rep_quick);
+int maria_repair_parallel(HA_CHECK *param, MARIA_HA *info,
+                          const char *name, my_bool rep_quick);
+int maria_change_to_newfile(const char *filename, const char *old_ext,
+                            const char *new_ext, time_t backup_time,
+                            myf myflags);
+void maria_lock_memory(HA_CHECK *param);
+int maria_update_state_info(HA_CHECK *param, MARIA_HA *info, uint update);
+void maria_update_key_parts(MARIA_KEYDEF *keyinfo, double *rec_per_key_part,
+                            ulonglong *unique, ulonglong *notnull,
+                            ulonglong records);
+int maria_filecopy(HA_CHECK *param, File to, File from, my_off_t start,
+                   my_off_t length, const char *type);
+int maria_movepoint(MARIA_HA *info, uchar *record, my_off_t oldpos,
+                    my_off_t newpos, uint prot_key);
+int maria_test_if_almost_full(MARIA_HA *info);
+int maria_recreate_table(HA_CHECK *param, MARIA_HA **org_info, char *filename);
+int maria_disable_indexes(MARIA_HA *info);
+int maria_enable_indexes(MARIA_HA *info);
+int maria_indexes_are_disabled(MARIA_HA *info);
+void maria_disable_indexes_for_rebuild(MARIA_HA *info, ha_rows rows,
+                                       my_bool all_keys);
+my_bool maria_test_if_sort_rep(MARIA_HA *info, ha_rows rows, ulonglong key_map,
+                               my_bool force);
+
+int maria_init_bulk_insert(MARIA_HA *info, size_t cache_size, ha_rows rows);
+void maria_flush_bulk_insert(MARIA_HA *info, uint inx);
+int maria_end_bulk_insert(MARIA_HA *info, my_bool abort);
+int maria_preload(MARIA_HA *info, ulonglong key_map, my_bool ignore_leaves);
+void maria_ignore_trids(MARIA_HA *info);
+my_bool maria_too_big_key_for_sort(MARIA_KEYDEF *key, ha_rows rows);
+
+/* fulltext functions */
+FT_INFO *maria_ft_init_search(uint,void *, uint, uchar *, size_t,
+                              CHARSET_INFO *, uchar *);
+
+/* 'Almost-internal' Maria functions */
+
+void _ma_update_auto_increment_key(HA_CHECK *param, MARIA_HA *info,
+                                  my_bool repair);
+
+
 /* Do extra sanity checking */
 #define SANITY_CHECKS 1
 #ifdef EXTRA_DEBUG
 #define EXTRA_DEBUG_KEY_CHANGES
+#endif
+/*
+  The following defines can be used when one has problems with redo logging
+  Setting this will log the full key page which can be compared with the
+  redo-changed key page. This will however make the aria log files MUCH bigger.
+*/
+#if defined(EXTRA_ARIA_DEBUG)
 #define EXTRA_STORE_FULL_PAGE_IN_KEY_CHANGES
 #endif
+/* For testing recovery */
+#ifdef TO_BE_REMOVED
+#define IDENTICAL_PAGES_AFTER_RECOVERY 1
+#endif
 
 #define MAX_NONMAPPED_INSERTS 1000
 #define MARIA_MAX_TREE_LEVELS 32
@@ -48,6 +335,9 @@
 /* maria_open() flag, specific for maria_pack */
 #define HA_OPEN_IGNORE_MOVED_STATE (1U << 30)
 
+typedef struct st_sort_key_blocks MA_SORT_KEY_BLOCKS;
+typedef struct st_sort_ftbuf MA_SORT_FT_BUF;
+
 extern PAGECACHE maria_pagecache_var, *maria_pagecache;
 int maria_assign_to_pagecache(MARIA_HA *info, ulonglong key_map,
 			      PAGECACHE *key_cache);
@@ -62,8 +352,8 @@ typedef struct st_maria_sort_info
   MARIA_HA *info, *new_info;
   HA_CHECK *param;
   char *buff;
-  SORT_KEY_BLOCKS *key_block, *key_block_end;
-  SORT_FT_BUF *ft_buf;
+  MA_SORT_KEY_BLOCKS *key_block, *key_block_end;
+  MA_SORT_FT_BUF *ft_buf;
   my_off_t filelength, dupp, buff_length;
   pgcache_page_no_t page;
   ha_rows max_records;
@@ -221,6 +511,10 @@ typedef struct st_maria_state_info
 #define MARIA_FILE_CREATE_RENAME_LSN_OFFSET 4
 #define MARIA_FILE_CREATE_TRID_OFFSET (4 + LSN_STORE_SIZE*3 + 11*8)
 
+#define MARIA_MAX_KEY_LENGTH    2000
+#define MARIA_MAX_KEY_BUFF      (MARIA_MAX_KEY_LENGTH+HA_MAX_KEY_SEG*6+8+8 + \
+                                 MARIA_MAX_PACK_TRANSID_SIZE)
+#define MARIA_MAX_POSSIBLE_KEY_BUFF  (MARIA_MAX_KEY_LENGTH + 24+ 6+6)
 #define MARIA_STATE_KEY_SIZE	(8 + 4)
 #define MARIA_STATE_KEYBLOCK_SIZE  8
 #define MARIA_STATE_KEYSEG_SIZE	12
@@ -228,7 +522,6 @@ typedef struct st_maria_state_info
 #define MARIA_KEYDEF_SIZE	(2+ 5*2)
 #define MARIA_UNIQUEDEF_SIZE	(2+1+1)
 #define HA_KEYSEG_SIZE		(6+ 2*2 + 4*2)
-#define MARIA_MAX_KEY_BUFF	(HA_MAX_KEY_BUFF + MARIA_MAX_PACK_TRANSID_SIZE)
 #define MARIA_COLUMNDEF_SIZE	(2*7+1+1+4)
 #define MARIA_BASE_INFO_SIZE	(MY_UUID_SIZE + 5*8 + 6*4 + 11*2 + 6 + 5*2 + 1 + 16)
 #define MARIA_INDEX_BLOCK_MARGIN 16	/* Safety margin for .MYI tables */
@@ -245,6 +538,8 @@ typedef struct st_maria_state_info
 #define MA_EXTRA_OPTIONS_ENCRYPTED (1 << 0)
 #define MA_EXTRA_OPTIONS_INSERT_ORDER (1 << 1)
 
+#include "ma_check.h"
+
 /*
   Basic information of the Maria table. This is stored on disk
   and not changed (unless we do DLL changes).
@@ -263,6 +558,7 @@ typedef struct st_ma_base_info
   ulong min_pack_length;
   ulong max_pack_length;                /* Max possibly length of packed rec */
   ulong min_block_length;
+  ulong s3_block_size;                  /* Block length for S3 files */
   uint fields;                          /* fields in table */
   uint fixed_not_null_fields;
   uint fixed_not_null_fields_length;
@@ -298,6 +594,8 @@ typedef struct st_ma_base_info
   uint extra_options;
   /* default language, not really used but displayed by maria_chk */
   uint language;
+  /* Compression library used. 0 for no compression */
+  uint compression_algorithm;
 
   /* The following are from the header */
   uint key_parts, all_key_parts;
@@ -309,6 +607,7 @@ typedef struct st_ma_base_info
   my_bool born_transactional;
 } MARIA_BASE_INFO;
 
+uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base);
 
 /* Structs used intern in database */
 
@@ -362,6 +661,7 @@ typedef struct st_maria_file_bitmap
 #define MARIA_CHECKPOINT_SEEN_IN_LOOP 4
 
 typedef struct st_maria_crypt_data MARIA_CRYPT_DATA;
+struct ms3_st;
 
 typedef struct st_maria_share
 {					/* Shared between opens */
@@ -456,6 +756,7 @@ typedef struct st_maria_share
   uint32 ftkeys;			/* Number of distinct full-text keys
 						   + 1 */
   PAGECACHE_FILE kfile;			/* Shared keyfile */
+  S3_INFO *s3_path;                     /* Connection and path in s3 */
   File data_file;			/* Shared data file */
   int mode;				/* mode of file on open */
   uint reopen;				/* How many times opened */
@@ -611,6 +912,8 @@ struct st_maria_handler
   MARIA_STATUS_INFO *state, state_save;
   MARIA_STATUS_INFO *state_start;       /* State at start of transaction */
   MARIA_USED_TABLES *used_tables;
+  struct ms3_st *s3;
+  void **stack_end_ptr;
   MARIA_ROW cur_row;                    /* The active row that we just read */
   MARIA_ROW new_row;			/* Storage for a row during update */
   MARIA_KEY last_key;                   /* Last found key */
@@ -704,6 +1007,8 @@ struct st_maria_handler
   my_bool once_flags;			/* For MARIA_MRG */
   /* For bulk insert enable/disable transactions control */
   my_bool switched_transactional;
+  /* If transaction will autocommit */
+  my_bool autocommit;
 #ifdef _WIN32
   my_bool owned_by_merge;               /* This Maria table is part of a merge union */
 #endif
@@ -717,6 +1022,14 @@ struct st_maria_handler
   void *index_cond_func_arg;           /* parameter for the func */
 };
 
+/* Table options for the Aria and S3 storage engine */
+
+struct ha_table_option_struct
+{
+  ulonglong s3_block_size;
+  uint compression_algorithm;
+};
+
 /* Some defines used by maria-functions */
 
 #define USE_WHOLE_KEY	65535         /* Use whole key in _search() */
@@ -926,11 +1239,11 @@ extern uchar maria_file_magic[], maria_pack_file_magic[];
 extern uchar maria_uuid[MY_UUID_SIZE];
 extern uint32 maria_read_vec[], maria_readnext_vec[];
 extern uint maria_quick_table_bits;
-extern char *maria_data_root;
+extern const char *maria_data_root;
 extern uchar maria_zero_string[];
 extern my_bool maria_inited, maria_in_ha_maria, maria_recovery_changed_data;
 extern my_bool maria_recovery_verbose, maria_checkpoint_disabled;
-extern my_bool maria_assert_if_crashed_table;
+extern my_bool maria_assert_if_crashed_table, aria_readonly;
 extern ulong maria_checkpoint_min_log_activity;
 extern HASH maria_stored_state;
 extern int (*maria_create_trn_hook)(MARIA_HA *);
@@ -1306,7 +1619,7 @@ extern size_t _ma_nommap_pwrite(MARIA_HA *info, const uchar *Buffer,
 #define MA_STATE_INFO_WRITE_FULL_INFO        2
 /* intern_lock taking is needed */
 #define MA_STATE_INFO_WRITE_LOCK             4
-uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite);
+uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite)__attribute__((visibility("default"))) ;
 uint _ma_state_info_write_sub(File file, MARIA_STATE_INFO *state, uint pWrite);
 uint _ma_state_info_read_dsk(File file, MARIA_STATE_INFO *state);
 uint _ma_base_info_write(File file, MARIA_BASE_INFO *base);
@@ -1333,12 +1646,6 @@ my_bool _ma_cmp_dynamic_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
                                const uchar *record, MARIA_RECORD_POS pos);
 my_bool _ma_unique_comp(MARIA_UNIQUEDEF *def, const uchar *a, const uchar *b,
                         my_bool null_are_equal);
-void _ma_get_status(void *param, my_bool concurrent_insert);
-void _ma_update_status(void *param);
-void _ma_restore_status(void *param);
-void _ma_copy_status(void *to, void *from);
-my_bool _ma_check_status(void *param);
-void _ma_restore_status(void *param);
 void _ma_reset_status(MARIA_HA *maria);
 int _ma_def_scan_remember_pos(MARIA_HA *info, MARIA_RECORD_POS *lastpos);
 int _ma_def_scan_restore_pos(MARIA_HA *info, MARIA_RECORD_POS lastpos);
@@ -1356,7 +1663,8 @@ void _ma_remap_file(MARIA_HA *info, my_off_t size);
 MARIA_RECORD_POS _ma_write_init_default(MARIA_HA *info, const uchar *record);
 my_bool _ma_write_abort_default(MARIA_HA *info);
 int maria_delete_table_files(const char *name, my_bool temporary,
-                             myf sync_dir);
+                             myf flags)__attribute__((visibility("default"))) ;
+
 
 /*
   This cannot be in my_base.h as it clashes with HA_SPATIAL.
@@ -1365,7 +1673,6 @@ int maria_delete_table_files(const char *name, my_bool temporary,
 */
 #define HA_RTREE_INDEX	        16384	/* For RTREE search */
 
-C_MODE_START
 #define MARIA_FLUSH_DATA  1
 #define MARIA_FLUSH_INDEX 2
 int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index,
@@ -1385,7 +1692,6 @@ void _ma_check_print_warning(HA_CHECK *param, const char *fmt, ...)
 void _ma_check_print_info(HA_CHECK *param, const char *fmt, ...)
   ATTRIBUTE_FORMAT(printf, 2, 3);
 my_bool write_log_record_for_repair(const HA_CHECK *param, MARIA_HA *info);
-C_MODE_END
 
 int _ma_flush_pending_blocks(MARIA_SORT_PARAM *param);
 int _ma_sort_ft_buf_flush(MARIA_SORT_PARAM *sort_param);
@@ -1440,6 +1746,7 @@ extern my_bool ma_yield_and_check_if_killed(MARIA_HA *info, int inx);
 extern my_bool ma_killed_standalone(MARIA_HA *);
 
 extern uint _ma_file_callback_to_id(void *callback_data);
+extern void free_maria_share(MARIA_SHARE *share);
 
 static inline void unmap_file(MARIA_HA *info __attribute__((unused)))
 {
@@ -1448,3 +1755,15 @@ static inline void unmap_file(MARIA_HA *info __attribute__((unused)))
     _ma_unmap_file(info);
 #endif
 }
+
+static inline void decrement_share_in_trans(MARIA_SHARE *share)
+{
+  /* Internal tables doesn't have transactions */
+  DBUG_ASSERT(!share->internal_table);
+  if (!--share->in_trans)
+    free_maria_share(share);
+  else
+    mysql_mutex_unlock(&share->intern_lock);
+}
+C_MODE_END
+#endif
diff --git a/storage/maria/s3.cnf b/storage/maria/s3.cnf
new file mode 100644
index 00000000000..345bddd1cb1
--- /dev/null
+++ b/storage/maria/s3.cnf
@@ -0,0 +1,22 @@
+[mariadbd]
+#
+# Uncomment line to enable
+#
+#plugin-maturity = alpha
+
+[mariadb]
+#
+# Uncomment line to enable
+#
+#plugin-load-add = ha_s3
+
+#
+# Uncomment to configure the S3 engine
+# See all options at https://mariadb.com/kb/en/s3-storage-engine/
+#
+#s3-host-name = s3.amazonaws.com
+#s3-protocol-version = Amazon
+#s3-bucket = ...
+#s3-access-key = ...
+#s3-secret-key = ...
+#s3-region = eu-north-1
diff --git a/storage/maria/s3_func.c b/storage/maria/s3_func.c
new file mode 100644
index 00000000000..491a8e0a323
--- /dev/null
+++ b/storage/maria/s3_func.c
@@ -0,0 +1,1625 @@
+/* Copyright (C) 2019 MariaDB Corporation Ab
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */
+
+/*
+  Interface function used by S3 storage engine and aria_copy_for_s3
+*/
+
+#include "maria_def.h"
+#include "s3_func.h"
+#include <aria_backup.h>
+#include <mysqld_error.h>
+#include <sql_const.h>
+#include <mysys_err.h>
+#include <mysql_com.h>
+#include <zlib.h>
+
+/* number of '.' to print during a copy in verbose mode */
+#define DISPLAY_WITH 79
+
+static void convert_index_to_s3_format(uchar *header, ulong block_size,
+                                       int compression);
+static void convert_index_to_disk_format(uchar *header);
+static void convert_frm_to_s3_format(uchar *header);
+static void convert_frm_to_disk_format(uchar *header);
+static int s3_read_file_from_disk(const char *filename, uchar **to,
+                                  size_t *to_size, my_bool print_error);
+
+/* Used by ha_s3.cc and tools to define different protocol options */
+
+static const char *protocol_types[]= {"Auto", "Original", "Amazon", NullS};
+TYPELIB s3_protocol_typelib= {array_elements(protocol_types)-1,"",
+                              protocol_types, NULL};
+
+/******************************************************************************
+ Allocations handler for libmarias3
+ To be removed when we do the init allocation in mysqld.cc
+******************************************************************************/
+
+static void *s3_wrap_malloc(size_t size)
+{
+  return my_malloc(PSI_NOT_INSTRUMENTED, size, MYF(MY_WME));
+}
+
+static void *s3_wrap_calloc(size_t nmemb, size_t size)
+{
+  return my_malloc(PSI_NOT_INSTRUMENTED, nmemb * size,
+                   MYF(MY_WME | MY_ZEROFILL));
+}
+
+static void *s3_wrap_realloc(void *ptr, size_t size)
+{
+  return my_realloc(PSI_NOT_INSTRUMENTED, ptr, size,
+                    MYF(MY_WME | MY_ALLOW_ZERO_PTR));
+}
+
+static char *s3_wrap_strdup(const char *str)
+{
+  return my_strdup(PSI_NOT_INSTRUMENTED, str, MYF(MY_WME));
+}
+
+static void s3_wrap_free(void *ptr)
+{
+  if (ptr)                                      /* Avoid tracing of null */
+    my_free(ptr);
+}
+
+void s3_init_library()
+{
+  ms3_library_init_malloc(s3_wrap_malloc, s3_wrap_free, s3_wrap_realloc,
+                          s3_wrap_strdup, s3_wrap_calloc);
+}
+
+void s3_deinit_library()
+{
+  ms3_library_deinit();
+}
+
+/******************************************************************************
+ Functions on S3_INFO and S3_BLOCK
+******************************************************************************/
+
+/*
+  Free memory allocated by s3_get_object
+*/
+
+void s3_free(S3_BLOCK *data)
+{
+  my_free(data->alloc_ptr);
+  data->alloc_ptr= 0;
+}
+
+
+/*
+  Copy a S3_INFO structure
+*/
+
+S3_INFO *s3_info_copy(S3_INFO *old)
+{
+  S3_INFO *to, tmp;
+
+  /* Copy lengths */
+  memcpy(&tmp, old, sizeof(tmp));
+  /* Allocate new buffers */
+  if (!my_multi_malloc(PSI_NOT_INSTRUMENTED, MY_WME, &to, sizeof(S3_INFO),
+                       &tmp.access_key.str, old->access_key.length+1,
+                       &tmp.secret_key.str, old->secret_key.length+1,
+                       &tmp.region.str,     old->region.length+1,
+                       &tmp.bucket.str,     old->bucket.length+1,
+                       &tmp.database.str,   old->database.length+1,
+                       &tmp.table.str,      old->table.length+1,
+                       &tmp.base_table.str, old->base_table.length+1,
+                       NullS))
+    return 0;
+  /* Copy lengths and new pointers to to */
+  memcpy(to, &tmp, sizeof(tmp));
+  /* Copy data */
+  strmov((char*) to->access_key.str, old->access_key.str);
+  strmov((char*) to->secret_key.str, old->secret_key.str);
+  strmov((char*) to->region.str,     old->region.str);
+  strmov((char*) to->bucket.str,     old->bucket.str);
+  /* Database may not be null terminated */
+  strmake((char*) to->database.str,  old->database.str, old->database.length);
+  strmov((char*) to->table.str,      old->table.str);
+  strmov((char*) to->base_table.str, old->base_table.str);
+  return to;
+}
+
+/**
+   Open a connection to s3
+*/
+
+ms3_st *s3_open_connection(S3_INFO *s3)
+{
+  ms3_st *s3_client;
+  if (!(s3_client= ms3_init(s3->access_key.str,
+                            s3->secret_key.str,
+                            s3->region.str,
+                            s3->host_name.str)))
+  {
+    my_printf_error(HA_ERR_NO_SUCH_TABLE,
+                    "Can't open connection to S3, error: %d %s", MYF(0),
+                    errno, ms3_error(errno));
+    my_errno= HA_ERR_NO_SUCH_TABLE;
+  }
+  if (s3->protocol_version)
+    ms3_set_option(s3_client, MS3_OPT_FORCE_PROTOCOL_VERSION,
+                   &s3->protocol_version);
+  if (s3->port)
+    ms3_set_option(s3_client, MS3_OPT_PORT_NUMBER, &s3->port);
+
+  if (s3->use_http)
+    ms3_set_option(s3_client, MS3_OPT_USE_HTTP, NULL);
+
+  return s3_client;
+}
+
+/**
+   close a connection to s3
+*/
+
+void s3_deinit(ms3_st *s3_client)
+{
+  DBUG_PUSH("");                                /* Avoid tracing free calls */
+  ms3_deinit(s3_client);
+  DBUG_POP();
+}
+
+
+/******************************************************************************
+ High level functions to copy tables to and from S3
+******************************************************************************/
+
+/**
+   Create suffix for object name
+   @param to_end end of suffix (from previous call or 000000 at start)
+
+   The suffix is a 6 length '0' prefixed number. If the number
+   gets longer than 6, then it's extended to 7 and more digits.
+*/
+
+static void fix_suffix(char *to_end, ulong nr)
+{
+  char buff[11];
+  uint length= (uint) (int10_to_str(nr, buff, 10) - buff);
+  set_if_smaller(length, 6);
+  strmov(to_end - length, buff);
+}
+
+/**
+   Copy file to 'aws_path' in blocks of block_size
+
+   @return 0   ok
+   @return 1   error. Error message is printed to stderr
+
+   Notes:
+   file is always closed before return
+*/
+
+static my_bool copy_from_file(ms3_st *s3_client, const char *aws_bucket,
+                              char *aws_path,
+                              File file, my_off_t start, my_off_t file_end,
+                              uchar *block, size_t block_size,
+                              my_bool compression, my_bool display)
+{
+  my_off_t pos;
+  char *path_end= strend(aws_path);
+  ulong bnr;
+  my_bool print_done= 0;
+  size_t length;
+
+  for (pos= start, bnr=1 ; pos < file_end ; pos+= length, bnr++)
+  {
+    if ((length= my_pread(file, block, block_size, pos, MYF(MY_WME))) ==
+        MY_FILE_ERROR)
+      goto err;
+    if (length == 0)
+    {
+      my_error(EE_EOFERR, MYF(0), my_filename(file), my_errno);
+      goto err;
+    }
+
+    fix_suffix(path_end, bnr);
+    if (s3_put_object(s3_client, aws_bucket, aws_path, block, length,
+                      compression))
+      goto err;
+
+    /* Write up to DISPLAY_WITH number of '.' during copy */
+    if (display &&
+        ((pos + block_size) * DISPLAY_WITH / file_end) >
+        (pos * DISPLAY_WITH/file_end))
+    {
+      fputc('.', stdout); fflush(stdout);
+      print_done= 1;
+    }
+  }
+  if (print_done)
+  {
+    fputc('\n', stdout); fflush(stdout);
+  }
+  my_close(file, MYF(MY_WME));
+  return 0;
+
+err:
+  my_close(file, MYF(MY_WME));
+  if (print_done)
+  {
+    fputc('\n', stdout); fflush(stdout);
+  }
+  return 1;
+}
+
+
+/**
+   Copy an Aria table to S3
+   @param s3_client    connection to S3
+   @param aws_bucket   Aws bucket
+   @param path         Path for Aria table (can be temp table)
+   @param database     database name
+   @param table_name   table name
+   @param block_size   Block size in s3. If 0 then use block size
+                       and compression as specified in the .MAI file as
+                       specified as part of open.
+   @param compression  Compression algorithm (0 = none, 1 = zip)
+                       If block size is 0 then use .MAI file.
+   @return 0  ok
+   @return 1  error
+
+   The table will be copied in S3 into the following locations:
+
+   frm file (for discovery):
+   aws_bucket/database/table/frm
+
+   First index block (contains description if the Aria file):
+   aws_bucket/database/table/aria
+
+   Rest of the index file:
+   aws_bucket/database/table/index/block_number
+
+   Data file:
+   aws_bucket/database/table/data/block_number
+
+   block_number is 6 digits decimal number, prefixed with 0
+   (Can be larger than 6 numbers, the prefix is just for nice output)
+
+   frm and base blocks are small (just the needed data).
+   index and blocks are of size 's3_block_size'
+
+   If compression is used, then original block size is s3_block_size
+   but the stored block will be the size of the compressed block.
+*/
+
+int aria_copy_to_s3(ms3_st *s3_client, const char *aws_bucket,
+                    const char *path,
+                    const char *database, const char *table_name,
+                    ulong block_size, my_bool compression,
+                    my_bool force, my_bool display, my_bool copy_frm)
+{
+  ARIA_TABLE_CAPABILITIES cap;
+  char aws_path[FN_REFLEN+100];
+  char filename[FN_REFLEN];
+  char *aws_path_end, *end;
+  uchar *alloc_block= 0, *block;
+  ms3_status_st status;
+  File file= -1;
+  my_off_t file_size;
+  size_t frm_length;
+  int error;
+  my_bool frm_created= 0;
+  DBUG_ENTER("aria_copy_to_s3");
+  DBUG_PRINT("enter",("from: %s  database: %s  table: %s",
+                      path, database, table_name));
+
+  aws_path_end= strxmov(aws_path, database, "/", table_name, NullS);
+  strmov(aws_path_end, "/aria");
+
+  if (!ms3_status(s3_client, aws_bucket, aws_path, &status))
+  {
+    if (!force)
+    {
+      my_printf_error(EE_CANTCREATEFILE, "File %s exists in s3", MYF(0),
+                      aws_path);
+      DBUG_RETURN(EE_CANTCREATEFILE);
+    }
+    if ((error= aria_delete_from_s3(s3_client, aws_bucket, database,
+                                    table_name, display)))
+      DBUG_RETURN(error);
+  }
+
+  if (copy_frm)
+  {
+    /*
+      Copy frm file if it exists
+      We do this first to ensure that .frm always exists. This is needed to
+      ensure that discovery of the table will work.
+    */
+    fn_format(filename, path, "", ".frm", MY_REPLACE_EXT);
+    if (!s3_read_file_from_disk(filename, &alloc_block, &frm_length,0))
+    {
+      if (display)
+        printf("Copying frm file %s\n", filename);
+
+      end= strmov(aws_path_end,"/frm");
+      convert_frm_to_s3_format(alloc_block);
+
+      /* Note that frm is not compressed! */
+      if (s3_put_object(s3_client, aws_bucket, aws_path, alloc_block, frm_length,
+                        0))
+        goto err;
+
+      frm_created= 1;
+      my_free(alloc_block);
+      alloc_block= 0;
+    }
+  }
+
+  if (display)
+    printf("Copying aria table: %s.%s to s3\n", database, table_name);
+
+  /* Index file name */
+  fn_format(filename, path, "", ".MAI", MY_REPLACE_EXT);
+  if ((file= my_open(filename,
+                     O_RDONLY | O_SHARE | O_NOFOLLOW | O_CLOEXEC,
+                     MYF(MY_WME))) < 0)
+    DBUG_RETURN(1);
+  if ((error= aria_get_capabilities(file, &cap)))
+  {
+    fprintf(stderr, "Got error %d when reading Aria header from %s\n",
+            error, path);
+    goto err;
+  }
+  if (cap.transactional || cap.data_file_type != BLOCK_RECORD ||
+      cap.encrypted)
+  {
+    fprintf(stderr,
+            "Aria table %s doesn't match criteria to be copied to S3.\n"
+            "It should be non-transactional and should have row_format page\n",
+            path);
+    goto err;
+  }
+  /*
+    If block size is not specified, use the values specified as part of
+    create
+  */
+  if (block_size == 0)
+  {
+    block_size=  cap.s3_block_size;
+    compression= cap.compression;
+  }
+
+  /* Align S3_BLOCK size with table block size */
+  block_size= (block_size/cap.block_size)*cap.block_size;
+
+  /* Allocate block for data + flag for compress header */
+  if (!(alloc_block= (uchar*) my_malloc(PSI_NOT_INSTRUMENTED,
+                                        block_size+ALIGN_SIZE(1),
+                                        MYF(MY_WME))))
+    goto err;
+  /* Read/write data here, but with prefix space for compression flag */
+  block= alloc_block+ ALIGN_SIZE(1);
+
+  if (my_pread(file, block, cap.header_size, 0, MYF(MY_WME | MY_FNABP)))
+    goto err;
+
+  strmov(aws_path_end, "/aria");
+
+  if (display)
+    printf("Creating aria table information %s\n", aws_path);
+
+  convert_index_to_s3_format(block, block_size, compression);
+
+  /*
+    The first page is not compressed as we need it to know if the rest is
+    compressed
+  */
+  if (s3_put_object(s3_client, aws_bucket, aws_path, block, cap.header_size,
+                    0 /* no compression */ ))
+    goto err;
+
+  file_size= my_seek(file, 0L, MY_SEEK_END, MYF(0));
+
+  end= strmov(aws_path_end,"/index");
+
+  if (display)
+    printf("Copying index information %s\n", aws_path);
+
+  /* The 000000 will be update with block number by fix_suffix() */
+  end= strmov(end, "/000000");
+
+  error= copy_from_file(s3_client, aws_bucket, aws_path, file, cap.header_size,
+                        file_size, block, block_size, compression, display);
+  file= -1;
+  if (error)
+    goto err;
+
+  /* Copy data file */
+  fn_format(filename, path, "", ".MAD", MY_REPLACE_EXT);
+  if ((file= my_open(filename,
+                           O_RDONLY | O_SHARE | O_NOFOLLOW | O_CLOEXEC,
+                           MYF(MY_WME))) < 0)
+    DBUG_RETURN(1);
+
+  file_size= my_seek(file, 0L, MY_SEEK_END, MYF(0));
+
+  end= strmov(aws_path_end, "/data");
+
+  if (display)
+    printf("Copying data information %s\n", aws_path);
+
+  /* The 000000 will be update with block number by fix_suffix() */
+  end= strmov(end, "/000000");
+
+  error= copy_from_file(s3_client, aws_bucket, aws_path, file, 0, file_size,
+                        block, block_size, compression, display);
+  file= -1;
+  if (error)
+    goto err;
+
+  my_free(alloc_block);
+  DBUG_RETURN(0);
+
+err:
+  if (frm_created)
+  {
+    end= strmov(aws_path_end,"/frm");
+    (void) s3_delete_object(s3_client, aws_bucket, aws_path, MYF(ME_NOTE));
+  }
+  if (file >= 0)
+    my_close(file, MYF(0));
+  my_free(alloc_block);
+  DBUG_RETURN(1);
+}
+
+
+/**
+   Copy file to 'aws_path' in blocks of block_size
+
+   @return 0   ok
+   @return 1   error. Error message is printed to stderr
+
+   Notes:
+   file is always closed before return
+*/
+
+static my_bool copy_to_file(ms3_st *s3_client, const char *aws_bucket,
+                            char *aws_path, File file, my_off_t start,
+                            my_off_t file_end, my_bool compression,
+                            my_bool display)
+{
+  my_off_t pos;
+  char *path_end= strend(aws_path);
+  size_t error;
+  ulong bnr;
+  my_bool print_done= 0;
+  S3_BLOCK block;
+  DBUG_ENTER("copy_to_file");
+  DBUG_PRINT("enter", ("path: %s  start: %llu  end: %llu",
+                       aws_path, (ulonglong) start, (ulonglong) file_end));
+
+  for (pos= start, bnr=1 ; pos < file_end ; pos+= block.length, bnr++)
+  {
+    fix_suffix(path_end, bnr);
+    if (s3_get_object(s3_client, aws_bucket, aws_path, &block, compression, 1))
+      goto err;
+
+    error= my_write(file, block.str, block.length, MYF(MY_WME | MY_FNABP));
+    s3_free(&block);
+    if (error == MY_FILE_ERROR)
+      goto err;
+
+    /* Write up to DISPLAY_WITH number of '.' during copy */
+    if (display &&
+        ((pos + block.length) * DISPLAY_WITH /file_end) >
+        (pos * DISPLAY_WITH/file_end))
+    {
+      fputc('.', stdout); fflush(stdout);
+      print_done= 1;
+    }
+  }
+  if (print_done)
+  {
+    fputc('\n', stdout); fflush(stdout);
+  }
+  my_close(file, MYF(MY_WME));
+  DBUG_RETURN(0);
+
+err:
+  my_close(file, MYF(MY_WME));
+  if (print_done)
+  {
+    fputc('\n', stdout); fflush(stdout);
+  }
+  DBUG_RETURN(1);
+}
+
+
+/**
+   Copy a table from S3 to current directory
+*/
+
+int aria_copy_from_s3(ms3_st *s3_client, const char *aws_bucket,
+                      const char *path, const char *database,
+                      my_bool compression, my_bool force, my_bool display)
+
+{
+  MARIA_STATE_INFO state;
+  MY_STAT stat_info;
+  char table_name[FN_REFLEN], aws_path[FN_REFLEN+100];
+  char filename[FN_REFLEN];
+  char *aws_path_end, *end;
+  File file= -1;
+  S3_BLOCK block;
+  my_off_t index_file_size, data_file_size;
+  uint offset;
+  int error;
+  DBUG_ENTER("aria_copy_from_s3");
+
+  /* Check if index file exists */
+  fn_format(filename, path, "", ".MAI", MY_REPLACE_EXT);
+  if (!force && my_stat(filename, &stat_info, MYF(0)))
+  {
+    my_printf_error(EE_CANTCREATEFILE, "Table %s already exists on disk",
+                    MYF(0), filename);
+    DBUG_RETURN(EE_CANTCREATEFILE);
+  }
+
+  fn_format(table_name, path, "", "", MY_REPLACE_DIR | MY_REPLACE_EXT);
+  block.str= 0;
+
+  aws_path_end= strxmov(aws_path, database, "/", table_name, NullS);
+  strmov(aws_path_end, "/aria");
+
+  if (s3_get_object(s3_client, aws_bucket, aws_path, &block, 0, 0))
+  {
+    my_printf_error(EE_FILENOTFOUND, "File %s/%s doesn't exist in s3", MYF(0),
+                    database,filename);
+    goto err;
+  }
+  if (block.length < MARIA_STATE_INFO_SIZE)
+  {
+    fprintf(stderr, "Wrong block length for first block: %lu\n",
+            (ulong) block.length);
+    goto err_with_free;
+  }
+
+  if (display)
+    printf("Copying aria table: %s.%s from s3\n", database, table_name);
+
+  /* For offset positions, check _ma_state_info_readlength() */
+  offset= sizeof(state.header) + 4+ LSN_STORE_SIZE*3 + 8*5;
+  index_file_size= mi_sizekorr(block.str + offset);
+  data_file_size=  mi_sizekorr(block.str + offset+8);
+
+  if ((file= my_create(filename, 0,
+                       O_WRONLY | O_TRUNC | O_NOFOLLOW, MYF(MY_WME))) < 0)
+    goto err_with_free;
+
+  convert_index_to_disk_format(block.str);
+
+  if (my_write(file, block.str, block.length, MYF(MY_WME | MY_FNABP)))
+    goto err_with_free;
+
+  if (display)
+    printf("Copying index information %s\n", aws_path);
+
+  end= strmov(aws_path_end,"/index/000000");
+
+  error= copy_to_file(s3_client, aws_bucket, aws_path, file, block.length,
+                      index_file_size, compression, display);
+  file= -1;
+  if (error)
+    goto err_with_free;
+
+  /* Copy data file */
+  fn_format(filename, path, "", ".MAD", MY_REPLACE_EXT);
+  if ((file= my_create(filename, 0,
+                       O_WRONLY | O_TRUNC | O_NOFOLLOW, MYF(MY_WME))) < 0)
+    DBUG_RETURN(1);
+
+  end= strmov(aws_path_end, "/data");
+
+  if (display)
+    printf("Copying data information %s\n", aws_path);
+
+  /* The 000000 will be update with block number by fix_suffix() */
+  strmov(end, "/000000");
+
+  error= copy_to_file(s3_client, aws_bucket, aws_path, file, 0, data_file_size,
+                      compression, display);
+  file= -1;
+  s3_free(&block);
+  block.str= 0;
+  if (error)
+    goto err;
+
+  /* Copy frm file if it exists */
+  strmov(aws_path_end, "/frm");
+  if (!s3_get_object(s3_client, aws_bucket, aws_path, &block, 0, 0))
+  {
+    fn_format(filename, path, "", ".frm", MY_REPLACE_EXT);
+    if ((file= my_create(filename, 0,
+                         O_WRONLY | O_SHARE | O_NOFOLLOW | O_CLOEXEC,
+                         MYF(0))) >= 0)
+    {
+      if (display)
+        printf("Copying frm file %s\n", filename);
+
+      convert_frm_to_disk_format(block.str);
+
+      if (my_write(file, block.str, block.length, MYF(MY_WME | MY_FNABP)))
+        goto err_with_free;
+    }
+    s3_free(&block);
+    my_close(file, MYF(MY_WME));
+    file= -1;
+  }
+
+  DBUG_RETURN(0);
+
+err_with_free:
+  s3_free(&block);
+err:
+  if (file >= 0)
+    my_close(file, MYF(0));
+  DBUG_RETURN(1);
+}
+
+
+/**
+   Drop all files related to a table from S3
+*/
+
+int aria_delete_from_s3(ms3_st *s3_client, const char *aws_bucket,
+                        const char *database, const char *table,
+                        my_bool display)
+{
+  ms3_status_st status;
+  char aws_path[FN_REFLEN+100];
+  char *aws_path_end;
+  int error;
+  DBUG_ENTER("aria_delete_from_s3");
+
+  aws_path_end= strxmov(aws_path, database, "/", table, NullS);
+  strmov(aws_path_end, "/aria");
+
+  /* Check if either /aria or /frm exists */
+
+  if (ms3_status(s3_client, aws_bucket, aws_path, &status))
+  {
+    strmov(aws_path_end, "/frm");
+    if (ms3_status(s3_client, aws_bucket, aws_path, &status))
+    {
+      my_printf_error(HA_ERR_NO_SUCH_TABLE,
+                      "Table %s.%s doesn't exist in s3", MYF(0),
+                      database, table);
+      my_errno= HA_ERR_NO_SUCH_TABLE;
+      DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+    }
+  }
+
+  if (display)
+    printf("Delete of aria table: %s.%s\n", database, table);
+
+  strmov(aws_path_end,"/index");
+
+  if (display)
+    printf("Delete of index information %s\n", aws_path);
+
+  error= s3_delete_directory(s3_client, aws_bucket, aws_path);
+
+  strmov(aws_path_end,"/data");
+  if (display)
+    printf("Delete of data information %s\n", aws_path);
+
+  error|= s3_delete_directory(s3_client, aws_bucket, aws_path);
+
+  if (display)
+    printf("Delete of base information and frm\n");
+
+  strmov(aws_path_end,"/aria");
+  if (s3_delete_object(s3_client, aws_bucket, aws_path, MYF(MY_WME)))
+    error= 1;
+
+  /*
+    Delete .frm last as this is used by discovery to check if a s3 table
+    exists
+  */
+  strmov(aws_path_end,"/frm");
+  /* Ignore error if .frm file doesn't exist */
+  s3_delete_object(s3_client, aws_bucket, aws_path, MYF(ME_NOTE));
+
+  DBUG_RETURN(error);
+}
+
+
+/**
+  Rename a table in s3
+*/
+
+int aria_rename_s3(ms3_st *s3_client, const char *aws_bucket,
+                   const char *from_database, const char *from_table,
+                   const char *to_database, const char *to_table,
+                   my_bool rename_frm)
+{
+  ms3_status_st status;
+  char to_aws_path[FN_REFLEN+100], from_aws_path[FN_REFLEN+100];
+  char *to_aws_path_end, *from_aws_path_end;
+  int error;
+  DBUG_ENTER("aria_rename_s3");
+
+  from_aws_path_end= strxmov(from_aws_path, from_database, "/", from_table,
+                             NullS);
+  to_aws_path_end= strxmov(to_aws_path, to_database, "/", to_table, NullS);
+  strmov(from_aws_path_end, "/aria");
+
+  if (ms3_status(s3_client, aws_bucket, from_aws_path, &status))
+  {
+    my_printf_error(HA_ERR_NO_SUCH_TABLE,
+                    "Table %s.%s doesn't exist in s3", MYF(0), from_database,
+                    from_table);
+    my_errno= HA_ERR_NO_SUCH_TABLE;
+    DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+  }
+
+  strmov(from_aws_path_end,"/index");
+  strmov(to_aws_path_end,"/index");
+
+  error= s3_rename_directory(s3_client, aws_bucket, from_aws_path, to_aws_path,
+                             MYF(MY_WME));
+
+  strmov(from_aws_path_end,"/data");
+  strmov(to_aws_path_end,"/data");
+
+  error|= s3_rename_directory(s3_client, aws_bucket, from_aws_path,
+                              to_aws_path, MYF(MY_WME));
+
+  if (rename_frm) {
+    strmov(from_aws_path_end, "/frm");
+    strmov(to_aws_path_end, "/frm");
+
+    s3_rename_object(s3_client, aws_bucket, from_aws_path, to_aws_path,
+                     MYF(MY_WME));
+  }
+
+  strmov(from_aws_path_end,"/aria");
+  strmov(to_aws_path_end,"/aria");
+  if (s3_rename_object(s3_client, aws_bucket, from_aws_path, to_aws_path,
+                       MYF(MY_WME)))
+    error= 1;
+  DBUG_RETURN(error);
+}
+
+/**
+   Copy all partition files related to a table from S3 (.frm and .par)
+
+   @param s3_client   s3 client connection
+   @param aws_bucket  bucket to use
+   @param path        The path to the partitioned table files (no extension)
+   @param old_path    In some cases the partioned files are not yet renamed.
+                      This points to the temporary files that will later
+                      be renamed to the partioned table
+   @param database    Database for the partitioned table
+   @param database    table name for the partitioned table
+*/
+
+int partition_copy_to_s3(ms3_st *s3_client, const char *aws_bucket,
+                         const char *path, const char *old_path,
+                         const char *database, const char *table_name)
+{
+  char aws_path[FN_REFLEN+100];
+  char filename[FN_REFLEN];
+  char *aws_path_end;
+  uchar *alloc_block= 0;
+  ms3_status_st status;
+  size_t frm_length;
+  int error;
+  DBUG_ENTER("partition_copy_to_s3");
+  DBUG_PRINT("enter",("from: %s  database: %s  table: %s",
+                      path, database, table_name));
+
+  if (!old_path)
+    old_path= path;
+
+  aws_path_end= strxmov(aws_path, database, "/", table_name, "/", NullS);
+  strmov(aws_path_end, "frm");
+  fn_format(filename, old_path, "", ".frm", MY_REPLACE_EXT);
+
+  /* Just to be safe, delete any conflicting object */
+  if (!ms3_status(s3_client, aws_bucket, aws_path, &status))
+  {
+    if ((error= s3_delete_object(s3_client, aws_bucket, aws_path,
+                                 MYF(ME_FATAL))))
+      DBUG_RETURN(error);
+  }
+  if ((error= s3_read_file_from_disk(filename, &alloc_block, &frm_length, 0)))
+  {
+    /*
+      In case of ADD PARTITION PARTITON the .frm file is already renamed.
+      Copy the renamed file if it exists.
+    */
+    fn_format(filename, path, "", ".frm", MY_REPLACE_EXT);
+    if ((error= s3_read_file_from_disk(filename, &alloc_block, &frm_length,
+                                       1)))
+      goto err;
+  }
+  if ((error= s3_put_object(s3_client, aws_bucket, aws_path, alloc_block,
+                            frm_length, 0)))
+    goto err;
+
+  /*
+    Note that because ha_partiton::rename_table() is called before
+    this function, the .par table already has it's final name!
+  */
+  fn_format(filename, path, "", ".par", MY_REPLACE_EXT);
+  strmov(aws_path_end, "par");
+  if (!ms3_status(s3_client, aws_bucket, aws_path, &status))
+  {
+    if ((error= s3_delete_object(s3_client, aws_bucket, aws_path,
+                                 MYF(ME_FATAL))))
+      goto err;
+  }
+
+  my_free(alloc_block);
+  alloc_block= 0;
+  if ((error=s3_read_file_from_disk(filename, &alloc_block, &frm_length, 1)))
+    goto err;
+  if ((error= s3_put_object(s3_client, aws_bucket, aws_path, alloc_block,
+                            frm_length, 0)))
+  {
+    /* Delete the .frm file created above */
+    strmov(aws_path_end, "frm");
+    (void) s3_delete_object(s3_client, aws_bucket, aws_path,
+                            MYF(ME_FATAL));
+    goto err;
+  }
+  error= 0;
+
+err:
+  my_free(alloc_block);
+  DBUG_RETURN(error);
+}
+
+
+/**
+   Drop all partition files related to a table from S3
+*/
+
+int partition_delete_from_s3(ms3_st *s3_client, const char *aws_bucket,
+                             const char *database, const char *table,
+                             myf error_flags)
+{
+  char aws_path[FN_REFLEN+100];
+  char *aws_path_end;
+  int error=0, res;
+  DBUG_ENTER("partition_delete_from_s3");
+
+  aws_path_end= strxmov(aws_path, database, "/", table, NullS);
+  strmov(aws_path_end, "/par");
+
+  if ((res= s3_delete_object(s3_client, aws_bucket, aws_path, error_flags)))
+    error= res;
+  /*
+    Delete .frm last as this is used by discovery to check if a s3 table
+    exists
+  */
+  strmov(aws_path_end, "/frm");
+  if ((res= s3_delete_object(s3_client, aws_bucket, aws_path, error_flags)))
+    error= res;
+
+  DBUG_RETURN(error);
+}
+
+/******************************************************************************
+ Low level functions interfacing with libmarias3
+******************************************************************************/
+
+/**
+   Create an object for index or data information
+
+   Note that if compression is used, the data may be overwritten and
+   there must be COMPRESS_HEADER length of free space before the data!
+
+*/
+
+int s3_put_object(ms3_st *s3_client, const char *aws_bucket,
+                   const char *name, uchar *data, size_t length,
+                   my_bool compression)
+{
+  uint8_t error;
+  const char *errmsg;
+  DBUG_ENTER("s3_put_object");
+  DBUG_PRINT("enter", ("name: %s", name));
+
+  if (compression)
+  {
+    size_t comp_len;
+
+    data[-COMPRESS_HEADER]= 0;                  // No compression
+    if (!my_compress(data, &length, &comp_len))
+      data[-COMPRESS_HEADER]= 1;                // Compressed package
+    data-=   COMPRESS_HEADER;
+    length+= COMPRESS_HEADER;
+    int3store(data+1, comp_len);               // Original length or 0
+  }
+
+  if (likely(!(error= ms3_put(s3_client, aws_bucket, name, data, length))))
+    DBUG_RETURN(0);
+
+  if (!(errmsg= ms3_server_error(s3_client)))
+    errmsg= ms3_error(error);
+
+  my_printf_error(EE_WRITE, "Got error from put_object(%s): %d %s", MYF(0),
+                  name, error, errmsg);
+  DBUG_RETURN(EE_WRITE);
+}
+
+
+/**
+   Read an object for index or data information
+
+   @param print_error 0  Don't print error
+   @param print_error 1  Print error that object doesn't exists
+   @param print_error 2  Print error that table doesn't exists
+*/
+
+int s3_get_object(ms3_st *s3_client, const char *aws_bucket,
+                  const char *name, S3_BLOCK *block,
+                  my_bool compression, int print_error)
+{
+  uint8_t error;
+  int result= 0;
+  uchar *data;
+  DBUG_ENTER("s3_get_object");
+  DBUG_PRINT("enter", ("name: %s  compression: %d", name, compression));
+
+  block->str= block->alloc_ptr= 0;
+  if (likely(!(error= ms3_get(s3_client, aws_bucket, name,
+                              (uint8_t**) &block->alloc_ptr,
+                              &block->length))))
+  {
+    block->str= block->alloc_ptr;
+    if (compression)
+    {
+      ulong length;
+
+      /* If not compressed */
+      if (!block->str[0])
+      {
+        block->length-= COMPRESS_HEADER;
+        block->str+=    COMPRESS_HEADER;
+
+        /* Simple check to ensure that it's a correct block */
+        if (block->length % 1024)
+        {
+          s3_free(block);
+          my_printf_error(HA_ERR_NOT_A_TABLE,
+                          "Block '%s' is not compressed", MYF(0), name);
+          DBUG_RETURN(HA_ERR_NOT_A_TABLE);
+        }
+        DBUG_RETURN(0);
+      }
+
+      if (((uchar*)block->str)[0] > 1)
+      {
+        s3_free(block);
+        my_printf_error(HA_ERR_NOT_A_TABLE,
+                        "Block '%s' is not compressed", MYF(0), name);
+        DBUG_RETURN(HA_ERR_NOT_A_TABLE);
+      }
+
+      length= uint3korr(block->str+1);
+
+      if (!(data= (uchar*) my_malloc(PSI_NOT_INSTRUMENTED,
+                                     length, MYF(MY_WME | MY_THREAD_SPECIFIC))))
+      {
+        s3_free(block);
+        DBUG_RETURN(EE_OUTOFMEMORY);
+      }
+      if (uncompress(data, &length, block->str + COMPRESS_HEADER,
+                     block->length - COMPRESS_HEADER))
+      {
+        my_printf_error(ER_NET_UNCOMPRESS_ERROR,
+                        "Got error uncompressing s3 packet", MYF(0));
+        s3_free(block);
+        my_free(data);
+        DBUG_RETURN(ER_NET_UNCOMPRESS_ERROR);
+      }
+      s3_free(block);
+      block->str= block->alloc_ptr= data;
+      block->length= length;
+    }
+    DBUG_RETURN(0);
+  }
+
+  if (error == 9)
+  {
+    result= my_errno= (print_error == 1 ? EE_FILENOTFOUND :
+                       HA_ERR_NO_SUCH_TABLE);
+    if (print_error)
+      my_printf_error(my_errno, "Expected object '%s' didn't exist",
+                      MYF(0), name);
+  }
+  else
+  {
+    result= my_errno= EE_READ;
+    if (print_error)
+    {
+      const char *errmsg;
+      if (!(errmsg= ms3_server_error(s3_client)))
+        errmsg= ms3_error(error);
+
+      my_printf_error(EE_READ, "Got error from get_object(%s): %d %s", MYF(0),
+                      name, error, errmsg);
+    }
+  }
+  s3_free(block);
+  DBUG_RETURN(result);
+}
+
+
+int s3_delete_object(ms3_st *s3_client, const char *aws_bucket,
+                         const char *name, myf error_flags)
+{
+  uint8_t error;
+  int result= 0;
+  DBUG_ENTER("s3_delete_object");
+  DBUG_PRINT("enter", ("name: %s", name));
+
+  if (likely(!(error= ms3_delete(s3_client, aws_bucket, name))))
+    DBUG_RETURN(0);
+
+  if (error_flags)
+  {
+    error_flags&= ~MY_WME;
+    if (error == 9)
+      my_printf_error(result= EE_FILENOTFOUND,
+                      "Expected object '%s' didn't exist",
+                      error_flags, name);
+    else
+    {
+      const char *errmsg;
+      if (!(errmsg= ms3_server_error(s3_client)))
+        errmsg= ms3_error(error);
+
+      my_printf_error(result= EE_READ,
+                      "Got error from delete_object(%s): %d %s",
+                      error_flags, name, error, errmsg);
+    }
+  }
+  DBUG_RETURN(result);
+}
+
+
+/*
+  Drop all files in a 'directory' in s3
+*/
+
+int s3_delete_directory(ms3_st *s3_client, const char *aws_bucket,
+                        const char *path)
+{
+  ms3_list_st *list, *org_list= 0;
+  my_bool error;
+  DBUG_ENTER("delete_directory");
+  DBUG_PRINT("enter", ("path: %s", path));
+
+  if ((error= ms3_list(s3_client, aws_bucket, path, &org_list)))
+  {
+    const char *errmsg;
+    if (!(errmsg= ms3_server_error(s3_client)))
+      errmsg= ms3_error(error);
+
+    my_printf_error(EE_FILENOTFOUND,
+                    "Can't get list of files from %s. Error: %d %s", MYF(0),
+                    path, error, errmsg);
+    DBUG_RETURN(EE_FILENOTFOUND);
+  }
+
+  for (list= org_list ; list ; list= list->next)
+    if (s3_delete_object(s3_client, aws_bucket, list->key, MYF(MY_WME)))
+      error= 1;
+  if (org_list)
+    ms3_list_free(org_list);
+  DBUG_RETURN(error);
+}
+
+
+my_bool s3_rename_object(ms3_st *s3_client, const char *aws_bucket,
+                         const char *from_name, const char *to_name,
+                         myf error_flags)
+{
+  uint8_t error;
+  DBUG_ENTER("s3_rename_object");
+  DBUG_PRINT("enter", ("from: %s  to: %s", from_name, to_name));
+
+  if (likely(!(error= ms3_move(s3_client,
+                               aws_bucket, from_name,
+                               aws_bucket, to_name))))
+    DBUG_RETURN(FALSE);
+
+  if (error_flags)
+  {
+    error_flags&= ~MY_WME;
+    if (error == 9)
+    {
+      my_printf_error(EE_FILENOTFOUND, "Expected object '%s' didn't exist",
+                      error_flags, from_name);
+    }
+    else
+    {
+      const char *errmsg;
+      if (!(errmsg= ms3_server_error(s3_client)))
+        errmsg= ms3_error(error);
+
+      my_printf_error(EE_READ, "Got error from move_object(%s -> %s): %d %",
+                      error_flags,
+                      from_name, to_name, error, errmsg);
+    }
+  }
+  DBUG_RETURN(TRUE);
+}
+
+
+int s3_rename_directory(ms3_st *s3_client, const char *aws_bucket,
+                        const char *from_name, const char *to_name,
+                        myf error_flags)
+{
+  ms3_list_st *list, *org_list= 0;
+  my_bool error= 0;
+  char name[AWS_PATH_LENGTH], *end;
+  DBUG_ENTER("s3_delete_directory");
+
+  if ((error= ms3_list(s3_client, aws_bucket, from_name, &org_list)))
+  {
+    const char *errmsg;
+    if (!(errmsg= ms3_server_error(s3_client)))
+      errmsg= ms3_error(error);
+
+    my_printf_error(EE_FILENOTFOUND,
+                    "Can't get list of files from %s. Error: %d %s",
+                    MYF(error_flags & ~MY_WME),
+                    from_name, error, errmsg);
+    DBUG_RETURN(EE_FILENOTFOUND);
+  }
+
+  end= strmov(name, to_name);
+  for (list= org_list ; list ; list= list->next)
+  {
+    const char *sep= strrchr(list->key, '/');
+    if (sep)                                    /* Safety */
+    {
+      strmake(end, sep, (sizeof(name) - (end-name) - 1));
+      if (s3_rename_object(s3_client, aws_bucket, list->key, name,
+                           error_flags))
+        error= 1;
+    }
+  }
+  if (org_list)
+    ms3_list_free(org_list);
+  DBUG_RETURN(error);
+}
+
+
+/******************************************************************************
+ Converting index and frm files to from S3 storage engine
+******************************************************************************/
+
+/**
+  Change index information to be of type s3
+
+  @param header      Copy of header in index file
+  @param block_size  S3 block size
+  @param compression Compression algorithm to use
+
+  The position are from _ma_base_info_write()
+*/
+
+static void convert_index_to_s3_format(uchar *header, ulong block_size,
+                                       int compression)
+{
+  MARIA_STATE_INFO state;
+  uchar *base_pos;
+  uint  base_offset;
+
+  memcpy(state.header.file_version, header, sizeof(state.header));
+  base_offset= mi_uint2korr(state.header.base_pos);
+  base_pos= header + base_offset;
+
+  base_pos[107]= (uchar) compression;
+  mi_int3store(base_pos+119, block_size);
+}
+
+
+/**
+   Change index information to be a normal disk based table
+*/
+
+static void convert_index_to_disk_format(uchar *header)
+{
+  MARIA_STATE_INFO state;
+  uchar *base_pos;
+  uint  base_offset;
+
+  memcpy(state.header.file_version, header, sizeof(state.header));
+  base_offset= mi_uint2korr(state.header.base_pos);
+  base_pos= header + base_offset;
+
+  base_pos[107]= 0;
+  mi_int3store(base_pos+119, 0);
+}
+
+/**
+  Change storage engine in the .frm file from Aria to s3
+
+  For information about engine types, see legacy_db_type
+*/
+
+static void convert_frm_to_s3_format(uchar *header)
+{
+  DBUG_ASSERT(header[3] == 42 || header[3] == 41); /* Aria or S3 */
+  header[3]= 41;                                   /* S3 */
+}
+
+/**
+  Change storage engine in the .frm file from S3 to Aria
+
+  For information about engine types, see legacy_db_type
+*/
+
+static void convert_frm_to_disk_format(uchar *header)
+{
+  DBUG_ASSERT(header[3] == 41);                 /* S3 */
+  header[3]= 42;                                /* Aria */
+}
+
+
+/******************************************************************************
+ Helper functions
+******************************************************************************/
+
+/**
+  Set database and table name from path
+
+  s3->database and s3->table_name will be pointed into path
+  Note that s3->database will not be null terminated!
+*/
+
+my_bool set_database_and_table_from_path(S3_INFO *s3, const char *path)
+{
+  size_t org_length= dirname_length(path);
+  size_t length= 0;
+
+  if (!org_length)
+    return 1;
+
+  s3->table.str= path+org_length;
+  s3->table.length= strlen(s3->table.str);
+  for (length= --org_length; length > 0 ; length --)
+  {
+    if (path[length-1] == FN_LIBCHAR || path[length-1] == '/')
+      break;
+#ifdef FN_DEVCHAR
+    if (path[length-1] == FN_DEVCHAR)
+      break;
+#endif
+  }
+  if (length &&
+      (path[length] != FN_CURLIB || org_length - length != 1))
+  {
+    s3->database.str= path + length;
+    s3->database.length= org_length - length;
+    return 0;
+  }
+  return 1;                                     /* Can't find database */
+}
+
+
+/**
+   Read frm from the disk
+*/
+
+static int s3_read_file_from_disk(const char *filename, uchar **to,
+                                  size_t *to_size, my_bool print_error)
+{
+  File file;
+  uchar *alloc_block;
+  size_t file_size;
+  int error;
+
+  *to= 0;
+  if ((file= my_open(filename,
+                     O_RDONLY | O_SHARE | O_NOFOLLOW | O_CLOEXEC,
+                     MYF(print_error ? MY_WME: 0))) < 0)
+    return(my_errno);
+
+  file_size= (size_t) my_seek(file, 0L, MY_SEEK_END, MYF(0));
+  if (!(alloc_block= my_malloc(PSI_NOT_INSTRUMENTED, file_size, MYF(MY_WME))))
+    goto err;
+
+  if (my_pread(file, alloc_block, file_size, 0, MYF(MY_WME | MY_FNABP)))
+    goto err;
+
+  *to=      alloc_block;
+  *to_size= file_size;
+  my_close(file, MYF(0));
+  return 0;
+
+err:
+  error= my_errno;
+  my_free(alloc_block);
+  my_close(file, MYF(0));
+  return error;
+}
+
+
+/**
+   Get .frm or par from S3
+
+   @return 0 ok
+   @return 1 error
+*/
+
+my_bool s3_get_def(ms3_st *s3_client, S3_INFO *s3_info, S3_BLOCK *block,
+                   const char *ext)
+{
+  char aws_path[AWS_PATH_LENGTH];
+
+  strxnmov(aws_path, sizeof(aws_path)-1, s3_info->database.str, "/",
+           s3_info->table.str, "/", ext, NullS);
+
+  return s3_get_object(s3_client, s3_info->bucket.str, aws_path, block,
+                       0, 0);
+}
+
+/**
+   Check if .frm exits in S3
+
+   @return 0 frm exists
+   @return 1 error
+*/
+
+my_bool s3_frm_exists(ms3_st *s3_client, S3_INFO *s3_info)
+{
+  char aws_path[AWS_PATH_LENGTH];
+  ms3_status_st status;
+
+  strxnmov(aws_path, sizeof(aws_path)-1, s3_info->database.str, "/",
+           s3_info->table.str, "/frm", NullS);
+
+  return ms3_status(s3_client, s3_info->bucket.str, aws_path, &status);
+}
+
+
+/**
+   Get version from frm file
+
+   @param out        Store the table_version_here. It's of size MY_UUID_SIZE
+   @param frm_image  Frm image
+   @param frm_length size of image
+
+   @return 0  Was able to read table version
+   @return 1  Wrong information in frm file
+*/
+
+#define FRM_HEADER_SIZE 64
+#define EXTRA2_TABLEDEF_VERSION 0
+
+static inline my_bool is_binary_frm_header(const uchar *head)
+{
+  return head[0] == 254
+      && head[1] == 1
+      && head[2] >= FRM_VER
+      && head[2] <= FRM_VER_CURRENT;
+}
+
+static my_bool get_tabledef_version_from_frm(char *out, const uchar *frm_image,
+                                             size_t frm_length)
+{
+  uint segment_len;
+  const uchar *extra, *extra_end;
+  if (!is_binary_frm_header(frm_image) || frm_length <= FRM_HEADER_SIZE)
+    return 1;
+
+  /* Length of the MariaDB extra2 segment in the form file. */
+  segment_len= uint2korr(frm_image + 4);
+  if (frm_length < FRM_HEADER_SIZE + segment_len)
+    return 1;
+
+  extra= frm_image + FRM_HEADER_SIZE;
+  if (*extra == '/')   // old frm had '/' there
+    return 1;
+
+  extra_end= extra + segment_len;
+  while (extra + 4 < extra_end)
+  {
+    uchar type= *extra++;
+    size_t length= *extra++;
+    if (!length)
+    {
+      length= uint2korr(extra);
+      extra+= 2;
+      if (length < 256)
+        return 1;                               /* Something is wrong */
+    }
+    if (extra + length > extra_end)
+      return 1;
+    if (type == EXTRA2_TABLEDEF_VERSION)
+    {
+      if (length != MY_UUID_SIZE)
+        return 1;
+      memcpy(out, extra, length);
+      return 0;                                 /* Found it */
+    }
+    extra+= length;
+  }
+  return 1;
+}
+
+
+/**
+   Check if version in frm file matches what the server expects
+
+   @return 0 table definitions matches
+   @return 1 table definitions doesn't match
+   @return 2 Can't find the frm version
+   @return 3 Can't read the frm version
+*/
+
+int s3_check_frm_version(ms3_st *s3_client, S3_INFO *s3_info)
+{
+  my_bool res= 0;
+  char aws_path[AWS_PATH_LENGTH];
+  char uuid[MY_UUID_SIZE];
+  S3_BLOCK block;
+  DBUG_ENTER("s3_check_frm_version");
+
+  strxnmov(aws_path, sizeof(aws_path)-1, s3_info->database.str, "/",
+           s3_info->base_table.str, "/frm", NullS);
+
+  if (s3_get_object(s3_client, s3_info->bucket.str, aws_path, &block, 0, 0))
+  {
+    DBUG_PRINT("exit", ("No object found"));
+    DBUG_RETURN(2);                    /* Ignore check, use old frm */
+  }
+
+  if (get_tabledef_version_from_frm(uuid, (uchar*) block.str, block.length) ||
+      s3_info->tabledef_version.length != MY_UUID_SIZE)
+  {
+    s3_free(&block);
+    DBUG_PRINT("error", ("Wrong definition"));
+    DBUG_RETURN(3);                                   /* Wrong definition */
+  }
+  /* res is set to 1 if versions numbers doesn't match */
+  res= bcmp(s3_info->tabledef_version.str, uuid, MY_UUID_SIZE) != 0;
+  s3_free(&block);
+  if (res)
+    DBUG_PRINT("error", ("Wrong table version"));
+  else
+    DBUG_PRINT("exit", ("Version strings matches"));
+  DBUG_RETURN(res);
+}
+
+
+/******************************************************************************
+ Reading blocks from index or data from S3
+******************************************************************************/
+
+/*
+  Read the index header (first page) from the index file
+
+  In case of error, my_error() is called
+*/
+
+my_bool read_index_header(ms3_st *client, S3_INFO *s3, S3_BLOCK *block)
+{
+  char aws_path[AWS_PATH_LENGTH];
+  DBUG_ENTER("read_index_header");
+  strxnmov(aws_path, sizeof(aws_path)-1, s3->database.str, "/", s3->table.str,
+           "/aria", NullS);
+  DBUG_RETURN(s3_get_object(client, s3->bucket.str, aws_path, block, 0, 2));
+}
+
+
+#ifdef FOR_FUTURE_IF_NEEDED_FOR_DEBUGGING_WITHOUT_S3
+/**
+   Read a big block from disk
+*/
+
+my_bool s3_block_read(struct st_pagecache *pagecache,
+                             PAGECACHE_IO_HOOK_ARGS *args,
+                             struct st_pagecache_file *file,
+                             LEX_STRING *data)
+{
+  MARIA_SHARE *share= (MARIA_SHARE*) file->callback_data;
+  my_bool datafile= file != &share->kfile;
+
+  DBUG_ASSERT(file->big_block_size > 0);
+  DBUG_ASSERT(((((my_off_t) args->pageno - file->head_blocks) <<
+                pagecache->shift) %
+               file->big_block_size) == 0);
+
+  if (!(data->str= (char *) my_malloc(file->big_block_size, MYF(MY_WME))))
+    return TRUE;
+
+  data->length= mysql_file_pread(file->file,
+                                 (unsigned char *)data->str,
+                                 file->big_block_size,
+                                 ((my_off_t) args->pageno << pagecache->shift),
+                                 MYF(MY_WME));
+  if (data->length == 0 || data->length == MY_FILE_ERROR)
+  {
+    if (data->length == 0)
+    {
+      LEX_STRING *file_name= (datafile ?
+                              &share->data_file_name :
+                              &share->index_file_name);
+      my_error(EE_EOFERR, MYF(0), file_name->str, my_errno);
+    }
+    my_free(data->str);
+    data->length= 0;
+    data->str= 0;
+    return TRUE;
+  }
+  return FALSE;
+}
+#endif
+
+
+/**
+   Read a block from S3 to page cache
+*/
+
+my_bool s3_block_read(struct st_pagecache *pagecache,
+                      PAGECACHE_IO_HOOK_ARGS *args,
+                      struct st_pagecache_file *file,
+                      S3_BLOCK *block)
+{
+  char aws_path[AWS_PATH_LENGTH];
+  MARIA_SHARE *share= (MARIA_SHARE*) file->callback_data;
+  my_bool datafile= file->file != share->kfile.file;
+  MARIA_HA *info= (MARIA_HA*) my_thread_var->keycache_file;
+  ms3_st *client= info->s3;
+  const char *path_suffix= datafile ? "/data/" : "/index/";
+  char *end;
+  S3_INFO *s3= share->s3_path;
+  ulong block_number;
+  DBUG_ENTER("s3_block_read");
+
+  DBUG_ASSERT(file->big_block_size > 0);
+  DBUG_ASSERT(((((my_off_t) args->pageno - file->head_blocks) <<
+                pagecache->shift) %
+               file->big_block_size) == 0);
+
+  block_number= (((args->pageno - file->head_blocks) << pagecache->shift) /
+                 file->big_block_size) + 1;
+
+  end= strxnmov(aws_path, sizeof(aws_path)-12, s3->database.str, "/",
+                s3->table.str, path_suffix, "000000", NullS);
+  fix_suffix(end, block_number);
+
+  DBUG_RETURN(s3_get_object(client, s3->bucket.str, aws_path, block,
+                            share->base.compression_algorithm, 1));
+}
+
+/*
+  Start file numbers from 1000 to more easily find bugs when the file number
+  could be mistaken for a real file
+*/
+static volatile int32 unique_file_number= 1000;
+
+int32 s3_unique_file_number()
+{
+  return my_atomic_add32_explicit(&unique_file_number, 1,
+                                  MY_MEMORY_ORDER_RELAXED);
+}
diff --git a/storage/maria/s3_func.h b/storage/maria/s3_func.h
new file mode 100644
index 00000000000..f73a95dea24
--- /dev/null
+++ b/storage/maria/s3_func.h
@@ -0,0 +1,147 @@
+#ifndef S3_FUNC_INCLUDED
+#define S3_FUNC_INCLUDED
+/* Copyright (C) 2019, 2022, MariaDB Corporation Ab
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */
+
+/*
+  Interface function used by S3 storage engine and aria_copy_for_s3
+*/
+
+#ifdef WITH_S3_STORAGE_ENGINE
+#include <libmarias3/marias3.h>
+
+C_MODE_START
+#define DEFAULT_AWS_HOST_NAME "s3.amazonaws.com"
+
+extern struct s3_func {
+  uint8_t (*set_option)(ms3_st *, ms3_set_option_t, void *);
+  void (*free)(S3_BLOCK *);
+  void (*deinit)(ms3_st *);
+  int32 (*unique_file_number)(void);
+  my_bool (*read_index_header)(ms3_st *, S3_INFO *, S3_BLOCK *);
+  int (*check_frm_version)(ms3_st *, S3_INFO *);
+  S3_INFO *(*info_copy)(S3_INFO *);
+  my_bool (*set_database_and_table_from_path)(S3_INFO *, const char *);
+  ms3_st *(*open_connection)(S3_INFO *);
+} s3f;
+
+extern TYPELIB s3_protocol_typelib;
+
+/* Store information about a s3 connection */
+
+struct s3_info
+{
+  /* Connection strings */
+  LEX_CSTRING access_key, secret_key, region, bucket, host_name;
+  int port; // 0 means 'Use default'
+  my_bool use_http;
+
+  /* Will be set by caller or by ma_open() */
+  LEX_CSTRING database, table;
+
+  /*
+    Name of the partition table if the table is partitioned. If not, it's set
+    to be same as table. This is used to know which frm file to read to
+    check table version.
+  */
+  LEX_CSTRING base_table;
+
+  /* Sent to open to verify version */
+  LEX_CUSTRING tabledef_version;
+
+  /* Protocol for the list bucket API call. 1 for Amazon, 2 for some others */
+  uint8_t protocol_version;
+};
+
+
+/* flag + length is stored in this header */
+#define COMPRESS_HEADER 4
+
+/* Max length of an AWS PATH */
+#define AWS_PATH_LENGTH ((NAME_LEN)*3+3+10+6+11)
+
+void s3_init_library(void);
+void s3_deinit_library(void);
+int aria_copy_to_s3(ms3_st *s3_client, const char *aws_bucket,
+                    const char *path,
+                    const char *database, const char *table_name,
+                    ulong block_size, my_bool compression,
+                    my_bool force, my_bool display, my_bool copy_frm);
+int aria_copy_from_s3(ms3_st *s3_client, const char *aws_bucket,
+                      const char *path,const char *database,
+                      my_bool compression, my_bool force, my_bool display);
+int aria_delete_from_s3(ms3_st *s3_client, const char *aws_bucket,
+                        const char *database, const char *table,
+                        my_bool display);
+int aria_rename_s3(ms3_st *s3_client, const char *aws_bucket,
+                   const char *from_database, const char *from_table,
+                   const char *to_database, const char *to_table,
+                   my_bool rename_frm);
+ms3_st *s3_open_connection(S3_INFO *s3);
+void s3_deinit(ms3_st *s3_client);
+int s3_put_object(ms3_st *s3_client, const char *aws_bucket,
+                  const char *name, uchar *data, size_t length,
+                  my_bool compression);
+int s3_get_object(ms3_st *s3_client, const char *aws_bucket,
+                  const char *name, S3_BLOCK *block, my_bool compression,
+                  int print_error);
+int s3_delete_object(ms3_st *s3_client, const char *aws_bucket,
+                     const char *name, myf error_flags);
+my_bool s3_rename_object(ms3_st *s3_client, const char *aws_bucket,
+                         const char *from_name, const char *to_name,
+                         myf error_flags);
+void s3_free(S3_BLOCK *data);
+my_bool s3_copy_from_file(ms3_st *s3_client, const char *aws_bucket,
+                          char *aws_path, File file, my_off_t start,
+                          my_off_t file_end, uchar *block, size_t block_size,
+                          my_bool compression, my_bool display);
+my_bool s3_copy_to_file(ms3_st *s3_client, const char *aws_bucket,
+                        char *aws_path, File file, my_off_t start,
+                        my_off_t file_end, my_bool compression,
+                        my_bool display);
+int s3_delete_directory(ms3_st *s3_client, const char *aws_bucket,
+                        const char *path);
+int s3_rename_directory(ms3_st *s3_client, const char *aws_bucket,
+                        const char *from_name, const char *to_name,
+                        myf error_flags);
+int partition_delete_from_s3(ms3_st *s3_client, const char *aws_bucket,
+                             const char *database, const char *table,
+                             myf error_flags);
+int partition_copy_to_s3(ms3_st *s3_client, const char *aws_bucket,
+                         const char *path, const char *old_path,
+                         const char *database, const char *table_name);
+
+S3_INFO *s3_info_copy(S3_INFO *old);
+my_bool set_database_and_table_from_path(S3_INFO *s3, const char *path);
+my_bool s3_get_def(ms3_st *s3_client, S3_INFO *S3_info, S3_BLOCK *block,
+                   const char *ext);
+my_bool s3_frm_exists(ms3_st *s3_client, S3_INFO *s3_info);
+int s3_check_frm_version(ms3_st *s3_client, S3_INFO *s3_info);
+my_bool read_index_header(ms3_st *client, S3_INFO *s3, S3_BLOCK *block);
+int32 s3_unique_file_number(void);
+my_bool s3_block_read(struct st_pagecache *pagecache,
+                      PAGECACHE_IO_HOOK_ARGS *args,
+                      struct st_pagecache_file *file,
+                      S3_BLOCK *block);
+C_MODE_END
+#else
+
+C_MODE_START
+/* Dummy structures and interfaces to be used when compiling without S3 */
+struct s3_info;
+struct ms3_st;
+C_MODE_END
+#endif /* WITH_S3_STORAGE_ENGINE */
+#endif /* HA_S3_FUNC_INCLUDED */
diff --git a/storage/maria/test_aria_s3_copy.sh b/storage/maria/test_aria_s3_copy.sh
new file mode 100755
index 00000000000..ad39df69de2
--- /dev/null
+++ b/storage/maria/test_aria_s3_copy.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+#
+# Note that this test expact that there are tables test1 and test2 in
+# the current directory where test2 has also a .frm file
+#
+
+TMPDIR=tmpdir
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64/
+
+my_cmp()
+{
+    if ! cmp $1 $TMPDIR/$1
+    then
+        echo "aborting"
+        exit 1;
+    fi
+}
+
+run_test()
+{
+    OPT=$1;
+    echo "******* Running test with options '$OPT' **********"
+    rm -rf $TMPDIR
+    mkdir $TMPDIR
+    cp test?.* $TMPDIR
+    if ! ./aria_s3_copy --op=to --force $OPT test1 test2
+    then
+        echo Got error $?
+        exit 1;
+    fi
+    rm test?.*
+    if ! ./aria_s3_copy --op=from $OPT test1 test2
+    then
+        echo Got error $?
+        exit 1;
+    fi
+    if ! ./aria_s3_copy --op=delete $OPT test1 test2
+    then
+        echo Got error $?
+        exit 1;
+    fi
+    my_cmp test1.MAI
+    my_cmp test1.MAD
+    my_cmp test2.MAI
+    my_cmp test2.MAD
+    my_cmp test2.frm
+    rm test?.*
+    cp $TMPDIR/* .
+    rm -r $TMPDIR
+}
+
+run_test ""
+run_test "--s3_block_size=64K --compress"
+run_test "--s3_block_size=4M"
+echo "ok"
diff --git a/storage/maria/test_ma_backup.c b/storage/maria/test_ma_backup.c
index 241763ecd32..c57ec6ece0d 100644
--- a/storage/maria/test_ma_backup.c
+++ b/storage/maria/test_ma_backup.c
@@ -41,13 +41,13 @@ int main(int argc __attribute__((unused)), char *argv[])
   safe_mutex_deadlock_detector= 1;
 #endif
   MY_INIT(argv[0]);
-  maria_data_root= (char *)".";
+  maria_data_root= ".";
 
   /* Maria requires that we always have a page cache */
   if (maria_init() ||
       (init_pagecache(maria_pagecache, maria_block_size * 2000, 0, 0,
                       maria_block_size, 0, MY_WME) == 0) ||
-      ma_control_file_open(TRUE, TRUE) ||
+      ma_control_file_open(TRUE, TRUE, TRUE) ||
       (init_pagecache(maria_log_pagecache,
                       TRANSLOG_PAGECACHE_SIZE, 0, 0,
                       TRANSLOG_PAGE_SIZE, 0, MY_WME) == 0) ||
@@ -113,7 +113,7 @@ static int copy_table(const char *table_name, int stage)
          cap.online_backup_safe);
   printf("- Copying index file\n");
 
-  copy_buffer= my_malloc(cap.block_size, MYF(0));
+  copy_buffer= my_malloc(PSI_NOT_INSTRUMENTED, cap.block_size, MYF(0));
   for (block= 0 ; ; block++)
   {
     if ((error= aria_read_index(org_file, &cap, block, copy_buffer) ==
@@ -310,7 +310,7 @@ static int create_test_table(const char *table_name, int type_of_table)
 		uniques, &uniquedef, &create_info,
 		create_flag))
     goto err;
-  if (!(file=maria_open(table_name,2,HA_OPEN_ABORT_IF_LOCKED)))
+  if (!(file=maria_open(table_name,2,HA_OPEN_ABORT_IF_LOCKED, 0)))
     goto err;
   if (!silent)
     printf("- Writing key:s\n");
diff --git a/storage/maria/trnman.c b/storage/maria/trnman.c
index 6bc83a9ba88..56f6c52b2f5 100644
--- a/storage/maria/trnman.c
+++ b/storage/maria/trnman.c
@@ -149,7 +149,7 @@ int trnman_init(TrID initial_trid)
   DBUG_ENTER("trnman_init");
   DBUG_PRINT("enter", ("initial_trid: %lu", (ulong) initial_trid));
 
-  short_trid_to_active_trn= (TRN **)my_malloc(SHORT_TRID_MAX*sizeof(TRN*),
+  short_trid_to_active_trn= (TRN **)my_malloc(PSI_INSTRUMENT_ME, SHORT_TRID_MAX*sizeof(TRN*),
                                      MYF(MY_WME|MY_ZEROFILL));
   if (unlikely(!short_trid_to_active_trn))
     DBUG_RETURN(1);
@@ -238,7 +238,7 @@ void trnman_destroy()
 static TrID new_trid()
 {
   DBUG_ENTER("new_trid");
-  DBUG_ASSERT(global_trid_generator < 0xffffffffffffLL);
+  DBUG_ASSERT(global_trid_generator < MAX_INTERNAL_TRID);
   DBUG_PRINT("info", ("mysql_mutex_assert_owner LOCK_trn_list"));
   mysql_mutex_assert_owner(&LOCK_trn_list);
   DBUG_RETURN(++global_trid_generator);
@@ -312,7 +312,7 @@ TRN *trnman_new_trn(WT_THD *wt)
       (Like redo_lns, which is assumed to be 0 at start of row handling
       and reset to zero before end of row handling)
     */
-    trn= (TRN *)my_malloc(sizeof(TRN), MYF(MY_WME | MY_ZEROFILL));
+    trn= (TRN *)my_malloc(PSI_INSTRUMENT_ME, sizeof(TRN), MYF(MY_WME | MY_ZEROFILL));
     if (unlikely(!trn))
     {
       DBUG_PRINT("info", ("mysql_mutex_unlock LOCK_trn_list"));
@@ -700,8 +700,8 @@ my_bool trnman_collect_transactions(LEX_STRING *str_act, LEX_STRING *str_com,
 #endif
      LSN_STORE_SIZE /* first_undo_lsn */
      ) * trnman_committed_transactions;
-  if ((NULL == (str_act->str= my_malloc(str_act->length, MYF(MY_WME)))) ||
-      (NULL == (str_com->str= my_malloc(str_com->length, MYF(MY_WME)))))
+  if ((NULL == (str_act->str= my_malloc(PSI_INSTRUMENT_ME, str_act->length, MYF(MY_WME)))) ||
+      (NULL == (str_com->str= my_malloc(PSI_INSTRUMENT_ME, str_com->length, MYF(MY_WME)))))
     goto err;
   /* First, the active transactions */
   ptr= str_act->str + 2 + LSN_STORE_SIZE;
diff --git a/storage/maria/trnman.h b/storage/maria/trnman.h
index 37ef8ceeee9..588bcdf6461 100644
--- a/storage/maria/trnman.h
+++ b/storage/maria/trnman.h
@@ -59,6 +59,7 @@ struct st_ma_transaction
 
 #define TRANSACTION_LOGGED_LONG_ID 0x8000000000000000ULL
 #define MAX_TRID (~(TrID)0)
+#define MAX_INTERNAL_TRID  0xffffffffffffLL
 
 extern WT_RESOURCE_TYPE ma_rc_dup_unique;
 
diff --git a/storage/maria/unittest/ma_control_file-t.c b/storage/maria/unittest/ma_control_file-t.c
index 7f2bd6768a6..859d5514ffa 100644
--- a/storage/maria/unittest/ma_control_file-t.c
+++ b/storage/maria/unittest/ma_control_file-t.c
@@ -114,7 +114,7 @@ static CONTROL_FILE_ERROR local_ma_control_file_open(void)
 {
   CONTROL_FILE_ERROR error;
   error_handler_hook= my_ignore_message;
-  error= ma_control_file_open(TRUE, TRUE);
+  error= ma_control_file_open(TRUE, TRUE, TRUE);
   error_handler_hook= default_error_handler_hook;
   return error;
 }
@@ -579,10 +579,11 @@ static void version(void)
 }
 
 static my_bool
-get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
-	       char *argument __attribute__((unused)))
+get_one_option(const struct my_option *opt,
+	       const char *argument __attribute__((unused)),
+	       const char *filename __attribute__((unused)))
 {
-  switch(optid) {
+  switch(opt->id) {
   case 'V':
     version();
     exit(0);
diff --git a/storage/maria/unittest/ma_pagecache_single.c b/storage/maria/unittest/ma_pagecache_single.c
index 00e6efad0e6..4cd62c52d86 100644
--- a/storage/maria/unittest/ma_pagecache_single.c
+++ b/storage/maria/unittest/ma_pagecache_single.c
@@ -576,11 +576,11 @@ int simple_delete_flush_test()
 
 int simple_big_test()
 {
-  unsigned char *buffw= (unsigned char *) my_malloc(TEST_PAGE_SIZE, MYF(MY_WME));
-  unsigned char *buffr= (unsigned char *) my_malloc(TEST_PAGE_SIZE, MYF(MY_WME));
+  unsigned char *buffw= (unsigned char *) my_malloc(PSI_NOT_INSTRUMENTED, TEST_PAGE_SIZE, MYF(MY_WME));
+  unsigned char *buffr= (unsigned char *) my_malloc(PSI_NOT_INSTRUMENTED, TEST_PAGE_SIZE, MYF(MY_WME));
   struct file_desc *desc= ((struct file_desc *)
-                           my_malloc((PCACHE_SIZE/(TEST_PAGE_SIZE/2) + 1) *
-                                     sizeof(struct file_desc), MYF(MY_WME)));
+                           my_malloc(PSI_NOT_INSTRUMENTED,
+              (PCACHE_SIZE/(TEST_PAGE_SIZE/2) + 1) * sizeof(struct file_desc), MYF(MY_WME)));
   int res, i;
   DBUG_ENTER("simple_big_test");
 
diff --git a/storage/maria/unittest/ma_test_loghandler-t.c b/storage/maria/unittest/ma_test_loghandler-t.c
index 112be3c66f1..198ea5b2afb 100644
--- a/storage/maria/unittest/ma_test_loghandler-t.c
+++ b/storage/maria/unittest/ma_test_loghandler-t.c
@@ -197,7 +197,7 @@ int main(int argc __attribute__((unused)), char *argv[])
   }
 #endif
 
-  if (ma_control_file_open(TRUE, TRUE))
+  if (ma_control_file_open(TRUE, TRUE, TRUE))
   {
     fprintf(stderr, "Can't init control file (%d)\n", errno);
     exit(1);
diff --git a/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c b/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c
index 7a8ee720ded..8806571cabf 100644
--- a/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c
+++ b/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c
@@ -66,7 +66,7 @@ int main(int argc __attribute__((unused)), char *argv[])
   }
 #endif
 
-  if (ma_control_file_open(TRUE, TRUE))
+  if (ma_control_file_open(TRUE, TRUE,TRUE))
   {
     fprintf(stderr, "Can't init control file (%d)\n", errno);
     exit(1);
diff --git a/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c b/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c
index b76bf30748e..65b926376ae 100644
--- a/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c
+++ b/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c
@@ -64,7 +64,7 @@ int main(int argc __attribute__((unused)), char *argv[])
   }
 #endif
 
-  if (ma_control_file_open(TRUE, TRUE))
+  if (ma_control_file_open(TRUE, TRUE, TRUE))
   {
     fprintf(stderr, "Can't init control file (%d)\n", errno);
     exit(1);
diff --git a/storage/maria/unittest/ma_test_loghandler_multigroup-t.c b/storage/maria/unittest/ma_test_loghandler_multigroup-t.c
index 6ba0d00f884..e8e114dd155 100644
--- a/storage/maria/unittest/ma_test_loghandler_multigroup-t.c
+++ b/storage/maria/unittest/ma_test_loghandler_multigroup-t.c
@@ -189,11 +189,11 @@ static void usage(void)
 
 
 static my_bool
-get_one_option(int optid __attribute__((unused)),
-               const struct my_option *opt __attribute__((unused)),
-               char *argument __attribute__((unused)))
+get_one_option(const struct my_option *opt,
+               const char *argument __attribute__((unused)),
+               const char *filename __attribute__((unused)))
 {
-  switch (optid) {
+  switch (opt->id) {
   case '?':
     usage();
     exit(0);
@@ -280,7 +280,7 @@ int main(int argc __attribute__((unused)), char *argv[])
 
   bzero(long_tr_id, 6);
 
-  if (ma_control_file_open(TRUE, TRUE))
+  if (ma_control_file_open(TRUE, TRUE, TRUE))
   {
     fprintf(stderr, "Can't init control file (%d)\n", errno);
     exit(1);
@@ -443,7 +443,7 @@ int main(int argc __attribute__((unused)), char *argv[])
   end_pagecache(&pagecache, 1);
   ma_control_file_end();
 
-  if (ma_control_file_open(TRUE,TRUE))
+  if (ma_control_file_open(TRUE,TRUE,TRUE))
   {
     fprintf(stderr, "pass2: Can't init control file (%d)\n", errno);
     exit(1);
diff --git a/storage/maria/unittest/ma_test_loghandler_multithread-t.c b/storage/maria/unittest/ma_test_loghandler_multithread-t.c
index 68d1edb9385..cb4d2bc70ba 100644
--- a/storage/maria/unittest/ma_test_loghandler_multithread-t.c
+++ b/storage/maria/unittest/ma_test_loghandler_multithread-t.c
@@ -335,7 +335,7 @@ int main(int argc __attribute__((unused)),
   thr_setconcurrency(2);
 #endif
 
-  if (ma_control_file_open(TRUE, TRUE))
+  if (ma_control_file_open(TRUE, TRUE, TRUE))
   {
     fprintf(stderr, "Can't init control file (%d)\n", errno);
     exit(1);
diff --git a/storage/maria/unittest/ma_test_loghandler_noflush-t.c b/storage/maria/unittest/ma_test_loghandler_noflush-t.c
index f6c214cc827..3aafe5db9b4 100644
--- a/storage/maria/unittest/ma_test_loghandler_noflush-t.c
+++ b/storage/maria/unittest/ma_test_loghandler_noflush-t.c
@@ -65,7 +65,7 @@ int main(int argc __attribute__((unused)), char *argv[])
   }
 #endif
 
-  if (ma_control_file_open(TRUE, TRUE))
+  if (ma_control_file_open(TRUE, TRUE, TRUE))
   {
     fprintf(stderr, "Can't init control file (%d)\n", errno);
     exit(1);
diff --git a/storage/maria/unittest/ma_test_loghandler_nologs-t.c b/storage/maria/unittest/ma_test_loghandler_nologs-t.c
index 06096d642f5..913bd4ef5b6 100644
--- a/storage/maria/unittest/ma_test_loghandler_nologs-t.c
+++ b/storage/maria/unittest/ma_test_loghandler_nologs-t.c
@@ -66,7 +66,7 @@ int main(int argc __attribute__((unused)), char *argv[])
   }
 #endif
 
-  if (ma_control_file_open(TRUE, TRUE))
+  if (ma_control_file_open(TRUE, TRUE, TRUE))
   {
     fprintf(stderr, "Can't init control file (%d)\n", errno);
     exit(1);
@@ -139,7 +139,7 @@ int main(int argc __attribute__((unused)), char *argv[])
     }
   }
 
-  if (ma_control_file_open(TRUE, TRUE))
+  if (ma_control_file_open(TRUE, TRUE, TRUE))
   {
     fprintf(stderr, "Can't init control file (%d)\n", errno);
     exit(1);
diff --git a/storage/maria/unittest/ma_test_loghandler_pagecache-t.c b/storage/maria/unittest/ma_test_loghandler_pagecache-t.c
index 7956d53186f..f09a78e5fa8 100644
--- a/storage/maria/unittest/ma_test_loghandler_pagecache-t.c
+++ b/storage/maria/unittest/ma_test_loghandler_pagecache-t.c
@@ -69,7 +69,7 @@ int main(int argc __attribute__((unused)), char *argv[])
   }
 #endif
 
-  if (ma_control_file_open(TRUE, TRUE))
+  if (ma_control_file_open(TRUE, TRUE, TRUE))
   {
     fprintf(stderr, "Can't init control file (%d)\n", errno);
     exit(1);
diff --git a/storage/maria/unittest/ma_test_loghandler_purge-t.c b/storage/maria/unittest/ma_test_loghandler_purge-t.c
index d28b16209ce..e1eeca2fc9b 100644
--- a/storage/maria/unittest/ma_test_loghandler_purge-t.c
+++ b/storage/maria/unittest/ma_test_loghandler_purge-t.c
@@ -67,7 +67,7 @@ int main(int argc __attribute__((unused)), char *argv[])
   }
 #endif
 
-  if (ma_control_file_open(TRUE, TRUE))
+  if (ma_control_file_open(TRUE, TRUE, TRUE))
   {
     fprintf(stderr, "Can't init control file (%d)\n", errno);
     exit(1);
diff --git a/storage/maria/unittest/sequence_storage.c b/storage/maria/unittest/sequence_storage.c
index c6c8caefca1..1e6b3fcb239 100644
--- a/storage/maria/unittest/sequence_storage.c
+++ b/storage/maria/unittest/sequence_storage.c
@@ -33,7 +33,8 @@ my_bool seq_storage_reader_init(SEQ_STORAGE *seq, const char *file)
   seq->pos= 0;
   if ((fd= my_fopen(file, O_RDONLY, MYF(MY_WME))) == NULL)
     return 1;
-  if (my_init_dynamic_array(&seq->seq, sizeof(ulong), 10, 10, MYF(0)))
+  if (my_init_dynamic_array(PSI_NOT_INSTRUMENTED, &seq->seq, sizeof(ulong), 10,
+                            10, MYF(0)))
     return 1;
 
   for(;;)
diff --git a/storage/maria/unittest/test_file.c b/storage/maria/unittest/test_file.c
index 354f691aaa9..8c9a5f66a2f 100644
--- a/storage/maria/unittest/test_file.c
+++ b/storage/maria/unittest/test_file.c
@@ -38,7 +38,7 @@
 int test_file(PAGECACHE_FILE file, char *file_name,
               off_t size, size_t buff_size, struct file_desc *desc)
 {
-  unsigned char *buffr= my_malloc(buff_size, MYF(0));
+  unsigned char *buffr= my_malloc(PSI_NOT_INSTRUMENTED, buff_size, MYF(0));
   off_t pos= 0;
   size_t byte;
   int step= 0;
diff --git a/storage/maria/unittest/trnman-t.c b/storage/maria/unittest/trnman-t.c
index 3d19116dbbc..43bca725a8a 100644
--- a/storage/maria/unittest/trnman-t.c
+++ b/storage/maria/unittest/trnman-t.c
@@ -79,7 +79,7 @@ void run_test(const char *test, pthread_handler handler, int n, int m)
 
   litmus= 0;
 
-  threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0));
+  threads= (pthread_t *)my_malloc(PSI_NOT_INSTRUMENTED, sizeof(void *)*n, MYF(0));
   if (!threads)
   {
     diag("Out of memory");
diff --git a/storage/mroonga/CMakeLists.txt b/storage/mroonga/CMakeLists.txt
index af4a9daa4c6..c048b214658 100644
--- a/storage/mroonga/CMakeLists.txt
+++ b/storage/mroonga/CMakeLists.txt
@@ -189,11 +189,7 @@ else()
   set(MYSQL_VARIANT "MySQL")
 endif()
 
-if(EXISTS "${MYSQL_SOURCE_DIR}/pcre")
-  set(MYSQL_REGEX_INCLUDE_DIR "${MYSQL_SOURCE_DIR}/pcre")
-else()
-  set(MYSQL_REGEX_INCLUDE_DIR "${MYSQL_SOURCE_DIR}/regex")
-endif()
+set(MYSQL_REGEX_INCLUDE_DIR "${MYSQL_SOURCE_DIR}/regex")
 
 if(EXISTS "${MYSQL_SOURCE_DIR}/extra/rapidjson")
   set(MYSQL_RAPIDJSON_INCLUDE_DIR "${MYSQL_SOURCE_DIR}/extra/rapidjson/include")
diff --git a/storage/mroonga/configure.ac b/storage/mroonga/configure.ac
index b1e66904f75..3ef31bdc32e 100644
--- a/storage/mroonga/configure.ac
+++ b/storage/mroonga/configure.ac
@@ -186,11 +186,7 @@ AC_DEFUN([CONFIG_OPTION_MYSQL],[
     mysql_regex_include_dir="$ac_mysql_source_dir/extra/regex"
     MYSQL_INCLUDES="$MYSQL_INCLUDES -I$mysql_regex_include_dir"
   else
-    if test -d "$ac_mysql_source_dir/pcre"; then
-      mysql_regex_include_dir="$ac_mysql_source_dir/pcre"
-    else
-      mysql_regex_include_dir="$ac_mysql_source_dir/regex"
-    fi
+    mysql_regex_include_dir="$ac_mysql_source_dir/regex"
     MYSQL_INCLUDES="$MYSQL_INCLUDES -I$mysql_regex_include_dir"
   fi
   if test -d "$ac_mysql_source_dir/libbinlogevents"; then
diff --git a/storage/mroonga/ha_mroonga.cpp b/storage/mroonga/ha_mroonga.cpp
index fdca803ad96..d0608433745 100644
--- a/storage/mroonga/ha_mroonga.cpp
+++ b/storage/mroonga/ha_mroonga.cpp
@@ -228,9 +228,9 @@ Time_zone *mrn_my_tz_UTC;
 HASH *mrn_table_def_cache;
 #endif
 
-#ifdef MRN_HAVE_PSI_MEMORY_KEY
 PSI_memory_key mrn_memory_key;
 
+#ifdef MRN_HAVE_PSI_MEMORY_KEY
 static PSI_memory_info mrn_all_memory_keys[]=
 {
   {&mrn_memory_key, "Mroonga", 0}
@@ -962,7 +962,7 @@ static MYSQL_SYSVAR_STR(default_parser, mrn_default_tokenizer,
                         "(Deprecated. Use mroonga_default_tokenizer instead.)",
                         NULL,
                         mrn_default_tokenizer_update,
-                        MRN_DEFAULT_TOKENIZER);
+                        MRN_DEFAULT_TOKENIZER); // since 10.1.6
 
 static MYSQL_SYSVAR_STR(default_tokenizer, mrn_default_tokenizer,
                         PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
@@ -1268,37 +1268,15 @@ static struct st_mysql_information_schema i_s_info =
   MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION
 };
 
+namespace Show {
 static ST_FIELD_INFO i_s_mrn_stats_fields_info[] =
 {
-  {
-    "VERSION",
-    40,
-    MYSQL_TYPE_STRING,
-    0,
-    0,
-    "",
-    SKIP_OPEN_TABLE
-  },
-  {
-    "rows_written",
-    MY_INT32_NUM_DECIMAL_DIGITS,
-    MYSQL_TYPE_LONG,
-    0,
-    0,
-    "Rows written to Groonga",
-    SKIP_OPEN_TABLE
-  },
-  {
-    "rows_read",
-    MY_INT32_NUM_DECIMAL_DIGITS,
-    MYSQL_TYPE_LONG,
-    0,
-    0,
-    "Rows read from Groonga",
-    SKIP_OPEN_TABLE
-  },
-  { 0, 0, MYSQL_TYPE_NULL, 0, 0, 0, 0}
+  Column("VERSION",      Varchar(40), NOT_NULL),
+  Column("rows_written", SLong(),     NOT_NULL, "Rows written to Groonga"),
+  Column("rows_read",    SLong(),     NOT_NULL, "Rows read from Groonga"),
+  CEnd()
 };
+} // namespace Show
 
 static int i_s_mrn_stats_deinit(void* p)
 {
@@ -1327,7 +1305,7 @@ static int i_s_mrn_stats_init(void* p)
 {
   MRN_DBUG_ENTER_FUNCTION();
   ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
-  schema->fields_info = i_s_mrn_stats_fields_info;
+  schema->fields_info = Show::i_s_mrn_stats_fields_info;
   schema->fill_table = i_s_mrn_stats_fill;
   DBUG_RETURN(0);
 }
@@ -1374,11 +1352,10 @@ static void mrn_drop_database(handlerton *hton, char *path)
 static int mrn_close_connection(handlerton *hton, THD *thd)
 {
   MRN_DBUG_ENTER_FUNCTION();
-  void *p = *thd_ha_data(thd, mrn_hton_ptr);
+  void *p = thd_get_ha_data(thd, mrn_hton_ptr);
   if (p) {
     mrn_clear_slot_data(thd);
     free(p);
-    *thd_ha_data(thd, mrn_hton_ptr) = (void *) NULL;
     {
       mrn::Lock lock(&mrn_allocated_thds_mutex);
       my_hash_delete(&mrn_allocated_thds, (uchar*) thd);
@@ -1804,7 +1781,6 @@ static int mrn_init(void *p)
   // init handlerton
   grn_ctx *ctx = NULL;
   handlerton *hton = static_cast<handlerton *>(p);
-  hton->state = SHOW_OPTION_YES;
   hton->create = mrn_handler_create;
   hton->flags = HTON_NO_FLAGS;
 #ifndef MRN_SUPPORT_PARTITION
@@ -2037,7 +2013,7 @@ static int mrn_deinit(void *p)
       mrn_clear_slot_data(tmp_thd);
       void *slot_ptr = mrn_get_slot_data(tmp_thd, false);
       if (slot_ptr) free(slot_ptr);
-      *thd_ha_data(tmp_thd, mrn_hton_ptr) = (void *) NULL;
+      thd_set_ha_data(tmp_thd, mrn_hton_ptr, 0);
       my_hash_delete(&mrn_allocated_thds, (uchar *) tmp_thd);
     }
   }
@@ -5052,19 +5028,8 @@ int ha_mroonga::wrapper_delete_table(const char *name,
                                      handlerton *wrap_handlerton,
                                      const char *table_name)
 {
-  int error = 0;
   MRN_DBUG_ENTER_METHOD();
-
-  handler *hnd = get_new_handler(NULL, current_thd->mem_root, wrap_handlerton);
-  if (!hnd)
-  {
-    DBUG_RETURN(HA_ERR_OUT_OF_MEM);
-  }
-
-  error = hnd->ha_delete_table(name);
-  delete hnd;
-
-  DBUG_RETURN(error);
+  DBUG_RETURN(wrap_handlerton->drop_table(wrap_handlerton, name));
 }
 
 int ha_mroonga::generic_delete_table(const char *name, const char *table_name)
@@ -6110,7 +6075,7 @@ int ha_mroonga::storage_write_row(const uchar *buf)
 #ifdef MRN_HAVE_SPATIAL
     bool is_null_geometry_value =
       field->real_type() == MYSQL_TYPE_GEOMETRY &&
-      static_cast<Field_geom *>(field)->get_length() == 0;
+      static_cast<Field_blob *>(field)->get_length() == 0;
     if (is_null_geometry_value) {
       continue;
     }
@@ -6551,17 +6516,10 @@ int ha_mroonga::wrapper_update_row_index(const uchar *old_data,
 
   grn_id old_record_id;
   my_ptrdiff_t ptr_diff = PTR_BYTE_DIFF(old_data, table->record[0]);
-  for (uint j = 0; j < KEY_N_KEY_PARTS(key_info); j++) {
-    Field *field = key_info->key_part[j].field;
-    field->move_field_offset(ptr_diff);
-  }
+
   error = wrapper_get_record_id((uchar *)old_data, &old_record_id,
                                 "failed to get old record ID "
                                 "for updating from groonga");
-  for (uint j = 0; j < KEY_N_KEY_PARTS(key_info); j++) {
-    Field *field = key_info->key_part[j].field;
-    field->move_field_offset(-ptr_diff);
-  }
   if (error) {
     DBUG_RETURN(0);
   }
@@ -6873,8 +6831,6 @@ int ha_mroonga::storage_update_row_index(const uchar *old_data,
   GRN_TEXT_INIT(&new_key, 0);
   GRN_TEXT_INIT(&new_encoded_key, 0);
 
-  my_ptrdiff_t ptr_diff = PTR_BYTE_DIFF(old_data, table->record[0]);
-
   mrn::DebugColumnAccess debug_column_access(table, &table->read_set);
   uint i;
   uint n_keys = table->s->keys;
@@ -6898,18 +6854,10 @@ int ha_mroonga::storage_update_row_index(const uchar *old_data,
 
     GRN_BULK_REWIND(&old_key);
     grn_bulk_space(ctx, &old_key, key_info->key_length);
-    for (uint j = 0; j < KEY_N_KEY_PARTS(key_info); j++) {
-      Field *field = key_info->key_part[j].field;
-      field->move_field_offset(ptr_diff);
-    }
     key_copy((uchar *)(GRN_TEXT_VALUE(&old_key)),
              (uchar *)old_data,
              key_info,
              key_info->key_length);
-    for (uint j = 0; j < KEY_N_KEY_PARTS(key_info); j++) {
-      Field *field = key_info->key_part[j].field;
-      field->move_field_offset(-ptr_diff);
-    }
     GRN_BULK_REWIND(&old_encoded_key);
     grn_bulk_reserve(ctx, &old_encoded_key, MRN_MAX_KEY_SIZE);
     uint old_encoded_key_length;
@@ -7458,8 +7406,10 @@ uint ha_mroonga::max_supported_key_parts() const
   DBUG_RETURN(parts);
 }
 
-ha_rows ha_mroonga::wrapper_records_in_range(uint key_nr, key_range *range_min,
-                                             key_range *range_max)
+ha_rows ha_mroonga::wrapper_records_in_range(uint key_nr,
+                                             const key_range *range_min,
+                                             const key_range *range_max,
+                                             page_range *pages)
 {
   ha_rows row_count;
   MRN_DBUG_ENTER_METHOD();
@@ -7469,15 +7419,18 @@ ha_rows ha_mroonga::wrapper_records_in_range(uint key_nr, key_range *range_min,
   } else {
     MRN_SET_WRAP_SHARE_KEY(share, table->s);
     MRN_SET_WRAP_TABLE_KEY(this, table);
-    row_count = wrap_handler->records_in_range(key_nr, range_min, range_max);
+    row_count = wrap_handler->records_in_range(key_nr, range_min, range_max,
+                                               pages);
     MRN_SET_BASE_SHARE_KEY(share, table->s);
     MRN_SET_BASE_TABLE_KEY(this, table);
   }
   DBUG_RETURN(row_count);
 }
 
-ha_rows ha_mroonga::storage_records_in_range(uint key_nr, key_range *range_min,
-                                             key_range *range_max)
+ha_rows ha_mroonga::storage_records_in_range(uint key_nr,
+                                             const key_range *range_min,
+                                             const key_range *range_max,
+                                             page_range *pages)
 {
   MRN_DBUG_ENTER_METHOD();
   int flags = 0;
@@ -7590,8 +7543,8 @@ ha_rows ha_mroonga::storage_records_in_range(uint key_nr, key_range *range_min,
 }
 
 ha_rows ha_mroonga::generic_records_in_range_geo(uint key_nr,
-                                                 key_range *range_min,
-                                                 key_range *range_max)
+                                                 const key_range *range_min,
+                                                 const key_range *range_max)
 {
   MRN_DBUG_ENTER_METHOD();
   ha_rows row_count;
@@ -7625,15 +7578,17 @@ ha_rows ha_mroonga::generic_records_in_range_geo(uint key_nr,
   DBUG_RETURN(row_count);
 }
 
-ha_rows ha_mroonga::records_in_range(uint key_nr, key_range *range_min, key_range *range_max)
+ha_rows ha_mroonga::records_in_range(uint key_nr, const key_range *range_min,
+                                     const key_range *range_max,
+                                     page_range *pages)
 {
   MRN_DBUG_ENTER_METHOD();
   ha_rows row_count = 0;
   if (share->wrapper_mode)
   {
-    row_count = wrapper_records_in_range(key_nr, range_min, range_max);
+    row_count = wrapper_records_in_range(key_nr, range_min, range_max, pages);
   } else {
-    row_count = storage_records_in_range(key_nr, range_min, range_max);
+    row_count = storage_records_in_range(key_nr, range_min, range_max, pages);
   }
   DBUG_PRINT("info", ("mroonga: row_count=%" MRN_HA_ROWS_FORMAT, row_count));
   DBUG_RETURN(row_count);
@@ -10753,7 +10708,7 @@ int ha_mroonga::generic_store_bulk_geometry(Field *field, grn_obj *buf)
   int error = 0;
 #ifdef MRN_HAVE_SPATIAL
   String buffer;
-  Field_geom *geometry = (Field_geom *)field;
+  Field_blob *geometry = (Field_blob *)field;
   String *value = geometry->val_str(0, &buffer);
   const char *wkb = value->ptr();
   int len = value->length();
@@ -11223,7 +11178,7 @@ void ha_mroonga::storage_store_field_geometry(Field *field,
   String *geometry_buffer = &blob_buffers[field->field_index];
   geometry_buffer->length(0);
   uint wkb_length = sizeof(wkb) / sizeof(*wkb);
-  Field_geom *geometry = (Field_geom *)field;
+  Field_blob *geometry= (Field_blob *)field;
   geometry_buffer->reserve(wkb_length);
   geometry_buffer->q_append((const char *) wkb, wkb_length);
   geometry->set_ptr((uint32) wkb_length, (uchar *) geometry_buffer->ptr());
@@ -14855,9 +14810,7 @@ bool ha_mroonga::wrapper_inplace_alter_table(
     need_fill_index = true;
   }
   if (!error && need_fill_index) {
-    my_ptrdiff_t diff =
-      PTR_BYTE_DIFF(table->record[0], altered_table->record[0]);
-    mrn::TableFieldsOffsetMover mover(altered_table, diff);
+    mrn::FieldTableChanger changer(altered_table, table);
     error = wrapper_fill_indexes(ha_thd(), altered_table->key_info,
                                  index_columns, ha_alter_info->key_count);
   }
@@ -15010,9 +14963,7 @@ bool ha_mroonga::storage_inplace_alter_table_add_index(
     }
   }
   if (!error && have_multiple_column_index) {
-    my_ptrdiff_t diff =
-      PTR_BYTE_DIFF(table->record[0], altered_table->record[0]);
-    mrn::TableFieldsOffsetMover mover(altered_table, diff);
+    mrn::FieldTableChanger changer(altered_table, table);
     error = storage_add_index_multiple_columns(altered_table->key_info,
                                                ha_alter_info->key_count,
                                                index_tables,
@@ -15195,9 +15146,7 @@ bool ha_mroonga::storage_inplace_alter_table_add_column(
       bitmap_set_bit(&generated_column_bitmap, field->field_index);
 #  endif
 
-      my_ptrdiff_t diff =
-        PTR_BYTE_DIFF(table->record[0], altered_table->record[0]);
-      mrn::TableFieldsOffsetMover mover(altered_table, diff);
+      mrn::FieldTableChanger changer(altered_table, table);
 
       error = storage_rnd_init(true);
       if (error) {
@@ -15515,34 +15464,6 @@ bool ha_mroonga::commit_inplace_alter_table(
   }
   DBUG_RETURN(result);
 }
-
-void ha_mroonga::wrapper_notify_table_changed()
-{
-  MRN_DBUG_ENTER_METHOD();
-  MRN_SET_WRAP_SHARE_KEY(share, table->s);
-  MRN_SET_WRAP_TABLE_KEY(this, table);
-  wrap_handler->ha_notify_table_changed();
-  MRN_SET_BASE_SHARE_KEY(share, table->s);
-  MRN_SET_BASE_TABLE_KEY(this, table);
-  DBUG_VOID_RETURN;
-}
-
-void ha_mroonga::storage_notify_table_changed()
-{
-  MRN_DBUG_ENTER_METHOD();
-  DBUG_VOID_RETURN;
-}
-
-void ha_mroonga::notify_table_changed()
-{
-  MRN_DBUG_ENTER_METHOD();
-  if (share->wrapper_mode) {
-    wrapper_notify_table_changed();
-  } else {
-    storage_notify_table_changed();
-  }
-  DBUG_VOID_RETURN;
-}
 #else
 alter_table_operations ha_mroonga::wrapper_alter_table_flags(alter_table_operations flags)
 {
@@ -16460,38 +16381,6 @@ void ha_mroonga::change_table_ptr(TABLE *table_arg, TABLE_SHARE *share_arg)
   DBUG_VOID_RETURN;
 }
 
-bool ha_mroonga::wrapper_primary_key_is_clustered()
-{
-  MRN_DBUG_ENTER_METHOD();
-  bool is_clustered;
-  MRN_SET_WRAP_SHARE_KEY(share, table->s);
-  MRN_SET_WRAP_TABLE_KEY(this, table);
-  is_clustered = wrap_handler->primary_key_is_clustered();
-  MRN_SET_BASE_SHARE_KEY(share, table->s);
-  MRN_SET_BASE_TABLE_KEY(this, table);
-  DBUG_RETURN(is_clustered);
-}
-
-bool ha_mroonga::storage_primary_key_is_clustered()
-{
-  MRN_DBUG_ENTER_METHOD();
-  bool is_clustered = handler::primary_key_is_clustered();
-  DBUG_RETURN(is_clustered);
-}
-
-bool ha_mroonga::primary_key_is_clustered()
-{
-  MRN_DBUG_ENTER_METHOD();
-  bool is_clustered;
-  if (share && share->wrapper_mode)
-  {
-    is_clustered = wrapper_primary_key_is_clustered();
-  } else {
-    is_clustered = storage_primary_key_is_clustered();
-  }
-  DBUG_RETURN(is_clustered);
-}
-
 bool ha_mroonga::wrapper_is_fk_defined_on_table_or_index(uint index)
 {
   MRN_DBUG_ENTER_METHOD();
@@ -17072,7 +16961,7 @@ void ha_mroonga::unbind_psi()
   DBUG_VOID_RETURN;
 }
 
-void ha_mroonga::wrapper_rebind_psi()
+void ha_mroonga::wrapper_rebind()
 {
   MRN_DBUG_ENTER_METHOD();
   MRN_SET_WRAP_SHARE_KEY(share, table->s);
@@ -17083,7 +16972,7 @@ void ha_mroonga::wrapper_rebind_psi()
   DBUG_VOID_RETURN;
 }
 
-void ha_mroonga::storage_rebind_psi()
+void ha_mroonga::storage_rebind()
 {
   MRN_DBUG_ENTER_METHOD();
   DBUG_VOID_RETURN;
@@ -17095,9 +16984,9 @@ void ha_mroonga::rebind_psi()
   handler::rebind_psi();
   if (share->wrapper_mode)
   {
-    wrapper_rebind_psi();
+    wrapper_rebind();
   } else {
-    storage_rebind_psi();
+    storage_rebind();
   }
   DBUG_VOID_RETURN;
 }
diff --git a/storage/mroonga/ha_mroonga.hpp b/storage/mroonga/ha_mroonga.hpp
index 7736ac52dba..66767899e21 100644
--- a/storage/mroonga/ha_mroonga.hpp
+++ b/storage/mroonga/ha_mroonga.hpp
@@ -461,7 +461,8 @@ public:
   uint max_supported_key_length()      const mrn_override;
   uint max_supported_key_part_length() const mrn_override;
 
-  ha_rows records_in_range(uint inx, key_range *min_key, key_range *max_key) mrn_override;
+  ha_rows records_in_range(uint inx, const key_range *min_key,
+                           const key_range *max_key, page_range *pages) mrn_override;
   int index_init(uint idx, bool sorted) mrn_override;
   int index_end() mrn_override;
 #ifndef MRN_HANDLER_HAVE_HA_INDEX_READ_MAP
@@ -612,7 +613,6 @@ protected:
   int index_last(uchar *buf) mrn_override;
 #endif
   void change_table_ptr(TABLE *table_arg, TABLE_SHARE *share_arg) mrn_override;
-  bool primary_key_is_clustered() mrn_override;
   bool is_fk_defined_on_table_or_index(uint index) mrn_override;
   char *get_foreign_key_create_info() mrn_override;
 #ifdef MRN_HANDLER_HAVE_GET_TABLESPACE_NAME
@@ -641,7 +641,6 @@ protected:
   bool commit_inplace_alter_table(TABLE *altered_table,
                                   Alter_inplace_info *ha_alter_info,
                                   bool commit) mrn_override;
-  void notify_table_changed() mrn_override;
 #endif
 
 private:
@@ -981,12 +980,14 @@ private:
   int storage_rnd_pos(uchar *buf, uchar *pos);
   void wrapper_position(const uchar *record);
   void storage_position(const uchar *record);
-  ha_rows wrapper_records_in_range(uint key_nr, key_range *range_min,
-                                   key_range *range_max);
-  ha_rows storage_records_in_range(uint key_nr, key_range *range_min,
-                                   key_range *range_max);
-  ha_rows generic_records_in_range_geo(uint key_nr, key_range *range_min,
-                                       key_range *range_max);
+  ha_rows wrapper_records_in_range(uint key_nr, const key_range *range_min,
+                                   const key_range *range_max,
+                                   page_range *pages);
+  ha_rows storage_records_in_range(uint key_nr, const key_range *range_min,
+                                   const key_range *range_max,
+                                   page_range *pages);
+  ha_rows generic_records_in_range_geo(uint key_nr, const key_range *range_min,
+                                       const key_range *range_max);
   int wrapper_index_init(uint idx, bool sorted);
   int storage_index_init(uint idx, bool sorted);
   int wrapper_index_end();
@@ -1201,8 +1202,6 @@ private:
   bool storage_commit_inplace_alter_table(TABLE *altered_table,
                                           Alter_inplace_info *ha_alter_info,
                                           bool commit);
-  void wrapper_notify_table_changed();
-  void storage_notify_table_changed();
 #else
   alter_table_operations wrapper_alter_table_flags(alter_table_operations flags);
   alter_table_operations storage_alter_table_flags(alter_table_operations flags);
@@ -1258,8 +1257,6 @@ private:
   int storage_start_stmt(THD *thd, thr_lock_type lock_type);
   void wrapper_change_table_ptr(TABLE *table_arg, TABLE_SHARE *share_arg);
   void storage_change_table_ptr(TABLE *table_arg, TABLE_SHARE *share_arg);
-  bool wrapper_primary_key_is_clustered();
-  bool storage_primary_key_is_clustered();
   bool wrapper_is_fk_defined_on_table_or_index(uint index);
   bool storage_is_fk_defined_on_table_or_index(uint index);
   char *wrapper_get_foreign_key_create_info();
@@ -1288,8 +1285,8 @@ private:
 #ifdef MRN_HAVE_HA_REBIND_PSI
   void wrapper_unbind_psi();
   void storage_unbind_psi();
-  void wrapper_rebind_psi();
-  void storage_rebind_psi();
+  void wrapper_rebind();
+  void storage_rebind();
 #endif
   my_bool wrapper_register_query_cache_table(THD *thd,
                                              const char *table_key,
diff --git a/storage/mroonga/lib/mrn_external_lock.cpp b/storage/mroonga/lib/mrn_external_lock.cpp
index 512a20a00af..762a96d0455 100644
--- a/storage/mroonga/lib/mrn_external_lock.cpp
+++ b/storage/mroonga/lib/mrn_external_lock.cpp
@@ -33,7 +33,7 @@ namespace mrn {
 
   ExternalLock::~ExternalLock() {
     if (lock_type_ != F_UNLCK) {
-      handler_->ha_external_lock(thd_, F_UNLCK);
+      handler_->ha_external_unlock(thd_);
     }
   }
 
diff --git a/storage/mroonga/lib/mrn_table_fields_offset_mover.cpp b/storage/mroonga/lib/mrn_table_fields_offset_mover.cpp
index f230900dd65..bc2d2f258ed 100644
--- a/storage/mroonga/lib/mrn_table_fields_offset_mover.cpp
+++ b/storage/mroonga/lib/mrn_table_fields_offset_mover.cpp
@@ -20,22 +20,28 @@
 #include "mrn_table_fields_offset_mover.hpp"
 
 namespace mrn {
-  TableFieldsOffsetMover::TableFieldsOffsetMover(TABLE *table,
-                                                 my_ptrdiff_t diff)
-    : table_(table),
-      diff_(diff) {
-    uint n_columns = table_->s->fields;
+  FieldTableChanger::FieldTableChanger(TABLE *table,
+                                       TABLE *new_table)
+    : old_table_(table),
+      new_table_(new_table) {
+    my_ptrdiff_t diff =
+            PTR_BYTE_DIFF(new_table_->record[0], old_table_->record[0]);
+    uint n_columns = old_table_->s->fields;
     for (uint i = 0; i < n_columns; ++i) {
-      Field *field = table_->field[i];
-      field->move_field_offset(diff_);
+      Field *field = old_table_->field[i];
+      field->move_field_offset(diff);
+      field->table = new_table;
     }
   }
 
-  TableFieldsOffsetMover::~TableFieldsOffsetMover() {
-    uint n_columns = table_->s->fields;
+  FieldTableChanger::~FieldTableChanger() {
+    my_ptrdiff_t diff =
+            PTR_BYTE_DIFF(new_table_->record[0], old_table_->record[0]);
+    uint n_columns = old_table_->s->fields;
     for (uint i = 0; i < n_columns; ++i) {
-      Field *field = table_->field[i];
-      field->move_field_offset(-diff_);
+      Field *field = old_table_->field[i];
+      field->move_field_offset(-diff);
+      field->table = old_table_;
     }
   }
 }
diff --git a/storage/mroonga/lib/mrn_table_fields_offset_mover.hpp b/storage/mroonga/lib/mrn_table_fields_offset_mover.hpp
index 49311b8df47..94f967728c0 100644
--- a/storage/mroonga/lib/mrn_table_fields_offset_mover.hpp
+++ b/storage/mroonga/lib/mrn_table_fields_offset_mover.hpp
@@ -22,12 +22,12 @@
 #include <mrn_mysql.h>
 
 namespace mrn {
-  class TableFieldsOffsetMover {
+  class FieldTableChanger {
   public:
-    TableFieldsOffsetMover(TABLE *table, my_ptrdiff_t diff);
-    ~TableFieldsOffsetMover();
+    FieldTableChanger(TABLE *table, TABLE *new_table);
+    ~FieldTableChanger();
   private:
-    TABLE *table_;
-    my_ptrdiff_t diff_;
+    TABLE *old_table_;
+    TABLE *new_table_;
   };
 }
diff --git a/storage/mroonga/mrn_mysql_compat.h b/storage/mroonga/mrn_mysql_compat.h
index bdb15637e31..08d874fabaf 100644
--- a/storage/mroonga/mrn_mysql_compat.h
+++ b/storage/mroonga/mrn_mysql_compat.h
@@ -60,19 +60,8 @@
 #  define KEY_N_KEY_PARTS(key) (key)->key_parts
 #endif
 
-#if defined(MRN_MARIADB_P) && MYSQL_VERSION_ID >= 100213
 #  define mrn_init_alloc_root(PTR, SZ1, SZ2, FLAG) \
-  init_alloc_root(PTR, "mroonga", SZ1, SZ2, FLAG)
-#elif defined(MRN_MARIADB_P) && MYSQL_VERSION_ID >= 100000
-#  define mrn_init_alloc_root(PTR, SZ1, SZ2, FLAG) \
-  init_alloc_root(PTR, SZ1, SZ2, FLAG)
-#elif MYSQL_VERSION_ID >= 50706
-#  define mrn_init_alloc_root(PTR, SZ1, SZ2, FLAG) \
-  init_alloc_root(mrn_memory_key, PTR, SZ1, SZ2)
-#else
-#  define mrn_init_alloc_root(PTR, SZ1, SZ2, FLAG) \
-  init_alloc_root(PTR, SZ1, SZ2)
-#endif
+  init_alloc_root(mrn_memory_key, PTR, SZ1, SZ2, FLAG)
 
 #if MYSQL_VERSION_ID < 100002 || !defined(MRN_MARIADB_P)
 #  define GTS_TABLE 0
@@ -144,11 +133,10 @@
 #  define MRN_SEVERITY_WARNING Sql_condition::WARN_LEVEL_WARN
 #endif
 
-#if MYSQL_VERSION_ID >= 50706 && !defined(MRN_MARIADB_P)
-#  define MRN_HAVE_PSI_MEMORY_KEY
+#ifdef HAVE_PSI_MEMORY_INTERFACE
+#define MRN_HAVE_PSI_MEMORY_KEY
 #endif
 
-#ifdef MRN_HAVE_PSI_MEMORY_KEY
 #  define mrn_my_malloc(size, flags) \
   my_malloc(mrn_memory_key, size, flags)
 #  define mrn_my_strdup(string, flags) \
@@ -157,14 +145,6 @@
   my_strndup(mrn_memory_key, string, size, flags)
 #  define mrn_my_multi_malloc(flags, ...) \
   my_multi_malloc(mrn_memory_key, flags, __VA_ARGS__)
-#else
-#  define mrn_my_malloc(size, flags) my_malloc(size, flags)
-#  define mrn_my_strdup(string, flags) my_strdup(string, flags)
-#  define mrn_my_strndup(string, size, flags) \
-  my_strndup(string, size, flags)
-#  define mrn_my_multi_malloc(flags, ...) \
-  my_multi_malloc(flags, __VA_ARGS__)
-#endif
 
 #if MYSQL_VERSION_ID >= 50706 && !defined(MRN_MARIADB_P)
 #  define MRN_STRING_FREE(string) string.mem_free();
@@ -240,40 +220,11 @@
   ((select_lex)->options)
 #endif
 
-#if defined(MRN_MARIADB_P) && MYSQL_VERSION_ID >= 100000
-#  if MYSQL_VERSION_ID >= 100213
-#    define mrn_init_sql_alloc(thd, mem_root)                           \
-  init_sql_alloc(mem_root, "Mroonga",                                   \
-                 TABLE_ALLOC_BLOCK_SIZE,                                \
-                 0,                                                     \
-                 MYF(thd->slave_thread ? 0 : MY_THREAD_SPECIFIC))
-#elif MYSQL_VERSION_ID >= 100104
 #    define mrn_init_sql_alloc(thd, mem_root)                           \
-  init_sql_alloc(mem_root,                                              \
+  init_sql_alloc(mrn_memory_key, mem_root,                              \
                  TABLE_ALLOC_BLOCK_SIZE,                                \
                  0,                                                     \
                  MYF(thd->slave_thread ? 0 : MY_THREAD_SPECIFIC))
-#  else
-#    define mrn_init_sql_alloc(thd, mem_root)           \
-  init_sql_alloc(mem_root,                              \
-                 TABLE_ALLOC_BLOCK_SIZE,                \
-                 0,                                     \
-                 MYF(0))
-#  endif
-#else
-#  if MYSQL_VERSION_ID >= 50709
-#    define mrn_init_sql_alloc(thd, mem_root)           \
-  init_sql_alloc(mrn_memory_key,                        \
-                 mem_root,                              \
-                 TABLE_ALLOC_BLOCK_SIZE,                \
-                 0)
-#  else
-#    define mrn_init_sql_alloc(thd, mem_root)           \
-  init_sql_alloc(mem_root,                              \
-                 TABLE_ALLOC_BLOCK_SIZE,                \
-                 0)
-#  endif
-#endif
 
 #ifdef MRN_MARIADB_P
 #  define MRN_ABORT_ON_WARNING(thd) thd->abort_on_warning
@@ -288,7 +239,6 @@
 #define MRN_ERROR_CODE_DATA_TRUNCATE(thd)                               \
   (MRN_ABORT_ON_WARNING(thd) ? ER_WARN_DATA_OUT_OF_RANGE : WARN_DATA_TRUNCATED)
 
-#if MYSQL_VERSION_ID >= 50709 && !defined(MRN_MARIADB_P)
 #  define mrn_my_hash_init(hash,                        \
                            charset,                     \
                            default_array_elements,      \
@@ -297,25 +247,7 @@
                            get_key,                     \
                            free_element,                \
                            flags)                       \
-  my_hash_init(hash,                                    \
-               charset,                                 \
-               default_array_elements,                  \
-               key_offset,                              \
-               key_length,                              \
-               get_key,                                 \
-               free_element,                            \
-               flags,                                   \
-               mrn_memory_key)
-#else
-#  define mrn_my_hash_init(hash,                        \
-                           charset,                     \
-                           default_array_elements,      \
-                           key_offset,                  \
-                           key_length,                  \
-                           get_key,                     \
-                           free_element,                \
-                           flags)                       \
-  my_hash_init(hash,                                    \
+  my_hash_init(mrn_memory_key, hash,                    \
                charset,                                 \
                default_array_elements,                  \
                key_offset,                              \
@@ -323,7 +255,6 @@
                get_key,                                 \
                free_element,                            \
                flags)
-#endif
 
 #if defined(MRN_MARIADB_P) && MYSQL_VERSION_ID >= 100000
 #  define mrn_strconvert(from_cs,               \
diff --git a/storage/mroonga/mrn_table.cpp b/storage/mroonga/mrn_table.cpp
index eedb03544d0..037a6a59487 100644
--- a/storage/mroonga/mrn_table.cpp
+++ b/storage/mroonga/mrn_table.cpp
@@ -1080,6 +1080,7 @@ TABLE_SHARE *mrn_create_tmp_table_share(TABLE_LIST *table_list, const char *path
   if (open_table_def(thd, share, GTS_TABLE))
   {
     *error = ER_CANT_OPEN_FILE;
+    mrn_free_tmp_table_share(share);
     DBUG_RETURN(NULL);
   }
   DBUG_RETURN(share);
@@ -1140,7 +1141,7 @@ st_mrn_slot_data *mrn_get_slot_data(THD *thd, bool can_create)
 {
   MRN_DBUG_ENTER_FUNCTION();
   st_mrn_slot_data *slot_data =
-    (st_mrn_slot_data*) *thd_ha_data(thd, mrn_hton_ptr);
+    (st_mrn_slot_data*) thd_get_ha_data(thd, mrn_hton_ptr);
   if (slot_data == NULL) {
     slot_data = (st_mrn_slot_data*) malloc(sizeof(st_mrn_slot_data));
     slot_data->last_insert_record_id = GRN_ID_NIL;
@@ -1149,7 +1150,7 @@ st_mrn_slot_data *mrn_get_slot_data(THD *thd, bool can_create)
     slot_data->disable_keys_create_info = NULL;
     slot_data->alter_connect_string = NULL;
     slot_data->alter_comment = NULL;
-    *thd_ha_data(thd, mrn_hton_ptr) = (void *) slot_data;
+    thd_set_ha_data(thd, mrn_hton_ptr, slot_data);
     {
       mrn::Lock lock(&mrn_allocated_thds_mutex);
       if (my_hash_insert(&mrn_allocated_thds, (uchar*) thd))
diff --git a/storage/mroonga/mrn_variables.hpp b/storage/mroonga/mrn_variables.hpp
index f55b1fd35a0..8a0113c5e14 100644
--- a/storage/mroonga/mrn_variables.hpp
+++ b/storage/mroonga/mrn_variables.hpp
@@ -22,9 +22,7 @@
 
 #include "mrn_mysql_compat.h"
 
-#ifdef MRN_HAVE_PSI_MEMORY_KEY
 extern PSI_memory_key mrn_memory_key;
-#endif
 
 namespace mrn {
   namespace variables {
diff --git a/storage/mroonga/mysql-test/mroonga/storage/r/create_table_index_parser_default.result b/storage/mroonga/mysql-test/mroonga/storage/r/create_table_index_parser_default.result
index 7f9ddd50e92..34545ecc30a 100644
--- a/storage/mroonga/mysql-test/mroonga/storage/r/create_table_index_parser_default.result
+++ b/storage/mroonga/mysql-test/mroonga/storage/r/create_table_index_parser_default.result
@@ -1,6 +1,6 @@
 drop table if exists diaries;
-set @mroonga_default_parser_backup=@@mroonga_default_parser;
-set global mroonga_default_parser=TokenBigramSplitSymbolAlphaDigit;
+set @mroonga_default_tokenizer_backup=@@mroonga_default_tokenizer;
+set global mroonga_default_tokenizer=TokenBigramSplitSymbolAlphaDigit;
 create table diaries (
 id int primary key auto_increment,
 body text,
@@ -22,4 +22,4 @@ id	body
 2	starting Groonga...
 3	started Groonga.
 drop table diaries;
-set global mroonga_default_parser=@mroonga_default_parser_backup;
+set global mroonga_default_tokenizer=@mroonga_default_tokenizer_backup;
diff --git a/storage/mroonga/mysql-test/mroonga/storage/r/create_table_index_parser_off.result b/storage/mroonga/mysql-test/mroonga/storage/r/create_table_index_parser_off.result
index 320fb9a5635..f827c0ad529 100644
--- a/storage/mroonga/mysql-test/mroonga/storage/r/create_table_index_parser_off.result
+++ b/storage/mroonga/mysql-test/mroonga/storage/r/create_table_index_parser_off.result
@@ -9,7 +9,7 @@ Warning	1287	'parser' is deprecated and will be removed in a future release. Ple
 INSERT INTO variables (name) VALUES ("mroonga_database_path_prefix");
 Warnings:
 Warning	1287	'parser' is deprecated and will be removed in a future release. Please use tokenizer instead
-INSERT INTO variables (name) VALUES ("mroonga_default_parser");
+INSERT INTO variables (name) VALUES ("mroonga_default_tokenizer");
 INSERT INTO variables (name) VALUES ("mroonga_default_wrapper_engine");
 INSERT INTO variables (name) VALUES ("mroonga_dry_write");
 INSERT INTO variables (name) VALUES ("mroonga_enable_optimization");
@@ -21,7 +21,7 @@ INSERT INTO variables (name) VALUES ("mroonga_version");
 SELECT * FROM variables;
 id	name
 1	mroonga_database_path_prefix
-2	mroonga_default_parser
+2	mroonga_default_tokenizer
 3	mroonga_default_wrapper_engine
 4	mroonga_dry_write
 5	mroonga_enable_optimization
@@ -34,7 +34,7 @@ SELECT * FROM variables
 WHERE MATCH (name) AGAINST ("mroonga_default*" IN BOOLEAN MODE);
 id	name
 3	mroonga_default_wrapper_engine
-2	mroonga_default_parser
+2	mroonga_default_tokenizer
 DROP TABLE variables;
 Warnings:
 Warning	1287	'parser' is deprecated and will be removed in a future release. Please use tokenizer instead
diff --git a/storage/mroonga/mysql-test/mroonga/storage/r/function_escape_query_match_against.result b/storage/mroonga/mysql-test/mroonga/storage/r/function_escape_query_match_against.result
index 8b92ec4137e..adf6b3a4770 100644
--- a/storage/mroonga/mysql-test/mroonga/storage/r/function_escape_query_match_against.result
+++ b/storage/mroonga/mysql-test/mroonga/storage/r/function_escape_query_match_against.result
@@ -1,5 +1,5 @@
 DROP TABLE IF EXISTS memos;
-SET GLOBAL mroonga_default_parser = TokenDelimit;
+SET GLOBAL mroonga_default_tokenizer = TokenDelimit;
 SET NAMES utf8mb4;
 CREATE TABLE memos (
 id INT PRIMARY KEY,
@@ -15,4 +15,4 @@ id	content
 1	(Groonga) Installed!
 3	(Groonga) Upgraded!
 DROP TABLE memos;
-SET GLOBAL mroonga_default_parser = TokenBigram;
+SET GLOBAL mroonga_default_tokenizer = TokenBigram;
diff --git a/storage/mroonga/mysql-test/mroonga/storage/r/i_s.result b/storage/mroonga/mysql-test/mroonga/storage/r/i_s.result
new file mode 100644
index 00000000000..80a26a1c74b
--- /dev/null
+++ b/storage/mroonga/mysql-test/mroonga/storage/r/i_s.result
@@ -0,0 +1,7 @@
+SHOW CREATE TABLE INFORMATION_SCHEMA.MROONGA_STATS;
+Table	Create Table
+Mroonga_stats	CREATE TEMPORARY TABLE `Mroonga_stats` (
+  `VERSION` varchar(40) NOT NULL,
+  `rows_written` int(11) NOT NULL,
+  `rows_read` int(11) NOT NULL
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
diff --git a/storage/mroonga/mysql-test/mroonga/storage/r/optimization_count_skip_index_not_equal.result b/storage/mroonga/mysql-test/mroonga/storage/r/optimization_count_skip_index_not_equal.result
index 199edf9d758..a1a123e7d5f 100644
--- a/storage/mroonga/mysql-test/mroonga/storage/r/optimization_count_skip_index_not_equal.result
+++ b/storage/mroonga/mysql-test/mroonga/storage/r/optimization_count_skip_index_not_equal.result
@@ -14,5 +14,5 @@ COUNT(*)
 2
 SHOW STATUS LIKE 'mroonga_count_skip';
 Variable_name	Value
-Mroonga_count_skip	0
+Mroonga_count_skip	2
 DROP TABLE users;
diff --git a/storage/mroonga/mysql-test/mroonga/storage/t/create_table_index_parser_default.test b/storage/mroonga/mysql-test/mroonga/storage/t/create_table_index_parser_default.test
index 75ee1882271..773c740b733 100644
--- a/storage/mroonga/mysql-test/mroonga/storage/t/create_table_index_parser_default.test
+++ b/storage/mroonga/mysql-test/mroonga/storage/t/create_table_index_parser_default.test
@@ -21,8 +21,8 @@
 drop table if exists diaries;
 --enable_warnings
 
-set @mroonga_default_parser_backup=@@mroonga_default_parser;
-set global mroonga_default_parser=TokenBigramSplitSymbolAlphaDigit;
+set @mroonga_default_tokenizer_backup=@@mroonga_default_tokenizer;
+set global mroonga_default_tokenizer=TokenBigramSplitSymbolAlphaDigit;
 create table diaries (
   id int primary key auto_increment,
   body text,
@@ -35,6 +35,6 @@ insert into diaries (body) values ("finished Groonga.");
 select * from diaries;
 select * from diaries where match(body) against("+start" IN BOOLEAN MODE) order by id;
 drop table diaries;
-set global mroonga_default_parser=@mroonga_default_parser_backup;
+set global mroonga_default_tokenizer=@mroonga_default_tokenizer_backup;
 
 --source ../../include/mroonga/have_mroonga_deinit.inc
diff --git a/storage/mroonga/mysql-test/mroonga/storage/t/create_table_index_parser_off.test b/storage/mroonga/mysql-test/mroonga/storage/t/create_table_index_parser_off.test
index 7888cb33861..f62a6a0fe6c 100644
--- a/storage/mroonga/mysql-test/mroonga/storage/t/create_table_index_parser_off.test
+++ b/storage/mroonga/mysql-test/mroonga/storage/t/create_table_index_parser_off.test
@@ -28,7 +28,7 @@ CREATE TABLE variables (
 ) DEFAULT CHARSET=utf8;
 
 INSERT INTO variables (name) VALUES ("mroonga_database_path_prefix");
-INSERT INTO variables (name) VALUES ("mroonga_default_parser");
+INSERT INTO variables (name) VALUES ("mroonga_default_tokenizer");
 INSERT INTO variables (name) VALUES ("mroonga_default_wrapper_engine");
 INSERT INTO variables (name) VALUES ("mroonga_dry_write");
 INSERT INTO variables (name) VALUES ("mroonga_enable_optimization");
diff --git a/storage/mroonga/mysql-test/mroonga/storage/t/function_escape_query_match_against.test b/storage/mroonga/mysql-test/mroonga/storage/t/function_escape_query_match_against.test
index 442aceccd2e..bab9f9179c1 100644
--- a/storage/mroonga/mysql-test/mroonga/storage/t/function_escape_query_match_against.test
+++ b/storage/mroonga/mysql-test/mroonga/storage/t/function_escape_query_match_against.test
@@ -22,7 +22,7 @@
 DROP TABLE IF EXISTS memos;
 --enable_warnings
 
-SET GLOBAL mroonga_default_parser = TokenDelimit;
+SET GLOBAL mroonga_default_tokenizer = TokenDelimit;
 
 SET NAMES utf8mb4;
 CREATE TABLE memos (
@@ -40,7 +40,7 @@ SELECT * FROM memos
 
 DROP TABLE memos;
 
-SET GLOBAL mroonga_default_parser = TokenBigram;
+SET GLOBAL mroonga_default_tokenizer = TokenBigram;
 
 --source ../../include/mroonga/unload_mroonga_functions.inc
 --source ../../include/mroonga/have_mroonga_deinit.inc
diff --git a/storage/mroonga/mysql-test/mroonga/storage/t/i_s.test b/storage/mroonga/mysql-test/mroonga/storage/t/i_s.test
new file mode 100644
index 00000000000..fdb8e205b38
--- /dev/null
+++ b/storage/mroonga/mysql-test/mroonga/storage/t/i_s.test
@@ -0,0 +1,23 @@
+# Copyright (c) 2019, MariaDB
+# Copyright(C) 2014 Naoya Murakami <naoya@createfield.com>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1335  USA
+
+--source include/not_embedded.inc
+--source ../../include/mroonga/have_mroonga.inc
+
+SHOW CREATE TABLE INFORMATION_SCHEMA.MROONGA_STATS;
+
+--source ../../include/mroonga/have_mroonga_deinit.inc
diff --git a/storage/mroonga/mysql-test/mroonga/wrapper/r/transaction_rollback_delete_delete.result b/storage/mroonga/mysql-test/mroonga/wrapper/r/transaction_rollback_delete_delete.result
index 9082032ff6e..8c61d9e865a 100644
--- a/storage/mroonga/mysql-test/mroonga/wrapper/r/transaction_rollback_delete_delete.result
+++ b/storage/mroonga/mysql-test/mroonga/wrapper/r/transaction_rollback_delete_delete.result
@@ -28,7 +28,7 @@ MATCH(body) AGAINST("groonga");
 id	title	body
 DELETE FROM diaries WHERE id = 1;
 Warnings:
-Warning	1026	failed to get record ID for deleting from groonga: key=<>
+Warning	1026	failed to get record ID for deleting from groonga: key=<\0001>
 SELECT * FROM diaries;
 id	title	body
 2	groonga (1)	starting groonga...
diff --git a/storage/mroonga/vendor/groonga/CMakeLists.txt b/storage/mroonga/vendor/groonga/CMakeLists.txt
index 8afa53be1e0..27afb935467 100644
--- a/storage/mroonga/vendor/groonga/CMakeLists.txt
+++ b/storage/mroonga/vendor/groonga/CMakeLists.txt
@@ -407,7 +407,7 @@ if(NOT ${GRN_WITH_MECAB} STREQUAL "no")
     set(MECAB_LIBRARIES libmecab)
   else()
     set(GRN_MECAB_CONFIG "mecab-config" CACHE FILEPATH "mecab-config path")
-    if(NOT CMAKE_CROSSCOMPILING)
+    if(NOT CMAKE_CROSSCOMPILING OR DEFINED CMAKE_CROSSCOMPILING_EMULATOR)
       find_program(GRN_MECAB_CONFIG_ABSOLUTE_PATH "${GRN_MECAB_CONFIG}")
     endif()
     if(EXISTS "${GRN_MECAB_CONFIG_ABSOLUTE_PATH}")
diff --git a/storage/mroonga/vendor/groonga/configure.ac b/storage/mroonga/vendor/groonga/configure.ac
index 414876c6a26..cab122ad3a5 100644
--- a/storage/mroonga/vendor/groonga/configure.ac
+++ b/storage/mroonga/vendor/groonga/configure.ac
@@ -1613,30 +1613,6 @@ AC_SUBST(ONIGMO_CFLAGS)
 AC_SUBST(ONIGMO_LIBS)
 AM_CONDITIONAL(WITH_BUNDLED_ONIGMO, test "$with_onigmo" != "no" -a "x$have_onigmo" != "xyes")
 
-# PCRE
-GRN_WITH_PCRE=no
-AC_ARG_WITH(pcre,
-  [AS_HELP_STRING([--without-pcre],
-    [Don't use PCRE for groonga-httpd. [default=auto-detect]])],
-  [with_pcre="$withval"],
-  [with_pcre="auto"])
-if test "x$with_pcre" != "xno"; then
-  m4_ifdef([PKG_CHECK_MODULES], [
-    PKG_CHECK_MODULES([PCRE], [libpcre],
-      [_PKG_CONFIG(PCRE_LIBS_ONLY_L, [libs-only-L], [libpcre])
-       PCRE_LIBS_ONLY_L="$pkg_cv_PCRE_LIBS_ONLY_L"
-       GRN_WITH_PCRE=yes],
-      [GRN_WITH_PCRE=no])
-    ],
-    [GRN_WITH_PCRE=no])
-  if test "x$with_pcre" = "xyes" -a "$GRN_WITH_PCRE" != "yes"; then
-    AC_MSG_ERROR("No PCRE found")
-  fi
-fi
-AC_SUBST(GRN_WITH_PCRE)
-AC_SUBST(PCRE_CFLAGS)
-AC_SUBST(PCRE_LIBS_ONLY_L)
-
 # SSL
 GRN_WITH_SSL=no
 AC_ARG_WITH(ssl,
@@ -1788,11 +1764,6 @@ echo "groonga-httpd:"
 echo "  enable:                $enable_groonga_httpd"
 if test "$enable_groonga_httpd" = "yes"; then
   echo "  default database path: $GROONGA_HTTPD_DEFAULT_DATABASE_PATH"
-  echo "  PCRE:                  $GRN_WITH_PCRE"
-  if test "$GRN_WITH_PCRE" = "yes"; then
-    echo "    CFLAGS:              $PCRE_CFLAGS"
-    echo "    LIBS only -L:        $PCRE_LIBS_ONLY_L"
-  fi
   echo "   SSL:                  $GRN_WITH_SSL"
   if test "$GRN_WITH_SSL" = "yes"; then
     echo "    CFLAGS:              $SSL_CFLAGS"
diff --git a/storage/mroonga/vendor/groonga/lib/com.c b/storage/mroonga/vendor/groonga/lib/com.c
index cc03d6462c3..7761f4838e2 100644
--- a/storage/mroonga/vendor/groonga/lib/com.c
+++ b/storage/mroonga/vendor/groonga/lib/com.c
@@ -343,7 +343,7 @@ grn_com_event_add(grn_ctx *ctx, grn_com_event *ev, grn_sock fd, int events, grn_
 {
   grn_com *c;
   /* todo : expand events */
-  if (!ev || *ev->hash->n_entries == ev->max_nevents) {
+  if (!ev || *ev->hash->n_entries == (uint32_t) ev->max_nevents) {
     if (ev) { GRN_LOG(ctx, GRN_LOG_ERROR, "too many connections (%d)", ev->max_nevents); }
     return GRN_INVALID_ARGUMENT;
   }
@@ -757,7 +757,7 @@ grn_com_send(grn_ctx *ctx, grn_com *cs,
       rc = ctx->rc;
     }
   }
-  if (ret != whole_size) {
+  if ((size_t) ret != whole_size) {
     GRN_LOG(ctx, GRN_LOG_ERROR,
             "sendmsg(%" GRN_FMT_SOCKET "): %" GRN_FMT_LLD " < %" GRN_FMT_LLU,
             cs->fd, (long long int)ret, (unsigned long long int)whole_size);
diff --git a/storage/mroonga/vendor/groonga/lib/config.c b/storage/mroonga/vendor/groonga/lib/config.c
index 8a0e3a5268c..6664e1270ad 100644
--- a/storage/mroonga/vendor/groonga/lib/config.c
+++ b/storage/mroonga/vendor/groonga/lib/config.c
@@ -51,7 +51,7 @@ grn_config_set(grn_ctx *ctx,
   if (value_size == -1) {
     value_size = strlen(value);
   }
-  if (value_size > GRN_CONFIG_MAX_VALUE_SIZE) {
+  if (value_size > (int32_t) GRN_CONFIG_MAX_VALUE_SIZE) {
     ERR(GRN_INVALID_ARGUMENT,
         "[config][set] too large value: max=<%" GRN_FMT_SIZE ">: <%d>",
         GRN_CONFIG_MAX_VALUE_SIZE, value_size);
diff --git a/storage/mroonga/vendor/groonga/lib/db.c b/storage/mroonga/vendor/groonga/lib/db.c
index 418335aaf00..7749d4c0165 100644
--- a/storage/mroonga/vendor/groonga/lib/db.c
+++ b/storage/mroonga/vendor/groonga/lib/db.c
@@ -1302,7 +1302,7 @@ grn_table_get_subrecs(grn_ctx *ctx, grn_obj *table, grn_id id,
       byte *psubrec = (byte *)ri->subrecs;
       uint32_t n_subrecs = (uint32_t)GRN_RSET_N_SUBRECS(ri);
       uint32_t limit = value_size / (GRN_RSET_SCORE_SIZE + subrec_size);
-      if (limit > buf_size) {
+      if ((int) limit > buf_size) {
         limit = buf_size;
       }
       if (limit > n_subrecs) {
@@ -1525,7 +1525,7 @@ grn_table_add(grn_ctx *ctx, grn_obj *table, const void *key, unsigned int key_si
       if (hooks) {
         // todo : grn_proc_ctx_open()
         grn_obj id_, flags_, oldvalue_, value_;
-        grn_proc_ctx pctx = {{0}, hooks->proc, NULL, hooks, hooks, PROC_INIT, 4, 4};
+        grn_proc_ctx pctx = {{0}, hooks->proc, NULL, hooks, hooks, PROC_INIT, 4, 4, {{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0}}};
         GRN_UINT32_INIT(&id_, 0);
         GRN_UINT32_INIT(&flags_, 0);
         GRN_TEXT_INIT(&oldvalue_, 0);
@@ -1751,7 +1751,7 @@ grn_table_get_key(grn_ctx *ctx, grn_obj *table, grn_id id, void *keybuf, int buf
       {
         grn_array *a = (grn_array *)table;
         if (a->obj.header.domain) {
-          if (buf_size >= a->value_size) {
+          if ((unsigned int) buf_size >= a->value_size) {
             r = grn_array_get_value(ctx, a, id, keybuf);
           } else {
             r = a->value_size;
@@ -1826,7 +1826,7 @@ call_delete_hook(grn_ctx *ctx, grn_obj *table, grn_id rid, const void *key, unsi
     if (hooks) {
       // todo : grn_proc_ctx_open()
       grn_obj id_, flags_, oldvalue_, value_;
-      grn_proc_ctx pctx = {{0}, hooks->proc, NULL, hooks, hooks, PROC_INIT, 4, 4};
+      grn_proc_ctx pctx = {{0}, hooks->proc, NULL, hooks, hooks, PROC_INIT, 4, 4,  {{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0}}};
       GRN_UINT32_INIT(&id_, 0);
       GRN_UINT32_INIT(&flags_, 0);
       GRN_TEXT_INIT(&oldvalue_, GRN_OBJ_DO_SHALLOW_COPY);
@@ -2466,7 +2466,7 @@ grn_table_cursor_open(grn_ctx *ctx, grn_obj *table,
     if (offset < 0) {
       ERR(GRN_TOO_SMALL_OFFSET,
           "can't use negative offset with GRN_CURSOR_PREFIX: %d", offset);
-    } else if (offset != 0 && offset >= table_size) {
+    } else if (offset != 0 && offset >= (int) table_size) {
       ERR(GRN_TOO_LARGE_OFFSET,
           "offset is not less than table size: offset:%d, table_size:%d",
           offset, table_size);
@@ -5237,7 +5237,7 @@ grn_vector_get_element(grn_ctx *ctx, grn_obj *vector,
     ERR(GRN_INVALID_ARGUMENT, "invalid vector");
     goto exit;
   }
-  if (vector->u.v.n_sections <= offset) {
+  if ((unsigned int) vector->u.v.n_sections <= offset) {
     ERR(GRN_RANGE_ERROR, "offset out of range");
     goto exit;
   }
@@ -7143,7 +7143,7 @@ call_hook(grn_ctx *ctx, grn_obj *obj, grn_id id, grn_obj *value, int flags)
     if (hooks) {
       // todo : grn_proc_ctx_open()
       grn_obj id_, flags_;
-      grn_proc_ctx pctx = {{0}, hooks->proc, NULL, hooks, hooks, PROC_INIT, 4, 4};
+      grn_proc_ctx pctx = {{0}, hooks->proc, NULL, hooks, hooks, PROC_INIT, 4, 4, {{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0},{0}}};
       GRN_UINT32_INIT(&id_, 0);
       GRN_UINT32_INIT(&flags_, 0);
       GRN_UINT32_SET(ctx, &id_, id);
@@ -7976,11 +7976,11 @@ grn_obj_get_values(grn_ctx *ctx, grn_obj *obj, grn_id offset, void **values)
     grn_obj *domain = grn_column_table(ctx, obj);
     if (domain) {
       int table_size = (int)grn_table_size(ctx, domain);
-      if (0 < offset && offset <= table_size) {
+      if (0 < offset && offset <= (grn_id) table_size) {
         grn_ra *ra = (grn_ra *)obj;
         void *p = grn_ra_ref(ctx, ra, offset);
         if (p) {
-          if ((offset >> ra->element_width) == (table_size >> ra->element_width)) {
+          if ((offset >> ra->element_width) == ((unsigned int) table_size >> ra->element_width)) {
             nrecords = (table_size & ra->element_mask) + 1 - (offset & ra->element_mask);
           } else {
             nrecords = ra->element_mask + 1 - (offset & ra->element_mask);
@@ -12581,7 +12581,7 @@ grn_column_find_index_data_column_equal(grn_ctx *ctx, grn_obj *obj,
     if (n < buf_size) {
       *ip++ = target;
     }
-    if (n < n_index_data) {
+    if ((unsigned int) n < n_index_data) {
       index_data[n].index = target;
       index_data[n].section = section;
     }
@@ -12643,7 +12643,7 @@ grn_column_find_index_data_column_match(grn_ctx *ctx, grn_obj *obj,
       if (n < buf_size) {
         *ip++ = target;
       }
-      if (n < n_index_data) {
+      if ((unsigned int) n < n_index_data) {
         index_data[n].index = target;
         index_data[n].section = section;
       }
@@ -12670,7 +12670,7 @@ grn_column_find_index_data_column_match(grn_ctx *ctx, grn_obj *obj,
     if (n < buf_size) {
       *ip++ = target;
     }
-    if (n < n_index_data) {
+    if ((unsigned int) n < n_index_data) {
       index_data[n].index = target;
       index_data[n].section = section;
     }
@@ -12724,7 +12724,7 @@ grn_column_find_index_data_column_range(grn_ctx *ctx, grn_obj *obj,
     if (n < buf_size) {
       *ip++ = target;
     }
-    if (n < n_index_data) {
+    if ((unsigned int) n < n_index_data) {
       index_data[n].index = target;
       index_data[n].section = section;
     }
@@ -12959,7 +12959,7 @@ grn_column_find_index_data_accessor_match(grn_ctx *ctx, grn_obj *obj,
         if (n < buf_size) {
           *ip++ = target;
         }
-        if (n < n_index_data) {
+        if ((unsigned int) n < n_index_data) {
           index_data[n].index = target;
           index_data[n].section = section;
         }
@@ -12985,7 +12985,7 @@ grn_column_find_index_data_accessor_match(grn_ctx *ctx, grn_obj *obj,
       if (n < buf_size) {
         *ip++ = index;
       }
-      if (n < n_index_data) {
+      if ((unsigned int) n < n_index_data) {
         index_data[n].index = index;
         index_data[n].section = section;
       }
@@ -13006,7 +13006,7 @@ grn_column_find_index_data_accessor_match(grn_ctx *ctx, grn_obj *obj,
       if (n < buf_size) {
         *ip++ = index;
       }
-      if (n < n_index_data) {
+      if ((unsigned int) n < n_index_data) {
         index_data[n].index = index;
         index_data[n].section = section;
       }
@@ -13617,7 +13617,7 @@ grn_table_sort_key_from_str(grn_ctx *ctx, const char *str, unsigned int str_size
 grn_rc
 grn_table_sort_key_close(grn_ctx *ctx, grn_table_sort_key *keys, unsigned int nkeys)
 {
-  int i;
+  unsigned int i;
   if (keys) {
     for (i = 0; i < nkeys; i++) {
       grn_obj *key = keys[i].key;
@@ -14022,7 +14022,7 @@ grn_ctx_merge_temporary_open_space(grn_ctx *ctx)
   GRN_API_ENTER;
 
   stack = &(ctx->impl->temporary_open_spaces.stack);
-  if (GRN_BULK_VSIZE(stack) < sizeof(grn_obj) * 2) {
+  if ((unsigned long) GRN_BULK_VSIZE(stack) < (unsigned long) sizeof(grn_obj) * 2) {
     ERR(GRN_INVALID_ARGUMENT,
         "[ctx][temporary-open-spaces][merge] "
         "merge requires at least two spaces");
diff --git a/storage/mroonga/vendor/groonga/lib/proc.c b/storage/mroonga/vendor/groonga/lib/proc.c
index 4897364146e..8ed39961488 100644
--- a/storage/mroonga/vendor/groonga/lib/proc.c
+++ b/storage/mroonga/vendor/groonga/lib/proc.c
@@ -989,7 +989,7 @@ parse_normalize_flags(grn_ctx *ctx, grn_obj *flag_names)
     }
 
 #define CHECK_FLAG(name)\
-    if (((names_end - names) >= (sizeof(#name) - 1)) &&\
+    if (((unsigned long) (names_end - names) >= (unsigned long) (sizeof(#name) - 1)) && \
         (!memcmp(names, #name, sizeof(#name) - 1))) {\
       flags |= GRN_STRING_ ## name;\
       names += sizeof(#name) - 1;\
diff --git a/storage/mroonga/vendor/groonga/lib/proc/proc_column.c b/storage/mroonga/vendor/groonga/lib/proc/proc_column.c
index 2e92f35fbbc..74d0d7a9e76 100644
--- a/storage/mroonga/vendor/groonga/lib/proc/proc_column.c
+++ b/storage/mroonga/vendor/groonga/lib/proc/proc_column.c
@@ -41,7 +41,7 @@ grn_proc_column_parse_flags(grn_ctx *ctx,
 
 #define CHECK_FLAG(name)                                                \
     name_size = strlen(#name);                                          \
-    if ((end - text) >= name_size &&                                    \
+    if ((unsigned long) (end - text) >= (unsigned long) name_size &&    \
         memcmp(text, #name, name_size) == 0) {                          \
       flags |= GRN_OBJ_ ## name;                                        \
       text += name_size;                                                \
diff --git a/storage/mroonga/vendor/groonga/lib/proc/proc_dump.c b/storage/mroonga/vendor/groonga/lib/proc/proc_dump.c
index 7915ddfd329..391925d800b 100644
--- a/storage/mroonga/vendor/groonga/lib/proc/proc_dump.c
+++ b/storage/mroonga/vendor/groonga/lib/proc/proc_dump.c
@@ -495,7 +495,7 @@ dump_record(grn_ctx *ctx, grn_dumper *dumper,
     }
   }
   GRN_TEXT_PUTC(ctx, dumper->output, ']');
-  if (GRN_TEXT_LEN(dumper->output) >= DUMP_FLUSH_THRESHOLD_SIZE) {
+  if ((size_t) GRN_TEXT_LEN(dumper->output) >= DUMP_FLUSH_THRESHOLD_SIZE) {
     grn_ctx_output_flush(ctx, 0);
   }
 }
diff --git a/storage/mroonga/vendor/groonga/lib/proc/proc_fuzzy_search.c b/storage/mroonga/vendor/groonga/lib/proc/proc_fuzzy_search.c
index 943a8fdf74d..952fdbb170e 100644
--- a/storage/mroonga/vendor/groonga/lib/proc/proc_fuzzy_search.c
+++ b/storage/mroonga/vendor/groonga/lib/proc/proc_fuzzy_search.c
@@ -232,7 +232,7 @@ sequential_fuzzy_search(grn_ctx *ctx, grn_obj *table, grn_obj *column, grn_obj *
           key_length = grn_table_get_key(ctx, domain, rid, key_name, GRN_TABLE_MAX_KEY_SIZE);
 
           if (!prefix_match_size ||
-              (prefix_match_size > 0 && key_length >= prefix_match_size &&
+              (prefix_match_size > 0 && key_length >= (int) prefix_match_size &&
                !memcmp(sx, key_name, prefix_match_size))) {
             distance = calc_edit_distance(ctx, sx, ex,
                                           key_name, key_name + key_length, flags);
@@ -250,7 +250,7 @@ sequential_fuzzy_search(grn_ctx *ctx, grn_obj *table, grn_obj *column, grn_obj *
           rid = GRN_RECORD_VALUE(&value);
           key_length = grn_table_get_key(ctx, domain, rid, key_name, GRN_TABLE_MAX_KEY_SIZE);
           if (!prefix_match_size ||
-              (prefix_match_size > 0 && key_length >= prefix_match_size &&
+              (prefix_match_size > 0 && key_length >= (int) prefix_match_size &&
                !memcmp(sx, key_name, prefix_match_size))) {
             distance = calc_edit_distance(ctx, sx, ex,
                                           key_name, key_name + key_length, flags);
@@ -277,7 +277,7 @@ sequential_fuzzy_search(grn_ctx *ctx, grn_obj *table, grn_obj *column, grn_obj *
     grn_obj_unlink(ctx, &value);
 
     for (i = 0; i < heap->n_entries; i++) {
-      if (max_expansion > 0 && i >= max_expansion) {
+      if (max_expansion > 0 && (uint32_t) i >= max_expansion) {
         break;
       }
       {
diff --git a/storage/mroonga/vendor/groonga/lib/proc/proc_select.c b/storage/mroonga/vendor/groonga/lib/proc/proc_select.c
index 1f2a5005401..a665b1cc898 100644
--- a/storage/mroonga/vendor/groonga/lib/proc/proc_select.c
+++ b/storage/mroonga/vendor/groonga/lib/proc/proc_select.c
@@ -297,7 +297,7 @@ grn_parse_table_group_calc_types(grn_ctx *ctx,
     }
 
 #define CHECK_TABLE_GROUP_CALC_TYPE(name)\
-  if (((calc_types_end - calc_types) >= (sizeof(#name) - 1)) &&\
+    if (((unsigned long) (calc_types_end - calc_types) >= (unsigned long) (sizeof(#name) - 1)) && \
       (!memcmp(calc_types, #name, sizeof(#name) - 1))) {\
     flags |= GRN_TABLE_GROUP_CALC_ ## name;\
     calc_types += sizeof(#name) - 1;\
@@ -577,7 +577,7 @@ grn_columns_collect(grn_ctx *ctx,
   while (grn_table_cursor_next(ctx, cursor)) {
     void *key;
     char *variable_name;
-    int variable_name_len;
+    unsigned int variable_name_len;
     char *column_name;
     size_t column_name_len;
     void *value_raw;
@@ -1057,7 +1057,7 @@ grn_proc_expr_query_flags_parse(grn_ctx *ctx,
     }
 
 #define CHECK_EXPR_FLAG(name)                                           \
-    if (((query_flags_end - query_flags) >= (sizeof(#name) - 1)) &&     \
+    if (((unsigned long) (query_flags_end - query_flags) >= (unsigned long) (sizeof(#name) - 1)) &&     \
         (memcmp(query_flags, #name, sizeof(#name) - 1) == 0) &&         \
         (((query_flags_end - query_flags) == (sizeof(#name) - 1)) ||    \
          (query_flags[sizeof(#name) - 1] == '|') ||                     \
diff --git a/storage/mroonga/vendor/groonga/lib/proc/proc_table.c b/storage/mroonga/vendor/groonga/lib/proc/proc_table.c
index c3cf2b21499..3c40992de49 100644
--- a/storage/mroonga/vendor/groonga/lib/proc/proc_table.c
+++ b/storage/mroonga/vendor/groonga/lib/proc/proc_table.c
@@ -40,7 +40,7 @@ command_table_create_parse_flags(grn_ctx *ctx,
 
 #define CHECK_FLAG(name)                                                \
     name_size = strlen(#name);                                          \
-    if ((end - nptr) >= name_size &&                                    \
+    if ((unsigned long) (end - nptr) >= (unsigned long) name_size &&    \
         memcmp(nptr, #name, name_size) == 0) {                          \
       flags |= GRN_OBJ_ ## name;                                        \
       nptr += name_size;                                                \
diff --git a/storage/mroonga/vendor/groonga/lib/proc/proc_tokenize.c b/storage/mroonga/vendor/groonga/lib/proc/proc_tokenize.c
index d69b044c5ea..206ebf58afb 100644
--- a/storage/mroonga/vendor/groonga/lib/proc/proc_tokenize.c
+++ b/storage/mroonga/vendor/groonga/lib/proc/proc_tokenize.c
@@ -39,7 +39,7 @@ parse_tokenize_flags(grn_ctx *ctx, grn_obj *flag_names)
     }
 
 #define CHECK_FLAG(name)\
-    if (((names_end - names) >= (sizeof(#name) - 1)) &&\
+    if (((unsigned long) (names_end - names) >= (unsigned long) (sizeof(#name) - 1)) &&\
         (!memcmp(names, #name, sizeof(#name) - 1))) {\
       flags |= GRN_TOKEN_CURSOR_ ## name;\
       names += sizeof(#name) - 1;\
diff --git a/storage/mroonga/vendor/groonga/lib/store.c b/storage/mroonga/vendor/groonga/lib/store.c
index d0fe974eddb..f579bc9ede6 100644
--- a/storage/mroonga/vendor/groonga/lib/store.c
+++ b/storage/mroonga/vendor/groonga/lib/store.c
@@ -349,7 +349,7 @@ static grn_ja *
 _grn_ja_create(grn_ctx *ctx, grn_ja *ja, const char *path,
                unsigned int max_element_size, uint32_t flags)
 {
-  int i;
+  unsigned int i;
   grn_io *io;
   struct grn_ja_header *header;
   struct grn_ja_header_v2 *header_v2;
@@ -689,7 +689,7 @@ grn_ja_replace(grn_ctx *ctx, grn_ja *ja, grn_id id,
     return ctx->rc;
   }
   if (*pseg == JA_ESEG_VOID) {
-    int i = 0;
+    unsigned int i = 0;
     while (SEGMENTS_AT(ja, i)) {
       if (++i >= JA_N_DSEGMENTS) {
         ERR(GRN_NOT_ENOUGH_SPACE, "grn_ja file (%s) is full", ja->io->path);
@@ -750,8 +750,9 @@ grn_ja_alloc(grn_ctx *ctx, grn_ja *ja, grn_id id,
   iw->tiny_p = 0;
   if (grn_io_lock(ctx, ja->io, grn_lock_timeout)) { return ctx->rc; }
   if (element_size + sizeof(grn_id) > JA_SEGMENT_SIZE) {
-    int i, j, n = (element_size + JA_SEGMENT_SIZE - 1) >> GRN_JA_W_SEGMENT;
-    for (i = 0, j = -1; i < JA_N_DSEGMENTS; i++) {
+    uint i;
+    int j, n = (element_size + JA_SEGMENT_SIZE - 1) >> GRN_JA_W_SEGMENT;
+    for (i = 0, j = -1;  i < JA_N_DSEGMENTS; i++) {
       if (SEGMENTS_AT(ja, i)) {
         j = i;
       } else {
diff --git a/storage/mroonga/vendor/groonga/plugins/functions/string.c b/storage/mroonga/vendor/groonga/plugins/functions/string.c
index 6cd7d953df9..0af2d6ab86c 100644
--- a/storage/mroonga/vendor/groonga/plugins/functions/string.c
+++ b/storage/mroonga/vendor/groonga/plugins/functions/string.c
@@ -238,7 +238,7 @@ func_string_substring(grn_ctx *ctx, int n_args, grn_obj **args,
       start = p;
     } else {
       unsigned int char_length = 0;
-      size_t n_chars = 0;
+      int64_t n_chars = 0;
 
       for (;
            p < end && (char_length = grn_charlen(ctx, p, end));
@@ -252,7 +252,7 @@ func_string_substring(grn_ctx *ctx, int n_args, grn_obj **args,
 
     if (start && length > 0) {
       unsigned int char_length = 0;
-      size_t n_chars = 0;
+      int64_t n_chars = 0;
 
       for (;
            p < end && (char_length = grn_charlen(ctx, p, end));
diff --git a/storage/mroonga/vendor/groonga/tools/travis-install.sh b/storage/mroonga/vendor/groonga/tools/travis-install.sh
index 72240ec1580..d7ac400c1a9 100755
--- a/storage/mroonga/vendor/groonga/tools/travis-install.sh
+++ b/storage/mroonga/vendor/groonga/tools/travis-install.sh
@@ -23,7 +23,6 @@ case "${TRAVIS_OS_NAME}" in
     brew outdated pkg-config || brew upgrade pkg-config
     brew reinstall libtool
     brew outdated libevent || brew upgrade libevent
-    brew outdated pcre || brew upgrade pcre
     brew install \
          autoconf-archive \
          msgpack \
diff --git a/storage/myisam/CMakeLists.txt b/storage/myisam/CMakeLists.txt
index 52485043e8f..2f5d6211e36 100644
--- a/storage/myisam/CMakeLists.txt
+++ b/storage/myisam/CMakeLists.txt
@@ -27,6 +27,11 @@ SET(MYISAM_SOURCES  ft_boolean_search.c ft_nlq_search.c ft_parser.c ft_static.c
 				rt_split.c sort.c sp_key.c mi_extrafunc.h myisamdef.h
 				rt_index.h mi_rkey.c)
 
+IF(CMAKE_SYSTEM_NAME MATCHES AIX)
+  # Workaround linker bug on AIX
+  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-berok")
+ENDIF()
+
 MYSQL_ADD_PLUGIN(myisam ${MYISAM_SOURCES} 
   STORAGE_ENGINE 
   MANDATORY 
diff --git a/storage/myisam/ft_boolean_search.c b/storage/myisam/ft_boolean_search.c
index 406a9bbc951..a91467c5b8d 100644
--- a/storage/myisam/ft_boolean_search.c
+++ b/storage/myisam/ft_boolean_search.c
@@ -566,7 +566,7 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, uchar *query,
   FTB_EXPR  *ftbe;
   FTB_WORD  *ftbw;
 
-  if (!(ftb=(FTB *)my_malloc(sizeof(FTB), MYF(MY_WME))))
+  if (!(ftb=(FTB *)my_malloc(mi_key_memory_FTB, sizeof(FTB), MYF(MY_WME))))
     return 0;
   ftb->please= (struct _ft_vft *) & _ft_vft_boolean;
   ftb->state=UNINITIALIZED;
@@ -579,7 +579,7 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, uchar *query,
   bzero(& ftb->no_dupes, sizeof(TREE));
   ftb->last_word= 0;
 
-  init_alloc_root(&ftb->mem_root, "fulltext", 1024, 1024, MYF(0));
+  init_alloc_root(mi_key_memory_FTB, &ftb->mem_root, 1024, 1024, MYF(0));
   ftb->queue.max_elements= 0;
   if (!(ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR))))
     goto err;
diff --git a/storage/myisam/ft_nlq_search.c b/storage/myisam/ft_nlq_search.c
index 3e433b71761..eb95d1e0b94 100644
--- a/storage/myisam/ft_nlq_search.c
+++ b/storage/myisam/ft_nlq_search.c
@@ -287,7 +287,7 @@ FT_INFO *ft_init_nlq_search(MI_INFO *info, uint keynr, uchar *query,
     If ndocs == 0, this will not allocate RAM for FT_INFO.doc[],
     so if ndocs == 0, FT_INFO.doc[] must not be accessed.
    */
-  dlist=(FT_INFO *)my_malloc(sizeof(FT_INFO)+
+  dlist=(FT_INFO *)my_malloc(mi_key_memory_FT_INFO, sizeof(FT_INFO)+
 			     sizeof(FT_DOC)*
 			     (int)(aio.dtree.elements_in_tree-1),
 			     MYF(0));
diff --git a/storage/myisam/ft_parser.c b/storage/myisam/ft_parser.c
index bc50301fab2..61b1915d5e0 100644
--- a/storage/myisam/ft_parser.c
+++ b/storage/myisam/ft_parser.c
@@ -1,4 +1,5 @@
 /* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+   Copyright (c) 2020, MariaDB Corporation.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -129,7 +130,7 @@ uchar ft_get_word(CHARSET_INFO *cs, const uchar **start, const uchar *end,
   {
     for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
     {
-      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+      mbl= my_ci_ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
       if (true_word_char(ctype, *doc))
         break;
       if (*doc == FTB_RQUOT && param->quot)
@@ -168,7 +169,7 @@ uchar ft_get_word(CHARSET_INFO *cs, const uchar **start, const uchar *end,
     for (word->pos= doc; doc < end; length++,
          doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
     {
-      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+      mbl= my_ci_ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
       if (true_word_char(ctype, *doc))
         mwc=0;
       else if (!misc_word_char(*doc) || mwc)
@@ -221,7 +222,7 @@ uchar ft_simple_get_word(CHARSET_INFO *cs, uchar **start, const uchar *end,
     {
       if (doc >= end)
         DBUG_RETURN(0);
-      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+      mbl= my_ci_ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
       if (true_word_char(ctype, *doc))
         break;
     }
@@ -230,7 +231,7 @@ uchar ft_simple_get_word(CHARSET_INFO *cs, uchar **start, const uchar *end,
     for (word->pos= doc; doc < end; length++,
          doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
     {
-      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+      mbl= my_ci_ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
       if (true_word_char(ctype, *doc))
         mwc= 0;
       else if (!misc_word_char(*doc) || mwc)
@@ -347,9 +348,10 @@ MYSQL_FTPARSER_PARAM* ftparser_alloc_param(MI_INFO *info)
       (ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2.
     */
     info->ftparser_param= (MYSQL_FTPARSER_PARAM *)
-      my_malloc(MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) *
-                info->s->ftkeys, MYF(MY_WME | MY_ZEROFILL));
-    init_alloc_root(&info->ft_memroot, "fulltext_parser",
+      my_malloc(mi_key_memory_FTPARSER_PARAM,
+                MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) * info->s->ftkeys,
+                MYF(MY_WME | MY_ZEROFILL));
+    init_alloc_root(mi_key_memory_ft_memroot, &info->ft_memroot,
                     FTPARSER_MEMROOT_ALLOC_SIZE, 0, MYF(0));
   }
   return info->ftparser_param;
diff --git a/storage/myisam/ft_stopwords.c b/storage/myisam/ft_stopwords.c
index 3422a82a111..34c445cc163 100644
--- a/storage/myisam/ft_stopwords.c
+++ b/storage/myisam/ft_stopwords.c
@@ -59,7 +59,8 @@ int ft_init_stopwords()
   DBUG_ENTER("ft_init_stopwords");
   if (!stopwords3)
   {
-    if (!(stopwords3=(TREE *)my_malloc(sizeof(TREE),MYF(0))))
+    if (!(stopwords3=(TREE *)my_malloc(mi_key_memory_ft_stopwords,
+                                       sizeof(TREE), MYF(0))))
       DBUG_RETURN(-1);
     init_tree(stopwords3,0,0,sizeof(FT_STOPWORD),(qsort_cmp2)&FT_STOPWORD_cmp,
               (ft_stopword_file ? (tree_element_free)&FT_STOPWORD_free : 0),
@@ -89,13 +90,15 @@ int ft_init_stopwords()
       DBUG_RETURN(-1);
     len=(size_t)my_seek(fd, 0L, MY_SEEK_END, MYF(0));
     my_seek(fd, 0L, MY_SEEK_SET, MYF(0));
-    if (!(start=buffer=my_malloc(len+1, MYF(MY_WME))))
+    if (!(start= buffer= my_malloc(mi_key_memory_ft_stopwords, len+1,
+                                   MYF(MY_WME))))
       goto err0;
     len=my_read(fd, buffer, len, MYF(MY_WME));
     end=start+len;
     while (ft_simple_get_word(ft_stopword_cs, &start, end, &w, TRUE))
     {
-      if (ft_add_stopword(my_strndup((char*) w.pos, w.len, MYF(0))))
+      if (ft_add_stopword(my_strndup(mi_key_memory_ft_stopwords,
+                                     (char*) w.pos, w.len, MYF(0))))
         goto err1;
     }
     error=0;
diff --git a/storage/myisam/ha_myisam.cc b/storage/myisam/ha_myisam.cc
index 231165adace..23a0adcaf2a 100644
--- a/storage/myisam/ha_myisam.cc
+++ b/storage/myisam/ha_myisam.cc
@@ -254,7 +254,7 @@ int table2myisam(TABLE *table_arg, MI_KEYDEF **keydef_out,
   TABLE_SHARE *share= table_arg->s;
   uint options= share->db_options_in_use;
   DBUG_ENTER("table2myisam");
-  if (!(my_multi_malloc(MYF(MY_WME),
+  if (!(my_multi_malloc(PSI_INSTRUMENT_ME, MYF(MY_WME),
           recinfo_out, (share->fields * 2 + 2) * sizeof(MI_COLUMNDEF),
           keydef_out, share->keys * sizeof(MI_KEYDEF),
           &keyseg,
@@ -1735,7 +1735,7 @@ void ha_myisam::start_bulk_insert(ha_rows rows, uint flags)
                      (ulong) rows, size));
 
   /* don't enable row cache if too few rows */
-  if (! rows || (rows > MI_MIN_ROWS_TO_USE_WRITE_CACHE))
+  if ((!rows || rows > MI_MIN_ROWS_TO_USE_WRITE_CACHE) && !has_long_unique())
     mi_extra(file, HA_EXTRA_WRITE_CACHE, (void*) &size);
 
   can_enable_indexes= mi_is_all_keys_active(file->s->state.key_map,
@@ -2103,7 +2103,8 @@ int ha_myisam::info(uint flag)
 
     ref_length= misam_info.reflength;
     share->db_options_in_use= misam_info.options;
-    stats.block_size= myisam_block_size;        /* record block size */
+    /* record block size. We adjust with IO_SIZE to not make it too small */
+    stats.block_size= MY_MAX(myisam_block_size, IO_SIZE);
 
     if (table_share->tmp_table == NO_TMP_TABLE)
       mysql_mutex_lock(&table_share->LOCK_share);
@@ -2134,7 +2135,8 @@ int ha_myisam::info(uint flag)
 
 int ha_myisam::extra(enum ha_extra_function operation)
 {
-  if (operation == HA_EXTRA_MMAP && !opt_myisam_use_mmap)
+  if ((operation == HA_EXTRA_MMAP && !opt_myisam_use_mmap) ||
+      (operation == HA_EXTRA_WRITE_CACHE && has_long_unique()))
     return 0;
   return mi_extra(file, operation, 0);
 }
@@ -2350,6 +2352,8 @@ void ha_myisam::get_auto_increment(ulonglong offset, ulonglong increment,
     inx			Index to use
     min_key		Start of range.  Null pointer if from first key
     max_key		End of range. Null pointer if to last key
+    pages               Store first and last page for the range in case of
+                        b-trees. In other cases it's not touched.
 
   NOTES
     min_key.flag can have one of the following values:
@@ -2367,10 +2371,12 @@ void ha_myisam::get_auto_increment(ulonglong offset, ulonglong increment,
 			the range.
 */
 
-ha_rows ha_myisam::records_in_range(uint inx, key_range *min_key,
-                                    key_range *max_key)
+ha_rows ha_myisam::records_in_range(uint inx, const key_range *min_key,
+                                    const key_range *max_key,
+                                    page_range *pages)
 {
-  return (ha_rows) mi_records_in_range(file, (int) inx, min_key, max_key);
+  return (ha_rows) mi_records_in_range(file, (int) inx, min_key, max_key,
+                                       pages);
 }
 
 
@@ -2520,13 +2526,16 @@ int myisam_panic(handlerton *hton, ha_panic_function flag)
   return mi_panic(flag);
 }
 
+static int myisam_drop_table(handlerton *hton, const char *path)
+{
+  return mi_delete_table(path);
+}
+
 static int myisam_init(void *p)
 {
   handlerton *hton;
 
-#ifdef HAVE_PSI_INTERFACE
   init_myisam_psi_keys();
-#endif
 
   /* Set global variables based on startup options */
   if (myisam_recover_options && myisam_recover_options != HA_RECOVER_OFF)
@@ -2534,12 +2543,12 @@ static int myisam_init(void *p)
   else
     myisam_recover_options= HA_RECOVER_OFF;
 
-  myisam_block_size=(uint) 1 << my_bit_log2(opt_myisam_block_size);
+  myisam_block_size=(uint) 1 << my_bit_log2_uint64(opt_myisam_block_size);
 
   hton= (handlerton *)p;
-  hton->state= SHOW_OPTION_YES;
   hton->db_type= DB_TYPE_MYISAM;
   hton->create= myisam_create_handler;
+  hton->drop_table= myisam_drop_table;
   hton->panic= myisam_panic;
   hton->flags= HTON_CAN_RECREATE | HTON_SUPPORT_LOG_TABLES;
   hton->tablefile_extensions= ha_myisam_exts;
@@ -2653,23 +2662,6 @@ bool ha_myisam::rowid_filter_push(Rowid_filter* rowid_filter)
 struct st_mysql_storage_engine myisam_storage_engine=
 { MYSQL_HANDLERTON_INTERFACE_VERSION };
 
-mysql_declare_plugin(myisam)
-{
-  MYSQL_STORAGE_ENGINE_PLUGIN,
-  &myisam_storage_engine,
-  "MyISAM",
-  "MySQL AB",
-  "MyISAM storage engine",
-  PLUGIN_LICENSE_GPL,
-  myisam_init, /* Plugin Init */
-  NULL, /* Plugin Deinit */
-  0x0100, /* 1.0 */
-  NULL,                       /* status variables                */
-  myisam_sysvars,             /* system variables                */
-  NULL,
-  0,
-}
-mysql_declare_plugin_end;
 maria_declare_plugin(myisam)
 {
   MYSQL_STORAGE_ENGINE_PLUGIN,
diff --git a/storage/myisam/ha_myisam.h b/storage/myisam/ha_myisam.h
index 70c99a617f6..3843004cc6e 100644
--- a/storage/myisam/ha_myisam.h
+++ b/storage/myisam/ha_myisam.h
@@ -41,7 +41,7 @@ C_MODE_START
 check_result_t index_cond_func_myisam(void *arg);
 C_MODE_END
 
-class ha_myisam: public handler
+class ha_myisam final : public handler
 {
   MI_INFO *file;
   ulonglong int_table_flags;
@@ -114,7 +114,8 @@ class ha_myisam: public handler
   int indexes_are_disabled(void);
   void start_bulk_insert(ha_rows rows, uint flags);
   int end_bulk_insert();
-  ha_rows records_in_range(uint inx, key_range *min_key, key_range *max_key);
+  ha_rows records_in_range(uint inx, const key_range *min_key,
+                           const key_range *max_key, page_range *pages);
   void update_create_info(HA_CREATE_INFO *create_info);
   int create(const char *name, TABLE *form, HA_CREATE_INFO *create_info);
   THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to,
diff --git a/storage/myisam/mi_check.c b/storage/myisam/mi_check.c
index 3822a929fcf..16184a1b166 100644
--- a/storage/myisam/mi_check.c
+++ b/storage/myisam/mi_check.c
@@ -1175,6 +1175,7 @@ int chk_data_link(HA_CHECK *param, MI_INFO *info, my_bool extend)
       if (_mi_read_cache(&param->read_cache,(uchar*) info->rec_buff,
 			block_info.filepos, block_info.rec_len, READING_NEXT))
 	goto err;
+      info->rec_buff[block_info.rec_len]= 0;  /* Keep valgrind happy */
       if (_mi_pack_rec_unpack(info, &info->bit_buff, record,
                               info->rec_buff, block_info.rec_len))
       {
@@ -2141,7 +2142,7 @@ int filecopy(HA_CHECK *param, File to,File from,my_off_t start,
   DBUG_ENTER("filecopy");
 
   buff_length=(ulong) MY_MIN(param->write_buffer_length,length);
-  if (!(buff=my_malloc(buff_length,MYF(0))))
+  if (!(buff=my_malloc(mi_key_memory_filecopy, buff_length, MYF(0))))
   {
     buff=tmp_buff; buff_length=IO_SIZE;
   }
@@ -2298,8 +2299,8 @@ int mi_repair_by_sort(HA_CHECK *param, register MI_INFO *info,
     info->state->data_file_length= sort_info.filelength;
 
   sort_param.wordlist=NULL;
-  init_alloc_root(&sort_param.wordroot, "sort", FTPARSER_MEMROOT_ALLOC_SIZE, 0,
-                  MYF(param->malloc_flags));
+  init_alloc_root(mi_key_memory_MI_SORT_PARAM_wordroot, &sort_param.wordroot,
+                  FTPARSER_MEMROOT_ALLOC_SIZE, 0, MYF(param->malloc_flags));
 
   if (share->data_file_type == DYNAMIC_RECORD)
     length=MY_MAX(share->base.min_pack_length+1,share->base.min_block_length);
@@ -2795,7 +2796,7 @@ int mi_repair_parallel(HA_CHECK *param, register MI_INFO *info,
   if (share->options & HA_OPTION_COMPRESS_RECORD)
     set_if_bigger(max_pack_reclength, share->max_pack_length);
   if (!(sort_param=(MI_SORT_PARAM *)
-        my_malloc((uint) share->base.keys *
+        my_malloc(mi_key_memory_MI_SORT_PARAM, (uint) share->base.keys *
 		  (sizeof(MI_SORT_PARAM) + max_pack_reclength),
 		  MYF(MY_ZEROFILL))))
   {
@@ -2882,8 +2883,8 @@ int mi_repair_parallel(HA_CHECK *param, register MI_INFO *info,
       uint ft_max_word_len_for_sort=FT_MAX_WORD_LEN_FOR_SORT*
                                     sort_param[i].keyinfo->seg->charset->mbmaxlen;
       sort_param[i].key_length+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN;
-      init_alloc_root(&sort_param[i].wordroot, "sort",
-                      FTPARSER_MEMROOT_ALLOC_SIZE, 0,
+      init_alloc_root(mi_key_memory_MI_SORT_PARAM_wordroot,
+                      &sort_param[i].wordroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0,
                       MYF(param->malloc_flags));
     }
   }
@@ -3636,6 +3637,7 @@ static int sort_get_next_record(MI_SORT_PARAM *sort_param)
 			      llstr(sort_param->pos,llbuff));
 	continue;
       }
+      sort_param->rec_buff[block_info.rec_len]= 0;  /* Keep valgrind happy */
       if (_mi_pack_rec_unpack(info, &sort_param->bit_buff, sort_param->record,
                               sort_param->rec_buff, block_info.rec_len))
       {
@@ -3727,7 +3729,8 @@ int sort_write_record(MI_SORT_PARAM *sort_param)
 	  MI_DYN_DELETE_BLOCK_HEADER;
 	if (sort_info->buff_length < reclength)
 	{
-	  if (!(sort_info->buff=my_realloc(sort_info->buff, (uint) reclength,
+	  if (!(sort_info->buff=my_realloc(mi_key_memory_SORT_INFO_buffer,
+                                           sort_info->buff, (uint) reclength,
 					   MYF(MY_FREE_ON_ERROR | MY_WME |
 					       MY_ALLOW_ZERO_PTR))))
 	    DBUG_RETURN(1);
@@ -3944,7 +3947,8 @@ static int sort_ft_key_write(MI_SORT_PARAM *sort_param, const void *a)
          sort_info->info->s->rec_reflength) &&
         (sort_info->info->s->options &
           (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)))
-      ft_buf=(SORT_FT_BUF *)my_malloc(sort_param->keyinfo->block_length +
+      ft_buf=(SORT_FT_BUF *)my_malloc(mi_key_memory_SORT_FT_BUF,
+                                      sort_param->keyinfo->block_length +
                                       sizeof(SORT_FT_BUF), MYF(MY_WME));
 
     if (!ft_buf)
@@ -4215,7 +4219,8 @@ static SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks,
   SORT_KEY_BLOCKS *block;
   DBUG_ENTER("alloc_key_blocks");
 
-  if (!(block=(SORT_KEY_BLOCKS*) my_malloc((sizeof(SORT_KEY_BLOCKS)+
+  if (!(block=(SORT_KEY_BLOCKS*) my_malloc(mi_key_memory_SORT_KEY_BLOCKS,
+                                           (sizeof(SORT_KEY_BLOCKS)+
 					    buffer_length+IO_SIZE)*blocks,
 					   MYF(0))))
   {
diff --git a/storage/myisam/mi_create.c b/storage/myisam/mi_create.c
index ebe139bb342..30537cef3e7 100644
--- a/storage/myisam/mi_create.c
+++ b/storage/myisam/mi_create.c
@@ -94,7 +94,8 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
     ci->reloc_rows=ci->max_rows;		/* Check if wrong parameter */
 
   if (!(rec_per_key_part=
-	(ulong*) my_malloc((keys + uniques)*HA_MAX_KEY_SEG*sizeof(long),
+	(ulong*) my_malloc(mi_key_memory_MYISAM_SHARE,
+                           (keys + uniques) * HA_MAX_KEY_SEG * sizeof(long),
 			   MYF(MY_WME | MY_ZEROFILL))))
     DBUG_RETURN(my_errno);
 
@@ -622,6 +623,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
     fn_format(kfilename, name, "", MI_NAME_IEXT, MY_UNPACK_FILENAME |
               (internal_table ? 0 : MY_RETURN_REAL_PATH) |
               (have_iext ? MY_REPLACE_EXT : MY_APPEND_EXT));
+    klinkname_ptr= 0;
     /* Replace the current file */
     create_flag=(flags & HA_CREATE_KEEP_FILES) ? 0 : MY_DELETE_OLD;
   }
diff --git a/storage/myisam/mi_delete_table.c b/storage/myisam/mi_delete_table.c
index 7990c3e8a80..d318b44720a 100644
--- a/storage/myisam/mi_delete_table.c
+++ b/storage/myisam/mi_delete_table.c
@@ -28,19 +28,23 @@
 
 int mi_delete_table(const char *name)
 {
+  int error= 0;
   DBUG_ENTER("mi_delete_table");
 
 #ifdef EXTRA_DEBUG
   check_table_is_closed(name,"delete");
 #endif
 
-  if (mysql_file_delete_with_symlink(mi_key_file_kfile, name, MI_NAME_IEXT, MYF(MY_WME)) ||
-      mysql_file_delete_with_symlink(mi_key_file_dfile, name, MI_NAME_DEXT, MYF(MY_WME)))
-    DBUG_RETURN(my_errno);
+  if (mysql_file_delete_with_symlink(mi_key_file_kfile, name, MI_NAME_IEXT,
+                                     MYF(MY_WME)))
+    error= my_errno;
+  if (mysql_file_delete_with_symlink(mi_key_file_dfile, name, MI_NAME_DEXT,
+                                     MYF(MY_WME)))
+    error= my_errno;
 
   // optionally present:
   mysql_file_delete_with_symlink(mi_key_file_dfile, name, ".OLD", MYF(0));
   mysql_file_delete_with_symlink(mi_key_file_dfile, name, ".TMD", MYF(0));
 
-  DBUG_RETURN(0);
+  DBUG_RETURN(error);
 }
diff --git a/storage/myisam/mi_dynrec.c b/storage/myisam/mi_dynrec.c
index 69c13ab96cf..09c10040f9c 100644
--- a/storage/myisam/mi_dynrec.c
+++ b/storage/myisam/mi_dynrec.c
@@ -43,7 +43,7 @@ static int _mi_cmp_buffer(File file, const uchar *buff, my_off_t filepos,
 /* Play it safe; We have a small stack when using threads */
 #undef my_alloca
 #undef my_afree
-#define my_alloca(A) my_malloc((A),MYF(0))
+#define my_alloca(A) my_malloc(PSI_NOT_INSTRUMENTED, (A),MYF(0))
 #define my_afree(A) my_free((A))
 
 	/* Interface function from MI_INFO */
diff --git a/storage/myisam/mi_info.c b/storage/myisam/mi_info.c
index 50cb5439472..eec5c857786 100644
--- a/storage/myisam/mi_info.c
+++ b/storage/myisam/mi_info.c
@@ -87,7 +87,10 @@ int mi_status(MI_INFO *info, register MI_ISAMINFO *x, uint flag)
     x->index_file_name  = share->index_file_name;
   }
   if ((flag & HA_STATUS_TIME) && !mysql_file_fstat(info->dfile, &state, MYF(0)))
+  {
+    MSAN_STAT_WORKAROUND(&state);
     x->update_time=state.st_mtime;
+  }
   else
     x->update_time=0;
   if (flag & HA_STATUS_AUTO)
diff --git a/storage/myisam/mi_key.c b/storage/myisam/mi_key.c
index 7d263d9fc7b..087eb59c7c0 100644
--- a/storage/myisam/mi_key.c
+++ b/storage/myisam/mi_key.c
@@ -1,4 +1,5 @@
 /* Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
+   Copyright (c) 2020, MariaDB Corporation.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -27,7 +28,9 @@
 #define FIX_LENGTH(cs, pos, length, char_length)                            \
             do {                                                            \
               if (length > char_length)                                     \
-                char_length= my_charpos(cs, pos, pos+length, char_length);  \
+                char_length= my_ci_charpos(cs, (const char *) pos,          \
+                                               (const char *) pos+length,   \
+                                               char_length);                \
               set_if_smaller(char_length,length);                           \
             } while(0)
 
@@ -109,7 +112,7 @@ uint _mi_make_key(register MI_INFO *info, uint keynr, uchar *key,
     {
       if (type != HA_KEYTYPE_NUM)
       {
-        length= cs->cset->lengthsp(cs, (char*) pos, length);
+        length= my_ci_lengthsp(cs, (char*) pos, length);
       }
       else
       {
@@ -186,7 +189,7 @@ uint _mi_make_key(register MI_INFO *info, uint keynr, uchar *key,
     FIX_LENGTH(cs, pos, length, char_length);
     memcpy((uchar*) key, pos, char_length);
     if (length > char_length)
-      cs->cset->fill(cs, (char*) key+char_length, length-char_length, ' ');
+      my_ci_fill(cs, (char*) key+char_length, length-char_length, ' ');
     key+= length;
   }
   _mi_dpointer(info,key,filepos);
@@ -264,7 +267,7 @@ uint _mi_pack_key(register MI_INFO *info, uint keynr, uchar *key, uchar *old,
       }
       else if (type != HA_KEYTYPE_BINARY)
       {
-        length= cs->cset->lengthsp(cs, (char*) pos, length);
+        length= my_ci_lengthsp(cs, (char*) pos, length);
       }
       FIX_LENGTH(cs, pos, length, char_length);
       store_key_length_inc(key,char_length);
@@ -295,7 +298,7 @@ uint _mi_pack_key(register MI_INFO *info, uint keynr, uchar *key, uchar *old,
     FIX_LENGTH(cs, pos, length, char_length);
     memcpy((uchar*) key, pos, char_length);
     if (length > char_length)
-      cs->cset->fill(cs, (char*) key+char_length, length-char_length, ' ');
+      my_ci_fill(cs, (char*) key+char_length, length-char_length, ' ');
     key+= length;
   }
   if (last_used_keyseg)
@@ -383,8 +386,7 @@ static int _mi_put_key_in_record(register MI_INFO *info, uint keynr,
       if (keyseg->type != (int) HA_KEYTYPE_NUM)
       {
         memcpy(pos,key,(size_t) length);
-        keyseg->charset->cset->fill(keyseg->charset,
-                                    (char*) pos + length,
+        my_ci_fill(keyseg->charset, (char*) pos + length,
                                     keyseg->length - length,
                                     ' ');
       }
diff --git a/storage/myisam/mi_locking.c b/storage/myisam/mi_locking.c
index 713ba0a3851..67b253761bc 100644
--- a/storage/myisam/mi_locking.c
+++ b/storage/myisam/mi_locking.c
@@ -284,7 +284,7 @@ int mi_lock_database(MI_INFO *info, int lock_type)
 			(THR_WRITE_CONCURRENT_INSERT was used)
 */
 
-void mi_get_status(void* param, my_bool concurrent_insert)
+my_bool mi_get_status(void* param, my_bool concurrent_insert)
 {
   MI_INFO *info=(MI_INFO*) param;
   DBUG_ENTER("mi_get_status");
@@ -306,7 +306,7 @@ void mi_get_status(void* param, my_bool concurrent_insert)
   info->append_insert_at_end= concurrent_insert;
   if (concurrent_insert)
     info->s->state.state.uncacheable= TRUE;
-  DBUG_VOID_RETURN;
+  DBUG_RETURN(0);
 }
 
 
diff --git a/storage/myisam/mi_open.c b/storage/myisam/mi_open.c
index 4d3c227dcc3..3db424ea997 100644
--- a/storage/myisam/mi_open.c
+++ b/storage/myisam/mi_open.c
@@ -180,7 +180,7 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
     if ((share->options & HA_OPTION_RELIES_ON_SQL_LAYER) &&
         ! (open_flags & HA_OPEN_FROM_SQL_LAYER))
     {
-      DBUG_PRINT("error", ("table cannot be openned from non-sql layer"));
+      DBUG_PRINT("error", ("table cannot be opened from non-sql layer"));
       my_errno= HA_ERR_UNSUPPORTED;
       goto err;
     }
@@ -310,7 +310,7 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
     /* Add space for node pointer */
     share->base.max_key_length+= share->base.key_reflength;
 
-    if (!my_multi_malloc(MY_WME,
+    if (!my_multi_malloc(mi_key_memory_MYISAM_SHARE, MYF(MY_WME),
 			 &share,sizeof(*share),
 			 &share->state.rec_per_key_part,
                          sizeof(long)*base_key_parts,
@@ -599,7 +599,7 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
   }
 
   /* alloc and set up private structure parts */
-  if (!my_multi_malloc(MY_WME,
+  if (!my_multi_malloc(mi_key_memory_MI_INFO, MYF(MY_WME),
 		       &m_info,sizeof(MI_INFO),
 		       &info.blobs,sizeof(MI_BLOB)*share->base.blobs,
 		       &info.buff,(share->base.max_key_block_length*2+
@@ -759,7 +759,8 @@ uchar *mi_alloc_rec_buff(MI_INFO *info, ulong length, uchar **buf)
 	    MI_REC_BUFF_OFFSET : 0);
     if (extra && newptr)
       newptr-= MI_REC_BUFF_OFFSET;
-    if (!(newptr=(uchar*) my_realloc((uchar*)newptr, length+extra+8,
+    if (!(newptr=(uchar*) my_realloc(mi_key_memory_record_buffer,
+                                     (uchar*)newptr, length + extra + 8,
                                      MYF(MY_ALLOW_ZERO_PTR))))
       return NULL;
     *((uint32 *) newptr)= (uint32) length;
@@ -1004,7 +1005,7 @@ uchar *mi_state_info_read(uchar *ptr, MI_STATE_INFO *state)
 
   if (!state->rec_per_key_part)
   {
-    if (!my_multi_malloc(MY_WME,
+    if (!my_multi_malloc(mi_key_memory_MYISAM_SHARE, MYF(MY_WME),
 			 &state->rec_per_key_part,sizeof(long)*key_parts,
 			 &state->key_root, keys*sizeof(my_off_t),
 			 &state->key_del,  key_blocks*sizeof(my_off_t),
diff --git a/storage/myisam/mi_packrec.c b/storage/myisam/mi_packrec.c
index 6197c083c52..ca8a8ef06c7 100644
--- a/storage/myisam/mi_packrec.c
+++ b/storage/myisam/mi_packrec.c
@@ -196,8 +196,8 @@ my_bool _mi_read_pack_info(MI_INFO *info, pbool fix_keys)
     - Distinct column values
   */
   if (!(share->decode_trees=(MI_DECODE_TREE*)
-	my_malloc((uint) (trees*sizeof(MI_DECODE_TREE)+
-			  intervall_length*sizeof(uchar)),
+	my_malloc(mi_key_memory_MI_DECODE_TREE,
+                  trees*sizeof(MI_DECODE_TREE) + intervall_length*sizeof(uchar),
 		  MYF(MY_WME))))
     goto err0;
   intervall_buff=(uchar*) (share->decode_trees+trees);
@@ -219,7 +219,8 @@ my_bool _mi_read_pack_info(MI_INFO *info, pbool fix_keys)
     data, we add (BITS_SAVED / 8) - 1 bytes to the buffer size.
   */
   if (!(share->decode_tables=(uint16*)
-        my_malloc((length + OFFSET_TABLE_SIZE) * sizeof(uint16) +
+        my_malloc(mi_key_memory_MYISAM_SHARE_decode_tables,
+                  (length + OFFSET_TABLE_SIZE) * sizeof(uint16) +
                   (uint) (share->pack.header_length - sizeof(header) +
                   (BITS_SAVED / 8) - 1), MYF(MY_WME | MY_ZEROFILL))))
     goto err1;
@@ -259,9 +260,10 @@ my_bool _mi_read_pack_info(MI_INFO *info, pbool fix_keys)
       goto err3;
   /* Reallocate the decoding tables to the used size. */
   decode_table=(uint16*)
-    my_realloc((uchar*) share->decode_tables,
+    my_realloc(mi_key_memory_MYISAM_SHARE_decode_tables,
+               (uchar*) share->decode_tables,
 	       (uint) ((uchar*) decode_table - (uchar*) share->decode_tables),
-	       MYF(MY_HOLD_ON_ERROR));
+	       MYF(0));
   /* Fix the table addresses in the tree heads. */
   {
     my_ptrdiff_t diff=PTR_BYTE_DIFF(decode_table,share->decode_tables);
@@ -723,6 +725,8 @@ int _mi_read_pack_record(MI_INFO *info, my_off_t filepos, uchar *buf)
                       block_info.rec_len - block_info.offset, MYF(MY_NABP)))
     goto panic;
   info->update|= HA_STATE_AKTIV;
+
+  info->rec_buff[block_info.rec_len]= 0; /* Keep valgrind happy */
   DBUG_RETURN(_mi_pack_rec_unpack(info, &info->bit_buff, buf,
                                   info->rec_buff, block_info.rec_len));
 panic:
@@ -1350,8 +1354,9 @@ int _mi_read_rnd_pack_record(MI_INFO *info, uchar *buf,
   info->nextpos=block_info.filepos+block_info.rec_len;
   info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED;
 
-  DBUG_RETURN (_mi_pack_rec_unpack(info, &info->bit_buff, buf,
-                                   info->rec_buff, block_info.rec_len));
+  info->rec_buff[block_info.rec_len]= 0; /* Keep valgrind happy */
+  DBUG_RETURN(_mi_pack_rec_unpack(info, &info->bit_buff, buf,
+                                  info->rec_buff, block_info.rec_len));
  err:
   DBUG_RETURN(my_errno);
 }
@@ -1370,8 +1375,8 @@ uint _mi_pack_get_block_info(MI_INFO *myisam, MI_BIT_BUFF *bit_buff,
   {
     ref_length=myisam->s->pack.ref_length;
     /*
-      We can't use mysql_file_pread() here because mi_read_rnd_pack_record assumes
-      position is ok
+      We can't use mysql_file_pread() here because mi_read_rnd_pack_record
+      assumes position is ok
     */
     mysql_file_seek(file, filepos, MY_SEEK_SET, MYF(0));
     if (mysql_file_read(file, header, ref_length, MYF(MY_NABP)))
@@ -1406,15 +1411,17 @@ uint _mi_pack_get_block_info(MI_INFO *myisam, MI_BIT_BUFF *bit_buff,
 }
 
 
-	/* rutines for bit buffer */
-	/* Note buffer must be 6 byte bigger than longest row */
+/*
+  Rutines for bit buffer
+  Note: buffer must be 6 byte bigger than longest row
+*/
 
 static void init_bit_buffer(MI_BIT_BUFF *bit_buff, uchar *buffer, uint length)
 {
   bit_buff->pos=buffer;
   bit_buff->end=buffer+length;
   bit_buff->bits=bit_buff->error=0;
-  bit_buff->current_byte=0;			/* Avoid purify errors */
+  bit_buff->current_byte=0;			/* Avoid valgrind errors */
 }
 
 static uint fill_and_get_bits(MI_BIT_BUFF *bit_buff, uint count)
@@ -1560,9 +1567,11 @@ void _mi_unmap_file(MI_INFO *info)
 }
 
 
-static uchar *_mi_mempack_get_block_info(MI_INFO *myisam, MI_BIT_BUFF *bit_buff,
-                                         MI_BLOCK_INFO *info, uchar **rec_buff_p,
-					 uchar *header)
+static uchar *_mi_mempack_get_block_info(MI_INFO *myisam,
+                                         MI_BIT_BUFF *bit_buff,
+                                         MI_BLOCK_INFO *info,
+                                         uchar **rec_buff_p,
+                                         uchar *header)
 {
   header+= read_pack_length((uint) myisam->s->pack.version, header,
                             &info->rec_len);
@@ -1571,7 +1580,7 @@ static uchar *_mi_mempack_get_block_info(MI_INFO *myisam, MI_BIT_BUFF *bit_buff,
     header+= read_pack_length((uint) myisam->s->pack.version, header,
                               &info->blob_len);
     /* mi_alloc_rec_buff sets my_errno on error */
-    if (!(mi_alloc_rec_buff(myisam, info->blob_len,
+    if (!(mi_alloc_rec_buff(myisam, info->blob_len ,
 			    rec_buff_p)))
       return 0;				/* not enough memory */
     bit_buff->blob_pos= (uchar*) *rec_buff_p;
@@ -1596,6 +1605,7 @@ static int _mi_read_mempack_record(MI_INFO *info, my_off_t filepos, uchar *buf)
 						(uchar*) share->file_map+
 						filepos)))
     DBUG_RETURN(-1);
+  /* No need to end-zero pos here for valgrind as data is memory mapped */
   DBUG_RETURN(_mi_pack_rec_unpack(info, &info->bit_buff, buf,
                                   pos, block_info.rec_len));
 }
diff --git a/storage/myisam/mi_preload.c b/storage/myisam/mi_preload.c
index d52a2ea46ea..5f9132abe14 100644
--- a/storage/myisam/mi_preload.c
+++ b/storage/myisam/mi_preload.c
@@ -73,7 +73,8 @@ int mi_preload(MI_INFO *info, ulonglong key_map, my_bool ignore_leaves)
   length= info->preload_buff_size/block_length * block_length;
   set_if_bigger(length, block_length);
 
-  if (!(buff= (uchar *) my_malloc(length, MYF(MY_WME))))
+  if (!(buff= (uchar *) my_malloc(mi_key_memory_preload_buffer, length,
+                                  MYF(MY_WME))))
     DBUG_RETURN(my_errno= HA_ERR_OUT_OF_MEM);
 
   if (flush_key_blocks(share->key_cache, share->kfile, &share->dirty_part_map,
diff --git a/storage/myisam/mi_range.c b/storage/myisam/mi_range.c
index 7292f3c83b8..54350f7a111 100644
--- a/storage/myisam/mi_range.c
+++ b/storage/myisam/mi_range.c
@@ -24,9 +24,9 @@
 #include "rt_index.h"
 
 static double _mi_record_pos(MI_INFO *, const uchar *, key_part_map,
-                             enum ha_rkey_function);
+                             enum ha_rkey_function, ulonglong *);
 static double _mi_search_pos(MI_INFO *,MI_KEYDEF *,uchar *, uint,uint,
-                             my_off_t,my_bool);
+                             my_off_t,my_bool, ulonglong *);
 static uint _mi_keynr(MI_INFO *info,MI_KEYDEF *,uchar *, uchar *,uint *);
 
 /*
@@ -48,7 +48,8 @@ static uint _mi_keynr(MI_INFO *info,MI_KEYDEF *,uchar *, uchar *,uint *);
 */
   
 ha_rows mi_records_in_range(MI_INFO *info, int inx,
-                            key_range *min_key, key_range *max_key)
+                            const key_range *min_key, const key_range *max_key,
+                            page_range *pages)
 {
   ha_rows res;
   double start_pos,end_pos,diff;
@@ -98,10 +99,12 @@ ha_rows mi_records_in_range(MI_INFO *info, int inx,
   case HA_KEY_ALG_BTREE:
   default:
     start_pos= (min_key ?_mi_record_pos(info, min_key->key,
-                                        min_key->keypart_map, min_key->flag)
+                                        min_key->keypart_map, min_key->flag,
+                                        &pages->first_page)
                         : (double) 0);
     end_pos=   (max_key ?  _mi_record_pos(info, max_key->key,
-                                          max_key->keypart_map, max_key->flag)
+                                          max_key->keypart_map, max_key->flag,
+                                          &pages->last_page)
 		        : (double) info->state->records);
     res= (end_pos < start_pos ? (ha_rows) 0 :
           (end_pos == start_pos ? (ha_rows) 1 : (ha_rows) (end_pos-start_pos)));
@@ -147,7 +150,8 @@ ha_rows mi_records_in_range(MI_INFO *info, int inx,
 
 static double _mi_record_pos(MI_INFO *info, const uchar *key,
                              key_part_map keypart_map,
-                             enum ha_rkey_function search_flag)
+                             enum ha_rkey_function search_flag,
+                             ulonglong *final_page)
 {
   uint inx=(uint) info->lastinx, nextflag, key_len;
   MI_KEYDEF *keyinfo=info->s->keyinfo+inx;
@@ -203,7 +207,8 @@ static double _mi_record_pos(MI_INFO *info, const uchar *key,
   */
   pos=_mi_search_pos(info,keyinfo,key_buff,key_len,
 		     nextflag | SEARCH_SAVE_BUFF | SEARCH_UPDATE,
-		     info->s->state.key_root[inx], TRUE);
+		     info->s->state.key_root[inx], TRUE,
+                     final_page);
   if (pos >= 0.0)
   {
     DBUG_PRINT("exit",("pos: %g",(pos*info->state->records)));
@@ -219,7 +224,8 @@ static double _mi_record_pos(MI_INFO *info, const uchar *key,
 static double _mi_search_pos(register MI_INFO *info,
 			     register MI_KEYDEF *keyinfo,
 			     uchar *key, uint key_len, uint nextflag,
-			     register my_off_t pos, my_bool last_in_level)
+			     register my_off_t pos, my_bool last_in_level,
+                             ulonglong *final_page)
 {
   int flag;
   uint nod_flag,keynr,UNINIT_VAR(max_keynr);
@@ -233,6 +239,7 @@ static double _mi_search_pos(register MI_INFO *info,
 
   if (!(buff=_mi_fetch_keypage(info,keyinfo,pos,DFLT_INIT_HITS,info->buff,1)))
     goto err;
+  *final_page= pos;
   flag=(*keyinfo->bin_search)(info,keyinfo,buff,key,key_len,nextflag,
 			      &keypos,info->lastkey, &after_key);
   nod_flag=mi_test_if_nod(buff);
@@ -251,7 +258,8 @@ static double _mi_search_pos(register MI_INFO *info,
       offset= 1.0;
     else if ((offset=_mi_search_pos(info,keyinfo,key,key_len,nextflag,
 				    _mi_kpos(nod_flag,keypos),
-                                    last_in_level && after_key)) < 0)
+                                    last_in_level && after_key,
+                                    final_page)) < 0)
       DBUG_RETURN(offset);
   }
   else
@@ -271,7 +279,8 @@ static double _mi_search_pos(register MI_INFO *info,
       */
       if ((offset=_mi_search_pos(info,keyinfo,key,key_len,SEARCH_FIND,
 				 _mi_kpos(nod_flag,keypos),
-                                 last_in_level && after_key)) < 0)
+                                 last_in_level && after_key,
+                                 final_page)) < 0)
 	DBUG_RETURN(offset);			/* Read error */
     }
   }
diff --git a/storage/myisam/mi_static.c b/storage/myisam/mi_static.c
index 3679ea4a329..d0c3995d9de 100644
--- a/storage/myisam/mi_static.c
+++ b/storage/myisam/mi_static.c
@@ -61,6 +61,28 @@ uint myisam_readnext_vec[]=
   SEARCH_BIGGER, SEARCH_SMALLER, SEARCH_SMALLER
 };
 
+PSI_memory_key mi_key_memory_MYISAM_SHARE;
+PSI_memory_key mi_key_memory_MI_INFO;
+PSI_memory_key mi_key_memory_MI_INFO_ft1_to_ft2;
+PSI_memory_key mi_key_memory_MI_INFO_bulk_insert;
+PSI_memory_key mi_key_memory_record_buffer;
+PSI_memory_key mi_key_memory_FTB;
+PSI_memory_key mi_key_memory_FT_INFO;
+PSI_memory_key mi_key_memory_FTPARSER_PARAM;
+PSI_memory_key mi_key_memory_ft_memroot;
+PSI_memory_key mi_key_memory_ft_stopwords;
+PSI_memory_key mi_key_memory_MI_SORT_PARAM;
+PSI_memory_key mi_key_memory_MI_SORT_PARAM_wordroot;
+PSI_memory_key mi_key_memory_SORT_FT_BUF;
+PSI_memory_key mi_key_memory_SORT_KEY_BLOCKS;
+PSI_memory_key mi_key_memory_filecopy;
+PSI_memory_key mi_key_memory_SORT_INFO_buffer;
+PSI_memory_key mi_key_memory_MI_DECODE_TREE;
+PSI_memory_key mi_key_memory_MYISAM_SHARE_decode_tables;
+PSI_memory_key mi_key_memory_preload_buffer;
+PSI_memory_key mi_key_memory_stPageList_pages;
+PSI_memory_key mi_key_memory_keycache_thread_var;
+
 #ifdef HAVE_PSI_INTERFACE
 PSI_mutex_key mi_key_mutex_MYISAM_SHARE_intern_lock,
   mi_key_mutex_MI_SORT_INFO_mutex, mi_key_mutex_MI_CHECK_print_msg;
@@ -106,6 +128,31 @@ static PSI_thread_info all_myisam_threads[]=
   { &mi_key_thread_find_all_keys, "find_all_keys", 0},
 };
 
+static PSI_memory_info all_myisam_memory[]=
+{
+  { &mi_key_memory_MYISAM_SHARE, "MYISAM_SHARE", 0},
+  { &mi_key_memory_MI_INFO, "MI_INFO", 0},
+  { &mi_key_memory_MI_INFO_ft1_to_ft2, "MI_INFO::ft1_to_ft2", 0},
+  { &mi_key_memory_MI_INFO_bulk_insert, "MI_INFO::bulk_insert", 0},
+  { &mi_key_memory_record_buffer, "record_buffer", 0},
+  { &mi_key_memory_FTB, "FTB", 0},
+  { &mi_key_memory_FT_INFO, "FT_INFO", 0},
+  { &mi_key_memory_FTPARSER_PARAM, "FTPARSER_PARAM", 0},
+  { &mi_key_memory_ft_memroot, "ft_memroot", 0},
+  { &mi_key_memory_ft_stopwords, "ft_stopwords", 0},
+  { &mi_key_memory_MI_SORT_PARAM, "MI_SORT_PARAM", 0},
+  { &mi_key_memory_MI_SORT_PARAM_wordroot, "MI_SORT_PARAM::wordroot", 0},
+  { &mi_key_memory_SORT_FT_BUF, "SORT_FT_BUF", 0},
+  { &mi_key_memory_SORT_KEY_BLOCKS, "SORT_KEY_BLOCKS", 0},
+  { &mi_key_memory_filecopy, "filecopy", 0},
+  { &mi_key_memory_SORT_INFO_buffer, "SORT_INFO::buffer", 0},
+  { &mi_key_memory_MI_DECODE_TREE, "MI_DECODE_TREE", 0},
+  { &mi_key_memory_MYISAM_SHARE_decode_tables, "MYISAM_SHARE::decode_tables", 0},
+  { &mi_key_memory_preload_buffer, "preload_buffer", 0},
+  { &mi_key_memory_stPageList_pages, "stPageList::pages", 0},
+  { &mi_key_memory_keycache_thread_var, "keycache_thread_var", 0}
+};
+
 void init_myisam_psi_keys()
 {
   const char* category= "myisam";
@@ -125,6 +172,9 @@ void init_myisam_psi_keys()
 
   count= array_elements(all_myisam_threads);
   mysql_thread_register(category, all_myisam_threads, count);
+
+  count= array_elements(all_myisam_memory);
+  mysql_memory_register(category, all_myisam_memory, count);
 }
 #endif /* HAVE_PSI_INTERFACE */
 
diff --git a/storage/myisam/mi_test1.c b/storage/myisam/mi_test1.c
index 6e1cbb716ba..5a614edb563 100644
--- a/storage/myisam/mi_test1.c
+++ b/storage/myisam/mi_test1.c
@@ -1,5 +1,6 @@
 /*
    Copyright (c) 2000, 2011, Oracle and/or its affiliates
+   Copyright (c) 2020, MariaDB Corporation.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -479,8 +480,7 @@ static void update_record(uchar *record)
     ptr=blob_key;
     memcpy(pos+4, &ptr, sizeof(char*));	/* Store pointer to new key */
     if (keyinfo[0].seg[0].type != HA_KEYTYPE_NUM)
-      default_charset_info->cset->casedn(default_charset_info,
-                                         (char*) blob_key, length,
+      my_ci_casedn(default_charset_info, (char*) blob_key, length,
                                          (char*) blob_key, length);
     pos+=recinfo[1].length;
   }
@@ -488,16 +488,14 @@ static void update_record(uchar *record)
   {
     uint pack_length= HA_VARCHAR_PACKLENGTH(recinfo[1].length-1);
     uint length= pack_length == 1 ? (uint) *(uchar*) pos : uint2korr(pos);
-    default_charset_info->cset->casedn(default_charset_info,
-                                       (char*) pos + pack_length, length,
+    my_ci_casedn(default_charset_info, (char*) pos + pack_length, length,
                                        (char*) pos + pack_length, length);
     pos+=recinfo[1].length;
   }
   else
   {
     if (keyinfo[0].seg[0].type != HA_KEYTYPE_NUM)
-      default_charset_info->cset->casedn(default_charset_info,
-                                         (char*) pos, keyinfo[0].seg[0].length,
+      my_ci_casedn(default_charset_info, (char*) pos, keyinfo[0].seg[0].length,
                                          (char*) pos, keyinfo[0].seg[0].length);
     pos+=recinfo[1].length;
   }
@@ -590,10 +588,11 @@ static struct my_option my_long_options[] =
 
 
 static my_bool
-get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
-	       char *argument __attribute__((unused)))
+get_one_option(const struct my_option *opt,
+	       const char *argument __attribute__((unused)),
+               const char *filename __attribute__((unused)))
 {
-  switch(optid) {
+  switch(opt->id) {
   case 'a':
     key_type= HA_KEYTYPE_TEXT;
     break;
diff --git a/storage/myisam/mi_test2.c b/storage/myisam/mi_test2.c
index 48a091e80de..4b5039eea5b 100644
--- a/storage/myisam/mi_test2.c
+++ b/storage/myisam/mi_test2.c
@@ -67,6 +67,7 @@ int main(int argc, char *argv[])
   const char *filename;
   char *blob_buffer;
   MI_CREATE_INFO create_info;
+  page_range pages;
   MY_INIT(argv[0]);
 
   filename= "test2";
@@ -622,7 +623,8 @@ int main(int argc, char *argv[])
     max_key.keypart_map= HA_WHOLE_KEY;
     max_key.flag= HA_READ_AFTER_KEY;
 
-    range_records= mi_records_in_range(file,(int) i, &min_key, &max_key);
+    range_records= mi_records_in_range(file,(int) i, &min_key, &max_key,
+                                       &pages);
     if (range_records < info.records*8/10 ||
 	range_records > info.records*12/10)
     {
@@ -645,6 +647,7 @@ int main(int argc, char *argv[])
     if (j != 0 && k != 0)
     {
       key_range min_key, max_key;
+      page_range pages;
       if (j > k)
 	swap_variables(int, j, k);
       sprintf((char*) key,"%6d",j);
@@ -656,7 +659,7 @@ int main(int argc, char *argv[])
       max_key.key= key2;
       max_key.keypart_map= HA_WHOLE_KEY;
       max_key.flag= HA_READ_BEFORE_KEY;
-      range_records= mi_records_in_range(file, 0, &min_key, &max_key);
+      range_records= mi_records_in_range(file, 0, &min_key, &max_key, &pages);
       records=0;
       for (j++ ; j < k ; j++)
 	records+=key1[j];
@@ -1021,7 +1024,7 @@ static void put_blob_in_record(uchar *blob_pos, char **blob_buffer)
     if (rnd(10) == 0)
     {
       if (! *blob_buffer &&
-	  !(*blob_buffer=my_malloc((uint) use_blob,MYF(MY_WME))))
+	  !(*blob_buffer=my_malloc(PSI_NOT_INSTRUMENTED, use_blob,MYF(MY_WME))))
       {
 	use_blob=0;
 	return;
diff --git a/storage/myisam/mi_unique.c b/storage/myisam/mi_unique.c
index 5d16efb96a1..e1d7aeaa711 100644
--- a/storage/myisam/mi_unique.c
+++ b/storage/myisam/mi_unique.c
@@ -1,5 +1,6 @@
 /*
    Copyright (c) 2000, 2010, Oracle and/or its affiliates
+   Copyright (c) 2020, MariaDB Corporation.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -121,9 +122,9 @@ ha_checksum mi_unique_hash(MI_UNIQUEDEF *def, const uchar *record)
     if (type == HA_KEYTYPE_TEXT || type == HA_KEYTYPE_VARTEXT1 ||
         type == HA_KEYTYPE_VARTEXT2)
     {
-      keyseg->charset->coll->hash_sort(keyseg->charset,
-                                       (const uchar*) pos, length, &seed1,
-                                       &seed2);
+      my_ci_hash_sort(keyseg->charset,
+                      (const uchar*) pos, length,
+                      &seed1, &seed2);
       crc^= seed1;
     }
     else
diff --git a/storage/myisam/mi_write.c b/storage/myisam/mi_write.c
index 7345ab1604d..7d489908725 100644
--- a/storage/myisam/mi_write.c
+++ b/storage/myisam/mi_write.c
@@ -546,8 +546,10 @@ int _mi_insert(register MI_INFO *info, register MI_KEYDEF *keyinfo,
       {
         /* yup. converting */
         info->ft1_to_ft2=(DYNAMIC_ARRAY *)
-          my_malloc(sizeof(DYNAMIC_ARRAY), MYF(MY_WME));
-        my_init_dynamic_array(info->ft1_to_ft2, ft2len, 300, 50, MYF(0));
+          my_malloc(mi_key_memory_MI_INFO_ft1_to_ft2,
+                    sizeof(DYNAMIC_ARRAY), MYF(MY_WME));
+        my_init_dynamic_array(mi_key_memory_MI_INFO_ft1_to_ft2,
+                              info->ft1_to_ft2, ft2len, 300, 50, MYF(0));
 
         /*
           now, adding all keys from the page to dynarray
@@ -998,7 +1000,8 @@ int mi_init_bulk_insert(MI_INFO *info, size_t cache_size, ha_rows rows)
     cache_size/=total_keylength*16;
 
   info->bulk_insert=(TREE *)
-    my_malloc((sizeof(TREE)*share->base.keys+
+    my_malloc(mi_key_memory_MI_INFO_bulk_insert,
+              (sizeof(TREE)*share->base.keys+
                sizeof(bulk_insert_param)*num_keys),MYF(0));
 
   if (!info->bulk_insert)
diff --git a/storage/myisam/myisam_ftdump.c b/storage/myisam/myisam_ftdump.c
index 7ced701ed5b..cc28d4a59e5 100644
--- a/storage/myisam/myisam_ftdump.c
+++ b/storage/myisam/myisam_ftdump.c
@@ -22,7 +22,8 @@
 
 static void usage();
 static void complain(int val);
-static my_bool get_one_option(int, const struct my_option *, char *);
+static my_bool get_one_option(const struct my_option *, const char *,
+                              const char *);
 
 static int count=0, stats=0, dump=0, lstats=0;
 static my_bool verbose;
@@ -228,10 +229,11 @@ err:
 
 
 static my_bool
-get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
-	       char *argument __attribute__((unused)))
+get_one_option(const struct my_option *opt,
+	       const char *argument __attribute__((unused)),
+               const char *filename __attribute__((unused)))
 {
-  switch(optid) {
+  switch(opt->id) {
   case 'd':
     dump=1;
     complain(count || query);
diff --git a/storage/myisam/myisamchk.c b/storage/myisam/myisamchk.c
index f616a9f9b1a..4344ce11420 100644
--- a/storage/myisam/myisamchk.c
+++ b/storage/myisam/myisamchk.c
@@ -429,7 +429,7 @@ static void usage(void)
   -q, --quick         Faster repair by not modifying the data file.\n\
                       One can give a second '-q' to force myisamchk to\n\
 		      modify the original datafile in case of duplicate keys.\n\
-		      NOTE: Tables where the data file is currupted can't be\n\
+		      NOTE: Tables where the data file is corrupted can't be\n\
 		      fixed with this option.\n\
   -u, --unpack        Unpack file packed with myisampack.\n\
 ");
@@ -470,11 +470,10 @@ TYPELIB myisam_stats_method_typelib= {
 	 /* Read options */
 
 static my_bool
-get_one_option(int optid,
-	       const struct my_option *opt __attribute__((unused)),
-	       char *argument)
+get_one_option(const struct my_option *opt,
+	       const char *argument, const char *filename __attribute__((unused)))
 {
-  switch (optid) {
+  switch (opt->id) {
   case 'a':
     if (argument == disabled_my_option)
       check_param.testflag&= ~T_STATISTICS;
@@ -798,7 +797,7 @@ static void get_options(register int *argc,register char ***argv)
                                              MYF(MY_WME))))
       exit(1);
 
-  myisam_block_size=(uint) 1 << my_bit_log2(opt_myisam_block_size);
+  myisam_block_size=(uint) 1 << my_bit_log2_uint64(opt_myisam_block_size);
   return;
 } /* get options */
 
@@ -1428,20 +1427,25 @@ static void descript(HA_CHECK *param, register MI_INFO *info, char * name)
       else
 	type=(enum en_fieldtype) share->rec[field].type;
       end=strmov(buff,field_pack[type]);
+      if (end != buff)
+      {
+        *(end++)=',';
+        *(end++)=' ';
+      }
       if (share->options & HA_OPTION_COMPRESS_RECORD)
       {
 	if (share->rec[field].pack_type & PACK_TYPE_SELECTED)
-	  end=strmov(end,", not_always");
+	  end=strmov(end,"not_always, ");
 	if (share->rec[field].pack_type & PACK_TYPE_SPACE_FIELDS)
-	  end=strmov(end,", no empty");
+	  end=strmov(end,"no empty, ");
 	if (share->rec[field].pack_type & PACK_TYPE_ZERO_FILL)
 	{
-	  sprintf(end,", zerofill(%d)",share->rec[field].space_length_bits);
+	  sprintf(end,"zerofill(%d), ",share->rec[field].space_length_bits);
 	  end=strend(end);
 	}
       }
-      if (buff[0] == ',')
-	strmov(buff,buff+2);
+      if (end != buff)
+        end[-2]= 0;                               /* Remove ", " */
       int10_to_str((long) share->rec[field].length,length,10);
       null_bit[0]=null_pos[0]=0;
       if (share->rec[field].null_bit)
diff --git a/storage/myisam/myisamdef.h b/storage/myisam/myisamdef.h
index 06de992d6de..c90d989c975 100644
--- a/storage/myisam/myisamdef.h
+++ b/storage/myisam/myisamdef.h
@@ -1,6 +1,6 @@
 /*
    Copyright (c) 2000, 2012, Oracle and/or its affiliates.
-   Copyright (c) 2017, MariaDB Corporation.
+   Copyright (c) 2017, 2022, MariaDB Corporation.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -234,11 +234,11 @@ typedef struct st_mi_isam_share
 
 struct st_myisam_info
 {
-  MYISAM_SHARE *s;                      /* Shared between open:s */
+  MYISAM_SHARE *s;                      /* Shared between opens */
   MI_STATUS_INFO *state, save_state;
   MI_BLOB *blobs;                       /* Pointer to blobs */
   MI_BIT_BUFF bit_buff;
-  /* accumulate indexfile changes between write's */
+  /* accumulate indexfile changes between writes */
   TREE *bulk_insert;
   DYNAMIC_ARRAY *ft1_to_ft2;            /* used only in ft1->ft2 conversion */
   MEM_ROOT      ft_memroot;             /* used by the parser               */
@@ -256,7 +256,7 @@ struct st_myisam_info
   uint32 int_keytree_version;		/*  -""-  */
   int (*read_record)(struct st_myisam_info*, my_off_t, uchar*);
   invalidator_by_filename invalidator;  /* query cache invalidator */
-  ulong this_unique;                    /* uniq filenumber or thread */
+  ulong this_unique;                    /* unique filenumber or thread */
   ulong last_unique;                    /* last unique number */
   ulong this_loop;                      /* counter for this open */
   ulong last_loop;                      /* last used counter */
@@ -269,7 +269,7 @@ struct st_myisam_info
   my_off_t dupp_key_pos;
   ha_checksum checksum;                 /* Temp storage for row checksum */
   /*
-    QQ: the folloing two xxx_length fields should be removed,
+    QQ: the following two xxx_length fields should be removed,
      as they are not compatible with parallel repair
   */
   ulong packed_length, blob_length;     /* Length of found, packed record */
@@ -612,7 +612,24 @@ typedef struct st_mi_block_info         /* Parameter to _mi_get_block_info */
   uint offset;
 } MI_BLOCK_INFO;
 
-        /* bits in return from _mi_get_block_info */
+
+struct st_sort_key_blocks		/* Used when sorting */
+{
+  uchar *buff, *end_pos;
+  uchar lastkey[HA_MAX_POSSIBLE_KEY_BUFF];
+  uint last_length;
+  int inited;
+};
+
+
+struct st_sort_ftbuf
+{
+  uchar *buf, *end;
+  int count;
+  uchar lastkey[HA_MAX_KEY_BUFF];
+};
+
+/* bits in return from _mi_get_block_info */
 
 #define BLOCK_FIRST     1U
 #define BLOCK_LAST      2U
@@ -709,7 +726,7 @@ int _mi_cmp_dynamic_unique(MI_INFO *info, MI_UNIQUEDEF *def,
                            const uchar *record, my_off_t pos);
 int mi_unique_comp(MI_UNIQUEDEF *def, const uchar *a, const uchar *b,
                    my_bool null_are_equal);
-void mi_get_status(void *param, my_bool concurrent_insert);
+my_bool mi_get_status(void *param, my_bool concurrent_insert);
 void mi_update_status(void *param);
 void mi_restore_status(void *param);
 void mi_copy_status(void *to, void *from);
@@ -757,6 +774,30 @@ extern PSI_file_key mi_key_file_datatmp, mi_key_file_dfile, mi_key_file_kfile,
 extern PSI_thread_key mi_key_thread_find_all_keys;
 
 void init_myisam_psi_keys();
+#else
+#define init_myisam_psi_keys() do { } while(0)
 #endif /* HAVE_PSI_INTERFACE */
 
+extern PSI_memory_key mi_key_memory_MYISAM_SHARE;
+extern PSI_memory_key mi_key_memory_MI_INFO;
+extern PSI_memory_key mi_key_memory_MI_INFO_ft1_to_ft2;
+extern PSI_memory_key mi_key_memory_MI_INFO_bulk_insert;
+extern PSI_memory_key mi_key_memory_record_buffer;
+extern PSI_memory_key mi_key_memory_FTB;
+extern PSI_memory_key mi_key_memory_FT_INFO;
+extern PSI_memory_key mi_key_memory_FTPARSER_PARAM;
+extern PSI_memory_key mi_key_memory_ft_memroot;
+extern PSI_memory_key mi_key_memory_ft_stopwords;
+extern PSI_memory_key mi_key_memory_MI_SORT_PARAM;
+extern PSI_memory_key mi_key_memory_MI_SORT_PARAM_wordroot;
+extern PSI_memory_key mi_key_memory_SORT_FT_BUF;
+extern PSI_memory_key mi_key_memory_SORT_KEY_BLOCKS;
+extern PSI_memory_key mi_key_memory_filecopy;
+extern PSI_memory_key mi_key_memory_SORT_INFO_buffer;
+extern PSI_memory_key mi_key_memory_MI_DECODE_TREE;
+extern PSI_memory_key mi_key_memory_MYISAM_SHARE_decode_tables;
+extern PSI_memory_key mi_key_memory_preload_buffer;
+extern PSI_memory_key mi_key_memory_stPageList_pages;
+extern PSI_memory_key mi_key_memory_keycache_thread_var;
+
 C_MODE_END
diff --git a/storage/myisam/myisamlog.c b/storage/myisam/myisamlog.c
index 9bef2be929f..40d473dc532 100644
--- a/storage/myisam/myisamlog.c
+++ b/storage/myisam/myisamlog.c
@@ -422,7 +422,7 @@ static int examine_log(char * file_name, char **table_names)
        * The additional space is needed for the sprintf commands two lines
        * below.
        */ 
-      file_info.show_name=my_memdup(isam_file_name,
+      file_info.show_name=my_memdup(PSI_NOT_INSTRUMENTED, isam_file_name,
 				    (uint) strlen(isam_file_name)+10,
 				    MYF(MY_WME));
       if (file_info.id > 1)
@@ -451,8 +451,8 @@ static int examine_log(char * file_name, char **table_names)
 	if (!(file_info.isam= mi_open(isam_file_name,O_RDWR,
 				      HA_OPEN_WAIT_IF_LOCKED)))
 	  goto com_err;
-	if (!(file_info.record=my_malloc(file_info.isam->s->base.reclength,
-					 MYF(MY_WME))))
+	if (!(file_info.record=my_malloc(PSI_NOT_INSTRUMENTED,
+                              file_info.isam->s->base.reclength, MYF(MY_WME))))
 	  goto end;
 	files_open++;
 	file_info.closed=0;
@@ -683,7 +683,7 @@ static int read_string(IO_CACHE *file, register uchar* *to, register uint length
 
   if (*to)
     my_free(*to);
-  if (!(*to= (uchar*) my_malloc(length+1,MYF(MY_WME))) ||
+  if (!(*to= (uchar*) my_malloc(PSI_NOT_INSTRUMENTED, length+1,MYF(MY_WME))) ||
       my_b_read(file,(uchar*) *to,length))
   {
     if (*to)
diff --git a/storage/myisam/myisampack.c b/storage/myisam/myisampack.c
index 8a0ca759871..d6cd9334a55 100644
--- a/storage/myisam/myisampack.c
+++ b/storage/myisam/myisampack.c
@@ -316,12 +316,13 @@ static void usage(void)
 
 
 static my_bool
-get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
-	       char *argument)
+get_one_option(const struct my_option *opt,
+	       const char *argument,
+               const char *filename __attribute__((unused)))
 {
   uint length;
 
-  switch(optid) {
+  switch(opt->id) {
   case 'f':
     force_pack= 1;
     tmpfile_createflag= O_RDWR | O_TRUNC;
@@ -439,7 +440,7 @@ static my_bool open_isam_files(PACK_MRG_INFO *mrg, char **names, uint count)
   uint i,j;
   mrg->count=0;
   mrg->current=0;
-  mrg->file=(MI_INFO**) my_malloc(sizeof(MI_INFO*)*count,MYF(MY_FAE));
+  mrg->file=(MI_INFO**) my_malloc(PSI_NOT_INSTRUMENTED, sizeof(MI_INFO*)*count,MYF(MY_FAE));
   mrg->free_file=1;
   mrg->src_file_has_indexes_disabled= 0;
   for (i=0; i < count ; i++)
@@ -518,7 +519,7 @@ static int compress(PACK_MRG_INFO *mrg,char *result_table)
 	< 0)
       goto err;
     length=(uint) share->base.keystart;
-    if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
+    if (!(buff= (uchar*) my_malloc(PSI_NOT_INSTRUMENTED, length,MYF(MY_WME))))
       goto err;
     if (my_pread(share->kfile,buff,length,0L,MYF(MY_WME | MY_NABP)) ||
 	my_write(join_isam_file,buff,length,
@@ -798,7 +799,7 @@ static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records)
 {
   reg2 uint i;
   reg1 HUFF_COUNTS *count;
-  if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
+  if ((count = (HUFF_COUNTS*) my_malloc(PSI_NOT_INSTRUMENTED, info->s->base.fields*
 					sizeof(HUFF_COUNTS),
 					MYF(MY_ZEROFILL | MY_WME))))
   {
@@ -825,7 +826,7 @@ static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records)
 		NULL, MYF(0));
       if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
 	count[i].tree_pos=count[i].tree_buff =
-	  my_malloc(count[i].field_length > 1 ? tree_buff_length : 2,
+	  my_malloc(PSI_NOT_INSTRUMENTED, count[i].field_length > 1 ? tree_buff_length : 2,
 		    MYF(MY_WME));
     }
   }
@@ -1476,7 +1477,7 @@ static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
   HUFF_TREE *huff_tree;
   DBUG_ENTER("make_huff_trees");
 
-  if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
+  if (!(huff_tree=(HUFF_TREE*) my_malloc(PSI_NOT_INSTRUMENTED, trees*sizeof(HUFF_TREE),
 					 MYF(MY_WME | MY_ZEROFILL))))
     DBUG_RETURN(0);
 
@@ -1554,14 +1555,14 @@ static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
   if (!huff_tree->element_buffer)
   {
     if (!(huff_tree->element_buffer=
-	 (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
+	 (HUFF_ELEMENT*) my_malloc(PSI_NOT_INSTRUMENTED, found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
       return 1;
   }
   else
   {
     HUFF_ELEMENT *temp;
     if (!(temp=
-	  (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
+	  (HUFF_ELEMENT*) my_realloc(PSI_NOT_INSTRUMENTED, (uchar*) huff_tree->element_buffer,
 				     found*2*sizeof(HUFF_ELEMENT),
 				     MYF(MY_WME))))
       return 1;
@@ -1930,7 +1931,7 @@ static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
     {
       elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
       if (!(huff_tree->code =
-            (ulonglong*) my_malloc(elements*
+            (ulonglong*) my_malloc(PSI_NOT_INSTRUMENTED, elements*
                                    (sizeof(ulonglong) + sizeof(uchar)),
                                    MYF(MY_WME | MY_ZEROFILL))))
 	return 1;
@@ -2823,7 +2824,7 @@ static char *make_old_name(char *new_name, char *old_name)
 static void init_file_buffer(File file, pbool read_buffer)
 {
   file_buffer.file=file;
-  file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
+  file_buffer.buffer= (uchar*) my_malloc(PSI_NOT_INSTRUMENTED, ALIGN_SIZE(RECORD_CACHE_SIZE),
 					 MYF(MY_WME));
   file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
   file_buffer.pos_in_file=0;
@@ -2880,7 +2881,7 @@ static int flush_buffer(ulong neaded_length)
   {
     char *tmp;
     neaded_length+=256;				/* some margin */
-    tmp= my_realloc((char*) file_buffer.buffer, neaded_length,MYF(MY_WME));
+    tmp= my_realloc(PSI_NOT_INSTRUMENTED, (char*) file_buffer.buffer, neaded_length,MYF(MY_WME));
     if (!tmp)
       return 1;
     file_buffer.pos= ((uchar*) tmp +
diff --git a/storage/myisam/rt_index.c b/storage/myisam/rt_index.c
index 08543ec2b22..651e2e79478 100644
--- a/storage/myisam/rt_index.c
+++ b/storage/myisam/rt_index.c
@@ -733,8 +733,11 @@ static int rtree_fill_reinsert_list(stPageList *ReinsertList, my_off_t page,
   if (ReinsertList->n_pages == ReinsertList->m_pages)
   {
     ReinsertList->m_pages += REINSERT_BUFFER_INC;
-    if (!(ReinsertList->pages = (stPageLevel*)my_realloc((uchar*)ReinsertList->pages, 
-      ReinsertList->m_pages * sizeof(stPageLevel), MYF(MY_ALLOW_ZERO_PTR))))
+    if (!(ReinsertList->pages = (stPageLevel*)
+          my_realloc(mi_key_memory_stPageList_pages,
+                     (uchar*)ReinsertList->pages,
+                     ReinsertList->m_pages * sizeof(stPageLevel),
+                     MYF(MY_ALLOW_ZERO_PTR))))
       goto err1;
   }
   /* save page to ReinsertList */
diff --git a/storage/myisam/rt_test.c b/storage/myisam/rt_test.c
index 62c9539eb66..a35d41a0025 100644
--- a/storage/myisam/rt_test.c
+++ b/storage/myisam/rt_test.c
@@ -112,6 +112,7 @@ static int run_test(const char *filename)
   uchar read_record[MAX_REC_LENGTH];
   int upd= 10;
   ha_rows hrows;
+  page_range pages;
 
   bzero(&uniquedef, sizeof(uniquedef));
   bzero(&create_info, sizeof(create_info));
@@ -332,7 +333,7 @@ static int run_test(const char *filename)
   range.key= record+1;
   range.length= 1000;                           /* Big enough */
   range.flag= HA_READ_MBR_INTERSECT;
-  hrows= mi_records_in_range(file, 0, &range, (key_range*) 0);
+  hrows= mi_records_in_range(file, 0, &range, (key_range*) 0, &pages);
   printf("     %ld rows\n", (long) hrows);
 
   if (mi_close(file)) goto err;
diff --git a/storage/myisam/sort.c b/storage/myisam/sort.c
index e586543363b..6fe38a3fd04 100644
--- a/storage/myisam/sort.c
+++ b/storage/myisam/sort.c
@@ -190,11 +190,12 @@ int _create_index_by_sort(MI_SORT_PARAM *info,my_bool no_messages,
     }
 
     if ((sort_keys= ((uchar **)
-                     my_malloc((size_t) (keys*(sort_length+sizeof(char*))+
+                     my_malloc(PSI_INSTRUMENT_ME,
+                               (size_t) (keys*(sort_length+sizeof(char*))+
                                          HA_FT_MAXBYTELEN), MYF(0)))))
     {
-      if (my_init_dynamic_array(&buffpek, sizeof(BUFFPEK), maxbuffer,
-                                MY_MIN(maxbuffer/2, 1000), MYF(0)))
+      if (my_init_dynamic_array(PSI_INSTRUMENT_ME, &buffpek, sizeof(BUFFPEK),
+                                maxbuffer, MY_MIN(maxbuffer/2, 1000), MYF(0)))
       {
 	my_free(sort_keys);
         sort_keys= 0;
@@ -406,12 +407,14 @@ static my_bool thr_find_all_keys_exec(MI_SORT_PARAM *sort_param)
       }
       while ((maxbuffer= (uint) (idx/(keys-1)+1)) != maxbuffer_org);
     }
-    if ((sort_keys= (uchar**) my_malloc((size_t)(keys * (sort_length + sizeof(char*)) +
+    if ((sort_keys= (uchar**) my_malloc(PSI_INSTRUMENT_ME,
+                    (size_t)(keys * (sort_length + sizeof(char*)) +
                    ((sort_param->keyinfo->flag & HA_FULLTEXT) ?
                     HA_FT_MAXBYTELEN : 0)), MYF(0))))
     {
-      if (my_init_dynamic_array(&sort_param->buffpek, sizeof(BUFFPEK),
-                                maxbuffer, MY_MIN(maxbuffer / 2, 1000), MYF(0)))
+      if (my_init_dynamic_array(PSI_INSTRUMENT_ME, &sort_param->buffpek,
+                                sizeof(BUFFPEK), maxbuffer,
+                                MY_MIN(maxbuffer / 2, 1000), MYF(0)))
       {
         my_free(sort_keys);
         sort_keys= NULL;          /* Safety against double free on error. */
@@ -607,7 +610,8 @@ int thr_write_keys(MI_SORT_PARAM *sort_param)
         length=param->sort_buffer_length;
         while (length >= MIN_SORT_BUFFER)
         {
-          if ((mergebuf= my_malloc((size_t) length, MYF(0))))
+          if ((mergebuf= my_malloc(PSI_INSTRUMENT_ME,
+                                   (size_t) length, MYF(0))))
               break;
           length=length*3/4;
         }
diff --git a/storage/myisam/sp_test.c b/storage/myisam/sp_test.c
index 368c9dc2d5a..a33a6c8e743 100644
--- a/storage/myisam/sp_test.c
+++ b/storage/myisam/sp_test.c
@@ -70,6 +70,7 @@ int run_test(const char *filename)
   uchar read_record[MAX_REC_LENGTH];
   int upd=10;
   ha_rows hrows;
+  page_range pages;
   
   /* Define a column for NULLs and DEL markers*/
   
@@ -256,7 +257,7 @@ int run_test(const char *filename)
   max_range.key= record+1;
   max_range.length= 1000;                       /* Big enough */
   max_range.flag= HA_READ_KEY_EXACT;
-  hrows= mi_records_in_range(file, 0, &min_range, &max_range);
+  hrows= mi_records_in_range(file, 0, &min_range, &max_range, &pages);
   printf("     %ld rows\n", (long) hrows);
 
   if (mi_close(file)) goto err;
diff --git a/storage/myisammrg/ha_myisammrg.cc b/storage/myisammrg/ha_myisammrg.cc
index 14036a31b8c..2a7ca8d51f8 100644
--- a/storage/myisammrg/ha_myisammrg.cc
+++ b/storage/myisammrg/ha_myisammrg.cc
@@ -120,7 +120,7 @@ static handler *myisammrg_create_handler(handlerton *hton,
 ha_myisammrg::ha_myisammrg(handlerton *hton, TABLE_SHARE *table_arg)
   :handler(hton, table_arg), file(0), is_cloned(0)
 {
-  init_sql_alloc(&children_mem_root, "ha_myisammrg",
+  init_sql_alloc(rg_key_memory_children, &children_mem_root,
                  FN_REFLEN + ALLOC_ROOT_MIN_BLOCK_SIZE, 0, MYF(0));
 }
 
@@ -1216,11 +1216,14 @@ void ha_myisammrg::position(const uchar *record)
 }
 
 
-ha_rows ha_myisammrg::records_in_range(uint inx, key_range *min_key,
-                                       key_range *max_key)
+ha_rows ha_myisammrg::records_in_range(uint inx,
+                                       const key_range *min_key,
+                                       const key_range *max_key,
+                                       page_range *pages)
 {
   DBUG_ASSERT(this->file->children_attached);
-  return (ha_rows) myrg_records_in_range(file, (int) inx, min_key, max_key);
+  return (ha_rows) myrg_records_in_range(file, (int) inx, min_key, max_key,
+                                         pages);
 }
 
 
diff --git a/storage/myisammrg/ha_myisammrg.h b/storage/myisammrg/ha_myisammrg.h
index b7cbd6c7d12..6da327ec84b 100644
--- a/storage/myisammrg/ha_myisammrg.h
+++ b/storage/myisammrg/ha_myisammrg.h
@@ -68,7 +68,7 @@ public:
 };
 
 
-class ha_myisammrg: public handler
+class ha_myisammrg final : public handler
 {
   MYRG_INFO *file;
   my_bool is_cloned;                    /* This instance has been cloned */
@@ -129,7 +129,8 @@ public:
   int rnd_next(uchar *buf);
   int rnd_pos(uchar * buf, uchar *pos);
   void position(const uchar *record);
-  ha_rows records_in_range(uint inx, key_range *min_key, key_range *max_key);
+  ha_rows records_in_range(uint inx, const key_range *start_key,
+                           const key_range *end_key, page_range *pages);
   int delete_all_rows();
   int info(uint);
   int reset(void);
diff --git a/storage/myisammrg/myrg_def.h b/storage/myisammrg/myrg_def.h
index 9ef65f220be..8bb79a73127 100644
--- a/storage/myisammrg/myrg_def.h
+++ b/storage/myisammrg/myrg_def.h
@@ -32,12 +32,13 @@ extern "C"
 #endif
 void myrg_print_wrong_table(const char *table_name);
 
-#ifdef HAVE_PSI_INTERFACE
+/* Always defined */
+extern PSI_memory_key rg_key_memory_MYRG_INFO;
 
 C_MODE_START
 extern PSI_mutex_key rg_key_mutex_MYRG_INFO_mutex;
+extern PSI_memory_key rg_key_memory_children;
 extern PSI_file_key rg_key_file_MRG;
 void init_myisammrg_psi_keys();
 C_MODE_END
-#endif /* HAVE_PSI_INTERFACE */
 
diff --git a/storage/myisammrg/myrg_open.c b/storage/myisammrg/myrg_open.c
index 903b4796e69..d9ea4b754f2 100644
--- a/storage/myisammrg/myrg_open.c
+++ b/storage/myisammrg/myrg_open.c
@@ -111,7 +111,8 @@ MYRG_INFO *myrg_open(const char *name, int mode, int handle_locking)
     if (!m_info)                                /* First file */
     {
       key_parts=isam->s->base.key_parts;
-      if (!(m_info= (MYRG_INFO*) my_malloc(sizeof(MYRG_INFO) +
+      if (!(m_info= (MYRG_INFO*) my_malloc(rg_key_memory_MYRG_INFO,
+                                           sizeof(MYRG_INFO) +
                                            files*sizeof(MYRG_TABLE) +
                                            key_parts*sizeof(long),
                                            MYF(MY_WME|MY_ZEROFILL))))
@@ -152,7 +153,8 @@ MYRG_INFO *myrg_open(const char *name, int mode, int handle_locking)
 
   if (bad_children)
     goto bad_children;
-  if (!m_info && !(m_info= (MYRG_INFO*) my_malloc(sizeof(MYRG_INFO),
+  if (!m_info && !(m_info= (MYRG_INFO*) my_malloc(rg_key_memory_MYRG_INFO,
+                                                  sizeof(MYRG_INFO),
                                                   MYF(MY_WME | MY_ZEROFILL))))
     goto err;
   /* Don't mark table readonly, for ALTER TABLE ... UNION=(...) to work */
@@ -288,7 +290,8 @@ MYRG_INFO *myrg_parent_open(const char *parent_name,
   }
 
   /* Allocate MERGE parent table structure. */
-  if (!(m_info= (MYRG_INFO*) my_malloc(sizeof(MYRG_INFO) +
+  if (!(m_info= (MYRG_INFO*) my_malloc(rg_key_memory_MYRG_INFO,
+                                       sizeof(MYRG_INFO) +
                                        child_count * sizeof(MYRG_TABLE),
                                        MYF(MY_WME | MY_ZEROFILL))))
     goto err; /* purecov: inspected */
@@ -438,7 +441,8 @@ int myrg_attach_children(MYRG_INFO *m_info, int handle_locking,
       if (!m_info->rec_per_key_part)
       {
         if(!(m_info->rec_per_key_part= (ulong*)
-             my_malloc(key_parts * sizeof(long), MYF(MY_WME))))
+             my_malloc(rg_key_memory_MYRG_INFO,
+                       key_parts * sizeof(long), MYF(MY_WME))))
           goto err; /* purecov: inspected */
         errpos= 1;
       }
diff --git a/storage/myisammrg/myrg_range.c b/storage/myisammrg/myrg_range.c
index 893bda20833..da5e2c38d68 100644
--- a/storage/myisammrg/myrg_range.c
+++ b/storage/myisammrg/myrg_range.c
@@ -17,14 +17,21 @@
 #include "myrg_def.h"
 
 ha_rows myrg_records_in_range(MYRG_INFO *info, int inx,
-                              key_range *min_key, key_range *max_key)
+                              const key_range *min_key,
+                              const key_range *max_key,
+                              page_range *pages)
 {
   ha_rows records=0, res;
   MYRG_TABLE *table;
+  page_range ignore_pages;
+
+  /* Don't calculate pages of more than one active partition */
+  if (info->open_tables +1 != info->end_table)
+    pages= &ignore_pages;
 
   for (table=info->open_tables ; table != info->end_table ; table++)
   {
-    res= mi_records_in_range(table->table, inx, min_key, max_key);
+    res= mi_records_in_range(table->table, inx, min_key, max_key, pages);
     if (res == HA_POS_ERROR)
       return HA_POS_ERROR; 
     if (records > HA_POS_ERROR - res)
diff --git a/storage/myisammrg/myrg_static.c b/storage/myisammrg/myrg_static.c
index a2f5d074c9e..36ec25cb7d9 100644
--- a/storage/myisammrg/myrg_static.c
+++ b/storage/myisammrg/myrg_static.c
@@ -29,6 +29,9 @@ static const char *merge_insert_methods[] =
 TYPELIB merge_insert_method= { array_elements(merge_insert_methods)-1,"",
 			       merge_insert_methods, 0};
 
+PSI_memory_key rg_key_memory_MYRG_INFO;
+PSI_memory_key rg_key_memory_children;
+
 #ifdef HAVE_PSI_INTERFACE
 PSI_mutex_key rg_key_mutex_MYRG_INFO_mutex;
 
@@ -44,6 +47,12 @@ static PSI_file_info all_myisammrg_files[]=
   { &rg_key_file_MRG, "MRG", 0}
 };
 
+static PSI_memory_info all_myisammrg_memory[]=
+{
+  { &rg_key_memory_MYRG_INFO, "MYRG_INFO", 0},
+  { &rg_key_memory_children, "children", 0}
+};
+
 void init_myisammrg_psi_keys()
 {
   const char* category= "myisammrg";
@@ -54,6 +63,9 @@ void init_myisammrg_psi_keys()
 
   count= array_elements(all_myisammrg_files);
   mysql_file_register(category, all_myisammrg_files, count);
+
+  count= array_elements(all_myisammrg_memory);
+  mysql_memory_register(category, all_myisammrg_memory, count);
 }
 #endif /* HAVE_PSI_INTERFACE */
 
diff --git a/storage/oqgraph/ha_oqgraph.cc b/storage/oqgraph/ha_oqgraph.cc
index e0e81f7cddc..11bb139fd55 100644
--- a/storage/oqgraph/ha_oqgraph.cc
+++ b/storage/oqgraph/ha_oqgraph.cc
@@ -179,7 +179,6 @@ static int oqgraph_init(void *p)
   handlerton *hton= (handlerton *)p;
   DBUG_PRINT( "oq-debug", ("oqgraph_init"));
 
-  hton->state= SHOW_OPTION_YES;
   hton->db_type= DB_TYPE_AUTOASSIGN;
   hton->create= oqgraph_create_handler;
   hton->flags= HTON_ALTER_NOT_SUPPORTED;
@@ -193,6 +192,7 @@ static int oqgraph_init(void *p)
   hton->discover_table_structure= oqgraph_discover_table_structure;
 
   hton->close_connection = oqgraph_close_connection;
+  hton->drop_table= [](handlerton *, const char*) { return -1; };
 
   oqgraph_init_done= TRUE;
   return 0;
@@ -563,11 +563,11 @@ int ha_oqgraph::open(const char *name, int mode, uint test_if_locked)
   init_tmp_table_share( thd, share, table->s->db.str, table->s->db.length, options->table_name, "");
   // because of that, we need to reinitialize the memroot (to reset MY_THREAD_SPECIFIC flag)
   DBUG_ASSERT(share->mem_root.used == NULL); // it's still empty
-  init_sql_alloc(&share->mem_root, "share", TABLE_ALLOC_BLOCK_SIZE, 0, MYF(0));
+  init_sql_alloc(PSI_INSTRUMENT_ME, &share->mem_root, TABLE_ALLOC_BLOCK_SIZE, 0, MYF(0));
 
   // What I think this code is doing:
   // * Our OQGRAPH table is `database_blah/name`
-  // * We point p --> /name (or if table happened to be simply `name`, to `name`, dont know if this is possible)
+  // * We point p --> /name (or if table happened to be simply `name`, to `name`, don't know if this is possible)
   // * plen seems to be then set to length of `database_blah/options_data_table_name`
   // * then we set share->normalized_path.str and share->path.str to `database_blah/options_data_table_name`
   // * I assume that this verbiage is needed so  the memory used by share->path.str is set in the share mem root
@@ -1186,8 +1186,10 @@ int ha_oqgraph::rename_table(const char *, const char *)
 }
 
 
-ha_rows ha_oqgraph::records_in_range(uint inx, key_range *min_key,
-                                  key_range *max_key)
+ha_rows ha_oqgraph::records_in_range(uint inx,
+                                     const key_range *min_key,
+                                     const key_range *max_key,
+                                     page_range *pages)
 {
   if (graph->get_thd() != current_thd) {
     DBUG_PRINT( "oq-debug", ("g->table->in_use: 0x%lx <-- current_thd 0x%lx", (long) graph->get_thd(), (long) current_thd));
diff --git a/storage/oqgraph/ha_oqgraph.h b/storage/oqgraph/ha_oqgraph.h
index 0c0af6def97..c8e175df616 100644
--- a/storage/oqgraph/ha_oqgraph.h
+++ b/storage/oqgraph/ha_oqgraph.h
@@ -99,7 +99,8 @@ public:
   int extra(enum ha_extra_function operation);
   int external_lock(THD *thd, int lock_type);
   int delete_all_rows(void);
-  ha_rows records_in_range(uint inx, key_range *min_key, key_range *max_key);
+  ha_rows records_in_range(uint inx, const key_range *min_key,
+                           const key_range *max_key, page_range *pages);
   int delete_table(const char *from);
   int rename_table(const char * from, const char * to);
   int create(const char *name, TABLE *form, HA_CREATE_INFO *create_info);
diff --git a/storage/oqgraph/mysql-test/oqgraph/boundary_conditions.result b/storage/oqgraph/mysql-test/oqgraph/boundary_conditions.result
index 7cb65bc07ea..de8362c16c4 100644
--- a/storage/oqgraph/mysql-test/oqgraph/boundary_conditions.result
+++ b/storage/oqgraph/mysql-test/oqgraph/boundary_conditions.result
@@ -142,7 +142,7 @@ SELECT * FROM graph WHERE latch='-1' and origid is NULL;
 latch	origid	destid	weight	seq	linkid
 Warnings:
 Warning	1210	Incorrect arguments to OQGRAPH latch
-# Make sure we dont crash if someone passed in a UTF string
+# Make sure we don't crash if someone passed in a UTF string
 SELECT * FROM graph WHERE latch='Ω Ohms Tennis Ball 〄';
 latch	origid	destid	weight	seq	linkid
 SELECT * FROM graph WHERE latch='Ω Ohms Tennis Ball 〄' and destid=2 and origid=1;
diff --git a/storage/oqgraph/mysql-test/oqgraph/boundary_conditions.test b/storage/oqgraph/mysql-test/oqgraph/boundary_conditions.test
index a6dae0e2678..9eea290c6b9 100644
--- a/storage/oqgraph/mysql-test/oqgraph/boundary_conditions.test
+++ b/storage/oqgraph/mysql-test/oqgraph/boundary_conditions.test
@@ -91,7 +91,7 @@ SELECT * FROM graph WHERE latch='-1' and destid=1;
 SELECT * FROM graph WHERE latch='-1' and origid=666;
 SELECT * FROM graph WHERE latch='-1' and origid is NULL;
 
---echo # Make sure we dont crash if someone passed in a UTF string
+--echo # Make sure we don't crash if someone passed in a UTF string
 #-- Note the next line couter-intuitively produces no warning
 SELECT * FROM graph WHERE latch='Ω Ohms Tennis Ball 〄';
 SELECT * FROM graph WHERE latch='Ω Ohms Tennis Ball 〄' and destid=2 and origid=1;
@@ -125,7 +125,7 @@ FLUSH TABLES;
 
 TRUNCATE TABLE graph_base;
 #-- Uncomment the following after fixing https://bugs.launchpad.net/oqgraph/+bug/xxxxxxx - Causes the later select to not fail!
-#-- For now dont report a separate bug as it may be a manifestation of https://bugs.launchpad.net/oqgraph/+bug/1195735
+#-- For now don't report a separate bug as it may be a manifestation of https://bugs.launchpad.net/oqgraph/+bug/1195735
 SELECT * FROM graph;
 
 #-- Expect error if we pull the table out from under
diff --git a/storage/oqgraph/mysql-test/oqgraph/create_attr_legacy.test b/storage/oqgraph/mysql-test/oqgraph/create_attr_legacy.test
index ba1d9791367..7fe58d3e307 100644
--- a/storage/oqgraph/mysql-test/oqgraph/create_attr_legacy.test
+++ b/storage/oqgraph/mysql-test/oqgraph/create_attr_legacy.test
@@ -26,7 +26,7 @@ CREATE TABLE backing (
 # Here we enable scaffolding to let us create a deprecated table
 # so we can check that the new code will still allow queries to be performed
 # on a legacy database
-# It should still generate a warning (1287) - but I dont know how to test for that
+# It should still generate a warning (1287) - but I don't know how to test for that
 #
 #   latch SMALLINT UNSIGNED NULL' is deprecated and will be removed in a future
 #   release. Please use 'latch VARCHAR(32) NULL' instead
diff --git a/storage/oqgraph/mysql-test/oqgraph/invalid_operations.result b/storage/oqgraph/mysql-test/oqgraph/invalid_operations.result
index 49639c278d0..930d65afc53 100644
--- a/storage/oqgraph/mysql-test/oqgraph/invalid_operations.result
+++ b/storage/oqgraph/mysql-test/oqgraph/invalid_operations.result
@@ -30,3 +30,14 @@ update graph set origid=123;
 ERROR HY000: Table 'graph' is read only
 DROP TABLE graph_base;
 DROP TABLE graph;
+#
+# End of 10.0 tests
+#
+#
+# MDEV-25373 DROP TABLE doesn't raise error while dropping non-existing table in MariaDB 10.5.9 when OQGraph SE is loaded to the server
+#
+drop table foobar;
+ERROR 42S02: Unknown table 'test.foobar'
+#
+# End of 10.5 tests
+#
diff --git a/storage/oqgraph/mysql-test/oqgraph/invalid_operations.test b/storage/oqgraph/mysql-test/oqgraph/invalid_operations.test
index cab99ec5018..24351cc9998 100644
--- a/storage/oqgraph/mysql-test/oqgraph/invalid_operations.test
+++ b/storage/oqgraph/mysql-test/oqgraph/invalid_operations.test
@@ -48,3 +48,16 @@ update graph set origid=123;
 DROP TABLE graph_base;
 DROP TABLE graph;
 
+--echo #
+--echo # End of 10.0 tests
+--echo #
+
+--echo #
+--echo # MDEV-25373 DROP TABLE doesn't raise error while dropping non-existing table in MariaDB 10.5.9 when OQGraph SE is loaded to the server
+--echo #
+--error 1051
+drop table foobar;
+
+--echo #
+--echo # End of 10.5 tests
+--echo #
diff --git a/storage/oqgraph/mysql-test/oqgraph/legacy_upgrade.test b/storage/oqgraph/mysql-test/oqgraph/legacy_upgrade.test
index f7fc79340ce..ae548b5e440 100644
--- a/storage/oqgraph/mysql-test/oqgraph/legacy_upgrade.test
+++ b/storage/oqgraph/mysql-test/oqgraph/legacy_upgrade.test
@@ -13,7 +13,7 @@ CREATE TABLE graph_base (
 # Backwards compatibility test
 # First we ensure the scaffolding is disabled (default situation)
 # and check we cant create a table with an integer latch
-# Assume this is the default, so dont explicity set false yet:
+# Assume this is the default, so don't explicity set false yet:
 # SET GLOBAL oqgraph_allow_create_integer_latch=false;
 --echo The next error 140 + 1005 is expected
 --error 140
@@ -32,7 +32,7 @@ CREATE TABLE graph (
 # Here we enable scaffolding to let us create a deprecated table
 # so we can check that the new code will still allow queries to be performed
 # on a legacy database
-# It should still generate a warning (1287) - but I dont know how to test for that
+# It should still generate a warning (1287) - but I don't know how to test for that
 #
 #   latch SMALLINT UNSIGNED NULL' is deprecated and will be removed in a future
 #   release. Please use 'latch VARCHAR(32) NULL' instead
diff --git a/storage/perfschema/CMakeLists.txt b/storage/perfschema/CMakeLists.txt
index 7ed365ea175..59a110103b4 100644
--- a/storage/perfschema/CMakeLists.txt
+++ b/storage/perfschema/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2009, 2021, Oracle and/or its affiliates.
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License, version 2.0,
@@ -23,10 +23,19 @@
 INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include
                     ${CMAKE_SOURCE_DIR}/sql
                     ${CMAKE_BINARY_DIR}/sql
+                    ${CMAKE_CURRENT_BINARY_DIR}
                     ${PCRE_INCLUDES}
                     ${SSL_INCLUDE_DIRS})
 
 ADD_DEFINITIONS(-DMYSQL_SERVER)
+IF (SSL_DEFINES)
+  ADD_DEFINITIONS(${SSL_DEFINES})
+ENDIF()
+
+IF(CMAKE_SYSTEM_NAME MATCHES AIX)
+  # Workaround linker bug on AIX
+  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-berok")
+ENDIF()
 
 #
 # Maintainer: keep this list sorted, to avoid merge collisions.
@@ -41,27 +50,35 @@ cursor_by_user.h
 pfs.h
 pfs_account.h
 pfs_atomic.h
+pfs_buffer_container.h
+pfs_builtin_memory.h
 pfs_column_types.h
 pfs_column_values.h
 pfs_con_slice.h
 pfs_defaults.h
 pfs_digest.h
+pfs_program.h
+pfs_prepared_stmt.h
 pfs_engine_table.h
 pfs_events.h
 pfs_events_stages.h
 pfs_events_statements.h
+pfs_events_transactions.h
 pfs_events_waits.h
 pfs_global.h
 pfs_host.h
 pfs_instr.h
 pfs_instr_class.h
 pfs_lock.h
+pfs_memory.h
 pfs_server.h
 pfs_setup_actor.h
 pfs_setup_object.h
 pfs_stat.h
+pfs_status.h
 pfs_timer.h
 pfs_user.h
+pfs_variable.h
 pfs_visitor.h
 table_accounts.h
 table_all_instr.h
@@ -73,11 +90,19 @@ table_esgs_global_by_event_name.h
 table_esms_by_account_by_event_name.h
 table_esms_by_host_by_event_name.h
 table_esms_by_digest.h
+table_esms_by_program.h
+table_prepared_stmt_instances.h
 table_esms_by_thread_by_event_name.h
 table_esms_by_user_by_event_name.h
 table_esms_global_by_event_name.h
+table_ets_by_account_by_event_name.h
+table_ets_by_host_by_event_name.h
+table_ets_by_thread_by_event_name.h
+table_ets_by_user_by_event_name.h
+table_ets_global_by_event_name.h
 table_events_stages.h
 table_events_statements.h
+table_events_transactions.h
 table_events_waits.h
 table_events_waits_summary.h
 table_ews_by_account_by_event_name.h
@@ -86,6 +111,12 @@ table_ews_by_thread_by_event_name.h
 table_ews_by_user_by_event_name.h
 table_ews_global_by_event_name.h
 table_file_instances.h
+table_md_locks.h
+table_mems_global_by_event_name.h
+table_mems_by_account_by_event_name.h
+table_mems_by_host_by_event_name.h
+table_mems_by_thread_by_event_name.h
+table_mems_by_user_by_event_name.h
 table_file_summary_by_instance.h
 table_file_summary_by_event_name.h
 table_socket_instances.h
@@ -102,40 +133,68 @@ table_setup_instruments.h
 table_setup_objects.h
 table_setup_timers.h
 table_sync_instances.h
+table_status_by_account.h
+table_status_by_host.h
+table_status_by_thread.h
+table_status_by_user.h
+table_global_status.h
+table_session_status.h
+table_variables_by_thread.h
+table_global_variables.h
+table_session_variables.h
+table_table_handles.h
 table_threads.h
 table_tiws_by_index_usage.h
 table_tiws_by_table.h
 table_tlws_by_table.h
 table_users.h
+table_uvar_by_thread.h
 cursor_by_thread_connect_attr.h
 table_session_connect.h
 table_session_connect_attrs.h
 table_session_account_connect_attrs.h
+table_replication_connection_configuration.h
+table_replication_group_members.h
+table_replication_connection_status.h
+table_replication_applier_configuration.h
+table_replication_applier_status.h
+table_replication_applier_status_by_coordinator.h
+table_replication_applier_status_by_worker.h
+table_replication_group_member_stats.h
 cursor_by_account.cc
 cursor_by_host.cc
 cursor_by_thread.cc
 cursor_by_user.cc
 ha_perfschema.cc
+mysqld_thd_manager.cc
 pfs.cc
 pfs_account.cc
 pfs_autosize.cc
+pfs_buffer_container.cc
+pfs_builtin_memory.cc
 pfs_column_values.cc
 pfs_con_slice.cc
 pfs_defaults.cc
 pfs_digest.cc
+pfs_program.cc
+pfs_prepared_stmt.cc
 pfs_engine_table.cc
 pfs_events_stages.cc
 pfs_events_statements.cc
+pfs_events_transactions.cc
 pfs_events_waits.cc
 pfs_global.cc
 pfs_host.cc
 pfs_instr.cc
 pfs_instr_class.cc
+pfs_memory.cc
 pfs_server.cc
 pfs_setup_actor.cc
 pfs_setup_object.cc
+pfs_status.cc
 pfs_timer.cc
 pfs_user.cc
+pfs_variable.cc
 pfs_visitor.cc
 table_accounts.cc
 table_all_instr.cc
@@ -147,11 +206,19 @@ table_esgs_global_by_event_name.cc
 table_esms_by_account_by_event_name.cc
 table_esms_by_host_by_event_name.cc
 table_esms_by_digest.cc
+table_esms_by_program.cc
+table_prepared_stmt_instances.cc
 table_esms_by_thread_by_event_name.cc
 table_esms_by_user_by_event_name.cc
 table_esms_global_by_event_name.cc
+table_ets_by_account_by_event_name.cc
+table_ets_by_host_by_event_name.cc
+table_ets_by_thread_by_event_name.cc
+table_ets_by_user_by_event_name.cc
+table_ets_global_by_event_name.cc
 table_events_stages.cc
 table_events_statements.cc
+table_events_transactions.cc
 table_events_waits.cc
 table_events_waits_summary.cc
 table_ews_by_account_by_event_name.cc
@@ -160,6 +227,12 @@ table_ews_by_thread_by_event_name.cc
 table_ews_by_user_by_event_name.cc
 table_ews_global_by_event_name.cc
 table_file_instances.cc
+table_md_locks.cc
+table_mems_global_by_event_name.cc
+table_mems_by_account_by_event_name.cc
+table_mems_by_host_by_event_name.cc
+table_mems_by_thread_by_event_name.cc
+table_mems_by_user_by_event_name.cc
 table_file_summary_by_instance.cc
 table_file_summary_by_event_name.cc
 table_socket_instances.cc
@@ -176,22 +249,139 @@ table_setup_instruments.cc
 table_setup_objects.cc
 table_setup_timers.cc
 table_sync_instances.cc
+table_status_by_account.cc
+table_status_by_host.cc
+table_status_by_thread.cc
+table_status_by_user.cc
+table_global_status.cc
+table_session_status.cc
+#table_variables_by_thread.cc
+#table_global_variables.cc
+#table_session_variables.cc
+table_table_handles.cc
 table_threads.cc
 table_tiws_by_index_usage.cc
 table_tiws_by_table.cc
 table_tlws_by_table.cc
 table_users.cc
+table_uvar_by_thread.cc
 cursor_by_thread_connect_attr.cc
 table_session_connect.cc
 table_session_connect_attrs.cc
 table_session_account_connect_attrs.cc
+table_replication_connection_configuration.cc
+#table_replication_group_members.cc
+#table_replication_connection_status.cc
+table_replication_applier_configuration.cc
+table_replication_applier_status.cc
+table_replication_applier_status_by_coordinator.cc
+#table_replication_applier_status_by_worker.cc
+#table_replication_group_member_stats.cc
 )
 
-MYSQL_ADD_PLUGIN(perfschema ${PERFSCHEMA_SOURCES} STORAGE_ENGINE DEFAULT
-  STATIC_ONLY DEPENDS GenServerSource)
+# Check for pthread_threadid_np()
+CHECK_C_SOURCE_COMPILES("
+#include <pthread.h>
+int main(int ac, char **av)
+{
+  unsigned long long tid64;
+  pthread_threadid_np(NULL, &tid64);
+  return (tid64 != 0 ? 0 : 1);
+}"
+HAVE_PTHREAD_THREADID_NP)
+
+# gettid() library function (glibc-2.30+)
+CHECK_SYMBOL_EXISTS(gettid unistd.h HAVE_GETTID)
+
+# Check for gettid() system call
+CHECK_C_SOURCE_COMPILES("
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+int main(int ac, char **av)
+{
+  unsigned long long tid = syscall(SYS_gettid);
+  return (tid != 0 ? 0 : 1);
+}"
+HAVE_SYS_GETTID)
+
+# Check for getthrid()
+CHECK_C_SOURCE_COMPILES("
+#include <unistd.h>
+int main(int ac, char **av)
+{
+  unsigned long long tid = getthrid();
+  return (tid != 0 ? 0 : 1);
+}"
+HAVE_GETTHRID)
+
+# Check for pthread_getthreadid_np()
+CHECK_C_SOURCE_COMPILES("
+#include <pthread_np.h>
+int main(int ac, char **av)
+{
+  unsigned long long tid = pthread_getthreadid_np();
+  return (tid != 0 ? 0 : 1);
+}"
+HAVE_PTHREAD_GETTHREADID_NP)
+
+# Check for pthread_self() returning an integer type
+CHECK_C_SOURCE_COMPILES("
+#include <sys/types.h>
+#include <pthread.h>
+int main(int ac, char **av)
+{
+  unsigned long long tid = pthread_self();
+  return (tid != 0 ? 0 : 1);
+}"
+HAVE_INTEGER_PTHREAD_SELF
+FAIL_REGEX "warning: incompatible pointer to integer conversion"
+)
+
+CONFIGURE_FILE(pfs_config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/pfs_config.h)
 
+MYSQL_ADD_PLUGIN(perfschema ${PERFSCHEMA_SOURCES} STORAGE_ENGINE DEFAULT
+  STATIC_ONLY RECOMPILE_FOR_EMBEDDED DEPENDS GenServerSource)
 IF (TARGET perfschema)
   IF(WITH_UNIT_TESTS)
     ADD_SUBDIRECTORY(unittest)
   ENDIF(WITH_UNIT_TESTS)
 ENDIF(TARGET perfschema)
+
+# Only disable threads when building without *any* instrumentation,
+# as other instrumentations have a dependency on threads.
+OPTION(DISABLE_PSI_THREAD "Exclude the performance schema thread instrumentation" OFF)
+
+OPTION(DISABLE_PSI_MUTEX "Exclude the performance schema mutex instrumentation" OFF)
+OPTION(DISABLE_PSI_RWLOCK "Exclude the performance schema rwlock instrumentation" OFF)
+OPTION(DISABLE_PSI_COND "Exclude the performance schema condition instrumentation" OFF)
+OPTION(DISABLE_PSI_FILE "Exclude the performance schema file instrumentation" OFF)
+OPTION(DISABLE_PSI_TABLE "Exclude the performance schema table instrumentation" OFF)
+OPTION(DISABLE_PSI_SOCKET "Exclude the performance schema socket instrumentation" OFF)
+OPTION(DISABLE_PSI_STAGE "Exclude the performance schema stage instrumentation" OFF)
+OPTION(DISABLE_PSI_STATEMENT "Exclude the performance schema statement instrumentation" OFF)
+OPTION(DISABLE_PSI_SP "Exclude the performance schema stored procedure instrumentation" OFF)
+OPTION(DISABLE_PSI_PS "Exclude the performance schema prepared statements instances instrumentation" OFF)
+OPTION(DISABLE_PSI_IDLE "Exclude the performance schema idle instrumentation" OFF)
+OPTION(DISABLE_PSI_STATEMENT_DIGEST "Exclude the performance schema statement digest instrumentation" OFF)
+OPTION(DISABLE_PSI_METADATA "Exclude the performance schema metadata instrumentation" OFF)
+OPTION(DISABLE_PSI_MEMORY "Exclude the performance schema memory instrumentation" OFF)
+OPTION(DISABLE_PSI_TRANSACTION "Exclude the performance schema transaction instrumentation" OFF)
+
+MARK_AS_ADVANCED(DISABLE_PSI_THREAD)
+
+MARK_AS_ADVANCED(DISABLE_PSI_MUTEX)
+MARK_AS_ADVANCED(DISABLE_PSI_RWLOCK)
+MARK_AS_ADVANCED(DISABLE_PSI_COND)
+MARK_AS_ADVANCED(DISABLE_PSI_FILE)
+MARK_AS_ADVANCED(DISABLE_PSI_TABLE)
+MARK_AS_ADVANCED(DISABLE_PSI_SOCKET)
+MARK_AS_ADVANCED(DISABLE_PSI_STAGE)
+MARK_AS_ADVANCED(DISABLE_PSI_STATEMENT)
+MARK_AS_ADVANCED(DISABLE_PSI_SP)
+MARK_AS_ADVANCED(DISABLE_PSI_PS)
+MARK_AS_ADVANCED(DISABLE_PSI_IDLE)
+MARK_AS_ADVANCED(DISABLE_PSI_STATEMENT_DIGEST)
+MARK_AS_ADVANCED(DISABLE_PSI_METADATA)
+MARK_AS_ADVANCED(DISABLE_PSI_MEMORY)
+MARK_AS_ADVANCED(DISABLE_PSI_TRANSACTION)
diff --git a/storage/perfschema/cursor_by_account.cc b/storage/perfschema/cursor_by_account.cc
index f26318c42fc..38359cb24aa 100644
--- a/storage/perfschema/cursor_by_account.cc
+++ b/storage/perfschema/cursor_by_account.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -27,7 +27,13 @@
 
 #include "my_global.h"
 #include "cursor_by_account.h"
-#include "pfs_user.h"
+#include "pfs_buffer_container.h"
+
+ha_rows
+cursor_by_account::get_row_count(void)
+{
+  return global_account_container.get_row_count();
+}
 
 cursor_by_account::cursor_by_account(const PFS_engine_table_share *share)
   : PFS_engine_table(share, &m_pos),
@@ -44,17 +50,14 @@ int cursor_by_account::rnd_next(void)
 {
   PFS_account *pfs;
 
-  for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index < account_max;
-       m_pos.next())
+  m_pos.set_at(&m_next_pos);
+  PFS_account_iterator it= global_account_container.iterate(m_pos.m_index);
+  pfs= it.scan_next(& m_pos.m_index);
+  if (pfs != NULL)
   {
-    pfs= &account_array[m_pos.m_index];
-    if (pfs->m_lock.is_populated())
-    {
-      make_row(pfs);
-      m_next_pos.set_after(&m_pos);
-      return 0;
-    }
+    make_row(pfs);
+    m_next_pos.set_after(&m_pos);
+    return 0;
   }
 
   return HA_ERR_END_OF_FILE;
@@ -66,9 +69,9 @@ cursor_by_account::rnd_pos(const void *pos)
   PFS_account *pfs;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index < account_max);
-  pfs= &account_array[m_pos.m_index];
-  if (pfs->m_lock.is_populated())
+
+  pfs= global_account_container.get(m_pos.m_index);
+  if (pfs != NULL)
   {
     make_row(pfs);
     return 0;
diff --git a/storage/perfschema/cursor_by_account.h b/storage/perfschema/cursor_by_account.h
index c14a563b712..1f6d65adc98 100644
--- a/storage/perfschema/cursor_by_account.h
+++ b/storage/perfschema/cursor_by_account.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -41,6 +41,8 @@
 class cursor_by_account : public PFS_engine_table
 {
 public:
+  static ha_rows get_row_count();
+
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
   virtual void reset_position(void);
diff --git a/storage/perfschema/cursor_by_host.cc b/storage/perfschema/cursor_by_host.cc
index c3397234e2e..26bdbc457f8 100644
--- a/storage/perfschema/cursor_by_host.cc
+++ b/storage/perfschema/cursor_by_host.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -27,7 +27,13 @@
 
 #include "my_global.h"
 #include "cursor_by_host.h"
-#include "pfs_host.h"
+#include "pfs_buffer_container.h"
+
+ha_rows
+cursor_by_host::get_row_count(void)
+{
+  return global_host_container.get_row_count();
+}
 
 cursor_by_host::cursor_by_host(const PFS_engine_table_share *share)
   : PFS_engine_table(share, &m_pos),
@@ -42,19 +48,16 @@ void cursor_by_host::reset_position(void)
 
 int cursor_by_host::rnd_next(void)
 {
-  PFS_host *host;
+  PFS_host *pfs;
 
-  for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index < host_max;
-       m_pos.next())
+  m_pos.set_at(&m_next_pos);
+  PFS_host_iterator it= global_host_container.iterate(m_pos.m_index);
+  pfs= it.scan_next(& m_pos.m_index);
+  if (pfs != NULL)
   {
-    host= & host_array[m_pos.m_index];
-    if (host->m_lock.is_populated())
-    {
-      make_row(host);
-      m_next_pos.set_after(&m_pos);
-      return 0;
-    }
+    make_row(pfs);
+    m_next_pos.set_after(&m_pos);
+    return 0;
   }
 
   return HA_ERR_END_OF_FILE;
@@ -66,9 +69,9 @@ cursor_by_host::rnd_pos(const void *pos)
   PFS_host *pfs;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index < host_max);
-  pfs= &host_array[m_pos.m_index];
-  if (pfs->m_lock.is_populated())
+
+  pfs= global_host_container.get(m_pos.m_index);
+  if (pfs != NULL)
   {
     make_row(pfs);
     return 0;
diff --git a/storage/perfschema/cursor_by_host.h b/storage/perfschema/cursor_by_host.h
index ac68acf3945..45a2fb894c1 100644
--- a/storage/perfschema/cursor_by_host.h
+++ b/storage/perfschema/cursor_by_host.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -41,6 +41,8 @@
 class cursor_by_host : public PFS_engine_table
 {
 public:
+  static ha_rows get_row_count();
+
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
   virtual void reset_position(void);
diff --git a/storage/perfschema/cursor_by_thread.cc b/storage/perfschema/cursor_by_thread.cc
index afdc9010b1f..f8a4fa243ce 100644
--- a/storage/perfschema/cursor_by_thread.cc
+++ b/storage/perfschema/cursor_by_thread.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -28,6 +28,13 @@
 #include "my_global.h"
 #include "cursor_by_thread.h"
 #include "pfs_instr.h"
+#include "pfs_buffer_container.h"
+
+ha_rows
+cursor_by_thread::get_row_count(void)
+{
+  return global_thread_container.get_row_count();
+}
 
 cursor_by_thread::cursor_by_thread(const PFS_engine_table_share *share)
   : PFS_engine_table(share, &m_pos),
@@ -44,17 +51,14 @@ int cursor_by_thread::rnd_next(void)
 {
   PFS_thread *pfs;
 
-  for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index < thread_max;
-       m_pos.next())
+  m_pos.set_at(&m_next_pos);
+  PFS_thread_iterator it= global_thread_container.iterate(m_pos.m_index);
+  pfs= it.scan_next(& m_pos.m_index);
+  if (pfs != NULL)
   {
-    pfs= &thread_array[m_pos.m_index];
-    if (pfs->m_lock.is_populated())
-    {
-      make_row(pfs);
-      m_next_pos.set_after(&m_pos);
-      return 0;
-    }
+    make_row(pfs);
+    m_next_pos.set_after(&m_pos);
+    return 0;
   }
 
   return HA_ERR_END_OF_FILE;
@@ -66,9 +70,9 @@ cursor_by_thread::rnd_pos(const void *pos)
   PFS_thread *pfs;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index < thread_max);
-  pfs= &thread_array[m_pos.m_index];
-  if (pfs->m_lock.is_populated())
+
+  pfs= global_thread_container.get(m_pos.m_index);
+  if (pfs != NULL)
   {
     make_row(pfs);
     return 0;
diff --git a/storage/perfschema/cursor_by_thread.h b/storage/perfschema/cursor_by_thread.h
index db130088920..24578fd44f1 100644
--- a/storage/perfschema/cursor_by_thread.h
+++ b/storage/perfschema/cursor_by_thread.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -41,6 +41,8 @@
 class cursor_by_thread : public PFS_engine_table
 {
 public:
+  static ha_rows get_row_count();
+
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
   virtual void reset_position(void);
diff --git a/storage/perfschema/cursor_by_thread_connect_attr.cc b/storage/perfschema/cursor_by_thread_connect_attr.cc
index 90a200b809a..f87ce7059fb 100644
--- a/storage/perfschema/cursor_by_thread_connect_attr.cc
+++ b/storage/perfschema/cursor_by_thread_connect_attr.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -22,6 +22,20 @@
 
 #include "my_global.h"
 #include "cursor_by_thread_connect_attr.h"
+#include "pfs_buffer_container.h"
+
+ha_rows
+cursor_by_thread_connect_attr::get_row_count(void)
+{
+  /*
+    The real number of attributes per thread does not matter,
+    we only need to hint the optimizer there are many per thread,
+    so abusing session_connect_attrs_size_per_thread
+    (which is a number of bytes, not attributes)
+  */
+  return global_thread_container.get_row_count() *
+    session_connect_attrs_size_per_thread;
+}
 
 cursor_by_thread_connect_attr::cursor_by_thread_connect_attr(
   const PFS_engine_table_share *share) :
@@ -31,14 +45,14 @@ cursor_by_thread_connect_attr::cursor_by_thread_connect_attr(
 int cursor_by_thread_connect_attr::rnd_next(void)
 {
   PFS_thread *thread;
+  bool has_more_thread= true;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.has_more_thread();
+       has_more_thread;
        m_pos.next_thread())
   {
-    thread= &thread_array[m_pos.m_index_1];
-
-    if (thread->m_lock.is_populated())
+    thread= global_thread_container.get(m_pos.m_index_1, & has_more_thread);
+    if (thread != NULL)
     {
       make_row(thread, m_pos.m_index_2);
       if (m_row_exists)
@@ -48,6 +62,7 @@ int cursor_by_thread_connect_attr::rnd_next(void)
       }
     }
   }
+
   return HA_ERR_END_OF_FILE;
 }
 
@@ -57,15 +72,14 @@ int cursor_by_thread_connect_attr::rnd_pos(const void *pos)
   PFS_thread *thread;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < thread_max);
-
-  thread= &thread_array[m_pos.m_index_1];
-  if (!thread->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
 
-  make_row(thread, m_pos.m_index_2);
-  if (m_row_exists)
-    return 0;
+  thread= global_thread_container.get(m_pos.m_index_1);
+  if (thread != NULL)
+  {
+    make_row(thread, m_pos.m_index_2);
+    if (m_row_exists)
+      return 0;
+  }
 
   return HA_ERR_RECORD_DELETED;
 }
diff --git a/storage/perfschema/cursor_by_thread_connect_attr.h b/storage/perfschema/cursor_by_thread_connect_attr.h
index 69d1b5ec0c0..4a18bdc5b7a 100644
--- a/storage/perfschema/cursor_by_thread_connect_attr.h
+++ b/storage/perfschema/cursor_by_thread_connect_attr.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2012, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -32,6 +32,10 @@
   @{
 */
 
+/**
+  Position of a cursor on abstract table
+  PERFORMANCE_SCHEMA.SESSION_CONNECT_ATTRS.
+*/
 struct pos_connect_attr_by_thread_by_attr
 : public PFS_double_index
 {
@@ -39,11 +43,6 @@ struct pos_connect_attr_by_thread_by_attr
     : PFS_double_index(0, 0)
   {}
 
-  inline bool has_more_thread(void)
-  {
-    return (m_index_1 < thread_max);
-  }
-
   inline void next_thread(void)
   {
     m_index_1++;
@@ -61,6 +60,8 @@ struct pos_connect_attr_by_thread_by_attr
 class cursor_by_thread_connect_attr : public PFS_engine_table
 {
 public:
+  static ha_rows get_row_count();
+
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
   virtual void reset_position(void);
diff --git a/storage/perfschema/cursor_by_user.cc b/storage/perfschema/cursor_by_user.cc
index 273d186b01c..4300f61f04f 100644
--- a/storage/perfschema/cursor_by_user.cc
+++ b/storage/perfschema/cursor_by_user.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -27,7 +27,13 @@
 
 #include "my_global.h"
 #include "cursor_by_user.h"
-#include "pfs_user.h"
+#include "pfs_buffer_container.h"
+
+ha_rows
+cursor_by_user::get_row_count(void)
+{
+  return global_user_container.get_row_count();
+}
 
 cursor_by_user::cursor_by_user(const PFS_engine_table_share *share)
   : PFS_engine_table(share, &m_pos),
@@ -44,17 +50,14 @@ int cursor_by_user::rnd_next(void)
 {
   PFS_user *pfs;
 
-  for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index < user_max;
-       m_pos.next())
+  m_pos.set_at(&m_next_pos);
+  PFS_user_iterator it= global_user_container.iterate(m_pos.m_index);
+  pfs= it.scan_next(& m_pos.m_index);
+  if (pfs != NULL)
   {
-    pfs= &user_array[m_pos.m_index];
-    if (pfs->m_lock.is_populated())
-    {
-      make_row(pfs);
-      m_next_pos.set_after(&m_pos);
-      return 0;
-    }
+    make_row(pfs);
+    m_next_pos.set_after(&m_pos);
+    return 0;
   }
 
   return HA_ERR_END_OF_FILE;
@@ -66,9 +69,9 @@ cursor_by_user::rnd_pos(const void *pos)
   PFS_user *pfs;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index < user_max);
-  pfs= &user_array[m_pos.m_index];
-  if (pfs->m_lock.is_populated())
+
+  pfs= global_user_container.get(m_pos.m_index);
+  if (pfs != NULL)
   {
     make_row(pfs);
     return 0;
diff --git a/storage/perfschema/cursor_by_user.h b/storage/perfschema/cursor_by_user.h
index 06554ebb228..1bbb06d2c18 100644
--- a/storage/perfschema/cursor_by_user.h
+++ b/storage/perfschema/cursor_by_user.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -41,6 +41,8 @@
 class cursor_by_user : public PFS_engine_table
 {
 public:
+  static ha_rows get_row_count();
+
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
   virtual void reset_position(void);
diff --git a/storage/perfschema/gen_pfs_lex_token.cc b/storage/perfschema/gen_pfs_lex_token.cc
deleted file mode 100644
index 5a51a8aeb2f..00000000000
--- a/storage/perfschema/gen_pfs_lex_token.cc
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
-   Copyright (c) 2011, 2014, Oracle and/or its affiliates. All rights reserved.
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License, version 2.0,
-   as published by the Free Software Foundation.
-
-   This program is also distributed with certain software (including
-   but not limited to OpenSSL) that is licensed under separate terms,
-   as designated in a particular file or component or in included license
-   documentation.  The authors of MySQL hereby grant you an additional
-   permission to link the program and your derivative works with the
-   separately licensed software that they have included with MySQL.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License, version 2.0, for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software Foundation,
-   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
-
-#include <my_global.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-/* We only need the tokens here */
-#define YYSTYPE_IS_DECLARED
-#include <../sql/sql_yacc.h>
-#include <lex.h>
-
-#include <welcome_copyright_notice.h> /* ORACLE_WELCOME_COPYRIGHT_NOTICE */
-
-/*
-  This is a tool used during build only,
-  so MY_MAX_TOKEN does not need to be exact,
-  only big enough to hold:
-  - 256 character terminal tokens
-  - YYNTOKENS named terminal tokens
-  from bison.
-  See also YYMAXUTOK.
-*/
-#define MY_MAX_TOKEN 1000
-/** Generated token. */
-struct gen_lex_token_string
-{
-  const char *m_token_string;
-  int m_token_length;
-  bool m_append_space;
-  bool m_start_expr;
-};
-
-gen_lex_token_string compiled_token_array[MY_MAX_TOKEN];
-int max_token_seen= 0;
-
-char char_tokens[256];
-
-int tok_pfs_generic_value= 0;
-int tok_pfs_generic_value_list= 0;
-int tok_pfs_row_single_value= 0;
-int tok_pfs_row_single_value_list= 0;
-int tok_pfs_row_multiple_value= 0;
-int tok_pfs_row_multiple_value_list= 0;
-int tok_pfs_unused= 0;
-
-void set_token(int tok, const char *str)
-{
-  if (tok <= 0)
-  {
-    fprintf(stderr, "Bad token found\n");
-    exit(1);
-  }
-
-  if (tok > max_token_seen)
-  {
-    max_token_seen= tok;
-  }
-
-  if (max_token_seen >= MY_MAX_TOKEN)
-  {
-    fprintf(stderr, "Added that many new keywords ? Increase MY_MAX_TOKEN\n");
-    exit(1);
-  }
-
-  compiled_token_array[tok].m_token_string= str;
-  compiled_token_array[tok].m_token_length= strlen(str);
-  compiled_token_array[tok].m_append_space= true;
-  compiled_token_array[tok].m_start_expr= false;
-}
-
-void set_start_expr_token(int tok)
-{
-  compiled_token_array[tok].m_start_expr= true;
-}
-
-void compute_tokens()
-{
-  int tok;
-  unsigned int i;
-  char *str;
-
-  /*
-    Default value.
-  */
-  for (tok= 0; tok < MY_MAX_TOKEN; tok++)
-  {
-    compiled_token_array[tok].m_token_string= "(unknown)";
-    compiled_token_array[tok].m_token_length= 9;
-    compiled_token_array[tok].m_append_space= true;
-    compiled_token_array[tok].m_start_expr= false;
-  }
-
-  /*
-    Tokens made of just one terminal character
-  */
-  for (tok=0; tok < 256; tok++)
-  {
-    str= & char_tokens[tok];
-    str[0]= (char) tok;
-    compiled_token_array[tok].m_token_string= str;
-    compiled_token_array[tok].m_token_length= 1;
-    compiled_token_array[tok].m_append_space= true;
-  }
-
-  max_token_seen= 255;
-
-  /*
-    String terminal tokens, used in sql_yacc.yy
-  */
-  set_token(NEG, "~");
-  set_token(TABLE_REF_PRIORITY, "TABLE_REF_PRIORITY");
-
-  /*
-    Tokens hard coded in sql_lex.cc
-  */
-
-  set_token(WITH_CUBE_SYM, "WITH CUBE");
-  set_token(WITH_ROLLUP_SYM, "WITH ROLLUP");
-  set_token(NOT2_SYM, "!");
-  set_token(OR2_SYM, "|");
-  set_token(PARAM_MARKER, "?");
-  set_token(SET_VAR, ":=");
-  set_token(UNDERSCORE_CHARSET, "(_charset)");
-  set_token(END_OF_INPUT, "");
-
-  /*
-    Values.
-    These tokens are all normalized later,
-    so this strings will never be displayed.
-  */
-  set_token(BIN_NUM, "(bin)");
-  set_token(DECIMAL_NUM, "(decimal)");
-  set_token(FLOAT_NUM, "(float)");
-  set_token(HEX_NUM, "(hex)");
-  set_token(LEX_HOSTNAME, "(hostname)");
-  set_token(LONG_NUM, "(long)");
-  set_token(NUM, "(num)");
-  set_token(TEXT_STRING, "(text)");
-  set_token(NCHAR_STRING, "(nchar)");
-  set_token(ULONGLONG_NUM, "(ulonglong)");
-
-  /*
-    Identifiers.
-  */
-  set_token(IDENT, "(id)");
-  set_token(IDENT_QUOTED, "(id_quoted)");
-
-  /*
-    Unused tokens
-  */
-  set_token(LOCATOR_SYM, "LOCATOR");
-  set_token(SERVER_OPTIONS, "SERVER_OPTIONS");
-  set_token(UDF_RETURNS_SYM, "UDF_RETURNS");
-
-  /*
-    See symbols[] in sql/lex.h
-  */
-  for (i= 0; i< sizeof(symbols)/sizeof(symbols[0]); i++)
-  {
-    set_token(symbols[i].tok, symbols[i].name);
-  }
-
-  /*
-    See sql_functions[] in sql/lex.h
-  */
-  for (i= 0; i< sizeof(sql_functions)/sizeof(sql_functions[0]); i++)
-  {
-    set_token(sql_functions[i].tok, sql_functions[i].name);
-  }
-
-  /*
-    Additional FAKE tokens,
-    used internally to normalize a digest text.
-  */
-
-  max_token_seen++;
-  tok_pfs_generic_value= max_token_seen;
-  set_token(tok_pfs_generic_value, "?");
-
-  max_token_seen++;
-  tok_pfs_generic_value_list= max_token_seen;
-  set_token(tok_pfs_generic_value_list, "?, ...");
-
-  max_token_seen++;
-  tok_pfs_row_single_value= max_token_seen;
-  set_token(tok_pfs_row_single_value, "(?)");
-
-  max_token_seen++;
-  tok_pfs_row_single_value_list= max_token_seen;
-  set_token(tok_pfs_row_single_value_list, "(?) /* , ... */");
-
-  max_token_seen++;
-  tok_pfs_row_multiple_value= max_token_seen;
-  set_token(tok_pfs_row_multiple_value, "(...)");
-
-  max_token_seen++;
-  tok_pfs_row_multiple_value_list= max_token_seen;
-  set_token(tok_pfs_row_multiple_value_list, "(...) /* , ... */");
-
-  max_token_seen++;
-  tok_pfs_unused= max_token_seen;
-  set_token(tok_pfs_unused, "UNUSED");
-
-  /*
-    Fix whitespace for some special tokens.
-  */
-
-  /*
-    The lexer parses "@@variable" as '@', '@', 'variable',
-    returning a token for '@' alone.
-
-    This is incorrect, '@' is not really a token,
-    because the syntax "@ @ variable" (with spaces) is not accepted:
-    The lexer keeps some internal state after the '@' fake token.
-
-    To work around this, digest text are printed as "@@variable".
-  */
-  compiled_token_array[(int) '@'].m_append_space= false;
-
-  /*
-    Define additional properties for tokens.
-
-    List all the token that are followed by an expression.
-    This is needed to differentiate unary from binary
-    '+' and '-' operators, because we want to:
-    - reduce <unary +> <NUM> to <?>,
-    - preserve <...> <binary +> <NUM> as is.
-  */
-  set_start_expr_token('(');
-  set_start_expr_token(',');
-  set_start_expr_token(EVERY_SYM);
-  set_start_expr_token(AT_SYM);
-  set_start_expr_token(STARTS_SYM);
-  set_start_expr_token(ENDS_SYM);
-  set_start_expr_token(DEFAULT);
-  set_start_expr_token(RETURN_SYM);
-  set_start_expr_token(IF_SYM);
-  set_start_expr_token(ELSEIF_SYM);
-  set_start_expr_token(CASE_SYM);
-  set_start_expr_token(WHEN_SYM);
-  set_start_expr_token(WHILE_SYM);
-  set_start_expr_token(UNTIL_SYM);
-  set_start_expr_token(SELECT_SYM);
-
-  set_start_expr_token(OR_SYM);
-  set_start_expr_token(OR2_SYM);
-  set_start_expr_token(XOR);
-  set_start_expr_token(AND_SYM);
-  set_start_expr_token(AND_AND_SYM);
-  set_start_expr_token(NOT_SYM);
-  set_start_expr_token(BETWEEN_SYM);
-  set_start_expr_token(LIKE);
-  set_start_expr_token(REGEXP);
-
-  set_start_expr_token('|');
-  set_start_expr_token('&');
-  set_start_expr_token(SHIFT_LEFT);
-  set_start_expr_token(SHIFT_RIGHT);
-  set_start_expr_token('+');
-  set_start_expr_token('-');
-  set_start_expr_token(INTERVAL_SYM);
-  set_start_expr_token('*');
-  set_start_expr_token('/');
-  set_start_expr_token('%');
-  set_start_expr_token(DIV_SYM);
-  set_start_expr_token(MOD_SYM);
-  set_start_expr_token('^');
-}
-
-void print_tokens()
-{
-  int tok;
-
-  printf("lex_token_string lex_token_array[]=\n");
-  printf("{\n");
-  printf("/* PART 1: character tokens. */\n");
-
-  for (tok= 0; tok<256; tok++)
-  {
-    printf("/* %03d */  { \"\\x%02x\", 1, %s, %s},\n",
-           tok,
-           tok,
-           compiled_token_array[tok].m_append_space ? "true" : "false",
-           compiled_token_array[tok].m_start_expr ? "true" : "false");
-  }
-
-  printf("/* PART 2: named tokens. */\n");
-
-  for (tok= 256; tok<= max_token_seen; tok++)
-  {
-    printf("/* %03d */  { \"%s\", %d, %s, %s},\n",
-           tok,
-           compiled_token_array[tok].m_token_string,
-           compiled_token_array[tok].m_token_length,
-           compiled_token_array[tok].m_append_space ? "true" : "false",
-           compiled_token_array[tok].m_start_expr ? "true" : "false");
-  }
-
-  printf("/* DUMMY */ { \"\", 0, false, false}\n");
-  printf("};\n");
-
-  printf("/* PFS specific tokens. */\n");
-  printf("#define TOK_PFS_GENERIC_VALUE %d\n", tok_pfs_generic_value);
-  printf("#define TOK_PFS_GENERIC_VALUE_LIST %d\n", tok_pfs_generic_value_list);
-  printf("#define TOK_PFS_ROW_SINGLE_VALUE %d\n", tok_pfs_row_single_value);
-  printf("#define TOK_PFS_ROW_SINGLE_VALUE_LIST %d\n", tok_pfs_row_single_value_list);
-  printf("#define TOK_PFS_ROW_MULTIPLE_VALUE %d\n", tok_pfs_row_multiple_value);
-  printf("#define TOK_PFS_ROW_MULTIPLE_VALUE_LIST %d\n", tok_pfs_row_multiple_value_list);
-  printf("#define TOK_PFS_UNUSED %d\n", tok_pfs_unused);
-}
-
-int main(int argc,char **argv)
-{
-  puts("/*");
-  puts(ORACLE_WELCOME_COPYRIGHT_NOTICE("2011"));
-  puts("*/");
-
-  printf("/*\n");
-  printf("  This file is generated, do not edit.\n");
-  printf("  See file storage/perfschema/gen_pfs_lex_token.cc.\n");
-  printf("*/\n");
-  printf("struct lex_token_string\n");
-  printf("{\n");
-  printf("  const char *m_token_string;\n");
-  printf("  int m_token_length;\n");
-  printf("  bool m_append_space;\n");
-  printf("  bool m_start_expr;\n");
-  printf("};\n");
-  printf("typedef struct lex_token_string lex_token_string;\n");
-
-  compute_tokens();
-  print_tokens();
-
-  return 0;
-}
-
diff --git a/storage/perfschema/ha_perfschema.cc b/storage/perfschema/ha_perfschema.cc
index 8860046fa26..c0fa80b8e0d 100644
--- a/storage/perfschema/ha_perfschema.cc
+++ b/storage/perfschema/ha_perfschema.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2014, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -35,10 +35,14 @@
 #include "pfs_account.h"
 #include "pfs_host.h"
 #include "pfs_user.h"
-#include "pfs_account.h"
+#include "pfs_program.h"
+#include "pfs_prepared_stmt.h"
+#include "pfs_buffer_container.h"
 
 handlerton *pfs_hton= NULL;
 
+#define PFS_ENABLED() (pfs_initialized && (pfs_enabled || m_table_share->m_perpetual))
+
 static handler* pfs_create_handler(handlerton *hton,
                                    TABLE_SHARE *table,
                                    MEM_ROOT *mem_root)
@@ -89,13 +93,11 @@ static int pfs_init_func(void *p)
 
   pfs_hton= reinterpret_cast<handlerton *> (p);
 
-  pfs_hton->state= SHOW_OPTION_YES;
   pfs_hton->create= pfs_create_handler;
+  pfs_hton->drop_table= [](handlerton *, const char*) { return -1; };
   pfs_hton->show_status= pfs_show_status;
-  pfs_hton->flags= HTON_ALTER_NOT_SUPPORTED |
-    HTON_TEMPORARY_NOT_SUPPORTED |
-    HTON_NO_PARTITION |
-    HTON_NO_BINLOG_ROW_OPT;
+  pfs_hton->flags= HTON_ALTER_NOT_SUPPORTED | HTON_TEMPORARY_NOT_SUPPORTED |
+                   HTON_NO_PARTITION | HTON_NO_BINLOG_ROW_OPT;
 
   /*
     As long as the server implementation keeps using legacy_db_type,
@@ -131,6 +133,15 @@ static int pfs_done_func(void *p)
   DBUG_RETURN(0);
 }
 
+static int show_func_mutex_instances_lost(THD *thd, SHOW_VAR *var, char *buff)
+{
+  var->type= SHOW_LONG;
+  var->value= buff;
+  long *value= reinterpret_cast<long*>(buff);
+  *value= global_mutex_container.get_lost_counter();
+  return 0;
+}
+
 static struct st_mysql_show_var pfs_status_vars[]=
 {
   {"Performance_schema_mutex_classes_lost",
@@ -145,34 +156,42 @@ static struct st_mysql_show_var pfs_status_vars[]=
     (char*) &file_class_lost, SHOW_LONG_NOFLUSH},
   {"Performance_schema_socket_classes_lost",
     (char*) &socket_class_lost, SHOW_LONG_NOFLUSH},
+  {"Performance_schema_memory_classes_lost",
+    (char*) &memory_class_lost, SHOW_LONG_NOFLUSH},
   {"Performance_schema_mutex_instances_lost",
-    (char*) &mutex_lost, SHOW_LONG},
+    (char*) &show_func_mutex_instances_lost, SHOW_FUNC},
   {"Performance_schema_rwlock_instances_lost",
-    (char*) &rwlock_lost, SHOW_LONG},
+    (char*) &global_rwlock_container.m_lost, SHOW_LONG},
   {"Performance_schema_cond_instances_lost",
-    (char*) &cond_lost, SHOW_LONG},
+    (char*) &global_cond_container.m_lost, SHOW_LONG},
   {"Performance_schema_thread_instances_lost",
-    (char*) &thread_lost, SHOW_LONG},
+    (char*) &global_thread_container.m_lost, SHOW_LONG},
   {"Performance_schema_file_instances_lost",
-    (char*) &file_lost, SHOW_LONG},
+    (char*) &global_file_container.m_lost, SHOW_LONG},
   {"Performance_schema_file_handles_lost",
     (char*) &file_handle_lost, SHOW_LONG},
   {"Performance_schema_socket_instances_lost",
-    (char*) &socket_lost, SHOW_LONG},
+    (char*) &global_socket_container.m_lost, SHOW_LONG},
   {"Performance_schema_locker_lost",
     (char*) &locker_lost, SHOW_LONG},
   /* table shares, can be flushed */
   {"Performance_schema_table_instances_lost",
-    (char*) &table_share_lost, SHOW_LONG},
+    (char*) &global_table_share_container.m_lost, SHOW_LONG},
   /* table handles, can be flushed */
   {"Performance_schema_table_handles_lost",
-    (char*) &table_lost, SHOW_LONG},
+    (char*) &global_table_container.m_lost, SHOW_LONG},
+  /* table lock stats, can be flushed */
+  {"Performance_schema_table_lock_stat_lost",
+    (char*) &global_table_share_lock_container.m_lost, SHOW_LONG},
+  /* table index stats, can be flushed */
+  {"Performance_schema_index_stat_lost",
+    (char*) &global_table_share_index_container.m_lost, SHOW_LONG},
   {"Performance_schema_hosts_lost",
-    (char*) &host_lost, SHOW_LONG},
+    (char*) &global_host_container.m_lost, SHOW_LONG},
   {"Performance_schema_users_lost",
-    (char*) &user_lost, SHOW_LONG},
+    (char*) &global_user_container.m_lost, SHOW_LONG},
   {"Performance_schema_accounts_lost",
-    (char*) &account_lost, SHOW_LONG},
+    (char*) &global_account_container.m_lost, SHOW_LONG},
   {"Performance_schema_stage_classes_lost",
     (char*) &stage_class_lost, SHOW_LONG},
   {"Performance_schema_statement_classes_lost",
@@ -181,6 +200,14 @@ static struct st_mysql_show_var pfs_status_vars[]=
     (char*) &digest_lost, SHOW_LONG},
   {"Performance_schema_session_connect_attrs_lost",
     (char*) &session_connect_attrs_lost, SHOW_LONG},
+  {"Performance_schema_program_lost",
+    (char*) &global_program_container.m_lost, SHOW_LONG},
+  {"Performance_schema_nested_statement_lost",
+    (char*) &nested_statement_lost, SHOW_LONG},
+  {"Performance_schema_prepared_statements_lost",
+    (char*) &global_prepared_stmt_container.m_lost, SHOW_LONG},
+  {"Performance_schema_metadata_lock_lost",
+    (char*) &global_mdl_container.m_lost, SHOW_LONG},
   {NullS, NullS, SHOW_LONG}
 };
 
@@ -189,24 +216,6 @@ struct st_mysql_storage_engine pfs_storage_engine=
 
 const char* pfs_engine_name= "PERFORMANCE_SCHEMA";
 
-mysql_declare_plugin(perfschema)
-{
-  MYSQL_STORAGE_ENGINE_PLUGIN,
-  &pfs_storage_engine,
-  pfs_engine_name,
-  "Marc Alff, Oracle", /* Formerly Sun Microsystems, formerly MySQL */
-  "Performance Schema",
-  PLUGIN_LICENSE_GPL,
-  pfs_init_func,                                /* Plugin Init */
-  pfs_done_func,                                /* Plugin Deinit */
-  0x0001 /* 0.1 */,
-  pfs_status_vars,                              /* status variables */
-  NULL,                                         /* system variables */
-  NULL,                                         /* config options */
-  0,                                            /* flags */
-}
-mysql_declare_plugin_end;
-
 maria_declare_plugin(perfschema)
 {
   MYSQL_STORAGE_ENGINE_PLUGIN,
@@ -220,7 +229,7 @@ maria_declare_plugin(perfschema)
   0x0001,
   pfs_status_vars,
   NULL,
-  "5.6.40",
+  "5.7.31",
   MariaDB_PLUGIN_MATURITY_STABLE
 }
 maria_declare_plugin_end;
@@ -262,7 +271,7 @@ int ha_perfschema::write_row(const uchar *buf)
   int result;
 
   DBUG_ENTER("ha_perfschema::write_row");
-  if (!pfs_initialized)
+  if (!PFS_ENABLED())
     DBUG_RETURN(HA_ERR_WRONG_COMMAND);
 
   DBUG_ASSERT(m_table_share);
@@ -284,7 +293,7 @@ void ha_perfschema::use_hidden_primary_key(void)
 int ha_perfschema::update_row(const uchar *old_data, const uchar *new_data)
 {
   DBUG_ENTER("ha_perfschema::update_row");
-  if (!pfs_initialized)
+  if (!PFS_ENABLED())
     DBUG_RETURN(HA_ERR_WRONG_COMMAND);
 
   if (is_executed_by_slave())
@@ -298,7 +307,7 @@ int ha_perfschema::update_row(const uchar *old_data, const uchar *new_data)
 int ha_perfschema::delete_row(const uchar *buf)
 {
   DBUG_ENTER("ha_perfschema::delete_row");
-  if (!pfs_initialized)
+  if (!PFS_ENABLED())
     DBUG_RETURN(HA_ERR_WRONG_COMMAND);
 
   DBUG_ASSERT(m_table);
@@ -311,8 +320,8 @@ int ha_perfschema::rnd_init(bool scan)
   int result;
   DBUG_ENTER("ha_perfschema::rnd_init");
 
-  DBUG_ASSERT(m_table_share);
-  DBUG_ASSERT(m_table_share->m_open_table != NULL);
+  assert(m_table_share);
+  assert(m_table_share->m_open_table != NULL);
 
   stats.records= 0;
   if (m_table == NULL)
@@ -330,7 +339,7 @@ int ha_perfschema::rnd_init(bool scan)
 int ha_perfschema::rnd_end(void)
 {
   DBUG_ENTER("ha_perfschema::rnd_end");
-  DBUG_ASSERT(m_table);
+  assert(m_table);
   delete m_table;
   m_table= NULL;
   DBUG_RETURN(0);
@@ -339,7 +348,7 @@ int ha_perfschema::rnd_end(void)
 int ha_perfschema::rnd_next(uchar *buf)
 {
   DBUG_ENTER("ha_perfschema::rnd_next");
-  if (!pfs_initialized)
+  if (!PFS_ENABLED())
   {
     table->status= STATUS_NOT_FOUND;
     DBUG_RETURN(HA_ERR_END_OF_FILE);
@@ -362,7 +371,7 @@ void ha_perfschema::position(const uchar *record)
 {
   DBUG_ENTER("ha_perfschema::position");
 
-  DBUG_ASSERT(m_table);
+  assert(m_table);
   m_table->get_position(ref);
   DBUG_VOID_RETURN;
 }
@@ -370,7 +379,7 @@ void ha_perfschema::position(const uchar *record)
 int ha_perfschema::rnd_pos(uchar *buf, uchar *pos)
 {
   DBUG_ENTER("ha_perfschema::rnd_pos");
-  if (!pfs_initialized)
+  if (!PFS_ENABLED())
   {
     table->status= STATUS_NOT_FOUND;
     DBUG_RETURN(HA_ERR_END_OF_FILE);
@@ -387,7 +396,7 @@ int ha_perfschema::rnd_pos(uchar *buf, uchar *pos)
 int ha_perfschema::info(uint flag)
 {
   DBUG_ENTER("ha_perfschema::info");
-  DBUG_ASSERT(m_table_share);
+  assert(m_table_share);
   if (flag & HA_STATUS_VARIABLE)
     stats.records= m_table_share->get_row_count();
   if (flag & HA_STATUS_CONST)
@@ -400,13 +409,13 @@ int ha_perfschema::delete_all_rows(void)
   int result;
 
   DBUG_ENTER("ha_perfschema::delete_all_rows");
-  if (!pfs_initialized)
+  if (!PFS_ENABLED())
     DBUG_RETURN(0);
 
   if (is_executed_by_slave())
     DBUG_RETURN(0);
 
-  DBUG_ASSERT(m_table_share);
+  assert(m_table_share);
   if (m_table_share->m_delete_all_rows)
     result= m_table_share->m_delete_all_rows();
   else
diff --git a/storage/perfschema/ha_perfschema.h b/storage/perfschema/ha_perfschema.h
index 18ac035831d..f3d84a3e264 100644
--- a/storage/perfschema/ha_perfschema.h
+++ b/storage/perfschema/ha_perfschema.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2014, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -41,7 +41,7 @@ class PFS_engine_table;
 extern const char *pfs_engine_name;
 
 /** A handler for a PERFORMANCE_SCHEMA table. */
-class ha_perfschema : public handler
+class ha_perfschema final : public handler
 {
 public:
   /**
@@ -232,8 +232,8 @@ private:
   */
   bool is_executed_by_slave() const
   {
-    DBUG_ASSERT(table != NULL);
-    DBUG_ASSERT(table->in_use != NULL);
+    assert(table != NULL);
+    assert(table->in_use != NULL);
     return table->in_use->slave_thread;
 
   }
diff --git a/storage/perfschema/my_thread.h b/storage/perfschema/my_thread.h
new file mode 100644
index 00000000000..c1a079ce34d
--- /dev/null
+++ b/storage/perfschema/my_thread.h
@@ -0,0 +1,137 @@
+#ifndef STORAGE_PERFSCHEMA_MY_THREAD_INCLUDED
+#define STORAGE_PERFSCHEMA_MY_THREAD_INCLUDED
+
+#include <my_pthread.h>
+#include <m_string.h>
+#include "pfs_config.h"
+
+#ifdef HAVE_SYS_GETTID
+#include <sys/types.h>
+#include <sys/syscall.h>
+#endif
+
+#ifdef HAVE_PTHREAD_GETTHREADID_NP
+#include <pthread_np.h>
+#endif
+
+#if defined(HAVE_INTEGER_PTHREAD_SELF)
+#include <cstdint>
+#endif
+
+typedef pthread_key_t thread_local_key_t;
+typedef pthread_t my_thread_handle;
+typedef pthread_attr_t my_thread_attr_t;
+#if defined(HAVE_PTHREAD_THREADID_NP) || defined(HAVE_GETTID) || defined(HAVE_SYS_GETTID) || defined(HAVE_GETTHRID)
+typedef pid_t my_thread_os_id_t;
+#elif defined(_WIN32)
+typedef uint32 my_thread_os_id_t;
+#elif defined(HAVE_PTHREAD_GETTHREADID_NP)
+typedef int my_thread_os_id_t;
+#elif defined(HAVE_INTEGER_PTHREAD_SELF)
+typedef uintptr_t my_thread_os_id_t;
+#else
+typedef unsigned long long my_thread_os_id_t;
+#endif
+
+#define LOCK_plugin_delete LOCK_plugin
+
+static inline int my_create_thread_local_key(thread_local_key_t *key, void (*destructor)(void*))
+{ return pthread_key_create(key, destructor); }
+
+static inline int my_delete_thread_local_key(thread_local_key_t key)
+{ return pthread_key_delete(key); }
+
+static inline void *my_get_thread_local(thread_local_key_t key)
+{ return pthread_getspecific(key); }
+
+static inline int my_set_thread_local(thread_local_key_t key, const void *ptr)
+{ return pthread_setspecific(key, ptr); }
+
+static inline int my_thread_create(my_thread_handle *thread,
+        const my_thread_attr_t *attr, void *(*start_routine)(void *), void *arg)
+{ return pthread_create(thread, attr, start_routine, arg); }
+
+static inline my_thread_os_id_t my_thread_os_id()
+{
+#ifdef HAVE_PTHREAD_THREADID_NP
+  /*
+    macOS.
+
+    Be careful to use this version first, and to not use SYS_gettid on macOS,
+    as SYS_gettid has a different meaning compared to linux gettid().
+  */
+  uint64_t tid64;
+  pthread_threadid_np(nullptr, &tid64);
+  return (pid_t)tid64;
+#else
+#ifdef HAVE_GETTID
+  /* Linux glibc-2.30+ */
+  return gettid();
+#else
+#ifdef HAVE_SYS_GETTID
+  /*
+    Linux before glibc-2.30
+    See man gettid
+  */
+  return syscall(SYS_gettid);
+#else
+#ifdef _WIN32
+  /* Windows */
+  return GetCurrentThreadId();
+#else
+#ifdef HAVE_PTHREAD_GETTHREADID_NP
+  /* FreeBSD 10.2 */
+  return pthread_getthreadid_np();
+#else
+#ifdef HAVE_GETTHRID
+  /* OpenBSD */
+  return getthrid();
+#else
+#ifdef HAVE_INTEGER_PTHREAD_SELF
+  /* NetBSD, and perhaps something else, fallback. */
+  return (my_thread_os_id_t) pthread_self();
+#else
+  /* Feature not available. */
+  return 0;
+#endif /* HAVE_INTEGER_PTHREAD_SELF */
+#endif /* HAVE_GETTHRID */
+#endif /* HAVE_PTHREAD_GETTHREADID_NP */
+#endif /* _WIN32 */
+#endif /* HAVE_SYS_GETTID */
+#endif /* HAVE_GETTID */
+#endif /* HAVE_PTHREAD_THREADID_NP */
+}
+
+#define CHANNEL_NAME_LENGTH MAX_CONNECTION_NAME
+
+enum enum_mysql_show_scope
+{
+  SHOW_SCOPE_UNDEF,
+  SHOW_SCOPE_GLOBAL,
+  SHOW_SCOPE_SESSION,
+  SHOW_SCOPE_ALL
+};
+typedef enum enum_mysql_show_scope SHOW_SCOPE;
+
+#define SHOW_VAR_MAX_NAME_LEN NAME_LEN
+
+static inline char *my_stpnmov(char *dst, const char *src, size_t n)
+{ return strnmov(dst, src, n); }
+
+static inline size_t bin_to_hex_str(char *to, size_t to_len,
+                                    const char *from, size_t from_len)
+{
+  if (to_len < from_len * 2 + 1)
+    return 0 ;
+  for (size_t i=0; i < from_len; i++, from++)
+  {
+    *to++=_dig_vec_upper[((unsigned char) *from) >> 4];
+    *to++=_dig_vec_upper[((unsigned char) *from) & 0xF];
+  }
+  *to= '\0';
+  return from_len * 2 + 1;
+}
+
+#define thd_get_psi(X) ((X)->get_psi())
+
+#endif
diff --git a/storage/perfschema/mysqld_thd_manager.cc b/storage/perfschema/mysqld_thd_manager.cc
new file mode 100644
index 00000000000..61282b7e024
--- /dev/null
+++ b/storage/perfschema/mysqld_thd_manager.cc
@@ -0,0 +1,39 @@
+#include "mysqld_thd_manager.h"
+#include "sql_class.h"
+
+static Global_THD_manager manager;
+Global_THD_manager* Global_THD_manager::get_instance()
+{
+  return &manager;
+}
+
+struct find_thd_arg
+{
+  Find_THD_Impl *func;
+  THD *cur;
+};
+
+static my_bool find_thd_cb(THD *tmp, find_thd_arg *arg)
+{
+  arg->cur= tmp;
+  return (*arg->func)(tmp);
+}
+
+THD* Global_THD_manager::find_thd(Find_THD_Impl *func)
+{
+  find_thd_arg arg= {func, NULL};
+  if (THD_list_iterator::iterator()->iterate(find_thd_cb, &arg))
+    return arg.cur;
+  return NULL;
+}
+
+static my_bool do_for_all_cb(THD *tmp, Do_THD_Impl *arg)
+{
+  (*arg)(tmp);
+  return 0;
+}
+
+void Global_THD_manager::do_for_all_thd(Do_THD_Impl *arg)
+{
+  THD_list_iterator::iterator()->iterate(do_for_all_cb, arg);
+}
diff --git a/storage/perfschema/mysqld_thd_manager.h b/storage/perfschema/mysqld_thd_manager.h
new file mode 100644
index 00000000000..fbb6f86a8c4
--- /dev/null
+++ b/storage/perfschema/mysqld_thd_manager.h
@@ -0,0 +1,29 @@
+#ifndef STORAGE_PERFSCHEMA_MYSQL_THD_MANAGER_INCLUDED
+#define STORAGE_PERFSCHEMA_MYSQL_THD_MANAGER_INCLUDED
+#include "my_global.h"
+#include "my_thread.h"
+
+class Find_THD_Impl
+{
+  public:
+  virtual ~Find_THD_Impl() {}
+  virtual bool operator()(THD *thd) = 0;
+};
+
+class Do_THD_Impl
+{
+  public:
+  virtual ~Do_THD_Impl() {}
+  virtual void operator()(THD*) = 0;
+};
+
+class Global_THD_manager
+{
+  public:
+  static Global_THD_manager* get_instance();
+  THD* find_thd(Find_THD_Impl *func);
+  void do_for_all_thd(Do_THD_Impl *arg);
+};
+
+ulong get_system_variable_hash_records(void);
+#endif
diff --git a/storage/perfschema/pfs.cc b/storage/perfschema/pfs.cc
index 44bcbad87d7..4e3d4f551a0 100644
--- a/storage/perfschema/pfs.cc
+++ b/storage/perfschema/pfs.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2017, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,6 +26,19 @@
 */
 #include "my_global.h"
 #include "thr_lock.h"
+
+/* Make sure exported prototypes match the implementation. */
+#include "pfs_file_provider.h"
+#include "pfs_idle_provider.h"
+#include "pfs_memory_provider.h"
+#include "pfs_metadata_provider.h"
+#include "pfs_socket_provider.h"
+#include "pfs_stage_provider.h"
+#include "pfs_statement_provider.h"
+#include "pfs_table_provider.h"
+#include "pfs_thread_provider.h"
+#include "pfs_transaction_provider.h"
+
 #include "mysql/psi/psi.h"
 #include "mysql/psi/mysql_thread.h"
 #include "my_pthread.h"
@@ -42,11 +55,48 @@
 #include "pfs_events_waits.h"
 #include "pfs_events_stages.h"
 #include "pfs_events_statements.h"
+#include "pfs_events_transactions.h"
 #include "pfs_setup_actor.h"
 #include "pfs_setup_object.h"
 #include "sql_error.h"
 #include "sp_head.h"
+#include "mdl.h" /* mdl_key_init */
 #include "pfs_digest.h"
+#include "pfs_program.h"
+#include "pfs_prepared_stmt.h"
+
+using std::min;
+
+/*
+  This is a development tool to investigate memory statistics,
+  do not use in production.
+*/
+#undef PFS_PARANOID
+
+#ifdef PFS_PARANOID
+static void report_memory_accounting_error(
+  const char *api_name,
+  PFS_thread *new_thread,
+  size_t size,
+  PFS_memory_class *klass,
+  PFS_thread *old_thread)
+{
+  pfs_print_error("%s "
+                  "thread <%d> of class <%s> "
+                  "not owner of <%d> bytes in class <%s> "
+                  "allocated by thread <%d> of class <%s>\n",
+                  api_name,
+                  new_thread->m_thread_internal_id,
+                  new_thread->m_class->m_name,
+                  size, klass->m_name,
+                  old_thread->m_thread_internal_id,
+                  old_thread->m_class->m_name);
+
+  assert(strcmp(new_thread->m_class->m_name, "thread/sql/event_worker") != 0);
+  assert(strcmp(new_thread->m_class->m_name, "thread/sql/event_scheduler") != 0);
+  assert(strcmp(new_thread->m_class->m_name, "thread/sql/one_connection") != 0);
+}
+#endif /* PFS_PARANOID */
 
 /**
   @page PAGE_PERFORMANCE_SCHEMA The Performance Schema main page
@@ -393,14 +443,14 @@ static inline int mysql_mutex_lock(
   struct PSI_mutex_locker *locker= NULL;
 
   ............... (a)
-  locker= PSI_server->start_mutex_wait(&state, that->p_psi,
-                                       PSI_MUTEX_LOCK, locker, src_file, src_line);
+  locker= PSI_MUTEX_CALL(start_mutex_wait)(&state, that->p_psi, PSI_MUTEX_LOCK,
+                                           locker, src_file, src_line);
 
   ............... (b)
   result= pthread_mutex_lock(&that->m_mutex);
 
   ............... (c)
-  PSI_server->end_mutex_wait(locker, result);
+  PSI_MUTEX_CALL(end_mutex_wait)(locker, result);
 
   return result;
 }
@@ -420,6 +470,62 @@ static inline int mysql_mutex_lock(...)
   return result;
 }
 @endverbatim
+
+  When the performance schema instrumentation is compiled in,
+  and when the code compiled is internal to the server implementation,
+  PSI_MUTEX_CALL expands directly to functions calls in the performance schema,
+  to make (a) and (c) calls as efficient as possible.
+
+@verbatim
+static inline int mysql_mutex_lock(...)
+{
+  int result;
+  struct PSI_mutex_locker_state state;
+  struct PSI_mutex_locker *locker= NULL;
+
+  ............... (a)
+  locker= pfs_start_mutex_wait_v1(&state, that->p_psi, PSI_MUTEX_LOCK,
+                                  locker, src_file, src_line);
+
+  ............... (b)
+  result= pthread_mutex_lock(&that->m_mutex);
+
+  ............... (c)
+  pfs_end_mutex_wait_v1(locker, result);
+
+  return result;
+}
+@endverbatim
+
+  When the performance schema instrumentation is compiled in,
+  and when the code compiled is external to the server implementation
+  (typically, a dynamic plugin),
+  PSI_MUTEX_CALL expands to dynamic calls to the underlying implementation,
+  using the PSI_server entry point.
+  This makes (a) and (c) slower, as a function pointer is used instead of a static call,
+  but also independent of the implementation, for binary compatibility.
+
+@verbatim
+static inline int mysql_mutex_lock(...)
+{
+  int result;
+  struct PSI_mutex_locker_state state;
+  struct PSI_mutex_locker *locker= NULL;
+
+  ............... (a)
+  locker= PSI_server->start_mutex_wait(&state, that->p_psi, PSI_MUTEX_LOCK,
+                                       locker, src_file, src_line);
+
+  ............... (b)
+  result= pthread_mutex_lock(&that->m_mutex);
+
+  ............... (c)
+  PSI_server->end_mutex_wait(locker, result);
+
+  return result;
+}
+@endverbatim
+
 */
 
 /**
@@ -1102,6 +1208,134 @@ static inline int mysql_mutex_lock(...)
         @c table_events_statements_common::make_row()
   - [I] EVENTS_STATEMENTS_SUMMARY_BY_DIGEST
         @c table_esms_by_digest::make_row()
+
+@section IMPL_TRANSACTION Implementation for transactions consumers
+
+  For transactions, the tables that contains individual event data are:
+  - EVENTS_TRANSACTIONS_CURRENT
+  - EVENTS_TRANSACTIONS_HISTORY
+  - EVENTS_TRANSACTIONS_HISTORY_LONG
+
+  For transactions, the tables that contains aggregated data are:
+  - EVENTS_TRANSACTIONS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME
+  - EVENTS_TRANSACTIONS_SUMMARY_BY_HOST_BY_EVENT_NAME
+  - EVENTS_TRANSACTIONS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+  - EVENTS_TRANSACTIONS_SUMMARY_BY_USER_BY_EVENT_NAME
+  - EVENTS_TRANSACTIONS_SUMMARY_GLOBAL_BY_EVENT_NAME
+
+@verbatim
+  transaction_locker(T, TX)
+   |
+   | [1]
+   |
+1a |-> pfs_thread(T).event_name(TX)              =====>> [A], [B], [C], [D], [E]
+   |    |
+   |    | [2]
+   |    |
+   | 2a |-> pfs_account(U, H).event_name(TX)     =====>> [B], [C], [D], [E]
+   |    .    |
+   |    .    | [3-RESET]
+   |    .    |
+   | 2b .....+-> pfs_user(U).event_name(TX)      =====>> [C]
+   |    .    |
+   | 2c .....+-> pfs_host(H).event_name(TX)      =====>> [D], [E]
+   |    .    .    |
+   |    .    .    | [4-RESET]
+   | 2d .    .    |
+1b |----+----+----+-> pfs_transaction_class(TX)  =====>> [E]
+   |
+1c |-> pfs_thread(T).transaction_current(TX)     =====>> [F]
+   |
+1d |-> pfs_thread(T).transaction_history(TX)     =====>> [G]
+   |
+1e |-> transaction_history_long(TX)              =====>> [H]
+
+@endverbatim
+
+  Implemented as:
+  - [1] @c start_transaction_v1(), end_transaction_v1()
+       (1a, 1b) is an aggregation by EVENT_NAME,
+        (1c, 1d, 1e) is an aggregation by TIME,
+        all of these are orthogonal,
+        and implemented in end_transaction_v1().
+  - [2] @c delete_thread_v1(), @c aggregate_thread_transactions()
+  - [3] @c PFS_account::aggregate_transactions()
+  - [4] @c PFS_host::aggregate_transactions()
+
+  - [A] EVENTS_TRANSACTIONS_SUMMARY_BY_THREAD_BY_EVENT_NAME,
+        @c table_ets_by_thread_by_event_name::make_row()
+  - [B] EVENTS_TRANSACTIONS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME,
+        @c table_ets_by_account_by_event_name::make_row()
+  - [C] EVENTS_TRANSACTIONS_SUMMARY_BY_USER_BY_EVENT_NAME,
+        @c table_ets_by_user_by_event_name::make_row()
+  - [D] EVENTS_TRANSACTIONS_SUMMARY_BY_HOST_BY_EVENT_NAME,
+        @c table_ets_by_host_by_event_name::make_row()
+  - [E] EVENTS_TRANSACTIONS_SUMMARY_GLOBAL_BY_EVENT_NAME,
+        @c table_ets_global_by_event_name::make_row()
+  - [F] EVENTS_TRANSACTIONS_CURRENT,
+        @c table_events_transactions_current::rnd_next(),
+        @c table_events_transactions_common::make_row()
+  - [G] EVENTS_TRANSACTIONS_HISTORY,
+        @c table_events_transactions_history::rnd_next(),
+        @c table_events_transactions_common::make_row()
+  - [H] EVENTS_TRANSACTIONS_HISTORY_LONG,
+        @c table_events_transactions_history_long::rnd_next(),
+        @c table_events_transactions_common::make_row()
+
+@section IMPL_MEMORY Implementation for memory instruments
+
+  For memory, there are no tables that contains individual event data.
+
+  For memory, the tables that contains aggregated data are:
+  - MEMORY_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME
+  - MEMORY_SUMMARY_BY_HOST_BY_EVENT_NAME
+  - MEMORY_SUMMARY_BY_THREAD_BY_EVENT_NAME
+  - MEMORY_SUMMARY_BY_USER_BY_EVENT_NAME
+  - MEMORY_SUMMARY_GLOBAL_BY_EVENT_NAME
+
+@verbatim
+  memory_event(T, S)
+   |
+   | [1]
+   |
+1a |-> pfs_thread(T).event_name(S)            =====>> [A], [B], [C], [D], [E]
+   |    |
+   |    | [2]
+   |    |
+1+ | 2a |-> pfs_account(U, H).event_name(S)   =====>> [B], [C], [D], [E]
+   |    .    |
+   |    .    | [3-RESET]
+   |    .    |
+1+ | 2b .....+-> pfs_user(U).event_name(S)    =====>> [C]
+   |    .    |
+1+ | 2c .....+-> pfs_host(H).event_name(S)    =====>> [D], [E]
+   |    .    .    |
+   |    .    .    | [4-RESET]
+   | 2d .    .    |
+1b |----+----+----+-> global.event_name(S)    =====>> [E]
+
+@endverbatim
+
+  Implemented as:
+  - [1] @c pfs_memory_alloc_v1(),
+        @c pfs_memory_realloc_v1(),
+        @c pfs_memory_free_v1().
+  - [1+] are overflows that can happen during [1a],
+        implemented with @c carry_memory_stat_delta()
+  - [2] @c delete_thread_v1(), @c aggregate_thread_memory()
+  - [3] @c PFS_account::aggregate_memory()
+  - [4] @c PFS_host::aggregate_memory()
+  - [A] EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME,
+        @c table_mems_by_thread_by_event_name::make_row()
+  - [B] EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME,
+        @c table_mems_by_account_by_event_name::make_row()
+  - [C] EVENTS_STATEMENTS_SUMMARY_BY_USER_BY_EVENT_NAME,
+        @c table_mems_by_user_by_event_name::make_row()
+  - [D] EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME,
+        @c table_mems_by_host_by_event_name::make_row()
+  - [E] EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME,
+        @c table_mems_global_by_event_name::make_row()
+
 */
 
 /**
@@ -1117,9 +1351,35 @@ static inline int mysql_mutex_lock(...)
   @ingroup Performance_schema_implementation
 */
 
-pthread_key(PFS_thread*, THR_PFS);
+thread_local_key_t THR_PFS;
+thread_local_key_t THR_PFS_VG;   // global_variables
+thread_local_key_t THR_PFS_SV;   // session_variables
+thread_local_key_t THR_PFS_VBT;  // variables_by_thread
+thread_local_key_t THR_PFS_SG;   // global_status
+thread_local_key_t THR_PFS_SS;   // session_status
+thread_local_key_t THR_PFS_SBT;  // status_by_thread
+thread_local_key_t THR_PFS_SBU;  // status_by_user
+thread_local_key_t THR_PFS_SBH;  // status_by_host
+thread_local_key_t THR_PFS_SBA;  // status_by_account
+
 bool THR_PFS_initialized= false;
 
+static inline PFS_thread*
+my_thread_get_THR_PFS()
+{
+  assert(THR_PFS_initialized);
+  PFS_thread *thread= static_cast<PFS_thread*>(my_get_thread_local(THR_PFS));
+  assert(thread == NULL || sanitize_thread(thread) != NULL);
+  return thread;
+}
+
+static inline void
+my_thread_set_THR_PFS(PFS_thread *pfs)
+{
+  assert(THR_PFS_initialized);
+  my_set_thread_local(THR_PFS, pfs);
+}
+
 /**
   Conversion map from PSI_mutex_operation to enum_operation_type.
   Indexed by enum PSI_mutex_operation.
@@ -1139,7 +1399,14 @@ static enum_operation_type rwlock_operation_map[]=
   OPERATION_TYPE_READLOCK,
   OPERATION_TYPE_WRITELOCK,
   OPERATION_TYPE_TRYREADLOCK,
-  OPERATION_TYPE_TRYWRITELOCK
+  OPERATION_TYPE_TRYWRITELOCK,
+
+  OPERATION_TYPE_SHAREDLOCK,
+  OPERATION_TYPE_SHAREDEXCLUSIVELOCK,
+  OPERATION_TYPE_EXCLUSIVELOCK,
+  OPERATION_TYPE_TRYSHAREDLOCK,
+  OPERATION_TYPE_TRYSHAREDEXCLUSIVELOCK,
+  OPERATION_TYPE_TRYEXCLUSIVELOCK,
 };
 
 /**
@@ -1244,7 +1511,7 @@ static enum_operation_type socket_operation_map[]=
   @return 0 for success, non zero for errors
 */
 static int build_prefix(const LEX_CSTRING *prefix, const char *category,
-                        char *output, int *output_length)
+                        char *output, size_t *output_length)
 {
   size_t len= strlen(category);
   char *out_ptr= output;
@@ -1268,52 +1535,56 @@ static int build_prefix(const LEX_CSTRING *prefix, const char *category,
   /* output = prefix + category + '/' */
   memcpy(out_ptr, prefix->str, prefix_length);
   out_ptr+= prefix_length;
-  memcpy(out_ptr, category, len);
-  out_ptr+= len;
-  *out_ptr= '/';
-  out_ptr++;
-  *output_length= (int)(out_ptr - output);
+  if (len > 0)
+  {
+    memcpy(out_ptr, category, len);
+    out_ptr+= len;
+    *out_ptr= '/';
+    out_ptr++;
+  }
+  *output_length= int(out_ptr - output);
 
   return 0;
 }
 
-#define REGISTER_BODY_V1(KEY_T, PREFIX, REGISTER_FUNC)                \
-  KEY_T key;                                                          \
-  char formatted_name[PFS_MAX_INFO_NAME_LENGTH];                      \
-  int prefix_length;                                                  \
-  int len;                                                            \
-  int full_length;                                                    \
-                                                                      \
-  DBUG_ASSERT(category != NULL);                                      \
-  DBUG_ASSERT(info != NULL);                                          \
-  if (unlikely(build_prefix(&PREFIX, category,                        \
-                   formatted_name, &prefix_length)))                  \
-  {                                                                   \
-    for (; count>0; count--, info++)                                  \
-      *(info->m_key)= 0;                                              \
-    return ;                                                          \
-  }                                                                   \
-                                                                      \
-  for (; count>0; count--, info++)                                    \
-  {                                                                   \
-    DBUG_ASSERT(info->m_key != NULL);                                 \
-    DBUG_ASSERT(info->m_name != NULL);                                \
-    len= (int)strlen(info->m_name);                                        \
-    full_length= prefix_length + len;                                 \
-    if (likely(full_length <= PFS_MAX_INFO_NAME_LENGTH))              \
-    {                                                                 \
-      memcpy(formatted_name + prefix_length, info->m_name, len);      \
-      key= REGISTER_FUNC(formatted_name, full_length, info->m_flags); \
-    }                                                                 \
-    else                                                              \
-    {                                                                 \
-      pfs_print_error("REGISTER_BODY_V1: name too long <%s> <%s>\n",  \
-                      category, info->m_name);                        \
-      key= 0;                                                         \
-    }                                                                 \
-                                                                      \
-    *(info->m_key)= key;                                              \
-  }                                                                   \
+#define REGISTER_BODY_V1(KEY_T, PREFIX, REGISTER_FUNC)                      \
+  KEY_T key;                                                                \
+  char formatted_name[PFS_MAX_INFO_NAME_LENGTH];                            \
+  size_t prefix_length;                                                     \
+  size_t len;                                                               \
+  size_t full_length;                                                       \
+                                                                            \
+  assert(category != NULL);                                             \
+  assert(info != NULL);                                                 \
+  if (unlikely(build_prefix(&PREFIX, category,                              \
+                   formatted_name, &prefix_length)) ||                      \
+      ! pfs_initialized)                                                    \
+  {                                                                         \
+    for (; count>0; count--, info++)                                        \
+      *(info->m_key)= 0;                                                    \
+    return ;                                                                \
+  }                                                                         \
+                                                                            \
+  for (; count>0; count--, info++)                                          \
+  {                                                                         \
+    assert(info->m_key != NULL);                                        \
+    assert(info->m_name != NULL);                                       \
+    len= strlen(info->m_name);                                              \
+    full_length= prefix_length + len;                                       \
+    if (likely(full_length <= PFS_MAX_INFO_NAME_LENGTH))                    \
+    {                                                                       \
+      memcpy(formatted_name + prefix_length, info->m_name, len);            \
+      key= REGISTER_FUNC(formatted_name, (uint)full_length, info->m_flags); \
+    }                                                                       \
+    else                                                                    \
+    {                                                                       \
+      pfs_print_error("REGISTER_BODY_V1: name too long <%s> <%s>\n",        \
+                      category, info->m_name);                              \
+      key= 0;                                                               \
+    }                                                                       \
+                                                                            \
+    *(info->m_key)= key;                                                    \
+  }                                                                         \
   return;
 
 /* Use C linkage for the interface functions. */
@@ -1324,9 +1595,9 @@ C_MODE_START
   Implementation of the mutex instrumentation interface.
   @sa PSI_v1::register_mutex.
 */
-static void register_mutex_v1(const char *category,
-                              PSI_mutex_info_v1 *info,
-                              int count)
+void pfs_register_mutex_v1(const char *category,
+                           PSI_mutex_info_v1 *info,
+                           int count)
 {
   REGISTER_BODY_V1(PSI_mutex_key,
                    mutex_instrument_prefix,
@@ -1337,22 +1608,80 @@ static void register_mutex_v1(const char *category,
   Implementation of the rwlock instrumentation interface.
   @sa PSI_v1::register_rwlock.
 */
-static void register_rwlock_v1(const char *category,
-                               PSI_rwlock_info_v1 *info,
-                               int count)
+void pfs_register_rwlock_v1(const char *category,
+                            PSI_rwlock_info_v1 *info,
+                            int count)
 {
-  REGISTER_BODY_V1(PSI_rwlock_key,
-                   rwlock_instrument_prefix,
-                   register_rwlock_class)
+  PSI_rwlock_key key;
+  char rw_formatted_name[PFS_MAX_INFO_NAME_LENGTH];
+  char sx_formatted_name[PFS_MAX_INFO_NAME_LENGTH];
+  size_t rw_prefix_length;
+  size_t sx_prefix_length;
+  size_t len;
+  size_t full_length;
+
+  assert(category != NULL);
+  assert(info != NULL);
+  if (build_prefix(&rwlock_instrument_prefix, category,
+                   rw_formatted_name, &rw_prefix_length) ||
+      build_prefix(&sxlock_instrument_prefix, category,
+                   sx_formatted_name, &sx_prefix_length) ||
+      ! pfs_initialized)
+  {
+    for (; count>0; count--, info++)
+      *(info->m_key)= 0;
+    return ;
+  }
+
+  for (; count>0; count--, info++)
+  {
+    assert(info->m_key != NULL);
+    assert(info->m_name != NULL);
+    len= strlen(info->m_name);
+
+    if (info->m_flags & PSI_RWLOCK_FLAG_SX)
+    {
+      full_length= sx_prefix_length + len;
+      if (likely(full_length <= PFS_MAX_INFO_NAME_LENGTH))
+      {
+        memcpy(sx_formatted_name + sx_prefix_length, info->m_name, len);
+        key= register_rwlock_class(sx_formatted_name, (uint)full_length, info->m_flags);
+      }
+      else
+      {
+        pfs_print_error("REGISTER_BODY_V1: (sx) name too long <%s> <%s>\n",
+                        category, info->m_name);
+        key= 0;
+      }
+    }
+    else
+    {
+      full_length= rw_prefix_length + len;
+      if (likely(full_length <= PFS_MAX_INFO_NAME_LENGTH))
+      {
+        memcpy(rw_formatted_name + rw_prefix_length, info->m_name, len);
+        key= register_rwlock_class(rw_formatted_name, (uint)full_length, info->m_flags);
+      }
+      else
+      {
+        pfs_print_error("REGISTER_BODY_V1: (rw) name too long <%s> <%s>\n",
+                        category, info->m_name);
+        key= 0;
+      }
+    }
+
+    *(info->m_key)= key;
+  }
+  return;
 }
 
 /**
   Implementation of the cond instrumentation interface.
   @sa PSI_v1::register_cond.
 */
-static void register_cond_v1(const char *category,
-                             PSI_cond_info_v1 *info,
-                             int count)
+void pfs_register_cond_v1(const char *category,
+                          PSI_cond_info_v1 *info,
+                          int count)
 {
   REGISTER_BODY_V1(PSI_cond_key,
                    cond_instrument_prefix,
@@ -1363,9 +1692,9 @@ static void register_cond_v1(const char *category,
   Implementation of the thread instrumentation interface.
   @sa PSI_v1::register_thread.
 */
-static void register_thread_v1(const char *category,
-                               PSI_thread_info_v1 *info,
-                               int count)
+void pfs_register_thread_v1(const char *category,
+                            PSI_thread_info_v1 *info,
+                            int count)
 {
   REGISTER_BODY_V1(PSI_thread_key,
                    thread_instrument_prefix,
@@ -1376,29 +1705,30 @@ static void register_thread_v1(const char *category,
   Implementation of the file instrumentation interface.
   @sa PSI_v1::register_file.
 */
-static void register_file_v1(const char *category,
-                             PSI_file_info_v1 *info,
-                             int count)
+void pfs_register_file_v1(const char *category,
+                          PSI_file_info_v1 *info,
+                          int count)
 {
   REGISTER_BODY_V1(PSI_file_key,
                    file_instrument_prefix,
                    register_file_class)
 }
 
-static void register_stage_v1(const char *category,
-                              PSI_stage_info_v1 **info_array,
-                              int count)
+void pfs_register_stage_v1(const char *category,
+                           PSI_stage_info_v1 **info_array,
+                           int count)
 {
   char formatted_name[PFS_MAX_INFO_NAME_LENGTH];
-  int prefix_length;
-  int len;
-  int full_length;
+  size_t prefix_length;
+  size_t len;
+  size_t full_length;
   PSI_stage_info_v1 *info;
 
-  DBUG_ASSERT(category != NULL);
-  DBUG_ASSERT(info_array != NULL);
+  assert(category != NULL);
+  assert(info_array != NULL);
   if (unlikely(build_prefix(&stage_instrument_prefix, category,
-               formatted_name, &prefix_length)))
+               formatted_name, &prefix_length)) ||
+      ! pfs_initialized)
   {
     for (; count>0; count--, info_array++)
       (*info_array)->m_key= 0;
@@ -1411,13 +1741,14 @@ static void register_stage_v1(const char *category,
     DBUG_ASSERT(info != NULL);
     DBUG_ASSERT(info->m_name != NULL);
     len= (int)strlen(info->m_name);
+    DBUG_ASSERT(len <= 64); // see table_threads.cc near PROCESSLIST_STATE
     full_length= prefix_length + len;
     if (likely(full_length <= PFS_MAX_INFO_NAME_LENGTH))
     {
       memcpy(formatted_name + prefix_length, info->m_name, len);
       info->m_key= register_stage_class(formatted_name,
-                                        prefix_length,
-                                        full_length,
+                                        (uint)prefix_length,
+                                        (uint)full_length,
                                         info->m_flags);
     }
     else
@@ -1430,19 +1761,20 @@ static void register_stage_v1(const char *category,
   return;
 }
 
-static void register_statement_v1(const char *category,
-                                  PSI_statement_info_v1 *info,
-                                  int count)
+void pfs_register_statement_v1(const char *category,
+                               PSI_statement_info_v1 *info,
+                               int count)
 {
   char formatted_name[PFS_MAX_INFO_NAME_LENGTH];
-  int prefix_length;
-  int len;
-  int full_length;
+  size_t prefix_length;
+  size_t len;
+  size_t full_length;
 
-  DBUG_ASSERT(category != NULL);
-  DBUG_ASSERT(info != NULL);
+  assert(category != NULL);
+  assert(info != NULL);
   if (unlikely(build_prefix(&statement_instrument_prefix,
-                            category, formatted_name, &prefix_length)))
+                            category, formatted_name, &prefix_length)) ||
+      ! pfs_initialized)
   {
     for (; count>0; count--, info++)
       info->m_key= 0;
@@ -1459,7 +1791,7 @@ static void register_statement_v1(const char *category,
     if (likely(full_length <= PFS_MAX_INFO_NAME_LENGTH))
     {
       memcpy(formatted_name + prefix_length, info->m_name, len);
-      info->m_key= register_statement_class(formatted_name, full_length, info->m_flags);
+      info->m_key= register_statement_class(formatted_name, (uint)full_length, info->m_flags);
     }
     else
     {
@@ -1471,9 +1803,9 @@ static void register_statement_v1(const char *category,
   return;
 }
 
-static void register_socket_v1(const char *category,
-                             PSI_socket_info_v1 *info,
-                             int count)
+void pfs_register_socket_v1(const char *category,
+                            PSI_socket_info_v1 *info,
+                            int count)
 {
   REGISTER_BODY_V1(PSI_socket_key,
                    socket_instrument_prefix,
@@ -1486,8 +1818,6 @@ static void register_socket_v1(const char *category,
   klass= find_##T##_class(KEY);                                             \
   if (unlikely(klass == NULL))                                              \
     return NULL;                                                            \
-  if (! klass->m_enabled)                                                   \
-    return NULL;                                                            \
   pfs= create_##T(klass, ID);                                               \
   return reinterpret_cast<PSI_##T *> (pfs)
 
@@ -1495,8 +1825,8 @@ static void register_socket_v1(const char *category,
   Implementation of the mutex instrumentation interface.
   @sa PSI_v1::init_mutex.
 */
-static PSI_mutex*
-init_mutex_v1(PSI_mutex_key key, void *identity)
+PSI_mutex*
+pfs_init_mutex_v1(PSI_mutex_key key, void *identity)
 {
   INIT_BODY_V1(mutex, key, identity);
 }
@@ -1505,11 +1835,11 @@ init_mutex_v1(PSI_mutex_key key, void *identity)
   Implementation of the mutex instrumentation interface.
   @sa PSI_v1::destroy_mutex.
 */
-static void destroy_mutex_v1(PSI_mutex* mutex)
+void pfs_destroy_mutex_v1(PSI_mutex* mutex)
 {
   PFS_mutex *pfs= reinterpret_cast<PFS_mutex*> (mutex);
 
-  DBUG_ASSERT(pfs != NULL);
+  assert(pfs != NULL);
 
   destroy_mutex(pfs);
 }
@@ -1518,8 +1848,8 @@ static void destroy_mutex_v1(PSI_mutex* mutex)
   Implementation of the rwlock instrumentation interface.
   @sa PSI_v1::init_rwlock.
 */
-static PSI_rwlock*
-init_rwlock_v1(PSI_rwlock_key key, void *identity)
+PSI_rwlock*
+pfs_init_rwlock_v1(PSI_rwlock_key key, void *identity)
 {
   INIT_BODY_V1(rwlock, key, identity);
 }
@@ -1528,11 +1858,11 @@ init_rwlock_v1(PSI_rwlock_key key, void *identity)
   Implementation of the rwlock instrumentation interface.
   @sa PSI_v1::destroy_rwlock.
 */
-static void destroy_rwlock_v1(PSI_rwlock* rwlock)
+void pfs_destroy_rwlock_v1(PSI_rwlock* rwlock)
 {
   PFS_rwlock *pfs= reinterpret_cast<PFS_rwlock*> (rwlock);
 
-  DBUG_ASSERT(pfs != NULL);
+  assert(pfs != NULL);
 
   destroy_rwlock(pfs);
 }
@@ -1541,8 +1871,8 @@ static void destroy_rwlock_v1(PSI_rwlock* rwlock)
   Implementation of the cond instrumentation interface.
   @sa PSI_v1::init_cond.
 */
-static PSI_cond*
-init_cond_v1(PSI_cond_key key, void *identity)
+PSI_cond*
+pfs_init_cond_v1(PSI_cond_key key, void *identity)
 {
   INIT_BODY_V1(cond, key, identity);
 }
@@ -1551,11 +1881,11 @@ init_cond_v1(PSI_cond_key key, void *identity)
   Implementation of the cond instrumentation interface.
   @sa PSI_v1::destroy_cond.
 */
-static void destroy_cond_v1(PSI_cond* cond)
+void pfs_destroy_cond_v1(PSI_cond* cond)
 {
   PFS_cond *pfs= reinterpret_cast<PFS_cond*> (cond);
 
-  DBUG_ASSERT(pfs != NULL);
+  assert(pfs != NULL);
 
   destroy_cond(pfs);
 }
@@ -1564,14 +1894,14 @@ static void destroy_cond_v1(PSI_cond* cond)
   Implementation of the table instrumentation interface.
   @sa PSI_v1::get_table_share.
 */
-static PSI_table_share*
-get_table_share_v1(my_bool temporary, TABLE_SHARE *share)
+PSI_table_share*
+pfs_get_table_share_v1(my_bool temporary, TABLE_SHARE *share)
 {
   /* Ignore temporary tables and views. */
   if (temporary || share->is_view)
     return NULL;
   /* An instrumented thread is required, for LF_PINS. */
-  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  PFS_thread *pfs_thread= my_thread_get_THR_PFS();
   if (unlikely(pfs_thread == NULL))
     return NULL;
   PFS_table_share* pfs_share;
@@ -1583,7 +1913,7 @@ get_table_share_v1(my_bool temporary, TABLE_SHARE *share)
   Implementation of the table instrumentation interface.
   @sa PSI_v1::release_table_share.
 */
-static void release_table_share_v1(PSI_table_share* share)
+void pfs_release_table_share_v1(PSI_table_share* share)
 {
   PFS_table_share* pfs= reinterpret_cast<PFS_table_share*> (share);
 
@@ -1597,15 +1927,15 @@ static void release_table_share_v1(PSI_table_share* share)
   Implementation of the table instrumentation interface.
   @sa PSI_v1::drop_table_share.
 */
-static void
-drop_table_share_v1(my_bool temporary,
-                    const char *schema_name, int schema_name_length,
-                    const char *table_name, int table_name_length)
+void
+pfs_drop_table_share_v1(my_bool temporary,
+                        const char *schema_name, int schema_name_length,
+                        const char *table_name, int table_name_length)
 {
   /* Ignore temporary tables. */
   if (temporary)
     return;
-  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  PFS_thread *pfs_thread= my_thread_get_THR_PFS();
   if (unlikely(pfs_thread == NULL))
     return;
   /* TODO: temporary tables */
@@ -1617,8 +1947,8 @@ drop_table_share_v1(my_bool temporary,
   Implementation of the table instrumentation interface.
   @sa PSI_v1::open_table.
 */
-static PSI_table*
-open_table_v1(PSI_table_share *share, const void *identity)
+PSI_table*
+pfs_open_table_v1(PSI_table_share *share, const void *identity)
 {
   PFS_table_share *pfs_table_share= reinterpret_cast<PFS_table_share*> (share);
 
@@ -1641,7 +1971,8 @@ open_table_v1(PSI_table_share *share, const void *identity)
   if (! global_table_io_class.m_enabled && ! global_table_lock_class.m_enabled)
     return NULL;
 
-  PFS_thread *thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  PFS_thread *thread= my_thread_get_THR_PFS();
+
   if (unlikely(thread == NULL))
     return NULL;
 
@@ -1653,12 +1984,13 @@ open_table_v1(PSI_table_share *share, const void *identity)
   Implementation of the table instrumentation interface.
   @sa PSI_v1::unbind_table.
 */
-static void unbind_table_v1(PSI_table *table)
+void pfs_unbind_table_v1(PSI_table *table)
 {
   PFS_table *pfs= reinterpret_cast<PFS_table*> (table);
   if (likely(pfs != NULL))
   {
     pfs->m_thread_owner= NULL;
+    pfs->m_owner_event_id= 0;
   }
 }
 
@@ -1666,43 +1998,42 @@ static void unbind_table_v1(PSI_table *table)
   Implementation of the table instrumentation interface.
   @sa PSI_v1::rebind_table.
 */
-static PSI_table *
-rebind_table_v1(PSI_table_share *share, const void *identity, PSI_table *table)
+PSI_table *
+pfs_rebind_table_v1(PSI_table_share *share, const void *identity, PSI_table *table)
 {
   PFS_table *pfs= reinterpret_cast<PFS_table*> (table);
   if (likely(pfs != NULL))
   {
-    PFS_thread *thread;
-    DBUG_ASSERT(pfs->m_thread_owner == NULL);
+    assert(pfs->m_thread_owner == NULL);
 
-    if (psi_unlikely(! flag_global_instrumentation))
+    if (unlikely(! pfs->m_share->m_enabled))
     {
       destroy_table(pfs);
       return NULL;
     }
 
-    /* The table handle was already instrumented, reuse it for this thread. */
-    thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
-
-    if (unlikely(! pfs->m_share->m_enabled))
+    if (unlikely(! global_table_io_class.m_enabled && ! global_table_lock_class.m_enabled))
     {
       destroy_table(pfs);
       return NULL;
     }
 
-    if (unlikely(! global_table_io_class.m_enabled && ! global_table_lock_class.m_enabled))
+    if (psi_unlikely(! flag_global_instrumentation))
     {
       destroy_table(pfs);
       return NULL;
     }
 
+    /* The table handle was already instrumented, reuse it for this thread. */
+    PFS_thread *thread= my_thread_get_THR_PFS();
     pfs->m_thread_owner= thread;
+    if (thread != NULL)
+      pfs->m_owner_event_id= thread->m_event_id;
+    else
+      pfs->m_owner_event_id= 0;
     return table;
   }
 
-  if (psi_unlikely(! flag_global_instrumentation))
-    return NULL;
-
   /* See open_table_v1() */
 
   PFS_table_share *pfs_table_share= reinterpret_cast<PFS_table_share*> (share);
@@ -1716,7 +2047,10 @@ rebind_table_v1(PSI_table_share *share, const void *identity, PSI_table *table)
   if (! global_table_io_class.m_enabled && ! global_table_lock_class.m_enabled)
     return NULL;
 
-  PFS_thread *thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  if (! flag_global_instrumentation)
+    return NULL;
+
+  PFS_thread *thread= my_thread_get_THR_PFS();
   if (unlikely(thread == NULL))
     return NULL;
 
@@ -1728,35 +2062,33 @@ rebind_table_v1(PSI_table_share *share, const void *identity, PSI_table *table)
   Implementation of the table instrumentation interface.
   @sa PSI_v1::close_table.
 */
-static void close_table_v1(PSI_table *table)
+void pfs_close_table_v1(TABLE_SHARE *server_share, PSI_table *table)
 {
   PFS_table *pfs= reinterpret_cast<PFS_table*> (table);
   if (unlikely(pfs == NULL))
     return;
-  pfs->aggregate();
+  pfs->aggregate(server_share);
   destroy_table(pfs);
 }
 
-static PSI_socket*
-init_socket_v1(PSI_socket_key key, const my_socket *fd,
-               const struct sockaddr *addr, socklen_t addr_len)
+PSI_socket*
+pfs_init_socket_v1(PSI_socket_key key, const my_socket *fd,
+                   const struct sockaddr *addr, socklen_t addr_len)
 {
   PFS_socket_class *klass;
   PFS_socket *pfs;
   klass= find_socket_class(key);
   if (unlikely(klass == NULL))
     return NULL;
-  if (! klass->m_enabled)
-    return NULL;
   pfs= create_socket(klass, fd, addr, addr_len);
   return reinterpret_cast<PSI_socket *> (pfs);
 }
 
-static void destroy_socket_v1(PSI_socket *socket)
+void pfs_destroy_socket_v1(PSI_socket *socket)
 {
   PFS_socket *pfs= reinterpret_cast<PFS_socket*> (socket);
 
-  DBUG_ASSERT(pfs != NULL);
+  assert(pfs != NULL);
 
   destroy_socket(pfs);
 }
@@ -1765,7 +2097,7 @@ static void destroy_socket_v1(PSI_socket *socket)
   Implementation of the file instrumentation interface.
   @sa PSI_v1::create_file.
 */
-static void create_file_v1(PSI_file_key key, const char *name, File file)
+void pfs_create_file_v1(PSI_file_key key, const char *name, File file)
 {
   if (psi_unlikely(! flag_global_instrumentation))
     return;
@@ -1779,7 +2111,7 @@ static void create_file_v1(PSI_file_key key, const char *name, File file)
     return;
 
   /* A thread is needed for LF_PINS */
-  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  PFS_thread *pfs_thread= my_thread_get_THR_PFS();
   if (unlikely(pfs_thread == NULL))
     return;
 
@@ -1821,7 +2153,7 @@ struct PFS_spawn_thread_arg
   void *m_user_arg;
 };
 
-void* pfs_spawn_thread(void *arg)
+extern "C" void* pfs_spawn_thread(void *arg)
 {
   PFS_spawn_thread_arg *typed_arg= (PFS_spawn_thread_arg*) arg;
   void *user_arg;
@@ -1853,7 +2185,7 @@ void* pfs_spawn_thread(void *arg)
   {
     pfs= NULL;
   }
-  my_pthread_setspecific_ptr(THR_PFS, pfs);
+  my_thread_set_THR_PFS(pfs);
 
   /*
     Secondly, free the memory allocated in spawn_thread_v1().
@@ -1875,15 +2207,16 @@ void* pfs_spawn_thread(void *arg)
   Implementation of the thread instrumentation interface.
   @sa PSI_v1::spawn_thread.
 */
-static int spawn_thread_v1(PSI_thread_key key,
-                           pthread_t *thread, const pthread_attr_t *attr,
-                           void *(*start_routine)(void*), void *arg)
+int pfs_spawn_thread_v1(PSI_thread_key key,
+                        my_thread_handle *thread, const my_thread_attr_t *attr,
+                        void *(*start_routine)(void*), void *arg)
 {
   PFS_spawn_thread_arg *psi_arg;
   PFS_thread *parent;
 
   /* psi_arg can not be global, and can not be a local variable. */
-  psi_arg= (PFS_spawn_thread_arg*) my_malloc(sizeof(PFS_spawn_thread_arg),
+  psi_arg= (PFS_spawn_thread_arg*) my_malloc(PSI_NOT_INSTRUMENTED,
+                                             sizeof(PFS_spawn_thread_arg),
                                              MYF(MY_WME));
   if (unlikely(psi_arg == NULL))
     return EAGAIN;
@@ -1893,7 +2226,7 @@ static int spawn_thread_v1(PSI_thread_key key,
   psi_arg->m_user_start_routine= start_routine;
   psi_arg->m_user_arg= arg;
 
-  parent= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  parent= my_thread_get_THR_PFS();
   if (parent != NULL)
   {
     /*
@@ -1916,7 +2249,7 @@ static int spawn_thread_v1(PSI_thread_key key,
     psi_arg->m_hostname_length= 0;
   }
 
-  int result= pthread_create(thread, attr, pfs_spawn_thread, psi_arg);
+  int result= my_thread_create(thread, attr, pfs_spawn_thread, psi_arg);
   if (unlikely(result != 0))
     my_free(psi_arg);
   return result;
@@ -1926,14 +2259,22 @@ static int spawn_thread_v1(PSI_thread_key key,
   Implementation of the thread instrumentation interface.
   @sa PSI_v1::new_thread.
 */
-static PSI_thread*
-new_thread_v1(PSI_thread_key key, const void *identity, ulonglong processlist_id)
+PSI_thread*
+pfs_new_thread_v1(PSI_thread_key key, const void *identity, ulonglong processlist_id)
 {
   PFS_thread *pfs;
 
   PFS_thread_class *klass= find_thread_class(key);
   if (likely(klass != NULL))
+  {
     pfs= create_thread(klass, identity, processlist_id);
+    if (pfs != NULL)
+    {
+      PFS_thread *parent= my_thread_get_THR_PFS();
+      if (parent != NULL)
+        pfs->m_parent_thread_internal_id= parent->m_parent_thread_internal_id;
+    }
+  }
   else
     pfs= NULL;
 
@@ -1944,7 +2285,7 @@ new_thread_v1(PSI_thread_key key, const void *identity, ulonglong processlist_id
   Implementation of the thread instrumentation interface.
   @sa PSI_v1::set_thread_id.
 */
-static void set_thread_id_v1(PSI_thread *thread, ulonglong processlist_id)
+void pfs_set_thread_id_v1(PSI_thread *thread, ulonglong processlist_id)
 {
   PFS_thread *pfs= reinterpret_cast<PFS_thread*> (thread);
   if (unlikely(pfs == NULL))
@@ -1954,12 +2295,36 @@ static void set_thread_id_v1(PSI_thread *thread, ulonglong processlist_id)
 
 /**
   Implementation of the thread instrumentation interface.
+  @sa PSI_v1::set_thread_THD.
+*/
+void pfs_set_thread_THD_v1(PSI_thread *thread, THD *thd)
+{
+  PFS_thread *pfs= reinterpret_cast<PFS_thread*> (thread);
+  if (unlikely(pfs == NULL))
+    return;
+  pfs->m_thd= thd;
+}
+
+/**
+  Implementation of the thread instrumentation interface.
+  @sa PSI_v1::set_thread_os_thread_id.
+*/
+void pfs_set_thread_os_id_v1(PSI_thread *thread)
+{
+  PFS_thread *pfs= reinterpret_cast<PFS_thread*> (thread);
+  if (unlikely(pfs == NULL))
+    return;
+  pfs->m_thread_os_id= my_thread_os_id();
+}
+
+/**
+  Implementation of the thread instrumentation interface.
   @sa PSI_v1::get_thread_id.
 */
-static PSI_thread*
-get_thread_v1(void)
+PSI_thread*
+pfs_get_thread_v1(void)
 {
-  PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  PFS_thread *pfs= my_thread_get_THR_PFS();
   return reinterpret_cast<PSI_thread*> (pfs);
 }
 
@@ -1967,20 +2332,21 @@ get_thread_v1(void)
   Implementation of the thread instrumentation interface.
   @sa PSI_v1::set_thread_user.
 */
-static void set_thread_user_v1(const char *user, int user_len)
+void pfs_set_thread_user_v1(const char *user, int user_len)
 {
-  PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  pfs_dirty_state dirty_state;
+  PFS_thread *pfs= my_thread_get_THR_PFS();
 
-  DBUG_ASSERT((user != NULL) || (user_len == 0));
-  DBUG_ASSERT(user_len >= 0);
-  DBUG_ASSERT((uint) user_len <= sizeof(pfs->m_username));
+  assert((user != NULL) || (user_len == 0));
+  assert(user_len >= 0);
+  assert((uint) user_len <= sizeof(pfs->m_username));
 
   if (unlikely(pfs == NULL))
     return;
 
   aggregate_thread(pfs, pfs->m_account, pfs->m_user, pfs->m_host);
 
-  pfs->m_session_lock.allocated_to_dirty();
+  pfs->m_session_lock.allocated_to_dirty(& dirty_state);
 
   clear_thread_account(pfs);
 
@@ -1990,50 +2356,57 @@ static void set_thread_user_v1(const char *user, int user_len)
 
   set_thread_account(pfs);
 
-  bool enabled= true;
-  if (flag_thread_instrumentation)
+  bool enabled;
+  bool history;
+  if (pfs->m_account != NULL)
+  {
+    enabled= pfs->m_account->m_enabled;
+    history= pfs->m_account->m_history;
+  }
+  else
   {
     if ((pfs->m_username_length > 0) && (pfs->m_hostname_length > 0))
     {
-      /*
-        TODO: performance improvement.
-        Once performance_schema.USERS is exposed,
-        we can use PFS_user::m_enabled instead of looking up
-        SETUP_ACTORS every time.
-      */
       lookup_setup_actor(pfs,
                          pfs->m_username, pfs->m_username_length,
                          pfs->m_hostname, pfs->m_hostname_length,
-                         &enabled);
+                         &enabled, &history);
+    }
+    else
+    {
+      /* There is no setting for background threads */
+      enabled= true;
+      history= true;
     }
   }
+  pfs->set_enabled(enabled);
+  pfs->set_history(history);
 
-  pfs->m_enabled= enabled;
-
-  pfs->m_session_lock.dirty_to_allocated();
+  pfs->m_session_lock.dirty_to_allocated(& dirty_state);
 }
 
 /**
   Implementation of the thread instrumentation interface.
   @sa PSI_v1::set_thread_account.
 */
-static void set_thread_account_v1(const char *user, int user_len,
-                                    const char *host, int host_len)
+void pfs_set_thread_account_v1(const char *user, int user_len,
+                               const char *host, int host_len)
 {
-  PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  pfs_dirty_state dirty_state;
+  PFS_thread *pfs= my_thread_get_THR_PFS();
 
-  DBUG_ASSERT((user != NULL) || (user_len == 0));
-  DBUG_ASSERT(user_len >= 0);
-  DBUG_ASSERT((uint) user_len <= sizeof(pfs->m_username));
-  DBUG_ASSERT((host != NULL) || (host_len == 0));
-  DBUG_ASSERT(host_len >= 0);
+  assert((user != NULL) || (user_len == 0));
+  assert(user_len >= 0);
+  assert((uint) user_len <= sizeof(pfs->m_username));
+  assert((host != NULL) || (host_len == 0));
+  assert(host_len >= 0);
 
   host_len= MY_MIN(host_len, static_cast<int>(sizeof(pfs->m_hostname)));
 
   if (unlikely(pfs == NULL))
     return;
 
-  pfs->m_session_lock.allocated_to_dirty();
+  pfs->m_session_lock.allocated_to_dirty(& dirty_state);
 
   clear_thread_account(pfs);
 
@@ -2047,47 +2420,55 @@ static void set_thread_account_v1(const char *user, int user_len,
 
   set_thread_account(pfs);
 
-  bool enabled= true;
-  if (flag_thread_instrumentation)
+  bool enabled;
+  bool history;
+  if (pfs->m_account != NULL)
+  {
+    enabled= pfs->m_account->m_enabled;
+    history= pfs->m_account->m_history;
+  }
+  else
   {
     if ((pfs->m_username_length > 0) && (pfs->m_hostname_length > 0))
     {
-      /*
-        TODO: performance improvement.
-        Once performance_schema.USERS is exposed,
-        we can use PFS_user::m_enabled instead of looking up
-        SETUP_ACTORS every time.
-      */
       lookup_setup_actor(pfs,
                          pfs->m_username, pfs->m_username_length,
                          pfs->m_hostname, pfs->m_hostname_length,
-                         &enabled);
+                         &enabled, &history);
+    }
+    else
+    {
+      /* There is no setting for background threads */
+      enabled= true;
+      history= true;
     }
   }
-  pfs->m_enabled= enabled;
+  pfs->set_enabled(enabled);
+  pfs->set_history(history);
 
-  pfs->m_session_lock.dirty_to_allocated();
+  pfs->m_session_lock.dirty_to_allocated(& dirty_state);
 }
 
 /**
   Implementation of the thread instrumentation interface.
   @sa PSI_v1::set_thread_db.
 */
-static void set_thread_db_v1(const char* db, int db_len)
+void pfs_set_thread_db_v1(const char* db, int db_len)
 {
-  PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  PFS_thread *pfs= my_thread_get_THR_PFS();
 
-  DBUG_ASSERT((db != NULL) || (db_len == 0));
-  DBUG_ASSERT(db_len >= 0);
-  DBUG_ASSERT((uint) db_len <= sizeof(pfs->m_dbname));
+  assert((db != NULL) || (db_len == 0));
+  assert(db_len >= 0);
+  assert((uint) db_len <= sizeof(pfs->m_dbname));
 
   if (likely(pfs != NULL))
   {
-    pfs->m_stmt_lock.allocated_to_dirty();
+    pfs_dirty_state dirty_state;
+    pfs->m_stmt_lock.allocated_to_dirty(& dirty_state);
     if (db_len > 0)
       memcpy(pfs->m_dbname, db, db_len);
     pfs->m_dbname_length= db_len;
-    pfs->m_stmt_lock.dirty_to_allocated();
+    pfs->m_stmt_lock.dirty_to_allocated(& dirty_state);
   }
 }
 
@@ -2095,12 +2476,12 @@ static void set_thread_db_v1(const char* db, int db_len)
   Implementation of the thread instrumentation interface.
   @sa PSI_v1::set_thread_command.
 */
-static void set_thread_command_v1(int command)
+void pfs_set_thread_command_v1(int command)
 {
-  PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  PFS_thread *pfs= my_thread_get_THR_PFS();
 
-  DBUG_ASSERT(command >= 0);
-  DBUG_ASSERT(command <= (int) COM_END);
+  assert(command >= 0);
+  assert(command <= (int) COM_END);
 
   if (likely(pfs != NULL))
   {
@@ -2109,12 +2490,27 @@ static void set_thread_command_v1(int command)
 }
 
 /**
+Implementation of the thread instrumentation interface.
+@sa PSI_v1::set_thread_connection_type.
+*/
+void pfs_set_connection_type_v1(opaque_vio_type conn_type)
+{
+  PFS_thread *pfs= my_thread_get_THR_PFS();
+
+  if (likely(pfs != NULL))
+  {
+    pfs->m_connection_type= static_cast<enum_vio_type> (conn_type);
+  }
+}
+
+
+/**
   Implementation of the thread instrumentation interface.
   @sa PSI_v1::set_thread_start_time.
 */
-static void set_thread_start_time_v1(time_t start_time)
+void pfs_set_thread_start_time_v1(time_t start_time)
 {
-  PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  PFS_thread *pfs= my_thread_get_THR_PFS();
 
   if (likely(pfs != NULL))
   {
@@ -2126,7 +2522,7 @@ static void set_thread_start_time_v1(time_t start_time)
   Implementation of the thread instrumentation interface.
   @sa PSI_v1::set_thread_state.
 */
-static void set_thread_state_v1(const char* state)
+void pfs_set_thread_state_v1(const char* state)
 {
   /* DEPRECATED. */
 }
@@ -2135,11 +2531,12 @@ static void set_thread_state_v1(const char* state)
   Implementation of the thread instrumentation interface.
   @sa PSI_v1::set_thread_info.
 */
-static void set_thread_info_v1(const char* info, uint info_len)
+void pfs_set_thread_info_v1(const char* info, uint info_len)
 {
-  PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  pfs_dirty_state dirty_state;
+  PFS_thread *pfs= my_thread_get_THR_PFS();
 
-  DBUG_ASSERT((info != NULL) || (info_len == 0));
+  assert((info != NULL) || (info_len == 0));
 
   if (likely(pfs != NULL))
   {
@@ -2148,16 +2545,16 @@ static void set_thread_info_v1(const char* info, uint info_len)
       if (info_len > sizeof(pfs->m_processlist_info))
         info_len= sizeof(pfs->m_processlist_info);
 
-      pfs->m_stmt_lock.allocated_to_dirty();
+      pfs->m_stmt_lock.allocated_to_dirty(& dirty_state);
       memcpy(pfs->m_processlist_info, info, info_len);
       pfs->m_processlist_info_length= info_len;
-      pfs->m_stmt_lock.dirty_to_allocated();
+      pfs->m_stmt_lock.dirty_to_allocated(& dirty_state);
     }
     else
     {
-      pfs->m_stmt_lock.allocated_to_dirty();
+      pfs->m_stmt_lock.allocated_to_dirty(& dirty_state);
       pfs->m_processlist_info_length= 0;
-      pfs->m_stmt_lock.dirty_to_allocated();
+      pfs->m_stmt_lock.dirty_to_allocated(& dirty_state);
     }
   }
 }
@@ -2166,23 +2563,23 @@ static void set_thread_info_v1(const char* info, uint info_len)
   Implementation of the thread instrumentation interface.
   @sa PSI_v1::set_thread.
 */
-static void set_thread_v1(PSI_thread* thread)
+void pfs_set_thread_v1(PSI_thread* thread)
 {
   PFS_thread *pfs= reinterpret_cast<PFS_thread*> (thread);
-  my_pthread_setspecific_ptr(THR_PFS, pfs);
+  my_thread_set_THR_PFS(pfs);
 }
 
 /**
   Implementation of the thread instrumentation interface.
   @sa PSI_v1::delete_current_thread.
 */
-static void delete_current_thread_v1(void)
+void pfs_delete_current_thread_v1(void)
 {
-  PFS_thread *thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  PFS_thread *thread= my_thread_get_THR_PFS();
   if (thread != NULL)
   {
     aggregate_thread(thread, thread->m_account, thread->m_user, thread->m_host);
-    my_pthread_setspecific_ptr(THR_PFS, NULL);
+    my_thread_set_THR_PFS(NULL);
     destroy_thread(thread);
   }
 }
@@ -2191,7 +2588,7 @@ static void delete_current_thread_v1(void)
   Implementation of the thread instrumentation interface.
   @sa PSI_v1::delete_thread.
 */
-static void delete_thread_v1(PSI_thread *thread)
+void pfs_delete_thread_v1(PSI_thread *thread)
 {
   PFS_thread *pfs= reinterpret_cast<PFS_thread*> (thread);
 
@@ -2206,18 +2603,18 @@ static void delete_thread_v1(PSI_thread *thread)
   Implementation of the mutex instrumentation interface.
   @sa PSI_v1::start_mutex_wait.
 */
-static PSI_mutex_locker*
-start_mutex_wait_v1(PSI_mutex_locker_state *state,
-                    PSI_mutex *mutex, PSI_mutex_operation op,
-                    const char *src_file, uint src_line)
+PSI_mutex_locker*
+pfs_start_mutex_wait_v1(PSI_mutex_locker_state *state,
+                        PSI_mutex *mutex, PSI_mutex_operation op,
+                        const char *src_file, uint src_line)
 {
   PFS_mutex *pfs_mutex= reinterpret_cast<PFS_mutex*> (mutex);
-  DBUG_ASSERT((int) op >= 0);
-  DBUG_ASSERT((uint) op < array_elements(mutex_operation_map));
-  DBUG_ASSERT(state != NULL);
+  assert((int) op >= 0);
+  assert((uint) op < array_elements(mutex_operation_map));
+  assert(state != NULL);
 
-  DBUG_ASSERT(pfs_mutex != NULL);
-  DBUG_ASSERT(pfs_mutex->m_class != NULL);
+  assert(pfs_mutex != NULL);
+  assert(pfs_mutex->m_class != NULL);
 
   if (! pfs_mutex->m_enabled)
     return NULL;
@@ -2227,7 +2624,7 @@ start_mutex_wait_v1(PSI_mutex_locker_state *state,
 
   if (flag_thread_instrumentation)
   {
-    PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+    PFS_thread *pfs_thread= my_thread_get_THR_PFS();
     if (unlikely(pfs_thread == NULL))
       return NULL;
     if (! pfs_thread->m_enabled)
@@ -2259,7 +2656,7 @@ start_mutex_wait_v1(PSI_mutex_locker_state *state,
       wait->m_nesting_event_id= parent_event->m_event_id;
       wait->m_nesting_event_type= parent_event->m_event_type;
 
-      wait->m_thread= pfs_thread;
+      wait->m_thread_internal_id= pfs_thread->m_thread_internal_id;
       wait->m_class= pfs_mutex->m_class;
       wait->m_timer_start= timer_start;
       wait->m_timer_end= 0;
@@ -2304,18 +2701,38 @@ start_mutex_wait_v1(PSI_mutex_locker_state *state,
   @sa PSI_v1::start_rwlock_rdwait
   @sa PSI_v1::start_rwlock_wrwait
 */
-static PSI_rwlock_locker*
-start_rwlock_wait_v1(PSI_rwlock_locker_state *state,
-                     PSI_rwlock *rwlock,
-                     PSI_rwlock_operation op,
-                     const char *src_file, uint src_line)
+PSI_rwlock_locker*
+pfs_start_rwlock_wait_v1(PSI_rwlock_locker_state *state,
+                         PSI_rwlock *rwlock,
+                         PSI_rwlock_operation op,
+                         const char *src_file, uint src_line)
 {
   PFS_rwlock *pfs_rwlock= reinterpret_cast<PFS_rwlock*> (rwlock);
-  DBUG_ASSERT(static_cast<int> (op) >= 0);
-  DBUG_ASSERT(static_cast<uint> (op) < array_elements(rwlock_operation_map));
-  DBUG_ASSERT(state != NULL);
-  DBUG_ASSERT(pfs_rwlock != NULL);
-  DBUG_ASSERT(pfs_rwlock->m_class != NULL);
+  assert(static_cast<int> (op) >= 0);
+  assert(static_cast<uint> (op) < array_elements(rwlock_operation_map));
+  assert(state != NULL);
+  assert(pfs_rwlock != NULL);
+  assert(pfs_rwlock->m_class != NULL);
+
+  /* Operations supported for READ WRITE LOCK */
+
+  assert(   pfs_rwlock->m_class->is_shared_exclusive()
+            || (op == PSI_RWLOCK_READLOCK)
+            || (op == PSI_RWLOCK_WRITELOCK)
+            || (op == PSI_RWLOCK_TRYREADLOCK)
+            || (op == PSI_RWLOCK_TRYWRITELOCK)
+            );
+
+  /* Operations supported for SHARED EXCLUSIVE LOCK */
+
+  assert(   ! pfs_rwlock->m_class->is_shared_exclusive()
+            || (op == PSI_RWLOCK_SHAREDLOCK)
+            || (op == PSI_RWLOCK_SHAREDEXCLUSIVELOCK)
+            || (op == PSI_RWLOCK_EXCLUSIVELOCK)
+            || (op == PSI_RWLOCK_TRYSHAREDLOCK)
+            || (op == PSI_RWLOCK_TRYSHAREDEXCLUSIVELOCK)
+            || (op == PSI_RWLOCK_TRYEXCLUSIVELOCK)
+            );
 
   if (! pfs_rwlock->m_enabled)
     return NULL;
@@ -2325,7 +2742,7 @@ start_rwlock_wait_v1(PSI_rwlock_locker_state *state,
 
   if (flag_thread_instrumentation)
   {
-    PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+    PFS_thread *pfs_thread= my_thread_get_THR_PFS();
     if (unlikely(pfs_thread == NULL))
       return NULL;
     if (! pfs_thread->m_enabled)
@@ -2357,7 +2774,7 @@ start_rwlock_wait_v1(PSI_rwlock_locker_state *state,
       wait->m_nesting_event_id= parent_event->m_event_id;
       wait->m_nesting_event_type= parent_event->m_event_type;
 
-      wait->m_thread= pfs_thread;
+      wait->m_thread_internal_id= pfs_thread->m_thread_internal_id;
       wait->m_class= pfs_rwlock->m_class;
       wait->m_timer_start= timer_start;
       wait->m_timer_end= 0;
@@ -2394,18 +2811,49 @@ start_rwlock_wait_v1(PSI_rwlock_locker_state *state,
 
   state->m_flags= flags;
   state->m_rwlock= rwlock;
+  state->m_operation= op;
   return reinterpret_cast<PSI_rwlock_locker*> (state);
 }
 
+PSI_rwlock_locker*
+pfs_start_rwlock_rdwait_v1(PSI_rwlock_locker_state *state,
+                           PSI_rwlock *rwlock,
+                           PSI_rwlock_operation op,
+                           const char *src_file, uint src_line)
+{
+  assert((op == PSI_RWLOCK_READLOCK) ||
+         (op == PSI_RWLOCK_TRYREADLOCK) ||
+         (op == PSI_RWLOCK_SHAREDLOCK) ||
+         (op == PSI_RWLOCK_TRYSHAREDLOCK));
+
+  return pfs_start_rwlock_wait_v1(state, rwlock, op, src_file, src_line);
+}
+
+PSI_rwlock_locker*
+pfs_start_rwlock_wrwait_v1(PSI_rwlock_locker_state *state,
+                           PSI_rwlock *rwlock,
+                           PSI_rwlock_operation op,
+                           const char *src_file, uint src_line)
+{
+  assert((op == PSI_RWLOCK_WRITELOCK) ||
+         (op == PSI_RWLOCK_TRYWRITELOCK) ||
+         (op == PSI_RWLOCK_SHAREDEXCLUSIVELOCK) ||
+         (op == PSI_RWLOCK_TRYSHAREDEXCLUSIVELOCK) ||
+         (op == PSI_RWLOCK_EXCLUSIVELOCK) ||
+         (op == PSI_RWLOCK_TRYEXCLUSIVELOCK));
+
+  return pfs_start_rwlock_wait_v1(state, rwlock, op, src_file, src_line);
+}
+
 /**
   Implementation of the cond instrumentation interface.
   @sa PSI_v1::start_cond_wait.
 */
-static PSI_cond_locker*
-start_cond_wait_v1(PSI_cond_locker_state *state,
-                   PSI_cond *cond, PSI_mutex *mutex,
-                   PSI_cond_operation op,
-                   const char *src_file, uint src_line)
+PSI_cond_locker*
+pfs_start_cond_wait_v1(PSI_cond_locker_state *state,
+                       PSI_cond *cond, PSI_mutex *mutex,
+                       PSI_cond_operation op,
+                       const char *src_file, uint src_line)
 {
   /*
     Note about the unused PSI_mutex *mutex parameter:
@@ -2419,11 +2867,11 @@ start_cond_wait_v1(PSI_cond_locker_state *state,
     in start_cond_wait_v1() and end_cond_wait_v1().
   */
   PFS_cond *pfs_cond= reinterpret_cast<PFS_cond*> (cond);
-  DBUG_ASSERT(static_cast<int> (op) >= 0);
-  DBUG_ASSERT(static_cast<uint> (op) < array_elements(cond_operation_map));
-  DBUG_ASSERT(state != NULL);
-  DBUG_ASSERT(pfs_cond != NULL);
-  DBUG_ASSERT(pfs_cond->m_class != NULL);
+  assert(static_cast<int> (op) >= 0);
+  assert(static_cast<uint> (op) < array_elements(cond_operation_map));
+  assert(state != NULL);
+  assert(pfs_cond != NULL);
+  assert(pfs_cond->m_class != NULL);
 
   if (! pfs_cond->m_enabled)
     return NULL;
@@ -2433,7 +2881,7 @@ start_cond_wait_v1(PSI_cond_locker_state *state,
 
   if (flag_thread_instrumentation)
   {
-    PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+    PFS_thread *pfs_thread= my_thread_get_THR_PFS();
     if (unlikely(pfs_thread == NULL))
       return NULL;
     if (! pfs_thread->m_enabled)
@@ -2465,7 +2913,7 @@ start_cond_wait_v1(PSI_cond_locker_state *state,
       wait->m_nesting_event_id= parent_event->m_event_id;
       wait->m_nesting_event_type= parent_event->m_event_type;
 
-      wait->m_thread= pfs_thread;
+      wait->m_thread_internal_id= pfs_thread->m_thread_internal_id;
       wait->m_class= pfs_cond->m_class;
       wait->m_timer_start= timer_start;
       wait->m_timer_end= 0;
@@ -2536,7 +2984,7 @@ static inline PFS_TL_LOCK_TYPE lock_flags_to_lock_type(uint flags)
     case TL_READ_DEFAULT:
     case TL_WRITE_DEFAULT:
     default:
-      DBUG_ASSERT(false);
+      assert(false);
   }
 
   /* Dead code */
@@ -2545,7 +2993,7 @@ static inline PFS_TL_LOCK_TYPE lock_flags_to_lock_type(uint flags)
 
 static inline PFS_TL_LOCK_TYPE external_lock_flags_to_lock_type(uint flags)
 {
-  DBUG_ASSERT(flags == F_RDLCK || flags == F_WRLCK);
+  assert(flags == F_RDLCK || flags == F_WRLCK);
   return (flags == F_RDLCK ? PFS_TL_READ_EXTERNAL : PFS_TL_WRITE_EXTERNAL);
 }
 
@@ -2553,24 +3001,24 @@ static inline PFS_TL_LOCK_TYPE external_lock_flags_to_lock_type(uint flags)
   Implementation of the table instrumentation interface.
   @sa PSI_v1::start_table_io_wait_v1
 */
-static PSI_table_locker*
-start_table_io_wait_v1(PSI_table_locker_state *state,
-                       PSI_table *table,
-                       PSI_table_io_operation op,
-                       uint index,
-                       const char *src_file, uint src_line)
+PSI_table_locker*
+pfs_start_table_io_wait_v1(PSI_table_locker_state *state,
+                           PSI_table *table,
+                           PSI_table_io_operation op,
+                           uint index,
+                           const char *src_file, uint src_line)
 {
-  DBUG_ASSERT(static_cast<int> (op) >= 0);
-  DBUG_ASSERT(static_cast<uint> (op) < array_elements(table_io_operation_map));
-  DBUG_ASSERT(state != NULL);
+  assert(static_cast<int> (op) >= 0);
+  assert(static_cast<uint> (op) < array_elements(table_io_operation_map));
+  assert(state != NULL);
   PFS_table *pfs_table= reinterpret_cast<PFS_table*> (table);
-  DBUG_ASSERT(pfs_table != NULL);
-  DBUG_ASSERT(pfs_table->m_share != NULL);
+  assert(pfs_table != NULL);
+  assert(pfs_table->m_share != NULL);
 
   if (! pfs_table->m_io_enabled)
     return NULL;
 
-  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  PFS_thread *pfs_thread= my_thread_get_THR_PFS();
 
   uint flags;
   ulonglong timer_start= 0;
@@ -2609,7 +3057,7 @@ start_table_io_wait_v1(PSI_table_locker_state *state,
       wait->m_nesting_event_type= parent_event->m_event_type;
 
       PFS_table_share *share= pfs_table->m_share;
-      wait->m_thread= pfs_thread;
+      wait->m_thread_internal_id= pfs_thread->m_thread_internal_id;
       wait->m_class= &global_table_io_class;
       wait->m_timer_start= timer_start;
       wait->m_timer_end= 0;
@@ -2655,25 +3103,25 @@ start_table_io_wait_v1(PSI_table_locker_state *state,
   Implementation of the table instrumentation interface.
   @sa PSI_v1::start_table_lock_wait.
 */
-static PSI_table_locker*
-start_table_lock_wait_v1(PSI_table_locker_state *state,
-                         PSI_table *table,
-                         PSI_table_lock_operation op,
-                         ulong op_flags,
-                         const char *src_file, uint src_line)
+PSI_table_locker*
+pfs_start_table_lock_wait_v1(PSI_table_locker_state *state,
+                             PSI_table *table,
+                             PSI_table_lock_operation op,
+                             ulong op_flags,
+                             const char *src_file, uint src_line)
 {
-  DBUG_ASSERT(state != NULL);
-  DBUG_ASSERT((op == PSI_TABLE_LOCK) || (op == PSI_TABLE_EXTERNAL_LOCK));
+  assert(state != NULL);
+  assert((op == PSI_TABLE_LOCK) || (op == PSI_TABLE_EXTERNAL_LOCK));
 
   PFS_table *pfs_table= reinterpret_cast<PFS_table*> (table);
 
-  DBUG_ASSERT(pfs_table != NULL);
-  DBUG_ASSERT(pfs_table->m_share != NULL);
+  assert(pfs_table != NULL);
+  assert(pfs_table->m_share != NULL);
 
   if (! pfs_table->m_lock_enabled)
     return NULL;
 
-  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  PFS_thread *pfs_thread= my_thread_get_THR_PFS();
 
   PFS_TL_LOCK_TYPE lock_type;
 
@@ -2681,6 +3129,7 @@ start_table_lock_wait_v1(PSI_table_locker_state *state,
   {
     case PSI_TABLE_LOCK:
       lock_type= lock_flags_to_lock_type(op_flags);
+      pfs_table->m_internal_lock= lock_type;
       break;
     case PSI_TABLE_EXTERNAL_LOCK:
       /*
@@ -2688,15 +3137,19 @@ start_table_lock_wait_v1(PSI_table_locker_state *state,
         there is no handler::external_unlock().
       */
       if (op_flags == F_UNLCK)
+      {
+        pfs_table->m_external_lock= PFS_TL_NONE;
         return NULL;
+      }
       lock_type= external_lock_flags_to_lock_type(op_flags);
+      pfs_table->m_external_lock= lock_type;
       break;
     default:
       lock_type= PFS_TL_READ;
-      DBUG_ASSERT(false);
+      assert(false);
   }
 
-  DBUG_ASSERT((uint) lock_type < array_elements(table_lock_operation_map));
+  assert((uint) lock_type < array_elements(table_lock_operation_map));
 
   uint flags;
   ulonglong timer_start= 0;
@@ -2735,7 +3188,7 @@ start_table_lock_wait_v1(PSI_table_locker_state *state,
       wait->m_nesting_event_type= parent_event->m_event_type;
 
       PFS_table_share *share= pfs_table->m_share;
-      wait->m_thread= pfs_thread;
+      wait->m_thread_internal_id= pfs_thread->m_thread_internal_id;
       wait->m_class= &global_table_lock_class;
       wait->m_timer_start= timer_start;
       wait->m_timer_end= 0;
@@ -2780,15 +3233,15 @@ start_table_lock_wait_v1(PSI_table_locker_state *state,
   Implementation of the file instrumentation interface.
   @sa PSI_v1::get_thread_file_name_locker.
 */
-static PSI_file_locker*
-get_thread_file_name_locker_v1(PSI_file_locker_state *state,
-                               PSI_file_key key,
-                               PSI_file_operation op,
-                               const char *name, const void *identity)
-{
-  DBUG_ASSERT(static_cast<int> (op) >= 0);
-  DBUG_ASSERT(static_cast<uint> (op) < array_elements(file_operation_map));
-  DBUG_ASSERT(state != NULL);
+PSI_file_locker*
+pfs_get_thread_file_name_locker_v1(PSI_file_locker_state *state,
+                                   PSI_file_key key,
+                                   PSI_file_operation op,
+                                   const char *name, const void *identity)
+{
+  assert(static_cast<int> (op) >= 0);
+  assert(static_cast<uint> (op) < array_elements(file_operation_map));
+  assert(state != NULL);
 
   if (psi_unlikely(! flag_global_instrumentation))
     return NULL;
@@ -2799,7 +3252,7 @@ get_thread_file_name_locker_v1(PSI_file_locker_state *state,
     return NULL;
 
   /* Needed for the LF_HASH */
-  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  PFS_thread *pfs_thread= my_thread_get_THR_PFS();
   if (unlikely(pfs_thread == NULL))
     return NULL;
 
@@ -2831,7 +3284,7 @@ get_thread_file_name_locker_v1(PSI_file_locker_state *state,
     wait->m_nesting_event_id= parent_event->m_event_id;
     wait->m_nesting_event_type= parent_event->m_event_type;
 
-    wait->m_thread= pfs_thread;
+    wait->m_thread_internal_id= pfs_thread->m_thread_internal_id;
     wait->m_class= klass;
     wait->m_timer_start= 0;
     wait->m_timer_end= 0;
@@ -2858,33 +3311,37 @@ get_thread_file_name_locker_v1(PSI_file_locker_state *state,
   Implementation of the file instrumentation interface.
   @sa PSI_v1::get_thread_file_stream_locker.
 */
-static PSI_file_locker*
-get_thread_file_stream_locker_v1(PSI_file_locker_state *state,
-                                 PSI_file *file, PSI_file_operation op)
+PSI_file_locker*
+pfs_get_thread_file_stream_locker_v1(PSI_file_locker_state *state,
+                                     PSI_file *file, PSI_file_operation op)
 {
   PFS_file *pfs_file= reinterpret_cast<PFS_file*> (file);
-  DBUG_ASSERT(static_cast<int> (op) >= 0);
-  DBUG_ASSERT(static_cast<uint> (op) < array_elements(file_operation_map));
-  DBUG_ASSERT(state != NULL);
+  assert(static_cast<int> (op) >= 0);
+  assert(static_cast<uint> (op) < array_elements(file_operation_map));
+  assert(state != NULL);
 
   if (unlikely(pfs_file == NULL))
     return NULL;
-  DBUG_ASSERT(pfs_file->m_class != NULL);
+  assert(pfs_file->m_class != NULL);
   PFS_file_class *klass= pfs_file->m_class;
 
   if (! pfs_file->m_enabled)
     return NULL;
 
+  /* Needed for the LF_HASH */
+  PFS_thread *pfs_thread= my_thread_get_THR_PFS();
+  if (unlikely(pfs_thread == NULL))
+    return NULL;
+
   uint flags;
 
+  /* Always populated */
+  state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
+
   if (flag_thread_instrumentation)
   {
-    PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
-    if (unlikely(pfs_thread == NULL))
-      return NULL;
     if (! pfs_thread->m_enabled)
       return NULL;
-    state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
     flags= STATE_FLAG_THREAD;
 
     if (pfs_file->m_timed)
@@ -2907,7 +3364,7 @@ get_thread_file_stream_locker_v1(PSI_file_locker_state *state,
       wait->m_nesting_event_id= parent_event->m_event_id;
       wait->m_nesting_event_type= parent_event->m_event_type;
 
-      wait->m_thread= pfs_thread;
+      wait->m_thread_internal_id= pfs_thread->m_thread_internal_id;
       wait->m_class= klass;
       wait->m_timer_start= 0;
       wait->m_timer_end= 0;
@@ -2924,7 +3381,6 @@ get_thread_file_stream_locker_v1(PSI_file_locker_state *state,
   }
   else
   {
-    state->m_thread= NULL;
     if (pfs_file->m_timed)
     {
       flags= STATE_FLAG_TIMED;
@@ -2948,14 +3404,14 @@ get_thread_file_stream_locker_v1(PSI_file_locker_state *state,
   Implementation of the file instrumentation interface.
   @sa PSI_v1::get_thread_file_descriptor_locker.
 */
-static PSI_file_locker*
-get_thread_file_descriptor_locker_v1(PSI_file_locker_state *state,
-                                     File file, PSI_file_operation op)
+PSI_file_locker*
+pfs_get_thread_file_descriptor_locker_v1(PSI_file_locker_state *state,
+                                         File file, PSI_file_operation op)
 {
   int index= static_cast<int> (file);
-  DBUG_ASSERT(static_cast<int> (op) >= 0);
-  DBUG_ASSERT(static_cast<uint> (op) < array_elements(file_operation_map));
-  DBUG_ASSERT(state != NULL);
+  assert(static_cast<int> (op) >= 0);
+  assert(static_cast<uint> (op) < array_elements(file_operation_map));
+  assert(state != NULL);
 
   if (unlikely((index < 0) || (index >= file_handle_max)))
     return NULL;
@@ -2978,19 +3434,23 @@ get_thread_file_descriptor_locker_v1(PSI_file_locker_state *state,
   if (! pfs_file->m_enabled)
     return NULL;
 
-  DBUG_ASSERT(pfs_file->m_class != NULL);
+  assert(pfs_file->m_class != NULL);
   PFS_file_class *klass= pfs_file->m_class;
 
+  /* Needed for the LF_HASH */
+  PFS_thread *pfs_thread= my_thread_get_THR_PFS();
+  if (unlikely(pfs_thread == NULL))
+    return NULL;
+
   uint flags;
 
+  /* Always populated */
+  state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
+
   if (flag_thread_instrumentation)
   {
-    PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
-    if (unlikely(pfs_thread == NULL))
-      return NULL;
     if (! pfs_thread->m_enabled)
       return NULL;
-    state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
     flags= STATE_FLAG_THREAD;
 
     if (pfs_file->m_timed)
@@ -3013,7 +3473,7 @@ get_thread_file_descriptor_locker_v1(PSI_file_locker_state *state,
       wait->m_nesting_event_id= parent_event->m_event_id;
       wait->m_nesting_event_type= parent_event->m_event_type;
 
-      wait->m_thread= pfs_thread;
+      wait->m_thread_internal_id= pfs_thread->m_thread_internal_id;
       wait->m_class= klass;
       wait->m_timer_start= 0;
       wait->m_timer_end= 0;
@@ -3030,7 +3490,6 @@ get_thread_file_descriptor_locker_v1(PSI_file_locker_state *state,
   }
   else
   {
-    state->m_thread= NULL;
     if (pfs_file->m_timed)
     {
       flags= STATE_FLAG_TIMED;
@@ -3052,20 +3511,20 @@ get_thread_file_descriptor_locker_v1(PSI_file_locker_state *state,
 
 /** Socket locker */
 
-static PSI_socket_locker*
-start_socket_wait_v1(PSI_socket_locker_state *state,
-                     PSI_socket *socket,
-                     PSI_socket_operation op,
-                     size_t count,
-                     const char *src_file, uint src_line)
+PSI_socket_locker*
+pfs_start_socket_wait_v1(PSI_socket_locker_state *state,
+                         PSI_socket *socket,
+                         PSI_socket_operation op,
+                         size_t count,
+                         const char *src_file, uint src_line)
 {
-  DBUG_ASSERT(static_cast<int> (op) >= 0);
-  DBUG_ASSERT(static_cast<uint> (op) < array_elements(socket_operation_map));
-  DBUG_ASSERT(state != NULL);
+  assert(static_cast<int> (op) >= 0);
+  assert(static_cast<uint> (op) < array_elements(socket_operation_map));
+  assert(state != NULL);
   PFS_socket *pfs_socket= reinterpret_cast<PFS_socket*> (socket);
 
-  DBUG_ASSERT(pfs_socket != NULL);
-  DBUG_ASSERT(pfs_socket->m_class != NULL);
+  assert(pfs_socket != NULL);
+  assert(pfs_socket->m_class != NULL);
 
   if (!pfs_socket->m_enabled || pfs_socket->m_idle)
     return NULL;
@@ -3080,7 +3539,7 @@ start_socket_wait_v1(PSI_socket_locker_state *state,
        as different threads may use concurrently the same socket,
        for example during a KILL.
     */
-    PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+    PFS_thread *pfs_thread= my_thread_get_THR_PFS();
 
     if (unlikely(pfs_thread == NULL))
       return NULL;
@@ -3114,7 +3573,7 @@ start_socket_wait_v1(PSI_socket_locker_state *state,
       wait->m_event_type= EVENT_TYPE_WAIT;
       wait->m_nesting_event_id=   parent_event->m_event_id;
       wait->m_nesting_event_type= parent_event->m_event_type;
-      wait->m_thread=       pfs_thread;
+      wait->m_thread_internal_id= pfs_thread->m_thread_internal_id;
       wait->m_class=        pfs_socket->m_class;
       wait->m_timer_start=  timer_start;
       wait->m_timer_end=    0;
@@ -3177,11 +3636,11 @@ start_socket_wait_v1(PSI_socket_locker_state *state,
   Implementation of the mutex instrumentation interface.
   @sa PSI_v1::unlock_mutex.
 */
-static void unlock_mutex_v1(PSI_mutex *mutex)
+void pfs_unlock_mutex_v1(PSI_mutex *mutex)
 {
   PFS_mutex *pfs_mutex= reinterpret_cast<PFS_mutex*> (mutex);
 
-  DBUG_ASSERT(pfs_mutex != NULL);
+  assert(pfs_mutex != NULL);
 
   /*
     Note that this code is still protected by the instrumented mutex,
@@ -3214,13 +3673,13 @@ static void unlock_mutex_v1(PSI_mutex *mutex)
   Implementation of the rwlock instrumentation interface.
   @sa PSI_v1::unlock_rwlock.
 */
-static void unlock_rwlock_v1(PSI_rwlock *rwlock)
+void pfs_unlock_rwlock_v1(PSI_rwlock *rwlock)
 {
   PFS_rwlock *pfs_rwlock= reinterpret_cast<PFS_rwlock*> (rwlock);
-  DBUG_ASSERT(pfs_rwlock != NULL);
-  DBUG_ASSERT(pfs_rwlock == sanitize_rwlock(pfs_rwlock));
-  DBUG_ASSERT(pfs_rwlock->m_class != NULL);
-  DBUG_ASSERT(pfs_rwlock->m_lock.is_populated());
+  assert(pfs_rwlock != NULL);
+  assert(pfs_rwlock == sanitize_rwlock(pfs_rwlock));
+  assert(pfs_rwlock->m_class != NULL);
+  assert(pfs_rwlock->m_lock.is_populated());
 
   bool last_writer= false;
   bool last_reader= false;
@@ -3292,36 +3751,40 @@ static void unlock_rwlock_v1(PSI_rwlock *rwlock)
   Implementation of the cond instrumentation interface.
   @sa PSI_v1::signal_cond.
 */
-static void signal_cond_v1(PSI_cond* cond)
+void pfs_signal_cond_v1(PSI_cond* cond)
 {
+#ifdef PFS_LATER
   PFS_cond *pfs_cond= reinterpret_cast<PFS_cond*> (cond);
 
-  DBUG_ASSERT(pfs_cond != NULL);
+  assert(pfs_cond != NULL);
 
   pfs_cond->m_cond_stat.m_signal_count++;
+#endif
 }
 
 /**
   Implementation of the cond instrumentation interface.
   @sa PSI_v1::broadcast_cond.
 */
-static void broadcast_cond_v1(PSI_cond* cond)
+void pfs_broadcast_cond_v1(PSI_cond* cond)
 {
+#ifdef PFS_LATER
   PFS_cond *pfs_cond= reinterpret_cast<PFS_cond*> (cond);
 
-  DBUG_ASSERT(pfs_cond != NULL);
+  assert(pfs_cond != NULL);
 
   pfs_cond->m_cond_stat.m_broadcast_count++;
+#endif
 }
 
 /**
   Implementation of the idle instrumentation interface.
   @sa PSI_v1::start_idle_wait.
 */
-static PSI_idle_locker*
-start_idle_wait_v1(PSI_idle_locker_state* state, const char *src_file, uint src_line)
+PSI_idle_locker*
+pfs_start_idle_wait_v1(PSI_idle_locker_state* state, const char *src_file, uint src_line)
 {
-  DBUG_ASSERT(state != NULL);
+  assert(state != NULL);
 
   if (psi_unlikely(! flag_global_instrumentation))
     return NULL;
@@ -3334,7 +3797,7 @@ start_idle_wait_v1(PSI_idle_locker_state* state, const char *src_file, uint src_
 
   if (flag_thread_instrumentation)
   {
-    PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+    PFS_thread *pfs_thread= my_thread_get_THR_PFS();
     if (unlikely(pfs_thread == NULL))
       return NULL;
     if (!pfs_thread->m_enabled)
@@ -3342,7 +3805,7 @@ start_idle_wait_v1(PSI_idle_locker_state* state, const char *src_file, uint src_
     state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
     flags= STATE_FLAG_THREAD;
 
-    DBUG_ASSERT(pfs_thread->m_events_statements_count == 0);
+    assert(pfs_thread->m_events_statements_count == 0);
 
     if (global_idle_class.m_timed)
     {
@@ -3372,7 +3835,7 @@ start_idle_wait_v1(PSI_idle_locker_state* state, const char *src_file, uint src_
       wait->m_nesting_event_id= 0;
       /* no need to set wait->m_nesting_event_type */
 
-      wait->m_thread= pfs_thread;
+      wait->m_thread_internal_id= pfs_thread->m_thread_internal_id;
       wait->m_class= &global_idle_class;
       wait->m_timer_start= timer_start;
       wait->m_timer_end= 0;
@@ -3404,10 +3867,10 @@ start_idle_wait_v1(PSI_idle_locker_state* state, const char *src_file, uint src_
   Implementation of the mutex instrumentation interface.
   @sa PSI_v1::end_idle_wait.
 */
-static void end_idle_wait_v1(PSI_idle_locker* locker)
+void pfs_end_idle_wait_v1(PSI_idle_locker* locker)
 {
   PSI_idle_locker_state *state= reinterpret_cast<PSI_idle_locker_state*> (locker);
-  DBUG_ASSERT(state != NULL);
+  assert(state != NULL);
   ulonglong timer_end= 0;
   ulonglong wait_time= 0;
 
@@ -3423,7 +3886,7 @@ static void end_idle_wait_v1(PSI_idle_locker* locker)
   {
     PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
     PFS_single_stat *event_name_array;
-    event_name_array= thread->m_instr_class_waits_stats;
+    event_name_array= thread->write_instr_class_waits_stats();
 
     if (flags & STATE_FLAG_TIMED)
     {
@@ -3439,17 +3902,17 @@ static void end_idle_wait_v1(PSI_idle_locker* locker)
     if (flags & STATE_FLAG_EVENT)
     {
       PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
-      DBUG_ASSERT(wait != NULL);
+      assert(wait != NULL);
 
       wait->m_timer_end= timer_end;
       wait->m_end_event_id= thread->m_event_id;
-      if (flag_events_waits_history)
+      if (thread->m_flag_events_waits_history)
         insert_events_waits_history(thread, wait);
-      if (flag_events_waits_history_long)
+      if (thread->m_flag_events_waits_history_long)
         insert_events_waits_history_long(wait);
       thread->m_events_waits_current--;
 
-      DBUG_ASSERT(wait == thread->m_events_waits_current);
+      assert(wait == thread->m_events_waits_current);
     }
   }
 
@@ -3469,16 +3932,16 @@ static void end_idle_wait_v1(PSI_idle_locker* locker)
   Implementation of the mutex instrumentation interface.
   @sa PSI_v1::end_mutex_wait.
 */
-static void end_mutex_wait_v1(PSI_mutex_locker* locker, int rc)
+void pfs_end_mutex_wait_v1(PSI_mutex_locker* locker, int rc)
 {
   PSI_mutex_locker_state *state= reinterpret_cast<PSI_mutex_locker_state*> (locker);
-  DBUG_ASSERT(state != NULL);
+  assert(state != NULL);
 
   ulonglong timer_end= 0;
   ulonglong wait_time= 0;
 
   PFS_mutex *mutex= reinterpret_cast<PFS_mutex *> (state->m_mutex);
-  DBUG_ASSERT(mutex != NULL);
+  assert(mutex != NULL);
   PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
 
   uint flags= state->m_flags;
@@ -3505,9 +3968,12 @@ static void end_mutex_wait_v1(PSI_mutex_locker* locker, int rc)
   if (flags & STATE_FLAG_THREAD)
   {
     PFS_single_stat *event_name_array;
-    event_name_array= thread->m_instr_class_waits_stats;
+    event_name_array= thread->write_instr_class_waits_stats();
     uint index= mutex->m_class->m_event_name_index;
 
+    assert(index <= wait_class_max);
+    assert(sanitize_thread(thread) != NULL);
+
     if (flags & STATE_FLAG_TIMED)
     {
       /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (timed) */
@@ -3522,17 +3988,17 @@ static void end_mutex_wait_v1(PSI_mutex_locker* locker, int rc)
     if (flags & STATE_FLAG_EVENT)
     {
       PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
-      DBUG_ASSERT(wait != NULL);
+      assert(wait != NULL);
 
       wait->m_timer_end= timer_end;
       wait->m_end_event_id= thread->m_event_id;
-      if (flag_events_waits_history)
+      if (thread->m_flag_events_waits_history)
         insert_events_waits_history(thread, wait);
-      if (flag_events_waits_history_long)
+      if (thread->m_flag_events_waits_history_long)
         insert_events_waits_history_long(wait);
       thread->m_events_waits_current--;
 
-      DBUG_ASSERT(wait == thread->m_events_waits_current);
+      assert(wait == thread->m_events_waits_current);
     }
   }
 }
@@ -3541,16 +4007,16 @@ static void end_mutex_wait_v1(PSI_mutex_locker* locker, int rc)
   Implementation of the rwlock instrumentation interface.
   @sa PSI_v1::end_rwlock_rdwait.
 */
-static void end_rwlock_rdwait_v1(PSI_rwlock_locker* locker, int rc)
+void pfs_end_rwlock_rdwait_v1(PSI_rwlock_locker* locker, int rc)
 {
   PSI_rwlock_locker_state *state= reinterpret_cast<PSI_rwlock_locker_state*> (locker);
-  DBUG_ASSERT(state != NULL);
+  assert(state != NULL);
 
   ulonglong timer_end= 0;
   ulonglong wait_time= 0;
 
   PFS_rwlock *rwlock= reinterpret_cast<PFS_rwlock *> (state->m_rwlock);
-  DBUG_ASSERT(rwlock != NULL);
+  assert(rwlock != NULL);
 
   if (state->m_flags & STATE_FLAG_TIMED)
   {
@@ -3583,10 +4049,10 @@ static void end_rwlock_rdwait_v1(PSI_rwlock_locker* locker, int rc)
   if (state->m_flags & STATE_FLAG_THREAD)
   {
     PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
-    DBUG_ASSERT(thread != NULL);
+    assert(thread != NULL);
 
     PFS_single_stat *event_name_array;
-    event_name_array= thread->m_instr_class_waits_stats;
+    event_name_array= thread->write_instr_class_waits_stats();
     uint index= rwlock->m_class->m_event_name_index;
 
     if (state->m_flags & STATE_FLAG_TIMED)
@@ -3603,17 +4069,17 @@ static void end_rwlock_rdwait_v1(PSI_rwlock_locker* locker, int rc)
     if (state->m_flags & STATE_FLAG_EVENT)
     {
       PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
-      DBUG_ASSERT(wait != NULL);
+      assert(wait != NULL);
 
       wait->m_timer_end= timer_end;
       wait->m_end_event_id= thread->m_event_id;
-      if (flag_events_waits_history)
+      if (thread->m_flag_events_waits_history)
         insert_events_waits_history(thread, wait);
-      if (flag_events_waits_history_long)
+      if (thread->m_flag_events_waits_history_long)
         insert_events_waits_history_long(wait);
       thread->m_events_waits_current--;
 
-      DBUG_ASSERT(wait == thread->m_events_waits_current);
+      assert(wait == thread->m_events_waits_current);
     }
   }
 }
@@ -3622,16 +4088,16 @@ static void end_rwlock_rdwait_v1(PSI_rwlock_locker* locker, int rc)
   Implementation of the rwlock instrumentation interface.
   @sa PSI_v1::end_rwlock_wrwait.
 */
-static void end_rwlock_wrwait_v1(PSI_rwlock_locker* locker, int rc)
+void pfs_end_rwlock_wrwait_v1(PSI_rwlock_locker* locker, int rc)
 {
   PSI_rwlock_locker_state *state= reinterpret_cast<PSI_rwlock_locker_state*> (locker);
-  DBUG_ASSERT(state != NULL);
+  assert(state != NULL);
 
   ulonglong timer_end= 0;
   ulonglong wait_time= 0;
 
   PFS_rwlock *rwlock= reinterpret_cast<PFS_rwlock *> (state->m_rwlock);
-  DBUG_ASSERT(rwlock != NULL);
+  assert(rwlock != NULL);
   PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
 
   if (state->m_flags & STATE_FLAG_TIMED)
@@ -3652,15 +4118,20 @@ static void end_rwlock_wrwait_v1(PSI_rwlock_locker* locker, int rc)
     /* Thread safe : we are protected by the instrumented rwlock */
     rwlock->m_writer= thread;
     rwlock->m_last_written= timer_end;
-    /* Reset the readers stats, they could be off */
-    rwlock->m_readers= 0;
-    rwlock->m_last_read= 0;
+
+    if ((state->m_operation != PSI_RWLOCK_SHAREDEXCLUSIVELOCK) &&
+        (state->m_operation != PSI_RWLOCK_TRYSHAREDEXCLUSIVELOCK))
+    {
+      /* Reset the readers stats, they could be off */
+      rwlock->m_readers= 0;
+      rwlock->m_last_read= 0;
+    }
   }
 
   if (state->m_flags & STATE_FLAG_THREAD)
   {
     PFS_single_stat *event_name_array;
-    event_name_array= thread->m_instr_class_waits_stats;
+    event_name_array= thread->write_instr_class_waits_stats();
     uint index= rwlock->m_class->m_event_name_index;
 
     if (state->m_flags & STATE_FLAG_TIMED)
@@ -3677,17 +4148,17 @@ static void end_rwlock_wrwait_v1(PSI_rwlock_locker* locker, int rc)
     if (state->m_flags & STATE_FLAG_EVENT)
     {
       PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
-      DBUG_ASSERT(wait != NULL);
+      assert(wait != NULL);
 
       wait->m_timer_end= timer_end;
       wait->m_end_event_id= thread->m_event_id;
-      if (flag_events_waits_history)
+      if (thread->m_flag_events_waits_history)
         insert_events_waits_history(thread, wait);
-      if (flag_events_waits_history_long)
+      if (thread->m_flag_events_waits_history_long)
         insert_events_waits_history_long(wait);
       thread->m_events_waits_current--;
 
-      DBUG_ASSERT(wait == thread->m_events_waits_current);
+      assert(wait == thread->m_events_waits_current);
     }
   }
 }
@@ -3696,10 +4167,10 @@ static void end_rwlock_wrwait_v1(PSI_rwlock_locker* locker, int rc)
   Implementation of the cond instrumentation interface.
   @sa PSI_v1::end_cond_wait.
 */
-static void end_cond_wait_v1(PSI_cond_locker* locker, int rc)
+void pfs_end_cond_wait_v1(PSI_cond_locker* locker, int rc)
 {
   PSI_cond_locker_state *state= reinterpret_cast<PSI_cond_locker_state*> (locker);
-  DBUG_ASSERT(state != NULL);
+  assert(state != NULL);
 
   ulonglong timer_end= 0;
   ulonglong wait_time= 0;
@@ -3723,10 +4194,10 @@ static void end_cond_wait_v1(PSI_cond_locker* locker, int rc)
   if (state->m_flags & STATE_FLAG_THREAD)
   {
     PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
-    DBUG_ASSERT(thread != NULL);
+    assert(thread != NULL);
 
     PFS_single_stat *event_name_array;
-    event_name_array= thread->m_instr_class_waits_stats;
+    event_name_array= thread->write_instr_class_waits_stats();
     uint index= cond->m_class->m_event_name_index;
 
     if (state->m_flags & STATE_FLAG_TIMED)
@@ -3743,17 +4214,17 @@ static void end_cond_wait_v1(PSI_cond_locker* locker, int rc)
     if (state->m_flags & STATE_FLAG_EVENT)
     {
       PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
-      DBUG_ASSERT(wait != NULL);
+      assert(wait != NULL);
 
       wait->m_timer_end= timer_end;
       wait->m_end_event_id= thread->m_event_id;
-      if (flag_events_waits_history)
+      if (thread->m_flag_events_waits_history)
         insert_events_waits_history(thread, wait);
-      if (flag_events_waits_history_long)
+      if (thread->m_flag_events_waits_history_long)
         insert_events_waits_history_long(wait);
       thread->m_events_waits_current--;
 
-      DBUG_ASSERT(wait == thread->m_events_waits_current);
+      assert(wait == thread->m_events_waits_current);
     }
   }
 }
@@ -3762,22 +4233,22 @@ static void end_cond_wait_v1(PSI_cond_locker* locker, int rc)
   Implementation of the table instrumentation interface.
   @sa PSI_v1::end_table_io_wait.
 */
-static void end_table_io_wait_v1(PSI_table_locker* locker)
+void pfs_end_table_io_wait_v1(PSI_table_locker* locker, ulonglong numrows)
 {
   PSI_table_locker_state *state= reinterpret_cast<PSI_table_locker_state*> (locker);
-  DBUG_ASSERT(state != NULL);
+  assert(state != NULL);
 
   ulonglong timer_end= 0;
   ulonglong wait_time= 0;
 
   PFS_table *table= reinterpret_cast<PFS_table *> (state->m_table);
-  DBUG_ASSERT(table != NULL);
+  assert(table != NULL);
 
   PFS_single_stat *stat;
   PFS_table_io_stat *table_io_stat;
 
-  DBUG_ASSERT((state->m_index < table->m_share->m_key_count) ||
-              (state->m_index == MAX_INDEXES));
+  assert((state->m_index < table->m_share->m_key_count) ||
+         (state->m_index == MAX_INDEXES));
 
   table_io_stat= & table->m_table_stat.m_index_stat[state->m_index];
   table_io_stat->m_has_data= true;
@@ -3797,7 +4268,7 @@ static void end_table_io_wait_v1(PSI_table_locker* locker)
     stat= & table_io_stat->m_delete;
     break;
   default:
-    DBUG_ASSERT(false);
+    assert(false);
     stat= NULL;
     break;
   }
@@ -3808,20 +4279,20 @@ static void end_table_io_wait_v1(PSI_table_locker* locker)
   {
     timer_end= state->m_timer();
     wait_time= timer_end - state->m_timer_start;
-    stat->aggregate_value(wait_time);
+    stat->aggregate_many_value(wait_time, numrows);
   }
   else
   {
-    stat->aggregate_counted();
+    stat->aggregate_counted(numrows);
   }
 
   if (flags & STATE_FLAG_THREAD)
   {
     PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
-    DBUG_ASSERT(thread != NULL);
+    assert(thread != NULL);
 
     PFS_single_stat *event_name_array;
-    event_name_array= thread->m_instr_class_waits_stats;
+    event_name_array= thread->write_instr_class_waits_stats();
 
     /*
       Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME
@@ -3829,27 +4300,28 @@ static void end_table_io_wait_v1(PSI_table_locker* locker)
     */
     if (flags & STATE_FLAG_TIMED)
     {
-      event_name_array[GLOBAL_TABLE_IO_EVENT_INDEX].aggregate_value(wait_time);
+      event_name_array[GLOBAL_TABLE_IO_EVENT_INDEX].aggregate_many_value(wait_time, numrows);
     }
     else
     {
-      event_name_array[GLOBAL_TABLE_IO_EVENT_INDEX].aggregate_counted();
+      event_name_array[GLOBAL_TABLE_IO_EVENT_INDEX].aggregate_counted(numrows);
     }
 
     if (flags & STATE_FLAG_EVENT)
     {
       PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
-      DBUG_ASSERT(wait != NULL);
+      assert(wait != NULL);
 
       wait->m_timer_end= timer_end;
       wait->m_end_event_id= thread->m_event_id;
-      if (flag_events_waits_history)
+      wait->m_number_of_bytes= static_cast<size_t>(numrows);
+      if (thread->m_flag_events_waits_history)
         insert_events_waits_history(thread, wait);
-      if (flag_events_waits_history_long)
+      if (thread->m_flag_events_waits_history_long)
         insert_events_waits_history_long(wait);
       thread->m_events_waits_current--;
 
-      DBUG_ASSERT(wait == thread->m_events_waits_current);
+      assert(wait == thread->m_events_waits_current);
     }
   }
 
@@ -3860,16 +4332,16 @@ static void end_table_io_wait_v1(PSI_table_locker* locker)
   Implementation of the table instrumentation interface.
   @sa PSI_v1::end_table_lock_wait.
 */
-static void end_table_lock_wait_v1(PSI_table_locker* locker)
+void pfs_end_table_lock_wait_v1(PSI_table_locker* locker)
 {
   PSI_table_locker_state *state= reinterpret_cast<PSI_table_locker_state*> (locker);
-  DBUG_ASSERT(state != NULL);
+  assert(state != NULL);
 
   ulonglong timer_end= 0;
   ulonglong wait_time= 0;
 
   PFS_table *table= reinterpret_cast<PFS_table *> (state->m_table);
-  DBUG_ASSERT(table != NULL);
+  assert(table != NULL);
 
   PFS_single_stat *stat= & table->m_table_stat.m_lock_stat.m_stat[state->m_index];
 
@@ -3889,10 +4361,10 @@ static void end_table_lock_wait_v1(PSI_table_locker* locker)
   if (flags & STATE_FLAG_THREAD)
   {
     PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
-    DBUG_ASSERT(thread != NULL);
+    assert(thread != NULL);
 
     PFS_single_stat *event_name_array;
-    event_name_array= thread->m_instr_class_waits_stats;
+    event_name_array= thread->write_instr_class_waits_stats();
 
     /*
       Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME
@@ -3910,40 +4382,40 @@ static void end_table_lock_wait_v1(PSI_table_locker* locker)
     if (flags & STATE_FLAG_EVENT)
     {
       PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
-      DBUG_ASSERT(wait != NULL);
+      assert(wait != NULL);
 
       wait->m_timer_end= timer_end;
       wait->m_end_event_id= thread->m_event_id;
-      if (flag_events_waits_history)
+      if (thread->m_flag_events_waits_history)
         insert_events_waits_history(thread, wait);
-      if (flag_events_waits_history_long)
+      if (thread->m_flag_events_waits_history_long)
         insert_events_waits_history_long(wait);
       thread->m_events_waits_current--;
 
-      DBUG_ASSERT(wait == thread->m_events_waits_current);
+      assert(wait == thread->m_events_waits_current);
     }
   }
 
   table->m_has_lock_stats= true;
 }
 
-static void start_file_wait_v1(PSI_file_locker *locker,
-                               size_t count,
-                               const char *src_file,
-                               uint src_line);
+void pfs_start_file_wait_v1(PSI_file_locker *locker,
+                            size_t count,
+                            const char *src_file,
+                            uint src_line);
 
-static void end_file_wait_v1(PSI_file_locker *locker,
-                             size_t count);
+void pfs_end_file_wait_v1(PSI_file_locker *locker,
+                          size_t count);
 
 /**
   Implementation of the file instrumentation interface.
   @sa PSI_v1::start_file_open_wait.
 */
-static void start_file_open_wait_v1(PSI_file_locker *locker,
-                                    const char *src_file,
-                                    uint src_line)
+void pfs_start_file_open_wait_v1(PSI_file_locker *locker,
+                                 const char *src_file,
+                                 uint src_line)
 {
-  start_file_wait_v1(locker, 0, src_file, src_line);
+  pfs_start_file_wait_v1(locker, 0, src_file, src_line);
 
   return;
 }
@@ -3952,11 +4424,12 @@ static void start_file_open_wait_v1(PSI_file_locker *locker,
   Implementation of the file instrumentation interface.
   @sa PSI_v1::end_file_open_wait.
 */
-static PSI_file* end_file_open_wait_v1(PSI_file_locker *locker,
-                                       void *result)
+PSI_file*
+pfs_end_file_open_wait_v1(PSI_file_locker *locker,
+                          void *result)
 {
   PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker);
-  DBUG_ASSERT(state != NULL);
+  assert(state != NULL);
 
   switch (state->m_operation)
   {
@@ -3977,11 +4450,11 @@ static PSI_file* end_file_open_wait_v1(PSI_file_locker *locker,
     }
     break;
   default:
-    DBUG_ASSERT(false);
+    assert(false);
     break;
   }
 
-  end_file_wait_v1(locker, 0);
+  pfs_end_file_wait_v1(locker, 0);
 
   return state->m_file;
 }
@@ -3990,13 +4463,13 @@ static PSI_file* end_file_open_wait_v1(PSI_file_locker *locker,
   Implementation of the file instrumentation interface.
   @sa PSI_v1::end_file_open_wait_and_bind_to_descriptor.
 */
-static void end_file_open_wait_and_bind_to_descriptor_v1
+void pfs_end_file_open_wait_and_bind_to_descriptor_v1
   (PSI_file_locker *locker, File file)
 {
   PFS_file *pfs_file= NULL;
   int index= (int) file;
   PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker);
-  DBUG_ASSERT(state != NULL);
+  assert(state != NULL);
 
   if (index >= 0)
   {
@@ -4008,7 +4481,7 @@ static void end_file_open_wait_and_bind_to_descriptor_v1
     state->m_file= reinterpret_cast<PSI_file*> (pfs_file);
   }
 
-  end_file_wait_v1(locker, 0);
+  pfs_end_file_wait_v1(locker, 0);
 
   if (likely(index >= 0))
   {
@@ -4025,16 +4498,39 @@ static void end_file_open_wait_and_bind_to_descriptor_v1
 
 /**
   Implementation of the file instrumentation interface.
+  @sa PSI_v1::end_temp_file_open_wait_and_bind_to_descriptor.
+*/
+void pfs_end_temp_file_open_wait_and_bind_to_descriptor_v1
+  (PSI_file_locker *locker, File file, const char *filename)
+{
+  assert(filename != NULL);
+  PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker);
+  assert(state != NULL);
+
+  /* Set filename that was generated during creation of temporary file. */
+  state->m_name= filename;
+  pfs_end_file_open_wait_and_bind_to_descriptor_v1(locker, file);
+
+  PFS_file *pfs_file= reinterpret_cast<PFS_file *> (state->m_file);
+  if (pfs_file != NULL)
+  {
+    pfs_file->m_temporary= true;
+  }
+}
+
+
+/**
+  Implementation of the file instrumentation interface.
   @sa PSI_v1::start_file_wait.
 */
-static void start_file_wait_v1(PSI_file_locker *locker,
-                               size_t count,
-                               const char *src_file,
-                               uint src_line)
+void pfs_start_file_wait_v1(PSI_file_locker *locker,
+                            size_t count,
+                            const char *src_file,
+                            uint src_line)
 {
   ulonglong timer_start= 0;
   PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker);
-  DBUG_ASSERT(state != NULL);
+  assert(state != NULL);
 
   uint flags= state->m_flags;
 
@@ -4047,7 +4543,7 @@ static void start_file_wait_v1(PSI_file_locker *locker,
   if (flags & STATE_FLAG_EVENT)
   {
     PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
-    DBUG_ASSERT(wait != NULL);
+    assert(wait != NULL);
 
     wait->m_timer_start= timer_start;
     wait->m_source_file= src_file;
@@ -4060,11 +4556,11 @@ static void start_file_wait_v1(PSI_file_locker *locker,
   Implementation of the file instrumentation interface.
   @sa PSI_v1::end_file_wait.
 */
-static void end_file_wait_v1(PSI_file_locker *locker,
-                             size_t byte_count)
+void pfs_end_file_wait_v1(PSI_file_locker *locker,
+                          size_t byte_count)
 {
   PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker);
-  DBUG_ASSERT(state != NULL);
+  assert(state != NULL);
   PFS_file *file= reinterpret_cast<PFS_file *> (state->m_file);
   PFS_file_class *klass= reinterpret_cast<PFS_file_class *> (state->m_class);
   PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
@@ -4115,7 +4611,7 @@ static void end_file_wait_v1(PSI_file_locker *locker,
       byte_stat= &file_stat->m_io_stat.m_misc;
       break;
     default:
-      DBUG_ASSERT(false);
+      assert(false);
       byte_stat= NULL;
       break;
   }
@@ -4136,10 +4632,10 @@ static void end_file_wait_v1(PSI_file_locker *locker,
 
   if (flags & STATE_FLAG_THREAD)
   {
-    DBUG_ASSERT(thread != NULL);
+    assert(thread != NULL);
 
     PFS_single_stat *event_name_array;
-    event_name_array= thread->m_instr_class_waits_stats;
+    event_name_array= thread->write_instr_class_waits_stats();
     uint index= klass->m_event_name_index;
 
     if (flags & STATE_FLAG_TIMED)
@@ -4156,7 +4652,7 @@ static void end_file_wait_v1(PSI_file_locker *locker,
     if (state->m_flags & STATE_FLAG_EVENT)
     {
       PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
-      DBUG_ASSERT(wait != NULL);
+      assert(wait != NULL);
 
       wait->m_timer_end= timer_end;
       wait->m_number_of_bytes= bytes;
@@ -4165,13 +4661,13 @@ static void end_file_wait_v1(PSI_file_locker *locker,
       wait->m_weak_file= file;
       wait->m_weak_version= (file ? file->get_version() : 0);
 
-      if (flag_events_waits_history)
+      if (thread->m_flag_events_waits_history)
         insert_events_waits_history(thread, wait);
-      if (flag_events_waits_history_long)
+      if (thread->m_flag_events_waits_history_long)
         insert_events_waits_history_long(wait);
       thread->m_events_waits_current--;
 
-      DBUG_ASSERT(wait == thread->m_events_waits_current);
+      assert(wait == thread->m_events_waits_current);
     }
   }
 }
@@ -4180,16 +4676,16 @@ static void end_file_wait_v1(PSI_file_locker *locker,
   Implementation of the file instrumentation interface.
   @sa PSI_v1::start_file_close_wait.
 */
-static void start_file_close_wait_v1(PSI_file_locker *locker,
-                                     const char *src_file,
-                                     uint src_line)
+void pfs_start_file_close_wait_v1(PSI_file_locker *locker,
+                                  const char *src_file,
+                                  uint src_line)
 {
   PFS_thread *thread;
   const char *name;
   uint len;
   PFS_file *pfs_file;
   PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker);
-  DBUG_ASSERT(state != NULL);
+  assert(state != NULL);
 
   switch (state->m_operation)
   {
@@ -4204,11 +4700,11 @@ static void start_file_close_wait_v1(PSI_file_locker *locker,
   case PSI_FILE_CLOSE:
     break;
   default:
-    DBUG_ASSERT(false);
+    assert(false);
     break;
   }
 
-  start_file_wait_v1(locker, 0, src_file, src_line);
+  pfs_start_file_wait_v1(locker, 0, src_file, src_line);
 
   return;
 }
@@ -4217,12 +4713,12 @@ static void start_file_close_wait_v1(PSI_file_locker *locker,
   Implementation of the file instrumentation interface.
   @sa PSI_v1::end_file_close_wait.
 */
-static void end_file_close_wait_v1(PSI_file_locker *locker, int rc)
+void pfs_end_file_close_wait_v1(PSI_file_locker *locker, int rc)
 {
   PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker);
-  DBUG_ASSERT(state != NULL);
+  assert(state != NULL);
 
-  end_file_wait_v1(locker, 0);
+  pfs_end_file_wait_v1(locker, 0);
 
   if (rc == 0)
   {
@@ -4233,6 +4729,17 @@ static void end_file_close_wait_v1(PSI_file_locker *locker, int rc)
     switch(state->m_operation)
     {
     case PSI_FILE_CLOSE:
+      if (file != NULL)
+      {
+        if (file->m_temporary)
+        {
+          assert(file->m_file_stat.m_open_count <= 1);
+          destroy_file(thread, file);
+        }
+        else
+          release_file(file);
+      }
+      break;
     case PSI_FILE_STREAM_CLOSE:
       if (file != NULL)
         release_file(file);
@@ -4242,29 +4749,57 @@ static void end_file_close_wait_v1(PSI_file_locker *locker, int rc)
         destroy_file(thread, file);
       break;
     default:
-      DBUG_ASSERT(false);
+      assert(false);
       break;
     }
   }
   return;
 }
 
-static void start_stage_v1(PSI_stage_key key, const char *src_file, int src_line)
+/**
+  Implementation of the file instrumentation interface.
+  @sa PSI_v1::end_file_rename_wait.
+*/
+void pfs_end_file_rename_wait_v1(PSI_file_locker *locker, const char *old_name,
+                                 const char *new_name, int rc)
+{
+  PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker);
+  assert(state != NULL);
+  assert(state->m_operation == PSI_FILE_RENAME);
+
+  if (rc == 0)
+  {
+    PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
+
+    uint old_len= (uint)strlen(old_name);
+    uint new_len= (uint)strlen(new_name);
+
+    find_and_rename_file(thread, old_name, old_len, new_name, new_len);
+  }
+
+  pfs_end_file_wait_v1(locker, 0);
+  return;
+}
+
+PSI_stage_progress*
+pfs_start_stage_v1(PSI_stage_key key, const char *src_file, int src_line)
 {
   ulonglong timer_value= 0;
 
-  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  PFS_thread *pfs_thread= my_thread_get_THR_PFS();
   if (unlikely(pfs_thread == NULL))
-    return;
+    return NULL;
 
   /* Always update column threads.processlist_state. */
   pfs_thread->m_stage= key;
+  /* Default value when the stage is not instrumented for progress */
+  pfs_thread->m_stage_progress= NULL;
 
   if (psi_unlikely(! flag_global_instrumentation))
-    return;
+    return NULL;
 
   if (flag_thread_instrumentation && ! pfs_thread->m_enabled)
-    return;
+    return NULL;
 
   PFS_events_stages *pfs= & pfs_thread->m_stage_current;
   PFS_events_waits *child_wait= & pfs_thread->m_events_waits_stack[0];
@@ -4274,7 +4809,7 @@ static void start_stage_v1(PSI_stage_key key, const char *src_file, int src_line
   if (old_class != NULL)
   {
     PFS_stage_stat *event_name_array;
-    event_name_array= pfs_thread->m_instr_class_stages_stats;
+    event_name_array= pfs_thread->write_instr_class_stages_stats();
     uint index= old_class->m_event_name_index;
 
     /* Finish old event */
@@ -4296,9 +4831,9 @@ static void start_stage_v1(PSI_stage_key key, const char *src_file, int src_line
     if (flag_events_stages_current)
     {
       pfs->m_end_event_id= pfs_thread->m_event_id;
-      if (flag_events_stages_history)
+      if (pfs_thread->m_flag_events_stages_history)
         insert_events_stages_history(pfs_thread, pfs);
-      if (flag_events_stages_history_long)
+      if (pfs_thread->m_flag_events_stages_history_long)
         insert_events_stages_history_long(pfs);
     }
 
@@ -4306,8 +4841,8 @@ static void start_stage_v1(PSI_stage_key key, const char *src_file, int src_line
     pfs->m_class= NULL;
 
     /* New waits will now be attached directly to the parent statement. */
-    child_wait->m_event_id= parent_statement->m_event_id;
-    child_wait->m_event_type= parent_statement->m_event_type;
+    child_wait->m_event_id= parent_statement->m_event.m_event_id;
+    child_wait->m_event_type= parent_statement->m_event.m_event_type;
     /* See below for new stages, that may overwrite this. */
   }
 
@@ -4315,10 +4850,10 @@ static void start_stage_v1(PSI_stage_key key, const char *src_file, int src_line
 
   PFS_stage_class *new_klass= find_stage_class(key);
   if (unlikely(new_klass == NULL))
-    return;
+    return NULL;
 
   if (! new_klass->m_enabled)
-    return;
+    return NULL;
 
   pfs->m_class= new_klass;
   if (new_klass->m_timed)
@@ -4337,8 +4872,7 @@ static void start_stage_v1(PSI_stage_key key, const char *src_file, int src_line
 
   if (flag_events_stages_current)
   {
-    /* m_thread_internal_id is immutable and already set */
-    DBUG_ASSERT(pfs->m_thread_internal_id == pfs_thread->m_thread_internal_id);
+    pfs->m_thread_internal_id= pfs_thread->m_thread_internal_id;
     pfs->m_event_id= pfs_thread->m_event_id++;
     pfs->m_end_event_id= 0;
     pfs->m_source_file= src_file;
@@ -4348,17 +4882,37 @@ static void start_stage_v1(PSI_stage_key key, const char *src_file, int src_line
     child_wait->m_event_id= pfs->m_event_id;
     child_wait->m_event_type= EVENT_TYPE_STAGE;
   }
+
+  if (new_klass->is_progress())
+  {
+    pfs_thread->m_stage_progress= & pfs->m_progress;
+    pfs->m_progress.m_work_completed= 0;
+    pfs->m_progress.m_work_estimated= 0;
+  }
+
+  return pfs_thread->m_stage_progress;
 }
 
-static void end_stage_v1()
+PSI_stage_progress*
+pfs_get_current_stage_progress_v1(void)
+{
+  PFS_thread *pfs_thread= my_thread_get_THR_PFS();
+  if (unlikely(pfs_thread == NULL))
+    return NULL;
+
+  return pfs_thread->m_stage_progress;
+}
+
+void pfs_end_stage_v1()
 {
   ulonglong timer_value= 0;
 
-  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  PFS_thread *pfs_thread= my_thread_get_THR_PFS();
   if (unlikely(pfs_thread == NULL))
     return;
 
   pfs_thread->m_stage= 0;
+  pfs_thread->m_stage_progress= NULL;
 
   if (psi_unlikely(! flag_global_instrumentation))
     return;
@@ -4372,7 +4926,7 @@ static void end_stage_v1()
   if (old_class != NULL)
   {
     PFS_stage_stat *event_name_array;
-    event_name_array= pfs_thread->m_instr_class_stages_stats;
+    event_name_array= pfs_thread->write_instr_class_stages_stats();
     uint index= old_class->m_event_name_index;
 
     /* Finish old event */
@@ -4394,27 +4948,27 @@ static void end_stage_v1()
     if (flag_events_stages_current)
     {
       pfs->m_end_event_id= pfs_thread->m_event_id;
-      if (flag_events_stages_history)
+      if (pfs_thread->m_flag_events_stages_history)
         insert_events_stages_history(pfs_thread, pfs);
-      if (flag_events_stages_history_long)
+      if (pfs_thread->m_flag_events_stages_history_long)
         insert_events_stages_history_long(pfs);
     }
 
     /* New waits will now be attached directly to the parent statement. */
     PFS_events_waits *child_wait= & pfs_thread->m_events_waits_stack[0];
     PFS_events_statements *parent_statement= & pfs_thread->m_statement_stack[0];
-    child_wait->m_event_id= parent_statement->m_event_id;
-    child_wait->m_event_type= parent_statement->m_event_type;
+    child_wait->m_event_id= parent_statement->m_event.m_event_id;
+    child_wait->m_event_type= parent_statement->m_event.m_event_type;
 
     /* This stage is completed */
     pfs->m_class= NULL;
   }
 }
 
-static PSI_statement_locker*
-get_thread_statement_locker_v1(PSI_statement_locker_state *state,
-                               PSI_statement_key key,
-                               const void *charset)
+PSI_statement_locker*
+pfs_get_thread_statement_locker_v1(PSI_statement_locker_state *state,
+                                   PSI_statement_key key,
+                                   const void *charset, PSI_sp_share *sp_share)
 {
   DBUG_ASSERT(state != NULL);
   DBUG_ASSERT(charset != NULL);
@@ -4431,7 +4985,7 @@ get_thread_statement_locker_v1(PSI_statement_locker_state *state,
 
   if (flag_thread_instrumentation)
   {
-    PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+    PFS_thread *pfs_thread= my_thread_get_THR_PFS();
     if (unlikely(pfs_thread == NULL))
       return NULL;
     if (! pfs_thread->m_enabled)
@@ -4448,18 +5002,20 @@ get_thread_statement_locker_v1(PSI_statement_locker_state *state,
 
       if (pfs_thread->m_events_statements_count >= statement_stack_max)
       {
+        nested_statement_lost++;
         return NULL;
       }
 
-      pfs_thread->m_stmt_lock.allocated_to_dirty();
+      pfs_dirty_state dirty_state;
+      pfs_thread->m_stmt_lock.allocated_to_dirty(& dirty_state);
       PFS_events_statements *pfs= & pfs_thread->m_statement_stack[pfs_thread->m_events_statements_count];
-      /* m_thread_internal_id is immutable and already set */
-      DBUG_ASSERT(pfs->m_thread_internal_id == pfs_thread->m_thread_internal_id);
-      pfs->m_event_id= event_id;
-      pfs->m_end_event_id= 0;
-      pfs->m_class= klass;
-      pfs->m_timer_start= 0;
-      pfs->m_timer_end= 0;
+      pfs->m_event.m_thread_internal_id= pfs_thread->m_thread_internal_id;
+      pfs->m_event.m_event_id= event_id;
+      pfs->m_event.m_event_type= EVENT_TYPE_STATEMENT;
+      pfs->m_event.m_end_event_id= 0;
+      pfs->m_event.m_class= klass;
+      pfs->m_event.m_timer_start= 0;
+      pfs->m_event.m_timer_end= 0;
       pfs->m_lock_time= 0;
       pfs->m_current_schema_name_length= 0;
       pfs->m_sqltext_length= 0;
@@ -4497,18 +5053,68 @@ get_thread_statement_locker_v1(PSI_statement_locker_state *state,
 
       /* New waits will have this statement as parent, if no stage is instrumented */
       PFS_events_waits *child_wait= & pfs_thread->m_events_waits_stack[0];
-      child_wait->m_nesting_event_id= event_id;
-      child_wait->m_nesting_event_type= EVENT_TYPE_STATEMENT;
+      child_wait->m_event_id= event_id;
+      child_wait->m_event_type= EVENT_TYPE_STATEMENT;
+
+      PFS_events_statements *parent_statement= NULL;
+      PFS_events_transactions *parent_transaction= &pfs_thread->m_transaction_current;
+      ulonglong parent_event= 0;
+      enum_event_type parent_type= EVENT_TYPE_STATEMENT;
+      uint parent_level= 0;
+
+      if (pfs_thread->m_events_statements_count > 0)
+      {
+        parent_statement= pfs - 1;
+        parent_event= parent_statement->m_event.m_event_id;
+        parent_type=  parent_statement->m_event.m_event_type;
+        parent_level= parent_statement->m_event.m_nesting_event_level + 1;
+      }
+
+      if (parent_transaction->m_state == TRANS_STATE_ACTIVE &&
+          parent_transaction->m_event_id > parent_event)
+      {
+        parent_event= parent_transaction->m_event_id;
+        parent_type=  parent_transaction->m_event_type;
+      }
+
+      pfs->m_event.m_nesting_event_id= parent_event;
+      pfs->m_event.m_nesting_event_type= parent_type;
+      pfs->m_event.m_nesting_event_level= parent_level;
+
+      /* Set parent Stored Procedure information for this statement. */
+      if(sp_share)
+      {
+        PFS_program *parent_sp= reinterpret_cast<PFS_program*>(sp_share);
+        pfs->m_sp_type= parent_sp->m_type;
+        memcpy(pfs->m_schema_name, parent_sp->m_schema_name,
+               parent_sp->m_schema_name_length);
+        pfs->m_schema_name_length= parent_sp->m_schema_name_length;
+        memcpy(pfs->m_object_name, parent_sp->m_object_name,
+               parent_sp->m_object_name_length);
+        pfs->m_object_name_length= parent_sp->m_object_name_length;
+      }
+      else
+      {
+        pfs->m_sp_type= NO_OBJECT_TYPE;
+        pfs->m_schema_name_length= 0;
+        pfs->m_object_name_length= 0;
+      }
 
       state->m_statement= pfs;
       flags|= STATE_FLAG_EVENT;
 
       pfs_thread->m_events_statements_count++;
-      pfs_thread->m_stmt_lock.dirty_to_allocated();
+      pfs_thread->m_stmt_lock.dirty_to_allocated(& dirty_state);
+    }
+    else
+    {
+      state->m_statement= NULL;
     }
   }
   else
   {
+    state->m_statement= NULL;
+
     if (klass->m_timed)
       flags= STATE_FLAG_TIMED;
     else
@@ -4542,25 +5148,27 @@ get_thread_statement_locker_v1(PSI_statement_locker_state *state,
   state->m_no_good_index_used= 0;
 
   state->m_digest= NULL;
+  state->m_cs_number= ((CHARSET_INFO *)charset)->number;
 
   state->m_schema_name_length= 0;
-  state->m_cs_number= ((CHARSET_INFO *)charset)->number;
+  state->m_parent_sp_share= sp_share;
+  state->m_parent_prepared_stmt= NULL;
 
   return reinterpret_cast<PSI_statement_locker*> (state);
 }
 
-static PSI_statement_locker*
-refine_statement_v1(PSI_statement_locker *locker,
-                    PSI_statement_key key)
+PSI_statement_locker*
+pfs_refine_statement_v1(PSI_statement_locker *locker,
+                        PSI_statement_key key)
 {
   PSI_statement_locker_state *state= reinterpret_cast<PSI_statement_locker_state*> (locker);
   if (state == NULL)
     return NULL;
-  DBUG_ASSERT(state->m_class != NULL);
+  assert(state->m_class != NULL);
   PFS_statement_class *klass;
   /* Only refine statements for mutable instrumentation */
   klass= reinterpret_cast<PFS_statement_class*> (state->m_class);
-  DBUG_ASSERT(klass->is_mutable());
+  assert(klass->is_mutable());
   klass= find_statement_class(key);
 
   uint flags= state->m_flags;
@@ -4571,7 +5179,7 @@ refine_statement_v1(PSI_statement_locker *locker,
     if (flags & STATE_FLAG_THREAD)
     {
       PFS_thread *pfs_thread= reinterpret_cast<PFS_thread *> (state->m_thread);
-      DBUG_ASSERT(pfs_thread != NULL);
+      assert(pfs_thread != NULL);
       if (pfs_thread->m_events_statements_count > 0)
         pfs_thread->m_events_statements_count--;
     }
@@ -4586,10 +5194,10 @@ refine_statement_v1(PSI_statement_locker *locker,
   if (flags & STATE_FLAG_EVENT)
   {
     PFS_events_statements *pfs= reinterpret_cast<PFS_events_statements*> (state->m_statement);
-    DBUG_ASSERT(pfs != NULL);
+    assert(pfs != NULL);
 
     /* mutate EVENTS_STATEMENTS_CURRENT.EVENT_NAME */
-    pfs->m_class= klass;
+    pfs->m_event.m_class= klass;
   }
 
   state->m_class= klass;
@@ -4597,12 +5205,12 @@ refine_statement_v1(PSI_statement_locker *locker,
   return reinterpret_cast<PSI_statement_locker*> (state);
 }
 
-static void start_statement_v1(PSI_statement_locker *locker,
-                               const char *db, uint db_len,
-                               const char *src_file, uint src_line)
+void pfs_start_statement_v1(PSI_statement_locker *locker,
+                            const char *db, uint db_len,
+                            const char *src_file, uint src_line)
 {
   PSI_statement_locker_state *state= reinterpret_cast<PSI_statement_locker_state*> (locker);
-  DBUG_ASSERT(state != NULL);
+  assert(state != NULL);
 
   uint flags= state->m_flags;
   ulonglong timer_start= 0;
@@ -4614,7 +5222,7 @@ static void start_statement_v1(PSI_statement_locker *locker,
   }
 
   compile_time_assert(PSI_SCHEMA_NAME_LEN == NAME_LEN);
-  DBUG_ASSERT(db_len <= sizeof(state->m_schema_name));
+  assert(db_len <= sizeof(state->m_schema_name));
 
   if (db_len > 0)
     memcpy(state->m_schema_name, db, db_len);
@@ -4623,24 +5231,24 @@ static void start_statement_v1(PSI_statement_locker *locker,
   if (flags & STATE_FLAG_EVENT)
   {
     PFS_events_statements *pfs= reinterpret_cast<PFS_events_statements*> (state->m_statement);
-    DBUG_ASSERT(pfs != NULL);
+    assert(pfs != NULL);
 
-    pfs->m_timer_start= timer_start;
-    pfs->m_source_file= src_file;
-    pfs->m_source_line= src_line;
+    pfs->m_event.m_timer_start= timer_start;
+    pfs->m_event.m_source_file= src_file;
+    pfs->m_event.m_source_line= src_line;
 
-    DBUG_ASSERT(db_len <= sizeof(pfs->m_current_schema_name));
+    assert(db_len <= sizeof(pfs->m_current_schema_name));
     if (db_len > 0)
       memcpy(pfs->m_current_schema_name, db, db_len);
     pfs->m_current_schema_name_length= db_len;
   }
 }
 
-static void set_statement_text_v1(PSI_statement_locker *locker,
-                                  const char *text, uint text_len)
+void pfs_set_statement_text_v1(PSI_statement_locker *locker,
+                               const char *text, uint text_len)
 {
   PSI_statement_locker_state *state= reinterpret_cast<PSI_statement_locker_state*> (locker);
-  DBUG_ASSERT(state != NULL);
+  assert(state != NULL);
 
   if (state->m_discarded)
     return;
@@ -4648,10 +5256,10 @@ static void set_statement_text_v1(PSI_statement_locker *locker,
   if (state->m_flags & STATE_FLAG_EVENT)
   {
     PFS_events_statements *pfs= reinterpret_cast<PFS_events_statements*> (state->m_statement);
-    DBUG_ASSERT(pfs != NULL);
-    if (text_len > sizeof (pfs->m_sqltext))
+    assert(pfs != NULL);
+    if (text_len > pfs_max_sqltext)
     {
-      text_len= sizeof(pfs->m_sqltext);
+      text_len= (uint)pfs_max_sqltext;
       pfs->m_sqltext_truncated= true;
     }
     if (text_len)
@@ -4675,7 +5283,7 @@ static void set_statement_text_v1(PSI_statement_locker *locker,
   {                                                                     \
     PFS_events_statements *pfs;                                         \
     pfs= reinterpret_cast<PFS_events_statements*> (state->m_statement); \
-    DBUG_ASSERT(pfs != NULL);                                           \
+    assert(pfs != NULL);                                                \
     pfs->ATTR= VALUE;                                                   \
   }                                                                     \
   return;
@@ -4692,117 +5300,117 @@ static void set_statement_text_v1(PSI_statement_locker *locker,
   {                                                                     \
     PFS_events_statements *pfs;                                         \
     pfs= reinterpret_cast<PFS_events_statements*> (state->m_statement); \
-    DBUG_ASSERT(pfs != NULL);                                           \
+    assert(pfs != NULL);                                                \
     pfs->ATTR+= VALUE;                                                  \
   }                                                                     \
   return;
 
-static void set_statement_lock_time_v1(PSI_statement_locker *locker,
-                                       ulonglong count)
+void pfs_set_statement_lock_time_v1(PSI_statement_locker *locker,
+                                    ulonglong count)
 {
   SET_STATEMENT_ATTR_BODY(locker, m_lock_time, count);
 }
 
-static void set_statement_rows_sent_v1(PSI_statement_locker *locker,
-                                       ulonglong count)
+void pfs_set_statement_rows_sent_v1(PSI_statement_locker *locker,
+                                    ulonglong count)
 {
   SET_STATEMENT_ATTR_BODY(locker, m_rows_sent, count);
 }
 
-static void set_statement_rows_examined_v1(PSI_statement_locker *locker,
-                                           ulonglong count)
+void pfs_set_statement_rows_examined_v1(PSI_statement_locker *locker,
+                                        ulonglong count)
 {
   SET_STATEMENT_ATTR_BODY(locker, m_rows_examined, count);
 }
 
-static void inc_statement_created_tmp_disk_tables_v1(PSI_statement_locker *locker,
-                                                    ulong count)
+void pfs_inc_statement_created_tmp_disk_tables_v1(PSI_statement_locker *locker,
+                                                  ulong count)
 {
   INC_STATEMENT_ATTR_BODY(locker, m_created_tmp_disk_tables, count);
 }
 
-static void inc_statement_created_tmp_tables_v1(PSI_statement_locker *locker,
-                                                ulong count)
+void pfs_inc_statement_created_tmp_tables_v1(PSI_statement_locker *locker,
+                                             ulong count)
 {
   INC_STATEMENT_ATTR_BODY(locker, m_created_tmp_tables, count);
 }
 
-static void inc_statement_select_full_join_v1(PSI_statement_locker *locker,
-                                              ulong count)
+void pfs_inc_statement_select_full_join_v1(PSI_statement_locker *locker,
+                                           ulong count)
 {
   INC_STATEMENT_ATTR_BODY(locker, m_select_full_join, count);
 }
 
-static void inc_statement_select_full_range_join_v1(PSI_statement_locker *locker,
-                                                    ulong count)
+void pfs_inc_statement_select_full_range_join_v1(PSI_statement_locker *locker,
+                                                 ulong count)
 {
   INC_STATEMENT_ATTR_BODY(locker, m_select_full_range_join, count);
 }
 
-static void inc_statement_select_range_v1(PSI_statement_locker *locker,
-                                          ulong count)
+void pfs_inc_statement_select_range_v1(PSI_statement_locker *locker,
+                                       ulong count)
 {
   INC_STATEMENT_ATTR_BODY(locker, m_select_range, count);
 }
 
-static void inc_statement_select_range_check_v1(PSI_statement_locker *locker,
-                                                ulong count)
+void pfs_inc_statement_select_range_check_v1(PSI_statement_locker *locker,
+                                             ulong count)
 {
   INC_STATEMENT_ATTR_BODY(locker, m_select_range_check, count);
 }
 
-static void inc_statement_select_scan_v1(PSI_statement_locker *locker,
-                                         ulong count)
+void pfs_inc_statement_select_scan_v1(PSI_statement_locker *locker,
+                                      ulong count)
 {
   INC_STATEMENT_ATTR_BODY(locker, m_select_scan, count);
 }
 
-static void inc_statement_sort_merge_passes_v1(PSI_statement_locker *locker,
-                                               ulong count)
+void pfs_inc_statement_sort_merge_passes_v1(PSI_statement_locker *locker,
+                                            ulong count)
 {
   INC_STATEMENT_ATTR_BODY(locker, m_sort_merge_passes, count);
 }
 
-static void inc_statement_sort_range_v1(PSI_statement_locker *locker,
-                                        ulong count)
+void pfs_inc_statement_sort_range_v1(PSI_statement_locker *locker,
+                                     ulong count)
 {
   INC_STATEMENT_ATTR_BODY(locker, m_sort_range, count);
 }
 
-static void inc_statement_sort_rows_v1(PSI_statement_locker *locker,
-                                       ulong count)
+void pfs_inc_statement_sort_rows_v1(PSI_statement_locker *locker,
+                                    ulong count)
 {
   INC_STATEMENT_ATTR_BODY(locker, m_sort_rows, count);
 }
 
-static void inc_statement_sort_scan_v1(PSI_statement_locker *locker,
-                                       ulong count)
+void pfs_inc_statement_sort_scan_v1(PSI_statement_locker *locker,
+                                    ulong count)
 {
   INC_STATEMENT_ATTR_BODY(locker, m_sort_scan, count);
 }
 
-static void set_statement_no_index_used_v1(PSI_statement_locker *locker)
+void pfs_set_statement_no_index_used_v1(PSI_statement_locker *locker)
 {
   SET_STATEMENT_ATTR_BODY(locker, m_no_index_used, 1);
 }
 
-static void set_statement_no_good_index_used_v1(PSI_statement_locker *locker)
+void pfs_set_statement_no_good_index_used_v1(PSI_statement_locker *locker)
 {
   SET_STATEMENT_ATTR_BODY(locker, m_no_good_index_used, 1);
 }
 
-static void end_statement_v1(PSI_statement_locker *locker, void *stmt_da)
+void pfs_end_statement_v1(PSI_statement_locker *locker, void *stmt_da)
 {
   PSI_statement_locker_state *state= reinterpret_cast<PSI_statement_locker_state*> (locker);
   Diagnostics_area *da= reinterpret_cast<Diagnostics_area*> (stmt_da);
-  DBUG_ASSERT(state != NULL);
-  DBUG_ASSERT(da != NULL);
+  assert(state != NULL);
+  assert(da != NULL);
 
   if (state->m_discarded)
     return;
 
   PFS_statement_class *klass= reinterpret_cast<PFS_statement_class *> (state->m_class);
-  DBUG_ASSERT(klass != NULL);
+  assert(klass != NULL);
 
   ulonglong timer_end= 0;
   ulonglong wait_time= 0;
@@ -4823,12 +5431,14 @@ static void end_statement_v1(PSI_statement_locker *locker, void *stmt_da)
   */
   const sql_digest_storage *digest_storage= NULL;
   PFS_statement_stat *digest_stat= NULL;
+  PFS_program *pfs_program= NULL;
+  PFS_prepared_stmt *pfs_prepared_stmt= NULL;
 
   if (flags & STATE_FLAG_THREAD)
   {
     PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
-    DBUG_ASSERT(thread != NULL);
-    event_name_array= thread->m_instr_class_statements_stats;
+    assert(thread != NULL);
+    event_name_array= thread->write_instr_class_statements_stats();
     /* Aggregate to EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME */
     stat= & event_name_array[index];
 
@@ -4848,17 +5458,20 @@ static void end_statement_v1(PSI_statement_locker *locker, void *stmt_da)
     if (flags & STATE_FLAG_EVENT)
     {
       PFS_events_statements *pfs= reinterpret_cast<PFS_events_statements*> (state->m_statement);
-      DBUG_ASSERT(pfs != NULL);
+      assert(pfs != NULL);
 
-      thread->m_stmt_lock.allocated_to_dirty();
+      pfs_dirty_state dirty_state;
+      thread->m_stmt_lock.allocated_to_dirty(& dirty_state);
 
       switch(da->status())
       {
         case Diagnostics_area::DA_OK_BULK:
+        case Diagnostics_area::DA_EOF_BULK:
         case Diagnostics_area::DA_EMPTY:
           break;
         case Diagnostics_area::DA_OK:
-          memcpy(pfs->m_message_text, da->message(), MYSQL_ERRMSG_SIZE);
+          memcpy(pfs->m_message_text, da->message(),
+                 MYSQL_ERRMSG_SIZE);
           pfs->m_message_text[MYSQL_ERRMSG_SIZE]= 0;
           pfs->m_rows_affected= da->affected_rows();
           pfs->m_warning_count= da->statement_warn_count();
@@ -4868,18 +5481,19 @@ static void end_statement_v1(PSI_statement_locker *locker, void *stmt_da)
           pfs->m_warning_count= da->statement_warn_count();
           break;
         case Diagnostics_area::DA_ERROR:
-          memcpy(pfs->m_message_text, da->message(), MYSQL_ERRMSG_SIZE);
+          memcpy(pfs->m_message_text, da->message(),
+                 MYSQL_ERRMSG_SIZE);
           pfs->m_message_text[MYSQL_ERRMSG_SIZE]= 0;
           pfs->m_sql_errno= da->sql_errno();
-          pfs->m_error_count++;
           memcpy(pfs->m_sqlstate, da->get_sqlstate(), SQLSTATE_LENGTH);
+          pfs->m_error_count++;
           break;
         case Diagnostics_area::DA_DISABLED:
           break;
       }
 
-      pfs->m_timer_end= timer_end;
-      pfs->m_end_event_id= thread->m_event_id;
+      pfs->m_event.m_timer_end= timer_end;
+      pfs->m_event.m_end_event_id= thread->m_event_id;
 
       if (digest_storage != NULL)
       {
@@ -4892,21 +5506,24 @@ static void end_statement_v1(PSI_statement_locker *locker, void *stmt_da)
         pfs->m_digest_storage.copy(digest_storage);
       }
 
-      if (flag_events_statements_history)
+      pfs_program= reinterpret_cast<PFS_program*>(state->m_parent_sp_share);
+      pfs_prepared_stmt= reinterpret_cast<PFS_prepared_stmt*>(state->m_parent_prepared_stmt);
+
+      if (thread->m_flag_events_statements_history)
         insert_events_statements_history(thread, pfs);
-      if (flag_events_statements_history_long)
+      if (thread->m_flag_events_statements_history_long)
         insert_events_statements_history_long(pfs);
 
-      DBUG_ASSERT(thread->m_events_statements_count > 0);
+      assert(thread->m_events_statements_count > 0);
       thread->m_events_statements_count--;
-      thread->m_stmt_lock.dirty_to_allocated();
+      thread->m_stmt_lock.dirty_to_allocated(& dirty_state);
     }
   }
   else
   {
     if (flags & STATE_FLAG_DIGEST)
     {
-      PFS_thread *thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+      PFS_thread *thread= my_thread_get_THR_PFS();
 
       /* An instrumented thread is required, for LF_PINS. */
       if (thread != NULL)
@@ -4929,6 +5546,8 @@ static void end_statement_v1(PSI_statement_locker *locker, void *stmt_da)
     stat= & event_name_array[index];
   }
 
+  stat->mark_used();
+
   if (flags & STATE_FLAG_TIMED)
   {
     /* Aggregate to EVENTS_STATEMENTS_SUMMARY_..._BY_EVENT_NAME (timed) */
@@ -4959,6 +5578,8 @@ static void end_statement_v1(PSI_statement_locker *locker, void *stmt_da)
 
   if (digest_stat != NULL)
   {
+    digest_stat->mark_used();
+
     if (flags & STATE_FLAG_TIMED)
     {
       digest_stat->aggregate_value(wait_time);
@@ -4986,9 +5607,107 @@ static void end_statement_v1(PSI_statement_locker *locker, void *stmt_da)
     digest_stat->m_no_good_index_used+= state->m_no_good_index_used;
   }
 
+  if(pfs_program != NULL)
+  {
+    PFS_statement_stat *sub_stmt_stat= NULL;
+    sub_stmt_stat= &pfs_program->m_stmt_stat;
+    if(sub_stmt_stat != NULL)
+    {
+      sub_stmt_stat->mark_used();
+
+      if (flags & STATE_FLAG_TIMED)
+      {
+        sub_stmt_stat->aggregate_value(wait_time);
+      }
+      else
+      {
+        sub_stmt_stat->aggregate_counted();
+      }
+
+      sub_stmt_stat->m_lock_time+= state->m_lock_time;
+      sub_stmt_stat->m_rows_sent+= state->m_rows_sent;
+      sub_stmt_stat->m_rows_examined+= state->m_rows_examined;
+      sub_stmt_stat->m_created_tmp_disk_tables+= state->m_created_tmp_disk_tables;
+      sub_stmt_stat->m_created_tmp_tables+= state->m_created_tmp_tables;
+      sub_stmt_stat->m_select_full_join+= state->m_select_full_join;
+      sub_stmt_stat->m_select_full_range_join+= state->m_select_full_range_join;
+      sub_stmt_stat->m_select_range+= state->m_select_range;
+      sub_stmt_stat->m_select_range_check+= state->m_select_range_check;
+      sub_stmt_stat->m_select_scan+= state->m_select_scan;
+      sub_stmt_stat->m_sort_merge_passes+= state->m_sort_merge_passes;
+      sub_stmt_stat->m_sort_range+= state->m_sort_range;
+      sub_stmt_stat->m_sort_rows+= state->m_sort_rows;
+      sub_stmt_stat->m_sort_scan+= state->m_sort_scan;
+      sub_stmt_stat->m_no_index_used+= state->m_no_index_used;
+      sub_stmt_stat->m_no_good_index_used+= state->m_no_good_index_used;
+    }
+  }
+
+  if (pfs_prepared_stmt != NULL)
+  {
+    if(state->m_in_prepare)
+    {
+      PFS_single_stat *prepared_stmt_stat= NULL;
+      prepared_stmt_stat= &pfs_prepared_stmt->m_prepare_stat;
+      if(prepared_stmt_stat != NULL)
+      {
+        if (flags & STATE_FLAG_TIMED)
+        {
+          prepared_stmt_stat->aggregate_value(wait_time);
+        }
+        else
+        {
+          prepared_stmt_stat->aggregate_counted();
+        }
+      }
+    }
+    else
+    {
+      PFS_statement_stat *prepared_stmt_stat= NULL;
+      prepared_stmt_stat= &pfs_prepared_stmt->m_execute_stat;
+      if(prepared_stmt_stat != NULL)
+      {
+        if (flags & STATE_FLAG_TIMED)
+        {
+          prepared_stmt_stat->aggregate_value(wait_time);
+        }
+        else
+        {
+          prepared_stmt_stat->aggregate_counted();
+        }
+
+        prepared_stmt_stat->m_lock_time+= state->m_lock_time;
+        prepared_stmt_stat->m_rows_sent+= state->m_rows_sent;
+        prepared_stmt_stat->m_rows_examined+= state->m_rows_examined;
+        prepared_stmt_stat->m_created_tmp_disk_tables+= state->m_created_tmp_disk_tables;
+        prepared_stmt_stat->m_created_tmp_tables+= state->m_created_tmp_tables;
+        prepared_stmt_stat->m_select_full_join+= state->m_select_full_join;
+        prepared_stmt_stat->m_select_full_range_join+= state->m_select_full_range_join;
+        prepared_stmt_stat->m_select_range+= state->m_select_range;
+        prepared_stmt_stat->m_select_range_check+= state->m_select_range_check;
+        prepared_stmt_stat->m_select_scan+= state->m_select_scan;
+        prepared_stmt_stat->m_sort_merge_passes+= state->m_sort_merge_passes;
+        prepared_stmt_stat->m_sort_range+= state->m_sort_range;
+        prepared_stmt_stat->m_sort_rows+= state->m_sort_rows;
+        prepared_stmt_stat->m_sort_scan+= state->m_sort_scan;
+        prepared_stmt_stat->m_no_index_used+= state->m_no_index_used;
+        prepared_stmt_stat->m_no_good_index_used+= state->m_no_good_index_used;
+      }
+    }
+  }
+
+  PFS_statement_stat *sub_stmt_stat= NULL;
+  if (pfs_program != NULL)
+    sub_stmt_stat= &pfs_program->m_stmt_stat;
+
+  PFS_statement_stat *prepared_stmt_stat= NULL;
+  if (pfs_prepared_stmt != NULL && !state->m_in_prepare)
+    prepared_stmt_stat= &pfs_prepared_stmt->m_execute_stat;
+
   switch (da->status())
   {
     case Diagnostics_area::DA_OK_BULK:
+    case Diagnostics_area::DA_EOF_BULK:
     case Diagnostics_area::DA_EMPTY:
       break;
     case Diagnostics_area::DA_OK:
@@ -4999,6 +5718,16 @@ static void end_statement_v1(PSI_statement_locker *locker, void *stmt_da)
         digest_stat->m_rows_affected+= da->affected_rows();
         digest_stat->m_warning_count+= da->statement_warn_count();
       }
+      if(sub_stmt_stat != NULL)
+      {
+        sub_stmt_stat->m_rows_affected+= da->affected_rows();
+        sub_stmt_stat->m_warning_count+= da->statement_warn_count();
+      }
+      if (prepared_stmt_stat != NULL)
+      {
+        prepared_stmt_stat->m_rows_affected+= da->affected_rows();
+        prepared_stmt_stat->m_warning_count+= da->statement_warn_count();
+      }
       break;
     case Diagnostics_area::DA_EOF:
       stat->m_warning_count+= da->statement_warn_count();
@@ -5006,6 +5735,14 @@ static void end_statement_v1(PSI_statement_locker *locker, void *stmt_da)
       {
         digest_stat->m_warning_count+= da->statement_warn_count();
       }
+      if(sub_stmt_stat != NULL)
+      {
+        sub_stmt_stat->m_warning_count+= da->statement_warn_count();
+      }
+      if (prepared_stmt_stat != NULL)
+      {
+        prepared_stmt_stat->m_warning_count+= da->statement_warn_count();
+      }
       break;
     case Diagnostics_area::DA_ERROR:
       stat->m_error_count++;
@@ -5013,23 +5750,482 @@ static void end_statement_v1(PSI_statement_locker *locker, void *stmt_da)
       {
         digest_stat->m_error_count++;
       }
+      if (sub_stmt_stat != NULL)
+      {
+        sub_stmt_stat->m_error_count++;
+      }
+      if (prepared_stmt_stat != NULL)
+      {
+        prepared_stmt_stat->m_error_count++;
+      }
       break;
     case Diagnostics_area::DA_DISABLED:
       break;
   }
 }
 
+static inline enum_object_type sp_type_to_object_type(uint sp_type)
+{
+  enum enum_sp_type value= static_cast<enum enum_sp_type> (sp_type);
+
+  switch (value)
+  {
+    case SP_TYPE_FUNCTION:
+      return OBJECT_TYPE_FUNCTION;
+    case SP_TYPE_PROCEDURE:
+      return OBJECT_TYPE_PROCEDURE;
+    case SP_TYPE_PACKAGE:
+      return OBJECT_TYPE_PACKAGE;
+    case SP_TYPE_PACKAGE_BODY:
+      return OBJECT_TYPE_PACKAGE_BODY;
+    case SP_TYPE_TRIGGER:
+      return OBJECT_TYPE_TRIGGER;
+    case SP_TYPE_EVENT:
+      return OBJECT_TYPE_EVENT;
+    default:
+      assert(false);
+      /* Dead code */
+      return NO_OBJECT_TYPE;
+  }
+}
+
+/**
+  Implementation of the stored program instrumentation interface.
+  @sa PSI_v1::get_sp_share.
+*/
+PSI_sp_share *pfs_get_sp_share_v1(uint sp_type,
+                                  const char* schema_name,
+                                  uint schema_name_length,
+                                  const char* object_name,
+                                  uint object_name_length)
+{
+
+  PFS_thread *pfs_thread= my_thread_get_THR_PFS();
+  if (unlikely(pfs_thread == NULL))
+    return NULL;
+
+  if (object_name_length > COL_OBJECT_NAME_SIZE)
+    object_name_length= COL_OBJECT_NAME_SIZE;
+  if (schema_name_length > COL_OBJECT_SCHEMA_SIZE)
+    schema_name_length= COL_OBJECT_SCHEMA_SIZE;
+
+  PFS_program *pfs_program;
+  pfs_program= find_or_create_program(pfs_thread,
+                                      sp_type_to_object_type(sp_type),
+                                      object_name,
+                                      object_name_length,
+                                      schema_name,
+                                      schema_name_length);
+
+  return reinterpret_cast<PSI_sp_share *>(pfs_program);
+}
+
+void pfs_release_sp_share_v1(PSI_sp_share* sp_share)
+{
+  /* Unused */
+  return;
+}
+
+PSI_sp_locker* pfs_start_sp_v1(PSI_sp_locker_state *state,
+                               PSI_sp_share *sp_share)
+{
+  assert(state != NULL);
+  if (! flag_global_instrumentation)
+    return NULL;
+
+  if (flag_thread_instrumentation)
+  {
+    PFS_thread *pfs_thread= my_thread_get_THR_PFS();
+    if (unlikely(pfs_thread == NULL))
+      return NULL;
+    if (! pfs_thread->m_enabled)
+      return NULL;
+  }
+
+  /*
+    sp share might be null in case when stat array is full and no new
+    stored program stats are being inserted into it.
+  */
+  PFS_program *pfs_program= reinterpret_cast<PFS_program*>(sp_share);
+  if (pfs_program == NULL || !pfs_program->m_enabled)
+    return NULL;
+
+  state->m_flags= 0;
+
+  if(pfs_program->m_timed)
+  {
+    state->m_flags|= STATE_FLAG_TIMED;
+    state->m_timer_start= get_timer_raw_value_and_function(statement_timer,
+                                                  & state->m_timer);
+  }
+
+  state->m_sp_share= sp_share;
+
+  return reinterpret_cast<PSI_sp_locker*> (state);
+}
+
+void pfs_end_sp_v1(PSI_sp_locker *locker)
+{
+  PSI_sp_locker_state *state= reinterpret_cast<PSI_sp_locker_state*> (locker);
+  assert(state != NULL);
+
+  ulonglong timer_end;
+  ulonglong wait_time;
+
+  PFS_program *pfs_program= reinterpret_cast<PFS_program *>(state->m_sp_share);
+  PFS_sp_stat *stat= &pfs_program->m_sp_stat;
+
+  if (state->m_flags & STATE_FLAG_TIMED)
+  {
+    timer_end= state->m_timer();
+    wait_time= timer_end - state->m_timer_start;
+
+    /* Now use this timer_end and wait_time for timing information. */
+    stat->aggregate_value(wait_time);
+  }
+  else
+  {
+    stat->aggregate_counted();
+  }
+}
+
+void pfs_drop_sp_v1(uint sp_type,
+                    const char* schema_name,
+                    uint schema_name_length,
+                    const char* object_name,
+                    uint object_name_length)
+{
+  PFS_thread *pfs_thread= my_thread_get_THR_PFS();
+  if (unlikely(pfs_thread == NULL))
+    return;
+
+  if (object_name_length > COL_OBJECT_NAME_SIZE)
+    object_name_length= COL_OBJECT_NAME_SIZE;
+  if (schema_name_length > COL_OBJECT_SCHEMA_SIZE)
+    schema_name_length= COL_OBJECT_SCHEMA_SIZE;
+
+  drop_program(pfs_thread,
+               sp_type_to_object_type(sp_type),
+               object_name, object_name_length,
+               schema_name, schema_name_length);
+}
+
+PSI_transaction_locker*
+pfs_get_thread_transaction_locker_v1(PSI_transaction_locker_state *state,
+                                     const void *xid,
+                                     ulonglong trxid,
+                                     int isolation_level,
+                                     my_bool read_only,
+                                     my_bool autocommit)
+{
+  assert(state != NULL);
+
+  if (!flag_global_instrumentation)
+    return NULL;
+
+  if (!global_transaction_class.m_enabled)
+    return NULL;
+
+  uint flags;
+
+  if (flag_thread_instrumentation)
+  {
+    PFS_thread *pfs_thread= my_thread_get_THR_PFS();
+    if (unlikely(pfs_thread == NULL))
+      return NULL;
+    if (!pfs_thread->m_enabled)
+      return NULL;
+    state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
+    flags= STATE_FLAG_THREAD;
+
+    if (global_transaction_class.m_timed)
+      flags|= STATE_FLAG_TIMED;
+
+    if (flag_events_transactions_current)
+    {
+      ulonglong event_id= pfs_thread->m_event_id++;
+
+      PFS_events_transactions *pfs= &pfs_thread->m_transaction_current;
+      pfs->m_thread_internal_id = pfs_thread->m_thread_internal_id;
+      pfs->m_event_id= event_id;
+      pfs->m_event_type= EVENT_TYPE_TRANSACTION;
+      pfs->m_end_event_id= 0;
+      pfs->m_class= &global_transaction_class;
+      pfs->m_timer_start= 0;
+      pfs->m_timer_end= 0;
+      if (xid != NULL)
+        pfs->m_xid= *(PSI_xid *)xid;
+      pfs->m_xa= false;
+      pfs->m_xa_state= TRANS_STATE_XA_NOTR;
+      pfs->m_trxid= trxid;
+      pfs->m_isolation_level= (enum_isolation_level)isolation_level;
+      pfs->m_read_only= read_only;
+      pfs->m_autocommit= autocommit;
+      pfs->m_savepoint_count= 0;
+      pfs->m_rollback_to_savepoint_count= 0;
+      pfs->m_release_savepoint_count= 0;
+
+      uint statements_count= pfs_thread->m_events_statements_count;
+      if (statements_count > 0)
+      {
+        PFS_events_statements *pfs_statement=
+          &pfs_thread->m_statement_stack[statements_count - 1];
+        pfs->m_nesting_event_id= pfs_statement->m_event.m_event_id;
+        pfs->m_nesting_event_type= pfs_statement->m_event.m_event_type;
+      }
+      else
+      {
+        pfs->m_nesting_event_id= 0;
+        /* pfs->m_nesting_event_type not used when m_nesting_event_id is 0 */
+      }
+
+      state->m_transaction= pfs;
+      flags|= STATE_FLAG_EVENT;
+    }
+  }
+  else
+  {
+    if (global_transaction_class.m_timed)
+      flags= STATE_FLAG_TIMED;
+    else
+      flags= 0;
+  }
+
+  state->m_class= &global_transaction_class;
+  state->m_flags= flags;
+  state->m_autocommit= autocommit;
+  state->m_read_only= read_only;
+  state->m_savepoint_count= 0;
+  state->m_rollback_to_savepoint_count= 0;
+  state->m_release_savepoint_count= 0;
+
+  return reinterpret_cast<PSI_transaction_locker*> (state);
+}
+
+void pfs_start_transaction_v1(PSI_transaction_locker *locker,
+                              const char *src_file, uint src_line)
+{
+  PSI_transaction_locker_state *state= reinterpret_cast<PSI_transaction_locker_state*> (locker);
+  assert(state != NULL);
+
+  uint flags= state->m_flags;
+  ulonglong timer_start= 0;
+
+  if (flags & STATE_FLAG_TIMED)
+  {
+    timer_start= get_timer_raw_value_and_function(transaction_timer, &state->m_timer);
+    state->m_timer_start= timer_start;
+  }
+
+  if (flags & STATE_FLAG_EVENT)
+  {
+    PFS_events_transactions *pfs= reinterpret_cast<PFS_events_transactions*> (state->m_transaction);
+    assert(pfs != NULL);
+
+    pfs->m_timer_start= timer_start;
+    pfs->m_source_file= src_file;
+    pfs->m_source_line= src_line;
+    pfs->m_state= TRANS_STATE_ACTIVE;
+    //pfs->m_sid.clear();
+    bzero(&pfs->m_gtid_spec, sizeof(pfs->m_gtid_spec));
+  }
+}
+
+void pfs_set_transaction_gtid_v1(PSI_transaction_locker *locker,
+                                 const void *sid,
+                                 const void *gtid_spec)
+{
+  PSI_transaction_locker_state *state= reinterpret_cast<PSI_transaction_locker_state*> (locker);
+  assert(state != NULL);
+  assert(sid != NULL);
+  assert(gtid_spec != NULL);
+
+  if (state->m_flags & STATE_FLAG_EVENT)
+  {
+    PFS_events_transactions *pfs= reinterpret_cast<PFS_events_transactions*> (state->m_transaction);
+    DBUG_ASSERT(pfs != NULL);
+    //pfs->m_sid= *(rpl_sid *)sid;
+    pfs->m_gtid_spec= *(Gtid_specification*)gtid_spec;
+  }
+}
+
+void pfs_set_transaction_xid_v1(PSI_transaction_locker *locker,
+                                const void *xid,
+                                int xa_state)
+{
+  PSI_transaction_locker_state *state= reinterpret_cast<PSI_transaction_locker_state*> (locker);
+  assert(state != NULL);
+
+  if (state->m_flags & STATE_FLAG_EVENT)
+  {
+    PFS_events_transactions *pfs= reinterpret_cast<PFS_events_transactions*> (state->m_transaction);
+    assert(pfs != NULL);
+    assert(xid != NULL);
+
+    pfs->m_xid= *(PSI_xid *)xid;
+    pfs->m_xa_state= (enum_xa_transaction_state)xa_state;
+    pfs->m_xa= true;
+  }
+  return;
+}
+
+void pfs_set_transaction_xa_state_v1(PSI_transaction_locker *locker,
+                                     int xa_state)
+{
+  PSI_transaction_locker_state *state= reinterpret_cast<PSI_transaction_locker_state*> (locker);
+  assert(state != NULL);
+
+  if (state->m_flags & STATE_FLAG_EVENT)
+  {
+    PFS_events_transactions *pfs= reinterpret_cast<PFS_events_transactions*> (state->m_transaction);
+    assert(pfs != NULL);
+
+    pfs->m_xa_state= (enum_xa_transaction_state)xa_state;
+    pfs->m_xa= true;
+  }
+  return;
+}
+
+void pfs_set_transaction_trxid_v1(PSI_transaction_locker *locker,
+                                  const ulonglong *trxid)
+{
+  assert(trxid != NULL);
+
+  PSI_transaction_locker_state *state= reinterpret_cast<PSI_transaction_locker_state*> (locker);
+  assert(state != NULL);
+
+  if (state->m_flags & STATE_FLAG_EVENT)
+  {
+    PFS_events_transactions *pfs= reinterpret_cast<PFS_events_transactions*> (state->m_transaction);
+    assert(pfs != NULL);
+
+    if (pfs->m_trxid == 0)
+      pfs->m_trxid= *trxid;
+  }
+}
+
+#define INC_TRANSACTION_ATTR_BODY(LOCKER, ATTR, VALUE)                  \
+  PSI_transaction_locker_state *state;                                  \
+  state= reinterpret_cast<PSI_transaction_locker_state*> (LOCKER);      \
+  if (unlikely(state == NULL))                                          \
+    return;                                                             \
+  state->ATTR+= VALUE;                                                  \
+  if (state->m_flags & STATE_FLAG_EVENT)                                \
+  {                                                                     \
+    PFS_events_transactions *pfs;                                       \
+    pfs= reinterpret_cast<PFS_events_transactions*> (state->m_transaction); \
+    assert(pfs != NULL);                                                \
+    pfs->ATTR+= VALUE;                                                  \
+  }                                                                     \
+  return;
+
+
+void pfs_inc_transaction_savepoints_v1(PSI_transaction_locker *locker,
+                                       ulong count)
+{
+  INC_TRANSACTION_ATTR_BODY(locker, m_savepoint_count, count);
+}
+
+void pfs_inc_transaction_rollback_to_savepoint_v1(PSI_transaction_locker *locker,
+                                                  ulong count)
+{
+  INC_TRANSACTION_ATTR_BODY(locker, m_rollback_to_savepoint_count, count);
+}
+
+void pfs_inc_transaction_release_savepoint_v1(PSI_transaction_locker *locker,
+                                              ulong count)
+{
+  INC_TRANSACTION_ATTR_BODY(locker, m_release_savepoint_count, count);
+}
+
+void pfs_end_transaction_v1(PSI_transaction_locker *locker, my_bool commit)
+{
+  PSI_transaction_locker_state *state= reinterpret_cast<PSI_transaction_locker_state*> (locker);
+  assert(state != NULL);
+
+  ulonglong timer_end= 0;
+  ulonglong wait_time= 0;
+  uint flags= state->m_flags;
+
+  if (flags & STATE_FLAG_TIMED)
+  {
+    timer_end= state->m_timer();
+    wait_time= timer_end - state->m_timer_start;
+  }
+
+  PFS_transaction_stat *stat;
+
+  if (flags & STATE_FLAG_THREAD)
+  {
+    PFS_thread *pfs_thread= reinterpret_cast<PFS_thread *> (state->m_thread);
+    assert(pfs_thread != NULL);
+
+    /* Aggregate to EVENTS_TRANSACTIONS_SUMMARY_BY_THREAD_BY_EVENT_NAME */
+    stat= &pfs_thread->write_instr_class_transactions_stats()[GLOBAL_TRANSACTION_INDEX];
+
+    if (flags & STATE_FLAG_EVENT)
+    {
+      PFS_events_transactions *pfs= reinterpret_cast<PFS_events_transactions*> (state->m_transaction);
+      assert(pfs != NULL);
+
+      /* events_transactions_current may have been cleared while the transaction was active */
+      if (unlikely(pfs->m_class == NULL))
+        return;
+
+      pfs->m_timer_end= timer_end;
+      pfs->m_end_event_id= pfs_thread->m_event_id;
+
+      pfs->m_state= (commit ? TRANS_STATE_COMMITTED : TRANS_STATE_ROLLED_BACK);
+
+      if (pfs->m_xa)
+          pfs->m_xa_state= (commit ? TRANS_STATE_XA_COMMITTED : TRANS_STATE_XA_ROLLBACK_ONLY);
+
+      if (pfs_thread->m_flag_events_transactions_history)
+        insert_events_transactions_history(pfs_thread, pfs);
+      if (pfs_thread->m_flag_events_transactions_history_long)
+        insert_events_transactions_history_long(pfs);
+    }
+  }
+  else
+  {
+    /* Aggregate to EVENTS_TRANSACTIONS_SUMMARY_GLOBAL_BY_EVENT_NAME */
+    stat= &global_transaction_stat;
+  }
+
+  if (flags & STATE_FLAG_TIMED)
+  {
+    /* Aggregate to EVENTS_TRANSACTIONS_SUMMARY_..._BY_EVENT_NAME (timed) */
+    if(state->m_read_only)
+      stat->m_read_only_stat.aggregate_value(wait_time);
+    else
+      stat->m_read_write_stat.aggregate_value(wait_time);
+  }
+  else
+  {
+    /* Aggregate to EVENTS_TRANSACTIONS_SUMMARY_..._BY_EVENT_NAME (counted) */
+    if(state->m_read_only)
+      stat->m_read_only_stat.aggregate_counted();
+    else
+      stat->m_read_write_stat.aggregate_counted();
+  }
+
+  stat->m_savepoint_count+= state->m_savepoint_count;
+  stat->m_rollback_to_savepoint_count+= state->m_rollback_to_savepoint_count;
+  stat->m_release_savepoint_count+= state->m_release_savepoint_count;
+}
+
+
 /**
   Implementation of the socket instrumentation interface.
   @sa PSI_v1::end_socket_wait.
 */
-static void end_socket_wait_v1(PSI_socket_locker *locker, size_t byte_count)
+void pfs_end_socket_wait_v1(PSI_socket_locker *locker, size_t byte_count)
 {
   PSI_socket_locker_state *state= reinterpret_cast<PSI_socket_locker_state*> (locker);
-  DBUG_ASSERT(state != NULL);
+  assert(state != NULL);
 
   PFS_socket *socket= reinterpret_cast<PFS_socket *>(state->m_socket);
-  DBUG_ASSERT(socket != NULL);
+  assert(socket != NULL);
 
   ulonglong timer_end= 0;
   ulonglong wait_time= 0;
@@ -5064,7 +6260,7 @@ static void end_socket_wait_v1(PSI_socket_locker *locker, size_t byte_count)
       byte_stat= &socket->m_socket_stat.m_io_stat.m_misc;
       break;
     default:
-      DBUG_ASSERT(false);
+      assert(false);
       byte_stat= NULL;
       break;
   }
@@ -5088,44 +6284,44 @@ static void end_socket_wait_v1(PSI_socket_locker *locker, size_t byte_count)
   if (flags & STATE_FLAG_EVENT)
   {
     PFS_thread *thread= reinterpret_cast<PFS_thread *>(state->m_thread);
-    DBUG_ASSERT(thread != NULL);
+    assert(thread != NULL);
     PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
-    DBUG_ASSERT(wait != NULL);
+    assert(wait != NULL);
 
     wait->m_timer_end= timer_end;
     wait->m_end_event_id= thread->m_event_id;
     wait->m_number_of_bytes= bytes;
 
-    if (flag_events_waits_history)
+    if (thread->m_flag_events_waits_history)
       insert_events_waits_history(thread, wait);
-    if (flag_events_waits_history_long)
+    if (thread->m_flag_events_waits_history_long)
       insert_events_waits_history_long(wait);
     thread->m_events_waits_current--;
 
-    DBUG_ASSERT(wait == thread->m_events_waits_current);
+    assert(wait == thread->m_events_waits_current);
   }
 }
 
-static void set_socket_state_v1(PSI_socket *socket, PSI_socket_state state)
+void pfs_set_socket_state_v1(PSI_socket *socket, PSI_socket_state state)
 {
-  DBUG_ASSERT((state == PSI_SOCKET_STATE_IDLE) || (state == PSI_SOCKET_STATE_ACTIVE));
+  assert((state == PSI_SOCKET_STATE_IDLE) || (state == PSI_SOCKET_STATE_ACTIVE));
   PFS_socket *pfs= reinterpret_cast<PFS_socket*>(socket);
-  DBUG_ASSERT(pfs != NULL);
-  DBUG_ASSERT(pfs->m_idle || (state == PSI_SOCKET_STATE_IDLE));
-  DBUG_ASSERT(!pfs->m_idle || (state == PSI_SOCKET_STATE_ACTIVE));
+  assert(pfs != NULL);
+  assert(pfs->m_idle || (state == PSI_SOCKET_STATE_IDLE));
+  assert(!pfs->m_idle || (state == PSI_SOCKET_STATE_ACTIVE));
   pfs->m_idle= (state == PSI_SOCKET_STATE_IDLE);
 }
 
 /**
   Set socket descriptor and address info.
 */
-static void set_socket_info_v1(PSI_socket *socket,
-                               const my_socket *fd,
-                               const struct sockaddr *addr,
-                               socklen_t addr_len)
+void pfs_set_socket_info_v1(PSI_socket *socket,
+                            const my_socket *fd,
+                            const struct sockaddr *addr,
+                            socklen_t addr_len)
 {
   PFS_socket *pfs= reinterpret_cast<PFS_socket*>(socket);
-  DBUG_ASSERT(pfs != NULL);
+  assert(pfs != NULL);
 
   /** Set socket descriptor */
   if (fd != NULL)
@@ -5148,11 +6344,11 @@ static void set_socket_info_v1(PSI_socket *socket,
   Implementation of the socket instrumentation interface.
   @sa PSI_v1::set_socket_info.
 */
-static void set_socket_thread_owner_v1(PSI_socket *socket)
+void pfs_set_socket_thread_owner_v1(PSI_socket *socket)
 {
   PFS_socket *pfs_socket= reinterpret_cast<PFS_socket*>(socket);
-  DBUG_ASSERT(pfs_socket != NULL);
-  pfs_socket->m_thread_owner= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  assert(pfs_socket != NULL);
+  pfs_socket->m_thread_owner= my_thread_get_THR_PFS();
 }
 
 struct PSI_digest_locker*
@@ -5160,7 +6356,7 @@ pfs_digest_start_v1(PSI_statement_locker *locker)
 {
   PSI_statement_locker_state *statement_state;
   statement_state= reinterpret_cast<PSI_statement_locker_state*> (locker);
-  DBUG_ASSERT(statement_state != NULL);
+  assert(statement_state != NULL);
 
   if (statement_state->m_discarded)
     return NULL;
@@ -5177,8 +6373,8 @@ void pfs_digest_end_v1(PSI_digest_locker *locker, const sql_digest_storage *dige
 {
   PSI_statement_locker_state *statement_state;
   statement_state= reinterpret_cast<PSI_statement_locker_state*> (locker);
-  DBUG_ASSERT(statement_state != NULL);
-  DBUG_ASSERT(digest != NULL);
+  assert(statement_state != NULL);
+  assert(digest != NULL);
 
   if (statement_state->m_discarded)
     return;
@@ -5189,30 +6385,81 @@ void pfs_digest_end_v1(PSI_digest_locker *locker, const sql_digest_storage *dige
   }
 }
 
+PSI_prepared_stmt*
+pfs_create_prepared_stmt_v1(void *identity, uint stmt_id,
+                           PSI_statement_locker *locker,
+                           const char *stmt_name, size_t stmt_name_length)
+{
+  PSI_statement_locker_state *state= reinterpret_cast<PSI_statement_locker_state*> (locker);
+  PFS_events_statements *pfs_stmt= reinterpret_cast<PFS_events_statements*> (state->m_statement);
+  PFS_program *pfs_program= reinterpret_cast<PFS_program *>(state->m_parent_sp_share);
+
+  PFS_thread *pfs_thread= my_thread_get_THR_PFS();
+  if (unlikely(pfs_thread == NULL))
+    return NULL;
+
+  PFS_prepared_stmt *pfs= create_prepared_stmt(identity,
+                                               pfs_thread, pfs_program,
+                                               pfs_stmt, stmt_id,
+                                               stmt_name, static_cast<uint>(stmt_name_length));
+
+  state->m_parent_prepared_stmt= reinterpret_cast<PSI_prepared_stmt*>(pfs);
+  state->m_in_prepare= true;
+
+  return reinterpret_cast<PSI_prepared_stmt*>(pfs);
+}
+
+void pfs_execute_prepared_stmt_v1 (PSI_statement_locker *locker,
+                                   PSI_prepared_stmt* ps)
+{
+  PSI_statement_locker_state *state= reinterpret_cast<PSI_statement_locker_state*> (locker);
+  assert(state != NULL);
+
+  state->m_parent_prepared_stmt= ps;
+  state->m_in_prepare= false;
+}
+
+void pfs_destroy_prepared_stmt_v1(PSI_prepared_stmt* prepared_stmt)
+{
+  PFS_prepared_stmt *pfs_prepared_stmt= reinterpret_cast<PFS_prepared_stmt*>(prepared_stmt);
+  delete_prepared_stmt(pfs_prepared_stmt);
+  return;
+}
+
+void pfs_reprepare_prepared_stmt_v1(PSI_prepared_stmt* prepared_stmt)
+{
+  PFS_prepared_stmt *pfs_prepared_stmt= reinterpret_cast<PFS_prepared_stmt*>(prepared_stmt);
+  PFS_single_stat *prepared_stmt_stat= &pfs_prepared_stmt->m_reprepare_stat;
+
+  if (prepared_stmt_stat != NULL)
+    prepared_stmt_stat->aggregate_counted();
+  return;
+}
+
 /**
   Implementation of the thread attribute connection interface
   @sa PSI_v1::set_thread_connect_attr.
 */
-static int set_thread_connect_attrs_v1(const char *buffer, uint length,
-                                       const void *from_cs)
+int pfs_set_thread_connect_attrs_v1(const char *buffer, uint length,
+                                    const void *from_cs)
 {
+  PFS_thread *thd= my_thread_get_THR_PFS();
 
-  PFS_thread *thd= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
-
-  DBUG_ASSERT(buffer != NULL);
+  assert(buffer != NULL);
 
   if (likely(thd != NULL) && session_connect_attrs_size_per_thread > 0)
   {
+    pfs_dirty_state dirty_state;
     const CHARSET_INFO *cs = static_cast<const CHARSET_INFO *> (from_cs);
 
     /* copy from the input buffer as much as we can fit */
     uint copy_size= (uint)(length < session_connect_attrs_size_per_thread ?
                            length : session_connect_attrs_size_per_thread);
-    thd->m_session_lock.allocated_to_dirty();
+    thd->m_session_lock.allocated_to_dirty(& dirty_state);
     memcpy(thd->m_session_connect_attrs, buffer, copy_size);
     thd->m_session_connect_attrs_length= copy_size;
     thd->m_session_connect_attrs_cs_number= cs->number;
-    thd->m_session_lock.dirty_to_allocated();
+    thd->m_session_lock.dirty_to_allocated(& dirty_state);
 
     if (copy_size == length)
       return 0;
@@ -5223,6 +6470,532 @@ static int set_thread_connect_attrs_v1(const char *buffer, uint length,
   return 0;
 }
 
+void pfs_register_memory_v1(const char *category,
+                               PSI_memory_info_v1 *info,
+                               int count)
+{
+  REGISTER_BODY_V1(PSI_memory_key,
+                   memory_instrument_prefix,
+                   register_memory_class)
+}
+
+PSI_memory_key pfs_memory_alloc_v1(PSI_memory_key key, size_t size, PSI_thread **owner)
+{
+  PFS_thread ** owner_thread= reinterpret_cast<PFS_thread**>(owner);
+  assert(owner_thread != NULL);
+
+  if (! flag_global_instrumentation)
+  {
+    *owner_thread= NULL;
+    return PSI_NOT_INSTRUMENTED;
+  }
+
+  PFS_memory_class *klass= find_memory_class(key);
+  if (klass == NULL)
+  {
+    *owner_thread= NULL;
+    return PSI_NOT_INSTRUMENTED;
+  }
+
+  if (! klass->m_enabled)
+  {
+    *owner_thread= NULL;
+    return PSI_NOT_INSTRUMENTED;
+  }
+
+  PFS_memory_stat *event_name_array;
+  PFS_memory_stat *stat;
+  uint index= klass->m_event_name_index;
+  PFS_memory_stat_delta delta_buffer;
+  PFS_memory_stat_delta *delta;
+
+  if (flag_thread_instrumentation && ! klass->is_global())
+  {
+    PFS_thread *pfs_thread= my_thread_get_THR_PFS();
+    if (unlikely(pfs_thread == NULL))
+    {
+      *owner_thread= NULL;
+      return PSI_NOT_INSTRUMENTED;
+    }
+    if (! pfs_thread->m_enabled)
+    {
+      *owner_thread= NULL;
+      return PSI_NOT_INSTRUMENTED;
+    }
+
+    /* Aggregate to MEMORY_SUMMARY_BY_THREAD_BY_EVENT_NAME */
+    event_name_array= pfs_thread->write_instr_class_memory_stats();
+    stat= & event_name_array[index];
+    delta= stat->count_alloc(size, &delta_buffer);
+
+    if (delta != NULL)
+    {
+      pfs_thread->carry_memory_stat_delta(delta, index);
+    }
+
+    /* Flag this memory as owned by the current thread. */
+    *owner_thread= pfs_thread;
+  }
+  else
+  {
+    /* Aggregate to MEMORY_SUMMARY_GLOBAL_BY_EVENT_NAME */
+    event_name_array= global_instr_class_memory_array;
+    stat= & event_name_array[index];
+    (void) stat->count_alloc(size, &delta_buffer);
+
+    *owner_thread= NULL;
+  }
+
+  return key;
+}
+
+PSI_memory_key pfs_memory_realloc_v1(PSI_memory_key key, size_t old_size, size_t new_size, PSI_thread **owner)
+{
+  PFS_thread ** owner_thread_hdl= reinterpret_cast<PFS_thread**>(owner);
+  assert(owner != NULL);
+
+  PFS_memory_class *klass= find_memory_class(key);
+  if (klass == NULL)
+  {
+    *owner_thread_hdl= NULL;
+    return PSI_NOT_INSTRUMENTED;
+  }
+
+  PFS_memory_stat *event_name_array;
+  PFS_memory_stat *stat;
+  uint index= klass->m_event_name_index;
+  PFS_memory_stat_delta delta_buffer;
+  PFS_memory_stat_delta *delta;
+
+  if (flag_thread_instrumentation && ! klass->is_global())
+  {
+    PFS_thread *pfs_thread= my_thread_get_THR_PFS();
+    if (likely(pfs_thread != NULL))
+    {
+#ifdef PFS_PARANOID
+      PFS_thread *owner_thread= *owner_thread_hdl;
+      if (owner_thread != pfs_thread)
+      {
+        owner_thread= sanitize_thread(owner_thread);
+        if (owner_thread != NULL)
+        {
+          report_memory_accounting_error("pfs_memory_realloc_v1",
+            pfs_thread, old_size, klass, owner_thread);
+        }
+      }
+#endif /* PFS_PARANOID */
+
+      /* Aggregate to MEMORY_SUMMARY_BY_THREAD_BY_EVENT_NAME */
+      event_name_array= pfs_thread->write_instr_class_memory_stats();
+      stat= & event_name_array[index];
+
+      if (flag_global_instrumentation && klass->m_enabled)
+      {
+        delta= stat->count_realloc(old_size, new_size, &delta_buffer);
+        *owner_thread_hdl= pfs_thread;
+      }
+      else
+      {
+        delta= stat->count_free(old_size, &delta_buffer);
+        *owner_thread_hdl= NULL;
+        key= PSI_NOT_INSTRUMENTED;
+      }
+
+      if (delta != NULL)
+      {
+        pfs_thread->carry_memory_stat_delta(delta, index);
+      }
+      return key;
+    }
+  }
+
+  /* Aggregate to MEMORY_SUMMARY_GLOBAL_BY_EVENT_NAME */
+  event_name_array= global_instr_class_memory_array;
+  stat= & event_name_array[index];
+
+  if (flag_global_instrumentation && klass->m_enabled)
+  {
+    (void) stat->count_realloc(old_size, new_size, &delta_buffer);
+  }
+  else
+  {
+    (void) stat->count_free(old_size, &delta_buffer);
+    key= PSI_NOT_INSTRUMENTED;
+  }
+
+  *owner_thread_hdl= NULL;
+  return key;
+}
+
+PSI_memory_key pfs_memory_claim_v1(PSI_memory_key key, size_t size, PSI_thread **owner)
+{
+  PFS_thread ** owner_thread= reinterpret_cast<PFS_thread**>(owner);
+  assert(owner_thread != NULL);
+
+  PFS_memory_class *klass= find_memory_class(key);
+  if (klass == NULL)
+  {
+    *owner_thread= NULL;
+    return PSI_NOT_INSTRUMENTED;
+  }
+
+  /*
+    Do not check klass->m_enabled.
+    Do not check flag_global_instrumentation.
+    If a memory alloc was instrumented,
+    the corresponding free must be instrumented.
+  */
+
+  PFS_memory_stat *event_name_array;
+  PFS_memory_stat *stat;
+  uint index= klass->m_event_name_index;
+  PFS_memory_stat_delta delta_buffer;
+  PFS_memory_stat_delta *delta;
+
+  if (flag_thread_instrumentation)
+  {
+    PFS_thread *old_thread= sanitize_thread(*owner_thread);
+    PFS_thread *new_thread= my_thread_get_THR_PFS();
+    if (old_thread != new_thread)
+    {
+      if (old_thread != NULL)
+      {
+        event_name_array= old_thread->write_instr_class_memory_stats();
+        stat= & event_name_array[index];
+        delta= stat->count_free(size, &delta_buffer);
+
+        if (delta != NULL)
+        {
+          old_thread->carry_memory_stat_delta(delta, index);
+        }
+      }
+
+      if (new_thread != NULL)
+      {
+        event_name_array= new_thread->write_instr_class_memory_stats();
+        stat= & event_name_array[index];
+        delta= stat->count_alloc(size, &delta_buffer);
+
+        if (delta != NULL)
+        {
+          new_thread->carry_memory_stat_delta(delta, index);
+        }
+      }
+
+      *owner_thread= new_thread;
+    }
+
+    return key;
+  }
+
+  *owner_thread= NULL;
+  return key;
+}
+
+void pfs_memory_free_v1(PSI_memory_key key, size_t size, PSI_thread *owner)
+{
+  PFS_memory_class *klass= find_memory_class(key);
+  if (klass == NULL)
+    return;
+
+  /*
+    Do not check klass->m_enabled.
+    Do not check flag_global_instrumentation.
+    If a memory alloc was instrumented,
+    the corresponding free must be instrumented.
+  */
+
+  PFS_memory_stat *event_name_array;
+  PFS_memory_stat *stat;
+  uint index= klass->m_event_name_index;
+  PFS_memory_stat_delta delta_buffer;
+  PFS_memory_stat_delta *delta;
+
+  if (flag_thread_instrumentation && ! klass->is_global())
+  {
+    PFS_thread *pfs_thread= my_thread_get_THR_PFS();
+    if (likely(pfs_thread != NULL))
+    {
+#ifdef PFS_PARANOID
+      PFS_thread *owner_thread= reinterpret_cast<PFS_thread*>(owner);
+
+      if (owner_thread != pfs_thread)
+      {
+        owner_thread= sanitize_thread(owner_thread);
+        if (owner_thread != NULL)
+        {
+          report_memory_accounting_error("pfs_memory_free_v1",
+            pfs_thread, size, klass, owner_thread);
+        }
+      }
+#endif /* PFS_PARANOID */
+
+      /*
+        Do not check pfs_thread->m_enabled.
+        If a memory alloc was instrumented,
+        the corresponding free must be instrumented.
+      */
+      /* Aggregate to MEMORY_SUMMARY_BY_THREAD_BY_EVENT_NAME */
+      event_name_array= pfs_thread->write_instr_class_memory_stats();
+      stat= & event_name_array[index];
+      delta= stat->count_free(size, &delta_buffer);
+
+      if (delta != NULL)
+      {
+        pfs_thread->carry_memory_stat_delta(delta, index);
+      }
+      return;
+    }
+  }
+
+  /* Aggregate to MEMORY_SUMMARY_GLOBAL_BY_EVENT_NAME */
+  event_name_array= global_instr_class_memory_array;
+  if (event_name_array)
+  {
+    stat= & event_name_array[index];
+    (void) stat->count_free(size, &delta_buffer);
+  }
+  return;
+}
+
+void pfs_unlock_table_v1(PSI_table *table)
+{
+  PFS_table *pfs_table= reinterpret_cast<PFS_table*> (table);
+
+  assert(pfs_table != NULL);
+
+  pfs_table->m_internal_lock= PFS_TL_NONE;
+  return;
+}
+
+PSI_metadata_lock *
+pfs_create_metadata_lock_v1(
+  void *identity,
+  const MDL_key *mdl_key,
+  opaque_mdl_type mdl_type,
+  opaque_mdl_duration mdl_duration,
+  opaque_mdl_status mdl_status,
+  const char *src_file,
+  uint src_line)
+{
+  if (! flag_global_instrumentation)
+    return NULL;
+
+  if (! global_metadata_class.m_enabled)
+    return NULL;
+
+  PFS_thread *pfs_thread= my_thread_get_THR_PFS();
+  if (pfs_thread == NULL)
+    return NULL;
+
+  PFS_metadata_lock *pfs;
+  pfs= create_metadata_lock(identity, mdl_key,
+                            mdl_type, mdl_duration, mdl_status,
+                            src_file, src_line);
+
+  if (pfs != NULL)
+  {
+    pfs->m_owner_thread_id= pfs_thread->m_thread_internal_id;
+    pfs->m_owner_event_id= pfs_thread->m_event_id;
+  }
+
+  return reinterpret_cast<PSI_metadata_lock *> (pfs);
+}
+
+void
+pfs_set_metadata_lock_status_v1(PSI_metadata_lock *lock, opaque_mdl_status mdl_status)
+{
+  PFS_metadata_lock *pfs= reinterpret_cast<PFS_metadata_lock*> (lock);
+  assert(pfs != NULL);
+  pfs->m_mdl_status= mdl_status;
+}
+
+void
+pfs_destroy_metadata_lock_v1(PSI_metadata_lock *lock)
+{
+  PFS_metadata_lock *pfs= reinterpret_cast<PFS_metadata_lock*> (lock);
+  assert(pfs != NULL);
+  destroy_metadata_lock(pfs);
+}
+
+PSI_metadata_locker *
+pfs_start_metadata_wait_v1(PSI_metadata_locker_state *state,
+                           PSI_metadata_lock *lock,
+                           const char *src_file,
+                           uint src_line)
+{
+  PFS_metadata_lock *pfs_lock= reinterpret_cast<PFS_metadata_lock*> (lock);
+  assert(state != NULL);
+  assert(pfs_lock != NULL);
+
+  if (! pfs_lock->m_enabled)
+    return NULL;
+
+  uint flags;
+  ulonglong timer_start= 0;
+
+  if (flag_thread_instrumentation)
+  {
+    PFS_thread *pfs_thread= my_thread_get_THR_PFS();
+    if (unlikely(pfs_thread == NULL))
+      return NULL;
+    if (! pfs_thread->m_enabled)
+      return NULL;
+    state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
+    flags= STATE_FLAG_THREAD;
+
+    if (pfs_lock->m_timed)
+    {
+      timer_start= get_timer_raw_value_and_function(wait_timer, & state->m_timer);
+      state->m_timer_start= timer_start;
+      flags|= STATE_FLAG_TIMED;
+    }
+
+    if (flag_events_waits_current)
+    {
+      if (unlikely(pfs_thread->m_events_waits_current >=
+                   & pfs_thread->m_events_waits_stack[WAIT_STACK_SIZE]))
+      {
+        locker_lost++;
+        return NULL;
+      }
+      PFS_events_waits *wait= pfs_thread->m_events_waits_current;
+      state->m_wait= wait;
+      flags|= STATE_FLAG_EVENT;
+
+      PFS_events_waits *parent_event= wait - 1;
+      wait->m_event_type= EVENT_TYPE_WAIT;
+      wait->m_nesting_event_id= parent_event->m_event_id;
+      wait->m_nesting_event_type= parent_event->m_event_type;
+
+      wait->m_thread_internal_id= pfs_thread->m_thread_internal_id;
+      wait->m_class= &global_metadata_class;
+      wait->m_timer_start= timer_start;
+      wait->m_timer_end= 0;
+      wait->m_object_instance_addr= pfs_lock->m_identity;
+      wait->m_event_id= pfs_thread->m_event_id++;
+      wait->m_end_event_id= 0;
+      wait->m_weak_metadata_lock= pfs_lock;
+      wait->m_weak_version= pfs_lock->get_version();
+      wait->m_operation= OPERATION_TYPE_METADATA;
+      wait->m_source_file= src_file;
+      wait->m_source_line= src_line;
+      wait->m_wait_class= WAIT_CLASS_METADATA;
+
+      pfs_thread->m_events_waits_current++;
+    }
+  }
+  else
+  {
+    if (pfs_lock->m_timed)
+    {
+      timer_start= get_timer_raw_value_and_function(wait_timer, & state->m_timer);
+      state->m_timer_start= timer_start;
+      flags= STATE_FLAG_TIMED;
+      state->m_thread= NULL;
+    }
+    else
+    {
+      /*
+        Complete shortcut.
+      */
+      /* Aggregate to EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME (counted) */
+      global_metadata_stat.aggregate_counted();
+      return NULL;
+    }
+  }
+
+  state->m_flags= flags;
+  state->m_metadata_lock= lock;
+  return reinterpret_cast<PSI_metadata_locker*> (state);
+}
+
+void
+pfs_end_metadata_wait_v1(PSI_metadata_locker *locker,
+                         int rc)
+{
+  PSI_metadata_locker_state *state= reinterpret_cast<PSI_metadata_locker_state*> (locker);
+  assert(state != NULL);
+
+  ulonglong timer_end= 0;
+  ulonglong wait_time= 0;
+
+  PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
+
+  uint flags= state->m_flags;
+
+  if (flags & STATE_FLAG_TIMED)
+  {
+    timer_end= state->m_timer();
+    wait_time= timer_end - state->m_timer_start;
+  }
+
+  if (flags & STATE_FLAG_THREAD)
+  {
+    PFS_single_stat *event_name_array;
+    event_name_array= thread->write_instr_class_waits_stats();
+
+    if (flags & STATE_FLAG_TIMED)
+    {
+      /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (timed) */
+      event_name_array[GLOBAL_METADATA_EVENT_INDEX].aggregate_value(wait_time);
+    }
+    else
+    {
+      /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (counted) */
+      event_name_array[GLOBAL_METADATA_EVENT_INDEX].aggregate_counted();
+    }
+
+    if (flags & STATE_FLAG_EVENT)
+    {
+      PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
+      assert(wait != NULL);
+
+      wait->m_timer_end= timer_end;
+      wait->m_end_event_id= thread->m_event_id;
+      if (thread->m_flag_events_waits_history)
+        insert_events_waits_history(thread, wait);
+      if (thread->m_flag_events_waits_history_long)
+        insert_events_waits_history_long(wait);
+      thread->m_events_waits_current--;
+
+      assert(wait == thread->m_events_waits_current);
+    }
+  }
+  else
+  {
+    if (flags & STATE_FLAG_TIMED)
+    {
+      /* Aggregate to EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME (timed) */
+      global_metadata_stat.aggregate_value(wait_time);
+    }
+    else
+    {
+      /* Aggregate to EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME (counted) */
+      global_metadata_stat.aggregate_counted();
+    }
+  }
+}
+
+void pfs_set_prepared_stmt_text_v1(PSI_prepared_stmt *prepared_stmt,
+                                   const char *text,
+                                   uint text_len)
+{
+  PFS_prepared_stmt *pfs_prepared_stmt =
+    reinterpret_cast<PFS_prepared_stmt *>(prepared_stmt);
+  DBUG_ASSERT(pfs_prepared_stmt != NULL);
+
+  uint max_len = COL_INFO_SIZE;
+  if (text_len > max_len)
+  {
+    text_len = max_len;
+  }
+
+  memcpy(pfs_prepared_stmt->m_sqltext, text, text_len);
+  pfs_prepared_stmt->m_sqltext_length = text_len;
+
+  return;
+}
 
 /**
   Implementation of the instrumentation interface.
@@ -5230,103 +7003,140 @@ static int set_thread_connect_attrs_v1(const char *buffer, uint length,
 */
 PSI_v1 PFS_v1=
 {
-  register_mutex_v1,
-  register_rwlock_v1,
-  register_cond_v1,
-  register_thread_v1,
-  register_file_v1,
-  register_stage_v1,
-  register_statement_v1,
-  register_socket_v1,
-  init_mutex_v1,
-  destroy_mutex_v1,
-  init_rwlock_v1,
-  destroy_rwlock_v1,
-  init_cond_v1,
-  destroy_cond_v1,
-  init_socket_v1,
-  destroy_socket_v1,
-  get_table_share_v1,
-  release_table_share_v1,
-  drop_table_share_v1,
-  open_table_v1,
-  unbind_table_v1,
-  rebind_table_v1,
-  close_table_v1,
-  create_file_v1,
-  spawn_thread_v1,
-  new_thread_v1,
-  set_thread_id_v1,
-  get_thread_v1,
-  set_thread_user_v1,
-  set_thread_account_v1,
-  set_thread_db_v1,
-  set_thread_command_v1,
-  set_thread_start_time_v1,
-  set_thread_state_v1,
-  set_thread_info_v1,
-  set_thread_v1,
-  delete_current_thread_v1,
-  delete_thread_v1,
-  get_thread_file_name_locker_v1,
-  get_thread_file_stream_locker_v1,
-  get_thread_file_descriptor_locker_v1,
-  unlock_mutex_v1,
-  unlock_rwlock_v1,
-  signal_cond_v1,
-  broadcast_cond_v1,
-  start_idle_wait_v1,
-  end_idle_wait_v1,
-  start_mutex_wait_v1,
-  end_mutex_wait_v1,
-  start_rwlock_wait_v1, /* read */
-  end_rwlock_rdwait_v1,
-  start_rwlock_wait_v1, /* write */
-  end_rwlock_wrwait_v1,
-  start_cond_wait_v1,
-  end_cond_wait_v1,
-  start_table_io_wait_v1,
-  end_table_io_wait_v1,
-  start_table_lock_wait_v1,
-  end_table_lock_wait_v1,
-  start_file_open_wait_v1,
-  end_file_open_wait_v1,
-  end_file_open_wait_and_bind_to_descriptor_v1,
-  start_file_wait_v1,
-  end_file_wait_v1,
-  start_file_close_wait_v1,
-  end_file_close_wait_v1,
-  start_stage_v1,
-  end_stage_v1,
-  get_thread_statement_locker_v1,
-  refine_statement_v1,
-  start_statement_v1,
-  set_statement_text_v1,
-  set_statement_lock_time_v1,
-  set_statement_rows_sent_v1,
-  set_statement_rows_examined_v1,
-  inc_statement_created_tmp_disk_tables_v1,
-  inc_statement_created_tmp_tables_v1,
-  inc_statement_select_full_join_v1,
-  inc_statement_select_full_range_join_v1,
-  inc_statement_select_range_v1,
-  inc_statement_select_range_check_v1,
-  inc_statement_select_scan_v1,
-  inc_statement_sort_merge_passes_v1,
-  inc_statement_sort_range_v1,
-  inc_statement_sort_rows_v1,
-  inc_statement_sort_scan_v1,
-  set_statement_no_index_used_v1,
-  set_statement_no_good_index_used_v1,
-  end_statement_v1,
-  start_socket_wait_v1,
-  end_socket_wait_v1,
-  set_socket_state_v1,
-  set_socket_info_v1,
-  set_socket_thread_owner_v1,
+  pfs_register_mutex_v1,
+  pfs_register_rwlock_v1,
+  pfs_register_cond_v1,
+  pfs_register_thread_v1,
+  pfs_register_file_v1,
+  pfs_register_stage_v1,
+  pfs_register_statement_v1,
+  pfs_register_socket_v1,
+  pfs_init_mutex_v1,
+  pfs_destroy_mutex_v1,
+  pfs_init_rwlock_v1,
+  pfs_destroy_rwlock_v1,
+  pfs_init_cond_v1,
+  pfs_destroy_cond_v1,
+  pfs_init_socket_v1,
+  pfs_destroy_socket_v1,
+  pfs_get_table_share_v1,
+  pfs_release_table_share_v1,
+  pfs_drop_table_share_v1,
+  pfs_open_table_v1,
+  pfs_unbind_table_v1,
+  pfs_rebind_table_v1,
+  pfs_close_table_v1,
+  pfs_create_file_v1,
+  pfs_spawn_thread_v1,
+  pfs_new_thread_v1,
+  pfs_set_thread_id_v1,
+  pfs_set_thread_THD_v1,
+  pfs_set_thread_os_id_v1,
+  pfs_get_thread_v1,
+  pfs_set_thread_user_v1,
+  pfs_set_thread_account_v1,
+  pfs_set_thread_db_v1,
+  pfs_set_thread_command_v1,
+  pfs_set_connection_type_v1,
+  pfs_set_thread_start_time_v1,
+  pfs_set_thread_state_v1,
+  pfs_set_thread_info_v1,
+  pfs_set_thread_v1,
+  pfs_delete_current_thread_v1,
+  pfs_delete_thread_v1,
+  pfs_get_thread_file_name_locker_v1,
+  pfs_get_thread_file_stream_locker_v1,
+  pfs_get_thread_file_descriptor_locker_v1,
+  pfs_unlock_mutex_v1,
+  pfs_unlock_rwlock_v1,
+  pfs_signal_cond_v1,
+  pfs_broadcast_cond_v1,
+  pfs_start_idle_wait_v1,
+  pfs_end_idle_wait_v1,
+  pfs_start_mutex_wait_v1,
+  pfs_end_mutex_wait_v1,
+  pfs_start_rwlock_rdwait_v1,
+  pfs_end_rwlock_rdwait_v1,
+  pfs_start_rwlock_wrwait_v1,
+  pfs_end_rwlock_wrwait_v1,
+  pfs_start_cond_wait_v1,
+  pfs_end_cond_wait_v1,
+  pfs_start_table_io_wait_v1,
+  pfs_end_table_io_wait_v1,
+  pfs_start_table_lock_wait_v1,
+  pfs_end_table_lock_wait_v1,
+  pfs_start_file_open_wait_v1,
+  pfs_end_file_open_wait_v1,
+  pfs_end_file_open_wait_and_bind_to_descriptor_v1,
+  pfs_end_temp_file_open_wait_and_bind_to_descriptor_v1,
+  pfs_start_file_wait_v1,
+  pfs_end_file_wait_v1,
+  pfs_start_file_close_wait_v1,
+  pfs_end_file_close_wait_v1,
+  pfs_end_file_rename_wait_v1,
+  pfs_start_stage_v1,
+  pfs_get_current_stage_progress_v1,
+  pfs_end_stage_v1,
+  pfs_get_thread_statement_locker_v1,
+  pfs_refine_statement_v1,
+  pfs_start_statement_v1,
+  pfs_set_statement_text_v1,
+  pfs_set_statement_lock_time_v1,
+  pfs_set_statement_rows_sent_v1,
+  pfs_set_statement_rows_examined_v1,
+  pfs_inc_statement_created_tmp_disk_tables_v1,
+  pfs_inc_statement_created_tmp_tables_v1,
+  pfs_inc_statement_select_full_join_v1,
+  pfs_inc_statement_select_full_range_join_v1,
+  pfs_inc_statement_select_range_v1,
+  pfs_inc_statement_select_range_check_v1,
+  pfs_inc_statement_select_scan_v1,
+  pfs_inc_statement_sort_merge_passes_v1,
+  pfs_inc_statement_sort_range_v1,
+  pfs_inc_statement_sort_rows_v1,
+  pfs_inc_statement_sort_scan_v1,
+  pfs_set_statement_no_index_used_v1,
+  pfs_set_statement_no_good_index_used_v1,
+  pfs_end_statement_v1,
+  pfs_get_thread_transaction_locker_v1,
+  pfs_start_transaction_v1,
+  pfs_set_transaction_xid_v1,
+  pfs_set_transaction_xa_state_v1,
+  pfs_set_transaction_gtid_v1,
+  pfs_set_transaction_trxid_v1,
+  pfs_inc_transaction_savepoints_v1,
+  pfs_inc_transaction_rollback_to_savepoint_v1,
+  pfs_inc_transaction_release_savepoint_v1,
+  pfs_end_transaction_v1,
+  pfs_start_socket_wait_v1,
+  pfs_end_socket_wait_v1,
+  pfs_set_socket_state_v1,
+  pfs_set_socket_info_v1,
+  pfs_set_socket_thread_owner_v1,
+  pfs_create_prepared_stmt_v1,
+  pfs_destroy_prepared_stmt_v1,
+  pfs_reprepare_prepared_stmt_v1,
+  pfs_execute_prepared_stmt_v1,
+  pfs_set_prepared_stmt_text_v1,
   pfs_digest_start_v1,
   pfs_digest_end_v1,
-  set_thread_connect_attrs_v1,
+  pfs_set_thread_connect_attrs_v1,
+  pfs_start_sp_v1,
+  pfs_end_sp_v1,
+  pfs_drop_sp_v1,
+  pfs_get_sp_share_v1,
+  pfs_release_sp_share_v1,
+  pfs_register_memory_v1,
+  pfs_memory_alloc_v1,
+  pfs_memory_realloc_v1,
+  pfs_memory_claim_v1,
+  pfs_memory_free_v1,
+  pfs_unlock_table_v1,
+  pfs_create_metadata_lock_v1,
+  pfs_set_metadata_lock_status_v1,
+  pfs_destroy_metadata_lock_v1,
+  pfs_start_metadata_wait_v1,
+  pfs_end_metadata_wait_v1
 };
 
 static void* get_interface(int version)
diff --git a/storage/perfschema/pfs.h b/storage/perfschema/pfs.h
index 3649143f8fa..3525f27724f 100644
--- a/storage/perfschema/pfs.h
+++ b/storage/perfschema/pfs.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -30,7 +30,8 @@
 
 #define HAVE_PSI_1
 
-#include <my_pthread.h>
+#include <my_global.h>
+#include "my_thread.h"
 #include <mysql/psi/psi.h>
 
 /**
@@ -39,9 +40,24 @@
 */
 extern struct PSI_bootstrap PFS_bootstrap;
 /** Performance schema Thread Local Storage key.  */
-extern pthread_key(PFS_thread*, THR_PFS);
-/** True when @c THR_PFS is initialized. */
+extern pthread_key_t THR_PFS;
+extern pthread_key_t THR_PFS_VG;  // global_variables
+extern pthread_key_t THR_PFS_SV;  // session_variables
+extern pthread_key_t THR_PFS_VBT; // variables_by_thread
+extern pthread_key_t THR_PFS_SG;  // global_status
+extern pthread_key_t THR_PFS_SS;  // session_status
+extern pthread_key_t THR_PFS_SBT; // status_by_thread
+extern pthread_key_t THR_PFS_SBU; // status_by_user
+extern pthread_key_t THR_PFS_SBA; // status_by_host
+extern pthread_key_t THR_PFS_SBH; // status_by_account
+
+/** True when @c THR_PFS and all other Performance Schema TLS keys are initialized. */
 extern bool THR_PFS_initialized;
 
+#define PSI_VOLATILITY_UNKNOWN 0
+#define PSI_VOLATILITY_SESSION 1
+
+#define PSI_COUNT_VOLATILITY 2
+
 #endif
 
diff --git a/storage/perfschema/pfs_account.cc b/storage/perfschema/pfs_account.cc
index be2153e84ae..abf31a6bc55 100644
--- a/storage/perfschema/pfs_account.cc
+++ b/storage/perfschema/pfs_account.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -22,7 +22,7 @@
 
 /**
   @file storage/perfschema/pfs_account.cc
-  Performance schema user@host (implementation).
+  Performance schema account (implementation).
 */
 
 #include "my_global.h"
@@ -37,21 +37,13 @@
 #include "pfs_account.h"
 #include "pfs_global.h"
 #include "pfs_instr_class.h"
+#include "pfs_buffer_container.h"
 
 /**
   @addtogroup Performance_schema_buffers
   @{
 */
 
-ulong account_max;
-ulong account_lost;
-
-PFS_account *account_array= NULL;
-
-static PFS_single_stat *account_instr_class_waits_array= NULL;
-static PFS_stage_stat *account_instr_class_stages_array= NULL;
-static PFS_statement_stat *account_instr_class_statements_array= NULL;
-
 LF_HASH account_hash;
 static bool account_hash_inited= false;
 
@@ -62,75 +54,16 @@ static bool account_hash_inited= false;
 */
 int init_account(const PFS_global_param *param)
 {
-  uint index;
-
-  account_max= param->m_account_sizing;
-
-  account_array= NULL;
-  account_instr_class_waits_array= NULL;
-  account_instr_class_stages_array= NULL;
-  account_instr_class_statements_array= NULL;
-  uint waits_sizing= account_max * wait_class_max;
-  uint stages_sizing= account_max * stage_class_max;
-  uint statements_sizing= account_max * statement_class_max;
-
-  if (account_max > 0)
-  {
-    account_array= PFS_MALLOC_ARRAY(account_max, sizeof(PFS_account), PFS_account,
-                                    MYF(MY_ZEROFILL));
-    if (unlikely(account_array == NULL))
-      return 1;
-  }
-
-  if (waits_sizing > 0)
-  {
-    account_instr_class_waits_array=
-      PFS_connection_slice::alloc_waits_slice(waits_sizing);
-    if (unlikely(account_instr_class_waits_array == NULL))
-      return 1;
-  }
-
-  if (stages_sizing > 0)
-  {
-    account_instr_class_stages_array=
-      PFS_connection_slice::alloc_stages_slice(stages_sizing);
-    if (unlikely(account_instr_class_stages_array == NULL))
-      return 1;
-  }
-
-  if (statements_sizing > 0)
-  {
-    account_instr_class_statements_array=
-      PFS_connection_slice::alloc_statements_slice(statements_sizing);
-    if (unlikely(account_instr_class_statements_array == NULL))
-      return 1;
-  }
-
-  for (index= 0; index < account_max; index++)
-  {
-    account_array[index].m_instr_class_waits_stats=
-      &account_instr_class_waits_array[index * wait_class_max];
-    account_array[index].m_instr_class_stages_stats=
-      &account_instr_class_stages_array[index * stage_class_max];
-    account_array[index].m_instr_class_statements_stats=
-      &account_instr_class_statements_array[index * statement_class_max];
-  }
+  if (global_account_container.init(param->m_account_sizing))
+    return 1;
 
   return 0;
 }
 
-/** Cleanup all the user buffers. */
+/** Cleanup all the account buffers. */
 void cleanup_account(void)
 {
-  pfs_free(account_array);
-  account_array= NULL;
-  pfs_free(account_instr_class_waits_array);
-  account_instr_class_waits_array= NULL;
-  pfs_free(account_instr_class_stages_array);
-  account_instr_class_stages_array= 0;
-  pfs_free(account_instr_class_statements_array);
-  account_instr_class_statements_array=0;
-  account_max= 0;
+  global_account_container.cleanup();
 }
 
 C_MODE_START
@@ -141,9 +74,9 @@ static uchar *account_hash_get_key(const uchar *entry, size_t *length,
   const PFS_account *account;
   const void *result;
   typed_entry= reinterpret_cast<const PFS_account* const *> (entry);
-  DBUG_ASSERT(typed_entry != NULL);
+  assert(typed_entry != NULL);
   account= *typed_entry;
-  DBUG_ASSERT(account != NULL);
+  assert(account != NULL);
   *length= account->m_key.m_key_length;
   result= account->m_key.m_hash_key;
   return const_cast<uchar*> (reinterpret_cast<const uchar*> (result));
@@ -154,13 +87,12 @@ C_MODE_END
   Initialize the user hash.
   @return 0 on success
 */
-int init_account_hash(void)
+int init_account_hash(const PFS_global_param *param)
 {
-  if ((! account_hash_inited) && (account_max > 0))
+  if ((! account_hash_inited) && (param->m_account_sizing != 0))
   {
     lf_hash_init(&account_hash, sizeof(PFS_account*), LF_HASH_UNIQUE,
                  0, 0, account_hash_get_key, &my_charset_bin);
-    /* account_hash.size= account_max; */
     account_hash_inited= true;
   }
   return 0;
@@ -191,8 +123,8 @@ static void set_account_key(PFS_account_key *key,
                               const char *user, uint user_length,
                               const char *host, uint host_length)
 {
-  DBUG_ASSERT(user_length <= USERNAME_LENGTH);
-  DBUG_ASSERT(host_length <= HOSTNAME_LENGTH);
+  assert(user_length <= USERNAME_LENGTH);
+  assert(host_length <= HOSTNAME_LENGTH);
 
   char *ptr= &key->m_hash_key[0];
   if (user_length > 0)
@@ -217,16 +149,10 @@ find_or_create_account(PFS_thread *thread,
                          const char *username, uint username_length,
                          const char *hostname, uint hostname_length)
 {
-  if (account_max == 0)
-  {
-    account_lost++;
-    return NULL;
-  }
-
   LF_PINS *pins= get_account_hash_pins(thread);
   if (unlikely(pins == NULL))
   {
-    account_lost++;
+    global_account_container.m_lost++;
     return NULL;
   }
 
@@ -235,8 +161,10 @@ find_or_create_account(PFS_thread *thread,
                     hostname, hostname_length);
 
   PFS_account **entry;
+  PFS_account *pfs;
   uint retry_count= 0;
   const uint retry_max= 3;
+  pfs_dirty_state dirty_state;
 
 search:
   entry= reinterpret_cast<PFS_account**>
@@ -244,7 +172,6 @@ search:
                     key.m_hash_key, key.m_key_length));
   if (entry && (entry != MY_ERRPTR))
   {
-    PFS_account *pfs;
     pfs= *entry;
     pfs->inc_refcount();
     lf_hash_search_unpin(pins);
@@ -253,93 +180,94 @@ search:
 
   lf_hash_search_unpin(pins);
 
-  PFS_scan scan;
-  uint random= randomized_index(username, account_max);
-
-  for (scan.init(random, account_max);
-       scan.has_pass();
-       scan.next_pass())
+  pfs= global_account_container.allocate(& dirty_state);
+  if (pfs != NULL)
   {
-    PFS_account *pfs= account_array + scan.first();
-    PFS_account *pfs_last= account_array + scan.last();
-    for ( ; pfs < pfs_last; pfs++)
+    pfs->m_key= key;
+    if (username_length > 0)
+      pfs->m_username= &pfs->m_key.m_hash_key[0];
+    else
+      pfs->m_username= NULL;
+    pfs->m_username_length= username_length;
+
+    if (hostname_length > 0)
+      pfs->m_hostname= &pfs->m_key.m_hash_key[username_length + 1];
+    else
+      pfs->m_hostname= NULL;
+    pfs->m_hostname_length= hostname_length;
+
+    pfs->m_user= find_or_create_user(thread, username, username_length);
+    pfs->m_host= find_or_create_host(thread, hostname, hostname_length);
+
+    pfs->init_refcount();
+    pfs->reset_stats();
+    pfs->m_disconnected_count= 0;
+
+    if (username_length > 0 && hostname_length > 0)
+    {
+      lookup_setup_actor(thread, username, username_length, hostname, hostname_length,
+                         & pfs->m_enabled, & pfs->m_history);
+    }
+    else
     {
-      if (pfs->m_lock.is_free())
+      pfs->m_enabled= true;
+      pfs->m_history= true;
+    }
+
+    int res;
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
+    res= lf_hash_insert(&account_hash, pins, &pfs);
+    if (likely(res == 0))
+    {
+      return pfs;
+    }
+
+    if (pfs->m_user)
+    {
+      pfs->m_user->release();
+      pfs->m_user= NULL;
+    }
+    if (pfs->m_host)
+    {
+      pfs->m_host->release();
+      pfs->m_host= NULL;
+    }
+
+    global_account_container.deallocate(pfs);
+
+    if (res > 0)
+    {
+      if (++retry_count > retry_max)
       {
-        if (pfs->m_lock.free_to_dirty())
-        {
-          pfs->m_key= key;
-          if (username_length > 0)
-            pfs->m_username= &pfs->m_key.m_hash_key[0];
-          else
-            pfs->m_username= NULL;
-          pfs->m_username_length= username_length;
-
-          if (hostname_length > 0)
-            pfs->m_hostname= &pfs->m_key.m_hash_key[username_length + 1];
-          else
-            pfs->m_hostname= NULL;
-          pfs->m_hostname_length= hostname_length;
-
-          pfs->m_user= find_or_create_user(thread, username, username_length);
-          pfs->m_host= find_or_create_host(thread, hostname, hostname_length);
-
-          pfs->init_refcount();
-          pfs->reset_stats();
-          pfs->m_disconnected_count= 0;
-
-          int res;
-          res= lf_hash_insert(&account_hash, pins, &pfs);
-          if (likely(res == 0))
-          {
-            pfs->m_lock.dirty_to_allocated();
-            return pfs;
-          }
-
-          if (pfs->m_user)
-          {
-            pfs->m_user->release();
-            pfs->m_user= NULL;
-          }
-          if (pfs->m_host)
-          {
-            pfs->m_host->release();
-            pfs->m_host= NULL;
-          }
-
-          pfs->m_lock.dirty_to_free();
-
-          if (res > 0)
-          {
-            if (++retry_count > retry_max)
-            {
-              account_lost++;
-              return NULL;
-            }
-            goto search;
-          }
-
-          account_lost++;
-          return NULL;
-        }
+        global_account_container.m_lost++;
+        return NULL;
       }
+      goto search;
     }
+
+    global_account_container.m_lost++;
+    return NULL;
   }
 
-  account_lost++;
   return NULL;
 }
 
-void PFS_account::aggregate(PFS_user *safe_user, PFS_host *safe_host)
+void PFS_account::aggregate(bool alive, PFS_user *safe_user, PFS_host *safe_host)
 {
   aggregate_waits(safe_user, safe_host);
   aggregate_stages(safe_user, safe_host);
   aggregate_statements(safe_user, safe_host);
+  aggregate_transactions(safe_user, safe_host);
+  aggregate_memory(alive, safe_user, safe_host);
+  aggregate_status(safe_user, safe_host);
   aggregate_stats(safe_user, safe_host);
 }
 
 void PFS_account::aggregate_waits(PFS_user *safe_user, PFS_host *safe_host)
 {
+  if (read_instr_class_waits_stats() == NULL)
+    return;
+
   if (likely(safe_user != NULL && safe_host != NULL))
   {
     /*
@@ -348,9 +276,9 @@ void PFS_account::aggregate_waits(PFS_user *safe_user, PFS_host *safe_host)
       -  EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME
       in parallel.
     */
-    aggregate_all_event_names(m_instr_class_waits_stats,
-                              safe_user->m_instr_class_waits_stats,
-                              safe_host->m_instr_class_waits_stats);
+    aggregate_all_event_names(write_instr_class_waits_stats(),
+                              safe_user->write_instr_class_waits_stats(),
+                              safe_host->write_instr_class_waits_stats());
     return;
   }
 
@@ -360,8 +288,8 @@ void PFS_account::aggregate_waits(PFS_user *safe_user, PFS_host *safe_host)
       Aggregate EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
       -  EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME
     */
-    aggregate_all_event_names(m_instr_class_waits_stats,
-                              safe_user->m_instr_class_waits_stats);
+    aggregate_all_event_names(write_instr_class_waits_stats(),
+                              safe_user->write_instr_class_waits_stats());
     return;
   }
 
@@ -371,8 +299,8 @@ void PFS_account::aggregate_waits(PFS_user *safe_user, PFS_host *safe_host)
       Aggregate EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
       -  EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME
     */
-    aggregate_all_event_names(m_instr_class_waits_stats,
-                              safe_host->m_instr_class_waits_stats);
+    aggregate_all_event_names(write_instr_class_waits_stats(),
+                              safe_host->write_instr_class_waits_stats());
     return;
   }
 
@@ -383,6 +311,9 @@ void PFS_account::aggregate_waits(PFS_user *safe_user, PFS_host *safe_host)
 
 void PFS_account::aggregate_stages(PFS_user *safe_user, PFS_host *safe_host)
 {
+  if (read_instr_class_stages_stats() == NULL)
+    return;
+
   if (likely(safe_user != NULL && safe_host != NULL))
   {
     /*
@@ -391,9 +322,9 @@ void PFS_account::aggregate_stages(PFS_user *safe_user, PFS_host *safe_host)
       -  EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME
       in parallel.
     */
-    aggregate_all_stages(m_instr_class_stages_stats,
-                         safe_user->m_instr_class_stages_stats,
-                         safe_host->m_instr_class_stages_stats);
+    aggregate_all_stages(write_instr_class_stages_stats(),
+                         safe_user->write_instr_class_stages_stats(),
+                         safe_host->write_instr_class_stages_stats());
     return;
   }
 
@@ -405,8 +336,8 @@ void PFS_account::aggregate_stages(PFS_user *safe_user, PFS_host *safe_host)
       -  EVENTS_STAGES_SUMMARY_GLOBAL_BY_EVENT_NAME
       in parallel.
     */
-    aggregate_all_stages(m_instr_class_stages_stats,
-                         safe_user->m_instr_class_stages_stats,
+    aggregate_all_stages(write_instr_class_stages_stats(),
+                         safe_user->write_instr_class_stages_stats(),
                          global_instr_class_stages_array);
     return;
   }
@@ -417,8 +348,8 @@ void PFS_account::aggregate_stages(PFS_user *safe_user, PFS_host *safe_host)
       Aggregate EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
       -  EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME
     */
-    aggregate_all_stages(m_instr_class_stages_stats,
-                         safe_host->m_instr_class_stages_stats);
+    aggregate_all_stages(write_instr_class_stages_stats(),
+                         safe_host->write_instr_class_stages_stats());
     return;
   }
 
@@ -426,13 +357,16 @@ void PFS_account::aggregate_stages(PFS_user *safe_user, PFS_host *safe_host)
     Aggregate EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
     -  EVENTS_STAGES_SUMMARY_GLOBAL_BY_EVENT_NAME
   */
-  aggregate_all_stages(m_instr_class_stages_stats,
+  aggregate_all_stages(write_instr_class_stages_stats(),
                        global_instr_class_stages_array);
   return;
 }
 
 void PFS_account::aggregate_statements(PFS_user *safe_user, PFS_host *safe_host)
 {
+  if (read_instr_class_statements_stats() == NULL)
+    return;
+
   if (likely(safe_user != NULL && safe_host != NULL))
   {
     /*
@@ -441,9 +375,9 @@ void PFS_account::aggregate_statements(PFS_user *safe_user, PFS_host *safe_host)
       -  EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME
       in parallel.
     */
-    aggregate_all_statements(m_instr_class_statements_stats,
-                             safe_user->m_instr_class_statements_stats,
-                             safe_host->m_instr_class_statements_stats);
+    aggregate_all_statements(write_instr_class_statements_stats(),
+                             safe_user->write_instr_class_statements_stats(),
+                             safe_host->write_instr_class_statements_stats());
     return;
   }
 
@@ -455,8 +389,8 @@ void PFS_account::aggregate_statements(PFS_user *safe_user, PFS_host *safe_host)
       -  EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME
       in parallel.
     */
-    aggregate_all_statements(m_instr_class_statements_stats,
-                             safe_user->m_instr_class_statements_stats,
+    aggregate_all_statements(write_instr_class_statements_stats(),
+                             safe_user->write_instr_class_statements_stats(),
                              global_instr_class_statements_array);
     return;
   }
@@ -467,8 +401,8 @@ void PFS_account::aggregate_statements(PFS_user *safe_user, PFS_host *safe_host)
       Aggregate EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
       -  EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME
     */
-    aggregate_all_statements(m_instr_class_statements_stats,
-                             safe_host->m_instr_class_statements_stats);
+    aggregate_all_statements(write_instr_class_statements_stats(),
+                             safe_host->write_instr_class_statements_stats());
     return;
   }
 
@@ -476,11 +410,169 @@ void PFS_account::aggregate_statements(PFS_user *safe_user, PFS_host *safe_host)
     Aggregate EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
     -  EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME
   */
-  aggregate_all_statements(m_instr_class_statements_stats,
+  aggregate_all_statements(write_instr_class_statements_stats(),
                            global_instr_class_statements_array);
   return;
 }
 
+void PFS_account::aggregate_transactions(PFS_user *safe_user, PFS_host *safe_host)
+{
+  if (read_instr_class_transactions_stats() == NULL)
+    return;
+
+  if (likely(safe_user != NULL && safe_host != NULL))
+  {
+    /*
+      Aggregate EVENTS_TRANSACTIONS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+      -  EVENTS_TRANSACTIONS_SUMMARY_BY_USER_BY_EVENT_NAME
+      -  EVENTS_TRANSACTIONS_SUMMARY_BY_HOST_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_transactions(write_instr_class_transactions_stats(),
+                               safe_user->write_instr_class_transactions_stats(),
+                               safe_host->write_instr_class_transactions_stats());
+    return;
+  }
+
+  if (safe_user != NULL)
+  {
+    /*
+      Aggregate EVENTS_TRANSACTIONS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+      -  EVENTS_TRANSACTIONS_SUMMARY_BY_USER_BY_EVENT_NAME
+      -  EVENTS_TRANSACTIONS_SUMMARY_GLOBAL_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_transactions(write_instr_class_transactions_stats(),
+                               safe_user->write_instr_class_transactions_stats(),
+                               &global_transaction_stat);
+    return;
+  }
+
+  if (safe_host != NULL)
+  {
+    /*
+      Aggregate EVENTS_TRANSACTIONS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+      -  EVENTS_TRANSACTIONS_SUMMARY_BY_HOST_BY_EVENT_NAME
+    */
+    aggregate_all_transactions(write_instr_class_transactions_stats(),
+                               safe_host->write_instr_class_transactions_stats());
+    return;
+  }
+
+  /*
+    Aggregate EVENTS_TRANSACTIONS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+    -  EVENTS_TRANSACTIONS_SUMMARY_GLOBAL_BY_EVENT_NAME
+  */
+  aggregate_all_transactions(write_instr_class_transactions_stats(),
+                             &global_transaction_stat);
+  return;
+}
+
+void PFS_account::aggregate_memory(bool alive, PFS_user *safe_user, PFS_host *safe_host)
+{
+  if (read_instr_class_memory_stats() == NULL)
+    return;
+
+  if (likely(safe_user != NULL && safe_host != NULL))
+  {
+    /*
+      Aggregate MEMORY_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+      - MEMORY_SUMMARY_BY_USER_BY_EVENT_NAME
+      - MEMORY_SUMMARY_BY_HOST_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_memory(alive,
+                         write_instr_class_memory_stats(),
+                         safe_user->write_instr_class_memory_stats(),
+                         safe_host->write_instr_class_memory_stats());
+    return;
+  }
+
+  if (safe_user != NULL)
+  {
+    /*
+      Aggregate MEMORY_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+      - MEMORY_SUMMARY_BY_USER_BY_EVENT_NAME
+      - MEMORY_SUMMARY_GLOBAL_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_memory(alive,
+                         write_instr_class_memory_stats(),
+                         safe_user->write_instr_class_memory_stats(),
+                         global_instr_class_memory_array);
+    return;
+  }
+
+  if (safe_host != NULL)
+  {
+    /*
+      Aggregate MEMORY_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+      - MEMORY_SUMMARY_BY_HOST_BY_EVENT_NAME
+    */
+    aggregate_all_memory(alive,
+                         write_instr_class_memory_stats(),
+                         safe_host->write_instr_class_memory_stats());
+    return;
+  }
+
+  /*
+    Aggregate MEMORY_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+    - MEMORY_SUMMARY_GLOBAL_BY_EVENT_NAME
+  */
+  aggregate_all_memory(alive,
+                       write_instr_class_memory_stats(),
+                       global_instr_class_memory_array);
+  return;
+}
+
+void PFS_account::aggregate_status(PFS_user *safe_user, PFS_host *safe_host)
+{
+  if (likely(safe_user != NULL && safe_host != NULL))
+  {
+    /*
+      Aggregate STATUS_BY_ACCOUNT to:
+      - STATUS_BY_USER
+      - STATUS_BY_HOST
+    */
+    safe_user->m_status_stats.aggregate(& m_status_stats);
+    safe_host->m_status_stats.aggregate(& m_status_stats);
+    m_status_stats.reset();
+    return;
+  }
+
+  if (safe_user != NULL)
+  {
+    /*
+      Aggregate STATUS_BY_ACCOUNT to:
+      - STATUS_BY_USER
+      - GLOBAL_STATUS
+    */
+    safe_user->m_status_stats.aggregate(& m_status_stats);
+    //m_status_stats.aggregate_to(& global_status_var);
+    m_status_stats.reset();
+    return;
+  }
+
+  if (safe_host != NULL)
+  {
+    /*
+      Aggregate STATUS_BY_ACCOUNT to:
+      - STATUS_BY_HOST
+    */
+    safe_host->m_status_stats.aggregate(& m_status_stats);
+    m_status_stats.reset();
+    return;
+  }
+
+  /*
+    Aggregate STATUS_BY_ACCOUNT to:
+    - GLOBAL_STATUS
+  */
+  //m_status_stats.aggregate_to(& global_status_var);
+  m_status_stats.reset();
+  return;
+}
+
 void PFS_account::aggregate_stats(PFS_user *safe_user, PFS_host *safe_host)
 {
   if (likely(safe_user != NULL && safe_host != NULL))
@@ -514,19 +606,42 @@ void PFS_account::release()
   dec_refcount();
 }
 
-PFS_account *sanitize_account(PFS_account *unsafe)
+void PFS_account::carry_memory_stat_delta(PFS_memory_stat_delta *delta, uint index)
 {
-  if ((&account_array[0] <= unsafe) &&
-      (unsafe < &account_array[account_max]))
-    return unsafe;
-  return NULL;
+  PFS_memory_stat *event_name_array;
+  PFS_memory_stat *stat;
+  PFS_memory_stat_delta delta_buffer;
+  PFS_memory_stat_delta *remaining_delta;
+
+  event_name_array= write_instr_class_memory_stats();
+  stat= & event_name_array[index];
+  remaining_delta= stat->apply_delta(delta, &delta_buffer);
+
+  if (remaining_delta == NULL)
+    return;
+
+  if (m_user != NULL)
+  {
+    m_user->carry_memory_stat_delta(remaining_delta, index);
+    /* do not return, need to process m_host below */
+  }
+
+  if (m_host != NULL)
+  {
+    m_host->carry_memory_stat_delta(remaining_delta, index);
+    return;
+  }
+
+  carry_global_memory_stat_delta(remaining_delta, index);
 }
 
-void purge_account(PFS_thread *thread, PFS_account *account,
-                   PFS_user *safe_user, PFS_host *safe_host)
+PFS_account *sanitize_account(PFS_account *unsafe)
 {
-  account->aggregate(safe_user, safe_host);
+  return global_account_container.sanitize(unsafe);
+}
 
+void purge_account(PFS_thread *thread, PFS_account *account)
+{
   LF_PINS *pins= get_account_hash_pins(thread);
   if (unlikely(pins == NULL))
     return;
@@ -538,12 +653,13 @@ void purge_account(PFS_thread *thread, PFS_account *account,
                     account->m_key.m_key_length));
   if (entry && (entry != MY_ERRPTR))
   {
-    DBUG_ASSERT(*entry == account);
+    assert(*entry == account);
     if (account->get_refcount() == 0)
     {
       lf_hash_delete(&account_hash, pins,
                      account->m_key.m_hash_key,
                      account->m_key.m_key_length);
+      account->aggregate(false, account->m_user, account->m_host);
       if (account->m_user != NULL)
       {
         account->m_user->release();
@@ -554,37 +670,78 @@ void purge_account(PFS_thread *thread, PFS_account *account,
         account->m_host->release();
         account->m_host= NULL;
       }
-      account->m_lock.allocated_to_free();
+      global_account_container.deallocate(account);
     }
   }
 
   lf_hash_search_unpin(pins);
 }
 
-/** Purge non connected user@host, reset stats of connected user@host. */
+class Proc_purge_account
+  : public PFS_buffer_processor<PFS_account>
+{
+public:
+  Proc_purge_account(PFS_thread *thread)
+    : m_thread(thread)
+  {}
+
+  virtual void operator()(PFS_account *pfs)
+  {
+    PFS_user *user= sanitize_user(pfs->m_user);
+    PFS_host *host= sanitize_host(pfs->m_host);
+    pfs->aggregate(true, user, host);
+
+    if (pfs->get_refcount() == 0)
+      purge_account(m_thread, pfs);
+  }
+
+private:
+  PFS_thread *m_thread;
+};
+
+/** Purge non connected accounts, reset stats of connected account. */
 void purge_all_account(void)
 {
   PFS_thread *thread= PFS_thread::get_current_thread();
   if (unlikely(thread == NULL))
     return;
 
-  PFS_account *pfs= account_array;
-  PFS_account *pfs_last= account_array + account_max;
-  PFS_user *user;
-  PFS_host *host;
+  Proc_purge_account proc(thread);
+  global_account_container.apply(proc);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
+class Proc_update_accounts_derived_flags
+  : public PFS_buffer_processor<PFS_account>
+{
+public:
+  Proc_update_accounts_derived_flags(PFS_thread *thread)
+    : m_thread(thread)
+  {}
+
+  virtual void operator()(PFS_account *pfs)
   {
-    if (pfs->m_lock.is_populated())
+    if (pfs->m_username_length > 0 && pfs->m_hostname_length > 0)
     {
-      user= sanitize_user(pfs->m_user);
-      host= sanitize_host(pfs->m_host);
-      pfs->aggregate_stats(user, host);
-
-      if (pfs->get_refcount() == 0)
-        purge_account(thread, pfs, user, host);
+      lookup_setup_actor(m_thread,
+                         pfs->m_username, pfs->m_username_length,
+                         pfs->m_hostname, pfs->m_hostname_length,
+                         & pfs->m_enabled, & pfs->m_history);
+    }
+    else
+    {
+      pfs->m_enabled= true;
+      pfs->m_history= true;
     }
   }
+
+private:
+  PFS_thread *m_thread;
+};
+
+void update_accounts_derived_flags(PFS_thread *thread)
+{
+  Proc_update_accounts_derived_flags proc(thread);
+  global_account_container.apply(proc);
 }
 
 /** @} */
diff --git a/storage/perfschema/pfs_account.h b/storage/perfschema/pfs_account.h
index cd7b1520df5..c98ff5d2740 100644
--- a/storage/perfschema/pfs_account.h
+++ b/storage/perfschema/pfs_account.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -25,23 +25,26 @@
 
 /**
   @file storage/perfschema/pfs_account.h
-  Performance schema user@host (declarations).
+  Performance schema account (declarations).
 */
 
 #include "pfs_lock.h"
 #include "lf.h"
 #include "pfs_con_slice.h"
+#include "mysql_com.h" /* USERNAME_LENGTH */
 
 struct PFS_global_param;
 struct PFS_user;
 struct PFS_host;
 struct PFS_thread;
+struct PFS_memory_stat_delta;
 
 /**
   @addtogroup Performance_schema_buffers
   @{
 */
 
+/** Hash key for an account. */
 struct PFS_account_key
 {
   /**
@@ -53,6 +56,7 @@ struct PFS_account_key
   uint m_key_length;
 };
 
+/** Per account statistics. */
 struct PFS_ALIGNED PFS_account : PFS_connection_slice
 {
 public:
@@ -76,16 +80,25 @@ public:
     PFS_atomic::add_32(& m_refcount, -1);
   }
 
-  void aggregate(PFS_user *safe_user, PFS_host *safe_host);
+  void aggregate(bool alive, PFS_user *safe_user, PFS_host *safe_host);
   void aggregate_waits(PFS_user *safe_user, PFS_host *safe_host);
   void aggregate_stages(PFS_user *safe_user, PFS_host *safe_host);
   void aggregate_statements(PFS_user *safe_user, PFS_host *safe_host);
+  void aggregate_transactions(PFS_user *safe_user, PFS_host *safe_host);
+  void aggregate_memory(bool alive, PFS_user *safe_user, PFS_host *safe_host);
+  void aggregate_status(PFS_user *safe_user, PFS_host *safe_host);
   void aggregate_stats(PFS_user *safe_user, PFS_host *safe_host);
   void release(void);
 
+  void carry_memory_stat_delta(PFS_memory_stat_delta *delta, uint index);
+
   /** Internal lock. */
   pfs_lock m_lock;
   PFS_account_key m_key;
+  /** True if this account is enabled, per rules in table SETUP_ACTORS. */
+  bool m_enabled;
+  /** True if this account has history enabled, per rules in table SETUP_ACTORS. */
+  bool m_history;
   const char *m_username;
   uint m_username_length;
   const char *m_hostname;
@@ -101,7 +114,7 @@ private:
 
 int init_account(const PFS_global_param *param);
 void cleanup_account(void);
-int init_account_hash(void);
+int init_account_hash(const PFS_global_param *param);
 void cleanup_account_hash(void);
 
 PFS_account *
@@ -112,15 +125,9 @@ find_or_create_account(PFS_thread *thread,
 PFS_account *sanitize_account(PFS_account *unsafe);
 void purge_all_account(void);
 
+void update_accounts_derived_flags(PFS_thread *thread);
 
-/* For iterators and show status. */
-
-extern ulong account_max;
-extern ulong account_lost;
-
-/* Exposing the data directly, for iterators. */
-
-extern PFS_account *account_array;
+/* For show status. */
 
 extern LF_HASH account_hash;
 
diff --git a/storage/perfschema/pfs_atomic.h b/storage/perfschema/pfs_atomic.h
index 00d1197970b..52c96527c3f 100644
--- a/storage/perfschema/pfs_atomic.h
+++ b/storage/perfschema/pfs_atomic.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2009, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2009, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -35,101 +35,101 @@ class PFS_atomic
 {
 public:
   /** Atomic load. */
-  static inline int32 load_32(volatile int32 *ptr)
+  static inline int32 load_32(int32 *ptr)
   {
     return my_atomic_load32(ptr);
   }
 
   /** Atomic load. */
-  static inline int64 load_64(volatile int64 *ptr)
+  static inline int64 load_64(int64 *ptr)
   {
     return my_atomic_load64(ptr);
   }
 
   /** Atomic load. */
-  static inline uint32 load_u32(volatile uint32 *ptr)
+  static inline uint32 load_u32(uint32 *ptr)
   {
     return (uint32) my_atomic_load32((int32*) ptr);
   }
 
   /** Atomic load. */
-  static inline uint64 load_u64(volatile uint64 *ptr)
+  static inline uint64 load_u64(uint64 *ptr)
   {
     return (uint64) my_atomic_load64((int64*) ptr);
   }
 
   /** Atomic store. */
-  static inline void store_32(volatile int32 *ptr, int32 value)
+  static inline void store_32(int32 *ptr, int32 value)
   {
     my_atomic_store32(ptr, value);
   }
 
   /** Atomic store. */
-  static inline void store_64(volatile int64 *ptr, int64 value)
+  static inline void store_64(int64 *ptr, int64 value)
   {
     my_atomic_store64(ptr, value);
   }
 
   /** Atomic store. */
-  static inline void store_u32(volatile uint32 *ptr, uint32 value)
+  static inline void store_u32(uint32 *ptr, uint32 value)
   {
     my_atomic_store32((int32*) ptr, (int32) value);
   }
 
   /** Atomic store. */
-  static inline void store_u64(volatile uint64 *ptr, uint64 value)
+  static inline void store_u64(uint64 *ptr, uint64 value)
   {
     my_atomic_store64((int64*) ptr, (int64) value);
   }
 
   /** Atomic add. */
-  static inline int32 add_32(volatile int32 *ptr, int32 value)
+  static inline int32 add_32(int32 *ptr, int32 value)
   {
     return my_atomic_add32(ptr, value);
   }
 
   /** Atomic add. */
-  static inline int64 add_64(volatile int64 *ptr, int64 value)
+  static inline int64 add_64(int64 *ptr, int64 value)
   {
     return my_atomic_add64(ptr, value);
   }
 
   /** Atomic add. */
-  static inline uint32 add_u32(volatile uint32 *ptr, uint32 value)
+  static inline uint32 add_u32(uint32 *ptr, uint32 value)
   {
     return (uint32) my_atomic_add32((int32*) ptr, (int32) value);
   }
 
   /** Atomic add. */
-  static inline uint64 add_u64(volatile uint64 *ptr, uint64 value)
+  static inline uint64 add_u64(uint64 *ptr, uint64 value)
   {
     return (uint64) my_atomic_add64((int64*) ptr, (int64) value);
   }
 
   /** Atomic compare and swap. */
-  static inline bool cas_32(volatile int32 *ptr, int32 *old_value,
+  static inline bool cas_32(int32 *ptr, int32 *old_value,
                             int32 new_value)
   {
     return my_atomic_cas32(ptr, old_value, new_value);
   }
 
   /** Atomic compare and swap. */
-  static inline bool cas_64(volatile int64 *ptr, int64 *old_value,
+  static inline bool cas_64(int64 *ptr, int64 *old_value,
                             int64 new_value)
   {
     return my_atomic_cas64(ptr, old_value, new_value);
   }
 
   /** Atomic compare and swap. */
-  static inline bool cas_u32(volatile uint32 *ptr, uint32 *old_value,
+  static inline bool cas_u32(uint32 *ptr, uint32 *old_value,
                              uint32 new_value)
   {
     return my_atomic_cas32((int32*) ptr, (int32*) old_value,
-                            (uint32) new_value);
+                           (uint32) new_value);
   }
 
   /** Atomic compare and swap. */
-  static inline bool cas_u64(volatile uint64 *ptr, uint64 *old_value,
+  static inline bool cas_u64(uint64 *ptr, uint64 *old_value,
                              uint64 new_value)
   {
     return my_atomic_cas64((int64*) ptr, (int64*) old_value,
diff --git a/storage/perfschema/pfs_autosize.cc b/storage/perfschema/pfs_autosize.cc
index e15a85fe2d6..65d0458220a 100644
--- a/storage/perfschema/pfs_autosize.cc
+++ b/storage/perfschema/pfs_autosize.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2012, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -30,47 +30,17 @@
 #include "pfs_server.h"
 #include "set_var.h"
 
+#include "my_thread.h" /* For pthread_t */
+/* Make sure HAVE_PSI_XXX_INTERFACE flags are set */
+#include "mysql/psi/psi.h"
+
 #include <algorithm>
 using std::min;
 using std::max;
 
-static const ulong fixed_mutex_instances= 500;
-static const ulong fixed_rwlock_instances= 200;
-static const ulong fixed_cond_instances= 50;
-static const ulong fixed_file_instances= 200;
-static const ulong fixed_socket_instances= 10;
-static const ulong fixed_thread_instances= 50;
-
-static const ulong mutex_per_connection= 3;
-static const ulong rwlock_per_connection= 1;
-static const ulong cond_per_connection= 2;
-static const ulong file_per_connection= 0;
-static const ulong socket_per_connection= 1;
-static const ulong thread_per_connection= 1;
-
-static const ulong mutex_per_handle= 0;
-static const ulong rwlock_per_handle= 0;
-static const ulong cond_per_handle= 0;
-static const ulong file_per_handle= 0;
-static const ulong socket_per_handle= 0;
-static const ulong thread_per_handle= 0;
-
-static const ulong mutex_per_share= 5;
-static const ulong rwlock_per_share= 3;
-static const ulong cond_per_share= 1;
-static const ulong file_per_share= 3;
-static const ulong socket_per_share= 0;
-static const ulong thread_per_share= 0;
-
+/** Performance schema sizing heuristics. */
 struct PFS_sizing_data
 {
-  /** Default value for @c PFS_param.m_account_sizing. */
-  ulong m_account_sizing;
-  /** Default value for @c PFS_param.m_user_sizing. */
-  ulong m_user_sizing;
-  /** Default value for @c PFS_param.m_host_sizing. */
-  ulong m_host_sizing;
-
   /** Default value for @c PFS_param.m_events_waits_history_sizing. */
   ulong m_events_waits_history_sizing;
   /** Default value for @c PFS_param.m_events_waits_history_long_sizing. */
@@ -83,103 +53,46 @@ struct PFS_sizing_data
   ulong m_events_statements_history_sizing;
   /** Default value for @c PFS_param.m_events_statements_history_long_sizing. */
   ulong m_events_statements_history_long_sizing;
+  /** Default value for @c PFS_param.m_events_transactions_history_sizing. */
+  ulong m_events_transactions_history_sizing;
+  /** Default value for @c PFS_param.m_events_transactions_history_long_sizing. */
+  ulong m_events_transactions_history_long_sizing;
   /** Default value for @c PFS_param.m_digest_sizing. */
   ulong m_digest_sizing;
   /** Default value for @c PFS_param.m_session_connect_attrs_sizing. */
   ulong m_session_connect_attrs_sizing;
-
-  /**
-    Minimum number of tables to keep statistics for.
-    On small deployments, all the tables can fit into the table definition cache,
-    and this value can be 0.
-    On big deployments, the table definition cache is only a subset of all the tables
-    in the database, which are accounted for here.
-  */
-  ulong m_min_number_of_tables;
-
-  /**
-    Load factor for 'volatile' objects (mutexes, table handles, ...).
-    Instrumented objects that:
-    - use little memory
-    - are created/destroyed very frequently
-    should be stored in a low density (mostly empty) memory buffer,
-    to optimize for speed.
-  */
-  float m_load_factor_volatile;
-  /**
-    Load factor for 'normal' objects (files).
-    Instrumented objects that:
-    - use a medium amount of memory
-    - are created/destroyed 
-    should be stored in a medium density memory buffer,
-    as a trade off between space and speed.
-  */
-  float m_load_factor_normal;
-  /**
-    Load factor for 'static' objects (table shares).
-    Instrumented objects that:
-    - use a lot of memory
-    - are created/destroyed very rarely
-    can be stored in a high density (mostly packed) memory buffer,
-    to optimize for space.
-  */
-  float m_load_factor_static;
 };
 
 PFS_sizing_data small_data=
 {
-  /* Account / user / host */
-  10, 5, 20,
   /* History sizes */
-  10, 100, 10, 100, 10, 100,
+  5, 100, 5, 100, 5, 100, 5, 100,
   /* Digests */
   1000,
   /* Session connect attrs. */
-  512,
-  /* Min tables */
-  200,
-  /* Load factors */
-  0.90f, 0.90f, 0.90f
+  512
 };
 
 PFS_sizing_data medium_data=
 {
-  /* Account / user / host */
-  100, 100, 100,
   /* History sizes */
-  20, 1000, 20, 1000, 20, 1000,
+  10, 1000, 10, 1000, 10, 1000, 10, 1000,
   /* Digests */
   5000,
   /* Session connect attrs. */
-  512,
-  /* Min tables */
-  500,
-  /* Load factors */
-  0.70f, 0.80f, 0.90f
+  512
 };
 
 PFS_sizing_data large_data=
 {
-  /* Account / user / host */
-  100, 100, 100,
   /* History sizes */
-  20, 10000, 20, 10000, 20, 10000,
+  10, 10000, 10, 10000, 10, 10000, 10, 10000,
   /* Digests */
   10000,
   /* Session connect attrs. */
-  512,
-  /* Min tables */
-  10000,
-  /* Load factors */
-  0.50f, 0.65f, 0.80f
+  512
 };
 
-static inline ulong apply_load_factor(ulong raw_value, float factor)
-{
-  float value = ((float) raw_value) / factor;
-  return (ulong) ceil(value);
-}
-
 PFS_sizing_data *estimate_hints(PFS_global_param *param)
 {
   if ((param->m_hints.m_max_connections <= MAX_CONNECTIONS_DEFAULT) &&
@@ -204,47 +117,6 @@ PFS_sizing_data *estimate_hints(PFS_global_param *param)
 
 static void apply_heuristic(PFS_global_param *p, PFS_sizing_data *h)
 {
-  ulong count;
-  ulong con = p->m_hints.m_max_connections;
-  ulong handle = p->m_hints.m_table_open_cache;
-  ulong share = p->m_hints.m_table_definition_cache;
-  ulong file = p->m_hints.m_open_files_limit;
-
-  if (p->m_table_sizing < 0)
-  {
-    count= handle;
-
-    SYSVAR_AUTOSIZE(p->m_table_sizing,
-                    apply_load_factor(count, h->m_load_factor_volatile));
-  }
-
-  if (p->m_table_share_sizing < 0)
-  {
-    count= share;
-
-    count= max<ulong>(count, h->m_min_number_of_tables);
-    SYSVAR_AUTOSIZE(p->m_table_share_sizing,
-                    apply_load_factor(count, h->m_load_factor_static));
-  }
-
-  if (p->m_account_sizing < 0)
-  {
-    SYSVAR_AUTOSIZE(p->m_account_sizing,
-                    h->m_account_sizing);
-  }
-
-  if (p->m_user_sizing < 0)
-  {
-    SYSVAR_AUTOSIZE(p->m_user_sizing,
-                    h->m_user_sizing);
-  }
-
-  if (p->m_host_sizing < 0)
-  {
-    SYSVAR_AUTOSIZE(p->m_host_sizing,
-                    h->m_host_sizing);
-  }
-
   if (p->m_events_waits_history_sizing < 0)
   {
     SYSVAR_AUTOSIZE(p->m_events_waits_history_sizing,
@@ -287,107 +159,157 @@ static void apply_heuristic(PFS_global_param *p, PFS_sizing_data *h)
                     h->m_digest_sizing);
   }
 
-  if (p->m_session_connect_attrs_sizing < 0)
+  if (p->m_events_transactions_history_sizing < 0)
   {
-    SYSVAR_AUTOSIZE(p->m_session_connect_attrs_sizing,
-                    h->m_session_connect_attrs_sizing);
+    SYSVAR_AUTOSIZE(p->m_events_transactions_history_sizing,
+                    h->m_events_transactions_history_sizing);
   }
 
-  if (p->m_mutex_sizing < 0)
+  if (p->m_events_transactions_history_long_sizing < 0)
   {
-    count= fixed_mutex_instances
-      + con * mutex_per_connection
-      + handle * mutex_per_handle
-      + share * mutex_per_share;
-
-    SYSVAR_AUTOSIZE(p->m_mutex_sizing,
-                    apply_load_factor(count, h->m_load_factor_volatile));
+    SYSVAR_AUTOSIZE(p->m_events_transactions_history_long_sizing,
+                    h->m_events_transactions_history_long_sizing);
   }
 
-  if (p->m_rwlock_sizing < 0)
-  {
-    count= fixed_rwlock_instances
-      + con * rwlock_per_connection
-      + handle * rwlock_per_handle
-      + share * rwlock_per_share;
-
-    SYSVAR_AUTOSIZE(p->m_rwlock_sizing,
-                    apply_load_factor(count, h->m_load_factor_volatile));
-  }
-
-  if (p->m_cond_sizing < 0)
-  {
-    ulong count;
-    count= fixed_cond_instances
-      + con * cond_per_connection
-      + handle * cond_per_handle
-      + share * cond_per_share;
-
-    SYSVAR_AUTOSIZE(p->m_cond_sizing,
-                    apply_load_factor(count, h->m_load_factor_volatile));
-  }
-
-  if (p->m_file_sizing < 0)
+  if (p->m_session_connect_attrs_sizing < 0)
   {
-    count= fixed_file_instances
-      + con * file_per_connection
-      + handle * file_per_handle
-      + share * file_per_share;
-
-    count= max<ulong>(count, file);
-    SYSVAR_AUTOSIZE(p->m_file_sizing,
-                    apply_load_factor(count, h->m_load_factor_normal));
+    SYSVAR_AUTOSIZE(p->m_session_connect_attrs_sizing,
+                    h->m_session_connect_attrs_sizing);
   }
+}
 
-  if (p->m_socket_sizing < 0)
+void pfs_automated_sizing(PFS_global_param *param)
+{
+  if (param->m_enabled)
   {
-    count= fixed_socket_instances
-      + con * socket_per_connection
-      + handle * socket_per_handle
-      + share * socket_per_share;
-
-    SYSVAR_AUTOSIZE(p->m_socket_sizing,
-                    apply_load_factor(count, h->m_load_factor_volatile));
+#ifndef HAVE_PSI_MUTEX_INTERFACE
+    param->m_mutex_class_sizing= 0;
+    param->m_mutex_sizing= 0;
+#endif
+
+#ifndef HAVE_PSI_RWLOCK_INTERFACE
+    param->m_rwlock_class_sizing= 0;
+    param->m_rwlock_sizing= 0;
+#endif
+
+#ifndef HAVE_PSI_COND_INTERFACE
+    param->m_cond_class_sizing= 0;
+    param->m_cond_sizing= 0;
+#endif
+
+#ifndef HAVE_PSI_FILE_INTERFACE
+    param->m_file_class_sizing= 0;
+    param->m_file_sizing= 0;
+    param->m_file_handle_sizing= 0;
+#endif
+
+#ifndef HAVE_PSI_TABLE_INTERFACE
+    param->m_table_share_sizing= 0;
+    param->m_table_sizing= 0;
+    param->m_table_lock_stat_sizing= 0;
+    param->m_index_stat_sizing= 0;
+#endif
+
+#ifndef HAVE_PSI_SOCKET_INTERFACE
+    param->m_socket_class_sizing= 0;
+    param->m_socket_sizing= 0;
+#endif
+
+#ifndef HAVE_PSI_STAGE_INTERFACE
+    param->m_stage_class_sizing= 0;
+    param->m_events_stages_history_sizing= 0;
+    param->m_events_stages_history_long_sizing= 0;
+#endif
+
+#ifndef HAVE_PSI_STATEMENT_INTERFACE
+    param->m_statement_class_sizing= 0;
+    param->m_events_statements_history_sizing= 0;
+    param->m_events_statements_history_long_sizing= 0;
+#endif
+
+#ifndef HAVE_PSI_SP_INTERFACE
+    param->m_program_sizing= 0;
+    if (param->m_statement_stack_sizing > 1)
+      param->m_statement_stack_sizing= 1;
+#endif
+
+#ifndef HAVE_PSI_PS_INTERFACE
+    param->m_prepared_stmt_sizing= 0;
+#endif
+
+#ifndef HAVE_PSI_STATEMENT_DIGEST_INTERFACE
+    param->m_digest_sizing= 0;
+#endif
+
+#ifndef HAVE_PSI_METADATA_INTERFACE
+    param->m_metadata_lock_sizing= 0;
+#endif
+
+#ifndef HAVE_PSI_MEMORY_INTERFACE
+    param->m_memory_class_sizing= 0;
+#endif
+
+    PFS_sizing_data *heuristic;
+    heuristic= estimate_hints(param);
+    apply_heuristic(param, heuristic);
+
+    assert(param->m_events_waits_history_sizing >= 0);
+    assert(param->m_events_waits_history_long_sizing >= 0);
+    assert(param->m_events_stages_history_sizing >= 0);
+    assert(param->m_events_stages_history_long_sizing >= 0);
+    assert(param->m_events_statements_history_sizing >= 0);
+    assert(param->m_events_statements_history_long_sizing >= 0);
+    assert(param->m_events_transactions_history_sizing >= 0);
+    assert(param->m_events_transactions_history_long_sizing >= 0);
+    assert(param->m_session_connect_attrs_sizing >= 0);
   }
-
-  if (p->m_thread_sizing < 0)
+  else
   {
-    count= fixed_thread_instances
-      + con * thread_per_connection
-      + handle * thread_per_handle
-      + share * thread_per_share;
-
-    SYSVAR_AUTOSIZE(p->m_thread_sizing,
-                    apply_load_factor(count, h->m_load_factor_volatile));
+    /*
+      The Performance Schema is disabled. Set the instrument sizings to zero to
+      disable all instrumentation while retaining support for the status and
+      system variable tables, the host cache table and the replication tables.
+    */
+    SYSVAR_AUTOSIZE(param->m_mutex_class_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_rwlock_class_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_cond_class_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_thread_class_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_table_share_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_table_lock_stat_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_index_stat_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_file_class_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_mutex_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_rwlock_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_cond_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_thread_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_table_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_file_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_file_handle_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_socket_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_socket_class_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_events_waits_history_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_events_waits_history_long_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_setup_actor_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_setup_object_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_host_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_user_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_account_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_stage_class_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_events_stages_history_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_events_stages_history_long_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_statement_class_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_events_statements_history_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_events_statements_history_long_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_digest_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_program_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_prepared_stmt_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_events_transactions_history_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_events_transactions_history_long_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_session_connect_attrs_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_statement_stack_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_memory_class_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_metadata_lock_sizing, 0);
+    SYSVAR_AUTOSIZE(param->m_max_digest_length, 0);
+    SYSVAR_AUTOSIZE(param->m_max_sql_text_length, 0);
   }
 }
-
-void pfs_automated_sizing(PFS_global_param *param)
-{
-  PFS_sizing_data *heuristic;
-  heuristic= estimate_hints(param);
-  apply_heuristic(param, heuristic);
-
-  DBUG_ASSERT(param->m_account_sizing >= 0);
-  DBUG_ASSERT(param->m_digest_sizing >= 0);
-  DBUG_ASSERT(param->m_host_sizing >= 0);
-  DBUG_ASSERT(param->m_user_sizing >= 0);
-
-  DBUG_ASSERT(param->m_events_waits_history_sizing >= 0);
-  DBUG_ASSERT(param->m_events_waits_history_long_sizing >= 0);
-  DBUG_ASSERT(param->m_events_stages_history_sizing >= 0);
-  DBUG_ASSERT(param->m_events_stages_history_long_sizing >= 0);
-  DBUG_ASSERT(param->m_events_statements_history_sizing >= 0);
-  DBUG_ASSERT(param->m_events_statements_history_long_sizing >= 0);
-  DBUG_ASSERT(param->m_session_connect_attrs_sizing >= 0);
-
-  DBUG_ASSERT(param->m_mutex_sizing >= 0);
-  DBUG_ASSERT(param->m_rwlock_sizing >= 0);
-  DBUG_ASSERT(param->m_cond_sizing >= 0);
-  DBUG_ASSERT(param->m_file_sizing >= 0);
-  DBUG_ASSERT(param->m_socket_sizing >= 0);
-  DBUG_ASSERT(param->m_thread_sizing >= 0);
-  DBUG_ASSERT(param->m_table_sizing >= 0);
-  DBUG_ASSERT(param->m_table_share_sizing >= 0);
-}
-
diff --git a/storage/perfschema/pfs_buffer_container.cc b/storage/perfschema/pfs_buffer_container.cc
new file mode 100644
index 00000000000..7d1d74541d1
--- /dev/null
+++ b/storage/perfschema/pfs_buffer_container.cc
@@ -0,0 +1,883 @@
+/* Copyright (c) 2014, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#include "my_global.h"
+#include "pfs_global.h"
+#include "pfs_lock.h"
+#include "pfs_account.h"
+#include "pfs_user.h"
+#include "pfs_host.h"
+#include "pfs_buffer_container.h"
+#include "pfs_builtin_memory.h"
+
+PFS_buffer_default_allocator<PFS_mutex> default_mutex_allocator(& builtin_memory_mutex);
+PFS_mutex_container global_mutex_container(& default_mutex_allocator);
+
+PFS_buffer_default_allocator<PFS_rwlock> default_rwlock_allocator(& builtin_memory_rwlock);
+PFS_rwlock_container global_rwlock_container(& default_rwlock_allocator);
+
+PFS_buffer_default_allocator<PFS_cond> default_cond_allocator(& builtin_memory_cond);
+PFS_cond_container global_cond_container(& default_cond_allocator);
+
+PFS_buffer_default_allocator<PFS_file> default_file_allocator(& builtin_memory_file);
+PFS_file_container global_file_container(& default_file_allocator);
+
+PFS_buffer_default_allocator<PFS_socket> default_socket_allocator(& builtin_memory_socket);
+PFS_socket_container global_socket_container(& default_socket_allocator);
+
+PFS_buffer_default_allocator<PFS_metadata_lock> default_mdl_allocator(& builtin_memory_mdl);
+PFS_mdl_container global_mdl_container(& default_mdl_allocator);
+
+PFS_buffer_default_allocator<PFS_setup_actor> default_setup_actor_allocator(& builtin_memory_setup_actor);
+PFS_setup_actor_container global_setup_actor_container(& default_setup_actor_allocator);
+
+PFS_buffer_default_allocator<PFS_setup_object> default_setup_object_allocator(& builtin_memory_setup_object);
+PFS_setup_object_container global_setup_object_container(& default_setup_object_allocator);
+
+PFS_buffer_default_allocator<PFS_table> default_table_allocator(& builtin_memory_table);
+PFS_table_container global_table_container(& default_table_allocator);
+
+PFS_buffer_default_allocator<PFS_table_share> default_table_share_allocator(& builtin_memory_table_share);
+PFS_table_share_container global_table_share_container(& default_table_share_allocator);
+
+PFS_buffer_default_allocator<PFS_table_share_index> default_table_share_index_allocator(& builtin_memory_table_share_index);
+PFS_table_share_index_container global_table_share_index_container(& default_table_share_index_allocator);
+
+PFS_buffer_default_allocator<PFS_table_share_lock> default_table_share_lock_allocator(& builtin_memory_table_share_lock);
+PFS_table_share_lock_container global_table_share_lock_container(& default_table_share_lock_allocator);
+
+PFS_buffer_default_allocator<PFS_program> default_program_allocator(& builtin_memory_program);
+PFS_program_container global_program_container(& default_program_allocator);
+
+PFS_buffer_default_allocator<PFS_prepared_stmt> default_prepared_stmt_allocator(& builtin_memory_prepared_stmt);
+PFS_prepared_stmt_container global_prepared_stmt_container(& default_prepared_stmt_allocator);
+
+int PFS_account_allocator::alloc_array(PFS_account_array *array)
+{
+  size_t size= array->m_max;
+  size_t index;
+  size_t waits_sizing= size * wait_class_max;
+  size_t stages_sizing= size * stage_class_max;
+  size_t statements_sizing= size * statement_class_max;
+  size_t transactions_sizing= size * transaction_class_max;
+  size_t memory_sizing= size * memory_class_max;
+
+  array->m_ptr= NULL;
+  array->m_full= true;
+  array->m_instr_class_waits_array= NULL;
+  array->m_instr_class_stages_array= NULL;
+  array->m_instr_class_statements_array= NULL;
+  array->m_instr_class_transactions_array= NULL;
+  array->m_instr_class_memory_array= NULL;
+
+  if (size > 0)
+  {
+    array->m_ptr=
+      PFS_MALLOC_ARRAY(& builtin_memory_account,
+                       size, sizeof(PFS_account), PFS_account, MYF(MY_ZEROFILL));
+    if (array->m_ptr == NULL)
+      return 1;
+  }
+
+  if (waits_sizing > 0)
+  {
+    array->m_instr_class_waits_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_account_waits,
+                       waits_sizing, sizeof(PFS_single_stat), PFS_single_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_waits_array == NULL)
+      return 1;
+
+    for (index=0; index < waits_sizing; index++)
+      array->m_instr_class_waits_array[index].reset();
+  }
+
+  if (stages_sizing > 0)
+  {
+    array->m_instr_class_stages_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_account_stages,
+                       stages_sizing, sizeof(PFS_stage_stat), PFS_stage_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_stages_array == NULL)
+      return 1;
+
+    for (index=0; index < stages_sizing; index++)
+      array->m_instr_class_stages_array[index].reset();
+  }
+
+  if (statements_sizing > 0)
+  {
+    array->m_instr_class_statements_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_account_statements,
+                       statements_sizing, sizeof(PFS_statement_stat), PFS_statement_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_statements_array == NULL)
+      return 1;
+
+    for (index=0; index < statements_sizing; index++)
+      array->m_instr_class_statements_array[index].reset();
+  }
+
+  if (transactions_sizing > 0)
+  {
+    array->m_instr_class_transactions_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_account_transactions,
+                       transactions_sizing, sizeof(PFS_transaction_stat), PFS_transaction_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_transactions_array == NULL)
+      return 1;
+
+    for (index=0; index < transactions_sizing; index++)
+      array->m_instr_class_transactions_array[index].reset();
+  }
+
+  if (memory_sizing > 0)
+  {
+    array->m_instr_class_memory_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_account_memory,
+                       memory_sizing, sizeof(PFS_memory_stat), PFS_memory_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_memory_array == NULL)
+      return 1;
+
+    for (index=0; index < memory_sizing; index++)
+      array->m_instr_class_memory_array[index].reset();
+  }
+
+  for (index= 0; index < size; index++)
+  {
+    array->m_ptr[index].set_instr_class_waits_stats(
+      & array->m_instr_class_waits_array[index * wait_class_max]);
+    array->m_ptr[index].set_instr_class_stages_stats(
+      & array->m_instr_class_stages_array[index * stage_class_max]);
+    array->m_ptr[index].set_instr_class_statements_stats(
+      & array->m_instr_class_statements_array[index * statement_class_max]);
+    array->m_ptr[index].set_instr_class_transactions_stats(
+      & array->m_instr_class_transactions_array[index * transaction_class_max]);
+    array->m_ptr[index].set_instr_class_memory_stats(
+      & array->m_instr_class_memory_array[index * memory_class_max]);
+  }
+
+  array->m_full= false;
+  return 0;
+}
+
+void PFS_account_allocator::free_array(PFS_account_array *array)
+{
+  size_t size= array->m_max;
+  size_t waits_sizing= size * wait_class_max;
+  size_t stages_sizing= size * stage_class_max;
+  size_t statements_sizing= size * statement_class_max;
+  size_t transactions_sizing= size * transaction_class_max;
+  size_t memory_sizing= size * memory_class_max;
+
+  PFS_FREE_ARRAY(& builtin_memory_account,
+                 size, sizeof(PFS_account), array->m_ptr);
+  array->m_ptr= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_account_waits,
+                 waits_sizing, sizeof(PFS_single_stat),
+                 array->m_instr_class_waits_array);
+  array->m_instr_class_waits_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_account_stages,
+                 stages_sizing, sizeof(PFS_stage_stat),
+                 array->m_instr_class_stages_array);
+  array->m_instr_class_stages_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_account_statements,
+                 statements_sizing, sizeof(PFS_statement_stat),
+                 array->m_instr_class_statements_array);
+  array->m_instr_class_statements_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_account_transactions,
+                 transactions_sizing, sizeof(PFS_transaction_stat),
+                 array->m_instr_class_transactions_array);
+  array->m_instr_class_transactions_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_account_memory,
+                 memory_sizing, sizeof(PFS_memory_stat),
+                 array->m_instr_class_memory_array);
+  array->m_instr_class_memory_array= NULL;
+}
+
+PFS_account_allocator account_allocator;
+PFS_account_container global_account_container(& account_allocator);
+
+int PFS_host_allocator::alloc_array(PFS_host_array *array)
+{
+  size_t size= array->m_max;
+  PFS_host *pfs;
+  size_t index;
+  size_t waits_sizing= size * wait_class_max;
+  size_t stages_sizing= size * stage_class_max;
+  size_t statements_sizing= size * statement_class_max;
+  size_t transactions_sizing= size * transaction_class_max;
+  size_t memory_sizing= size * memory_class_max;
+
+  array->m_ptr= NULL;
+  array->m_full= true;
+  array->m_instr_class_waits_array= NULL;
+  array->m_instr_class_stages_array= NULL;
+  array->m_instr_class_statements_array= NULL;
+  array->m_instr_class_transactions_array= NULL;
+  array->m_instr_class_memory_array= NULL;
+
+  if (size > 0)
+  {
+    array->m_ptr=
+      PFS_MALLOC_ARRAY(& builtin_memory_host,
+                       size, sizeof(PFS_host), PFS_host, MYF(MY_ZEROFILL));
+    if (array->m_ptr == NULL)
+      return 1;
+  }
+
+  if (waits_sizing > 0)
+  {
+    array->m_instr_class_waits_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_host_waits,
+                       waits_sizing, sizeof(PFS_single_stat), PFS_single_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_waits_array == NULL)
+      return 1;
+
+    for (index=0; index < waits_sizing; index++)
+      array->m_instr_class_waits_array[index].reset();
+  }
+
+  if (stages_sizing > 0)
+  {
+    array->m_instr_class_stages_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_host_stages,
+                       stages_sizing, sizeof(PFS_stage_stat), PFS_stage_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_stages_array == NULL)
+      return 1;
+
+    for (index=0; index < stages_sizing; index++)
+      array->m_instr_class_stages_array[index].reset();
+  }
+
+  if (statements_sizing > 0)
+  {
+    array->m_instr_class_statements_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_host_statements,
+                       statements_sizing, sizeof(PFS_statement_stat), PFS_statement_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_statements_array == NULL)
+      return 1;
+
+    for (index=0; index < statements_sizing; index++)
+      array->m_instr_class_statements_array[index].reset();
+  }
+
+  if (transactions_sizing > 0)
+  {
+    array->m_instr_class_transactions_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_host_transactions,
+                       transactions_sizing, sizeof(PFS_transaction_stat), PFS_transaction_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_transactions_array == NULL)
+      return 1;
+
+    for (index=0; index < transactions_sizing; index++)
+      array->m_instr_class_transactions_array[index].reset();
+  }
+
+  if (memory_sizing > 0)
+  {
+    array->m_instr_class_memory_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_host_memory,
+                       memory_sizing, sizeof(PFS_memory_stat), PFS_memory_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_memory_array == NULL)
+      return 1;
+
+    for (index=0; index < memory_sizing; index++)
+      array->m_instr_class_memory_array[index].reset();
+  }
+
+  for (index= 0; index < size; index++)
+  {
+    pfs= & array->m_ptr[index];
+
+    pfs->set_instr_class_waits_stats(
+      & array->m_instr_class_waits_array[index * wait_class_max]);
+    pfs->set_instr_class_stages_stats(
+      & array->m_instr_class_stages_array[index * stage_class_max]);
+    pfs->set_instr_class_statements_stats(
+      & array->m_instr_class_statements_array[index * statement_class_max]);
+    pfs->set_instr_class_transactions_stats(
+      & array->m_instr_class_transactions_array[index * transaction_class_max]);
+    pfs->set_instr_class_memory_stats(
+      & array->m_instr_class_memory_array[index * memory_class_max]);
+  }
+
+  array->m_full= false;
+  return 0;
+}
+
+void PFS_host_allocator::free_array(PFS_host_array *array)
+{
+  size_t size= array->m_max;
+  size_t waits_sizing= size * wait_class_max;
+  size_t stages_sizing= size * stage_class_max;
+  size_t statements_sizing= size * statement_class_max;
+  size_t transactions_sizing= size * transaction_class_max;
+  size_t memory_sizing= size * memory_class_max;
+
+  PFS_FREE_ARRAY(& builtin_memory_host,
+                 size, sizeof(PFS_host), array->m_ptr);
+  array->m_ptr= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_host_waits,
+                 waits_sizing, sizeof(PFS_single_stat),
+                 array->m_instr_class_waits_array);
+  array->m_instr_class_waits_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_host_stages,
+                 stages_sizing, sizeof(PFS_stage_stat),
+                 array->m_instr_class_stages_array);
+  array->m_instr_class_stages_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_host_statements,
+                 statements_sizing, sizeof(PFS_statement_stat),
+                 array->m_instr_class_statements_array);
+  array->m_instr_class_statements_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_host_transactions,
+                 transactions_sizing, sizeof(PFS_transaction_stat),
+                 array->m_instr_class_transactions_array);
+  array->m_instr_class_transactions_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_host_memory,
+                 memory_sizing, sizeof(PFS_memory_stat),
+                 array->m_instr_class_memory_array);
+  array->m_instr_class_memory_array= NULL;
+}
+
+PFS_host_allocator host_allocator;
+PFS_host_container global_host_container(& host_allocator);
+
+int PFS_thread_allocator::alloc_array(PFS_thread_array *array)
+{
+  size_t size= array->m_max;
+  PFS_thread *pfs;
+  PFS_events_statements *pfs_stmt;
+  unsigned char *pfs_tokens;
+
+  size_t index;
+  size_t waits_sizing= size * wait_class_max;
+  size_t stages_sizing= size * stage_class_max;
+  size_t statements_sizing= size * statement_class_max;
+  size_t transactions_sizing= size * transaction_class_max;
+  size_t memory_sizing= size * memory_class_max;
+
+  size_t waits_history_sizing= size * events_waits_history_per_thread;
+  size_t stages_history_sizing= size * events_stages_history_per_thread;
+  size_t statements_history_sizing= size * events_statements_history_per_thread;
+  size_t statements_stack_sizing= size * statement_stack_max;
+  size_t transactions_history_sizing= size * events_transactions_history_per_thread;
+  size_t session_connect_attrs_sizing= size * session_connect_attrs_size_per_thread;
+
+  size_t current_sqltext_sizing= size * pfs_max_sqltext * statement_stack_max;
+  size_t history_sqltext_sizing= size * pfs_max_sqltext * events_statements_history_per_thread;
+  size_t current_digest_tokens_sizing= size * pfs_max_digest_length * statement_stack_max;
+  size_t history_digest_tokens_sizing= size * pfs_max_digest_length * events_statements_history_per_thread;
+
+  array->m_ptr= NULL;
+  array->m_full= true;
+  array->m_instr_class_waits_array= NULL;
+  array->m_instr_class_stages_array= NULL;
+  array->m_instr_class_statements_array= NULL;
+  array->m_instr_class_transactions_array= NULL;
+  array->m_instr_class_memory_array= NULL;
+
+  array->m_waits_history_array= NULL;
+  array->m_stages_history_array= NULL;
+  array->m_statements_history_array= NULL;
+  array->m_statements_stack_array= NULL;
+  array->m_transactions_history_array= NULL;
+  array->m_session_connect_attrs_array= NULL;
+
+  array->m_current_stmts_text_array= NULL;
+  array->m_current_stmts_digest_token_array= NULL;
+  array->m_history_stmts_text_array= NULL;
+  array->m_history_stmts_digest_token_array= NULL;
+
+  if (size > 0)
+  {
+    array->m_ptr=
+      PFS_MALLOC_ARRAY(& builtin_memory_thread,
+                       size, sizeof(PFS_thread), PFS_thread, MYF(MY_ZEROFILL));
+    if (array->m_ptr == NULL)
+      return 1;
+  }
+
+  if (waits_sizing > 0)
+  {
+    array->m_instr_class_waits_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_thread_waits,
+                       waits_sizing, sizeof(PFS_single_stat), PFS_single_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_waits_array == NULL)
+      return 1;
+
+    for (index=0; index < waits_sizing; index++)
+      array->m_instr_class_waits_array[index].reset();
+  }
+
+  if (stages_sizing > 0)
+  {
+    array->m_instr_class_stages_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_thread_stages,
+                       stages_sizing, sizeof(PFS_stage_stat), PFS_stage_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_stages_array == NULL)
+      return 1;
+
+    for (index=0; index < stages_sizing; index++)
+      array->m_instr_class_stages_array[index].reset();
+  }
+
+  if (statements_sizing > 0)
+  {
+    array->m_instr_class_statements_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_thread_statements,
+                       statements_sizing, sizeof(PFS_statement_stat), PFS_statement_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_statements_array == NULL)
+      return 1;
+
+    for (index=0; index < statements_sizing; index++)
+      array->m_instr_class_statements_array[index].reset();
+  }
+
+  if (transactions_sizing > 0)
+  {
+    array->m_instr_class_transactions_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_thread_transactions,
+                       transactions_sizing, sizeof(PFS_transaction_stat), PFS_transaction_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_transactions_array == NULL)
+      return 1;
+
+    for (index=0; index < transactions_sizing; index++)
+      array->m_instr_class_transactions_array[index].reset();
+  }
+
+  if (memory_sizing > 0)
+  {
+    array->m_instr_class_memory_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_thread_memory,
+                       memory_sizing, sizeof(PFS_memory_stat), PFS_memory_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_memory_array == NULL)
+      return 1;
+
+    for (index=0; index < memory_sizing; index++)
+      array->m_instr_class_memory_array[index].reset();
+  }
+
+  if (waits_history_sizing > 0)
+  {
+    array->m_waits_history_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_thread_waits_history,
+                       waits_history_sizing, sizeof(PFS_events_waits), PFS_events_waits, MYF(MY_ZEROFILL));
+    if (unlikely(array->m_waits_history_array == NULL))
+      return 1;
+  }
+
+  if (stages_history_sizing > 0)
+  {
+    array->m_stages_history_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_thread_stages_history,
+                       stages_history_sizing, sizeof(PFS_events_stages), PFS_events_stages, MYF(MY_ZEROFILL));
+    if (unlikely(array->m_stages_history_array == NULL))
+      return 1;
+  }
+
+  if (statements_history_sizing > 0)
+  {
+    array->m_statements_history_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_thread_statements_history,
+                       statements_history_sizing, sizeof(PFS_events_statements), PFS_events_statements, MYF(MY_ZEROFILL));
+    if (unlikely(array->m_statements_history_array == NULL))
+      return 1;
+  }
+
+  if (statements_stack_sizing > 0)
+  {
+    array->m_statements_stack_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_thread_statements_stack,
+                       statements_stack_sizing, sizeof(PFS_events_statements), PFS_events_statements, MYF(MY_ZEROFILL));
+    if (unlikely(array->m_statements_stack_array == NULL))
+      return 1;
+  }
+
+  if (transactions_history_sizing > 0)
+  {
+    array->m_transactions_history_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_thread_transaction_history,
+                       transactions_history_sizing, sizeof(PFS_events_transactions), PFS_events_transactions, MYF(MY_ZEROFILL));
+    if (unlikely(array->m_transactions_history_array == NULL))
+      return 1;
+  }
+
+  if (session_connect_attrs_sizing > 0)
+  {
+    array->m_session_connect_attrs_array=
+      (char *)pfs_malloc(& builtin_memory_thread_session_connect_attrs,
+                         session_connect_attrs_sizing, MYF(MY_ZEROFILL));
+    if (unlikely(array->m_session_connect_attrs_array == NULL))
+      return 1;
+  }
+
+  if (current_sqltext_sizing > 0)
+  {
+    array->m_current_stmts_text_array=
+      (char *)pfs_malloc(& builtin_memory_thread_statements_stack_sqltext,
+                         current_sqltext_sizing, MYF(MY_ZEROFILL));
+    if (unlikely(array->m_current_stmts_text_array == NULL))
+      return 1;
+  }
+
+  if (history_sqltext_sizing > 0)
+  {
+    array->m_history_stmts_text_array=
+      (char *)pfs_malloc(& builtin_memory_thread_statements_history_sqltext,
+                         history_sqltext_sizing, MYF(MY_ZEROFILL));
+    if (unlikely(array->m_history_stmts_text_array == NULL))
+      return 1;
+  }
+
+  if (current_digest_tokens_sizing > 0)
+  {
+    array->m_current_stmts_digest_token_array=
+      (unsigned char *)pfs_malloc(& builtin_memory_thread_statements_stack_tokens,
+                                  current_digest_tokens_sizing, MYF(MY_ZEROFILL));
+    if (unlikely(array->m_current_stmts_digest_token_array == NULL))
+      return 1;
+  }
+
+  if (history_digest_tokens_sizing > 0)
+  {
+    array->m_history_stmts_digest_token_array=
+      (unsigned char *)pfs_malloc(& builtin_memory_thread_statements_history_tokens,
+                                  history_digest_tokens_sizing, MYF(MY_ZEROFILL));
+    if (unlikely(array->m_history_stmts_digest_token_array == NULL))
+      return 1;
+  }
+
+  for (index= 0; index < size; index++)
+  {
+    pfs= & array->m_ptr[index];
+
+    pfs->set_instr_class_waits_stats(
+      & array->m_instr_class_waits_array[index * wait_class_max]);
+    pfs->set_instr_class_stages_stats(
+      & array->m_instr_class_stages_array[index * stage_class_max]);
+    pfs->set_instr_class_statements_stats(
+      & array->m_instr_class_statements_array[index * statement_class_max]);
+    pfs->set_instr_class_transactions_stats(
+      & array->m_instr_class_transactions_array[index * transaction_class_max]);
+    pfs->set_instr_class_memory_stats(
+      & array->m_instr_class_memory_array[index * memory_class_max]);
+
+    pfs->m_waits_history=
+      & array->m_waits_history_array[index * events_waits_history_per_thread];
+    pfs->m_stages_history=
+      & array->m_stages_history_array[index * events_stages_history_per_thread];
+    pfs->m_statements_history=
+      & array->m_statements_history_array[index * events_statements_history_per_thread];
+    pfs->m_statement_stack=
+      & array->m_statements_stack_array[index * statement_stack_max];
+    pfs->m_transactions_history=
+      & array->m_transactions_history_array[index * events_transactions_history_per_thread];
+    pfs->m_session_connect_attrs=
+      & array->m_session_connect_attrs_array[index * session_connect_attrs_size_per_thread];
+  }
+
+  for (index= 0; index < statements_stack_sizing; index++)
+  {
+    pfs_stmt= & array->m_statements_stack_array[index];
+
+    pfs_stmt->m_sqltext= & array->m_current_stmts_text_array[index * pfs_max_sqltext];
+
+    pfs_tokens= & array->m_current_stmts_digest_token_array[index * pfs_max_digest_length];
+    pfs_stmt->m_digest_storage.reset(pfs_tokens, pfs_max_digest_length);
+  }
+
+  for (index= 0; index < statements_history_sizing; index++)
+  {
+    pfs_stmt= & array->m_statements_history_array[index];
+
+    pfs_stmt->m_sqltext= & array->m_history_stmts_text_array[index * pfs_max_sqltext];
+
+    pfs_tokens= & array->m_history_stmts_digest_token_array[index * pfs_max_digest_length];
+    pfs_stmt->m_digest_storage.reset(pfs_tokens, pfs_max_digest_length);
+  }
+
+  array->m_full= false;
+  return 0;
+}
+
+void PFS_thread_allocator::free_array(PFS_thread_array *array)
+{
+  size_t size= array->m_max;
+  size_t waits_sizing= size * wait_class_max;
+  size_t stages_sizing= size * stage_class_max;
+  size_t statements_sizing= size * statement_class_max;
+  size_t transactions_sizing= size * transaction_class_max;
+  size_t memory_sizing= size * memory_class_max;
+
+  size_t waits_history_sizing= size * events_waits_history_per_thread;
+  size_t stages_history_sizing= size * events_stages_history_per_thread;
+  size_t statements_history_sizing= size * events_statements_history_per_thread;
+  size_t statements_stack_sizing= size * statement_stack_max;
+  size_t transactions_history_sizing= size * events_transactions_history_per_thread;
+  size_t session_connect_attrs_sizing= size * session_connect_attrs_size_per_thread;
+
+  size_t current_sqltext_sizing= size * pfs_max_sqltext * statement_stack_max;
+  size_t history_sqltext_sizing= size * pfs_max_sqltext * events_statements_history_per_thread;
+  size_t current_digest_tokens_sizing= size * pfs_max_digest_length * statement_stack_max;
+  size_t history_digest_tokens_sizing= size * pfs_max_digest_length * events_statements_history_per_thread;
+
+  PFS_FREE_ARRAY(& builtin_memory_thread,
+                 size, sizeof(PFS_thread), array->m_ptr);
+  array->m_ptr= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_thread_waits,
+                 waits_sizing, sizeof(PFS_single_stat),
+                 array->m_instr_class_waits_array);
+  array->m_instr_class_waits_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_thread_stages,
+                 stages_sizing, sizeof(PFS_stage_stat),
+                 array->m_instr_class_stages_array);
+  array->m_instr_class_stages_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_thread_statements,
+                 statements_sizing, sizeof(PFS_statement_stat),
+                 array->m_instr_class_statements_array);
+  array->m_instr_class_statements_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_thread_transactions,
+                 transactions_sizing, sizeof(PFS_transaction_stat),
+                 array->m_instr_class_transactions_array);
+  array->m_instr_class_transactions_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_thread_memory,
+                 memory_sizing, sizeof(PFS_memory_stat),
+                 array->m_instr_class_memory_array);
+  array->m_instr_class_memory_array= NULL;
+
+
+  PFS_FREE_ARRAY(& builtin_memory_thread_waits_history,
+                 waits_history_sizing, sizeof(PFS_events_waits),
+                 array->m_waits_history_array);
+  array->m_waits_history_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_thread_stages_history,
+                 stages_history_sizing, sizeof(PFS_events_stages),
+                 array->m_stages_history_array);
+  array->m_stages_history_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_thread_statements_history,
+                 statements_history_sizing, sizeof(PFS_events_statements),
+                 array->m_statements_history_array);
+  array->m_statements_history_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_thread_statements_stack,
+                 statements_stack_sizing, sizeof(PFS_events_statements),
+                 array->m_statements_stack_array);
+  array->m_statements_stack_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_thread_transaction_history,
+                 transactions_history_sizing, sizeof(PFS_events_transactions),
+                 array->m_transactions_history_array);
+  array->m_transactions_history_array= NULL;
+
+  pfs_free(& builtin_memory_thread_session_connect_attrs,
+           session_connect_attrs_sizing,
+           array->m_session_connect_attrs_array);
+  array->m_session_connect_attrs_array= NULL;
+
+  pfs_free(& builtin_memory_thread_statements_stack_sqltext,
+           current_sqltext_sizing,
+           array->m_current_stmts_text_array);
+  array->m_current_stmts_text_array= NULL;
+
+  pfs_free(& builtin_memory_thread_statements_history_sqltext,
+           history_sqltext_sizing,
+          array->m_history_stmts_text_array);
+  array->m_history_stmts_text_array= NULL;
+
+  pfs_free(& builtin_memory_thread_statements_stack_tokens,
+           current_digest_tokens_sizing,
+           array->m_current_stmts_digest_token_array);
+  array->m_current_stmts_digest_token_array= NULL;
+
+  pfs_free(& builtin_memory_thread_statements_history_tokens,
+           history_digest_tokens_sizing,
+           array->m_history_stmts_digest_token_array);
+  array->m_history_stmts_digest_token_array= NULL;
+}
+
+PFS_thread_allocator thread_allocator;
+PFS_thread_container global_thread_container(& thread_allocator);
+
+int PFS_user_allocator::alloc_array(PFS_user_array *array)
+{
+  size_t size= array->m_max;
+  PFS_user *pfs;
+  size_t index;
+  size_t waits_sizing= size * wait_class_max;
+  size_t stages_sizing= size * stage_class_max;
+  size_t statements_sizing= size * statement_class_max;
+  size_t transactions_sizing= size * transaction_class_max;
+  size_t memory_sizing= size * memory_class_max;
+
+  array->m_ptr= NULL;
+  array->m_full= true;
+  array->m_instr_class_waits_array= NULL;
+  array->m_instr_class_stages_array= NULL;
+  array->m_instr_class_statements_array= NULL;
+  array->m_instr_class_transactions_array= NULL;
+  array->m_instr_class_memory_array= NULL;
+
+  if (size > 0)
+  {
+    array->m_ptr=
+      PFS_MALLOC_ARRAY(& builtin_memory_user,
+                       size, sizeof(PFS_user), PFS_user, MYF(MY_ZEROFILL));
+    if (array->m_ptr == NULL)
+      return 1;
+  }
+
+  if (waits_sizing > 0)
+  {
+    array->m_instr_class_waits_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_user_waits,
+                       waits_sizing, sizeof(PFS_single_stat), PFS_single_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_waits_array == NULL)
+      return 1;
+
+    for (index=0; index < waits_sizing; index++)
+      array->m_instr_class_waits_array[index].reset();
+  }
+
+  if (stages_sizing > 0)
+  {
+    array->m_instr_class_stages_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_user_stages,
+                       stages_sizing, sizeof(PFS_stage_stat), PFS_stage_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_stages_array == NULL)
+      return 1;
+
+    for (index=0; index < stages_sizing; index++)
+      array->m_instr_class_stages_array[index].reset();
+  }
+
+  if (statements_sizing > 0)
+  {
+    array->m_instr_class_statements_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_user_statements,
+                       statements_sizing, sizeof(PFS_statement_stat), PFS_statement_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_statements_array == NULL)
+      return 1;
+
+    for (index=0; index < statements_sizing; index++)
+      array->m_instr_class_statements_array[index].reset();
+  }
+
+  if (transactions_sizing > 0)
+  {
+    array->m_instr_class_transactions_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_user_transactions,
+                       transactions_sizing, sizeof(PFS_transaction_stat), PFS_transaction_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_transactions_array == NULL)
+      return 1;
+
+    for (index=0; index < transactions_sizing; index++)
+      array->m_instr_class_transactions_array[index].reset();
+  }
+
+  if (memory_sizing > 0)
+  {
+    array->m_instr_class_memory_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_user_memory,
+                       memory_sizing, sizeof(PFS_memory_stat), PFS_memory_stat, MYF(MY_ZEROFILL));
+    if (array->m_instr_class_memory_array == NULL)
+      return 1;
+
+    for (index=0; index < memory_sizing; index++)
+      array->m_instr_class_memory_array[index].reset();
+  }
+
+  for (index= 0; index < size; index++)
+  {
+    pfs= & array->m_ptr[index];
+
+    pfs->set_instr_class_waits_stats(
+      & array->m_instr_class_waits_array[index * wait_class_max]);
+    pfs->set_instr_class_stages_stats(
+      & array->m_instr_class_stages_array[index * stage_class_max]);
+    pfs->set_instr_class_statements_stats(
+      & array->m_instr_class_statements_array[index * statement_class_max]);
+    pfs->set_instr_class_transactions_stats(
+      & array->m_instr_class_transactions_array[index * transaction_class_max]);
+    pfs->set_instr_class_memory_stats(
+      & array->m_instr_class_memory_array[index * memory_class_max]);
+  }
+
+  array->m_full= false;
+  return 0;
+}
+
+void PFS_user_allocator::free_array(PFS_user_array *array)
+{
+  size_t size= array->m_max;
+  size_t waits_sizing= size * wait_class_max;
+  size_t stages_sizing= size * stage_class_max;
+  size_t statements_sizing= size * statement_class_max;
+  size_t transactions_sizing= size * transaction_class_max;
+  size_t memory_sizing= size * memory_class_max;
+
+  PFS_FREE_ARRAY(& builtin_memory_user,
+                 size, sizeof(PFS_user), array->m_ptr);
+  array->m_ptr= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_user_waits,
+                 waits_sizing, sizeof(PFS_single_stat),
+                 array->m_instr_class_waits_array);
+  array->m_instr_class_waits_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_user_stages,
+                 stages_sizing, sizeof(PFS_stage_stat),
+                 array->m_instr_class_stages_array);
+  array->m_instr_class_stages_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_user_statements,
+                 statements_sizing, sizeof(PFS_statement_stat),
+                 array->m_instr_class_statements_array);
+  array->m_instr_class_statements_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_user_transactions,
+                 transactions_sizing, sizeof(PFS_transaction_stat),
+                 array->m_instr_class_transactions_array);
+  array->m_instr_class_transactions_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_user_memory,
+                 memory_sizing, sizeof(PFS_memory_stat),
+                 array->m_instr_class_memory_array);
+  array->m_instr_class_memory_array= NULL;
+}
+
+PFS_user_allocator user_allocator;
+PFS_user_container global_user_container(& user_allocator);
+
diff --git a/storage/perfschema/pfs_buffer_container.h b/storage/perfschema/pfs_buffer_container.h
new file mode 100644
index 00000000000..5baed2f872c
--- /dev/null
+++ b/storage/perfschema/pfs_buffer_container.h
@@ -0,0 +1,1626 @@
+/* Copyright (c) 2014, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef PFS_BUFFER_CONTAINER_H
+#define PFS_BUFFER_CONTAINER_H
+
+#include "my_global.h"
+#include "pfs.h" // PSI_COUNT_VOLATILITY
+#include "pfs_lock.h"
+#include "pfs_instr.h"
+#include "pfs_setup_actor.h"
+#include "pfs_setup_object.h"
+#include "pfs_program.h"
+#include "pfs_prepared_stmt.h"
+#include "pfs_builtin_memory.h"
+
+#define USE_SCALABLE
+
+class PFS_opaque_container_page;
+class PFS_opaque_container;
+
+struct PFS_builtin_memory_class;
+
+template <class T>
+class PFS_buffer_const_iterator;
+
+template <class T>
+class PFS_buffer_processor;
+
+template <class T, class U, class V>
+class PFS_buffer_iterator;
+
+template <class T, int PFS_PAGE_SIZE, int PFS_PAGE_COUNT, class U, class V>
+class PFS_buffer_scalable_iterator;
+
+template <class T>
+class PFS_buffer_default_array;
+
+template <class T>
+class PFS_buffer_default_allocator;
+
+template <class T, class U, class V>
+class PFS_buffer_container;
+
+template <class T, int PFS_PAGE_SIZE, int PFS_PAGE_COUNT, class U, class V>
+class PFS_buffer_scalable_container;
+
+template <class B, int COUNT>
+class PFS_partitioned_buffer_scalable_iterator;
+
+template <class B, int COUNT>
+class PFS_partitioned_buffer_scalable_container;
+
+
+template <class T>
+class PFS_buffer_default_array
+{
+public:
+  typedef T value_type;
+
+  value_type *allocate(pfs_dirty_state *dirty_state)
+  {
+    uint index;
+    uint monotonic;
+    uint monotonic_max;
+    value_type *pfs;
+
+    if (m_full)
+      return NULL;
+
+    monotonic= PFS_atomic::add_u32(& m_monotonic.m_u32, 1);
+    monotonic_max= monotonic + static_cast<uint>(m_max);
+
+    while (monotonic < monotonic_max)
+    {
+      index= monotonic % m_max;
+      pfs= m_ptr + index;
+
+      if (pfs->m_lock.free_to_dirty(dirty_state))
+      {
+        return pfs;
+      }
+      monotonic= PFS_atomic::add_u32(& m_monotonic.m_u32, 1);
+    }
+
+    m_full= true;
+    return NULL;
+  }
+
+  void deallocate(value_type *pfs)
+  {
+    pfs->m_lock.allocated_to_free();
+    m_full= false;
+  }
+
+  T* get_first()
+  {
+    return m_ptr;
+  }
+
+  T* get_last()
+  {
+    return m_ptr + m_max;
+  }
+
+  bool m_full;
+  PFS_cacheline_uint32 m_monotonic;
+  T * m_ptr;
+  size_t m_max;
+  /** Container. */
+  PFS_opaque_container *m_container;
+};
+
+template <class T>
+class PFS_buffer_default_allocator
+{
+public:
+  typedef PFS_buffer_default_array<T> array_type;
+
+  PFS_buffer_default_allocator(PFS_builtin_memory_class *klass)
+    : m_builtin_class(klass)
+  {}
+
+  int alloc_array(array_type *array)
+  {
+    array->m_ptr= NULL;
+    array->m_full= true;
+    array->m_monotonic.m_u32= 0;
+
+    if (array->m_max > 0)
+    {
+      array->m_ptr= PFS_MALLOC_ARRAY(m_builtin_class,
+                                     array->m_max, sizeof(T), T, MYF(MY_ZEROFILL));
+      if (array->m_ptr == NULL)
+        return 1;
+      array->m_full= false;
+    }
+    return 0;
+  }
+
+  void free_array(array_type *array)
+  {
+    assert(array->m_max > 0);
+
+    PFS_FREE_ARRAY(m_builtin_class,
+                   array->m_max, sizeof(T), array->m_ptr);
+    array->m_ptr= NULL;
+  }
+
+private:
+  PFS_builtin_memory_class *m_builtin_class;
+};
+
+template <class T,
+          class U = PFS_buffer_default_array<T>,
+          class V = PFS_buffer_default_allocator<T> >
+class PFS_buffer_container
+{
+public:
+  friend class PFS_buffer_iterator<T, U, V>;
+
+  typedef T value_type;
+  typedef U array_type;
+  typedef V allocator_type;
+  typedef PFS_buffer_const_iterator<T> const_iterator_type;
+  typedef PFS_buffer_iterator<T, U, V> iterator_type;
+  typedef PFS_buffer_processor<T> processor_type;
+  typedef void (*function_type)(value_type *);
+
+  PFS_buffer_container(allocator_type *allocator)
+  {
+    m_array.m_full= true;
+    m_array.m_ptr= NULL;
+    m_array.m_max= 0;
+    m_array.m_monotonic.m_u32= 0;
+    m_lost= 0;
+    m_max= 0;
+    m_allocator= allocator;
+  }
+
+  int init(ulong max_size)
+  {
+    if (max_size > 0)
+    {
+      m_array.m_max= max_size;
+      int rc= m_allocator->alloc_array(& m_array);
+      if (rc != 0)
+      {
+        m_allocator->free_array(& m_array);
+        return 1;
+      }
+      m_max= max_size;
+      m_array.m_full= false;
+    }
+    return 0;
+  }
+
+  void cleanup()
+  {
+    m_allocator->free_array(& m_array);
+  }
+
+  ulong get_row_count() const
+  {
+    return m_max;
+  }
+
+  ulong get_row_size() const
+  {
+    return sizeof(value_type);
+  }
+
+  ulong get_memory() const
+  {
+    return get_row_count() * get_row_size();
+  }
+
+  value_type *allocate(pfs_dirty_state *dirty_state)
+  {
+    value_type *pfs;
+
+    pfs= m_array.allocate(dirty_state, m_max);
+    if (pfs == NULL)
+    {
+      m_lost++;
+    }
+
+    return pfs;
+  }
+
+  void deallocate(value_type *pfs)
+  {
+    m_array.deallocate(pfs);
+  }
+
+  iterator_type iterate()
+  {
+    return PFS_buffer_iterator<T, U, V>(this, 0);
+  }
+
+  iterator_type iterate(uint index)
+  {
+    assert(index <= m_max);
+    return PFS_buffer_iterator<T, U, V>(this, index);
+  }
+
+  void apply(function_type fct)
+  {
+    value_type *pfs= m_array.get_first();
+    value_type *pfs_last= m_array.get_last();
+
+    while (pfs < pfs_last)
+    {
+      if (pfs->m_lock.is_populated())
+      {
+        fct(pfs);
+      }
+      pfs++;
+    }
+  }
+
+  void apply_all(function_type fct)
+  {
+    value_type *pfs= m_array.get_first();
+    value_type *pfs_last= m_array.get_last();
+
+    while (pfs < pfs_last)
+    {
+      fct(pfs);
+      pfs++;
+    }
+  }
+
+  void apply(processor_type & proc)
+  {
+    value_type *pfs= m_array.get_first();
+    value_type *pfs_last= m_array.get_last();
+
+    while (pfs < pfs_last)
+    {
+      if (pfs->m_lock.is_populated())
+      {
+        proc(pfs);
+      }
+      pfs++;
+    }
+  }
+
+  void apply_all(processor_type & proc)
+  {
+    value_type *pfs= m_array.get_first();
+    value_type *pfs_last= m_array.get_last();
+
+    while (pfs < pfs_last)
+    {
+      proc(pfs);
+      pfs++;
+    }
+  }
+
+  inline value_type* get(uint index)
+  {
+    assert(index < m_max);
+
+    value_type *pfs= m_array.m_ptr + index;
+    if (pfs->m_lock.is_populated())
+    {
+      return pfs;
+    }
+
+    return NULL;
+  }
+
+  value_type* get(uint index, bool *has_more)
+  {
+    if (index >= m_max)
+    {
+      *has_more= false;
+      return NULL;
+    }
+
+    *has_more= true;
+    return get(index);
+  }
+
+  value_type *sanitize(value_type *unsafe)
+  {
+    intptr offset;
+    value_type *pfs= m_array.get_first();
+    value_type *pfs_last= m_array.get_last();
+
+    if ((pfs <= unsafe) &&
+        (unsafe < pfs_last))
+    {
+      offset= ((intptr) unsafe - (intptr) pfs) % sizeof(value_type);
+      if (offset == 0)
+        return unsafe;
+    }
+
+    return NULL;
+  }
+
+  ulong m_lost;
+
+private:
+  value_type* scan_next(uint & index, uint * found_index)
+  {
+    assert(index <= m_max);
+
+    value_type *pfs_first= m_array.get_first();
+    value_type *pfs= pfs_first + index;
+    value_type *pfs_last= m_array.get_last();
+
+    while (pfs < pfs_last)
+    {
+      if (pfs->m_lock.is_populated())
+      {
+        uint found= pfs - pfs_first;
+        *found_index= found;
+        index= found + 1;
+        return pfs;
+      }
+      pfs++;
+    }
+
+    index= m_max;
+    return NULL;
+  }
+
+  ulong m_max;
+  array_type m_array;
+  allocator_type *m_allocator;
+};
+
+template <class T,
+          int PFS_PAGE_SIZE,
+          int PFS_PAGE_COUNT,
+          class U = PFS_buffer_default_array<T>,
+          class V = PFS_buffer_default_allocator<T> >
+class PFS_buffer_scalable_container
+{
+public:
+  friend class PFS_buffer_scalable_iterator<T, PFS_PAGE_SIZE, PFS_PAGE_COUNT, U, V>;
+
+  /**
+    Type of elements in the buffer.
+    The following attributes are required:
+    - pfs_lock m_lock
+    - PFS_opaque_container_page *m_page
+  */
+  typedef T value_type;
+  /**
+    Type of pages in the buffer.
+    The following attributes are required:
+    - PFS_opaque_container *m_container
+  */
+  typedef U array_type;
+  typedef V allocator_type;
+  /** This container type */
+  typedef PFS_buffer_scalable_container<T, PFS_PAGE_SIZE, PFS_PAGE_COUNT, U, V> container_type;
+  typedef PFS_buffer_const_iterator<T> const_iterator_type;
+  typedef PFS_buffer_scalable_iterator<T, PFS_PAGE_SIZE, PFS_PAGE_COUNT, U, V> iterator_type;
+  typedef PFS_buffer_processor<T> processor_type;
+  typedef void (*function_type)(value_type *);
+
+  static const size_t MAX_SIZE= PFS_PAGE_SIZE*PFS_PAGE_COUNT;
+
+  PFS_buffer_scalable_container(allocator_type *allocator)
+  {
+    m_allocator= allocator;
+    m_initialized= false;
+    m_lost= 0;
+  }
+
+  int init(long max_size)
+  {
+    int i;
+
+    m_initialized= true;
+    m_full= true;
+    m_max= PFS_PAGE_COUNT * PFS_PAGE_SIZE;
+    m_max_page_count= PFS_PAGE_COUNT;
+    m_last_page_size= PFS_PAGE_SIZE;
+    m_lost= 0;
+    m_monotonic.m_u32= 0;
+    m_max_page_index.m_u32= 0;
+
+    for (i=0 ; i < PFS_PAGE_COUNT; i++)
+    {
+      m_pages[i]= NULL;
+    }
+
+    if (max_size == 0)
+    {
+      /* No allocation. */
+      m_max_page_count= 0;
+    }
+    else if (max_size > 0)
+    {
+      if (max_size % PFS_PAGE_SIZE == 0)
+      {
+        m_max_page_count= max_size / PFS_PAGE_SIZE;
+      }
+      else
+      {
+        m_max_page_count= max_size / PFS_PAGE_SIZE + 1;
+        m_last_page_size= max_size % PFS_PAGE_SIZE;
+      }
+      /* Bounded allocation. */
+      m_full= false;
+
+      if (m_max_page_count > PFS_PAGE_COUNT)
+      {
+        m_max_page_count= PFS_PAGE_COUNT;
+        m_last_page_size= PFS_PAGE_SIZE;
+      }
+    }
+    else
+    {
+      /* max_size = -1 means unbounded allocation */
+      m_full= false;
+    }
+
+    assert(m_max_page_count <= PFS_PAGE_COUNT);
+    assert(0 < m_last_page_size);
+    assert(m_last_page_size <= PFS_PAGE_SIZE);
+
+    pthread_mutex_init(& m_critical_section, NULL);
+    return 0;
+  }
+
+  void cleanup()
+  {
+    int i;
+    array_type *page;
+
+    if (! m_initialized)
+      return;
+
+    pthread_mutex_lock(& m_critical_section);
+
+    for (i=0 ; i < PFS_PAGE_COUNT; i++)
+    {
+      page= m_pages[i];
+      if (page != NULL)
+      {
+        m_allocator->free_array(page);
+        delete page;
+        m_pages[i]= NULL;
+      }
+    }
+    pthread_mutex_unlock(& m_critical_section);
+
+    pthread_mutex_destroy(& m_critical_section);
+
+    m_initialized= false;
+  }
+
+  ulong get_row_count()
+  {
+    ulong page_count= PFS_atomic::load_u32(& m_max_page_index.m_u32);
+
+    return page_count * PFS_PAGE_SIZE;
+  }
+
+  ulong get_row_size() const
+  {
+    return sizeof(value_type);
+  }
+
+  ulong get_memory()
+  {
+    return get_row_count() * get_row_size();
+  }
+
+  value_type *allocate(pfs_dirty_state *dirty_state)
+  {
+    if (m_full)
+    {
+      m_lost++;
+      return NULL;
+    }
+
+    uint index;
+    uint monotonic;
+    uint monotonic_max;
+    uint current_page_count;
+    value_type *pfs;
+    array_type *array;
+
+    void *addr;
+    void * volatile * typed_addr;
+    void *ptr;
+
+    /*
+      1: Try to find an available record within the existing pages
+    */
+    current_page_count= PFS_atomic::load_u32(& m_max_page_index.m_u32);
+
+    if (current_page_count != 0)
+    {
+      monotonic= PFS_atomic::load_u32(& m_monotonic.m_u32);
+      monotonic_max= monotonic + current_page_count;
+
+      while (monotonic < monotonic_max)
+      {
+        /*
+          Scan in the [0 .. current_page_count - 1] range,
+          in parallel with m_monotonic (see below)
+        */
+        index= monotonic % current_page_count;
+
+        /* Atomic Load, array= m_pages[index] */
+        addr= & m_pages[index];
+        typed_addr= static_cast<void * volatile *>(addr);
+        ptr= my_atomic_loadptr(typed_addr);
+        array= static_cast<array_type *>(ptr);
+
+        if (array != NULL)
+        {
+          pfs= array->allocate(dirty_state);
+          if (pfs != NULL)
+          {
+            /* Keep a pointer to the parent page, for deallocate(). */
+            pfs->m_page= reinterpret_cast<PFS_opaque_container_page *> (array);
+            return pfs;
+          }
+        }
+
+        /*
+          Parallel scans collaborate to increase
+          the common monotonic scan counter.
+
+          Note that when all the existing page are full,
+          one thread will eventually add a new page,
+          and cause m_max_page_index to increase,
+          which fools all the modulo logic for scans already in progress,
+          because the monotonic counter is not folded to the same place
+          (sometime modulo N, sometime modulo N+1).
+
+          This is actually ok: since all the pages are full anyway,
+          there is nothing to miss, so better increase the monotonic
+          counter faster and then move on to the detection of new pages,
+          in part 2: below.
+        */
+        monotonic= PFS_atomic::add_u32(& m_monotonic.m_u32, 1);
+      };
+    }
+
+    /*
+      2: Try to add a new page, beyond the m_max_page_index limit
+    */
+    while (current_page_count < m_max_page_count)
+    {
+      /* Peek for pages added by collaborating threads */
+
+      /* (2-a) Atomic Load, array= m_pages[current_page_count] */
+      addr= & m_pages[current_page_count];
+      typed_addr= static_cast<void * volatile *>(addr);
+      ptr= my_atomic_loadptr(typed_addr);
+      array= static_cast<array_type *>(ptr);
+
+      if (array == NULL)
+      {
+        // ==================================================================
+        // BEGIN CRITICAL SECTION -- buffer expand
+        // ==================================================================
+
+        /*
+          On a fresh started server, buffers are typically empty.
+          When a sudden load spike is seen by the server,
+          multiple threads may want to expand the buffer at the same time.
+
+          Using a compare and swap to allow multiple pages to be added,
+          possibly freeing duplicate pages on collisions,
+          does not work well because the amount of code involved
+          when creating a new page can be significant (PFS_thread),
+          causing MANY collisions between (2-b) and (2-d).
+
+          A huge number of collisions (which can happen when thousands
+          of new connections hits the server after a restart)
+          leads to a huge memory consumption, and to OOM.
+
+          To mitigate this, we use here a mutex,
+          to enforce that only ONE page is added at a time,
+          so that scaling the buffer happens in a predictable
+          and controlled manner.
+        */
+        pthread_mutex_lock(& m_critical_section);
+
+        /*
+          Peek again for pages added by collaborating threads,
+          this time as the only thread allowed to expand the buffer
+        */
+
+        /* (2-b) Atomic Load, array= m_pages[current_page_count] */
+
+        ptr= my_atomic_loadptr(typed_addr);
+        array= static_cast<array_type *>(ptr);
+
+        if (array == NULL)
+        {
+          /* (2-c) Found no page, allocate a new one */
+          array= new array_type();
+          builtin_memory_scalable_buffer.count_alloc(sizeof (array_type));
+
+          array->m_max= get_page_logical_size(current_page_count);
+          int rc= m_allocator->alloc_array(array);
+          if (rc != 0)
+          {
+            m_allocator->free_array(array);
+            delete array;
+            builtin_memory_scalable_buffer.count_free(sizeof (array_type));
+            m_lost++;
+            pthread_mutex_unlock(& m_critical_section);
+            return NULL;
+          }
+
+          /* Keep a pointer to this container, for static_deallocate(). */
+          array->m_container= reinterpret_cast<PFS_opaque_container *> (this);
+
+          /* (2-d) Atomic STORE, m_pages[current_page_count] = array  */
+          ptr= array;
+          my_atomic_storeptr(typed_addr, ptr);
+
+          /* Advertise the new page */
+          PFS_atomic::add_u32(& m_max_page_index.m_u32, 1);
+        }
+
+        pthread_mutex_unlock(& m_critical_section);
+
+        // ==================================================================
+        // END CRITICAL SECTION -- buffer expand
+        // ==================================================================
+      }
+
+      assert(array != NULL);
+      pfs= array->allocate(dirty_state);
+      if (pfs != NULL)
+      {
+        /* Keep a pointer to the parent page, for deallocate(). */
+        pfs->m_page= reinterpret_cast<PFS_opaque_container_page *> (array);
+        return pfs;
+      }
+
+      current_page_count++;
+    }
+
+    m_lost++;
+    m_full= true;
+    return NULL;
+  }
+
+  void deallocate(value_type *safe_pfs)
+  {
+    /* Find the containing page */
+    PFS_opaque_container_page *opaque_page= safe_pfs->m_page;
+    array_type *page= reinterpret_cast<array_type *> (opaque_page);
+
+    /* Mark the object free */
+    safe_pfs->m_lock.allocated_to_free();
+
+    /* Flag the containing page as not full. */
+    page->m_full= false;
+
+    /* Flag the overall container as not full. */
+    m_full= false;
+  }
+
+  static void static_deallocate(value_type *safe_pfs)
+  {
+    /* Find the containing page */
+    PFS_opaque_container_page *opaque_page= safe_pfs->m_page;
+    array_type *page= reinterpret_cast<array_type *> (opaque_page);
+
+    /* Mark the object free */
+    safe_pfs->m_lock.allocated_to_free();
+
+    /* Flag the containing page as not full. */
+    page->m_full= false;
+
+    /* Find the containing buffer */
+    PFS_opaque_container *opaque_container= page->m_container;
+    PFS_buffer_scalable_container *container;
+    container= reinterpret_cast<container_type *> (opaque_container);
+
+    /* Flag the overall container as not full. */
+    container->m_full= false;
+  }
+
+  iterator_type iterate()
+  {
+    return PFS_buffer_scalable_iterator<T, PFS_PAGE_SIZE, PFS_PAGE_COUNT, U, V>(this, 0);
+  }
+
+  iterator_type iterate(uint index)
+  {
+    assert(index <= m_max);
+    return PFS_buffer_scalable_iterator<T, PFS_PAGE_SIZE, PFS_PAGE_COUNT, U, V>(this, index);
+  }
+
+  void apply(function_type fct)
+  {
+    uint i;
+    array_type *page;
+    value_type *pfs;
+    value_type *pfs_last;
+
+    for (i=0 ; i < PFS_PAGE_COUNT; i++)
+    {
+      page= m_pages[i];
+      if (page != NULL)
+      {
+        pfs= page->get_first();
+        pfs_last= page->get_last();
+
+        while (pfs < pfs_last)
+        {
+          if (pfs->m_lock.is_populated())
+          {
+            fct(pfs);
+          }
+          pfs++;
+        }
+      }
+    }
+  }
+
+  void apply_all(function_type fct)
+  {
+    uint i;
+    array_type *page;
+    value_type *pfs;
+    value_type *pfs_last;
+
+    for (i=0 ; i < PFS_PAGE_COUNT; i++)
+    {
+      page= m_pages[i];
+      if (page != NULL)
+      {
+        pfs= page->get_first();
+        pfs_last= page->get_last();
+
+        while (pfs < pfs_last)
+        {
+          fct(pfs);
+          pfs++;
+        }
+      }
+    }
+  }
+
+  void apply(processor_type & proc)
+  {
+    uint i;
+    array_type *page;
+    value_type *pfs;
+    value_type *pfs_last;
+
+    for (i=0 ; i < PFS_PAGE_COUNT; i++)
+    {
+      page= m_pages[i];
+      if (page != NULL)
+      {
+        pfs= page->get_first();
+        pfs_last= page->get_last();
+
+        while (pfs < pfs_last)
+        {
+          if (pfs->m_lock.is_populated())
+          {
+            proc(pfs);
+          }
+          pfs++;
+        }
+      }
+    }
+  }
+
+  void apply_all(processor_type & proc)
+  {
+    uint i;
+    array_type *page;
+    value_type *pfs;
+    value_type *pfs_last;
+
+    for (i=0 ; i < PFS_PAGE_COUNT; i++)
+    {
+      page= m_pages[i];
+      if (page != NULL)
+      {
+        pfs= page->get_first();
+        pfs_last= page->get_last();
+
+        while (pfs < pfs_last)
+        {
+          proc(pfs);
+          pfs++;
+        }
+      }
+    }
+  }
+
+  value_type* get(uint index)
+  {
+    assert(index < m_max);
+
+    uint index_1= index / PFS_PAGE_SIZE;
+    array_type *page= m_pages[index_1];
+    if (page != NULL)
+    {
+      uint index_2= index % PFS_PAGE_SIZE;
+
+      if (index_2 >= page->m_max)
+      {
+        return NULL;
+      }
+
+      value_type *pfs= page->m_ptr + index_2;
+
+      if (pfs->m_lock.is_populated())
+      {
+        return pfs;
+      }
+    }
+
+    return NULL;
+  }
+
+  value_type* get(uint index, bool *has_more)
+  {
+    if (index >= m_max)
+    {
+      *has_more= false;
+      return NULL;
+    }
+
+    uint index_1= index / PFS_PAGE_SIZE;
+    array_type *page= m_pages[index_1];
+
+    if (page == NULL)
+    {
+      *has_more= false;
+      return NULL;
+    }
+
+    uint index_2= index % PFS_PAGE_SIZE;
+
+    if (index_2 >= page->m_max)
+    {
+      *has_more= false;
+      return NULL;
+    }
+
+    *has_more= true;
+    value_type *pfs= page->m_ptr + index_2;
+
+    if (pfs->m_lock.is_populated())
+    {
+      return pfs;
+    }
+
+    return NULL;
+  }
+
+  value_type *sanitize(value_type *unsafe)
+  {
+    intptr offset;
+    uint i;
+    array_type *page;
+    value_type *pfs;
+    value_type *pfs_last;
+
+    for (i=0 ; i < PFS_PAGE_COUNT; i++)
+    {
+      page= m_pages[i];
+      if (page != NULL)
+      {
+        pfs= page->get_first();
+        pfs_last= page->get_last();
+
+        if ((pfs <= unsafe) &&
+            (unsafe < pfs_last))
+        {
+          offset= ((intptr) unsafe - (intptr) pfs) % sizeof(value_type);
+          if (offset == 0)
+            return unsafe;
+        }
+      }
+    }
+
+    return NULL;
+  }
+
+  ulong m_lost;
+
+private:
+
+  uint get_page_logical_size(uint page_index)
+  {
+    if (page_index + 1 < m_max_page_count)
+      return PFS_PAGE_SIZE;
+    assert(page_index + 1 == m_max_page_count);
+    return m_last_page_size;
+  }
+
+  value_type* scan_next(uint & index, uint * found_index)
+  {
+    assert(index <= m_max);
+
+    uint index_1= index / PFS_PAGE_SIZE;
+    uint index_2= index % PFS_PAGE_SIZE;
+    array_type *page;
+    value_type *pfs_first;
+    value_type *pfs;
+    value_type *pfs_last;
+
+    while (index_1 < PFS_PAGE_COUNT)
+    {
+      page= m_pages[index_1];
+
+      if (page == NULL)
+      {
+        index= static_cast<uint>(m_max);
+        return NULL;
+      }
+
+      pfs_first= page->get_first();
+      pfs= pfs_first + index_2;
+      pfs_last= page->get_last();
+
+      while (pfs < pfs_last)
+      {
+        if (pfs->m_lock.is_populated())
+        {
+          uint found= index_1 * PFS_PAGE_SIZE + static_cast<uint>(pfs - pfs_first);
+          *found_index= found;
+          index= found + 1;
+          return pfs;
+        }
+        pfs++;
+      }
+
+      index_1++;
+      index_2= 0;
+    }
+
+    index= static_cast<uint>(m_max);
+    return NULL;
+  }
+
+  bool m_initialized;
+  bool m_full;
+  size_t m_max;
+  PFS_cacheline_uint32 m_monotonic;
+  PFS_cacheline_uint32 m_max_page_index;
+  ulong m_max_page_count;
+  ulong m_last_page_size;
+  array_type * m_pages[PFS_PAGE_COUNT];
+  allocator_type *m_allocator;
+  pthread_mutex_t m_critical_section;
+};
+
+template <class T, class U, class V>
+class PFS_buffer_iterator
+{
+  friend class PFS_buffer_container<T, U, V>;
+
+  typedef T value_type;
+  typedef PFS_buffer_container<T, U, V> container_type;
+
+public:
+  value_type* scan_next()
+  {
+    uint unused;
+    return m_container->scan_next(m_index, & unused);
+  }
+
+  value_type* scan_next(uint * found_index)
+  {
+    return m_container->scan_next(m_index, found_index);
+  }
+
+private:
+  PFS_buffer_iterator(container_type *container, uint index)
+    : m_container(container),
+      m_index(index)
+  {}
+
+  container_type *m_container;
+  uint m_index;
+};
+
+template <class T, int page_size, int page_count, class U, class V>
+class PFS_buffer_scalable_iterator
+{
+  friend class PFS_buffer_scalable_container<T, page_size, page_count, U, V>;
+
+  typedef T value_type;
+  typedef PFS_buffer_scalable_container<T, page_size, page_count, U, V> container_type;
+
+public:
+  value_type* scan_next()
+  {
+    uint unused;
+    return m_container->scan_next(m_index, & unused);
+  }
+
+  value_type* scan_next(uint * found_index)
+  {
+    return m_container->scan_next(m_index, found_index);
+  }
+
+private:
+  PFS_buffer_scalable_iterator(container_type *container, uint index)
+    : m_container(container),
+      m_index(index)
+  {}
+
+  container_type *m_container;
+  uint m_index;
+};
+
+template <class T>
+class PFS_buffer_processor
+{
+public:
+  virtual ~PFS_buffer_processor<T> ()
+  {}
+  virtual void operator()(T *element) = 0;
+};
+
+template <class B, int PFS_PARTITION_COUNT>
+class PFS_partitioned_buffer_scalable_container
+{
+public:
+  friend class PFS_partitioned_buffer_scalable_iterator<B, PFS_PARTITION_COUNT>;
+
+  typedef typename B::value_type value_type;
+  typedef typename B::allocator_type allocator_type;
+  typedef PFS_partitioned_buffer_scalable_iterator<B, PFS_PARTITION_COUNT> iterator_type;
+  typedef typename B::iterator_type sub_iterator_type;
+  typedef typename B::processor_type processor_type;
+  typedef typename B::function_type function_type;
+
+  PFS_partitioned_buffer_scalable_container(allocator_type *allocator)
+  {
+    for (int i=0 ; i < PFS_PARTITION_COUNT; i++)
+    {
+      m_partitions[i]= new B(allocator);
+    }
+  }
+
+  ~PFS_partitioned_buffer_scalable_container()
+  {
+    for (int i=0 ; i < PFS_PARTITION_COUNT; i++)
+    {
+      delete m_partitions[i];
+    }
+  }
+
+  int init(long max_size)
+  {
+    int rc= 0;
+    // FIXME: we have max_size * PFS_PARTITION_COUNT here
+    for (int i=0 ; i < PFS_PARTITION_COUNT; i++)
+    {
+      rc|= m_partitions[i]->init(max_size);
+    }
+    return rc;
+  }
+
+  void cleanup()
+  {
+    for (int i=0 ; i < PFS_PARTITION_COUNT; i++)
+    {
+      m_partitions[i]->cleanup();
+    }
+  }
+
+  ulong get_row_count() const
+  {
+    ulong sum= 0;
+
+    for (int i=0; i < PFS_PARTITION_COUNT; i++)
+    {
+      sum += m_partitions[i]->get_row_count();
+    }
+
+    return sum;
+  }
+
+  ulong get_row_size() const
+  {
+    return sizeof(value_type);
+  }
+
+  ulong get_memory() const
+  {
+    ulong sum= 0;
+
+    for (int i=0; i < PFS_PARTITION_COUNT; i++)
+    {
+      sum += m_partitions[i]->get_memory();
+    }
+
+    return sum;
+  }
+
+  long get_lost_counter()
+  {
+    long sum= 0;
+
+    for (int i=0; i < PFS_PARTITION_COUNT; i++)
+    {
+      sum += m_partitions[i]->m_lost;
+    }
+
+    return sum;
+  }
+
+  value_type *allocate(pfs_dirty_state *dirty_state, uint partition)
+  {
+    assert(partition < PFS_PARTITION_COUNT);
+
+    return m_partitions[partition]->allocate(dirty_state);
+  }
+
+  void deallocate(value_type *safe_pfs)
+  {
+    /*
+      One issue here is that we do not know which partition
+      the record belongs to.
+      Each record points to the parent page,
+      and each page points to the parent buffer,
+      so using static_deallocate here,
+      which will find the correct partition by itself.
+    */
+    B::static_deallocate(safe_pfs);
+  }
+
+  iterator_type iterate()
+  {
+    return iterator_type(this, 0, 0);
+  }
+
+  iterator_type iterate(uint user_index)
+  {
+    uint partition_index;
+    uint sub_index;
+    unpack_index(user_index, &partition_index, &sub_index);
+    return iterator_type(this, partition_index, sub_index);
+  }
+
+  void apply(function_type fct)
+  {
+    for (int i=0; i < PFS_PARTITION_COUNT; i++)
+    {
+      m_partitions[i]->apply(fct);
+    }
+  }
+
+  void apply_all(function_type fct)
+  {
+    for (int i=0; i < PFS_PARTITION_COUNT; i++)
+    {
+      m_partitions[i]->apply_all(fct);
+    }
+  }
+
+  void apply(processor_type & proc)
+  {
+    for (int i=0; i < PFS_PARTITION_COUNT; i++)
+    {
+      m_partitions[i]->apply(proc);
+    }
+  }
+
+  void apply_all(processor_type & proc)
+  {
+    for (int i=0; i < PFS_PARTITION_COUNT; i++)
+    {
+      m_partitions[i]->apply_all(proc);
+    }
+  }
+
+  value_type* get(uint user_index)
+  {
+    uint partition_index;
+    uint sub_index;
+    unpack_index(user_index, &partition_index, &sub_index);
+
+    if (partition_index >= PFS_PARTITION_COUNT)
+    {
+      return NULL;
+    }
+
+    return m_partitions[partition_index]->get(sub_index);
+  }
+
+  value_type* get(uint user_index, bool *has_more)
+  {
+    uint partition_index;
+    uint sub_index;
+    unpack_index(user_index, &partition_index, &sub_index);
+
+    if (partition_index >= PFS_PARTITION_COUNT)
+    {
+      *has_more= false;
+      return NULL;
+    }
+
+    *has_more= true;
+    return m_partitions[partition_index]->get(sub_index);
+  }
+
+  value_type *sanitize(value_type *unsafe)
+  {
+    value_type *safe= NULL;
+
+    for (int i=0; i < PFS_PARTITION_COUNT; i++)
+    {
+      safe= m_partitions[i]->sanitize(unsafe);
+      if (safe != NULL)
+      {
+        return safe;
+      }
+    }
+
+    return safe;
+  }
+
+private:
+  static void pack_index(uint partition_index, uint sub_index, uint *user_index)
+  {
+    /* 2^8 = 256 partitions max */
+    compile_time_assert(PFS_PARTITION_COUNT <= (1 << 8));
+    /* 2^24 = 16777216 max per partitioned buffer. */
+    compile_time_assert((B::MAX_SIZE) <= (1 << 24));
+
+    *user_index= (partition_index << 24) + sub_index;
+  }
+
+  static void unpack_index(uint user_index, uint *partition_index, uint *sub_index)
+  {
+    *partition_index= user_index >> 24;
+    *sub_index= user_index & 0x00FFFFFF;
+  }
+
+  value_type* scan_next(uint & partition_index, uint & sub_index, uint * found_partition, uint * found_sub_index)
+  {
+    value_type *record= NULL;
+    assert(partition_index < PFS_PARTITION_COUNT);
+
+    while (partition_index < PFS_PARTITION_COUNT)
+    {
+      sub_iterator_type sub_iterator= m_partitions[partition_index]->iterate(sub_index);
+      record= sub_iterator.scan_next(found_sub_index);
+      if (record != NULL)
+      {
+        *found_partition= partition_index;
+        sub_index= *found_sub_index + 1;
+        return record;
+      }
+
+      partition_index++;
+      sub_index= 0;
+    }
+
+    *found_partition= PFS_PARTITION_COUNT;
+    *found_sub_index= 0;
+    sub_index= 0;
+    return NULL;
+  }
+
+  B *m_partitions[PFS_PARTITION_COUNT];
+};
+
+template <class B, int PFS_PARTITION_COUNT>
+class PFS_partitioned_buffer_scalable_iterator
+{
+public:
+  friend class PFS_partitioned_buffer_scalable_container<B, PFS_PARTITION_COUNT>;
+
+  typedef typename B::value_type value_type;
+  typedef PFS_partitioned_buffer_scalable_container<B, PFS_PARTITION_COUNT> container_type;
+
+  value_type* scan_next()
+  {
+    uint unused_partition;
+    uint unused_sub_index;
+    return m_container->scan_next(m_partition, m_sub_index, & unused_partition, & unused_sub_index);
+  }
+
+  value_type* scan_next(uint *found_user_index)
+  {
+    uint found_partition;
+    uint found_sub_index;
+    value_type *record;
+    record=  m_container->scan_next(m_partition, m_sub_index, &found_partition, &found_sub_index);
+    container_type::pack_index(found_partition, found_sub_index, found_user_index);
+    return record;
+  }
+
+private:
+  PFS_partitioned_buffer_scalable_iterator(container_type *container, uint partition, uint sub_index)
+    : m_container(container),
+      m_partition(partition),
+      m_sub_index(sub_index)
+  {}
+
+  container_type *m_container;
+  uint m_partition;
+  uint m_sub_index;
+};
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_mutex, 1024, 1024> PFS_mutex_basic_container;
+typedef PFS_partitioned_buffer_scalable_container<PFS_mutex_basic_container, PSI_COUNT_VOLATILITY> PFS_mutex_container;
+#else
+typedef PFS_buffer_container<PFS_mutex> PFS_mutex_container;
+#endif
+typedef PFS_mutex_container::iterator_type PFS_mutex_iterator;
+extern PFS_mutex_container global_mutex_container;
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_rwlock, 1024, 1024> PFS_rwlock_container;
+#else
+typedef PFS_buffer_container<PFS_rwlock> PFS_rwlock_container;
+#endif
+typedef PFS_rwlock_container::iterator_type PFS_rwlock_iterator;
+extern PFS_rwlock_container global_rwlock_container;
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_cond, 256, 256> PFS_cond_container;
+#else
+typedef PFS_buffer_container<PFS_cond> PFS_cond_container;
+#endif
+typedef PFS_cond_container::iterator_type PFS_cond_iterator;
+extern PFS_cond_container global_cond_container;
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_file, 4 * 1024, 4 * 1024> PFS_file_container;
+#else
+typedef PFS_buffer_container<PFS_file> PFS_file_container;
+#endif
+typedef PFS_file_container::iterator_type PFS_file_iterator;
+extern PFS_file_container global_file_container;
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_socket, 256, 256> PFS_socket_container;
+#else
+typedef PFS_buffer_container<PFS_socket> PFS_socket_container;
+#endif
+typedef PFS_socket_container::iterator_type PFS_socket_iterator;
+extern PFS_socket_container global_socket_container;
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_metadata_lock, 1024, 1024> PFS_mdl_container;
+#else
+typedef PFS_buffer_container<PFS_metadata_lock> PFS_mdl_container;
+#endif
+typedef PFS_mdl_container::iterator_type PFS_mdl_iterator;
+extern PFS_mdl_container global_mdl_container;
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_setup_actor, 128, 1024> PFS_setup_actor_container;
+#else
+typedef PFS_buffer_container<PFS_setup_actor> PFS_setup_actor_container;
+#endif
+typedef PFS_setup_actor_container::iterator_type PFS_setup_actor_iterator;
+extern PFS_setup_actor_container global_setup_actor_container;
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_setup_object, 128, 1024> PFS_setup_object_container;
+#else
+typedef PFS_buffer_container<PFS_setup_object> PFS_setup_object_container;
+#endif
+typedef PFS_setup_object_container::iterator_type PFS_setup_object_iterator;
+extern PFS_setup_object_container global_setup_object_container;
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_table, 1024, 1024> PFS_table_container;
+#else
+typedef PFS_buffer_container<PFS_table> PFS_table_container;
+#endif
+typedef PFS_table_container::iterator_type PFS_table_iterator;
+extern PFS_table_container global_table_container;
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_table_share, 4 * 1024, 4 * 1024> PFS_table_share_container;
+#else
+typedef PFS_buffer_container<PFS_table_share> PFS_table_share_container;
+#endif
+typedef PFS_table_share_container::iterator_type PFS_table_share_iterator;
+extern PFS_table_share_container global_table_share_container;
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_table_share_index, 8 * 1024, 8 * 1024> PFS_table_share_index_container;
+#else
+typedef PFS_buffer_container<PFS_table_share_index> PFS_table_share_index_container;
+#endif
+typedef PFS_table_share_index_container::iterator_type PFS_table_share_index_iterator;
+extern PFS_table_share_index_container global_table_share_index_container;
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_table_share_lock, 4 * 1024, 4 * 1024> PFS_table_share_lock_container;
+#else
+typedef PFS_buffer_container<PFS_table_share_lock> PFS_table_share_lock_container;
+#endif
+typedef PFS_table_share_lock_container::iterator_type PFS_table_share_lock_iterator;
+extern PFS_table_share_lock_container global_table_share_lock_container;
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_program, 1024, 1024> PFS_program_container;
+#else
+typedef PFS_buffer_container<PFS_program> PFS_program_container;
+#endif
+typedef PFS_program_container::iterator_type PFS_program_iterator;
+extern PFS_program_container global_program_container;
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_prepared_stmt, 1024, 1024> PFS_prepared_stmt_container;
+#else
+typedef PFS_buffer_container<PFS_prepared_stmt> PFS_prepared_stmt_container;
+#endif
+typedef PFS_prepared_stmt_container::iterator_type PFS_prepared_stmt_iterator;
+extern PFS_prepared_stmt_container global_prepared_stmt_container;
+
+class PFS_account_array : public PFS_buffer_default_array<PFS_account>
+{
+public:
+  PFS_single_stat *m_instr_class_waits_array;
+  PFS_stage_stat *m_instr_class_stages_array;
+  PFS_statement_stat *m_instr_class_statements_array;
+  PFS_transaction_stat *m_instr_class_transactions_array;
+  PFS_memory_stat *m_instr_class_memory_array;
+};
+
+class PFS_account_allocator
+{
+public:
+  int alloc_array(PFS_account_array *array);
+  void free_array(PFS_account_array *array);
+};
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_account,
+                                      128,
+                                      128,
+                                      PFS_account_array,
+                                      PFS_account_allocator> PFS_account_container;
+#else
+typedef PFS_buffer_container<PFS_account,
+                             PFS_account_array,
+                             PFS_account_allocator> PFS_account_container;
+#endif
+typedef PFS_account_container::iterator_type PFS_account_iterator;
+extern PFS_account_container global_account_container;
+
+class PFS_host_array : public PFS_buffer_default_array<PFS_host>
+{
+public:
+  PFS_single_stat *m_instr_class_waits_array;
+  PFS_stage_stat *m_instr_class_stages_array;
+  PFS_statement_stat *m_instr_class_statements_array;
+  PFS_transaction_stat *m_instr_class_transactions_array;
+  PFS_memory_stat *m_instr_class_memory_array;
+};
+
+class PFS_host_allocator
+{
+public:
+  int alloc_array(PFS_host_array *array);
+  void free_array(PFS_host_array *array);
+};
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_host,
+                                      128,
+                                      128,
+                                      PFS_host_array,
+                                      PFS_host_allocator> PFS_host_container;
+#else
+typedef PFS_buffer_container<PFS_host,
+                             PFS_host_array,
+                             PFS_host_allocator> PFS_host_container;
+#endif
+typedef PFS_host_container::iterator_type PFS_host_iterator;
+extern PFS_host_container global_host_container;
+
+class PFS_thread_array : public PFS_buffer_default_array<PFS_thread>
+{
+public:
+  PFS_single_stat *m_instr_class_waits_array;
+  PFS_stage_stat *m_instr_class_stages_array;
+  PFS_statement_stat *m_instr_class_statements_array;
+  PFS_transaction_stat *m_instr_class_transactions_array;
+  PFS_memory_stat *m_instr_class_memory_array;
+
+  PFS_events_waits *m_waits_history_array;
+  PFS_events_stages *m_stages_history_array;
+  PFS_events_statements *m_statements_history_array;
+  PFS_events_statements *m_statements_stack_array;
+  PFS_events_transactions *m_transactions_history_array;
+  char *m_session_connect_attrs_array;
+
+  char *m_current_stmts_text_array;
+  char *m_history_stmts_text_array;
+  unsigned char *m_current_stmts_digest_token_array;
+  unsigned char *m_history_stmts_digest_token_array;
+};
+
+class PFS_thread_allocator
+{
+public:
+  int alloc_array(PFS_thread_array *array);
+  void free_array(PFS_thread_array *array);
+};
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_thread,
+                                      256,
+                                      256,
+                                      PFS_thread_array,
+                                      PFS_thread_allocator> PFS_thread_container;
+#else
+typedef PFS_buffer_container<PFS_thread,
+                             PFS_thread_array,
+                             PFS_thread_allocator> PFS_thread_container;
+#endif
+typedef PFS_thread_container::iterator_type PFS_thread_iterator;
+extern PFS_thread_container global_thread_container;
+
+class PFS_user_array : public PFS_buffer_default_array<PFS_user>
+{
+public:
+  PFS_single_stat *m_instr_class_waits_array;
+  PFS_stage_stat *m_instr_class_stages_array;
+  PFS_statement_stat *m_instr_class_statements_array;
+  PFS_transaction_stat *m_instr_class_transactions_array;
+  PFS_memory_stat *m_instr_class_memory_array;
+};
+
+class PFS_user_allocator
+{
+public:
+  int alloc_array(PFS_user_array *array);
+  void free_array(PFS_user_array *array);
+};
+
+#ifdef USE_SCALABLE
+typedef PFS_buffer_scalable_container<PFS_user,
+                                      128,
+                                      128,
+                                      PFS_user_array,
+                                      PFS_user_allocator> PFS_user_container;
+#else
+typedef PFS_buffer_container<PFS_user,
+                             PFS_user_array,
+                             PFS_user_allocator> PFS_user_container;
+#endif
+typedef PFS_user_container::iterator_type PFS_user_iterator;
+extern PFS_user_container global_user_container;
+
+#endif
+
diff --git a/storage/perfschema/pfs_builtin_memory.cc b/storage/perfschema/pfs_builtin_memory.cc
new file mode 100644
index 00000000000..354de87f05f
--- /dev/null
+++ b/storage/perfschema/pfs_builtin_memory.cc
@@ -0,0 +1,382 @@
+/* Copyright (c) 2014, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#include "my_global.h"
+#include "m_string.h"
+#include "pfs_global.h"
+#include "pfs_builtin_memory.h"
+
+PFS_builtin_memory_class builtin_memory_mutex;
+PFS_builtin_memory_class builtin_memory_rwlock;
+PFS_builtin_memory_class builtin_memory_cond;
+PFS_builtin_memory_class builtin_memory_file;
+PFS_builtin_memory_class builtin_memory_socket;
+PFS_builtin_memory_class builtin_memory_mdl;
+PFS_builtin_memory_class builtin_memory_file_handle;
+
+PFS_builtin_memory_class builtin_memory_account;
+PFS_builtin_memory_class builtin_memory_account_waits;
+PFS_builtin_memory_class builtin_memory_account_stages;
+PFS_builtin_memory_class builtin_memory_account_statements;
+PFS_builtin_memory_class builtin_memory_account_transactions;
+PFS_builtin_memory_class builtin_memory_account_memory;
+
+PFS_builtin_memory_class builtin_memory_global_stages;
+PFS_builtin_memory_class builtin_memory_global_statements;
+PFS_builtin_memory_class builtin_memory_global_memory;
+
+PFS_builtin_memory_class builtin_memory_host;
+PFS_builtin_memory_class builtin_memory_host_waits;
+PFS_builtin_memory_class builtin_memory_host_stages;
+PFS_builtin_memory_class builtin_memory_host_statements;
+PFS_builtin_memory_class builtin_memory_host_transactions;
+PFS_builtin_memory_class builtin_memory_host_memory;
+
+PFS_builtin_memory_class builtin_memory_thread;
+PFS_builtin_memory_class builtin_memory_thread_waits;
+PFS_builtin_memory_class builtin_memory_thread_stages;
+PFS_builtin_memory_class builtin_memory_thread_statements;
+PFS_builtin_memory_class builtin_memory_thread_transactions;
+PFS_builtin_memory_class builtin_memory_thread_memory;
+
+PFS_builtin_memory_class builtin_memory_thread_waits_history;
+PFS_builtin_memory_class builtin_memory_thread_stages_history;
+PFS_builtin_memory_class builtin_memory_thread_statements_history;
+PFS_builtin_memory_class builtin_memory_thread_statements_history_tokens;
+PFS_builtin_memory_class builtin_memory_thread_statements_history_sqltext;
+PFS_builtin_memory_class builtin_memory_thread_statements_stack;
+PFS_builtin_memory_class builtin_memory_thread_statements_stack_tokens;
+PFS_builtin_memory_class builtin_memory_thread_statements_stack_sqltext;
+PFS_builtin_memory_class builtin_memory_thread_transaction_history;
+PFS_builtin_memory_class builtin_memory_thread_session_connect_attrs;
+
+PFS_builtin_memory_class builtin_memory_user;
+PFS_builtin_memory_class builtin_memory_user_waits;
+PFS_builtin_memory_class builtin_memory_user_stages;
+PFS_builtin_memory_class builtin_memory_user_statements;
+PFS_builtin_memory_class builtin_memory_user_transactions;
+PFS_builtin_memory_class builtin_memory_user_memory;
+
+PFS_builtin_memory_class builtin_memory_mutex_class;
+PFS_builtin_memory_class builtin_memory_rwlock_class;
+PFS_builtin_memory_class builtin_memory_cond_class;
+PFS_builtin_memory_class builtin_memory_thread_class;
+PFS_builtin_memory_class builtin_memory_file_class;
+PFS_builtin_memory_class builtin_memory_socket_class;
+PFS_builtin_memory_class builtin_memory_stage_class;
+PFS_builtin_memory_class builtin_memory_statement_class;
+PFS_builtin_memory_class builtin_memory_memory_class;
+
+PFS_builtin_memory_class builtin_memory_setup_actor;
+PFS_builtin_memory_class builtin_memory_setup_object;
+
+PFS_builtin_memory_class builtin_memory_digest;
+PFS_builtin_memory_class builtin_memory_digest_tokens;
+
+PFS_builtin_memory_class builtin_memory_stages_history_long;
+PFS_builtin_memory_class builtin_memory_statements_history_long;
+PFS_builtin_memory_class builtin_memory_statements_history_long_tokens;
+PFS_builtin_memory_class builtin_memory_statements_history_long_sqltext;
+PFS_builtin_memory_class builtin_memory_transactions_history_long;
+PFS_builtin_memory_class builtin_memory_waits_history_long;
+
+PFS_builtin_memory_class builtin_memory_table;
+PFS_builtin_memory_class builtin_memory_table_share;
+PFS_builtin_memory_class builtin_memory_table_share_index;
+PFS_builtin_memory_class builtin_memory_table_share_lock;
+
+PFS_builtin_memory_class builtin_memory_program;
+PFS_builtin_memory_class builtin_memory_prepared_stmt;
+
+PFS_builtin_memory_class builtin_memory_scalable_buffer;
+
+static void init_builtin_memory_class(PFS_builtin_memory_class *klass, const char* name)
+{
+  klass->m_class.m_type= PFS_CLASS_MEMORY;
+  klass->m_class.m_enabled= true; /* Immutable */
+  klass->m_class.m_timed= false; /* Immutable */
+  klass->m_class.m_flags= PSI_FLAG_GLOBAL;
+  klass->m_class.m_event_name_index= 0;
+  my_snprintf(klass->m_class.m_name, sizeof(klass->m_class.m_name), "%.*s",
+              PFS_MAX_INFO_NAME_LENGTH - 1, name);
+  klass->m_class.m_name_length= static_cast<uint>(strlen(name));
+  DBUG_ASSERT(klass->m_class.m_name_length < sizeof(klass->m_class.m_name));
+  klass->m_class.m_timer= NULL;
+
+  klass->m_stat.reset();
+}
+
+void init_all_builtin_memory_class()
+{
+  init_builtin_memory_class( & builtin_memory_mutex,
+                             "memory/performance_schema/mutex_instances");
+  init_builtin_memory_class( & builtin_memory_rwlock,
+                             "memory/performance_schema/rwlock_instances");
+  init_builtin_memory_class( & builtin_memory_cond,
+                             "memory/performance_schema/cond_instances");
+  init_builtin_memory_class( & builtin_memory_file,
+                             "memory/performance_schema/file_instances");
+  init_builtin_memory_class( & builtin_memory_socket,
+                             "memory/performance_schema/socket_instances");
+  init_builtin_memory_class( & builtin_memory_mdl,
+                             "memory/performance_schema/metadata_locks");
+  init_builtin_memory_class( & builtin_memory_file_handle,
+                             "memory/performance_schema/file_handle");
+
+  init_builtin_memory_class( & builtin_memory_account,
+                             "memory/performance_schema/accounts");
+  init_builtin_memory_class( & builtin_memory_account_waits,
+                             "memory/performance_schema/events_waits_summary_by_account_by_event_name");
+  init_builtin_memory_class( & builtin_memory_account_stages,
+                             "memory/performance_schema/events_stages_summary_by_account_by_event_name");
+  init_builtin_memory_class( & builtin_memory_account_statements,
+                             "memory/performance_schema/events_statements_summary_by_account_by_event_name");
+  init_builtin_memory_class( & builtin_memory_account_transactions,
+                             "memory/performance_schema/events_transactions_summary_by_account_by_event_name");
+  init_builtin_memory_class( & builtin_memory_account_memory,
+                             "memory/performance_schema/memory_summary_by_account_by_event_name");
+
+  init_builtin_memory_class( & builtin_memory_global_stages,
+                             "memory/performance_schema/events_stages_summary_global_by_event_name");
+  init_builtin_memory_class( & builtin_memory_global_statements,
+                             "memory/performance_schema/events_statements_summary_global_by_event_name");
+  init_builtin_memory_class( & builtin_memory_global_memory,
+                             "memory/performance_schema/memory_summary_global_by_event_name");
+
+  init_builtin_memory_class( & builtin_memory_host,
+                             "memory/performance_schema/hosts");
+  init_builtin_memory_class( & builtin_memory_host_waits,
+                             "memory/performance_schema/events_waits_summary_by_host_by_event_name");
+  init_builtin_memory_class( & builtin_memory_host_stages,
+                             "memory/performance_schema/events_stages_summary_by_host_by_event_name");
+  init_builtin_memory_class( & builtin_memory_host_statements,
+                             "memory/performance_schema/events_statements_summary_by_host_by_event_name");
+  init_builtin_memory_class( & builtin_memory_host_transactions,
+                             "memory/performance_schema/events_transactions_summary_by_host_by_event_name");
+  init_builtin_memory_class( & builtin_memory_host_memory,
+                             "memory/performance_schema/memory_summary_by_host_by_event_name");
+
+  init_builtin_memory_class( & builtin_memory_thread,
+                             "memory/performance_schema/threads");
+  init_builtin_memory_class( & builtin_memory_thread_waits,
+                             "memory/performance_schema/events_waits_summary_by_thread_by_event_name");
+  init_builtin_memory_class( & builtin_memory_thread_stages,
+                             "memory/performance_schema/events_stages_summary_by_thread_by_event_name");
+  init_builtin_memory_class( & builtin_memory_thread_statements,
+                             "memory/performance_schema/events_statements_summary_by_thread_by_event_name");
+  init_builtin_memory_class( & builtin_memory_thread_transactions,
+                             "memory/performance_schema/events_transactions_summary_by_thread_by_event_name");
+  init_builtin_memory_class( & builtin_memory_thread_memory,
+                             "memory/performance_schema/memory_summary_by_thread_by_event_name");
+
+  init_builtin_memory_class( & builtin_memory_thread_waits_history,
+                             "memory/performance_schema/events_waits_history");
+  init_builtin_memory_class( & builtin_memory_thread_stages_history,
+                             "memory/performance_schema/events_stages_history");
+  init_builtin_memory_class( & builtin_memory_thread_statements_history,
+                             "memory/performance_schema/events_statements_history");
+  init_builtin_memory_class( & builtin_memory_thread_statements_history_tokens,
+                             "memory/performance_schema/events_statements_history.tokens");
+  init_builtin_memory_class( & builtin_memory_thread_statements_history_sqltext,
+                             "memory/performance_schema/events_statements_history.sqltext");
+  init_builtin_memory_class( & builtin_memory_thread_statements_stack,
+                             "memory/performance_schema/events_statements_current");
+  init_builtin_memory_class( & builtin_memory_thread_statements_stack_tokens,
+                             "memory/performance_schema/events_statements_current.tokens");
+  init_builtin_memory_class( & builtin_memory_thread_statements_stack_sqltext,
+                             "memory/performance_schema/events_statements_current.sqltext");
+  init_builtin_memory_class( & builtin_memory_thread_transaction_history,
+                             "memory/performance_schema/events_transactions_history");
+  init_builtin_memory_class( & builtin_memory_thread_session_connect_attrs,
+                             "memory/performance_schema/session_connect_attrs");
+
+  init_builtin_memory_class( & builtin_memory_user,
+                             "memory/performance_schema/users");
+  init_builtin_memory_class( & builtin_memory_user_waits,
+                             "memory/performance_schema/events_waits_summary_by_user_by_event_name");
+  init_builtin_memory_class( & builtin_memory_user_stages,
+                             "memory/performance_schema/events_stages_summary_by_user_by_event_name");
+  init_builtin_memory_class( & builtin_memory_user_statements,
+                             "memory/performance_schema/events_statements_summary_by_user_by_event_name");
+  init_builtin_memory_class( & builtin_memory_user_transactions,
+                             "memory/performance_schema/events_transactions_summary_by_user_by_event_name");
+  init_builtin_memory_class( & builtin_memory_user_memory,
+                             "memory/performance_schema/memory_summary_by_user_by_event_name");
+
+  init_builtin_memory_class( & builtin_memory_mutex_class,
+                             "memory/performance_schema/mutex_class");
+  init_builtin_memory_class( & builtin_memory_rwlock_class,
+                             "memory/performance_schema/rwlock_class");
+  init_builtin_memory_class( & builtin_memory_cond_class,
+                             "memory/performance_schema/cond_class");
+  init_builtin_memory_class( & builtin_memory_thread_class,
+                             "memory/performance_schema/thread_class");
+  init_builtin_memory_class( & builtin_memory_file_class,
+                             "memory/performance_schema/file_class");
+  init_builtin_memory_class( & builtin_memory_socket_class,
+                             "memory/performance_schema/socket_class");
+  init_builtin_memory_class( & builtin_memory_stage_class,
+                             "memory/performance_schema/stage_class");
+  init_builtin_memory_class( & builtin_memory_statement_class,
+                             "memory/performance_schema/statement_class");
+  init_builtin_memory_class( & builtin_memory_memory_class,
+                             "memory/performance_schema/memory_class");
+
+  init_builtin_memory_class( & builtin_memory_setup_actor,
+                             "memory/performance_schema/setup_actors");
+  init_builtin_memory_class( & builtin_memory_setup_object,
+                             "memory/performance_schema/setup_objects");
+
+  init_builtin_memory_class( & builtin_memory_digest,
+                             "memory/performance_schema/events_statements_summary_by_digest");
+  init_builtin_memory_class( & builtin_memory_digest_tokens,
+                             "memory/performance_schema/events_statements_summary_by_digest.tokens");
+
+  init_builtin_memory_class( & builtin_memory_stages_history_long,
+                             "memory/performance_schema/events_stages_history_long");
+  init_builtin_memory_class( & builtin_memory_statements_history_long,
+                             "memory/performance_schema/events_statements_history_long");
+  init_builtin_memory_class( & builtin_memory_statements_history_long_tokens,
+                             "memory/performance_schema/events_statements_history_long.tokens");
+  init_builtin_memory_class( & builtin_memory_statements_history_long_sqltext,
+                             "memory/performance_schema/events_statements_history_long.sqltext");
+  init_builtin_memory_class( & builtin_memory_transactions_history_long,
+                             "memory/performance_schema/events_transactions_history_long");
+  init_builtin_memory_class( & builtin_memory_waits_history_long,
+                             "memory/performance_schema/events_waits_history_long");
+
+  init_builtin_memory_class( & builtin_memory_table,
+                             "memory/performance_schema/table_handles");
+  init_builtin_memory_class( & builtin_memory_table_share,
+                             "memory/performance_schema/table_shares");
+  init_builtin_memory_class( & builtin_memory_table_share_index,
+                             "memory/performance_schema/table_io_waits_summary_by_index_usage");
+  init_builtin_memory_class( & builtin_memory_table_share_lock,
+                             "memory/performance_schema/table_lock_waits_summary_by_table");
+
+  init_builtin_memory_class( & builtin_memory_program,
+                             "memory/performance_schema/events_statements_summary_by_program");
+  init_builtin_memory_class( & builtin_memory_prepared_stmt,
+                             "memory/performance_schema/prepared_statements_instances");
+
+  init_builtin_memory_class( & builtin_memory_scalable_buffer,
+                             "memory/performance_schema/scalable_buffer");
+}
+
+static PFS_builtin_memory_class* all_builtin_memory[]=
+{
+  & builtin_memory_mutex,
+  & builtin_memory_rwlock,
+  & builtin_memory_cond,
+  & builtin_memory_file,
+  & builtin_memory_socket,
+  & builtin_memory_mdl,
+  & builtin_memory_file_handle,
+
+  & builtin_memory_account,
+  & builtin_memory_account_waits,
+  & builtin_memory_account_stages,
+  & builtin_memory_account_statements,
+  & builtin_memory_account_transactions,
+  & builtin_memory_account_memory,
+
+  & builtin_memory_global_stages,
+  & builtin_memory_global_statements,
+  & builtin_memory_global_memory,
+
+  & builtin_memory_host,
+  & builtin_memory_host_waits,
+  & builtin_memory_host_stages,
+  & builtin_memory_host_statements,
+  & builtin_memory_host_transactions,
+  & builtin_memory_host_memory,
+
+  & builtin_memory_thread,
+  & builtin_memory_thread_waits,
+  & builtin_memory_thread_stages,
+  & builtin_memory_thread_statements,
+  & builtin_memory_thread_transactions,
+  & builtin_memory_thread_memory,
+
+  & builtin_memory_thread_waits_history,
+  & builtin_memory_thread_stages_history,
+  & builtin_memory_thread_statements_history,
+  & builtin_memory_thread_statements_history_tokens,
+  & builtin_memory_thread_statements_history_sqltext,
+  & builtin_memory_thread_statements_stack,
+  & builtin_memory_thread_statements_stack_tokens,
+  & builtin_memory_thread_statements_stack_sqltext,
+  & builtin_memory_thread_transaction_history,
+  & builtin_memory_thread_session_connect_attrs,
+
+  & builtin_memory_user,
+  & builtin_memory_user_waits,
+  & builtin_memory_user_stages,
+  & builtin_memory_user_statements,
+  & builtin_memory_user_transactions,
+  & builtin_memory_user_memory,
+
+  & builtin_memory_mutex_class,
+  & builtin_memory_rwlock_class,
+  & builtin_memory_cond_class,
+  & builtin_memory_thread_class,
+  & builtin_memory_file_class,
+  & builtin_memory_socket_class,
+  & builtin_memory_stage_class,
+  & builtin_memory_statement_class,
+  & builtin_memory_memory_class,
+
+  & builtin_memory_setup_actor,
+  & builtin_memory_setup_object,
+
+  & builtin_memory_digest,
+  & builtin_memory_digest_tokens,
+
+  & builtin_memory_stages_history_long,
+  & builtin_memory_statements_history_long,
+  & builtin_memory_statements_history_long_tokens,
+  & builtin_memory_statements_history_long_sqltext,
+  & builtin_memory_transactions_history_long,
+  & builtin_memory_waits_history_long,
+
+  & builtin_memory_table,
+  & builtin_memory_table_share,
+  & builtin_memory_table_share_index,
+  & builtin_memory_table_share_lock,
+
+  & builtin_memory_program,
+  & builtin_memory_prepared_stmt,
+
+  & builtin_memory_scalable_buffer,
+
+  NULL
+};
+
+
+PFS_builtin_memory_class *find_builtin_memory_class(PFS_builtin_memory_key key)
+{
+  if (key == 0)
+    return NULL;
+
+  return all_builtin_memory[key - 1];
+}
+
diff --git a/storage/perfschema/pfs_builtin_memory.h b/storage/perfschema/pfs_builtin_memory.h
new file mode 100644
index 00000000000..2c9da9dd4a0
--- /dev/null
+++ b/storage/perfschema/pfs_builtin_memory.h
@@ -0,0 +1,143 @@
+/* Copyright (c) 2014, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef PFS_BUILTIN_MEMORY_H
+#define PFS_BUILTIN_MEMORY_H
+
+#include "my_global.h"
+#include "pfs_global.h"
+#include "pfs_instr_class.h"
+
+/**
+  @file storage/perfschema/pfs_builtin_memory.h
+  Performance schema instruments meta data (declarations).
+*/
+
+typedef uint PFS_builtin_memory_key;
+
+struct PFS_builtin_memory_class
+{
+  PFS_memory_class m_class;
+  PFS_memory_stat m_stat;
+
+  inline void count_alloc(size_t size)
+  {
+    m_stat.count_builtin_alloc(size);
+  }
+
+  inline void count_free(size_t size)
+  {
+    m_stat.count_builtin_free(size);
+  }
+};
+
+void init_all_builtin_memory_class();
+
+PFS_builtin_memory_class *find_builtin_memory_class(PFS_builtin_memory_key);
+
+extern PFS_builtin_memory_class builtin_memory_mutex;
+extern PFS_builtin_memory_class builtin_memory_rwlock;
+extern PFS_builtin_memory_class builtin_memory_cond;
+extern PFS_builtin_memory_class builtin_memory_file;
+extern PFS_builtin_memory_class builtin_memory_socket;
+extern PFS_builtin_memory_class builtin_memory_mdl;
+extern PFS_builtin_memory_class builtin_memory_file_handle;
+
+extern PFS_builtin_memory_class builtin_memory_account;
+extern PFS_builtin_memory_class builtin_memory_account_waits;
+extern PFS_builtin_memory_class builtin_memory_account_stages;
+extern PFS_builtin_memory_class builtin_memory_account_statements;
+extern PFS_builtin_memory_class builtin_memory_account_transactions;
+extern PFS_builtin_memory_class builtin_memory_account_memory;
+
+extern PFS_builtin_memory_class builtin_memory_global_stages;
+extern PFS_builtin_memory_class builtin_memory_global_statements;
+extern PFS_builtin_memory_class builtin_memory_global_memory;
+
+extern PFS_builtin_memory_class builtin_memory_host;
+extern PFS_builtin_memory_class builtin_memory_host_waits;
+extern PFS_builtin_memory_class builtin_memory_host_stages;
+extern PFS_builtin_memory_class builtin_memory_host_statements;
+extern PFS_builtin_memory_class builtin_memory_host_transactions;
+extern PFS_builtin_memory_class builtin_memory_host_memory;
+
+extern PFS_builtin_memory_class builtin_memory_thread;
+extern PFS_builtin_memory_class builtin_memory_thread_waits;
+extern PFS_builtin_memory_class builtin_memory_thread_stages;
+extern PFS_builtin_memory_class builtin_memory_thread_statements;
+extern PFS_builtin_memory_class builtin_memory_thread_transactions;
+extern PFS_builtin_memory_class builtin_memory_thread_memory;
+
+extern PFS_builtin_memory_class builtin_memory_thread_waits_history;
+extern PFS_builtin_memory_class builtin_memory_thread_stages_history;
+extern PFS_builtin_memory_class builtin_memory_thread_statements_history;
+extern PFS_builtin_memory_class builtin_memory_thread_statements_history_tokens;
+extern PFS_builtin_memory_class builtin_memory_thread_statements_history_sqltext;
+extern PFS_builtin_memory_class builtin_memory_thread_statements_stack;
+extern PFS_builtin_memory_class builtin_memory_thread_statements_stack_tokens;
+extern PFS_builtin_memory_class builtin_memory_thread_statements_stack_sqltext;
+extern PFS_builtin_memory_class builtin_memory_thread_transaction_history;
+extern PFS_builtin_memory_class builtin_memory_thread_session_connect_attrs;
+
+extern PFS_builtin_memory_class builtin_memory_user;
+extern PFS_builtin_memory_class builtin_memory_user_waits;
+extern PFS_builtin_memory_class builtin_memory_user_stages;
+extern PFS_builtin_memory_class builtin_memory_user_statements;
+extern PFS_builtin_memory_class builtin_memory_user_transactions;
+extern PFS_builtin_memory_class builtin_memory_user_memory;
+
+extern PFS_builtin_memory_class builtin_memory_mutex_class;
+extern PFS_builtin_memory_class builtin_memory_rwlock_class;
+extern PFS_builtin_memory_class builtin_memory_cond_class;
+extern PFS_builtin_memory_class builtin_memory_thread_class;
+extern PFS_builtin_memory_class builtin_memory_file_class;
+extern PFS_builtin_memory_class builtin_memory_socket_class;
+extern PFS_builtin_memory_class builtin_memory_stage_class;
+extern PFS_builtin_memory_class builtin_memory_statement_class;
+extern PFS_builtin_memory_class builtin_memory_memory_class;
+
+extern PFS_builtin_memory_class builtin_memory_setup_actor;
+extern PFS_builtin_memory_class builtin_memory_setup_object;
+
+extern PFS_builtin_memory_class builtin_memory_digest;
+extern PFS_builtin_memory_class builtin_memory_digest_tokens;
+
+extern PFS_builtin_memory_class builtin_memory_stages_history_long;
+extern PFS_builtin_memory_class builtin_memory_statements_history_long;
+extern PFS_builtin_memory_class builtin_memory_statements_history_long_tokens;
+extern PFS_builtin_memory_class builtin_memory_statements_history_long_sqltext;
+extern PFS_builtin_memory_class builtin_memory_transactions_history_long;
+extern PFS_builtin_memory_class builtin_memory_waits_history_long;
+
+extern PFS_builtin_memory_class builtin_memory_table;
+extern PFS_builtin_memory_class builtin_memory_table_share;
+extern PFS_builtin_memory_class builtin_memory_table_share_index;
+extern PFS_builtin_memory_class builtin_memory_table_share_lock;
+
+extern PFS_builtin_memory_class builtin_memory_program;
+extern PFS_builtin_memory_class builtin_memory_prepared_stmt;
+
+extern PFS_builtin_memory_class builtin_memory_scalable_buffer;
+
+/** @} */
+#endif
+
diff --git a/storage/perfschema/pfs_column_types.h b/storage/perfschema/pfs_column_types.h
index 09fce551402..146c9c8054e 100644
--- a/storage/perfschema/pfs_column_types.h
+++ b/storage/perfschema/pfs_column_types.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -63,9 +63,6 @@
 /** Size of the DIGEST columns. */
 #define COL_DIGEST_SIZE 64
 
-/** Size of the DIGEST_TEXT columns. */
-#define COL_DIGEST_TEXT_SIZE 1024
-
 /**
   Enum values for the TIMER_NAME columns.
   This enum is found in the following tables:
@@ -114,78 +111,89 @@ enum enum_operation_type
   OPERATION_TYPE_LOCK= 1,
   OPERATION_TYPE_TRYLOCK= 2,
 
-  /* Rwlock operations */
+  /* Rwlock operations (RW-lock) */
   OPERATION_TYPE_READLOCK= 3,
   OPERATION_TYPE_WRITELOCK= 4,
   OPERATION_TYPE_TRYREADLOCK= 5,
   OPERATION_TYPE_TRYWRITELOCK= 6,
 
+  /* Rwlock operations (SX-lock) */
+  OPERATION_TYPE_SHAREDLOCK= 7,
+  OPERATION_TYPE_SHAREDEXCLUSIVELOCK= 8,
+  OPERATION_TYPE_EXCLUSIVELOCK= 9,
+  OPERATION_TYPE_TRYSHAREDLOCK= 10,
+  OPERATION_TYPE_TRYSHAREDEXCLUSIVELOCK= 11,
+  OPERATION_TYPE_TRYEXCLUSIVELOCK= 12,
+
   /* Cond operations */
-  OPERATION_TYPE_WAIT= 7,
-  OPERATION_TYPE_TIMEDWAIT= 8,
+  OPERATION_TYPE_WAIT= 13,
+  OPERATION_TYPE_TIMEDWAIT= 14,
 
   /* File operations */
-  OPERATION_TYPE_FILECREATE= 9,
-  OPERATION_TYPE_FILECREATETMP= 10,
-  OPERATION_TYPE_FILEOPEN= 11,
-  OPERATION_TYPE_FILESTREAMOPEN= 12,
-  OPERATION_TYPE_FILECLOSE= 13,
-  OPERATION_TYPE_FILESTREAMCLOSE= 14,
-  OPERATION_TYPE_FILEREAD= 15,
-  OPERATION_TYPE_FILEWRITE= 16,
-  OPERATION_TYPE_FILESEEK= 17,
-  OPERATION_TYPE_FILETELL= 18,
-  OPERATION_TYPE_FILEFLUSH= 19,
-  OPERATION_TYPE_FILESTAT= 20,
-  OPERATION_TYPE_FILEFSTAT= 21,
-  OPERATION_TYPE_FILECHSIZE= 22,
-  OPERATION_TYPE_FILEDELETE= 23,
-  OPERATION_TYPE_FILERENAME= 24,
-  OPERATION_TYPE_FILESYNC= 25,
+  OPERATION_TYPE_FILECREATE= 15,
+  OPERATION_TYPE_FILECREATETMP= 16,
+  OPERATION_TYPE_FILEOPEN= 17,
+  OPERATION_TYPE_FILESTREAMOPEN= 18,
+  OPERATION_TYPE_FILECLOSE= 19,
+  OPERATION_TYPE_FILESTREAMCLOSE= 20,
+  OPERATION_TYPE_FILEREAD= 21,
+  OPERATION_TYPE_FILEWRITE= 22,
+  OPERATION_TYPE_FILESEEK= 23,
+  OPERATION_TYPE_FILETELL= 24,
+  OPERATION_TYPE_FILEFLUSH= 25,
+  OPERATION_TYPE_FILESTAT= 26,
+  OPERATION_TYPE_FILEFSTAT= 27,
+  OPERATION_TYPE_FILECHSIZE= 28,
+  OPERATION_TYPE_FILEDELETE= 29,
+  OPERATION_TYPE_FILERENAME= 30,
+  OPERATION_TYPE_FILESYNC= 31,
 
   /* Table io operations */
-  OPERATION_TYPE_TABLE_FETCH= 26,
-  OPERATION_TYPE_TABLE_WRITE_ROW= 27,
-  OPERATION_TYPE_TABLE_UPDATE_ROW= 28,
-  OPERATION_TYPE_TABLE_DELETE_ROW= 29,
+  OPERATION_TYPE_TABLE_FETCH= 32,
+  OPERATION_TYPE_TABLE_WRITE_ROW= 33,
+  OPERATION_TYPE_TABLE_UPDATE_ROW= 34,
+  OPERATION_TYPE_TABLE_DELETE_ROW= 35,
 
   /* Table lock operations */
-  OPERATION_TYPE_TL_READ_NORMAL= 30,
-  OPERATION_TYPE_TL_READ_WITH_SHARED_LOCKS= 31,
-  OPERATION_TYPE_TL_READ_HIGH_PRIORITY= 32,
-  OPERATION_TYPE_TL_READ_NO_INSERTS= 33,
-  OPERATION_TYPE_TL_WRITE_ALLOW_WRITE= 34,
-  OPERATION_TYPE_TL_WRITE_CONCURRENT_INSERT= 35,
-  OPERATION_TYPE_TL_WRITE_DELAYED= 36,
-  OPERATION_TYPE_TL_WRITE_LOW_PRIORITY= 37,
-  OPERATION_TYPE_TL_WRITE_NORMAL= 38,
-  OPERATION_TYPE_TL_READ_EXTERNAL= 39,
-  OPERATION_TYPE_TL_WRITE_EXTERNAL= 40,
+  OPERATION_TYPE_TL_READ_NORMAL= 36,
+  OPERATION_TYPE_TL_READ_WITH_SHARED_LOCKS= 37,
+  OPERATION_TYPE_TL_READ_HIGH_PRIORITY= 38,
+  OPERATION_TYPE_TL_READ_NO_INSERTS= 39,
+  OPERATION_TYPE_TL_WRITE_ALLOW_WRITE= 40,
+  OPERATION_TYPE_TL_WRITE_CONCURRENT_INSERT= 41,
+  OPERATION_TYPE_TL_WRITE_DELAYED= 42,
+  OPERATION_TYPE_TL_WRITE_LOW_PRIORITY= 43,
+  OPERATION_TYPE_TL_WRITE_NORMAL= 44,
+  OPERATION_TYPE_TL_READ_EXTERNAL= 45,
+  OPERATION_TYPE_TL_WRITE_EXTERNAL= 46,
 
   /* Socket operations */
-  OPERATION_TYPE_SOCKETCREATE = 41,
-  OPERATION_TYPE_SOCKETCONNECT = 42,
-  OPERATION_TYPE_SOCKETBIND = 43,
-  OPERATION_TYPE_SOCKETCLOSE = 44,
-  OPERATION_TYPE_SOCKETSEND = 45,
-  OPERATION_TYPE_SOCKETRECV = 46,
-  OPERATION_TYPE_SOCKETSENDTO = 47,
-  OPERATION_TYPE_SOCKETRECVFROM = 48,
-  OPERATION_TYPE_SOCKETSENDMSG = 49,
-  OPERATION_TYPE_SOCKETRECVMSG = 50,
-  OPERATION_TYPE_SOCKETSEEK = 51,
-  OPERATION_TYPE_SOCKETOPT = 52,
-  OPERATION_TYPE_SOCKETSTAT = 53,
-  OPERATION_TYPE_SOCKETSHUTDOWN = 54,
-  OPERATION_TYPE_SOCKETSELECT = 55,
+  OPERATION_TYPE_SOCKETCREATE = 47,
+  OPERATION_TYPE_SOCKETCONNECT = 48,
+  OPERATION_TYPE_SOCKETBIND = 49,
+  OPERATION_TYPE_SOCKETCLOSE = 50,
+  OPERATION_TYPE_SOCKETSEND = 51,
+  OPERATION_TYPE_SOCKETRECV = 52,
+  OPERATION_TYPE_SOCKETSENDTO = 53,
+  OPERATION_TYPE_SOCKETRECVFROM = 54,
+  OPERATION_TYPE_SOCKETSENDMSG = 55,
+  OPERATION_TYPE_SOCKETRECVMSG = 56,
+  OPERATION_TYPE_SOCKETSEEK = 57,
+  OPERATION_TYPE_SOCKETOPT = 58,
+  OPERATION_TYPE_SOCKETSTAT = 59,
+  OPERATION_TYPE_SOCKETSHUTDOWN = 60,
+  OPERATION_TYPE_SOCKETSELECT = 61,
 
   /* Idle operation */
-  OPERATION_TYPE_IDLE= 56
+  OPERATION_TYPE_IDLE= 62,
+
+  /* Metadata lock operation */
+  OPERATION_TYPE_METADATA= 63
 };
 /** Integer, first value of @sa enum_operation_type. */
 #define FIRST_OPERATION_TYPE (static_cast<int> (OPERATION_TYPE_LOCK))
 /** Integer, last value of @sa enum_operation_type. */
-#define LAST_OPERATION_TYPE (static_cast<int> (OPERATION_TYPE_IDLE))
+#define LAST_OPERATION_TYPE (static_cast<int> (OPERATION_TYPE_METADATA))
 /** Integer, number of values of @sa enum_operation_type. */
 #define COUNT_OPERATION_TYPE (LAST_OPERATION_TYPE - FIRST_OPERATION_TYPE + 1)
 
@@ -194,13 +202,29 @@ enum enum_operation_type
 */
 enum enum_object_type
 {
-  OBJECT_TYPE_TABLE= 1,
-  OBJECT_TYPE_TEMPORARY_TABLE= 2
+  NO_OBJECT_TYPE= 0,
+
+  /* Advertised in SQL ENUM (see table_setup_object.cc) */
+
+  OBJECT_TYPE_EVENT= 1,
+  OBJECT_TYPE_FUNCTION= 2,
+  OBJECT_TYPE_PROCEDURE= 3,
+  OBJECT_TYPE_TABLE= 4,
+  OBJECT_TYPE_TRIGGER= 5,
+
+  /* Not advertised in SQL ENUM, only displayed as VARCHAR */
+
+  OBJECT_TYPE_TEMPORARY_TABLE= 6,
+  OBJECT_TYPE_BACKUP= 7,
+  OBJECT_TYPE_SCHEMA= 8,
+  OBJECT_TYPE_PACKAGE= 9,
+  OBJECT_TYPE_PACKAGE_BODY= 10,
+  OBJECT_TYPE_USER_LEVEL_LOCK= 11,
 };
 /** Integer, first value of @sa enum_object_type. */
-#define FIRST_OBJECT_TYPE (static_cast<int> (OBJECT_TYPE_TABLE))
+#define FIRST_OBJECT_TYPE (static_cast<int> (OBJECT_TYPE_EVENT))
 /** Integer, last value of @sa enum_object_type. */
-#define LAST_OBJECT_TYPE (static_cast<int> (OBJECT_TYPE_TEMPORARY_TABLE))
+#define LAST_OBJECT_TYPE (static_cast<int> (OBJECT_TYPE_USER_LEVEL_LOCK))
 /** Integer, number of values of @sa enum_object_type. */
 #define COUNT_OBJECT_TYPE (LAST_OBJECT_TYPE - FIRST_OBJECT_TYPE + 1)
 
@@ -213,17 +237,92 @@ enum enum_object_type
 */
 enum enum_event_type
 {
-  EVENT_TYPE_STATEMENT= 1,
-  EVENT_TYPE_STAGE= 2,
-  EVENT_TYPE_WAIT= 3
+  EVENT_TYPE_TRANSACTION= 1,
+  EVENT_TYPE_STATEMENT= 2,
+  EVENT_TYPE_STAGE= 3,
+  EVENT_TYPE_WAIT= 4
 };
 
 /** Integer, first value of @sa enum_event_type. */
-#define FIRST_EVENT_TYPE (static_cast<int> (EVENT_TYPE_STATEMENT))
+#define FIRST_EVENT_TYPE (static_cast<int> (EVENT_TYPE_TRANSACTION))
 /** Integer, last value of @sa enum_event_type. */
 #define LAST_EVENT_TYPE (static_cast<int> (EVENT_TYPE_WAIT))
 /** Integer, number of values of @sa enum_event_type. */
 #define COUNT_EVENT_TYPE (LAST_EVENT_TYPE - FIRST_EVENT_TYPE + 1)
 
+/**
+  Enum values for transaction state columns.
+*/
+enum enum_transaction_state
+{
+  TRANS_STATE_ACTIVE= 1,
+  TRANS_STATE_COMMITTED= 2,
+  TRANS_STATE_ROLLED_BACK= 3
+};
+
+/** Integer, first value of @sa enum_transaction_state. */
+#define FIRST_TRANS_STATE (static_cast<int> (TRANS_STATE_ACTIVE))
+/** Integer, last value of @sa enum_transaction_state. */
+#define LAST_TRANS_STATE (static_cast<int> (TRANS_STATE_ROLLED_BACK))
+/** Integer, number of values of @sa enum_transaction_state. */
+#define COUNT_TRANS_STATE (LAST_TRANS_STATE - FIRST_TRANS_STATE + 1)
+
+/**
+  Enum values for XA transaction state columns. Enums 0-3 match those used by
+  the server. See XID_STATE::enum xa_states in xa.h.
+*/
+enum enum_xa_transaction_state
+{
+  TRANS_STATE_XA_NOTR=-1,
+  TRANS_STATE_XA_ACTIVE=0,
+  TRANS_STATE_XA_IDLE,
+  TRANS_STATE_XA_PREPARED,
+  TRANS_STATE_XA_ROLLBACK_ONLY,
+  TRANS_STATE_XA_COMMITTED
+};
+
+/** Integer, first value of @sa enum_xa_transaction_state. */
+#define FIRST_TRANS_STATE_XA (static_cast<int> (TRANS_STATE_XA_NOTR))
+/** Integer, last value of @sa enum_xa_transaction_state. */
+#define LAST_TRANS_STATE_XA (static_cast<int> (TRANS_STATE_XA_COMMITTED))
+/** Integer, number of values of @sa enum_xa_transaction_state. */
+#define COUNT_TRANS_STATE_XA (LAST_TRANS_STATE_XA - FIRST_TRANS_STATE_XA + 1)
+
+/**
+  Enum values for transaction isolation level columns.
+  See enum_tx_isolation in handler.h.
+*/
+enum enum_isolation_level
+{
+  TRANS_LEVEL_READ_UNCOMMITTED,
+  TRANS_LEVEL_READ_COMMITTED,
+  TRANS_LEVEL_REPEATABLE_READ,
+  TRANS_LEVEL_SERIALIZABLE
+};
+
+/** Integer, first value of @sa enum_isolation_level. */
+#define FIRST_TRANS_LEVEL (static_cast<int> (TRANS_LEVEL_READ_UNCOMMITTED))
+/** Integer, last value of @sa enum_isolation_level. */
+#define LAST_TRANS_LEVEL (static_cast<int> (TRANS_LEVEL_SERIALIZABLE))
+/** Integer, number of values of @sa enum_isolation_level. */
+#define COUNT_TRANS_LEVEL (LAST_TRANS_LEVEL - FIRST_TRANS_LEVEL + 1)
+
+/**
+  Enum values for transaction acces mode columns.
+*/
+enum enum_transaction_mode
+{
+  TRANS_MODE_READ_ONLY= 1,
+  TRANS_MODE_READ_WRITE= 2
+};
+
+/** Integer, first value of @sa enum_transaction_mode. */
+#define FIRST_TRANS_MODE (static_cast<int> (TRANS_MODE_READ_WRITE))
+/** Integer, last value of @sa enum_transaction_mode. */
+#define LAST_TRANS_MODE (static_cast<int> (TRANS_MODE_READ_ONLY))
+/** Integer, number of values of @sa enum_transaction_mode. */
+#define COUNT_TRANS_MODE (LAST_TRANS_MODE - FIRST_TRANS_MODE + 1)
+
+
 #endif
 
diff --git a/storage/perfschema/pfs_column_values.cc b/storage/perfschema/pfs_column_values.cc
index 043c8fd2f38..02c939d4274 100644
--- a/storage/perfschema/pfs_column_values.cc
+++ b/storage/perfschema/pfs_column_values.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -38,8 +38,11 @@ LEX_CSTRING mutex_instrument_prefix=
 LEX_CSTRING rwlock_instrument_prefix=
 { STRING_WITH_LEN("wait/synch/rwlock/") };
 
+LEX_CSTRING sxlock_instrument_prefix=
+{ C_STRING_WITH_LEN("wait/synch/sxlock/") };
+
 LEX_CSTRING cond_instrument_prefix=
-{ STRING_WITH_LEN("wait/synch/cond/") };
+{ C_STRING_WITH_LEN("wait/synch/cond/") };
 
 LEX_CSTRING thread_instrument_prefix=
 { STRING_WITH_LEN("thread/") };
@@ -53,5 +56,12 @@ LEX_CSTRING stage_instrument_prefix=
 LEX_CSTRING statement_instrument_prefix=
 { STRING_WITH_LEN("statement/") };
 
+LEX_CSTRING transaction_instrument_prefix=
+{ C_STRING_WITH_LEN("transaction") };
+
 LEX_CSTRING socket_instrument_prefix=
-{ STRING_WITH_LEN("wait/io/socket/") };
+{ C_STRING_WITH_LEN("wait/io/socket/") };
+
+LEX_CSTRING memory_instrument_prefix=
+{ C_STRING_WITH_LEN("memory/") };
+
diff --git a/storage/perfschema/pfs_column_values.h b/storage/perfschema/pfs_column_values.h
index 44f23527881..b06f7a0130b 100644
--- a/storage/perfschema/pfs_column_values.h
+++ b/storage/perfschema/pfs_column_values.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -23,7 +23,7 @@
 #ifndef PFS_COLUMN_VALUES_H
 #define PFS_COLUMN_VALUES_H
 
-#include "m_string.h"                           /* LEX_STRING */
+#include "m_string.h"                           /* LEX_CSTRING */
 
 /**
   @file storage/perfschema/pfs_column_values.h
@@ -38,6 +38,8 @@ extern LEX_CSTRING PERFORMANCE_SCHEMA_str;
 extern LEX_CSTRING mutex_instrument_prefix;
 /** String prefix for all rwlock instruments. */
 extern LEX_CSTRING rwlock_instrument_prefix;
+/** String prefix for all sxlock instruments. */
+extern LEX_CSTRING sxlock_instrument_prefix;
 /** String prefix for all cond instruments. */
 extern LEX_CSTRING cond_instrument_prefix;
 /** String prefix for all thread instruments. */
@@ -48,7 +50,12 @@ extern LEX_CSTRING file_instrument_prefix;
 extern LEX_CSTRING stage_instrument_prefix;
 /** String prefix for all statement instruments. */
 extern LEX_CSTRING statement_instrument_prefix;
+/** String prefix for all transaction instruments. */
+extern LEX_CSTRING transaction_instrument_prefix;
+/** String prefix for all socket instruments. */
 extern LEX_CSTRING socket_instrument_prefix;
+/** String prefix for all memory instruments. */
+extern LEX_CSTRING memory_instrument_prefix;
 
 #endif
 
diff --git a/storage/perfschema/pfs_con_slice.cc b/storage/perfschema/pfs_con_slice.cc
index 9f9deb0919c..eaa65d524ba 100644
--- a/storage/perfschema/pfs_con_slice.cc
+++ b/storage/perfschema/pfs_con_slice.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -22,7 +22,7 @@
 
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_con_slice.h"
 #include "pfs_stat.h"
 #include "pfs_global.h"
@@ -38,66 +38,6 @@
   @{
 */
 
-PFS_single_stat *
-PFS_connection_slice::alloc_waits_slice(uint sizing)
-{
-  PFS_single_stat *slice= NULL;
-  uint index;
-
-  if (sizing > 0)
-  {
-    slice= PFS_MALLOC_ARRAY(sizing, sizeof(PFS_single_stat), PFS_single_stat,
-                            MYF(MY_ZEROFILL));
-    if (unlikely(slice == NULL))
-      return NULL;
-
-    for (index= 0; index < sizing; index++)
-      slice[index].reset();
-  }
-
-  return slice;
-}
-
-PFS_stage_stat *
-PFS_connection_slice::alloc_stages_slice(uint sizing)
-{
-  PFS_stage_stat *slice= NULL;
-  uint index;
-
-  if (sizing > 0)
-  {
-    slice= PFS_MALLOC_ARRAY(sizing, sizeof(PFS_stage_stat), PFS_stage_stat,
-                            MYF(MY_ZEROFILL));
-    if (unlikely(slice == NULL))
-      return NULL;
-
-    for (index= 0; index < sizing; index++)
-      slice[index].reset();
-  }
-
-  return slice;
-}
-
-PFS_statement_stat *
-PFS_connection_slice::alloc_statements_slice(uint sizing)
-{
-  PFS_statement_stat *slice= NULL;
-  uint index;
-
-  if (sizing > 0)
-  {
-    slice= PFS_MALLOC_ARRAY(sizing, sizeof(PFS_statement_stat), PFS_statement_stat,
-                            MYF(MY_ZEROFILL));
-    if (unlikely(slice == NULL))
-      return NULL;
-
-    for (index= 0; index < sizing; index++)
-      slice[index].reset();
-  }
-
-  return slice;
-}
-
 void PFS_connection_slice::reset_waits_stats()
 {
   PFS_single_stat *stat= m_instr_class_waits_stats;
@@ -122,5 +62,21 @@ void PFS_connection_slice::reset_statements_stats()
     stat->reset();
 }
 
+void PFS_connection_slice::reset_transactions_stats()
+{
+  PFS_transaction_stat *stat=
+                    &m_instr_class_transactions_stats[GLOBAL_TRANSACTION_INDEX];
+  if (stat)
+    stat->reset();
+}
+
+void PFS_connection_slice::rebase_memory_stats()
+{
+  PFS_memory_stat *stat= m_instr_class_memory_stats;
+  PFS_memory_stat *stat_last= stat + memory_class_max;
+  for ( ; stat < stat_last; stat++)
+    stat->reset();
+}
+
 /** @} */
 
diff --git a/storage/perfschema/pfs_con_slice.h b/storage/perfschema/pfs_con_slice.h
index ed17de5f2b0..f28c40f884c 100644
--- a/storage/perfschema/pfs_con_slice.h
+++ b/storage/perfschema/pfs_con_slice.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -28,12 +28,17 @@
   Performance schema connection slice (declarations).
 */
 
+#include "sql_class.h"
 #include "pfs_lock.h"
 #include "lf.h"
+#include "pfs_status.h"
 
 struct PFS_single_stat;
 struct PFS_stage_stat;
 struct PFS_statement_stat;
+struct PFS_transaction_stat;
+struct PFS_memory_stat;
+class PFS_opaque_container_page;
 
 /**
   @addtogroup Performance_schema_buffers
@@ -46,31 +51,15 @@ struct PFS_statement_stat;
 */
 struct PFS_connection_slice
 {
-  /**
-    Allocate memory for waits statistics.
-    @param sizing the number of wait classes.
-    @return wait statistics for this slice.
-  */
-  static PFS_single_stat *alloc_waits_slice(uint sizing);
-  /**
-    Allocate memory for stages statistics.
-    @param sizing the number of stage classes.
-    @return stage statistics for this slice.
-  */
-  static PFS_stage_stat *alloc_stages_slice(uint sizing);
-  /**
-    Allocate memory for statement statistics.
-    @param sizing the number of statement classes.
-    @return statement statistics for this slice.
-  */
-  static PFS_statement_stat *alloc_statements_slice(uint sizing);
-
   /** Reset all statistics. */
   inline void reset_stats()
   {
-    reset_waits_stats();
-    reset_stages_stats();
-    reset_statements_stats();
+    m_has_waits_stats= false;
+    m_has_stages_stats= false;
+    m_has_statements_stats= false;
+    m_has_transactions_stats= false;
+    m_has_memory_stats= false;
+    reset_status_stats();
   }
 
   /** Reset all wait statistics. */
@@ -79,6 +68,137 @@ struct PFS_connection_slice
   void reset_stages_stats();
   /** Reset all statements statistics. */
   void reset_statements_stats();
+  /** Reset all transactions statistics. */
+  void reset_transactions_stats();
+  /** Reset all memory statistics. */
+  void rebase_memory_stats();
+  /** Reset all status variable statistics. */
+  void reset_status_stats()
+  {
+    m_status_stats.reset();
+  }
+
+  void set_instr_class_waits_stats(PFS_single_stat *array)
+  {
+    m_has_waits_stats= false;
+    m_instr_class_waits_stats= array;
+  }
+
+  const PFS_single_stat* read_instr_class_waits_stats() const
+  {
+    if (! m_has_waits_stats)
+      return NULL;
+    return m_instr_class_waits_stats;
+  }
+
+  PFS_single_stat* write_instr_class_waits_stats()
+  {
+    if (! m_has_waits_stats)
+    {
+      reset_waits_stats();
+      m_has_waits_stats= true;
+    }
+    return m_instr_class_waits_stats;
+  }
+
+  void set_instr_class_stages_stats(PFS_stage_stat *array)
+  {
+    m_has_stages_stats= false;
+    m_instr_class_stages_stats= array;
+  }
+
+  const PFS_stage_stat* read_instr_class_stages_stats() const
+  {
+    if (! m_has_stages_stats)
+      return NULL;
+    return m_instr_class_stages_stats;
+  }
+
+  PFS_stage_stat* write_instr_class_stages_stats()
+  {
+    if (! m_has_stages_stats)
+    {
+      reset_stages_stats();
+      m_has_stages_stats= true;
+    }
+    return m_instr_class_stages_stats;
+  }
+
+  void set_instr_class_statements_stats(PFS_statement_stat *array)
+  {
+    m_has_statements_stats= false;
+    m_instr_class_statements_stats= array;
+  }
+
+  const PFS_statement_stat* read_instr_class_statements_stats() const
+  {
+    if (! m_has_statements_stats)
+      return NULL;
+    return m_instr_class_statements_stats;
+  }
+
+  PFS_statement_stat* write_instr_class_statements_stats()
+  {
+    if (! m_has_statements_stats)
+    {
+      reset_statements_stats();
+      m_has_statements_stats= true;
+    }
+    return m_instr_class_statements_stats;
+  }
+
+  void set_instr_class_transactions_stats(PFS_transaction_stat *array)
+  {
+    m_has_transactions_stats= false;
+    m_instr_class_transactions_stats= array;
+  }
+
+  const PFS_transaction_stat* read_instr_class_transactions_stats() const
+  {
+    if (! m_has_transactions_stats)
+      return NULL;
+    return m_instr_class_transactions_stats;
+  }
+
+  PFS_transaction_stat* write_instr_class_transactions_stats()
+  {
+    if (! m_has_transactions_stats)
+    {
+      reset_transactions_stats();
+      m_has_transactions_stats= true;
+    }
+    return m_instr_class_transactions_stats;
+  }
+
+  void set_instr_class_memory_stats(PFS_memory_stat *array)
+  {
+    m_has_memory_stats= false;
+    m_instr_class_memory_stats= array;
+  }
+
+  const PFS_memory_stat* read_instr_class_memory_stats() const
+  {
+    if (! m_has_memory_stats)
+      return NULL;
+    return m_instr_class_memory_stats;
+  }
+
+  PFS_memory_stat* write_instr_class_memory_stats()
+  {
+    if (! m_has_memory_stats)
+    {
+      rebase_memory_stats();
+      m_has_memory_stats= true;
+    }
+    return m_instr_class_memory_stats;
+  }
+
+private:
+  bool m_has_waits_stats;
+  bool m_has_stages_stats;
+  bool m_has_statements_stats;
+  bool m_has_transactions_stats;
+  bool m_has_memory_stats;
 
   /**
     Per connection slice waits aggregated statistics.
@@ -103,6 +223,37 @@ struct PFS_connection_slice
     Immutable, safe to use without internal lock.
   */
   PFS_statement_stat *m_instr_class_statements_stats;
+
+  /**
+    Per connection slice transactions aggregated statistics.
+    This member holds the data for the table
+    PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_SUMMARY_BY_*_BY_EVENT_NAME.
+    Immutable, safe to use without internal lock.
+  */
+  PFS_transaction_stat *m_instr_class_transactions_stats;
+
+  /**
+    Per connection slice memory aggregated statistics.
+    This member holds the data for the table
+    PERFORMANCE_SCHEMA.MEMORY_SUMMARY_BY_*_BY_EVENT_NAME.
+    Immutable, safe to use without internal lock.
+  */
+  PFS_memory_stat *m_instr_class_memory_stats;
+
+public:
+
+  void aggregate_status_stats(const STATUS_VAR *status_vars)
+  {
+    m_status_stats.aggregate_from(status_vars);
+  }
+
+  /**
+    Aggregated status variables.
+  */
+  PFS_status_stats m_status_stats;
+
+  /** Container page. */
+  PFS_opaque_container_page *m_page;
 };
 
 /** @} */
diff --git a/storage/perfschema/pfs_config.h.cmake b/storage/perfschema/pfs_config.h.cmake
new file mode 100644
index 00000000000..2b61b7e170e
--- /dev/null
+++ b/storage/perfschema/pfs_config.h.cmake
@@ -0,0 +1,6 @@
+#cmakedefine HAVE_PTHREAD_THREADID_NP 1
+#cmakedefine HAVE_SYS_GETTID 1
+#cmakedefine HAVE_GETTID
+#cmakedefine HAVE_GETTHRID 1
+#cmakedefine HAVE_PTHREAD_GETTHREADID_NP 1
+#cmakedefine HAVE_INTEGER_PTHREAD_SELF 1
diff --git a/storage/perfschema/pfs_defaults.cc b/storage/perfschema/pfs_defaults.cc
index 8c56ceba682..522beb890eb 100644
--- a/storage/perfschema/pfs_defaults.cc
+++ b/storage/perfschema/pfs_defaults.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -28,12 +28,15 @@
 #include <my_global.h>
 #include "pfs.h"
 #include "pfs_defaults.h"
+#include "pfs_instr_class.h"
 #include "pfs_instr.h"
 #include "pfs_setup_actor.h"
 #include "pfs_setup_object.h"
 
-static PSI_thread_key key;
-static PSI_thread_info info= { &key, "setup", PSI_FLAG_GLOBAL };
+static PSI_thread_key thread_key;
+static PSI_thread_info thread_info= { &thread_key, "setup", PSI_FLAG_GLOBAL };
+
+const char* pfs_category= "performance_schema";
 
 void install_default_setup(PSI_bootstrap *boot)
 {
@@ -41,30 +44,62 @@ void install_default_setup(PSI_bootstrap *boot)
   if (psi == NULL)
     return;
 
-  psi->register_thread("performance_schema", &info, 1);
-  PSI_thread *psi_thread= psi->new_thread(key, NULL, 0);
-  if (psi_thread == NULL)
-    return;
-
-  /* LF_HASH needs a thread, for PINS */
-  psi->set_thread(psi_thread);
-
-  String percent("%", 1, &my_charset_utf8_bin);
-  /* Enable all users on all hosts by default */
-  insert_setup_actor(&percent, &percent, &percent);
-
-  /* Disable system tables by default */
-  String mysql_db("mysql", 5, &my_charset_utf8_bin);
-  insert_setup_object(OBJECT_TYPE_TABLE, &mysql_db, &percent, false, false);
-
-  /* Disable performance/information schema tables. */
-  String PS_db("performance_schema", 18, &my_charset_utf8_bin);
-  String IS_db("information_schema", 18, &my_charset_utf8_bin);
-  insert_setup_object(OBJECT_TYPE_TABLE, &PS_db, &percent, false, false);
-  insert_setup_object(OBJECT_TYPE_TABLE, &IS_db, &percent, false, false);
-
-  /* Enable every other tables */
-  insert_setup_object(OBJECT_TYPE_TABLE, &percent, &percent, true, true);
+  psi->register_thread(pfs_category, &thread_info, 1);
+  PSI_thread *psi_thread= psi->new_thread(thread_key, NULL, 0);
+
+  if (psi_thread != NULL)
+  {
+    /* LF_HASH needs a thread, for PINS */
+    psi->set_thread(psi_thread);
+
+    String percent("%", 1, &my_charset_utf8mb3_bin);
+    /* Enable all users on all hosts by default */
+    insert_setup_actor(&percent, &percent, &percent, true, true);
+
+    String mysql_db("mysql", 5, &my_charset_utf8mb3_bin);
+    String PS_db("performance_schema", 18, &my_charset_utf8mb3_bin);
+    String IS_db("information_schema", 18, &my_charset_utf8mb3_bin);
+
+    /* Disable sp by default in mysql. */
+    insert_setup_object(OBJECT_TYPE_EVENT, &mysql_db, &percent, false, false);
+    /* Disable sp in performance/information schema. */
+    insert_setup_object(OBJECT_TYPE_EVENT, &PS_db, &percent, false, false);
+    insert_setup_object(OBJECT_TYPE_EVENT, &IS_db, &percent, false, false);
+    /* Enable every other sp. */
+    insert_setup_object(OBJECT_TYPE_EVENT, &percent, &percent, true, true);
+
+    /* Disable sp by default in mysql. */
+    insert_setup_object(OBJECT_TYPE_FUNCTION, &mysql_db, &percent, false, false);
+    /* Disable sp in performance/information schema. */
+    insert_setup_object(OBJECT_TYPE_FUNCTION, &PS_db, &percent, false, false);
+    insert_setup_object(OBJECT_TYPE_FUNCTION, &IS_db, &percent, false, false);
+    /* Enable every other sp. */
+    insert_setup_object(OBJECT_TYPE_FUNCTION, &percent, &percent, true, true);
+
+    /* Disable sp by default in mysql. */
+    insert_setup_object(OBJECT_TYPE_PROCEDURE, &mysql_db, &percent, false, false);
+    /* Disable sp in performance/information schema. */
+    insert_setup_object(OBJECT_TYPE_PROCEDURE, &PS_db, &percent, false, false);
+    insert_setup_object(OBJECT_TYPE_PROCEDURE, &IS_db, &percent, false, false);
+    /* Enable every other sp. */
+    insert_setup_object(OBJECT_TYPE_PROCEDURE, &percent, &percent, true, true);
+
+    /* Disable system tables by default */
+    insert_setup_object(OBJECT_TYPE_TABLE, &mysql_db, &percent, false, false);
+    /* Disable performance/information schema tables. */
+    insert_setup_object(OBJECT_TYPE_TABLE, &PS_db, &percent, false, false);
+    insert_setup_object(OBJECT_TYPE_TABLE, &IS_db, &percent, false, false);
+    /* Enable every other tables */
+    insert_setup_object(OBJECT_TYPE_TABLE, &percent, &percent, true, true);
+
+    /* Disable sp by default in mysql. */
+    insert_setup_object(OBJECT_TYPE_TRIGGER, &mysql_db, &percent, false, false);
+    /* Disable sp in performance/information schema. */
+    insert_setup_object(OBJECT_TYPE_TRIGGER, &PS_db, &percent, false, false);
+    insert_setup_object(OBJECT_TYPE_TRIGGER, &IS_db, &percent, false, false);
+    /* Enable every other sp. */
+    insert_setup_object(OBJECT_TYPE_TRIGGER, &percent, &percent, true, true);
+  }
 
   psi->delete_current_thread();
 }
diff --git a/storage/perfschema/pfs_defaults.h b/storage/perfschema/pfs_defaults.h
index 7751b55b83b..34ac1b0955a 100644
--- a/storage/perfschema/pfs_defaults.h
+++ b/storage/perfschema/pfs_defaults.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
diff --git a/storage/perfschema/pfs_digest.cc b/storage/perfschema/pfs_digest.cc
index 0ddb4c90eb3..43375c753dd 100644
--- a/storage/perfschema/pfs_digest.cc
+++ b/storage/perfschema/pfs_digest.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2017, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -36,8 +36,10 @@
 #include "pfs_instr.h"
 #include "pfs_digest.h"
 #include "pfs_global.h"
+#include "pfs_builtin_memory.h"
 #include "table_helper.h"
 #include "sql_lex.h"
+#include "sql_signal.h"
 #include "sql_get_diagnostics.h"
 #include "sql_string.h"
 #include <string.h>
@@ -54,7 +56,8 @@ bool flag_statements_digest= true;
   Current index in Stat array where new record is to be inserted.
   index 0 is reserved for "all else" case when entire array is full.
 */
-volatile uint32 PFS_ALIGNED digest_monotonic_index;
+static PFS_ALIGNED PFS_cacheline_uint32 digest_monotonic_index;
+
 bool digest_full= false;
 
 LF_HASH digest_hash;
@@ -72,16 +75,16 @@ int init_digest(const PFS_global_param *param)
   */
   digest_max= param->m_digest_sizing;
   digest_lost= 0;
-  PFS_atomic::store_u32(& digest_monotonic_index, 1);
+  PFS_atomic::store_u32(& digest_monotonic_index.m_u32, 1);
   digest_full= false;
 
   if (digest_max == 0)
     return 0;
 
   statements_digest_stat_array=
-    PFS_MALLOC_ARRAY(digest_max,
-                     sizeof(PFS_statements_digest_stat),
-                     PFS_statements_digest_stat,
+    PFS_MALLOC_ARRAY(& builtin_memory_digest,
+                     digest_max,
+                     sizeof(PFS_statements_digest_stat), PFS_statements_digest_stat,
                      MYF(MY_ZEROFILL));
 
   if (unlikely(statements_digest_stat_array == NULL))
@@ -96,7 +99,8 @@ int init_digest(const PFS_global_param *param)
     size_t digest_memory_size= pfs_max_digest_length * sizeof(unsigned char);
 
     statements_digest_token_array=
-      PFS_MALLOC_ARRAY(digest_max,
+      PFS_MALLOC_ARRAY(& builtin_memory_digest_tokens,
+                       digest_max,
                        digest_memory_size,
                        unsigned char,
                        MYF(MY_ZEROFILL));
@@ -117,15 +121,25 @@ int init_digest(const PFS_global_param *param)
   /* Set record[0] as allocated. */
   statements_digest_stat_array[0].m_lock.set_allocated();
 
+  /* Set record[0] as allocated. */
+  statements_digest_stat_array[0].m_lock.set_allocated();
+
   return 0;
 }
 
 /** Cleanup table EVENTS_STATEMENTS_SUMMARY_BY_DIGEST. */
 void cleanup_digest(void)
 {
-  /*  Free memory allocated to statements_digest_stat_array. */
-  pfs_free(statements_digest_stat_array);
-  pfs_free(statements_digest_token_array);
+  PFS_FREE_ARRAY(& builtin_memory_digest,
+                 digest_max,
+                 sizeof(PFS_statements_digest_stat),
+                 statements_digest_stat_array);
+
+  PFS_FREE_ARRAY(& builtin_memory_digest_tokens,
+                 digest_max,
+                 (pfs_max_digest_length * sizeof(unsigned char)),
+                 statements_digest_token_array);
+
   statements_digest_stat_array= NULL;
   statements_digest_token_array= NULL;
 }
@@ -138,9 +152,9 @@ static uchar *digest_hash_get_key(const uchar *entry, size_t *length,
   const PFS_statements_digest_stat *digest;
   const void *result;
   typed_entry= reinterpret_cast<const PFS_statements_digest_stat*const*>(entry);
-  DBUG_ASSERT(typed_entry != NULL);
+  assert(typed_entry != NULL);
   digest= *typed_entry;
-  DBUG_ASSERT(digest != NULL);
+  assert(digest != NULL);
   *length= sizeof (PFS_digest_key);
   result= & digest->m_digest_key;
   return const_cast<uchar*> (reinterpret_cast<const uchar*> (result));
@@ -152,14 +166,13 @@ C_MODE_END
   Initialize the digest hash.
   @return 0 on success
 */
-int init_digest_hash(void)
+int init_digest_hash(const PFS_global_param *param)
 {
-  if ((! digest_hash_inited) && (digest_max > 0))
+  if ((! digest_hash_inited) && (param->m_digest_sizing != 0))
   {
     lf_hash_init(&digest_hash, sizeof(PFS_statements_digest_stat*),
                  LF_HASH_UNIQUE, 0, 0, digest_hash_get_key,
                  &my_charset_bin);
-    /* digest_hash.size= digest_max; */
     digest_hash_inited= true;
   }
   return 0;
@@ -191,7 +204,7 @@ find_or_create_digest(PFS_thread *thread,
                       const char *schema_name,
                       uint schema_name_length)
 {
-  DBUG_ASSERT(digest_storage != NULL);
+  assert(digest_storage != NULL);
 
   if (statements_digest_stat_array == NULL)
     return NULL;
@@ -225,6 +238,7 @@ find_or_create_digest(PFS_thread *thread,
   size_t attempts= 0;
   PFS_statements_digest_stat **entry;
   PFS_statements_digest_stat *pfs= NULL;
+  pfs_dirty_state dirty_state;
 
   ulonglong now= my_hrtime().val;
 
@@ -248,7 +262,7 @@ search:
 
   if (digest_full)
   {
-    /*  digest_stat array is full. Add stat at index 0 and return. */
+    /* digest_stat array is full. Add stat at index 0 and return. */
     pfs= &statements_digest_stat_array[0];
     digest_lost++;
 
@@ -260,7 +274,7 @@ search:
 
   while (++attempts <= digest_max)
   {
-    safe_index= PFS_atomic::add_u32(& digest_monotonic_index, 1) % digest_max;
+    safe_index= PFS_atomic::add_u32(& digest_monotonic_index.m_u32, 1) % digest_max;
     if (safe_index == 0)
     {
       /* Record [0] is reserved. */
@@ -268,12 +282,12 @@ search:
     }
 
     /* Add a new record in digest stat array. */
-    DBUG_ASSERT(safe_index < digest_max);
+    assert(safe_index < digest_max);
     pfs= &statements_digest_stat_array[safe_index];
 
     if (pfs->m_lock.is_free())
     {
-      if (pfs->m_lock.free_to_dirty())
+      if (pfs->m_lock.free_to_dirty(& dirty_state))
       {
         /* Copy digest hash/LF Hash search key. */
         memcpy(& pfs->m_digest_key, &hash_key, sizeof(PFS_digest_key));
@@ -290,11 +304,11 @@ search:
         res= lf_hash_insert(&digest_hash, pins, &pfs);
         if (likely(res == 0))
         {
-          pfs->m_lock.dirty_to_allocated();
+          pfs->m_lock.dirty_to_allocated(& dirty_state);
           return & pfs->m_stat;
         }
 
-        pfs->m_lock.dirty_to_free();
+        pfs->m_lock.dirty_to_free(& dirty_state);
 
         if (res > 0)
         {
@@ -349,12 +363,13 @@ void purge_digest(PFS_thread* thread, PFS_digest_key *hash_key)
 
 void PFS_statements_digest_stat::reset_data(unsigned char *token_array, size_t length)
 {
-  m_lock.set_dirty();
+  pfs_dirty_state dirty_state;
+  m_lock.set_dirty(& dirty_state);
   m_digest_storage.reset(token_array, length);
   m_stat.reset();
   m_first_seen= 0;
   m_last_seen= 0;
-  m_lock.dirty_to_free();
+  m_lock.dirty_to_free(& dirty_state);
 }
 
 void PFS_statements_digest_stat::reset_index(PFS_thread *thread)
@@ -368,6 +383,8 @@ void PFS_statements_digest_stat::reset_index(PFS_thread *thread)
 
 void reset_esms_by_digest()
 {
+  uint index;
+
   if (statements_digest_stat_array == NULL)
     return;
 
@@ -376,7 +393,7 @@ void reset_esms_by_digest()
     return;
 
   /* Reset statements_digest_stat_array. */
-  for (size_t index= 0; index < digest_max; index++)
+  for (index= 0; index < digest_max; index++)
   {
     statements_digest_stat_array[index].reset_index(thread);
     statements_digest_stat_array[index].reset_data(statements_digest_token_array + index * pfs_max_digest_length, pfs_max_digest_length);
@@ -389,7 +406,7 @@ void reset_esms_by_digest()
     Reset index which indicates where the next calculated digest information
     to be inserted in statements_digest_stat_array.
   */
-  PFS_atomic::store_u32(& digest_monotonic_index, 1);
+  PFS_atomic::store_u32(& digest_monotonic_index.m_u32, 1);
   digest_full= false;
 }
 
diff --git a/storage/perfschema/pfs_digest.h b/storage/perfschema/pfs_digest.h
index c11852a1510..e99bdfebf57 100644
--- a/storage/perfschema/pfs_digest.h
+++ b/storage/perfschema/pfs_digest.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -76,7 +76,7 @@ struct PFS_ALIGNED PFS_statements_digest_stat
 int init_digest(const PFS_global_param *param);
 void cleanup_digest();
 
-int init_digest_hash(void);
+int init_digest_hash(const PFS_global_param *param);
 void cleanup_digest_hash(void);
 PFS_statement_stat* find_or_create_digest(PFS_thread *thread,
                                           const sql_digest_storage *digest_storage,
@@ -91,3 +91,4 @@ extern PFS_statements_digest_stat *statements_digest_stat_array;
 extern LF_HASH digest_hash;
 
 #endif
+
diff --git a/storage/perfschema/pfs_engine_table.cc b/storage/perfschema/pfs_engine_table.cc
index acab0e73a3d..afd69ba8920 100644
--- a/storage/perfschema/pfs_engine_table.cc
+++ b/storage/perfschema/pfs_engine_table.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,9 +26,10 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "hostname.h" /* For Host_entry */
 #include "pfs_engine_table.h"
+#include "pfs_buffer_container.h"
 
 #include "table_events_waits.h"
 #include "table_setup_actors.h"
@@ -69,6 +70,14 @@
 #include "table_esms_by_account_by_event_name.h"
 #include "table_esms_global_by_event_name.h"
 #include "table_esms_by_digest.h"
+#include "table_esms_by_program.h"
+
+#include "table_events_transactions.h"
+#include "table_ets_by_thread_by_event_name.h"
+#include "table_ets_by_host_by_event_name.h"
+#include "table_ets_by_user_by_event_name.h"
+#include "table_ets_by_account_by_event_name.h"
+#include "table_ets_global_by_event_name.h"
 
 #include "table_users.h"
 #include "table_accounts.h"
@@ -79,6 +88,39 @@
 #include "table_socket_summary_by_event_name.h"
 #include "table_session_connect_attrs.h"
 #include "table_session_account_connect_attrs.h"
+#include "table_mems_global_by_event_name.h"
+#include "table_mems_by_account_by_event_name.h"
+#include "table_mems_by_host_by_event_name.h"
+#include "table_mems_by_thread_by_event_name.h"
+#include "table_mems_by_user_by_event_name.h"
+
+/* For replication related perfschema tables. */
+#include "table_replication_connection_configuration.h"
+#include "table_replication_group_members.h"
+#include "table_replication_connection_status.h"
+#include "table_replication_applier_configuration.h"
+#include "table_replication_applier_status.h"
+#include "table_replication_applier_status_by_coordinator.h"
+#include "table_replication_applier_status_by_worker.h"
+#include "table_replication_group_member_stats.h"
+
+#include "table_prepared_stmt_instances.h"
+
+#include "table_md_locks.h"
+#include "table_table_handles.h"
+
+#include "table_uvar_by_thread.h"
+
+#include "table_status_by_account.h"
+#include "table_status_by_host.h"
+#include "table_status_by_thread.h"
+#include "table_status_by_user.h"
+#include "table_global_status.h"
+#include "table_session_status.h"
+
+#include "table_variables_by_thread.h"
+#include "table_global_variables.h"
+#include "table_session_variables.h"
 
 /* For show status */
 #include "pfs_column_values.h"
@@ -91,12 +133,103 @@
 
 #include "sql_base.h"                           // close_thread_tables
 #include "lock.h"                               // MYSQL_LOCK_IGNORE_TIMEOUT
+#include "log.h"
 
 /**
   @addtogroup Performance_schema_engine
   @{
 */
 
+bool PFS_table_context::initialize(void)
+{
+  if (m_restore)
+  {
+    /* Restore context from TLS. */
+    PFS_table_context *context= static_cast<PFS_table_context *>(my_get_thread_local(m_thr_key));
+    assert(context != NULL);
+
+    if(context)
+    {
+      m_last_version= context->m_current_version;
+      m_map= context->m_map;
+      assert(m_map_size == context->m_map_size);
+      m_map_size= context->m_map_size;
+    }
+  }
+  else
+  {
+    /* Check that TLS is not in use. */
+    PFS_table_context *context= static_cast<PFS_table_context *>(my_get_thread_local(m_thr_key));
+    //assert(context == NULL);
+
+    context= this;
+
+    /* Initialize a new context, store in TLS. */
+    m_last_version= m_current_version;
+    m_map= NULL;
+
+    /* Allocate a bitmap to record which threads are materialized. */
+    if (m_map_size > 0)
+    {
+      THD *thd= current_thd;
+      ulong words= m_map_size / m_word_size + (m_map_size % m_word_size > 0);
+      m_map= (ulong *)thd->calloc(words * m_word_size);
+    }
+
+    /* Write to TLS. */
+    my_set_thread_local(m_thr_key, static_cast<void *>(context));
+  }
+
+  m_initialized= (m_map_size > 0) ? (m_map != NULL) : true;
+
+  return m_initialized;
+}
+
+/* Constructor for global or single thread tables, map size = 0.  */
+PFS_table_context::PFS_table_context(ulonglong current_version, bool restore, thread_local_key_t key) :
+                   m_thr_key(key), m_current_version(current_version), m_last_version(0),
+                   m_map(NULL), m_map_size(0),
+                   m_restore(restore), m_initialized(false), m_last_item(0)
+{
+  initialize();
+}
+
+/* Constructor for by-thread or aggregate tables, map size = max thread/user/host/account. */
+PFS_table_context::PFS_table_context(ulonglong current_version, ulong map_size, bool restore, thread_local_key_t key) :
+                   m_thr_key(key), m_current_version(current_version), m_last_version(0),
+                   m_map(NULL), m_map_size(map_size),
+                   m_restore(restore), m_initialized(false), m_last_item(0)
+{
+  initialize();
+}
+
+PFS_table_context::~PFS_table_context(void)
+{
+  /* Clear TLS after final use. */ // TODO: How is that determined?
+//  if (m_restore)
+//  {
+//    my_set_thread_local(m_thr_key, NULL);
+//  }
+}
+
+void PFS_table_context::set_item(ulong n)
+{
+  if (n == m_last_item)
+    return;
+  ulong word= n / m_word_size;
+  ulong bit= n % m_word_size;
+  m_map[word] |= (1UL << bit);
+  m_last_item= n;
+}
+
+bool PFS_table_context::is_item_set(ulong n)
+{
+  ulong word= n / m_word_size;
+  ulong bit= n % m_word_size;
+  return (m_map[word] & (1UL << bit));
+}
+
+
 static PFS_engine_table_share *all_shares[]=
 {
   &table_cond_instances::m_share,
@@ -145,6 +278,16 @@ static PFS_engine_table_share *all_shares[]=
   &table_esms_by_host_by_event_name::m_share,
   &table_esms_global_by_event_name::m_share,
   &table_esms_by_digest::m_share,
+  &table_esms_by_program::m_share,
+
+  &table_events_transactions_current::m_share,
+  &table_events_transactions_history::m_share,
+  &table_events_transactions_history_long::m_share,
+  &table_ets_by_thread_by_event_name::m_share,
+  &table_ets_by_account_by_event_name::m_share,
+  &table_ets_by_user_by_event_name::m_share,
+  &table_ets_by_host_by_event_name::m_share,
+  &table_ets_global_by_event_name::m_share,
 
   &table_users::m_share,
   &table_accounts::m_share,
@@ -153,8 +296,43 @@ static PFS_engine_table_share *all_shares[]=
   &table_socket_instances::m_share,
   &table_socket_summary_by_instance::m_share,
   &table_socket_summary_by_event_name::m_share,
+
   &table_session_connect_attrs::m_share,
   &table_session_account_connect_attrs::m_share,
+
+  &table_mems_global_by_event_name::m_share,
+  &table_mems_by_account_by_event_name::m_share,
+  &table_mems_by_host_by_event_name::m_share,
+  &table_mems_by_thread_by_event_name::m_share,
+  &table_mems_by_user_by_event_name::m_share,
+  &table_table_handles::m_share,
+  &table_metadata_locks::m_share,
+
+#ifdef HAVE_REPLICATION
+  &table_replication_connection_configuration::m_share,
+  //&table_replication_group_members::m_share,
+  //&table_replication_connection_status::m_share,
+  &table_replication_applier_configuration::m_share,
+  &table_replication_applier_status::m_share,
+  &table_replication_applier_status_by_coordinator::m_share,
+  //&table_replication_applier_status_by_worker::m_share,
+  //&table_replication_group_member_stats::m_share,
+#endif
+
+  &table_prepared_stmt_instances::m_share,
+
+  &table_uvar_by_thread::m_share,
+  &table_status_by_account::m_share,
+  &table_status_by_host::m_share,
+  &table_status_by_thread::m_share,
+  &table_status_by_user::m_share,
+  &table_global_status::m_share,
+  &table_session_status::m_share,
+
+  //&table_variables_by_thread::m_share,
+  //&table_global_variables::m_share,
+  //&table_session_variables::m_share,
+
   NULL
 };
 
@@ -178,11 +356,7 @@ void PFS_engine_table_share::delete_all_locks(void)
 
 ha_rows PFS_engine_table_share::get_row_count(void) const
 {
-  /* If available, count the exact number or records */
-  if (m_get_row_count)
-    return m_get_row_count();
-  /* Otherwise, return an estimate */
-  return m_records;
+  return m_get_row_count();
 }
 
 int PFS_engine_table_share::write_row(TABLE *table, const unsigned char *buf,
@@ -349,16 +523,30 @@ void PFS_engine_table::get_normalizer(PFS_instr_class *instr_class)
   }
 }
 
+void PFS_engine_table::set_field_long(Field *f, long value)
+{
+  assert(f->real_type() == MYSQL_TYPE_LONG);
+  Field_long *f2= (Field_long*) f;
+  f2->store(value, false);
+}
+
 void PFS_engine_table::set_field_ulong(Field *f, ulong value)
 {
-  DBUG_ASSERT(f->real_type() == MYSQL_TYPE_LONG);
+  assert(f->real_type() == MYSQL_TYPE_LONG);
   Field_long *f2= (Field_long*) f;
   f2->store(value, true);
 }
 
+void PFS_engine_table::set_field_longlong(Field *f, longlong value)
+{
+  assert(f->real_type() == MYSQL_TYPE_LONGLONG);
+  Field_longlong *f2= (Field_longlong*) f;
+  f2->store(value, false);
+}
+
 void PFS_engine_table::set_field_ulonglong(Field *f, ulonglong value)
 {
-  DBUG_ASSERT(f->real_type() == MYSQL_TYPE_LONGLONG);
+  assert(f->real_type() == MYSQL_TYPE_LONGLONG);
   Field_longlong *f2= (Field_longlong*) f;
   f2->store(value, true);
 }
@@ -366,30 +554,48 @@ void PFS_engine_table::set_field_ulonglong(Field *f, ulonglong value)
 void PFS_engine_table::set_field_char_utf8(Field *f, const char* str,
                                            uint len)
 {
-  DBUG_ASSERT(f->real_type() == MYSQL_TYPE_STRING);
+  assert(f->real_type() == MYSQL_TYPE_STRING);
   Field_string *f2= (Field_string*) f;
-  f2->store(str, len, &my_charset_utf8_bin);
+  f2->store(str, len, &my_charset_utf8mb3_bin);
+}
+
+void PFS_engine_table::set_field_varchar(Field *f,
+                                         const CHARSET_INFO *cs,
+                                         const char* str,
+                                         uint len)
+{
+  assert(f->real_type() == MYSQL_TYPE_VARCHAR);
+  Field_varstring *f2= (Field_varstring*) f;
+  f2->store(str, len, cs);
 }
 
 void PFS_engine_table::set_field_varchar_utf8(Field *f, const char* str,
                                               uint len)
 {
-  DBUG_ASSERT(f->real_type() == MYSQL_TYPE_VARCHAR);
+  assert(f->real_type() == MYSQL_TYPE_VARCHAR);
   Field_varstring *f2= (Field_varstring*) f;
-  f2->store(str, len, &my_charset_utf8_bin);
+  f2->store(str, len, &my_charset_utf8mb3_bin);
 }
 
 void PFS_engine_table::set_field_longtext_utf8(Field *f, const char* str,
                                                uint len)
 {
-  DBUG_ASSERT(f->real_type() == MYSQL_TYPE_BLOB);
+  assert(f->real_type() == MYSQL_TYPE_BLOB);
   Field_blob *f2= (Field_blob*) f;
-  f2->store(str, len, &my_charset_utf8_bin);
+  f2->store(str, len, &my_charset_utf8mb3_bin);
+}
+
+void PFS_engine_table::set_field_blob(Field *f, const char* val,
+                                      uint len)
+{
+  assert(f->real_type() == MYSQL_TYPE_BLOB);
+  Field_blob *f2= (Field_blob*) f;
+  f2->store(val, len, &my_charset_utf8mb3_bin);
 }
 
 void PFS_engine_table::set_field_enum(Field *f, ulonglong value)
 {
-  DBUG_ASSERT(f->real_type() == MYSQL_TYPE_ENUM);
+  assert(f->real_type() == MYSQL_TYPE_ENUM);
   Field_enum *f2= (Field_enum*) f;
   f2->store_type(value);
 }
@@ -401,9 +607,16 @@ void PFS_engine_table::set_field_timestamp(Field *f, ulonglong value)
   f2->store_TIME((long)(value / 1000000), (value % 1000000));
 }
 
+void PFS_engine_table::set_field_double(Field *f, double value)
+{
+  assert(f->real_type() == MYSQL_TYPE_DOUBLE);
+  Field_double *f2= (Field_double*) f;
+  f2->store(value);
+}
+
 ulonglong PFS_engine_table::get_field_enum(Field *f)
 {
-  DBUG_ASSERT(f->real_type() == MYSQL_TYPE_ENUM);
+  assert(f->real_type() == MYSQL_TYPE_ENUM);
   Field_enum *f2= (Field_enum*) f;
   return f2->val_int();
 }
@@ -411,7 +624,7 @@ ulonglong PFS_engine_table::get_field_enum(Field *f)
 String*
 PFS_engine_table::get_field_char_utf8(Field *f, String *val)
 {
-  DBUG_ASSERT(f->real_type() == MYSQL_TYPE_STRING);
+  assert(f->real_type() == MYSQL_TYPE_STRING);
   Field_string *f2= (Field_string*) f;
   val= f2->val_str(NULL, val);
   return val;
@@ -420,7 +633,7 @@ PFS_engine_table::get_field_char_utf8(Field *f, String *val)
 String*
 PFS_engine_table::get_field_varchar_utf8(Field *f, String *val)
 {
-  DBUG_ASSERT(f->real_type() == MYSQL_TYPE_VARCHAR);
+  assert(f->real_type() == MYSQL_TYPE_VARCHAR);
   Field_varstring *f2= (Field_varstring*) f;
   val= f2->val_str(NULL, val);
   return val;
@@ -444,22 +657,22 @@ public:
   ~PFS_internal_schema_access()
   {}
 
-  ACL_internal_access_result check(ulong want_access,
-                                   ulong *save_priv) const;
+  ACL_internal_access_result check(privilege_t want_access,
+                                   privilege_t *save_priv) const;
 
   const ACL_internal_table_access *lookup(const char *name) const;
 };
 
 ACL_internal_access_result
-PFS_internal_schema_access::check(ulong want_access,
-                                  ulong *save_priv)  const
+PFS_internal_schema_access::check(privilege_t want_access,
+                                  privilege_t *save_priv)  const
 {
-  const ulong always_forbidden= /* CREATE_ACL | */ REFERENCES_ACL
+  const privilege_t always_forbidden= /* CREATE_ACL | */ REFERENCES_ACL
     | INDEX_ACL | ALTER_ACL | CREATE_TMP_ACL | EXECUTE_ACL
     | CREATE_VIEW_ACL | SHOW_VIEW_ACL | CREATE_PROC_ACL | ALTER_PROC_ACL
     | EVENT_ACL | TRIGGER_ACL ;
 
-  if (unlikely(want_access & always_forbidden))
+  if (unlikely((want_access & always_forbidden) != NO_ACL))
     return ACL_INTERNAL_ACCESS_DENIED;
 
   /*
@@ -501,46 +714,108 @@ void initialize_performance_schema_acl(bool bootstrap)
   }
 }
 
+static bool allow_drop_table_privilege() {
+  /*
+    The same DROP_ACL privilege is used for different statements,
+    in particular:
+    - TRUNCATE TABLE
+    - DROP TABLE
+    - ALTER TABLE
+    Here, we want to prevent DROP / ALTER  while allowing TRUNCATE.
+    Note that we must also allow GRANT to transfer the truncate privilege.
+  */
+  THD *thd= current_thd;
+  if (thd == NULL) {
+    return false;
+  }
+
+  assert(thd->lex != NULL);
+  if ((thd->lex->sql_command != SQLCOM_TRUNCATE) &&
+      (thd->lex->sql_command != SQLCOM_GRANT)) {
+    return false;
+  }
+
+  return true;
+}
+
+
 PFS_readonly_acl pfs_readonly_acl;
 
 ACL_internal_access_result
-PFS_readonly_acl::check(ulong want_access, ulong *save_priv) const
+PFS_readonly_acl::check(privilege_t want_access, privilege_t *save_priv) const
 {
-  const ulong always_forbidden= INSERT_ACL | UPDATE_ACL | DELETE_ACL
+  const privilege_t always_forbidden= INSERT_ACL | UPDATE_ACL | DELETE_ACL
     | /* CREATE_ACL | */ REFERENCES_ACL | INDEX_ACL | ALTER_ACL
     | CREATE_VIEW_ACL | SHOW_VIEW_ACL | TRIGGER_ACL | LOCK_TABLES_ACL;
 
-  if (unlikely(want_access & always_forbidden))
+  if (unlikely((want_access & always_forbidden) != NO_ACL))
     return ACL_INTERNAL_ACCESS_DENIED;
 
   return ACL_INTERNAL_ACCESS_CHECK_GRANT;
 }
 
+
+PFS_readonly_world_acl pfs_readonly_world_acl;
+
+ACL_internal_access_result
+PFS_readonly_world_acl::check(privilege_t want_access, privilege_t *save_priv) const
+{
+  ACL_internal_access_result res= PFS_readonly_acl::check(want_access, save_priv);
+  if (res == ACL_INTERNAL_ACCESS_CHECK_GRANT)
+  {
+    if (want_access == SELECT_ACL)
+      res= ACL_INTERNAL_ACCESS_GRANTED;
+  }
+  return res;
+}
+
+
 PFS_truncatable_acl pfs_truncatable_acl;
 
 ACL_internal_access_result
-PFS_truncatable_acl::check(ulong want_access, ulong *save_priv) const
+PFS_truncatable_acl::check(privilege_t want_access, privilege_t *save_priv) const
 {
-  const ulong always_forbidden= INSERT_ACL | UPDATE_ACL | DELETE_ACL
+  const privilege_t always_forbidden= INSERT_ACL | UPDATE_ACL | DELETE_ACL
     | /* CREATE_ACL | */ REFERENCES_ACL | INDEX_ACL | ALTER_ACL
     | CREATE_VIEW_ACL | SHOW_VIEW_ACL | TRIGGER_ACL | LOCK_TABLES_ACL;
 
-  if (unlikely(want_access & always_forbidden))
+  if (unlikely((want_access & always_forbidden) != NO_ACL))
     return ACL_INTERNAL_ACCESS_DENIED;
 
   return ACL_INTERNAL_ACCESS_CHECK_GRANT;
 }
 
+
+PFS_truncatable_world_acl pfs_truncatable_world_acl;
+
+ACL_internal_access_result
+PFS_truncatable_world_acl::check(privilege_t want_access, privilege_t *save_priv) const
+{
+  ACL_internal_access_result res= PFS_truncatable_acl::check(want_access, save_priv);
+  if (res == ACL_INTERNAL_ACCESS_CHECK_GRANT)
+  {
+    if (want_access == DROP_ACL)
+    {
+      if (allow_drop_table_privilege())
+        res= ACL_INTERNAL_ACCESS_GRANTED;
+    }
+    else if (want_access == SELECT_ACL)
+      res= ACL_INTERNAL_ACCESS_GRANTED;
+  }
+  return res;
+}
+
+
 PFS_updatable_acl pfs_updatable_acl;
 
 ACL_internal_access_result
-PFS_updatable_acl::check(ulong want_access, ulong *save_priv) const
+PFS_updatable_acl::check(privilege_t want_access, privilege_t *save_priv) const
 {
-  const ulong always_forbidden= INSERT_ACL | DELETE_ACL
+  const privilege_t always_forbidden= INSERT_ACL | DELETE_ACL
     | /* CREATE_ACL | */ REFERENCES_ACL | INDEX_ACL | ALTER_ACL
     | CREATE_VIEW_ACL | SHOW_VIEW_ACL | TRIGGER_ACL;
 
-  if (unlikely(want_access & always_forbidden))
+  if (unlikely((want_access & always_forbidden) != NO_ACL))
     return ACL_INTERNAL_ACCESS_DENIED;
 
   return ACL_INTERNAL_ACCESS_CHECK_GRANT;
@@ -549,12 +824,12 @@ PFS_updatable_acl::check(ulong want_access, ulong *save_priv) const
 PFS_editable_acl pfs_editable_acl;
 
 ACL_internal_access_result
-PFS_editable_acl::check(ulong want_access, ulong *save_priv) const
+PFS_editable_acl::check(privilege_t want_access, privilege_t *save_priv) const
 {
-  const ulong always_forbidden= /* CREATE_ACL | */ REFERENCES_ACL
+  const privilege_t always_forbidden= /* CREATE_ACL | */ REFERENCES_ACL
     | INDEX_ACL | ALTER_ACL | CREATE_VIEW_ACL | SHOW_VIEW_ACL | TRIGGER_ACL;
 
-  if (unlikely(want_access & always_forbidden))
+  if (unlikely((want_access & always_forbidden) != NO_ACL))
     return ACL_INTERNAL_ACCESS_DENIED;
 
   return ACL_INTERNAL_ACCESS_CHECK_GRANT;
@@ -563,13 +838,13 @@ PFS_editable_acl::check(ulong want_access, ulong *save_priv) const
 PFS_unknown_acl pfs_unknown_acl;
 
 ACL_internal_access_result
-PFS_unknown_acl::check(ulong want_access, ulong *save_priv) const
+PFS_unknown_acl::check(privilege_t want_access, privilege_t *save_priv) const
 {
-  const ulong always_forbidden= CREATE_ACL
+  const privilege_t always_forbidden= CREATE_ACL
     | REFERENCES_ACL | INDEX_ACL | ALTER_ACL
     | CREATE_VIEW_ACL | TRIGGER_ACL;
 
-  if (unlikely(want_access & always_forbidden))
+  if (unlikely((want_access & always_forbidden) != NO_ACL))
     return ACL_INTERNAL_ACCESS_DENIED;
 
   /*
@@ -619,33 +894,33 @@ bool pfs_show_status(handlerton *hton, THD *thd,
   {
     switch (i){
     case 0:
-      name= "events_waits_current.row_size";
+      name= "events_waits_current.size";
       size= sizeof(PFS_events_waits);
       break;
     case 1:
-      name= "events_waits_current.row_count";
-      size= WAIT_STACK_SIZE * thread_max;
+      name= "events_waits_current.count";
+      size= WAIT_STACK_SIZE * global_thread_container.get_row_count();
       break;
     case 2:
-      name= "events_waits_history.row_size";
+      name= "events_waits_history.size";
       size= sizeof(PFS_events_waits);
       break;
     case 3:
-      name= "events_waits_history.row_count";
-      size= events_waits_history_per_thread * thread_max;
+      name= "events_waits_history.count";
+      size= events_waits_history_per_thread * global_thread_container.get_row_count();
       break;
     case 4:
       name= "events_waits_history.memory";
-      size= events_waits_history_per_thread * thread_max
+      size= events_waits_history_per_thread * global_thread_container.get_row_count()
         * sizeof(PFS_events_waits);
       total_memory+= size;
       break;
     case 5:
-      name= "events_waits_history_long.row_size";
+      name= "events_waits_history_long.size";
       size= sizeof(PFS_events_waits);
       break;
     case 6:
-      name= "events_waits_history_long.row_count";
+      name= "events_waits_history_long.count";
       size= events_waits_history_long_size;
       break;
     case 7:
@@ -654,11 +929,11 @@ bool pfs_show_status(handlerton *hton, THD *thd,
       total_memory+= size;
       break;
     case 8:
-      name= "(pfs_mutex_class).row_size";
+      name= "(pfs_mutex_class).size";
       size= sizeof(PFS_mutex_class);
       break;
     case 9:
-      name= "(pfs_mutex_class).row_count";
+      name= "(pfs_mutex_class).count";
       size= mutex_class_max;
       break;
     case 10:
@@ -667,11 +942,11 @@ bool pfs_show_status(handlerton *hton, THD *thd,
       total_memory+= size;
       break;
     case 11:
-      name= "(pfs_rwlock_class).row_size";
+      name= "(pfs_rwlock_class).size";
       size= sizeof(PFS_rwlock_class);
       break;
     case 12:
-      name= "(pfs_rwlock_class).row_count";
+      name= "(pfs_rwlock_class).count";
       size= rwlock_class_max;
       break;
     case 13:
@@ -680,11 +955,11 @@ bool pfs_show_status(handlerton *hton, THD *thd,
       total_memory+= size;
       break;
     case 14:
-      name= "(pfs_cond_class).row_size";
+      name= "(pfs_cond_class).size";
       size= sizeof(PFS_cond_class);
       break;
     case 15:
-      name= "(pfs_cond_class).row_count";
+      name= "(pfs_cond_class).count";
       size= cond_class_max;
       break;
     case 16:
@@ -693,11 +968,11 @@ bool pfs_show_status(handlerton *hton, THD *thd,
       total_memory+= size;
       break;
     case 17:
-      name= "(pfs_thread_class).row_size";
+      name= "(pfs_thread_class).size";
       size= sizeof(PFS_thread_class);
       break;
     case 18:
-      name= "(pfs_thread_class).row_count";
+      name= "(pfs_thread_class).count";
       size= thread_class_max;
       break;
     case 19:
@@ -706,11 +981,11 @@ bool pfs_show_status(handlerton *hton, THD *thd,
       total_memory+= size;
       break;
     case 20:
-      name= "(pfs_file_class).row_size";
+      name= "(pfs_file_class).size";
       size= sizeof(PFS_file_class);
       break;
     case 21:
-      name= "(pfs_file_class).row_count";
+      name= "(pfs_file_class).count";
       size= file_class_max;
       break;
     case 22:
@@ -719,76 +994,76 @@ bool pfs_show_status(handlerton *hton, THD *thd,
       total_memory+= size;
       break;
     case 23:
-      name= "mutex_instances.row_size";
-      size= sizeof(PFS_mutex);
+      name= "mutex_instances.size";
+      size= global_mutex_container.get_row_size();
       break;
     case 24:
-      name= "mutex_instances.row_count";
-      size= mutex_max;
+      name= "mutex_instances.count";
+      size= global_mutex_container.get_row_count();
       break;
     case 25:
       name= "mutex_instances.memory";
-      size= mutex_max * sizeof(PFS_mutex);
+      size= global_mutex_container.get_memory();
       total_memory+= size;
       break;
     case 26:
-      name= "rwlock_instances.row_size";
-      size= sizeof(PFS_rwlock);
+      name= "rwlock_instances.size";
+      size= global_rwlock_container.get_row_size();
       break;
     case 27:
-      name= "rwlock_instances.row_count";
-      size= rwlock_max;
+      name= "rwlock_instances.count";
+      size= global_rwlock_container.get_row_count();
       break;
     case 28:
       name= "rwlock_instances.memory";
-      size= rwlock_max * sizeof(PFS_rwlock);
+      size= global_rwlock_container.get_memory();
       total_memory+= size;
       break;
     case 29:
-      name= "cond_instances.row_size";
-      size= sizeof(PFS_cond);
+      name= "cond_instances.size";
+      size= global_cond_container.get_row_size();
       break;
     case 30:
-      name= "cond_instances.row_count";
-      size= cond_max;
+      name= "cond_instances.count";
+      size= global_cond_container.get_row_count();
       break;
     case 31:
       name= "cond_instances.memory";
-      size= cond_max * sizeof(PFS_cond);
+      size= global_cond_container.get_memory();
       total_memory+= size;
       break;
     case 32:
-      name= "threads.row_size";
-      size= sizeof(PFS_thread);
+      name= "threads.size";
+      size= global_thread_container.get_row_size();
       break;
     case 33:
-      name= "threads.row_count";
-      size= thread_max;
+      name= "threads.count";
+      size= global_thread_container.get_row_count();
       break;
     case 34:
       name= "threads.memory";
-      size= thread_max * sizeof(PFS_thread);
+      size= global_thread_container.get_memory();
       total_memory+= size;
       break;
     case 35:
-      name= "file_instances.row_size";
-      size= sizeof(PFS_file);
+      name= "file_instances.size";
+      size= global_file_container.get_row_size();
       break;
     case 36:
-      name= "file_instances.row_count";
-      size= file_max;
+      name= "file_instances.count";
+      size= global_file_container.get_row_count();
       break;
     case 37:
       name= "file_instances.memory";
-      size= file_max * sizeof(PFS_file);
+      size= global_file_container.get_memory();
       total_memory+= size;
       break;
     case 38:
-      name= "(pfs_file_handle).row_size";
+      name= "(pfs_file_handle).size";
       size= sizeof(PFS_file*);
       break;
     case 39:
-      name= "(pfs_file_handle).row_count";
+      name= "(pfs_file_handle).count";
       size= file_handle_max;
       break;
     case 40:
@@ -797,154 +1072,154 @@ bool pfs_show_status(handlerton *hton, THD *thd,
       total_memory+= size;
       break;
     case 41:
-      name= "events_waits_summary_by_thread_by_event_name.row_size";
+      name= "events_waits_summary_by_thread_by_event_name.size";
       size= sizeof(PFS_single_stat);
       break;
     case 42:
-      name= "events_waits_summary_by_thread_by_event_name.row_count";
-      size= thread_max * wait_class_max;
+      name= "events_waits_summary_by_thread_by_event_name.count";
+      size= global_thread_container.get_row_count() * wait_class_max;
       break;
     case 43:
       name= "events_waits_summary_by_thread_by_event_name.memory";
-      size= thread_max * wait_class_max * sizeof(PFS_single_stat);
+      size= global_thread_container.get_row_count() * wait_class_max * sizeof(PFS_single_stat);
       total_memory+= size;
       break;
     case 44:
-      name= "(pfs_table_share).row_size";
-      size= sizeof(PFS_table_share);
+      name= "(pfs_table_share).size";
+      size= global_table_share_container.get_row_size();
       break;
     case 45:
-      name= "(pfs_table_share).row_count";
-      size= table_share_max;
+      name= "(pfs_table_share).count";
+      size= global_table_share_container.get_row_count();
       break;
     case 46:
       name= "(pfs_table_share).memory";
-      size= table_share_max * sizeof(PFS_table_share);
+      size= global_table_share_container.get_memory();
       total_memory+= size;
       break;
     case 47:
-      name= "(pfs_table).row_size";
-      size= sizeof(PFS_table);
+      name= "(pfs_table).size";
+      size= global_table_container.get_row_size();
       break;
     case 48:
-      name= "(pfs_table).row_count";
-      size= table_max;
+      name= "(pfs_table).count";
+      size= global_table_container.get_row_count();
       break;
     case 49:
       name= "(pfs_table).memory";
-      size= table_max * sizeof(PFS_table);
+      size= global_table_container.get_memory();
       total_memory+= size;
       break;
     case 50:
-      name= "setup_actors.row_size";
-      size= sizeof(PFS_setup_actor);
+      name= "setup_actors.size";
+      size= global_setup_actor_container.get_row_size();
       break;
     case 51:
-      name= "setup_actors.row_count";
-      size= setup_actor_max;
+      name= "setup_actors.count";
+      size= global_setup_actor_container.get_row_count();
       break;
     case 52:
       name= "setup_actors.memory";
-      size= setup_actor_max * sizeof(PFS_setup_actor);
+      size= global_setup_actor_container.get_memory();
       total_memory+= size;
       break;
     case 53:
-      name= "setup_objects.row_size";
-      size= sizeof(PFS_setup_object);
+      name= "setup_objects.size";
+      size= global_setup_object_container.get_row_size();
       break;
     case 54:
-      name= "setup_objects.row_count";
-      size= setup_object_max;
+      name= "setup_objects.count";
+      size= global_setup_object_container.get_row_count();
       break;
     case 55:
       name= "setup_objects.memory";
-      size= setup_object_max * sizeof(PFS_setup_object);
+      size= global_setup_object_container.get_memory();
       total_memory+= size;
       break;
     case 56:
-      name= "(pfs_account).row_size";
-      size= sizeof(PFS_account);
+      name= "(pfs_account).size";
+      size= global_account_container.get_row_size();
       break;
     case 57:
-      name= "(pfs_account).row_count";
-      size= account_max;
+      name= "(pfs_account).count";
+      size= global_account_container.get_row_count();
       break;
     case 58:
       name= "(pfs_account).memory";
-      size= account_max * sizeof(PFS_account);
+      size= global_account_container.get_memory();
       total_memory+= size;
       break;
     case 59:
-      name= "events_waits_summary_by_account_by_event_name.row_size";
+      name= "events_waits_summary_by_account_by_event_name.size";
       size= sizeof(PFS_single_stat);
       break;
     case 60:
-      name= "events_waits_summary_by_account_by_event_name.row_count";
-      size= account_max * wait_class_max;
+      name= "events_waits_summary_by_account_by_event_name.count";
+      size= global_account_container.get_row_count() * wait_class_max;
       break;
     case 61:
       name= "events_waits_summary_by_account_by_event_name.memory";
-      size= account_max * wait_class_max * sizeof(PFS_single_stat);
+      size= global_account_container.get_row_count() * wait_class_max * sizeof(PFS_single_stat);
       total_memory+= size;
       break;
     case 62:
-      name= "events_waits_summary_by_user_by_event_name.row_size";
+      name= "events_waits_summary_by_user_by_event_name.size";
       size= sizeof(PFS_single_stat);
       break;
     case 63:
-      name= "events_waits_summary_by_user_by_event_name.row_count";
-      size= user_max * wait_class_max;
+      name= "events_waits_summary_by_user_by_event_name.count";
+      size= global_user_container.get_row_count() * wait_class_max;
       break;
     case 64:
       name= "events_waits_summary_by_user_by_event_name.memory";
-      size= user_max * wait_class_max * sizeof(PFS_single_stat);
+      size= global_user_container.get_row_count() * wait_class_max * sizeof(PFS_single_stat);
       total_memory+= size;
       break;
     case 65:
-      name= "events_waits_summary_by_host_by_event_name.row_size";
+      name= "events_waits_summary_by_host_by_event_name.size";
       size= sizeof(PFS_single_stat);
       break;
     case 66:
-      name= "events_waits_summary_by_host_by_event_name.row_count";
-      size= host_max * wait_class_max;
+      name= "events_waits_summary_by_host_by_event_name.count";
+      size= global_host_container.get_row_count() * wait_class_max;
       break;
     case 67:
       name= "events_waits_summary_by_host_by_event_name.memory";
-      size= host_max * wait_class_max * sizeof(PFS_single_stat);
+      size= global_host_container.get_row_count() * wait_class_max * sizeof(PFS_single_stat);
       total_memory+= size;
       break;
     case 68:
-      name= "(pfs_user).row_size";
-      size= sizeof(PFS_user);
+      name= "(pfs_user).size";
+      size= global_user_container.get_row_size();
       break;
     case 69:
-      name= "(pfs_user).row_count";
-      size= user_max;
+      name= "(pfs_user).count";
+      size= global_user_container.get_row_count();
       break;
     case 70:
       name= "(pfs_user).memory";
-      size= user_max * sizeof(PFS_user);
+      size= global_user_container.get_memory();
       total_memory+= size;
       break;
     case 71:
-      name= "(pfs_host).row_size";
-      size= sizeof(PFS_host);
+      name= "(pfs_host).size";
+      size= global_host_container.get_row_size();
       break;
     case 72:
-      name= "(pfs_host).row_count";
-      size= host_max;
+      name= "(pfs_host).count";
+      size= global_host_container.get_row_count();
       break;
     case 73:
       name= "(pfs_host).memory";
-      size= host_max * sizeof(PFS_host);
+      size= global_host_container.get_memory();
       total_memory+= size;
       break;
     case 74:
-      name= "(pfs_stage_class).row_size";
+      name= "(pfs_stage_class).size";
       size= sizeof(PFS_stage_class);
       break;
     case 75:
-      name= "(pfs_stage_class).row_count";
+      name= "(pfs_stage_class).count";
       size= stage_class_max;
       break;
     case 76:
@@ -953,25 +1228,25 @@ bool pfs_show_status(handlerton *hton, THD *thd,
       total_memory+= size;
       break;
     case 77:
-      name= "events_stages_history.row_size";
+      name= "events_stages_history.size";
       size= sizeof(PFS_events_stages);
       break;
     case 78:
-      name= "events_stages_history.row_count";
-      size= events_stages_history_per_thread * thread_max;
+      name= "events_stages_history.count";
+      size= events_stages_history_per_thread * global_thread_container.get_row_count();
       break;
     case 79:
       name= "events_stages_history.memory";
-      size= events_stages_history_per_thread * thread_max
+      size= events_stages_history_per_thread * global_thread_container.get_row_count()
         * sizeof(PFS_events_stages);
       total_memory+= size;
       break;
     case 80:
-      name= "events_stages_history_long.row_size";
+      name= "events_stages_history_long.size";
       size= sizeof(PFS_events_stages);
       break;
     case 81:
-      name= "events_stages_history_long.row_count";
+      name= "events_stages_history_long.count";
       size= events_stages_history_long_size;
       break;
     case 82:
@@ -980,24 +1255,24 @@ bool pfs_show_status(handlerton *hton, THD *thd,
       total_memory+= size;
       break;
     case 83:
-      name= "events_stages_summary_by_thread_by_event_name.row_size";
+      name= "events_stages_summary_by_thread_by_event_name.size";
       size= sizeof(PFS_stage_stat);
       break;
     case 84:
-      name= "events_stages_summary_by_thread_by_event_name.row_count";
-      size= thread_max * stage_class_max;
+      name= "events_stages_summary_by_thread_by_event_name.count";
+      size= global_thread_container.get_row_count() * stage_class_max;
       break;
     case 85:
       name= "events_stages_summary_by_thread_by_event_name.memory";
-      size= thread_max * stage_class_max * sizeof(PFS_stage_stat);
+      size= global_thread_container.get_row_count() * stage_class_max * sizeof(PFS_stage_stat);
       total_memory+= size;
       break;
     case 86:
-      name= "events_stages_summary_global_by_event_name.row_size";
+      name= "events_stages_summary_global_by_event_name.size";
       size= sizeof(PFS_stage_stat);
       break;
     case 87:
-      name= "events_stages_summary_global_by_event_name.row_count";
+      name= "events_stages_summary_global_by_event_name.count";
       size= stage_class_max;
       break;
     case 88:
@@ -1006,50 +1281,50 @@ bool pfs_show_status(handlerton *hton, THD *thd,
       total_memory+= size;
       break;
     case 89:
-      name= "events_stages_summary_by_account_by_event_name.row_size";
+      name= "events_stages_summary_by_account_by_event_name.size";
       size= sizeof(PFS_stage_stat);
       break;
     case 90:
-      name= "events_stages_summary_by_account_by_event_name.row_count";
-      size= account_max * stage_class_max;
+      name= "events_stages_summary_by_account_by_event_name.count";
+      size= global_account_container.get_row_count() * stage_class_max;
       break;
     case 91:
       name= "events_stages_summary_by_account_by_event_name.memory";
-      size= account_max * stage_class_max * sizeof(PFS_stage_stat);
+      size= global_account_container.get_row_count() * stage_class_max * sizeof(PFS_stage_stat);
       total_memory+= size;
       break;
     case 92:
-      name= "events_stages_summary_by_user_by_event_name.row_size";
+      name= "events_stages_summary_by_user_by_event_name.size";
       size= sizeof(PFS_stage_stat);
       break;
     case 93:
-      name= "events_stages_summary_by_user_by_event_name.row_count";
-      size= user_max * stage_class_max;
+      name= "events_stages_summary_by_user_by_event_name.count";
+      size= global_user_container.get_row_count() * stage_class_max;
       break;
     case 94:
       name= "events_stages_summary_by_user_by_event_name.memory";
-      size= user_max * stage_class_max * sizeof(PFS_stage_stat);
+      size= global_user_container.get_row_count() * stage_class_max * sizeof(PFS_stage_stat);
       total_memory+= size;
       break;
     case 95:
-      name= "events_stages_summary_by_host_by_event_name.row_size";
+      name= "events_stages_summary_by_host_by_event_name.size";
       size= sizeof(PFS_stage_stat);
       break;
     case 96:
-      name= "events_stages_summary_by_host_by_event_name.row_count";
-      size= host_max * stage_class_max;
+      name= "events_stages_summary_by_host_by_event_name.count";
+      size= global_host_container.get_row_count() * stage_class_max;
       break;
     case 97:
       name= "events_stages_summary_by_host_by_event_name.memory";
-      size= host_max * stage_class_max * sizeof(PFS_stage_stat);
+      size= global_host_container.get_row_count() * stage_class_max * sizeof(PFS_stage_stat);
       total_memory+= size;
       break;
     case 98:
-      name= "(pfs_statement_class).row_size";
+      name= "(pfs_statement_class).size";
       size= sizeof(PFS_statement_class);
       break;
     case 99:
-      name= "(pfs_statement_class).row_count";
+      name= "(pfs_statement_class).count";
       size= statement_class_max;
       break;
     case 100:
@@ -1058,51 +1333,51 @@ bool pfs_show_status(handlerton *hton, THD *thd,
       total_memory+= size;
       break;
     case 101:
-      name= "events_statements_history.row_size";
+      name= "events_statements_history.size";
       size= sizeof(PFS_events_statements);
       break;
     case 102:
-      name= "events_statements_history.row_count";
-      size= events_statements_history_per_thread * thread_max;
+      name= "events_statements_history.count";
+      size= events_statements_history_per_thread * global_thread_container.get_row_count();
       break;
     case 103:
       name= "events_statements_history.memory";
-      size= events_statements_history_per_thread * thread_max
+      size= events_statements_history_per_thread * global_thread_container.get_row_count()
         * sizeof(PFS_events_statements);
       total_memory+= size;
       break;
     case 104:
-      name= "events_statements_history_long.row_size";
+      name= "events_statements_history_long.size";
       size= sizeof(PFS_events_statements);
       break;
     case 105:
-      name= "events_statements_history_long.row_count";
+      name= "events_statements_history_long.count";
       size= events_statements_history_long_size;
       break;
     case 106:
       name= "events_statements_history_long.memory";
-      size= events_statements_history_long_size * sizeof(PFS_events_statements);
+      size= events_statements_history_long_size * (sizeof(PFS_events_statements));
       total_memory+= size;
       break;
     case 107:
-      name= "events_statements_summary_by_thread_by_event_name.row_size";
+      name= "events_statements_summary_by_thread_by_event_name.size";
       size= sizeof(PFS_statement_stat);
       break;
     case 108:
-      name= "events_statements_summary_by_thread_by_event_name.row_count";
-      size= thread_max * statement_class_max;
+      name= "events_statements_summary_by_thread_by_event_name.count";
+      size= global_thread_container.get_row_count() * statement_class_max;
       break;
     case 109:
       name= "events_statements_summary_by_thread_by_event_name.memory";
-      size= thread_max * statement_class_max * sizeof(PFS_statement_stat);
+      size= global_thread_container.get_row_count() * statement_class_max * sizeof(PFS_statement_stat);
       total_memory+= size;
       break;
     case 110:
-      name= "events_statements_summary_global_by_event_name.row_size";
+      name= "events_statements_summary_global_by_event_name.size";
       size= sizeof(PFS_statement_stat);
       break;
     case 111:
-      name= "events_statements_summary_global_by_event_name.row_count";
+      name= "events_statements_summary_global_by_event_name.count";
       size= statement_class_max;
       break;
     case 112:
@@ -1111,63 +1386,63 @@ bool pfs_show_status(handlerton *hton, THD *thd,
       total_memory+= size;
       break;
     case 113:
-      name= "events_statements_summary_by_account_by_event_name.row_size";
+      name= "events_statements_summary_by_account_by_event_name.size";
       size= sizeof(PFS_statement_stat);
       break;
     case 114:
-      name= "events_statements_summary_by_account_by_event_name.row_count";
-      size= account_max * statement_class_max;
+      name= "events_statements_summary_by_account_by_event_name.count";
+      size= global_account_container.get_row_count() * statement_class_max;
       break;
     case 115:
       name= "events_statements_summary_by_account_by_event_name.memory";
-      size= account_max * statement_class_max * sizeof(PFS_statement_stat);
+      size= global_account_container.get_row_count() * statement_class_max * sizeof(PFS_statement_stat);
       total_memory+= size;
       break;
     case 116:
-      name= "events_statements_summary_by_user_by_event_name.row_size";
+      name= "events_statements_summary_by_user_by_event_name.size";
       size= sizeof(PFS_statement_stat);
       break;
     case 117:
-      name= "events_statements_summary_by_user_by_event_name.row_count";
-      size= user_max * statement_class_max;
+      name= "events_statements_summary_by_user_by_event_name.count";
+      size= global_user_container.get_row_count() * statement_class_max;
       break;
     case 118:
       name= "events_statements_summary_by_user_by_event_name.memory";
-      size= user_max * statement_class_max * sizeof(PFS_statement_stat);
+      size= global_user_container.get_row_count() * statement_class_max * sizeof(PFS_statement_stat);
       total_memory+= size;
       break;
     case 119:
-      name= "events_statements_summary_by_host_by_event_name.row_size";
+      name= "events_statements_summary_by_host_by_event_name.size";
       size= sizeof(PFS_statement_stat);
       break;
     case 120:
-      name= "events_statements_summary_by_host_by_event_name.row_count";
-      size= host_max * statement_class_max;
+      name= "events_statements_summary_by_host_by_event_name.count";
+      size= global_host_container.get_row_count() * statement_class_max;
       break;
     case 121:
       name= "events_statements_summary_by_host_by_event_name.memory";
-      size= host_max * statement_class_max * sizeof(PFS_statement_stat);
+      size= global_host_container.get_row_count() * statement_class_max * sizeof(PFS_statement_stat);
       total_memory+= size;
       break;
     case 122:
-      name= "events_statements_current.row_size";
+      name= "events_statements_current.size";
       size= sizeof(PFS_events_statements);
       break;
     case 123:
-      name= "events_statements_current.row_count";
-      size= thread_max * statement_stack_max;
+      name= "events_statements_current.count";
+      size= global_thread_container.get_row_count() * statement_stack_max;
       break;
     case 124:
       name= "events_statements_current.memory";
-      size= thread_max * statement_stack_max * sizeof(PFS_events_statements);
+      size= global_thread_container.get_row_count() * statement_stack_max * sizeof(PFS_events_statements);
       total_memory+= size;
       break;
     case 125:
-      name= "(pfs_socket_class).row_size";
+      name= "(pfs_socket_class).size";
       size= sizeof(PFS_socket_class);
       break;
     case 126:
-      name= "(pfs_socket_class).row_count";
+      name= "(pfs_socket_class).count";
       size= socket_class_max;
       break;
     case 127:
@@ -1176,110 +1451,144 @@ bool pfs_show_status(handlerton *hton, THD *thd,
       total_memory+= size;
       break;
     case 128:
-      name= "socket_instances.row_size";
-      size= sizeof(PFS_socket);
+      name= "socket_instances.size";
+      size= global_socket_container.get_row_size();
       break;
     case 129:
-      name= "socket_instances.row_count";
-      size= socket_max;
+      name= "socket_instances.count";
+      size= global_socket_container.get_row_count();
       break;
     case 130:
       name= "socket_instances.memory";
-      size= socket_max * sizeof(PFS_socket);
+      size= global_socket_container.get_memory();
       total_memory+= size;
       break;
     case 131:
-      name= "events_statements_summary_by_digest.row_size";
+      name= "events_statements_summary_by_digest.size";
       size= sizeof(PFS_statements_digest_stat);
       break;
     case 132:
-      name= "events_statements_summary_by_digest.row_count";
+      name= "events_statements_summary_by_digest.count";
       size= digest_max;
       break;
     case 133:
       name= "events_statements_summary_by_digest.memory";
-      size= digest_max * sizeof(PFS_statements_digest_stat);
+      size= digest_max * (sizeof(PFS_statements_digest_stat));
       total_memory+= size;
       break;
     case 134:
-      name= "session_connect_attrs.row_size";
-      size= thread_max;
+      name= "events_statements_summary_by_program.size";
+      size= global_program_container.get_row_size();
       break;
     case 135:
-      name= "session_connect_attrs.row_count";
-      size= session_connect_attrs_size_per_thread;
+      name= "events_statements_summary_by_program.count";
+      size= global_program_container.get_row_count();
       break;
     case 136:
+      name= "events_statements_summary_by_program.memory";
+      size= global_program_container.get_memory();
+      total_memory+= size;
+      break;
+    case 137:
+      name= "session_connect_attrs.size";
+      size= global_thread_container.get_row_count();
+      break;
+    case 138:
+      name= "session_connect_attrs.count";
+      size= session_connect_attrs_size_per_thread;
+      break;
+    case 139:
       name= "session_connect_attrs.memory";
-      size= thread_max * session_connect_attrs_size_per_thread;
+      size= global_thread_container.get_row_count() * session_connect_attrs_size_per_thread;
+      total_memory+= size;
+      break;
+    case 140:
+      name= "prepared_statements_instances.size";
+      size= global_prepared_stmt_container.get_row_size();
+      break;
+    case 141:
+      name= "prepared_statements_instances.count";
+      size= global_prepared_stmt_container.get_row_count();
+      break;
+    case 142:
+      name= "prepared_statements_instances.memory";
+      size= global_prepared_stmt_container.get_memory();
       total_memory+= size;
       break;
 
-    case 137:
+    case 143:
       name= "(account_hash).count";
       size= account_hash.count;
       break;
-    case 138:
+    case 144:
       name= "(account_hash).size";
       size= account_hash.size;
       break;
-    case 139:
+    case 145:
       name= "(digest_hash).count";
       size= digest_hash.count;
       break;
-    case 140:
+    case 146:
       name= "(digest_hash).size";
       size= digest_hash.size;
       break;
-    case 141:
+    case 147:
       name= "(filename_hash).count";
       size= pfs_filename_hash.count;
       break;
-    case 142:
+    case 148:
       name= "(filename_hash).size";
       size= pfs_filename_hash.size;
       break;
-    case 143:
+    case 149:
       name= "(host_hash).count";
       size= host_hash.count;
       break;
-    case 144:
+    case 150:
       name= "(host_hash).size";
       size= host_hash.size;
       break;
-    case 145:
+    case 151:
       name= "(setup_actor_hash).count";
       size= setup_actor_hash.count;
       break;
-    case 146:
+    case 152:
       name= "(setup_actor_hash).size";
       size= setup_actor_hash.size;
       break;
-    case 147:
+    case 153:
       name= "(setup_object_hash).count";
       size= setup_object_hash.count;
       break;
-    case 148:
+    case 154:
       name= "(setup_object_hash).size";
       size= setup_object_hash.size;
       break;
-    case 149:
+    case 155:
       name= "(table_share_hash).count";
       size= table_share_hash.count;
       break;
-    case 150:
+    case 156:
       name= "(table_share_hash).size";
       size= table_share_hash.size;
       break;
-    case 151:
+    case 157:
       name= "(user_hash).count";
       size= user_hash.count;
       break;
-    case 152:
+    case 158:
       name= "(user_hash).size";
       size= user_hash.size;
       break;
-    case 153:
+    case 159:
+      name= "(program_hash).count";
+      size= program_hash.count;
+      break;
+    case 160:
+      name= "(program_hash).size";
+      size= program_hash.size;
+      break;
+    case 161:
       /*
         This is not a performance_schema buffer,
         the data is maintained in the server,
@@ -1291,68 +1600,302 @@ bool pfs_show_status(handlerton *hton, THD *thd,
       name= "host_cache.size";
       size= sizeof(Host_entry);
       break;
-    case 154:
-      name= "(history_long_statements_digest_token_array).row_count";
+
+    case 162:
+      name= "(pfs_memory_class).row_size";
+      size= sizeof(PFS_memory_class);
+      break;
+    case 163:
+      name= "(pfs_memory_class).row_count";
+      size= memory_class_max;
+      break;
+    case 164:
+      name= "(pfs_memory_class).memory";
+      size= memory_class_max * sizeof(PFS_memory_class);
+      total_memory+= size;
+      break;
+
+    case 165:
+      name= "memory_summary_by_thread_by_event_name.row_size";
+      size= sizeof(PFS_memory_stat);
+      break;
+    case 166:
+      name= "memory_summary_by_thread_by_event_name.row_count";
+      size= global_thread_container.get_row_count() * memory_class_max;
+      break;
+    case 167:
+      name= "memory_summary_by_thread_by_event_name.memory";
+      size= global_thread_container.get_row_count() * memory_class_max * sizeof(PFS_memory_stat);
+      total_memory+= size;
+      break;
+    case 168:
+      name= "memory_summary_global_by_event_name.row_size";
+      size= sizeof(PFS_memory_stat);
+      break;
+    case 169:
+      name= "memory_summary_global_by_event_name.row_count";
+      size= memory_class_max;
+      break;
+    case 170:
+      name= "memory_summary_global_by_event_name.memory";
+      size= memory_class_max * sizeof(PFS_memory_stat);
+      total_memory+= size;
+      break;
+    case 171:
+      name= "memory_summary_by_account_by_event_name.row_size";
+      size= sizeof(PFS_memory_stat);
+      break;
+    case 172:
+      name= "memory_summary_by_account_by_event_name.row_count";
+      size= global_account_container.get_row_count() * memory_class_max;
+      break;
+    case 173:
+      name= "memory_summary_by_account_by_event_name.memory";
+      size= global_account_container.get_row_count() * memory_class_max * sizeof(PFS_memory_stat);
+      total_memory+= size;
+      break;
+    case 174:
+      name= "memory_summary_by_user_by_event_name.row_size";
+      size= sizeof(PFS_memory_stat);
+      break;
+    case 175:
+      name= "memory_summary_by_user_by_event_name.row_count";
+      size= global_user_container.get_row_count() * memory_class_max;
+      break;
+    case 176:
+      name= "memory_summary_by_user_by_event_name.memory";
+      size= global_user_container.get_row_count() * memory_class_max * sizeof(PFS_memory_stat);
+      total_memory+= size;
+      break;
+    case 177:
+      name= "memory_summary_by_host_by_event_name.row_size";
+      size= sizeof(PFS_memory_stat);
+      break;
+    case 178:
+      name= "memory_summary_by_host_by_event_name.row_count";
+      size= global_host_container.get_row_count() * memory_class_max;
+      break;
+    case 179:
+      name= "memory_summary_by_host_by_event_name.memory";
+      size= global_host_container.get_row_count() * memory_class_max * sizeof(PFS_memory_stat);
+      total_memory+= size;
+      break;
+    case 180:
+      name= "metadata_locks.row_size";
+      size= global_mdl_container.get_row_size();
+      break;
+    case 181:
+      name= "metadata_locks.row_count";
+      size= global_mdl_container.get_row_count();
+      break;
+    case 182:
+      name= "metadata_locks.memory";
+      size= global_mdl_container.get_memory();
+      total_memory+= size;
+      break;
+    case 183:
+      name= "events_transactions_history.size";
+      size= sizeof(PFS_events_transactions);
+      break;
+    case 184:
+      name= "events_transactions_history.count";
+      size= events_transactions_history_per_thread * global_thread_container.get_row_count();
+      break;
+    case 185:
+      name= "events_transactions_history.memory";
+      size= events_transactions_history_per_thread * global_thread_container.get_row_count()
+        * sizeof(PFS_events_transactions);
+      total_memory+= size;
+      break;
+    case 186:
+      name= "events_transactions_history_long.size";
+      size= sizeof(PFS_events_transactions);
+      break;
+    case 187:
+      name= "events_transactions_history_long.count";
+      size= events_transactions_history_long_size;
+      break;
+    case 188:
+      name= "events_transactions_history_long.memory";
+      size= events_transactions_history_long_size * sizeof(PFS_events_transactions);
+      total_memory+= size;
+      break;
+    case 189:
+      name= "events_transactions_summary_by_thread_by_event_name.size";
+      size= sizeof(PFS_transaction_stat);
+      break;
+    case 190:
+      name= "events_transactions_summary_by_thread_by_event_name.count";
+      size= global_thread_container.get_row_count() * transaction_class_max;
+      break;
+    case 191:
+      name= "events_transactions_summary_by_thread_by_event_name.memory";
+      size= global_thread_container.get_row_count() * transaction_class_max * sizeof(PFS_transaction_stat);
+      total_memory+= size;
+      break;
+    case 192:
+      name= "events_transactions_summary_by_account_by_event_name.size";
+      size= sizeof(PFS_transaction_stat);
+      break;
+    case 193:
+      name= "events_transactions_summary_by_account_by_event_name.count";
+      size= global_account_container.get_row_count() * transaction_class_max;
+      break;
+    case 194:
+      name= "events_transactions_summary_by_account_by_event_name.memory";
+      size= global_account_container.get_row_count() * transaction_class_max * sizeof(PFS_transaction_stat);
+      total_memory+= size;
+      break;
+    case 195:
+      name= "events_transactions_summary_by_user_by_event_name.size";
+      size= sizeof(PFS_transaction_stat);
+      break;
+    case 196:
+      name= "events_transactions_summary_by_user_by_event_name.count";
+      size= global_user_container.get_row_count() * transaction_class_max;
+      break;
+    case 197:
+      name= "events_transactions_summary_by_user_by_event_name.memory";
+      size= global_user_container.get_row_count() * transaction_class_max * sizeof(PFS_transaction_stat);
+      total_memory+= size;
+      break;
+    case 198:
+      name= "events_transactions_summary_by_host_by_event_name.size";
+      size= sizeof(PFS_transaction_stat);
+      break;
+    case 199:
+      name= "events_transactions_summary_by_host_by_event_name.count";
+      size= global_host_container.get_row_count() * transaction_class_max;
+      break;
+    case 200:
+      name= "events_transactions_summary_by_host_by_event_name.memory";
+      size= global_host_container.get_row_count() * transaction_class_max * sizeof(PFS_transaction_stat);
+      total_memory+= size;
+      break;
+    case 201:
+      name= "table_lock_waits_summary_by_table.size";
+      size= global_table_share_lock_container.get_row_size();
+      break;
+    case 202:
+      name= "table_lock_waits_summary_by_table.count";
+      size= global_table_share_lock_container.get_row_count();
+      break;
+    case 203:
+      name= "table_lock_waits_summary_by_table.memory";
+      size= global_table_share_lock_container.get_memory();
+      total_memory+= size;
+      break;
+    case 204:
+      name= "table_io_waits_summary_by_index_usage.size";
+      size= global_table_share_index_container.get_row_size();
+      break;
+    case 205:
+      name= "table_io_waits_summary_by_index_usage.count";
+      size= global_table_share_index_container.get_row_count();
+      break;
+    case 206:
+      name= "table_io_waits_summary_by_index_usage.memory";
+      size= global_table_share_index_container.get_memory();
+      total_memory+= size;
+      break;
+    case 207:
+      name= "(history_long_statements_digest_token_array).count";
       size= events_statements_history_long_size;
       break;
-    case 155:
-      name= "(history_long_statements_digest_token_array).row_size";
+    case 208:
+      name= "(history_long_statements_digest_token_array).size";
       size= pfs_max_digest_length;
       break;
-    case 156:
+    case 209:
       name= "(history_long_statements_digest_token_array).memory";
       size= events_statements_history_long_size * pfs_max_digest_length;
       total_memory+= size;
       break;
-    case 157:
-      name= "(history_statements_digest_token_array).row_count";
-      size= thread_max * events_statements_history_per_thread;
+    case 210:
+      name= "(history_statements_digest_token_array).count";
+      size= global_thread_container.get_row_count() * events_statements_history_per_thread;
       break;
-    case 158:
-      name= "(history_statements_digest_token_array).row_size";
+    case 211:
+      name= "(history_statements_digest_token_array).size";
       size= pfs_max_digest_length;
       break;
-    case 159:
+    case 212:
       name= "(history_statements_digest_token_array).memory";
-      size= thread_max * events_statements_history_per_thread * pfs_max_digest_length;
+      size= global_thread_container.get_row_count() * events_statements_history_per_thread * pfs_max_digest_length;
       total_memory+= size;
       break;
-    case 160:
-      name= "(current_statements_digest_token_array).row_count";
-      size= thread_max * statement_stack_max;
+    case 213:
+      name= "(current_statements_digest_token_array).count";
+      size= global_thread_container.get_row_count() * statement_stack_max;
       break;
-    case 161:
-      name= "(current_statements_digest_token_array).row_size";
+    case 214:
+      name= "(current_statements_digest_token_array).size";
       size= pfs_max_digest_length;
       break;
-    case 162:
+    case 215:
       name= "(current_statements_digest_token_array).memory";
-      size= thread_max * statement_stack_max * pfs_max_digest_length;
+      size= global_thread_container.get_row_count() * statement_stack_max * pfs_max_digest_length;
       total_memory+= size;
       break;
-    case 163:
-      name= "(statements_digest_token_array).row_count";
+    case 216:
+      name= "(history_long_statements_text_array).count";
+      size= events_statements_history_long_size;
+      break;
+    case 217:
+      name= "(history_long_statements_text_array).size";
+      size= pfs_max_sqltext;
+      break;
+    case 218:
+      name= "(history_long_statements_text_array).memory";
+      size= events_statements_history_long_size * pfs_max_sqltext;
+      total_memory+= size;
+      break;
+    case 219:
+      name= "(history_statements_text_array).count";
+      size= global_thread_container.get_row_count() * events_statements_history_per_thread;
+      break;
+    case 220:
+      name= "(history_statements_text_array).size";
+      size= pfs_max_sqltext;
+      break;
+    case 221:
+      name= "(history_statements_text_array).memory";
+      size= global_thread_container.get_row_count() * events_statements_history_per_thread * pfs_max_sqltext;
+      total_memory+= size;
+      break;
+    case 222:
+      name= "(current_statements_text_array).count";
+      size= global_thread_container.get_row_count() * statement_stack_max;
+      break;
+    case 223:
+      name= "(current_statements_text_array).size";
+      size= pfs_max_sqltext;
+      break;
+    case 224:
+      name= "(current_statements_text_array).memory";
+      size= global_thread_container.get_row_count() * statement_stack_max * pfs_max_sqltext;
+      total_memory+= size;
+      break;
+    case 225:
+      name= "(statements_digest_token_array).count";
       size= digest_max;
       break;
-    case 164:
-      name= "(statements_digest_token_array).row_size";
+    case 226:
+      name= "(statements_digest_token_array).size";
       size= pfs_max_digest_length;
       break;
-    case 165:
+    case 227:
       name= "(statements_digest_token_array).memory";
       size= digest_max * pfs_max_digest_length;
       total_memory+= size;
       break;
-
     /*
       This case must be last,
       for aggregation in total_memory.
     */
-    case 166:
+    case 228:
       name= "performance_schema.memory";
       size= total_memory;
-      /* This will fail if something is not advertised here */
-      DBUG_ASSERT(size == pfs_allocated_memory);
       break;
     default:
       goto end;
diff --git a/storage/perfschema/pfs_engine_table.h b/storage/perfschema/pfs_engine_table.h
index 03a2b6a9d28..925a2186a92 100644
--- a/storage/perfschema/pfs_engine_table.h
+++ b/storage/perfschema/pfs_engine_table.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -23,13 +23,24 @@
 #ifndef PFS_ENGINE_TABLE_H
 #define PFS_ENGINE_TABLE_H
 
-#include "sql_acl.h"                            /* struct ACL_* */
+#include "table.h"
+#include "sql_acl.h"
 /**
   @file storage/perfschema/pfs_engine_table.h
   Performance schema tables (declarations).
 */
 
 #include "pfs_instr_class.h"
+extern pthread_key_t THR_PFS_VG;   // global_variables
+extern pthread_key_t THR_PFS_SV;   // session_variables
+extern pthread_key_t THR_PFS_VBT;  // variables_by_thread
+extern pthread_key_t THR_PFS_SG;   // global_status
+extern pthread_key_t THR_PFS_SS;   // session_status
+extern pthread_key_t THR_PFS_SBT;  // status_by_thread
+extern pthread_key_t THR_PFS_SBU;  // status_by_user
+extern pthread_key_t THR_PFS_SBH;  // status_by_host
+extern pthread_key_t THR_PFS_SBA;  // status_by_account
+
 class Field;
 struct PFS_engine_table_share;
 struct time_normalizer;
@@ -40,6 +51,36 @@ struct time_normalizer;
 */
 
 /**
+  Store and retrieve table state information during a query.
+*/
+class PFS_table_context
+{
+public:
+  PFS_table_context(ulonglong current_version, bool restore, pthread_key_t key);
+  PFS_table_context(ulonglong current_version, ulong map_size, bool restore, pthread_key_t key);
+~PFS_table_context(void);
+
+  bool initialize(void);
+  bool is_initialized(void) { return m_initialized; }
+  ulonglong current_version(void) { return m_current_version; }
+  ulonglong last_version(void) { return m_last_version; }
+  bool versions_match(void) { return m_last_version == m_current_version; }
+  void set_item(ulong n);
+  bool is_item_set(ulong n);
+  pthread_key_t m_thr_key;
+
+private:
+  ulonglong m_current_version;
+  ulonglong m_last_version;
+  ulong *m_map;
+  ulong m_map_size;
+  static constexpr ulong m_word_size= 8 * sizeof(ulong);
+  bool m_restore;
+  bool m_initialized;
+  ulong m_last_item;
+};
+
+/**
   An abstract PERFORMANCE_SCHEMA table.
   Every table implemented in the performance schema schema and storage engine
   derives from this class.
@@ -88,12 +129,24 @@ public:
   {}
 
   /**
+    Helper, assign a value to a long field.
+    @param f the field to set
+    @param value the value to assign
+  */
+  static void set_field_long(Field *f, long value);
+  /**
     Helper, assign a value to a ulong field.
     @param f the field to set
     @param value the value to assign
   */
   static void set_field_ulong(Field *f, ulong value);
   /**
+    Helper, assign a value to a longlong field.
+    @param f the field to set
+    @param value the value to assign
+  */
+  static void set_field_longlong(Field *f, longlong value);
+  /**
     Helper, assign a value to a ulonglong field.
     @param f the field to set
     @param value the value to assign
@@ -109,6 +162,14 @@ public:
   /**
     Helper, assign a value to a varchar utf8 field.
     @param f the field to set
+    @param cs the string character set
+    @param str the string to assign
+    @param len the length of the string to assign
+  */
+  static void set_field_varchar(Field *f, const CHARSET_INFO *cs, const char *str, uint len);
+  /**
+    Helper, assign a value to a varchar utf8 field.
+    @param f the field to set
     @param str the string to assign
     @param len the length of the string to assign
   */
@@ -121,6 +182,13 @@ public:
   */
   static void set_field_longtext_utf8(Field *f, const char *str, uint len);
   /**
+    Helper, assign a value to a blob field.
+    @param f the field to set
+    @param val the value to assign
+    @param len the length of the string to assign
+  */
+  static void set_field_blob(Field *f, const char *val, uint len);
+  /**
     Helper, assign a value to an enum field.
     @param f the field to set
     @param value the value to assign
@@ -133,6 +201,12 @@ public:
   */
   static void set_field_timestamp(Field *f, ulonglong value);
   /**
+    Helper, assign a value to a double field.
+    @param f the field to set
+    @param value the value to assign
+  */
+  static void set_field_double(Field *f, double value);
+  /**
     Helper, read a value from an enum field.
     @param f the field to read
     @return the field value
@@ -182,7 +256,6 @@ protected:
   */
   virtual int delete_row_values(TABLE *table, const unsigned char *buf,
                                 Field **fields);
-
   /**
     Constructor.
     @param share            table share
@@ -238,19 +311,14 @@ struct PFS_engine_table_share
   pfs_delete_all_rows_t m_delete_all_rows;
   /** Get rows count function. */
   pfs_get_row_count_t m_get_row_count;
-  /**
-    Number or records.
-    This number does not need to be precise,
-    it is used by the optimizer to decide if the table
-    has 0, 1, or many records.
-  */
-  ha_rows m_records;
   /** Length of the m_pos position structure. */
   uint m_ref_length;
   /** The lock, stored on behalf of the SQL layer. */
   THR_LOCK *m_thr_lock_ptr;
   /** Table definition. */
   LEX_STRING sql;
+  /** Table is available even if the Performance Schema is disabled. */
+  bool m_perpetual;
 };
 
 /**
@@ -266,7 +334,8 @@ public:
   ~PFS_readonly_acl()
   {}
 
-  virtual ACL_internal_access_result check(ulong want_access, ulong *save_priv) const;
+  virtual ACL_internal_access_result check(privilege_t want_access,
+                                           privilege_t *save_priv) const;
 };
 
 /** Singleton instance of PFS_readonly_acl. */
@@ -285,7 +354,8 @@ public:
   ~PFS_truncatable_acl()
   {}
 
-  ACL_internal_access_result check(ulong want_access, ulong *save_priv) const;
+  ACL_internal_access_result check(privilege_t want_access,
+                                   privilege_t *save_priv) const;
 };
 
 /** Singleton instance of PFS_truncatable_acl. */
@@ -304,7 +374,8 @@ public:
   ~PFS_updatable_acl()
   {}
 
-  ACL_internal_access_result check(ulong want_access, ulong *save_priv) const;
+  ACL_internal_access_result check(privilege_t want_access,
+                                   privilege_t *save_priv) const;
 };
 
 /** Singleton instance of PFS_updatable_acl. */
@@ -323,7 +394,8 @@ public:
   ~PFS_editable_acl()
   {}
 
-  ACL_internal_access_result check(ulong want_access, ulong *save_priv) const;
+  ACL_internal_access_result check(privilege_t want_access,
+                                   privilege_t *save_priv) const;
 };
 
 /** Singleton instance of PFS_editable_acl. */
@@ -341,12 +413,52 @@ public:
   ~PFS_unknown_acl()
   {}
 
-  ACL_internal_access_result check(ulong want_access, ulong *save_priv) const;
+  ACL_internal_access_result check(privilege_t want_access,
+                                   privilege_t *save_priv) const;
 };
 
 /** Singleton instance of PFS_unknown_acl. */
 extern PFS_unknown_acl pfs_unknown_acl;
 
+
+/**
+  Privileges for world readable tables.
+*/
+class PFS_readonly_world_acl : public PFS_readonly_acl
+{
+public:
+  PFS_readonly_world_acl()
+  {}
+
+  ~PFS_readonly_world_acl()
+  {}
+  virtual ACL_internal_access_result check(privilege_t want_access, privilege_t *save_priv) const;
+};
+
+
+/** Singleton instance of PFS_readonly_world_acl */
+extern PFS_readonly_world_acl pfs_readonly_world_acl;
+
+
+/**
+Privileges for world readable truncatable tables.
+*/
+class PFS_truncatable_world_acl : public PFS_truncatable_acl
+{
+public:
+  PFS_truncatable_world_acl()
+  {}
+
+  ~PFS_truncatable_world_acl()
+  {}
+  virtual ACL_internal_access_result check(privilege_t want_access, privilege_t *save_priv) const;
+};
+
+
+/** Singleton instance of PFS_readonly_world_acl */
+extern PFS_truncatable_world_acl pfs_truncatable_world_acl;
+
+
 /** Position of a cursor, for simple iterations. */
 struct PFS_simple_index
 {
@@ -363,6 +475,13 @@ struct PFS_simple_index
 
   /**
     Set this index at a given position.
+    @param index an index
+  */
+  void set_at(uint index)
+  { m_index= index; }
+
+  /**
+    Set this index at a given position.
     @param other a position
   */
   void set_at(const struct PFS_simple_index *other)
@@ -399,6 +518,15 @@ struct PFS_double_index
 
   /**
     Set this index at a given position.
+  */
+  void set_at(uint index_1, uint index_2)
+  {
+    m_index_1= index_1;
+    m_index_2= index_2;
+  }
+
+  /**
+    Set this index at a given position.
     @param other a position
   */
   void set_at(const struct PFS_double_index *other)
@@ -440,6 +568,16 @@ struct PFS_triple_index
 
   /**
     Set this index at a given position.
+  */
+  void set_at(uint index_1, uint index_2, uint index_3)
+  {
+    m_index_1= index_1;
+    m_index_2= index_2;
+    m_index_3= index_3;
+  }
+
+  /**
+    Set this index at a given position.
     @param other a position
   */
   void set_at(const struct PFS_triple_index *other)
diff --git a/storage/perfschema/pfs_events.h b/storage/perfschema/pfs_events.h
index ca2fd8582ad..fbe65d6ebad 100644
--- a/storage/perfschema/pfs_events.h
+++ b/storage/perfschema/pfs_events.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -41,8 +41,16 @@ struct PFS_events
   ulonglong m_event_id;
   /** END_EVENT_ID. */
   ulonglong m_end_event_id;
+  /** (EVENT_TYPE) */
+  enum_event_type m_event_type;
   /** NESTING_EVENT_ID. */
   ulonglong m_nesting_event_id;
+  /** NESTING_EVENT_TYPE */
+  enum_event_type m_nesting_event_type;
+  /** NESTING_EVENT_LEVEL */
+  uint m_nesting_event_level;
+  /** Instrument metadata. */
+  PFS_instr_class *m_class;
   /**
     Timer start.
     This member is populated only if m_class->m_timed is true.
@@ -53,14 +61,8 @@ struct PFS_events
     This member is populated only if m_class->m_timed is true.
   */
   ulonglong m_timer_end;
-  /** Instrument metadata. */
-  PFS_instr_class *m_class;
   /** Location of the instrumentation in the source code (file name). */
   const char *m_source_file;
-  /** (EVENT_TYPE) */
-  enum_event_type m_event_type;
-  /** NESTING_EVENT_TYPE */
-  enum_event_type m_nesting_event_type;
   /** Location of the instrumentation in the source code (line number). */
   uint m_source_line;
 };
diff --git a/storage/perfschema/pfs_events_stages.cc b/storage/perfschema/pfs_events_stages.cc
index a68b0729e96..087e5a1c55f 100644
--- a/storage/perfschema/pfs_events_stages.cc
+++ b/storage/perfschema/pfs_events_stages.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -35,22 +35,24 @@
 #include "pfs_user.h"
 #include "pfs_events_stages.h"
 #include "pfs_atomic.h"
+#include "pfs_buffer_container.h"
+#include "pfs_builtin_memory.h"
 #include "m_string.h"
 
-ulong events_stages_history_long_size= 0;
+PFS_ALIGNED ulong events_stages_history_long_size= 0;
 /** Consumer flag for table EVENTS_STAGES_CURRENT. */
-bool flag_events_stages_current= false;
+PFS_ALIGNED bool flag_events_stages_current= false;
 /** Consumer flag for table EVENTS_STAGES_HISTORY. */
-bool flag_events_stages_history= false;
+PFS_ALIGNED bool flag_events_stages_history= false;
 /** Consumer flag for table EVENTS_STAGES_HISTORY_LONG. */
-bool flag_events_stages_history_long= false;
+PFS_ALIGNED bool flag_events_stages_history_long= false;
 
 /** True if EVENTS_STAGES_HISTORY_LONG circular buffer is full. */
-bool events_stages_history_long_full= false;
+PFS_ALIGNED bool events_stages_history_long_full= false;
 /** Index in EVENTS_STAGES_HISTORY_LONG circular buffer. */
-volatile uint32 events_stages_history_long_index= 0;
+PFS_ALIGNED PFS_cacheline_uint32 events_stages_history_long_index;
 /** EVENTS_STAGES_HISTORY_LONG circular buffer. */
-PFS_events_stages *events_stages_history_long_array= NULL;
+PFS_ALIGNED PFS_events_stages *events_stages_history_long_array= NULL;
 
 /**
   Initialize table EVENTS_STAGES_HISTORY_LONG.
@@ -60,14 +62,16 @@ int init_events_stages_history_long(uint events_stages_history_long_sizing)
 {
   events_stages_history_long_size= events_stages_history_long_sizing;
   events_stages_history_long_full= false;
-  PFS_atomic::store_u32(&events_stages_history_long_index, 0);
+  PFS_atomic::store_u32(&events_stages_history_long_index.m_u32, 0);
 
   if (events_stages_history_long_size == 0)
     return 0;
 
   events_stages_history_long_array=
-    PFS_MALLOC_ARRAY(events_stages_history_long_size, sizeof(PFS_events_stages),
-                     PFS_events_stages,  MYF(MY_ZEROFILL));
+    PFS_MALLOC_ARRAY(& builtin_memory_stages_history_long,
+                     events_stages_history_long_size,
+                     sizeof(PFS_events_stages), PFS_events_stages,
+                     MYF(MY_ZEROFILL));
 
   return (events_stages_history_long_array ? 0 : 1);
 }
@@ -75,7 +79,9 @@ int init_events_stages_history_long(uint events_stages_history_long_sizing)
 /** Cleanup table EVENTS_STAGES_HISTORY_LONG. */
 void cleanup_events_stages_history_long(void)
 {
-  pfs_free(events_stages_history_long_array);
+  PFS_FREE_ARRAY(& builtin_memory_stages_history_long,
+                 events_stages_history_long_size, sizeof(PFS_events_stages),
+                 events_stages_history_long_array);
   events_stages_history_long_array= NULL;
 }
 
@@ -95,7 +101,7 @@ void insert_events_stages_history(PFS_thread *thread, PFS_events_stages *stage)
   if (unlikely(events_stages_history_per_thread == 0))
     return;
 
-  DBUG_ASSERT(thread->m_stages_history != NULL);
+  assert(thread->m_stages_history != NULL);
 
   uint index= thread->m_stages_history_index;
 
@@ -127,9 +133,9 @@ void insert_events_stages_history_long(PFS_events_stages *stage)
   if (unlikely(events_stages_history_long_size == 0))
     return;
 
-  DBUG_ASSERT(events_stages_history_long_array != NULL);
+  assert(events_stages_history_long_array != NULL);
 
-  uint index= PFS_atomic::add_u32(&events_stages_history_long_index, 1);
+  uint index= PFS_atomic::add_u32(&events_stages_history_long_index.m_u32, 1);
 
   index= index % events_stages_history_long_size;
   if (index == 0)
@@ -139,40 +145,38 @@ void insert_events_stages_history_long(PFS_events_stages *stage)
   copy_events_stages(&events_stages_history_long_array[index], stage);
 }
 
+static void fct_reset_events_stages_current(PFS_thread *pfs)
+{
+  pfs->m_stage_current.m_class= NULL;
+}
+
 /** Reset table EVENTS_STAGES_CURRENT data. */
 void reset_events_stages_current(void)
 {
-  PFS_thread *pfs_thread= thread_array;
-  PFS_thread *pfs_thread_last= thread_array + thread_max;
+  global_thread_container.apply_all(fct_reset_events_stages_current);
+}
 
-  for ( ; pfs_thread < pfs_thread_last; pfs_thread++)
-  {
-    pfs_thread->m_stage_current.m_class= NULL;
-  }
+static void fct_reset_events_stages_history(PFS_thread *pfs_thread)
+{
+  PFS_events_stages *pfs= pfs_thread->m_stages_history;
+  PFS_events_stages *pfs_last= pfs + events_stages_history_per_thread;
+
+  pfs_thread->m_stages_history_index= 0;
+  pfs_thread->m_stages_history_full= false;
+  for ( ; pfs < pfs_last; pfs++)
+    pfs->m_class= NULL;
 }
 
 /** Reset table EVENTS_STAGES_HISTORY data. */
 void reset_events_stages_history(void)
 {
-  PFS_thread *pfs_thread= thread_array;
-  PFS_thread *pfs_thread_last= thread_array + thread_max;
-
-  for ( ; pfs_thread < pfs_thread_last; pfs_thread++)
-  {
-    PFS_events_stages *pfs= pfs_thread->m_stages_history;
-    PFS_events_stages *pfs_last= pfs + events_stages_history_per_thread;
-
-    pfs_thread->m_stages_history_index= 0;
-    pfs_thread->m_stages_history_full= false;
-    for ( ; pfs < pfs_last; pfs++)
-      pfs->m_class= NULL;
-  }
+  global_thread_container.apply_all(fct_reset_events_stages_history);
 }
 
 /** Reset table EVENTS_STAGES_HISTORY_LONG data. */
 void reset_events_stages_history_long(void)
 {
-  PFS_atomic::store_u32(&events_stages_history_long_index, 0);
+  PFS_atomic::store_u32(&events_stages_history_long_index.m_u32, 0);
   events_stages_history_long_full= false;
 
   PFS_events_stages *pfs= events_stages_history_long_array;
@@ -181,70 +185,53 @@ void reset_events_stages_history_long(void)
     pfs->m_class= NULL;
 }
 
+static void fct_reset_events_stages_by_thread(PFS_thread *thread)
+{
+  PFS_account *account= sanitize_account(thread->m_account);
+  PFS_user *user= sanitize_user(thread->m_user);
+  PFS_host *host= sanitize_host(thread->m_host);
+  aggregate_thread_stages(thread, account, user, host);
+}
+
 /** Reset table EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME data. */
 void reset_events_stages_by_thread()
 {
-  PFS_thread *thread= thread_array;
-  PFS_thread *thread_last= thread_array + thread_max;
-  PFS_account *account;
-  PFS_user *user;
-  PFS_host *host;
+  global_thread_container.apply(fct_reset_events_stages_by_thread);
+}
 
-  for ( ; thread < thread_last; thread++)
-  {
-    if (thread->m_lock.is_populated())
-    {
-      account= sanitize_account(thread->m_account);
-      user= sanitize_user(thread->m_user);
-      host= sanitize_host(thread->m_host);
-      aggregate_thread_stages(thread, account, user, host);
-    }
-  }
+static void fct_reset_events_stages_by_account(PFS_account *pfs)
+{
+  PFS_user *user= sanitize_user(pfs->m_user);
+  PFS_host *host= sanitize_host(pfs->m_host);
+  pfs->aggregate_stages(user, host);
 }
 
 /** Reset table EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME data. */
 void reset_events_stages_by_account()
 {
-  PFS_account *pfs= account_array;
-  PFS_account *pfs_last= account_array + account_max;
-  PFS_user *user;
-  PFS_host *host;
+  global_account_container.apply(fct_reset_events_stages_by_account);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-    {
-      user= sanitize_user(pfs->m_user);
-      host= sanitize_host(pfs->m_host);
-      pfs->aggregate_stages(user, host);
-    }
-  }
+static void fct_reset_events_stages_by_user(PFS_user *pfs)
+{
+  pfs->aggregate_stages();
 }
 
 /** Reset table EVENTS_STAGES_SUMMARY_BY_USER_BY_EVENT_NAME data. */
 void reset_events_stages_by_user()
 {
-  PFS_user *pfs= user_array;
-  PFS_user *pfs_last= user_array + user_max;
+  global_user_container.apply(fct_reset_events_stages_by_user);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-      pfs->aggregate_stages();
-  }
+static void fct_reset_events_stages_by_host(PFS_host *pfs)
+{
+  pfs->aggregate_stages();
 }
 
 /** Reset table EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME data. */
 void reset_events_stages_by_host()
 {
-  PFS_host *pfs= host_array;
-  PFS_host *pfs_last= host_array + host_max;
-
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-      pfs->aggregate_stages();
-  }
+  global_host_container.apply(fct_reset_events_stages_by_host);
 }
 
 /** Reset table EVENTS_STAGES_GLOBAL_BY_EVENT_NAME data. */
diff --git a/storage/perfschema/pfs_events_stages.h b/storage/perfschema/pfs_events_stages.h
index f61a7c3c077..eeae350ddf4 100644
--- a/storage/perfschema/pfs_events_stages.h
+++ b/storage/perfschema/pfs_events_stages.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -38,7 +38,7 @@ struct PFS_host;
 /** A stage record. */
 struct PFS_events_stages : public PFS_events
 {
-  /* No specific attributes */
+  PSI_stage_progress m_progress;
 };
 
 void insert_events_stages_history(PFS_thread *thread, PFS_events_stages *stage);
@@ -49,7 +49,7 @@ extern bool flag_events_stages_history;
 extern bool flag_events_stages_history_long;
 
 extern bool events_stages_history_long_full;
-extern volatile uint32 events_stages_history_long_index;
+extern PFS_ALIGNED PFS_cacheline_uint32 events_stages_history_long_index;
 extern PFS_events_stages *events_stages_history_long_array;
 extern ulong events_stages_history_long_size;
 
diff --git a/storage/perfschema/pfs_events_statements.cc b/storage/perfschema/pfs_events_statements.cc
index 1942787665a..37b179e8924 100644
--- a/storage/perfschema/pfs_events_statements.cc
+++ b/storage/perfschema/pfs_events_statements.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -35,23 +35,26 @@
 #include "pfs_user.h"
 #include "pfs_events_statements.h"
 #include "pfs_atomic.h"
+#include "pfs_buffer_container.h"
+#include "pfs_builtin_memory.h"
 #include "m_string.h"
 
-size_t events_statements_history_long_size= 0;
+PFS_ALIGNED size_t events_statements_history_long_size= 0;
 /** Consumer flag for table EVENTS_STATEMENTS_CURRENT. */
-bool flag_events_statements_current= false;
+PFS_ALIGNED bool flag_events_statements_current= false;
 /** Consumer flag for table EVENTS_STATEMENTS_HISTORY. */
-bool flag_events_statements_history= false;
+PFS_ALIGNED bool flag_events_statements_history= false;
 /** Consumer flag for table EVENTS_STATEMENTS_HISTORY_LONG. */
-bool flag_events_statements_history_long= false;
+PFS_ALIGNED bool flag_events_statements_history_long= false;
 
 /** True if EVENTS_STATEMENTS_HISTORY_LONG circular buffer is full. */
-bool events_statements_history_long_full= false;
+PFS_ALIGNED bool events_statements_history_long_full= false;
 /** Index in EVENTS_STATEMENTS_HISTORY_LONG circular buffer. */
-volatile uint32 events_statements_history_long_index= 0;
+PFS_ALIGNED PFS_cacheline_uint32 events_statements_history_long_index;
 /** EVENTS_STATEMENTS_HISTORY_LONG circular buffer. */
-PFS_events_statements *events_statements_history_long_array= NULL;
+PFS_ALIGNED PFS_events_statements *events_statements_history_long_array= NULL;
 static unsigned char *h_long_stmts_digest_token_array= NULL;
+static char *h_long_stmts_text_array= NULL;
 
 /**
   Initialize table EVENTS_STATEMENTS_HISTORY_LONG.
@@ -61,29 +64,32 @@ int init_events_statements_history_long(size_t events_statements_history_long_si
 {
   events_statements_history_long_size= events_statements_history_long_sizing;
   events_statements_history_long_full= false;
-  PFS_atomic::store_u32(&events_statements_history_long_index, 0);
+  PFS_atomic::store_u32(&events_statements_history_long_index.m_u32, 0);
 
   if (events_statements_history_long_size == 0)
     return 0;
 
   events_statements_history_long_array=
-    PFS_MALLOC_ARRAY(events_statements_history_long_size, sizeof(PFS_events_statements),
+    PFS_MALLOC_ARRAY(& builtin_memory_statements_history_long,
+                     events_statements_history_long_size, sizeof(PFS_events_statements),
                      PFS_events_statements, MYF(MY_ZEROFILL));
 
   if (events_statements_history_long_array == NULL)
-  {
-    cleanup_events_statements_history_long();
-    return 1;
-  }
+   {
+     cleanup_events_statements_history_long();
+     return 1;
+   }
 
   if (pfs_max_digest_length > 0)
   {
-    /* Size of each digest token array. */
+    /* Size of each digest text array. */
     size_t digest_text_size= pfs_max_digest_length * sizeof(unsigned char);
 
     h_long_stmts_digest_token_array=
-      PFS_MALLOC_ARRAY(events_statements_history_long_size, digest_text_size, 
+      PFS_MALLOC_ARRAY(& builtin_memory_statements_history_long_tokens,
+                       events_statements_history_long_size, digest_text_size,
                        unsigned char, MYF(MY_ZEROFILL));
+
     if (h_long_stmts_digest_token_array == NULL)
     {
       cleanup_events_statements_history_long();
@@ -91,10 +97,28 @@ int init_events_statements_history_long(size_t events_statements_history_long_si
     }
   }
 
+  if (pfs_max_sqltext > 0)
+  {
+    /* Size of each sql text array. */
+    size_t sqltext_size= pfs_max_sqltext * sizeof(char);
+
+    h_long_stmts_text_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_statements_history_long_sqltext,
+                       events_statements_history_long_size, sqltext_size,
+                       char, MYF(MY_ZEROFILL));
+
+    if (h_long_stmts_text_array == NULL)
+    {
+      cleanup_events_statements_history_long();
+      return 1;
+    }
+  }
+
   for (size_t index= 0; index < events_statements_history_long_size; index++)
   {
     events_statements_history_long_array[index].m_digest_storage.reset(h_long_stmts_digest_token_array
                                                                        + index * pfs_max_digest_length, pfs_max_digest_length);
+    events_statements_history_long_array[index].m_sqltext= h_long_stmts_text_array + index * pfs_max_sqltext;
   }
 
   return 0;
@@ -103,20 +127,46 @@ int init_events_statements_history_long(size_t events_statements_history_long_si
 /** Cleanup table EVENTS_STATEMENTS_HISTORY_LONG. */
 void cleanup_events_statements_history_long(void)
 {
-  pfs_free(events_statements_history_long_array);
-  pfs_free(h_long_stmts_digest_token_array);
+  PFS_FREE_ARRAY(& builtin_memory_statements_history_long,
+                 events_statements_history_long_size,
+                 sizeof(PFS_events_statements),
+                 events_statements_history_long_array);
+
+  PFS_FREE_ARRAY(& builtin_memory_statements_history_long_tokens,
+                 events_statements_history_long_size,
+                 (pfs_max_digest_length * sizeof(unsigned char)),
+                 h_long_stmts_digest_token_array);
+
+  PFS_FREE_ARRAY(& builtin_memory_statements_history_long_sqltext,
+                 events_statements_history_long_size,
+                 (pfs_max_sqltext * sizeof(char)),
+                 h_long_stmts_text_array);
+
   events_statements_history_long_array= NULL;
   h_long_stmts_digest_token_array= NULL;
+  h_long_stmts_text_array= NULL;
 }
 
-static inline void copy_events_statements(PFS_events_statements *dest,
-                                      const PFS_events_statements *source)
+inline void PFS_events_statements::copy(const PFS_events_statements &source)
 {
-  /* Copy all attributes except DIGEST */
-  memcpy(dest, source, my_offsetof(PFS_events_statements, m_digest_storage));
+  /* Copy all attributes except SQL TEXT and DIGEST */
+  memcpy((void*) this, &source, offsetof(PFS_events_statements, m_sqltext));
+
+  /* Copy SQL TEXT */
+  int sqltext_length= source.m_sqltext_length;
+
+  if (sqltext_length > 0)
+  {
+    memcpy(m_sqltext, source.m_sqltext, sqltext_length);
+    m_sqltext_length= sqltext_length;
+  }
+  else
+  {
+    m_sqltext_length= 0;
+  }
 
   /* Copy DIGEST */
-  dest->m_digest_storage.copy(& source->m_digest_storage);
+  m_digest_storage.copy(&source.m_digest_storage);
 }
 
 /**
@@ -129,7 +179,7 @@ void insert_events_statements_history(PFS_thread *thread, PFS_events_statements
   if (unlikely(events_statements_history_per_thread == 0))
     return;
 
-  DBUG_ASSERT(thread->m_statements_history != NULL);
+  assert(thread->m_statements_history != NULL);
 
   uint index= thread->m_statements_history_index;
 
@@ -141,7 +191,7 @@ void insert_events_statements_history(PFS_thread *thread, PFS_events_statements
     to make this thread (the writer) faster.
     This is ok, the readers of m_statements_history will filter this out.
   */
-  copy_events_statements(&thread->m_statements_history[index], statement);
+  thread->m_statements_history[index].copy(*statement);
 
   index++;
   if (index >= events_statements_history_per_thread)
@@ -161,128 +211,109 @@ void insert_events_statements_history_long(PFS_events_statements *statement)
   if (unlikely(events_statements_history_long_size == 0))
     return ;
 
-  DBUG_ASSERT(events_statements_history_long_array != NULL);
+  assert(events_statements_history_long_array != NULL);
 
-  uint index= PFS_atomic::add_u32(&events_statements_history_long_index, 1);
+  uint index= PFS_atomic::add_u32(&events_statements_history_long_index.m_u32, 1);
 
   index= index % events_statements_history_long_size;
   if (index == 0)
     events_statements_history_long_full= true;
 
   /* See related comment in insert_events_statements_history. */
-  copy_events_statements(&events_statements_history_long_array[index], statement);
+  events_statements_history_long_array[index].copy(*statement);
+}
+
+static void fct_reset_events_statements_current(PFS_thread *pfs_thread)
+{
+  PFS_events_statements *pfs_stmt= & pfs_thread->m_statement_stack[0];
+  PFS_events_statements *pfs_stmt_last= pfs_stmt + statement_stack_max;
+
+  for ( ; pfs_stmt < pfs_stmt_last; pfs_stmt++)
+    pfs_stmt->m_event.m_class= nullptr;
 }
 
 /** Reset table EVENTS_STATEMENTS_CURRENT data. */
 void reset_events_statements_current(void)
 {
-  PFS_thread *pfs_thread= thread_array;
-  PFS_thread *pfs_thread_last= thread_array + thread_max;
+  global_thread_container.apply_all(fct_reset_events_statements_current);
+}
 
-  for ( ; pfs_thread < pfs_thread_last; pfs_thread++)
-  {
-    PFS_events_statements *pfs_stmt= & pfs_thread->m_statement_stack[0];
-    PFS_events_statements *pfs_stmt_last= pfs_stmt + statement_stack_max;
+static void fct_reset_events_statements_history(PFS_thread *pfs_thread)
+{
+  PFS_events_statements *pfs= pfs_thread->m_statements_history;
+  PFS_events_statements *pfs_last= pfs + events_statements_history_per_thread;
 
-    for ( ; pfs_stmt < pfs_stmt_last; pfs_stmt++)
-      pfs_stmt->m_class= NULL;
-  }
+  pfs_thread->m_statements_history_index= 0;
+  pfs_thread->m_statements_history_full= false;
+  for ( ; pfs < pfs_last; pfs++)
+    pfs->m_event.m_class= nullptr;
 }
 
 /** Reset table EVENTS_STATEMENTS_HISTORY data. */
 void reset_events_statements_history(void)
 {
-  PFS_thread *pfs_thread= thread_array;
-  PFS_thread *pfs_thread_last= thread_array + thread_max;
-
-  for ( ; pfs_thread < pfs_thread_last; pfs_thread++)
-  {
-    PFS_events_statements *pfs= pfs_thread->m_statements_history;
-    PFS_events_statements *pfs_last= pfs + events_statements_history_per_thread;
-
-    pfs_thread->m_statements_history_index= 0;
-    pfs_thread->m_statements_history_full= false;
-    for ( ; pfs < pfs_last; pfs++)
-      pfs->m_class= NULL;
-  }
+  global_thread_container.apply_all(fct_reset_events_statements_history);
 }
 
 /** Reset table EVENTS_STATEMENTS_HISTORY_LONG data. */
 void reset_events_statements_history_long(void)
 {
-  PFS_atomic::store_u32(&events_statements_history_long_index, 0);
+  PFS_atomic::store_u32(&events_statements_history_long_index.m_u32, 0);
   events_statements_history_long_full= false;
 
   PFS_events_statements *pfs= events_statements_history_long_array;
   PFS_events_statements *pfs_last= pfs + events_statements_history_long_size;
   for ( ; pfs < pfs_last; pfs++)
-    pfs->m_class= NULL;
+    pfs->m_event.m_class= nullptr;
+}
+
+static void fct_reset_events_statements_by_thread(PFS_thread *thread)
+{
+  PFS_account *account= sanitize_account(thread->m_account);
+  PFS_user *user= sanitize_user(thread->m_user);
+  PFS_host *host= sanitize_host(thread->m_host);
+  aggregate_thread_statements(thread, account, user, host);
 }
 
 /** Reset table EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME data. */
 void reset_events_statements_by_thread()
 {
-  PFS_thread *thread= thread_array;
-  PFS_thread *thread_last= thread_array + thread_max;
-  PFS_account *account;
-  PFS_user *user;
-  PFS_host *host;
+  global_thread_container.apply(fct_reset_events_statements_by_thread);
+}
 
-  for ( ; thread < thread_last; thread++)
-  {
-    if (thread->m_lock.is_populated())
-    {
-      account= sanitize_account(thread->m_account);
-      user= sanitize_user(thread->m_user);
-      host= sanitize_host(thread->m_host);
-      aggregate_thread_statements(thread, account, user, host);
-    }
-  }
+static void fct_reset_events_statements_by_account(PFS_account *pfs)
+{
+  PFS_user *user= sanitize_user(pfs->m_user);
+  PFS_host *host= sanitize_host(pfs->m_host);
+  pfs->aggregate_statements(user, host);
 }
 
 /** Reset table EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME data. */
 void reset_events_statements_by_account()
 {
-  PFS_account *pfs= account_array;
-  PFS_account *pfs_last= account_array + account_max;
-  PFS_user *user;
-  PFS_host *host;
+  global_account_container.apply(fct_reset_events_statements_by_account);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-    {
-      user= sanitize_user(pfs->m_user);
-      host= sanitize_host(pfs->m_host);
-      pfs->aggregate_statements(user, host);
-    }
-  }
+static void fct_reset_events_statements_by_user(PFS_user *pfs)
+{
+  pfs->aggregate_statements();
 }
 
 /** Reset table EVENTS_STATEMENTS_SUMMARY_BY_USER_BY_EVENT_NAME data. */
 void reset_events_statements_by_user()
 {
-  PFS_user *pfs= user_array;
-  PFS_user *pfs_last= user_array + user_max;
+  global_user_container.apply(fct_reset_events_statements_by_user);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-      pfs->aggregate_statements();
-  }
+static void fct_reset_events_statements_by_host(PFS_host *pfs)
+{
+  pfs->aggregate_statements();
 }
 
 /** Reset table EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME data. */
 void reset_events_statements_by_host()
 {
-  PFS_host *pfs= host_array;
-  PFS_host *pfs_last= host_array + host_max;
-
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-      pfs->aggregate_statements();
-  }
+  global_host_container.apply(fct_reset_events_statements_by_host);
 }
 
 /** Reset table EVENTS_STATEMENTS_GLOBAL_BY_EVENT_NAME data. */
diff --git a/storage/perfschema/pfs_events_statements.h b/storage/perfschema/pfs_events_statements.h
index e47e2e79280..2b2426ef7d1 100644
--- a/storage/perfschema/pfs_events_statements.h
+++ b/storage/perfschema/pfs_events_statements.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -38,20 +38,23 @@ struct PFS_user;
 struct PFS_host;
 
 /** A statement record. */
-struct PFS_events_statements : public PFS_events
+struct PFS_events_statements
 {
+  PFS_events m_event;
+  enum_object_type m_sp_type;
+  char m_schema_name[NAME_LEN];
+  uint m_schema_name_length;
+  char m_object_name[NAME_LEN];
+  uint m_object_name_length;
+
   /** Database name. */
   char m_current_schema_name[NAME_LEN];
   /** Length of @c m_current_schema_name. */
   uint m_current_schema_name_length;
-  /** SQL_TEXT */
-  char m_sqltext[COL_INFO_SIZE];
-  /** Length of @ m_info. */
-  uint m_sqltext_length;
 
   /** Locked time. */
   ulonglong m_lock_time;
-  
+
   /** Diagnostics area, message text. */
   char m_message_text[MYSQL_ERRMSG_SIZE+1];
   /** Diagnostics area, error number. */
@@ -102,22 +105,34 @@ struct PFS_events_statements : public PFS_events
   uint m_sqltext_cs_number;
 
   /**
+    SQL_TEXT.
+    This pointer is immutable,
+    and always point to pre allocated memory.
+  */
+  char *m_sqltext;
+  /** Length of @ m_info. */
+  uint m_sqltext_length;
+  /**
     Statement digest.
     This underlying token array storage pointer is immutable,
     and always point to pre allocated memory.
   */
   sql_digest_storage m_digest_storage;
+
+  inline void copy(const PFS_events_statements &source);
 };
 
 void insert_events_statements_history(PFS_thread *thread, PFS_events_statements *statement);
 void insert_events_statements_history_long(PFS_events_statements *statement);
 
+extern ulong nested_statement_lost;
+
 extern bool flag_events_statements_current;
 extern bool flag_events_statements_history;
 extern bool flag_events_statements_history_long;
 
 extern bool events_statements_history_long_full;
-extern volatile uint32 events_statements_history_long_index;
+extern PFS_ALIGNED PFS_cacheline_uint32 events_statements_history_long_index;
 extern PFS_events_statements *events_statements_history_long_array;
 extern size_t events_statements_history_long_size;
 
diff --git a/storage/perfschema/pfs_events_transactions.cc b/storage/perfschema/pfs_events_transactions.cc
new file mode 100644
index 00000000000..8b83b71c96b
--- /dev/null
+++ b/storage/perfschema/pfs_events_transactions.cc
@@ -0,0 +1,267 @@
+/* Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/pfs_events_transactions.cc
+  Events transactions data structures (implementation).
+*/
+
+#include "my_global.h"
+#include "my_sys.h"
+#include "pfs_global.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_account.h"
+#include "pfs_host.h"
+#include "pfs_user.h"
+#include "pfs_events_transactions.h"
+#include "pfs_atomic.h"
+#include "pfs_buffer_container.h"
+#include "pfs_builtin_memory.h"
+#include "m_string.h"
+
+PFS_ALIGNED ulong events_transactions_history_long_size= 0;
+/** Consumer flag for table EVENTS_TRANSACTIONS_CURRENT. */
+PFS_ALIGNED bool flag_events_transactions_current= false;
+/** Consumer flag for table EVENTS_TRANSACTIONS_HISTORY. */
+PFS_ALIGNED bool flag_events_transactions_history= false;
+/** Consumer flag for table EVENTS_TRANSACTIONS_HISTORY_LONG. */
+PFS_ALIGNED bool flag_events_transactions_history_long= false;
+
+/** True if EVENTS_TRANSACTIONS_HISTORY_LONG circular buffer is full. */
+PFS_ALIGNED bool events_transactions_history_long_full= false;
+/** Index in EVENTS_TRANSACTIONS_HISTORY_LONG circular buffer. */
+PFS_ALIGNED PFS_cacheline_uint32 events_transactions_history_long_index;
+/** EVENTS_TRANSACTIONS_HISTORY_LONG circular buffer. */
+PFS_ALIGNED PFS_events_transactions *events_transactions_history_long_array= NULL;
+
+/**
+  Initialize table EVENTS_TRANSACTIONS_HISTORY_LONG.
+  @param events_transactions_history_long_sizing       table sizing
+*/
+int init_events_transactions_history_long(uint events_transactions_history_long_sizing)
+{
+  events_transactions_history_long_size= events_transactions_history_long_sizing;
+  events_transactions_history_long_full= false;
+  PFS_atomic::store_u32(&events_transactions_history_long_index.m_u32, 0);
+
+  if (events_transactions_history_long_size == 0)
+    return 0;
+
+  events_transactions_history_long_array=
+    PFS_MALLOC_ARRAY(& builtin_memory_transactions_history_long,
+                     events_transactions_history_long_size,
+                     sizeof(PFS_events_transactions), PFS_events_transactions,
+                     MYF(MY_ZEROFILL));
+
+  return (events_transactions_history_long_array ? 0 : 1);
+}
+
+/** Cleanup table EVENTS_TRANSACTIONS_HISTORY_LONG. */
+void cleanup_events_transactions_history_long(void)
+{
+  PFS_FREE_ARRAY(& builtin_memory_transactions_history_long,
+                 events_transactions_history_long_size, sizeof(PFS_events_transactions),
+                 events_transactions_history_long_array);
+  events_transactions_history_long_array= NULL;
+}
+
+static inline void copy_events_transactions(PFS_events_transactions *dest,
+                                      const PFS_events_transactions *source)
+{
+  memcpy(dest, source, sizeof(PFS_events_transactions));
+}
+
+/**
+  Insert a transaction record in table EVENTS_TRANSACTIONS_HISTORY.
+  @param thread             thread that executed the wait
+  @param transaction          record to insert
+*/
+void insert_events_transactions_history(PFS_thread *thread, PFS_events_transactions *transaction)
+{
+  if (unlikely(events_transactions_history_per_thread == 0))
+    return;
+
+  assert(thread->m_transactions_history != NULL);
+
+  uint index= thread->m_transactions_history_index;
+
+  /*
+    A concurrent thread executing TRUNCATE TABLE EVENTS_TRANSACTIONS_CURRENT
+    could alter the data that this thread is inserting,
+    causing a potential race condition.
+    We are not testing for this and insert a possibly empty record,
+    to make this thread (the writer) faster.
+    This is ok, the readers of m_transactions_history will filter this out.
+  */
+  copy_events_transactions(&thread->m_transactions_history[index], transaction);
+
+  index++;
+  if (index >= events_transactions_history_per_thread)
+  {
+    index= 0;
+    thread->m_transactions_history_full= true;
+  }
+  thread->m_transactions_history_index= index;
+}
+
+/**
+  Insert a transaction record in table EVENTS_TRANSACTIONS_HISTORY_LONG.
+  @param transaction              record to insert
+*/
+void insert_events_transactions_history_long(PFS_events_transactions *transaction)
+{
+  if (unlikely(events_transactions_history_long_size == 0))
+    return ;
+
+  assert(events_transactions_history_long_array != NULL);
+
+  uint index= PFS_atomic::add_u32(&events_transactions_history_long_index.m_u32, 1);
+
+  index= index % events_transactions_history_long_size;
+  if (index == 0)
+    events_transactions_history_long_full= true;
+
+  /* See related comment in insert_events_transactions_history. */
+  copy_events_transactions(&events_transactions_history_long_array[index], transaction);
+}
+
+static void fct_reset_events_transactions_current(PFS_thread *pfs)
+{
+  pfs->m_transaction_current.m_class= NULL;
+}
+
+/** Reset table EVENTS_TRANSACTIONS_CURRENT data. */
+void reset_events_transactions_current(void)
+{
+  global_thread_container.apply_all(fct_reset_events_transactions_current);
+}
+
+static void fct_reset_events_transactions_history(PFS_thread *pfs_thread)
+{
+  PFS_events_transactions *pfs= pfs_thread->m_transactions_history;
+  PFS_events_transactions *pfs_last= pfs + events_transactions_history_per_thread;
+
+  pfs_thread->m_transactions_history_index= 0;
+  pfs_thread->m_transactions_history_full= false;
+  for ( ; pfs < pfs_last; pfs++)
+    pfs->m_class= NULL;
+}
+
+/** Reset table EVENTS_TRANSACTIONS_HISTORY data. */
+void reset_events_transactions_history(void)
+{
+  global_thread_container.apply_all(fct_reset_events_transactions_history);
+}
+
+/** Reset table EVENTS_TRANSACTIONS_HISTORY_LONG data. */
+void reset_events_transactions_history_long(void)
+{
+  PFS_atomic::store_u32(&events_transactions_history_long_index.m_u32, 0);
+  events_transactions_history_long_full= false;
+
+  PFS_events_transactions *pfs= events_transactions_history_long_array;
+  PFS_events_transactions *pfs_last= pfs + events_transactions_history_long_size;
+  for ( ; pfs < pfs_last; pfs++)
+    pfs->m_class= NULL;
+}
+
+static void fct_reset_events_transactions_by_thread(PFS_thread *thread)
+{
+  PFS_account *account= sanitize_account(thread->m_account);
+  PFS_user *user= sanitize_user(thread->m_user);
+  PFS_host *host= sanitize_host(thread->m_host);
+  aggregate_thread_transactions(thread, account, user, host);
+}
+
+/** Reset table EVENTS_TRANSACTIONS_SUMMARY_BY_THREAD_BY_EVENT_NAME data. */
+void reset_events_transactions_by_thread()
+{
+  global_thread_container.apply(fct_reset_events_transactions_by_thread);
+}
+
+static void fct_reset_events_transactions_by_account(PFS_account *pfs)
+{
+  PFS_user *user= sanitize_user(pfs->m_user);
+  PFS_host *host= sanitize_host(pfs->m_host);
+  pfs->aggregate_transactions(user, host);
+}
+
+/** Reset table EVENTS_TRANSACTIONS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME data. */
+void reset_events_transactions_by_account()
+{
+  global_account_container.apply(fct_reset_events_transactions_by_account);
+}
+
+static void fct_reset_events_transactions_by_user(PFS_user *pfs)
+{
+  pfs->aggregate_transactions();
+}
+
+/** Reset table EVENTS_TRANSACTIONS_SUMMARY_BY_USER_BY_EVENT_NAME data. */
+void reset_events_transactions_by_user()
+{
+  global_user_container.apply(fct_reset_events_transactions_by_user);
+}
+
+static void fct_reset_events_transactions_by_host(PFS_host *pfs)
+{
+  pfs->aggregate_transactions();
+}
+
+/** Reset table EVENTS_TRANSACTIONS_SUMMARY_BY_HOST_BY_EVENT_NAME data. */
+void reset_events_transactions_by_host()
+{
+  global_host_container.apply(fct_reset_events_transactions_by_host);
+}
+
+/** Reset table EVENTS_TRANSACTIONS_GLOBAL_BY_EVENT_NAME data. */
+void reset_events_transactions_global()
+{
+  global_transaction_stat.reset();
+}
+
+/**
+  Check if the XID consists of printable characters, ASCII 32 - 127.
+  @param xid     XID structure
+  @param offset  offset into XID.data[]
+  @param length  number of bytes to process
+  @return true if all bytes are in printable range
+*/
+bool xid_printable(PSI_xid *xid, size_t offset, size_t length)
+{
+  if (xid->is_null())
+    return false;
+
+  assert(offset + length <= MYSQL_XIDDATASIZE);
+
+  unsigned char *c= (unsigned char*)&xid->data + offset;
+
+  for (size_t i= 0; i < length; i++, c++)
+  {
+    if(*c < 32 || *c > 127)
+      return false;
+  }
+
+  return true;
+}
+
diff --git a/storage/perfschema/pfs_events_transactions.h b/storage/perfschema/pfs_events_transactions.h
new file mode 100644
index 00000000000..57e57e70210
--- /dev/null
+++ b/storage/perfschema/pfs_events_transactions.h
@@ -0,0 +1,132 @@
+/* Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef PFS_EVENTS_TRANSACTIONS_H
+#define PFS_EVENTS_TRANSACTIONS_H
+
+/**
+  @file storage/perfschema/pfs_events_transactions.h
+  Events transactions data structures (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_events.h"
+#include "rpl_gtid.h"
+#include "mysql/plugin.h" /* MYSQL_XIDDATASIZE */
+#include "my_thread.h"
+
+struct PFS_thread;
+struct PFS_account;
+struct PFS_user;
+struct PFS_host;
+
+/**
+  struct PSI_xid is binary compatible with the XID structure as
+  in the X/Open CAE Specification, Distributed Transaction Processing:
+  The XA Specification, X/Open Company Ltd., 1991.
+  http://www.opengroup.org/bookstore/catalog/c193.htm
+
+  A value of -1 in formatID means that the XID is null.
+  Max length for bqual and gtrid is 64 bytes each.
+
+  @see XID in sql/handler.h
+  @see MYSQL_XID in mysql/plugin.h
+*/
+struct PSI_xid
+{
+  /** Format identifier. */
+  long formatID;
+  /** GTRID length, value 1-64. */
+  long gtrid_length;
+  /** BQUAL length, value 1-64. */
+  long bqual_length;
+  /** XID raw data, not \0-terminated */
+  char data[MYSQL_XIDDATASIZE];
+
+  PSI_xid() {null();}
+  bool is_null() { return formatID == -1; }
+  void null() { formatID= -1; gtrid_length= 0; bqual_length= 0;}
+};
+typedef struct PSI_xid PSI_xid;
+
+/** A transaction record. */
+struct PFS_events_transactions : public PFS_events
+{
+  /** Source identifier, mapped from internal format. */
+  //rpl_sid m_sid;
+  /** InnoDB transaction ID. */
+  ulonglong m_trxid;
+  /** Status */
+  enum_transaction_state m_state;
+  /** Global Transaction ID specifier. */
+  Gtid_specification m_gtid_spec;
+  /** True if XA transaction. */
+  my_bool m_xa;
+  /** XA transaction ID. */
+  PSI_xid m_xid;
+  /** XA status */
+  enum_xa_transaction_state m_xa_state;
+  /** Transaction isolation level. */
+  enum_isolation_level m_isolation_level;
+  /** True if read-only transaction, otherwise read-write. */
+  my_bool m_read_only;
+  /** True if autocommit transaction. */
+  my_bool m_autocommit;
+  /** Total number of savepoints. */
+  ulonglong m_savepoint_count;
+  /** Number of rollback_to_savepoint. */
+  ulonglong m_rollback_to_savepoint_count;
+  /** Number of release_savepoint. */
+  ulonglong m_release_savepoint_count;
+};
+
+bool xid_printable(PSI_xid *xid, size_t offset, size_t length);
+
+void insert_events_transactions_history(PFS_thread *thread, PFS_events_transactions *transaction);
+void insert_events_transactions_history_long(PFS_events_transactions *transaction);
+
+extern bool flag_events_transactions_current;
+extern bool flag_events_transactions_history;
+extern bool flag_events_transactions_history_long;
+
+extern bool events_transactions_history_long_full;
+extern PFS_cacheline_uint32 events_transactions_history_long_index;
+extern PFS_events_transactions *events_transactions_history_long_array;
+extern ulong events_transactions_history_long_size;
+
+int init_events_transactions_history_long(uint events_transactions_history_long_sizing);
+void cleanup_events_transactions_history_long();
+
+void reset_events_transactions_current();
+void reset_events_transactions_history();
+void reset_events_transactions_history_long();
+void reset_events_transactions_by_thread();
+void reset_events_transactions_by_account();
+void reset_events_transactions_by_user();
+void reset_events_transactions_by_host();
+void reset_events_transactions_global();
+void aggregate_account_transactions(PFS_account *account);
+void aggregate_user_transactions(PFS_user *user);
+void aggregate_host_transactions(PFS_host *host);
+
+#endif
+
diff --git a/storage/perfschema/pfs_events_waits.cc b/storage/perfschema/pfs_events_waits.cc
index c3961461f34..14f6d743764 100644
--- a/storage/perfschema/pfs_events_waits.cc
+++ b/storage/perfschema/pfs_events_waits.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -35,26 +35,28 @@
 #include "pfs_account.h"
 #include "pfs_events_waits.h"
 #include "pfs_atomic.h"
+#include "pfs_buffer_container.h"
+#include "pfs_builtin_memory.h"
 #include "m_string.h"
 
-ulong events_waits_history_long_size= 0;
+PFS_ALIGNED ulong events_waits_history_long_size= 0;
 /** Consumer flag for table EVENTS_WAITS_CURRENT. */
-bool flag_events_waits_current= false;
+PFS_ALIGNED bool flag_events_waits_current= false;
 /** Consumer flag for table EVENTS_WAITS_HISTORY. */
-bool flag_events_waits_history= false;
+PFS_ALIGNED bool flag_events_waits_history= false;
 /** Consumer flag for table EVENTS_WAITS_HISTORY_LONG. */
-bool flag_events_waits_history_long= false;
+PFS_ALIGNED bool flag_events_waits_history_long= false;
 /** Consumer flag for the global instrumentation. */
-bool flag_global_instrumentation= false;
+PFS_ALIGNED bool flag_global_instrumentation= false;
 /** Consumer flag for the per thread instrumentation. */
-bool flag_thread_instrumentation= false;
+PFS_ALIGNED bool flag_thread_instrumentation= false;
 
 /** True if EVENTS_WAITS_HISTORY_LONG circular buffer is full. */
-bool events_waits_history_long_full= false;
+PFS_ALIGNED bool events_waits_history_long_full= false;
 /** Index in EVENTS_WAITS_HISTORY_LONG circular buffer. */
-volatile uint32 events_waits_history_long_index= 0;
+PFS_ALIGNED PFS_cacheline_uint32 events_waits_history_long_index;
 /** EVENTS_WAITS_HISTORY_LONG circular buffer. */
-PFS_events_waits *events_waits_history_long_array= NULL;
+PFS_ALIGNED PFS_events_waits *events_waits_history_long_array= NULL;
 
 /**
   Initialize table EVENTS_WAITS_HISTORY_LONG.
@@ -64,14 +66,16 @@ int init_events_waits_history_long(uint events_waits_history_long_sizing)
 {
   events_waits_history_long_size= events_waits_history_long_sizing;
   events_waits_history_long_full= false;
-  PFS_atomic::store_u32(&events_waits_history_long_index, 0);
+  PFS_atomic::store_u32(&events_waits_history_long_index.m_u32, 0);
 
   if (events_waits_history_long_size == 0)
     return 0;
 
   events_waits_history_long_array=
-    PFS_MALLOC_ARRAY(events_waits_history_long_size, sizeof(PFS_events_waits),
-                     PFS_events_waits, MYF(MY_ZEROFILL));
+    PFS_MALLOC_ARRAY(& builtin_memory_waits_history_long,
+                     events_waits_history_long_size,
+                     sizeof(PFS_events_waits), PFS_events_waits,
+                     MYF(MY_ZEROFILL));
 
   return (events_waits_history_long_array ? 0 : 1);
 }
@@ -79,7 +83,9 @@ int init_events_waits_history_long(uint events_waits_history_long_sizing)
 /** Cleanup table EVENTS_WAITS_HISTORY_LONG. */
 void cleanup_events_waits_history_long(void)
 {
-  pfs_free(events_waits_history_long_array);
+  PFS_FREE_ARRAY(& builtin_memory_waits_history_long,
+                 events_waits_history_long_size, sizeof(PFS_events_waits),
+                 events_waits_history_long_array);
   events_waits_history_long_array= NULL;
 }
 
@@ -129,7 +135,7 @@ void insert_events_waits_history_long(PFS_events_waits *wait)
   if (unlikely(events_waits_history_long_size == 0))
     return;
 
-  uint index= PFS_atomic::add_u32(&events_waits_history_long_index, 1);
+  uint index= PFS_atomic::add_u32(&events_waits_history_long_index.m_u32, 1);
 
   index= index % events_waits_history_long_size;
   if (index == 0)
@@ -139,44 +145,43 @@ void insert_events_waits_history_long(PFS_events_waits *wait)
   copy_events_waits(&events_waits_history_long_array[index], wait);
 }
 
+static void fct_reset_events_waits_current(PFS_thread *pfs_thread)
+{
+  PFS_events_waits *pfs_wait= pfs_thread->m_events_waits_stack;
+  PFS_events_waits *pfs_wait_last= pfs_wait + WAIT_STACK_SIZE;
+
+  for ( ; pfs_wait < pfs_wait_last; pfs_wait++)
+    pfs_wait->m_wait_class= NO_WAIT_CLASS;
+}
+
+
 /** Reset table EVENTS_WAITS_CURRENT data. */
 void reset_events_waits_current(void)
 {
-  PFS_thread *pfs_thread= thread_array;
-  PFS_thread *pfs_thread_last= thread_array + thread_max;
+  global_thread_container.apply_all(fct_reset_events_waits_current);
+}
 
-  for ( ; pfs_thread < pfs_thread_last; pfs_thread++)
-  {
-    PFS_events_waits *pfs_wait= pfs_thread->m_events_waits_stack;
-    PFS_events_waits *pfs_wait_last= pfs_wait + WAIT_STACK_SIZE;
+static void fct_reset_events_waits_history(PFS_thread *pfs_thread)
+{
+  PFS_events_waits *wait= pfs_thread->m_waits_history;
+  PFS_events_waits *wait_last= wait + events_waits_history_per_thread;
 
-    for ( ; pfs_wait < pfs_wait_last; pfs_wait++)
-      pfs_wait->m_wait_class= NO_WAIT_CLASS;
-  }
+  pfs_thread->m_waits_history_index= 0;
+  pfs_thread->m_waits_history_full= false;
+  for ( ; wait < wait_last; wait++)
+    wait->m_wait_class= NO_WAIT_CLASS;
 }
 
 /** Reset table EVENTS_WAITS_HISTORY data. */
 void reset_events_waits_history(void)
 {
-  PFS_thread *pfs_thread= thread_array;
-  PFS_thread *pfs_thread_last= thread_array + thread_max;
-
-  for ( ; pfs_thread < pfs_thread_last; pfs_thread++)
-  {
-    PFS_events_waits *wait= pfs_thread->m_waits_history;
-    PFS_events_waits *wait_last= wait + events_waits_history_per_thread;
-
-    pfs_thread->m_waits_history_index= 0;
-    pfs_thread->m_waits_history_full= false;
-    for ( ; wait < wait_last; wait++)
-      wait->m_wait_class= NO_WAIT_CLASS;
-  }
+  global_thread_container.apply_all(fct_reset_events_waits_history);
 }
 
 /** Reset table EVENTS_WAITS_HISTORY_LONG data. */
 void reset_events_waits_history_long(void)
 {
-  PFS_atomic::store_u32(&events_waits_history_long_index, 0);
+  PFS_atomic::store_u32(&events_waits_history_long_index.m_u32, 0);
   events_waits_history_long_full= false;
 
   PFS_events_waits *wait= events_waits_history_long_array;
@@ -185,141 +190,112 @@ void reset_events_waits_history_long(void)
     wait->m_wait_class= NO_WAIT_CLASS;
 }
 
+static void fct_reset_events_waits_by_thread(PFS_thread *thread)
+{
+  PFS_account *account= sanitize_account(thread->m_account);
+  PFS_user *user= sanitize_user(thread->m_user);
+  PFS_host *host= sanitize_host(thread->m_host);
+  aggregate_thread_waits(thread, account, user, host);
+}
+
 /** Reset table EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME data. */
 void reset_events_waits_by_thread()
 {
-  PFS_thread *thread= thread_array;
-  PFS_thread *thread_last= thread_array + thread_max;
-  PFS_account *account;
-  PFS_user *user;
-  PFS_host *host;
+  global_thread_container.apply(fct_reset_events_waits_by_thread);
+}
 
-  for ( ; thread < thread_last; thread++)
-  {
-    if (thread->m_lock.is_populated())
-    {
-      account= sanitize_account(thread->m_account);
-      user= sanitize_user(thread->m_user);
-      host= sanitize_host(thread->m_host);
-      aggregate_thread_waits(thread, account, user, host);
-    }
-  }
+static void fct_reset_events_waits_by_account(PFS_account *pfs)
+{
+  PFS_user *user= sanitize_user(pfs->m_user);
+  PFS_host *host= sanitize_host(pfs->m_host);
+  pfs->aggregate_waits(user, host);
 }
 
 /** Reset table EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME data. */
 void reset_events_waits_by_account()
 {
-  PFS_account *pfs= account_array;
-  PFS_account *pfs_last= account_array + account_max;
-  PFS_user *user;
-  PFS_host *host;
+  global_account_container.apply(fct_reset_events_waits_by_account);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-    {
-      user= sanitize_user(pfs->m_user);
-      host= sanitize_host(pfs->m_host);
-      pfs->aggregate_waits(user, host);
-    }
-  }
+static void fct_reset_events_waits_by_user(PFS_user *pfs)
+{
+  pfs->aggregate_waits();
 }
 
 /** Reset table EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME data. */
 void reset_events_waits_by_user()
 {
-  PFS_user *pfs= user_array;
-  PFS_user *pfs_last= user_array + user_max;
+  global_user_container.apply(fct_reset_events_waits_by_user);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-      pfs->aggregate_waits();
-  }
+static void fct_reset_events_waits_by_host(PFS_host *pfs)
+{
+  pfs->aggregate_waits();
 }
 
 /** Reset table EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME data. */
 void reset_events_waits_by_host()
 {
-  PFS_host *pfs= host_array;
-  PFS_host *pfs_last= host_array + host_max;
+  global_host_container.apply(fct_reset_events_waits_by_host);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-      pfs->aggregate_waits();
-  }
+static void fct_reset_table_waits_by_table(PFS_table_share *pfs)
+{
+  pfs->aggregate();
 }
 
 void reset_table_waits_by_table()
 {
-  PFS_table_share *pfs= table_share_array;
-  PFS_table_share *pfs_last= pfs + table_share_max;
+  global_table_share_container.apply(fct_reset_table_waits_by_table);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-      pfs->aggregate();
-  }
+static void fct_reset_table_io_waits_by_table(PFS_table_share *pfs)
+{
+  pfs->aggregate_io();
 }
 
 void reset_table_io_waits_by_table()
 {
-  PFS_table_share *pfs= table_share_array;
-  PFS_table_share *pfs_last= pfs + table_share_max;
+  global_table_share_container.apply(fct_reset_table_io_waits_by_table);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-      pfs->aggregate_io();
-  }
+static void fct_reset_table_lock_waits_by_table(PFS_table_share *pfs)
+{
+  pfs->aggregate_lock();
 }
 
 void reset_table_lock_waits_by_table()
 {
-  PFS_table_share *pfs= table_share_array;
-  PFS_table_share *pfs_last= pfs + table_share_max;
+  global_table_share_container.apply(fct_reset_table_lock_waits_by_table);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-      pfs->aggregate_lock();
-  }
+void fct_reset_table_waits_by_table_handle(PFS_table *pfs)
+{
+  pfs->sanitized_aggregate();
 }
 
 void reset_table_waits_by_table_handle()
 {
-  PFS_table *pfs= table_array;
-  PFS_table *pfs_last= pfs + table_max;
+  global_table_container.apply(fct_reset_table_waits_by_table_handle);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-      pfs->sanitized_aggregate();
-  }
+void fct_reset_table_io_waits_by_table_handle(PFS_table *pfs)
+{
+  pfs->sanitized_aggregate_io();
 }
 
 void reset_table_io_waits_by_table_handle()
 {
-  PFS_table *pfs= table_array;
-  PFS_table *pfs_last= pfs + table_max;
+  global_table_container.apply(fct_reset_table_io_waits_by_table_handle);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-      pfs->sanitized_aggregate_io();
-  }
+void fct_reset_table_lock_waits_by_table_handle(PFS_table *pfs)
+{
+  pfs->sanitized_aggregate_lock();
 }
 
 void reset_table_lock_waits_by_table_handle()
 {
-  PFS_table *pfs= table_array;
-  PFS_table *pfs_last= pfs + table_max;
-
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-      pfs->sanitized_aggregate_lock();
-  }
+  global_table_container.apply(fct_reset_table_lock_waits_by_table_handle);
 }
 
diff --git a/storage/perfschema/pfs_events_waits.h b/storage/perfschema/pfs_events_waits.h
index 702f7e3ce07..8a7355f2d32 100644
--- a/storage/perfschema/pfs_events_waits.h
+++ b/storage/perfschema/pfs_events_waits.h
@@ -1,5 +1,5 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
-   Copyright (c) 2017, MariaDB Corporation.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates..
+   Copyright (c) 2017, 2019, MariaDB Corporation.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -45,6 +45,7 @@ struct PFS_table_share;
 struct PFS_account;
 struct PFS_user;
 struct PFS_host;
+struct PFS_metadata_lock;
 
 /** Class of a wait event. */
 enum events_waits_class
@@ -56,29 +57,13 @@ enum events_waits_class
   WAIT_CLASS_TABLE,
   WAIT_CLASS_FILE,
   WAIT_CLASS_SOCKET,
-  WAIT_CLASS_IDLE
+  WAIT_CLASS_IDLE,
+  WAIT_CLASS_METADATA
 };
 
 /** A wait event record. */
 struct PFS_events_waits : public PFS_events
 {
-  /** Executing thread. */
-  PFS_thread *m_thread;
-  /** Table share, for table operations only. */
-  PFS_table_share *m_weak_table_share;
-  /** File, for file operations only. */
-  PFS_file *m_weak_file;
-  /** Address in memory of the object instance waited on. */
-  const void *m_object_instance_addr;
-  /** Socket, for socket operations only. */
-  PFS_socket *m_weak_socket;
-  /**
-    Number of bytes read/written.
-    This member is populated for file READ/WRITE operations only.
-  */
-  size_t m_number_of_bytes;
-  /** Flags */
-  ulong m_flags;
   /**
     The type of wait.
     Readers:
@@ -93,15 +78,33 @@ struct PFS_events_waits : public PFS_events
   events_waits_class m_wait_class;
   /** Object type */
   enum_object_type m_object_type;
+  /** Table share, for table operations only. */
+  PFS_table_share *m_weak_table_share;
+  /** File, for file operations only. */
+  PFS_file *m_weak_file;
+  /** Socket, for socket operations only. */
+  PFS_socket *m_weak_socket;
+  /** Metadata lock, for mdl operations only. */
+  PFS_metadata_lock *m_weak_metadata_lock;
   /** For weak pointers, target object version. */
   uint32 m_weak_version;
+  /** Address in memory of the object instance waited on. */
+  const void *m_object_instance_addr;
   /** Operation performed. */
   enum_operation_type m_operation;
   /**
+    Number of bytes/rows read/written.
+    This member is populated for FILE READ/WRITE operations, with a number of bytes.
+    This member is populated for TABLE IO operations, with a number of rows.
+  */
+  size_t m_number_of_bytes;
+  /**
     Index used.
     This member is populated for TABLE IO operations only.
   */
   uint m_index;
+  /** Flags */
+  ulong m_flags;
 };
 
 /** TIMED bit in the state flags bitfield. */
@@ -124,7 +127,7 @@ extern bool flag_global_instrumentation;
 extern bool flag_thread_instrumentation;
 
 extern bool events_waits_history_long_full;
-extern volatile uint32 events_waits_history_long_index;
+extern PFS_ALIGNED PFS_cacheline_uint32 events_waits_history_long_index;
 extern PFS_events_waits *events_waits_history_long_array;
 extern ulong events_waits_history_long_size;
 
diff --git a/storage/perfschema/pfs_global.cc b/storage/perfschema/pfs_global.cc
index e1b5e3400ca..1f9d24eea4e 100644
--- a/storage/perfschema/pfs_global.cc
+++ b/storage/perfschema/pfs_global.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates. All rights
+   reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -27,38 +28,39 @@
 
 #include <my_global.h>
 #include "pfs_global.h"
-#include <my_sys.h>
-#include <my_net.h>
-#ifdef HAVE_MALLOC_H
-#include <malloc.h>                             /* memalign() may be here */
-#endif
+#include "pfs_builtin_memory.h"
+#include "log.h"
+
+#include <stdlib.h>
+#include <string.h>
 
 #ifdef HAVE_UNISTD_H
 #include <unistd.h>
 #endif
-
-#ifdef HAVE_STRING_H
-#include <string.h>
+#ifdef _WIN32
+#include <winsock2.h>
 #endif
-
-#ifdef __WIN__
-  #include <winsock2.h>
-#else
-  #include <arpa/inet.h>
+#ifdef HAVE_ARPA_INET_H
+#include <arpa/inet.h>
+#endif
+#ifdef HAVE_NETINET_IN_H
+#include <netinet/in.h>
+#endif
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
 #endif
 
 bool pfs_initialized= false;
-size_t pfs_allocated_memory= 0;
 
 /**
   Memory allocation for the performance schema.
-  The memory used internally in the performance schema implementation
-  is allocated once during startup, and considered static thereafter.
+  The memory used internally in the performance schema implementation.
+  It is allocated at startup, or during runtime with scalable buffers.
 */
-void *pfs_malloc(size_t size, myf flags)
+void *pfs_malloc(PFS_builtin_memory_class *klass, size_t size, myf flags)
 {
-  DBUG_ASSERT(! pfs_initialized);
-  DBUG_ASSERT(size > 0);
+  assert(klass != NULL);
+  assert(size > 0);
 
   void *ptr= NULL;
 
@@ -91,13 +93,14 @@ void *pfs_malloc(size_t size, myf flags)
     return NULL;
 #endif
 
-  pfs_allocated_memory+= size;
+  klass->count_alloc(size);
+
   if (flags & MY_ZEROFILL)
     memset(ptr, 0, size);
   return ptr;
 }
 
-void pfs_free(void *ptr)
+void pfs_free(PFS_builtin_memory_class *klass, size_t size, void *ptr)
 {
   if (ptr == NULL)
     return;
@@ -119,40 +122,58 @@ void pfs_free(void *ptr)
 #endif /* HAVE_ALIGNED_MALLOC */
 #endif /* HAVE_MEMALIGN */
 #endif /* HAVE_POSIX_MEMALIGN */
-}
 
-void pfs_print_error(const char *format, ...)
-{
-  va_list args;
-  va_start(args, format);
-  /*
-    Printing to anything else, like the error log, would generate even more
-    recursive calls to the performance schema implementation
-    (file io is instrumented), so that could lead to catastrophic results.
-    Printing to something safe, and low level: stderr only.
-  */
-  vfprintf(stderr, format, args);
-  va_end(args);
-  fflush(stderr);
+  klass->count_free(size);
 }
 
 /**
   Array allocation for the performance schema.
   Checks for overflow of n * size before allocating.
-  @param n  number of array elements
+  @param klass performance schema memory class
+  @param n     number of array elements
   @param size  element size
   @param flags malloc flags
   @return pointer to memory on success, else NULL
 */
-void *pfs_malloc_array(size_t n, size_t size, myf flags)
+void *pfs_malloc_array(PFS_builtin_memory_class *klass, size_t n, size_t size, myf flags)
 {
-  DBUG_ASSERT(n > 0);
-  DBUG_ASSERT(size > 0);
+  assert(klass != NULL);
+  assert(n > 0);
+  assert(size > 0);
+  void *ptr= NULL;
   size_t array_size= n * size;
   /* Check for overflow before allocating. */
   if (is_overflow(array_size, n, size))
+  {
+    sql_print_warning("Failed to allocate memory for %zu chunks each of size "
+                      "%zu for buffer '%s' due to overflow", n, size,
+                      klass->m_class.m_name);
     return NULL;
-  return pfs_malloc(array_size, flags);
+  }
+
+  if(NULL == (ptr= pfs_malloc(klass, array_size, flags)))
+  {
+    sql_print_warning("Failed to allocate %zu bytes for buffer '%s' due to "
+                      "out-of-memory", array_size, klass->m_class.m_name);
+  }
+  return ptr;
+}
+
+/**
+  Free array allocated by @sa pfs_malloc_array.
+  @param klass performance schema memory class
+  @param n     number of array elements
+  @param size  element size
+  @param ptr   pointer to memory
+*/
+void pfs_free_array(PFS_builtin_memory_class *klass, size_t n, size_t size, void *ptr)
+{
+  if (ptr == NULL)
+    return;
+  size_t array_size= n * size;
+  /* Overflow should have been detected by pfs_malloc_array. */
+  assert(!is_overflow(array_size, n, size));
+  return pfs_free(klass, array_size, ptr);
 }
 
 /**
@@ -170,6 +191,22 @@ bool is_overflow(size_t product, size_t n1, size_t n2)
     return false;
 }
 
+void pfs_print_error(const char *format, ...)
+{
+  va_list args;
+  va_start(args, format);
+  /*
+    Printing to anything else, like the error log, would generate even more
+    recursive calls to the performance schema implementation
+    (file io is instrumented), so that could lead to catastrophic results.
+    Printing to something safe, and low level: stderr only.
+  */
+  vfprintf(stderr, format, args);
+  va_end(args);
+  fflush(stderr);
+}
+
+
 /** Convert raw ip address into readable format. Do not do a reverse DNS lookup. */
 
 uint pfs_get_socket_address(char *host,
@@ -178,9 +215,9 @@ uint pfs_get_socket_address(char *host,
                             const struct sockaddr_storage *src_addr,
                             socklen_t src_len)
 {
-  DBUG_ASSERT(host);
-  DBUG_ASSERT(src_addr);
-  DBUG_ASSERT(port);
+  assert(host);
+  assert(src_addr);
+  assert(port);
 
   memset(host, 0, host_len);
   *port= 0;
@@ -192,7 +229,7 @@ uint pfs_get_socket_address(char *host,
       if (host_len < INET_ADDRSTRLEN+1)
         return 0;
       struct sockaddr_in *sa4= (struct sockaddr_in *)(src_addr);
-    #ifdef __WIN__
+    #ifdef _WIN32
       /* Older versions of Windows do not support inet_ntop() */
       getnameinfo((struct sockaddr *)sa4, sizeof(struct sockaddr_in),
                   host, host_len, NULL, 0, NI_NUMERICHOST);
@@ -209,7 +246,7 @@ uint pfs_get_socket_address(char *host,
       if (host_len < INET6_ADDRSTRLEN+1)
         return 0;
       struct sockaddr_in6 *sa6= (struct sockaddr_in6 *)(src_addr);
-    #ifdef __WIN__
+    #ifdef _WIN32
       /* Older versions of Windows do not support inet_ntop() */
       getnameinfo((struct sockaddr *)sa6, sizeof(struct sockaddr_in6),
                   host, host_len, NULL, 0, NI_NUMERICHOST);
@@ -228,4 +265,3 @@ uint pfs_get_socket_address(char *host,
   /* Return actual IP address string length */
   return ((uint)strlen((const char*)host));
 }
-
diff --git a/storage/perfschema/pfs_global.h b/storage/perfschema/pfs_global.h
index 58db39e607d..48a5587839e 100644
--- a/storage/perfschema/pfs_global.h
+++ b/storage/perfschema/pfs_global.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -32,7 +32,6 @@
 
 /** True when the performance schema is initialized. */
 extern bool pfs_initialized;
-
 /** Total memory allocated by the performance schema, in bytes. */
 extern size_t pfs_allocated_memory;
 
@@ -48,23 +47,74 @@ extern size_t pfs_allocated_memory;
 #define PFS_ALIGNED
 #endif /* HAVE_POSIX_MEMALIGN || HAVE_MEMALIGN || HAVE_ALIGNED_MALLOC */
 
-void *pfs_malloc(size_t size, myf flags);
+#ifdef CPU_LEVEL1_DCACHE_LINESIZE
+#define PFS_CACHE_LINE_SIZE CPU_LEVEL1_DCACHE_LINESIZE
+#else
+#define PFS_CACHE_LINE_SIZE 128
+#endif
+
+/**
+  A uint32 variable, guaranteed to be alone in a CPU cache line.
+  This is for performance, for variables accessed very frequently.
+*/
+struct PFS_cacheline_uint32
+{
+  uint32 m_u32;
+  char m_full_cache_line[PFS_CACHE_LINE_SIZE - sizeof(uint32)];
+
+  PFS_cacheline_uint32()
+  : m_u32(0)
+  {}
+};
+
+/**
+  A uint64 variable, guaranteed to be alone in a CPU cache line.
+  This is for performance, for variables accessed very frequently.
+*/
+struct PFS_cacheline_uint64
+{
+  uint64 m_u64;
+  char m_full_cache_line[PFS_CACHE_LINE_SIZE - sizeof(uint64)];
+
+  PFS_cacheline_uint64()
+  : m_u64(0)
+  {}
+};
+
+struct PFS_builtin_memory_class;
+
+/** Memory allocation for the performance schema. */
+void *pfs_malloc(PFS_builtin_memory_class *klass, size_t size, myf flags);
 
 /** Allocate an array of structures with overflow check. */
-void *pfs_malloc_array(size_t n, size_t size, myf flags);
+void *pfs_malloc_array(PFS_builtin_memory_class *klass, size_t n, size_t size, myf flags);
 
 /**
   Helper, to allocate an array of structures.
+  @param k memory class
   @param n number of elements in the array
   @param s size of array element
   @param T type of an element
   @param f flags to use when allocating memory
 */
-#define PFS_MALLOC_ARRAY(n, s, T, f) \
-  reinterpret_cast<T*>(pfs_malloc_array((n), (s), (f)))
+#define PFS_MALLOC_ARRAY(k, n, s, T, f) \
+  reinterpret_cast<T*>(pfs_malloc_array((k), (n), (s), (f)))
 
 /** Free memory allocated with @sa pfs_malloc. */
-void pfs_free(void *ptr);
+void pfs_free(PFS_builtin_memory_class *klass, size_t size, void *ptr);
+
+/** Free memory allocated with @sa pfs_malloc_array. */
+void pfs_free_array(PFS_builtin_memory_class *klass, size_t n, size_t size, void *ptr);
+
+/**
+  Helper, to free an array of structures.
+  @param k memory class
+  @param n number of elements in the array
+  @param s size of array element
+  @param p the array to free
+*/
+#define PFS_FREE_ARRAY(k, n, s, p) \
+  pfs_free_array((k), (n), (s), (p))
 
 /** Detect multiplication overflow. */
 bool is_overflow(size_t product, size_t n1, size_t n2);
@@ -125,7 +175,7 @@ inline uint randomized_index(const void *ptr, uint max_size)
   seed2= seed1*seed1;
   seed1= result;
 
-  DBUG_ASSERT(result < max_size);
+  assert(result < max_size);
   return result;
 }
 
diff --git a/storage/perfschema/pfs_host.cc b/storage/perfschema/pfs_host.cc
index d6461ef3851..7d7af7700c2 100644
--- a/storage/perfschema/pfs_host.cc
+++ b/storage/perfschema/pfs_host.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -34,21 +34,13 @@
 #include "pfs_host.h"
 #include "pfs_global.h"
 #include "pfs_instr_class.h"
+#include "pfs_buffer_container.h"
 
 /**
   @addtogroup Performance_schema_buffers
   @{
 */
 
-ulong host_max;
-ulong host_lost;
-
-PFS_host *host_array= NULL;
-
-static PFS_single_stat *host_instr_class_waits_array= NULL;
-static PFS_stage_stat *host_instr_class_stages_array= NULL;
-static PFS_statement_stat *host_instr_class_statements_array= NULL;
-
 LF_HASH host_hash;
 static bool host_hash_inited= false;
 
@@ -59,59 +51,8 @@ static bool host_hash_inited= false;
 */
 int init_host(const PFS_global_param *param)
 {
-  uint index;
-
-  host_max= param->m_host_sizing;
-
-  host_array= NULL;
-  host_instr_class_waits_array= NULL;
-  host_instr_class_stages_array= NULL;
-  host_instr_class_statements_array= NULL;
-  uint waits_sizing= host_max * wait_class_max;
-  uint stages_sizing= host_max * stage_class_max;
-  uint statements_sizing= host_max * statement_class_max;
-
-  if (host_max > 0)
-  {
-    host_array= PFS_MALLOC_ARRAY(host_max, sizeof(PFS_host), PFS_host,
-                                 MYF(MY_ZEROFILL));
-    if (unlikely(host_array == NULL))
-      return 1;
-  }
-
-  if (waits_sizing > 0)
-  {
-    host_instr_class_waits_array=
-      PFS_connection_slice::alloc_waits_slice(waits_sizing);
-    if (unlikely(host_instr_class_waits_array == NULL))
-      return 1;
-  }
-
-  if (stages_sizing > 0)
-  {
-    host_instr_class_stages_array=
-      PFS_connection_slice::alloc_stages_slice(stages_sizing);
-    if (unlikely(host_instr_class_stages_array == NULL))
-      return 1;
-  }
-
-  if (statements_sizing > 0)
-  {
-    host_instr_class_statements_array=
-      PFS_connection_slice::alloc_statements_slice(statements_sizing);
-    if (unlikely(host_instr_class_statements_array == NULL))
-      return 1;
-  }
-
-  for (index= 0; index < host_max; index++)
-  {
-    host_array[index].m_instr_class_waits_stats=
-      &host_instr_class_waits_array[index * wait_class_max];
-    host_array[index].m_instr_class_stages_stats=
-      &host_instr_class_stages_array[index * stage_class_max];
-    host_array[index].m_instr_class_statements_stats=
-      &host_instr_class_statements_array[index * statement_class_max];
-  }
+  if (global_host_container.init(param->m_host_sizing))
+    return 1;
 
   return 0;
 }
@@ -119,15 +60,7 @@ int init_host(const PFS_global_param *param)
 /** Cleanup all the host buffers. */
 void cleanup_host(void)
 {
-  pfs_free(host_array);
-  host_array= NULL;
-  pfs_free(host_instr_class_waits_array);
-  host_instr_class_waits_array= NULL;
-  pfs_free(host_instr_class_stages_array);
-  host_instr_class_stages_array= NULL;
-  pfs_free(host_instr_class_statements_array);
-  host_instr_class_statements_array= NULL;
-  host_max= 0;
+  global_host_container.cleanup();
 }
 
 C_MODE_START
@@ -138,9 +71,9 @@ static uchar *host_hash_get_key(const uchar *entry, size_t *length,
   const PFS_host *host;
   const void *result;
   typed_entry= reinterpret_cast<const PFS_host* const *> (entry);
-  DBUG_ASSERT(typed_entry != NULL);
+  assert(typed_entry != NULL);
   host= *typed_entry;
-  DBUG_ASSERT(host != NULL);
+  assert(host != NULL);
   *length= host->m_key.m_key_length;
   result= host->m_key.m_hash_key;
   return const_cast<uchar*> (reinterpret_cast<const uchar*> (result));
@@ -151,13 +84,12 @@ C_MODE_END
   Initialize the host hash.
   @return 0 on success
 */
-int init_host_hash(void)
+int init_host_hash(const PFS_global_param *param)
 {
-  if ((! host_hash_inited) && (host_max > 0))
+  if ((! host_hash_inited) && (param->m_host_sizing != 0))
   {
     lf_hash_init(&host_hash, sizeof(PFS_host*), LF_HASH_UNIQUE,
                  0, 0, host_hash_get_key, &my_charset_bin);
-    /* host_hash.size= host_max; */
     host_hash_inited= true;
   }
   return 0;
@@ -187,7 +119,7 @@ static LF_PINS* get_host_hash_pins(PFS_thread *thread)
 static void set_host_key(PFS_host_key *key,
                          const char *host, uint host_length)
 {
-  DBUG_ASSERT(host_length <= HOSTNAME_LENGTH);
+  assert(host_length <= HOSTNAME_LENGTH);
 
   char *ptr= &key->m_hash_key[0];
   if (host_length > 0)
@@ -203,16 +135,12 @@ static void set_host_key(PFS_host_key *key,
 PFS_host *find_or_create_host(PFS_thread *thread,
                               const char *hostname, uint hostname_length)
 {
-  if (host_max == 0)
-  {
-    host_lost++;
-    return NULL;
-  }
+  static PFS_ALIGNED PFS_cacheline_uint32 monotonic;
 
   LF_PINS *pins= get_host_hash_pins(thread);
   if (unlikely(pins == NULL))
   {
-    host_lost++;
+    global_host_container.m_lost++;
     return NULL;
   }
 
@@ -220,8 +148,10 @@ PFS_host *find_or_create_host(PFS_thread *thread,
   set_host_key(&key, hostname, hostname_length);
 
   PFS_host **entry;
+  PFS_host *pfs;
   uint retry_count= 0;
   const uint retry_max= 3;
+  pfs_dirty_state dirty_state;
 
 search:
   entry= reinterpret_cast<PFS_host**>
@@ -238,68 +168,55 @@ search:
 
   lf_hash_search_unpin(pins);
 
-  PFS_scan scan;
-  uint random= randomized_index(hostname, host_max);
-
-  for (scan.init(random, host_max);
-       scan.has_pass();
-       scan.next_pass())
+  pfs= global_host_container.allocate(& dirty_state);
+  if (pfs != NULL)
   {
-    PFS_host *pfs= host_array + scan.first();
-    PFS_host *pfs_last= host_array + scan.last();
-    for ( ; pfs < pfs_last; pfs++)
+    pfs->m_key= key;
+    if (hostname_length > 0)
+      pfs->m_hostname= &pfs->m_key.m_hash_key[0];
+    else
+      pfs->m_hostname= NULL;
+    pfs->m_hostname_length= hostname_length;
+
+    pfs->init_refcount();
+    pfs->reset_stats();
+    pfs->m_disconnected_count= 0;
+
+    int res;
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
+    res= lf_hash_insert(&host_hash, pins, &pfs);
+    if (likely(res == 0))
     {
-      if (pfs->m_lock.is_free())
+      return pfs;
+    }
+
+    global_host_container.deallocate(pfs);
+
+    if (res > 0)
+    {
+      if (++retry_count > retry_max)
       {
-        if (pfs->m_lock.free_to_dirty())
-        {
-          pfs->m_key= key;
-          if (hostname_length > 0)
-            pfs->m_hostname= &pfs->m_key.m_hash_key[0];
-          else
-            pfs->m_hostname= NULL;
-          pfs->m_hostname_length= hostname_length;
-
-          pfs->init_refcount();
-          pfs->reset_stats();
-          pfs->m_disconnected_count= 0;
-
-          int res;
-          res= lf_hash_insert(&host_hash, pins, &pfs);
-          if (likely(res == 0))
-          {
-            pfs->m_lock.dirty_to_allocated();
-            return pfs;
-          }
-
-          pfs->m_lock.dirty_to_free();
-
-          if (res > 0)
-          {
-            if (++retry_count > retry_max)
-            {
-              host_lost++;
-              return NULL;
-            }
-            goto search;
-          }
-
-          host_lost++;
-          return NULL;
-        }
+        global_host_container.m_lost++;
+        return NULL;
       }
+      goto search;
     }
+
+    global_host_container.m_lost++;
+    return NULL;
   }
 
-  host_lost++;
   return NULL;
 }
 
-void PFS_host::aggregate()
+void PFS_host::aggregate(bool alive)
 {
   aggregate_waits();
   aggregate_stages();
   aggregate_statements();
+  aggregate_transactions();
+  aggregate_memory(alive);
+  aggregate_status();
   aggregate_stats();
 }
 
@@ -311,24 +228,63 @@ void PFS_host::aggregate_waits()
 
 void PFS_host::aggregate_stages()
 {
+  if (read_instr_class_stages_stats() == NULL)
+    return;
+
   /*
     Aggregate EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME to:
     -  EVENTS_STAGES_SUMMARY_GLOBAL_BY_EVENT_NAME
   */
-  aggregate_all_stages(m_instr_class_stages_stats,
+  aggregate_all_stages(write_instr_class_stages_stats(),
                        global_instr_class_stages_array);
 }
 
 void PFS_host::aggregate_statements()
 {
+  if (read_instr_class_statements_stats() == NULL)
+    return;
+
   /*
     Aggregate EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME to:
     -  EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME
   */
-  aggregate_all_statements(m_instr_class_statements_stats,
+  aggregate_all_statements(write_instr_class_statements_stats(),
                            global_instr_class_statements_array);
 }
 
+void PFS_host::aggregate_transactions()
+{
+  if (read_instr_class_transactions_stats() == NULL)
+    return;
+
+  /*
+    Aggregate EVENTS_TRANSACTIONS_SUMMARY_BY_HOST_BY_EVENT_NAME to:
+    -  EVENTS_TRANSACTIONS_SUMMARY_GLOBAL_BY_EVENT_NAME
+  */
+  aggregate_all_transactions(write_instr_class_transactions_stats(),
+                             &global_transaction_stat);
+}
+
+void PFS_host::aggregate_memory(bool alive)
+{
+  if (read_instr_class_memory_stats() == NULL)
+    return;
+
+  /*
+    Aggregate MEMORY_SUMMARY_BY_HOST_BY_EVENT_NAME to:
+    - MEMORY_SUMMARY_GLOBAL_BY_EVENT_NAME
+  */
+  aggregate_all_memory(alive,
+                       write_instr_class_memory_stats(),
+                       global_instr_class_memory_array);
+}
+
+void PFS_host::aggregate_status()
+{
+  /* No parent to aggregate to, clean the stats */
+  m_status_stats.reset();
+}
+
 void PFS_host::aggregate_stats()
 {
   /* No parent to aggregate to, clean the stats */
@@ -340,12 +296,24 @@ void PFS_host::release()
   dec_refcount();
 }
 
+void PFS_host::carry_memory_stat_delta(PFS_memory_stat_delta *delta, uint index)
+{
+  PFS_memory_stat *event_name_array;
+  PFS_memory_stat *stat;
+  PFS_memory_stat_delta delta_buffer;
+  PFS_memory_stat_delta *remaining_delta;
+
+  event_name_array= write_instr_class_memory_stats();
+  stat= & event_name_array[index];
+  remaining_delta= stat->apply_delta(delta, &delta_buffer);
+
+  if (remaining_delta != NULL)
+    carry_global_memory_stat_delta(remaining_delta, index);
+}
+
 PFS_host *sanitize_host(PFS_host *unsafe)
 {
-  if ((&host_array[0] <= unsafe) &&
-      (unsafe < &host_array[host_max]))
-    return unsafe;
-  return NULL;
+  return global_host_container.sanitize(unsafe);
 }
 
 void purge_host(PFS_thread *thread, PFS_host *host)
@@ -360,18 +328,38 @@ void purge_host(PFS_thread *thread, PFS_host *host)
                     host->m_key.m_hash_key, host->m_key.m_key_length));
   if (entry && (entry != MY_ERRPTR))
   {
-    DBUG_ASSERT(*entry == host);
+    assert(*entry == host);
     if (host->get_refcount() == 0)
     {
       lf_hash_delete(&host_hash, pins,
                      host->m_key.m_hash_key, host->m_key.m_key_length);
-      host->m_lock.allocated_to_free();
+      host->aggregate(false);
+      global_host_container.deallocate(host);
     }
   }
 
   lf_hash_search_unpin(pins);
 }
 
+class Proc_purge_host
+  : public PFS_buffer_processor<PFS_host>
+{
+public:
+  Proc_purge_host(PFS_thread *thread)
+    : m_thread(thread)
+  {}
+
+  virtual void operator()(PFS_host *pfs)
+  {
+    pfs->aggregate(true);
+    if (pfs->get_refcount() == 0)
+      purge_host(m_thread, pfs);
+  }
+
+private:
+  PFS_thread *m_thread;
+};
+
 /** Purge non connected hosts, reset stats of connected hosts. */
 void purge_all_host(void)
 {
@@ -379,18 +367,8 @@ void purge_all_host(void)
   if (unlikely(thread == NULL))
     return;
 
-  PFS_host *pfs= host_array;
-  PFS_host *pfs_last= host_array + host_max;
-
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-    {
-      pfs->aggregate();
-      if (pfs->get_refcount() == 0)
-        purge_host(thread, pfs);
-    }
-  }
+  Proc_purge_host proc(thread);
+  global_host_container.apply(proc);
 }
 
 /** @} */
diff --git a/storage/perfschema/pfs_host.h b/storage/perfschema/pfs_host.h
index d52207d3571..71ef0d7f5ad 100644
--- a/storage/perfschema/pfs_host.h
+++ b/storage/perfschema/pfs_host.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -40,6 +40,7 @@ struct PFS_thread;
   @{
 */
 
+/** Hash key for a host. */
 struct PFS_host_key
 {
   /**
@@ -51,6 +52,7 @@ struct PFS_host_key
   uint m_key_length;
 };
 
+/** Per host statistics. */
 struct PFS_ALIGNED PFS_host : PFS_connection_slice
 {
 public:
@@ -74,13 +76,18 @@ public:
     PFS_atomic::add_32(& m_refcount, -1);
   }
 
-  void aggregate(void);
+  void aggregate(bool alive);
   void aggregate_waits(void);
   void aggregate_stages(void);
   void aggregate_statements(void);
+  void aggregate_transactions(void);
+  void aggregate_memory(bool alive);
+  void aggregate_status(void);
   void aggregate_stats(void);
   void release(void);
 
+  void carry_memory_stat_delta(PFS_memory_stat_delta *delta, uint index);
+
   /* Internal lock. */
   pfs_lock m_lock;
   PFS_host_key m_key;
@@ -95,7 +102,7 @@ private:
 
 int init_host(const PFS_global_param *param);
 void cleanup_host(void);
-int init_host_hash(void);
+int init_host_hash(const PFS_global_param *param);
 void cleanup_host_hash(void);
 
 PFS_host *find_or_create_host(PFS_thread *thread,
@@ -104,14 +111,7 @@ PFS_host *find_or_create_host(PFS_thread *thread,
 PFS_host *sanitize_host(PFS_host *unsafe);
 void purge_all_host(void);
 
-/* For iterators and show status. */
-
-extern ulong host_max;
-extern ulong host_lost;
-
-/* Exposing the data directly, for iterators. */
-
-extern PFS_host *host_array;
+/* For show status. */
 
 extern LF_HASH host_hash;
 
diff --git a/storage/perfschema/pfs_instr.cc b/storage/perfschema/pfs_instr.cc
index fd8da77fe40..41b1b24295a 100644
--- a/storage/perfschema/pfs_instr.cc
+++ b/storage/perfschema/pfs_instr.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -37,115 +37,45 @@
 #include "pfs_account.h"
 #include "pfs_global.h"
 #include "pfs_instr_class.h"
+#include "pfs_buffer_container.h"
+#include "pfs_builtin_memory.h"
+
+ulong nested_statement_lost= 0;
 
 /**
   @addtogroup Performance_schema_buffers
   @{
 */
 
-/** Size of the mutex instances array. @sa mutex_array */
-ulong mutex_max;
-/** True when @c mutex_array is full. */
-bool mutex_full;
-/** Number of mutexes instance lost. @sa mutex_array */
-ulong mutex_lost;
-/** Size of the rwlock instances array. @sa rwlock_array */
-ulong rwlock_max;
-/** True when @c rwlock_array is full. */
-bool rwlock_full;
-/** Number or rwlock instances lost. @sa rwlock_array */
-ulong rwlock_lost;
-/** Size of the conditions instances array. @sa cond_array */
-ulong cond_max;
-/** True when @c cond_array is full. */
-bool cond_full;
-/** Number of conditions instances lost. @sa cond_array */
-ulong cond_lost;
-/** Size of the thread instances array. @sa thread_array */
-ulong thread_max;
-/** True when @c thread_array is full. */
-bool thread_full;
-/** Number or thread instances lost. @sa thread_array */
-ulong thread_lost;
-/** Size of the file instances array. @sa file_array */
-ulong file_max;
-/** True when @c file_array is full. */
-bool file_full;
-/** Number of file instances lost. @sa file_array */
-ulong file_lost;
 /**
   Size of the file handle array. @sa file_handle_array.
   Signed value, for easier comparisons with a file descriptor number.
 */
-long file_handle_max;
+long file_handle_max= 0;
 /** True when @c file_handle_array is full. */
 bool file_handle_full;
 /** Number of file handle lost. @sa file_handle_array */
-ulong file_handle_lost;
-/** Size of the table instances array. @sa table_array */
-ulong table_max;
-/** True when @c table_array is full. */
-bool table_full;
-/** Number of table instances lost. @sa table_array */
-ulong table_lost;
-/** Size of the socket instances array. @sa socket_array */
-ulong socket_max;
-/** True when @c socket_array is full. */
-bool socket_full;
-/** Number of socket instances lost. @sa socket_array */
-ulong socket_lost;
+ulong file_handle_lost= 0;
 /** Number of EVENTS_WAITS_HISTORY records per thread. */
-ulong events_waits_history_per_thread;
+ulong events_waits_history_per_thread= 0;
 /** Number of EVENTS_STAGES_HISTORY records per thread. */
-ulong events_stages_history_per_thread;
+ulong events_stages_history_per_thread= 0;
 /** Number of EVENTS_STATEMENTS_HISTORY records per thread. */
-ulong events_statements_history_per_thread;
-uint statement_stack_max;
+ulong events_statements_history_per_thread= 0;
+uint statement_stack_max= 0;
 size_t pfs_max_digest_length= 0;
+size_t pfs_max_sqltext= 0;
 /** Number of locker lost. @sa LOCKER_STACK_SIZE. */
 ulong locker_lost= 0;
-/** Number of statement lost. @sa STATEMENT_STACK_SIZE. */
+/** Number of statements lost. @sa STATEMENT_STACK_SIZE. */
 ulong statement_lost= 0;
 /** Size of connection attribute storage per thread */
 ulong session_connect_attrs_size_per_thread;
 /** Number of connection attributes lost */
 ulong session_connect_attrs_lost= 0;
 
-/**
-  Mutex instrumentation instances array.
-  @sa mutex_max
-  @sa mutex_lost
-*/
-PFS_mutex *mutex_array= NULL;
-
-/**
-  RWLock instrumentation instances array.
-  @sa rwlock_max
-  @sa rwlock_lost
-*/
-PFS_rwlock *rwlock_array= NULL;
-
-/**
-  Condition instrumentation instances array.
-  @sa cond_max
-  @sa cond_lost
-*/
-PFS_cond *cond_array= NULL;
-
-/**
-  Thread instrumentation instances array.
-  @sa thread_max
-  @sa thread_lost
-*/
-PFS_thread *thread_array= NULL;
-
-/**
-  File instrumentation instances array.
-  @sa file_max
-  @sa file_lost
-  @sa pfs_filename_hash
-*/
-PFS_file *file_array= NULL;
+/** Number of EVENTS_TRANSACTIONS_HISTORY records per thread. */
+ulong events_transactions_history_per_thread= 0;
 
 /**
   File instrumentation handle array.
@@ -154,45 +84,19 @@ PFS_file *file_array= NULL;
 */
 PFS_file **file_handle_array= NULL;
 
-/**
-  Table instrumentation instances array.
-  @sa table_max
-  @sa table_lost
-*/
-PFS_table *table_array= NULL;
-
-/**
-  Socket instrumentation instances array.
-  @sa socket_max
-  @sa socket_lost
-*/
-PFS_socket *socket_array= NULL;
-
 PFS_stage_stat *global_instr_class_stages_array= NULL;
 PFS_statement_stat *global_instr_class_statements_array= NULL;
+PFS_memory_stat *global_instr_class_memory_array= NULL;
 
-static volatile uint64 thread_internal_id_counter= 0;
-
-static uint thread_instr_class_waits_sizing;
-static uint thread_instr_class_stages_sizing;
-static uint thread_instr_class_statements_sizing;
-static PFS_single_stat *thread_instr_class_waits_array= NULL;
-static PFS_stage_stat *thread_instr_class_stages_array= NULL;
-static PFS_statement_stat *thread_instr_class_statements_array= NULL;
-
-static PFS_events_waits *thread_waits_history_array= NULL;
-static PFS_events_stages *thread_stages_history_array= NULL;
-static PFS_events_statements *thread_statements_history_array= NULL;
-static PFS_events_statements *thread_statements_stack_array= NULL;
-static unsigned char *current_stmts_digest_token_array= NULL;
-static unsigned char *history_stmts_digest_token_array= NULL;
-static char *thread_session_connect_attrs_array= NULL;
+static PFS_ALIGNED PFS_cacheline_uint64 thread_internal_id_counter;
 
 /** Hash table for instrumented files. */
 LF_HASH pfs_filename_hash;
 /** True if pfs_filename_hash is initialized. */
 static bool filename_hash_inited= false;
 
+my_bool show_compatibility_56= 0;
+
 /**
   Initialize all the instruments instance buffers.
   @param param                        sizing parameters
@@ -200,291 +104,76 @@ static bool filename_hash_inited= false;
 */
 int init_instruments(const PFS_global_param *param)
 {
-  PFS_events_statements *pfs_stmt;
-  unsigned char *pfs_tokens;
-
-  uint thread_waits_history_sizing;
-  uint thread_stages_history_sizing;
-  uint thread_statements_history_sizing;
-  uint thread_statements_stack_sizing;
-  uint thread_session_connect_attrs_sizing;
   uint index;
 
   /* Make sure init_event_name_sizing is called */
-  DBUG_ASSERT(wait_class_max != 0);
-
-  mutex_max= param->m_mutex_sizing;
-  mutex_full= false;
-  mutex_lost= 0;
-  rwlock_max= param->m_rwlock_sizing;
-  rwlock_full= false;
-  rwlock_lost= 0;
-  cond_max= param->m_cond_sizing;
-  cond_full= false;
-  cond_lost= 0;
-  file_max= param->m_file_sizing;
-  file_full= false;
-  file_lost= 0;
+  assert(wait_class_max != 0);
+
   file_handle_max= param->m_file_handle_sizing;
   file_handle_full= false;
   file_handle_lost= 0;
 
   pfs_max_digest_length= param->m_max_digest_length;
-
-  table_max= param->m_table_sizing;
-  table_full= false;
-  table_lost= 0;
-  thread_max= param->m_thread_sizing;
-  thread_full= false;
-  thread_lost= 0;
-  socket_max= param->m_socket_sizing;
-  socket_full= false;
-  socket_lost= 0;
+  pfs_max_sqltext= param->m_max_sql_text_length;
 
   events_waits_history_per_thread= param->m_events_waits_history_sizing;
-  thread_waits_history_sizing= param->m_thread_sizing
-    * events_waits_history_per_thread;
-
-  thread_instr_class_waits_sizing= param->m_thread_sizing
-    * wait_class_max;
 
   events_stages_history_per_thread= param->m_events_stages_history_sizing;
-  thread_stages_history_sizing= param->m_thread_sizing
-    * events_stages_history_per_thread;
 
   events_statements_history_per_thread= param->m_events_statements_history_sizing;
-  thread_statements_history_sizing= param->m_thread_sizing
-    * events_statements_history_per_thread;
-
-  statement_stack_max= 1;
-  thread_statements_stack_sizing= param->m_thread_sizing * statement_stack_max;
 
-  thread_instr_class_stages_sizing= param->m_thread_sizing
-    * param->m_stage_class_sizing;
+  statement_stack_max= param->m_statement_stack_sizing;
 
-  thread_instr_class_statements_sizing= param->m_thread_sizing
-    * param->m_statement_class_sizing;
+  events_transactions_history_per_thread= param->m_events_transactions_history_sizing;
 
   session_connect_attrs_size_per_thread= param->m_session_connect_attrs_sizing;
-  thread_session_connect_attrs_sizing= param->m_thread_sizing
-    * session_connect_attrs_size_per_thread;
   session_connect_attrs_lost= 0;
 
-  size_t current_digest_tokens_sizing= param->m_thread_sizing * pfs_max_digest_length * statement_stack_max;
-  size_t history_digest_tokens_sizing= param->m_thread_sizing * pfs_max_digest_length * events_statements_history_per_thread;
-
-  mutex_array= NULL;
-  rwlock_array= NULL;
-  cond_array= NULL;
-  file_array= NULL;
   file_handle_array= NULL;
-  table_array= NULL;
-  socket_array= NULL;
-  thread_array= NULL;
-  thread_waits_history_array= NULL;
-  thread_stages_history_array= NULL;
-  thread_statements_history_array= NULL;
-  thread_statements_stack_array= NULL;
-  current_stmts_digest_token_array= NULL;
-  history_stmts_digest_token_array= NULL;
-  thread_instr_class_waits_array= NULL;
-  thread_instr_class_stages_array= NULL;
-  thread_instr_class_statements_array= NULL;
-  thread_internal_id_counter= 0;
-
-  if (mutex_max > 0)
-  {
-    mutex_array= PFS_MALLOC_ARRAY(mutex_max, sizeof(PFS_mutex), PFS_mutex, MYF(MY_ZEROFILL));
-    if (unlikely(mutex_array == NULL))
-      return 1;
-  }
 
-  if (rwlock_max > 0)
-  {
-    rwlock_array= PFS_MALLOC_ARRAY(rwlock_max, sizeof(PFS_rwlock), PFS_rwlock, MYF(MY_ZEROFILL));
-    if (unlikely(rwlock_array == NULL))
-      return 1;
-  }
+  thread_internal_id_counter.m_u64= 0;
 
-  if (cond_max > 0)
-  {
-    cond_array= PFS_MALLOC_ARRAY(cond_max, sizeof(PFS_cond), PFS_cond, MYF(MY_ZEROFILL));
-    if (unlikely(cond_array == NULL))
-      return 1;
-  }
+  if (global_mutex_container.init(param->m_mutex_sizing))
+    return 1;
 
-  if (file_max > 0)
-  {
-    file_array= PFS_MALLOC_ARRAY(file_max, sizeof(PFS_file), PFS_file, MYF(MY_ZEROFILL));
-    if (unlikely(file_array == NULL))
-      return 1;
-  }
+  if (global_rwlock_container.init(param->m_rwlock_sizing))
+    return 1;
 
-  if (file_handle_max > 0)
-  {
-    file_handle_array= PFS_MALLOC_ARRAY(file_handle_max, sizeof(PFS_file*), PFS_file*, MYF(MY_ZEROFILL));
-    if (unlikely(file_handle_array == NULL))
-      return 1;
-  }
+  if (global_cond_container.init(param->m_cond_sizing))
+    return 1;
 
-  if (table_max > 0)
-  {
-    table_array= PFS_MALLOC_ARRAY(table_max, sizeof(PFS_table), PFS_table, MYF(MY_ZEROFILL));
-    if (unlikely(table_array == NULL))
-      return 1;
-  }
-
-  if (socket_max > 0)
-  {
-    socket_array= PFS_MALLOC_ARRAY(socket_max, sizeof(PFS_socket), PFS_socket, MYF(MY_ZEROFILL));
-    if (unlikely(socket_array == NULL))
-      return 1;
-  }
-
-  if (thread_max > 0)
-  {
-    thread_array= PFS_MALLOC_ARRAY(thread_max, sizeof(PFS_thread), PFS_thread, MYF(MY_ZEROFILL));
-    if (unlikely(thread_array == NULL))
-      return 1;
-  }
+  if (global_file_container.init(param->m_file_sizing))
+    return 1;
 
-  if (thread_waits_history_sizing > 0)
-  {
-    thread_waits_history_array=
-      PFS_MALLOC_ARRAY(thread_waits_history_sizing, sizeof(PFS_events_waits), PFS_events_waits,
-                       MYF(MY_ZEROFILL));
-    if (unlikely(thread_waits_history_array == NULL))
-      return 1;
-  }
-
-  if (thread_instr_class_waits_sizing > 0)
-  {
-    thread_instr_class_waits_array=
-      PFS_MALLOC_ARRAY(thread_instr_class_waits_sizing,
-                       sizeof(PFS_single_stat), PFS_single_stat, MYF(MY_ZEROFILL));
-    if (unlikely(thread_instr_class_waits_array == NULL))
-      return 1;
-
-    for (index= 0; index < thread_instr_class_waits_sizing; index++)
-      thread_instr_class_waits_array[index].reset();
-  }
-
-  if (thread_stages_history_sizing > 0)
-  {
-    thread_stages_history_array=
-      PFS_MALLOC_ARRAY(thread_stages_history_sizing, sizeof(PFS_events_stages), PFS_events_stages,
-                       MYF(MY_ZEROFILL));
-    if (unlikely(thread_stages_history_array == NULL))
-      return 1;
-  }
-
-  if (thread_instr_class_stages_sizing > 0)
-  {
-    thread_instr_class_stages_array=
-      PFS_MALLOC_ARRAY(thread_instr_class_stages_sizing,
-                       sizeof(PFS_stage_stat), PFS_stage_stat, MYF(MY_ZEROFILL));
-    if (unlikely(thread_instr_class_stages_array == NULL))
-      return 1;
-
-    for (index= 0; index < thread_instr_class_stages_sizing; index++)
-      thread_instr_class_stages_array[index].reset();
-  }
-
-  if (thread_statements_history_sizing > 0)
-  {
-    thread_statements_history_array=
-      PFS_MALLOC_ARRAY(thread_statements_history_sizing, sizeof(PFS_events_statements),
-                       PFS_events_statements, MYF(MY_ZEROFILL));
-    if (unlikely(thread_statements_history_array == NULL))
-      return 1;
-  }
-
-  if (thread_statements_stack_sizing > 0)
-  {
-    thread_statements_stack_array=
-      PFS_MALLOC_ARRAY(thread_statements_stack_sizing, sizeof(PFS_events_statements),
-                       PFS_events_statements, MYF(MY_ZEROFILL));
-    if (unlikely(thread_statements_stack_array == NULL))
-      return 1;
-  }
-
-  if (thread_instr_class_statements_sizing > 0)
-  {
-    thread_instr_class_statements_array=
-      PFS_MALLOC_ARRAY(thread_instr_class_statements_sizing,
-                       sizeof(PFS_statement_stat), PFS_statement_stat, MYF(MY_ZEROFILL));
-    if (unlikely(thread_instr_class_statements_array == NULL))
-      return 1;
-
-    for (index= 0; index < thread_instr_class_statements_sizing; index++)
-      thread_instr_class_statements_array[index].reset();
-  }
-
-  if (thread_session_connect_attrs_sizing > 0)
-  {
-    thread_session_connect_attrs_array=
-      (char *)pfs_malloc(thread_session_connect_attrs_sizing, MYF(MY_ZEROFILL));
-    if (unlikely(thread_session_connect_attrs_array == NULL))
-      return 1;
-  }
-
-  if (current_digest_tokens_sizing > 0)
+  if (file_handle_max > 0)
   {
-    current_stmts_digest_token_array=
-      (unsigned char *)pfs_malloc(current_digest_tokens_sizing, MYF(MY_ZEROFILL));
-    if (unlikely(current_stmts_digest_token_array == NULL))
+    file_handle_array= PFS_MALLOC_ARRAY(& builtin_memory_file_handle,
+                                        file_handle_max,
+                                        sizeof(PFS_file*), PFS_file*,
+                                        MYF(MY_ZEROFILL));
+    if (unlikely(file_handle_array == NULL))
       return 1;
   }
 
-  if (history_digest_tokens_sizing > 0)
-  {
-    history_stmts_digest_token_array=
-      (unsigned char *)pfs_malloc(history_digest_tokens_sizing, MYF(MY_ZEROFILL));
-    if (unlikely(history_stmts_digest_token_array == NULL))
-      return 1;
-  }
+  if (global_table_container.init(param->m_table_sizing))
+    return 1;
 
-  for (index= 0; index < thread_max; index++)
-  {
-    thread_array[index].m_waits_history=
-      &thread_waits_history_array[index * events_waits_history_per_thread];
-    thread_array[index].m_instr_class_waits_stats=
-      &thread_instr_class_waits_array[index * wait_class_max];
-    thread_array[index].m_stages_history=
-      &thread_stages_history_array[index * events_stages_history_per_thread];
-    thread_array[index].m_instr_class_stages_stats=
-      &thread_instr_class_stages_array[index * stage_class_max];
-    thread_array[index].m_statements_history=
-      &thread_statements_history_array[index * events_statements_history_per_thread];
-    thread_array[index].m_statement_stack=
-      &thread_statements_stack_array[index * statement_stack_max];
-    thread_array[index].m_instr_class_statements_stats=
-      &thread_instr_class_statements_array[index * statement_class_max];
-    thread_array[index].m_session_connect_attrs=
-      &thread_session_connect_attrs_array[index * session_connect_attrs_size_per_thread];
-  }
+  if (global_socket_container.init(param->m_socket_sizing))
+    return 1;
 
-  for (index= 0; index < thread_statements_stack_sizing; index++)
-  {
-    pfs_stmt= & thread_statements_stack_array[index];
+  if (global_mdl_container.init(param->m_metadata_lock_sizing))
+    return 1;
 
-    pfs_tokens= & current_stmts_digest_token_array[index * pfs_max_digest_length];
-    pfs_stmt->m_digest_storage.reset(pfs_tokens, pfs_max_digest_length);
-  }
-
-  for (index= 0; index < thread_statements_history_sizing; index++)
-  {
-    pfs_stmt= & thread_statements_history_array[index];
-
-    pfs_tokens= & history_stmts_digest_token_array[index * pfs_max_digest_length];
-    pfs_stmt->m_digest_storage.reset(pfs_tokens, pfs_max_digest_length);
-  }
+  if (global_thread_container.init(param->m_thread_sizing))
+    return 1;
 
   if (stage_class_max > 0)
   {
     global_instr_class_stages_array=
-      PFS_MALLOC_ARRAY(stage_class_max,
-                       sizeof(PFS_stage_stat), PFS_stage_stat, MYF(MY_ZEROFILL));
+      PFS_MALLOC_ARRAY(& builtin_memory_global_stages,
+                       stage_class_max,
+                       sizeof(PFS_stage_stat), PFS_stage_stat,
+                       MYF(MY_ZEROFILL));
     if (unlikely(global_instr_class_stages_array == NULL))
       return 1;
 
@@ -495,8 +184,10 @@ int init_instruments(const PFS_global_param *param)
   if (statement_class_max > 0)
   {
     global_instr_class_statements_array=
-      PFS_MALLOC_ARRAY(statement_class_max,
-                       sizeof(PFS_statement_stat), PFS_statement_stat, MYF(MY_ZEROFILL));
+      PFS_MALLOC_ARRAY(& builtin_memory_global_statements,
+                       statement_class_max,
+                       sizeof(PFS_statement_stat), PFS_statement_stat,
+                       MYF(MY_ZEROFILL));
     if (unlikely(global_instr_class_statements_array == NULL))
       return 1;
 
@@ -504,60 +195,59 @@ int init_instruments(const PFS_global_param *param)
       global_instr_class_statements_array[index].reset();
   }
 
+  if (memory_class_max > 0)
+  {
+    global_instr_class_memory_array=
+      PFS_MALLOC_ARRAY(& builtin_memory_global_memory,
+                       memory_class_max,
+                       sizeof(PFS_memory_stat), PFS_memory_stat,
+                       MYF(MY_ZEROFILL));
+    if (unlikely(global_instr_class_memory_array == NULL))
+      return 1;
+
+    for (index= 0; index < memory_class_max; index++)
+      global_instr_class_memory_array[index].reset();
+  }
+
   return 0;
 }
 
 /** Cleanup all the instruments buffers. */
 void cleanup_instruments(void)
 {
-  pfs_free(mutex_array);
-  mutex_array= NULL;
-  mutex_max= 0;
-  pfs_free(rwlock_array);
-  rwlock_array= NULL;
-  rwlock_max= 0;
-  pfs_free(cond_array);
-  cond_array= NULL;
-  cond_max= 0;
-  pfs_free(file_array);
-  file_array= NULL;
-  file_max= 0;
-  pfs_free(file_handle_array);
+  global_mutex_container.cleanup();
+  global_rwlock_container.cleanup();
+  global_cond_container.cleanup();
+  global_file_container.cleanup();
+
+  PFS_FREE_ARRAY(& builtin_memory_file_handle,
+                 file_handle_max, sizeof(PFS_file*),
+                 file_handle_array);
   file_handle_array= NULL;
   file_handle_max= 0;
-  pfs_free(table_array);
-  table_array= NULL;
-  table_max= 0;
-  pfs_free(socket_array);
-  socket_array= NULL;
-  socket_max= 0;
-  pfs_free(thread_array);
-  thread_array= NULL;
-  thread_max= 0;
-  pfs_free(thread_waits_history_array);
-  thread_waits_history_array= NULL;
-  pfs_free(thread_stages_history_array);
-  thread_stages_history_array= NULL;
-  pfs_free(thread_statements_history_array);
-  thread_statements_history_array= NULL;
-  pfs_free(thread_statements_stack_array);
-  thread_statements_stack_array= NULL;
-  pfs_free(thread_instr_class_waits_array);
-  thread_instr_class_waits_array= NULL;
-  pfs_free(global_instr_class_stages_array);
+
+  global_table_container.cleanup();
+  global_socket_container.cleanup();
+  global_mdl_container.cleanup();
+  global_thread_container.cleanup();
+
+  PFS_FREE_ARRAY(& builtin_memory_global_stages,
+                 stage_class_max,
+                 sizeof(PFS_stage_stat),
+                 global_instr_class_stages_array);
   global_instr_class_stages_array= NULL;
-  pfs_free(global_instr_class_statements_array);
+
+  PFS_FREE_ARRAY(& builtin_memory_global_statements,
+                 statement_class_max,
+                 sizeof(PFS_statement_stat),
+                 global_instr_class_statements_array);
   global_instr_class_statements_array= NULL;
-  pfs_free(thread_instr_class_statements_array);
-  thread_instr_class_statements_array= NULL;
-  pfs_free(thread_instr_class_stages_array);
-  thread_instr_class_stages_array= NULL;
-  pfs_free(thread_session_connect_attrs_array);
-  thread_session_connect_attrs_array=NULL;
-  pfs_free(current_stmts_digest_token_array);
-  current_stmts_digest_token_array= NULL;
-  pfs_free(history_stmts_digest_token_array);
-  history_stmts_digest_token_array= NULL;
+
+  PFS_FREE_ARRAY(& builtin_memory_global_memory,
+                 memory_class_max,
+                 sizeof(PFS_memory_stat),
+                 global_instr_class_memory_array);
+  global_instr_class_memory_array= NULL;
 }
 
 C_MODE_START
@@ -569,9 +259,9 @@ static uchar *filename_hash_get_key(const uchar *entry, size_t *length,
   const PFS_file *file;
   const void *result;
   typed_entry= reinterpret_cast<const PFS_file* const *> (entry);
-  DBUG_ASSERT(typed_entry != NULL);
+  assert(typed_entry != NULL);
   file= *typed_entry;
-  DBUG_ASSERT(file != NULL);
+  assert(file != NULL);
   *length= file->m_filename_length;
   result= file->m_filename;
   return const_cast<uchar*> (reinterpret_cast<const uchar*> (result));
@@ -582,13 +272,12 @@ C_MODE_END
   Initialize the file name hash.
   @return 0 on success
 */
-int init_file_hash(void)
+int init_file_hash(const PFS_global_param *param)
 {
-  if ((! filename_hash_inited) && (file_max > 0))
+  if ((! filename_hash_inited) && (param->m_file_sizing != 0))
   {
     lf_hash_init(&pfs_filename_hash, sizeof(PFS_file*), LF_HASH_UNIQUE,
                  0, 0, filename_hash_get_key, &my_charset_bin);
-    /* filename_hash.size= file_max; */
     filename_hash_inited= true;
   }
   return 0;
@@ -604,75 +293,6 @@ void cleanup_file_hash(void)
   }
 }
 
-void PFS_scan::init(uint random, uint max_size)
-{
-  m_pass= 0;
-
-  if (max_size == 0)
-  {
-    /* Degenerated case, no buffer */
-    m_pass_max= 0;
-    return;
-  }
-
-  DBUG_ASSERT(random < max_size);
-
-  if (PFS_MAX_ALLOC_RETRY < max_size)
-  {
-    /*
-      The buffer is big compared to PFS_MAX_ALLOC_RETRY,
-      scan it only partially.
-    */
-    if (random + PFS_MAX_ALLOC_RETRY < max_size)
-    {
-      /*
-        Pass 1: [random, random + PFS_MAX_ALLOC_RETRY - 1]
-        Pass 2: not used.
-      */
-      m_pass_max= 1;
-      m_first[0]= random;
-      m_last[0]= random + PFS_MAX_ALLOC_RETRY;
-      m_first[1]= 0;
-      m_last[1]= 0;
-    }
-    else
-    {
-      /*
-        Pass 1: [random, max_size - 1]
-        Pass 2: [0, ...]
-        The combined length of pass 1 and 2 is PFS_MAX_ALLOC_RETRY.
-      */
-      m_pass_max= 2;
-      m_first[0]= random;
-      m_last[0]= max_size;
-      m_first[1]= 0;
-      m_last[1]= PFS_MAX_ALLOC_RETRY - (max_size - random);
-    }
-  }
-  else
-  {
-    /*
-      The buffer is small compared to PFS_MAX_ALLOC_RETRY,
-      scan it in full in two passes.
-      Pass 1: [random, max_size - 1]
-      Pass 2: [0, random - 1]
-    */
-    m_pass_max= 2;
-    m_first[0]= random;
-    m_last[0]= max_size;
-    m_first[1]= 0;
-    m_last[1]= random;
-  }
-
-  DBUG_ASSERT(m_first[0] < max_size);
-  DBUG_ASSERT(m_first[1] < max_size);
-  DBUG_ASSERT(m_last[1] <= max_size);
-  DBUG_ASSERT(m_last[1] <= max_size);
-  /* The combined length of all passes should not exceed PFS_MAX_ALLOC_RETRY. */
-  DBUG_ASSERT((m_last[0] - m_first[0]) +
-              (m_last[1] - m_first[1]) <= PFS_MAX_ALLOC_RETRY);
-}
-
 /**
   Create instrumentation for a mutex instance.
   @param klass                        the mutex class
@@ -681,76 +301,25 @@ void PFS_scan::init(uint random, uint max_size)
 */
 PFS_mutex* create_mutex(PFS_mutex_class *klass, const void *identity)
 {
-  static uint PFS_ALIGNED mutex_monotonic_index= 0;
-  uint index;
-  uint attempts= 0;
   PFS_mutex *pfs;
+  pfs_dirty_state dirty_state;
 
-  if (mutex_full)
+  pfs= global_mutex_container.allocate(& dirty_state, klass->m_volatility);
+  if (pfs != NULL)
   {
-    /*
-      This is a safety plug.
-      When mutex_array is severely undersized,
-      do not spin to death for each call.
-    */
-    mutex_lost++;
-    return NULL;
-  }
-
-  while (++attempts <= mutex_max)
-  {
-    /*
-      Problem:
-      Multiple threads running concurrently may need to create a new
-      instrumented mutex, and find an empty slot in mutex_array[].
-      With N1 threads running on a N2 core hardware:
-      - up to N2 hardware threads can run concurrently,
-      causing contention if looking at the same array[i] slot.
-      - up to N1 threads can run almost concurrently (with thread scheduling),
-      scanning maybe overlapping regions in the [0-mutex_max] array.
-
-      Solution:
-      Instead of letting different threads compete on the same array[i] entry,
-      this code forces all threads to cooperate with the monotonic_index.
-      Only one thread will be allowed to test a given array[i] slot.
-      All threads do scan from the same region, starting at monotonic_index.
-      Serializing on monotonic_index ensures that when a slot is found occupied
-      in a given loop by a given thread, other threads will not attempt this
-      slot.
-    */
-    index= PFS_atomic::add_u32(& mutex_monotonic_index, 1) % mutex_max;
-    pfs= mutex_array + index;
-
-    if (pfs->m_lock.is_free())
-    {
-      if (pfs->m_lock.free_to_dirty())
-      {
-        pfs->m_identity= identity;
-        pfs->m_class= klass;
-        pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
-        pfs->m_timed= klass->m_timed;
-        pfs->m_mutex_stat.reset();
-        pfs->m_owner= NULL;
-        pfs->m_last_locked= 0;
-        pfs->m_lock.dirty_to_allocated();
-        if (klass->is_singleton())
-          klass->m_singleton= pfs;
-        return pfs;
-      }
-    }
+    pfs->m_identity= identity;
+    pfs->m_class= klass;
+    pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+    pfs->m_timed= klass->m_timed;
+    pfs->m_mutex_stat.reset();
+    pfs->m_owner= NULL;
+    pfs->m_last_locked= 0;
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
+    if (klass->is_singleton())
+      klass->m_singleton= pfs;
   }
 
-  mutex_lost++;
-  /*
-    Race condition.
-    The mutex_array might not be full if a concurrent thread
-    called destroy_mutex() during the scan, leaving one
-    empty slot we did not find.
-    However, 99.999 percent full tables or 100 percent full tables
-    are treated the same here, we declare the array overloaded.
-  */
-  mutex_full= true;
-  return NULL;
+  return pfs;
 }
 
 /**
@@ -759,15 +328,15 @@ PFS_mutex* create_mutex(PFS_mutex_class *klass, const void *identity)
 */
 void destroy_mutex(PFS_mutex *pfs)
 {
-  DBUG_ASSERT(pfs != NULL);
+  assert(pfs != NULL);
   PFS_mutex_class *klass= pfs->m_class;
   /* Aggregate to EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME */
   klass->m_mutex_stat.aggregate(& pfs->m_mutex_stat);
   pfs->m_mutex_stat.reset();
   if (klass->is_singleton())
     klass->m_singleton= NULL;
-  pfs->m_lock.allocated_to_free();
-  mutex_full= false;
+
+  global_mutex_container.deallocate(pfs);
 }
 
 /**
@@ -778,47 +347,27 @@ void destroy_mutex(PFS_mutex *pfs)
 */
 PFS_rwlock* create_rwlock(PFS_rwlock_class *klass, const void *identity)
 {
-  static uint PFS_ALIGNED rwlock_monotonic_index= 0;
-  uint index;
-  uint attempts= 0;
   PFS_rwlock *pfs;
+  pfs_dirty_state dirty_state;
 
-  if (rwlock_full)
+  pfs= global_rwlock_container.allocate(& dirty_state);
+  if (pfs != NULL)
   {
-    rwlock_lost++;
-    return NULL;
-  }
-
-  while (++attempts <= rwlock_max)
-  {
-    /* See create_mutex() */
-    index= PFS_atomic::add_u32(& rwlock_monotonic_index, 1) % rwlock_max;
-    pfs= rwlock_array + index;
-
-    if (pfs->m_lock.is_free())
-    {
-      if (pfs->m_lock.free_to_dirty())
-      {
-        pfs->m_identity= identity;
-        pfs->m_class= klass;
-        pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
-        pfs->m_timed= klass->m_timed;
-        pfs->m_rwlock_stat.reset();
-        pfs->m_lock.dirty_to_allocated();
-        pfs->m_writer= NULL;
-        pfs->m_readers= 0;
-        pfs->m_last_written= 0;
-        pfs->m_last_read= 0;
-        if (klass->is_singleton())
-          klass->m_singleton= pfs;
-        return pfs;
-      }
-    }
+    pfs->m_identity= identity;
+    pfs->m_class= klass;
+    pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+    pfs->m_timed= klass->m_timed;
+    pfs->m_rwlock_stat.reset();
+    pfs->m_writer= NULL;
+    pfs->m_readers= 0;
+    pfs->m_last_written= 0;
+    pfs->m_last_read= 0;
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
+    if (klass->is_singleton())
+      klass->m_singleton= pfs;
   }
 
-  rwlock_lost++;
-  rwlock_full= true;
-  return NULL;
+  return pfs;
 }
 
 /**
@@ -827,15 +376,15 @@ PFS_rwlock* create_rwlock(PFS_rwlock_class *klass, const void *identity)
 */
 void destroy_rwlock(PFS_rwlock *pfs)
 {
-  DBUG_ASSERT(pfs != NULL);
+  assert(pfs != NULL);
   PFS_rwlock_class *klass= pfs->m_class;
   /* Aggregate to EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME */
   klass->m_rwlock_stat.aggregate(& pfs->m_rwlock_stat);
   pfs->m_rwlock_stat.reset();
   if (klass->is_singleton())
     klass->m_singleton= NULL;
-  pfs->m_lock.allocated_to_free();
-  rwlock_full= false;
+
+  global_rwlock_container.deallocate(pfs);
 }
 
 /**
@@ -846,45 +395,23 @@ void destroy_rwlock(PFS_rwlock *pfs)
 */
 PFS_cond* create_cond(PFS_cond_class *klass, const void *identity)
 {
-  static uint PFS_ALIGNED cond_monotonic_index= 0;
-  uint index;
-  uint attempts= 0;
   PFS_cond *pfs;
+  pfs_dirty_state dirty_state;
 
-  if (cond_full)
+  pfs= global_cond_container.allocate(& dirty_state);
+  if (pfs != NULL)
   {
-    cond_lost++;
-    return NULL;
-  }
-
-  while (++attempts <= cond_max)
-  {
-    /* See create_mutex() */
-    index= PFS_atomic::add_u32(& cond_monotonic_index, 1) % cond_max;
-    pfs= cond_array + index;
-
-    if (pfs->m_lock.is_free())
-    {
-      if (pfs->m_lock.free_to_dirty())
-      {
-        pfs->m_identity= identity;
-        pfs->m_class= klass;
-        pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
-        pfs->m_timed= klass->m_timed;
-        pfs->m_cond_stat.m_signal_count= 0;
-        pfs->m_cond_stat.m_broadcast_count= 0;
-        pfs->m_wait_stat.reset();
-        pfs->m_lock.dirty_to_allocated();
-        if (klass->is_singleton())
-          klass->m_singleton= pfs;
-        return pfs;
-      }
-    }
+    pfs->m_identity= identity;
+    pfs->m_class= klass;
+    pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+    pfs->m_timed= klass->m_timed;
+    pfs->m_cond_stat.reset();
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
+    if (klass->is_singleton())
+      klass->m_singleton= pfs;
   }
 
-  cond_lost++;
-  cond_full= true;
-  return NULL;
+  return pfs;
 }
 
 /**
@@ -893,21 +420,20 @@ PFS_cond* create_cond(PFS_cond_class *klass, const void *identity)
 */
 void destroy_cond(PFS_cond *pfs)
 {
-  DBUG_ASSERT(pfs != NULL);
+  assert(pfs != NULL);
   PFS_cond_class *klass= pfs->m_class;
   /* Aggregate to EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME */
   klass->m_cond_stat.aggregate(& pfs->m_cond_stat);
-  pfs->m_wait_stat.reset();
+  pfs->m_cond_stat.reset();
   if (klass->is_singleton())
     klass->m_singleton= NULL;
-  pfs->m_lock.allocated_to_free();
-  cond_full= false;
+
+  global_cond_container.deallocate(pfs);
 }
 
 PFS_thread* PFS_thread::get_current_thread()
 {
-  PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
-  return pfs;
+  return static_cast<PFS_thread*>(my_get_thread_local(THR_PFS));
 }
 
 void PFS_thread::reset_session_connect_attrs()
@@ -923,6 +449,64 @@ void PFS_thread::reset_session_connect_attrs()
   }
 }
 
+void PFS_thread::set_history_derived_flags()
+{
+  if (m_history)
+  {
+    m_flag_events_waits_history= flag_events_waits_history;
+    m_flag_events_waits_history_long= flag_events_waits_history_long;
+    m_flag_events_stages_history= flag_events_stages_history;
+    m_flag_events_stages_history_long= flag_events_stages_history_long;
+    m_flag_events_statements_history= flag_events_statements_history;
+    m_flag_events_statements_history_long= flag_events_statements_history_long;
+    m_flag_events_transactions_history= flag_events_transactions_history;
+    m_flag_events_transactions_history_long= flag_events_transactions_history_long;
+  }
+  else
+  {
+    m_flag_events_waits_history= false;
+    m_flag_events_waits_history_long= false;
+    m_flag_events_stages_history= false;
+    m_flag_events_stages_history_long= false;
+    m_flag_events_statements_history= false;
+    m_flag_events_statements_history_long= false;
+    m_flag_events_transactions_history= false;
+    m_flag_events_transactions_history_long= false;
+  }
+}
+
+void PFS_thread::carry_memory_stat_delta(PFS_memory_stat_delta *delta, uint index)
+{
+  if (m_account != NULL)
+  {
+    m_account->carry_memory_stat_delta(delta, index);
+    return;
+  }
+
+  if (m_user != NULL)
+  {
+    m_user->carry_memory_stat_delta(delta, index);
+    /* do not return, need to process m_host below */
+  }
+
+  if (m_host != NULL)
+  {
+    m_host->carry_memory_stat_delta(delta, index);
+    return;
+  }
+
+  carry_global_memory_stat_delta(delta, index);
+}
+
+void carry_global_memory_stat_delta(PFS_memory_stat_delta *delta, uint index)
+{
+  PFS_memory_stat *stat;
+  PFS_memory_stat_delta delta_buffer;
+
+  stat= & global_instr_class_memory_array[index];
+  (void) stat->apply_delta(delta, &delta_buffer);
+}
+
 /**
   Create instrumentation for a thread instance.
   @param klass                        the thread class
@@ -935,158 +519,101 @@ void PFS_thread::reset_session_connect_attrs()
 PFS_thread* create_thread(PFS_thread_class *klass, const void *identity,
                           ulonglong processlist_id)
 {
-  static uint PFS_ALIGNED thread_monotonic_index= 0;
-  uint index;
-  uint attempts= 0;
   PFS_thread *pfs;
+  pfs_dirty_state dirty_state;
+
+  pfs= global_thread_container.allocate(& dirty_state);
+  if (pfs != NULL)
+  {
+    pfs->m_thread_internal_id=
+      PFS_atomic::add_u64(&thread_internal_id_counter.m_u64, 1);
+    pfs->m_parent_thread_internal_id= 0;
+    pfs->m_processlist_id= static_cast<ulong>(processlist_id);
+    pfs->m_thread_os_id= my_thread_os_id();
+    pfs->m_event_id= 1;
+    pfs->m_stmt_lock.set_allocated();
+    pfs->m_session_lock.set_allocated();
+    pfs->set_enabled(true);
+    pfs->set_history(true);
+    pfs->m_class= klass;
+    pfs->m_events_waits_current= & pfs->m_events_waits_stack[WAIT_STACK_BOTTOM];
+    pfs->m_waits_history_full= false;
+    pfs->m_waits_history_index= 0;
+    pfs->m_stages_history_full= false;
+    pfs->m_stages_history_index= 0;
+    pfs->m_statements_history_full= false;
+    pfs->m_statements_history_index= 0;
+    pfs->m_transactions_history_full= false;
+    pfs->m_transactions_history_index= 0;
+
+    pfs->reset_stats();
+    pfs->reset_session_connect_attrs();
 
-  if (thread_full)
-  {
-    thread_lost++;
-    return NULL;
-  }
+    pfs->m_filename_hash_pins= NULL;
+    pfs->m_table_share_hash_pins= NULL;
+    pfs->m_setup_actor_hash_pins= NULL;
+    pfs->m_setup_object_hash_pins= NULL;
+    pfs->m_user_hash_pins= NULL;
+    pfs->m_account_hash_pins= NULL;
+    pfs->m_host_hash_pins= NULL;
+    pfs->m_digest_hash_pins= NULL;
+    pfs->m_program_hash_pins= NULL;
+
+    pfs->m_username_length= 0;
+    pfs->m_hostname_length= 0;
+    pfs->m_dbname_length= 0;
+    pfs->m_command= 0;
+    pfs->m_start_time= 0;
+    pfs->m_stage= 0;
+    pfs->m_stage_progress= NULL;
+    pfs->m_processlist_info[0]= '\0';
+    pfs->m_processlist_info_length= 0;
+    pfs->m_connection_type= VIO_CLOSED;
+
+    pfs->m_thd= NULL;
+    pfs->m_host= NULL;
+    pfs->m_user= NULL;
+    pfs->m_account= NULL;
+    set_thread_account(pfs);
 
-  while (++attempts <= thread_max)
-  {
-    /* See create_mutex() */
-    index= PFS_atomic::add_u32(& thread_monotonic_index, 1) % thread_max;
-    pfs= thread_array + index;
+    /*
+      For child waits, by default,
+      - NESTING_EVENT_ID is NULL
+      - NESTING_EVENT_TYPE is NULL
+    */
+    PFS_events_waits *child_wait= & pfs->m_events_waits_stack[0];
+    child_wait->m_event_id= 0;
 
-    if (pfs->m_lock.is_free())
-    {
-      if (pfs->m_lock.free_to_dirty())
-      {
-        pfs->m_thread_internal_id=
-          PFS_atomic::add_u64(&thread_internal_id_counter, 1);
-        pfs->m_parent_thread_internal_id= 0;
-        pfs->m_processlist_id= (ulong)processlist_id;
-        pfs->m_event_id= 1;
-        pfs->m_stmt_lock.set_allocated();
-        pfs->m_session_lock.set_allocated();
-        pfs->m_enabled= true;
-        pfs->m_class= klass;
-        pfs->m_events_waits_current= & pfs->m_events_waits_stack[WAIT_STACK_BOTTOM];
-        pfs->m_waits_history_full= false;
-        pfs->m_waits_history_index= 0;
-        pfs->m_stages_history_full= false;
-        pfs->m_stages_history_index= 0;
-        pfs->m_statements_history_full= false;
-        pfs->m_statements_history_index= 0;
-
-        pfs->reset_stats();
-        pfs->reset_session_connect_attrs();
-
-        pfs->m_filename_hash_pins= NULL;
-        pfs->m_table_share_hash_pins= NULL;
-        pfs->m_setup_actor_hash_pins= NULL;
-        pfs->m_setup_object_hash_pins= NULL;
-        pfs->m_user_hash_pins= NULL;
-        pfs->m_account_hash_pins= NULL;
-        pfs->m_host_hash_pins= NULL;
-        pfs->m_digest_hash_pins= NULL;
-
-        pfs->m_username_length= 0;
-        pfs->m_hostname_length= 0;
-        pfs->m_dbname_length= 0;
-        pfs->m_command= 0;
-        pfs->m_start_time= 0;
-        pfs->m_stage= 0;
-        pfs->m_processlist_info[0]= '\0';
-        pfs->m_processlist_info_length= 0;
-
-        pfs->m_host= NULL;
-        pfs->m_user= NULL;
-        pfs->m_account= NULL;
-        set_thread_account(pfs);
-
-        PFS_events_waits *child_wait;
-        for (index= 0; index < WAIT_STACK_SIZE; index++)
-        {
-          child_wait= & pfs->m_events_waits_stack[index];
-          child_wait->m_thread_internal_id= pfs->m_thread_internal_id;
-          child_wait->m_event_id= 0;
-          child_wait->m_end_event_id= 0;
-          child_wait->m_event_type= EVENT_TYPE_STATEMENT;
-          child_wait->m_wait_class= NO_WAIT_CLASS;
-        }
-
-        PFS_events_stages *child_stage= & pfs->m_stage_current;
-        child_stage->m_thread_internal_id= pfs->m_thread_internal_id;
-        child_stage->m_event_id= 0;
-        child_stage->m_end_event_id= 0;
-        child_stage->m_event_type= EVENT_TYPE_STATEMENT;
-        child_stage->m_class= NULL;
-        child_stage->m_timer_start= 0;
-        child_stage->m_timer_end= 0;
-        child_stage->m_source_file= NULL;
-        child_stage->m_source_line= 0;
-
-        PFS_events_statements *child_statement;
-        for (index= 0; index < statement_stack_max; index++)
-        {
-          child_statement= & pfs->m_statement_stack[index];
-          child_statement->m_thread_internal_id= pfs->m_thread_internal_id;
-          child_statement->m_event_id= 0;
-          child_statement->m_end_event_id= 0;
-          child_statement->m_event_type= EVENT_TYPE_STATEMENT;
-          child_statement->m_class= NULL;
-          child_statement->m_timer_start= 0;
-          child_statement->m_timer_end= 0;
-          child_statement->m_lock_time= 0;
-          child_statement->m_source_file= NULL;
-          child_statement->m_source_line= 0;
-          child_statement->m_current_schema_name_length= 0;
-          child_statement->m_sqltext_length= 0;
-
-          child_statement->m_message_text[0]= '\0';
-          child_statement->m_sql_errno= 0;
-          child_statement->m_sqlstate[0]= '\0';
-          child_statement->m_error_count= 0;
-          child_statement->m_warning_count= 0;
-          child_statement->m_rows_affected= 0;
-
-          child_statement->m_rows_sent= 0;
-          child_statement->m_rows_examined= 0;
-          child_statement->m_created_tmp_disk_tables= 0;
-          child_statement->m_created_tmp_tables= 0;
-          child_statement->m_select_full_join= 0;
-          child_statement->m_select_full_range_join= 0;
-          child_statement->m_select_range= 0;
-          child_statement->m_select_range_check= 0;
-          child_statement->m_select_scan= 0;
-          child_statement->m_sort_merge_passes= 0;
-          child_statement->m_sort_range= 0;
-          child_statement->m_sort_rows= 0;
-          child_statement->m_sort_scan= 0;
-          child_statement->m_no_index_used= 0;
-          child_statement->m_no_good_index_used= 0;
-        }
-        pfs->m_events_statements_count= 0;
-
-        pfs->m_lock.dirty_to_allocated();
-        return pfs;
-      }
-    }
+    /*
+      For child stages, by default,
+      - NESTING_EVENT_ID is NULL
+      - NESTING_EVENT_TYPE is NULL
+    */
+    PFS_events_stages *child_stage= & pfs->m_stage_current;
+    child_stage->m_nesting_event_id= 0;
+
+    pfs->m_events_statements_count= 0;
+    pfs->m_transaction_current.m_event_id= 0;
+
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
   }
 
-  thread_lost++;
-  thread_full= true;
-  return NULL;
+  return pfs;
 }
 
 PFS_mutex *sanitize_mutex(PFS_mutex *unsafe)
 {
-  SANITIZE_ARRAY_BODY(PFS_mutex, mutex_array, mutex_max, unsafe);
+  return global_mutex_container.sanitize(unsafe);
 }
 
 PFS_rwlock *sanitize_rwlock(PFS_rwlock *unsafe)
 {
-  SANITIZE_ARRAY_BODY(PFS_rwlock, rwlock_array, rwlock_max, unsafe);
+  return global_rwlock_container.sanitize(unsafe);
 }
 
 PFS_cond *sanitize_cond(PFS_cond *unsafe)
 {
-  SANITIZE_ARRAY_BODY(PFS_cond, cond_array, cond_max, unsafe);
+  return global_cond_container.sanitize(unsafe);
 }
 
 /**
@@ -1100,17 +627,22 @@ PFS_cond *sanitize_cond(PFS_cond *unsafe)
 */
 PFS_thread *sanitize_thread(PFS_thread *unsafe)
 {
-  SANITIZE_ARRAY_BODY(PFS_thread, thread_array, thread_max, unsafe);
+  return global_thread_container.sanitize(unsafe);
 }
 
 PFS_file *sanitize_file(PFS_file *unsafe)
 {
-  SANITIZE_ARRAY_BODY(PFS_file, file_array, file_max, unsafe);
+  return global_file_container.sanitize(unsafe);
 }
 
 PFS_socket *sanitize_socket(PFS_socket *unsafe)
 {
-  SANITIZE_ARRAY_BODY(PFS_socket, socket_array, socket_max, unsafe);
+  return global_socket_container.sanitize(unsafe);
+}
+
+PFS_metadata_lock *sanitize_metadata_lock(PFS_metadata_lock *unsafe)
+{
+  return global_mdl_container.sanitize(unsafe);
 }
 
 /**
@@ -1119,14 +651,14 @@ PFS_socket *sanitize_socket(PFS_socket *unsafe)
 */
 void destroy_thread(PFS_thread *pfs)
 {
-  DBUG_ASSERT(pfs != NULL);
+  assert(pfs != NULL);
   pfs->reset_session_connect_attrs();
   if (pfs->m_account != NULL)
   {
     pfs->m_account->release();
     pfs->m_account= NULL;
-    DBUG_ASSERT(pfs->m_user == NULL);
-    DBUG_ASSERT(pfs->m_host == NULL);
+    assert(pfs->m_user == NULL);
+    assert(pfs->m_host == NULL);
   }
   else
   {
@@ -1181,8 +713,12 @@ void destroy_thread(PFS_thread *pfs)
     lf_hash_put_pins(pfs->m_digest_hash_pins);
     pfs->m_digest_hash_pins= NULL;
   }
-  pfs->m_lock.allocated_to_free();
-  thread_full= false;
+  if (pfs->m_program_hash_pins)
+  {
+    lf_hash_put_pins(pfs->m_program_hash_pins);
+    pfs->m_program_hash_pins= NULL;
+  }
+  global_thread_container.deallocate(pfs);
 }
 
 /**
@@ -1216,12 +752,12 @@ find_or_create_file(PFS_thread *thread, PFS_file_class *klass,
 {
   PFS_file *pfs;
 
-  DBUG_ASSERT(klass != NULL || ! create);
+  assert(klass != NULL || ! create);
 
   LF_PINS *pins= get_filename_hash_pins(thread);
   if (unlikely(pins == NULL))
   {
-    file_lost++;
+    global_file_container.m_lost++;
     return NULL;
   }
 
@@ -1272,6 +808,7 @@ find_or_create_file(PFS_thread *thread, PFS_file_class *klass,
   char dirbuffer[FN_REFLEN];
   size_t dirlen;
   const char *normalized_filename;
+  uint normalized_length;
 
   dirlen= dirname_length(safe_filename);
   if (dirlen == 0)
@@ -1288,7 +825,7 @@ find_or_create_file(PFS_thread *thread, PFS_file_class *klass,
 
   if (my_realpath(buffer, dirbuffer, MYF(0)) != 0)
   {
-    file_lost++;
+    global_file_container.m_lost++;
     return NULL;
   }
 
@@ -1302,14 +839,12 @@ find_or_create_file(PFS_thread *thread, PFS_file_class *klass,
   *buf_end= '\0';
 
   normalized_filename= buffer;
-  uint normalized_length= static_cast<uint>(strlen(normalized_filename));
+  normalized_length= (uint)strlen(normalized_filename);
 
   PFS_file **entry;
   uint retry_count= 0;
   const uint retry_max= 3;
-  static uint PFS_ALIGNED file_monotonic_index= 0;
-  uint index;
-  uint attempts= 0;
+  pfs_dirty_state dirty_state;
 
 search:
 
@@ -1332,76 +867,210 @@ search:
     return NULL;
   }
 
-  if (file_full)
+  pfs= global_file_container.allocate(& dirty_state);
+  if (pfs != NULL)
   {
-    file_lost++;
-    return NULL;
-  }
+    pfs->m_class= klass;
+    pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+    pfs->m_timed= klass->m_timed;
+    memcpy(pfs->m_filename, normalized_filename, normalized_length);
+    pfs->m_filename[normalized_length]= '\0';
+    pfs->m_filename_length= normalized_length;
+    pfs->m_file_stat.m_open_count= 1;
+    pfs->m_file_stat.m_io_stat.reset();
+    pfs->m_identity= (const void *)pfs;
+    pfs->m_temporary= false;
+
+    int res;
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
+    res= lf_hash_insert(&pfs_filename_hash, pins,
+                        &pfs);
+    if (likely(res == 0))
+    {
+      if (klass->is_singleton())
+        klass->m_singleton= pfs;
+      return pfs;
+    }
 
-  while (++attempts <= file_max)
-  {
-    /* See create_mutex() */
-    index= PFS_atomic::add_u32(& file_monotonic_index, 1) % file_max;
-    pfs= file_array + index;
+    global_file_container.deallocate(pfs);
 
-    if (pfs->m_lock.is_free())
+    if (res > 0)
     {
-      if (pfs->m_lock.free_to_dirty())
+      /* Duplicate insert by another thread */
+      if (++retry_count > retry_max)
       {
-        pfs->m_class= klass;
-        pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
-        pfs->m_timed= klass->m_timed;
-        strncpy(pfs->m_filename, normalized_filename, normalized_length + 1);
-        pfs->m_filename[normalized_length]= '\0';
-        pfs->m_filename_length= normalized_length;
-        pfs->m_file_stat.m_open_count= 1;
-        pfs->m_file_stat.m_io_stat.reset();
-        pfs->m_identity= (const void *)pfs;
-
-        int res;
-        res= lf_hash_insert(&pfs_filename_hash, thread->m_filename_hash_pins,
-                            &pfs);
-        if (likely(res == 0))
-        {
-          pfs->m_lock.dirty_to_allocated();
-          if (klass->is_singleton())
-            klass->m_singleton= pfs;
-          return pfs;
-        }
-
-        pfs->m_lock.dirty_to_free();
-
-        if (res > 0)
-        {
-          /* Duplicate insert by another thread */
-          if (++retry_count > retry_max)
-          {
-            /* Avoid infinite loops */
-            file_lost++;
-            return NULL;
-          }
-          goto search;
-        }
-
-        /* OOM in lf_hash_insert */
-        file_lost++;
+        /* Avoid infinite loops */
+        global_file_container.m_lost++;
         return NULL;
       }
+      goto search;
     }
+
+    /* OOM in lf_hash_insert */
+    global_file_container.m_lost++;
+    return NULL;
   }
 
-  file_lost++;
-  file_full= true;
   return NULL;
 }
 
 /**
+  Find a file instrumentation instance by name, and rename it
+  @param thread                       the executing instrumented thread
+  @param old_filename                 the file to be renamed
+  @param old_len                      the length in bytes of the old filename
+  @param new_filename                 the new file name
+  @param new_len                      the length in bytes of the new filename
+*/
+void find_and_rename_file(PFS_thread *thread, const char *old_filename,
+                          uint old_len, const char *new_filename, uint new_len)
+{
+  PFS_file *pfs;
+
+  assert(thread != NULL);
+
+  LF_PINS *pins= get_filename_hash_pins(thread);
+  if (unlikely(pins == NULL))
+  {
+    global_file_container.m_lost++;
+    return;
+  }
+
+  /*
+    Normalize the old file name.
+  */
+  char safe_buffer[FN_REFLEN];
+  const char *safe_filename;
+
+  if (old_len >= FN_REFLEN)
+  {
+    memcpy(safe_buffer, old_filename, FN_REFLEN - 1);
+    safe_buffer[FN_REFLEN - 1]= 0;
+    safe_filename= safe_buffer;
+  }
+  else
+    safe_filename= old_filename;
+
+  char buffer[FN_REFLEN];
+  char dirbuffer[FN_REFLEN];
+  size_t dirlen;
+  const char *normalized_filename;
+  uint normalized_length;
+
+  dirlen= dirname_length(safe_filename);
+  if (dirlen == 0)
+  {
+    dirbuffer[0]= FN_CURLIB;
+    dirbuffer[1]= FN_LIBCHAR;
+    dirbuffer[2]= '\0';
+  }
+  else
+  {
+    memcpy(dirbuffer, safe_filename, dirlen);
+    dirbuffer[dirlen]= '\0';
+  }
+
+  if (my_realpath(buffer, dirbuffer, MYF(0)) != 0)
+  {
+    global_file_container.m_lost++;
+    return;
+  }
+
+  /* Append the unresolved file name to the resolved path */
+  char *ptr= buffer + strlen(buffer);
+  char *buf_end= &buffer[sizeof(buffer)-1];
+  if ((buf_end > ptr) && (*(ptr-1) != FN_LIBCHAR))
+    *ptr++= FN_LIBCHAR;
+  if (buf_end > ptr)
+    strncpy(ptr, safe_filename + dirlen, buf_end - ptr);
+  *buf_end= '\0';
+
+  normalized_filename= buffer;
+  normalized_length= (uint)strlen(normalized_filename);
+
+  PFS_file **entry;
+  entry= reinterpret_cast<PFS_file**>
+    (lf_hash_search(&pfs_filename_hash, pins,
+                    normalized_filename, normalized_length));
+
+  if (entry && (entry != MY_ERRPTR))
+    pfs= *entry;
+  else
+  {
+    lf_hash_search_unpin(pins);
+    return;
+  }
+
+  lf_hash_delete(&pfs_filename_hash, pins,
+                 pfs->m_filename, pfs->m_filename_length);
+
+  /*
+    Normalize the new file name.
+  */
+  if (new_len >= FN_REFLEN)
+  {
+    memcpy(safe_buffer, new_filename, FN_REFLEN - 1);
+    safe_buffer[FN_REFLEN - 1]= 0;
+    safe_filename= safe_buffer;
+  }
+  else
+    safe_filename= new_filename;
+
+  dirlen= dirname_length(safe_filename);
+  if (dirlen == 0)
+  {
+    dirbuffer[0]= FN_CURLIB;
+    dirbuffer[1]= FN_LIBCHAR;
+    dirbuffer[2]= '\0';
+  }
+  else
+  {
+    memcpy(dirbuffer, safe_filename, dirlen);
+    dirbuffer[dirlen]= '\0';
+  }
+
+  if (my_realpath(buffer, dirbuffer, MYF(0)) != 0)
+  {
+    global_file_container.m_lost++;
+    return;
+  }
+
+  /* Append the unresolved file name to the resolved path */
+  ptr= buffer + strlen(buffer);
+  buf_end= &buffer[sizeof(buffer)-1];
+  if ((buf_end > ptr) && (*(ptr-1) != FN_LIBCHAR))
+    *ptr++= FN_LIBCHAR;
+  if (buf_end > ptr)
+    strncpy(ptr, safe_filename + dirlen, buf_end - ptr);
+  *buf_end= '\0';
+
+  normalized_filename= buffer;
+  normalized_length= (uint)strlen(normalized_filename);
+
+  memcpy(pfs->m_filename, normalized_filename, normalized_length);
+  pfs->m_filename[normalized_length]= '\0';
+  pfs->m_filename_length= normalized_length;
+
+  int res;
+  res= lf_hash_insert(&pfs_filename_hash, pins, &pfs);
+
+  if (likely(res == 0))
+    return;
+  else
+  {
+    global_file_container.deallocate(pfs);
+    global_file_container.m_lost++;
+    return;
+  }
+}
+
+/**
   Release instrumentation for a file instance.
   @param pfs                          the file to release
 */
 void release_file(PFS_file *pfs)
 {
-  DBUG_ASSERT(pfs != NULL);
+  assert(pfs != NULL);
   pfs->m_file_stat.m_open_count--;
 }
 
@@ -1412,8 +1081,8 @@ void release_file(PFS_file *pfs)
 */
 void destroy_file(PFS_thread *thread, PFS_file *pfs)
 {
-  DBUG_ASSERT(thread != NULL);
-  DBUG_ASSERT(pfs != NULL);
+  assert(thread != NULL);
+  assert(pfs != NULL);
   PFS_file_class *klass= pfs->m_class;
 
   /* Aggregate to FILE_SUMMARY_BY_EVENT_NAME */
@@ -1424,14 +1093,14 @@ void destroy_file(PFS_thread *thread, PFS_file *pfs)
     klass->m_singleton= NULL;
 
   LF_PINS *pins= get_filename_hash_pins(thread);
-  DBUG_ASSERT(pins != NULL);
+  assert(pins != NULL);
 
   lf_hash_delete(&pfs_filename_hash, pins,
                  pfs->m_filename, pfs->m_filename_length);
   if (klass->is_singleton())
     klass->m_singleton= NULL;
-  pfs->m_lock.allocated_to_free();
-  file_full= false;
+
+  global_file_container.deallocate(pfs);
 }
 
 /**
@@ -1444,49 +1113,32 @@ void destroy_file(PFS_thread *thread, PFS_file *pfs)
 PFS_table* create_table(PFS_table_share *share, PFS_thread *opening_thread,
                         const void *identity)
 {
-  static uint PFS_ALIGNED table_monotonic_index= 0;
-  uint index;
-  uint attempts= 0;
   PFS_table *pfs;
-
-  if (table_full)
-  {
-    table_lost++;
-    return NULL;
-  }
-
-  while (++attempts <= table_max)
-  {
-    /* See create_mutex() */
-    index= PFS_atomic::add_u32(& table_monotonic_index, 1) % table_max;
-    pfs= table_array + index;
-
-    if (pfs->m_lock.is_free())
-    {
-      if (pfs->m_lock.free_to_dirty())
-      {
-        pfs->m_identity= identity;
-        pfs->m_share= share;
-        pfs->m_io_enabled= share->m_enabled &&
-          flag_global_instrumentation && global_table_io_class.m_enabled;
-        pfs->m_io_timed= share->m_timed && global_table_io_class.m_timed;
-        pfs->m_lock_enabled= share->m_enabled &&
-          flag_global_instrumentation && global_table_lock_class.m_enabled;
-        pfs->m_lock_timed= share->m_timed && global_table_lock_class.m_timed;
-        pfs->m_has_io_stats= false;
-        pfs->m_has_lock_stats= false;
-        share->inc_refcount();
-        pfs->m_table_stat.fast_reset();
-        pfs->m_thread_owner= opening_thread;
-        pfs->m_lock.dirty_to_allocated();
-        return pfs;
-      }
-    }
+  pfs_dirty_state dirty_state;
+
+  pfs= global_table_container.allocate(& dirty_state);
+  if (pfs != NULL)
+  {
+    pfs->m_identity= identity;
+    pfs->m_share= share;
+    pfs->m_io_enabled= share->m_enabled &&
+      flag_global_instrumentation && global_table_io_class.m_enabled;
+    pfs->m_io_timed= share->m_timed && global_table_io_class.m_timed;
+    pfs->m_lock_enabled= share->m_enabled &&
+      flag_global_instrumentation && global_table_lock_class.m_enabled;
+    pfs->m_lock_timed= share->m_timed && global_table_lock_class.m_timed;
+    pfs->m_has_io_stats= false;
+    pfs->m_has_lock_stats= false;
+    pfs->m_internal_lock= PFS_TL_NONE;
+    pfs->m_external_lock= PFS_TL_NONE;
+    share->inc_refcount();
+    pfs->m_table_stat.fast_reset();
+    pfs->m_thread_owner= opening_thread;
+    pfs->m_owner_event_id= opening_thread->m_event_id;
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
   }
 
-  table_lost++;
-  table_full= true;
-  return NULL;
+  return pfs;
 }
 
 void PFS_table::sanitized_aggregate(void)
@@ -1498,18 +1150,12 @@ void PFS_table::sanitized_aggregate(void)
   PFS_table_share *safe_share= sanitize_table_share(m_share);
   if (safe_share != NULL)
   {
-    if (m_has_io_stats && m_has_lock_stats)
-    {
-      safe_aggregate(& m_table_stat, safe_share);
-      m_has_io_stats= false;
-      m_has_lock_stats= false;
-    }
-    else if (m_has_io_stats)
+    if (m_has_io_stats)
     {
-      safe_aggregate_io(& m_table_stat, safe_share);
+      safe_aggregate_io(NULL, & m_table_stat, safe_share);
       m_has_io_stats= false;
     }
-    else if (m_has_lock_stats)
+    if (m_has_lock_stats)
     {
       safe_aggregate_lock(& m_table_stat, safe_share);
       m_has_lock_stats= false;
@@ -1522,7 +1168,7 @@ void PFS_table::sanitized_aggregate_io(void)
   PFS_table_share *safe_share= sanitize_table_share(m_share);
   if (safe_share != NULL && m_has_io_stats)
   {
-    safe_aggregate_io(& m_table_stat, safe_share);
+    safe_aggregate_io(NULL, & m_table_stat, safe_share);
     m_has_io_stats= false;
   }
 }
@@ -1537,40 +1183,87 @@ void PFS_table::sanitized_aggregate_lock(void)
   }
 }
 
-void PFS_table::safe_aggregate(PFS_table_stat *table_stat,
-                               PFS_table_share *table_share)
+void PFS_table::safe_aggregate_io(const TABLE_SHARE *optional_server_share,
+                                  PFS_table_stat *table_stat,
+                                  PFS_table_share *table_share)
 {
-  DBUG_ASSERT(table_stat != NULL);
-  DBUG_ASSERT(table_share != NULL);
+  assert(table_stat != NULL);
+  assert(table_share != NULL);
 
   uint key_count= sanitize_index_count(table_share->m_key_count);
 
-  /* Aggregate to TABLE_IO_SUMMARY, TABLE_LOCK_SUMMARY */
-  table_share->m_table_stat.aggregate(table_stat, key_count);
-  table_stat->fast_reset();
-}
+  PFS_table_share_index *to_stat;
+  PFS_table_io_stat *from_stat;
+  uint index;
 
-void PFS_table::safe_aggregate_io(PFS_table_stat *table_stat,
-                                  PFS_table_share *table_share)
-{
-  DBUG_ASSERT(table_stat != NULL);
-  DBUG_ASSERT(table_share != NULL);
+  assert(key_count <= MAX_INDEXES);
 
-  uint key_count= sanitize_index_count(table_share->m_key_count);
+  /* Aggregate stats for each index, if any */
+  for (index= 0; index < key_count; index++)
+  {
+    from_stat= & table_stat->m_index_stat[index];
+    if (from_stat->m_has_data)
+    {
+      if (optional_server_share != NULL)
+      {
+        /*
+          An instrumented thread is closing a table,
+          and capable of providing index names when
+          creating index statistics on the fly.
+        */
+        to_stat= table_share->find_or_create_index_stat(optional_server_share, index);
+      }
+      else
+      {
+        /*
+          A monitoring thread, performing TRUNCATE TABLE,
+          is asking to flush existing stats from table handles,
+          but it does not know about index names used in handles.
+          If the index stat already exists, find it and aggregate to it.
+          It the index stat does not exist yet, drop the stat and do nothing.
+        */
+        to_stat= table_share->find_index_stat(index);
+      }
+      if (to_stat != NULL)
+      {
+        /* Aggregate to TABLE_IO_SUMMARY */
+        to_stat->m_stat.aggregate(from_stat);
+      }
+    }
+  }
+
+  /* Aggregate stats for the table */
+  from_stat= & table_stat->m_index_stat[MAX_INDEXES];
+  if (from_stat->m_has_data)
+  {
+    to_stat= table_share->find_or_create_index_stat(NULL, MAX_INDEXES);
+    if (to_stat != NULL)
+    {
+      /* Aggregate to TABLE_IO_SUMMARY */
+      to_stat->m_stat.aggregate(from_stat);
+    }
+  }
 
-  /* Aggregate to TABLE_IO_SUMMARY */
-  table_share->m_table_stat.aggregate_io(table_stat, key_count);
   table_stat->fast_reset_io();
 }
 
 void PFS_table::safe_aggregate_lock(PFS_table_stat *table_stat,
                                     PFS_table_share *table_share)
 {
-  DBUG_ASSERT(table_stat != NULL);
-  DBUG_ASSERT(table_share != NULL);
+  assert(table_stat != NULL);
+  assert(table_share != NULL);
+
+  PFS_table_lock_stat *from_stat= & table_stat->m_lock_stat;
+
+  PFS_table_share_lock *to_stat;
+
+  to_stat= table_share->find_or_create_lock_stat();
+  if (to_stat != NULL)
+  {
+    /* Aggregate to TABLE_LOCK_SUMMARY */
+    to_stat->m_stat.aggregate(from_stat);
+  }
 
-  /* Aggregate to TABLE_LOCK_SUMMARY */
-  table_share->m_table_stat.aggregate_lock(table_stat);
   table_stat->fast_reset_lock();
 }
 
@@ -1580,31 +1273,24 @@ void PFS_table::safe_aggregate_lock(PFS_table_stat *table_stat,
 */
 void destroy_table(PFS_table *pfs)
 {
-  DBUG_ASSERT(pfs != NULL);
+  assert(pfs != NULL);
   pfs->m_share->dec_refcount();
-  pfs->m_lock.allocated_to_free();
-  table_full= false;
+  global_table_container.deallocate(pfs);
 }
 
 /**
   Create instrumentation for a socket instance.
   @param klass                        the socket class
-  @param identity                     the socket descriptor
+  @param fd                           the socket file descriptor
+  @param addr                         the socket address
+  @param addr_len                     the socket address length
   @return a socket instance, or NULL
 */
 PFS_socket* create_socket(PFS_socket_class *klass, const my_socket *fd,
                           const struct sockaddr *addr, socklen_t addr_len)
 {
-  static uint PFS_ALIGNED socket_monotonic_index= 0;
-  uint index;
-  uint attempts= 0;
   PFS_socket *pfs;
-
-  if (socket_full)
-  {
-    socket_lost++;
-    return NULL;
-  }
+  pfs_dirty_state dirty_state;
 
   uint fd_used= 0;
   uint addr_len_used= addr_len;
@@ -1615,48 +1301,38 @@ PFS_socket* create_socket(PFS_socket_class *klass, const my_socket *fd,
   if (addr_len_used > sizeof(sockaddr_storage))
     addr_len_used= sizeof(sockaddr_storage);
 
-  while (++attempts <= socket_max)
+  pfs= global_socket_container.allocate(& dirty_state);
+
+  if (pfs != NULL)
   {
-    index= PFS_atomic::add_u32(& socket_monotonic_index, 1) % socket_max;
-    pfs= socket_array + index;
+    pfs->m_fd= fd_used;
+    /* There is no socket object, so we use the instrumentation. */
+    pfs->m_identity= pfs;
+    pfs->m_class= klass;
+    pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+    pfs->m_timed= klass->m_timed;
+    pfs->m_idle= false;
+    pfs->m_socket_stat.reset();
+    pfs->m_thread_owner= NULL;
 
-    if (pfs->m_lock.is_free())
+    pfs->m_addr_len= addr_len_used;
+    if ((addr != NULL) && (addr_len_used > 0))
     {
-      if (pfs->m_lock.free_to_dirty())
-      {
-        pfs->m_fd= fd_used;
-        /* There is no socket object, so we use the instrumentation. */
-        pfs->m_identity= pfs;
-        pfs->m_class= klass;
-        pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
-        pfs->m_timed= klass->m_timed;
-        pfs->m_idle= false;
-        pfs->m_socket_stat.reset();
-        pfs->m_thread_owner= NULL;
-
-        pfs->m_addr_len= addr_len_used;
-        if ((addr != NULL) && (addr_len_used > 0))
-        {
-          pfs->m_addr_len= addr_len_used;
-          memcpy(&pfs->m_sock_addr, addr, addr_len_used);
-        }
-        else
-        {
-          pfs->m_addr_len= 0;
-        }
-
-        pfs->m_lock.dirty_to_allocated();
-
-        if (klass->is_singleton())
-          klass->m_singleton= pfs;
-        return pfs;
-      }
+      pfs->m_addr_len= addr_len_used;
+      memcpy(&pfs->m_sock_addr, addr, addr_len_used);
+    }
+    else
+    {
+      pfs->m_addr_len= 0;
     }
+
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
+
+    if (klass->is_singleton())
+      klass->m_singleton= pfs;
   }
 
-  socket_lost++;
-  socket_full= true;
-  return NULL;
+  return pfs;
 }
 
 /**
@@ -1665,7 +1341,7 @@ PFS_socket* create_socket(PFS_socket_class *klass, const my_socket *fd,
 */
 void destroy_socket(PFS_socket *pfs)
 {
-  DBUG_ASSERT(pfs != NULL);
+  assert(pfs != NULL);
   PFS_socket_class *klass= pfs->m_class;
 
   /* Aggregate to SOCKET_SUMMARY_BY_EVENT_NAME */
@@ -1678,67 +1354,112 @@ void destroy_socket(PFS_socket *pfs)
   PFS_thread *thread= pfs->m_thread_owner;
   if (thread != NULL)
   {
-    PFS_single_stat *event_name_array;
-    event_name_array= thread->m_instr_class_waits_stats;
-    uint index= pfs->m_class->m_event_name_index;
-
     /* Combine stats for all operations */
     PFS_single_stat stat;
     pfs->m_socket_stat.m_io_stat.sum_waits(&stat);
-    event_name_array[index].aggregate(&stat);
+    if (stat.m_count != 0)
+    {
+      PFS_single_stat *event_name_array;
+      event_name_array= thread->write_instr_class_waits_stats();
+      uint index= pfs->m_class->m_event_name_index;
+
+      event_name_array[index].aggregate(&stat);
+    }
   }
 
   pfs->m_socket_stat.reset();
   pfs->m_thread_owner= NULL;
   pfs->m_fd= 0;
   pfs->m_addr_len= 0;
-  pfs->m_lock.allocated_to_free();
-  socket_full= false;
+
+  global_socket_container.deallocate(pfs);
+}
+
+PFS_metadata_lock* create_metadata_lock(void *identity,
+                                        const MDL_key *mdl_key,
+                                        opaque_mdl_type mdl_type,
+                                        opaque_mdl_duration mdl_duration,
+                                        opaque_mdl_status mdl_status,
+                                        const char *src_file,
+                                        uint src_line)
+{
+  PFS_metadata_lock *pfs;
+  pfs_dirty_state dirty_state;
+
+  pfs= global_mdl_container.allocate(& dirty_state);
+  if (pfs != NULL)
+  {
+    pfs->m_identity= identity;
+    pfs->m_enabled= global_metadata_class.m_enabled && flag_global_instrumentation;
+    pfs->m_timed= global_metadata_class.m_timed;
+    pfs->m_mdl_key.mdl_key_init(mdl_key);
+    pfs->m_mdl_type= mdl_type;
+    pfs->m_mdl_duration= mdl_duration;
+    pfs->m_mdl_status= mdl_status;
+    pfs->m_src_file= src_file;
+    pfs->m_src_line= src_line;
+    pfs->m_owner_thread_id= 0;
+    pfs->m_owner_event_id= 0;
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
+  }
+
+  return pfs;
+}
+
+void destroy_metadata_lock(PFS_metadata_lock *pfs)
+{
+  assert(pfs != NULL);
+  global_mdl_container.deallocate(pfs);
+}
+
+static void fct_reset_mutex_waits(PFS_mutex *pfs)
+{
+  pfs->m_mutex_stat.reset();
 }
 
 static void reset_mutex_waits_by_instance(void)
 {
-  PFS_mutex *pfs= mutex_array;
-  PFS_mutex *pfs_last= mutex_array + mutex_max;
+  global_mutex_container.apply_all(fct_reset_mutex_waits);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-    pfs->m_mutex_stat.reset();
+static void fct_reset_rwlock_waits(PFS_rwlock *pfs)
+{
+  pfs->m_rwlock_stat.reset();
 }
 
 static void reset_rwlock_waits_by_instance(void)
 {
-  PFS_rwlock *pfs= rwlock_array;
-  PFS_rwlock *pfs_last= rwlock_array + rwlock_max;
+  global_rwlock_container.apply_all(fct_reset_rwlock_waits);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-    pfs->m_rwlock_stat.reset();
+static void fct_reset_cond_waits(PFS_cond *pfs)
+{
+  pfs->m_cond_stat.reset();
 }
 
 static void reset_cond_waits_by_instance(void)
 {
-  PFS_cond *pfs= cond_array;
-  PFS_cond *pfs_last= cond_array + cond_max;
+  global_cond_container.apply_all(fct_reset_cond_waits);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-    pfs->m_cond_stat.reset();
+static void fct_reset_file_waits(PFS_file *pfs)
+{
+  pfs->m_file_stat.reset();
 }
 
 static void reset_file_waits_by_instance(void)
 {
-  PFS_file *pfs= file_array;
-  PFS_file *pfs_last= file_array + file_max;
+  global_file_container.apply_all(fct_reset_file_waits);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-    pfs->m_file_stat.reset();
+static void fct_reset_socket_waits(PFS_socket *pfs)
+{
+  pfs->m_socket_stat.reset();
 }
 
 static void reset_socket_waits_by_instance(void)
 {
-  PFS_socket *pfs= socket_array;
-  PFS_socket *pfs_last= socket_array + socket_max;
-
-  for ( ; pfs < pfs_last; pfs++)
-    pfs->m_socket_stat.reset();
+  global_socket_container.apply_all(fct_reset_socket_waits);
 }
 
 /** Reset the wait statistics per object instance. */
@@ -1751,24 +1472,26 @@ void reset_events_waits_by_instance(void)
   reset_socket_waits_by_instance();
 }
 
+static void fct_reset_file_io(PFS_file *pfs)
+{
+  pfs->m_file_stat.m_io_stat.reset();
+}
+
 /** Reset the io statistics per file instance. */
 void reset_file_instance_io(void)
 {
-  PFS_file *pfs= file_array;
-  PFS_file *pfs_last= file_array + file_max;
+  global_file_container.apply_all(fct_reset_file_io);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-    pfs->m_file_stat.m_io_stat.reset();
+static void fct_reset_socket_io(PFS_socket *pfs)
+{
+  pfs->m_socket_stat.m_io_stat.reset();
 }
 
 /** Reset the io statistics per socket instance. */
 void reset_socket_instance_io(void)
 {
-  PFS_socket *pfs= socket_array;
-  PFS_socket *pfs_last= socket_array + socket_max;
-
-  for ( ; pfs < pfs_last; pfs++)
-    pfs->m_socket_stat.m_io_stat.reset();
+  global_socket_container.apply_all(fct_reset_socket_io);
 }
 
 void aggregate_all_event_names(PFS_single_stat *from_array,
@@ -1909,6 +1632,124 @@ void aggregate_all_statements(PFS_statement_stat *from_array,
   }
 }
 
+void aggregate_all_transactions(PFS_transaction_stat *from_array,
+                                PFS_transaction_stat *to_array)
+{
+  assert(from_array != NULL);
+  assert(to_array != NULL);
+
+  if (from_array->count() > 0)
+  {
+    to_array->aggregate(from_array);
+    from_array->reset();
+  }
+}
+
+void aggregate_all_transactions(PFS_transaction_stat *from_array,
+                                PFS_transaction_stat *to_array_1,
+                                PFS_transaction_stat *to_array_2)
+{
+  assert(from_array != NULL);
+  assert(to_array_1 != NULL);
+  assert(to_array_2 != NULL);
+
+  if (from_array->count() > 0)
+  {
+    to_array_1->aggregate(from_array);
+    to_array_2->aggregate(from_array);
+    from_array->reset();
+  }
+}
+
+void aggregate_all_memory(bool alive,
+                          PFS_memory_stat *from_array,
+                          PFS_memory_stat *to_array)
+{
+  PFS_memory_stat *from;
+  PFS_memory_stat *from_last;
+  PFS_memory_stat *to;
+
+  from= from_array;
+  from_last= from_array + memory_class_max;
+  to= to_array;
+
+  if (alive)
+  {
+    for ( ; from < from_last ; from++, to++)
+    {
+      from->partial_aggregate_to(to);
+    }
+  }
+  else
+  {
+    for ( ; from < from_last ; from++, to++)
+    {
+      from->full_aggregate_to(to);
+      from->reset();
+    }
+  }
+}
+
+void aggregate_all_memory(bool alive,
+                          PFS_memory_stat *from_array,
+                          PFS_memory_stat *to_array_1,
+                          PFS_memory_stat *to_array_2)
+{
+  PFS_memory_stat *from;
+  PFS_memory_stat *from_last;
+  PFS_memory_stat *to_1;
+  PFS_memory_stat *to_2;
+
+  from= from_array;
+  from_last= from_array + memory_class_max;
+  to_1= to_array_1;
+  to_2= to_array_2;
+
+  if (alive)
+  {
+    for ( ; from < from_last ; from++, to_1++, to_2++)
+    {
+      from->partial_aggregate_to(to_1, to_2);
+    }
+  }
+  else
+  {
+    for ( ; from < from_last ; from++, to_1++, to_2++)
+    {
+      from->full_aggregate_to(to_1, to_2);
+      from->reset();
+    }
+  }
+}
+
+void aggregate_thread_status(PFS_thread *thread,
+                             PFS_account *safe_account,
+                             PFS_user *safe_user,
+                             PFS_host *safe_host)
+{
+  THD *thd= thread->m_thd;
+
+  if (thd == NULL)
+    return;
+
+  if (likely(safe_account != NULL))
+  {
+    safe_account->aggregate_status_stats(&thd->status_var);
+    return;
+  }
+
+  if (safe_user != NULL)
+  {
+    safe_user->aggregate_status_stats(&thd->status_var);
+  }
+
+  if (safe_host != NULL)
+  {
+    safe_host->aggregate_status_stats(&thd->status_var);
+  }
+  return;
+}
+
 void aggregate_thread_stats(PFS_thread *thread,
                             PFS_account *safe_account,
                             PFS_user *safe_user,
@@ -1917,14 +1758,17 @@ void aggregate_thread_stats(PFS_thread *thread,
   if (likely(safe_account != NULL))
   {
     safe_account->m_disconnected_count++;
-    return;
   }
 
   if (safe_user != NULL)
+  {
     safe_user->m_disconnected_count++;
+  }
 
   if (safe_host != NULL)
+  {
     safe_host->m_disconnected_count++;
+  }
 
   /* There is no global table for connections statistics. */
   return;
@@ -1935,9 +1779,28 @@ void aggregate_thread(PFS_thread *thread,
                       PFS_user *safe_user,
                       PFS_host *safe_host)
 {
+  /* No HAVE_PSI_???_INTERFACE flag, waits cover multiple instrumentations */
   aggregate_thread_waits(thread, safe_account, safe_user, safe_host);
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
   aggregate_thread_stages(thread, safe_account, safe_user, safe_host);
+#endif
+
+#ifdef HAVE_PSI_STATEMENT_INTERFACE
   aggregate_thread_statements(thread, safe_account, safe_user, safe_host);
+#endif
+
+#ifdef HAVE_PSI_TRANSACTION_INTERFACE
+  aggregate_thread_transactions(thread, safe_account, safe_user, safe_host);
+#endif
+
+#ifdef HAVE_PSI_MEMORY_INTERFACE
+  aggregate_thread_memory(false, thread, safe_account, safe_user, safe_host);
+#endif
+
+  if (!show_compatibility_56)
+    aggregate_thread_status(thread, safe_account, safe_user, safe_host);
+
   aggregate_thread_stats(thread, safe_account, safe_user, safe_host);
 }
 
@@ -1946,14 +1809,17 @@ void aggregate_thread_waits(PFS_thread *thread,
                             PFS_user *safe_user,
                             PFS_host *safe_host)
 {
+  if (thread->read_instr_class_waits_stats() == NULL)
+    return;
+
   if (likely(safe_account != NULL))
   {
     /*
       Aggregate EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME
       to EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
     */
-    aggregate_all_event_names(thread->m_instr_class_waits_stats,
-                              safe_account->m_instr_class_waits_stats);
+    aggregate_all_event_names(thread->write_instr_class_waits_stats(),
+                              safe_account->write_instr_class_waits_stats());
 
     return;
   }
@@ -1966,9 +1832,9 @@ void aggregate_thread_waits(PFS_thread *thread,
       -  EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME
       in parallel.
     */
-    aggregate_all_event_names(thread->m_instr_class_waits_stats,
-                              safe_user->m_instr_class_waits_stats,
-                              safe_host->m_instr_class_waits_stats);
+    aggregate_all_event_names(thread->write_instr_class_waits_stats(),
+                              safe_user->write_instr_class_waits_stats(),
+                              safe_host->write_instr_class_waits_stats());
     return;
   }
 
@@ -1978,8 +1844,8 @@ void aggregate_thread_waits(PFS_thread *thread,
       Aggregate EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME
       to EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME, directly.
     */
-    aggregate_all_event_names(thread->m_instr_class_waits_stats,
-                              safe_user->m_instr_class_waits_stats);
+    aggregate_all_event_names(thread->write_instr_class_waits_stats(),
+                              safe_user->write_instr_class_waits_stats());
     return;
   }
 
@@ -1989,8 +1855,8 @@ void aggregate_thread_waits(PFS_thread *thread,
       Aggregate EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME
       to EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME, directly.
     */
-    aggregate_all_event_names(thread->m_instr_class_waits_stats,
-                              safe_host->m_instr_class_waits_stats);
+    aggregate_all_event_names(thread->write_instr_class_waits_stats(),
+                              safe_host->write_instr_class_waits_stats());
     return;
   }
 
@@ -2003,14 +1869,17 @@ void aggregate_thread_stages(PFS_thread *thread,
                              PFS_user *safe_user,
                              PFS_host *safe_host)
 {
+  if (thread->read_instr_class_stages_stats() == NULL)
+    return;
+
   if (likely(safe_account != NULL))
   {
     /*
       Aggregate EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME
       to EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
     */
-    aggregate_all_stages(thread->m_instr_class_stages_stats,
-                         safe_account->m_instr_class_stages_stats);
+    aggregate_all_stages(thread->write_instr_class_stages_stats(),
+                         safe_account->write_instr_class_stages_stats());
 
     return;
   }
@@ -2023,9 +1892,9 @@ void aggregate_thread_stages(PFS_thread *thread,
       -  EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME
       in parallel.
     */
-    aggregate_all_stages(thread->m_instr_class_stages_stats,
-                         safe_user->m_instr_class_stages_stats,
-                         safe_host->m_instr_class_stages_stats);
+    aggregate_all_stages(thread->write_instr_class_stages_stats(),
+                         safe_user->write_instr_class_stages_stats(),
+                         safe_host->write_instr_class_stages_stats());
     return;
   }
 
@@ -2037,8 +1906,8 @@ void aggregate_thread_stages(PFS_thread *thread,
       -  EVENTS_STAGES_SUMMARY_GLOBAL_BY_EVENT_NAME
       in parallel.
     */
-    aggregate_all_stages(thread->m_instr_class_stages_stats,
-                         safe_user->m_instr_class_stages_stats,
+    aggregate_all_stages(thread->write_instr_class_stages_stats(),
+                         safe_user->write_instr_class_stages_stats(),
                          global_instr_class_stages_array);
     return;
   }
@@ -2049,8 +1918,8 @@ void aggregate_thread_stages(PFS_thread *thread,
       Aggregate EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME
       to EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME, directly.
     */
-    aggregate_all_stages(thread->m_instr_class_stages_stats,
-                         safe_host->m_instr_class_stages_stats);
+    aggregate_all_stages(thread->write_instr_class_stages_stats(),
+                         safe_host->write_instr_class_stages_stats());
     return;
   }
 
@@ -2058,7 +1927,7 @@ void aggregate_thread_stages(PFS_thread *thread,
     Aggregate EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME
     to EVENTS_STAGES_SUMMARY_GLOBAL_BY_EVENT_NAME.
   */
-  aggregate_all_stages(thread->m_instr_class_stages_stats,
+  aggregate_all_stages(thread->write_instr_class_stages_stats(),
                        global_instr_class_stages_array);
 }
 
@@ -2067,14 +1936,17 @@ void aggregate_thread_statements(PFS_thread *thread,
                                  PFS_user *safe_user,
                                  PFS_host *safe_host)
 {
+  if (thread->read_instr_class_statements_stats() == NULL)
+    return;
+
   if (likely(safe_account != NULL))
   {
     /*
       Aggregate EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME
       to EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
     */
-    aggregate_all_statements(thread->m_instr_class_statements_stats,
-                             safe_account->m_instr_class_statements_stats);
+    aggregate_all_statements(thread->write_instr_class_statements_stats(),
+                             safe_account->write_instr_class_statements_stats());
 
     return;
   }
@@ -2087,9 +1959,9 @@ void aggregate_thread_statements(PFS_thread *thread,
       -  EVENTS_STATEMENT_SUMMARY_BY_HOST_BY_EVENT_NAME
       in parallel.
     */
-    aggregate_all_statements(thread->m_instr_class_statements_stats,
-                             safe_user->m_instr_class_statements_stats,
-                             safe_host->m_instr_class_statements_stats);
+    aggregate_all_statements(thread->write_instr_class_statements_stats(),
+                             safe_user->write_instr_class_statements_stats(),
+                             safe_host->write_instr_class_statements_stats());
     return;
   }
 
@@ -2101,8 +1973,8 @@ void aggregate_thread_statements(PFS_thread *thread,
       -  EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME
       in parallel.
     */
-    aggregate_all_statements(thread->m_instr_class_statements_stats,
-                             safe_user->m_instr_class_statements_stats,
+    aggregate_all_statements(thread->write_instr_class_statements_stats(),
+                             safe_user->write_instr_class_statements_stats(),
                              global_instr_class_statements_array);
     return;
   }
@@ -2113,8 +1985,8 @@ void aggregate_thread_statements(PFS_thread *thread,
       Aggregate EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME
       to EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME, directly.
     */
-    aggregate_all_statements(thread->m_instr_class_statements_stats,
-                             safe_host->m_instr_class_statements_stats);
+    aggregate_all_statements(thread->write_instr_class_statements_stats(),
+                             safe_host->write_instr_class_statements_stats());
     return;
   }
 
@@ -2122,10 +1994,149 @@ void aggregate_thread_statements(PFS_thread *thread,
     Aggregate EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME
     to EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME.
   */
-  aggregate_all_statements(thread->m_instr_class_statements_stats,
+  aggregate_all_statements(thread->write_instr_class_statements_stats(),
                            global_instr_class_statements_array);
 }
 
+void aggregate_thread_transactions(PFS_thread *thread,
+                                   PFS_account *safe_account,
+                                   PFS_user *safe_user,
+                                   PFS_host *safe_host)
+{
+  if (thread->read_instr_class_transactions_stats() == NULL)
+    return;
+
+  if (likely(safe_account != NULL))
+  {
+    /*
+      Aggregate EVENTS_TRANSACTIONS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+      to EVENTS_TRANSACTIONS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
+    */
+    aggregate_all_transactions(thread->write_instr_class_transactions_stats(),
+                               safe_account->write_instr_class_transactions_stats());
+
+    return;
+  }
+
+  if ((safe_user != NULL) && (safe_host != NULL))
+  {
+    /*
+      Aggregate EVENTS_TRANSACTION_SUMMARY_BY_THREAD_BY_EVENT_NAME to:
+      -  EVENTS_TRANSACTION_SUMMARY_BY_USER_BY_EVENT_NAME
+      -  EVENTS_TRANSACTION_SUMMARY_BY_HOST_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_transactions(thread->write_instr_class_transactions_stats(),
+                               safe_user->write_instr_class_transactions_stats(),
+                               safe_host->write_instr_class_transactions_stats());
+    return;
+  }
+
+  if (safe_user != NULL)
+  {
+    /*
+      Aggregate EVENTS_TRANSACTIONS_SUMMARY_BY_THREAD_BY_EVENT_NAME to:
+      -  EVENTS_TRANSACTIONS_SUMMARY_BY_USER_BY_EVENT_NAME
+      -  EVENTS_TRANSACTIONS_SUMMARY_GLOBAL_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_transactions(thread->write_instr_class_transactions_stats(),
+                               safe_user->write_instr_class_transactions_stats(),
+                               &global_transaction_stat);
+    return;
+  }
+
+  if (safe_host != NULL)
+  {
+    /*
+      Aggregate EVENTS_TRANSACTIONS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+      to EVENTS_TRANSACTIONS_SUMMARY_BY_HOST_BY_EVENT_NAME, directly.
+    */
+    aggregate_all_transactions(thread->write_instr_class_transactions_stats(),
+                               safe_host->write_instr_class_transactions_stats());
+    return;
+  }
+
+  /*
+    Aggregate EVENTS_TRANSACTIONS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+    to EVENTS_TRANSACTIONS_SUMMARY_GLOBAL_BY_EVENT_NAME.
+  */
+  aggregate_all_transactions(thread->write_instr_class_transactions_stats(),
+                             &global_transaction_stat);
+}
+
+void aggregate_thread_memory(bool alive, PFS_thread *thread,
+                             PFS_account *safe_account,
+                             PFS_user *safe_user,
+                             PFS_host *safe_host)
+{
+  if (thread->read_instr_class_memory_stats() == NULL)
+    return;
+
+  if (likely(safe_account != NULL))
+  {
+    /*
+      Aggregate MEMORY_SUMMARY_BY_THREAD_BY_EVENT_NAME
+      to MEMORY_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
+    */
+    aggregate_all_memory(alive,
+                         thread->write_instr_class_memory_stats(),
+                         safe_account->write_instr_class_memory_stats());
+
+    return;
+  }
+
+  if ((safe_user != NULL) && (safe_host != NULL))
+  {
+    /*
+      Aggregate MEMORY_SUMMARY_BY_THREAD_BY_EVENT_NAME to:
+      -  MEMORY_SUMMARY_BY_USER_BY_EVENT_NAME
+      -  MEMORY_SUMMARY_BY_HOST_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_memory(alive,
+                         thread->write_instr_class_memory_stats(),
+                         safe_user->write_instr_class_memory_stats(),
+                         safe_host->write_instr_class_memory_stats());
+    return;
+  }
+
+  if (safe_user != NULL)
+  {
+    /*
+      Aggregate MEMORY_SUMMARY_BY_THREAD_BY_EVENT_NAME to:
+      -  MEMORY_SUMMARY_BY_USER_BY_EVENT_NAME
+      -  MEMORY_SUMMARY_GLOBAL_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_memory(alive,
+                         thread->write_instr_class_memory_stats(),
+                         safe_user->write_instr_class_memory_stats(),
+                         global_instr_class_memory_array);
+    return;
+  }
+
+  if (safe_host != NULL)
+  {
+    /*
+      Aggregate MEMORY_SUMMARY_BY_THREAD_BY_EVENT_NAME
+      to MEMORY_SUMMARY_BY_HOST_BY_EVENT_NAME, directly.
+    */
+    aggregate_all_memory(alive,
+                         thread->write_instr_class_memory_stats(),
+                         safe_host->write_instr_class_memory_stats());
+    return;
+  }
+
+  /*
+    Aggregate MEMORY_SUMMARY_BY_THREAD_BY_EVENT_NAME
+    to MEMORY_SUMMARY_GLOBAL_BY_EVENT_NAME.
+  */
+  aggregate_all_memory(alive,
+                       thread->write_instr_class_memory_stats(),
+                       global_instr_class_memory_array);
+}
+
 void clear_thread_account(PFS_thread *thread)
 {
   if (thread->m_account != NULL)
@@ -2149,9 +2160,9 @@ void clear_thread_account(PFS_thread *thread)
 
 void set_thread_account(PFS_thread *thread)
 {
-  DBUG_ASSERT(thread->m_account == NULL);
-  DBUG_ASSERT(thread->m_user == NULL);
-  DBUG_ASSERT(thread->m_host == NULL);
+  assert(thread->m_account == NULL);
+  assert(thread->m_user == NULL);
+  assert(thread->m_host == NULL);
 
   thread->m_account= find_or_create_account(thread,
                                                 thread->m_username,
@@ -2170,142 +2181,151 @@ void set_thread_account(PFS_thread *thread)
                                         thread->m_hostname_length);
 }
 
+static void fct_update_mutex_derived_flags(PFS_mutex *pfs)
+{
+  PFS_mutex_class *klass= sanitize_mutex_class(pfs->m_class);
+  if (likely(klass != NULL))
+  {
+    pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+    pfs->m_timed= klass->m_timed;
+  }
+  else
+  {
+    pfs->m_enabled= false;
+    pfs->m_timed= false;
+  }
+}
+
 void update_mutex_derived_flags()
 {
-  PFS_mutex *pfs= mutex_array;
-  PFS_mutex *pfs_last= mutex_array + mutex_max;
-  PFS_mutex_class *klass;
+  global_mutex_container.apply_all(fct_update_mutex_derived_flags);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
+static void fct_update_rwlock_derived_flags(PFS_rwlock *pfs)
+{
+  PFS_rwlock_class *klass= sanitize_rwlock_class(pfs->m_class);
+  if (likely(klass != NULL))
   {
-    klass= sanitize_mutex_class(pfs->m_class);
-    if (likely(klass != NULL))
-    {
-      pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
-      pfs->m_timed= klass->m_timed;
-    }
-    else
-    {
-      pfs->m_enabled= false;
-      pfs->m_timed= false;
-    }
+    pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+    pfs->m_timed= klass->m_timed;
+  }
+  else
+  {
+    pfs->m_enabled= false;
+    pfs->m_timed= false;
   }
 }
 
 void update_rwlock_derived_flags()
 {
-  PFS_rwlock *pfs= rwlock_array;
-  PFS_rwlock *pfs_last= rwlock_array + rwlock_max;
-  PFS_rwlock_class *klass;
+  global_rwlock_container.apply_all(fct_update_rwlock_derived_flags);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
+static void fct_update_cond_derived_flags(PFS_cond *pfs)
+{
+  PFS_cond_class *klass= sanitize_cond_class(pfs->m_class);
+  if (likely(klass != NULL))
   {
-    klass= sanitize_rwlock_class(pfs->m_class);
-    if (likely(klass != NULL))
-    {
-      pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
-      pfs->m_timed= klass->m_timed;
-    }
-    else
-    {
-      pfs->m_enabled= false;
-      pfs->m_timed= false;
-    }
+    pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+    pfs->m_timed= klass->m_timed;
+  }
+  else
+  {
+    pfs->m_enabled= false;
+    pfs->m_timed= false;
   }
 }
 
 void update_cond_derived_flags()
 {
-  PFS_cond *pfs= cond_array;
-  PFS_cond *pfs_last= cond_array + cond_max;
-  PFS_cond_class *klass;
+  global_cond_container.apply_all(fct_update_cond_derived_flags);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
+static void fct_update_file_derived_flags(PFS_file *pfs)
+{
+  PFS_file_class *klass= sanitize_file_class(pfs->m_class);
+  if (likely(klass != NULL))
   {
-    klass= sanitize_cond_class(pfs->m_class);
-    if (likely(klass != NULL))
-    {
-      pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
-      pfs->m_timed= klass->m_timed;
-    }
-    else
-    {
-      pfs->m_enabled= false;
-      pfs->m_timed= false;
-    }
+    pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+    pfs->m_timed= klass->m_timed;
+  }
+  else
+  {
+    pfs->m_enabled= false;
+    pfs->m_timed= false;
   }
 }
 
 void update_file_derived_flags()
 {
-  PFS_file *pfs= file_array;
-  PFS_file *pfs_last= file_array + file_max;
-  PFS_file_class *klass;
+  global_file_container.apply_all(fct_update_file_derived_flags);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
+void fct_update_table_derived_flags(PFS_table *pfs)
+{
+  PFS_table_share *share= sanitize_table_share(pfs->m_share);
+  if (likely(share != NULL))
   {
-    klass= sanitize_file_class(pfs->m_class);
-    if (likely(klass != NULL))
-    {
-      pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
-      pfs->m_timed= klass->m_timed;
-    }
-    else
-    {
-      pfs->m_enabled= false;
-      pfs->m_timed= false;
-    }
+    pfs->m_io_enabled= share->m_enabled &&
+      flag_global_instrumentation && global_table_io_class.m_enabled;
+    pfs->m_io_timed= share->m_timed && global_table_io_class.m_timed;
+    pfs->m_lock_enabled= share->m_enabled &&
+      flag_global_instrumentation && global_table_lock_class.m_enabled;
+    pfs->m_lock_timed= share->m_timed && global_table_lock_class.m_timed;
+  }
+  else
+  {
+    pfs->m_io_enabled= false;
+    pfs->m_io_timed= false;
+    pfs->m_lock_enabled= false;
+    pfs->m_lock_timed= false;
   }
 }
 
 void update_table_derived_flags()
 {
-  PFS_table *pfs= table_array;
-  PFS_table *pfs_last= table_array + table_max;
-  PFS_table_share *share;
+  global_table_container.apply_all(fct_update_table_derived_flags);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
+static void fct_update_socket_derived_flags(PFS_socket *pfs)
+{
+  PFS_socket_class *klass= sanitize_socket_class(pfs->m_class);
+  if (likely(klass != NULL))
   {
-    share= sanitize_table_share(pfs->m_share);
-    if (likely(share != NULL))
-    {
-      pfs->m_io_enabled= share->m_enabled &&
-        flag_global_instrumentation && global_table_io_class.m_enabled;
-      pfs->m_io_timed= share->m_timed && global_table_io_class.m_timed;
-      pfs->m_lock_enabled= share->m_enabled &&
-        flag_global_instrumentation && global_table_lock_class.m_enabled;
-      pfs->m_lock_timed= share->m_timed && global_table_lock_class.m_timed;
-    }
-    else
-    {
-      pfs->m_io_enabled= false;
-      pfs->m_io_timed= false;
-      pfs->m_lock_enabled= false;
-      pfs->m_lock_timed= false;
-    }
+    pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+    pfs->m_timed= klass->m_timed;
+  }
+  else
+  {
+    pfs->m_enabled= false;
+    pfs->m_timed= false;
   }
 }
 
 void update_socket_derived_flags()
 {
-  PFS_socket *pfs= socket_array;
-  PFS_socket *pfs_last= socket_array + socket_max;
-  PFS_socket_class *klass;
+  global_socket_container.apply_all(fct_update_socket_derived_flags);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    klass= sanitize_socket_class(pfs->m_class);
-    if (likely(klass != NULL))
-    {
-      pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
-      pfs->m_timed= klass->m_timed;
-    }
-    else
-    {
-      pfs->m_enabled= false;
-      pfs->m_timed= false;
-    }
-  }
+static void fct_update_metadata_derived_flags(PFS_metadata_lock *pfs)
+{
+  pfs->m_enabled= global_metadata_class.m_enabled && flag_global_instrumentation;
+  pfs->m_timed= global_metadata_class.m_timed;
+}
+
+void update_metadata_derived_flags()
+{
+  global_mdl_container.apply_all(fct_update_metadata_derived_flags);
+}
+
+static void fct_update_thread_derived_flags(PFS_thread *pfs)
+{
+  pfs->set_history_derived_flags();
+}
+
+void update_thread_derived_flags()
+{
+  global_thread_container.apply(fct_update_thread_derived_flags);
 }
 
 void update_instruments_derived_flags()
@@ -2316,7 +2336,8 @@ void update_instruments_derived_flags()
   update_file_derived_flags();
   update_table_derived_flags();
   update_socket_derived_flags();
-  /* nothing for stages and statements (no instances) */
+  update_metadata_derived_flags();
+  /* nothing for stages, statements and transactions (no instances) */
 }
 
 /** @} */
diff --git a/storage/perfschema/pfs_instr.h b/storage/perfschema/pfs_instr.h
index a5ff3b4a17d..e09cc7a1cf0 100644
--- a/storage/perfschema/pfs_instr.h
+++ b/storage/perfschema/pfs_instr.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -35,11 +35,15 @@ struct PFS_file_class;
 struct PFS_table_share;
 struct PFS_thread_class;
 struct PFS_socket_class;
+class PFS_opaque_container_page;
+
+class THD;
 
 #include "my_global.h"
-#ifdef __WIN__
+#ifdef _WIN32
 #include <winsock2.h>
-#else
+#endif
+#ifdef HAVE_ARPA_INET_H
 #include <arpa/inet.h>
 #endif
 #include "my_compiler.h"
@@ -49,9 +53,17 @@ struct PFS_socket_class;
 #include "pfs_events_waits.h"
 #include "pfs_events_stages.h"
 #include "pfs_events_statements.h"
+#include "pfs_events_transactions.h"
 #include "pfs_server.h"
 #include "lf.h"
 #include "pfs_con_slice.h"
+#include "pfs_column_types.h"
+#include "mdl.h"
+#include "violite.h" /* enum_vio_type */
+
+extern PFS_single_stat *thread_instr_class_waits_array_start;
+extern PFS_single_stat *thread_instr_class_waits_array_end;
+extern my_bool show_compatibility_56;
 
 /**
   @addtogroup Performance_schema_buffers
@@ -72,6 +84,8 @@ struct PFS_instr
   bool m_enabled;
   /** Timed flag. */
   bool m_timed;
+  /** Container page. */
+  PFS_opaque_container_page *m_page;
 };
 
 /** Instrumented mutex implementation. @see PSI_mutex. */
@@ -124,8 +138,6 @@ struct PFS_ALIGNED PFS_cond : public PFS_instr
   const void *m_identity;
   /** Condition class. */
   PFS_cond_class *m_class;
-  /** Instrument wait statistics. */
-  PFS_single_stat m_wait_stat;
   /** Condition instance usage statistics. */
   PFS_cond_stat m_cond_stat;
 };
@@ -146,6 +158,8 @@ struct PFS_ALIGNED PFS_file : public PFS_instr
   PFS_file_class *m_class;
   /** File usage statistics. */
   PFS_file_stat m_file_stat;
+  /** True if a temporary file. */
+  bool m_temporary;
 };
 
 /** Instrumented table implementation. @see PSI_table. */
@@ -184,20 +198,14 @@ public:
     Only use this method for handles owned by the calling code.
     @sa sanitized_aggregate.
   */
-  void aggregate(void)
+  void aggregate(const TABLE_SHARE *server_share)
   {
-    if (m_has_io_stats && m_has_lock_stats)
-    {
-      safe_aggregate(& m_table_stat, m_share);
-      m_has_io_stats= false;
-      m_has_lock_stats= false;
-    }
-    else if (m_has_io_stats)
+    if (m_has_io_stats)
     {
-      safe_aggregate_io(& m_table_stat, m_share);
+      safe_aggregate_io(server_share, & m_table_stat, m_share);
       m_has_io_stats= false;
     }
-    else if (m_has_lock_stats)
+    if (m_has_lock_stats)
     {
       safe_aggregate_lock(& m_table_stat, m_share);
       m_has_lock_stats= false;
@@ -227,19 +235,26 @@ public:
 
   /** Internal lock. */
   pfs_lock m_lock;
-  /** Owner. */
+  /** Thread Owner. */
   PFS_thread *m_thread_owner;
+  /** Event Owner. */
+  ulonglong m_owner_event_id;
   /** Table share. */
   PFS_table_share *m_share;
   /** Table identity, typically a handler. */
   const void *m_identity;
   /** Table statistics. */
   PFS_table_stat m_table_stat;
+  /** Current internal lock. */
+  PFS_TL_LOCK_TYPE m_internal_lock;
+  /** Current external lock. */
+  PFS_TL_LOCK_TYPE m_external_lock;
+  /** Container page. */
+  PFS_opaque_container_page *m_page;
 
 private:
-  static void safe_aggregate(PFS_table_stat *stat,
-                             PFS_table_share *safe_share);
-  static void safe_aggregate_io(PFS_table_stat *stat,
+  static void safe_aggregate_io(const TABLE_SHARE *optional_server_share,
+                                PFS_table_stat *stat,
                                 PFS_table_share *safe_share);
   static void safe_aggregate_lock(PFS_table_stat *stat,
                                   PFS_table_share *safe_share);
@@ -269,6 +284,24 @@ struct PFS_ALIGNED PFS_socket : public PFS_instr
   PFS_socket_stat m_socket_stat;
 };
 
+/** Instrumented metadata lock implementation. @see PSI_metadata_lock. */
+struct PFS_ALIGNED PFS_metadata_lock : public PFS_instr
+{
+  uint32 get_version()
+  { return m_lock.get_version(); }
+
+  /** Lock identity. */
+  const void *m_identity;
+  MDL_key m_mdl_key;
+  opaque_mdl_type m_mdl_type;
+  opaque_mdl_duration m_mdl_duration;
+  opaque_mdl_status m_mdl_status;
+  const char *m_src_file;
+  uint m_src_line;
+  ulonglong m_owner_thread_id;
+  ulonglong m_owner_event_id;
+};
+
 /**
   @def WAIT_STACK_LOGICAL_SIZE
   Maximum number of nested waits.
@@ -293,7 +326,7 @@ struct PFS_ALIGNED PFS_socket : public PFS_instr
 /**
   @def WAIT_STACK_BOTTOM
   Maximum number dummy waits records.
-  One dummy record is reserved for the parent stage / statement,
+  One dummy record is reserved for the parent stage / statement / transaction,
   at the bottom of the wait stack.
 */
 #define WAIT_STACK_BOTTOM 1
@@ -307,75 +340,84 @@ struct PFS_ALIGNED PFS_socket : public PFS_instr
 extern uint statement_stack_max;
 /** Max size of the digests token array. */
 extern size_t pfs_max_digest_length;
+/** Max size of SQL TEXT. */
+extern size_t pfs_max_sqltext;
 
-/**
-  @def PFS_MAX_ALLOC_RETRY
-  Maximum number of times the code attempts to allocate an item
-  from internal buffers, before giving up.
-*/
-#define PFS_MAX_ALLOC_RETRY 1000
+/** Instrumented thread implementation. @see PSI_thread. */
+struct PFS_ALIGNED PFS_thread : PFS_connection_slice
+{
+  static PFS_thread* get_current_thread(void);
 
-/** The maximun number of passes in @sa PFS_scan. */
-#define PFS_MAX_SCAN_PASS 2
+  /** Thread instrumentation flag. */
+  bool m_enabled;
+  /** Thread history instrumentation flag. */
+  bool m_history;
 
-/**
-  Helper to scan circular buffers.
-  Given a buffer of size [0, max_size - 1],
-  and a random starting point in the buffer,
-  this helper returns up to two [first, last -1] intervals that:
-  - fit into the [0, max_size - 1] range,
-  - have a maximum combined length of at most PFS_MAX_ALLOC_RETRY.
-*/
-struct PFS_scan
-{
-public:
   /**
-    Initialize a new scan.
-    @param random a random index to start from
-    @param max_size the max size of the interval to scan
+    Derived flag flag_events_waits_history, per thread.
+    Cached computation of
+      TABLE SETUP_CONSUMERS[EVENTS_WAITS_HISTORY].ENABLED == 'YES'
+    AND
+      TABLE THREADS[THREAD_ID].HISTORY == 'YES'
   */
-  void init(uint random, uint max_size);
-
+  bool m_flag_events_waits_history;
   /**
-    Predicate, has a next pass.
-    @return true if there is a next pass to perform.
+    Derived flag flag_events_waits_history_long, per thread.
+    Cached computation of
+      TABLE SETUP_CONSUMERS[EVENTS_WAITS_HISTORY_LONG].ENABLED == 'YES'
+    AND
+      TABLE THREADS[THREAD_ID].HISTORY == 'YES'
   */
-  bool has_pass() const
-  { return (m_pass < m_pass_max); }
-
+  bool m_flag_events_waits_history_long;
   /**
-    Iterator, proceed to the next pass.
+    Derived flag flag_events_stages_history, per thread.
+    Cached computation of
+      TABLE SETUP_CONSUMERS[EVENTS_STAGES_HISTORY].ENABLED == 'YES'
+    AND
+      TABLE THREADS[THREAD_ID].HISTORY == 'YES'
   */
-  void next_pass()
-  { m_pass++; }
-  
-  /** First index for this pass. */
-  uint first() const
-  { return m_first[m_pass]; }
-
-  /** Last index for this pass. */
-  uint last() const
-  { return m_last[m_pass]; }
-
-private:
-  /** Current pass. */
-  uint m_pass;
-  /** Maximum number of passes. */
-  uint m_pass_max;
-  /** First element for each pass. */
-  uint m_first[PFS_MAX_SCAN_PASS];
-  /** Last element for each pass. */
-  uint m_last[PFS_MAX_SCAN_PASS];
-};
-
-
-/** Instrumented thread implementation. @see PSI_thread. */
-struct PFS_ALIGNED PFS_thread : PFS_connection_slice
-{
-  static PFS_thread* get_current_thread(void);
+  bool m_flag_events_stages_history;
+  /**
+    Derived flag flag_events_stages_history_long, per thread.
+    Cached computation of
+      TABLE SETUP_CONSUMERS[EVENTS_STAGES_HISTORY_LONG].ENABLED == 'YES'
+    AND
+      TABLE THREADS[THREAD_ID].HISTORY == 'YES'
+  */
+  bool m_flag_events_stages_history_long;
+  /**
+    Derived flag flag_events_statements_history, per thread.
+    Cached computation of
+      TABLE SETUP_CONSUMERS[EVENTS_STATEMENTS_HISTORY].ENABLED == 'YES'
+    AND
+      TABLE THREADS[THREAD_ID].HISTORY == 'YES'
+  */
+  bool m_flag_events_statements_history;
+  /**
+    Derived flag flag_events_statements_history_long, per thread.
+    Cached computation of
+      TABLE SETUP_CONSUMERS[EVENTS_STATEMENTS_HISTORY_LONG].ENABLED == 'YES'
+    AND
+      TABLE THREADS[THREAD_ID].HISTORY == 'YES'
+  */
+  bool m_flag_events_statements_history_long;
+  /**
+    Derived flag flag_events_transactions_history, per thread.
+    Cached computation of
+      TABLE SETUP_CONSUMERS[EVENTS_TRANSACTIONS_HISTORY].ENABLED == 'YES'
+    AND
+      TABLE THREADS[THREAD_ID].HISTORY == 'YES'
+  */
+  bool m_flag_events_transactions_history;
+  /**
+    Derived flag flag_events_transactions_history_long, per thread.
+    Cached computation of
+      TABLE SETUP_CONSUMERS[EVENTS_TRANSACTIONS_HISTORY_LONG].ENABLED == 'YES'
+    AND
+      TABLE THREADS[THREAD_ID].HISTORY == 'YES'
+  */
+  bool m_flag_events_transactions_history_long;
 
-  /** Thread instrumentation flag. */
-  bool m_enabled;
   /** Current wait event in the event stack. */
   PFS_events_waits *m_events_waits_current;
   /** Event ID counter */
@@ -404,18 +446,22 @@ struct PFS_ALIGNED PFS_thread : PFS_connection_slice
   LF_PINS *m_account_hash_pins;
   /** Pins for digest_hash. */
   LF_PINS *m_digest_hash_pins;
+  /** Pins for routine_hash. */
+  LF_PINS *m_program_hash_pins;
   /** Internal thread identifier, unique. */
   ulonglong m_thread_internal_id;
   /** Parent internal thread identifier. */
   ulonglong m_parent_thread_internal_id;
   /** External (SHOW PROCESSLIST) thread identifier, not unique. */
   ulong m_processlist_id;
+  /** External (Operating system) thread identifier, if any. */
+  uint32 m_thread_os_id;
   /** Thread class. */
   PFS_thread_class *m_class;
   /**
     Stack of events waits.
     This member holds the data for the table PERFORMANCE_SCHEMA.EVENTS_WAITS_CURRENT.
-    Note that stack[0] is a dummy record that represents the parent stage/statement.
+    Note that stack[0] is a dummy record that represents the parent stage/statement/transaction.
     For example, assuming the following tree:
     - STAGE ID 100
       - WAIT ID 101, parent STAGE 100
@@ -475,12 +521,24 @@ struct PFS_ALIGNED PFS_thread : PFS_connection_slice
   */
   PFS_events_statements *m_statements_history;
 
+  /** True if the circular buffer @c m_transactions_history is full. */
+  bool m_transactions_history_full;
+  /** Current index in the circular buffer @c m_transactions_history. */
+  uint m_transactions_history_index;
+  /**
+    Statements history circular buffer.
+    This member holds the data for the table
+    PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_HISTORY.
+  */
+  PFS_events_transactions *m_transactions_history;
+
   /**
     Internal lock, for session attributes.
     Statement attributes are expected to be updated in frequently,
     typically per session execution.
   */
   pfs_lock m_session_lock;
+
   /**
     User name.
     Protected by @c m_session_lock.
@@ -513,6 +571,8 @@ struct PFS_ALIGNED PFS_thread : PFS_connection_slice
   uint m_dbname_length;
   /** Current command. */
   int m_command;
+  /** Connection type. */
+  enum_vio_type m_connection_type;
   /** Start time. */
   time_t m_start_time;
   /**
@@ -523,6 +583,8 @@ struct PFS_ALIGNED PFS_thread : PFS_connection_slice
   pfs_lock m_stmt_lock;
   /** Processlist state (derived from stage). */
   PFS_stage_key m_stage;
+  /** Current stage progress. */
+  PSI_stage_progress* m_stage_progress;
   /**
     Processlist info.
     Protected by @c m_stmt_lock.
@@ -540,6 +602,9 @@ struct PFS_ALIGNED PFS_thread : PFS_connection_slice
   uint m_events_statements_count;
   PFS_events_statements *m_statement_stack;
 
+  PFS_events_transactions m_transaction_current;
+
+  THD *m_thd;
   PFS_host *m_host;
   PFS_user *m_user;
   PFS_account *m_account;
@@ -562,10 +627,28 @@ struct PFS_ALIGNED PFS_thread : PFS_connection_slice
     Protected by @c m_session_lock.
   */
   uint m_session_connect_attrs_cs_number;
+
+  void carry_memory_stat_delta(PFS_memory_stat_delta *delta, uint index);
+
+  void set_enabled(bool enabled)
+  {
+    m_enabled= enabled;
+  }
+
+  void set_history(bool history)
+  {
+    m_history= history;
+    set_history_derived_flags();
+  }
+
+  void set_history_derived_flags();
 };
 
+void carry_global_memory_stat_delta(PFS_memory_stat_delta *delta, uint index);
+
 extern PFS_stage_stat *global_instr_class_stages_array;
 extern PFS_statement_stat *global_instr_class_statements_array;
+extern PFS_memory_stat *global_instr_class_memory_array;
 
 PFS_mutex *sanitize_mutex(PFS_mutex *unsafe);
 PFS_rwlock *sanitize_rwlock(PFS_rwlock *unsafe);
@@ -573,10 +656,11 @@ PFS_cond *sanitize_cond(PFS_cond *unsafe);
 PFS_thread *sanitize_thread(PFS_thread *unsafe);
 PFS_file *sanitize_file(PFS_file *unsafe);
 PFS_socket *sanitize_socket(PFS_socket *unsafe);
+PFS_metadata_lock *sanitize_metadata_lock(PFS_metadata_lock *unsafe);
 
 int init_instruments(const PFS_global_param *param);
 void cleanup_instruments();
-int init_file_hash();
+int init_file_hash(const PFS_global_param *param);
 void cleanup_file_hash();
 PFS_mutex* create_mutex(PFS_mutex_class *mutex_class, const void *identity);
 void destroy_mutex(PFS_mutex *pfs);
@@ -592,6 +676,9 @@ void destroy_thread(PFS_thread *pfs);
 
 PFS_file* find_or_create_file(PFS_thread *thread, PFS_file_class *klass,
                               const char *filename, uint len, bool create);
+void find_and_rename_file(PFS_thread *thread, const char *old_filename,
+                          uint old_len, const char *new_filename,
+                          uint new_len);
 
 void release_file(PFS_file *pfs);
 void destroy_file(PFS_thread *thread, PFS_file *pfs);
@@ -605,27 +692,23 @@ PFS_socket* create_socket(PFS_socket_class *socket_class,
                           socklen_t addr_len);
 void destroy_socket(PFS_socket *pfs);
 
+PFS_metadata_lock* create_metadata_lock(void *identity,
+                                        const MDL_key *mdl_key,
+                                        opaque_mdl_type mdl_type,
+                                        opaque_mdl_duration mdl_duration,
+                                        opaque_mdl_status mdl_status,
+                                        const char *src_file,
+                                        uint src_line);
+void destroy_metadata_lock(PFS_metadata_lock *pfs);
+
 /* For iterators and show status. */
 
-extern ulong mutex_max;
-extern ulong mutex_lost;
-extern ulong rwlock_max;
-extern ulong rwlock_lost;
-extern ulong cond_max;
-extern ulong cond_lost;
-extern ulong thread_max;
-extern ulong thread_lost;
-extern ulong file_max;
-extern ulong file_lost;
 extern long file_handle_max;
 extern ulong file_handle_lost;
-extern ulong table_max;
-extern ulong table_lost;
-extern ulong socket_max;
-extern ulong socket_lost;
 extern ulong events_waits_history_per_thread;
 extern ulong events_stages_history_per_thread;
 extern ulong events_statements_history_per_thread;
+extern ulong events_transactions_history_per_thread;
 extern ulong locker_lost;
 extern ulong statement_lost;
 extern ulong session_connect_attrs_lost;
@@ -633,14 +716,7 @@ extern ulong session_connect_attrs_size_per_thread;
 
 /* Exposing the data directly, for iterators. */
 
-extern PFS_mutex *mutex_array;
-extern PFS_rwlock *rwlock_array;
-extern PFS_cond *cond_array;
-extern PFS_thread *thread_array;
-extern PFS_file *file_array;
 extern PFS_file **file_handle_array;
-extern PFS_table *table_array;
-extern PFS_socket *socket_array;
 
 void reset_events_waits_by_instance();
 void reset_file_instance_io();
@@ -664,6 +740,20 @@ void aggregate_all_statements(PFS_statement_stat *from_array,
                               PFS_statement_stat *to_array_1,
                               PFS_statement_stat *to_array_2);
 
+void aggregate_all_transactions(PFS_transaction_stat *from_array,
+                                PFS_transaction_stat *to_array);
+void aggregate_all_transactions(PFS_transaction_stat *from_array,
+                                PFS_transaction_stat *to_array_1,
+                                PFS_transaction_stat *to_array_2);
+
+void aggregate_all_memory(bool alive,
+                          PFS_memory_stat *from_array,
+                          PFS_memory_stat *to_array);
+void aggregate_all_memory(bool alive,
+                          PFS_memory_stat *from_array,
+                          PFS_memory_stat *to_array_1,
+                          PFS_memory_stat *to_array_2);
+
 void aggregate_thread(PFS_thread *thread,
                       PFS_account *safe_account,
                       PFS_user *safe_user,
@@ -680,6 +770,21 @@ void aggregate_thread_statements(PFS_thread *thread,
                                  PFS_account *safe_account,
                                  PFS_user *safe_user,
                                  PFS_host *safe_host);
+void aggregate_thread_transactions(PFS_thread *thread,
+                                   PFS_account *safe_account,
+                                   PFS_user *safe_user,
+                                   PFS_host *safe_host);
+
+void aggregate_thread_memory(bool alive, PFS_thread *thread,
+                             PFS_account *safe_account,
+                             PFS_user *safe_user,
+                             PFS_host *safe_host);
+
+void aggregate_thread_status(PFS_thread *thread,
+                             PFS_account *safe_account,
+                             PFS_user *safe_user,
+                             PFS_host *safe_host);
+
 void clear_thread_account(PFS_thread *thread);
 void set_thread_account(PFS_thread *thread);
 
@@ -695,6 +800,10 @@ void update_file_derived_flags();
 void update_table_derived_flags();
 /** Update derived flags for all socket instances. */
 void update_socket_derived_flags();
+/** Update derived flags for all metadata instances. */
+void update_metadata_derived_flags();
+/** Update derived flags for all thread instances. */
+void update_thread_derived_flags();
 /** Update derived flags for all instruments. */
 void update_instruments_derived_flags();
 
diff --git a/storage/perfschema/pfs_instr_class.cc b/storage/perfschema/pfs_instr_class.cc
index 31f100b92f1..971ee7ca00f 100644
--- a/storage/perfschema/pfs_instr_class.cc
+++ b/storage/perfschema/pfs_instr_class.cc
@@ -1,5 +1,5 @@
-/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
-  Copyright (c) 2020, MariaDB Corporation.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
+   Copyright (c) 2020, MariaDB Corporation.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -28,15 +28,17 @@
 
 #include "my_global.h"
 #include "my_sys.h"
-#include "structs.h"
 #include "table.h"
 #include "pfs_instr_class.h"
+#include "pfs_builtin_memory.h"
 #include "pfs_instr.h"
 #include "pfs_global.h"
 #include "pfs_timer.h"
 #include "pfs_events_waits.h"
 #include "pfs_setup_object.h"
 #include "pfs_atomic.h"
+#include "pfs_program.h"
+#include "pfs_buffer_container.h"
 #include "mysql/psi/mysql_thread.h"
 #include "lf.h"
 
@@ -49,11 +51,16 @@
 */
 
 /**
-  PFS_INSTRUMENT option settings array and associated state variable to
-  serialize access during shutdown.
+  Global performance schema flag.
+  Indicate if the performance schema is enabled.
+  This flag is set at startup, and never changes.
+*/
+my_bool pfs_enabled= TRUE;
+
+/**
+  PFS_INSTRUMENT option settings array
  */
-DYNAMIC_ARRAY pfs_instr_config_array;
-int pfs_instr_config_state= PFS_INSTR_CONFIG_NOT_INITIALIZED;
+Pfs_instr_config_array *pfs_instr_config_array= NULL;
 
 static void configure_instr_class(PFS_instr_class *entry);
 
@@ -69,12 +76,12 @@ static void init_instr_class(PFS_instr_class *klass,
   - the performance schema initialization
   - a plugin initialization
 */
-static volatile uint32 mutex_class_dirty_count= 0;
-static volatile uint32 mutex_class_allocated_count= 0;
-static volatile uint32 rwlock_class_dirty_count= 0;
-static volatile uint32 rwlock_class_allocated_count= 0;
-static volatile uint32 cond_class_dirty_count= 0;
-static volatile uint32 cond_class_allocated_count= 0;
+static uint32 mutex_class_dirty_count= 0;
+static uint32 mutex_class_allocated_count= 0;
+static uint32 rwlock_class_dirty_count= 0;
+static uint32 rwlock_class_allocated_count= 0;
+static uint32 cond_class_dirty_count= 0;
+static uint32 cond_class_allocated_count= 0;
 
 /** Size of the mutex class array. @sa mutex_class_array */
 ulong mutex_class_max= 0;
@@ -104,14 +111,21 @@ ulong stage_class_lost= 0;
 ulong statement_class_max= 0;
 /** Number of statement class lost. @sa statement_class_array */
 ulong statement_class_lost= 0;
-/** Size of the table share array. @sa table_share_array */
-ulong table_share_max= 0;
-/** Number of table share lost. @sa table_share_array */
-ulong table_share_lost= 0;
 /** Size of the socket class array. @sa socket_class_array */
 ulong socket_class_max= 0;
 /** Number of socket class lost. @sa socket_class_array */
 ulong socket_class_lost= 0;
+/** Size of the memory class array. @sa memory_class_array */
+ulong memory_class_max= 0;
+/** Number of memory class lost. @sa memory_class_array */
+ulong memory_class_lost= 0;
+
+/**
+  Number of transaction classes. Although there is only one transaction class,
+  this is used for sizing by other event classes.
+  @sa global_transaction_class
+*/
+ulong transaction_class_max= 0;
 
 PFS_mutex_class *mutex_class_array= NULL;
 PFS_rwlock_class *rwlock_class_array= NULL;
@@ -123,47 +137,45 @@ PFS_cond_class *cond_class_array= NULL;
   - the performance schema initialization
   - a plugin initialization
 */
-static volatile uint32 thread_class_dirty_count= 0;
-static volatile uint32 thread_class_allocated_count= 0;
+static uint32 thread_class_dirty_count= 0;
+static uint32 thread_class_allocated_count= 0;
 
 static PFS_thread_class *thread_class_array= NULL;
 
-/**
-  Table instance array.
-  @sa table_share_max
-  @sa table_share_lost
-  @sa table_share_hash
-*/
-PFS_table_share *table_share_array= NULL;
-
 PFS_ALIGNED PFS_single_stat global_idle_stat;
 PFS_ALIGNED PFS_table_io_stat global_table_io_stat;
 PFS_ALIGNED PFS_table_lock_stat global_table_lock_stat;
+PFS_ALIGNED PFS_single_stat global_metadata_stat;
+PFS_ALIGNED PFS_transaction_stat global_transaction_stat;
 PFS_ALIGNED PFS_instr_class global_table_io_class;
 PFS_ALIGNED PFS_instr_class global_table_lock_class;
 PFS_ALIGNED PFS_instr_class global_idle_class;
+PFS_ALIGNED PFS_instr_class global_metadata_class;
+PFS_ALIGNED PFS_transaction_class global_transaction_class;
 
 /** Class-timer map */
 enum_timer_name *class_timers[] =
-{&wait_timer,      /* PFS_CLASS_NONE */
- &wait_timer,      /* PFS_CLASS_MUTEX */
- &wait_timer,      /* PFS_CLASS_RWLOCK */
- &wait_timer,      /* PFS_CLASS_COND */
- &wait_timer,      /* PFS_CLASS_FILE */
- &wait_timer,      /* PFS_CLASS_TABLE */
- &stage_timer,     /* PFS_CLASS_STAGE */
- &statement_timer, /* PFS_CLASS_STATEMENT */
- &wait_timer,      /* PFS_CLASS_SOCKET */
- &wait_timer,      /* PFS_CLASS_TABLE_IO */
- &wait_timer,      /* PFS_CLASS_TABLE_LOCK */
- &idle_timer       /* PFS_CLASS_IDLE */
+{&wait_timer,        /* PFS_CLASS_NONE */
+ &wait_timer,        /* PFS_CLASS_MUTEX */
+ &wait_timer,        /* PFS_CLASS_RWLOCK */
+ &wait_timer,        /* PFS_CLASS_COND */
+ &wait_timer,        /* PFS_CLASS_FILE */
+ &wait_timer,        /* PFS_CLASS_TABLE */
+ &stage_timer,       /* PFS_CLASS_STAGE */
+ &statement_timer,   /* PFS_CLASS_STATEMENT */
+ &transaction_timer, /* PFS_CLASS_TRANSACTION */
+ &wait_timer,        /* PFS_CLASS_SOCKET */
+ &wait_timer,        /* PFS_CLASS_TABLE_IO */
+ &wait_timer,        /* PFS_CLASS_TABLE_LOCK */
+ &idle_timer,        /* PFS_CLASS_IDLE */
+ &wait_timer,        /* PFS_CLASS_METADATA */
+ &wait_timer         /* PFS_CLASS_MEMORY */
 };
 
 /**
   Hash index for instrumented table shares.
   This index is searched by table fully qualified name (@c PFS_table_share_key),
   and points to instrumented table shares (@c PFS_table_share).
-  @sa table_share_array
   @sa PFS_table_share_key
   @sa PFS_table_share
   @sa table_share_hash_get_key
@@ -173,26 +185,31 @@ LF_HASH table_share_hash;
 /** True if table_share_hash is initialized. */
 static bool table_share_hash_inited= false;
 
-static volatile uint32 file_class_dirty_count= 0;
-static volatile uint32 file_class_allocated_count= 0;
+static uint32 file_class_dirty_count= 0;
+static uint32 file_class_allocated_count= 0;
 
 PFS_file_class *file_class_array= NULL;
 
-static volatile uint32 stage_class_dirty_count= 0;
-static volatile uint32 stage_class_allocated_count= 0;
+static uint32 stage_class_dirty_count= 0;
+static uint32 stage_class_allocated_count= 0;
 
 static PFS_stage_class *stage_class_array= NULL;
 
-static volatile uint32 statement_class_dirty_count= 0;
-static volatile uint32 statement_class_allocated_count= 0;
+static uint32 statement_class_dirty_count= 0;
+static uint32 statement_class_allocated_count= 0;
 
 static PFS_statement_class *statement_class_array= NULL;
 
-static volatile uint32 socket_class_dirty_count= 0;
-static volatile uint32 socket_class_allocated_count= 0;
+static uint32 socket_class_dirty_count= 0;
+static uint32 socket_class_allocated_count= 0;
 
 static PFS_socket_class *socket_class_array= NULL;
 
+static uint32 memory_class_dirty_count= 0;
+static uint32 memory_class_allocated_count= 0;
+
+static PFS_memory_class *memory_class_array= NULL;
+
 uint mutex_class_start= 0;
 uint rwlock_class_start= 0;
 uint cond_class_start= 0;
@@ -202,7 +219,8 @@ uint socket_class_start= 0;
 
 void init_event_name_sizing(const PFS_global_param *param)
 {
-  mutex_class_start= 3; /* global table io, table lock, idle */
+  /* global table io, table lock, idle, metadata */
+  mutex_class_start= COUNT_GLOBAL_EVENT_INDEX;
   rwlock_class_start= mutex_class_start + param->m_mutex_class_sizing;
   cond_class_start= rwlock_class_start + param->m_rwlock_class_sizing;
   file_class_start= cond_class_start + param->m_cond_class_sizing;
@@ -223,12 +241,29 @@ void register_global_classes()
                    0, PFS_CLASS_TABLE_LOCK);
   global_table_lock_class.m_event_name_index= GLOBAL_TABLE_LOCK_EVENT_INDEX;
   configure_instr_class(&global_table_lock_class);
-  
+
   /* Idle class */
   init_instr_class(&global_idle_class, "idle", 4,
                    0, PFS_CLASS_IDLE);
   global_idle_class.m_event_name_index= GLOBAL_IDLE_EVENT_INDEX;
   configure_instr_class(&global_idle_class);
+
+  /* Metadata class */
+  init_instr_class(&global_metadata_class, "wait/lock/metadata/sql/mdl", 26,
+                   0, PFS_CLASS_METADATA);
+  global_metadata_class.m_event_name_index= GLOBAL_METADATA_EVENT_INDEX;
+  global_metadata_class.m_enabled= false; /* Disabled by default */
+  global_metadata_class.m_timed= false;
+  configure_instr_class(&global_metadata_class);
+
+  /* Transaction class */
+  init_instr_class(&global_transaction_class, "transaction", 11,
+                   0, PFS_CLASS_TRANSACTION);
+  global_transaction_class.m_event_name_index= GLOBAL_TRANSACTION_INDEX;
+  global_transaction_class.m_enabled= false; /* Disabled by default */
+  global_transaction_class.m_timed= false;
+  configure_instr_class(&global_transaction_class);
+  transaction_class_max= 1; /* used for sizing by other event classes */
 }
 
 /**
@@ -256,24 +291,30 @@ int init_sync_class(uint mutex_class_sizing,
 
   if (mutex_class_max > 0)
   {
-    mutex_class_array= PFS_MALLOC_ARRAY(mutex_class_max, sizeof(PFS_mutex_class),
-                                        PFS_mutex_class, MYF(MY_ZEROFILL));
+    mutex_class_array= PFS_MALLOC_ARRAY(& builtin_memory_mutex_class,
+                                        mutex_class_max,
+                                        sizeof(PFS_mutex_class), PFS_mutex_class,
+                                        MYF(MY_ZEROFILL));
     if (unlikely(mutex_class_array == NULL))
       return 1;
   }
 
   if (rwlock_class_max > 0)
   {
-    rwlock_class_array= PFS_MALLOC_ARRAY(rwlock_class_max, sizeof(PFS_rwlock_class),
-                                         PFS_rwlock_class, MYF(MY_ZEROFILL));
+    rwlock_class_array= PFS_MALLOC_ARRAY(& builtin_memory_rwlock_class,
+                                         rwlock_class_max,
+                                         sizeof(PFS_rwlock_class), PFS_rwlock_class,
+                                         MYF(MY_ZEROFILL));
     if (unlikely(rwlock_class_array == NULL))
       return 1;
   }
 
   if (cond_class_max > 0)
   {
-    cond_class_array= PFS_MALLOC_ARRAY(cond_class_max, sizeof(PFS_cond_class),
-                                       PFS_cond_class, MYF(MY_ZEROFILL));
+    cond_class_array= PFS_MALLOC_ARRAY(& builtin_memory_cond_class,
+                                       cond_class_max,
+                                       sizeof(PFS_cond_class), PFS_cond_class,
+                                       MYF(MY_ZEROFILL));
     if (unlikely(cond_class_array == NULL))
       return 1;
   }
@@ -284,13 +325,21 @@ int init_sync_class(uint mutex_class_sizing,
 /** Cleanup the instrument synch class buffers. */
 void cleanup_sync_class(void)
 {
-  pfs_free(mutex_class_array);
+  PFS_FREE_ARRAY(& builtin_memory_mutex_class,
+                 mutex_class_max, sizeof(PFS_mutex_class),
+                 mutex_class_array);
   mutex_class_array= NULL;
   mutex_class_dirty_count= mutex_class_allocated_count= mutex_class_max= 0;
-  pfs_free(rwlock_class_array);
+
+  PFS_FREE_ARRAY(& builtin_memory_rwlock_class,
+                 rwlock_class_max, sizeof(PFS_rwlock_class),
+                 rwlock_class_array);
   rwlock_class_array= NULL;
   rwlock_class_dirty_count= rwlock_class_allocated_count= rwlock_class_max= 0;
-  pfs_free(cond_class_array);
+
+  PFS_FREE_ARRAY(& builtin_memory_cond_class,
+                 cond_class_max, sizeof(PFS_cond_class),
+                 cond_class_array);
   cond_class_array= NULL;
   cond_class_dirty_count= cond_class_allocated_count= cond_class_max= 0;
 }
@@ -309,8 +358,10 @@ int init_thread_class(uint thread_class_sizing)
 
   if (thread_class_max > 0)
   {
-    thread_class_array= PFS_MALLOC_ARRAY(thread_class_max, sizeof(PFS_thread_class),
-                                         PFS_thread_class, MYF(MY_ZEROFILL));
+    thread_class_array= PFS_MALLOC_ARRAY(& builtin_memory_thread_class,
+                                         thread_class_max,
+                                         sizeof(PFS_thread_class), PFS_thread_class,
+                                         MYF(MY_ZEROFILL));
     if (unlikely(thread_class_array == NULL))
       result= 1;
   }
@@ -323,7 +374,9 @@ int init_thread_class(uint thread_class_sizing)
 /** Cleanup the thread class buffers. */
 void cleanup_thread_class(void)
 {
-  pfs_free(thread_class_array);
+  PFS_FREE_ARRAY(& builtin_memory_thread_class,
+                 thread_class_max, sizeof(PFS_thread_class),
+                 thread_class_array);
   thread_class_array= NULL;
   thread_class_dirty_count= thread_class_allocated_count= 0;
   thread_class_max= 0;
@@ -336,29 +389,16 @@ void cleanup_thread_class(void)
 */
 int init_table_share(uint table_share_sizing)
 {
-  int result= 0;
-  table_share_max= table_share_sizing;
-  table_share_lost= 0;
-
-  if (table_share_max > 0)
-  {
-    table_share_array= PFS_MALLOC_ARRAY(table_share_max, sizeof(PFS_table_share),
-                                        PFS_table_share, MYF(MY_ZEROFILL));
-    if (unlikely(table_share_array == NULL))
-      result= 1;
-  }
-  else
-    table_share_array= NULL;
+  if (global_table_share_container.init(table_share_sizing))
+    return 1;
 
-  return result;
+  return 0;
 }
 
 /** Cleanup the table share buffers. */
 void cleanup_table_share(void)
 {
-  pfs_free(table_share_array);
-  table_share_array= NULL;
-  table_share_max= 0;
+  global_table_share_container.cleanup();
 }
 
 C_MODE_START
@@ -370,9 +410,9 @@ static uchar *table_share_hash_get_key(const uchar *entry, size_t *length,
   const PFS_table_share *share;
   const void *result;
   typed_entry= reinterpret_cast<const PFS_table_share* const *> (entry);
-  DBUG_ASSERT(typed_entry != NULL);
+  assert(typed_entry != NULL);
   share= *typed_entry;
-  DBUG_ASSERT(share != NULL);
+  assert(share != NULL);
   *length= share->m_key.m_key_length;
   result= &share->m_key.m_hash_key[0];
   return const_cast<uchar*> (reinterpret_cast<const uchar*> (result));
@@ -380,13 +420,12 @@ static uchar *table_share_hash_get_key(const uchar *entry, size_t *length,
 C_MODE_END
 
 /** Initialize the table share hash table. */
-int init_table_share_hash(void)
+int init_table_share_hash(const PFS_global_param *param)
 {
-  if ((! table_share_hash_inited) && (table_share_max > 0))
+  if ((! table_share_hash_inited) && (param->m_table_share_sizing != 0))
   {
     lf_hash_init(&table_share_hash, sizeof(PFS_table_share*), LF_HASH_UNIQUE,
                  0, 0, table_share_hash_get_key, &my_charset_bin);
-    /* table_share_hash.size= table_share_max; */
     table_share_hash_inited= true;
   }
   return 0;
@@ -429,11 +468,11 @@ LF_PINS* get_table_share_hash_pins(PFS_thread *thread)
 */
 static void set_table_share_key(PFS_table_share_key *key,
                                 bool temporary,
-                                const char *schema_name, uint schema_name_length,
-                                const char *table_name, uint table_name_length)
+                                const char *schema_name, size_t schema_name_length,
+                                const char *table_name, size_t table_name_length)
 {
-  DBUG_ASSERT(schema_name_length <= NAME_LEN);
-  DBUG_ASSERT(table_name_length <= NAME_LEN);
+  assert(schema_name_length <= NAME_LEN);
+  assert(table_name_length <= NAME_LEN);
   char *saved_schema_name;
   char *saved_table_name;
 
@@ -459,13 +498,315 @@ static void set_table_share_key(PFS_table_share_key *key,
   }
 }
 
+/**
+  Find an existing table share lock instrumentation.
+  @return a table share lock.
+*/
+PFS_table_share_lock*
+PFS_table_share::find_lock_stat() const
+{
+  PFS_table_share *that= const_cast<PFS_table_share*>(this);
+  void *addr= & that->m_race_lock_stat;
+  void * volatile * typed_addr= static_cast<void * volatile *>(addr);
+  void *ptr;
+
+  /* Atomic Load */
+  ptr= my_atomic_loadptr(typed_addr);
+
+  PFS_table_share_lock *pfs;
+  pfs= static_cast<PFS_table_share_lock *>(ptr);
+  return pfs;
+}
+
+/**
+  Find or create a table share lock instrumentation.
+  @return a table share lock, or NULL.
+*/
+PFS_table_share_lock*
+PFS_table_share::find_or_create_lock_stat()
+{
+  void *addr= & this->m_race_lock_stat;
+  void * volatile * typed_addr= static_cast<void * volatile *>(addr);
+  void *ptr;
+
+  /* (1) Atomic Load */
+  ptr= my_atomic_loadptr(typed_addr);
+
+  PFS_table_share_lock *pfs;
+  if (ptr != NULL)
+  {
+    pfs= static_cast<PFS_table_share_lock *>(ptr);
+    return pfs;
+  }
+
+  /* (2) Create a lock stat */
+  pfs= create_table_share_lock_stat();
+  if (pfs == NULL)
+    return NULL;
+  pfs->m_owner= this;
+
+  void *old_ptr= NULL;
+  ptr= pfs;
+
+  /* (3) Atomic CAS */
+  if (my_atomic_casptr(typed_addr, & old_ptr, ptr))
+  {
+    /* Ok. */
+    return pfs;
+  }
+
+  /* Collision with another thread that also executed (2) and (3). */
+  release_table_share_lock_stat(pfs);
+
+  pfs= static_cast<PFS_table_share_lock *>(old_ptr);
+  return pfs;
+}
+
+/** Destroy a table share lock instrumentation. */
+void PFS_table_share::destroy_lock_stat()
+{
+  void *addr= & this->m_race_lock_stat;
+  void * volatile * typed_addr= static_cast<void * volatile *>(addr);
+  void *new_ptr= NULL;
+  void *old_ptr;
+
+  old_ptr= my_atomic_fasptr(typed_addr, new_ptr);
+  if (old_ptr != NULL)
+  {
+    PFS_table_share_lock *pfs;
+    pfs= static_cast<PFS_table_share_lock *>(old_ptr);
+    release_table_share_lock_stat(pfs);
+  }
+}
+
+/**
+  Find an existing table share index instrumentation.
+  @return a table share index
+*/
+PFS_table_share_index*
+PFS_table_share::find_index_stat(uint index) const
+{
+  assert(index <= MAX_INDEXES);
+
+  PFS_table_share *that= const_cast<PFS_table_share*>(this);
+  void *addr= & that->m_race_index_stat[index];
+  void * volatile * typed_addr= static_cast<void * volatile *>(addr);
+  void *ptr;
+
+  /* Atomic Load */
+  ptr= my_atomic_loadptr(typed_addr);
+
+  PFS_table_share_index *pfs;
+  pfs= static_cast<PFS_table_share_index *>(ptr);
+  return pfs;
+}
+
+/**
+  Find or create a table share index instrumentation.
+  @param server_share
+  @index index
+  @return a table share index, or NULL
+*/
+PFS_table_share_index*
+PFS_table_share::find_or_create_index_stat(const TABLE_SHARE *server_share, uint index)
+{
+  assert(index <= MAX_INDEXES);
+
+  void *addr= & this->m_race_index_stat[index];
+  void * volatile * typed_addr= static_cast<void * volatile *>(addr);
+  void *ptr;
+
+  /* (1) Atomic Load */
+  ptr= my_atomic_loadptr(typed_addr);
+
+  PFS_table_share_index *pfs;
+  if (ptr != NULL)
+  {
+    pfs= static_cast<PFS_table_share_index *>(ptr);
+    return pfs;
+  }
+
+  /* (2) Create an index stat */
+  pfs= create_table_share_index_stat(server_share, index);
+  if (pfs == NULL)
+    return NULL;
+  pfs->m_owner= this;
+
+  void *old_ptr= NULL;
+  ptr= pfs;
+
+  /* (3) Atomic CAS */
+  if (my_atomic_casptr(typed_addr, & old_ptr, ptr))
+  {
+    /* Ok. */
+    return pfs;
+  }
+
+  /* Collision with another thread that also executed (2) and (3). */
+  release_table_share_index_stat(pfs);
+
+  pfs= static_cast<PFS_table_share_index *>(old_ptr);
+  return pfs;
+}
+
+/** Destroy table share index instrumentation. */
+void PFS_table_share::destroy_index_stats()
+{
+  uint index;
+
+  for (index= 0; index <= MAX_INDEXES; index++)
+  {
+    void *addr= & this->m_race_index_stat[index];
+    void * volatile * typed_addr= static_cast<void * volatile *>(addr);
+    void *new_ptr= NULL;
+    void *old_ptr;
+
+    old_ptr= my_atomic_fasptr(typed_addr, new_ptr);
+    if (old_ptr != NULL)
+    {
+      PFS_table_share_index *pfs;
+      pfs= static_cast<PFS_table_share_index *>(old_ptr);
+      release_table_share_index_stat(pfs);
+    }
+  }
+}
+
 void PFS_table_share::refresh_setup_object_flags(PFS_thread *thread)
 {
+  bool old_enabled= m_enabled;
+
   lookup_setup_object(thread,
                       OBJECT_TYPE_TABLE,
                       m_schema_name, m_schema_name_length,
                       m_table_name, m_table_name_length,
                       &m_enabled, &m_timed);
+
+  /*
+    If instrumentation for this table was enabled earlier and is disabled now,
+    cleanup slots reserved for lock stats and index stats.
+  */
+  if (old_enabled && ! m_enabled)
+  {
+    destroy_lock_stat();
+    destroy_index_stats();
+  }
+}
+
+/**
+  Initialize the table lock stat buffer.
+  @param table_stat_sizing           max number of table lock statistics
+  @return 0 on success
+*/
+int init_table_share_lock_stat(uint table_stat_sizing)
+{
+  if (global_table_share_lock_container.init(table_stat_sizing))
+    return 1;
+
+  return 0;
+}
+
+/**
+  Create a table share lock instrumentation.
+  @return table share lock instrumentation, or NULL
+*/
+PFS_table_share_lock*
+create_table_share_lock_stat()
+{
+  PFS_table_share_lock *pfs= NULL;
+  pfs_dirty_state dirty_state;
+
+  /* Create a new record in table stat array. */
+  pfs= global_table_share_lock_container.allocate(& dirty_state);
+  if (pfs != NULL)
+  {
+    /* Reset the stats. */
+    pfs->m_stat.reset();
+
+    /* Use this stat buffer. */
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
+  }
+
+  return pfs;
+}
+
+/** Release a table share lock instrumentation. */
+void release_table_share_lock_stat(PFS_table_share_lock *pfs)
+{
+  pfs->m_owner= NULL;
+  global_table_share_lock_container.deallocate(pfs);
+  return;
+}
+
+/** Cleanup the table stat buffers. */
+void cleanup_table_share_lock_stat(void)
+{
+  global_table_share_lock_container.cleanup();
+}
+
+/**
+  Initialize table index stat buffer.
+  @param index_stat_sizing           max number of index statistics
+  @return 0 on success
+*/
+int init_table_share_index_stat(uint index_stat_sizing)
+{
+  if (global_table_share_index_container.init(index_stat_sizing))
+    return 1;
+
+  return 0;
+}
+
+/**
+  Create a table share index instrumentation.
+  @return table share index instrumentation, or NULL
+*/
+PFS_table_share_index*
+create_table_share_index_stat(const TABLE_SHARE *server_share, uint server_index)
+{
+  assert((server_share != NULL) || (server_index == MAX_INDEXES));
+
+  PFS_table_share_index *pfs= NULL;
+  pfs_dirty_state dirty_state;
+
+  /* Create a new record in index stat array. */
+  pfs= global_table_share_index_container.allocate(& dirty_state);
+  if (pfs != NULL)
+  {
+    if (server_index == MAX_INDEXES)
+    {
+      pfs->m_key.m_name_length= 0;
+    }
+    else
+    {
+      KEY *key_info= server_share->key_info + server_index;
+      size_t len= key_info->name.length;
+
+      memcpy(pfs->m_key.m_name, key_info->name.str, len);
+      pfs->m_key.m_name_length= static_cast<uint>(len);
+    }
+
+    /* Reset the stats. */
+    pfs->m_stat.reset();
+
+    /* Use this stat buffer. */
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
+  }
+
+  return pfs;
+}
+
+/** Release a table share index instrumentation. */
+void release_table_share_index_stat(PFS_table_share_index *pfs)
+{
+  pfs->m_owner= NULL;
+  global_table_share_index_container.deallocate(pfs);
+  return;
+}
+
+/** Cleanup the table stat buffers. */
+void cleanup_table_share_index_stat(void)
+{
+  global_table_share_index_container.cleanup();
 }
 
 /**
@@ -482,8 +823,10 @@ int init_file_class(uint file_class_sizing)
 
   if (file_class_max > 0)
   {
-    file_class_array= PFS_MALLOC_ARRAY(file_class_max, sizeof(PFS_file_class),
-                                       PFS_file_class, MYF(MY_ZEROFILL));
+    file_class_array= PFS_MALLOC_ARRAY(& builtin_memory_file_class,
+                                       file_class_max,
+                                       sizeof(PFS_file_class), PFS_file_class,
+                                       MYF(MY_ZEROFILL));
     if (unlikely(file_class_array == NULL))
       return 1;
   }
@@ -496,7 +839,9 @@ int init_file_class(uint file_class_sizing)
 /** Cleanup the file class buffers. */
 void cleanup_file_class(void)
 {
-  pfs_free(file_class_array);
+  PFS_FREE_ARRAY(& builtin_memory_file_class,
+                 file_class_max, sizeof(PFS_file_class),
+                 file_class_array);
   file_class_array= NULL;
   file_class_dirty_count= file_class_allocated_count= 0;
   file_class_max= 0;
@@ -516,8 +861,10 @@ int init_stage_class(uint stage_class_sizing)
 
   if (stage_class_max > 0)
   {
-    stage_class_array= PFS_MALLOC_ARRAY(stage_class_max, sizeof(PFS_stage_class),
-                                        PFS_stage_class, MYF(MY_ZEROFILL));
+    stage_class_array= PFS_MALLOC_ARRAY(& builtin_memory_stage_class,
+                                        stage_class_max,
+                                        sizeof(PFS_stage_class), PFS_stage_class,
+                                        MYF(MY_ZEROFILL));
     if (unlikely(stage_class_array == NULL))
       return 1;
   }
@@ -530,7 +877,9 @@ int init_stage_class(uint stage_class_sizing)
 /** Cleanup the stage class buffers. */
 void cleanup_stage_class(void)
 {
-  pfs_free(stage_class_array);
+  PFS_FREE_ARRAY(& builtin_memory_stage_class,
+                 stage_class_max, sizeof(PFS_stage_class),
+                 stage_class_array);
   stage_class_array= NULL;
   stage_class_dirty_count= stage_class_allocated_count= 0;
   stage_class_max= 0;
@@ -550,8 +899,10 @@ int init_statement_class(uint statement_class_sizing)
 
   if (statement_class_max > 0)
   {
-    statement_class_array= PFS_MALLOC_ARRAY(statement_class_max, sizeof(PFS_statement_class),
-                                            PFS_statement_class, MYF(MY_ZEROFILL));
+    statement_class_array= PFS_MALLOC_ARRAY(& builtin_memory_statement_class,
+                                            statement_class_max,
+                                            sizeof(PFS_statement_class), PFS_statement_class,
+                                            MYF(MY_ZEROFILL));
     if (unlikely(statement_class_array == NULL))
       return 1;
   }
@@ -564,7 +915,9 @@ int init_statement_class(uint statement_class_sizing)
 /** Cleanup the statement class buffers. */
 void cleanup_statement_class(void)
 {
-  pfs_free(statement_class_array);
+  PFS_FREE_ARRAY(& builtin_memory_statement_class,
+                 statement_class_max, sizeof(PFS_statement_class),
+                 statement_class_array);
   statement_class_array= NULL;
   statement_class_dirty_count= statement_class_allocated_count= 0;
   statement_class_max= 0;
@@ -584,8 +937,10 @@ int init_socket_class(uint socket_class_sizing)
 
   if (socket_class_max > 0)
   {
-    socket_class_array= PFS_MALLOC_ARRAY(socket_class_max, sizeof(PFS_socket_class),
-                                         PFS_socket_class, MYF(MY_ZEROFILL));
+    socket_class_array= PFS_MALLOC_ARRAY(& builtin_memory_socket_class,
+                                         socket_class_max,
+                                         sizeof(PFS_socket_class), PFS_socket_class,
+                                         MYF(MY_ZEROFILL));
     if (unlikely(socket_class_array == NULL))
       return 1;
   }
@@ -598,19 +953,59 @@ int init_socket_class(uint socket_class_sizing)
 /** Cleanup the socket class buffers. */
 void cleanup_socket_class(void)
 {
-  pfs_free(socket_class_array);
+  PFS_FREE_ARRAY(& builtin_memory_socket_class,
+                 socket_class_max, sizeof(PFS_socket_class),
+                 socket_class_array);
   socket_class_array= NULL;
   socket_class_dirty_count= socket_class_allocated_count= 0;
   socket_class_max= 0;
 }
 
+/**
+  Initialize the memory class buffer.
+  @param memory_class_sizing            max number of memory class
+  @return 0 on success
+*/
+int init_memory_class(uint memory_class_sizing)
+{
+  int result= 0;
+  memory_class_dirty_count= memory_class_allocated_count= 0;
+  memory_class_max= memory_class_sizing;
+  memory_class_lost= 0;
+
+  if (memory_class_max > 0)
+  {
+    memory_class_array= PFS_MALLOC_ARRAY(& builtin_memory_memory_class,
+                                         memory_class_max,
+                                         sizeof(PFS_memory_class), PFS_memory_class,
+                                         MYF(MY_ZEROFILL));
+    if (unlikely(memory_class_array == NULL))
+      return 1;
+  }
+  else
+    memory_class_array= NULL;
+
+  return result;
+}
+
+/** Cleanup the memory class buffers. */
+void cleanup_memory_class(void)
+{
+  PFS_FREE_ARRAY(& builtin_memory_memory_class,
+                 memory_class_max, sizeof(PFS_memory_class),
+                 memory_class_array);
+  memory_class_array= NULL;
+  memory_class_dirty_count= memory_class_allocated_count= 0;
+  memory_class_max= 0;
+}
+
 static void init_instr_class(PFS_instr_class *klass,
                              const char *name,
                              uint name_length,
                              int flags,
                              PFS_class_type class_type)
 {
-  DBUG_ASSERT(name_length <= PFS_MAX_INFO_NAME_LENGTH);
+  assert(name_length <= PFS_MAX_INFO_NAME_LENGTH);
   memset(klass, 0, sizeof(PFS_instr_class));
   strncpy(klass->m_name, name, name_length);
   klass->m_name[PFS_MAX_INFO_NAME_LENGTH - 1]= '\0';
@@ -629,10 +1024,13 @@ static void configure_instr_class(PFS_instr_class *entry)
 {
   uint match_length= 0; /* length of matching pattern */
 
-  for (uint i= 0; i < pfs_instr_config_array.elements; i++)
+  // May be NULL in unit tests
+  if (pfs_instr_config_array == NULL)
+    return;
+  PFS_instr_config **it= pfs_instr_config_array->front();
+  for ( ; it != pfs_instr_config_array->end(); it++)
   {
-    PFS_instr_config* e;
-    get_dynamic(&pfs_instr_config_array, (uchar*)&e, i);
+    PFS_instr_config* e= *it;
 
     /**
       Compare class name to all configuration entries. In case of multiple
@@ -642,7 +1040,7 @@ static void configure_instr_class(PFS_instr_class *entry)
 
       Consecutive wildcards affect the count.
     */
-    if (!my_wildcmp(&my_charset_latin1,
+    if (!my_charset_latin1.wildcmp(
                     entry->m_name, entry->m_name+entry->m_name_length,
                     e->m_name, e->m_name+e->m_name_length,
                     '\\', '?','%'))
@@ -664,7 +1062,7 @@ static void configure_instr_class(PFS_instr_class *entry)
     if ((entry->m_name_length == NAME_LENGTH) &&                       \
         (strncmp(entry->m_name, NAME, NAME_LENGTH) == 0))              \
     {                                                                  \
-      DBUG_ASSERT(entry->m_flags == flags);                            \
+      assert(entry->m_flags == flags);                                 \
       return (INDEX + 1);                                              \
     }                                                                  \
   }
@@ -758,7 +1156,8 @@ PFS_sync_key register_mutex_class(const char *name, uint name_length,
     Out of space, report to SHOW STATUS that
     the allocated memory was too small.
   */
-  mutex_class_lost++;
+  if (pfs_enabled)
+    mutex_class_lost++;
   return 0;
 }
 
@@ -796,7 +1195,8 @@ PFS_sync_key register_rwlock_class(const char *name, uint name_length,
     return (index + 1);
   }
 
-  rwlock_class_lost++;
+  if (pfs_enabled)
+    rwlock_class_lost++;
   return 0;
 }
 
@@ -833,7 +1233,8 @@ PFS_sync_key register_cond_class(const char *name, uint name_length,
     return (index + 1);
   }
 
-  cond_class_lost++;
+  if (pfs_enabled)
+    cond_class_lost++;
   return 0;
 }
 
@@ -915,7 +1316,7 @@ PFS_thread_key register_thread_class(const char *name, uint name_length,
   if (index < thread_class_max)
   {
     entry= &thread_class_array[index];
-    DBUG_ASSERT(name_length <= PFS_MAX_INFO_NAME_LENGTH);
+    assert(name_length <= PFS_MAX_INFO_NAME_LENGTH);
     strncpy(entry->m_name, name, name_length);
     entry->m_name_length= name_length;
     entry->m_enabled= true;
@@ -923,7 +1324,8 @@ PFS_thread_key register_thread_class(const char *name, uint name_length,
     return (index + 1);
   }
 
-  thread_class_lost++;
+  if (pfs_enabled)
+    thread_class_lost++;
   return 0;
 }
 
@@ -972,10 +1374,12 @@ PFS_file_key register_file_class(const char *name, uint name_length,
     /* Set user-defined configuration options for this instrument */
     configure_instr_class(entry);
     PFS_atomic::add_u32(&file_class_allocated_count, 1);
+
     return (index + 1);
   }
 
-  file_class_lost++;
+  if (pfs_enabled)
+    file_class_lost++;
   return 0;
 }
 
@@ -1007,8 +1411,20 @@ PFS_stage_key register_stage_class(const char *name,
     init_instr_class(entry, name, name_length, flags, PFS_CLASS_STAGE);
     entry->m_prefix_length= prefix_length;
     entry->m_event_name_index= index;
-    entry->m_enabled= false; /* disabled by default */
-    entry->m_timed= false;
+
+    if (flags & PSI_FLAG_STAGE_PROGRESS)
+    {
+      /* Stages with progress information are enabled and timed by default */
+      entry->m_enabled= true;
+      entry->m_timed= true;
+    }
+    else
+    {
+      /* Stages without progress information are disabled by default */
+      entry->m_enabled= false;
+      entry->m_timed= false;
+    }
+
     /* Set user-defined configuration options for this instrument */
     configure_instr_class(entry);
     PFS_atomic::add_u32(&stage_class_allocated_count, 1);
@@ -1016,7 +1432,8 @@ PFS_stage_key register_stage_class(const char *name,
     return (index + 1);
   }
 
-  stage_class_lost++;
+  if (pfs_enabled)
+    stage_class_lost++;
   return 0;
 }
 
@@ -1053,7 +1470,8 @@ PFS_statement_key register_statement_class(const char *name, uint name_length,
     return (index + 1);
   }
 
-  statement_class_lost++;
+  if (pfs_enabled)
+    statement_class_lost++;
   return 0;
 }
 
@@ -1135,7 +1553,8 @@ PFS_socket_key register_socket_class(const char *name, uint name_length,
     return (index + 1);
   }
 
-  socket_class_lost++;
+  if (pfs_enabled)
+    socket_class_lost++;
   return 0;
 }
 
@@ -1154,6 +1573,58 @@ PFS_socket_class *sanitize_socket_class(PFS_socket_class *unsafe)
   SANITIZE_ARRAY_BODY(PFS_socket_class, socket_class_array, socket_class_max, unsafe);
 }
 
+/**
+  Register a memory instrumentation metadata.
+  @param name                         the instrumented name
+  @param name_length                  length in bytes of name
+  @param flags                        the instrumentation flags
+  @return a memory instrumentation key
+*/
+PFS_memory_key register_memory_class(const char *name, uint name_length,
+                                     int flags)
+{
+  /* See comments in register_mutex_class */
+  uint32 index;
+  PFS_memory_class *entry;
+
+  REGISTER_CLASS_BODY_PART(index, memory_class_array, memory_class_max,
+                           name, name_length)
+
+  index= PFS_atomic::add_u32(&memory_class_dirty_count, 1);
+
+  if (index < memory_class_max)
+  {
+    entry= &memory_class_array[index];
+    init_instr_class(entry, name, name_length, flags, PFS_CLASS_MEMORY);
+    entry->m_event_name_index= index;
+    entry->m_enabled= false; /* disabled by default */
+    /* Set user-defined configuration options for this instrument */
+    configure_instr_class(entry);
+    entry->m_timed= false; /* Immutable */
+    PFS_atomic::add_u32(&memory_class_allocated_count, 1);
+    return (index + 1);
+  }
+
+  if (pfs_enabled)
+    memory_class_lost++;
+  return 0;
+}
+
+/**
+  Find a memory instrumentation class by key.
+  @param key                          the instrument key
+  @return the instrument class, or NULL
+*/
+PFS_memory_class *find_memory_class(PFS_memory_key key)
+{
+  FIND_CLASS_BODY(key, memory_class_allocated_count, memory_class_array);
+}
+
+PFS_memory_class *sanitize_memory_class(PFS_memory_class *unsafe)
+{
+  SANITIZE_ARRAY_BODY(PFS_memory_class, memory_class_array, memory_class_max, unsafe);
+}
+
 PFS_instr_class *find_table_class(uint index)
 {
   if (index == 1)
@@ -1185,49 +1656,64 @@ PFS_instr_class *sanitize_idle_class(PFS_instr_class *unsafe)
   return NULL;
 }
 
-static void set_keys(PFS_table_share *pfs, const TABLE_SHARE *share)
+PFS_instr_class *find_metadata_class(uint index)
 {
-  uint len;
-  KEY *key_info= share->key_info;
-  PFS_table_key *pfs_key= pfs->m_keys;
-  PFS_table_key *pfs_key_last= pfs->m_keys + share->keys;
-  pfs->m_key_count= share->keys;
+  if (index == 1)
+    return & global_metadata_class;
+  return NULL;
+}
 
-  for ( ; pfs_key < pfs_key_last; pfs_key++, key_info++)
-  {
-    len= (uint)key_info->name.length;
-    memcpy(pfs_key->m_name, key_info->name.str, len);
-    pfs_key->m_name_length= len;
-  }
+PFS_instr_class *sanitize_metadata_class(PFS_instr_class *unsafe)
+{
+  if (likely(& global_metadata_class == unsafe))
+    return unsafe;
+  return NULL;
+}
 
-  pfs_key_last= pfs->m_keys + MAX_INDEXES;
-  for ( ; pfs_key < pfs_key_last; pfs_key++)
-    pfs_key->m_name_length= 0;
+PFS_transaction_class *find_transaction_class(uint index)
+{
+  if (index == 1)
+    return &global_transaction_class;
+  return NULL;
 }
 
-static int compare_keys(PFS_table_share *pfs, const TABLE_SHARE *share)
+PFS_transaction_class *sanitize_transaction_class(PFS_transaction_class *unsafe)
 {
-  uint len;
-  KEY *key_info= share->key_info;
-  PFS_table_key *pfs_key= pfs->m_keys;
-  PFS_table_key *pfs_key_last= pfs->m_keys + share->keys;
+  if (likely(&global_transaction_class == unsafe))
+    return unsafe;
+  return NULL;
+}
 
+static int compare_keys(PFS_table_share *pfs, const TABLE_SHARE *share)
+{
   if (pfs->m_key_count != share->keys)
     return 1;
 
-  for ( ; pfs_key < pfs_key_last; pfs_key++, key_info++)
+  size_t len;
+  uint index= 0;
+  uint key_count= share->keys;
+  KEY *key_info= share->key_info;
+  PFS_table_share_index *index_stat;
+
+  for ( ; index < key_count; key_info++, index++)
   {
-    len= (uint)key_info->name.length;
-    if (len != pfs_key->m_name_length)
-      return 1;
+    index_stat= pfs->find_index_stat(index);
+    if (index_stat != NULL)
+    {
+      len= key_info->name.length;
 
-    if (memcmp(pfs_key->m_name, key_info->name.str, len) != 0)
-      return 1;
+      if (len != index_stat->m_key.m_name_length)
+        return 1;
+
+      if (memcmp(index_stat->m_key.m_name, key_info->name.str, len) != 0)
+        return 1;
+    }
   }
 
   return 0;
 }
 
+
 /**
   Find or create a table share instrumentation.
   @param thread                       the executing instrumented thread
@@ -1245,14 +1731,14 @@ PFS_table_share* find_or_create_table_share(PFS_thread *thread,
   LF_PINS *pins= get_table_share_hash_pins(thread);
   if (unlikely(pins == NULL))
   {
-    table_share_lost++;
+    global_table_share_container.m_lost++;
     return NULL;
   }
 
   const char *schema_name= share->db.str;
-  uint schema_name_length= (uint)share->db.length;
+  size_t schema_name_length= share->db.length;
   const char *table_name= share->table_name.str;
-  uint table_name_length= (uint)share->table_name.length;
+  size_t table_name_length= share->table_name.length;
 
   set_table_share_key(&key, temporary,
                       schema_name, schema_name_length,
@@ -1263,10 +1749,8 @@ PFS_table_share* find_or_create_table_share(PFS_thread *thread,
   const uint retry_max= 3;
   bool enabled= true;
   bool timed= true;
-  static uint PFS_ALIGNED table_share_monotonic_index= 0;
-  uint index;
-  uint attempts= 0;
   PFS_table_share *pfs;
+  pfs_dirty_state dirty_state;
 
 search:
   entry= reinterpret_cast<PFS_table_share**>
@@ -1278,9 +1762,19 @@ search:
     pfs->inc_refcount() ;
     if (compare_keys(pfs, share) != 0)
     {
-      set_keys(pfs, share);
-      /* FIXME: aggregate to table_share sink ? */
-      pfs->m_table_stat.fast_reset();
+      /*
+        Some DDL was detected.
+        - keep the lock stats, they are unaffected
+        - destroy the index stats, indexes changed.
+        - adjust the expected key count
+        - recreate index stats
+      */
+      pfs->destroy_index_stats();
+      pfs->m_key_count= share->keys;
+      for (uint index= 0; index < pfs->m_key_count; index++)
+      {
+        (void)pfs->find_or_create_index_stat(share, index);
+      }
     }
     lf_hash_search_unpin(pins);
     return pfs;
@@ -1292,8 +1786,8 @@ search:
   {
     lookup_setup_object(thread,
                         OBJECT_TYPE_TABLE,
-                        schema_name, schema_name_length,
-                        table_name, table_name_length,
+                        schema_name, static_cast<uint>(schema_name_length),
+                        table_name, static_cast<uint>(table_name_length),
                         &enabled, &timed);
     /*
       Even when enabled is false, a record is added in the dictionary:
@@ -1303,90 +1797,141 @@ search:
     */
   }
 
-  while (++attempts <= table_share_max)
+  pfs= global_table_share_container.allocate(& dirty_state);
+  if (pfs != NULL)
   {
-    /* See create_mutex() */
-    index= PFS_atomic::add_u32(& table_share_monotonic_index, 1) % table_share_max;
-    pfs= table_share_array + index;
-
-    if (pfs->m_lock.is_free())
+    pfs->m_key= key;
+    pfs->m_schema_name= &pfs->m_key.m_hash_key[1];
+    pfs->m_schema_name_length= static_cast<uint>(schema_name_length);
+    pfs->m_table_name= &pfs->m_key.m_hash_key[schema_name_length + 2];
+    pfs->m_table_name_length= static_cast<uint>(table_name_length);
+    pfs->m_enabled= enabled;
+    pfs->m_timed= timed;
+    pfs->init_refcount();
+    pfs->destroy_lock_stat();
+    pfs->destroy_index_stats();
+    pfs->m_key_count= share->keys;
+
+    int res;
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
+    res= lf_hash_insert(&table_share_hash, pins, &pfs);
+
+    if (likely(res == 0))
     {
-      if (pfs->m_lock.free_to_dirty())
+      /* Create table share index stats. */
+      for (uint index= 0; index < pfs->m_key_count; index++)
       {
-        pfs->m_key= key;
-        pfs->m_schema_name= &pfs->m_key.m_hash_key[1];
-        pfs->m_schema_name_length= schema_name_length;
-        pfs->m_table_name= &pfs->m_key.m_hash_key[schema_name_length + 2];
-        pfs->m_table_name_length= table_name_length;
-        pfs->m_enabled= enabled;
-        pfs->m_timed= timed;
-        pfs->init_refcount();
-        pfs->m_table_stat.fast_reset();
-        set_keys(pfs, share);
-
-        int res;
-        res= lf_hash_insert(&table_share_hash, pins, &pfs);
-        if (likely(res == 0))
-        {
-          pfs->m_lock.dirty_to_allocated();
-          return pfs;
-        }
-
-        pfs->m_lock.dirty_to_free();
+        (void)pfs->find_or_create_index_stat(share, index);
+      }
+      return pfs;
+    }
 
-        if (res > 0)
-        {
-          /* Duplicate insert by another thread */
-          if (++retry_count > retry_max)
-          {
-            /* Avoid infinite loops */
-            table_share_lost++;
-            return NULL;
-          }
-          goto search;
-        }
+    global_table_share_container.deallocate(pfs);
 
-        /* OOM in lf_hash_insert */
-        table_share_lost++;
+    if (res > 0)
+    {
+      /* Duplicate insert by another thread */
+      if (++retry_count > retry_max)
+      {
+        /* Avoid infinite loops */
+        global_table_share_container.m_lost++;
         return NULL;
       }
+      goto search;
     }
+
+    /* OOM in lf_hash_insert */
+    global_table_share_container.m_lost++;
+    return NULL;
   }
 
-  table_share_lost++;
   return NULL;
 }
 
 void PFS_table_share::aggregate_io(void)
 {
+  uint index;
   uint safe_key_count= sanitize_index_count(m_key_count);
-  PFS_table_io_stat *from_stat;
-  PFS_table_io_stat *from_stat_last;
+  PFS_table_share_index *from_stat;
   PFS_table_io_stat sum_io;
 
   /* Aggregate stats for each index, if any */
-  from_stat= & m_table_stat.m_index_stat[0];
-  from_stat_last= from_stat + safe_key_count;
-  for ( ; from_stat < from_stat_last ; from_stat++)
-    sum_io.aggregate(from_stat);
+  for (index= 0; index < safe_key_count; index++)
+  {
+    from_stat= find_index_stat(index);
+    if (from_stat != NULL)
+    {
+      sum_io.aggregate(& from_stat->m_stat);
+      from_stat->m_stat.reset();
+    }
+  }
 
   /* Aggregate stats for the table */
-  sum_io.aggregate(& m_table_stat.m_index_stat[MAX_INDEXES]);
+  from_stat= find_index_stat(MAX_INDEXES);
+  if (from_stat != NULL)
+  {
+    sum_io.aggregate(& from_stat->m_stat);
+    from_stat->m_stat.reset();
+  }
 
   /* Add this table stats to the global sink. */
   global_table_io_stat.aggregate(& sum_io);
-  m_table_stat.fast_reset_io();
+}
+
+void PFS_table_share::sum_io(PFS_single_stat *result, uint key_count)
+{
+  uint index;
+  PFS_table_share_index *stat;
+
+  assert(key_count <= MAX_INDEXES);
+
+  /* Sum stats for each index, if any */
+  for (index= 0; index < key_count; index++)
+  {
+    stat= find_index_stat(index);
+    if (stat != NULL)
+    {
+      stat->m_stat.sum(result);
+    }
+  }
+
+  /* Sum stats for the table */
+  stat= find_index_stat(MAX_INDEXES);
+  if (stat != NULL)
+  {
+    stat->m_stat.sum(result);
+  }
+}
+
+void PFS_table_share::sum_lock(PFS_single_stat *result)
+{
+  PFS_table_share_lock *lock_stat;
+  lock_stat= find_lock_stat();
+  if (lock_stat != NULL)
+    lock_stat->m_stat.sum(result);
+}
+
+void PFS_table_share::sum(PFS_single_stat *result, uint key_count)
+{
+  sum_io(result, key_count);
+  sum_lock(result);
 }
 
 void PFS_table_share::aggregate_lock(void)
 {
-  global_table_lock_stat.aggregate(& m_table_stat.m_lock_stat);
-  m_table_stat.fast_reset_lock();
+  PFS_table_share_lock *lock_stat;
+  lock_stat= find_lock_stat();
+  if (lock_stat != NULL)
+  {
+    global_table_lock_stat.aggregate(& lock_stat->m_stat);
+    /* Reset lock stat. */
+    lock_stat->m_stat.reset();
+  }
 }
 
 void release_table_share(PFS_table_share *pfs)
 {
-  DBUG_ASSERT(pfs->get_refcount() > 0);
+  assert(pfs->get_refcount() > 0);
   pfs->dec_refcount();
 }
 
@@ -1419,6 +1964,9 @@ void drop_table_share(PFS_thread *thread,
     PFS_table_share *pfs= *entry;
     lf_hash_delete(&table_share_hash, pins,
                    pfs->m_key.m_hash_key, pfs->m_key.m_key_length);
+    pfs->destroy_lock_stat();
+    pfs->destroy_index_stats();
+
     pfs->m_lock.allocated_to_free();
   }
 
@@ -1432,7 +1980,7 @@ void drop_table_share(PFS_thread *thread,
 */
 PFS_table_share *sanitize_table_share(PFS_table_share *unsafe)
 {
-  SANITIZE_ARRAY_BODY(PFS_table_share, table_share_array, table_share_max, unsafe);
+  return global_table_share_container.sanitize(unsafe);
 }
 
 /** Reset the wait statistics per instrument class. */
@@ -1443,6 +1991,7 @@ void reset_events_waits_by_class()
   global_idle_stat.reset();
   global_table_io_stat.reset();
   global_table_lock_stat.reset();
+  global_metadata_stat.reset();
 }
 
 /** Reset the io statistics per file class. */
@@ -1465,16 +2014,50 @@ void reset_socket_class_io(void)
     pfs->m_socket_stat.m_io_stat.reset();
 }
 
+class Proc_table_share_derived_flags
+  : public PFS_buffer_processor<PFS_table_share>
+{
+public:
+  Proc_table_share_derived_flags(PFS_thread *thread)
+    : m_thread(thread)
+  {}
+
+  virtual void operator()(PFS_table_share *pfs)
+  {
+    pfs->refresh_setup_object_flags(m_thread);
+  }
+
+private:
+  PFS_thread* m_thread;
+};
+
 void update_table_share_derived_flags(PFS_thread *thread)
 {
-  PFS_table_share *pfs= table_share_array;
-  PFS_table_share *pfs_last= table_share_array + table_share_max;
+  Proc_table_share_derived_flags proc(thread);
+  global_table_share_container.apply(proc);
+}
 
-  for ( ; pfs < pfs_last; pfs++)
+class Proc_program_share_derived_flags
+  : public PFS_buffer_processor<PFS_program>
+{
+public:
+  Proc_program_share_derived_flags(PFS_thread *thread)
+    : m_thread(thread)
+  {}
+
+  virtual void operator()(PFS_program *pfs)
   {
-    if (pfs->m_lock.is_populated())
-      pfs->refresh_setup_object_flags(thread);
+    pfs->refresh_setup_object_flags(m_thread);
   }
+
+private:
+  PFS_thread* m_thread;
+};
+
+void update_program_share_derived_flags(PFS_thread *thread)
+{
+  Proc_program_share_derived_flags proc(thread);
+  global_program_container.apply(proc);
 }
 
 /** @} */
diff --git a/storage/perfschema/pfs_instr_class.h b/storage/perfschema/pfs_instr_class.h
index 9d256fac78a..b8ff8497241 100644
--- a/storage/perfschema/pfs_instr_class.h
+++ b/storage/perfschema/pfs_instr_class.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -27,6 +27,8 @@
 #include "mysql_com.h"                          /* NAME_LEN */
 #include "lf.h"
 #include "pfs_global.h"
+#include "pfs_atomic.h"
+#include "sql_array.h"
 
 /**
   @file storage/perfschema/pfs_instr_class.h
@@ -48,7 +50,6 @@
 */
 #define PFS_MAX_FULL_PREFIX_NAME_LENGTH 32
 
-#include <my_global.h>
 #include <my_sys.h>
 #include <mysql/psi/psi.h>
 #include "pfs_lock.h"
@@ -56,6 +57,8 @@
 #include "pfs_column_types.h"
 
 struct PFS_global_param;
+struct PFS_table_share;
+class PFS_opaque_container_page;
 
 /**
   @addtogroup Performance_schema_buffers
@@ -75,8 +78,12 @@ typedef unsigned int PFS_file_key;
 typedef unsigned int PFS_stage_key;
 /** Key, naming a statement instrument. */
 typedef unsigned int PFS_statement_key;
+/** Key, naming a transaction instrument. */
+typedef unsigned int PFS_transaction_key;
 /** Key, naming a socket instrument. */
 typedef unsigned int PFS_socket_key;
+/** Key, naming a memory instrument. */
+typedef unsigned int PFS_memory_key;
 
 enum PFS_class_type
 {
@@ -88,11 +95,14 @@ enum PFS_class_type
   PFS_CLASS_TABLE=       5,
   PFS_CLASS_STAGE=       6,
   PFS_CLASS_STATEMENT=   7,
-  PFS_CLASS_SOCKET=      8,
-  PFS_CLASS_TABLE_IO=    9,
-  PFS_CLASS_TABLE_LOCK= 10,
-  PFS_CLASS_IDLE=       11,
-  PFS_CLASS_LAST=       PFS_CLASS_IDLE,
+  PFS_CLASS_TRANSACTION= 8,
+  PFS_CLASS_SOCKET=      9,
+  PFS_CLASS_TABLE_IO=   10,
+  PFS_CLASS_TABLE_LOCK= 11,
+  PFS_CLASS_IDLE=       12,
+  PFS_CLASS_MEMORY=     13,
+  PFS_CLASS_METADATA=   14,
+  PFS_CLASS_LAST=       PFS_CLASS_METADATA,
   PFS_CLASS_MAX=        PFS_CLASS_LAST + 1
 };
 
@@ -109,12 +119,8 @@ struct PFS_instr_config
   bool m_timed;
 };
 
-extern DYNAMIC_ARRAY pfs_instr_config_array;
-extern int pfs_instr_config_state;
-
-static const int PFS_INSTR_CONFIG_NOT_INITIALIZED= 0;
-static const int PFS_INSTR_CONFIG_ALLOCATED= 1;
-static const int PFS_INSTR_CONFIG_DEALLOCATED= 2;
+typedef Dynamic_array<PFS_instr_config*> Pfs_instr_config_array;
+extern Pfs_instr_config_array *pfs_instr_config_array;
 
 struct PFS_thread;
 
@@ -136,12 +142,15 @@ struct PFS_instr_class
   bool m_timed;
   /** Instrument flags. */
   int m_flags;
+  /** Volatility index. */
+  int m_volatility;
   /**
     Instrument name index.
     Self index in:
     - EVENTS_WAITS_SUMMARY_*_BY_EVENT_NAME for waits
     - EVENTS_STAGES_SUMMARY_*_BY_EVENT_NAME for stages
     - EVENTS_STATEMENTS_SUMMARY_*_BY_EVENT_NAME for statements
+    - EVENTS_TRANSACTIONS_SUMMARY_*_BY_EVENT_NAME for transactions
   */
   uint m_event_name_index;
   /** Instrument name. */
@@ -161,6 +170,18 @@ struct PFS_instr_class
     return m_flags & PSI_FLAG_MUTABLE;
   }
 
+  bool is_progress() const
+  {
+    assert(m_type == PFS_CLASS_STAGE);
+    return m_flags & PSI_FLAG_STAGE_PROGRESS;
+  }
+
+  bool is_shared_exclusive() const
+  {
+    assert(m_type == PFS_CLASS_RWLOCK);
+    return m_flags & PSI_RWLOCK_FLAG_SX;
+  }
+
   static void set_enabled(PFS_instr_class *pfs, bool enabled);
   static void set_timed(PFS_instr_class *pfs, bool timed);
 
@@ -252,6 +273,32 @@ struct PFS_table_key
   uint m_name_length;
 };
 
+/** Index statistics of a table.*/
+struct PFS_table_share_index
+{
+  pfs_lock m_lock;
+  /** The index name */
+  PFS_table_key m_key;
+  /** The index stat */
+  PFS_table_io_stat m_stat;
+  /** Owner table share. To be used later. */
+  PFS_table_share* m_owner;
+  /** Container page. */
+  PFS_opaque_container_page *m_page;
+};
+
+/** Lock statistics of a table.*/
+struct PFS_table_share_lock
+{
+  pfs_lock m_lock;
+  /** Lock stats. */
+  PFS_table_lock_stat m_stat;
+  /** Owner table share. To be used later. */
+  PFS_table_share* m_owner;
+  /** Container page. */
+  PFS_opaque_container_page *m_page;
+};
+
 /** Instrumentation metadata for a table share. */
 struct PFS_ALIGNED PFS_table_share
 {
@@ -267,6 +314,10 @@ public:
   void aggregate_io(void);
   void aggregate_lock(void);
 
+  void sum_io(PFS_single_stat *result, uint key_count);
+  void sum_lock(PFS_single_stat *result);
+  void sum(PFS_single_stat *result, uint key_count);
+
   inline void aggregate(void)
   {
     aggregate_io();
@@ -307,6 +358,7 @@ public:
     This flag is computed from the content of table setup_objects.
   */
   bool m_timed;
+
   /** Search key. */
   PFS_table_share_key m_key;
   /** Schema name. */
@@ -319,14 +371,24 @@ public:
   uint m_table_name_length;
   /** Number of indexes. */
   uint m_key_count;
-  /** Table statistics. */
-  PFS_table_stat m_table_stat;
-  /** Index names. */
-  PFS_table_key m_keys[MAX_INDEXES];
+  /** Container page. */
+  PFS_opaque_container_page *m_page;
+
+  PFS_table_share_lock *find_lock_stat() const;
+  PFS_table_share_lock *find_or_create_lock_stat();
+  void destroy_lock_stat();
+
+  PFS_table_share_index *find_index_stat(uint index) const;
+  PFS_table_share_index *find_or_create_index_stat(const TABLE_SHARE *server_share, uint index);
+  void destroy_index_stats();
 
 private:
   /** Number of opened table handles. */
   int m_refcount;
+  /** Table locks statistics. */
+  PFS_table_share_lock *m_race_lock_stat;
+  /** Table indexes' stats. */
+  PFS_table_share_index *m_race_index_stat[MAX_INDEXES + 1];
 };
 
 /** Statistics for the IDLE instrument. */
@@ -335,6 +397,10 @@ extern PFS_single_stat global_idle_stat;
 extern PFS_table_io_stat global_table_io_stat;
 /** Statistics for dropped table lock. */
 extern PFS_table_lock_stat global_table_lock_stat;
+/** Statistics for the METADATA instrument. */
+extern PFS_single_stat global_metadata_stat;
+/** Statistics for the transaction instrument. */
+extern PFS_transaction_stat global_transaction_stat;
 
 inline uint sanitize_index_count(uint count)
 {
@@ -346,6 +412,12 @@ inline uint sanitize_index_count(uint count)
 #define GLOBAL_TABLE_IO_EVENT_INDEX 0
 #define GLOBAL_TABLE_LOCK_EVENT_INDEX 1
 #define GLOBAL_IDLE_EVENT_INDEX 2
+#define GLOBAL_METADATA_EVENT_INDEX 3
+/** Number of global wait events. */
+#define COUNT_GLOBAL_EVENT_INDEX 4
+
+/** Transaction events are not wait events .*/
+#define GLOBAL_TRANSACTION_INDEX 0
 
 /**
   Instrument controlling all table io.
@@ -364,6 +436,8 @@ extern PFS_instr_class global_table_lock_class;
 */
 extern PFS_instr_class global_idle_class;
 
+extern PFS_instr_class global_metadata_class;
+
 struct PFS_file;
 
 /** Instrumentation metadata for a file. */
@@ -392,7 +466,14 @@ struct PFS_ALIGNED PFS_statement_class : public PFS_instr_class
 {
 };
 
-struct  PFS_socket;
+/** Instrumentation metadata for a transaction. */
+struct PFS_ALIGNED PFS_transaction_class : public PFS_instr_class
+{
+};
+
+extern PFS_transaction_class global_transaction_class;
+
+struct PFS_socket;
 
 /** Instrumentation metadata for a socket. */
 struct PFS_ALIGNED PFS_socket_class : public PFS_instr_class
@@ -403,6 +484,20 @@ struct PFS_ALIGNED PFS_socket_class : public PFS_instr_class
   PFS_socket *m_singleton;
 };
 
+/** Instrumentation metadata for a memory. */
+struct PFS_ALIGNED PFS_memory_class : public PFS_instr_class
+{
+  bool is_global() const
+  {
+    return m_flags & PSI_FLAG_GLOBAL;
+  }
+
+  bool is_transferable() const
+  {
+    return m_flags & PSI_FLAG_TRANSFER;
+  }
+};
+
 void init_event_name_sizing(const PFS_global_param *param);
 
 void register_global_classes();
@@ -416,7 +511,18 @@ int init_thread_class(uint thread_class_sizing);
 void cleanup_thread_class();
 int init_table_share(uint table_share_sizing);
 void cleanup_table_share();
-int init_table_share_hash();
+
+int init_table_share_lock_stat(uint table_stat_sizing);
+void cleanup_table_share_lock_stat();
+PFS_table_share_lock* create_table_share_lock_stat();
+void release_table_share_lock_stat(PFS_table_share_lock *pfs);
+
+int init_table_share_index_stat(uint index_stat_sizing);
+void cleanup_table_share_index_stat();
+PFS_table_share_index* create_table_share_index_stat(const TABLE_SHARE *share, uint index);
+void release_table_share_index_stat(PFS_table_share_index *pfs);
+
+int init_table_share_hash(const PFS_global_param *param);
 void cleanup_table_share_hash();
 int init_file_class(uint file_class_sizing);
 void cleanup_file_class();
@@ -426,6 +532,8 @@ int init_statement_class(uint statement_class_sizing);
 void cleanup_statement_class();
 int init_socket_class(uint socket_class_sizing);
 void cleanup_socket_class();
+int init_memory_class(uint memory_class_sizing);
+void cleanup_memory_class();
 
 PFS_sync_key register_mutex_class(const char *name, uint name_length,
                                   int flags);
@@ -453,6 +561,9 @@ PFS_statement_key register_statement_class(const char *name, uint name_length,
 PFS_socket_key register_socket_class(const char *name, uint name_length,
                                      int flags);
 
+PFS_memory_key register_memory_class(const char *name, uint name_length,
+                                     int flags);
+
 PFS_mutex_class *find_mutex_class(PSI_mutex_key key);
 PFS_mutex_class *sanitize_mutex_class(PFS_mutex_class *unsafe);
 PFS_rwlock_class *find_rwlock_class(PSI_rwlock_key key);
@@ -471,8 +582,14 @@ PFS_instr_class *find_table_class(uint index);
 PFS_instr_class *sanitize_table_class(PFS_instr_class *unsafe);
 PFS_socket_class *find_socket_class(PSI_socket_key key);
 PFS_socket_class *sanitize_socket_class(PFS_socket_class *unsafe);
+PFS_memory_class *find_memory_class(PSI_memory_key key);
+PFS_memory_class *sanitize_memory_class(PFS_memory_class *unsafe);
 PFS_instr_class *find_idle_class(uint index);
 PFS_instr_class *sanitize_idle_class(PFS_instr_class *unsafe);
+PFS_instr_class *find_metadata_class(uint index);
+PFS_instr_class *sanitize_metadata_class(PFS_instr_class *unsafe);
+PFS_transaction_class *find_transaction_class(uint index);
+PFS_transaction_class *sanitize_transaction_class(PFS_transaction_class *unsafe);
 
 PFS_table_share *find_or_create_table_share(PFS_thread *thread,
                                             bool temporary,
@@ -499,10 +616,11 @@ extern ulong stage_class_max;
 extern ulong stage_class_lost;
 extern ulong statement_class_max;
 extern ulong statement_class_lost;
+extern ulong transaction_class_max;
 extern ulong socket_class_max;
 extern ulong socket_class_lost;
-extern ulong table_share_max;
-extern ulong table_share_lost;
+extern ulong memory_class_max;
+extern ulong memory_class_lost;
 
 /* Exposing the data directly, for iterators. */
 
@@ -510,7 +628,6 @@ extern PFS_mutex_class *mutex_class_array;
 extern PFS_rwlock_class *rwlock_class_array;
 extern PFS_cond_class *cond_class_array;
 extern PFS_file_class *file_class_array;
-extern PFS_table_share *table_share_array;
 
 void reset_events_waits_by_class();
 void reset_file_class_io();
@@ -519,6 +636,9 @@ void reset_socket_class_io();
 /** Update derived flags for all table shares. */
 void update_table_share_derived_flags(PFS_thread *thread);
 
+/** Update derived flags for all stored procedure shares. */
+void update_program_share_derived_flags(PFS_thread *thread);
+
 extern LF_HASH table_share_hash;
 
 /** @} */
diff --git a/storage/perfschema/pfs_lock.h b/storage/perfschema/pfs_lock.h
index b74131c79e1..4c1674c5e67 100644
--- a/storage/perfschema/pfs_lock.h
+++ b/storage/perfschema/pfs_lock.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2009, 2016, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2009, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -28,8 +28,15 @@
   Performance schema internal locks (declarations).
 */
 
+#include "my_global.h"
+
 #include "pfs_atomic.h"
 
+/* to cause bugs, testing */
+// #define MEM(X) std::memory_order_relaxed
+/* correct code */
+#define MEM(X) X
+
 /**
   @addtogroup Performance_schema_buffers
   @{
@@ -61,6 +68,16 @@
 #define STATE_MASK   0x00000003
 #define VERSION_INC  4
 
+struct pfs_optimistic_state
+{
+  uint32 m_version_state;
+};
+
+struct pfs_dirty_state
+{
+  uint32 m_version_state;
+};
+
 /**
   A 'lock' protecting performance schema internal buffers.
   This lock is used to mark the state of a record.
@@ -86,19 +103,34 @@ struct pfs_lock
     The version number is stored in the high 30 bits.
     The state is stored in the low 2 bits.
   */
-  volatile uint32 m_version_state;
+  uint32 m_version_state;
+
+  uint32 copy_version_state()
+  {
+    uint32 copy;
+
+    copy= m_version_state; /* dirty read */
+
+    return copy;
+  }
 
   /** Returns true if the record is free. */
   bool is_free(void)
   {
-    uint32 copy= m_version_state; /* non volatile copy, and dirty read */
+    uint32 copy;
+
+    copy= PFS_atomic::load_u32(&m_version_state);
+
     return ((copy & STATE_MASK) == PFS_LOCK_FREE);
   }
 
   /** Returns true if the record contains values that can be read. */
   bool is_populated(void)
   {
-    uint32 copy= m_version_state; /* non volatile copy, and dirty read */
+    uint32 copy;
+
+    copy= PFS_atomic::load_u32(&m_version_state);
+
     return ((copy & STATE_MASK) == PFS_LOCK_ALLOCATED);
   }
 
@@ -108,13 +140,28 @@ struct pfs_lock
     Only one writer will succeed to acquire the record.
     @return true if the operation succeed
   */
-  bool free_to_dirty(void)
+  bool free_to_dirty(pfs_dirty_state *copy_ptr)
   {
-    uint32 copy= m_version_state; /* non volatile copy, and dirty read */
-    uint32 old_val= (copy & VERSION_MASK) + PFS_LOCK_FREE;
-    uint32 new_val= (copy & VERSION_MASK) + PFS_LOCK_DIRTY;
+    uint32 old_val;
+
+    old_val= PFS_atomic::load_u32(&m_version_state);
+
+    if ((old_val & STATE_MASK) != PFS_LOCK_FREE)
+    {
+      return false;
+    }
 
-    return (PFS_atomic::cas_u32(&m_version_state, &old_val, new_val));
+    uint32 new_val= (old_val & VERSION_MASK) + PFS_LOCK_DIRTY;
+    bool pass;
+
+    pass= PFS_atomic::cas_u32(&m_version_state, &old_val, new_val);
+
+    if (pass)
+    {
+      copy_ptr->m_version_state= new_val;
+    }
+
+    return pass;
   }
 
   /**
@@ -122,15 +169,18 @@ struct pfs_lock
     This transition should be executed by the writer that owns the record,
     before the record is modified.
   */
-  void allocated_to_dirty(void)
+  void allocated_to_dirty(pfs_dirty_state *copy_ptr)
   {
-    uint32 copy= PFS_atomic::load_u32(&m_version_state);
+    uint32 copy= copy_version_state();
     /* Make sure the record was ALLOCATED. */
-    DBUG_ASSERT((copy & STATE_MASK) == PFS_LOCK_ALLOCATED);
+    assert((copy & STATE_MASK) == PFS_LOCK_ALLOCATED);
     /* Keep the same version, set the DIRTY state */
     uint32 new_val= (copy & VERSION_MASK) + PFS_LOCK_DIRTY;
     /* We own the record, no need to use compare and swap. */
+
     PFS_atomic::store_u32(&m_version_state, new_val);
+
+    copy_ptr->m_version_state= new_val;
   }
 
   /**
@@ -138,13 +188,13 @@ struct pfs_lock
     This transition should be executed by the writer that owns the record,
     after the record is in a state ready to be read.
   */
-  void dirty_to_allocated(void)
+  void dirty_to_allocated(const pfs_dirty_state *copy)
   {
-    uint32 copy= PFS_atomic::load_u32(&m_version_state);
     /* Make sure the record was DIRTY. */
-    DBUG_ASSERT((copy & STATE_MASK) == PFS_LOCK_DIRTY);
+    assert((copy->m_version_state & STATE_MASK) == PFS_LOCK_DIRTY);
     /* Increment the version, set the ALLOCATED state */
-    uint32 new_val= (copy & VERSION_MASK) + VERSION_INC + PFS_LOCK_ALLOCATED;
+    uint32 new_val= (copy->m_version_state & VERSION_MASK) + VERSION_INC + PFS_LOCK_ALLOCATED;
+
     PFS_atomic::store_u32(&m_version_state, new_val);
   }
 
@@ -156,35 +206,38 @@ struct pfs_lock
   void set_allocated(void)
   {
     /* Do not set the version to 0, read the previous value. */
-    uint32 copy= PFS_atomic::load_u32(&m_version_state);
+    uint32 copy= copy_version_state();
     /* Increment the version, set the ALLOCATED state */
     uint32 new_val= (copy & VERSION_MASK) + VERSION_INC + PFS_LOCK_ALLOCATED;
+
     PFS_atomic::store_u32(&m_version_state, new_val);
   }
 
   /**
     Initialize a lock to dirty.
   */
-  void set_dirty(void)
+  void set_dirty(pfs_dirty_state *copy_ptr)
   {
     /* Do not set the version to 0, read the previous value. */
     uint32 copy= PFS_atomic::load_u32(&m_version_state);
     /* Increment the version, set the DIRTY state */
     uint32 new_val= (copy & VERSION_MASK) + VERSION_INC + PFS_LOCK_DIRTY;
     PFS_atomic::store_u32(&m_version_state, new_val);
+
+    copy_ptr->m_version_state= new_val;
   }
 
   /**
     Execute a dirty to free transition.
     This transition should be executed by the writer that owns the record.
   */
-  void dirty_to_free(void)
+  void dirty_to_free(const pfs_dirty_state *copy)
   {
-    uint32 copy= PFS_atomic::load_u32(&m_version_state);
     /* Make sure the record was DIRTY. */
-    DBUG_ASSERT((copy & STATE_MASK) == PFS_LOCK_DIRTY);
+    assert((copy->m_version_state & STATE_MASK) == PFS_LOCK_DIRTY);
     /* Keep the same version, set the FREE state */
-    uint32 new_val= (copy & VERSION_MASK) + PFS_LOCK_FREE;
+    uint32 new_val= (copy->m_version_state & VERSION_MASK) + PFS_LOCK_FREE;
+
     PFS_atomic::store_u32(&m_version_state, new_val);
   }
 
@@ -198,22 +251,22 @@ struct pfs_lock
       If this record is not in the ALLOCATED state and the caller is trying
       to free it, this is a bug: the caller is confused,
       and potentially damaging data owned by another thread or object.
-      The correct assert to use here to guarantee data integrity is simply:
-        DBUG_ASSERT(m_state == PFS_LOCK_ALLOCATED);
     */
-    uint32 copy= PFS_atomic::load_u32(&m_version_state);
+    uint32 copy= copy_version_state();
     /* Make sure the record was ALLOCATED. */
-    DBUG_ASSERT(((copy & STATE_MASK) == PFS_LOCK_ALLOCATED));
+    assert(((copy & STATE_MASK) == PFS_LOCK_ALLOCATED));
     /* Keep the same version, set the FREE state */
     uint32 new_val= (copy & VERSION_MASK) + PFS_LOCK_FREE;
+
     PFS_atomic::store_u32(&m_version_state, new_val);
   }
 
   /**
     Start an optimistic read operation.
+    @param [out] copy Saved lock state
     @sa end_optimist_lock.
   */
-  void begin_optimistic_lock(struct pfs_lock *copy)
+  void begin_optimistic_lock(struct pfs_optimistic_state *copy)
   {
     copy->m_version_state= PFS_atomic::load_u32(&m_version_state);
   }
@@ -221,16 +274,21 @@ struct pfs_lock
   /**
     End an optimistic read operation.
     @sa begin_optimist_lock.
+    @param copy Saved lock state
     @return true if the data read is safe to use.
   */
-  bool end_optimistic_lock(struct pfs_lock *copy)
+  bool end_optimistic_lock(const struct pfs_optimistic_state *copy)
   {
+    uint32 version_state;
+
     /* Check there was valid data to look at. */
     if ((copy->m_version_state & STATE_MASK) != PFS_LOCK_ALLOCATED)
       return false;
 
+    version_state= PFS_atomic::load_u32(&m_version_state);
+
     /* Check the version + state has not changed. */
-    if (copy->m_version_state != PFS_atomic::load_u32(&m_version_state))
+    if (copy->m_version_state != version_state)
       return false;
 
     return true;
@@ -238,7 +296,11 @@ struct pfs_lock
 
   uint32 get_version()
   {
-    return (PFS_atomic::load_u32(&m_version_state) & VERSION_MASK);
+    uint32 version_state;
+
+    version_state= PFS_atomic::load_u32(&m_version_state);
+
+    return (version_state & VERSION_MASK);
   }
 };
 
diff --git a/storage/perfschema/pfs_memory.cc b/storage/perfschema/pfs_memory.cc
new file mode 100644
index 00000000000..d2185ecc648
--- /dev/null
+++ b/storage/perfschema/pfs_memory.cc
@@ -0,0 +1,98 @@
+/* Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/pfs_memory.cc
+  Memory statistics aggregation (implementation).
+*/
+
+#include "my_global.h"
+#include "my_sys.h"
+#include "pfs_global.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_account.h"
+#include "pfs_host.h"
+#include "pfs_user.h"
+#include "pfs_atomic.h"
+#include "pfs_buffer_container.h"
+#include "m_string.h"
+
+static void fct_reset_memory_by_thread(PFS_thread *pfs)
+{
+  PFS_account *account= sanitize_account(pfs->m_account);
+  PFS_user *user= sanitize_user(pfs->m_user);
+  PFS_host *host= sanitize_host(pfs->m_host);
+  aggregate_thread_memory(true, pfs, account, user, host);
+}
+
+/** Reset table MEMORY_SUMMARY_BY_THREAD_BY_EVENT_NAME data. */
+void reset_memory_by_thread()
+{
+  global_thread_container.apply(fct_reset_memory_by_thread);
+}
+
+static void fct_reset_memory_by_account(PFS_account *pfs)
+{
+  PFS_user *user= sanitize_user(pfs->m_user);
+  PFS_host *host= sanitize_host(pfs->m_host);
+  pfs->aggregate_memory(true, user, host);
+}
+
+/** Reset table MEMORY_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME data. */
+void reset_memory_by_account()
+{
+  global_account_container.apply(fct_reset_memory_by_account);
+}
+
+static void fct_reset_memory_by_user(PFS_user *pfs)
+{
+  pfs->aggregate_memory(true);
+}
+
+/** Reset table MEMORY_SUMMARY_BY_USER_BY_EVENT_NAME data. */
+void reset_memory_by_user()
+{
+  global_user_container.apply(fct_reset_memory_by_user);
+}
+
+static void fct_reset_memory_by_host(PFS_host *pfs)
+{
+  pfs->aggregate_memory(true);
+}
+
+/** Reset table MEMORY_SUMMARY_BY_HOST_BY_EVENT_NAME data. */
+void reset_memory_by_host()
+{
+  global_host_container.apply(fct_reset_memory_by_host);
+}
+
+/** Reset table MEMORY_GLOBAL_BY_EVENT_NAME data. */
+void reset_memory_global()
+{
+  PFS_memory_stat *stat= global_instr_class_memory_array;
+  PFS_memory_stat *stat_last= global_instr_class_memory_array + memory_class_max;
+
+  for ( ; stat < stat_last; stat++)
+    stat->rebase();
+}
+
diff --git a/storage/perfschema/pfs_memory.h b/storage/perfschema/pfs_memory.h
new file mode 100644
index 00000000000..a3e753656ce
--- /dev/null
+++ b/storage/perfschema/pfs_memory.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef PFS_MEMORY_H
+#define PFS_MEMORY_H
+
+/**
+  @file storage/perfschema/pfs_memory.h
+  Memory statistics aggregation (declarations).
+*/
+
+void reset_memory_by_thread();
+void reset_memory_by_account();
+void reset_memory_by_user();
+void reset_memory_by_host();
+void reset_memory_global();
+
+#endif
+
diff --git a/storage/perfschema/pfs_prepared_stmt.cc b/storage/perfschema/pfs_prepared_stmt.cc
new file mode 100644
index 00000000000..ccb49f3290d
--- /dev/null
+++ b/storage/perfschema/pfs_prepared_stmt.cc
@@ -0,0 +1,145 @@
+/* Copyright (c) 2014, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/pfs_prepared_stmt.cc
+  Prepared Statement data structures (implementation).
+*/
+
+/*
+  This code needs extra visibility in the lexer structures
+*/
+
+#include "my_global.h"
+#include "my_sys.h"
+#include "pfs_instr.h"
+#include "pfs_prepared_stmt.h"
+#include "pfs_global.h"
+#include "sql_string.h"
+#include "pfs_buffer_container.h"
+#include <string.h>
+
+/**
+  Initialize table PREPARED_STATEMENTS_INSTANCE.
+  @param param performance schema sizing
+*/
+int init_prepared_stmt(const PFS_global_param *param)
+{
+  if (global_prepared_stmt_container.init(param->m_prepared_stmt_sizing))
+    return 1;
+
+  reset_prepared_stmt_instances();
+  return 0;
+}
+
+/** Cleanup table PREPARED_STATEMENTS_INSTANCE. */
+void cleanup_prepared_stmt(void)
+{
+  global_prepared_stmt_container.cleanup();
+}
+
+void PFS_prepared_stmt::reset_data()
+{
+  m_prepare_stat.reset();
+  m_reprepare_stat.reset();
+  m_execute_stat.reset();
+}
+
+static void fct_reset_prepared_stmt_instances(PFS_prepared_stmt *pfs)
+{
+  pfs->reset_data();
+}
+
+void reset_prepared_stmt_instances()
+{
+  global_prepared_stmt_container.apply_all(fct_reset_prepared_stmt_instances);
+}
+
+PFS_prepared_stmt*
+create_prepared_stmt(void *identity,
+                     PFS_thread *thread, PFS_program *pfs_program,
+                     PFS_events_statements *pfs_stmt, uint stmt_id,
+                     const char* stmt_name, uint stmt_name_length)
+{
+  PFS_prepared_stmt *pfs= NULL;
+  pfs_dirty_state dirty_state;
+
+  /* Create a new record in prepared stmt stat array. */
+  pfs= global_prepared_stmt_container.allocate(& dirty_state);
+  if (pfs != NULL)
+  {
+    /* Reset the stats. */
+    pfs->reset_data();
+    /* Do the assignments. */
+    pfs->m_identity= identity;
+
+    pfs->m_sqltext_length= 0;
+
+    if (stmt_name != NULL)
+    {
+      pfs->m_stmt_name_length= stmt_name_length;
+      if (pfs->m_stmt_name_length > PS_NAME_LENGTH)
+        pfs->m_stmt_name_length= PS_NAME_LENGTH;
+      strncpy(pfs->m_stmt_name, stmt_name, pfs->m_stmt_name_length);
+    }
+    else
+      pfs->m_stmt_name_length= 0;
+
+    pfs->m_stmt_id= stmt_id;
+    pfs->m_owner_thread_id= thread->m_thread_internal_id;
+
+    /* If this statement prepare is called from a SP. */
+    if (pfs_program)
+    {
+      pfs->m_owner_object_type= pfs_program->m_type;
+      strncpy(pfs->m_owner_object_schema, pfs_program->m_schema_name, pfs_program->m_schema_name_length);
+      pfs->m_owner_object_schema_length= pfs_program->m_schema_name_length;
+      strncpy(pfs->m_owner_object_name, pfs_program->m_object_name, pfs_program->m_object_name_length);
+      pfs->m_owner_object_name_length= pfs_program->m_object_name_length;
+    }
+    else
+    {
+      pfs->m_owner_object_type= NO_OBJECT_TYPE;
+      pfs->m_owner_object_schema_length= 0;
+      pfs->m_owner_object_name_length= 0;
+    }
+
+    if (pfs_stmt)
+    {
+      if (pfs_program)
+        pfs->m_owner_event_id= pfs_stmt->m_event.m_nesting_event_id;
+      else
+        pfs->m_owner_event_id= pfs_stmt->m_event.m_event_id;
+    }
+
+    /* Insert this record. */
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
+  }
+
+  return pfs;
+}
+
+void delete_prepared_stmt(PFS_prepared_stmt *pfs)
+{
+  global_prepared_stmt_container.deallocate(pfs);
+  return;
+}
diff --git a/storage/perfschema/pfs_prepared_stmt.h b/storage/perfschema/pfs_prepared_stmt.h
new file mode 100644
index 00000000000..1014d9c6af6
--- /dev/null
+++ b/storage/perfschema/pfs_prepared_stmt.h
@@ -0,0 +1,95 @@
+/* Copyright (c) 2014, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef PFS_PS_H
+#define PFS_PS_H
+
+/**
+  @file storage/perfschema/pfs_prepared_statement.h
+  Stored Program data structures (declarations).
+*/
+
+#include "pfs_stat.h"
+#include "mysql/psi/psi.h"
+#include "mysql/psi/mysql_ps.h"
+#include "pfs_program.h"
+
+#define PS_NAME_LENGTH NAME_LEN
+
+struct PFS_ALIGNED PFS_prepared_stmt : public PFS_instr
+{
+  /** Column OBJECT_INSTANCE_BEGIN */
+  const void *m_identity;
+
+  /** STATEMENT_ID */
+  ulonglong m_stmt_id;
+
+  /** STATEMENT_NAME */
+  char m_stmt_name[PS_NAME_LENGTH];
+  uint m_stmt_name_length;
+
+  /** SQL_TEXT */
+  char m_sqltext[COL_INFO_SIZE];
+  uint m_sqltext_length;
+
+  /** Column OWNER_THREAD_ID */
+  ulonglong m_owner_thread_id;
+
+  /** Column OWNER_EVENT_ID. */
+  ulonglong m_owner_event_id;
+
+  /** Column OBJECT_OWNER_TYPE. */
+  enum_object_type m_owner_object_type;
+
+  /** Column OBJECT_OWNER_SCHEMA. */
+  char m_owner_object_schema[COL_OBJECT_SCHEMA_SIZE];
+  uint m_owner_object_schema_length;
+
+  /** Column OBJECT_OWNER_NAME. */
+  char m_owner_object_name[COL_OBJECT_NAME_SIZE];
+  uint m_owner_object_name_length;
+
+  /** COLUMN TIMER_PREPARE. Prepared stmt prepare stat. */
+  PFS_single_stat m_prepare_stat;
+
+  /** COLUMN COUNT_REPREPARE. Prepared stmt reprepare stat. */
+  PFS_single_stat m_reprepare_stat;
+
+  /** Prepared stmt execution stat. */
+  PFS_statement_stat m_execute_stat;
+
+  /** Reset data for this record. */
+  void reset_data();
+};
+
+int init_prepared_stmt(const PFS_global_param *param);
+void cleanup_prepared_stmt(void);
+
+void reset_prepared_stmt_instances();
+
+PFS_prepared_stmt*
+create_prepared_stmt(void *identity,
+                     PFS_thread *thread, PFS_program *pfs_program,
+                     PFS_events_statements *pfs_stmt, uint stmt_id,
+                     const char* stmt_name, uint stmt_name_length);
+void delete_prepared_stmt(PFS_prepared_stmt *pfs_ps);
+#endif
diff --git a/storage/perfschema/pfs_program.cc b/storage/perfschema/pfs_program.cc
new file mode 100644
index 00000000000..00b8082ec53
--- /dev/null
+++ b/storage/perfschema/pfs_program.cc
@@ -0,0 +1,322 @@
+/* Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/pfs_program.cc
+  Statement Digest data structures (implementation).
+*/
+
+/*
+  This code needs extra visibility in the lexer structures
+*/
+
+#include "my_global.h"
+#include "my_sys.h"
+#include "pfs_instr.h"
+#include "pfs_program.h"
+#include "pfs_global.h"
+#include "sql_string.h"
+#include "pfs_setup_object.h"
+#include "pfs_buffer_container.h"
+#include "mysqld.h"                //system_charset_info
+#include <string.h>
+
+LF_HASH program_hash;
+static bool program_hash_inited= false;
+
+/**
+  Initialize table EVENTS_STATEMENTS_SUMMARY_BY_PROGRAM.
+  @param param performance schema sizing
+*/
+int init_program(const PFS_global_param *param)
+{
+  if (global_program_container.init(param->m_program_sizing))
+    return 1;
+
+  reset_esms_by_program();
+  return 0;
+}
+
+/** Cleanup table EVENTS_STATEMENTS_SUMMARY_BY_PROGRAM. */
+void cleanup_program(void)
+{
+  global_program_container.cleanup();
+}
+
+C_MODE_START
+static uchar *program_hash_get_key(const uchar *entry, size_t *length,
+                                   my_bool)
+{
+  const PFS_program * const *typed_entry;
+  const PFS_program *program;
+  const void *result;
+  typed_entry= reinterpret_cast<const PFS_program* const *> (entry);
+  assert(typed_entry != NULL);
+  program= *typed_entry;
+  assert(program != NULL);
+  *length= program->m_key.m_key_length;
+  result= program->m_key.m_hash_key;
+  return const_cast<uchar*> (reinterpret_cast<const uchar*> (result));
+}
+C_MODE_END
+
+/**
+  Initialize the program hash.
+  @return 0 on success
+*/
+int init_program_hash(const PFS_global_param *param)
+{
+  if ((! program_hash_inited) && (param->m_program_sizing != 0))
+  {
+    lf_hash_init(&program_hash, sizeof(PFS_program*), LF_HASH_UNIQUE,
+                 0, 0, program_hash_get_key, &my_charset_bin);
+    program_hash_inited= true;
+  }
+  return 0;
+}
+
+/** Cleanup the program hash. */
+void cleanup_program_hash(void)
+{
+  if (program_hash_inited)
+  {
+    lf_hash_destroy(&program_hash);
+    program_hash_inited= false;
+  }
+}
+
+static void set_program_key(PFS_program_key *key,
+                            enum_object_type object_type,
+                            const char *object_name, uint object_name_length,
+                            const char *schema_name, uint schema_name_length)
+{
+  assert(object_name_length <= COL_OBJECT_NAME_SIZE);
+  assert(schema_name_length <= COL_OBJECT_SCHEMA_SIZE);
+
+  /*
+    To make sure generated key is case insensitive,
+    convert object_name/schema_name to lowercase.
+   */
+
+  char *ptr= &key->m_hash_key[0];
+
+  ptr[0]= object_type;
+  ptr++;
+
+  if (object_name_length > 0)
+  {
+    char tmp_object_name[COL_OBJECT_NAME_SIZE + 1];
+    memcpy(tmp_object_name, object_name, object_name_length);
+    tmp_object_name[object_name_length]= '\0';
+    my_casedn_str(system_charset_info, tmp_object_name);
+    memcpy(ptr, tmp_object_name, object_name_length);
+    ptr+= object_name_length;
+  }
+  ptr[0]= 0;
+  ptr++;
+
+  if (schema_name_length > 0)
+  {
+    char tmp_schema_name[COL_OBJECT_SCHEMA_SIZE + 1];
+    memcpy(tmp_schema_name, schema_name, schema_name_length);
+    tmp_schema_name[schema_name_length]='\0';
+    my_casedn_str(system_charset_info, tmp_schema_name);
+    memcpy(ptr, tmp_schema_name, schema_name_length);
+    ptr+= schema_name_length;
+  }
+  ptr[0]= 0;
+  ptr++;
+
+  key->m_key_length= static_cast<uint>(ptr - &key->m_hash_key[0]);
+}
+
+
+
+void PFS_program::reset_data()
+{
+  m_sp_stat.reset();
+  m_stmt_stat.reset();
+}
+
+static void fct_reset_esms_by_program(PFS_program *pfs)
+{
+  pfs->reset_data();
+}
+
+void reset_esms_by_program()
+{
+  global_program_container.apply_all(fct_reset_esms_by_program);
+}
+
+static LF_PINS* get_program_hash_pins(PFS_thread *thread)
+{
+  if (unlikely(thread->m_program_hash_pins == NULL))
+  {
+    if (! program_hash_inited)
+      return NULL;
+    thread->m_program_hash_pins= lf_hash_get_pins(&program_hash);
+  }
+  return thread->m_program_hash_pins;
+}
+
+PFS_program*
+find_or_create_program(PFS_thread *thread,
+                      enum_object_type object_type,
+                      const char *object_name,
+                      uint object_name_length,
+                      const char *schema_name,
+                      uint schema_name_length)
+{
+  bool is_enabled, is_timed;
+
+  LF_PINS *pins= get_program_hash_pins(thread);
+  if (unlikely(pins == NULL))
+  {
+    global_program_container.m_lost++;
+    return NULL;
+  }
+
+  /* Prepare program key */
+  PFS_program_key key;
+  set_program_key(&key, object_type,
+                  object_name, object_name_length,
+                  schema_name, schema_name_length);
+
+  PFS_program **entry;
+  PFS_program *pfs= NULL;
+  uint retry_count= 0;
+  const uint retry_max= 3;
+  pfs_dirty_state dirty_state;
+
+search:
+  entry= reinterpret_cast<PFS_program**>
+    (lf_hash_search(&program_hash, pins,
+                    key.m_hash_key, key.m_key_length));
+
+  if (entry && (entry != MY_ERRPTR))
+  {
+    /* If record already exists then return its pointer. */
+    pfs= *entry;
+    lf_hash_search_unpin(pins);
+    return pfs;
+  }
+
+  lf_hash_search_unpin(pins);
+
+  /*
+    First time while inserting this record to program array we need to
+    find out if it is enabled and timed.
+  */
+  lookup_setup_object(thread, object_type,
+                      schema_name, schema_name_length,
+                      object_name, object_name_length,
+                      &is_enabled, &is_timed);
+
+  /* Else create a new record in program stat array. */
+  pfs= global_program_container.allocate(& dirty_state);
+  if (pfs != NULL)
+  {
+    /* Do the assignments. */
+    memcpy(pfs->m_key.m_hash_key, key.m_hash_key, key.m_key_length);
+    pfs->m_key.m_key_length= key.m_key_length;
+    pfs->m_type= object_type;
+
+    pfs->m_object_name= pfs->m_key.m_hash_key + 1;
+    pfs->m_object_name_length= object_name_length;
+    pfs->m_schema_name= pfs->m_object_name + object_name_length + 1;
+    pfs->m_schema_name_length= schema_name_length;
+    pfs->m_enabled= is_enabled;
+    pfs->m_timed= is_timed;
+
+    /* Insert this record. */
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
+    int res= lf_hash_insert(&program_hash, pins, &pfs);
+
+    if (likely(res == 0))
+    {
+      return pfs;
+    }
+
+    global_program_container.deallocate(pfs);
+
+    if (res > 0)
+    {
+      /* Duplicate insert by another thread */
+      if (++retry_count > retry_max)
+      {
+        /* Avoid infinite loops */
+        global_program_container.m_lost++;
+        return NULL;
+      }
+       goto search;
+    }
+    /* OOM in lf_hash_insert */
+    global_program_container.m_lost++;
+    return NULL;
+  }
+
+  return NULL;
+}
+
+void drop_program(PFS_thread *thread,
+                 enum_object_type object_type,
+                 const char *object_name,
+                 uint object_name_length,
+                 const char *schema_name,
+                 uint schema_name_length)
+{
+  LF_PINS *pins= get_program_hash_pins(thread);
+  if (unlikely(pins == NULL))
+    return;
+
+  /* Prepare program key */
+  PFS_program_key key;
+  set_program_key(&key, object_type,
+                  object_name, object_name_length,
+                  schema_name, schema_name_length);
+
+  PFS_program **entry;
+  entry= reinterpret_cast<PFS_program**>
+    (lf_hash_search(&program_hash, pins,
+                    key.m_hash_key, key.m_key_length));
+
+  if (entry && (entry != MY_ERRPTR))
+  {
+    PFS_program *pfs= NULL;
+    pfs= *entry;
+
+    lf_hash_delete(&program_hash, pins,
+                   key.m_hash_key, key.m_key_length);
+    global_program_container.deallocate(pfs);
+  }
+
+  lf_hash_search_unpin(pins);
+  return;
+}
+
+void PFS_program::refresh_setup_object_flags(PFS_thread *thread)
+{
+  lookup_setup_object(thread, m_type,
+                      m_schema_name, m_schema_name_length,
+                      m_object_name, m_object_name_length,
+                      &m_enabled, &m_timed);
+}
diff --git a/storage/perfschema/pfs_program.h b/storage/perfschema/pfs_program.h
new file mode 100644
index 00000000000..a34f533d444
--- /dev/null
+++ b/storage/perfschema/pfs_program.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef PFS_PROGRAM_H
+#define PFS_PROGRAM_H
+
+/**
+  @file storage/perfschema/pfs_program.h
+  Stored Program data structures (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_stat.h"
+
+#define PROGRAM_HASH_KEY_LENGTH sizeof(enum_object_type) + COL_OBJECT_NAME_SIZE + 1 + COL_OBJECT_SCHEMA_SIZE + 1
+
+extern LF_HASH program_hash;
+
+/**
+  Hash key for a program.
+*/
+struct PFS_program_key
+{
+  /**
+    Hash search key.
+    This has to be a string for LF_HASH,
+    the format is "<object_type><0x00><object_name><0x00><schema_name><0x00>"
+  */
+  char m_hash_key[PROGRAM_HASH_KEY_LENGTH];
+  uint m_key_length;
+};
+
+struct PFS_ALIGNED PFS_program : public PFS_instr
+{
+  /** Object type. */
+  enum_object_type m_type;
+
+  /** Object name. */
+  const char *m_object_name;
+  int m_object_name_length;
+
+  /** Object Schema name. */
+  const char *m_schema_name;
+  int m_schema_name_length;
+
+  /** Hash key */
+  PFS_program_key m_key;
+
+  /** Sub statement stat. */
+  PFS_statement_stat m_stmt_stat;
+
+  /** Stored program stat. */
+  PFS_sp_stat m_sp_stat;
+
+  /** Referesh setup object flags. */
+  void refresh_setup_object_flags(PFS_thread* thread);
+
+  /** Reset data for this record. */
+  void reset_data();
+};
+
+int init_program(const PFS_global_param *param);
+void cleanup_program(void);
+int init_program_hash(const PFS_global_param *param);
+void cleanup_program_hash(void);
+
+void reset_esms_by_program();
+
+PFS_program*
+find_or_create_program(PFS_thread *thread,
+                      enum_object_type object_type,
+                      const char *object_name,
+                      uint object_name_length,
+                      const char *schema,
+                      uint schema_length);
+
+void
+drop_program(PFS_thread *thread,
+             enum_object_type object_type,
+             const char *object_name,
+             uint object_name_length,
+             const char *schema_name,
+             uint schema_name_length);
+#endif
diff --git a/storage/perfschema/pfs_server.cc b/storage/perfschema/pfs_server.cc
index f3f22bbcf4f..0a97f05c54a 100644
--- a/storage/perfschema/pfs_server.cc
+++ b/storage/perfschema/pfs_server.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -32,10 +32,12 @@
 #include "pfs.h"
 #include "pfs_global.h"
 #include "pfs_instr_class.h"
+#include "pfs_builtin_memory.h"
 #include "pfs_instr.h"
 #include "pfs_events_waits.h"
 #include "pfs_events_stages.h"
 #include "pfs_events_statements.h"
+#include "pfs_events_transactions.h"
 #include "pfs_timer.h"
 #include "pfs_setup_actor.h"
 #include "pfs_setup_object.h"
@@ -44,6 +46,9 @@
 #include "pfs_account.h"
 #include "pfs_defaults.h"
 #include "pfs_digest.h"
+#include "pfs_program.h"
+//#include "template_utils.h"
+#include "pfs_prepared_stmt.h"
 
 PFS_global_param pfs_param;
 
@@ -56,48 +61,69 @@ C_MODE_END
 static void cleanup_performance_schema(void);
 void cleanup_instrument_config(void);
 
-struct PSI_bootstrap*
-initialize_performance_schema(PFS_global_param *param)
+void pre_initialize_performance_schema()
 {
   pfs_initialized= false;
 
+  init_all_builtin_memory_class();
+
   PFS_table_stat::g_reset_template.reset();
   global_idle_stat.reset();
   global_table_io_stat.reset();
   global_table_lock_stat.reset();
 
-  pfs_automated_sizing(param);
+  if (my_create_thread_local_key(&THR_PFS, destroy_pfs_thread))
+    return;
+  if (my_create_thread_local_key(&THR_PFS_VG, NULL))  // global_variables
+    return;
+  if (my_create_thread_local_key(&THR_PFS_SV, NULL))  // session_variables
+    return;
+  if (my_create_thread_local_key(&THR_PFS_VBT, NULL)) // variables_by_thread
+    return;
+  if (my_create_thread_local_key(&THR_PFS_SG, NULL))  // global_status
+    return;
+  if (my_create_thread_local_key(&THR_PFS_SS, NULL))  // session_status
+    return;
+  if (my_create_thread_local_key(&THR_PFS_SBT, NULL)) // status_by_thread
+    return;
+  if (my_create_thread_local_key(&THR_PFS_SBU, NULL)) // status_by_user
+    return;
+  if (my_create_thread_local_key(&THR_PFS_SBH, NULL)) // status_by_host
+    return;
+  if (my_create_thread_local_key(&THR_PFS_SBA, NULL)) // status_by_account
+    return;
+
+  THR_PFS_initialized= true;
+}
 
-  if (! param->m_enabled)
+struct PSI_bootstrap*
+initialize_performance_schema(PFS_global_param *param)
+{
+  if (!THR_PFS_initialized)
   {
-    /*
-      The performance schema is disabled in the startup command line.
-      All the instrumentation is turned off.
-    */
-    pfs_enabled= 0;
+    /* Pre-initialization failed. */
     return NULL;
   }
-  pfs_enabled= TRUE;
 
-  init_timers();
+  pfs_enabled= param->m_enabled;
 
+  pfs_automated_sizing(param);
+  init_timers();
   init_event_name_sizing(param);
   register_global_classes();
 
-  if (pthread_key_create(&THR_PFS, destroy_pfs_thread))
-    return NULL;
-
-  THR_PFS_initialized= true;
-
   if (init_sync_class(param->m_mutex_class_sizing,
                       param->m_rwlock_class_sizing,
                       param->m_cond_class_sizing) ||
       init_thread_class(param->m_thread_class_sizing) ||
       init_table_share(param->m_table_share_sizing) ||
+      init_table_share_lock_stat(param->m_table_lock_stat_sizing) ||
+      init_table_share_index_stat(param->m_index_stat_sizing) ||
       init_file_class(param->m_file_class_sizing) ||
       init_stage_class(param->m_stage_class_sizing) ||
       init_statement_class(param->m_statement_class_sizing) ||
       init_socket_class(param->m_socket_class_sizing) ||
+      init_memory_class(param->m_memory_class_sizing) ||
       init_instruments(param) ||
       init_events_waits_history_long(
         param->m_events_waits_history_long_sizing) ||
@@ -105,20 +131,25 @@ initialize_performance_schema(PFS_global_param *param)
         param->m_events_stages_history_long_sizing) ||
       init_events_statements_history_long(
         param->m_events_statements_history_long_sizing) ||
-      init_file_hash() ||
-      init_table_share_hash() ||
+      init_events_transactions_history_long(
+        param->m_events_transactions_history_long_sizing) ||
+      init_file_hash(param) ||
+      init_table_share_hash(param) ||
       init_setup_actor(param) ||
-      init_setup_actor_hash() ||
+      init_setup_actor_hash(param) ||
       init_setup_object(param) ||
-      init_setup_object_hash() ||
+      init_setup_object_hash(param) ||
       init_host(param) ||
-      init_host_hash() ||
+      init_host_hash(param) ||
       init_user(param) ||
-      init_user_hash() ||
+      init_user_hash(param) ||
       init_account(param) ||
-      init_account_hash() ||
+      init_account_hash(param) ||
       init_digest(param) ||
-      init_digest_hash())
+      init_digest_hash(param) ||
+      init_program(param) ||
+      init_program_hash(param) ||
+      init_prepared_stmt(param))
   {
     /*
       The performance schema initialization failed.
@@ -128,30 +159,59 @@ initialize_performance_schema(PFS_global_param *param)
     return NULL;
   }
 
+  if (param->m_enabled)
+  {
+    /** Default values for SETUP_CONSUMERS */
+    flag_events_stages_current=            param->m_consumer_events_stages_current_enabled;
+    flag_events_stages_history=            param->m_consumer_events_stages_history_enabled;
+    flag_events_stages_history_long=       param->m_consumer_events_stages_history_long_enabled;
+    flag_events_statements_current=        param->m_consumer_events_statements_current_enabled;
+    flag_events_statements_history=        param->m_consumer_events_statements_history_enabled;
+    flag_events_statements_history_long=   param->m_consumer_events_statements_history_long_enabled;
+    flag_events_transactions_current=      param->m_consumer_events_transactions_current_enabled;
+    flag_events_transactions_history=      param->m_consumer_events_transactions_history_enabled;
+    flag_events_transactions_history_long= param->m_consumer_events_transactions_history_long_enabled;
+    flag_events_waits_current=             param->m_consumer_events_waits_current_enabled;
+    flag_events_waits_history=             param->m_consumer_events_waits_history_enabled;
+    flag_events_waits_history_long=        param->m_consumer_events_waits_history_long_enabled;
+    flag_global_instrumentation=           param->m_consumer_global_instrumentation_enabled;
+    flag_thread_instrumentation=           param->m_consumer_thread_instrumentation_enabled;
+    flag_statements_digest=                param->m_consumer_statement_digest_enabled;
+  }
+  else
+  {
+    flag_events_stages_current= false;
+    flag_events_stages_history= false;
+    flag_events_stages_history_long= false;
+    flag_events_statements_current= false;
+    flag_events_statements_history= false;
+    flag_events_statements_history_long= false;
+    flag_events_transactions_current= false;
+    flag_events_transactions_history= false;
+    flag_events_transactions_history_long= false;
+    flag_events_waits_current= false;
+    flag_events_waits_history= false;
+    flag_events_waits_history_long= false;
+    flag_global_instrumentation= false;
+    flag_thread_instrumentation= false;
+    flag_statements_digest= false;
+  }
+
   pfs_initialized= true;
 
-  /** Default values for SETUP_CONSUMERS */
-  flag_events_stages_current=          param->m_consumer_events_stages_current_enabled;
-  flag_events_stages_history=          param->m_consumer_events_stages_history_enabled;
-  flag_events_stages_history_long=     param->m_consumer_events_stages_history_long_enabled;
-  flag_events_statements_current=      param->m_consumer_events_statements_current_enabled;
-  flag_events_statements_history=      param->m_consumer_events_statements_history_enabled;
-  flag_events_statements_history_long= param->m_consumer_events_statements_history_long_enabled;
-  flag_events_waits_current=           param->m_consumer_events_waits_current_enabled;
-  flag_events_waits_history=           param->m_consumer_events_waits_history_enabled;
-  flag_events_waits_history_long=      param->m_consumer_events_waits_history_long_enabled;
-  flag_global_instrumentation=         param->m_consumer_global_instrumentation_enabled;
-  flag_thread_instrumentation=         param->m_consumer_thread_instrumentation_enabled;
-  flag_statements_digest=              param->m_consumer_statement_digest_enabled;
-
-  install_default_setup(&PFS_bootstrap);
-  return &PFS_bootstrap;
+  if (param->m_enabled)
+  {
+    install_default_setup(&PFS_bootstrap);
+    return &PFS_bootstrap;
+  }
+
+  return NULL;
 }
 
 static void destroy_pfs_thread(void *key)
 {
   PFS_thread* pfs= reinterpret_cast<PFS_thread*> (key);
-  DBUG_ASSERT(pfs);
+  assert(pfs);
   /*
     This automatic cleanup is a last resort and best effort to avoid leaks,
     and may not work on windows due to the implementation of pthread_key_create().
@@ -169,50 +229,131 @@ static void destroy_pfs_thread(void *key)
 
 static void cleanup_performance_schema(void)
 {
+  /*
+    my.cnf options
+  */
+
   cleanup_instrument_config();
-  cleanup_instruments();
+
+  /*
+    All the LF_HASH
+  */
+
+  cleanup_setup_actor_hash();
+  cleanup_setup_object_hash();
+  cleanup_account_hash();
+  cleanup_host_hash();
+  cleanup_user_hash();
+  cleanup_program_hash();
+  cleanup_table_share_hash();
+  cleanup_file_hash();
+  cleanup_digest_hash();
+
+  /*
+    Then the lookup tables
+  */
+
+  cleanup_setup_actor();
+  cleanup_setup_object();
+
+  /*
+    Then the history tables
+  */
+
+  cleanup_events_waits_history_long();
+  cleanup_events_stages_history_long();
+  cleanup_events_statements_history_long();
+  cleanup_events_transactions_history_long();
+
+  /*
+    Then the various aggregations
+  */
+
+  cleanup_digest();
+  cleanup_account();
+  cleanup_host();
+  cleanup_user();
+
+  /*
+    Then the instrument classes.
+    Once a class is cleaned up,
+    find_XXX_class(key)
+    will return PSI_NOT_INSTRUMENTED
+  */
+  cleanup_program();
+  cleanup_prepared_stmt();
   cleanup_sync_class();
   cleanup_thread_class();
   cleanup_table_share();
+  cleanup_table_share_lock_stat();
+  cleanup_table_share_index_stat();
   cleanup_file_class();
   cleanup_stage_class();
   cleanup_statement_class();
   cleanup_socket_class();
-  cleanup_events_waits_history_long();
-  cleanup_events_stages_history_long();
-  cleanup_events_statements_history_long();
-  cleanup_table_share_hash();
-  cleanup_file_hash();
-  cleanup_setup_actor();
-  cleanup_setup_actor_hash();
-  cleanup_setup_object();
-  cleanup_setup_object_hash();
-  cleanup_host();
-  cleanup_host_hash();
-  cleanup_user();
-  cleanup_user_hash();
-  cleanup_account();
-  cleanup_account_hash();
-  cleanup_digest();
-  cleanup_digest_hash();
+  cleanup_memory_class();
+
+  cleanup_instruments();
 }
 
 void shutdown_performance_schema(void)
 {
   pfs_initialized= false;
+
+  /* disable everything, especially for this thread. */
+  flag_events_stages_current= false;
+  flag_events_stages_history= false;
+  flag_events_stages_history_long= false;
+  flag_events_statements_current= false;
+  flag_events_statements_history= false;
+  flag_events_statements_history_long= false;
+  flag_events_transactions_current= false;
+  flag_events_transactions_history= false;
+  flag_events_transactions_history_long= false;
+  flag_events_waits_current= false;
+  flag_events_waits_history= false;
+  flag_events_waits_history_long= false;
+  flag_global_instrumentation= false;
+  flag_thread_instrumentation= false;
+  flag_statements_digest= false;
+
+  global_table_io_class.m_enabled= false;
+  global_table_lock_class.m_enabled= false;
+  global_idle_class.m_enabled= false;
+  global_metadata_class.m_enabled= false;
+  global_transaction_class.m_enabled= false;
+
   cleanup_performance_schema();
-#if 0
   /*
     Be careful to not delete un-initialized keys,
     this would affect key 0, which is THR_KEY_mysys,
   */
   if (THR_PFS_initialized)
   {
-    my_pthread_setspecific_ptr(THR_PFS, NULL);
-    pthread_key_delete(THR_PFS);
+    my_set_thread_local(THR_PFS, NULL);
+    my_set_thread_local(THR_PFS_VG, NULL);  // global_variables
+    my_set_thread_local(THR_PFS_SV, NULL);  // session_variables
+    my_set_thread_local(THR_PFS_VBT, NULL); // variables_by_thread
+    my_set_thread_local(THR_PFS_SG, NULL);  // global_status
+    my_set_thread_local(THR_PFS_SS, NULL);  // session_status
+    my_set_thread_local(THR_PFS_SBT, NULL); // status_by_thread
+    my_set_thread_local(THR_PFS_SBU, NULL); // status_by_user
+    my_set_thread_local(THR_PFS_SBH, NULL); // status_by_host
+    my_set_thread_local(THR_PFS_SBA, NULL); // status_by_account
+
+    my_delete_thread_local_key(THR_PFS);
+    my_delete_thread_local_key(THR_PFS_VG);
+    my_delete_thread_local_key(THR_PFS_SV);
+    my_delete_thread_local_key(THR_PFS_VBT);
+    my_delete_thread_local_key(THR_PFS_SG);
+    my_delete_thread_local_key(THR_PFS_SS);
+    my_delete_thread_local_key(THR_PFS_SBT);
+    my_delete_thread_local_key(THR_PFS_SBU);
+    my_delete_thread_local_key(THR_PFS_SBH);
+    my_delete_thread_local_key(THR_PFS_SBA);
+
     THR_PFS_initialized= false;
   }
-#endif
 }
 
 /**
@@ -221,27 +362,22 @@ void shutdown_performance_schema(void)
 */
 void init_pfs_instrument_array()
 {
-  my_init_dynamic_array(&pfs_instr_config_array, sizeof(PFS_instr_config*),
-                        10, 10, MYF(0));
-  pfs_instr_config_state=  PFS_INSTR_CONFIG_ALLOCATED;
+  pfs_instr_config_array= new Pfs_instr_config_array((PSI_memory_key)PSI_NOT_INSTRUMENTED);
 }
 
 /**
-  Deallocate the PFS_INSTRUMENT array. Use an atomic compare-and-swap to ensure
-  that it is deallocated only once in the chaotic environment of server shutdown.
+  Deallocate the PFS_INSTRUMENT array.
 */
 void cleanup_instrument_config()
 {
-  int desired_state= PFS_INSTR_CONFIG_ALLOCATED;
-  
-  /* Ignore if another thread has already deallocated the array */
-  if (my_atomic_cas32(&pfs_instr_config_state, &desired_state, PFS_INSTR_CONFIG_DEALLOCATED))
+  if (pfs_instr_config_array != NULL)
   {
-    PFS_instr_config **array=dynamic_element(&pfs_instr_config_array, 0, PFS_instr_config**);
-    for (uint i=0; i < pfs_instr_config_array.elements; i++)
-      my_free(array[i]);
-    delete_dynamic(&pfs_instr_config_array);
+    PFS_instr_config **it= pfs_instr_config_array->front();
+    for ( ; it != pfs_instr_config_array->end(); it++)
+      my_free(*it);
   }
+  delete pfs_instr_config_array;
+  pfs_instr_config_array= NULL;
 }
 
 /**
@@ -260,16 +396,17 @@ int add_pfs_instr_to_array(const char* name, const char* value)
   size_t value_length= strlen(value);
 
   /* Allocate structure plus string buffers plus null terminators */
-  PFS_instr_config* e = (PFS_instr_config*)my_malloc(sizeof(PFS_instr_config)
+  PFS_instr_config* e = (PFS_instr_config*)my_malloc(PSI_NOT_INSTRUMENTED,
+                                                     sizeof(PFS_instr_config)
                        + name_length + 1 + value_length + 1, MYF(MY_WME));
   if (!e) return 1;
-  
+
   /* Copy the instrument name */
   e->m_name= (char*)e + sizeof(PFS_instr_config);
   memcpy(e->m_name, name, name_length);
   e->m_name_length= (uint)name_length;
   e->m_name[name_length]= '\0';
-  
+
   /* Set flags accordingly */
   if (!my_strcasecmp(&my_charset_latin1, value, "counted"))
   {
@@ -301,7 +438,7 @@ int add_pfs_instr_to_array(const char* name, const char* value)
   }
 
   /* Add to the array of default startup options */
-  if (insert_dynamic(&pfs_instr_config_array, &e))
+  if (pfs_instr_config_array->push(e))
   {
     my_free(e);
     return 1;
diff --git a/storage/perfschema/pfs_server.h b/storage/perfschema/pfs_server.h
index 9f904e6545b..c1ebb531a85 100644
--- a/storage/perfschema/pfs_server.h
+++ b/storage/perfschema/pfs_server.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -28,11 +28,14 @@
   Private interface for the server (declarations).
 */
 
+#define PFS_AUTOSCALE_VALUE (-1)
+#define PFS_AUTOSIZE_VALUE (-1)
+
 #ifndef PFS_MAX_MUTEX_CLASS
-  #define PFS_MAX_MUTEX_CLASS 200
+  #define PFS_MAX_MUTEX_CLASS 210
 #endif
 #ifndef PFS_MAX_RWLOCK_CLASS
-  #define PFS_MAX_RWLOCK_CLASS 40
+  #define PFS_MAX_RWLOCK_CLASS 50
 #endif
 #ifndef PFS_MAX_COND_CLASS
   #define PFS_MAX_COND_CLASS 90
@@ -41,7 +44,7 @@
   #define PFS_MAX_THREAD_CLASS 50
 #endif
 #ifndef PFS_MAX_FILE_CLASS
-  #define PFS_MAX_FILE_CLASS 50
+  #define PFS_MAX_FILE_CLASS 80
 #endif
 #ifndef PFS_MAX_FILE_HANDLE
   #define PFS_MAX_FILE_HANDLE 32768
@@ -49,28 +52,29 @@
 #ifndef PFS_MAX_SOCKET_CLASS
   #define PFS_MAX_SOCKET_CLASS 10
 #endif
-#ifndef PFS_MAX_SETUP_ACTOR
-  #define PFS_MAX_SETUP_ACTOR 100
-#endif
-#ifndef PFS_MAX_SETUP_OBJECT
-  #define PFS_MAX_SETUP_OBJECT 100
-#endif
 #ifndef PFS_MAX_STAGE_CLASS
   #define PFS_MAX_STAGE_CLASS 160
 #endif
 #ifndef PFS_STATEMENTS_STACK_SIZE
   #define PFS_STATEMENTS_STACK_SIZE 10
 #endif
-#ifndef PFS_CONNECT_ATTRS_SIZE
-  #define PFS_SESSION_CONNECT_ATTRS_SIZE 2048
+#ifndef PFS_MAX_MEMORY_CLASS
+  #define PFS_MAX_MEMORY_CLASS 320
 #endif
 
+/** Sizing hints, from the server configuration. */
 struct PFS_sizing_hints
 {
+  /** Value of @c Sys_table_def_size */
   long m_table_definition_cache;
+  /** Value of @c Sys_table_cache_size */
   long m_table_open_cache;
+  /** Value of @c Sys_max_connections */
   long m_max_connections;
+  /** Value of @c Sys_open_files_limit */
   long m_open_files_limit;
+  /** Value of @c Sys_max_prepared_stmt_count */
+  long m_max_prepared_stmt_count;
 };
 
 /** Performance schema global sizing parameters. */
@@ -85,6 +89,9 @@ struct PFS_global_param
   bool m_consumer_events_statements_current_enabled;
   bool m_consumer_events_statements_history_enabled;
   bool m_consumer_events_statements_history_long_enabled;
+  bool m_consumer_events_transactions_current_enabled;
+  bool m_consumer_events_transactions_history_enabled;
+  bool m_consumer_events_transactions_history_long_enabled;
   bool m_consumer_events_waits_current_enabled;
   bool m_consumer_events_waits_history_enabled;
   bool m_consumer_events_waits_history_long_enabled;
@@ -121,6 +128,16 @@ struct PFS_global_param
   */
   long m_table_share_sizing;
   /**
+    Maximum number of lock statistics collected for tables.
+    @sa table_lock_stat_lost.
+  */
+  long m_table_lock_stat_sizing;
+  /**
+    Maximum number of index statistics collected for tables.
+    @sa table_index_lost.
+  */
+  long m_index_stat_sizing;
+  /**
     Maximum number of instrumented file classes.
     @sa file_class_lost.
   */
@@ -162,7 +179,7 @@ struct PFS_global_param
   long m_file_handle_sizing;
   /**
     Maxium number of instrumented socket instances
-    @sa socket_lost  
+    @sa socket_lost
   */
   long m_socket_sizing;
   /**
@@ -175,9 +192,9 @@ struct PFS_global_param
   /** Maximum number of rows in table EVENTS_WAITS_HISTORY_LONG. */
   long m_events_waits_history_long_sizing;
   /** Maximum number of rows in table SETUP_ACTORS. */
-  ulong m_setup_actor_sizing;
+  long m_setup_actor_sizing;
   /** Maximum number of rows in table SETUP_OBJECTS. */
-  ulong m_setup_object_sizing;
+  long m_setup_object_sizing;
   /** Maximum number of rows in table HOSTS. */
   long m_host_sizing;
   /** Maximum number of rows in table USERS. */
@@ -198,16 +215,36 @@ struct PFS_global_param
     @sa statement_class_lost.
   */
   ulong m_statement_class_sizing;
-  /** Maximum number of rows per thread in table EVENTS_STATEMENT_HISTORY. */
+  /** Maximum number of rows per thread in table EVENTS_STATEMENTS_HISTORY. */
   long m_events_statements_history_sizing;
   /** Maximum number of rows in table EVENTS_STATEMENTS_HISTORY_LONG. */
   long m_events_statements_history_long_sizing;
   /** Maximum number of digests to be captured */
   long m_digest_sizing;
+  /** Maximum number of programs to be captured */
+  long m_program_sizing;
+  /** Maximum number of prepared statements to be captured */
+  long m_prepared_stmt_sizing;
+  /** Maximum number of rows per thread in table EVENTS_TRANSACTIONS_HISTORY. */
+  long m_events_transactions_history_sizing;
+  /** Maximum number of rows in table EVENTS_TRANSACTIONS_HISTORY_LONG. */
+  long m_events_transactions_history_long_sizing;
+
   /** Maximum number of session attribute strings per thread */
   long m_session_connect_attrs_sizing;
+  /** Maximum size of statement stack */
+  ulong m_statement_stack_sizing;
+
+  /**
+    Maximum number of instrumented memory classes.
+    @sa memory_class_lost.
+  */
+  ulong m_memory_class_sizing;
+
+  long m_metadata_lock_sizing;
 
   long m_max_digest_length;
+  ulong m_max_sql_text_length;
 
   /** Sizing hints, for auto tuning. */
   PFS_sizing_hints m_hints;
@@ -220,9 +257,17 @@ struct PFS_global_param
 extern PFS_global_param pfs_param;
 
 /**
+  Null initialization.
+  Disable all instrumentation, size all internal buffers to 0.
+  This pre initialization step is needed to ensure that events can be collected
+  and discarded, until such time @c initialize_performance_schema() is called.
+*/
+void pre_initialize_performance_schema();
+
+/**
   Initialize the performance schema.
   @param param Size parameters to use.
-  @return A boostrap handle, or NULL.
+  @return A bootstrap handle, or NULL.
 */
 struct PSI_bootstrap*
 initialize_performance_schema(PFS_global_param *param);
@@ -233,14 +278,19 @@ void pfs_automated_sizing(PFS_global_param *param);
   Initialize the performance schema ACL.
   ACL is strictly enforced when the server is running in normal mode,
   to enforce that only legal operations are allowed.
-  When running in boostrap mode, ACL restrictions are relaxed,
-  to allow the boostrap scripts to DROP / CREATE performance schema tables.
+  When running in bootstrap mode, ACL restrictions are relaxed,
+  to allow the bootstrap scripts to DROP / CREATE performance schema tables.
   @sa ACL_internal_schema_registry
   @param bootstrap True if the server is starting in bootstrap mode.
 */
 void initialize_performance_schema_acl(bool bootstrap);
 
 /**
+  Reset the aggregated status counter stats.
+*/
+void reset_pfs_status_stats();
+
+/**
   Initialize the dynamic array holding individual instrument settings collected
   from the server configuration options.
 */
diff --git a/storage/perfschema/pfs_setup_actor.cc b/storage/perfschema/pfs_setup_actor.cc
index c4cec6c9ff8..6e066e34231 100644
--- a/storage/perfschema/pfs_setup_actor.cc
+++ b/storage/perfschema/pfs_setup_actor.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -32,23 +32,15 @@
 #include "pfs_stat.h"
 #include "pfs_instr.h"
 #include "pfs_setup_actor.h"
+#include "pfs_account.h"
 #include "pfs_global.h"
+#include "pfs_buffer_container.h"
 
 /**
   @addtogroup Performance_schema_buffers
   @{
 */
 
-/** Size of the setup_actor instances array. @sa setup_actor_array */
-ulong setup_actor_max;
-
-/**
-  Setup_actor instances array.
-  @sa setup_actor_max
-*/
-
-PFS_setup_actor *setup_actor_array= NULL;
-
 /** Hash table for setup_actor records. */
 LF_HASH setup_actor_hash;
 /** True if @c setup_actor_hash is initialized. */
@@ -61,27 +53,13 @@ static bool setup_actor_hash_inited= false;
 */
 int init_setup_actor(const PFS_global_param *param)
 {
-  setup_actor_max= param->m_setup_actor_sizing;
-
-  setup_actor_array= NULL;
-
-  if (setup_actor_max > 0)
-  {
-    setup_actor_array= PFS_MALLOC_ARRAY(setup_actor_max, sizeof(PFS_setup_actor),
-                                        PFS_setup_actor, MYF(MY_ZEROFILL));
-    if (unlikely(setup_actor_array == NULL))
-      return 1;
-  }
-
-  return 0;
+  return global_setup_actor_container.init(param->m_setup_actor_sizing);
 }
 
 /** Cleanup all the setup actor buffers. */
 void cleanup_setup_actor(void)
 {
-  pfs_free(setup_actor_array);
-  setup_actor_array= NULL;
-  setup_actor_max= 0;
+  global_setup_actor_container.cleanup();
 }
 
 C_MODE_START
@@ -92,9 +70,9 @@ static uchar *setup_actor_hash_get_key(const uchar *entry, size_t *length,
   const PFS_setup_actor *setup_actor;
   const void *result;
   typed_entry= reinterpret_cast<const PFS_setup_actor* const *> (entry);
-  DBUG_ASSERT(typed_entry != NULL);
+  assert(typed_entry != NULL);
   setup_actor= *typed_entry;
-  DBUG_ASSERT(setup_actor != NULL);
+  assert(setup_actor != NULL);
   *length= setup_actor->m_key.m_key_length;
   result= setup_actor->m_key.m_hash_key;
   return const_cast<uchar*> (reinterpret_cast<const uchar*> (result));
@@ -105,13 +83,13 @@ C_MODE_END
   Initialize the setup actor hash.
   @return 0 on success
 */
-int init_setup_actor_hash(void)
+int init_setup_actor_hash(const PFS_global_param *param)
 {
-  if ((! setup_actor_hash_inited) && (setup_actor_max > 0))
+  if ((! setup_actor_hash_inited) && (param->m_setup_actor_sizing != 0))
   {
     lf_hash_init(&setup_actor_hash, sizeof(PFS_setup_actor*), LF_HASH_UNIQUE,
                  0, 0, setup_actor_hash_get_key, &my_charset_bin);
-    /* setup_actor_hash.size= setup_actor_max; */
+    /* setup_actor_hash.size= param->m_setup_actor_sizing; */
     setup_actor_hash_inited= true;
   }
   return 0;
@@ -143,8 +121,8 @@ static void set_setup_actor_key(PFS_setup_actor_key *key,
                                 const char *host, uint host_length,
                                 const char *role, uint role_length)
 {
-  DBUG_ASSERT(user_length <= USERNAME_LENGTH);
-  DBUG_ASSERT(host_length <= HOSTNAME_LENGTH);
+  assert(user_length <= USERNAME_LENGTH);
+  assert(host_length <= HOSTNAME_LENGTH);
 
   char *ptr= &key->m_hash_key[0];
   memcpy(ptr, user, user_length);
@@ -162,11 +140,9 @@ static void set_setup_actor_key(PFS_setup_actor_key *key,
   key->m_key_length= (uint)(ptr - &key->m_hash_key[0]);
 }
 
-int insert_setup_actor(const String *user, const String *host, const String *role)
+int insert_setup_actor(const String *user, const String *host, const String *role,
+                       bool enabled, bool history)
 {
-  if (setup_actor_max == 0)
-    return HA_ERR_RECORD_FILE_FULL;
-
   PFS_thread *thread= PFS_thread::get_current_thread();
   if (unlikely(thread == NULL))
     return HA_ERR_OUT_OF_MEM;
@@ -175,46 +151,39 @@ int insert_setup_actor(const String *user, const String *host, const String *rol
   if (unlikely(pins == NULL))
     return HA_ERR_OUT_OF_MEM;
 
-  static uint PFS_ALIGNED setup_actor_monotonic_index= 0;
-  uint index;
-  uint attempts= 0;
   PFS_setup_actor *pfs;
+  pfs_dirty_state dirty_state;
 
-  while (++attempts <= setup_actor_max)
+  pfs= global_setup_actor_container.allocate(& dirty_state);
+  if (pfs != NULL)
   {
-    /* See create_mutex() */
-    index= PFS_atomic::add_u32(& setup_actor_monotonic_index, 1) % setup_actor_max;
-    pfs= setup_actor_array + index;
-
-    if (pfs->m_lock.is_free())
+    set_setup_actor_key(&pfs->m_key,
+                        user->ptr(), user->length(),
+                        host->ptr(), host->length(),
+                        role->ptr(), role->length());
+    pfs->m_username= &pfs->m_key.m_hash_key[0];
+    pfs->m_username_length= user->length();
+    pfs->m_hostname= pfs->m_username + pfs->m_username_length + 1;
+    pfs->m_hostname_length= host->length();
+    pfs->m_rolename= pfs->m_hostname + pfs->m_hostname_length + 1;
+    pfs->m_rolename_length= role->length();
+    pfs->m_enabled= enabled;
+    pfs->m_history= history;
+
+    int res;
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
+    res= lf_hash_insert(&setup_actor_hash, pins, &pfs);
+    if (likely(res == 0))
     {
-      if (pfs->m_lock.free_to_dirty())
-      {
-        set_setup_actor_key(&pfs->m_key,
-                            user->ptr(), user->length(),
-                            host->ptr(), host->length(),
-                            role->ptr(), role->length());
-        pfs->m_username= &pfs->m_key.m_hash_key[0];
-        pfs->m_username_length= user->length();
-        pfs->m_hostname= pfs->m_username + pfs->m_username_length + 1;
-        pfs->m_hostname_length= host->length();
-        pfs->m_rolename= pfs->m_hostname + pfs->m_hostname_length + 1;
-        pfs->m_rolename_length= role->length();
-
-        int res;
-        res= lf_hash_insert(&setup_actor_hash, pins, &pfs);
-        if (likely(res == 0))
-        {
-          pfs->m_lock.dirty_to_allocated();
-          return 0;
-        }
-
-        pfs->m_lock.dirty_to_free();
-        if (res > 0)
-          return HA_ERR_FOUND_DUPP_KEY;
-        return HA_ERR_OUT_OF_MEM;
-      }
+      update_setup_actors_derived_flags();
+      return 0;
     }
+
+    global_setup_actor_container.deallocate(pfs);
+
+    if (res > 0)
+      return HA_ERR_FOUND_DUPP_KEY;
+    return HA_ERR_OUT_OF_MEM;
   }
 
   return HA_ERR_RECORD_FILE_FULL;
@@ -244,14 +213,36 @@ int delete_setup_actor(const String *user, const String *host, const String *rol
   {
     PFS_setup_actor *pfs= *entry;
     lf_hash_delete(&setup_actor_hash, pins, key.m_hash_key, key.m_key_length);
-    pfs->m_lock.allocated_to_free();
+    global_setup_actor_container.deallocate(pfs);
   }
 
   lf_hash_search_unpin(pins);
 
+  update_setup_actors_derived_flags();
+
   return 0;
 }
 
+class Proc_reset_setup_actor
+  : public PFS_buffer_processor<PFS_setup_actor>
+{
+public:
+  Proc_reset_setup_actor(LF_PINS* pins)
+    : m_pins(pins)
+  {}
+
+  virtual void operator()(PFS_setup_actor *pfs)
+  {
+    lf_hash_delete(&setup_actor_hash, m_pins,
+                   pfs->m_key.m_hash_key, pfs->m_key.m_key_length);
+
+    global_setup_actor_container.deallocate(pfs);
+  }
+
+private:
+  LF_PINS* m_pins;
+};
+
 int reset_setup_actor()
 {
   PFS_thread *thread= PFS_thread::get_current_thread();
@@ -262,18 +253,11 @@ int reset_setup_actor()
   if (unlikely(pins == NULL))
     return HA_ERR_OUT_OF_MEM;
 
-  PFS_setup_actor *pfs= setup_actor_array;
-  PFS_setup_actor *pfs_last= setup_actor_array + setup_actor_max;
+  Proc_reset_setup_actor proc(pins);
+  // FIXME: delete helper instead
+  global_setup_actor_container.apply(proc);
 
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-    {
-      lf_hash_delete(&setup_actor_hash, pins,
-                     pfs->m_key.m_hash_key, pfs->m_key.m_key_length);
-      pfs->m_lock.allocated_to_free();
-    }
-  }
+  update_setup_actors_derived_flags();
 
   return 0;
 }
@@ -291,7 +275,7 @@ long setup_actor_count()
 void lookup_setup_actor(PFS_thread *thread,
                         const char *user, uint user_length,
                         const char *host, uint host_length,
-                        bool *enabled)
+                        bool *enabled, bool *history)
 {
   PFS_setup_actor_key key;
   PFS_setup_actor **entry;
@@ -301,6 +285,7 @@ void lookup_setup_actor(PFS_thread *thread,
   if (unlikely(pins == NULL))
   {
     *enabled= false;
+    *history= false;
     return;
   }
 
@@ -330,15 +315,28 @@ void lookup_setup_actor(PFS_thread *thread,
 
     if (entry && (entry != MY_ERRPTR))
     {
+      PFS_setup_actor *pfs= *entry;
       lf_hash_search_unpin(pins);
-      *enabled= true;
+      *enabled= pfs->m_enabled;
+      *history= pfs->m_history;
       return;
     }
 
     lf_hash_search_unpin(pins);
   }
   *enabled= false;
+  *history= false;
   return;
 }
 
+int update_setup_actors_derived_flags()
+{
+  PFS_thread *thread= PFS_thread::get_current_thread();
+  if (unlikely(thread == NULL))
+    return HA_ERR_OUT_OF_MEM;
+
+  update_accounts_derived_flags(thread);
+  return 0;
+}
+
 /** @} */
diff --git a/storage/perfschema/pfs_setup_actor.h b/storage/perfschema/pfs_setup_actor.h
index 613d5454a9c..3570748085a 100644
--- a/storage/perfschema/pfs_setup_actor.h
+++ b/storage/perfschema/pfs_setup_actor.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -33,6 +33,7 @@
 #include "lf.h"
 
 struct PFS_global_param;
+class PFS_opaque_container_page;
 
 /* WL#988 Roles Not implemented yet */
 #define ROLENAME_LENGTH 64
@@ -74,30 +75,35 @@ struct PFS_ALIGNED PFS_setup_actor
   const char *m_rolename;
   /** Length of @c m_rolename. */
   uint m_rolename_length;
+  /** ENABLED flag. */
+  bool m_enabled;
+  /** HISTORY flag. */
+  bool m_history;
+  /** Container page. */
+  PFS_opaque_container_page *m_page;
 };
 
 int init_setup_actor(const PFS_global_param *param);
 void cleanup_setup_actor(void);
-int init_setup_actor_hash(void);
+int init_setup_actor_hash(const PFS_global_param *param);
 void cleanup_setup_actor_hash(void);
 
-int insert_setup_actor(const String *user, const String *host, const String *role);
-int delete_setup_actor(const String *user, const String *host, const String *role);
+int insert_setup_actor(const String *user, const String *host,
+                       const String *role, bool enabled, bool history);
+int delete_setup_actor(const String *user, const String *host,
+                       const String *role);
 int reset_setup_actor(void);
 long setup_actor_count(void);
 
 void lookup_setup_actor(PFS_thread *thread,
                         const char *user, uint user_length,
                         const char *host, uint host_length,
-                        bool *enabled);
+                        bool *enabled, bool *history);
 
-/* For iterators and show status. */
+/** Update derived flags for all setup_actors. */
+int update_setup_actors_derived_flags();
 
-extern ulong setup_actor_max;
-
-/* Exposing the data directly, for iterators. */
-
-extern PFS_setup_actor *setup_actor_array;
+/* For show status. */
 
 extern LF_HASH setup_actor_hash;
 
diff --git a/storage/perfschema/pfs_setup_object.cc b/storage/perfschema/pfs_setup_object.cc
index 8da9fccc65f..540f2c74936 100644
--- a/storage/perfschema/pfs_setup_object.cc
+++ b/storage/perfschema/pfs_setup_object.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -34,6 +34,7 @@
 #include "pfs_instr.h"
 #include "pfs_setup_object.h"
 #include "pfs_global.h"
+#include "pfs_buffer_container.h"
 
 /**
   @addtogroup Performance_schema_buffers
@@ -42,10 +43,6 @@
 
 uint setup_objects_version= 0;
 
-ulong setup_object_max;
-
-PFS_setup_object *setup_object_array= NULL;
-
 LF_HASH setup_object_hash;
 static bool setup_object_hash_inited= false;
 
@@ -56,27 +53,13 @@ static bool setup_object_hash_inited= false;
 */
 int init_setup_object(const PFS_global_param *param)
 {
-  setup_object_max= param->m_setup_object_sizing;
-
-  setup_object_array= NULL;
-
-  if (setup_object_max > 0)
-  {
-    setup_object_array= PFS_MALLOC_ARRAY(setup_object_max, sizeof(PFS_setup_object),
-                                         PFS_setup_object, MYF(MY_ZEROFILL));
-    if (unlikely(setup_object_array == NULL))
-      return 1;
-  }
-
-  return 0;
+  return global_setup_object_container.init(param->m_setup_object_sizing);
 }
 
 /** Cleanup all the setup object buffers. */
 void cleanup_setup_object(void)
 {
-  pfs_free(setup_object_array);
-  setup_object_array= NULL;
-  setup_object_max= 0;
+  global_setup_object_container.cleanup();
 }
 
 C_MODE_START
@@ -87,9 +70,9 @@ static uchar *setup_object_hash_get_key(const uchar *entry, size_t *length,
   const PFS_setup_object *setup_object;
   const void *result;
   typed_entry= reinterpret_cast<const PFS_setup_object* const *> (entry);
-  DBUG_ASSERT(typed_entry != NULL);
+  assert(typed_entry != NULL);
   setup_object= *typed_entry;
-  DBUG_ASSERT(setup_object != NULL);
+  assert(setup_object != NULL);
   *length= setup_object->m_key.m_key_length;
   result= setup_object->m_key.m_hash_key;
   return const_cast<uchar*> (reinterpret_cast<const uchar*> (result));
@@ -100,13 +83,12 @@ C_MODE_END
   Initialize the setup objects hash.
   @return 0 on success
 */
-int init_setup_object_hash(void)
+int init_setup_object_hash(const PFS_global_param *param)
 {
-  if ((! setup_object_hash_inited) && (setup_object_max > 0))
+  if ((! setup_object_hash_inited) && (param->m_setup_object_sizing != 0))
   {
     lf_hash_init(&setup_object_hash, sizeof(PFS_setup_object*), LF_HASH_UNIQUE,
                  0, 0, setup_object_hash_get_key, &my_charset_bin);
-    /* setup_object_hash.size= setup_object_max; */
     setup_object_hash_inited= true;
   }
   return 0;
@@ -138,8 +120,8 @@ static void set_setup_object_key(PFS_setup_object_key *key,
                                  const char *schema, uint schema_length,
                                  const char *object, uint object_length)
 {
-  DBUG_ASSERT(schema_length <= NAME_LEN);
-  DBUG_ASSERT(object_length <= NAME_LEN);
+  assert(schema_length <= NAME_LEN);
+  assert(object_length <= NAME_LEN);
 
   char *ptr= &key->m_hash_key[0];
   ptr[0]= (char) object_type;
@@ -164,9 +146,6 @@ static void set_setup_object_key(PFS_setup_object_key *key,
 int insert_setup_object(enum_object_type object_type, const String *schema,
                         const String *object, bool enabled, bool timed)
 {
-  if (setup_object_max == 0)
-    return HA_ERR_RECORD_FILE_FULL;
-
   PFS_thread *thread= PFS_thread::get_current_thread();
   if (unlikely(thread == NULL))
     return HA_ERR_OUT_OF_MEM;
@@ -175,47 +154,37 @@ int insert_setup_object(enum_object_type object_type, const String *schema,
   if (unlikely(pins == NULL))
     return HA_ERR_OUT_OF_MEM;
 
-  static uint PFS_ALIGNED setup_object_monotonic_index= 0;
-  uint index;
-  uint attempts= 0;
   PFS_setup_object *pfs;
+  pfs_dirty_state dirty_state;
 
-  while (++attempts <= setup_object_max)
+  pfs= global_setup_object_container.allocate(& dirty_state);
+  if (pfs != NULL)
   {
-    /* See create_mutex() */
-    index= PFS_atomic::add_u32(& setup_object_monotonic_index, 1) % setup_object_max;
-    pfs= setup_object_array + index;
-
-    if (pfs->m_lock.is_free())
+    set_setup_object_key(&pfs->m_key, object_type,
+                         schema->ptr(), schema->length(),
+                         object->ptr(), object->length());
+    pfs->m_schema_name= &pfs->m_key.m_hash_key[1];
+    pfs->m_schema_name_length= schema->length();
+    pfs->m_object_name= pfs->m_schema_name + pfs->m_schema_name_length + 1;
+    pfs->m_object_name_length= object->length();
+    pfs->m_enabled= enabled;
+    pfs->m_timed= timed;
+
+    int res;
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
+    res= lf_hash_insert(&setup_object_hash, pins, &pfs);
+    if (likely(res == 0))
     {
-      if (pfs->m_lock.free_to_dirty())
-      {
-        set_setup_object_key(&pfs->m_key, object_type,
-                             schema->ptr(), schema->length(),
-                             object->ptr(), object->length());
-        pfs->m_schema_name= &pfs->m_key.m_hash_key[1];
-        pfs->m_schema_name_length= schema->length();
-        pfs->m_object_name= pfs->m_schema_name + pfs->m_schema_name_length + 1;
-        pfs->m_object_name_length= object->length();
-        pfs->m_enabled= enabled;
-        pfs->m_timed= timed;
-
-        int res;
-        res= lf_hash_insert(&setup_object_hash, pins, &pfs);
-        if (likely(res == 0))
-        {
-          pfs->m_lock.dirty_to_allocated();
-          setup_objects_version++;
-          return 0;
-        }
-
-        pfs->m_lock.dirty_to_free();
-        if (res > 0)
-          return HA_ERR_FOUND_DUPP_KEY;
-        /* OOM in lf_hash_insert */
-        return HA_ERR_OUT_OF_MEM;
-      }
+      setup_objects_version++;
+      return 0;
     }
+
+    global_setup_object_container.deallocate(pfs);
+
+    if (res > 0)
+      return HA_ERR_FOUND_DUPP_KEY;
+    /* OOM in lf_hash_insert */
+    return HA_ERR_OUT_OF_MEM;
   }
 
   return HA_ERR_RECORD_FILE_FULL;
@@ -245,7 +214,7 @@ int delete_setup_object(enum_object_type object_type, const String *schema,
   {
     PFS_setup_object *pfs= *entry;
     lf_hash_delete(&setup_object_hash, pins, key.m_hash_key, key.m_key_length);
-    pfs->m_lock.allocated_to_free();
+    global_setup_object_container.deallocate(pfs);
   }
 
   lf_hash_search_unpin(pins);
@@ -254,6 +223,26 @@ int delete_setup_object(enum_object_type object_type, const String *schema,
   return 0;
 }
 
+class Proc_reset_setup_object
+  : public PFS_buffer_processor<PFS_setup_object>
+{
+public:
+  Proc_reset_setup_object(LF_PINS* pins)
+    : m_pins(pins)
+  {}
+
+  virtual void operator()(PFS_setup_object *pfs)
+  {
+    lf_hash_delete(&setup_object_hash, m_pins,
+                   pfs->m_key.m_hash_key, pfs->m_key.m_key_length);
+
+    global_setup_object_container.deallocate(pfs);
+  }
+
+private:
+  LF_PINS* m_pins;
+};
+
 int reset_setup_object()
 {
   PFS_thread *thread= PFS_thread::get_current_thread();
@@ -264,18 +253,9 @@ int reset_setup_object()
   if (unlikely(pins == NULL))
     return HA_ERR_OUT_OF_MEM;
 
-  PFS_setup_object *pfs= setup_object_array;
-  PFS_setup_object *pfs_last= setup_object_array + setup_object_max;
-
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-    {
-      lf_hash_delete(&setup_object_hash, pins,
-                     pfs->m_key.m_hash_key, pfs->m_key.m_key_length);
-      pfs->m_lock.allocated_to_free();
-    }
-  }
+  Proc_reset_setup_object proc(pins);
+  // FIXME: delete helper instead
+  global_setup_object_container.apply(proc);
 
   setup_objects_version++;
   return 0;
@@ -304,7 +284,7 @@ void lookup_setup_object(PFS_thread *thread,
     - TABLE foo.bar
     - TEMPORARY TABLE foo.bar
   */
-  DBUG_ASSERT(object_type != OBJECT_TYPE_TEMPORARY_TABLE);
+  assert(object_type != OBJECT_TYPE_TEMPORARY_TABLE);
 
   LF_PINS* pins= get_setup_object_hash_pins(thread);
   if (unlikely(pins == NULL))
diff --git a/storage/perfschema/pfs_setup_object.h b/storage/perfschema/pfs_setup_object.h
index ee40742941c..63e275385b1 100644
--- a/storage/perfschema/pfs_setup_object.h
+++ b/storage/perfschema/pfs_setup_object.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -33,6 +33,7 @@
 
 class String;
 struct PFS_global_param;
+class PFS_opaque_container_page;
 
 /**
   @addtogroup Performance_schema_buffers
@@ -75,11 +76,13 @@ struct PFS_ALIGNED PFS_setup_object
   bool m_enabled;
   /** TIMED flag. */
   bool m_timed;
+  /** Container page. */
+  PFS_opaque_container_page *m_page;
 };
 
 int init_setup_object(const PFS_global_param *param);
 void cleanup_setup_object(void);
-int init_setup_object_hash(void);
+int init_setup_object_hash(const PFS_global_param *param);
 void cleanup_setup_object_hash(void);
 
 int insert_setup_object(enum_object_type object_type, const String *schema,
@@ -95,13 +98,7 @@ void lookup_setup_object(PFS_thread *thread,
                          const char *object_name, int object_name_length,
                          bool *enabled, bool *timed);
 
-/* For iterators and show status. */
-
-extern ulong setup_object_max;
-
-/* Exposing the data directly, for iterators. */
-
-extern PFS_setup_object *setup_object_array;
+/* For show status. */
 
 extern LF_HASH setup_object_hash;
 
diff --git a/storage/perfschema/pfs_stat.h b/storage/perfschema/pfs_stat.h
index 8a049e3013b..f266267f4cd 100644
--- a/storage/perfschema/pfs_stat.h
+++ b/storage/perfschema/pfs_stat.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -23,6 +23,7 @@
 #ifndef PFS_STAT_H
 #define PFS_STAT_H
 
+#include <algorithm>
 #include "sql_const.h"
 /* memcpy */
 #include "string.h"
@@ -53,7 +54,7 @@ struct PFS_single_stat
   {
     m_count= 0;
     m_sum= 0;
-    m_min= ULONGLONG_MAX;
+    m_min= ULLONG_MAX;
     m_max= 0;
   }
 
@@ -61,7 +62,7 @@ struct PFS_single_stat
   {
     m_count= 0;
     m_sum= 0;
-    m_min= ULONGLONG_MAX;
+    m_min= ULLONG_MAX;
     m_max= 0;
   }
 
@@ -72,6 +73,19 @@ struct PFS_single_stat
 
   inline void aggregate(const PFS_single_stat *stat)
   {
+    if (stat->m_count != 0)
+    {
+      m_count+= stat->m_count;
+      m_sum+= stat->m_sum;
+      if (unlikely(m_min > stat->m_min))
+        m_min= stat->m_min;
+      if (unlikely(m_max < stat->m_max))
+        m_max= stat->m_max;
+    }
+  }
+
+  inline void aggregate_no_check(const PFS_single_stat *stat)
+  {
     m_count+= stat->m_count;
     m_sum+= stat->m_sum;
     if (unlikely(m_min > stat->m_min))
@@ -99,6 +113,16 @@ struct PFS_single_stat
     if (unlikely(m_max < value))
       m_max= value;
   }
+
+  inline void aggregate_many_value(ulonglong value, ulonglong count)
+  {
+    m_count+= count;
+    m_sum+= value;
+    if (unlikely(m_min > value))
+      m_min= value;
+    if (unlikely(m_max < value))
+      m_max= value;
+  }
 };
 
 /** Combined statistic. */
@@ -107,39 +131,49 @@ struct PFS_byte_stat : public PFS_single_stat
   /** Byte count statistics */
   ulonglong m_bytes;
 
-  /* Aggregate wait stats, event count and byte count */
+  /** Aggregate wait stats, event count and byte count */
   inline void aggregate(const PFS_byte_stat *stat)
   {
-    PFS_single_stat::aggregate(stat);
+    if (stat->m_count != 0)
+    {
+      PFS_single_stat::aggregate_no_check(stat);
+      m_bytes+= stat->m_bytes;
+    }
+  }
+
+  /** Aggregate wait stats, event count and byte count */
+  inline void aggregate_no_check(const PFS_byte_stat *stat)
+  {
+    PFS_single_stat::aggregate_no_check(stat);
     m_bytes+= stat->m_bytes;
   }
 
-  /* Aggregate individual wait time, event count and byte count */
+  /** Aggregate individual wait time, event count and byte count */
   inline void aggregate(ulonglong wait, ulonglong bytes)
   {
     aggregate_value(wait);
     m_bytes+= bytes;
   }
 
-  /* Aggregate wait stats and event count */
+  /** Aggregate wait stats and event count */
   inline void aggregate_waits(const PFS_byte_stat *stat)
   {
     PFS_single_stat::aggregate(stat);
   }
 
-  /* Aggregate event count and byte count */
+  /** Aggregate event count and byte count */
   inline void aggregate_counted()
   {
     PFS_single_stat::aggregate_counted();
   }
 
-  /* Aggregate event count and byte count */
+  /** Aggregate event count and byte count */
   inline void aggregate_counted(ulonglong bytes)
   {
     PFS_single_stat::aggregate_counted();
     m_bytes+= bytes;
   }
-    
+
   PFS_byte_stat()
   {
     reset();
@@ -157,22 +191,28 @@ struct PFS_mutex_stat
 {
   /** Wait statistics. */
   PFS_single_stat m_wait_stat;
+#ifdef PFS_LATER
   /**
     Lock statistics.
     This statistic is not exposed in user visible tables yet.
   */
   PFS_single_stat m_lock_stat;
+#endif
 
   inline void aggregate(const PFS_mutex_stat *stat)
   {
     m_wait_stat.aggregate(&stat->m_wait_stat);
+#ifdef PFS_LATER
     m_lock_stat.aggregate(&stat->m_lock_stat);
+#endif
   }
 
   inline void reset(void)
   {
     m_wait_stat.reset();
+#ifdef PFS_LATER
     m_lock_stat.reset();
+#endif
   }
 };
 
@@ -181,6 +221,7 @@ struct PFS_rwlock_stat
 {
   /** Wait statistics. */
   PFS_single_stat m_wait_stat;
+#ifdef PFS_LATER
   /**
     RWLock read lock usage statistics.
     This statistic is not exposed in user visible tables yet.
@@ -191,19 +232,24 @@ struct PFS_rwlock_stat
     This statistic is not exposed in user visible tables yet.
   */
   PFS_single_stat m_write_lock_stat;
+#endif
 
   inline void aggregate(const PFS_rwlock_stat *stat)
   {
     m_wait_stat.aggregate(&stat->m_wait_stat);
+#ifdef PFS_LATER
     m_read_lock_stat.aggregate(&stat->m_read_lock_stat);
     m_write_lock_stat.aggregate(&stat->m_write_lock_stat);
+#endif
   }
 
   inline void reset(void)
   {
     m_wait_stat.reset();
+#ifdef PFS_LATER
     m_read_lock_stat.reset();
     m_write_lock_stat.reset();
+#endif
   }
 };
 
@@ -212,6 +258,7 @@ struct PFS_cond_stat
 {
   /** Wait statistics. */
   PFS_single_stat m_wait_stat;
+#ifdef PFS_LATER
   /**
     Number of times a condition was signalled.
     This statistic is not exposed in user visible tables yet.
@@ -222,19 +269,24 @@ struct PFS_cond_stat
     This statistic is not exposed in user visible tables yet.
   */
   ulonglong m_broadcast_count;
+#endif
 
   inline void aggregate(const PFS_cond_stat *stat)
   {
     m_wait_stat.aggregate(&stat->m_wait_stat);
+#ifdef PFS_LATER
     m_signal_count+= stat->m_signal_count;
     m_broadcast_count+= stat->m_broadcast_count;
+#endif
   }
 
   inline void reset(void)
   {
     m_wait_stat.reset();
+#ifdef PFS_LATER
     m_signal_count= 0;
     m_broadcast_count= 0;
+#endif
   }
 };
 
@@ -245,7 +297,7 @@ struct PFS_file_io_stat
   PFS_byte_stat m_read;
   /** WRITE statistics */
   PFS_byte_stat m_write;
-  /** Miscelleanous statistics */
+  /** Miscellaneous statistics */
   PFS_byte_stat m_misc;
 
   inline void reset(void)
@@ -313,11 +365,51 @@ struct PFS_stage_stat
   inline void aggregate_value(ulonglong value)
   { m_timer1_stat.aggregate_value(value); }
 
+  inline void aggregate(const PFS_stage_stat *stat)
+  { m_timer1_stat.aggregate(& stat->m_timer1_stat); }
+};
+
+/** Statistics for stored program usage. */
+struct PFS_sp_stat
+{
+  PFS_single_stat m_timer1_stat;
+
+  inline void reset(void)
+  { m_timer1_stat.reset(); }
+
+  inline void aggregate_counted()
+  { m_timer1_stat.aggregate_counted(); }
+
+  inline void aggregate_value(ulonglong value)
+  { m_timer1_stat.aggregate_value(value); }
+
+  inline void aggregate(const PFS_stage_stat *stat)
+  { m_timer1_stat.aggregate(& stat->m_timer1_stat); }
+};
+
+/** Statistics for prepared statement usage. */
+struct PFS_prepared_stmt_stat
+{
+  PFS_single_stat m_timer1_stat;
+
+  inline void reset(void)
+  { m_timer1_stat.reset(); }
+
+  inline void aggregate_counted()
+  { m_timer1_stat.aggregate_counted(); }
+
+  inline void aggregate_value(ulonglong value)
+  { m_timer1_stat.aggregate_value(value); }
+
   inline void aggregate(PFS_stage_stat *stat)
   { m_timer1_stat.aggregate(& stat->m_timer1_stat); }
 };
 
-/** Statistics for statement usage. */
+/**
+  Statistics for statement usage.
+  This structure uses lazy initialization,
+  controlled by member @c m_timer1_stat.m_count.
+*/
 struct PFS_statement_stat
 {
   PFS_single_stat m_timer1_stat;
@@ -343,80 +435,128 @@ struct PFS_statement_stat
 
   PFS_statement_stat()
   {
-    m_error_count= 0;
-    m_warning_count= 0;
-    m_rows_affected= 0;
-    m_lock_time= 0;
-    m_rows_sent= 0;
-    m_rows_examined= 0;
-    m_created_tmp_disk_tables= 0;
-    m_created_tmp_tables= 0;
-    m_select_full_join= 0;
-    m_select_full_range_join= 0;
-    m_select_range= 0;
-    m_select_range_check= 0;
-    m_select_scan= 0;
-    m_sort_merge_passes= 0;
-    m_sort_range= 0;
-    m_sort_rows= 0;
-    m_sort_scan= 0;
-    m_no_index_used= 0;
-    m_no_good_index_used= 0;
+    reset();
   }
 
-  inline void reset(void)
+  inline void reset()
   {
-    m_timer1_stat.reset();
-    m_error_count= 0;
-    m_warning_count= 0;
-    m_rows_affected= 0;
-    m_lock_time= 0;
-    m_rows_sent= 0;
-    m_rows_examined= 0;
-    m_created_tmp_disk_tables= 0;
-    m_created_tmp_tables= 0;
-    m_select_full_join= 0;
-    m_select_full_range_join= 0;
-    m_select_range= 0;
-    m_select_range_check= 0;
-    m_select_scan= 0;
-    m_sort_merge_passes= 0;
-    m_sort_range= 0;
-    m_sort_rows= 0;
-    m_sort_scan= 0;
-    m_no_index_used= 0;
-    m_no_good_index_used= 0;
+    m_timer1_stat.m_count= 0;
   }
 
+  inline void mark_used()
+  {
+    delayed_reset();
+  }
+
+private:
+  inline void delayed_reset(void)
+  {
+    if (m_timer1_stat.m_count == 0)
+    {
+      m_timer1_stat.reset();
+      m_error_count= 0;
+      m_warning_count= 0;
+      m_rows_affected= 0;
+      m_lock_time= 0;
+      m_rows_sent= 0;
+      m_rows_examined= 0;
+      m_created_tmp_disk_tables= 0;
+      m_created_tmp_tables= 0;
+      m_select_full_join= 0;
+      m_select_full_range_join= 0;
+      m_select_range= 0;
+      m_select_range_check= 0;
+      m_select_scan= 0;
+      m_sort_merge_passes= 0;
+      m_sort_range= 0;
+      m_sort_rows= 0;
+      m_sort_scan= 0;
+      m_no_index_used= 0;
+      m_no_good_index_used= 0;
+    }
+  }
+
+public:
   inline void aggregate_counted()
-  { m_timer1_stat.aggregate_counted(); }
+  {
+    delayed_reset();
+    m_timer1_stat.aggregate_counted();
+  }
 
   inline void aggregate_value(ulonglong value)
-  { m_timer1_stat.aggregate_value(value); }
+  {
+    delayed_reset();
+    m_timer1_stat.aggregate_value(value);
+  }
+
+  inline void aggregate(const PFS_statement_stat *stat)
+  {
+    if (stat->m_timer1_stat.m_count != 0)
+    {
+      delayed_reset();
+      m_timer1_stat.aggregate_no_check(& stat->m_timer1_stat);
+
+      m_error_count+= stat->m_error_count;
+      m_warning_count+= stat->m_warning_count;
+      m_rows_affected+= stat->m_rows_affected;
+      m_lock_time+= stat->m_lock_time;
+      m_rows_sent+= stat->m_rows_sent;
+      m_rows_examined+= stat->m_rows_examined;
+      m_created_tmp_disk_tables+= stat->m_created_tmp_disk_tables;
+      m_created_tmp_tables+= stat->m_created_tmp_tables;
+      m_select_full_join+= stat->m_select_full_join;
+      m_select_full_range_join+= stat->m_select_full_range_join;
+      m_select_range+= stat->m_select_range;
+      m_select_range_check+= stat->m_select_range_check;
+      m_select_scan+= stat->m_select_scan;
+      m_sort_merge_passes+= stat->m_sort_merge_passes;
+      m_sort_range+= stat->m_sort_range;
+      m_sort_rows+= stat->m_sort_rows;
+      m_sort_scan+= stat->m_sort_scan;
+      m_no_index_used+= stat->m_no_index_used;
+      m_no_good_index_used+= stat->m_no_good_index_used;
+    }
+  }
+};
+
+/** Statistics for transaction usage. */
+struct PFS_transaction_stat
+{
+  PFS_single_stat m_read_write_stat;
+  PFS_single_stat m_read_only_stat;
+
+  ulonglong m_savepoint_count;
+  ulonglong m_rollback_to_savepoint_count;
+  ulonglong m_release_savepoint_count;
+
+  PFS_transaction_stat()
+  {
+    m_savepoint_count= 0;
+    m_rollback_to_savepoint_count= 0;
+    m_release_savepoint_count= 0;
+  }
+
+  ulonglong count(void)
+  {
+    return (m_read_write_stat.m_count + m_read_only_stat.m_count);
+  }
+
+  inline void reset(void)
+  {
+    m_read_write_stat.reset();
+    m_read_only_stat.reset();
+    m_savepoint_count= 0;
+    m_rollback_to_savepoint_count= 0;
+    m_release_savepoint_count= 0;
+  }
 
-  inline void aggregate(PFS_statement_stat *stat)
-  {
-    m_timer1_stat.aggregate(& stat->m_timer1_stat);
-
-    m_error_count+= stat->m_error_count;
-    m_warning_count+= stat->m_warning_count;
-    m_rows_affected+= stat->m_rows_affected;
-    m_lock_time+= stat->m_lock_time;
-    m_rows_sent+= stat->m_rows_sent;
-    m_rows_examined+= stat->m_rows_examined;
-    m_created_tmp_disk_tables+= stat->m_created_tmp_disk_tables;
-    m_created_tmp_tables+= stat->m_created_tmp_tables;
-    m_select_full_join+= stat->m_select_full_join;
-    m_select_full_range_join+= stat->m_select_full_range_join;
-    m_select_range+= stat->m_select_range;
-    m_select_range_check+= stat->m_select_range_check;
-    m_select_scan+= stat->m_select_scan;
-    m_sort_merge_passes+= stat->m_sort_merge_passes;
-    m_sort_range+= stat->m_sort_range;
-    m_sort_rows+= stat->m_sort_rows;
-    m_sort_scan+= stat->m_sort_scan;
-    m_no_index_used+= stat->m_no_index_used;
-    m_no_good_index_used+= stat->m_no_good_index_used;
+  inline void aggregate(const PFS_transaction_stat *stat)
+  {
+    m_read_write_stat.aggregate(&stat->m_read_write_stat);
+    m_read_only_stat.aggregate(&stat->m_read_only_stat);
+    m_savepoint_count+= stat->m_savepoint_count;
+    m_rollback_to_savepoint_count+= stat->m_rollback_to_savepoint_count;
+    m_release_savepoint_count+= stat->m_release_savepoint_count;
   }
 };
 
@@ -486,7 +626,9 @@ enum PFS_TL_LOCK_TYPE
 
   /* Locks for handler::ha_external_lock() */
   PFS_TL_READ_EXTERNAL= 9,
-  PFS_TL_WRITE_EXTERNAL= 10
+  PFS_TL_WRITE_EXTERNAL= 10,
+
+  PFS_TL_NONE= 99
 };
 
 #define COUNT_PFS_TL_LOCK_TYPE 11
@@ -580,7 +722,7 @@ struct PFS_table_stat
     PFS_table_io_stat *to_stat_last;
     const PFS_table_io_stat *from_stat;
 
-    DBUG_ASSERT(key_count <= MAX_INDEXES);
+    assert(key_count <= MAX_INDEXES);
 
     /* Aggregate stats for each index, if any */
     to_stat= & m_index_stat[0];
@@ -611,7 +753,7 @@ struct PFS_table_stat
     PFS_table_io_stat *stat;
     PFS_table_io_stat *stat_last;
 
-    DBUG_ASSERT(key_count <= MAX_INDEXES);
+    assert(key_count <= MAX_INDEXES);
 
     /* Sum stats for each index, if any */
     stat= & m_index_stat[0];
@@ -644,7 +786,7 @@ struct PFS_socket_io_stat
   PFS_byte_stat m_read;
   /** WRITE statistics */
   PFS_byte_stat m_write;
-  /** Miscelleanous statistics */
+  /** Miscellaneous statistics */
   PFS_byte_stat m_misc;
 
   inline void reset(void)
@@ -691,6 +833,478 @@ struct PFS_socket_stat
   }
 };
 
+struct PFS_memory_stat_delta
+{
+  size_t m_alloc_count_delta;
+  size_t m_free_count_delta;
+  size_t m_alloc_size_delta;
+  size_t m_free_size_delta;
+
+  void reset()
+  {
+    m_alloc_count_delta= 0;
+    m_free_count_delta= 0;
+    m_alloc_size_delta= 0;
+    m_free_size_delta= 0;
+  }
+};
+
+/**
+  Memory statistics.
+  Conceptually, the following statistics are maintained:
+  - CURRENT_COUNT_USED,
+  - LOW_COUNT_USED,
+  - HIGH_COUNT_USED
+  - CURRENT_SIZE_USED,
+  - LOW_SIZE_USED,
+  - HIGH_SIZE_USED
+  Now, the implementation keeps different counters,
+  which are easier (less overhead) to maintain while
+  collecting statistics.
+  Invariants are as follows:
+  CURRENT_COUNT_USED = @c m_alloc_count - @c m_free_count
+  LOW_COUNT_USED + @c m_free_count_capacity = CURRENT_COUNT_USED
+  CURRENT_COUNT_USED + @c m_alloc_count_capacity = HIGH_COUNT_USED
+  CURRENT_SIZE_USED = @c m_alloc_size - @c m_free_size
+  LOW_SIZE_USED + @c m_free_size_capacity = CURRENT_SIZE_USED
+  CURRENT_SIZE_USED + @c m_alloc_size_capacity = HIGH_SIZE_USED
+
+*/
+struct PFS_memory_stat
+{
+  bool m_used;
+  size_t m_alloc_count;
+  size_t m_free_count;
+  size_t m_alloc_size;
+  size_t m_free_size;
+
+  size_t m_alloc_count_capacity;
+  size_t m_free_count_capacity;
+  size_t m_alloc_size_capacity;
+  size_t m_free_size_capacity;
+
+  inline void reset(void)
+  {
+    m_used= false;
+    m_alloc_count= 0;
+    m_free_count= 0;
+    m_alloc_size= 0;
+    m_free_size= 0;
+
+    m_alloc_count_capacity= 0;
+    m_free_count_capacity= 0;
+    m_alloc_size_capacity= 0;
+    m_free_size_capacity= 0;
+  }
+
+  inline void rebase(void)
+  {
+    if (! m_used)
+      return;
+
+    size_t base;
+
+    base= std::min<size_t>(m_alloc_count, m_free_count);
+    m_alloc_count-= base;
+    m_free_count-= base;
+
+    base= std::min<size_t>(m_alloc_size, m_free_size);
+    m_alloc_size-= base;
+    m_free_size-= base;
+
+    m_alloc_count_capacity= 0;
+    m_free_count_capacity= 0;
+    m_alloc_size_capacity= 0;
+    m_free_size_capacity= 0;
+  }
+
+  inline void partial_aggregate_to(PFS_memory_stat *stat)
+  {
+    if (! m_used)
+      return;
+
+    size_t base;
+
+    stat->m_used= true;
+
+    base= std::min<size_t>(m_alloc_count, m_free_count);
+    if (base != 0)
+    {
+      stat->m_alloc_count+= base;
+      stat->m_free_count+= base;
+      m_alloc_count-= base;
+      m_free_count-= base;
+    }
+
+    base= std::min<size_t>(m_alloc_size, m_free_size);
+    if (base != 0)
+    {
+      stat->m_alloc_size+= base;
+      stat->m_free_size+= base;
+      m_alloc_size-= base;
+      m_free_size-= base;
+    }
+
+    stat->m_alloc_count_capacity+= m_alloc_count_capacity;
+    stat->m_free_count_capacity+= m_free_count_capacity;
+    stat->m_alloc_size_capacity+= m_alloc_size_capacity;
+    stat->m_free_size_capacity+= m_free_size_capacity;
+
+    m_alloc_count_capacity= 0;
+    m_free_count_capacity= 0;
+    m_alloc_size_capacity= 0;
+    m_free_size_capacity= 0;
+  }
+
+  inline void full_aggregate_to(PFS_memory_stat *stat) const
+  {
+    if (! m_used)
+      return;
+
+    stat->m_used= true;
+
+    stat->m_alloc_count+= m_alloc_count;
+    stat->m_free_count+= m_free_count;
+    stat->m_alloc_size+= m_alloc_size;
+    stat->m_free_size+= m_free_size;
+
+    stat->m_alloc_count_capacity+= m_alloc_count_capacity;
+    stat->m_free_count_capacity+= m_free_count_capacity;
+    stat->m_alloc_size_capacity+= m_alloc_size_capacity;
+    stat->m_free_size_capacity+= m_free_size_capacity;
+  }
+
+  inline void partial_aggregate_to(PFS_memory_stat *stat1, PFS_memory_stat *stat2)
+  {
+    if (! m_used)
+      return;
+
+    size_t base;
+
+    stat1->m_used= true;
+    stat2->m_used= true;
+
+    base= std::min<size_t>(m_alloc_count, m_free_count);
+    if (base != 0)
+    {
+      stat1->m_alloc_count+= base;
+      stat2->m_alloc_count+= base;
+      stat1->m_free_count+= base;
+      stat2->m_free_count+= base;
+      m_alloc_count-= base;
+      m_free_count-= base;
+    }
+
+    base= std::min<size_t>(m_alloc_size, m_free_size);
+    if (base != 0)
+    {
+      stat1->m_alloc_size+= base;
+      stat2->m_alloc_size+= base;
+      stat1->m_free_size+= base;
+      stat2->m_free_size+= base;
+      m_alloc_size-= base;
+      m_free_size-= base;
+    }
+
+    stat1->m_alloc_count_capacity+= m_alloc_count_capacity;
+    stat2->m_alloc_count_capacity+= m_alloc_count_capacity;
+    stat1->m_free_count_capacity+= m_free_count_capacity;
+    stat2->m_free_count_capacity+= m_free_count_capacity;
+    stat1->m_alloc_size_capacity+= m_alloc_size_capacity;
+    stat2->m_alloc_size_capacity+= m_alloc_size_capacity;
+    stat1->m_free_size_capacity+= m_free_size_capacity;
+    stat2->m_free_size_capacity+= m_free_size_capacity;
+
+    m_alloc_count_capacity= 0;
+    m_free_count_capacity= 0;
+    m_alloc_size_capacity= 0;
+    m_free_size_capacity= 0;
+  }
+
+  inline void full_aggregate_to(PFS_memory_stat *stat1, PFS_memory_stat *stat2) const
+  {
+    if (! m_used)
+      return;
+
+    stat1->m_used= true;
+    stat2->m_used= true;
+
+    stat1->m_alloc_count+= m_alloc_count;
+    stat2->m_alloc_count+= m_alloc_count;
+    stat1->m_free_count+= m_free_count;
+    stat2->m_free_count+= m_free_count;
+    stat1->m_alloc_size+= m_alloc_size;
+    stat2->m_alloc_size+= m_alloc_size;
+    stat1->m_free_size+= m_free_size;
+    stat2->m_free_size+= m_free_size;
+
+    stat1->m_alloc_count_capacity+= m_alloc_count_capacity;
+    stat2->m_alloc_count_capacity+= m_alloc_count_capacity;
+    stat1->m_free_count_capacity+= m_free_count_capacity;
+    stat2->m_free_count_capacity+= m_free_count_capacity;
+    stat1->m_alloc_size_capacity+= m_alloc_size_capacity;
+    stat2->m_alloc_size_capacity+= m_alloc_size_capacity;
+    stat1->m_free_size_capacity+= m_free_size_capacity;
+    stat2->m_free_size_capacity+= m_free_size_capacity;
+  }
+
+  void count_builtin_alloc(size_t size)
+  {
+    m_used= true;
+
+    m_alloc_count++;
+    m_free_count_capacity++;
+    m_alloc_size+= size;
+    m_free_size_capacity+= size;
+
+    if (m_alloc_count_capacity >= 1)
+    {
+      m_alloc_count_capacity--;
+    }
+
+    if (m_alloc_size_capacity >= size)
+    {
+      m_alloc_size_capacity-= size;
+    }
+
+    return;
+  }
+
+  void count_builtin_free(size_t size)
+  {
+    m_used= true;
+
+    m_free_count++;
+    m_alloc_count_capacity++;
+    m_free_size+= size;
+    m_alloc_size_capacity+= size;
+
+    if (m_free_count_capacity >= 1)
+    {
+      m_free_count_capacity--;
+    }
+
+    if (m_free_size_capacity >= size)
+    {
+      m_free_size_capacity-= size;
+    }
+
+    return;
+  }
+
+  inline PFS_memory_stat_delta *count_alloc(size_t size,
+                                            PFS_memory_stat_delta *delta)
+  {
+    m_used= true;
+
+    m_alloc_count++;
+    m_free_count_capacity++;
+    m_alloc_size+= size;
+    m_free_size_capacity+= size;
+
+    if ((m_alloc_count_capacity >= 1) &&
+        (m_alloc_size_capacity >= size))
+    {
+      m_alloc_count_capacity--;
+      m_alloc_size_capacity-= size;
+      return NULL;
+    }
+
+    delta->reset();
+
+    if (m_alloc_count_capacity >= 1)
+    {
+      m_alloc_count_capacity--;
+    }
+    else
+    {
+      delta->m_alloc_count_delta= 1;
+    }
+
+    if (m_alloc_size_capacity >= size)
+    {
+      m_alloc_size_capacity-= size;
+    }
+    else
+    {
+      delta->m_alloc_size_delta= size - m_alloc_size_capacity;
+      m_alloc_size_capacity= 0;
+    }
+
+    return delta;
+  }
+
+  inline PFS_memory_stat_delta *count_realloc(size_t old_size, size_t new_size,
+                                              PFS_memory_stat_delta *delta)
+  {
+    m_used= true;
+
+    size_t size_delta= new_size - old_size;
+    m_alloc_count++;
+    m_alloc_size+= new_size;
+    m_free_count++;
+    m_free_size+= old_size;
+
+    if (new_size == old_size)
+    {
+      return NULL;
+    }
+
+    if (new_size > old_size)
+    {
+      /* Growing */
+      size_delta= new_size - old_size;
+      m_free_size_capacity+= size_delta;
+
+      if (m_alloc_size_capacity >= size_delta)
+      {
+        m_alloc_size_capacity-= size_delta;
+        return NULL;
+      }
+
+      delta->reset();
+      delta->m_alloc_size_delta= size_delta - m_alloc_size_capacity;
+      m_alloc_size_capacity= 0;
+    }
+    else
+    {
+      /* Shrinking */
+      size_delta= old_size - new_size;
+      m_alloc_size_capacity+= size_delta;
+
+      if (m_free_size_capacity >= size_delta)
+      {
+        m_free_size_capacity-= size_delta;
+        return NULL;
+      }
+
+      delta->reset();
+      delta->m_free_size_delta= size_delta - m_free_size_capacity;
+      m_free_size_capacity= 0;
+    }
+
+    return delta;
+  }
+
+  inline PFS_memory_stat_delta *count_free(size_t size, PFS_memory_stat_delta *delta)
+  {
+    m_used= true;
+
+    m_free_count++;
+    m_alloc_count_capacity++;
+    m_free_size+= size;
+    m_alloc_size_capacity+= size;
+
+    if ((m_free_count_capacity >= 1) &&
+        (m_free_size_capacity >= size))
+    {
+      m_free_count_capacity--;
+      m_free_size_capacity-= size;
+      return NULL;
+    }
+
+    delta->reset();
+
+    if (m_free_count_capacity >= 1)
+    {
+      m_free_count_capacity--;
+    }
+    else
+    {
+      delta->m_free_count_delta= 1;
+    }
+
+    if (m_free_size_capacity >= size)
+    {
+      m_free_size_capacity-= size;
+    }
+    else
+    {
+      delta->m_free_size_delta= size - m_free_size_capacity;
+      m_free_size_capacity= 0;
+    }
+
+    return delta;
+  }
+
+  inline PFS_memory_stat_delta *apply_delta(const PFS_memory_stat_delta *delta,
+                                            PFS_memory_stat_delta *delta_buffer)
+  {
+    size_t val;
+    size_t remaining_alloc_count;
+    size_t remaining_alloc_size;
+    size_t remaining_free_count;
+    size_t remaining_free_size;
+    bool has_remaining= false;
+
+    m_used= true;
+
+    val= delta->m_alloc_count_delta;
+    if (val <= m_alloc_count_capacity)
+    {
+      m_alloc_count_capacity-= val;
+      remaining_alloc_count= 0;
+    }
+    else
+    {
+      remaining_alloc_count= val - m_alloc_count_capacity;
+      m_alloc_count_capacity= 0;
+      has_remaining= true;
+    }
+
+    val= delta->m_alloc_size_delta;
+    if (val <= m_alloc_size_capacity)
+    {
+      m_alloc_size_capacity-= val;
+      remaining_alloc_size= 0;
+    }
+    else
+    {
+      remaining_alloc_size= val - m_alloc_size_capacity;
+      m_alloc_size_capacity= 0;
+      has_remaining= true;
+    }
+
+    val= delta->m_free_count_delta;
+    if (val <= m_free_count_capacity)
+    {
+      m_free_count_capacity-= val;
+      remaining_free_count= 0;
+    }
+    else
+    {
+      remaining_free_count= val - m_free_count_capacity;
+      m_free_count_capacity= 0;
+      has_remaining= true;
+    }
+
+    val= delta->m_free_size_delta;
+    if (val <= m_free_size_capacity)
+    {
+      m_free_size_capacity-= val;
+      remaining_free_size= 0;
+    }
+    else
+    {
+      remaining_free_size= val - m_free_size_capacity;
+      m_free_size_capacity= 0;
+      has_remaining= true;
+    }
+
+    if (! has_remaining)
+      return NULL;
+
+    delta_buffer->m_alloc_count_delta= remaining_alloc_count;
+    delta_buffer->m_alloc_size_delta= remaining_alloc_size;
+    delta_buffer->m_free_count_delta= remaining_free_count;
+    delta_buffer->m_free_size_delta= remaining_free_size;
+    return delta_buffer;
+  }
+};
+
+#define PFS_MEMORY_STAT_INITIALIZER { false, 0, 0, 0, 0, 0, 0, 0, 0}
+
+/** Connections statistics. */
 struct PFS_connection_stat
 {
   PFS_connection_stat()
diff --git a/storage/perfschema/pfs_status.cc b/storage/perfschema/pfs_status.cc
new file mode 100644
index 00000000000..50f361db083
--- /dev/null
+++ b/storage/perfschema/pfs_status.cc
@@ -0,0 +1,166 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/pfs_status.cc
+  Status variables statistics (implementation).
+*/
+
+#include "my_global.h"
+#include "my_sys.h"
+#include "pfs_global.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_account.h"
+#include "pfs_host.h"
+#include "pfs_user.h"
+#include "pfs_status.h"
+#include "pfs_atomic.h"
+#include "pfs_buffer_container.h"
+
+#include "sql_show.h" /* reset_status_vars */
+
+PFS_status_stats::PFS_status_stats()
+{
+  reset();
+}
+
+void PFS_status_stats::reset()
+{
+  m_has_stats= false;
+  memset(&m_stats, 0, sizeof(m_stats));
+}
+
+void PFS_status_stats::aggregate(const PFS_status_stats *from)
+{
+  if (from->m_has_stats)
+  {
+    m_has_stats= true;
+    for (int i= 0; i < COUNT_GLOBAL_STATUS_VARS; i++)
+    {
+      m_stats[i] += from->m_stats[i];
+    }
+  }
+}
+
+void PFS_status_stats::aggregate_from(const STATUS_VAR *from)
+{
+  ulong *from_var= (ulong*) from;
+
+  m_has_stats= true;
+  for (int i= 0;
+       i < COUNT_GLOBAL_STATUS_VARS;
+       i++, from_var++)
+  {
+    m_stats[i] += *from_var;
+  }
+}
+
+void PFS_status_stats::aggregate_to(STATUS_VAR *to)
+{
+  if (m_has_stats)
+  {
+    ulong *to_var= (ulong*) to;
+
+    for (int i= 0;
+         i < COUNT_GLOBAL_STATUS_VARS;
+         i++, to_var++)
+    {
+      *to_var += m_stats[i];
+    }
+  }
+}
+
+static void fct_reset_status_by_thread(PFS_thread *thread)
+{
+  PFS_account *account;
+  PFS_user *user;
+  PFS_host *host;
+
+  if (thread->m_lock.is_populated())
+  {
+    account= sanitize_account(thread->m_account);
+    user= sanitize_user(thread->m_user);
+    host= sanitize_host(thread->m_host);
+    aggregate_thread_status(thread, account, user, host);
+  }
+}
+
+/** Reset table STATUS_BY_THREAD data. */
+void reset_status_by_thread()
+{
+  global_thread_container.apply_all(fct_reset_status_by_thread);
+}
+
+static void fct_reset_status_by_account(PFS_account *account)
+{
+  PFS_user *user;
+  PFS_host *host;
+
+  if (account->m_lock.is_populated())
+  {
+    user= sanitize_user(account->m_user);
+    host= sanitize_host(account->m_host);
+    account->aggregate_status(user, host);
+  }
+}
+
+/** Reset table STATUS_BY_ACCOUNT data. */
+void reset_status_by_account()
+{
+  global_account_container.apply_all(fct_reset_status_by_account);
+}
+
+static void fct_reset_status_by_user(PFS_user *user)
+{
+  if (user->m_lock.is_populated())
+    user->aggregate_status();
+}
+
+/** Reset table STATUS_BY_USER data. */
+void reset_status_by_user()
+{
+  global_user_container.apply_all(fct_reset_status_by_user);
+}
+
+static void fct_reset_status_by_host(PFS_host *host)
+{
+  if (host->m_lock.is_populated())
+    host->aggregate_status();
+}
+
+/** Reset table STATUS_BY_HOST data. */
+void reset_status_by_host()
+{
+  global_host_container.apply_all(fct_reset_status_by_host);
+}
+
+/** Reset table GLOBAL_STATUS data. */
+void reset_global_status()
+{
+  /*
+    Do not memset global_status_var,
+    NO_FLUSH counters need to be preserved
+  */
+  reset_status_vars();
+}
+
diff --git a/storage/perfschema/pfs_status.h b/storage/perfschema/pfs_status.h
new file mode 100644
index 00000000000..b29791420ad
--- /dev/null
+++ b/storage/perfschema/pfs_status.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef PFS_STATUS_H
+#define PFS_STATUS_H
+
+/**
+  @file storage/perfschema/pfs_status.h
+  Status variables statistics (declarations).
+*/
+
+struct PFS_status_stats
+{
+  PFS_status_stats();
+
+  void reset();
+  void aggregate(const PFS_status_stats *from);
+  void aggregate_from(const STATUS_VAR *from);
+  void aggregate_to(STATUS_VAR *to);
+
+  bool m_has_stats;
+  ulong m_stats[COUNT_GLOBAL_STATUS_VARS];
+};
+
+void reset_status_by_thread();
+void reset_status_by_account();
+void reset_status_by_user();
+void reset_status_by_host();
+void reset_global_status();
+
+#endif
+
diff --git a/storage/perfschema/pfs_timer.cc b/storage/perfschema/pfs_timer.cc
index 8533dffcb27..c5a0bdf2262 100644
--- a/storage/perfschema/pfs_timer.cc
+++ b/storage/perfschema/pfs_timer.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -33,6 +33,7 @@ enum_timer_name idle_timer= TIMER_NAME_MICROSEC;
 enum_timer_name wait_timer= TIMER_NAME_CYCLE;
 enum_timer_name stage_timer= TIMER_NAME_NANOSEC;
 enum_timer_name statement_timer= TIMER_NAME_NANOSEC;
+enum_timer_name transaction_timer= TIMER_NAME_NANOSEC;
 
 static ulonglong cycle_v0;
 static ulonglong nanosec_v0;
@@ -174,30 +175,35 @@ void init_timers(void)
     /* Normal case. */
     stage_timer= TIMER_NAME_NANOSEC;
     statement_timer= TIMER_NAME_NANOSEC;
+    transaction_timer= TIMER_NAME_NANOSEC;
   }
   else if (microsec_to_pico != 0)
   {
     /* Windows. */
     stage_timer= TIMER_NAME_MICROSEC;
     statement_timer= TIMER_NAME_MICROSEC;
+    transaction_timer= TIMER_NAME_MICROSEC;
   }
   else if (millisec_to_pico != 0)
   {
     /* Robustness, no known cases. */
     stage_timer= TIMER_NAME_MILLISEC;
     statement_timer= TIMER_NAME_MILLISEC;
+    transaction_timer= TIMER_NAME_MILLISEC;
   }
   else if (tick_to_pico != 0)
   {
     /* Robustness, no known cases. */
     stage_timer= TIMER_NAME_TICK;
     statement_timer= TIMER_NAME_TICK;
+    transaction_timer= TIMER_NAME_TICK;
   }
   else
   {
     /* Robustness, no known cases. */
     stage_timer= TIMER_NAME_CYCLE;
     statement_timer= TIMER_NAME_CYCLE;
+    transaction_timer= TIMER_NAME_CYCLE;
   }
 
   /*
@@ -243,7 +249,7 @@ ulonglong get_timer_raw_value(enum_timer_name timer_name)
   case TIMER_NAME_TICK:
     return my_timer_ticks();
   default:
-    DBUG_ASSERT(false);
+    assert(false);
   }
   return 0;
 }
@@ -269,7 +275,7 @@ ulonglong get_timer_raw_value_and_function(enum_timer_name timer_name, timer_fct
     return my_timer_ticks();
   default:
     *fct= NULL;
-    DBUG_ASSERT(false);
+    assert(false);
   }
   return 0;
 }
@@ -297,7 +303,7 @@ ulonglong get_timer_pico_value(enum_timer_name timer_name)
     break;
   default:
     result= 0;
-    DBUG_ASSERT(false);
+    assert(false);
   }
   return result;
 }
@@ -306,8 +312,8 @@ time_normalizer* time_normalizer::get(enum_timer_name timer_name)
 {
   uint index= static_cast<uint> (timer_name);
 
-  DBUG_ASSERT(index >= FIRST_TIMER_NAME);
-  DBUG_ASSERT(index <= LAST_TIMER_NAME);
+  assert(index >= FIRST_TIMER_NAME);
+  assert(index <= LAST_TIMER_NAME);
 
   return & to_pico_data[index];
 }
diff --git a/storage/perfschema/pfs_timer.h b/storage/perfschema/pfs_timer.h
index 222a7f00fae..e9387c110ee 100644
--- a/storage/perfschema/pfs_timer.h
+++ b/storage/perfschema/pfs_timer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -106,6 +106,11 @@ extern enum_timer_name stage_timer;
 */
 extern enum_timer_name statement_timer;
 /**
+  Transaction timer.
+  The timer used to measure all transaction events.
+*/
+extern enum_timer_name transaction_timer;
+/**
   Timer information data.
   Characteristics about each supported timer.
 */
diff --git a/storage/perfschema/pfs_user.cc b/storage/perfschema/pfs_user.cc
index 14b86e1478e..d7dfda4efc2 100644
--- a/storage/perfschema/pfs_user.cc
+++ b/storage/perfschema/pfs_user.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -34,21 +34,13 @@
 #include "pfs_user.h"
 #include "pfs_global.h"
 #include "pfs_instr_class.h"
+#include "pfs_buffer_container.h"
 
 /**
   @addtogroup Performance_schema_buffers
   @{
 */
 
-ulong user_max;
-ulong user_lost;
-
-PFS_user *user_array= NULL;
-
-static PFS_single_stat *user_instr_class_waits_array= NULL;
-static PFS_stage_stat *user_instr_class_stages_array= NULL;
-static PFS_statement_stat *user_instr_class_statements_array= NULL;
-
 LF_HASH user_hash;
 static bool user_hash_inited= false;
 
@@ -59,59 +51,8 @@ static bool user_hash_inited= false;
 */
 int init_user(const PFS_global_param *param)
 {
-  uint index;
-
-  user_max= param->m_user_sizing;
-
-  user_array= NULL;
-  user_instr_class_waits_array= NULL;
-  user_instr_class_stages_array= NULL;
-  user_instr_class_statements_array= NULL;
-  uint waits_sizing= user_max * wait_class_max;
-  uint stages_sizing= user_max * stage_class_max;
-  uint statements_sizing= user_max * statement_class_max;
-
-  if (user_max > 0)
-  {
-    user_array= PFS_MALLOC_ARRAY(user_max, sizeof(PFS_user), PFS_user,
-                                 MYF(MY_ZEROFILL));
-    if (unlikely(user_array == NULL))
-      return 1;
-  }
-
-  if (waits_sizing > 0)
-  {
-    user_instr_class_waits_array=
-      PFS_connection_slice::alloc_waits_slice(waits_sizing);
-    if (unlikely(user_instr_class_waits_array == NULL))
-      return 1;
-  }
-
-  if (stages_sizing > 0)
-  {
-    user_instr_class_stages_array=
-      PFS_connection_slice::alloc_stages_slice(stages_sizing);
-    if (unlikely(user_instr_class_stages_array == NULL))
-      return 1;
-  }
-
-  if (statements_sizing > 0)
-  {
-    user_instr_class_statements_array=
-      PFS_connection_slice::alloc_statements_slice(statements_sizing);
-    if (unlikely(user_instr_class_statements_array == NULL))
-      return 1;
-  }
-
-  for (index= 0; index < user_max; index++)
-  {
-    user_array[index].m_instr_class_waits_stats=
-      &user_instr_class_waits_array[index * wait_class_max];
-    user_array[index].m_instr_class_stages_stats=
-      &user_instr_class_stages_array[index * stage_class_max];
-    user_array[index].m_instr_class_statements_stats=
-      &user_instr_class_statements_array[index * statement_class_max];
-  }
+  if (global_user_container.init(param->m_user_sizing))
+    return 1;
 
   return 0;
 }
@@ -119,15 +60,7 @@ int init_user(const PFS_global_param *param)
 /** Cleanup all the user buffers. */
 void cleanup_user(void)
 {
-  pfs_free(user_array);
-  user_array= NULL;
-  pfs_free(user_instr_class_waits_array);
-  user_instr_class_waits_array= NULL;
-  pfs_free(user_instr_class_stages_array);
-  user_instr_class_stages_array= NULL;
-  pfs_free(user_instr_class_statements_array);
-  user_instr_class_statements_array= NULL;
-  user_max= 0;
+  global_user_container.cleanup();
 }
 
 C_MODE_START
@@ -138,9 +71,9 @@ static uchar *user_hash_get_key(const uchar *entry, size_t *length,
   const PFS_user *user;
   const void *result;
   typed_entry= reinterpret_cast<const PFS_user* const *> (entry);
-  DBUG_ASSERT(typed_entry != NULL);
+  assert(typed_entry != NULL);
   user= *typed_entry;
-  DBUG_ASSERT(user != NULL);
+  assert(user != NULL);
   *length= user->m_key.m_key_length;
   result= user->m_key.m_hash_key;
   return const_cast<uchar*> (reinterpret_cast<const uchar*> (result));
@@ -151,13 +84,12 @@ C_MODE_END
   Initialize the user hash.
   @return 0 on success
 */
-int init_user_hash(void)
+int init_user_hash(const PFS_global_param *param)
 {
-  if ((! user_hash_inited) && (user_max > 0))
+  if ((! user_hash_inited) && (param->m_user_sizing != 0))
   {
     lf_hash_init(&user_hash, sizeof(PFS_user*), LF_HASH_UNIQUE,
                  0, 0, user_hash_get_key, &my_charset_bin);
-    /* user_hash.size= user_max; */
     user_hash_inited= true;
   }
   return 0;
@@ -187,7 +119,7 @@ static LF_PINS* get_user_hash_pins(PFS_thread *thread)
 static void set_user_key(PFS_user_key *key,
                          const char *user, uint user_length)
 {
-  DBUG_ASSERT(user_length <= USERNAME_LENGTH);
+  assert(user_length <= USERNAME_LENGTH);
 
   char *ptr= &key->m_hash_key[0];
   if (user_length > 0)
@@ -204,16 +136,10 @@ PFS_user *
 find_or_create_user(PFS_thread *thread,
                     const char *username, uint username_length)
 {
-  if (user_max == 0)
-  {
-    user_lost++;
-    return NULL;
-  }
-
   LF_PINS *pins= get_user_hash_pins(thread);
   if (unlikely(pins == NULL))
   {
-    user_lost++;
+    global_user_container.m_lost++;
     return NULL;
   }
 
@@ -221,8 +147,10 @@ find_or_create_user(PFS_thread *thread,
   set_user_key(&key, username, username_length);
 
   PFS_user **entry;
+  PFS_user *pfs;
   uint retry_count= 0;
   const uint retry_max= 3;
+  pfs_dirty_state dirty_state;
 
 search:
   entry= reinterpret_cast<PFS_user**>
@@ -230,7 +158,6 @@ search:
                     key.m_hash_key, key.m_key_length));
   if (entry && (entry != MY_ERRPTR))
   {
-    PFS_user *pfs;
     pfs= *entry;
     pfs->inc_refcount();
     lf_hash_search_unpin(pins);
@@ -239,68 +166,55 @@ search:
 
   lf_hash_search_unpin(pins);
 
-  PFS_scan scan;
-  uint random= randomized_index(username, user_max);
-
-  for (scan.init(random, user_max);
-       scan.has_pass();
-       scan.next_pass())
+  pfs= global_user_container.allocate(& dirty_state);
+  if (pfs != NULL)
   {
-    PFS_user *pfs= user_array + scan.first();
-    PFS_user *pfs_last= user_array + scan.last();
-    for ( ; pfs < pfs_last; pfs++)
+    pfs->m_key= key;
+    if (username_length > 0)
+      pfs->m_username= &pfs->m_key.m_hash_key[0];
+    else
+      pfs->m_username= NULL;
+    pfs->m_username_length= username_length;
+
+    pfs->init_refcount();
+    pfs->reset_stats();
+    pfs->m_disconnected_count= 0;
+
+    int res;
+    pfs->m_lock.dirty_to_allocated(& dirty_state);
+    res= lf_hash_insert(&user_hash, pins, &pfs);
+    if (likely(res == 0))
     {
-      if (pfs->m_lock.is_free())
+      return pfs;
+    }
+
+    global_user_container.deallocate(pfs);
+
+    if (res > 0)
+    {
+      if (++retry_count > retry_max)
       {
-        if (pfs->m_lock.free_to_dirty())
-        {
-          pfs->m_key= key;
-          if (username_length > 0)
-            pfs->m_username= &pfs->m_key.m_hash_key[0];
-          else
-            pfs->m_username= NULL;
-          pfs->m_username_length= username_length;
-
-          pfs->init_refcount();
-          pfs->reset_stats();
-          pfs->m_disconnected_count= 0;
-
-          int res;
-          res= lf_hash_insert(&user_hash, pins, &pfs);
-          if (likely(res == 0))
-          {
-            pfs->m_lock.dirty_to_allocated();
-            return pfs;
-          }
-
-          pfs->m_lock.dirty_to_free();
-
-          if (res > 0)
-          {
-            if (++retry_count > retry_max)
-            {
-              user_lost++;
-              return NULL;
-            }
-            goto search;
-          }
-
-          user_lost++;
-          return NULL;
-        }
+        global_user_container.m_lost++;
+        return NULL;
       }
+      goto search;
     }
+
+    global_user_container.m_lost++;
+    return NULL;
   }
 
-  user_lost++;
   return NULL;
 }
 
-void PFS_user::aggregate()
+void PFS_user::aggregate(bool alive)
 {
   aggregate_waits();
   aggregate_stages();
   aggregate_statements();
+  aggregate_transactions();
+  aggregate_memory(alive);
+  aggregate_status();
   aggregate_stats();
 }
 
@@ -322,6 +236,24 @@ void PFS_user::aggregate_statements()
   reset_statements_stats();
 }
 
+void PFS_user::aggregate_transactions()
+{
+  /* No parent to aggregate to, clean the stats */
+  reset_transactions_stats();
+}
+
+void PFS_user::aggregate_memory(bool alive)
+{
+  /* No parent to aggregate to, clean the stats */
+  rebase_memory_stats();
+}
+
+void PFS_user::aggregate_status()
+{
+  /* No parent to aggregate to, clean the stats */
+  reset_status_stats();
+}
+
 void PFS_user::aggregate_stats()
 {
   /* No parent to aggregate to, clean the stats */
@@ -333,12 +265,20 @@ void PFS_user::release()
   dec_refcount();
 }
 
+void PFS_user::carry_memory_stat_delta(PFS_memory_stat_delta *delta, uint index)
+{
+  PFS_memory_stat *event_name_array;
+  PFS_memory_stat *stat;
+  PFS_memory_stat_delta delta_buffer;
+
+  event_name_array= write_instr_class_memory_stats();
+  stat= & event_name_array[index];
+  (void) stat->apply_delta(delta, &delta_buffer);
+}
+
 PFS_user *sanitize_user(PFS_user *unsafe)
 {
-  if ((&user_array[0] <= unsafe) &&
-      (unsafe < &user_array[user_max]))
-    return unsafe;
-  return NULL;
+  return global_user_container.sanitize(unsafe);
 }
 
 void purge_user(PFS_thread *thread, PFS_user *user)
@@ -353,18 +293,38 @@ void purge_user(PFS_thread *thread, PFS_user *user)
                     user->m_key.m_hash_key, user->m_key.m_key_length));
   if (entry && (entry != MY_ERRPTR))
   {
-    DBUG_ASSERT(*entry == user);
+    assert(*entry == user);
     if (user->get_refcount() == 0)
     {
       lf_hash_delete(&user_hash, pins,
                      user->m_key.m_hash_key, user->m_key.m_key_length);
-      user->m_lock.allocated_to_free();
+      user->aggregate(false);
+      global_user_container.deallocate(user);
     }
   }
 
   lf_hash_search_unpin(pins);
 }
 
+class Proc_purge_user
+  : public PFS_buffer_processor<PFS_user>
+{
+public:
+  Proc_purge_user(PFS_thread *thread)
+    : m_thread(thread)
+  {}
+
+  virtual void operator()(PFS_user *pfs)
+  {
+    pfs->aggregate(true);
+    if (pfs->get_refcount() == 0)
+      purge_user(m_thread, pfs);
+  }
+
+private:
+  PFS_thread *m_thread;
+};
+
 /** Purge non connected users, reset stats of connected users. */
 void purge_all_user(void)
 {
@@ -372,18 +332,8 @@ void purge_all_user(void)
   if (unlikely(thread == NULL))
     return;
 
-  PFS_user *pfs= user_array;
-  PFS_user *pfs_last= user_array + user_max;
-
-  for ( ; pfs < pfs_last; pfs++)
-  {
-    if (pfs->m_lock.is_populated())
-    {
-      pfs->aggregate();
-      if (pfs->get_refcount() == 0)
-        purge_user(thread, pfs);
-    }
-  }
+  Proc_purge_user proc(thread);
+  global_user_container.apply(proc);
 }
 
 /** @} */
diff --git a/storage/perfschema/pfs_user.h b/storage/perfschema/pfs_user.h
index 3d0457eae59..58c95ad720e 100644
--- a/storage/perfschema/pfs_user.h
+++ b/storage/perfschema/pfs_user.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -40,6 +40,7 @@ struct PFS_thread;
   @{
 */
 
+/** Hash key for a user. */
 struct PFS_user_key
 {
   /**
@@ -51,6 +52,7 @@ struct PFS_user_key
   uint m_key_length;
 };
 
+/** Per user statistics. */
 struct PFS_ALIGNED PFS_user : public PFS_connection_slice
 {
 public:
@@ -74,13 +76,18 @@ public:
     PFS_atomic::add_32(& m_refcount, -1);
   }
 
-  void aggregate(void);
+  void aggregate(bool alive);
   void aggregate_waits(void);
   void aggregate_stages(void);
   void aggregate_statements(void);
+  void aggregate_transactions(void);
+  void aggregate_memory(bool alive);
+  void aggregate_status(void);
   void aggregate_stats(void);
   void release(void);
 
+  void carry_memory_stat_delta(PFS_memory_stat_delta *delta, uint index);
+
   /** Internal lock. */
   pfs_lock m_lock;
   PFS_user_key m_key;
@@ -95,7 +102,7 @@ private:
 
 int init_user(const PFS_global_param *param);
 void cleanup_user(void);
-int init_user_hash(void);
+int init_user_hash(const PFS_global_param *param);
 void cleanup_user_hash(void);
 
 PFS_user *
@@ -106,14 +113,7 @@ PFS_user *sanitize_user(PFS_user *unsafe);
 void purge_all_user(void);
 
 
-/* For iterators and show status. */
-
-extern ulong user_max;
-extern ulong user_lost;
-
-/* Exposing the data directly, for iterators. */
-
-extern PFS_user *user_array;
+/* For show status. */
 
 extern LF_HASH user_hash;
 
diff --git a/storage/perfschema/pfs_variable.cc b/storage/perfschema/pfs_variable.cc
new file mode 100644
index 00000000000..17b7dfc200c
--- /dev/null
+++ b/storage/perfschema/pfs_variable.cc
@@ -0,0 +1,1291 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+   Copyright (c) 2020, MariaDB Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */
+
+/**
+  @file storage/perfschema/pfs_variable.cc
+  Performance schema system variable and status variable (implementation).
+*/
+#include "sql_plugin.h"
+#include "pfs_variable.h"
+#include "my_sys.h"
+#include "debug_sync.h"
+#include "pfs.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+#include "sql_audit.h"                      // audit_global_variable_get
+
+static inline SHOW_SCOPE show_scope_from_type(enum enum_mysql_show_type type)
+{
+  switch(type) {
+    case SHOW_BOOL:
+    case SHOW_CHAR:
+    case SHOW_CHAR_PTR:
+    case SHOW_DOUBLE:
+    case SHOW_HAVE:
+    case SHOW_HA_ROWS:
+    case SHOW_KEY_CACHE_LONG:
+    case SHOW_LEX_STRING:
+    case SHOW_LONG_NOFLUSH:
+    case SHOW_MY_BOOL:
+    case SHOW_SINT:
+    case SHOW_SLONG:
+    case SHOW_SLONGLONG:
+    case SHOW_SYS:
+    case SHOW_UINT:
+    case SHOW_ULONG:
+    case SHOW_ULONGLONG:
+      return SHOW_SCOPE_GLOBAL;
+
+    case SHOW_DOUBLE_STATUS:
+    case SHOW_LONGLONG_STATUS:
+    case SHOW_LONG_STATUS:
+      return SHOW_SCOPE_ALL;
+
+    case SHOW_ARRAY:
+    case SHOW_FUNC:
+    case SHOW_SIMPLE_FUNC:
+    case SHOW_UNDEF:
+    default:
+      return SHOW_SCOPE_UNDEF;
+  }
+  return SHOW_SCOPE_UNDEF;
+}
+
+
+/**
+  CLASS PFS_system_variable_cache
+*/
+
+/**
+  Build a sorted list of all system variables from the system variable hash.
+  Filter by scope. Must be called inside of LOCK_plugin_delete.
+*/
+bool PFS_system_variable_cache::init_show_var_array(enum_var_type scope, bool strict)
+{
+  assert(!m_initialized);
+  m_query_scope= scope;
+
+  mysql_prlock_rdlock(&LOCK_system_variables_hash);
+  DEBUG_SYNC(m_current_thd, "acquired_LOCK_system_variables_hash");
+
+  /* Record the system variable hash version to detect subsequent changes. */
+  m_version= get_system_variable_hash_version();
+
+  /* Build the SHOW_VAR array from the system variable hash. */
+  SHOW_VAR *vars= enumerate_sys_vars(m_current_thd, true, m_query_scope/*, strict */);
+  m_show_var_array.reserve(get_system_variable_hash_records());
+  for (int i=0; vars[i].name; i++)
+    m_show_var_array.set(i, vars[i]);
+
+  mysql_prlock_unlock(&LOCK_system_variables_hash);
+
+  /* Increase cache size if necessary. */
+  m_cache.reserve(m_show_var_array.elements());
+
+  m_initialized= true;
+  return true;
+}
+
+/**
+  Build an array of SHOW_VARs from the system variable hash.
+  Filter for SESSION scope.
+*/
+bool PFS_system_variable_cache::do_initialize_session(void)
+{
+  /* Block plugins from unloading. */
+  mysql_mutex_lock(&LOCK_plugin_delete);
+
+  /* Build the array. */
+  bool ret= init_show_var_array(OPT_SESSION, true);
+
+  mysql_mutex_unlock(&LOCK_plugin_delete);
+  return ret;
+}
+
+/**
+  Match system variable scope to desired scope.
+*/
+bool PFS_system_variable_cache::match_scope(int scope)
+{
+  switch (scope)
+  {
+    case sys_var::GLOBAL:
+      return m_query_scope == OPT_GLOBAL;
+      break;
+
+    case sys_var::SESSION:
+      return (m_query_scope == OPT_GLOBAL || m_query_scope == OPT_SESSION);
+      break;
+
+    case sys_var::ONLY_SESSION:
+      return m_query_scope == OPT_SESSION;
+      break;
+
+    default:
+      return false;
+      break;
+  }
+  return false;
+}
+
+/**
+  Build a GLOBAL system variable cache.
+*/
+int PFS_system_variable_cache::do_materialize_global(void)
+{
+  /* Block system variable additions or deletions. */
+  mysql_mutex_lock(&LOCK_global_system_variables);
+
+  m_materialized= false;
+
+  /*
+     Build array of SHOW_VARs from system variable hash. Do this within
+     LOCK_plugin_delete to ensure that the hash table remains unchanged
+     during materialization.
+   */
+  if (!m_external_init)
+    init_show_var_array(OPT_GLOBAL, true);
+
+  /* Resolve the value for each SHOW_VAR in the array, add to cache. */
+  for (SHOW_VAR *show_var= m_show_var_array.front();
+       show_var->value && (show_var != m_show_var_array.end()); show_var++)
+  {
+    const char* name= show_var->name;
+    sys_var *value= (sys_var *)show_var->value;
+    assert(value);
+
+    if ((m_query_scope == OPT_GLOBAL) &&
+        (!my_strcasecmp(system_charset_info, name, "sql_log_bin")))
+    {
+      /*
+        PLEASE READ:
+        http://dev.mysql.com/doc/relnotes/mysql/5.7/en/news-5-7-6.html
+
+        SQL_LOG_BIN is:
+        - declared in sys_vars.cc as both GLOBAL and SESSION in 5.7
+        - impossible to SET with SET GLOBAL (raises an error)
+        - and yet can be read with @@global.sql_log_bin
+
+        When show_compatibility_56 = ON,
+        - SHOW GLOBAL VARIABLES does expose a row for SQL_LOG_BIN
+        - INFORMATION_SCHEMA.GLOBAL_VARIABLES also does expose a row,
+        both are for backward compatibility of existing applications,
+        so that no application logic change is required.
+
+        Now, with show_compatibility_56 = OFF (aka, in this code)
+        - SHOW GLOBAL VARIABLES does -- not -- expose a row for SQL_LOG_BIN
+        - PERFORMANCE_SCHEMA.GLOBAL_VARIABLES also does -- not -- expose a row
+        so that a clean interface is exposed to (upgraded and modified) applications.
+
+        The assert below will fail once SQL_LOG_BIN really is defined
+        as SESSION_ONLY (in 5.8), so that this special case can be removed.
+      */
+      assert(value->scope() == sys_var::SESSION);
+      continue;
+    }
+
+    /* Match the system variable scope to the target scope. */
+    if (match_scope(value->scope()))
+    {
+      /* Resolve value, convert to text, add to cache. */
+      System_variable system_var(m_current_thd, show_var, m_query_scope, false);
+      m_cache.push(system_var);
+    }
+  }
+
+  m_materialized= true;
+  mysql_mutex_unlock(&LOCK_global_system_variables);
+  return 0;
+}
+
+/**
+  Build a GLOBAL and SESSION system variable cache.
+*/
+int PFS_system_variable_cache::do_materialize_all(THD *unsafe_thd)
+{
+  int ret= 1;
+
+  m_unsafe_thd= unsafe_thd;
+  m_safe_thd= NULL;
+  m_materialized= false;
+  m_cache.clear();
+
+  /* Block plugins from unloading. */
+  mysql_mutex_lock(&LOCK_plugin_delete);
+
+  /*
+     Build array of SHOW_VARs from system variable hash. Do this within
+     LOCK_plugin_delete to ensure that the hash table remains unchanged
+     while this thread is materialized.
+   */
+  if (!m_external_init)
+    init_show_var_array(OPT_SESSION, false);
+
+  /* Get and lock a validated THD from the thread manager. */
+  if ((m_safe_thd= get_THD(unsafe_thd)) != NULL)
+  {
+    DEBUG_SYNC(m_current_thd, "materialize_session_variable_array_THD_locked");
+    for (SHOW_VAR *show_var= m_show_var_array.front();
+         show_var->value && (show_var != m_show_var_array.end()); show_var++)
+    {
+      /* Resolve value, convert to text, add to cache. */
+      System_variable system_var(m_safe_thd, show_var, m_query_scope, false);
+      m_cache.push(system_var);
+    }
+
+    /* Release lock taken in get_THD(). */
+    mysql_mutex_unlock(&m_safe_thd->LOCK_thd_data);
+
+    m_materialized= true;
+    ret= 0;
+  }
+
+  mysql_mutex_unlock(&LOCK_plugin_delete);
+  return ret;
+}
+
+/**
+  Allocate and assign mem_root for system variable materialization.
+*/
+void PFS_system_variable_cache::set_mem_root(void)
+{
+  if (m_mem_sysvar_ptr == NULL)
+  {
+    init_sql_alloc(PSI_INSTRUMENT_ME, &m_mem_sysvar, SYSVAR_MEMROOT_BLOCK_SIZE, 0, 0);
+    m_mem_sysvar_ptr= &m_mem_sysvar;
+  }
+  m_mem_thd= &current_thd->mem_root;      /* pointer to current THD mem_root */
+  m_mem_thd_save= *m_mem_thd;             /* restore later */
+  *m_mem_thd= &m_mem_sysvar;              /* use temporary mem_root */
+}
+
+/**
+  Mark memory blocks in the temporary mem_root as free.
+  Restore THD::mem_root.
+*/
+void PFS_system_variable_cache::clear_mem_root(void)
+{
+  if (m_mem_sysvar_ptr)
+  {
+    free_root(&m_mem_sysvar, MYF(MY_MARK_BLOCKS_FREE));
+    *m_mem_thd= m_mem_thd_save;          /* restore original mem_root */
+    m_mem_thd= NULL;
+    m_mem_thd_save= NULL;
+  }
+}
+
+/**
+  Free the temporary mem_root.
+  Restore THD::mem_root if necessary.
+*/
+void PFS_system_variable_cache::free_mem_root(void)
+{
+  if (m_mem_sysvar_ptr)
+  {
+    free_root(&m_mem_sysvar, MYF(0));
+    m_mem_sysvar_ptr= NULL;
+    if (m_mem_thd && m_mem_thd_save)
+    {
+      *m_mem_thd= m_mem_thd_save;       /* restore original mem_root */
+      m_mem_thd= NULL;
+      m_mem_thd_save= NULL;
+    }
+  }
+}
+
+/**
+  Build a SESSION system variable cache for a pfs_thread.
+  Requires that init_show_var_array() has already been called.
+  Return 0 for success.
+*/
+int PFS_system_variable_cache::do_materialize_session(PFS_thread *pfs_thread)
+{
+  int ret= 1;
+
+  m_pfs_thread= pfs_thread;
+  m_materialized= false;
+  m_cache.clear();
+
+  /* Block plugins from unloading. */
+  mysql_mutex_lock(&LOCK_plugin_delete);
+
+  /* The SHOW_VAR array must be initialized externally. */
+  assert(m_initialized);
+
+  /* Use a temporary mem_root to avoid depleting THD mem_root. */
+  if (m_use_mem_root)
+    set_mem_root();
+
+  /* Get and lock a validated THD from the thread manager. */
+  if ((m_safe_thd= get_THD(pfs_thread)) != NULL)
+  {
+    for (SHOW_VAR *show_var= m_show_var_array.front();
+         show_var->value && (show_var != m_show_var_array.end()); show_var++)
+    {
+      sys_var *value= (sys_var *)show_var->value;
+
+      /* Match the system variable scope to the target scope. */
+      if (match_scope(value->scope()))
+      {
+        /* Resolve value, convert to text, add to cache. */
+        System_variable system_var(m_safe_thd, show_var, m_query_scope, false);
+        m_cache.push(system_var);
+      }
+    }
+
+    /* Release lock taken in get_THD(). */
+    mysql_mutex_unlock(&m_safe_thd->LOCK_thd_data);
+
+    m_materialized= true;
+    ret= 0;
+  }
+
+  /* Mark mem_root blocks as free. */
+  if (m_use_mem_root)
+    clear_mem_root();
+
+  mysql_mutex_unlock(&LOCK_plugin_delete);
+  return ret;
+}
+
+/**
+  Materialize a single system variable for a pfs_thread.
+  Requires that init_show_var_array() has already been called.
+  Return 0 for success.
+*/
+int PFS_system_variable_cache::do_materialize_session(PFS_thread *pfs_thread, uint index)
+{
+  int ret= 1;
+
+  m_pfs_thread= pfs_thread;
+  m_materialized= false;
+  m_cache.clear();
+
+  /* Block plugins from unloading. */
+  mysql_mutex_lock(&LOCK_plugin_delete);
+
+  /* The SHOW_VAR array must be initialized externally. */
+  assert(m_initialized);
+
+  /* Get and lock a validated THD from the thread manager. */
+  if ((m_safe_thd= get_THD(pfs_thread)) != NULL)
+  {
+    SHOW_VAR *show_var= &m_show_var_array.at(index);
+
+    if (show_var && show_var->value &&
+        (show_var != m_show_var_array.end()))
+    {
+      sys_var *value= (sys_var *)show_var->value;
+
+      /* Match the system variable scope to the target scope. */
+      if (match_scope(value->scope()))
+      {
+        /* Resolve value, convert to text, add to cache. */
+        System_variable system_var(m_safe_thd, show_var, m_query_scope, false);
+        m_cache.push(system_var);
+      }
+    }
+
+    /* Release lock taken in get_THD(). */
+    mysql_mutex_unlock(&m_safe_thd->LOCK_thd_data);
+
+    m_materialized= true;
+    ret= 0;
+  }
+
+  mysql_mutex_unlock(&LOCK_plugin_delete);
+  return ret;
+}
+
+/**
+  Build a SESSION system variable cache for a THD.
+*/
+int PFS_system_variable_cache::do_materialize_session(THD *unsafe_thd)
+{
+  int ret= 1;
+
+  m_unsafe_thd= unsafe_thd;
+  m_safe_thd= NULL;
+  m_materialized= false;
+  m_cache.clear();
+
+  /* Block plugins from unloading. */
+  mysql_mutex_lock(&LOCK_plugin_delete);
+
+  /*
+     Build array of SHOW_VARs from system variable hash. Do this within
+     LOCK_plugin_delete to ensure that the hash table remains unchanged
+     while this thread is materialized.
+   */
+  if (!m_external_init)
+    init_show_var_array(OPT_SESSION, true);
+
+  /* Get and lock a validated THD from the thread manager. */
+  if ((m_safe_thd= get_THD(unsafe_thd)) != NULL)
+  {
+    for (SHOW_VAR *show_var= m_show_var_array.front();
+         show_var->value && (show_var != m_show_var_array.end()); show_var++)
+    {
+      sys_var *value = (sys_var *)show_var->value;
+
+      /* Match the system variable scope to the target scope. */
+      if (match_scope(value->scope()))
+      {
+        /* Resolve value, convert to text, add to cache. */
+        System_variable system_var(m_safe_thd, show_var, m_query_scope, false);
+        m_cache.push(system_var);
+      }
+    }
+
+    /* Release lock taken in get_THD(). */
+    mysql_mutex_unlock(&m_safe_thd->LOCK_thd_data);
+
+    m_materialized= true;
+    ret= 0;
+  }
+
+  mysql_mutex_unlock(&LOCK_plugin_delete);
+  return ret;
+}
+
+
+/**
+  CLASS System_variable
+*/
+
+/**
+  Empty placeholder.
+*/
+System_variable::System_variable()
+  : m_name(NULL), m_name_length(0), m_value_length(0), m_type(SHOW_UNDEF), m_scope(0),
+    m_ignore(false), m_charset(NULL), m_initialized(false)
+{
+  m_value_str[0]= '\0';
+}
+
+/**
+  GLOBAL or SESSION system variable.
+*/
+System_variable::System_variable(THD *target_thd, const SHOW_VAR *show_var,
+                                 enum_var_type query_scope, bool ignore)
+  : m_name(NULL), m_name_length(0), m_value_length(0), m_type(SHOW_UNDEF), m_scope(0),
+    m_ignore(ignore), m_charset(NULL), m_initialized(false)
+{
+  init(target_thd, show_var, query_scope);
+}
+
+/**
+  Get sys_var value from global or local source then convert to string.
+*/
+void System_variable::init(THD *target_thd, const SHOW_VAR *show_var,
+                           enum_var_type query_scope)
+{
+  if (show_var == NULL || show_var->name == NULL)
+    return;
+
+  DBUG_ASSERT(show_var->type == SHOW_SYS);
+
+  m_name= show_var->name;
+  m_name_length= strlen(m_name);
+
+  /* Deprecated variables are ignored but must still be accounted for. */
+  if (m_ignore)
+  {
+    m_value_str[0]= '\0';
+    m_value_length= 0;
+    m_initialized= true;
+    return;
+  }
+
+  /* Block remote target thread from updating this system variable. */
+  /*XXX
+  THD *current_thread= current_thd;
+  if (target_thd != current_thread)
+    mysql_mutex_lock(&target_thd->LOCK_thd_sysvar);*/
+
+  sys_var *system_var= (sys_var *)show_var->value;
+  assert(system_var != NULL);
+  m_charset= system_var->charset(target_thd);
+  m_type= system_var->show_type();
+  m_scope= system_var->scope();
+
+  /* Get the value of the system variable. */
+  String buf(m_value_str, sizeof(m_value_str) - 1, system_charset_info);
+  if (!system_var->val_str_nolock(&buf, target_thd,
+               system_var->value_ptr(target_thd, query_scope, &null_clex_str)))
+    buf.length(0);
+
+  m_value_length= MY_MIN(buf.length(), SHOW_VAR_FUNC_BUFF_SIZE);
+
+  /* Returned value may reference a string other than m_value_str. */
+  if (buf.ptr() != m_value_str)
+    memcpy(m_value_str, buf.ptr(), m_value_length);
+  m_value_str[m_value_length]= 0;
+
+  /*XXX
+  if (target_thd != current_thread)
+    mysql_mutex_unlock(&target_thd->LOCK_thd_sysvar);*/
+
+  m_initialized= true;
+}
+
+
+/**
+  CLASS PFS_status_variable_cache
+*/
+
+PFS_status_variable_cache::
+PFS_status_variable_cache(bool external_init) :
+                          PFS_variable_cache<Status_variable>(external_init),
+                          m_show_command(false), m_sum_client_status(NULL)
+{
+  /* Determine if the originating query is a SHOW command. */
+  m_show_command= (m_current_thd->lex->sql_command == SQLCOM_SHOW_STATUS);
+}
+
+/**
+  Build cache of SESSION status variables for a user.
+*/
+int PFS_status_variable_cache::materialize_user(PFS_user *pfs_user)
+{
+  if (!pfs_user)
+    return 1;
+
+  if (is_materialized(pfs_user))
+    return 0;
+
+  if (!pfs_user->m_lock.is_populated())
+    return 1;
+
+  /* Set callback function. */
+  m_sum_client_status= sum_user_status;
+  return do_materialize_client((PFS_client *)pfs_user);
+}
+
+/**
+  Build cache of SESSION status variables for a host.
+*/
+int PFS_status_variable_cache::materialize_host(PFS_host *pfs_host)
+{
+  if (!pfs_host)
+    return 1;
+
+  if (is_materialized(pfs_host))
+    return 0;
+
+  if (!pfs_host->m_lock.is_populated())
+    return 1;
+
+  /* Set callback function. */
+  m_sum_client_status= sum_host_status;
+  return do_materialize_client((PFS_client *)pfs_host);
+}
+
+/**
+  Build cache of SESSION status variables for an account.
+*/
+int PFS_status_variable_cache::materialize_account(PFS_account *pfs_account)
+{
+  if (!pfs_account)
+    return 1;
+
+  if (is_materialized(pfs_account))
+    return 0;
+
+  if (!pfs_account->m_lock.is_populated())
+    return 1;
+
+  /* Set callback function. */
+  m_sum_client_status= sum_account_status;
+  return do_materialize_client((PFS_client *)pfs_account);
+}
+/**
+  Compare status variable scope to desired scope.
+  @param variable_scope         Scope of current status variable
+  @return TRUE if variable matches the query scope
+*/
+bool PFS_status_variable_cache::match_scope(SHOW_SCOPE variable_scope, bool strict)
+{
+  switch (variable_scope)
+  {
+    case SHOW_SCOPE_GLOBAL:
+      return (m_query_scope == OPT_GLOBAL) || (! strict && (m_query_scope == OPT_SESSION));
+      break;
+    case SHOW_SCOPE_SESSION:
+      /* Ignore session-only vars if aggregating by user, host or account. */
+      if (m_aggregate)
+        return false;
+      else
+        return (m_query_scope == OPT_SESSION);
+      break;
+    case SHOW_SCOPE_ALL:
+      return (m_query_scope == OPT_GLOBAL || m_query_scope == OPT_SESSION);
+      break;
+    case SHOW_SCOPE_UNDEF:
+    default:
+      return false;
+      break;
+  }
+  return false;
+}
+
+/*
+  Exclude specific status variables from the query by name or prefix.
+  Return TRUE if variable should be filtered.
+*/
+bool PFS_status_variable_cache::filter_by_name(const SHOW_VAR *show_var)
+{
+  assert(show_var);
+  assert(show_var->name);
+
+  if (show_var->type == SHOW_ARRAY)
+  {
+    /* The SHOW_ARRAY name is the prefix for the variables in the subarray. */
+    const char *prefix= show_var->name;
+    /* Exclude COM counters if not a SHOW STATUS command. */
+    if (!my_strcasecmp(system_charset_info, prefix, "Com") && !m_show_command)
+      return true;
+  }
+  else
+  {
+    /*
+      Slave status resides in Performance Schema replication tables. Exclude
+      these slave status variables from the SHOW STATUS command and from the
+      status tables.
+      Assume null prefix to ensure that only server-defined slave status
+      variables are filtered.
+    */
+    const char *name= show_var->name;
+    if (!my_strcasecmp(system_charset_info, name, "Slave_running") ||
+        !my_strcasecmp(system_charset_info, name, "Slave_retried_transactions") ||
+        !my_strcasecmp(system_charset_info, name, "Slave_last_heartbeat") ||
+        !my_strcasecmp(system_charset_info, name, "Slave_received_heartbeats") ||
+        !my_strcasecmp(system_charset_info, name, "Slave_heartbeat_period"))
+    {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/**
+  Check that the variable type is aggregatable.
+
+  @param variable_type         Status variable type
+  @return TRUE if variable type can be aggregated
+*/
+bool PFS_status_variable_cache::can_aggregate(enum_mysql_show_type variable_type)
+{
+  switch(variable_type)
+  {
+    /*
+      All server status counters that are totaled across threads are defined in
+      system_status_var as either SHOW_LONGLONG_STATUS or SHOW_LONG_STATUS.
+      These data types are not available to plugins.
+    */
+    case SHOW_LONGLONG_STATUS:
+    case SHOW_LONG_STATUS:
+      return true;
+      break;
+
+    /* Server and plugin */
+    case SHOW_UNDEF:
+    case SHOW_BOOL:
+    case SHOW_CHAR:
+    case SHOW_CHAR_PTR:
+    case SHOW_ARRAY:
+    case SHOW_FUNC:
+    case SHOW_INT:
+    case SHOW_LONG:
+    case SHOW_LONGLONG:
+    case SHOW_DOUBLE:
+    /* Server only */
+    case SHOW_HAVE:
+    case SHOW_MY_BOOL:
+    case SHOW_SYS:
+    case SHOW_LEX_STRING:
+    case SHOW_KEY_CACHE_LONG:
+    case SHOW_DOUBLE_STATUS:
+    case SHOW_HA_ROWS:
+    case SHOW_LONG_NOFLUSH:
+    case SHOW_SLONG:
+    default:
+      return false;
+      break;
+  }
+}
+
+/**
+  Check if a status variable should be excluded from the query.
+  Return TRUE if the variable should be excluded.
+*/
+bool PFS_status_variable_cache::filter_show_var(const SHOW_VAR *show_var, bool strict)
+{
+  /* Match the variable scope with the query scope. */
+  if (!match_scope(show_scope_from_type(show_var->type), strict))
+    return true;
+
+  /* Exclude specific status variables by name or prefix. */
+  if (filter_by_name(show_var))
+    return true;
+
+  /* For user, host or account, ignore variables having non-aggregatable types. */
+  if (m_aggregate && !can_aggregate(show_var->type))
+    return true;
+
+  return false;
+}
+
+
+/**
+  Build an array of SHOW_VARs from the global status array. Expand nested
+  subarrays, filter unwanted variables.
+  NOTE: Must be done inside of LOCK_status to guard against plugin load/unload.
+*/
+bool PFS_status_variable_cache::init_show_var_array(enum_var_type scope, bool strict)
+{
+  assert(!m_initialized);
+
+  /* Resize if necessary. */
+  m_show_var_array.reserve(all_status_vars.elements + 1);
+
+  m_query_scope= scope;
+
+  for (SHOW_VAR *show_var_iter= dynamic_element(&all_status_vars, 0, SHOW_VAR *);
+       show_var_iter != dynamic_element(&all_status_vars, all_status_vars.elements, SHOW_VAR *);
+       show_var_iter++)
+  {
+    SHOW_VAR show_var= *show_var_iter;
+
+    /* Check if this status var should be excluded from the query. */
+    if (filter_show_var(&show_var, strict))
+      continue;
+
+    if (show_var.type == SHOW_ARRAY)
+    {
+      /* Expand nested subarray. The name is used as a prefix. */
+      expand_show_var_array((SHOW_VAR *)show_var.value, show_var.name, strict);
+    }
+    else
+    {
+      show_var.name= make_show_var_name(NULL, show_var.name);
+      m_show_var_array.push(show_var);
+    }
+  }
+
+  /* Last element is NULL. */
+  st_mysql_show_var empty= {0,0,SHOW_UNDEF};
+  m_show_var_array.push(empty);
+
+  /* Get the latest version of all_status_vars. */
+  m_version= get_status_vars_version();
+
+  /* Increase cache size if necessary. */
+  m_cache.reserve(m_show_var_array.elements());
+
+  m_initialized= true;
+  return true;
+}
+
+/**
+  Expand a nested subarray of status variables, indicated by a type of SHOW_ARRAY.
+*/
+void PFS_status_variable_cache::expand_show_var_array(const SHOW_VAR *show_var_array, const char *prefix, bool strict)
+{
+  for (const SHOW_VAR *show_var_ptr= show_var_array;
+       show_var_ptr && show_var_ptr->name;
+       show_var_ptr++)
+  {
+    SHOW_VAR show_var= *show_var_ptr;
+
+    if (filter_show_var(&show_var, strict))
+      continue;
+
+    if (show_var.type == SHOW_ARRAY)
+    {
+      char name_buf[SHOW_VAR_MAX_NAME_LEN];
+      show_var.name= make_show_var_name(prefix, show_var.name, name_buf, sizeof(name_buf));
+      /* Expand nested subarray. The name is used as a prefix. */
+      expand_show_var_array((SHOW_VAR *)show_var.value, show_var.name, strict);
+    }
+    else
+    {
+      /* Add the SHOW_VAR element. Make a local copy of the name string. */
+      show_var.name= make_show_var_name(prefix, show_var.name);
+      m_show_var_array.push(show_var);
+    }
+  }
+}
+
+/**
+  Build the complete status variable name, with prefix. Return in buffer provided.
+*/
+char * PFS_status_variable_cache::make_show_var_name(const char* prefix, const char* name,
+                                                     char *name_buf, size_t buf_len)
+{
+  assert(name_buf != NULL);
+  char *prefix_end= name_buf;
+
+  if (prefix && *prefix)
+  {
+    /* Drop the prefix into the front of the name buffer. */
+    prefix_end= my_stpnmov(name_buf, prefix, buf_len-1);
+    *prefix_end++= '_';
+  }
+
+  /* Restrict name length to remaining buffer size. */
+  size_t max_name_len= name_buf + buf_len - prefix_end;
+
+  /* Load the name into the buffer after the prefix. */
+  my_stpnmov(prefix_end, name, max_name_len);
+  name_buf[buf_len-1]= 0;
+
+  return (name_buf);
+}
+
+/**
+  Make a copy of the name string prefixed with the subarray name if necessary.
+*/
+char * PFS_status_variable_cache::make_show_var_name(const char* prefix, const char* name)
+{
+  char name_buf[SHOW_VAR_MAX_NAME_LEN];
+  size_t buf_len= sizeof(name_buf);
+  make_show_var_name(prefix, name, name_buf, buf_len);
+  return m_current_thd->strdup(name_buf); /* freed at statement end */
+}
+
+/**
+  Build an internal SHOW_VAR array from the external status variable array.
+*/
+bool PFS_status_variable_cache::do_initialize_session(void)
+{
+  /* Acquire LOCK_status to guard against plugin load/unload. */
+  //if (m_current_thd->fill_status_recursion_level++ == 0)
+    mysql_mutex_lock(&LOCK_status);
+
+  bool ret= init_show_var_array(OPT_SESSION, true);
+
+  //if (m_current_thd->fill_status_recursion_level-- == 1)
+    mysql_mutex_unlock(&LOCK_status);
+
+  return ret;
+}
+
+/**
+  For the current THD, use initial_status_vars taken from before the query start.
+*/
+STATUS_VAR *PFS_status_variable_cache::set_status_vars(void)
+{
+  STATUS_VAR *status_vars;
+  if (m_safe_thd == m_current_thd && m_current_thd->initial_status_var != NULL)
+    status_vars= m_current_thd->initial_status_var;
+  else
+    status_vars= &m_safe_thd->status_var;
+
+  return status_vars;
+}
+
+/**
+  Build cache for GLOBAL status variables using values totaled from all threads.
+*/
+int PFS_status_variable_cache::do_materialize_global(void)
+{
+  STATUS_VAR status_totals;
+
+  m_materialized= false;
+  DEBUG_SYNC(m_current_thd, "before_materialize_global_status_array");
+
+  /* Acquire LOCK_status to guard against plugin load/unload. */
+  //if (m_current_thd->fill_status_recursion_level++ == 0)
+    mysql_mutex_lock(&LOCK_status);
+
+  /*
+     Build array of SHOW_VARs from global status array. Do this within
+     LOCK_status to ensure that the array remains unchanged during
+     materialization.
+   */
+  if (!m_external_init)
+    init_show_var_array(OPT_GLOBAL, true);
+
+  /*
+    Collect totals for all active threads. Start with global status vars as a
+    baseline.
+  */
+  PFS_connection_status_visitor visitor(&status_totals);
+  PFS_connection_iterator::visit_global(false, /* hosts */
+                                        false, /* users */
+                                        false, /* accounts */
+                                        false, /* threads */
+                                        true,  /* THDs */
+                                        &visitor);
+  /*
+    Build the status variable cache using the SHOW_VAR array as a reference.
+    Use the status totals collected from all threads.
+  */
+  manifest(m_current_thd, m_show_var_array.front(), &status_totals, "", false, true);
+
+  //if (m_current_thd->fill_status_recursion_level-- == 1)
+    mysql_mutex_unlock(&LOCK_status);
+
+  m_materialized= true;
+  DEBUG_SYNC(m_current_thd, "after_materialize_global_status_array");
+
+  return 0;
+}
+
+/**
+  Build GLOBAL and SESSION status variable cache using values for a non-instrumented thread.
+*/
+int PFS_status_variable_cache::do_materialize_all(THD* unsafe_thd)
+{
+  int ret= 1;
+  assert(unsafe_thd != NULL);
+
+  m_unsafe_thd= unsafe_thd;
+  m_materialized= false;
+  m_cache.clear();
+
+  /* Avoid recursive acquisition of LOCK_status. */
+  //if (m_current_thd->fill_status_recursion_level++ == 0)
+    mysql_mutex_lock(&LOCK_status);
+
+  /*
+     Build array of SHOW_VARs from global status array. Do this within
+     LOCK_status to ensure that the array remains unchanged while this
+     thread is materialized.
+   */
+  if (!m_external_init)
+    init_show_var_array(OPT_SESSION, false);
+
+    /* Get and lock a validated THD from the thread manager. */
+  if ((m_safe_thd= get_THD(unsafe_thd)) != NULL)
+  {
+    /*
+      Build the status variable cache using the SHOW_VAR array as a reference.
+      Use the status values from the THD protected by the thread manager lock.
+    */
+    STATUS_VAR *status_vars= set_status_vars();
+    manifest(m_safe_thd, m_show_var_array.front(), status_vars, "", false, false);
+
+    /* Release lock taken in get_THD(). */
+    mysql_mutex_unlock(&m_safe_thd->LOCK_thd_data);
+
+    m_materialized= true;
+    ret= 0;
+  }
+
+  //if (m_current_thd->fill_status_recursion_level-- == 1)
+    mysql_mutex_unlock(&LOCK_status);
+  return ret;
+}
+
+/**
+  Build SESSION status variable cache using values for a non-instrumented thread.
+*/
+int PFS_status_variable_cache::do_materialize_session(THD* unsafe_thd)
+{
+  int ret= 1;
+  assert(unsafe_thd != NULL);
+
+  m_unsafe_thd= unsafe_thd;
+  m_materialized= false;
+  m_cache.clear();
+
+  /* Avoid recursive acquisition of LOCK_status. */
+  //if (m_current_thd->fill_status_recursion_level++ == 0)
+    mysql_mutex_lock(&LOCK_status);
+
+  /*
+     Build array of SHOW_VARs from global status array. Do this within
+     LOCK_status to ensure that the array remains unchanged while this
+     thread is materialized.
+   */
+  if (!m_external_init)
+    init_show_var_array(OPT_SESSION, true);
+
+    /* Get and lock a validated THD from the thread manager. */
+  if ((m_safe_thd= get_THD(unsafe_thd)) != NULL)
+  {
+    /*
+      Build the status variable cache using the SHOW_VAR array as a reference.
+      Use the status values from the THD protected by the thread manager lock.
+    */
+    STATUS_VAR *status_vars= set_status_vars();
+    manifest(m_safe_thd, m_show_var_array.front(), status_vars, "", false, true);
+
+    /* Release lock taken in get_THD(). */
+    mysql_mutex_unlock(&m_safe_thd->LOCK_thd_data);
+
+    m_materialized= true;
+    ret= 0;
+  }
+
+  //if (m_current_thd->fill_status_recursion_level-- == 1)
+    mysql_mutex_unlock(&LOCK_status);
+  return ret;
+}
+
+/**
+  Build SESSION status variable cache using values for a PFS_thread.
+  NOTE: Requires that init_show_var_array() has already been called.
+*/
+int PFS_status_variable_cache::do_materialize_session(PFS_thread *pfs_thread)
+{
+  int ret= 1;
+  assert(pfs_thread != NULL);
+
+  m_pfs_thread= pfs_thread;
+  m_materialized= false;
+  m_cache.clear();
+
+  /* Acquire LOCK_status to guard against plugin load/unload. */
+  //if (m_current_thd->fill_status_recursion_level++ == 0)
+    mysql_mutex_lock(&LOCK_status);
+
+  /* The SHOW_VAR array must be initialized externally. */
+  assert(m_initialized);
+
+    /* Get and lock a validated THD from the thread manager. */
+  if ((m_safe_thd= get_THD(pfs_thread)) != NULL)
+  {
+    /*
+      Build the status variable cache using the SHOW_VAR array as a reference.
+      Use the status values from the THD protected by the thread manager lock.
+    */
+    STATUS_VAR *status_vars= set_status_vars();
+    manifest(m_safe_thd, m_show_var_array.front(), status_vars, "", false, true);
+
+    /* Release lock taken in get_THD(). */
+    mysql_mutex_unlock(&m_safe_thd->LOCK_thd_data);
+
+    m_materialized= true;
+    ret= 0;
+  }
+
+  //if (m_current_thd->fill_status_recursion_level-- == 1)
+    mysql_mutex_unlock(&LOCK_status);
+  return ret;
+}
+
+/**
+  Build cache of SESSION status variables using the status values provided.
+  The cache is associated with a user, host or account, but not with any
+  particular thread.
+  NOTE: Requires that init_show_var_array() has already been called.
+*/
+int PFS_status_variable_cache::do_materialize_client(PFS_client *pfs_client)
+{
+  assert(pfs_client != NULL);
+  STATUS_VAR status_totals;
+
+  m_pfs_client= pfs_client;
+  m_materialized= false;
+  m_cache.clear();
+
+  /* Acquire LOCK_status to guard against plugin load/unload. */
+  //if (m_current_thd->fill_status_recursion_level++ == 0)
+    mysql_mutex_lock(&LOCK_status);
+
+  /* The SHOW_VAR array must be initialized externally. */
+  assert(m_initialized);
+
+  /*
+    Generate status totals from active threads and from totals aggregated
+    from disconnected threads.
+  */
+  m_sum_client_status(pfs_client, &status_totals);
+
+  /*
+    Build the status variable cache using the SHOW_VAR array as a reference and
+    the status totals collected from threads associated with this client.
+  */
+  manifest(m_current_thd, m_show_var_array.front(), &status_totals, "", false, true);
+
+  //if (m_current_thd->fill_status_recursion_level-- == 1)
+    mysql_mutex_unlock(&LOCK_status);
+
+  m_materialized= true;
+  return 0;
+}
+
+/*
+  Build the status variable cache from the expanded and sorted SHOW_VAR array.
+  Resolve status values using the STATUS_VAR struct provided.
+*/
+void PFS_status_variable_cache::manifest(THD *thd, const SHOW_VAR *show_var_array,
+                                    STATUS_VAR *status_vars, const char *prefix,
+                                    bool nested_array, bool strict)
+{
+  for (const SHOW_VAR *show_var_iter= show_var_array;
+       show_var_iter && show_var_iter->name;
+       show_var_iter++)
+  {
+    // work buffer, must be aligned to handle long/longlong values
+    my_aligned_storage<SHOW_VAR_FUNC_BUFF_SIZE+1, MY_ALIGNOF(longlong)>
+      value_buf;
+    SHOW_VAR show_var_tmp;
+    const SHOW_VAR *show_var_ptr= show_var_iter;  /* preserve array pointer */
+
+    /*
+      If the value is a function reference, then execute the function and
+      reevaluate the new SHOW_TYPE and value. Handle nested case where
+      SHOW_FUNC resolves to another SHOW_FUNC.
+    */
+    if (show_var_ptr->type == SHOW_FUNC)
+    {
+      show_var_tmp= *show_var_ptr;
+      /*
+        Execute the function reference in show_var_tmp->value, which returns
+        show_var_tmp with a new type and new value.
+      */
+      for (const SHOW_VAR *var= show_var_ptr; var->type == SHOW_FUNC; var= &show_var_tmp)
+      {
+        ((mysql_show_var_func)(var->value))(thd, &show_var_tmp, value_buf.data, NULL, m_query_scope);
+      }
+      show_var_ptr= &show_var_tmp;
+    }
+
+    /*
+      If we are expanding a SHOW_ARRAY, filter variables that were not prefiltered by
+      init_show_var_array().
+    */
+    if (nested_array && filter_show_var(show_var_ptr, strict))
+      continue;
+
+    if (show_var_ptr->type == SHOW_ARRAY)
+    {
+      /*
+        Status variables of type SHOW_ARRAY were expanded and filtered by
+        init_show_var_array(), except where a SHOW_FUNC resolves into a
+        SHOW_ARRAY, such as with InnoDB. Recurse to expand the subarray.
+      */
+      manifest(thd, (SHOW_VAR *)show_var_ptr->value, status_vars, show_var_ptr->name, true, strict);
+    }
+    else
+    {
+      /* Add the materialized status variable to the cache. */
+      SHOW_VAR show_var= *show_var_ptr;
+      /*
+        For nested array expansions, make a copy of the variable name, just as
+        done in init_show_var_array().
+      */
+      if (nested_array)
+        show_var.name= make_show_var_name(prefix, show_var_ptr->name);
+
+      /* Convert status value to string format. Add to the cache. */
+      Status_variable status_var(&show_var, status_vars, m_query_scope);
+      m_cache.push(status_var);
+    }
+  }
+}
+
+/**
+  CLASS Status_variable
+*/
+Status_variable::Status_variable(const SHOW_VAR *show_var, STATUS_VAR *status_vars, enum_var_type query_scope)
+  : m_name_length(0), m_value_length(0), m_type(SHOW_UNDEF),
+    m_charset(NULL), m_initialized(false)
+{
+  init(show_var, status_vars, query_scope);
+}
+
+/**
+  Resolve status value, convert to string.
+  show_var->value is an offset into status_vars.
+  NOTE: Assumes LOCK_status is held.
+*/
+void Status_variable::init(const SHOW_VAR *show_var, STATUS_VAR *status_vars, enum_var_type query_scope)
+{
+  if (show_var == NULL || show_var->name == NULL)
+    return;
+  m_name= show_var->name;
+  m_name_length= strlen(m_name);
+  m_type= show_var->type;
+
+  /* Get the value of the status variable. */
+  const char *value;
+  value= get_one_variable(current_thd, show_var, query_scope, m_type,
+                          status_vars, &m_charset, m_value_str, &m_value_length);
+  m_value_length= MY_MIN(m_value_length, SHOW_VAR_FUNC_BUFF_SIZE);
+  m_charset= system_charset_info;
+
+  /* Returned value may reference a string other than m_value_str. */
+  if (value != m_value_str)
+    memcpy(m_value_str, value, m_value_length);
+  m_value_str[m_value_length]= 0;
+
+  m_initialized= true;
+}
+
+/*
+  Get status totals for this user from active THDs and related accounts.
+*/
+void sum_user_status(PFS_client *pfs_user, STATUS_VAR *status_totals)
+{
+  PFS_connection_status_visitor visitor(status_totals);
+  PFS_connection_iterator::visit_user((PFS_user *)pfs_user,
+                                                  true,  /* accounts */
+                                                  false, /* threads */
+                                                  true,  /* THDs */
+                                                  &visitor);
+}
+
+/*
+  Get status totals for this host from active THDs and related accounts.
+*/
+void sum_host_status(PFS_client *pfs_host, STATUS_VAR *status_totals)
+{
+  PFS_connection_status_visitor visitor(status_totals);
+  PFS_connection_iterator::visit_host((PFS_host *)pfs_host,
+                                                  true,  /* accounts */
+                                                  false, /* threads */
+                                                  true,  /* THDs */
+                                                  &visitor);
+}
+
+/*
+  Get status totals for this account from active THDs and from totals aggregated
+  from disconnectd threads.
+*/
+void sum_account_status(PFS_client *pfs_account, STATUS_VAR *status_totals)
+{
+  PFS_connection_status_visitor visitor(status_totals);
+  PFS_connection_iterator::visit_account((PFS_account *)pfs_account,
+                                                        false,      /* threads */
+                                                        true,       /* THDs */
+                                                        &visitor);
+}
+
+/**
+  Reset aggregated status counter stats for account, user and host.
+  NOTE: Assumes LOCK_status is held.
+*/
+void reset_pfs_status_stats()
+{
+  reset_status_by_account();
+  reset_status_by_user();
+  reset_status_by_host();
+}
+
+/** @} */
diff --git a/storage/perfschema/pfs_variable.h b/storage/perfschema/pfs_variable.h
new file mode 100644
index 00000000000..e59b02f2af8
--- /dev/null
+++ b/storage/perfschema/pfs_variable.h
@@ -0,0 +1,716 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */
+
+#ifndef PFS_VARIABLE_H
+#define PFS_VARIABLE_H
+
+/**
+  @file storage/perfschema/pfs_variable.h
+  Performance schema system and status variables (declarations).
+*/
+
+/**
+  OVERVIEW
+  --------
+  Status and system variables are implemented differently in the server, but the
+  steps to process them in the Performance Schema are essentially the same:
+
+  1. INITIALIZE - Build or acquire a sorted list of variables to use for input.
+     Use the SHOW_VAR struct as an intermediate format common to system, status
+     and user vars:
+
+     SHOW_VAR
+       Name  - Text string
+       Value - Pointer to memory location, function, subarray structure
+       Type  - Scalar, function, or subarray
+       Scope - SESSION, GLOBAL, BOTH
+
+     Steps:
+     - Register the server's internal buffer with the class. Acquire locks
+       if necessary, then scan the contents of the input buffer.
+     - For system variables, convert each element to SHOW_VAR format, store in
+       a temporary array.
+     - For status variables, copy existing global status array into a local
+       array that can be used without locks. Expand nested subarrays, indicated
+       by a type of SHOW_ARRAY.
+
+  2. MATERIALIZE - Convert the list of SHOW_VAR variables to string format,
+     store in a local cache:
+     - Resolve each variable according to the type.
+     - Recursively process unexpanded nested arrays and callback functions.
+     - Aggregate values across threads for global status.
+     - Convert numeric values to a string.
+     - Prefix variable name with the plugin name.
+
+  3. OUTPUT - Iterate the cache for the SHOW command or table query.
+
+  CLASS OVERVIEW
+  --------------
+  1. System_variable - A materialized system variable
+  2. Status_variable - A materialized status variable
+  3. PFS_variable_cache - Base class that defines the interface for the operations above.
+     public
+       init_show_var_array() - Build SHOW_VAR list of variables for processing
+       materialize_global()  - Materialize global variables, aggregate across sessions
+       materialize_session() - Materialize variables for a given PFS_thread or THD
+       materialize_user()    - Materialize variables for a user, aggregate across related threads.
+       materialize_host()    - Materialize variables for a host, aggregate across related threads.
+       materialize_account() - Materialize variables for a account, aggregate across related threads.
+     private
+       m_show_var_array         - Prealloc_array of SHOW_VARs for input to Materialize
+       m_cache                  - Prealloc_array of materialized variables for output
+       do_materialize_global()  - Implementation of materialize_global()
+       do_materialize_session() - Implementation of materialize_session()
+       do_materialize_client()  - Implementation of materialize_user/host/account()
+
+  4. PFS_system_variable_cache - System variable implementation of PFS_variable_cache
+  5. PFS_status_variable_cache - Status variable implementation of PFS_variable_cache
+  6. Find_THD_variable         - Used by the thread manager to find and lock a THD.
+
+  GLOSSARY
+  --------
+  Status variable - Server or plugin status counter. Not dynamic.
+  System variable - Server or plugin configuration variable. Usually dynamic.
+  GLOBAL scope    - Associated with the server, no context at thread level.
+  SESSION scope   - Associated with a connection or thread, but no global context.
+  BOTH scope      - Globally defined but applies at the session level.
+  Initialize      - Build list of variables in SHOW_VAR format.
+  Materialize     - Convert variables in SHOW_VAR list to string, cache for output.
+  Manifest        - Substep of Materialize. Resolve variable values according to
+                    type. This includes SHOW_FUNC types which are resolved by
+                    executing a callback function (possibly recursively), and
+                    SHOW_ARRAY types that expand into nested subarrays.
+  LOCK PRIORITIES
+  ---------------
+  System Variables
+    LOCK_plugin_delete              (block plugin delete)
+     LOCK_system_variables_hash
+     LOCK_thd_data                  (block THD delete)
+     LOCK_thd_sysvar                (block system variable updates, alloc_and_copy_thd_dynamic_variables)
+       LOCK_global_system_variables (very briefly held)
+
+  Status Variables
+    LOCK_status
+      LOCK_thd_data                 (block THD delete)
+*/
+
+/* Iteration on THD from the sql layer. */
+#include "mysqld_thd_manager.h"
+#define PFS_VAR
+/* Class sys_var */
+#include "set_var.h"
+/* convert_value_to_string */
+#include "sql_show.h"
+/* PFS_thread */
+#include "pfs_instr.h"
+#include "pfs_user.h"
+#include "pfs_host.h"
+#include "pfs_account.h"
+
+/* Global array of all server and plugin-defined status variables. */
+extern DYNAMIC_ARRAY all_status_vars;
+extern bool status_vars_inited;
+static const uint SYSVAR_MEMROOT_BLOCK_SIZE = 4096;
+
+extern mysql_mutex_t LOCK_plugin_delete;
+
+class Find_THD_Impl;
+class Find_THD_variable;
+typedef PFS_connection_slice PFS_client;
+
+/**
+  CLASS System_variable - System variable derived from sys_var object.
+*/
+class System_variable
+{
+public:
+  System_variable();
+  System_variable(THD *target_thd, const SHOW_VAR *show_var,
+                  enum_var_type query_scope, bool ignore);
+  ~System_variable() {}
+
+  bool is_null() const { return !m_initialized; }
+  bool is_ignored() const { return m_ignore; }
+
+public:
+  const char *m_name;
+  size_t m_name_length;
+  char m_value_str[SHOW_VAR_FUNC_BUFF_SIZE+1];
+  size_t m_value_length;
+  enum_mysql_show_type m_type;
+  int m_scope;
+  bool m_ignore;
+  const CHARSET_INFO *m_charset;
+
+private:
+  bool m_initialized;
+  void init(THD *thd, const SHOW_VAR *show_var, enum_var_type query_scope);
+};
+
+
+/**
+  CLASS Status_variable - Status variable derived from SHOW_VAR.
+*/
+class Status_variable
+{
+public:
+  Status_variable() : m_name(NULL), m_name_length(0), m_value_length(0),
+                      m_type(SHOW_UNDEF),
+                      m_charset(NULL), m_initialized(false) {}
+
+  Status_variable(const SHOW_VAR *show_var, STATUS_VAR *status_array, enum_var_type query_scope);
+
+  ~Status_variable() {}
+
+  bool is_null() const {return !m_initialized;};
+
+public:
+  const char *m_name;
+  size_t m_name_length;
+  char m_value_str[SHOW_VAR_FUNC_BUFF_SIZE+1];
+  size_t m_value_length;
+  SHOW_TYPE  m_type;
+  const CHARSET_INFO *m_charset;
+private:
+  bool m_initialized;
+  void init(const SHOW_VAR *show_var, STATUS_VAR *status_array, enum_var_type query_scope);
+};
+
+
+/**
+  CLASS Find_THD_variable - Get and lock a validated THD from the thread manager.
+*/
+class Find_THD_variable : public Find_THD_Impl
+{
+public:
+  Find_THD_variable() : m_unsafe_thd(NULL) {}
+  Find_THD_variable(THD *unsafe_thd) : m_unsafe_thd(unsafe_thd) {}
+
+  virtual bool operator()(THD *thd)
+  {
+    //TODO: filter bg threads?
+    if (thd != m_unsafe_thd)
+      return false;
+
+    /* Hold this lock to keep THD during materialization. */
+    mysql_mutex_lock(&thd->LOCK_thd_data);
+    return true;
+  }
+  void set_unsafe_thd(THD *unsafe_thd) { m_unsafe_thd= unsafe_thd; }
+private:
+  THD *m_unsafe_thd;
+};
+
+/**
+  CLASS PFS_variable_cache - Base class for a system or status variable cache.
+*/
+template <class Var_type>
+class PFS_variable_cache
+{
+public:
+  typedef Dynamic_array<Var_type> Variable_array;
+
+  PFS_variable_cache(bool external_init)
+    : m_safe_thd(NULL),
+      m_unsafe_thd(NULL),
+      m_current_thd(current_thd),
+      m_pfs_thread(NULL),
+      m_pfs_client(NULL),
+      m_thd_finder(),
+      m_cache(PSI_INSTRUMENT_MEM),
+      m_initialized(false),
+      m_external_init(external_init),
+      m_materialized(false),
+      m_show_var_array(PSI_INSTRUMENT_MEM),
+      m_version(0),
+      m_query_scope(OPT_DEFAULT),
+      m_use_mem_root(false),
+      m_aggregate(false)
+  { }
+
+  virtual ~PFS_variable_cache()= 0;
+
+  /**
+    Build array of SHOW_VARs from the external variable source.
+    Filter using session scope.
+  */
+  bool initialize_session(void);
+
+  /**
+    Build array of SHOW_VARs suitable for aggregation by user, host or account.
+    Filter using session scope.
+  */
+  bool initialize_client_session(void);
+
+  /**
+    Build cache of GLOBAL system or status variables.
+    Aggregate across threads if applicable.
+  */
+  int materialize_global();
+
+  /**
+    Build cache of GLOBAL and SESSION variables for a non-instrumented thread.
+  */
+  int materialize_all(THD *thd);
+
+  /**
+    Build cache of SESSION variables for a non-instrumented thread.
+  */
+  int materialize_session(THD *thd);
+
+  /**
+    Build cache of SESSION variables for an instrumented thread.
+  */
+  int materialize_session(PFS_thread *pfs_thread, bool use_mem_root= false);
+
+  /**
+    Cache a single SESSION variable for an instrumented thread.
+  */
+  int materialize_session(PFS_thread *pfs_thread, uint index);
+
+  /**
+    Build cache of SESSION status variables for a user.
+  */
+  int materialize_user(PFS_user *pfs_user);
+
+  /**
+    Build cache of SESSION status variables for a host.
+  */
+  int materialize_host(PFS_host *pfs_host);
+
+  /**
+    Build cache of SESSION status variables for an account.
+  */
+  int materialize_account(PFS_account *pfs_account);
+
+  /**
+    True if variables have been materialized.
+  */
+  bool is_materialized(void)
+  {
+    return m_materialized;
+  }
+
+  /**
+    True if variables have been materialized for given THD.
+  */
+  bool is_materialized(THD *unsafe_thd)
+  {
+    return (unsafe_thd == m_unsafe_thd && m_materialized);
+  }
+
+  /**
+    True if variables have been materialized for given PFS_thread.
+  */
+  bool is_materialized(PFS_thread *pfs_thread)
+  {
+    return (pfs_thread == m_pfs_thread && m_materialized);
+  }
+
+  /**
+    True if variables have been materialized for given PFS_user.
+  */
+  bool is_materialized(PFS_user *pfs_user)
+  {
+    return (static_cast<PFS_client *>(pfs_user) == m_pfs_client && m_materialized);
+  }
+
+  /**
+    True if variables have been materialized for given PFS_host.
+  */
+  bool is_materialized(PFS_host *pfs_host)
+  {
+    return (static_cast<PFS_client *>(pfs_host) == m_pfs_client && m_materialized);
+  }
+
+  /**
+    True if variables have been materialized for given PFS_account.
+  */
+  bool is_materialized(PFS_account *pfs_account)
+  {
+    return (static_cast<PFS_client *>(pfs_account) == m_pfs_client && m_materialized);
+  }
+
+  /**
+    True if variables have been materialized for given PFS_user/host/account.
+  */
+  bool is_materialized(PFS_client *pfs_client)
+  {
+    return (static_cast<PFS_client *>(pfs_client) == m_pfs_client && m_materialized);
+  }
+
+  /**
+    Get a validated THD from the thread manager. Execute callback function while
+    inside of the thread manager locks.
+  */
+  THD *get_THD(THD *thd);
+  THD *get_THD(PFS_thread *pfs_thread);
+
+  /**
+    Get a single variable from the cache.
+    Get the first element in the cache by default.
+  */
+  const Var_type *get(uint index= 0) const
+  {
+    if (index >= m_cache.elements())
+      return NULL;
+
+    const Var_type *p= &m_cache.at(index);
+    return p;
+  }
+
+  /**
+    Number of elements in the cache.
+  */
+  uint size()
+  {
+    return (uint)m_cache.elements();
+  }
+
+private:
+  virtual bool do_initialize_global(void) { return true; }
+  virtual bool do_initialize_session(void) { return true; }
+  virtual int do_materialize_global(void) { return 1; }
+  virtual int do_materialize_all(THD *thd) { return 1; }
+  virtual int do_materialize_session(THD *thd) { return 1; }
+  virtual int do_materialize_session(PFS_thread *) { return 1; }
+  virtual int do_materialize_session(PFS_thread *, uint index) { return 1; }
+
+protected:
+  /* Validated THD */
+  THD *m_safe_thd;
+
+  /* Unvalidated THD */
+  THD *m_unsafe_thd;
+
+  /* Current THD */
+  THD *m_current_thd;
+
+  /* Current PFS_thread. */
+  PFS_thread *m_pfs_thread;
+
+  /* Current PFS_user, host or account. */
+  PFS_client *m_pfs_client;
+
+  /* Callback for thread iterator. */
+  Find_THD_variable m_thd_finder;
+
+  /* Cache of materialized variables. */
+  Variable_array m_cache;
+
+  /* True when list of SHOW_VAR is complete. */
+  bool m_initialized;
+
+  /*
+    True if the SHOW_VAR array must be initialized externally from the
+    materialization step, such as with aggregations and queries by thread.
+  */
+  bool m_external_init;
+
+  /* True when cache is complete. */
+  bool m_materialized;
+
+  /* Array of variables to be materialized. Last element must be null. */
+  Dynamic_array<SHOW_VAR> m_show_var_array;
+
+  /* Version of global hash/array. Changes when vars added/removed. */
+  ulonglong m_version;
+
+  /* Query scope: GLOBAL or SESSION. */
+  enum_var_type m_query_scope;
+
+  /* True if temporary mem_root should be used for materialization. */
+  bool m_use_mem_root;
+
+  /* True if summarizing across users, hosts or accounts. */
+  bool m_aggregate;
+
+};
+
+/**
+  Required implementation for pure virtual destructor of a template class.
+*/
+template <class Var_type>
+PFS_variable_cache<Var_type>::~PFS_variable_cache()
+{
+}
+
+/**
+  Get a validated THD from the thread manager. Execute callback function while
+  while inside the thread manager lock.
+*/
+template <class Var_type>
+THD *PFS_variable_cache<Var_type>::get_THD(THD *unsafe_thd)
+{
+  if (unsafe_thd == NULL)
+  {
+    /*
+      May happen, precisely because the pointer is unsafe
+      (THD just disconnected for example).
+      No need to walk Global_THD_manager for that.
+    */
+    return NULL;
+  }
+
+  m_thd_finder.set_unsafe_thd(unsafe_thd);
+  THD* safe_thd= Global_THD_manager::get_instance()->find_thd(&m_thd_finder);
+  return safe_thd;
+}
+
+template <class Var_type>
+THD *PFS_variable_cache<Var_type>::get_THD(PFS_thread *pfs_thread)
+{
+  assert(pfs_thread != NULL);
+  return get_THD(pfs_thread->m_thd);
+}
+
+/**
+  Build array of SHOW_VARs from external source of system or status variables.
+  Filter using session scope.
+*/
+template <class Var_type>
+bool PFS_variable_cache<Var_type>::initialize_session(void)
+{
+  if (m_initialized)
+    return 0;
+
+  return do_initialize_session();
+}
+
+/**
+  Build array of SHOW_VARs suitable for aggregation by user, host or account.
+  Filter using session scope.
+*/
+template <class Var_type>
+bool PFS_variable_cache<Var_type>::initialize_client_session(void)
+{
+  if (m_initialized)
+    return 0;
+
+  /* Requires aggregation by user, host or account. */
+  m_aggregate= true;
+
+  return do_initialize_session();
+}
+
+/**
+  Build cache of all GLOBAL variables.
+*/
+template <class Var_type>
+int PFS_variable_cache<Var_type>::materialize_global()
+{
+  if (is_materialized())
+    return 0;
+
+  return do_materialize_global();
+}
+
+/**
+  Build cache of GLOBAL and SESSION variables for a non-instrumented thread.
+*/
+template <class Var_type>
+int PFS_variable_cache<Var_type>::materialize_all(THD *unsafe_thd)
+{
+  if (!unsafe_thd)
+    return 1;
+
+  if (is_materialized(unsafe_thd))
+    return 0;
+
+  return do_materialize_all(unsafe_thd);
+}
+
+/**
+  Build cache of SESSION variables for a non-instrumented thread.
+*/
+template <class Var_type>
+int PFS_variable_cache<Var_type>::materialize_session(THD *unsafe_thd)
+{
+  if (!unsafe_thd)
+    return 1;
+
+  if (is_materialized(unsafe_thd))
+    return 0;
+
+  return do_materialize_session(unsafe_thd);
+}
+
+/**
+  Build cache of SESSION variables for a thread.
+*/
+template <class Var_type>
+int PFS_variable_cache<Var_type>::materialize_session(PFS_thread *pfs_thread, bool use_mem_root)
+{
+  if (!pfs_thread)
+    return 1;
+
+  if (is_materialized(pfs_thread))
+    return 0;
+
+  if (!pfs_thread->m_lock.is_populated() || pfs_thread->m_thd == NULL)
+    return 1;
+
+  m_use_mem_root= use_mem_root;
+
+  return do_materialize_session(pfs_thread);
+}
+
+/**
+  Materialize a single variable for a thread.
+*/
+template <class Var_type>
+int PFS_variable_cache<Var_type>::materialize_session(PFS_thread *pfs_thread, uint index)
+{
+  /* No check for is_materialized(). */
+
+  if (!pfs_thread)
+    return 1;
+
+  if (!pfs_thread->m_lock.is_populated() || pfs_thread->m_thd == NULL)
+    return 1;
+
+  return do_materialize_session(pfs_thread, index);
+}
+
+/**
+  CLASS PFS_system_variable_cache - System variable cache.
+*/
+class PFS_system_variable_cache : public PFS_variable_cache<System_variable>
+{
+public:
+  PFS_system_variable_cache(bool external_init) :
+                            PFS_variable_cache<System_variable>(external_init),
+                            m_mem_thd(NULL), m_mem_thd_save(NULL),
+                            m_mem_sysvar_ptr(NULL) { }
+  bool match_scope(int scope);
+  ulonglong get_sysvar_hash_version(void) { return m_version; }
+  ~PFS_system_variable_cache() { free_mem_root(); }
+
+private:
+  /* Build SHOW_var array. */
+  bool init_show_var_array(enum_var_type scope, bool strict);
+  bool do_initialize_session(void);
+
+  /* Global */
+  int do_materialize_global(void);
+  /* Global and Session - THD */
+  int do_materialize_all(THD* thd);
+  /* Session - THD */
+  int do_materialize_session(THD* thd);
+  /* Session -  PFS_thread */
+  int do_materialize_session(PFS_thread *thread);
+  /* Single variable -  PFS_thread */
+  int do_materialize_session(PFS_thread *pfs_thread, uint index);
+
+  /* Temporary mem_root to use for materialization. */
+  MEM_ROOT m_mem_sysvar;
+  /* Pointer to THD::mem_root. */
+  MEM_ROOT **m_mem_thd;
+  /* Save THD::mem_root. */
+  MEM_ROOT *m_mem_thd_save;
+  /* Pointer to temporary mem_root. */
+  MEM_ROOT *m_mem_sysvar_ptr;
+  /* Allocate and/or assign temporary mem_root. */
+  void set_mem_root(void);
+  /* Mark all memory blocks as free in temporary mem_root. */
+  void clear_mem_root(void);
+  /* Free mem_root memory. */
+  void free_mem_root(void);
+};
+
+
+/**
+  CLASS PFS_status_variable_cache - Status variable cache
+*/
+class PFS_status_variable_cache : public PFS_variable_cache<Status_variable>
+{
+public:
+  PFS_status_variable_cache(bool external_init);
+
+  int materialize_user(PFS_user *pfs_user);
+  int materialize_host(PFS_host *pfs_host);
+  int materialize_account(PFS_account *pfs_account);
+
+  ulonglong get_status_array_version(void) { return m_version; }
+
+protected:
+  /* Get PFS_user, account or host associated with a PFS_thread. Implemented by table class. */
+  virtual PFS_client *get_pfs(PFS_thread *pfs_thread) { return NULL; }
+
+  /* True if query is a SHOW command. */
+  bool m_show_command;
+
+private:
+  bool do_initialize_session(void);
+
+  int do_materialize_global(void);
+  /* Global and Session - THD */
+  int do_materialize_all(THD* thd);
+  int do_materialize_session(THD *thd);
+  int do_materialize_session(PFS_thread *thread);
+  int do_materialize_session(PFS_thread *thread, uint index) { return 0; }
+  int do_materialize_client(PFS_client *pfs_client);
+
+  /* Callback to sum user, host or account status variables. */
+  void (*m_sum_client_status)(PFS_client *pfs_client, STATUS_VAR *status_totals);
+
+  /* Build SHOW_VAR array from external source. */
+  bool init_show_var_array(enum_var_type scope, bool strict);
+
+  /* Recursively expand nested SHOW_VAR arrays. */
+  void expand_show_var_array(const SHOW_VAR *show_var_array, const char *prefix, bool strict);
+
+  /* Exclude unwanted variables from the query. */
+  bool filter_show_var(const SHOW_VAR *show_var, bool strict);
+
+  /* Check the variable scope against the query scope. */
+  bool match_scope(SHOW_SCOPE variable_scope, bool strict);
+
+  /* Exclude specific status variables by name or prefix. */
+  bool filter_by_name(const SHOW_VAR *show_var);
+
+  /* Check if a variable has an aggregatable type. */
+  bool can_aggregate(enum_mysql_show_type variable_type);
+
+  /* Build status variable name with prefix. Return in the buffer provided. */
+  char *make_show_var_name(const char* prefix, const char* name, char *name_buf, size_t buf_len);
+
+  /* Build status variable name with prefix. Return copy of the string. */
+  char *make_show_var_name(const char* prefix, const char* name);
+
+  /* For the current THD, use initial_status_vars taken from before the query start. */
+  STATUS_VAR *set_status_vars(void);
+
+  /* Build the list of status variables from SHOW_VAR array. */
+  void manifest(THD *thd, const SHOW_VAR *show_var_array,
+                STATUS_VAR *status_var_array, const char *prefix, bool nested_array, bool strict);
+};
+
+/* Callback functions to sum status variables for a given user, host or account. */
+void sum_user_status(PFS_client *pfs_user, STATUS_VAR *status_totals);
+void sum_host_status(PFS_client *pfs_host, STATUS_VAR *status_totals);
+void sum_account_status(PFS_client *pfs_account, STATUS_VAR *status_totals);
+
+
+/** @} */
+#endif
+
diff --git a/storage/perfschema/pfs_visitor.cc b/storage/perfschema/pfs_visitor.cc
index 097965fde17..52e0d6871f3 100644
--- a/storage/perfschema/pfs_visitor.cc
+++ b/storage/perfschema/pfs_visitor.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -28,6 +28,9 @@
 #include "pfs_user.h"
 #include "pfs_host.h"
 #include "pfs_account.h"
+#include "pfs_buffer_container.h"
+
+#include "mysqld_thd_manager.h"
 
 /**
   @file storage/perfschema/pfs_visitor.cc
@@ -39,182 +42,330 @@
   @{
 */
 
+class All_THD_visitor_adapter : public Do_THD_Impl
+{
+public:
+  All_THD_visitor_adapter(PFS_connection_visitor *visitor)
+    : m_visitor(visitor)
+  {}
+
+  virtual void operator()(THD *thd)
+  {
+    m_visitor->visit_THD(thd);
+  }
+
+private:
+  PFS_connection_visitor *m_visitor;
+};
+
 /** Connection iterator */
 void PFS_connection_iterator::visit_global(bool with_hosts, bool with_users,
                                            bool with_accounts, bool with_threads,
+                                           bool with_THDs,
                                            PFS_connection_visitor *visitor)
 {
-  DBUG_ASSERT(visitor != NULL);
+  assert(visitor != NULL);
+  assert(! with_threads || ! with_THDs);
 
   visitor->visit_global();
 
   if (with_hosts)
   {
-    PFS_host *pfs= host_array;
-    PFS_host *pfs_last= pfs + host_max;
-    for ( ; pfs < pfs_last; pfs++)
+    PFS_host_iterator it= global_host_container.iterate();
+    PFS_host *pfs= it.scan_next();
+
+    while (pfs != NULL)
     {
-      if (pfs->m_lock.is_populated())
-        visitor->visit_host(pfs);
+      visitor->visit_host(pfs);
+      pfs= it.scan_next();
     }
   }
 
   if (with_users)
   {
-    PFS_user *pfs= user_array;
-    PFS_user *pfs_last= pfs + user_max;
-    for ( ; pfs < pfs_last; pfs++)
+    PFS_user_iterator it= global_user_container.iterate();
+    PFS_user *pfs= it.scan_next();
+
+    while (pfs != NULL)
     {
-      if (pfs->m_lock.is_populated())
-        visitor->visit_user(pfs);
+      visitor->visit_user(pfs);
+      pfs= it.scan_next();
     }
   }
 
   if (with_accounts)
   {
-    PFS_account *pfs= account_array;
-    PFS_account *pfs_last= pfs + account_max;
-    for ( ; pfs < pfs_last; pfs++)
+    PFS_account_iterator it= global_account_container.iterate();
+    PFS_account *pfs= it.scan_next();
+
+    while (pfs != NULL)
     {
-      if (pfs->m_lock.is_populated())
-        visitor->visit_account(pfs);
+      visitor->visit_account(pfs);
+      pfs= it.scan_next();
     }
   }
 
+
   if (with_threads)
   {
-    PFS_thread *pfs= thread_array;
-    PFS_thread *pfs_last= pfs + thread_max;
-    for ( ; pfs < pfs_last; pfs++)
+    PFS_thread_iterator it= global_thread_container.iterate();
+    PFS_thread *pfs= it.scan_next();
+
+    while (pfs != NULL)
     {
-      if (pfs->m_lock.is_populated())
-        visitor->visit_thread(pfs);
+      visitor->visit_thread(pfs);
+      pfs= it.scan_next();
     }
   }
+
+  if (with_THDs)
+  {
+    All_THD_visitor_adapter adapter(visitor);
+    Global_THD_manager::get_instance()->do_for_all_thd(& adapter);
+  }
 }
 
+class All_host_THD_visitor_adapter : public Do_THD_Impl
+{
+public:
+  All_host_THD_visitor_adapter(PFS_connection_visitor *visitor, PFS_host *host)
+    : m_visitor(visitor), m_host(host)
+  {}
+
+  virtual void operator()(THD *thd)
+  {
+    PSI_thread *psi= thd->get_psi();
+    PFS_thread *pfs= reinterpret_cast<PFS_thread*>(psi);
+    pfs= sanitize_thread(pfs);
+    if (pfs != NULL)
+    {
+      PFS_account *account= sanitize_account(pfs->m_account);
+      if (account != NULL)
+      {
+        if (account->m_host == m_host)
+        {
+          m_visitor->visit_THD(thd);
+        }
+      }
+      else if (pfs->m_host == m_host)
+      {
+        m_visitor->visit_THD(thd);
+      }
+    }
+  }
+
+private:
+  PFS_connection_visitor *m_visitor;
+  PFS_host *m_host;
+};
+
 void PFS_connection_iterator::visit_host(PFS_host *host,
                                          bool with_accounts, bool with_threads,
+                                         bool with_THDs,
                                          PFS_connection_visitor *visitor)
 {
-  DBUG_ASSERT(visitor != NULL);
+  assert(visitor != NULL);
+  assert(! with_threads || ! with_THDs);
 
   visitor->visit_host(host);
 
   if (with_accounts)
   {
-    PFS_account *pfs= account_array;
-    PFS_account *pfs_last= pfs + account_max;
-    for ( ; pfs < pfs_last; pfs++)
+    PFS_account_iterator it= global_account_container.iterate();
+    PFS_account *pfs= it.scan_next();
+
+    while (pfs != NULL)
     {
-      if ((pfs->m_host == host) && pfs->m_lock.is_populated())
+      if (pfs->m_host == host)
       {
         visitor->visit_account(pfs);
       }
+      pfs= it.scan_next();
     }
   }
 
   if (with_threads)
   {
-    PFS_thread *pfs= thread_array;
-    PFS_thread *pfs_last= pfs + thread_max;
-    for ( ; pfs < pfs_last; pfs++)
+    PFS_thread_iterator it= global_thread_container.iterate();
+    PFS_thread *pfs= it.scan_next();
+
+    while (pfs != NULL)
     {
-      if (pfs->m_lock.is_populated())
+      PFS_account *safe_account= sanitize_account(pfs->m_account);
+      if (((safe_account != NULL) && (safe_account->m_host == host)) /* 1 */
+          || (pfs->m_host == host))                                  /* 2 */
       {
-        PFS_account *safe_account= sanitize_account(pfs->m_account);
-        if ((safe_account != NULL) && (safe_account->m_host == host))
-        {
-          /*
-            If the thread belongs to a known user@host that belongs to this host,
-            process it.
-          */
-          visitor->visit_thread(pfs);
-        }
-        else if (pfs->m_host == host)
+        /*
+          If the thread belongs to:
+          - (1) a known user@host that belongs to this host,
+          - (2) a 'lost' user@host that belongs to this host
+          process it.
+        */
+        visitor->visit_thread(pfs);
+      }
+      pfs= it.scan_next();
+    }
+  }
+
+  if (with_THDs)
+  {
+    All_host_THD_visitor_adapter adapter(visitor, host);
+    Global_THD_manager::get_instance()->do_for_all_thd(& adapter);
+  }
+}
+
+class All_user_THD_visitor_adapter : public Do_THD_Impl
+{
+public:
+  All_user_THD_visitor_adapter(PFS_connection_visitor *visitor, PFS_user *user)
+    : m_visitor(visitor), m_user(user)
+  {}
+
+  virtual void operator()(THD *thd)
+  {
+    PSI_thread *psi= thd->get_psi();
+    PFS_thread *pfs= reinterpret_cast<PFS_thread*>(psi);
+    pfs= sanitize_thread(pfs);
+    if (pfs != NULL)
+    {
+      PFS_account *account= sanitize_account(pfs->m_account);
+      if (account != NULL)
+      {
+        if (account->m_user == m_user)
         {
-          /*
-            If the thread belongs to a 'lost' user@host that belong to this host,
-            process it.
-          */
-          visitor->visit_thread(pfs);
+          m_visitor->visit_THD(thd);
         }
       }
+      else if (pfs->m_user == m_user)
+      {
+        m_visitor->visit_THD(thd);
+      }
     }
   }
-}
+
+private:
+  PFS_connection_visitor *m_visitor;
+  PFS_user *m_user;
+};
 
 void PFS_connection_iterator::visit_user(PFS_user *user,
                                          bool with_accounts, bool with_threads,
+                                         bool with_THDs,
                                          PFS_connection_visitor *visitor)
 {
-  DBUG_ASSERT(visitor != NULL);
+  assert(visitor != NULL);
+  assert(! with_threads || ! with_THDs);
 
   visitor->visit_user(user);
 
   if (with_accounts)
   {
-    PFS_account *pfs= account_array;
-    PFS_account *pfs_last= pfs + account_max;
-    for ( ; pfs < pfs_last; pfs++)
+    PFS_account_iterator it= global_account_container.iterate();
+    PFS_account *pfs= it.scan_next();
+
+    while (pfs != NULL)
     {
-      if ((pfs->m_user == user) && pfs->m_lock.is_populated())
+      if (pfs->m_user == user)
       {
         visitor->visit_account(pfs);
       }
+      pfs= it.scan_next();
     }
   }
 
   if (with_threads)
   {
-    PFS_thread *pfs= thread_array;
-    PFS_thread *pfs_last= pfs + thread_max;
-    for ( ; pfs < pfs_last; pfs++)
+    PFS_thread_iterator it= global_thread_container.iterate();
+    PFS_thread *pfs= it.scan_next();
+
+    while (pfs != NULL)
     {
-      if (pfs->m_lock.is_populated())
+      PFS_account *safe_account= sanitize_account(pfs->m_account);
+      if (((safe_account != NULL) && (safe_account->m_user == user)) /* 1 */
+          || (pfs->m_user == user))                                  /* 2 */
       {
-        PFS_account *safe_account= sanitize_account(pfs->m_account);
-        if ((safe_account != NULL) && (safe_account->m_user == user))
-        {
-          /*
-            If the thread belongs to a known user@host that belongs to this user,
-            process it.
-          */
-          visitor->visit_thread(pfs);
-        }
-        else if (pfs->m_user == user)
-        {
-          /*
-            If the thread belongs to a 'lost' user@host that belong to this user,
-            process it.
-          */
-          visitor->visit_thread(pfs);
-        }
+        /*
+          If the thread belongs to:
+          - (1) a known user@host that belongs to this user,
+          - (2) a 'lost' user@host that belongs to this user
+          process it.
+        */
+        visitor->visit_thread(pfs);
       }
+      pfs= it.scan_next();
     }
   }
+
+  if (with_THDs)
+  {
+    All_user_THD_visitor_adapter adapter(visitor, user);
+    Global_THD_manager::get_instance()->do_for_all_thd(& adapter);
+  }
 }
 
+class All_account_THD_visitor_adapter : public Do_THD_Impl
+{
+public:
+  All_account_THD_visitor_adapter(PFS_connection_visitor *visitor, PFS_account *account)
+    : m_visitor(visitor), m_account(account)
+  {}
+
+  virtual void operator()(THD *thd)
+  {
+    PSI_thread *psi= thd->get_psi();
+    PFS_thread *pfs= reinterpret_cast<PFS_thread*>(psi);
+    pfs= sanitize_thread(pfs);
+    if (pfs != NULL)
+    {
+      if (pfs->m_account == m_account)
+      {
+        m_visitor->visit_THD(thd);
+      }
+    }
+  }
+
+private:
+  PFS_connection_visitor *m_visitor;
+  PFS_account *m_account;
+};
+
 void PFS_connection_iterator::visit_account(PFS_account *account,
-                                              bool with_threads,
-                                              PFS_connection_visitor *visitor)
+                                            bool with_threads,
+                                            bool with_THDs,
+                                            PFS_connection_visitor *visitor)
 {
-  DBUG_ASSERT(visitor != NULL);
+  assert(visitor != NULL);
+  assert(! with_threads || ! with_THDs);
 
   visitor->visit_account(account);
 
   if (with_threads)
   {
-    PFS_thread *pfs= thread_array;
-    PFS_thread *pfs_last= pfs + thread_max;
-    for ( ; pfs < pfs_last; pfs++)
+    PFS_thread_iterator it= global_thread_container.iterate();
+    PFS_thread *pfs= it.scan_next();
+
+    while (pfs != NULL)
     {
-      if ((pfs->m_account == account) && pfs->m_lock.is_populated())
+      if (pfs->m_account == account)
       {
         visitor->visit_thread(pfs);
       }
+      pfs= it.scan_next();
     }
   }
+
+  if (with_THDs)
+  {
+    All_account_THD_visitor_adapter adapter(visitor, account);
+    Global_THD_manager::get_instance()->do_for_all_thd(& adapter);
+  }
+}
+
+void PFS_connection_iterator::visit_THD(THD *thd,
+                                        PFS_connection_visitor *visitor)
+{
+  assert(visitor != NULL);
+  visitor->visit_THD(thd);
 }
 
 void PFS_instance_iterator::visit_all(PFS_instance_visitor *visitor)
@@ -246,14 +397,13 @@ void PFS_instance_iterator::visit_all_mutex_classes(PFS_instance_visitor *visito
 
 void PFS_instance_iterator::visit_all_mutex_instances(PFS_instance_visitor *visitor)
 {
-  PFS_mutex *pfs= mutex_array;
-  PFS_mutex *pfs_last= pfs + mutex_max;
-  for ( ; pfs < pfs_last; pfs++)
+  PFS_mutex_iterator it= global_mutex_container.iterate();
+  PFS_mutex *pfs= it.scan_next();
+
+  while (pfs != NULL)
   {
-    if (pfs->m_lock.is_populated())
-    {
-      visitor->visit_mutex(pfs);
-    }
+    visitor->visit_mutex(pfs);
+    pfs= it.scan_next();
   }
 }
 
@@ -278,14 +428,13 @@ void PFS_instance_iterator::visit_all_rwlock_classes(PFS_instance_visitor *visit
 
 void PFS_instance_iterator::visit_all_rwlock_instances(PFS_instance_visitor *visitor)
 {
-  PFS_rwlock *pfs= rwlock_array;
-  PFS_rwlock *pfs_last= pfs + rwlock_max;
-  for ( ; pfs < pfs_last; pfs++)
+  PFS_rwlock_iterator it= global_rwlock_container.iterate();
+  PFS_rwlock *pfs= it.scan_next();
+
+  while (pfs != NULL)
   {
-    if (pfs->m_lock.is_populated())
-    {
-      visitor->visit_rwlock(pfs);
-    }
+    visitor->visit_rwlock(pfs);
+    pfs= it.scan_next();
   }
 }
 
@@ -310,14 +459,13 @@ void PFS_instance_iterator::visit_all_cond_classes(PFS_instance_visitor *visitor
 
 void PFS_instance_iterator::visit_all_cond_instances(PFS_instance_visitor *visitor)
 {
-  PFS_cond *pfs= cond_array;
-  PFS_cond *pfs_last= pfs + cond_max;
-  for ( ; pfs < pfs_last; pfs++)
+  PFS_cond_iterator it= global_cond_container.iterate();
+  PFS_cond *pfs= it.scan_next();
+
+  while (pfs != NULL)
   {
-    if (pfs->m_lock.is_populated())
-    {
-      visitor->visit_cond(pfs);
-    }
+    visitor->visit_cond(pfs);
+    pfs= it.scan_next();
   }
 }
 
@@ -342,14 +490,13 @@ void PFS_instance_iterator::visit_all_file_classes(PFS_instance_visitor *visitor
 
 void PFS_instance_iterator::visit_all_file_instances(PFS_instance_visitor *visitor)
 {
-  PFS_file *pfs= file_array;
-  PFS_file *pfs_last= pfs + file_max;
-  for ( ; pfs < pfs_last; pfs++)
+  PFS_file_iterator it= global_file_container.iterate();
+  PFS_file *pfs= it.scan_next();
+
+  while (pfs != NULL)
   {
-    if (pfs->m_lock.is_populated())
-    {
-      visitor->visit_file(pfs);
-    }
+    visitor->visit_file(pfs);
+    pfs= it.scan_next();
   }
 }
 
@@ -358,7 +505,7 @@ void PFS_instance_iterator::visit_all_file_instances(PFS_instance_visitor *visit
 void PFS_instance_iterator::visit_mutex_instances(PFS_mutex_class *klass,
                                                   PFS_instance_visitor *visitor)
 {
-  DBUG_ASSERT(visitor != NULL);
+  assert(visitor != NULL);
 
   visitor->visit_mutex_class(klass);
 
@@ -375,14 +522,16 @@ void PFS_instance_iterator::visit_mutex_instances(PFS_mutex_class *klass,
   }
   else
   {
-    PFS_mutex *pfs= mutex_array;
-    PFS_mutex *pfs_last= pfs + mutex_max;
-    for ( ; pfs < pfs_last; pfs++)
+    PFS_mutex_iterator it= global_mutex_container.iterate();
+    PFS_mutex *pfs= it.scan_next();
+
+    while (pfs != NULL)
     {
-      if ((pfs->m_class == klass) && pfs->m_lock.is_populated())
+      if (pfs->m_class == klass)
       {
         visitor->visit_mutex(pfs);
       }
+      pfs= it.scan_next();
     }
   }
 }
@@ -390,7 +539,7 @@ void PFS_instance_iterator::visit_mutex_instances(PFS_mutex_class *klass,
 void PFS_instance_iterator::visit_rwlock_instances(PFS_rwlock_class *klass,
                                                    PFS_instance_visitor *visitor)
 {
-  DBUG_ASSERT(visitor != NULL);
+  assert(visitor != NULL);
 
   visitor->visit_rwlock_class(klass);
 
@@ -407,14 +556,16 @@ void PFS_instance_iterator::visit_rwlock_instances(PFS_rwlock_class *klass,
   }
   else
   {
-    PFS_rwlock *pfs= rwlock_array;
-    PFS_rwlock *pfs_last= pfs + rwlock_max;
-    for ( ; pfs < pfs_last; pfs++)
+    PFS_rwlock_iterator it= global_rwlock_container.iterate();
+    PFS_rwlock *pfs= it.scan_next();
+
+    while (pfs != NULL)
     {
-      if ((pfs->m_class == klass) && pfs->m_lock.is_populated())
+      if (pfs->m_class == klass)
       {
         visitor->visit_rwlock(pfs);
       }
+      pfs= it.scan_next();
     }
   }
 }
@@ -422,7 +573,7 @@ void PFS_instance_iterator::visit_rwlock_instances(PFS_rwlock_class *klass,
 void PFS_instance_iterator::visit_cond_instances(PFS_cond_class *klass,
                                                  PFS_instance_visitor *visitor)
 {
-  DBUG_ASSERT(visitor != NULL);
+  assert(visitor != NULL);
 
   visitor->visit_cond_class(klass);
 
@@ -439,14 +590,16 @@ void PFS_instance_iterator::visit_cond_instances(PFS_cond_class *klass,
   }
   else
   {
-    PFS_cond *pfs= cond_array;
-    PFS_cond *pfs_last= pfs + cond_max;
-    for ( ; pfs < pfs_last; pfs++)
+    PFS_cond_iterator it= global_cond_container.iterate();
+    PFS_cond *pfs= it.scan_next();
+
+    while (pfs != NULL)
     {
-      if ((pfs->m_class == klass) && pfs->m_lock.is_populated())
+      if (pfs->m_class == klass)
       {
         visitor->visit_cond(pfs);
       }
+      pfs= it.scan_next();
     }
   }
 }
@@ -454,7 +607,7 @@ void PFS_instance_iterator::visit_cond_instances(PFS_cond_class *klass,
 void PFS_instance_iterator::visit_file_instances(PFS_file_class *klass,
                                                  PFS_instance_visitor *visitor)
 {
-  DBUG_ASSERT(visitor != NULL);
+  assert(visitor != NULL);
 
   visitor->visit_file_class(klass);
 
@@ -471,14 +624,16 @@ void PFS_instance_iterator::visit_file_instances(PFS_file_class *klass,
   }
   else
   {
-    PFS_file *pfs= file_array;
-    PFS_file *pfs_last= pfs + file_max;
-    for ( ; pfs < pfs_last; pfs++)
+    PFS_file_iterator it= global_file_container.iterate();
+    PFS_file *pfs= it.scan_next();
+
+    while (pfs != NULL)
     {
-      if ((pfs->m_class == klass) && pfs->m_lock.is_populated())
+      if (pfs->m_class == klass)
       {
         visitor->visit_file(pfs);
       }
+      pfs= it.scan_next();
     }
   }
 }
@@ -488,7 +643,7 @@ void PFS_instance_iterator::visit_file_instances(PFS_file_class *klass,
 void PFS_instance_iterator::visit_socket_instances(PFS_socket_class *klass,
                                                    PFS_instance_visitor *visitor)
 {
-  DBUG_ASSERT(visitor != NULL);
+  assert(visitor != NULL);
 
   visitor->visit_socket_class(klass);
 
@@ -505,14 +660,16 @@ void PFS_instance_iterator::visit_socket_instances(PFS_socket_class *klass,
   }
   else
   {
-    PFS_socket *pfs= socket_array;
-    PFS_socket *pfs_last= pfs + socket_max;
-    for ( ; pfs < pfs_last; pfs++)
+    PFS_socket_iterator it= global_socket_container.iterate();
+    PFS_socket *pfs= it.scan_next();
+
+    while (pfs != NULL)
     {
-      if ((pfs->m_class == klass) && pfs->m_lock.is_populated())
+      if (pfs->m_class == klass)
       {
         visitor->visit_socket(pfs);
       }
+      pfs= it.scan_next();
     }
   }
 }
@@ -524,8 +681,8 @@ void PFS_instance_iterator::visit_socket_instances(PFS_socket_class *klass,
                                                    PFS_thread *thread,
                                                    bool visit_class)
 {
-  DBUG_ASSERT(visitor != NULL);
-  DBUG_ASSERT(thread != NULL);
+  assert(visitor != NULL);
+  assert(thread != NULL);
 
   if (visit_class)
     visitor->visit_socket_class(klass);
@@ -542,16 +699,17 @@ void PFS_instance_iterator::visit_socket_instances(PFS_socket_class *klass,
   else
   {
     /* Get current socket stats from each socket instance owned by this thread */
-    PFS_socket *pfs= socket_array;
-    PFS_socket *pfs_last= pfs + socket_max;
+    PFS_socket_iterator it= global_socket_container.iterate();
+    PFS_socket *pfs= it.scan_next();
 
-    for ( ; pfs < pfs_last; pfs++)
+    while (pfs != NULL)
     {
       if (unlikely((pfs->m_class == klass) &&
                    (pfs->m_thread_owner == thread)))
       {
         visitor->visit_socket(pfs);
       }
+      pfs= it.scan_next();
     }
   }
 }
@@ -563,8 +721,8 @@ void PFS_instance_iterator::visit_instances(PFS_instr_class *klass,
                                             PFS_thread *thread,
                                             bool visit_class)
 {
-  DBUG_ASSERT(visitor != NULL);
-  DBUG_ASSERT(klass != NULL);
+  assert(visitor != NULL);
+  assert(klass != NULL);
 
   switch (klass->m_type)
   {
@@ -586,72 +744,147 @@ void PFS_object_iterator::visit_all(PFS_object_visitor *visitor)
   visit_all_tables(visitor);
 }
 
-void PFS_object_iterator::visit_all_tables(PFS_object_visitor *visitor)
+class Proc_all_table_shares
+  : public PFS_buffer_processor<PFS_table_share>
 {
-  DBUG_ASSERT(visitor != NULL);
+public:
+  Proc_all_table_shares(PFS_object_visitor *visitor)
+    : m_visitor(visitor)
+  {}
 
-  visitor->visit_global();
+  virtual void operator()(PFS_table_share *pfs)
+  {
+    if (pfs->m_enabled)
+    {
+      m_visitor->visit_table_share(pfs);
+    }
+  }
 
-  /* For all the table shares ... */
-  PFS_table_share *share= table_share_array;
-  PFS_table_share *share_last= table_share_array + table_share_max;
-  for ( ; share < share_last; share++)
+private:
+  PFS_object_visitor* m_visitor;
+};
+
+class Proc_all_table_handles
+  : public PFS_buffer_processor<PFS_table>
+{
+public:
+  Proc_all_table_handles(PFS_object_visitor *visitor)
+    : m_visitor(visitor)
+  {}
+
+  virtual void operator()(PFS_table *pfs)
   {
-    if (share->m_lock.is_populated())
+    PFS_table_share *safe_share= sanitize_table_share(pfs->m_share);
+    if (safe_share != NULL)
     {
-      visitor->visit_table_share(share);
+      if (safe_share->m_enabled)
+      {
+        m_visitor->visit_table(pfs);
+      }
     }
   }
 
+private:
+  PFS_object_visitor* m_visitor;
+};
+
+void PFS_object_iterator::visit_all_tables(PFS_object_visitor *visitor)
+{
+  assert(visitor != NULL);
+
+  visitor->visit_global();
+
+  /* For all the table shares ... */
+  Proc_all_table_shares proc_shares(visitor);
+  global_table_share_container.apply(proc_shares);
+
   /* For all the table handles ... */
-  PFS_table *table= table_array;
-  PFS_table *table_last= table_array + table_max;
-  for ( ; table < table_last; table++)
+  Proc_all_table_handles proc_handles(visitor);
+  global_table_container.apply(proc_handles);
+}
+
+class Proc_one_table_share_handles
+  : public PFS_buffer_processor<PFS_table>
+{
+public:
+  Proc_one_table_share_handles(PFS_object_visitor *visitor, PFS_table_share *share)
+    : m_visitor(visitor), m_share(share)
+  {}
+
+  virtual void operator()(PFS_table *pfs)
   {
-    if (table->m_lock.is_populated())
+    if (pfs->m_share == m_share)
     {
-      visitor->visit_table(table);
+      m_visitor->visit_table(pfs);
     }
   }
-}
+
+private:
+  PFS_object_visitor* m_visitor;
+  PFS_table_share* m_share;
+};
 
 void PFS_object_iterator::visit_tables(PFS_table_share *share,
                                        PFS_object_visitor *visitor)
 {
-  DBUG_ASSERT(visitor != NULL);
+  assert(visitor != NULL);
+
+  if (!share->m_enabled)
+    return;
 
   visitor->visit_table_share(share);
 
+#ifdef LATER
+  if (share->get_refcount() == 0)
+    return;
+#endif
+
   /* For all the table handles ... */
-  PFS_table *table= table_array;
-  PFS_table *table_last= table_array + table_max;
-  for ( ; table < table_last; table++)
+  Proc_one_table_share_handles proc(visitor, share);
+  global_table_container.apply(proc);
+}
+
+class Proc_one_table_share_indexes
+  : public PFS_buffer_processor<PFS_table>
+{
+public:
+  Proc_one_table_share_indexes(PFS_object_visitor *visitor, PFS_table_share *share, uint index)
+    : m_visitor(visitor), m_share(share), m_index(index)
+  {}
+
+  virtual void operator()(PFS_table *pfs)
   {
-    if ((table->m_share == share) && table->m_lock.is_populated())
+    if (pfs->m_share == m_share)
     {
-      visitor->visit_table(table);
+      m_visitor->visit_table_index(pfs, m_index);
     }
   }
-}
+
+private:
+  PFS_object_visitor* m_visitor;
+  PFS_table_share* m_share;
+  uint m_index;
+};
 
 void PFS_object_iterator::visit_table_indexes(PFS_table_share *share,
                                               uint index,
                                               PFS_object_visitor *visitor)
 {
-  DBUG_ASSERT(visitor != NULL);
+  assert(visitor != NULL);
+
+  if (!share->m_enabled)
+    return;
 
   visitor->visit_table_share_index(share, index);
 
+#ifdef LATER
+  if (share->get_refcount() == 0)
+    return;
+#endif
+
   /* For all the table handles ... */
-  PFS_table *table= table_array;
-  PFS_table *table_last= table_array + table_max;
-  for ( ; table < table_last; table++)
-  {
-    if ((table->m_share == share) && table->m_lock.is_populated())
-    {
-      visitor->visit_table_index(table, index);
-    }
-  }
+  Proc_one_table_share_indexes proc(visitor, share, index);
+  global_table_container.apply(proc);
 }
 
 /** Connection wait visitor */
@@ -668,32 +901,62 @@ PFS_connection_wait_visitor::~PFS_connection_wait_visitor()
 void PFS_connection_wait_visitor::visit_global()
 {
   /*
-    This visitor is used only for idle instruments.
+    This visitor is used only for global instruments
+    that do not have instances.
     For waits, do not sum by connection but by instances,
     it is more efficient.
   */
-  DBUG_ASSERT(m_index == global_idle_class.m_event_name_index);
-  m_stat.aggregate(& global_idle_stat);
+  assert(   (m_index == global_idle_class.m_event_name_index)
+            || (m_index == global_metadata_class.m_event_name_index));
+
+  if (m_index == global_idle_class.m_event_name_index)
+  {
+    m_stat.aggregate(& global_idle_stat);
+  }
+  else
+  {
+    m_stat.aggregate(& global_metadata_stat);
+  }
 }
 
 void PFS_connection_wait_visitor::visit_host(PFS_host *pfs)
 {
-  m_stat.aggregate(& pfs->m_instr_class_waits_stats[m_index]);
+  const PFS_single_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_waits_stats();
+  if (event_name_array != NULL)
+  {
+    m_stat.aggregate(& event_name_array[m_index]);
+  }
 }
 
 void PFS_connection_wait_visitor::visit_user(PFS_user *pfs)
 {
-  m_stat.aggregate(& pfs->m_instr_class_waits_stats[m_index]);
+  const PFS_single_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_waits_stats();
+  if (event_name_array != NULL)
+  {
+    m_stat.aggregate(& event_name_array[m_index]);
+  }
 }
 
 void PFS_connection_wait_visitor::visit_account(PFS_account *pfs)
 {
-  m_stat.aggregate(& pfs->m_instr_class_waits_stats[m_index]);
+  const PFS_single_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_waits_stats();
+  if (event_name_array != NULL)
+  {
+    m_stat.aggregate(& event_name_array[m_index]);
+  }
 }
 
 void PFS_connection_wait_visitor::visit_thread(PFS_thread *pfs)
 {
-  m_stat.aggregate(& pfs->m_instr_class_waits_stats[m_index]);
+  const PFS_single_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_waits_stats();
+  if (event_name_array != NULL)
+  {
+    m_stat.aggregate(& event_name_array[m_index]);
+  }
 }
 
 PFS_connection_all_wait_visitor
@@ -706,16 +969,19 @@ PFS_connection_all_wait_visitor::~PFS_connection_all_wait_visitor()
 void PFS_connection_all_wait_visitor::visit_global()
 {
   /* Sum by instances, not by connection */
-  DBUG_ASSERT(false);
+  assert(false);
 }
 
 void PFS_connection_all_wait_visitor::visit_connection_slice(PFS_connection_slice *pfs)
 {
-  PFS_single_stat *stat= pfs->m_instr_class_waits_stats;
-  PFS_single_stat *stat_last= stat + wait_class_max;
-  for ( ; stat < stat_last; stat++)
+  const PFS_single_stat *stat= pfs->read_instr_class_waits_stats();
+  if (stat != NULL)
   {
-    m_stat.aggregate(stat);
+    const PFS_single_stat *stat_last= stat + wait_class_max;
+    for ( ; stat < stat_last; stat++)
+    {
+      m_stat.aggregate(stat);
+    }
   }
 }
 
@@ -754,22 +1020,42 @@ void PFS_connection_stage_visitor::visit_global()
 
 void PFS_connection_stage_visitor::visit_host(PFS_host *pfs)
 {
-  m_stat.aggregate(& pfs->m_instr_class_stages_stats[m_index]);
+  const PFS_stage_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_stages_stats();
+  if (event_name_array != NULL)
+  {
+    m_stat.aggregate(& event_name_array[m_index]);
+  }
 }
 
 void PFS_connection_stage_visitor::visit_user(PFS_user *pfs)
 {
-  m_stat.aggregate(& pfs->m_instr_class_stages_stats[m_index]);
+  const PFS_stage_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_stages_stats();
+  if (event_name_array != NULL)
+  {
+    m_stat.aggregate(& event_name_array[m_index]);
+  }
 }
 
 void PFS_connection_stage_visitor::visit_account(PFS_account *pfs)
 {
-  m_stat.aggregate(& pfs->m_instr_class_stages_stats[m_index]);
+  const PFS_stage_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_stages_stats();
+  if (event_name_array != NULL)
+  {
+    m_stat.aggregate(& event_name_array[m_index]);
+  }
 }
 
 void PFS_connection_stage_visitor::visit_thread(PFS_thread *pfs)
 {
-  m_stat.aggregate(& pfs->m_instr_class_stages_stats[m_index]);
+  const PFS_stage_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_stages_stats();
+  if (event_name_array != NULL)
+  {
+    m_stat.aggregate(& event_name_array[m_index]);
+  }
 }
 
 PFS_connection_statement_visitor
@@ -788,22 +1074,42 @@ void PFS_connection_statement_visitor::visit_global()
 
 void PFS_connection_statement_visitor::visit_host(PFS_host *pfs)
 {
-  m_stat.aggregate(& pfs->m_instr_class_statements_stats[m_index]);
+  const PFS_statement_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_statements_stats();
+  if (event_name_array != NULL)
+  {
+    m_stat.aggregate(& event_name_array[m_index]);
+  }
 }
 
 void PFS_connection_statement_visitor::visit_user(PFS_user *pfs)
 {
-  m_stat.aggregate(& pfs->m_instr_class_statements_stats[m_index]);
+  const PFS_statement_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_statements_stats();
+  if (event_name_array != NULL)
+  {
+    m_stat.aggregate(& event_name_array[m_index]);
+  }
 }
 
 void PFS_connection_statement_visitor::visit_account(PFS_account *pfs)
 {
-  m_stat.aggregate(& pfs->m_instr_class_statements_stats[m_index]);
+  const PFS_statement_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_statements_stats();
+  if (event_name_array != NULL)
+  {
+    m_stat.aggregate(& event_name_array[m_index]);
+  }
 }
 
 void PFS_connection_statement_visitor::visit_thread(PFS_thread *pfs)
 {
-  m_stat.aggregate(& pfs->m_instr_class_statements_stats[m_index]);
+  const PFS_statement_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_statements_stats();
+  if (event_name_array != NULL)
+  {
+    m_stat.aggregate(& event_name_array[m_index]);
+  }
 }
 
 /** Instance wait visitor */
@@ -826,11 +1132,14 @@ void PFS_connection_all_statement_visitor::visit_global()
 
 void PFS_connection_all_statement_visitor::visit_connection_slice(PFS_connection_slice *pfs)
 {
-  PFS_statement_stat *stat= pfs->m_instr_class_statements_stats;
-  PFS_statement_stat *stat_last= stat + statement_class_max;
-  for ( ; stat < stat_last; stat++)
+  const PFS_statement_stat *stat= pfs->read_instr_class_statements_stats();
+  if (stat != NULL)
   {
-    m_stat.aggregate(stat);
+    const PFS_statement_stat *stat_last= stat + statement_class_max;
+    for ( ; stat < stat_last; stat++)
+    {
+      m_stat.aggregate(stat);
+    }
   }
 }
 
@@ -854,6 +1163,102 @@ void PFS_connection_all_statement_visitor::visit_thread(PFS_thread *pfs)
   visit_connection_slice(pfs);
 }
 
+PFS_connection_transaction_visitor
+::PFS_connection_transaction_visitor(PFS_transaction_class *klass)
+{
+  m_index= klass->m_event_name_index;
+}
+
+PFS_connection_transaction_visitor::~PFS_connection_transaction_visitor()
+{}
+
+void PFS_connection_transaction_visitor::visit_global()
+{
+  m_stat.aggregate(&global_transaction_stat);
+}
+
+void PFS_connection_transaction_visitor::visit_host(PFS_host *pfs)
+{
+  const PFS_transaction_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_transactions_stats();
+  if (event_name_array != NULL)
+  {
+    m_stat.aggregate(& event_name_array[m_index]);
+  }
+}
+
+void PFS_connection_transaction_visitor::visit_user(PFS_user *pfs)
+{
+  const PFS_transaction_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_transactions_stats();
+  if (event_name_array != NULL)
+  {
+    m_stat.aggregate(& event_name_array[m_index]);
+  }
+}
+
+void PFS_connection_transaction_visitor::visit_account(PFS_account *pfs)
+{
+  const PFS_transaction_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_transactions_stats();
+  if (event_name_array != NULL)
+  {
+    m_stat.aggregate(& event_name_array[m_index]);
+  }
+}
+
+void PFS_connection_transaction_visitor::visit_thread(PFS_thread *pfs)
+{
+  const PFS_transaction_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_transactions_stats();
+  if (event_name_array != NULL)
+  {
+    m_stat.aggregate(& event_name_array[m_index]);
+  }
+}
+
+/** Disabled pending code review */
+#if 0
+/** Instance wait visitor */
+PFS_connection_all_transaction_visitor
+::PFS_connection_all_transaction_visitor()
+{}
+
+PFS_connection_all_transaction_visitor::~PFS_connection_all_transaction_visitor()
+{}
+
+void PFS_connection_all_transaction_visitor::visit_global()
+{
+  m_stat.aggregate(&global_transaction_stat);
+}
+
+void PFS_connection_all_transaction_visitor::visit_connection_slice(PFS_connection_slice *pfs)
+{
+  PFS_transaction_stat *stat= pfs->m_instr_class_transactions_stats;
+  m_stat.aggregate(stat);
+}
+
+void PFS_connection_all_transaction_visitor::visit_host(PFS_host *pfs)
+{
+  visit_connection_slice(pfs);
+}
+
+void PFS_connection_all_transaction_visitor::visit_user(PFS_user *pfs)
+{
+  visit_connection_slice(pfs);
+}
+
+void PFS_connection_all_transaction_visitor::visit_account(PFS_account *pfs)
+{
+  visit_connection_slice(pfs);
+}
+
+void PFS_connection_all_transaction_visitor::visit_thread(PFS_thread *pfs)
+{
+  visit_connection_slice(pfs);
+}
+#endif
+
 PFS_connection_stat_visitor::PFS_connection_stat_visitor()
 {}
 
@@ -883,10 +1288,117 @@ void PFS_connection_stat_visitor::visit_thread(PFS_thread *)
   m_stat.aggregate_active(1);
 }
 
-PFS_instance_wait_visitor::PFS_instance_wait_visitor()
+PFS_connection_memory_visitor
+::PFS_connection_memory_visitor(PFS_memory_class *klass)
+{
+  m_index= klass->m_event_name_index;
+  m_stat.reset();
+}
+
+PFS_connection_memory_visitor::~PFS_connection_memory_visitor()
+{}
+
+void PFS_connection_memory_visitor::visit_global()
+{
+  PFS_memory_stat *stat;
+  stat= & global_instr_class_memory_array[m_index];
+  stat->full_aggregate_to(& m_stat);
+}
+
+void PFS_connection_memory_visitor::visit_host(PFS_host *pfs)
+{
+  const PFS_memory_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_memory_stats();
+  if (event_name_array != NULL)
+  {
+    const PFS_memory_stat *stat;
+    stat= & event_name_array[m_index];
+    stat->full_aggregate_to(& m_stat);
+  }
+}
+
+void PFS_connection_memory_visitor::visit_user(PFS_user *pfs)
 {
+  const PFS_memory_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_memory_stats();
+  if (event_name_array != NULL)
+  {
+    const PFS_memory_stat *stat;
+    stat= & event_name_array[m_index];
+    stat->full_aggregate_to(& m_stat);
+  }
 }
 
+void PFS_connection_memory_visitor::visit_account(PFS_account *pfs)
+{
+  const PFS_memory_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_memory_stats();
+  if (event_name_array != NULL)
+  {
+    const PFS_memory_stat *stat;
+    stat= & event_name_array[m_index];
+    stat->full_aggregate_to(& m_stat);
+  }
+}
+
+void PFS_connection_memory_visitor::visit_thread(PFS_thread *pfs)
+{
+  const PFS_memory_stat *event_name_array;
+  event_name_array= pfs->read_instr_class_memory_stats();
+  if (event_name_array != NULL)
+  {
+    const PFS_memory_stat *stat;
+    stat= & event_name_array[m_index];
+    stat->full_aggregate_to(& m_stat);
+  }
+}
+
+
+PFS_connection_status_visitor::
+PFS_connection_status_visitor(STATUS_VAR *status_vars) : m_status_vars(status_vars)
+{
+  memset(m_status_vars, 0, sizeof(STATUS_VAR));
+}
+
+PFS_connection_status_visitor::~PFS_connection_status_visitor()
+{}
+
+/** Aggregate from global status. */
+void PFS_connection_status_visitor::visit_global()
+{
+   /* NOTE: Requires lock on LOCK_status. */
+   mysql_mutex_assert_owner(&LOCK_status);
+   add_to_status(m_status_vars, &global_status_var);
+}
+
+void PFS_connection_status_visitor::visit_host(PFS_host *pfs)
+{
+  pfs->m_status_stats.aggregate_to(m_status_vars);
+}
+
+void PFS_connection_status_visitor::visit_user(PFS_user *pfs)
+{
+  pfs->m_status_stats.aggregate_to(m_status_vars);
+}
+
+void PFS_connection_status_visitor::visit_account(PFS_account *pfs)
+{
+  pfs->m_status_stats.aggregate_to(m_status_vars);
+}
+
+void PFS_connection_status_visitor::visit_thread(PFS_thread *pfs)
+{
+}
+
+void PFS_connection_status_visitor::visit_THD(THD *thd)
+{
+  add_to_status(m_status_vars, &thd->status_var);
+}
+
+
+PFS_instance_wait_visitor::PFS_instance_wait_visitor()
+{}
+
 PFS_instance_wait_visitor::~PFS_instance_wait_visitor()
 {}
 
@@ -930,7 +1442,7 @@ void PFS_instance_wait_visitor::visit_cond(PFS_cond *pfs)
   m_stat.aggregate(& pfs->m_cond_stat.m_wait_stat);
 }
 
-void PFS_instance_wait_visitor::visit_file(PFS_file *pfs) 
+void PFS_instance_wait_visitor::visit_file(PFS_file *pfs)
 {
   /* Combine per-operation file wait stats before aggregating */
   PFS_single_stat stat;
@@ -938,7 +1450,7 @@ void PFS_instance_wait_visitor::visit_file(PFS_file *pfs)
   m_stat.aggregate(&stat);
 }
 
-void PFS_instance_wait_visitor::visit_socket(PFS_socket *pfs) 
+void PFS_instance_wait_visitor::visit_socket(PFS_socket *pfs)
 {
   /* Combine per-operation socket wait stats before aggregating */
   PFS_single_stat stat;
@@ -963,7 +1475,7 @@ void PFS_object_wait_visitor::visit_global()
 void PFS_object_wait_visitor::visit_table_share(PFS_table_share *pfs)
 {
   uint safe_key_count= sanitize_index_count(pfs->m_key_count);
-  pfs->m_table_stat.sum(& m_stat, safe_key_count);
+  pfs->sum(& m_stat, safe_key_count);
 }
 
 void PFS_object_wait_visitor::visit_table(PFS_table *pfs)
@@ -992,13 +1504,20 @@ void PFS_table_io_wait_visitor::visit_table_share(PFS_table_share *pfs)
   PFS_table_io_stat io_stat;
   uint safe_key_count= sanitize_index_count(pfs->m_key_count);
   uint index;
+  PFS_table_share_index *index_stat;
 
   /* Aggregate index stats */
   for (index= 0; index < safe_key_count; index++)
-    io_stat.aggregate(& pfs->m_table_stat.m_index_stat[index]);
+  {
+    index_stat= pfs->find_index_stat(index);
+    if (index_stat != NULL)
+      io_stat.aggregate(& index_stat->m_stat);
+  }
 
   /* Aggregate global stats */
-  io_stat.aggregate(& pfs->m_table_stat.m_index_stat[MAX_INDEXES]);
+  index_stat= pfs->find_index_stat(MAX_INDEXES);
+  if (index_stat != NULL)
+    io_stat.aggregate(& index_stat->m_stat);
 
   io_stat.sum(& m_stat);
 }
@@ -1036,13 +1555,20 @@ void PFS_table_io_stat_visitor::visit_table_share(PFS_table_share *pfs)
 {
   uint safe_key_count= sanitize_index_count(pfs->m_key_count);
   uint index;
+  PFS_table_share_index *index_stat;
 
   /* Aggregate index stats */
   for (index= 0; index < safe_key_count; index++)
-    m_stat.aggregate(& pfs->m_table_stat.m_index_stat[index]);
+  {
+    index_stat= pfs->find_index_stat(index);
+    if (index_stat != NULL)
+      m_stat.aggregate(& index_stat->m_stat);
+  }
 
   /* Aggregate global stats */
-  m_stat.aggregate(& pfs->m_table_stat.m_index_stat[MAX_INDEXES]);
+  index_stat= pfs->find_index_stat(MAX_INDEXES);
+  if (index_stat != NULL)
+    m_stat.aggregate(& index_stat->m_stat);
 }
 
 void PFS_table_io_stat_visitor::visit_table(PFS_table *pfs)
@@ -1073,7 +1599,11 @@ PFS_index_io_stat_visitor::~PFS_index_io_stat_visitor()
 
 void PFS_index_io_stat_visitor::visit_table_share_index(PFS_table_share *pfs, uint index)
 {
-  m_stat.aggregate(& pfs->m_table_stat.m_index_stat[index]);
+  PFS_table_share_index *index_stat;
+
+  index_stat= pfs->find_index_stat(index);
+  if (index_stat != NULL)
+    m_stat.aggregate(& index_stat->m_stat);
 }
 
 void PFS_index_io_stat_visitor::visit_table_index(PFS_table *pfs, uint index)
@@ -1096,7 +1626,7 @@ void PFS_table_lock_wait_visitor::visit_global()
 
 void PFS_table_lock_wait_visitor::visit_table_share(PFS_table_share *pfs)
 {
-  pfs->m_table_stat.sum_lock(& m_stat);
+  pfs->sum_lock(& m_stat);
 }
 
 void PFS_table_lock_wait_visitor::visit_table(PFS_table *pfs)
@@ -1114,7 +1644,11 @@ PFS_table_lock_stat_visitor::~PFS_table_lock_stat_visitor()
 
 void PFS_table_lock_stat_visitor::visit_table_share(PFS_table_share *pfs)
 {
-  m_stat.aggregate(& pfs->m_table_stat.m_lock_stat);
+  PFS_table_share_lock *lock_stat;
+
+  lock_stat= pfs->find_lock_stat();
+  if (lock_stat != NULL)
+    m_stat.aggregate(& lock_stat->m_stat);
 }
 
 void PFS_table_lock_stat_visitor::visit_table(PFS_table *pfs)
@@ -1128,32 +1662,31 @@ PFS_instance_socket_io_stat_visitor::PFS_instance_socket_io_stat_visitor()
 PFS_instance_socket_io_stat_visitor::~PFS_instance_socket_io_stat_visitor()
 {}
 
-void PFS_instance_socket_io_stat_visitor::visit_socket_class(PFS_socket_class *pfs) 
+void PFS_instance_socket_io_stat_visitor::visit_socket_class(PFS_socket_class *pfs)
 {
   /* Aggregate wait times, event counts and byte counts */
   m_socket_io_stat.aggregate(&pfs->m_socket_stat.m_io_stat);
 }
 
-void PFS_instance_socket_io_stat_visitor::visit_socket(PFS_socket *pfs) 
+void PFS_instance_socket_io_stat_visitor::visit_socket(PFS_socket *pfs)
 {
   /* Aggregate wait times, event counts and byte counts */
   m_socket_io_stat.aggregate(&pfs->m_socket_stat.m_io_stat);
 }
 
-
 PFS_instance_file_io_stat_visitor::PFS_instance_file_io_stat_visitor()
 {}
 
 PFS_instance_file_io_stat_visitor::~PFS_instance_file_io_stat_visitor()
 {}
 
-void PFS_instance_file_io_stat_visitor::visit_file_class(PFS_file_class *pfs) 
+void PFS_instance_file_io_stat_visitor::visit_file_class(PFS_file_class *pfs)
 {
   /* Aggregate wait times, event counts and byte counts */
   m_file_io_stat.aggregate(&pfs->m_file_stat.m_io_stat);
 }
 
-void PFS_instance_file_io_stat_visitor::visit_file(PFS_file *pfs) 
+void PFS_instance_file_io_stat_visitor::visit_file(PFS_file *pfs)
 {
   /* Aggregate wait times, event counts and byte counts */
   m_file_io_stat.aggregate(&pfs->m_file_stat.m_io_stat);
diff --git a/storage/perfschema/pfs_visitor.h b/storage/perfschema/pfs_visitor.h
index 120b5928045..ebedae6e7f0 100644
--- a/storage/perfschema/pfs_visitor.h
+++ b/storage/perfschema/pfs_visitor.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -25,6 +25,8 @@
 
 #include "pfs_stat.h"
 
+typedef struct system_status_var STATUS_VAR;
+
 /**
   @file storage/perfschema/pfs_visitor.h
   Visitors (declarations).
@@ -45,6 +47,7 @@ struct PFS_rwlock_class;
 struct PFS_cond_class;
 struct PFS_file_class;
 struct PFS_socket_class;
+struct PFS_memory_class;
 struct PFS_table_share;
 struct PFS_mutex;
 struct PFS_rwlock;
@@ -53,6 +56,7 @@ struct PFS_file;
 struct PFS_table;
 struct PFS_stage_class;
 struct PFS_statement_class;
+struct PFS_transaction_class;
 struct PFS_socket;
 struct PFS_connection_slice;
 
@@ -75,6 +79,8 @@ public:
   virtual void visit_user(PFS_user *pfs) {}
   /** Visit a thread. */
   virtual void visit_thread(PFS_thread *pfs) {}
+  /** Visit a THD associated with a thread. */
+  virtual void visit_THD(THD *thd) {}
 };
 
 /**
@@ -90,37 +96,45 @@ public:
     @param with_users when true, visit also all users.
     @param with_accounts when true, visit also all user+host.
     @param with_threads when true, visit also all threads.
+    @param with_THDs when true, visit also all threads THD.
     @param visitor the visitor to call
   */
   static void visit_global(bool with_hosts, bool with_users,
                            bool with_accounts, bool with_threads,
+                           bool with_THDs,
                            PFS_connection_visitor *visitor);
   /**
     Visit all connections of a host.
     @param host the host to visit.
     @param with_accounts when true, visit also all related user+host.
     @param with_threads when true, visit also all related threads.
+    @param with_THDs when true, visit also all related threads THD.
     @param visitor the visitor to call
   */
   static void visit_host(PFS_host *host, bool with_accounts, bool with_threads,
+                         bool with_THDs,
                          PFS_connection_visitor *visitor);
   /**
     Visit all connections of a user.
     @param user the user to visit.
     @param with_accounts when true, visit also all related user+host.
     @param with_threads when true, visit also all related threads.
+    @param with_THDs when true, visit also all related threads THD.
     @param visitor the visitor to call
   */
   static void visit_user(PFS_user *user, bool with_accounts, bool with_threads,
+                         bool with_THDs,
                          PFS_connection_visitor *visitor);
   /**
     Visit all connections of a user+host.
     @param account the user+host to visit.
     @param with_threads when true, visit also all related threads.
+    @param with_THDs when true, visit also all related threads THD.
     @param visitor the visitor to call
   */
   static void visit_account(PFS_account *account, bool with_threads,
-                              PFS_connection_visitor *visitor);
+                            bool with_THDs,
+                            PFS_connection_visitor *visitor);
   /**
     Visit a thread or connection.
     @param thread the thread to visit.
@@ -129,6 +143,13 @@ public:
   static inline void visit_thread(PFS_thread *thread,
                                   PFS_connection_visitor *visitor)
   { visitor->visit_thread(thread); }
+
+  /**
+    Visit THD.
+    @param thd the THD to visit.
+    @param visitor the visitor to call.
+  */
+  static void visit_THD(THD *thd, PFS_connection_visitor *visitor);
 };
 
 /**
@@ -397,6 +418,54 @@ private:
 
 /**
   A concrete connection visitor that aggregates
+  transaction statistics for a given event_name.
+*/
+class PFS_connection_transaction_visitor : public PFS_connection_visitor
+{
+public:
+  /** Constructor. */
+  PFS_connection_transaction_visitor(PFS_transaction_class *klass);
+  virtual ~PFS_connection_transaction_visitor();
+  virtual void visit_global();
+  virtual void visit_host(PFS_host *pfs);
+  virtual void visit_account(PFS_account *pfs);
+  virtual void visit_user(PFS_user *pfs);
+  virtual void visit_thread(PFS_thread *pfs);
+
+  /** EVENT_NAME instrument index. */
+  uint m_index;
+  /** Statement statistic collected. */
+  PFS_transaction_stat m_stat;
+};
+
+/** Disabled pending code review */
+#if 0
+/**
+  A concrete connection visitor that aggregates
+  transaction statistics for all events.
+*/
+class PFS_connection_all_transaction_visitor : public PFS_connection_visitor
+{
+public:
+  /** Constructor. */
+  PFS_connection_all_transaction_visitor();
+  virtual ~PFS_connection_all_transaction_visitor();
+  virtual void visit_global();
+  virtual void visit_host(PFS_host *pfs);
+  virtual void visit_account(PFS_account *pfs);
+  virtual void visit_user(PFS_user *pfs);
+  virtual void visit_thread(PFS_thread *pfs);
+
+  /** Statement statistic collected. */
+  PFS_transaction_stat m_stat;
+
+private:
+  void visit_connection_slice(PFS_connection_slice *pfs);
+};
+#endif
+
+/**
+  A concrete connection visitor that aggregates
   connection statistics.
 */
 class PFS_connection_stat_visitor : public PFS_connection_visitor
@@ -416,6 +485,49 @@ public:
 };
 
 /**
+  A concrete connection visitor that aggregates
+  memory statistics for a given event_name.
+*/
+class PFS_connection_memory_visitor : public PFS_connection_visitor
+{
+public:
+  /** Constructor. */
+  PFS_connection_memory_visitor(PFS_memory_class *klass);
+  virtual ~PFS_connection_memory_visitor();
+  virtual void visit_global();
+  virtual void visit_host(PFS_host *pfs);
+  virtual void visit_account(PFS_account *pfs);
+  virtual void visit_user(PFS_user *pfs);
+  virtual void visit_thread(PFS_thread *pfs);
+
+  /** EVENT_NAME instrument index. */
+  uint m_index;
+  /** Statement statistic collected. */
+  PFS_memory_stat m_stat;
+};
+
+/**
+  A concrete connection visitor that aggregates
+  status variables.
+*/
+class PFS_connection_status_visitor : public PFS_connection_visitor
+{
+public:
+  /** Constructor. */
+  PFS_connection_status_visitor(STATUS_VAR *status_vars);
+  virtual ~PFS_connection_status_visitor();
+  virtual void visit_global();
+  virtual void visit_host(PFS_host *pfs);
+  virtual void visit_account(PFS_account *pfs);
+  virtual void visit_user(PFS_user *pfs);
+  virtual void visit_thread(PFS_thread *pfs);
+  virtual void visit_THD(THD *thd);
+
+private:
+  STATUS_VAR *m_status_vars;
+};
+
+/**
   A concrete instance visitor that aggregates
   wait statistics.
 */
diff --git a/storage/perfschema/rpl_gtid.h b/storage/perfschema/rpl_gtid.h
new file mode 100644
index 00000000000..6d9ecb0ea5f
--- /dev/null
+++ b/storage/perfschema/rpl_gtid.h
@@ -0,0 +1,17 @@
+#ifndef STORAGE_PERFSCHEMA_RPL_GTID_INCLUDED
+#define STORAGE_PERFSCHEMA_RPL_GTID_INCLUDED
+
+struct TABLE;
+
+#include "../../sql/rpl_gtid.h"
+
+class Gtid_specification: public rpl_gtid
+{
+public:
+  size_t to_string(char *buf)
+  {
+    return my_snprintf(buf, GTID_MAX_STR_LENGTH, "%u-%u-%llu",
+                       domain_id, server_id, seq_no);
+  }
+};
+#endif
diff --git a/storage/perfschema/table_accounts.cc b/storage/perfschema/table_accounts.cc
index d6c3ceb0c63..34dd49017a9 100644
--- a/storage/perfschema/table_accounts.cc
+++ b/storage/perfschema/table_accounts.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -21,12 +21,15 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "table_accounts.h"
 #include "pfs_instr_class.h"
 #include "pfs_instr.h"
 #include "pfs_account.h"
 #include "pfs_visitor.h"
+#include "pfs_memory.h"
+#include "pfs_status.h"
+#include "field.h"
 
 THR_LOCK table_accounts::m_table_lock;
 
@@ -35,18 +38,18 @@ table_accounts::m_share=
 {
   { C_STRING_WITH_LEN("accounts") },
   &pfs_truncatable_acl,
-  &table_accounts::create,
+  table_accounts::create,
   NULL, /* write_row */
   table_accounts::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  cursor_by_account::get_row_count,
   sizeof(PFS_simple_index), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE accounts("
-                      "USER CHAR(" STRINGIFY_ARG(USERNAME_CHAR_LENGTH) ") collate utf8_bin default null comment 'The connection''s client user name for the connection, or NULL if an internal thread.',"
-                      "HOST CHAR(" STRINGIFY_ARG(HOSTNAME_LENGTH) ") collate utf8_bin default null comment 'The connection client''s host name, or NULL if an internal thread.',"
+                      "USER CHAR(" USERNAME_CHAR_LENGTH_STR ") collate utf8_bin default null comment 'The connection''s client user name for the connection, or NULL if an internal thread.',"
+                      "HOST CHAR(" HOSTNAME_LENGTH_STR ") collate utf8_bin default null comment 'The connection client''s host name, or NULL if an internal thread.',"
                       "CURRENT_CONNECTIONS bigint not null comment 'Current connections for the account.',"
-                      "TOTAL_CONNECTIONS bigint not null comment 'Total connections for the account.')") }
+                      "TOTAL_CONNECTIONS bigint not null comment 'Total connections for the account.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_accounts::create()
@@ -63,6 +66,12 @@ table_accounts::delete_all_rows(void)
   reset_events_stages_by_account();
   reset_events_statements_by_thread();
   reset_events_statements_by_account();
+  reset_events_transactions_by_thread();
+  reset_events_transactions_by_account();
+  reset_memory_by_thread();
+  reset_memory_by_account();
+  reset_status_by_thread();
+  reset_status_by_account();
   purge_all_account();
   return 0;
 }
@@ -74,7 +83,7 @@ table_accounts::table_accounts()
 
 void table_accounts::make_row(PFS_account *pfs)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
 
   m_row_exists= false;
   pfs->m_lock.begin_optimistic_lock(&lock);
@@ -83,7 +92,10 @@ void table_accounts::make_row(PFS_account *pfs)
     return;
 
   PFS_connection_stat_visitor visitor;
-  PFS_connection_iterator::visit_account(pfs, true, & visitor);
+  PFS_connection_iterator::visit_account(pfs,
+                                         true,  /* threads */
+                                         false, /* THDs */
+                                         & visitor);
 
   if (! pfs->m_lock.end_optimistic_lock(& lock))
     return;
@@ -103,7 +115,7 @@ int table_accounts::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
@@ -121,7 +133,7 @@ int table_accounts::read_row_values(TABLE *table,
         m_row.m_connection_stat.set_field(f->field_index - 2, f);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
diff --git a/storage/perfschema/table_accounts.h b/storage/perfschema/table_accounts.h
index dfc2cc322e0..c3c62f77924 100644
--- a/storage/perfschema/table_accounts.h
+++ b/storage/perfschema/table_accounts.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
diff --git a/storage/perfschema/table_all_instr.cc b/storage/perfschema/table_all_instr.cc
index d48028b1539..6a01912a3a0 100644
--- a/storage/perfschema/table_all_instr.cc
+++ b/storage/perfschema/table_all_instr.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,9 +26,20 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "table_all_instr.h"
 #include "pfs_global.h"
+#include "pfs_buffer_container.h"
+
+ha_rows
+table_all_instr::get_row_count(void)
+{
+  return global_mutex_container.get_row_count()
+    + global_rwlock_container.get_row_count()
+    + global_cond_container.get_row_count()
+    + global_file_container.get_row_count()
+    + global_socket_container.get_row_count() ;
+}
 
 table_all_instr::table_all_instr(const PFS_engine_table_share *share)
   : PFS_engine_table(share, &m_pos),
@@ -55,10 +66,10 @@ int table_all_instr::rnd_next(void)
   {
     switch (m_pos.m_index_1) {
     case pos_all_instr::VIEW_MUTEX:
-      for ( ; m_pos.m_index_2 < mutex_max; m_pos.m_index_2++)
       {
-        mutex= &mutex_array[m_pos.m_index_2];
-        if (mutex->m_lock.is_populated())
+        PFS_mutex_iterator it= global_mutex_container.iterate(m_pos.m_index_2);
+        mutex= it.scan_next(& m_pos.m_index_2);
+        if (mutex != NULL)
         {
           make_mutex_row(mutex);
           m_next_pos.set_after(&m_pos);
@@ -67,10 +78,10 @@ int table_all_instr::rnd_next(void)
       }
       break;
     case pos_all_instr::VIEW_RWLOCK:
-      for ( ; m_pos.m_index_2 < rwlock_max; m_pos.m_index_2++)
       {
-        rwlock= &rwlock_array[m_pos.m_index_2];
-        if (rwlock->m_lock.is_populated())
+        PFS_rwlock_iterator it= global_rwlock_container.iterate(m_pos.m_index_2);
+        rwlock= it.scan_next(& m_pos.m_index_2);
+        if (rwlock != NULL)
         {
           make_rwlock_row(rwlock);
           m_next_pos.set_after(&m_pos);
@@ -79,10 +90,10 @@ int table_all_instr::rnd_next(void)
       }
       break;
     case pos_all_instr::VIEW_COND:
-      for ( ; m_pos.m_index_2 < cond_max; m_pos.m_index_2++)
       {
-        cond= &cond_array[m_pos.m_index_2];
-        if (cond->m_lock.is_populated())
+        PFS_cond_iterator it= global_cond_container.iterate(m_pos.m_index_2);
+        cond= it.scan_next(& m_pos.m_index_2);
+        if (cond != NULL)
         {
           make_cond_row(cond);
           m_next_pos.set_after(&m_pos);
@@ -91,10 +102,10 @@ int table_all_instr::rnd_next(void)
       }
       break;
     case pos_all_instr::VIEW_FILE:
-      for ( ; m_pos.m_index_2 < file_max; m_pos.m_index_2++)
       {
-        file= &file_array[m_pos.m_index_2];
-        if (file->m_lock.is_populated())
+        PFS_file_iterator it= global_file_container.iterate(m_pos.m_index_2);
+        file= it.scan_next(& m_pos.m_index_2);
+        if (file != NULL)
         {
           make_file_row(file);
           m_next_pos.set_after(&m_pos);
@@ -103,10 +114,10 @@ int table_all_instr::rnd_next(void)
       }
       break;
     case pos_all_instr::VIEW_SOCKET:
-      for ( ; m_pos.m_index_2 < socket_max; m_pos.m_index_2++)
       {
-        socket= &socket_array[m_pos.m_index_2];
-        if (socket->m_lock.is_populated())
+        PFS_socket_iterator it= global_socket_container.iterate(m_pos.m_index_2);
+        socket= it.scan_next(& m_pos.m_index_2);
+        if (socket != NULL)
         {
           make_socket_row(socket);
           m_next_pos.set_after(&m_pos);
@@ -132,45 +143,40 @@ int table_all_instr::rnd_pos(const void *pos)
 
   switch (m_pos.m_index_1) {
   case pos_all_instr::VIEW_MUTEX:
-    DBUG_ASSERT(m_pos.m_index_2 < mutex_max);
-    mutex= &mutex_array[m_pos.m_index_2];
-    if (mutex->m_lock.is_populated())
+    mutex= global_mutex_container.get(m_pos.m_index_2);
+    if (mutex != NULL)
     {
       make_mutex_row(mutex);
       return 0;
     }
     break;
   case pos_all_instr::VIEW_RWLOCK:
-    DBUG_ASSERT(m_pos.m_index_2 < rwlock_max);
-    rwlock= &rwlock_array[m_pos.m_index_2];
-    if (rwlock->m_lock.is_populated())
+    rwlock= global_rwlock_container.get(m_pos.m_index_2);
+    if (rwlock != NULL)
     {
       make_rwlock_row(rwlock);
       return 0;
     }
     break;
   case pos_all_instr::VIEW_COND:
-    DBUG_ASSERT(m_pos.m_index_2 < cond_max);
-    cond= &cond_array[m_pos.m_index_2];
-    if (cond->m_lock.is_populated())
+    cond= global_cond_container.get(m_pos.m_index_2);
+    if (cond != NULL)
     {
       make_cond_row(cond);
       return 0;
     }
     break;
   case pos_all_instr::VIEW_FILE:
-    DBUG_ASSERT(m_pos.m_index_2 < file_max);
-    file= &file_array[m_pos.m_index_2];
-    if (file->m_lock.is_populated())
+    file= global_file_container.get(m_pos.m_index_2);
+    if (file != NULL)
     {
       make_file_row(file);
       return 0;
     }
     break;
   case pos_all_instr::VIEW_SOCKET:
-    DBUG_ASSERT(m_pos.m_index_2 < socket_max);
-    socket= &socket_array[m_pos.m_index_2];
-    if (socket->m_lock.is_populated())
+    socket= global_socket_container.get(m_pos.m_index_2);
+    if (socket != NULL)
     {
       make_socket_row(socket);
       return 0;
diff --git a/storage/perfschema/table_all_instr.h b/storage/perfschema/table_all_instr.h
index 072221ba86e..ea52f5d2de2 100644
--- a/storage/perfschema/table_all_instr.h
+++ b/storage/perfschema/table_all_instr.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -74,6 +74,8 @@ struct pos_all_instr : public PFS_double_index,
 class table_all_instr : public PFS_engine_table
 {
 public:
+  static ha_rows get_row_count();
+
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
   virtual void reset_position(void);
diff --git a/storage/perfschema/table_esgs_by_account_by_event_name.cc b/storage/perfschema/table_esgs_by_account_by_event_name.cc
index 6ef00026370..d6c97ac3edf 100644
--- a/storage/perfschema/table_esgs_by_account_by_event_name.cc
+++ b/storage/perfschema/table_esgs_by_account_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,13 +26,15 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_esgs_by_account_by_event_name.h"
 #include "pfs_global.h"
 #include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_esgs_by_account_by_event_name::m_table_lock;
 
@@ -44,19 +46,19 @@ table_esgs_by_account_by_event_name::m_share=
   table_esgs_by_account_by_event_name::create,
   NULL, /* write_row */
   table_esgs_by_account_by_event_name::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_esgs_by_account_by_event_name::get_row_count,
   sizeof(pos_esgs_by_account_by_event_name),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_stages_summary_by_account_by_event_name("
-                      "USER CHAR(" STRINGIFY_ARG(USERNAME_CHAR_LENGTH) ") collate utf8_bin default null comment 'User. Used together with HOST and EVENT_NAME for grouping events.',"
-                      "HOST CHAR(" STRINGIFY_ARG(HOSTNAME_LENGTH) ") collate utf8_bin default null comment 'Host. Used together with USER and EVENT_NAME for grouping events.',"
+                      "USER CHAR(" USERNAME_CHAR_LENGTH_STR ") collate utf8_bin default null comment 'User. Used together with HOST and EVENT_NAME for grouping events.',"
+                      "HOST CHAR(" HOSTNAME_LENGTH_STR ") collate utf8_bin default null comment 'Host. Used together with USER and EVENT_NAME for grouping events.',"
                       "EVENT_NAME VARCHAR(128) not null comment 'Event name. Used together with USER and HOST for grouping events.',"
                       "COUNT_STAR BIGINT unsigned not null comment 'Number of summarized events, which includes all timed and untimed events.',"
                       "SUM_TIMER_WAIT BIGINT unsigned not null comment 'Total wait time of the timed summarized events.',"
                       "MIN_TIMER_WAIT BIGINT unsigned not null comment 'Minimum wait time of the timed summarized events.',"
                       "AVG_TIMER_WAIT BIGINT unsigned not null comment 'Average wait time of the timed summarized events.',"
-                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the timed summarized events.')") }
+                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the timed summarized events.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -73,6 +75,12 @@ table_esgs_by_account_by_event_name::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_esgs_by_account_by_event_name::get_row_count(void)
+{
+  return global_account_container.get_row_count() * stage_class_max;
+}
+
 table_esgs_by_account_by_event_name::table_esgs_by_account_by_event_name()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(), m_next_pos()
@@ -94,13 +102,14 @@ int table_esgs_by_account_by_event_name::rnd_next(void)
 {
   PFS_account *account;
   PFS_stage_class *stage_class;
+  bool has_more_account= true;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.has_more_account();
+       has_more_account;
        m_pos.next_account())
   {
-    account= &account_array[m_pos.m_index_1];
-    if (account->m_lock.is_populated())
+    account= global_account_container.get(m_pos.m_index_1, & has_more_account);
+    if (account != NULL)
     {
       stage_class= find_stage_class(m_pos.m_index_2);
       if (stage_class)
@@ -122,17 +131,16 @@ table_esgs_by_account_by_event_name::rnd_pos(const void *pos)
   PFS_stage_class *stage_class;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < account_max);
 
-  account= &account_array[m_pos.m_index_1];
-  if (! account->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
-
-  stage_class= find_stage_class(m_pos.m_index_2);
-  if (stage_class)
+  account= global_account_container.get(m_pos.m_index_1);
+  if (account != NULL)
   {
-    make_row(account, stage_class);
-    return 0;
+    stage_class= find_stage_class(m_pos.m_index_2);
+    if (stage_class)
+    {
+      make_row(account, stage_class);
+      return 0;
+    }
   }
 
   return HA_ERR_RECORD_DELETED;
@@ -141,7 +149,7 @@ table_esgs_by_account_by_event_name::rnd_pos(const void *pos)
 void table_esgs_by_account_by_event_name
 ::make_row(PFS_account *account, PFS_stage_class *klass)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   m_row_exists= false;
 
   account->m_lock.begin_optimistic_lock(&lock);
@@ -152,7 +160,10 @@ void table_esgs_by_account_by_event_name
   m_row.m_event_name.make_row(klass);
 
   PFS_connection_stage_visitor visitor(klass);
-  PFS_connection_iterator::visit_account(account, true, & visitor);
+  PFS_connection_iterator::visit_account(account,
+                                         true,  /* threads */
+                                         false, /* THDs */
+                                         & visitor);
 
   if (! account->m_lock.end_optimistic_lock(&lock))
     return;
@@ -171,7 +182,7 @@ int table_esgs_by_account_by_event_name
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
diff --git a/storage/perfschema/table_esgs_by_account_by_event_name.h b/storage/perfschema/table_esgs_by_account_by_event_name.h
index ee855d42818..f19241a4d7b 100644
--- a/storage/perfschema/table_esgs_by_account_by_event_name.h
+++ b/storage/perfschema/table_esgs_by_account_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -57,7 +57,7 @@ struct row_esgs_by_account_by_event_name
 /**
   Position of a cursor on
   PERFORMANCE_SCHEMA.EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
-  Index 1 on user@host (0 based)
+  Index 1 on account (0 based)
   Index 2 on stage class (1 based)
 */
 struct pos_esgs_by_account_by_event_name
@@ -73,9 +73,6 @@ struct pos_esgs_by_account_by_event_name
     m_index_2= 1;
   }
 
-  inline bool has_more_account(void)
-  { return (m_index_1 < account_max); }
-
   inline void next_account(void)
   {
     m_index_1++;
@@ -91,6 +88,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
diff --git a/storage/perfschema/table_esgs_by_host_by_event_name.cc b/storage/perfschema/table_esgs_by_host_by_event_name.cc
index 880ed279ccf..bf219705513 100644
--- a/storage/perfschema/table_esgs_by_host_by_event_name.cc
+++ b/storage/perfschema/table_esgs_by_host_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,7 +26,7 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
@@ -34,6 +34,8 @@
 #include "pfs_global.h"
 #include "pfs_account.h"
 #include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_esgs_by_host_by_event_name::m_table_lock;
 
@@ -45,8 +47,7 @@ table_esgs_by_host_by_event_name::m_share=
   table_esgs_by_host_by_event_name::create,
   NULL, /* write_row */
   table_esgs_by_host_by_event_name::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_esgs_by_host_by_event_name::get_row_count,
   sizeof(pos_esgs_by_host_by_event_name),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_stages_summary_by_host_by_event_name("
@@ -56,7 +57,8 @@ table_esgs_by_host_by_event_name::m_share=
                       "SUM_TIMER_WAIT BIGINT unsigned not null comment 'Total wait time of the timed summarized events.',"
                       "MIN_TIMER_WAIT BIGINT unsigned not null comment 'Minimum wait time of the timed summarized events.',"
                       "AVG_TIMER_WAIT BIGINT unsigned not null comment 'Average wait time of the timed summarized events.',"
-                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the timed summarized events.')") }
+                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the timed summarized events.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -74,6 +76,12 @@ table_esgs_by_host_by_event_name::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_esgs_by_host_by_event_name::get_row_count(void)
+{
+  return global_host_container.get_row_count() * stage_class_max;
+}
+
 table_esgs_by_host_by_event_name::table_esgs_by_host_by_event_name()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(), m_next_pos()
@@ -95,13 +103,14 @@ int table_esgs_by_host_by_event_name::rnd_next(void)
 {
   PFS_host *host;
   PFS_stage_class *stage_class;
+  bool has_more_host= true;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.has_more_host();
+       has_more_host;
        m_pos.next_host())
   {
-    host= &host_array[m_pos.m_index_1];
-    if (host->m_lock.is_populated())
+    host= global_host_container.get(m_pos.m_index_1, & has_more_host);
+    if (host != NULL)
     {
       stage_class= find_stage_class(m_pos.m_index_2);
       if (stage_class)
@@ -123,17 +132,16 @@ table_esgs_by_host_by_event_name::rnd_pos(const void *pos)
   PFS_stage_class *stage_class;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < host_max);
 
-  host= &host_array[m_pos.m_index_1];
-  if (! host->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
-
-  stage_class= find_stage_class(m_pos.m_index_2);
-  if (stage_class)
+  host= global_host_container.get(m_pos.m_index_1);
+  if (host != NULL)
   {
-    make_row(host, stage_class);
-    return 0;
+    stage_class= find_stage_class(m_pos.m_index_2);
+    if (stage_class)
+    {
+      make_row(host, stage_class);
+      return 0;
+    }
   }
 
   return HA_ERR_RECORD_DELETED;
@@ -142,7 +150,7 @@ table_esgs_by_host_by_event_name::rnd_pos(const void *pos)
 void table_esgs_by_host_by_event_name
 ::make_row(PFS_host *host, PFS_stage_class *klass)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   m_row_exists= false;
 
   host->m_lock.begin_optimistic_lock(&lock);
@@ -153,7 +161,11 @@ void table_esgs_by_host_by_event_name
   m_row.m_event_name.make_row(klass);
 
   PFS_connection_stage_visitor visitor(klass);
-  PFS_connection_iterator::visit_host(host, true, true, & visitor);
+  PFS_connection_iterator::visit_host(host,
+                                      true,  /* accounts */
+                                      true,  /* threads */
+                                      false, /* THDs */
+                                      & visitor);
 
   if (! host->m_lock.end_optimistic_lock(&lock))
     return;
@@ -172,7 +184,7 @@ int table_esgs_by_host_by_event_name
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
diff --git a/storage/perfschema/table_esgs_by_host_by_event_name.h b/storage/perfschema/table_esgs_by_host_by_event_name.h
index 6042e6396af..71592834d9a 100644
--- a/storage/perfschema/table_esgs_by_host_by_event_name.h
+++ b/storage/perfschema/table_esgs_by_host_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -73,9 +73,6 @@ struct pos_esgs_by_host_by_event_name
     m_index_2= 1;
   }
 
-  inline bool has_more_host(void)
-  { return (m_index_1 < host_max); }
-
   inline void next_host(void)
   {
     m_index_1++;
@@ -91,6 +88,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
diff --git a/storage/perfschema/table_esgs_by_thread_by_event_name.cc b/storage/perfschema/table_esgs_by_thread_by_event_name.cc
index 3f09cef4dba..f6568655e51 100644
--- a/storage/perfschema/table_esgs_by_thread_by_event_name.cc
+++ b/storage/perfschema/table_esgs_by_thread_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,13 +26,15 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_esgs_by_thread_by_event_name.h"
 #include "pfs_global.h"
 #include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_esgs_by_thread_by_event_name::m_table_lock;
 
@@ -44,8 +46,7 @@ table_esgs_by_thread_by_event_name::m_share=
   table_esgs_by_thread_by_event_name::create,
   NULL, /* write_row */
   table_esgs_by_thread_by_event_name::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_esgs_by_thread_by_event_name::get_row_count,
   sizeof(pos_esgs_by_thread_by_event_name),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_stages_summary_by_thread_by_event_name("
@@ -55,7 +56,8 @@ table_esgs_by_thread_by_event_name::m_share=
                       "SUM_TIMER_WAIT BIGINT unsigned not null comment 'Total wait time of the timed summarized events.',"
                       "MIN_TIMER_WAIT BIGINT unsigned not null comment 'Minimum wait time of the timed summarized events.',"
                       "AVG_TIMER_WAIT BIGINT unsigned not null comment 'Average wait time of the timed summarized events.',"
-                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the timed summarized events.')") }
+                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the timed summarized events.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -71,6 +73,12 @@ table_esgs_by_thread_by_event_name::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_esgs_by_thread_by_event_name::get_row_count(void)
+{
+  return global_thread_container.get_row_count() * stage_class_max;
+}
+
 table_esgs_by_thread_by_event_name::table_esgs_by_thread_by_event_name()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(), m_next_pos()
@@ -92,18 +100,14 @@ int table_esgs_by_thread_by_event_name::rnd_next(void)
 {
   PFS_thread *thread;
   PFS_stage_class *stage_class;
+  bool has_more_thread= true;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.has_more_thread();
+       has_more_thread;
        m_pos.next_thread())
   {
-    thread= &thread_array[m_pos.m_index_1];
-
-    /*
-      Important note: the thread scan is the outer loop (index 1),
-      to minimize the number of calls to atomic operations.
-    */
-    if (thread->m_lock.is_populated())
+    thread= global_thread_container.get(m_pos.m_index_1, & has_more_thread);
+    if (thread != NULL)
     {
       stage_class= find_stage_class(m_pos.m_index_2);
       if (stage_class)
@@ -125,17 +129,16 @@ table_esgs_by_thread_by_event_name::rnd_pos(const void *pos)
   PFS_stage_class *stage_class;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < thread_max);
 
-  thread= &thread_array[m_pos.m_index_1];
-  if (! thread->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
-
-  stage_class= find_stage_class(m_pos.m_index_2);
-  if (stage_class)
+  thread= global_thread_container.get(m_pos.m_index_1);
+  if (thread != NULL)
   {
-    make_row(thread, stage_class);
-    return 0;
+    stage_class= find_stage_class(m_pos.m_index_2);
+    if (stage_class)
+    {
+      make_row(thread, stage_class);
+      return 0;
+    }
   }
 
   return HA_ERR_RECORD_DELETED;
@@ -144,7 +147,7 @@ table_esgs_by_thread_by_event_name::rnd_pos(const void *pos)
 void table_esgs_by_thread_by_event_name
 ::make_row(PFS_thread *thread, PFS_stage_class *klass)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   m_row_exists= false;
 
   /* Protect this reader against a thread termination */
@@ -173,7 +176,7 @@ int table_esgs_by_thread_by_event_name
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 0);
+  assert(table->s->null_bytes == 0);
 
   for (; (f= *fields) ; fields++)
   {
diff --git a/storage/perfschema/table_esgs_by_thread_by_event_name.h b/storage/perfschema/table_esgs_by_thread_by_event_name.h
index 6ff677a95e2..e5c30363704 100644
--- a/storage/perfschema/table_esgs_by_thread_by_event_name.h
+++ b/storage/perfschema/table_esgs_by_thread_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -72,9 +72,6 @@ struct pos_esgs_by_thread_by_event_name
     m_index_2= 1;
   }
 
-  inline bool has_more_thread(void)
-  { return (m_index_1 < thread_max); }
-
   inline void next_thread(void)
   {
     m_index_1++;
@@ -95,6 +92,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
diff --git a/storage/perfschema/table_esgs_by_user_by_event_name.cc b/storage/perfschema/table_esgs_by_user_by_event_name.cc
index 1e33b475e4b..ecec4e08425 100644
--- a/storage/perfschema/table_esgs_by_user_by_event_name.cc
+++ b/storage/perfschema/table_esgs_by_user_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,14 +26,15 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_esgs_by_user_by_event_name.h"
 #include "pfs_global.h"
-#include "pfs_account.h"
 #include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_esgs_by_user_by_event_name::m_table_lock;
 
@@ -45,18 +46,18 @@ table_esgs_by_user_by_event_name::m_share=
   table_esgs_by_user_by_event_name::create,
   NULL, /* write_row */
   table_esgs_by_user_by_event_name::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_esgs_by_user_by_event_name::get_row_count,
   sizeof(pos_esgs_by_user_by_event_name),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_stages_summary_by_user_by_event_name("
-                      "USER CHAR(" STRINGIFY_ARG(USERNAME_CHAR_LENGTH) ") collate utf8_bin default null comment 'User. Used together with EVENT_NAME for grouping events.',"
+                      "USER CHAR(" USERNAME_CHAR_LENGTH_STR ") collate utf8_bin default null comment 'User. Used together with EVENT_NAME for grouping events.',"
                       "EVENT_NAME VARCHAR(128) not null comment 'Event name. Used together with USER for grouping events.',"
                       "COUNT_STAR BIGINT unsigned not null comment 'Number of summarized events, which includes all timed and untimed events.',"
                       "SUM_TIMER_WAIT BIGINT unsigned not null comment 'Total wait time of the timed summarized events.',"
                       "MIN_TIMER_WAIT BIGINT unsigned not null comment 'Minimum wait time of the timed summarized events.',"
                       "AVG_TIMER_WAIT BIGINT unsigned not null comment 'Average wait time of the timed summarized events.',"
-                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the timed summarized events.')") }
+                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the timed summarized events.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -74,6 +75,12 @@ table_esgs_by_user_by_event_name::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_esgs_by_user_by_event_name::get_row_count(void)
+{
+  return global_user_container.get_row_count() * stage_class_max;
+}
+
 table_esgs_by_user_by_event_name::table_esgs_by_user_by_event_name()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(), m_next_pos()
@@ -95,13 +102,14 @@ int table_esgs_by_user_by_event_name::rnd_next(void)
 {
   PFS_user *user;
   PFS_stage_class *stage_class;
+  bool has_more_user= true;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.has_more_user();
+       has_more_user;
        m_pos.next_user())
   {
-    user= &user_array[m_pos.m_index_1];
-    if (user->m_lock.is_populated())
+    user= global_user_container.get(m_pos.m_index_1, & has_more_user);
+    if (user != NULL)
     {
       stage_class= find_stage_class(m_pos.m_index_2);
       if (stage_class)
@@ -123,17 +131,16 @@ table_esgs_by_user_by_event_name::rnd_pos(const void *pos)
   PFS_stage_class *stage_class;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < user_max);
 
-  user= &user_array[m_pos.m_index_1];
-  if (! user->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
-
-  stage_class= find_stage_class(m_pos.m_index_2);
-  if (stage_class)
+  user= global_user_container.get(m_pos.m_index_1);
+  if (user != NULL)
   {
-    make_row(user, stage_class);
-    return 0;
+    stage_class= find_stage_class(m_pos.m_index_2);
+    if (stage_class)
+    {
+      make_row(user, stage_class);
+      return 0;
+    }
   }
 
   return HA_ERR_RECORD_DELETED;
@@ -142,7 +149,7 @@ table_esgs_by_user_by_event_name::rnd_pos(const void *pos)
 void table_esgs_by_user_by_event_name
 ::make_row(PFS_user *user, PFS_stage_class *klass)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   m_row_exists= false;
 
   user->m_lock.begin_optimistic_lock(&lock);
@@ -153,7 +160,11 @@ void table_esgs_by_user_by_event_name
   m_row.m_event_name.make_row(klass);
 
   PFS_connection_stage_visitor visitor(klass);
-  PFS_connection_iterator::visit_user(user, true, true, & visitor);
+  PFS_connection_iterator::visit_user(user,
+                                      true,  /* accounts */
+                                      true,  /* threads */
+                                      false, /* THDs */
+                                      & visitor);
 
   if (! user->m_lock.end_optimistic_lock(&lock))
     return;
@@ -172,7 +183,7 @@ int table_esgs_by_user_by_event_name
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
diff --git a/storage/perfschema/table_esgs_by_user_by_event_name.h b/storage/perfschema/table_esgs_by_user_by_event_name.h
index bc545c2438a..895840fc476 100644
--- a/storage/perfschema/table_esgs_by_user_by_event_name.h
+++ b/storage/perfschema/table_esgs_by_user_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -73,9 +73,6 @@ struct pos_esgs_by_user_by_event_name
     m_index_2= 1;
   }
 
-  inline bool has_more_user(void)
-  { return (m_index_1 < user_max); }
-
   inline void next_user(void)
   {
     m_index_1++;
@@ -96,6 +93,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
diff --git a/storage/perfschema/table_esgs_global_by_event_name.cc b/storage/perfschema/table_esgs_global_by_event_name.cc
index d01b19ed539..fe24d6794c1 100644
--- a/storage/perfschema/table_esgs_global_by_event_name.cc
+++ b/storage/perfschema/table_esgs_global_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,7 +26,7 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
@@ -35,6 +35,7 @@
 #include "pfs_instr.h"
 #include "pfs_timer.h"
 #include "pfs_visitor.h"
+#include "field.h"
 
 THR_LOCK table_esgs_global_by_event_name::m_table_lock;
 
@@ -46,8 +47,7 @@ table_esgs_global_by_event_name::m_share=
   table_esgs_global_by_event_name::create,
   NULL, /* write_row */
   table_esgs_global_by_event_name::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_esgs_global_by_event_name::get_row_count,
   sizeof(PFS_simple_index),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_stages_summary_global_by_event_name("
@@ -56,7 +56,8 @@ table_esgs_global_by_event_name::m_share=
                       "SUM_TIMER_WAIT BIGINT unsigned not null comment 'Total wait time of the timed summarized events.',"
                       "MIN_TIMER_WAIT BIGINT unsigned not null comment 'Minimum wait time of the timed summarized events.',"
                       "AVG_TIMER_WAIT BIGINT unsigned not null comment 'Average wait time of the timed summarized events.',"
-                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the timed summarized events.')") }
+                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the timed summarized events.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -76,6 +77,12 @@ table_esgs_global_by_event_name::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_esgs_global_by_event_name::get_row_count(void)
+{
+  return stage_class_max;
+}
+
 table_esgs_global_by_event_name::table_esgs_global_by_event_name()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(1), m_next_pos(1)
@@ -140,9 +147,12 @@ void table_esgs_global_by_event_name
   m_row.m_event_name.make_row(klass);
 
   PFS_connection_stage_visitor visitor(klass);
-  PFS_connection_iterator::visit_global(true, /* hosts */
+  PFS_connection_iterator::visit_global(true,  /* hosts */
                                         false, /* users */
-                                        true, true, & visitor);
+                                        true,  /* accounts */
+                                        true,  /* threads */
+                                        false, /* THDs */
+                                        & visitor);
 
   m_row.m_stat.set(m_normalizer, & visitor.m_stat);
   m_row_exists= true;
@@ -158,7 +168,7 @@ int table_esgs_global_by_event_name
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 0);
+  assert(table->s->null_bytes == 0);
 
   for (; (f= *fields) ; fields++)
   {
diff --git a/storage/perfschema/table_esgs_global_by_event_name.h b/storage/perfschema/table_esgs_global_by_event_name.h
index b8884355676..f2e51f625c7 100644
--- a/storage/perfschema/table_esgs_global_by_event_name.h
+++ b/storage/perfschema/table_esgs_global_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -59,6 +59,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
diff --git a/storage/perfschema/table_esms_by_account_by_event_name.cc b/storage/perfschema/table_esms_by_account_by_event_name.cc
index fd255633f9c..9756bc3c70b 100644
--- a/storage/perfschema/table_esms_by_account_by_event_name.cc
+++ b/storage/perfschema/table_esms_by_account_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,13 +26,15 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_esms_by_account_by_event_name.h"
 #include "pfs_global.h"
 #include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_esms_by_account_by_event_name::m_table_lock;
 
@@ -44,13 +46,12 @@ table_esms_by_account_by_event_name::m_share=
   table_esms_by_account_by_event_name::create,
   NULL, /* write_row */
   table_esms_by_account_by_event_name::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_esms_by_account_by_event_name::get_row_count,
   sizeof(pos_esms_by_account_by_event_name),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_statements_summary_by_account_by_event_name("
-                      "USER CHAR(" STRINGIFY_ARG(USERNAME_CHAR_LENGTH) ") collate utf8_bin default null comment 'User. Used together with HOST and EVENT_NAME for grouping events.',"
-                      "HOST CHAR(" STRINGIFY_ARG(HOSTNAME_LENGTH) ") collate utf8_bin default null comment 'Host. Used together with USER and EVENT_NAME for grouping events.',"
+                      "USER CHAR(" USERNAME_CHAR_LENGTH_STR ") collate utf8_bin default null comment 'User. Used together with HOST and EVENT_NAME for grouping events.',"
+                      "HOST CHAR(" HOSTNAME_LENGTH_STR ") collate utf8_bin default null comment 'Host. Used together with USER and EVENT_NAME for grouping events.',"
                       "EVENT_NAME VARCHAR(128) not null comment 'Event name. Used together with USER and HOST for grouping events.',"
                       "COUNT_STAR BIGINT unsigned not null comment 'Number of summarized events',"
                       "SUM_TIMER_WAIT BIGINT unsigned not null comment 'Total wait time of the summarized events that are timed.',"
@@ -75,7 +76,8 @@ table_esms_by_account_by_event_name::m_share=
                       "SUM_SORT_ROWS BIGINT unsigned not null comment 'Sum of the SORT_ROWS column in the events_statements_current table.',"
                       "SUM_SORT_SCAN BIGINT unsigned not null comment 'Sum of the SORT_SCAN column in the events_statements_current table.',"
                       "SUM_NO_INDEX_USED BIGINT unsigned not null comment 'Sum of the NO_INDEX_USED column in the events_statements_current table.',"
-                      "SUM_NO_GOOD_INDEX_USED BIGINT unsigned not null comment 'Sum of the NO_GOOD_INDEX_USED column in the events_statements_current table.')") }
+                      "SUM_NO_GOOD_INDEX_USED BIGINT unsigned not null comment 'Sum of the NO_GOOD_INDEX_USED column in the events_statements_current table.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -92,6 +94,12 @@ table_esms_by_account_by_event_name::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_esms_by_account_by_event_name::get_row_count(void)
+{
+  return global_account_container.get_row_count() * statement_class_max;
+}
+
 table_esms_by_account_by_event_name::table_esms_by_account_by_event_name()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(), m_next_pos()
@@ -113,13 +121,14 @@ int table_esms_by_account_by_event_name::rnd_next(void)
 {
   PFS_account *account;
   PFS_statement_class *statement_class;
+  bool has_more_account= true;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.has_more_account();
+       has_more_account;
        m_pos.next_account())
   {
-    account= &account_array[m_pos.m_index_1];
-    if (account->m_lock.is_populated())
+    account= global_account_container.get(m_pos.m_index_1, & has_more_account);
+    if (account != NULL)
     {
       statement_class= find_statement_class(m_pos.m_index_2);
       if (statement_class)
@@ -141,17 +150,16 @@ table_esms_by_account_by_event_name::rnd_pos(const void *pos)
   PFS_statement_class *statement_class;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < account_max);
 
-  account= &account_array[m_pos.m_index_1];
-  if (! account->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
-
-  statement_class= find_statement_class(m_pos.m_index_2);
-  if (statement_class)
+  account= global_account_container.get(m_pos.m_index_1);
+  if (account != NULL)
   {
-    make_row(account, statement_class);
-    return 0;
+    statement_class= find_statement_class(m_pos.m_index_2);
+    if (statement_class)
+    {
+      make_row(account, statement_class);
+      return 0;
+    }
   }
 
   return HA_ERR_RECORD_DELETED;
@@ -160,7 +168,7 @@ table_esms_by_account_by_event_name::rnd_pos(const void *pos)
 void table_esms_by_account_by_event_name
 ::make_row(PFS_account *account, PFS_statement_class *klass)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   m_row_exists= false;
 
   if (klass->is_mutable())
@@ -174,7 +182,10 @@ void table_esms_by_account_by_event_name
   m_row.m_event_name.make_row(klass);
 
   PFS_connection_statement_visitor visitor(klass);
-  PFS_connection_iterator::visit_account(account, true, & visitor);
+  PFS_connection_iterator::visit_account(account,
+                                         true,  /* threads */
+                                         false, /* THDs */
+                                         & visitor);
 
   if (! account->m_lock.end_optimistic_lock(&lock))
     return;
@@ -193,7 +204,7 @@ int table_esms_by_account_by_event_name
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
diff --git a/storage/perfschema/table_esms_by_account_by_event_name.h b/storage/perfschema/table_esms_by_account_by_event_name.h
index 64f2053cff6..e5f17ee1113 100644
--- a/storage/perfschema/table_esms_by_account_by_event_name.h
+++ b/storage/perfschema/table_esms_by_account_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -57,7 +57,7 @@ struct row_esms_by_account_by_event_name
 /**
   Position of a cursor on
   PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
-  Index 1 on user@host (0 based)
+  Index 1 on account (0 based)
   Index 2 on statement class (1 based)
 */
 struct pos_esms_by_account_by_event_name
@@ -73,9 +73,6 @@ struct pos_esms_by_account_by_event_name
     m_index_2= 1;
   }
 
-  inline bool has_more_account(void)
-  { return (m_index_1 < account_max); }
-
   inline void next_account(void)
   {
     m_index_1++;
@@ -91,6 +88,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
diff --git a/storage/perfschema/table_esms_by_digest.cc b/storage/perfschema/table_esms_by_digest.cc
index 5f225ff5856..c27e3372562 100644
--- a/storage/perfschema/table_esms_by_digest.cc
+++ b/storage/perfschema/table_esms_by_digest.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2016, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,7 +26,7 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
@@ -37,6 +37,7 @@
 #include "pfs_visitor.h"
 #include "table_esms_by_digest.h"
 #include "pfs_digest.h"
+#include "field.h"
 
 THR_LOCK table_esms_by_digest::m_table_lock;
 
@@ -48,8 +49,7 @@ table_esms_by_digest::m_share=
   table_esms_by_digest::create,
   NULL, /* write_row */
   table_esms_by_digest::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_esms_by_digest::get_row_count,
   sizeof(PFS_simple_index),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_statements_summary_by_digest("
@@ -81,7 +81,8 @@ table_esms_by_digest::m_share=
                       "SUM_NO_INDEX_USED BIGINT unsigned not null comment 'Sum of the NO_INDEX_USED column in the events_statements_current table.',"
                       "SUM_NO_GOOD_INDEX_USED BIGINT unsigned not null comment 'Sum of the NO_GOOD_INDEX_USED column in the events_statements_current table.',"
                       "FIRST_SEEN TIMESTAMP(0) NOT NULL default 0 comment 'Time at which the digest was first seen.',"
-                      "LAST_SEEN TIMESTAMP(0) NOT NULL default 0 comment 'Time at which the digest was most recently seen.')") }
+                      "LAST_SEEN TIMESTAMP(0) NOT NULL default 0 comment 'Time at which the digest was most recently seen.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -97,6 +98,12 @@ table_esms_by_digest::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_esms_by_digest::get_row_count(void)
+{
+  return digest_max;
+}
+
 table_esms_by_digest::table_esms_by_digest()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(0), m_next_pos(0)
@@ -183,11 +190,11 @@ int table_esms_by_digest
   if (unlikely(! m_row_exists))
     return HA_ERR_RECORD_DELETED;
 
-  /* 
+  /*
     Set the null bits. It indicates how many fields could be null
     in the table.
   */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
diff --git a/storage/perfschema/table_esms_by_digest.h b/storage/perfschema/table_esms_by_digest.h
index 903b86110f6..7b84cdd6429 100644
--- a/storage/perfschema/table_esms_by_digest.h
+++ b/storage/perfschema/table_esms_by_digest.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -62,6 +62,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
diff --git a/storage/perfschema/table_esms_by_host_by_event_name.cc b/storage/perfschema/table_esms_by_host_by_event_name.cc
index 92359186e8a..ac6a584965f 100644
--- a/storage/perfschema/table_esms_by_host_by_event_name.cc
+++ b/storage/perfschema/table_esms_by_host_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,7 +26,7 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
@@ -34,6 +34,8 @@
 #include "pfs_global.h"
 #include "pfs_account.h"
 #include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_esms_by_host_by_event_name::m_table_lock;
 
@@ -45,8 +47,7 @@ table_esms_by_host_by_event_name::m_share=
   table_esms_by_host_by_event_name::create,
   NULL, /* write_row */
   table_esms_by_host_by_event_name::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_esms_by_host_by_event_name::get_row_count,
   sizeof(pos_esms_by_host_by_event_name),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_statements_summary_by_host_by_event_name("
@@ -75,7 +76,8 @@ table_esms_by_host_by_event_name::m_share=
                       "SUM_SORT_ROWS BIGINT unsigned not null comment 'Sum of the SORT_ROWS column in the events_statements_current table.',"
                       "SUM_SORT_SCAN BIGINT unsigned not null comment 'Sum of the SORT_SCAN column in the events_statements_current table.',"
                       "SUM_NO_INDEX_USED BIGINT unsigned not null comment 'Sum of the NO_INDEX_USED column in the events_statements_current table.',"
-                      "SUM_NO_GOOD_INDEX_USED BIGINT unsigned not null comment 'Sum of the NO_GOOD_INDEX_USED column in the events_statements_current table.')") }
+                      "SUM_NO_GOOD_INDEX_USED BIGINT unsigned not null comment 'Sum of the NO_GOOD_INDEX_USED column in the events_statements_current table.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -93,6 +95,12 @@ table_esms_by_host_by_event_name::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_esms_by_host_by_event_name::get_row_count(void)
+{
+  return global_host_container.get_row_count() * statement_class_max;
+}
+
 table_esms_by_host_by_event_name::table_esms_by_host_by_event_name()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(), m_next_pos()
@@ -114,13 +122,14 @@ int table_esms_by_host_by_event_name::rnd_next(void)
 {
   PFS_host *host;
   PFS_statement_class *statement_class;
+  bool has_more_host= true;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.has_more_host();
+       has_more_host;
        m_pos.next_host())
   {
-    host= &host_array[m_pos.m_index_1];
-    if (host->m_lock.is_populated())
+    host= global_host_container.get(m_pos.m_index_1, & has_more_host);
+    if (host != NULL)
     {
       statement_class= find_statement_class(m_pos.m_index_2);
       if (statement_class)
@@ -142,17 +151,16 @@ table_esms_by_host_by_event_name::rnd_pos(const void *pos)
   PFS_statement_class *statement_class;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < host_max);
 
-  host= &host_array[m_pos.m_index_1];
-  if (! host->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
-
-  statement_class= find_statement_class(m_pos.m_index_2);
-  if (statement_class)
+  host= global_host_container.get(m_pos.m_index_1);
+  if (host != NULL)
   {
-    make_row(host, statement_class);
-    return 0;
+    statement_class= find_statement_class(m_pos.m_index_2);
+    if (statement_class)
+    {
+      make_row(host, statement_class);
+      return 0;
+    }
   }
 
   return HA_ERR_RECORD_DELETED;
@@ -161,7 +169,7 @@ table_esms_by_host_by_event_name::rnd_pos(const void *pos)
 void table_esms_by_host_by_event_name
 ::make_row(PFS_host *host, PFS_statement_class *klass)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   m_row_exists= false;
 
   if (klass->is_mutable())
@@ -175,7 +183,11 @@ void table_esms_by_host_by_event_name
   m_row.m_event_name.make_row(klass);
 
   PFS_connection_statement_visitor visitor(klass);
-  PFS_connection_iterator::visit_host(host, true, true, & visitor);
+  PFS_connection_iterator::visit_host(host,
+                                      true,  /* accounts */
+                                      true,  /* threads */
+                                      false, /* THDs */
+                                      & visitor);
 
   if (! host->m_lock.end_optimistic_lock(&lock))
     return;
@@ -194,7 +206,7 @@ int table_esms_by_host_by_event_name
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
diff --git a/storage/perfschema/table_esms_by_host_by_event_name.h b/storage/perfschema/table_esms_by_host_by_event_name.h
index a6985b48149..54237b1deb5 100644
--- a/storage/perfschema/table_esms_by_host_by_event_name.h
+++ b/storage/perfschema/table_esms_by_host_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -73,9 +73,6 @@ struct pos_esms_by_host_by_event_name
     m_index_2= 1;
   }
 
-  inline bool has_more_host(void)
-  { return (m_index_1 < host_max); }
-
   inline void next_host(void)
   {
     m_index_1++;
@@ -91,6 +88,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
diff --git a/storage/perfschema/table_esms_by_program.cc b/storage/perfschema/table_esms_by_program.cc
new file mode 100644
index 00000000000..5a4dab759cd
--- /dev/null
+++ b/storage/perfschema/table_esms_by_program.cc
@@ -0,0 +1,245 @@
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_esms_by_program.cc
+  Table EVENTS_STATEMENTS_SUMMARY_BY_PROGRAM (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "pfs_global.h"
+#include "pfs_instr.h"
+#include "pfs_timer.h"
+#include "pfs_visitor.h"
+#include "pfs_program.h"
+#include "table_esms_by_program.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
+
+THR_LOCK table_esms_by_program::m_table_lock;
+
+PFS_engine_table_share
+table_esms_by_program::m_share=
+{
+  { C_STRING_WITH_LEN("events_statements_summary_by_program") },
+  &pfs_truncatable_acl,
+  table_esms_by_program::create,
+  NULL, /* write_row */
+  table_esms_by_program::delete_all_rows,
+  table_esms_by_program::get_row_count,
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE events_statements_summary_by_program ("
+                      "OBJECT_TYPE enum('EVENT', 'FUNCTION', 'PROCEDURE', 'TABLE', 'TRIGGER') comment 'Object type for which the summary is generated.',"
+                      "OBJECT_SCHEMA varchar(64) NOT NULL comment 'The schema of the object for which the summary is generated.',"
+                      "OBJECT_NAME varchar(64) NOT NULL comment 'The name of the object for which the summary is generated.',"
+                      "COUNT_STAR bigint(20) unsigned NOT NULL comment 'The number of summarized events (from events_statements_current). This value includes all events, whether timed or nontimed.',"
+                      "SUM_TIMER_WAIT bigint(20) unsigned NOT NULL comment 'The total wait time of the summarized timed events. This value is calculated only for timed events because nontimed events have a wait time of NULL. The same is true for the other xxx_TIMER_WAIT values.',"
+                      "MIN_TIMER_WAIT bigint(20) unsigned NOT NULL comment 'The minimum wait time of the summarized timed events.',"
+                      "AVG_TIMER_WAIT bigint(20) unsigned NOT NULL comment 'The average wait time of the summarized timed events.',"
+                      "MAX_TIMER_WAIT bigint(20) unsigned NOT NULL comment 'The maximum wait time of the summarized timed events.',"
+                      "COUNT_STATEMENTS bigint(20) unsigned NOT NULL comment 'Total number of nested statements invoked during stored program execution.',"
+                      "SUM_STATEMENTS_WAIT bigint(20) unsigned NOT NULL comment 'The total wait time of the summarized timed statements. This value is calculated only for timed statements because nontimed statements have a wait time of NULL. The same is true for the other xxx_STATEMENT_WAIT values.',"
+                      "MIN_STATEMENTS_WAIT bigint(20) unsigned NOT NULL comment 'The minimum wait time of the summarized timed statements.',"
+                      "AVG_STATEMENTS_WAIT bigint(20) unsigned NOT NULL comment 'The average wait time of the summarized timed statements.',"
+                      "MAX_STATEMENTS_WAIT bigint(20) unsigned NOT NULL comment 'The maximum wait time of the summarized timed statements.',"
+                      "SUM_LOCK_TIME bigint(20) unsigned NOT NULL comment 'The total time spent (in picoseconds) waiting for table locks for the summarized statements.',"
+                      "SUM_ERRORS bigint(20) unsigned NOT NULL comment 'The total number of errors that occurend for the summarized statements.',"
+                      "SUM_WARNINGS bigint(20) unsigned NOT NULL comment 'The total number of warnings that occurend for the summarized statements.',"
+                      "SUM_ROWS_AFFECTED bigint(20) unsigned NOT NULL comment 'The total number of affected rows by the summarized statements.',"
+                      "SUM_ROWS_SENT bigint(20) unsigned NOT NULL comment 'The total number of rows returned by the summarized statements.',"
+                      "SUM_ROWS_EXAMINED bigint(20) unsigned NOT NULL comment 'The total number of rows examined by the summarized statements.',"
+                      "SUM_CREATED_TMP_DISK_TABLES bigint(20) unsigned NOT NULL comment 'The total number of on-disk temporary tables created by the summarized statements.',"
+                      "SUM_CREATED_TMP_TABLES bigint(20) unsigned NOT NULL comment 'The total number of in-memory temporary tables created by the summarized statements.',"
+                      "SUM_SELECT_FULL_JOIN bigint(20) unsigned NOT NULL comment 'The total number of full joins executed by the summarized statements.',"
+                      "SUM_SELECT_FULL_RANGE_JOIN bigint(20) unsigned NOT NULL comment 'The total number of range search joins executed by the summarized statements.',"
+                      "SUM_SELECT_RANGE bigint(20) unsigned NOT NULL comment 'The total number of joins that used ranges on the first table executed by the summarized statements.',"
+                      "SUM_SELECT_RANGE_CHECK bigint(20) unsigned NOT NULL comment 'The total number of joins that check for key usage after each row executed by the summarized statements.',"
+                      "SUM_SELECT_SCAN bigint(20) unsigned NOT NULL comment 'The total number of joins that did a full scan of the first table executed by the summarized statements.',"
+                      "SUM_SORT_MERGE_PASSES bigint(20) unsigned NOT NULL comment 'The total number of merge passes that the sort algorithm has had to do for the summarized statements.',"
+                      "SUM_SORT_RANGE bigint(20) unsigned NOT NULL comment 'The total number of sorts that were done using ranges for the summarized statements.',"
+                      "SUM_SORT_ROWS bigint(20) unsigned NOT NULL comment 'The total number of sorted rows that were sorted by the summarized statements.',"
+                      "SUM_SORT_SCAN bigint(20) unsigned NOT NULL comment 'The total number of sorts that were done by scanning the table by the summarized statements.',"
+                      "SUM_NO_INDEX_USED bigint(20) unsigned NOT NULL comment 'The total number of statements that performed a table scan without using an index.',"
+                      "SUM_NO_GOOD_INDEX_USED bigint(20) unsigned NOT NULL comment 'The total number of statements where no good index was found.')")},
+  false  /* perpetual */
+};
+
+PFS_engine_table*
+table_esms_by_program::create(void)
+{
+  return new table_esms_by_program();
+}
+
+int
+table_esms_by_program::delete_all_rows(void)
+{
+  reset_esms_by_program();
+  return 0;
+}
+
+ha_rows
+table_esms_by_program::get_row_count(void)
+{
+  return global_program_container.get_row_count();
+}
+
+table_esms_by_program::table_esms_by_program()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+void table_esms_by_program::reset_position(void)
+{
+  m_pos= 0;
+  m_next_pos= 0;
+}
+
+int table_esms_by_program::rnd_next(void)
+{
+  PFS_program* pfs;
+
+  m_pos.set_at(&m_next_pos);
+  PFS_program_iterator it= global_program_container.iterate(m_pos.m_index);
+  pfs= it.scan_next(& m_pos.m_index);
+  if (pfs != NULL)
+  {
+    make_row(pfs);
+    m_next_pos.set_after(&m_pos);
+    return 0;
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_esms_by_program::rnd_pos(const void *pos)
+{
+  PFS_program* pfs;
+
+  set_position(pos);
+
+  pfs= global_program_container.get(m_pos.m_index);
+  if (pfs != NULL)
+  {
+    make_row(pfs);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+
+void table_esms_by_program::make_row(PFS_program* program)
+{
+  pfs_optimistic_state lock;
+  m_row_exists= false;
+
+  program->m_lock.begin_optimistic_lock(&lock);
+
+  m_row.m_object_type= program->m_type;
+
+  m_row.m_object_name_length= program->m_object_name_length;
+  if(m_row.m_object_name_length > 0)
+    memcpy(m_row.m_object_name, program->m_object_name,
+           m_row.m_object_name_length);
+
+  m_row.m_schema_name_length= program->m_schema_name_length;
+  if(m_row.m_schema_name_length > 0)
+    memcpy(m_row.m_schema_name, program->m_schema_name,
+           m_row.m_schema_name_length);
+
+  time_normalizer *normalizer= time_normalizer::get(statement_timer);
+  /* Get stored program's over all stats. */
+  m_row.m_sp_stat.set(normalizer, &program->m_sp_stat);
+  /* Get sub statements' stats. */
+  m_row.m_stmt_stat.set(normalizer, & program->m_stmt_stat);
+
+  if (! program->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+}
+
+int table_esms_by_program
+::read_row_values(TABLE *table, unsigned char *buf, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /*
+    Set the null bits. It indicates how many fields could be null
+    in the table.
+  */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* OBJECT_TYPE */
+        if(m_row.m_object_type != 0)
+          set_field_enum(f, m_row.m_object_type);
+        else
+          f->set_null();
+        break;
+      case 1: /* OBJECT_SCHEMA */
+        if(m_row.m_schema_name_length > 0)
+          set_field_varchar_utf8(f, m_row.m_schema_name,
+                                 m_row.m_schema_name_length);
+        else
+          f->set_null();
+        break;
+      case 2: /* OBJECT_NAME */
+        if(m_row.m_object_name_length > 0)
+          set_field_varchar_utf8(f, m_row.m_object_name,
+                                 m_row.m_object_name_length);
+        else
+          f->set_null();
+        break;
+      case 3: /* COUNT_STAR */
+      case 4: /* SUM_TIMER_WAIT */
+      case 5: /* MIN_TIMER_WAIT */
+      case 6: /* AVG_TIMER_WAIT */
+      case 7: /* MAX_TIMER_WAIT */
+        m_row.m_sp_stat.set_field(f->field_index - 3, f);
+        break;
+      default: /* 8, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_stmt_stat.set_field(f->field_index - 8, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_esms_by_program.h b/storage/perfschema/table_esms_by_program.h
new file mode 100644
index 00000000000..fdafb655918
--- /dev/null
+++ b/storage/perfschema/table_esms_by_program.h
@@ -0,0 +1,113 @@
+/* Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_ESMS_BY_PROGRAM_H
+#define TABLE_ESMS_BY_PROGRAM_H
+
+/**
+  @file storage/perfschema/table_esms_by_program.h
+  Table EVENTS_STATEMENTS_SUMMARY_BY_PROGRAM (declarations).
+*/
+
+#include "table_helper.h"
+#include "pfs_program.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_PROGRAM.
+*/
+struct row_esms_by_program
+{
+  /** Column OBJECT_TYPE. */
+  enum_object_type m_object_type;
+  /** Column OBJECT_SCHEMA. */
+  char m_schema_name[COL_OBJECT_SCHEMA_SIZE];
+  int m_schema_name_length;
+  /** Column OBJECT_NAME. */
+  char m_object_name[COL_OBJECT_NAME_SIZE];
+  int m_object_name_length;
+
+  /**
+    Columns COUNT_STAR
+            SUM_TIMER_WAIT
+            MIN_TIMER_WAIT
+            AVG_TIMER_WAIT
+            MAX_TIMER_WAIT
+  */
+  PFS_sp_stat_row m_sp_stat;
+
+  /** Columns COUNT_STATEMENTS,SUM_STATEMENTS_WAIT...SUM_NO_GOOD_INDEX_USED. */
+  PFS_statement_stat_row m_stmt_stat;
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_PROGRAM. */
+class table_esms_by_program : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_esms_by_program();
+
+public:
+  ~table_esms_by_program()
+  {}
+
+protected:
+  void make_row(PFS_program*);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_esms_by_program m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_esms_by_thread_by_event_name.cc b/storage/perfschema/table_esms_by_thread_by_event_name.cc
index 5a06f602e5c..eb38c3e5687 100644
--- a/storage/perfschema/table_esms_by_thread_by_event_name.cc
+++ b/storage/perfschema/table_esms_by_thread_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,13 +26,15 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_esms_by_thread_by_event_name.h"
 #include "pfs_global.h"
 #include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_esms_by_thread_by_event_name::m_table_lock;
 
@@ -44,8 +46,7 @@ table_esms_by_thread_by_event_name::m_share=
   table_esms_by_thread_by_event_name::create,
   NULL, /* write_row */
   table_esms_by_thread_by_event_name::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_esms_by_thread_by_event_name::get_row_count,
   sizeof(pos_esms_by_thread_by_event_name),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_statements_summary_by_thread_by_event_name("
@@ -74,7 +75,8 @@ table_esms_by_thread_by_event_name::m_share=
                       "SUM_SORT_ROWS BIGINT unsigned not null comment 'Sum of the SORT_ROWS column in the events_statements_current table.',"
                       "SUM_SORT_SCAN BIGINT unsigned not null comment 'Sum of the SORT_SCAN column in the events_statements_current table.',"
                       "SUM_NO_INDEX_USED BIGINT unsigned not null comment 'Sum of the NO_INDEX_USED column in the events_statements_current table.',"
-                      "SUM_NO_GOOD_INDEX_USED BIGINT unsigned not null comment 'Sum of the NO_GOOD_INDEX_USED column in the events_statements_current table.')") }
+                      "SUM_NO_GOOD_INDEX_USED BIGINT unsigned not null comment 'Sum of the NO_GOOD_INDEX_USED column in the events_statements_current table.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -90,6 +92,12 @@ table_esms_by_thread_by_event_name::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_esms_by_thread_by_event_name::get_row_count(void)
+{
+  return global_thread_container.get_row_count() * statement_class_max;
+}
+
 table_esms_by_thread_by_event_name::table_esms_by_thread_by_event_name()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(), m_next_pos()
@@ -111,18 +119,14 @@ int table_esms_by_thread_by_event_name::rnd_next(void)
 {
   PFS_thread *thread;
   PFS_statement_class *statement_class;
+  bool has_more_thread= true;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.has_more_thread();
+       has_more_thread;
        m_pos.next_thread())
   {
-    thread= &thread_array[m_pos.m_index_1];
-
-    /*
-      Important note: the thread scan is the outer loop (index 1),
-      to minimize the number of calls to atomic operations.
-    */
-    if (thread->m_lock.is_populated())
+    thread= global_thread_container.get(m_pos.m_index_1, & has_more_thread);
+    if (thread != NULL)
     {
       statement_class= find_statement_class(m_pos.m_index_2);
       if (statement_class)
@@ -144,17 +148,16 @@ table_esms_by_thread_by_event_name::rnd_pos(const void *pos)
   PFS_statement_class *statement_class;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < thread_max);
 
-  thread= &thread_array[m_pos.m_index_1];
-  if (! thread->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
-
-  statement_class= find_statement_class(m_pos.m_index_2);
-  if (statement_class)
+  thread= global_thread_container.get(m_pos.m_index_1);
+  if (thread != NULL)
   {
-    make_row(thread, statement_class);
-    return 0;
+    statement_class= find_statement_class(m_pos.m_index_2);
+    if (statement_class)
+    {
+      make_row(thread, statement_class);
+      return 0;
+    }
   }
 
   return HA_ERR_RECORD_DELETED;
@@ -163,7 +166,7 @@ table_esms_by_thread_by_event_name::rnd_pos(const void *pos)
 void table_esms_by_thread_by_event_name
 ::make_row(PFS_thread *thread, PFS_statement_class *klass)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   m_row_exists= false;
 
   if (klass->is_mutable())
@@ -196,7 +199,7 @@ int table_esms_by_thread_by_event_name
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 0);
+  assert(table->s->null_bytes == 0);
 
   for (; (f= *fields) ; fields++)
   {
diff --git a/storage/perfschema/table_esms_by_thread_by_event_name.h b/storage/perfschema/table_esms_by_thread_by_event_name.h
index 72645d03389..9a0818291b9 100644
--- a/storage/perfschema/table_esms_by_thread_by_event_name.h
+++ b/storage/perfschema/table_esms_by_thread_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -72,9 +72,6 @@ struct pos_esms_by_thread_by_event_name
     m_index_2= 1;
   }
 
-  inline bool has_more_thread(void)
-  { return (m_index_1 < thread_max); }
-
   inline void next_thread(void)
   {
     m_index_1++;
@@ -95,6 +92,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
diff --git a/storage/perfschema/table_esms_by_user_by_event_name.cc b/storage/perfschema/table_esms_by_user_by_event_name.cc
index 53bce1ac202..a0efa66c177 100644
--- a/storage/perfschema/table_esms_by_user_by_event_name.cc
+++ b/storage/perfschema/table_esms_by_user_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,14 +26,15 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_esms_by_user_by_event_name.h"
 #include "pfs_global.h"
-#include "pfs_account.h"
 #include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_esms_by_user_by_event_name::m_table_lock;
 
@@ -45,12 +46,11 @@ table_esms_by_user_by_event_name::m_share=
   table_esms_by_user_by_event_name::create,
   NULL, /* write_row */
   table_esms_by_user_by_event_name::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_esms_by_user_by_event_name::get_row_count,
   sizeof(pos_esms_by_user_by_event_name),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_statements_summary_by_user_by_event_name("
-                      "USER CHAR(" STRINGIFY_ARG(USERNAME_CHAR_LENGTH) ") collate utf8_bin default null comment 'User. Used together with EVENT_NAME for grouping events.',"
+                      "USER CHAR(" USERNAME_CHAR_LENGTH_STR ") collate utf8_bin default null comment 'User. Used together with EVENT_NAME for grouping events.',"
                       "EVENT_NAME VARCHAR(128) not null comment 'Event name. Used together with USER for grouping events.',"
                       "COUNT_STAR BIGINT unsigned not null comment 'Number of summarized events',"
                       "SUM_TIMER_WAIT BIGINT unsigned not null comment 'Total wait time of the summarized events that are timed.',"
@@ -75,7 +75,8 @@ table_esms_by_user_by_event_name::m_share=
                       "SUM_SORT_ROWS BIGINT unsigned not null comment 'Sum of the SORT_ROWS column in the events_statements_current table.',"
                       "SUM_SORT_SCAN BIGINT unsigned not null comment 'Sum of the SORT_SCAN column in the events_statements_current table.',"
                       "SUM_NO_INDEX_USED BIGINT unsigned not null comment 'Sum of the NO_INDEX_USED column in the events_statements_current table.',"
-                      "SUM_NO_GOOD_INDEX_USED BIGINT unsigned not null comment 'Sum of the NO_GOOD_INDEX_USED column in the events_statements_current table.')") }
+                      "SUM_NO_GOOD_INDEX_USED BIGINT unsigned not null comment 'Sum of the NO_GOOD_INDEX_USED column in the events_statements_current table.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -93,6 +94,12 @@ table_esms_by_user_by_event_name::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_esms_by_user_by_event_name::get_row_count(void)
+{
+  return global_user_container.get_row_count() * statement_class_max;
+}
+
 table_esms_by_user_by_event_name::table_esms_by_user_by_event_name()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(), m_next_pos()
@@ -114,13 +121,14 @@ int table_esms_by_user_by_event_name::rnd_next(void)
 {
   PFS_user *user;
   PFS_statement_class *statement_class;
+  bool has_more_user= true;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.has_more_user();
+       has_more_user;
        m_pos.next_user())
   {
-    user= &user_array[m_pos.m_index_1];
-    if (user->m_lock.is_populated())
+    user= global_user_container.get(m_pos.m_index_1, & has_more_user);
+    if (user != NULL)
     {
       statement_class= find_statement_class(m_pos.m_index_2);
       if (statement_class)
@@ -142,17 +150,16 @@ table_esms_by_user_by_event_name::rnd_pos(const void *pos)
   PFS_statement_class *statement_class;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < user_max);
 
-  user= &user_array[m_pos.m_index_1];
-  if (! user->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
-
-  statement_class= find_statement_class(m_pos.m_index_2);
-  if (statement_class)
+  user= global_user_container.get(m_pos.m_index_1);
+  if (user != NULL)
   {
-    make_row(user, statement_class);
-    return 0;
+    statement_class= find_statement_class(m_pos.m_index_2);
+    if (statement_class)
+    {
+      make_row(user, statement_class);
+      return 0;
+    }
   }
 
   return HA_ERR_RECORD_DELETED;
@@ -161,7 +168,7 @@ table_esms_by_user_by_event_name::rnd_pos(const void *pos)
 void table_esms_by_user_by_event_name
 ::make_row(PFS_user *user, PFS_statement_class *klass)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   m_row_exists= false;
 
   if (klass->is_mutable())
@@ -175,7 +182,11 @@ void table_esms_by_user_by_event_name
   m_row.m_event_name.make_row(klass);
 
   PFS_connection_statement_visitor visitor(klass);
-  PFS_connection_iterator::visit_user(user, true, true, & visitor);
+  PFS_connection_iterator::visit_user(user,
+                                      true,  /* accounts */
+                                      true,  /* threads */
+                                      false, /* THDs */
+                                      & visitor);
 
   if (! user->m_lock.end_optimistic_lock(&lock))
     return;
@@ -194,7 +205,7 @@ int table_esms_by_user_by_event_name
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
diff --git a/storage/perfschema/table_esms_by_user_by_event_name.h b/storage/perfschema/table_esms_by_user_by_event_name.h
index d1d1e5df85d..51762be4143 100644
--- a/storage/perfschema/table_esms_by_user_by_event_name.h
+++ b/storage/perfschema/table_esms_by_user_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -73,9 +73,6 @@ struct pos_esms_by_user_by_event_name
     m_index_2= 1;
   }
 
-  inline bool has_more_user(void)
-  { return (m_index_1 < user_max); }
-
   inline void next_user(void)
   {
     m_index_1++;
@@ -91,6 +88,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
diff --git a/storage/perfschema/table_esms_global_by_event_name.cc b/storage/perfschema/table_esms_global_by_event_name.cc
index c33f05bbecb..f889df11b5a 100644
--- a/storage/perfschema/table_esms_global_by_event_name.cc
+++ b/storage/perfschema/table_esms_global_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,7 +26,7 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
@@ -35,6 +35,7 @@
 #include "pfs_instr.h"
 #include "pfs_timer.h"
 #include "pfs_visitor.h"
+#include "field.h"
 
 THR_LOCK table_esms_global_by_event_name::m_table_lock;
 
@@ -46,8 +47,7 @@ table_esms_global_by_event_name::m_share=
   table_esms_global_by_event_name::create,
   NULL, /* write_row */
   table_esms_global_by_event_name::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_esms_global_by_event_name::get_row_count,
   sizeof(PFS_simple_index),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_statements_summary_global_by_event_name("
@@ -75,7 +75,8 @@ table_esms_global_by_event_name::m_share=
                       "SUM_SORT_ROWS BIGINT unsigned not null comment 'Sum of the SORT_ROWS column in the events_statements_current table.',"
                       "SUM_SORT_SCAN BIGINT unsigned not null comment 'Sum of the SORT_SCAN column in the events_statements_current table.',"
                       "SUM_NO_INDEX_USED BIGINT unsigned not null comment 'Sum of the NO_INDEX_USED column in the events_statements_current table.',"
-                      "SUM_NO_GOOD_INDEX_USED BIGINT unsigned not null comment 'Sum of the NO_GOOD_INDEX_USED column in the events_statements_current table.')") }
+                      "SUM_NO_GOOD_INDEX_USED BIGINT unsigned not null comment 'Sum of the NO_GOOD_INDEX_USED column in the events_statements_current table.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -95,6 +96,12 @@ table_esms_global_by_event_name::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_esms_global_by_event_name::get_row_count(void)
+{
+  return statement_class_max;
+}
+
 table_esms_global_by_event_name::table_esms_global_by_event_name()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(1), m_next_pos(1)
@@ -164,9 +171,12 @@ void table_esms_global_by_event_name
   m_row.m_event_name.make_row(klass);
 
   PFS_connection_statement_visitor visitor(klass);
-  PFS_connection_iterator::visit_global(true, /* hosts */
+  PFS_connection_iterator::visit_global(true,  /* hosts */
                                         false, /* users */
-                                        true, true, & visitor);
+                                        true,  /* accounts */
+                                        true,  /* threads */
+                                        false, /* THDs */
+                                        & visitor);
 
   m_row.m_stat.set(m_normalizer, & visitor.m_stat);
   m_row_exists= true;
@@ -182,7 +192,7 @@ int table_esms_global_by_event_name
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 0);
+  assert(table->s->null_bytes == 0);
 
   for (; (f= *fields) ; fields++)
   {
diff --git a/storage/perfschema/table_esms_global_by_event_name.h b/storage/perfschema/table_esms_global_by_event_name.h
index b90c14c0c0f..c4a32da11bf 100644
--- a/storage/perfschema/table_esms_global_by_event_name.h
+++ b/storage/perfschema/table_esms_global_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -59,6 +59,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
diff --git a/storage/perfschema/table_ets_by_account_by_event_name.cc b/storage/perfschema/table_ets_by_account_by_event_name.cc
new file mode 100644
index 00000000000..9d323eb78d9
--- /dev/null
+++ b/storage/perfschema/table_ets_by_account_by_event_name.cc
@@ -0,0 +1,225 @@
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_ets_by_account_by_event_name.cc
+  Table EVENTS_TRANSACTIONS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_ets_by_account_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
+
+THR_LOCK table_ets_by_account_by_event_name::m_table_lock;
+
+PFS_engine_table_share
+table_ets_by_account_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_transactions_summary_by_account_by_event_name") },
+  &pfs_truncatable_acl,
+  table_ets_by_account_by_event_name::create,
+  NULL, /* write_row */
+  table_ets_by_account_by_event_name::delete_all_rows,
+  table_ets_by_account_by_event_name::get_row_count,
+  sizeof(pos_ets_by_account_by_event_name),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE events_transactions_summary_by_account_by_event_name("
+  "USER CHAR(32) collate utf8_bin default null comment 'User for which summary is generated.',"
+  "HOST CHAR(60) collate utf8_bin default null comment 'Host for which summary is generated.',"
+  "EVENT_NAME VARCHAR(128) not null comment 'Event name for which summary is generated.',"
+  "COUNT_STAR BIGINT unsigned not null comment 'The number of summarized events. This value includes all events, whether timed or nontimed.',"
+  "SUM_TIMER_WAIT BIGINT unsigned not null comment 'The total wait time of the summarized timed events. This value is calculated only for timed events because nontimed events have a wait time of NULL. The same is true for the other xxx_TIMER_WAIT values.',"
+  "MIN_TIMER_WAIT BIGINT unsigned not null comment 'The minimum wait time of the summarized timed events.',"
+  "AVG_TIMER_WAIT BIGINT unsigned not null comment 'The average wait time of the summarized timed events.',"
+  "MAX_TIMER_WAIT BIGINT unsigned not null comment 'The maximum wait time of the summarized timed events.',"
+  "COUNT_READ_WRITE BIGINT unsigned not null comment 'The total number of only READ/WRITE transaction events.',"
+  "SUM_TIMER_READ_WRITE BIGINT unsigned not null comment 'The total wait time of only READ/WRITE transaction events.',"
+  "MIN_TIMER_READ_WRITE BIGINT unsigned not null comment 'The minimum wait time of only READ/WRITE transaction events.',"
+  "AVG_TIMER_READ_WRITE BIGINT unsigned not null comment 'The average wait time of only READ/WRITE transaction events.',"
+  "MAX_TIMER_READ_WRITE BIGINT unsigned not null comment 'The maximum wait time of only READ/WRITE transaction events.',"
+  "COUNT_READ_ONLY BIGINT unsigned not null comment 'The total number of only READ ONLY transaction events.',"
+  "SUM_TIMER_READ_ONLY BIGINT unsigned not null comment 'The total wait time of only READ ONLY transaction events.',"
+  "MIN_TIMER_READ_ONLY BIGINT unsigned not null comment 'The minimum wait time of only READ ONLY transaction events.',"
+  "AVG_TIMER_READ_ONLY BIGINT unsigned not null comment 'The average wait time of only READ ONLY transaction events.',"
+  "MAX_TIMER_READ_ONLY BIGINT unsigned not null comment 'The maximum wait time of only READ ONLY transaction events.')")},
+  false  /* perpetual */
+};
+
+PFS_engine_table*
+table_ets_by_account_by_event_name::create(void)
+{
+  return new table_ets_by_account_by_event_name();
+}
+
+int
+table_ets_by_account_by_event_name::delete_all_rows(void)
+{
+  reset_events_transactions_by_thread();
+  reset_events_transactions_by_account();
+  return 0;
+}
+
+ha_rows
+table_ets_by_account_by_event_name::get_row_count(void)
+{
+  return global_account_container.get_row_count() * transaction_class_max;
+}
+
+table_ets_by_account_by_event_name::table_ets_by_account_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_ets_by_account_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_ets_by_account_by_event_name::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(transaction_timer);
+  return 0;
+}
+
+int table_ets_by_account_by_event_name::rnd_next(void)
+{
+  PFS_account *account;
+  PFS_transaction_class *transaction_class;
+  bool has_more_account= true;
+
+  for (m_pos.set_at(&m_next_pos);
+       has_more_account;
+       m_pos.next_account())
+  {
+    account= global_account_container.get(m_pos.m_index_1, & has_more_account);
+    if (account != NULL)
+    {
+      transaction_class= find_transaction_class(m_pos.m_index_2);
+      if (transaction_class)
+      {
+        make_row(account, transaction_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_ets_by_account_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_account *account;
+  PFS_transaction_class *transaction_class;
+
+  set_position(pos);
+
+  account= global_account_container.get(m_pos.m_index_1);
+  if (account != NULL)
+  {
+    transaction_class= find_transaction_class(m_pos.m_index_2);
+    if (transaction_class)
+    {
+      make_row(account, transaction_class);
+      return 0;
+    }
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_ets_by_account_by_event_name
+::make_row(PFS_account *account, PFS_transaction_class *klass)
+{
+  pfs_optimistic_state lock;
+  m_row_exists= false;
+
+  account->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_account.make_row(account))
+    return;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_transaction_visitor visitor(klass);
+  PFS_connection_iterator::visit_account(account,
+                                         true,  /* threads */
+                                         false, /* THDs */
+                                         &visitor);
+
+  if (! account->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(m_normalizer, &visitor.m_stat);
+}
+
+int table_ets_by_account_by_event_name
+::read_row_values(TABLE *table, unsigned char *buf, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* USER */
+      case 1: /* HOST */
+        m_row.m_account.set_field(f->field_index, f);
+        break;
+      case 2: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default:
+        /**
+          COUNT_STAR, SUM/MIN/AVG/MAX_TIMER_WAIT,
+          COUNT_READ_WRITE, SUM/MIN/AVG/MAX_TIMER_READ_WRITE,
+          COUNT_READ_ONLY, SUM/MIN/AVG/MAX_TIMER_READ_ONLY
+        */
+        m_row.m_stat.set_field(f->field_index-3, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_ets_by_account_by_event_name.h b/storage/perfschema/table_ets_by_account_by_event_name.h
new file mode 100644
index 00000000000..3399c27d6e4
--- /dev/null
+++ b/storage/perfschema/table_ets_by_account_by_event_name.h
@@ -0,0 +1,134 @@
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_ETS_BY_ACCOUNT_BY_EVENT_NAME_H
+#define TABLE_ETS_BY_ACCOUNT_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_ets_by_account_by_event_name.h
+  Table EVENTS_TRANSACTIONS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_account.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
+*/
+struct row_ets_by_account_by_event_name
+{
+  /** Columns USER, HOST. */
+  PFS_account_row m_account;
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /**
+    Columns COUNT_STAR, SUM/MIN/AVG/MAX_TIMER_WAIT,
+    COUNT_READ_WRITE, SUM/MIN/AVG/MAX_TIMER_READ_WRITE,
+    COUNT_READ_ONLY, SUM/MIN/AVG/MAX_TIMER_READ_ONLY
+  */
+  PFS_transaction_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
+  Index 1 on account (0 based)
+  Index 2 on transaction class (1 based)
+*/
+struct pos_ets_by_account_by_event_name
+: public PFS_double_index
+{
+  pos_ets_by_account_by_event_name()
+    : PFS_double_index(0, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 1;
+  }
+
+  inline void next_account(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME. */
+class table_ets_by_account_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_ets_by_account_by_event_name();
+
+public:
+  ~table_ets_by_account_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_account *account, PFS_transaction_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_ets_by_account_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_ets_by_account_by_event_name m_pos;
+  /** Next position. */
+  pos_ets_by_account_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_ets_by_host_by_event_name.cc b/storage/perfschema/table_ets_by_host_by_event_name.cc
new file mode 100644
index 00000000000..6ca2cfad81a
--- /dev/null
+++ b/storage/perfschema/table_ets_by_host_by_event_name.cc
@@ -0,0 +1,226 @@
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_ets_by_host_by_event_name.cc
+  Table EVENTS_TRANSACTIONS_SUMMARY_BY_HOST_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_ets_by_host_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_account.h"
+#include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
+
+THR_LOCK table_ets_by_host_by_event_name::m_table_lock;
+
+PFS_engine_table_share
+table_ets_by_host_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_transactions_summary_by_host_by_event_name") },
+  &pfs_truncatable_acl,
+  table_ets_by_host_by_event_name::create,
+  NULL, /* write_row */
+  table_ets_by_host_by_event_name::delete_all_rows,
+  table_ets_by_host_by_event_name::get_row_count,
+  sizeof(pos_ets_by_host_by_event_name),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE events_transactions_summary_by_host_by_event_name("
+  "HOST CHAR(60) collate utf8_bin default null comment 'Host for which summary is generated.',"
+  "EVENT_NAME VARCHAR(128) not null comment 'Event name for which summary is generated.',"
+  "COUNT_STAR BIGINT unsigned not null comment 'The number of summarized events. This value includes all events, whether timed or nontimed.',"
+  "SUM_TIMER_WAIT BIGINT unsigned not null comment 'The total wait time of the summarized timed events. This value is calculated only for timed events because nontimed events have a wait time of NULL. The same is true for the other xxx_TIMER_WAIT values.',"
+  "MIN_TIMER_WAIT BIGINT unsigned not null comment 'The minimum wait time of the summarized timed events.',"
+  "AVG_TIMER_WAIT BIGINT unsigned not null comment 'The average wait time of the summarized timed events.',"
+  "MAX_TIMER_WAIT BIGINT unsigned not null comment 'The maximum wait time of the summarized timed events.',"
+  "COUNT_READ_WRITE BIGINT unsigned not null comment 'The total number of only READ/WRITE transaction events.',"
+  "SUM_TIMER_READ_WRITE BIGINT unsigned not null comment 'The total wait time of only READ/WRITE transaction events.',"
+  "MIN_TIMER_READ_WRITE BIGINT unsigned not null comment 'The minimum wait time of only READ/WRITE transaction events.',"
+  "AVG_TIMER_READ_WRITE BIGINT unsigned not null comment 'The average wait time of only READ/WRITE transaction events.',"
+  "MAX_TIMER_READ_WRITE BIGINT unsigned not null comment 'The maximum wait time of only READ/WRITE transaction events.',"
+  "COUNT_READ_ONLY BIGINT unsigned not null comment 'The total number of only READ ONLY transaction events.',"
+  "SUM_TIMER_READ_ONLY BIGINT unsigned not null comment 'The total wait time of only READ ONLY transaction events.',"
+  "MIN_TIMER_READ_ONLY BIGINT unsigned not null comment 'The minimum wait time of only READ ONLY transaction events.',"
+  "AVG_TIMER_READ_ONLY BIGINT unsigned not null comment 'The average wait time of only READ ONLY transaction events.',"
+  "MAX_TIMER_READ_ONLY BIGINT unsigned not null comment 'The maximum wait time of only READ ONLY transaction events.')")},
+  false  /* perpetual */
+};
+
+PFS_engine_table*
+table_ets_by_host_by_event_name::create(void)
+{
+  return new table_ets_by_host_by_event_name();
+}
+
+int
+table_ets_by_host_by_event_name::delete_all_rows(void)
+{
+  reset_events_transactions_by_thread();
+  reset_events_transactions_by_account();
+  reset_events_transactions_by_host();
+  return 0;
+}
+
+ha_rows
+table_ets_by_host_by_event_name::get_row_count(void)
+{
+  return global_host_container.get_row_count() * transaction_class_max;
+}
+
+table_ets_by_host_by_event_name::table_ets_by_host_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_ets_by_host_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_ets_by_host_by_event_name::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(transaction_timer);
+  return 0;
+}
+
+int table_ets_by_host_by_event_name::rnd_next(void)
+{
+  PFS_host *host;
+  PFS_transaction_class *transaction_class;
+  bool has_more_host= true;
+
+  for (m_pos.set_at(&m_next_pos);
+       has_more_host;
+       m_pos.next_host())
+  {
+    host= global_host_container.get(m_pos.m_index_1, & has_more_host);
+    if (host != NULL)
+    {
+      transaction_class= find_transaction_class(m_pos.m_index_2);
+      if (transaction_class)
+      {
+        make_row(host, transaction_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_ets_by_host_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_host *host;
+  PFS_transaction_class *transaction_class;
+
+  set_position(pos);
+
+  host= global_host_container.get(m_pos.m_index_1);
+  if (host != NULL)
+  {
+    transaction_class= find_transaction_class(m_pos.m_index_2);
+    if (transaction_class)
+    {
+      make_row(host, transaction_class);
+      return 0;
+    }
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_ets_by_host_by_event_name
+::make_row(PFS_host *host, PFS_transaction_class *klass)
+{
+  pfs_optimistic_state lock;
+  m_row_exists= false;
+
+  host->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_host.make_row(host))
+    return;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_transaction_visitor visitor(klass);
+  PFS_connection_iterator::visit_host(host,
+                                      true,  /* accounts */
+                                      true,  /* threads */
+                                      false, /* THDs */
+                                      & visitor);
+
+  if (! host->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+}
+
+int table_ets_by_host_by_event_name
+::read_row_values(TABLE *table, unsigned char *buf, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* HOST */
+        m_row.m_host.set_field(f);
+        break;
+      case 1: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default:
+        /**
+          COUNT_STAR, SUM/MIN/AVG/MAX_TIMER_WAIT,
+          COUNT_READ_WRITE, SUM/MIN/AVG/MAX_TIMER_READ_WRITE,
+          COUNT_READ_ONLY, SUM/MIN/AVG/MAX_TIMER_READ_ONLY
+        */
+        m_row.m_stat.set_field(f->field_index-2, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_ets_by_host_by_event_name.h b/storage/perfschema/table_ets_by_host_by_event_name.h
new file mode 100644
index 00000000000..2188f385701
--- /dev/null
+++ b/storage/perfschema/table_ets_by_host_by_event_name.h
@@ -0,0 +1,134 @@
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_ETS_BY_HOST_BY_EVENT_NAME_H
+#define TABLE_ETS_BY_HOST_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_ets_by_host_by_event_name.h
+  Table EVENTS_TRANSACTIONS_SUMMARY_BY_HOST_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_host.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_SUMMARY_BY_HOST_BY_EVENT_NAME.
+*/
+struct row_ets_by_host_by_event_name
+{
+  /** Column HOST */
+  PFS_host_row m_host;
+  /** Column EVENT_NAME */
+  PFS_event_name_row m_event_name;
+  /**
+    Columns COUNT_STAR, SUM/MIN/AVG/MAX_TIMER_WAIT,
+    COUNT_READ_WRITE, SUM/MIN/AVG/MAX_TIMER_READ_WRITE,
+    COUNT_READ_ONLY, SUM/MIN/AVG/MAX_TIMER_READ_ONLY
+  */
+  PFS_transaction_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_SUMMARY_BY_HOST_BY_EVENT_NAME.
+  Index 1 on host (0 based)
+  Index 2 on transaction class (1 based)
+*/
+struct pos_ets_by_host_by_event_name
+: public PFS_double_index
+{
+  pos_ets_by_host_by_event_name()
+    : PFS_double_index(0, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 1;
+  }
+
+  inline void next_host(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_SUMMARY_BY_HOST_BY_EVENT_NAME. */
+class table_ets_by_host_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_ets_by_host_by_event_name();
+
+public:
+  ~table_ets_by_host_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_host *host, PFS_transaction_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_ets_by_host_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_ets_by_host_by_event_name m_pos;
+  /** Next position. */
+  pos_ets_by_host_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_ets_by_thread_by_event_name.cc b/storage/perfschema/table_ets_by_thread_by_event_name.cc
new file mode 100644
index 00000000000..257f37d56ed
--- /dev/null
+++ b/storage/perfschema/table_ets_by_thread_by_event_name.cc
@@ -0,0 +1,218 @@
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_ets_by_thread_by_event_name.cc
+  Table EVENTS_TRANSACTIONS_SUMMARY_BY_HOST_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_ets_by_thread_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
+
+THR_LOCK table_ets_by_thread_by_event_name::m_table_lock;
+
+PFS_engine_table_share
+table_ets_by_thread_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_transactions_summary_by_thread_by_event_name") },
+  &pfs_truncatable_acl,
+  table_ets_by_thread_by_event_name::create,
+  NULL, /* write_row */
+  table_ets_by_thread_by_event_name::delete_all_rows,
+  table_ets_by_thread_by_event_name::get_row_count,
+  sizeof(pos_ets_by_thread_by_event_name),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE events_transactions_summary_by_thread_by_event_name("
+  "THREAD_ID BIGINT unsigned not null comment 'Thread for which summary is generated.',"
+  "EVENT_NAME VARCHAR(128) not null comment 'Event name for which summary is generated.',"
+  "COUNT_STAR BIGINT unsigned not null comment 'The number of summarized events. This value includes all events, whether timed or nontimed.',"
+  "SUM_TIMER_WAIT BIGINT unsigned not null comment 'The total wait time of the summarized timed events. This value is calculated only for timed events because nontimed events have a wait time of NULL. The same is true for the other xxx_TIMER_WAIT values.',"
+  "MIN_TIMER_WAIT BIGINT unsigned not null comment 'The minimum wait time of the summarized timed events.',"
+  "AVG_TIMER_WAIT BIGINT unsigned not null comment 'The average wait time of the summarized timed events.',"
+  "MAX_TIMER_WAIT BIGINT unsigned not null comment 'The maximum wait time of the summarized timed events.',"
+  "COUNT_READ_WRITE BIGINT unsigned not null comment 'The total number of only READ/WRITE transaction events.',"
+  "SUM_TIMER_READ_WRITE BIGINT unsigned not null comment 'The total wait time of only READ/WRITE transaction events.',"
+  "MIN_TIMER_READ_WRITE BIGINT unsigned not null comment 'The minimum wait time of only READ/WRITE transaction events.',"
+  "AVG_TIMER_READ_WRITE BIGINT unsigned not null comment 'The average wait time of only READ/WRITE transaction events.',"
+  "MAX_TIMER_READ_WRITE BIGINT unsigned not null comment 'The maximum wait time of only READ/WRITE transaction events.',"
+  "COUNT_READ_ONLY BIGINT unsigned not null comment 'The total number of only READ ONLY transaction events.',"
+  "SUM_TIMER_READ_ONLY BIGINT unsigned not null comment 'The total wait time of only READ ONLY transaction events.',"
+  "MIN_TIMER_READ_ONLY BIGINT unsigned not null comment 'The minimum wait time of only READ ONLY transaction events.',"
+  "AVG_TIMER_READ_ONLY BIGINT unsigned not null comment 'The average wait time of only READ ONLY transaction events.',"
+  "MAX_TIMER_READ_ONLY BIGINT unsigned not null comment 'The maximum wait time of only READ ONLY transaction events.')")},
+  false  /* perpetual */
+};
+
+PFS_engine_table*
+table_ets_by_thread_by_event_name::create(void)
+{
+  return new table_ets_by_thread_by_event_name();
+}
+
+int
+table_ets_by_thread_by_event_name::delete_all_rows(void)
+{
+  reset_events_transactions_by_thread();
+  return 0;
+}
+
+ha_rows
+table_ets_by_thread_by_event_name::get_row_count(void)
+{
+  return global_thread_container.get_row_count() * transaction_class_max;
+}
+
+table_ets_by_thread_by_event_name::table_ets_by_thread_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_ets_by_thread_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_ets_by_thread_by_event_name::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(transaction_timer);
+  return 0;
+}
+
+int table_ets_by_thread_by_event_name::rnd_next(void)
+{
+  PFS_thread *thread;
+  PFS_transaction_class *transaction_class;
+  bool has_more_thread= true;
+
+  for (m_pos.set_at(&m_next_pos);
+       has_more_thread;
+       m_pos.next_thread())
+  {
+    thread= global_thread_container.get(m_pos.m_index_1, & has_more_thread);
+    if (thread != NULL)
+    {
+      transaction_class= find_transaction_class(m_pos.m_index_2);
+      if (transaction_class)
+      {
+        make_row(thread, transaction_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_ets_by_thread_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_thread *thread;
+  PFS_transaction_class *transaction_class;
+
+  set_position(pos);
+
+  thread= global_thread_container.get(m_pos.m_index_1);
+  if (thread != NULL)
+  {
+    transaction_class= find_transaction_class(m_pos.m_index_2);
+    if (transaction_class)
+    {
+      make_row(thread, transaction_class);
+      return 0;
+    }
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_ets_by_thread_by_event_name
+::make_row(PFS_thread *thread, PFS_transaction_class *klass)
+{
+  pfs_optimistic_state lock;
+  m_row_exists= false;
+
+  /* Protect this reader against a thread termination */
+  thread->m_lock.begin_optimistic_lock(&lock);
+
+  m_row.m_thread_internal_id= thread->m_thread_internal_id;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_transaction_visitor visitor(klass);
+  PFS_connection_iterator::visit_thread(thread, &visitor);
+
+  if (! thread->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(m_normalizer, &visitor.m_stat);
+}
+
+int table_ets_by_thread_by_event_name
+::read_row_values(TABLE *table, unsigned char *, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 0);
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* THREAD_ID */
+        set_field_ulonglong(f, m_row.m_thread_internal_id);
+        break;
+      case 1: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default:
+        /**
+          COUNT_STAR, SUM/MIN/AVG/MAX_TIMER_WAIT
+          COUNT_READ_WRITE, SUM/MIN/AVG/MAX_TIMER_READ_WRITE
+          COUNT_READ_ONLY, SUM/MIN/AVG/MAX_TIMER_READ_ONLY
+        */
+        m_row.m_stat.set_field(f->field_index-2, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_ets_by_thread_by_event_name.h b/storage/perfschema/table_ets_by_thread_by_event_name.h
new file mode 100644
index 00000000000..dbfe1fae016
--- /dev/null
+++ b/storage/perfschema/table_ets_by_thread_by_event_name.h
@@ -0,0 +1,138 @@
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_ETS_BY_THREAD_BY_EVENT_NAME_H
+#define TABLE_ETS_BY_THREAD_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_ets_by_thread_by_event_name.h
+  Table EVENTS_TRANSACTIONS_SUMMARY_BY_THREAD_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_SUMMARY_BY_THREAD_BY_EVENT_NAME.
+*/
+struct row_ets_by_thread_by_event_name
+{
+  /** Column THREAD_ID. */
+  ulonglong m_thread_internal_id;
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /**
+    Columns COUNT_STAR, SUM/MIN/AVG/MAX_TIMER_WAIT,
+    COUNT_READ_WRITE, SUM/MIN/AVG/MAX_TIMER_READ_WRITE,
+    COUNT_READ_ONLY, SUM/MIN/AVG/MAX_TIMER_READ_ONLY
+  */
+  PFS_transaction_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_SUMMARY_BY_THREAD_BY_EVENT_NAME.
+  Index 1 on thread (0 based).
+  Index 2 on transaction class (1 based).
+*/
+struct pos_ets_by_thread_by_event_name
+: public PFS_double_index, public PFS_instrument_view_constants
+{
+  pos_ets_by_thread_by_event_name()
+    : PFS_double_index(0, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 1;
+  }
+
+  inline void next_thread(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
+
+  inline void next_transaction(void)
+  {
+    m_index_2++;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_SUMMARY_BY_THREAD_BY_EVENT_NAME. */
+class table_ets_by_thread_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_ets_by_thread_by_event_name();
+
+public:
+  ~table_ets_by_thread_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_thread *thread, PFS_transaction_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_ets_by_thread_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_ets_by_thread_by_event_name m_pos;
+  /** Next position. */
+  pos_ets_by_thread_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_ets_by_user_by_event_name.cc b/storage/perfschema/table_ets_by_user_by_event_name.cc
new file mode 100644
index 00000000000..6b54175b6cc
--- /dev/null
+++ b/storage/perfschema/table_ets_by_user_by_event_name.cc
@@ -0,0 +1,225 @@
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_ets_by_user_by_event_name.cc
+  Table EVENTS_TRANSACTIONS_SUMMARY_BY_USER_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_ets_by_user_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
+
+THR_LOCK table_ets_by_user_by_event_name::m_table_lock;
+
+PFS_engine_table_share
+table_ets_by_user_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_transactions_summary_by_user_by_event_name") },
+  &pfs_truncatable_acl,
+  table_ets_by_user_by_event_name::create,
+  NULL, /* write_row */
+  table_ets_by_user_by_event_name::delete_all_rows,
+  table_ets_by_user_by_event_name::get_row_count,
+  sizeof(pos_ets_by_user_by_event_name),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE events_transactions_summary_by_user_by_event_name("
+  "USER CHAR(32) collate utf8_bin default null comment 'User for which summary is generated.',"
+  "EVENT_NAME VARCHAR(128) not null comment 'Event name for which summary is generated.',"
+  "COUNT_STAR BIGINT unsigned not null comment 'The number of summarized events. This value includes all events, whether timed or nontimed.',"
+  "SUM_TIMER_WAIT BIGINT unsigned not null comment 'The total wait time of the summarized timed events. This value is calculated only for timed events because nontimed events have a wait time of NULL. The same is true for the other xxx_TIMER_WAIT values.',"
+  "MIN_TIMER_WAIT BIGINT unsigned not null comment 'The minimum wait time of the summarized timed events.',"
+  "AVG_TIMER_WAIT BIGINT unsigned not null comment 'The average wait time of the summarized timed events.',"
+  "MAX_TIMER_WAIT BIGINT unsigned not null comment 'The maximum wait time of the summarized timed events.',"
+  "COUNT_READ_WRITE BIGINT unsigned not null comment 'The total number of only READ/WRITE transaction events.',"
+  "SUM_TIMER_READ_WRITE BIGINT unsigned not null comment 'The total wait time of only READ/WRITE transaction events.',"
+  "MIN_TIMER_READ_WRITE BIGINT unsigned not null comment 'The minimum wait time of only READ/WRITE transaction events.',"
+  "AVG_TIMER_READ_WRITE BIGINT unsigned not null comment 'The average wait time of only READ/WRITE transaction events.',"
+  "MAX_TIMER_READ_WRITE BIGINT unsigned not null comment 'The maximum wait time of only READ/WRITE transaction events.',"
+  "COUNT_READ_ONLY BIGINT unsigned not null comment 'The total number of only READ ONLY transaction events.',"
+  "SUM_TIMER_READ_ONLY BIGINT unsigned not null comment 'The total wait time of only READ ONLY transaction events.',"
+  "MIN_TIMER_READ_ONLY BIGINT unsigned not null comment 'The minimum wait time of only READ ONLY transaction events.',"
+  "AVG_TIMER_READ_ONLY BIGINT unsigned not null comment 'The average wait time of only READ ONLY transaction events.',"
+  "MAX_TIMER_READ_ONLY BIGINT unsigned not null comment 'The maximum wait time of only READ ONLY transaction events.')")},
+  false  /* perpetual */
+};
+
+PFS_engine_table*
+table_ets_by_user_by_event_name::create(void)
+{
+  return new table_ets_by_user_by_event_name();
+}
+
+int
+table_ets_by_user_by_event_name::delete_all_rows(void)
+{
+  reset_events_transactions_by_thread();
+  reset_events_transactions_by_account();
+  reset_events_transactions_by_user();
+  return 0;
+}
+
+ha_rows
+table_ets_by_user_by_event_name::get_row_count(void)
+{
+  return global_user_container.get_row_count() * transaction_class_max;
+}
+
+table_ets_by_user_by_event_name::table_ets_by_user_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_ets_by_user_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_ets_by_user_by_event_name::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(transaction_timer);
+  return 0;
+}
+
+int table_ets_by_user_by_event_name::rnd_next(void)
+{
+  PFS_user *user;
+  PFS_transaction_class *transaction_class;
+  bool has_more_user= true;
+
+  for (m_pos.set_at(&m_next_pos);
+       has_more_user;
+       m_pos.next_user())
+  {
+    user= global_user_container.get(m_pos.m_index_1, & has_more_user);
+    if (user != NULL)
+    {
+      transaction_class= find_transaction_class(m_pos.m_index_2);
+      if (transaction_class)
+      {
+        make_row(user, transaction_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_ets_by_user_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_user *user;
+  PFS_transaction_class *transaction_class;
+
+  set_position(pos);
+
+  user= global_user_container.get(m_pos.m_index_1);
+  if (user != NULL)
+  {
+    transaction_class= find_transaction_class(m_pos.m_index_2);
+    if (transaction_class)
+    {
+      make_row(user, transaction_class);
+      return 0;
+    }
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_ets_by_user_by_event_name
+::make_row(PFS_user *user, PFS_transaction_class *klass)
+{
+  pfs_optimistic_state lock;
+  m_row_exists= false;
+
+  user->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_user.make_row(user))
+    return;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_transaction_visitor visitor(klass);
+  PFS_connection_iterator::visit_user(user,
+                                      true,  /* accounts */
+                                      true,  /* threads */
+                                      false, /* THDs */
+                                      & visitor);
+
+  if (! user->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+}
+
+int table_ets_by_user_by_event_name
+::read_row_values(TABLE *table, unsigned char *buf, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* USER */
+        m_row.m_user.set_field(f);
+        break;
+      case 1: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default:
+        /**
+          COUNT_STAR, SUM/MIN/AVG/MAX_TIMER_WAIT,
+          COUNT_READ_WRITE, SUM/MIN/AVG/MAX_TIMER_READ_WRITE,
+          COUNT_READ_ONLY, SUM/MIN/AVG/MAX_TIMER_READ_ONLY
+         */
+        m_row.m_stat.set_field(f->field_index-2, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_ets_by_user_by_event_name.h b/storage/perfschema/table_ets_by_user_by_event_name.h
new file mode 100644
index 00000000000..62721ce9adf
--- /dev/null
+++ b/storage/perfschema/table_ets_by_user_by_event_name.h
@@ -0,0 +1,134 @@
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_ETS_BY_USER_BY_EVENT_NAME_H
+#define TABLE_ETS_BY_USER_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_ets_by_user_by_event_name.h
+  Table EVENTS_TRANSACTIONS_SUMMARY_BY_USER_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_user.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_SUMMARY_BY_USER_BY_EVENT_NAME.
+*/
+struct row_ets_by_user_by_event_name
+{
+  /** Column USER */
+  PFS_user_row m_user;
+  /** Column EVENT_NAME */
+  PFS_event_name_row m_event_name;
+  /**
+    Columns COUNT_STAR, SUM/MIN/AVG/MAX_TIMER_WAIT,
+    COUNT_READ_WRITE, SUM/MIN/AVG/MAX_TIMER_READ_WRITE,
+    COUNT_READ_ONLY, SUM/MIN/AVG/MAX_TIMER_READ_ONLY
+  */
+  PFS_transaction_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_SUMMARY_BY_USER_BY_EVENT_NAME.
+  Index 1 on user (0 based)
+  Index 2 on transaction class (1 based)
+*/
+struct pos_ets_by_user_by_event_name
+: public PFS_double_index
+{
+  pos_ets_by_user_by_event_name()
+    : PFS_double_index(0, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 1;
+  }
+
+  inline void next_user(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_SUMMARY_BY_USER_BY_EVENT_NAME. */
+class table_ets_by_user_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_ets_by_user_by_event_name();
+
+public:
+  ~table_ets_by_user_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_user *user, PFS_transaction_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_ets_by_user_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_ets_by_user_by_event_name m_pos;
+  /** Next position. */
+  pos_ets_by_user_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_ets_global_by_event_name.cc b/storage/perfschema/table_ets_global_by_event_name.cc
new file mode 100644
index 00000000000..e08ff15075d
--- /dev/null
+++ b/storage/perfschema/table_ets_global_by_event_name.cc
@@ -0,0 +1,200 @@
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_ets_global_by_event_name.cc
+  Table EVENTS_TRANSACTIONS_SUMMARY_GLOBAL_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_ets_global_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_instr.h"
+#include "pfs_timer.h"
+#include "pfs_visitor.h"
+#include "field.h"
+
+THR_LOCK table_ets_global_by_event_name::m_table_lock;
+
+PFS_engine_table_share
+table_ets_global_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_transactions_summary_global_by_event_name") },
+  &pfs_truncatable_acl,
+  table_ets_global_by_event_name::create,
+  NULL, /* write_row */
+  table_ets_global_by_event_name::delete_all_rows,
+  table_ets_global_by_event_name::get_row_count,
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE events_transactions_summary_global_by_event_name("
+  "EVENT_NAME VARCHAR(128) not null comment 'Event name for which summary is generated.',"
+  "COUNT_STAR BIGINT unsigned not null comment 'The number of summarized events. This value includes all events, whether timed or nontimed.',"
+  "SUM_TIMER_WAIT BIGINT unsigned not null comment 'The total wait time of the summarized timed events. This value is calculated only for timed events because nontimed events have a wait time of NULL. The same is true for the other xxx_TIMER_WAIT values.',"
+  "MIN_TIMER_WAIT BIGINT unsigned not null comment 'The minimum wait time of the summarized timed events.',"
+  "AVG_TIMER_WAIT BIGINT unsigned not null comment 'The average wait time of the summarized timed events.',"
+  "MAX_TIMER_WAIT BIGINT unsigned not null comment 'The maximum wait time of the summarized timed events.',"
+  "COUNT_READ_WRITE BIGINT unsigned not null comment 'The total number of only READ/WRITE transaction events.',"
+  "SUM_TIMER_READ_WRITE BIGINT unsigned not null comment 'The total wait time of only READ/WRITE transaction events.',"
+  "MIN_TIMER_READ_WRITE BIGINT unsigned not null comment 'The minimum wait time of only READ/WRITE transaction events.',"
+  "AVG_TIMER_READ_WRITE BIGINT unsigned not null comment 'The average wait time of only READ/WRITE transaction events.',"
+  "MAX_TIMER_READ_WRITE BIGINT unsigned not null comment 'The maximum wait time of only READ/WRITE transaction events.',"
+  "COUNT_READ_ONLY BIGINT unsigned not null comment 'The total number of only READ ONLY transaction events.',"
+  "SUM_TIMER_READ_ONLY BIGINT unsigned not null comment 'The total wait time of only READ ONLY transaction events.',"
+  "MIN_TIMER_READ_ONLY BIGINT unsigned not null comment 'The minimum wait time of only READ ONLY transaction events.',"
+  "AVG_TIMER_READ_ONLY BIGINT unsigned not null comment 'The average wait time of only READ ONLY transaction events.',"
+  "MAX_TIMER_READ_ONLY BIGINT unsigned not null comment 'The maximum wait time of only READ ONLY transaction events.')")},
+  false  /* perpetual */
+};
+
+PFS_engine_table*
+table_ets_global_by_event_name::create(void)
+{
+  return new table_ets_global_by_event_name();
+}
+
+int
+table_ets_global_by_event_name::delete_all_rows(void)
+{
+  reset_events_transactions_by_thread();
+  reset_events_transactions_by_account();
+  reset_events_transactions_by_user();
+  reset_events_transactions_by_host();
+  reset_events_transactions_global();
+  return 0;
+}
+
+ha_rows
+table_ets_global_by_event_name::get_row_count(void)
+{
+  return transaction_class_max;
+}
+
+table_ets_global_by_event_name::table_ets_global_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(1), m_next_pos(1)
+{}
+
+void table_ets_global_by_event_name::reset_position(void)
+{
+  m_pos= 1;
+  m_next_pos= 1;
+}
+
+int table_ets_global_by_event_name::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(transaction_timer);
+  return 0;
+}
+
+int table_ets_global_by_event_name::rnd_next(void)
+{
+  PFS_transaction_class *transaction_class;
+
+  m_pos.set_at(&m_next_pos);
+
+  transaction_class= find_transaction_class(m_pos.m_index);
+  if (transaction_class)
+  {
+    make_row(transaction_class);
+    m_next_pos.set_after(&m_pos);
+    return 0;
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_ets_global_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_transaction_class *transaction_class;
+
+  set_position(pos);
+
+  transaction_class=find_transaction_class(m_pos.m_index);
+  if (transaction_class)
+  {
+    make_row(transaction_class);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+
+void table_ets_global_by_event_name
+::make_row(PFS_transaction_class *klass)
+{
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_transaction_visitor visitor(klass);
+  PFS_connection_iterator::visit_global(true,  /* hosts */
+                                        false, /* users */
+                                        true,  /* accounts */
+                                        true,  /* threads */
+                                        false, /* THDs */
+                                        & visitor);
+
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+  m_row_exists= true;
+}
+
+int table_ets_global_by_event_name
+::read_row_values(TABLE *table, unsigned char *, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 0);
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default:
+        /**
+          Columns COUNT_STAR, SUM/MIN/AVG/MAX_TIMER_WAIT,
+          COUNT_READ_WRITE, SUM/MIN/AVG/MAX_TIMER_READ_WRITE,
+          COUNT_READ_ONLY, SUM/MIN/AVG/MAX_TIMER_READ_ONLY
+        */
+        m_row.m_stat.set_field(f->field_index - 1, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_ets_global_by_event_name.h b/storage/perfschema/table_ets_global_by_event_name.h
new file mode 100644
index 00000000000..717a737f93c
--- /dev/null
+++ b/storage/perfschema/table_ets_global_by_event_name.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_ETS_GLOBAL_BY_EVENT_NAME_H
+#define TABLE_ETS_GLOBAL_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_ets_global_by_event_name.h
+  Table EVENTS_TRANSACTIONS_SUMMARY_GLOBAL_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_SUMMARY_GLOBAL_BY_EVENT_NAME.
+*/
+struct row_ets_global_by_event_name
+{
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /**
+    Columns COUNT_STAR, SUM/MIN/AVG/MAX_TIMER_WAIT,
+    COUNT_READ_WRITE, SUM/MIN/AVG/MAX_TIMER_READ_WRITE,
+    COUNT_READ_ONLY, SUM/MIN/AVG/MAX_TIMER_READ_ONLY
+  */
+  PFS_transaction_stat_row m_stat;
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_SUMMARY_GLOBAL_BY_EVENT_NAME. */
+class table_ets_global_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_ets_global_by_event_name();
+
+public:
+  ~table_ets_global_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_transaction_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_ets_global_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_events_stages.cc b/storage/perfschema/table_events_stages.cc
index 12b8ed31273..66887d39e5a 100644
--- a/storage/perfschema/table_events_stages.cc
+++ b/storage/perfschema/table_events_stages.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,12 +26,14 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "table_events_stages.h"
 #include "pfs_instr_class.h"
 #include "pfs_instr.h"
 #include "pfs_events_stages.h"
 #include "pfs_timer.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_events_stages_current::m_table_lock;
 
@@ -40,11 +42,10 @@ table_events_stages_current::m_share=
 {
   { C_STRING_WITH_LEN("events_stages_current") },
   &pfs_truncatable_acl,
-  &table_events_stages_current::create,
+  table_events_stages_current::create,
   NULL, /* write_row */
-  &table_events_stages_current::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_events_stages_current::delete_all_rows,
+  table_events_stages_current::get_row_count,
   sizeof(PFS_simple_index), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_stages_current("
@@ -56,8 +57,11 @@ table_events_stages_current::m_share=
                       "TIMER_START BIGINT unsigned comment 'Value in picoseconds when the event timing started or NULL if timing is not collected.',"
                       "TIMER_END BIGINT unsigned comment 'Value in picoseconds when the event timing ended, or NULL if the event has not ended or timing is not collected.',"
                       "TIMER_WAIT BIGINT unsigned comment 'Value in picoseconds of the event''s duration or NULL if the event has not ended or timing is not collected.',"
+                      "WORK_COMPLETED BIGINT unsigned comment 'The number of work units completed for the stage. NULL if the stage event progress is not instrumented.',"
+                      "WORK_ESTIMATED BIGINT unsigned comment 'The number of work units expected for the stage. NULL if the stage event progress is not instrumented.',"
                       "NESTING_EVENT_ID BIGINT unsigned comment 'EVENT_ID of event within which this event nests.',"
-                      "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT') comment 'Nesting event type. Either statement, stage or wait.')") }
+                      "NESTING_EVENT_TYPE ENUM('TRANSACTION', 'STATEMENT', 'STAGE', 'WAIT') comment 'Nesting event type. Either transaction, statement, stage or wait.')") },
+  false  /* perpetual */
 };
 
 THR_LOCK table_events_stages_history::m_table_lock;
@@ -67,11 +71,10 @@ table_events_stages_history::m_share=
 {
   { C_STRING_WITH_LEN("events_stages_history") },
   &pfs_truncatable_acl,
-  &table_events_stages_history::create,
+  table_events_stages_history::create,
   NULL, /* write_row */
-  &table_events_stages_history::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_events_stages_history::delete_all_rows,
+  table_events_stages_history::get_row_count,
   sizeof(pos_events_stages_history), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_stages_history("
@@ -83,8 +86,11 @@ table_events_stages_history::m_share=
                       "TIMER_START BIGINT unsigned comment 'Value in picoseconds when the event timing started or NULL if timing is not collected.',"
                       "TIMER_END BIGINT unsigned comment 'Value in picoseconds when the event timing ended, or NULL if the event has not ended or timing is not collected.',"
                       "TIMER_WAIT BIGINT unsigned comment 'Value in picoseconds of the event''s duration or NULL if the event has not ended or timing is not collected.',"
+                      "WORK_COMPLETED BIGINT unsigned comment 'The number of work units completed for the stage. NULL if the stage event progress is not instrumented.',"
+                      "WORK_ESTIMATED BIGINT unsigned comment 'The number of work units expected for the stage. NULL if the stage event progress is not instrumented.',"
                       "NESTING_EVENT_ID BIGINT unsigned comment 'EVENT_ID of event within which this event nests.',"
-                      "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT') comment 'Nesting event type. Either statement, stage or wait.')") }
+                      "NESTING_EVENT_TYPE ENUM('TRANSACTION', 'STATEMENT', 'STAGE', 'WAIT') comment 'Nesting event type. Either transaction, statement, stage or wait.')") },
+  false  /* perpetual */
 };
 
 THR_LOCK table_events_stages_history_long::m_table_lock;
@@ -94,11 +100,10 @@ table_events_stages_history_long::m_share=
 {
   { C_STRING_WITH_LEN("events_stages_history_long") },
   &pfs_truncatable_acl,
-  &table_events_stages_history_long::create,
+  table_events_stages_history_long::create,
   NULL, /* write_row */
-  &table_events_stages_history_long::delete_all_rows,
-  NULL, /* get_row_count */
-  10000, /* records */
+  table_events_stages_history_long::delete_all_rows,
+  table_events_stages_history_long::get_row_count,
   sizeof(PFS_simple_index), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_stages_history_long("
@@ -110,8 +115,11 @@ table_events_stages_history_long::m_share=
                       "TIMER_START BIGINT unsigned comment 'Value in picoseconds when the event timing started or NULL if timing is not collected.',"
                       "TIMER_END BIGINT unsigned comment 'Value in picoseconds when the event timing ended, or NULL if the event has not ended or timing is not collected.',"
                       "TIMER_WAIT BIGINT unsigned comment 'Value in picoseconds of the event''s duration or NULL if the event has not ended or timing is not collected.',"
+                      "WORK_COMPLETED BIGINT unsigned comment 'The number of work units completed for the stage. NULL if the stage event progress is not instrumented.',"
+                      "WORK_ESTIMATED BIGINT unsigned comment 'The number of work units expected for the stage. NULL if the stage event progress is not instrumented.',"
                       "NESTING_EVENT_ID BIGINT unsigned comment 'EVENT_ID of event within which this event nests.',"
-                      "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT') comment 'Nesting event type. Either statement, stage or wait.')") }
+                      "NESTING_EVENT_TYPE ENUM('TRANSACTION', 'STATEMENT', 'STAGE', 'WAIT') comment 'Nesting event type. Either transaction, statement, stage or wait.')") },
+  false  /* perpetual */
 };
 
 table_events_stages_common::table_events_stages_common
@@ -126,8 +134,6 @@ table_events_stages_common::table_events_stages_common
 */
 void table_events_stages_common::make_row(PFS_events_stages *stage)
 {
-  const char *base;
-  const char *safe_source_file;
   ulonglong timer_end;
 
   m_row_exists= false;
@@ -158,15 +164,19 @@ void table_events_stages_common::make_row(PFS_events_stages *stage)
   m_row.m_name= klass->m_name;
   m_row.m_name_length= klass->m_name_length;
 
-  safe_source_file= stage->m_source_file;
-  if (unlikely(safe_source_file == NULL))
-    return;
+  /* Disable source file and line to avoid stale __FILE__ pointers. */
+  m_row.m_source_length= 0;
 
-  base= base_name(safe_source_file);
-  m_row.m_source_length= (uint)my_snprintf(m_row.m_source, sizeof(m_row.m_source),
-                                     "%s:%d", base, stage->m_source_line);
-  if (m_row.m_source_length > sizeof(m_row.m_source))
-    m_row.m_source_length= sizeof(m_row.m_source);
+  if (klass->is_progress())
+  {
+    m_row.m_progress= true;
+    m_row.m_work_completed= stage->m_progress.m_work_completed;
+    m_row.m_work_estimated= stage->m_progress.m_work_estimated;
+  }
+  else
+  {
+    m_row.m_progress= false;
+  }
 
   m_row_exists= true;
   return;
@@ -183,8 +193,9 @@ int table_events_stages_common::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 2);
   buf[0]= 0;
+  buf[1]= 0;
 
   for (; (f= *fields) ; fields++)
   {
@@ -228,20 +239,32 @@ int table_events_stages_common::read_row_values(TABLE *table,
         else
           f->set_null();
         break;
-      case 8: /* NESTING_EVENT_ID */
+      case 8: /* WORK_COMPLETED */
+        if (m_row.m_progress)
+          set_field_ulonglong(f, m_row.m_work_completed);
+        else
+          f->set_null();
+        break;
+      case 9: /* WORK_ESTIMATED */
+        if (m_row.m_progress)
+          set_field_ulonglong(f, m_row.m_work_estimated);
+        else
+          f->set_null();
+        break;
+      case 10: /* NESTING_EVENT_ID */
         if (m_row.m_nesting_event_id != 0)
           set_field_ulonglong(f, m_row.m_nesting_event_id);
         else
           f->set_null();
         break;
-      case 9: /* NESTING_EVENT_TYPE */
+      case 11: /* NESTING_EVENT_TYPE */
         if (m_row.m_nesting_event_id != 0)
           set_field_enum(f, m_row.m_nesting_event_type);
         else
           f->set_null();
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
@@ -275,20 +298,12 @@ int table_events_stages_current::rnd_next(void)
   PFS_thread *pfs_thread;
   PFS_events_stages *stage;
 
-  for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index < thread_max;
-       m_pos.next())
+  m_pos.set_at(&m_next_pos);
+  PFS_thread_iterator it= global_thread_container.iterate(m_pos.m_index);
+  pfs_thread= it.scan_next(& m_pos.m_index);
+  if (pfs_thread != NULL)
   {
-    pfs_thread= &thread_array[m_pos.m_index];
-
-    if (! pfs_thread->m_lock.is_populated())
-    {
-      /* This thread does not exist */
-      continue;
-    }
-
     stage= &pfs_thread->m_stage_current;
-
     make_row(stage);
     m_next_pos.set_after(&m_pos);
     return 0;
@@ -303,15 +318,16 @@ int table_events_stages_current::rnd_pos(const void *pos)
   PFS_events_stages *stage;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index < thread_max);
-  pfs_thread= &thread_array[m_pos.m_index];
 
-  if (! pfs_thread->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
+  pfs_thread= global_thread_container.get(m_pos.m_index);
+  if (pfs_thread != NULL)
+  {
+    stage= &pfs_thread->m_stage_current;
+    make_row(stage);
+    return 0;
+  }
 
-  stage= &pfs_thread->m_stage_current;
-  make_row(stage);
-  return 0;
+  return HA_ERR_RECORD_DELETED;
 }
 
 int table_events_stages_current::delete_all_rows(void)
@@ -320,6 +336,12 @@ int table_events_stages_current::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_events_stages_current::get_row_count(void)
+{
+  return global_thread_container.get_row_count();
+}
+
 PFS_engine_table* table_events_stages_history::create(void)
 {
   return new table_events_stages_history();
@@ -346,43 +368,40 @@ int table_events_stages_history::rnd_next(void)
 {
   PFS_thread *pfs_thread;
   PFS_events_stages *stage;
+  bool has_more_thread= true;
 
   if (events_stages_history_per_thread == 0)
     return HA_ERR_END_OF_FILE;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index_1 < thread_max;
+       has_more_thread;
        m_pos.next_thread())
   {
-    pfs_thread= &thread_array[m_pos.m_index_1];
-
-    if (! pfs_thread->m_lock.is_populated())
-    {
-      /* This thread does not exist */
-      continue;
-    }
-
-    if (m_pos.m_index_2 >= events_stages_history_per_thread)
+    pfs_thread= global_thread_container.get(m_pos.m_index_1, & has_more_thread);
+    if (pfs_thread != NULL)
     {
-      /* This thread does not have more (full) history */
-      continue;
-    }
+      if (m_pos.m_index_2 >= events_stages_history_per_thread)
+      {
+        /* This thread does not have more (full) history */
+        continue;
+      }
 
-    if ( ! pfs_thread->m_stages_history_full &&
-        (m_pos.m_index_2 >= pfs_thread->m_stages_history_index))
-    {
-      /* This thread does not have more (not full) history */
-      continue;
-    }
+      if ( ! pfs_thread->m_stages_history_full &&
+          (m_pos.m_index_2 >= pfs_thread->m_stages_history_index))
+      {
+        /* This thread does not have more (not full) history */
+        continue;
+      }
 
-    stage= &pfs_thread->m_stages_history[m_pos.m_index_2];
+      stage= &pfs_thread->m_stages_history[m_pos.m_index_2];
 
-    if (stage->m_class != NULL)
-    {
-      make_row(stage);
-      /* Next iteration, look for the next history in this thread */
-      m_next_pos.set_after(&m_pos);
-      return 0;
+      if (stage->m_class != NULL)
+      {
+        make_row(stage);
+        /* Next iteration, look for the next history in this thread */
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
     }
   }
 
@@ -394,27 +413,28 @@ int table_events_stages_history::rnd_pos(const void *pos)
   PFS_thread *pfs_thread;
   PFS_events_stages *stage;
 
-  DBUG_ASSERT(events_stages_history_per_thread != 0);
+  assert(events_stages_history_per_thread != 0);
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < thread_max);
-  pfs_thread= &thread_array[m_pos.m_index_1];
 
-  if (! pfs_thread->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
+  assert(m_pos.m_index_2 < events_stages_history_per_thread);
 
-  DBUG_ASSERT(m_pos.m_index_2 < events_stages_history_per_thread);
-
-  if ( ! pfs_thread->m_stages_history_full &&
-      (m_pos.m_index_2 >= pfs_thread->m_stages_history_index))
-    return HA_ERR_RECORD_DELETED;
+  pfs_thread= global_thread_container.get(m_pos.m_index_1);
+  if (pfs_thread != NULL)
+  {
+    if ( ! pfs_thread->m_stages_history_full &&
+        (m_pos.m_index_2 >= pfs_thread->m_stages_history_index))
+      return HA_ERR_RECORD_DELETED;
 
-  stage= &pfs_thread->m_stages_history[m_pos.m_index_2];
+    stage= &pfs_thread->m_stages_history[m_pos.m_index_2];
 
-  if (stage->m_class == NULL)
-    return HA_ERR_RECORD_DELETED;
+    if (stage->m_class != NULL)
+    {
+      make_row(stage);
+      return 0;
+    }
+  }
 
-  make_row(stage);
-  return 0;
+  return HA_ERR_RECORD_DELETED;
 }
 
 int table_events_stages_history::delete_all_rows(void)
@@ -423,6 +443,12 @@ int table_events_stages_history::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_events_stages_history::get_row_count(void)
+{
+  return events_stages_history_per_thread * global_thread_container.get_row_count();
+}
+
 PFS_engine_table* table_events_stages_history_long::create(void)
 {
   return new table_events_stages_history_long();
@@ -456,7 +482,7 @@ int table_events_stages_history_long::rnd_next(void)
   if (events_stages_history_long_full)
     limit= events_stages_history_long_size;
   else
-    limit= events_stages_history_long_index % events_stages_history_long_size;
+    limit= events_stages_history_long_index.m_u32 % events_stages_history_long_size;
 
   for (m_pos.set_at(&m_next_pos); m_pos.m_index < limit; m_pos.next())
   {
@@ -487,7 +513,7 @@ int table_events_stages_history_long::rnd_pos(const void *pos)
   if (events_stages_history_long_full)
     limit= events_stages_history_long_size;
   else
-    limit= events_stages_history_long_index % events_stages_history_long_size;
+    limit= events_stages_history_long_index.m_u32 % events_stages_history_long_size;
 
   if (m_pos.m_index > limit)
     return HA_ERR_RECORD_DELETED;
@@ -507,3 +533,9 @@ int table_events_stages_history_long::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_events_stages_history_long::get_row_count(void)
+{
+  return events_stages_history_long_size;
+}
+
diff --git a/storage/perfschema/table_events_stages.h b/storage/perfschema/table_events_stages.h
index ae8760cd953..2fd760cfae3 100644
--- a/storage/perfschema/table_events_stages.h
+++ b/storage/perfschema/table_events_stages.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -66,6 +66,11 @@ struct row_events_stages
   char m_source[COL_SOURCE_SIZE];
   /** Length in bytes of @c m_source. */
   uint m_source_length;
+  bool m_progress;
+  /** Column WORK_COMPLETED. */
+  ulonglong m_work_completed;
+  /** Column WORK_ESTIMATED. */
+  ulonglong m_work_estimated;
 };
 
 /** Position of a cursor on PERFORMANCE_SCHEMA.EVENTS_STAGES_HISTORY. */
@@ -121,6 +126,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
@@ -155,6 +161,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
@@ -186,6 +193,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
diff --git a/storage/perfschema/table_events_statements.cc b/storage/perfschema/table_events_statements.cc
index 86403c9f3e9..5e30f39c785 100644
--- a/storage/perfschema/table_events_statements.cc
+++ b/storage/perfschema/table_events_statements.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2016, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,7 +26,7 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "table_events_statements.h"
 #include "pfs_instr_class.h"
 #include "pfs_instr.h"
@@ -35,6 +35,7 @@
 #include "sp_head.h" /* TYPE_ENUM_FUNCTION, ... */
 #include "table_helper.h"
 #include "my_md5.h"
+#include "pfs_buffer_container.h"
 
 THR_LOCK table_events_statements_current::m_table_lock;
 
@@ -43,11 +44,10 @@ table_events_statements_current::m_share=
 {
   { C_STRING_WITH_LEN("events_statements_current") },
   &pfs_truncatable_acl,
-  &table_events_statements_current::create,
+  table_events_statements_current::create,
   NULL, /* write_row */
-  &table_events_statements_current::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_events_statements_current::delete_all_rows,
+  table_events_statements_current::get_row_count,
   sizeof(pos_events_statements_current), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_statements_current("
@@ -64,9 +64,9 @@ table_events_statements_current::m_share=
                       "DIGEST VARCHAR(32) comment 'Statement digest.',"
                       "DIGEST_TEXT LONGTEXT comment 'Statement digest text.',"
                       "CURRENT_SCHEMA VARCHAR(64) comment 'Statement''s default database for the statement, or NULL if there was none.',"
-                      "OBJECT_TYPE VARCHAR(64) comment 'Reserved, currently NULL',"
-                      "OBJECT_SCHEMA VARCHAR(64) comment 'Reserved, currently NULL',"
-                      "OBJECT_NAME VARCHAR(64) comment 'Reserved, currently NULL',"
+                      "OBJECT_TYPE VARCHAR(64) comment 'NULL for top level statements. The parent statement object type for nested statements (stored programs).',"
+                      "OBJECT_SCHEMA VARCHAR(64) comment 'NULL for top level statements. The parent statement object schema for nested statements (stored programs).',"
+                      "OBJECT_NAME VARCHAR(64) comment 'NULL for top level statements. The parent statement object name for nested statements (stored programs).',"
                       "OBJECT_INSTANCE_BEGIN BIGINT unsigned comment 'Address in memory of the statement object.',"
                       "MYSQL_ERRNO INTEGER comment 'Error code. See MariaDB Error Codes for a full list.',"
                       "RETURNED_SQLSTATE VARCHAR(5) comment 'The SQLSTATE value.',"
@@ -89,8 +89,10 @@ table_events_statements_current::m_share=
                       "SORT_SCAN BIGINT unsigned not null comment 'Number of sorts performed by the statement which used a full table scan.',"
                       "NO_INDEX_USED BIGINT unsigned not null comment '0 if the statement performed a table scan with an index, 1 if without an index.',"
                       "NO_GOOD_INDEX_USED BIGINT unsigned not null comment '0 if a good index was found for the statement, 1 if no good index was found. See the Range checked for each record description in the EXPLAIN article.',"
-                      "NESTING_EVENT_ID BIGINT unsigned comment 'Reserved, currently NULL.',"
-                      "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT') comment 'Reserved, currently NULL.')") }
+                      "NESTING_EVENT_ID BIGINT unsigned comment 'NULL for top level statements. The parent statement event id for nested statements (stored programs).',"
+                      "NESTING_EVENT_TYPE ENUM('TRANSACTION', 'STATEMENT', 'STAGE', 'WAIT') comment 'NULL for top level statements. The parent statement event type for nested statements (stored programs).',"
+                      "NESTING_EVENT_LEVEL INT comment '0 for top level statements. The parent statement level plus 1 for nested statements (stored programs).')") },
+  false  /* perpetual */
 };
 
 THR_LOCK table_events_statements_history::m_table_lock;
@@ -100,11 +102,10 @@ table_events_statements_history::m_share=
 {
   { C_STRING_WITH_LEN("events_statements_history") },
   &pfs_truncatable_acl,
-  &table_events_statements_history::create,
+  table_events_statements_history::create,
   NULL, /* write_row */
-  &table_events_statements_history::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_events_statements_history::delete_all_rows,
+  table_events_statements_history::get_row_count,
   sizeof(pos_events_statements_history), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_statements_history("
@@ -121,9 +122,9 @@ table_events_statements_history::m_share=
                       "DIGEST VARCHAR(32) comment 'Statement digest.',"
                       "DIGEST_TEXT LONGTEXT comment 'Statement digest text.',"
                       "CURRENT_SCHEMA VARCHAR(64) comment 'Statement''s default database for the statement, or NULL if there was none.',"
-                      "OBJECT_TYPE VARCHAR(64) comment 'Reserved, currently NULL',"
-                      "OBJECT_SCHEMA VARCHAR(64) comment 'Reserved, currently NULL',"
-                      "OBJECT_NAME VARCHAR(64) comment 'Reserved, currently NULL',"
+                      "OBJECT_TYPE VARCHAR(64) comment 'NULL for top level statements. The parent statement object type for nested statements (stored programs).',"
+                      "OBJECT_SCHEMA VARCHAR(64) comment 'NULL for top level statements. The parent statement object schema for nested statements (stored programs).',"
+                      "OBJECT_NAME VARCHAR(64) comment 'NULL for top level statements. The parent statement object name for nested statements (stored programs).',"
                       "OBJECT_INSTANCE_BEGIN BIGINT unsigned comment 'Address in memory of the statement object.',"
                       "MYSQL_ERRNO INTEGER comment 'Error code. See MariaDB Error Codes for a full list.',"
                       "RETURNED_SQLSTATE VARCHAR(5) comment 'The SQLSTATE value.',"
@@ -146,8 +147,10 @@ table_events_statements_history::m_share=
                       "SORT_SCAN BIGINT unsigned not null comment 'Number of sorts performed by the statement which used a full table scan.',"
                       "NO_INDEX_USED BIGINT unsigned not null comment '0 if the statement performed a table scan with an index, 1 if without an index.',"
                       "NO_GOOD_INDEX_USED BIGINT unsigned not null comment '0 if a good index was found for the statement, 1 if no good index was found. See the Range checked for each record description in the EXPLAIN article.',"
-                      "NESTING_EVENT_ID BIGINT unsigned comment 'Reserved, currently NULL.',"
-                      "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT') comment 'Reserved, currently NULL.')") }
+                      "NESTING_EVENT_ID BIGINT unsigned comment 'NULL for top level statements. The parent statement event id for nested statements (stored programs).',"
+                      "NESTING_EVENT_TYPE ENUM('TRANSACTION', 'STATEMENT', 'STAGE', 'WAIT') comment 'NULL for top level statements. The parent statement event type for nested statements (stored programs).',"
+                      "NESTING_EVENT_LEVEL INT comment '0 for top level statements. The parent statement level plus 1 for nested statements (stored programs).')") },
+  false  /* perpetual */
 };
 
 THR_LOCK table_events_statements_history_long::m_table_lock;
@@ -157,11 +160,10 @@ table_events_statements_history_long::m_share=
 {
   { C_STRING_WITH_LEN("events_statements_history_long") },
   &pfs_truncatable_acl,
-  &table_events_statements_history_long::create,
+  table_events_statements_history_long::create,
   NULL, /* write_row */
-  &table_events_statements_history_long::delete_all_rows,
-  NULL, /* get_row_count */
-  10000, /* records */
+  table_events_statements_history_long::delete_all_rows,
+  table_events_statements_history_long::get_row_count,
   sizeof(PFS_simple_index), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_statements_history_long("
@@ -178,9 +180,9 @@ table_events_statements_history_long::m_share=
                       "DIGEST VARCHAR(32) comment 'Statement digest.',"
                       "DIGEST_TEXT LONGTEXT comment 'Statement digest text.',"
                       "CURRENT_SCHEMA VARCHAR(64) comment 'Statement''s default database for the statement, or NULL if there was none.',"
-                      "OBJECT_TYPE VARCHAR(64) comment 'Reserved, currently NULL',"
-                      "OBJECT_SCHEMA VARCHAR(64) comment 'Reserved, currently NULL',"
-                      "OBJECT_NAME VARCHAR(64) comment 'Reserved, currently NULL',"
+                      "OBJECT_TYPE VARCHAR(64) comment 'NULL for top level statements. The parent statement object type for nested statements (stored programs).',"
+                      "OBJECT_SCHEMA VARCHAR(64) comment 'NULL for top level statements. The parent statement object schema for nested statements (stored programs).',"
+                      "OBJECT_NAME VARCHAR(64) comment 'NULL for top level statements. The parent statement object name for nested statements (stored programs).',"
                       "OBJECT_INSTANCE_BEGIN BIGINT unsigned comment 'Address in memory of the statement object.',"
                       "MYSQL_ERRNO INTEGER comment 'Error code. See MariaDB Error Codes for a full list.',"
                       "RETURNED_SQLSTATE VARCHAR(5) comment 'The SQLSTATE value.',"
@@ -203,8 +205,10 @@ table_events_statements_history_long::m_share=
                       "SORT_SCAN BIGINT unsigned not null comment 'Number of sorts performed by the statement which used a full table scan.',"
                       "NO_INDEX_USED BIGINT unsigned not null comment '0 if the statement performed a table scan with an index, 1 if without an index.',"
                       "NO_GOOD_INDEX_USED BIGINT unsigned not null comment '0 if a good index was found for the statement, 1 if no good index was found. See the Range checked for each record description in the EXPLAIN article.',"
-                      "NESTING_EVENT_ID BIGINT unsigned comment 'Reserved, currently NULL.',"
-                      "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT') comment 'Reserved, currently NULL.')") }
+                      "NESTING_EVENT_ID BIGINT unsigned comment 'NULL for top level statements. The parent statement event id for nested statements (stored programs).',"
+                      "NESTING_EVENT_TYPE ENUM('TRANSACTION', 'STATEMENT', 'STAGE', 'WAIT') comment 'NULL for top level statements. The parent statement event type for nested statements (stored programs).',"
+                      "NESTING_EVENT_LEVEL INT comment '0 for top level statements. The parent statement level plus 1 for nested statements (stored programs).')") },
+  false  /* perpetual */
 };
 
 table_events_statements_common::table_events_statements_common
@@ -220,22 +224,22 @@ table_events_statements_common::table_events_statements_common
 void table_events_statements_common::make_row_part_1(PFS_events_statements *statement,
                                                      sql_digest_storage *digest)
 {
-  const char *base;
-  const char *safe_source_file;
   ulonglong timer_end;
 
   m_row_exists= false;
 
-  PFS_statement_class *unsafe= (PFS_statement_class*) statement->m_class;
+  PFS_statement_class *unsafe= (PFS_statement_class*)
+    statement->m_event.m_class;
   PFS_statement_class *klass= sanitize_statement_class(unsafe);
   if (unlikely(klass == NULL))
     return;
 
-  m_row.m_thread_internal_id= statement->m_thread_internal_id;
-  m_row.m_event_id= statement->m_event_id;
-  m_row.m_end_event_id= statement->m_end_event_id;
-  m_row.m_nesting_event_id= statement->m_nesting_event_id;
-  m_row.m_nesting_event_type= statement->m_nesting_event_type;
+  m_row.m_thread_internal_id= statement->m_event.m_thread_internal_id;
+  m_row.m_event_id= statement->m_event.m_event_id;
+  m_row.m_end_event_id= statement->m_event.m_end_event_id;
+  m_row.m_nesting_event_id= statement->m_event.m_nesting_event_id;
+  m_row.m_nesting_event_type= statement->m_event.m_nesting_event_type;
+  m_row.m_nesting_event_level= statement->m_event.m_nesting_event_level;
 
   if (m_row.m_end_event_id == 0)
   {
@@ -243,10 +247,10 @@ void table_events_statements_common::make_row_part_1(PFS_events_statements *stat
   }
   else
   {
-    timer_end= statement->m_timer_end;
+    timer_end= statement->m_event.m_timer_end;
   }
 
-  m_normalizer->to_pico(statement->m_timer_start, timer_end,
+  m_normalizer->to_pico(statement->m_event.m_timer_start, timer_end,
                       & m_row.m_timer_start, & m_row.m_timer_end, & m_row.m_timer_wait);
   m_row.m_lock_time= statement->m_lock_time * MICROSEC_TO_PICOSEC;
 
@@ -287,15 +291,18 @@ void table_events_statements_common::make_row_part_1(PFS_events_statements *stat
   if (m_row.m_current_schema_name_length > 0)
     memcpy(m_row.m_current_schema_name, statement->m_current_schema_name, m_row.m_current_schema_name_length);
 
-  safe_source_file= statement->m_source_file;
-  if (unlikely(safe_source_file == NULL))
-    return;
+  m_row.m_object_type= statement->m_sp_type;
+
+  m_row.m_schema_name_length= statement->m_schema_name_length;
+  if (m_row.m_schema_name_length > 0)
+    memcpy(m_row.m_schema_name, statement->m_schema_name, m_row.m_schema_name_length);
 
-  base= base_name(safe_source_file);
-  m_row.m_source_length= (uint)my_snprintf(m_row.m_source, sizeof(m_row.m_source),
-                                           "%s:%d", base, statement->m_source_line);
-  if (m_row.m_source_length > sizeof(m_row.m_source))
-    m_row.m_source_length= sizeof(m_row.m_source);
+  m_row.m_object_name_length= statement->m_object_name_length;
+  if (m_row.m_object_name_length > 0)
+    memcpy(m_row.m_object_name, statement->m_object_name, m_row.m_object_name_length);
+
+  /* Disable source file and line to avoid stale __FILE__ pointers. */
+  m_row.m_source_length= 0;
 
   memcpy(m_row.m_message_text, statement->m_message_text, sizeof(m_row.m_message_text));
   m_row.m_sql_errno= statement->m_sql_errno;
@@ -319,6 +326,7 @@ void table_events_statements_common::make_row_part_1(PFS_events_statements *stat
   m_row.m_sort_scan= statement->m_sort_scan;
   m_row.m_no_index_used= statement->m_no_index_used;
   m_row.m_no_good_index_used= statement->m_no_good_index_used;
+
   /*
     Making a copy of digest storage.
   */
@@ -328,7 +336,6 @@ void table_events_statements_common::make_row_part_1(PFS_events_statements *stat
   return;
 }
 
-
 void table_events_statements_common::make_row_part_2(const sql_digest_storage *digest)
 {
   /*
@@ -370,7 +377,7 @@ int table_events_statements_common::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 3);
+  assert(table->s->null_bytes == 3);
   buf[0]= 0;
   buf[1]= 0;
   buf[2]= 0;
@@ -445,18 +452,30 @@ int table_events_statements_common::read_row_values(TABLE *table,
         break;
       case 12: /* CURRENT_SCHEMA */
         if (m_row.m_current_schema_name_length)
-          set_field_varchar_utf8(f, m_row.m_current_schema_name, m_row.m_current_schema_name_length);
+          set_field_varchar_utf8(f, m_row.m_current_schema_name,
+                                 m_row.m_current_schema_name_length);
         else
           f->set_null();
         break;
-      case 13: /* OBJECT_TYPE */
-        f->set_null();
+     case 13: /* OBJECT_TYPE */
+        if (m_row.m_object_name_length > 0)
+          set_field_object_type(f, m_row.m_object_type);
+        else
+          f->set_null();
         break;
       case 14: /* OBJECT_SCHEMA */
-        f->set_null();
+        if (m_row.m_schema_name_length)
+          set_field_varchar_utf8(f, m_row.m_schema_name,
+                                 m_row.m_schema_name_length);
+        else
+          f->set_null();
         break;
       case 15: /* OBJECT_NAME */
-        f->set_null();
+        if (m_row.m_object_name_length)
+          set_field_varchar_utf8(f, m_row.m_object_name,
+                                 m_row.m_object_name_length);
+        else
+          f->set_null();
         break;
       case 16: /* OBJECT_INSTANCE_BEGIN */
         f->set_null();
@@ -471,7 +490,7 @@ int table_events_statements_common::read_row_values(TABLE *table,
           f->set_null();
         break;
       case 19: /* MESSAGE_TEXT */
-        len= (uint)strlen(m_row.m_message_text);
+        len= static_cast<uint>(strlen(m_row.m_message_text));
         if (len)
           set_field_varchar_utf8(f, m_row.m_message_text, len);
         else
@@ -543,8 +562,11 @@ int table_events_statements_common::read_row_values(TABLE *table,
         else
           f->set_null();
         break;
+      case 40: /* NESTING_EVENT_LEVEL */
+          set_field_ulong(f, m_row.m_nesting_event_level);
+        break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
@@ -577,39 +599,36 @@ int table_events_statements_current::rnd_next(void)
 {
   PFS_thread *pfs_thread;
   PFS_events_statements *statement;
+  bool has_more_thread= true;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index_1 < thread_max;
+       has_more_thread;
        m_pos.next_thread())
   {
-    pfs_thread= &thread_array[m_pos.m_index_1];
-
-    if (! pfs_thread->m_lock.is_populated())
+    pfs_thread= global_thread_container.get(m_pos.m_index_1, & has_more_thread);
+    if (pfs_thread != NULL)
     {
-      /* This thread does not exist */
-      continue;
-    }
-
-    uint safe_events_statements_count= pfs_thread->m_events_statements_count;
+      uint safe_events_statements_count= pfs_thread->m_events_statements_count;
 
-    if (safe_events_statements_count == 0)
-    {
-      /* Display the last top level statement, when completed */
-      if (m_pos.m_index_2 >= 1)
-        continue;
-    }
-    else
-    {
-      /* Display all pending statements, when in progress */
-      if (m_pos.m_index_2 >= safe_events_statements_count)
-        continue;
-    }
+      if (safe_events_statements_count == 0)
+      {
+        /* Display the last top level statement, when completed */
+        if (m_pos.m_index_2 >= 1)
+          continue;
+      }
+      else
+      {
+        /* Display all pending statements, when in progress */
+        if (m_pos.m_index_2 >= safe_events_statements_count)
+          continue;
+      }
 
-    statement= &pfs_thread->m_statement_stack[m_pos.m_index_2];
+      statement= &pfs_thread->m_statement_stack[m_pos.m_index_2];
 
-    make_row(pfs_thread, statement);
-    m_next_pos.set_after(&m_pos);
-    return 0;
+      make_row(pfs_thread, statement);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
   }
 
   return HA_ERR_END_OF_FILE;
@@ -621,44 +640,45 @@ int table_events_statements_current::rnd_pos(const void *pos)
   PFS_events_statements *statement;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < thread_max);
-  pfs_thread= &thread_array[m_pos.m_index_1];
-
-  if (! pfs_thread->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
 
-  uint safe_events_statements_count= pfs_thread->m_events_statements_count;
-
-  if (safe_events_statements_count == 0)
+  pfs_thread= global_thread_container.get(m_pos.m_index_1);
+  if (pfs_thread != NULL)
   {
-    /* Display the last top level statement, when completed */
-    if (m_pos.m_index_2 >= 1)
-      return HA_ERR_RECORD_DELETED;
-  }
-  else
-  {
-    /* Display all pending statements, when in progress */
-    if (m_pos.m_index_2 >= safe_events_statements_count)
-      return HA_ERR_RECORD_DELETED;
-  }
+    uint safe_events_statements_count= pfs_thread->m_events_statements_count;
+
+    if (safe_events_statements_count == 0)
+    {
+      /* Display the last top level statement, when completed */
+      if (m_pos.m_index_2 >= 1)
+        return HA_ERR_RECORD_DELETED;
+    }
+    else
+    {
+      /* Display all pending statements, when in progress */
+      if (m_pos.m_index_2 >= safe_events_statements_count)
+        return HA_ERR_RECORD_DELETED;
+    }
 
-  DBUG_ASSERT(m_pos.m_index_2 < statement_stack_max);
+    assert(m_pos.m_index_2 < statement_stack_max);
 
-  statement= &pfs_thread->m_statement_stack[m_pos.m_index_2];
+    statement= &pfs_thread->m_statement_stack[m_pos.m_index_2];
 
-  if (statement->m_class == NULL)
-    return HA_ERR_RECORD_DELETED;
+    if (statement->m_event.m_class)
+    {
+      make_row(pfs_thread, statement);
+      return 0;
+    }
+  }
 
-  make_row(pfs_thread, statement);
-  return 0;
+  return HA_ERR_RECORD_DELETED;
 }
 
 void table_events_statements_current::make_row(PFS_thread *pfs_thread,
                                                PFS_events_statements *statement)
 {
   sql_digest_storage digest;
-  pfs_lock lock;
-  pfs_lock stmt_lock;
+  pfs_optimistic_state lock;
+  pfs_optimistic_state stmt_lock;
 
   digest.reset(m_token_array, MAX_DIGEST_STORAGE_SIZE);
   /* Protect this reader against thread termination. */
@@ -684,6 +704,12 @@ int table_events_statements_current::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_events_statements_current::get_row_count(void)
+{
+  return global_thread_container.get_row_count() * statement_stack_max;
+}
+
 PFS_engine_table* table_events_statements_history::create(void)
 {
   return new table_events_statements_history();
@@ -710,43 +736,40 @@ int table_events_statements_history::rnd_next(void)
 {
   PFS_thread *pfs_thread;
   PFS_events_statements *statement;
+  bool has_more_thread= true;
 
   if (events_statements_history_per_thread == 0)
     return HA_ERR_END_OF_FILE;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index_1 < thread_max;
+       has_more_thread;
        m_pos.next_thread())
   {
-    pfs_thread= &thread_array[m_pos.m_index_1];
-
-    if (! pfs_thread->m_lock.is_populated())
-    {
-      /* This thread does not exist */
-      continue;
-    }
-
-    if (m_pos.m_index_2 >= events_statements_history_per_thread)
+    pfs_thread= global_thread_container.get(m_pos.m_index_1, & has_more_thread);
+    if (pfs_thread != NULL)
     {
-      /* This thread does not have more (full) history */
-      continue;
-    }
+      if (m_pos.m_index_2 >= events_statements_history_per_thread)
+      {
+        /* This thread does not have more (full) history */
+        continue;
+      }
 
-    if ( ! pfs_thread->m_statements_history_full &&
-        (m_pos.m_index_2 >= pfs_thread->m_statements_history_index))
-    {
-      /* This thread does not have more (not full) history */
-      continue;
-    }
+      if ( ! pfs_thread->m_statements_history_full &&
+          (m_pos.m_index_2 >= pfs_thread->m_statements_history_index))
+      {
+        /* This thread does not have more (not full) history */
+        continue;
+      }
 
-    statement= &pfs_thread->m_statements_history[m_pos.m_index_2];
+      statement= &pfs_thread->m_statements_history[m_pos.m_index_2];
 
-    if (statement->m_class != NULL)
-    {
-      make_row(pfs_thread, statement);
-      /* Next iteration, look for the next history in this thread */
-      m_next_pos.set_after(&m_pos);
-      return 0;
+      if (statement->m_event.m_class)
+      {
+        make_row(pfs_thread, statement);
+        /* Next iteration, look for the next history in this thread */
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
     }
   }
 
@@ -758,34 +781,34 @@ int table_events_statements_history::rnd_pos(const void *pos)
   PFS_thread *pfs_thread;
   PFS_events_statements *statement;
 
-  DBUG_ASSERT(events_statements_history_per_thread != 0);
+  assert(events_statements_history_per_thread != 0);
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < thread_max);
-  pfs_thread= &thread_array[m_pos.m_index_1];
-
-  if (! pfs_thread->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
-
-  DBUG_ASSERT(m_pos.m_index_2 < events_statements_history_per_thread);
 
-  if ( ! pfs_thread->m_statements_history_full &&
-      (m_pos.m_index_2 >= pfs_thread->m_statements_history_index))
-    return HA_ERR_RECORD_DELETED;
+  pfs_thread= global_thread_container.get(m_pos.m_index_1);
+  if (pfs_thread != NULL)
+  {
+    assert(m_pos.m_index_2 < events_statements_history_per_thread);
 
-  statement= &pfs_thread->m_statements_history[m_pos.m_index_2];
+    if ( ! pfs_thread->m_statements_history_full &&
+        (m_pos.m_index_2 >= pfs_thread->m_statements_history_index))
+      return HA_ERR_RECORD_DELETED;
 
-  if (statement->m_class == NULL)
-    return HA_ERR_RECORD_DELETED;
+    statement= &pfs_thread->m_statements_history[m_pos.m_index_2];
+    if (statement->m_event.m_class)
+    {
+      make_row(pfs_thread, statement);
+      return 0;
+    }
+  }
 
-  make_row(pfs_thread, statement);
-  return 0;
+  return HA_ERR_RECORD_DELETED;
 }
 
 void table_events_statements_history::make_row(PFS_thread *pfs_thread,
                                                PFS_events_statements *statement)
 {
   sql_digest_storage digest;
-  pfs_lock lock;
+  pfs_optimistic_state lock;
 
   digest.reset(m_token_array, MAX_DIGEST_STORAGE_SIZE);
   /* Protect this reader against thread termination. */
@@ -799,15 +822,22 @@ void table_events_statements_history::make_row(PFS_thread *pfs_thread,
     return;
   }
   table_events_statements_common::make_row_part_2(&digest);
-  return; 
+  return;
 }
 
+
 int table_events_statements_history::delete_all_rows(void)
 {
   reset_events_statements_history();
   return 0;
 }
 
+ha_rows
+table_events_statements_history::get_row_count(void)
+{
+  return events_statements_history_per_thread * global_thread_container.get_row_count();
+}
+
 PFS_engine_table* table_events_statements_history_long::create(void)
 {
   return new table_events_statements_history_long();
@@ -833,21 +863,21 @@ int table_events_statements_history_long::rnd_init(bool scan)
 int table_events_statements_history_long::rnd_next(void)
 {
   PFS_events_statements *statement;
-  size_t limit;
+  uint limit;
 
   if (events_statements_history_long_size == 0)
     return HA_ERR_END_OF_FILE;
 
   if (events_statements_history_long_full)
-    limit= events_statements_history_long_size;
+    limit= static_cast<uint>(events_statements_history_long_size);
   else
-    limit= events_statements_history_long_index % events_statements_history_long_size;
+    limit= events_statements_history_long_index.m_u32 % events_statements_history_long_size;
 
   for (m_pos.set_at(&m_next_pos); m_pos.m_index < limit; m_pos.next())
   {
     statement= &events_statements_history_long_array[m_pos.m_index];
 
-    if (statement->m_class != NULL)
+    if (statement->m_event.m_class)
     {
       make_row(statement);
       /* Next iteration, look for the next entry */
@@ -862,7 +892,7 @@ int table_events_statements_history_long::rnd_next(void)
 int table_events_statements_history_long::rnd_pos(const void *pos)
 {
   PFS_events_statements *statement;
-  size_t limit;
+  uint limit;
 
   if (events_statements_history_long_size == 0)
     return HA_ERR_RECORD_DELETED;
@@ -870,16 +900,16 @@ int table_events_statements_history_long::rnd_pos(const void *pos)
   set_position(pos);
 
   if (events_statements_history_long_full)
-    limit= events_statements_history_long_size;
+    limit= static_cast<uint>(events_statements_history_long_size);
   else
-    limit= events_statements_history_long_index % events_statements_history_long_size;
+    limit= events_statements_history_long_index.m_u32 % events_statements_history_long_size;
 
   if (m_pos.m_index >= limit)
     return HA_ERR_RECORD_DELETED;
 
   statement= &events_statements_history_long_array[m_pos.m_index];
 
-  if (statement->m_class == NULL)
+  if (!statement->m_event.m_class)
     return HA_ERR_RECORD_DELETED;
 
   make_row(statement);
@@ -903,3 +933,9 @@ int table_events_statements_history_long::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_events_statements_history_long::get_row_count(void)
+{
+  return events_statements_history_long_size;
+}
+
diff --git a/storage/perfschema/table_events_statements.h b/storage/perfschema/table_events_statements.h
index cec28628f3e..5499c8d62ec 100644
--- a/storage/perfschema/table_events_statements.h
+++ b/storage/perfschema/table_events_statements.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -53,6 +53,8 @@ struct row_events_statements
   ulonglong m_nesting_event_id;
   /** Column NESTING_EVENT_TYPE. */
   enum_event_type m_nesting_event_type;
+  /** Column NESTING_EVENT_LEVEL. */
+  uint m_nesting_event_level;
   /** Column EVENT_NAME. */
   const char *m_name;
   /** Length in bytes of @c m_name. */
@@ -73,11 +75,23 @@ struct row_events_statements
   String m_sqltext;
   /** Column DIGEST and DIGEST_TEXT. */
   PFS_digest_row m_digest;
-  /** Column CURRENT_SCHEMA. */
+    /** Column CURRENT_SCHEMA. */
   char m_current_schema_name[NAME_LEN];
   /** Length in bytes of @c m_current_schema_name. */
   uint m_current_schema_name_length;
 
+  /** Column OBJECT_TYPE. */
+  enum_object_type m_object_type;
+  /** Column OBJECT_SCHEMA. */
+  char m_schema_name[NAME_LEN];
+  /** Length in bytes of @c m_schema_name. */
+  uint m_schema_name_length;
+  /** Column OBJECT_NAME. */
+  char m_object_name[NAME_LEN];
+  /** Length in bytes of @c m_object_name. */
+  uint m_object_name_length;
+
+
   /** Column MESSAGE_TEXT. */
   char m_message_text[MYSQL_ERRMSG_SIZE+1];
   /** Column MYSQL_ERRNO. */
@@ -199,6 +213,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
@@ -235,6 +250,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
@@ -268,6 +284,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
diff --git a/storage/perfschema/table_events_transactions.cc b/storage/perfschema/table_events_transactions.cc
new file mode 100644
index 00000000000..543d106ff26
--- /dev/null
+++ b/storage/perfschema/table_events_transactions.cc
@@ -0,0 +1,718 @@
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_events_transactions.cc
+  Table EVENTS_TRANSACTIONS_xxx (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "table_events_transactions.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_events_transactions.h"
+#include "pfs_timer.h"
+#include "table_helper.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
+//#include "xa.h"
+
+THR_LOCK table_events_transactions_current::m_table_lock;
+
+PFS_engine_table_share
+table_events_transactions_current::m_share=
+{
+  { C_STRING_WITH_LEN("events_transactions_current") },
+  &pfs_truncatable_acl,
+  table_events_transactions_current::create,
+  NULL, /* write_row */
+  table_events_transactions_current::delete_all_rows,
+  table_events_transactions_current::get_row_count,
+  sizeof(PFS_simple_index), /* ref length */
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE events_transactions_current("
+  "THREAD_ID BIGINT unsigned not null comment 'The thread associated with the event.',"
+  "EVENT_ID BIGINT unsigned not null comment 'The event id associated with the event.',"
+  "END_EVENT_ID BIGINT unsigned comment 'This column is set to NULL when the event starts and updated to the thread current event number when the event ends.',"
+  "EVENT_NAME VARCHAR(128) not null comment 'The name of the instrument from which the event was collected. This is a NAME value from the setup_instruments table.',"
+  "STATE ENUM('ACTIVE', 'COMMITTED', 'ROLLED BACK') comment 'The current transaction state. The value is ACTIVE (after START TRANSACTION or BEGIN), COMMITTED (after COMMIT), or ROLLED BACK (after ROLLBACK).',"
+  "TRX_ID BIGINT unsigned comment 'Unused.',"
+  "GTID VARCHAR(64) comment 'Transaction GTID, using the format DOMAIN-SERVER_ID-SEQUENCE_NO.',"
+  "XID_FORMAT_ID INTEGER comment 'XA transaction format ID for GTRID and BQUAL values.',"
+  "XID_GTRID VARCHAR(130) comment 'XA global transaction ID.',"
+  "XID_BQUAL VARCHAR(130) comment 'XA transaction branch qualifier.',"
+  "XA_STATE VARCHAR(64) comment 'The state of the XA transaction. The value is ACTIVE (after XA START), IDLE (after XA END), PREPARED (after XA PREPARE), ROLLED BACK (after XA ROLLBACK), or COMMITTED (after XA COMMIT).',"
+  "SOURCE VARCHAR(64) comment 'The name of the source file containing the instrumented code that produced the event and the line number in the file at which the instrumentation occurs.',"
+  "TIMER_START BIGINT unsigned comment 'The unit is picoseconds. When event timing started. NULL if event has no timing information.',"
+  "TIMER_END BIGINT unsigned comment 'The unit is picoseconds. When event timing ended. NULL if event has no timing information.',"
+  "TIMER_WAIT BIGINT unsigned comment 'The unit is picoseconds. Event duration. NULL if event has not timing information.',"
+  "ACCESS_MODE ENUM('READ ONLY', 'READ WRITE') comment 'Transaction access mode.',"
+  "ISOLATION_LEVEL VARCHAR(64) comment 'Transaction isolation level. One of: REPEATABLE READ, READ COMMITTED, READ UNCOMMITTED, or SERIALIZABLE.',"
+  "AUTOCOMMIT ENUM('YES','NO') not null comment 'Whether autcommit mode was enabled when the transaction started.',"
+  "NUMBER_OF_SAVEPOINTS BIGINT unsigned comment 'The number of SAVEPOINT statements issued during the transaction.',"
+  "NUMBER_OF_ROLLBACK_TO_SAVEPOINT BIGINT unsigned comment 'The number of ROLLBACK_TO_SAVEPOINT statements issued during the transaction.',"
+  "NUMBER_OF_RELEASE_SAVEPOINT BIGINT unsigned comment 'The number of RELEASE_SAVEPOINT statements issued during the transaction.',"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned comment 'Unused.',"
+  "NESTING_EVENT_ID BIGINT unsigned comment 'The EVENT_ID value of the event within which this event is nested.',"
+  "NESTING_EVENT_TYPE ENUM('TRANSACTION', 'STATEMENT', 'STAGE', 'WAIT') comment 'The nesting event type.')")},
+  false  /* perpetual */
+};
+
+THR_LOCK table_events_transactions_history::m_table_lock;
+
+PFS_engine_table_share
+table_events_transactions_history::m_share=
+{
+  { C_STRING_WITH_LEN("events_transactions_history") },
+  &pfs_truncatable_acl,
+  table_events_transactions_history::create,
+  NULL, /* write_row */
+  table_events_transactions_history::delete_all_rows,
+  table_events_transactions_history::get_row_count,
+  sizeof(pos_events_transactions_history), /* ref length */
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE events_transactions_history("
+  "THREAD_ID BIGINT unsigned not null comment 'The thread associated with the event.',"
+  "EVENT_ID BIGINT unsigned not null comment 'The event id associated with the event.',"
+  "END_EVENT_ID BIGINT unsigned comment 'This column is set to NULL when the event starts and updated to the thread current event number when the event ends.',"
+  "EVENT_NAME VARCHAR(128) not null comment 'The name of the instrument from which the event was collected. This is a NAME value from the setup_instruments table.',"
+  "STATE ENUM('ACTIVE', 'COMMITTED', 'ROLLED BACK') comment 'The current transaction state. The value is ACTIVE (after START TRANSACTION or BEGIN), COMMITTED (after COMMIT), or ROLLED BACK (after ROLLBACK).',"
+  "TRX_ID BIGINT unsigned comment 'Unused.',"
+  "GTID VARCHAR(64) comment 'Transaction GTID, using the format DOMAIN-SERVER_ID-SEQUENCE_NO.',"
+  "XID_FORMAT_ID INTEGER comment 'XA transaction format ID for GTRID and BQUAL values.',"
+  "XID_GTRID VARCHAR(130) comment 'XA global transaction ID.',"
+  "XID_BQUAL VARCHAR(130) comment 'XA transaction branch qualifier.',"
+  "XA_STATE VARCHAR(64) comment 'The state of the XA transaction. The value is ACTIVE (after XA START), IDLE (after XA END), PREPARED (after XA PREPARE), ROLLED BACK (after XA ROLLBACK), or COMMITTED (after XA COMMIT).',"
+  "SOURCE VARCHAR(64) comment 'The name of the source file containing the instrumented code that produced the event and the line number in the file at which the instrumentation occurs.',"
+  "TIMER_START BIGINT unsigned comment 'The unit is picoseconds. When event timing started. NULL if event has no timing information.',"
+  "TIMER_END BIGINT unsigned comment 'The unit is picoseconds. When event timing ended. NULL if event has no timing information.',"
+  "TIMER_WAIT BIGINT unsigned comment 'The unit is picoseconds. Event duration. NULL if event has not timing information.',"
+  "ACCESS_MODE ENUM('READ ONLY', 'READ WRITE') comment 'Transaction access mode.',"
+  "ISOLATION_LEVEL VARCHAR(64) comment 'Transaction isolation level. One of: REPEATABLE READ, READ COMMITTED, READ UNCOMMITTED, or SERIALIZABLE.',"
+  "AUTOCOMMIT ENUM('YES','NO') not null comment 'Whether autcommit mode was enabled when the transaction started.',"
+  "NUMBER_OF_SAVEPOINTS BIGINT unsigned comment 'The number of SAVEPOINT statements issued during the transaction.',"
+  "NUMBER_OF_ROLLBACK_TO_SAVEPOINT BIGINT unsigned comment 'The number of ROLLBACK_TO_SAVEPOINT statements issued during the transaction.',"
+  "NUMBER_OF_RELEASE_SAVEPOINT BIGINT unsigned comment 'The number of RELEASE_SAVEPOINT statements issued during the transaction.',"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned comment 'Unused.',"
+  "NESTING_EVENT_ID BIGINT unsigned comment 'The EVENT_ID value of the event within which this event is nested.',"
+  "NESTING_EVENT_TYPE ENUM('TRANSACTION', 'STATEMENT', 'STAGE', 'WAIT') comment 'The nesting event type.')")},
+  false  /* perpetual */
+};
+
+THR_LOCK table_events_transactions_history_long::m_table_lock;
+
+PFS_engine_table_share
+table_events_transactions_history_long::m_share=
+{
+  { C_STRING_WITH_LEN("events_transactions_history_long") },
+  &pfs_truncatable_acl,
+  table_events_transactions_history_long::create,
+  NULL, /* write_row */
+  table_events_transactions_history_long::delete_all_rows,
+  table_events_transactions_history_long::get_row_count,
+  sizeof(PFS_simple_index), /* ref length */
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE events_transactions_history_long("
+  "THREAD_ID BIGINT unsigned not null comment 'The thread associated with the event.',"
+  "EVENT_ID BIGINT unsigned not null comment 'The event id associated with the event.',"
+  "END_EVENT_ID BIGINT unsigned comment 'This column is set to NULL when the event starts and updated to the thread current event number when the event ends.',"
+  "EVENT_NAME VARCHAR(128) not null comment 'The name of the instrument from which the event was collected. This is a NAME value from the setup_instruments table.',"
+  "STATE ENUM('ACTIVE', 'COMMITTED', 'ROLLED BACK') comment 'The current transaction state. The value is ACTIVE (after START TRANSACTION or BEGIN), COMMITTED (after COMMIT), or ROLLED BACK (after ROLLBACK).',"
+  "TRX_ID BIGINT unsigned comment 'Unused.',"
+  "GTID VARCHAR(64) comment 'Transaction GTID, using the format DOMAIN-SERVER_ID-SEQUENCE_NO.',"
+  "XID_FORMAT_ID INTEGER comment 'XA transaction format ID for GTRID and BQUAL values.',"
+  "XID_GTRID VARCHAR(130) comment 'XA global transaction ID.',"
+  "XID_BQUAL VARCHAR(130) comment 'XA transaction branch qualifier.',"
+  "XA_STATE VARCHAR(64) comment 'The state of the XA transaction. The value is ACTIVE (after XA START), IDLE (after XA END), PREPARED (after XA PREPARE), ROLLED BACK (after XA ROLLBACK), or COMMITTED (after XA COMMIT).',"
+  "SOURCE VARCHAR(64) comment 'The name of the source file containing the instrumented code that produced the event and the line number in the file at which the instrumentation occurs.',"
+  "TIMER_START BIGINT unsigned comment 'The unit is picoseconds. When event timing started. NULL if event has no timing information.',"
+  "TIMER_END BIGINT unsigned comment 'The unit is picoseconds. When event timing ended. NULL if event has no timing information.',"
+  "TIMER_WAIT BIGINT unsigned comment 'The unit is picoseconds. Event duration. NULL if event has not timing information.',"
+  "ACCESS_MODE ENUM('READ ONLY', 'READ WRITE') comment 'Transaction access mode.',"
+  "ISOLATION_LEVEL VARCHAR(64) comment 'Transaction isolation level. One of: REPEATABLE READ, READ COMMITTED, READ UNCOMMITTED, or SERIALIZABLE.',"
+  "AUTOCOMMIT ENUM('YES','NO') not null comment 'Whether autcommit mode was enabled when the transaction started.',"
+  "NUMBER_OF_SAVEPOINTS BIGINT unsigned comment 'The number of SAVEPOINT statements issued during the transaction.',"
+  "NUMBER_OF_ROLLBACK_TO_SAVEPOINT BIGINT unsigned comment 'The number of ROLLBACK_TO_SAVEPOINT statements issued during the transaction.',"
+  "NUMBER_OF_RELEASE_SAVEPOINT BIGINT unsigned comment 'The number of RELEASE_SAVEPOINT statements issued during the transaction.',"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned comment 'Unused.',"
+  "NESTING_EVENT_ID BIGINT unsigned comment 'The EVENT_ID value of the event within which this event is nested.',"
+  "NESTING_EVENT_TYPE ENUM('TRANSACTION', 'STATEMENT', 'STAGE', 'WAIT') comment 'The nesting event type.')")},
+  false  /* perpetual */
+};
+
+table_events_transactions_common::table_events_transactions_common
+(const PFS_engine_table_share *share, void *pos)
+  : PFS_engine_table(share, pos),
+  m_row_exists(false)
+{}
+
+/**
+  Build a row.
+  @param transaction                      the transaction the cursor is reading
+*/
+void table_events_transactions_common::make_row(PFS_events_transactions *transaction)
+{
+  ulonglong timer_end;
+
+  m_row_exists= false;
+
+  PFS_transaction_class *unsafe= (PFS_transaction_class*) transaction->m_class;
+  PFS_transaction_class *klass= sanitize_transaction_class(unsafe);
+  if (unlikely(klass == NULL))
+    return;
+
+  m_row.m_thread_internal_id= transaction->m_thread_internal_id;
+  m_row.m_event_id= transaction->m_event_id;
+  m_row.m_end_event_id= transaction->m_end_event_id;
+  m_row.m_nesting_event_id= transaction->m_nesting_event_id;
+  m_row.m_nesting_event_type= transaction->m_nesting_event_type;
+
+  if (m_row.m_end_event_id == 0)
+  {
+    timer_end= get_timer_raw_value(transaction_timer);
+  }
+  else
+  {
+    timer_end= transaction->m_timer_end;
+  }
+
+  m_normalizer->to_pico(transaction->m_timer_start, timer_end,
+                        &m_row.m_timer_start, &m_row.m_timer_end, &m_row.m_timer_wait);
+  m_row.m_name= klass->m_name;
+  m_row.m_name_length= klass->m_name_length;
+
+  /* Disable source file and line to avoid stale __FILE__ pointers. */
+  m_row.m_source_length= 0;
+
+  /* A GTID consists of the SID (source id) and GNO (transaction number).
+     The SID is stored in transaction->m_sid and the GNO is stored in
+     transaction->m_gtid_spec.gno.
+
+     On a master, the GTID is assigned when the transaction commit.
+     On a slave, the GTID is assigned before the transaction starts.
+     If GTID_MODE = OFF, all transactions have the special GTID
+     'ANONYMOUS'.
+
+     Therefore, a transaction can be in three different states wrt GTIDs:
+     - Before the GTID has been assigned, the state is 'AUTOMATIC'.
+       On a master, this is the state until the transaction commits.
+       On a slave, this state does not appear.
+     - If GTID_MODE = ON, and a GTID is assigned, the GTID is a string
+       of the form 'UUID:NUMBER'.
+     - If GTID_MODE = OFF, and a GTID is assigned, the GTID is a string
+       of the form 'ANONYMOUS'.
+
+     The Gtid_specification contains the GNO, as well as a type code
+     that specifies which of the three modes is currently in effect.
+     Given a SID, it can generate the textual representation of the
+     GTID.
+  */
+  //rpl_sid *sid= &transaction->m_sid;
+  Gtid_specification *gtid_spec= &transaction->m_gtid_spec;
+  m_row.m_gtid_length= static_cast<uint>(gtid_spec->to_string(m_row.m_gtid));
+
+  m_row.m_xid= transaction->m_xid;
+  m_row.m_isolation_level= transaction->m_isolation_level;
+  m_row.m_read_only= transaction->m_read_only;
+  m_row.m_trxid= transaction->m_trxid;
+  m_row.m_state= transaction->m_state;
+  m_row.m_xa_state= transaction->m_xa_state;
+  m_row.m_xa= transaction->m_xa;
+  m_row.m_autocommit= transaction->m_autocommit;
+  m_row.m_savepoint_count= transaction->m_savepoint_count;
+  m_row.m_rollback_to_savepoint_count= transaction->m_rollback_to_savepoint_count;
+  m_row.m_release_savepoint_count= transaction->m_release_savepoint_count;
+  m_row_exists= true;
+  return;
+}
+
+/** Size of XID converted to null-terminated hex string prefixed with 0x. */
+static const ulong XID_BUFFER_SIZE= XIDDATASIZE*2 + 2 + 1;
+
+/**
+  Convert the XID to HEX string prefixed by '0x'
+
+  @param[out] buf     output hex string buffer, null-terminated
+  @param buf_len size of buffer, must be at least @c XID_BUFFER_SIZE
+  @param xid     XID structure
+  @param offset  offset into XID.data[]
+  @param length  number of bytes to process
+  @return number of bytes in hex string
+*/
+static size_t xid_to_hex(char *buf, size_t buf_len, PSI_xid *xid, size_t offset, size_t length)
+{
+  assert(buf_len >= XID_BUFFER_SIZE);
+  assert(offset + length <= XIDDATASIZE);
+  *buf++= '0';
+  *buf++= 'x';
+  return bin_to_hex_str(buf, buf_len-2, (char*)(xid->data + offset), length) + 2;
+}
+
+/**
+  Store the XID in printable format if possible, otherwise convert
+  to a string of hex digits.
+
+  @param  field   Record field
+  @param  xid     XID structure
+  @param  offset  offset into XID.data[]
+  @param  length  number of bytes to process
+*/
+static void xid_store(Field *field, PSI_xid *xid, size_t offset, size_t length)
+{
+  assert(!xid->is_null());
+  if (xid_printable(xid, offset, length))
+  {
+    field->store(xid->data + offset, length, &my_charset_bin);
+  }
+  else
+  {
+    /*
+      xid_buf contains enough space for 0x followed by hex representation of
+      the binary XID data and one null termination character.
+    */
+    char xid_buf[XID_BUFFER_SIZE];
+
+    size_t xid_str_len= xid_to_hex(xid_buf, sizeof(xid_buf), xid, offset, length);
+    field->store(xid_buf, xid_str_len, &my_charset_bin);
+  }
+}
+
+static void xid_store_bqual(Field *field, PSI_xid *xid)
+{
+  xid_store(field, xid, xid->gtrid_length, xid->bqual_length);
+}
+
+static void xid_store_gtrid(Field *field, PSI_xid *xid)
+{
+  xid_store(field, xid, 0, xid->gtrid_length);
+}
+
+int table_events_transactions_common::read_row_values(TABLE *table,
+                                                      unsigned char *buf,
+                                                      Field **fields,
+                                                      bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 3);
+  buf[0]= 0;
+  buf[1]= 0;
+  buf[2]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* THREAD_ID */
+        set_field_ulonglong(f, m_row.m_thread_internal_id);
+        break;
+      case 1: /* EVENT_ID */
+        set_field_ulonglong(f, m_row.m_event_id);
+        break;
+      case 2: /* END_EVENT_ID */
+        if (m_row.m_end_event_id > 0)
+          set_field_ulonglong(f, m_row.m_end_event_id - 1);
+        else
+          f->set_null();
+        break;
+      case 3: /* EVENT_NAME */
+        set_field_varchar_utf8(f, m_row.m_name, m_row.m_name_length);
+        break;
+      case 4: /* STATE */
+        set_field_enum(f, m_row.m_state);
+        break;
+      case 5: /* TRX_ID */
+        if (m_row.m_trxid != 0)
+          set_field_ulonglong(f, m_row.m_trxid);
+        else
+          f->set_null();
+        break;
+      case 6: /* GTID */
+        set_field_varchar_utf8(f, m_row.m_gtid, m_row.m_gtid_length);
+        break;
+      case 7: /* XID_FORMAT_ID */
+        if (!m_row.m_xa || m_row.m_xid.is_null())
+          f->set_null();
+        else
+          set_field_long(f, m_row.m_xid.formatID);
+        break;
+      case 8: /* XID_GTRID */
+        if (!m_row.m_xa || m_row.m_xid.is_null() || m_row.m_xid.gtrid_length <= 0)
+          f->set_null();
+        else
+          xid_store_gtrid(f, &m_row.m_xid);
+        break;
+      case 9: /* XID_BQUAL */
+        if (!m_row.m_xa || m_row.m_xid.is_null() || m_row.m_xid.bqual_length <= 0)
+          f->set_null();
+        else
+          xid_store_bqual(f, &m_row.m_xid);
+        break;
+      case 10: /* XA STATE */
+        if (!m_row.m_xa || m_row.m_xid.is_null())
+          f->set_null();
+        else
+          set_field_xa_state(f, m_row.m_xa_state);
+        break;
+      case 11: /* SOURCE */
+        set_field_varchar_utf8(f, m_row.m_source, m_row.m_source_length);
+        break;
+      case 12: /* TIMER_START */
+        if (m_row.m_timer_start != 0)
+          set_field_ulonglong(f, m_row.m_timer_start);
+        else
+          f->set_null();
+        break;
+      case 13: /* TIMER_END */
+        if (m_row.m_timer_end != 0)
+          set_field_ulonglong(f, m_row.m_timer_end);
+        else
+          f->set_null();
+        break;
+      case 14: /* TIMER_WAIT */
+        if (m_row.m_timer_wait != 0)
+          set_field_ulonglong(f, m_row.m_timer_wait);
+        else
+          f->set_null();
+        break;
+      case 15: /* ACCESS_MODE */
+        set_field_enum(f, m_row.m_read_only ? TRANS_MODE_READ_ONLY
+                                            : TRANS_MODE_READ_WRITE);
+        break;
+      case 16: /* ISOLATION_LEVEL */
+        set_field_isolation_level(f, m_row.m_isolation_level);
+        break;
+      case 17: /* AUTOCOMMIT */
+        set_field_enum(f, m_row.m_autocommit ? ENUM_YES : ENUM_NO);
+        break;
+      case 18: /* NUMBER_OF_SAVEPOINTS */
+        set_field_ulonglong(f, m_row.m_savepoint_count);
+        break;
+      case 19: /* NUMBER_OF_ROLLBACK_TO_SAVEPOINT */
+        set_field_ulonglong(f, m_row.m_rollback_to_savepoint_count);
+        break;
+      case 20: /* NUMBER_OF_RELEASE_SAVEPOINT */
+        set_field_ulonglong(f, m_row.m_release_savepoint_count);
+        break;
+      case 21: /* OBJECT_INSTANCE_BEGIN */
+        f->set_null();
+        break;
+      case 22: /* NESTING_EVENT_ID */
+        if (m_row.m_nesting_event_id != 0)
+          set_field_ulonglong(f, m_row.m_nesting_event_id);
+        else
+          f->set_null();
+        break;
+      case 23: /* NESTING_EVENT_TYPE */
+        if (m_row.m_nesting_event_id != 0)
+          set_field_enum(f, m_row.m_nesting_event_type);
+        else
+          f->set_null();
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+  return 0;
+}
+
+PFS_engine_table* table_events_transactions_current::create(void)
+{
+  return new table_events_transactions_current();
+}
+
+table_events_transactions_current::table_events_transactions_current()
+  : table_events_transactions_common(&m_share, &m_pos),
+  m_pos(0), m_next_pos(0)
+{}
+
+void table_events_transactions_current::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int table_events_transactions_current::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(transaction_timer);
+  return 0;
+}
+
+int table_events_transactions_current::rnd_next(void)
+{
+  PFS_thread *pfs_thread;
+  PFS_events_transactions *transaction;
+  bool has_more_thread= true;
+
+  for (m_pos.set_at(&m_next_pos);
+       has_more_thread;
+       m_pos.next())
+  {
+    pfs_thread= global_thread_container.get(m_pos.m_index, & has_more_thread);
+    if (pfs_thread != NULL)
+    {
+      transaction= &pfs_thread->m_transaction_current;
+      make_row(transaction);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_events_transactions_current::rnd_pos(const void *pos)
+{
+  PFS_thread *pfs_thread;
+  PFS_events_transactions *transaction;
+
+  set_position(pos);
+
+  pfs_thread= global_thread_container.get(m_pos.m_index);
+  if (pfs_thread != NULL)
+  {
+    transaction= &pfs_thread->m_transaction_current;
+    if (transaction->m_class != NULL)
+    {
+      make_row(transaction);
+      return 0;
+    }
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+int table_events_transactions_current::delete_all_rows(void)
+{
+  reset_events_transactions_current();
+  return 0;
+}
+
+ha_rows
+table_events_transactions_current::get_row_count(void)
+{
+  return global_thread_container.get_row_count();
+}
+
+PFS_engine_table* table_events_transactions_history::create(void)
+{
+  return new table_events_transactions_history();
+}
+
+table_events_transactions_history::table_events_transactions_history()
+  : table_events_transactions_common(&m_share, &m_pos),
+  m_pos(), m_next_pos()
+{}
+
+void table_events_transactions_history::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_events_transactions_history::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(transaction_timer);
+  return 0;
+}
+
+int table_events_transactions_history::rnd_next(void)
+{
+  PFS_thread *pfs_thread;
+  PFS_events_transactions *transaction;
+  bool has_more_thread= true;
+
+  if (events_transactions_history_per_thread == 0)
+    return HA_ERR_END_OF_FILE;
+
+  for (m_pos.set_at(&m_next_pos);
+       has_more_thread;
+       m_pos.next_thread())
+  {
+    pfs_thread= global_thread_container.get(m_pos.m_index_1, & has_more_thread);
+    if (pfs_thread != NULL)
+    {
+      if (m_pos.m_index_2 >= events_transactions_history_per_thread)
+      {
+        /* This thread does not have more (full) history */
+        continue;
+      }
+
+      if ( ! pfs_thread->m_transactions_history_full &&
+          (m_pos.m_index_2 >= pfs_thread->m_transactions_history_index))
+      {
+        /* This thread does not have more (not full) history */
+        continue;
+      }
+
+      transaction= &pfs_thread->m_transactions_history[m_pos.m_index_2];
+      if (transaction->m_class != NULL)
+      {
+        make_row(transaction);
+        /* Next iteration, look for the next history in this thread */
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_events_transactions_history::rnd_pos(const void *pos)
+{
+  PFS_thread *pfs_thread;
+  PFS_events_transactions *transaction;
+
+  assert(events_transactions_history_per_thread != 0);
+  set_position(pos);
+
+  assert(m_pos.m_index_2 < events_transactions_history_per_thread);
+
+  pfs_thread= global_thread_container.get(m_pos.m_index_1);
+  if (pfs_thread != NULL)
+  {
+    if ( ! pfs_thread->m_transactions_history_full &&
+        (m_pos.m_index_2 >= pfs_thread->m_transactions_history_index))
+      return HA_ERR_RECORD_DELETED;
+
+    transaction= &pfs_thread->m_transactions_history[m_pos.m_index_2];
+    if (transaction->m_class != NULL)
+    {
+      make_row(transaction);
+      return 0;
+    }
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+int table_events_transactions_history::delete_all_rows(void)
+{
+  reset_events_transactions_history();
+  return 0;
+}
+
+ha_rows
+table_events_transactions_history::get_row_count(void)
+{
+  return events_transactions_history_per_thread * global_thread_container.get_row_count();
+}
+
+PFS_engine_table* table_events_transactions_history_long::create(void)
+{
+  return new table_events_transactions_history_long();
+}
+
+table_events_transactions_history_long::table_events_transactions_history_long()
+  : table_events_transactions_common(&m_share, &m_pos),
+  m_pos(0), m_next_pos(0)
+{}
+
+void table_events_transactions_history_long::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int table_events_transactions_history_long::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(transaction_timer);
+  return 0;
+}
+
+int table_events_transactions_history_long::rnd_next(void)
+{
+  PFS_events_transactions *transaction;
+  uint limit;
+
+  if (events_transactions_history_long_size == 0)
+    return HA_ERR_END_OF_FILE;
+
+  if (events_transactions_history_long_full)
+    limit= events_transactions_history_long_size;
+  else
+    limit= events_transactions_history_long_index.m_u32 % events_transactions_history_long_size;
+
+  for (m_pos.set_at(&m_next_pos); m_pos.m_index < limit; m_pos.next())
+  {
+    transaction= &events_transactions_history_long_array[m_pos.m_index];
+
+    if (transaction->m_class != NULL)
+    {
+      make_row(transaction);
+      /* Next iteration, look for the next entry */
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_events_transactions_history_long::rnd_pos(const void *pos)
+{
+  PFS_events_transactions *transaction;
+  uint limit;
+
+  if (events_transactions_history_long_size == 0)
+    return HA_ERR_RECORD_DELETED;
+
+  set_position(pos);
+
+  if (events_transactions_history_long_full)
+    limit= events_transactions_history_long_size;
+  else
+    limit= events_transactions_history_long_index.m_u32 % events_transactions_history_long_size;
+
+  if (m_pos.m_index >= limit)
+    return HA_ERR_RECORD_DELETED;
+
+  transaction= &events_transactions_history_long_array[m_pos.m_index];
+
+  if (transaction->m_class == NULL)
+    return HA_ERR_RECORD_DELETED;
+
+  make_row(transaction);
+  return 0;
+}
+
+int table_events_transactions_history_long::delete_all_rows(void)
+{
+  reset_events_transactions_history_long();
+  return 0;
+}
+
+ha_rows
+table_events_transactions_history_long::get_row_count(void)
+{
+  return events_transactions_history_long_size;
+}
+
diff --git a/storage/perfschema/table_events_transactions.h b/storage/perfschema/table_events_transactions.h
new file mode 100644
index 00000000000..b9f8f750d85
--- /dev/null
+++ b/storage/perfschema/table_events_transactions.h
@@ -0,0 +1,254 @@
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_EVENTS_TRANSACTIONS_H
+#define TABLE_EVENTS_TRANSACTIONS_H
+
+/**
+  @file storage/perfschema/table_events_HA_ERR_WRONG_COMMAND.h
+  Table EVENTS_TRANSACTIONS_xxx (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_events_transactions.h"
+#include "table_helper.h"
+#include "rpl_gtid.h"
+
+struct PFS_thread;
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** A row of table_events_transactions_common. */
+struct row_events_transactions
+{
+  /** Column THREAD_ID. */
+  ulonglong m_thread_internal_id;
+  /** Column EVENT_ID. */
+  ulonglong m_event_id;
+  /** Column END_EVENT_ID. */
+  ulonglong m_end_event_id;
+  /** Column NESTING_EVENT_ID. */
+  ulonglong m_nesting_event_id;
+  /** Column NESTING_EVENT_TYPE. */
+  enum_event_type m_nesting_event_type;
+  /** Column EVENT_NAME. */
+  const char *m_name;
+  /** Length in bytes of @c m_name. */
+  uint m_name_length;
+  /** Column TIMER_START. */
+  ulonglong m_timer_start;
+  /** Column TIMER_END. */
+  ulonglong m_timer_end;
+  /** Column TIMER_WAIT. */
+  ulonglong m_timer_wait;
+  /** Column SOURCE. */
+  char m_source[COL_SOURCE_SIZE];
+  /** Length in bytes of @c m_source. */
+  uint m_source_length;
+  /** InnoDB transaction id. */
+  ulonglong m_trxid;
+  /** Transaction state. */
+  enum_transaction_state m_state;
+  /** Global Transaction ID. */
+  char m_gtid[GTID_MAX_STR_LENGTH + 1];
+  /** GTID length in bytes*/
+  int m_gtid_length;
+  /** XA transaction ID. */
+  PSI_xid m_xid;
+  /** XA transaction state. */
+  enum_xa_transaction_state m_xa_state;
+  /** True if XA transaction. */
+  bool m_xa;
+  /** True if autocommit transaction. */
+  bool m_autocommit;
+  /** Isolation level. */
+  enum_isolation_level m_isolation_level;
+  /** True if read-only, read-write otherwise. */
+  bool m_read_only;
+  /** Column NUMBER_OF_SAVEPOINTS. */
+  ulonglong m_savepoint_count;
+  /** Column NUMBER_OF_ROLLBACK_TO_SAVEPOINT. */
+  ulonglong m_rollback_to_savepoint_count;
+  /** Column NUMBER_OF_RELEASE_SAVEPOINT. */
+  ulonglong m_release_savepoint_count;
+};
+
+/**
+  Position of a cursor on PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_HISTORY.
+  Index 1 on thread (0 based)
+  Index 2 on transaction event record in thread history (0 based)
+*/
+struct pos_events_transactions_history : public PFS_double_index
+{
+  pos_events_transactions_history()
+    : PFS_double_index(0, 0)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 0;
+  }
+
+  inline void next_thread(void)
+  {
+    m_index_1++;
+    m_index_2= 0;
+  }
+};
+
+/**
+  Adapter, for table sharing the structure of
+  PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_CURRENT.
+*/
+class table_events_transactions_common : public PFS_engine_table
+{
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_events_transactions_common(const PFS_engine_table_share *share, void *pos);
+
+  ~table_events_transactions_common()
+  {}
+
+  void make_row(PFS_events_transactions *statement);
+
+  /** Current row. */
+  row_events_transactions m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_CURRENT. */
+class table_events_transactions_current : public table_events_transactions_common
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  table_events_transactions_current();
+
+public:
+  ~table_events_transactions_current()
+  {}
+
+private:
+  friend class table_events_transactions_history;
+  friend class table_events_transactions_history_long;
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /**
+    Fields definition.
+    Also used by table_events_transactions_history
+    and table_events_transactions_history_long.
+  */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_HISTORY. */
+class table_events_transactions_history : public table_events_transactions_common
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  table_events_transactions_history();
+
+public:
+  ~table_events_transactions_history()
+  {}
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+
+  /** Current position. */
+  pos_events_transactions_history m_pos;
+  /** Next position. */
+  pos_events_transactions_history m_next_pos;
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_TRANSACTIONS_HISTORY_LONG. */
+class table_events_transactions_history_long : public table_events_transactions_common
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  table_events_transactions_history_long();
+
+public:
+  ~table_events_transactions_history_long()
+  {}
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_events_waits.cc b/storage/perfschema/table_events_waits.cc
index ce632cde0c1..a345c6fdf3a 100644
--- a/storage/perfschema/table_events_waits.cc
+++ b/storage/perfschema/table_events_waits.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,7 +26,7 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "table_events_waits.h"
 #include "pfs_global.h"
 #include "pfs_instr_class.h"
@@ -34,6 +34,8 @@
 #include "pfs_events_waits.h"
 #include "pfs_timer.h"
 #include "m_string.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_events_waits_current::m_table_lock;
 
@@ -42,11 +44,10 @@ table_events_waits_current::m_share=
 {
   { C_STRING_WITH_LEN("events_waits_current") },
   &pfs_truncatable_acl,
-  &table_events_waits_current::create,
+  table_events_waits_current::create,
   NULL, /* write_row */
-  &table_events_waits_current::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_events_waits_current::delete_all_rows,
+  table_events_waits_current::get_row_count,
   sizeof(pos_events_waits_current), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_waits_current("
@@ -65,10 +66,11 @@ table_events_waits_current::m_share=
                       "OBJECT_TYPE VARCHAR(64) comment 'FILE for a file object, TABLE or TEMPORARY TABLE for a table object, or NULL for a synchronization object.',"
                       "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null comment 'Address in memory of the object.',"
                       "NESTING_EVENT_ID BIGINT unsigned comment 'EVENT_ID of event within which this event nests.',"
-                      "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT') comment 'Nesting event type. Either statement, stage or wait.',"
+                      "NESTING_EVENT_TYPE ENUM('TRANSACTION', 'STATEMENT', 'STAGE', 'WAIT') comment 'Nesting event type. Either statement, stage or wait.',"
                       "OPERATION VARCHAR(32) not null comment 'Operation type, for example read, write or lock',"
                       "NUMBER_OF_BYTES BIGINT comment 'Number of bytes that the operation read or wrote, or NULL for table I/O waits.',"
-                      "FLAGS INTEGER unsigned comment 'Reserved for use in the future.')") }
+                      "FLAGS INTEGER unsigned comment 'Reserved for use in the future.')") },
+  false  /* perpetual */
 };
 
 THR_LOCK table_events_waits_history::m_table_lock;
@@ -78,11 +80,10 @@ table_events_waits_history::m_share=
 {
   { C_STRING_WITH_LEN("events_waits_history") },
   &pfs_truncatable_acl,
-  &table_events_waits_history::create,
+  table_events_waits_history::create,
   NULL, /* write_row */
-  &table_events_waits_history::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_events_waits_history::delete_all_rows,
+  table_events_waits_history::get_row_count,
   sizeof(pos_events_waits_history), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_waits_history("
@@ -101,10 +102,11 @@ table_events_waits_history::m_share=
                       "OBJECT_TYPE VARCHAR(64) comment 'FILE for a file object, TABLE or TEMPORARY TABLE for a table object, or NULL for a synchronization object.',"
                       "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null comment 'Address in memory of the object.',"
                       "NESTING_EVENT_ID BIGINT unsigned comment 'EVENT_ID of event within which this event nests.',"
-                      "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT') comment 'Nesting event type. Either statement, stage or wait.',"
+                      "NESTING_EVENT_TYPE ENUM('TRANSACTION', 'STATEMENT', 'STAGE', 'WAIT') comment 'Nesting event type. Either statement, stage or wait.',"
                       "OPERATION VARCHAR(32) not null comment 'Operation type, for example read, write or lock',"
                       "NUMBER_OF_BYTES BIGINT comment 'Number of bytes that the operation read or wrote, or NULL for table I/O waits.',"
-                      "FLAGS INTEGER unsigned comment 'Reserved for use in the future.')") }
+                      "FLAGS INTEGER unsigned comment 'Reserved for use in the future.')") },
+  false  /* perpetual */
 };
 
 THR_LOCK table_events_waits_history_long::m_table_lock;
@@ -114,11 +116,10 @@ table_events_waits_history_long::m_share=
 {
   { C_STRING_WITH_LEN("events_waits_history_long") },
   &pfs_truncatable_acl,
-  &table_events_waits_history_long::create,
+  table_events_waits_history_long::create,
   NULL, /* write_row */
-  &table_events_waits_history_long::delete_all_rows,
-  NULL, /* get_row_count */
-  10000, /* records */
+  table_events_waits_history_long::delete_all_rows,
+  table_events_waits_history_long::get_row_count,
   sizeof(PFS_simple_index), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_waits_history_long("
@@ -137,10 +138,11 @@ table_events_waits_history_long::m_share=
                       "OBJECT_TYPE VARCHAR(64) comment 'FILE for a file object, TABLE or TEMPORARY TABLE for a table object, or NULL for a synchronization object.',"
                       "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null comment 'Address in memory of the object.',"
                       "NESTING_EVENT_ID BIGINT unsigned comment 'EVENT_ID of event within which this event nests.',"
-                      "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT') comment 'Nesting event type. Either statement, stage or wait.',"
+                      "NESTING_EVENT_TYPE ENUM('TRANSACTION', 'STATEMENT', 'STAGE', 'WAIT') comment 'Nesting event type. Either statement, stage or wait.',"
                       "OPERATION VARCHAR(32) not null comment 'Operation type, for example read, write or lock',"
                       "NUMBER_OF_BYTES BIGINT comment 'Number of bytes that the operation read or wrote, or NULL for table I/O waits.',"
-                      "FLAGS INTEGER unsigned comment 'Reserved for use in the future.')") }
+                      "FLAGS INTEGER unsigned comment 'Reserved for use in the future.')") },
+  false  /* perpetual */
 };
 
 table_events_waits_common::table_events_waits_common
@@ -151,15 +153,13 @@ table_events_waits_common::table_events_waits_common
 
 void table_events_waits_common::clear_object_columns()
 {
-  m_row.m_object_type= NULL;
   m_row.m_object_type_length= 0;
   m_row.m_object_schema_length= 0;
   m_row.m_object_name_length= 0;
   m_row.m_index_name_length= 0;
-  m_row.m_object_instance_addr= 0;
 }
 
-int table_events_waits_common::make_table_object_columns(volatile PFS_events_waits *wait)
+int table_events_waits_common::make_table_object_columns(PFS_events_waits *wait)
 {
   uint safe_index;
   PFS_table_share *safe_table_share;
@@ -200,15 +200,28 @@ int table_events_waits_common::make_table_object_columns(volatile PFS_events_wai
     uint safe_key_count= sanitize_index_count(safe_table_share->m_key_count);
     if (safe_index < safe_key_count)
     {
-      PFS_table_key *key= & safe_table_share->m_keys[safe_index];
-      m_row.m_index_name_length= key->m_name_length;
-      if (unlikely((m_row.m_index_name_length == 0) ||
-                   (m_row.m_index_name_length > sizeof(m_row.m_index_name))))
-        return 1;
-      memcpy(m_row.m_index_name, key->m_name, m_row.m_index_name_length);
+      PFS_table_share_index *index_stat;
+      index_stat= safe_table_share->find_index_stat(safe_index);
+
+      if (index_stat != NULL)
+      {
+        m_row.m_index_name_length= index_stat->m_key.m_name_length;
+
+        if (unlikely((m_row.m_index_name_length == 0) ||
+                     (m_row.m_index_name_length > sizeof(m_row.m_index_name))))
+          return 1;
+
+        memcpy(m_row.m_index_name, index_stat->m_key.m_name, m_row.m_index_name_length);
+      }
+      else
+      {
+        m_row.m_index_name_length= 0;
+      }
     }
     else
+    {
       m_row.m_index_name_length= 0;
+    }
   }
   else
   {
@@ -221,7 +234,7 @@ int table_events_waits_common::make_table_object_columns(volatile PFS_events_wai
   return 0;
 }
 
-int table_events_waits_common::make_file_object_columns(volatile PFS_events_waits *wait)
+int table_events_waits_common::make_file_object_columns(PFS_events_waits *wait)
 {
   PFS_file *safe_file;
 
@@ -253,7 +266,7 @@ int table_events_waits_common::make_file_object_columns(volatile PFS_events_wait
   return 0;
 }
 
-int table_events_waits_common::make_socket_object_columns(volatile PFS_events_waits *wait)
+int table_events_waits_common::make_socket_object_columns(PFS_events_waits *wait)
 {
   PFS_socket *safe_socket;
 
@@ -286,10 +299,10 @@ int table_events_waits_common::make_socket_object_columns(volatile PFS_events_wa
                                    safe_socket->m_addr_len);
 
     /* Convert port number to a string (length includes ':') */
-    int port_len= (int)(int10_to_str(port, (port_str+1), 10) - port_str + 1);
+    size_t port_len= int10_to_str(port, (port_str+1), 10) - port_str + 1;
 
     /* OBJECT NAME */
-    m_row.m_object_name_length= ip_length + port_len;
+    m_row.m_object_name_length= ip_length + static_cast<uint>(port_len);
 
     if (unlikely((m_row.m_object_name_length == 0) ||
                  (m_row.m_object_name_length > sizeof(m_row.m_object_name))))
@@ -309,33 +322,122 @@ int table_events_waits_common::make_socket_object_columns(volatile PFS_events_wa
   return 0;
 }
 
+int table_events_waits_common::make_metadata_lock_object_columns(PFS_events_waits *wait)
+{
+  PFS_metadata_lock *safe_metadata_lock;
+
+  safe_metadata_lock= sanitize_metadata_lock(wait->m_weak_metadata_lock);
+  if (unlikely(safe_metadata_lock == NULL))
+    return 1;
+
+  if (safe_metadata_lock->get_version() == wait->m_weak_version)
+  {
+    MDL_key *mdl= & safe_metadata_lock->m_mdl_key;
+    MDL_key user_lock_workaround;
+
+    switch(mdl->mdl_namespace())
+    {
+    case MDL_key::BACKUP:
+      m_row.m_object_type= "BACKUP";
+      m_row.m_object_type_length= 6;
+      m_row.m_object_schema_length= 0;
+      m_row.m_object_name_length= 0;
+      break;
+    case MDL_key::SCHEMA:
+      m_row.m_object_type= "SCHEMA";
+      m_row.m_object_type_length= 6;
+      m_row.m_object_schema_length= mdl->db_name_length();
+      m_row.m_object_name_length= 0;
+      break;
+    case MDL_key::TABLE:
+      m_row.m_object_type= "TABLE";
+      m_row.m_object_type_length= 5;
+      m_row.m_object_schema_length= mdl->db_name_length();
+      m_row.m_object_name_length= mdl->name_length();
+      break;
+    case MDL_key::FUNCTION:
+      m_row.m_object_type= "FUNCTION";
+      m_row.m_object_type_length= 8;
+      m_row.m_object_schema_length= mdl->db_name_length();
+      m_row.m_object_name_length= mdl->name_length();
+      break;
+    case MDL_key::PROCEDURE:
+      m_row.m_object_type= "PROCEDURE";
+      m_row.m_object_type_length= 9;
+      m_row.m_object_schema_length= mdl->db_name_length();
+      m_row.m_object_name_length= mdl->name_length();
+      break;
+    case MDL_key::PACKAGE_BODY:
+      m_row.m_object_type= "PACKAGE_BODY";
+      m_row.m_object_type_length= 12;
+      m_row.m_object_schema_length= mdl->db_name_length();
+      m_row.m_object_name_length= mdl->name_length();
+      break;
+    case MDL_key::TRIGGER:
+      m_row.m_object_type= "TRIGGER";
+      m_row.m_object_type_length= 7;
+      m_row.m_object_schema_length= mdl->db_name_length();
+      m_row.m_object_name_length= mdl->name_length();
+      break;
+    case MDL_key::EVENT:
+      m_row.m_object_type= "EVENT";
+      m_row.m_object_type_length= 5;
+      m_row.m_object_schema_length= mdl->db_name_length();
+      m_row.m_object_name_length= mdl->name_length();
+      break;
+    case MDL_key::USER_LOCK:
+      m_row.m_object_type= "USER LEVEL LOCK";
+      m_row.m_object_type_length= 15;
+      user_lock_workaround.mdl_key_init(MDL_key::USER_LOCK, "", mdl->db_name());
+      mdl=& user_lock_workaround;
+      m_row.m_object_schema_length= 0;
+      m_row.m_object_name_length= mdl->name_length();
+      break;
+    case MDL_key::NAMESPACE_END:
+    default:
+      m_row.m_object_type_length= 0;
+      m_row.m_object_schema_length= 0;
+      m_row.m_object_name_length= 0;
+      break;
+    }
+
+    if (m_row.m_object_schema_length > sizeof(m_row.m_object_schema))
+      return 1;
+    if (m_row.m_object_schema_length > 0)
+      memcpy(m_row.m_object_schema, mdl->db_name(), m_row.m_object_schema_length);
+
+    if (m_row.m_object_name_length > sizeof(m_row.m_object_name))
+      return 1;
+    if (m_row.m_object_name_length > 0)
+      memcpy(m_row.m_object_name, mdl->name(), m_row.m_object_name_length);
+
+    m_row.m_object_instance_addr= (intptr) wait->m_object_instance_addr;
+  }
+  else
+  {
+    m_row.m_object_type_length= 0;
+    m_row.m_object_schema_length= 0;
+    m_row.m_object_name_length= 0;
+    m_row.m_object_instance_addr= 0;
+  }
+
+  /* INDEX NAME */
+  m_row.m_index_name_length= 0;
+
+  return 0;
+}
+
 /**
   Build a row.
-  @param thread_own_wait            True if the memory for the wait
-    is owned by pfs_thread
-  @param pfs_thread                 the thread the cursor is reading
   @param wait                       the wait the cursor is reading
 */
-void table_events_waits_common::make_row(bool thread_own_wait,
-                                         PFS_thread *pfs_thread,
-                                         volatile PFS_events_waits *wait)
+void table_events_waits_common::make_row(PFS_events_waits *wait)
 {
-  pfs_lock lock;
-  PFS_thread *safe_thread;
   PFS_instr_class *safe_class;
-  const char *base;
-  const char *safe_source_file;
   enum_timer_name timer_name= wait_timer;
   ulonglong timer_end;
 
   m_row_exists= false;
-  safe_thread= sanitize_thread(pfs_thread);
-  if (unlikely(safe_thread == NULL))
-    return;
-
-  /* Protect this reader against a thread termination */
-  if (thread_own_wait)
-    safe_thread->m_lock.begin_optimistic_lock(&lock);
 
   /*
     Design choice:
@@ -362,21 +464,30 @@ void table_events_waits_common::make_row(bool thread_own_wait,
   */
   switch (wait->m_wait_class)
   {
+  case WAIT_CLASS_METADATA:
+    if (make_metadata_lock_object_columns(wait))
+      return;
+    safe_class= sanitize_metadata_class(wait->m_class);
+    break;
   case WAIT_CLASS_IDLE:
     clear_object_columns();
+    m_row.m_object_instance_addr= 0;
     safe_class= sanitize_idle_class(wait->m_class);
     timer_name= idle_timer;
     break;
   case WAIT_CLASS_MUTEX:
     clear_object_columns();
+    m_row.m_object_instance_addr= (intptr) wait->m_object_instance_addr;
     safe_class= sanitize_mutex_class((PFS_mutex_class*) wait->m_class);
     break;
   case WAIT_CLASS_RWLOCK:
     clear_object_columns();
+    m_row.m_object_instance_addr= (intptr) wait->m_object_instance_addr;
     safe_class= sanitize_rwlock_class((PFS_rwlock_class*) wait->m_class);
     break;
   case WAIT_CLASS_COND:
     clear_object_columns();
+    m_row.m_object_instance_addr= (intptr) wait->m_object_instance_addr;
     safe_class= sanitize_cond_class((PFS_cond_class*) wait->m_class);
     break;
   case WAIT_CLASS_TABLE:
@@ -402,7 +513,7 @@ void table_events_waits_common::make_row(bool thread_own_wait,
   if (unlikely(safe_class == NULL))
     return;
 
-  m_row.m_thread_internal_id= safe_thread->m_thread_internal_id;
+  m_row.m_thread_internal_id= wait->m_thread_internal_id;
   m_row.m_event_id= wait->m_event_id;
   m_row.m_end_event_id= wait->m_end_event_id;
   m_row.m_nesting_event_id= wait->m_nesting_event_id;
@@ -425,39 +536,14 @@ void table_events_waits_common::make_row(bool thread_own_wait,
   m_row.m_name= safe_class->m_name;
   m_row.m_name_length= safe_class->m_name_length;
 
-  /*
-    We are assuming this pointer is sane,
-    since it comes from __FILE__.
-  */
-  safe_source_file= wait->m_source_file;
-  if (unlikely(safe_source_file == NULL))
-    return;
+  /* Disable source file and line to avoid stale __FILE__ pointers. */
+  m_row.m_source_length= 0;
 
-  base= base_name(wait->m_source_file);
-  m_row.m_source_length= (uint)my_snprintf(m_row.m_source, sizeof(m_row.m_source),
-                                     "%s:%d", base, wait->m_source_line);
-  if (m_row.m_source_length > sizeof(m_row.m_source))
-    m_row.m_source_length= sizeof(m_row.m_source);
   m_row.m_operation= wait->m_operation;
   m_row.m_number_of_bytes= wait->m_number_of_bytes;
   m_row.m_flags= wait->m_flags;
 
-  if (thread_own_wait)
-  {
-    if (safe_thread->m_lock.end_optimistic_lock(&lock))
-      m_row_exists= true;
-  }
-  else
-  {
-    /*
-      For EVENTS_WAITS_HISTORY_LONG (thread_own_wait is false),
-      the wait record is always valid, because it is not stored
-      in memory owned by pfs_thread.
-      Even when the thread terminated, the record is mostly readable,
-      so this record is displayed.
-    */
-    m_row_exists= true;
-  }
+  m_row_exists= true;
 }
 
 /**
@@ -474,12 +560,20 @@ static const LEX_STRING operation_names_map[]=
   { C_STRING_WITH_LEN("lock") },
   { C_STRING_WITH_LEN("try_lock") },
 
-  /* RWLock operations */
+  /* RWLock operations (RW-lock) */
   { C_STRING_WITH_LEN("read_lock") },
   { C_STRING_WITH_LEN("write_lock") },
   { C_STRING_WITH_LEN("try_read_lock") },
   { C_STRING_WITH_LEN("try_write_lock") },
 
+  /* RWLock operations (SX-lock) */
+  { C_STRING_WITH_LEN("shared_lock") },
+  { C_STRING_WITH_LEN("shared_exclusive_lock") },
+  { C_STRING_WITH_LEN("exclusive_lock") },
+  { C_STRING_WITH_LEN("try_shared_lock") },
+  { C_STRING_WITH_LEN("try_shared_exclusive_lock") },
+  { C_STRING_WITH_LEN("try_exclusive_lock") },
+
   /* Condition operations */
   { C_STRING_WITH_LEN("wait") },
   { C_STRING_WITH_LEN("timed_wait") },
@@ -540,7 +634,10 @@ static const LEX_STRING operation_names_map[]=
   { C_STRING_WITH_LEN("select") },
 
   /* Idle operations */
-  { C_STRING_WITH_LEN("idle") }
+  { C_STRING_WITH_LEN("idle") },
+
+  /* Medatada lock operations */
+  { C_STRING_WITH_LEN("metadata lock") }
 };
 
 
@@ -559,7 +656,7 @@ int table_events_waits_common::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 2);
+  assert(table->s->null_bytes == 2);
   buf[0]= 0;
   buf[1]= 0;
 
@@ -644,7 +741,7 @@ int table_events_waits_common::read_row_values(TABLE *table,
           f->set_null();
         break;
       case 12: /* OBJECT_TYPE */
-        if (m_row.m_object_type)
+        if (m_row.m_object_type_length > 0)
         {
           set_field_varchar_utf8(f, m_row.m_object_type,
                                  m_row.m_object_type_length);
@@ -671,14 +768,18 @@ int table_events_waits_common::read_row_values(TABLE *table,
         operation= &operation_names_map[(int) m_row.m_operation - 1];
         set_field_varchar_utf8(f, operation->str, (uint)operation->length);
         break;
-      case 17: /* NUMBER_OF_BYTES */
+      case 17: /* NUMBER_OF_BYTES (also used for ROWS) */
         if ((m_row.m_operation == OPERATION_TYPE_FILEREAD) ||
             (m_row.m_operation == OPERATION_TYPE_FILEWRITE) ||
             (m_row.m_operation == OPERATION_TYPE_FILECHSIZE) ||
             (m_row.m_operation == OPERATION_TYPE_SOCKETSEND) ||
             (m_row.m_operation == OPERATION_TYPE_SOCKETRECV) ||
             (m_row.m_operation == OPERATION_TYPE_SOCKETSENDTO) ||
-            (m_row.m_operation == OPERATION_TYPE_SOCKETRECVFROM))
+            (m_row.m_operation == OPERATION_TYPE_SOCKETRECVFROM) ||
+            (m_row.m_operation == OPERATION_TYPE_TABLE_FETCH) ||
+            (m_row.m_operation == OPERATION_TYPE_TABLE_WRITE_ROW) ||
+            (m_row.m_operation == OPERATION_TYPE_TABLE_UPDATE_ROW) ||
+            (m_row.m_operation == OPERATION_TYPE_TABLE_DELETE_ROW))
           set_field_ulonglong(f, m_row.m_number_of_bytes);
         else
           f->set_null();
@@ -687,7 +788,7 @@ int table_events_waits_common::read_row_values(TABLE *table,
         f->set_null();
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
@@ -714,28 +815,77 @@ int table_events_waits_current::rnd_next(void)
 {
   PFS_thread *pfs_thread;
   PFS_events_waits *wait;
+  bool has_more_thread= true;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index_1 < thread_max;
+       has_more_thread;
        m_pos.next_thread())
   {
-    pfs_thread= &thread_array[m_pos.m_index_1];
-
-    if (! pfs_thread->m_lock.is_populated())
+    pfs_thread= global_thread_container.get(m_pos.m_index_1, & has_more_thread);
+    if (pfs_thread != NULL)
     {
-      /* This thread does not exist */
-      continue;
+      /*
+        We do not show nested events for now,
+        this will be revised with TABLE io
+      */
+// #define ONLY_SHOW_ONE_WAIT
+
+#ifdef ONLY_SHOW_ONE_WAIT
+      if (m_pos.m_index_2 >= 1)
+        continue;
+#else
+      /* m_events_waits_stack[0] is a dummy record */
+      PFS_events_waits *top_wait = &pfs_thread->m_events_waits_stack[WAIT_STACK_BOTTOM];
+      wait= &pfs_thread->m_events_waits_stack[m_pos.m_index_2 + WAIT_STACK_BOTTOM];
+
+      PFS_events_waits *safe_current = pfs_thread->m_events_waits_current;
+
+      if (safe_current == top_wait)
+      {
+        /* Display the last top level wait, when completed */
+        if (m_pos.m_index_2 >= 1)
+          continue;
+      }
+      else
+      {
+        /* Display all pending waits, when in progress */
+        if (wait >= safe_current)
+          continue;
+      }
+#endif
+
+      if (wait->m_wait_class == NO_WAIT_CLASS)
+      {
+        /*
+          This locker does not exist.
+          There can not be more lockers in the stack, skip to the next thread
+        */
+        continue;
+      }
+
+      make_row(pfs_thread, wait);
+      /* Next iteration, look for the next locker in this thread */
+      m_next_pos.set_after(&m_pos);
+      return 0;
     }
+  }
 
-    /*
-      We do not show nested events for now,
-      this will be revised with TABLE io
-    */
-// #define ONLY_SHOW_ONE_WAIT
+  return HA_ERR_END_OF_FILE;
+}
 
+int table_events_waits_current::rnd_pos(const void *pos)
+{
+  PFS_thread *pfs_thread;
+  PFS_events_waits *wait;
+
+  set_position(pos);
+
+  pfs_thread= global_thread_container.get(m_pos.m_index_1);
+  if (pfs_thread != NULL)
+  {
 #ifdef ONLY_SHOW_ONE_WAIT
     if (m_pos.m_index_2 >= 1)
-      continue;
+      return HA_ERR_RECORD_DELETED;
 #else
     /* m_events_waits_stack[0] is a dummy record */
     PFS_events_waits *top_wait = &pfs_thread->m_events_waits_stack[WAIT_STACK_BOTTOM];
@@ -747,77 +897,39 @@ int table_events_waits_current::rnd_next(void)
     {
       /* Display the last top level wait, when completed */
       if (m_pos.m_index_2 >= 1)
-        continue;
+        return HA_ERR_RECORD_DELETED;
     }
     else
     {
       /* Display all pending waits, when in progress */
       if (wait >= safe_current)
-        continue;
+        return HA_ERR_RECORD_DELETED;
     }
 #endif
 
-    if (wait->m_wait_class == NO_WAIT_CLASS)
+    assert(m_pos.m_index_2 < WAIT_STACK_LOGICAL_SIZE);
+
+    if (wait->m_wait_class != NO_WAIT_CLASS)
     {
-      /*
-        This locker does not exist.
-        There can not be more lockers in the stack, skip to the next thread
-      */
-      continue;
+      make_row(pfs_thread, wait);
+      return 0;
     }
-
-    make_row(true, pfs_thread, wait);
-    /* Next iteration, look for the next locker in this thread */
-    m_next_pos.set_after(&m_pos);
-    return 0;
   }
 
-  return HA_ERR_END_OF_FILE;
+  return HA_ERR_RECORD_DELETED;
 }
 
-int table_events_waits_current::rnd_pos(const void *pos)
+void table_events_waits_current::make_row(PFS_thread *thread, PFS_events_waits *wait)
 {
-  PFS_thread *pfs_thread;
-  PFS_events_waits *wait;
-
-  set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < thread_max);
-  pfs_thread= &thread_array[m_pos.m_index_1];
-
-  if (! pfs_thread->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
-
-#ifdef ONLY_SHOW_ONE_WAIT
-  if (m_pos.m_index_2 >= 1)
-    return HA_ERR_RECORD_DELETED;
-#else
-  /* m_events_waits_stack[0] is a dummy record */
-  PFS_events_waits *top_wait = &pfs_thread->m_events_waits_stack[WAIT_STACK_BOTTOM];
-  wait= &pfs_thread->m_events_waits_stack[m_pos.m_index_2 + WAIT_STACK_BOTTOM];
-
-  PFS_events_waits *safe_current = pfs_thread->m_events_waits_current;
-
-  if (safe_current == top_wait)
-  {
-    /* Display the last top level wait, when completed */
-    if (m_pos.m_index_2 >= 1)
-      return HA_ERR_RECORD_DELETED;
-  }
-  else
-  {
-    /* Display all pending waits, when in progress */
-    if (wait >= safe_current)
-      return HA_ERR_RECORD_DELETED;
-  }
-#endif
+  pfs_optimistic_state lock;
 
-  DBUG_ASSERT(m_pos.m_index_2 < WAIT_STACK_LOGICAL_SIZE);
+  /* Protect this reader against a thread termination */
+  thread->m_lock.begin_optimistic_lock(&lock);
 
-  if (wait->m_wait_class == NO_WAIT_CLASS)
-    return HA_ERR_RECORD_DELETED;
+  table_events_waits_common::make_row(wait);
 
-  make_row(true, pfs_thread, wait);
-  return 0;
+  if (! thread->m_lock.end_optimistic_lock(&lock))
+    m_row_exists= false;
 }
 
 int table_events_waits_current::delete_all_rows(void)
@@ -826,6 +938,12 @@ int table_events_waits_current::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_events_waits_current::get_row_count(void)
+{
+  return WAIT_STACK_SIZE * global_thread_container.get_row_count();
+}
+
 PFS_engine_table* table_events_waits_history::create(void)
 {
   return new table_events_waits_history();
@@ -846,51 +964,40 @@ int table_events_waits_history::rnd_next(void)
 {
   PFS_thread *pfs_thread;
   PFS_events_waits *wait;
+  bool has_more_thread= true;
 
   if (events_waits_history_per_thread == 0)
     return HA_ERR_END_OF_FILE;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index_1 < thread_max;
+       has_more_thread;
        m_pos.next_thread())
   {
-    pfs_thread= &thread_array[m_pos.m_index_1];
-
-    if (! pfs_thread->m_lock.is_populated())
+    pfs_thread= global_thread_container.get(m_pos.m_index_1, & has_more_thread);
+    if (pfs_thread != NULL)
     {
-      /* This thread does not exist */
-      continue;
-    }
-
-    if (m_pos.m_index_2 >= events_waits_history_per_thread)
-    {
-      /* This thread does not have more (full) history */
-      continue;
-    }
+      if (m_pos.m_index_2 >= events_waits_history_per_thread)
+      {
+        /* This thread does not have more (full) history */
+        continue;
+      }
 
-    if ( ! pfs_thread->m_waits_history_full &&
-        (m_pos.m_index_2 >= pfs_thread->m_waits_history_index))
-    {
-      /* This thread does not have more (not full) history */
-      continue;
-    }
+      if ( ! pfs_thread->m_waits_history_full &&
+          (m_pos.m_index_2 >= pfs_thread->m_waits_history_index))
+      {
+        /* This thread does not have more (not full) history */
+        continue;
+      }
 
-    if (pfs_thread->m_waits_history[m_pos.m_index_2].m_wait_class
-        == NO_WAIT_CLASS)
-    {
-      /*
-        This locker does not exist.
-        There can not be more lockers in the stack, skip to the next thread
-      */
-      continue;
+      wait= &pfs_thread->m_waits_history[m_pos.m_index_2];
+      if (wait->m_wait_class != NO_WAIT_CLASS)
+      {
+        make_row(pfs_thread, wait);
+        /* Next iteration, look for the next history in this thread */
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
     }
-
-    wait= &pfs_thread->m_waits_history[m_pos.m_index_2];
-
-    make_row(true, pfs_thread, wait);
-    /* Next iteration, look for the next history in this thread */
-    m_next_pos.set_after(&m_pos);
-    return 0;
   }
 
   return HA_ERR_END_OF_FILE;
@@ -901,27 +1008,41 @@ int table_events_waits_history::rnd_pos(const void *pos)
   PFS_thread *pfs_thread;
   PFS_events_waits *wait;
 
-  DBUG_ASSERT(events_waits_history_per_thread != 0);
+  assert(events_waits_history_per_thread != 0);
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < thread_max);
-  pfs_thread= &thread_array[m_pos.m_index_1];
 
-  if (! pfs_thread->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
+  pfs_thread= global_thread_container.get(m_pos.m_index_1);
+  if (pfs_thread != NULL)
+  {
+    assert(m_pos.m_index_2 < events_waits_history_per_thread);
+
+    if ( ! pfs_thread->m_waits_history_full &&
+        (m_pos.m_index_2 >= pfs_thread->m_waits_history_index))
+      return HA_ERR_RECORD_DELETED;
 
-  DBUG_ASSERT(m_pos.m_index_2 < events_waits_history_per_thread);
+    wait= &pfs_thread->m_waits_history[m_pos.m_index_2];
 
-  if ( ! pfs_thread->m_waits_history_full &&
-      (m_pos.m_index_2 >= pfs_thread->m_waits_history_index))
-    return HA_ERR_RECORD_DELETED;
+    if (wait->m_wait_class != NO_WAIT_CLASS)
+    {
+      make_row(pfs_thread, wait);
+      return 0;
+    }
+  }
 
-  wait= &pfs_thread->m_waits_history[m_pos.m_index_2];
+  return HA_ERR_RECORD_DELETED;
+}
 
-  if (wait->m_wait_class == NO_WAIT_CLASS)
-    return HA_ERR_RECORD_DELETED;
+void table_events_waits_history::make_row(PFS_thread *thread, PFS_events_waits *wait)
+{
+  pfs_optimistic_state lock;
 
-  make_row(true, pfs_thread, wait);
-  return 0;
+  /* Protect this reader against a thread termination */
+  thread->m_lock.begin_optimistic_lock(&lock);
+
+  table_events_waits_common::make_row(wait);
+
+  if (! thread->m_lock.end_optimistic_lock(&lock))
+    m_row_exists= false;
 }
 
 int table_events_waits_history::delete_all_rows(void)
@@ -930,6 +1051,12 @@ int table_events_waits_history::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_events_waits_history::get_row_count(void)
+{
+  return events_waits_history_per_thread * global_thread_container.get_row_count();
+}
+
 PFS_engine_table* table_events_waits_history_long::create(void)
 {
   return new table_events_waits_history_long();
@@ -957,7 +1084,7 @@ int table_events_waits_history_long::rnd_next(void)
   if (events_waits_history_long_full)
     limit= events_waits_history_long_size;
   else
-    limit= events_waits_history_long_index % events_waits_history_long_size;
+    limit= events_waits_history_long_index.m_u32 % events_waits_history_long_size;
 
   for (m_pos.set_at(&m_next_pos); m_pos.m_index < limit; m_pos.next())
   {
@@ -965,7 +1092,7 @@ int table_events_waits_history_long::rnd_next(void)
 
     if (wait->m_wait_class != NO_WAIT_CLASS)
     {
-      make_row(false, wait->m_thread, wait);
+      make_row(wait);
       /* Next iteration, look for the next entry */
       m_next_pos.set_after(&m_pos);
       return 0;
@@ -988,7 +1115,7 @@ int table_events_waits_history_long::rnd_pos(const void *pos)
   if (events_waits_history_long_full)
     limit= events_waits_history_long_size;
   else
-    limit= events_waits_history_long_index % events_waits_history_long_size;
+    limit= events_waits_history_long_index.m_u32 % events_waits_history_long_size;
 
   if (m_pos.m_index >= limit)
     return HA_ERR_RECORD_DELETED;
@@ -998,7 +1125,7 @@ int table_events_waits_history_long::rnd_pos(const void *pos)
   if (wait->m_wait_class == NO_WAIT_CLASS)
     return HA_ERR_RECORD_DELETED;
 
-  make_row(false, wait->m_thread, wait);
+  make_row(wait);
   return 0;
 }
 
@@ -1008,3 +1135,9 @@ int table_events_waits_history_long::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_events_waits_history_long::get_row_count(void)
+{
+  return events_waits_history_long_size;
+}
+
diff --git a/storage/perfschema/table_events_waits.h b/storage/perfschema/table_events_waits.h
index 90c1d341e5d..82d6f56bbb7 100644
--- a/storage/perfschema/table_events_waits.h
+++ b/storage/perfschema/table_events_waits.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -150,12 +150,12 @@ protected:
   {}
 
   void clear_object_columns();
-  int make_table_object_columns(volatile PFS_events_waits *wait);
-  int make_file_object_columns(volatile PFS_events_waits *wait);
-  int make_socket_object_columns(volatile PFS_events_waits *wait);
+  int make_table_object_columns(PFS_events_waits *wait);
+  int make_file_object_columns(PFS_events_waits *wait);
+  int make_socket_object_columns(PFS_events_waits *wait);
+  int make_metadata_lock_object_columns(PFS_events_waits *wait);
 
-  void make_row(bool thread_own_wait, PFS_thread *pfs_thread,
-                volatile PFS_events_waits *wait);
+  void make_row(PFS_events_waits *wait);
 
   /** Current row. */
   row_events_waits m_row;
@@ -171,6 +171,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
@@ -190,6 +191,8 @@ private:
   /** Table share lock. */
   static THR_LOCK m_table_lock;
 
+  void make_row(PFS_thread *thread, PFS_events_waits *wait);
+
   /** Current position. */
   pos_events_waits_current m_pos;
   /** Next position. */
@@ -204,6 +207,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
@@ -220,6 +224,8 @@ private:
   /** Table share lock. */
   static THR_LOCK m_table_lock;
 
+  void make_row(PFS_thread *thread, PFS_events_waits *wait);
+
   /** Current position. */
   pos_events_waits_history m_pos;
   /** Next position. */
@@ -234,6 +240,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
diff --git a/storage/perfschema/table_events_waits_summary.cc b/storage/perfschema/table_events_waits_summary.cc
index 01de5b25355..b9565d54460 100644
--- a/storage/perfschema/table_events_waits_summary.cc
+++ b/storage/perfschema/table_events_waits_summary.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,12 +26,13 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_events_waits_summary.h"
 #include "pfs_global.h"
+#include "field.h"
 
 THR_LOCK table_events_waits_summary_by_instance::m_table_lock;
 
@@ -40,11 +41,10 @@ table_events_waits_summary_by_instance::m_share=
 {
   { C_STRING_WITH_LEN("events_waits_summary_by_instance") },
   &pfs_truncatable_acl,
-  &table_events_waits_summary_by_instance::create,
+  table_events_waits_summary_by_instance::create,
   NULL, /* write_row */
-  &table_events_waits_summary_by_instance::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_events_waits_summary_by_instance::delete_all_rows,
+  table_all_instr::get_row_count,
   sizeof(pos_all_instr),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_waits_summary_by_instance("
@@ -54,7 +54,8 @@ table_events_waits_summary_by_instance::m_share=
                       "SUM_TIMER_WAIT BIGINT unsigned not null comment 'Total wait time of the summarized events that are timed.',"
                       "MIN_TIMER_WAIT BIGINT unsigned not null comment 'Minimum wait time of the summarized events that are timed.',"
                       "AVG_TIMER_WAIT BIGINT unsigned not null comment 'Average wait time of the summarized events that are timed.',"
-                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the summarized events that are timed.')") }
+                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the summarized events that are timed.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_events_waits_summary_by_instance::create(void)
@@ -78,7 +79,7 @@ void table_events_waits_summary_by_instance
                  const void *object_instance_begin,
                  PFS_single_stat *pfs_stat)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   m_row_exists= false;
 
   /*
@@ -195,7 +196,7 @@ int table_events_waits_summary_by_instance
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 0);
+  assert(table->s->null_bytes == 0);
 
   for (; (f= *fields) ; fields++)
   {
@@ -225,7 +226,7 @@ int table_events_waits_summary_by_instance
         set_field_ulonglong(f, m_row.m_stat.m_max);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
diff --git a/storage/perfschema/table_events_waits_summary.h b/storage/perfschema/table_events_waits_summary.h
index 53f1bed7987..13d50a23760 100644
--- a/storage/perfschema/table_events_waits_summary.h
+++ b/storage/perfschema/table_events_waits_summary.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
diff --git a/storage/perfschema/table_ews_by_account_by_event_name.cc b/storage/perfschema/table_ews_by_account_by_event_name.cc
index e859d0aa261..007f75e52eb 100644
--- a/storage/perfschema/table_ews_by_account_by_event_name.cc
+++ b/storage/perfschema/table_ews_by_account_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,13 +26,15 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_ews_by_account_by_event_name.h"
 #include "pfs_global.h"
 #include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_ews_by_account_by_event_name::m_table_lock;
 
@@ -44,19 +46,19 @@ table_ews_by_account_by_event_name::m_share=
   table_ews_by_account_by_event_name::create,
   NULL, /* write_row */
   table_ews_by_account_by_event_name::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_ews_by_account_by_event_name::get_row_count,
   sizeof(pos_ews_by_account_by_event_name),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_waits_summary_by_account_by_event_name("
-                      "USER CHAR(" STRINGIFY_ARG(USERNAME_CHAR_LENGTH) ") collate utf8_bin default null comment 'User. Used together with HOST and EVENT_NAME for grouping events.',"
-                      "HOST CHAR(" STRINGIFY_ARG(HOSTNAME_LENGTH) ") collate utf8_bin default null comment 'Host. Used together with USER and EVENT_NAME for grouping events.',"
+                      "USER CHAR(" USERNAME_CHAR_LENGTH_STR ") collate utf8_bin default null comment 'User. Used together with HOST and EVENT_NAME for grouping events.',"
+                      "HOST CHAR(" HOSTNAME_LENGTH_STR ") collate utf8_bin default null comment 'Host. Used together with USER and EVENT_NAME for grouping events.',"
                       "EVENT_NAME VARCHAR(128) not null comment 'Event name. Used together with USER and HOST for grouping events.',"
                       "COUNT_STAR BIGINT unsigned not null comment 'Number of summarized events',"
                       "SUM_TIMER_WAIT BIGINT unsigned not null comment 'Total wait time of the summarized events that are timed.',"
                       "MIN_TIMER_WAIT BIGINT unsigned not null comment 'Minimum wait time of the summarized events that are timed.',"
                       "AVG_TIMER_WAIT BIGINT unsigned not null comment 'Average wait time of the summarized events that are timed.',"
-                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the summarized events that are timed.')") }
+                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the summarized events that are timed.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -73,6 +75,12 @@ table_ews_by_account_by_event_name::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_ews_by_account_by_event_name::get_row_count(void)
+{
+  return global_account_container.get_row_count() * wait_class_max;
+}
+
 table_ews_by_account_by_event_name::table_ews_by_account_by_event_name()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(), m_next_pos()
@@ -88,13 +96,14 @@ int table_ews_by_account_by_event_name::rnd_next(void)
 {
   PFS_account *account;
   PFS_instr_class *instr_class;
+  bool has_more_account= true;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.has_more_account();
+       has_more_account;
        m_pos.next_account())
   {
-    account= &account_array[m_pos.m_index_1];
-    if (account->m_lock.is_populated())
+    account= global_account_container.get(m_pos.m_index_1, & has_more_account);
+    if (account != NULL)
     {
       for ( ;
            m_pos.has_more_view();
@@ -123,9 +132,12 @@ int table_ews_by_account_by_event_name::rnd_next(void)
         case pos_ews_by_account_by_event_name::VIEW_IDLE:
           instr_class= find_idle_class(m_pos.m_index_3);
           break;
+        case pos_ews_by_account_by_event_name::VIEW_METADATA:
+          instr_class= find_metadata_class(m_pos.m_index_3);
+          break;
         default:
           instr_class= NULL;
-          DBUG_ASSERT(false);
+          assert(false);
           break;
         }
 
@@ -149,10 +161,9 @@ table_ews_by_account_by_event_name::rnd_pos(const void *pos)
   PFS_instr_class *instr_class;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < account_max);
 
-  account= &account_array[m_pos.m_index_1];
-  if (! account->m_lock.is_populated())
+  account= global_account_container.get(m_pos.m_index_1);
+  if (account == NULL)
     return HA_ERR_RECORD_DELETED;
 
   switch (m_pos.m_index_2)
@@ -178,9 +189,12 @@ table_ews_by_account_by_event_name::rnd_pos(const void *pos)
   case pos_ews_by_account_by_event_name::VIEW_IDLE:
     instr_class= find_idle_class(m_pos.m_index_3);
     break;
+  case pos_ews_by_account_by_event_name::VIEW_METADATA:
+    instr_class= find_metadata_class(m_pos.m_index_3);
+    break;
   default:
     instr_class= NULL;
-    DBUG_ASSERT(false);
+    assert(false);
   }
   if (instr_class)
   {
@@ -194,7 +208,7 @@ table_ews_by_account_by_event_name::rnd_pos(const void *pos)
 void table_ews_by_account_by_event_name
 ::make_row(PFS_account *account, PFS_instr_class *klass)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   m_row_exists= false;
 
   account->m_lock.begin_optimistic_lock(&lock);
@@ -205,7 +219,10 @@ void table_ews_by_account_by_event_name
   m_row.m_event_name.make_row(klass);
 
   PFS_connection_wait_visitor visitor(klass);
-  PFS_connection_iterator::visit_account(account, true, & visitor);
+  PFS_connection_iterator::visit_account(account,
+                                         true,  /* threads */
+                                         false, /* THDs */
+                                         & visitor);
 
   if (! account->m_lock.end_optimistic_lock(&lock))
     return;
@@ -226,7 +243,7 @@ int table_ews_by_account_by_event_name
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
diff --git a/storage/perfschema/table_ews_by_account_by_event_name.h b/storage/perfschema/table_ews_by_account_by_event_name.h
index 7cde09183e3..4d626cf44cf 100644
--- a/storage/perfschema/table_ews_by_account_by_event_name.h
+++ b/storage/perfschema/table_ews_by_account_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -57,7 +57,7 @@ struct row_ews_by_account_by_event_name
 /**
   Position of a cursor on
   PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
-  Index 1 on user@host (0 based)
+  Index 1 on account (0 based)
   Index 2 on instrument view
   Index 3 on instrument class (1 based)
 */
@@ -75,9 +75,6 @@ struct pos_ews_by_account_by_event_name
     m_index_3= 1;
   }
 
-  inline bool has_more_account(void)
-  { return (m_index_1 < account_max); }
-
   inline void next_account(void)
   {
     m_index_1++;
@@ -103,6 +100,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
diff --git a/storage/perfschema/table_ews_by_host_by_event_name.cc b/storage/perfschema/table_ews_by_host_by_event_name.cc
index 2fcbbc55033..3a1ee52b6e4 100644
--- a/storage/perfschema/table_ews_by_host_by_event_name.cc
+++ b/storage/perfschema/table_ews_by_host_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,7 +26,7 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
@@ -34,6 +34,8 @@
 #include "pfs_global.h"
 #include "pfs_account.h"
 #include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_ews_by_host_by_event_name::m_table_lock;
 
@@ -45,8 +47,7 @@ table_ews_by_host_by_event_name::m_share=
   table_ews_by_host_by_event_name::create,
   NULL, /* write_row */
   table_ews_by_host_by_event_name::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_ews_by_host_by_event_name::get_row_count,
   sizeof(pos_ews_by_host_by_event_name),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_waits_summary_by_host_by_event_name("
@@ -56,7 +57,8 @@ table_ews_by_host_by_event_name::m_share=
                       "SUM_TIMER_WAIT BIGINT unsigned not null comment 'Total wait time of the summarized events that are timed.',"
                       "MIN_TIMER_WAIT BIGINT unsigned not null comment 'Minimum wait time of the summarized events that are timed.',"
                       "AVG_TIMER_WAIT BIGINT unsigned not null comment 'Average wait time of the summarized events that are timed.',"
-                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the summarized events that are timed.' )") }
+                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the summarized events that are timed.' )") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -74,6 +76,12 @@ table_ews_by_host_by_event_name::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_ews_by_host_by_event_name::get_row_count(void)
+{
+  return global_host_container.get_row_count() * wait_class_max;
+}
+
 table_ews_by_host_by_event_name::table_ews_by_host_by_event_name()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(), m_next_pos()
@@ -89,13 +97,14 @@ int table_ews_by_host_by_event_name::rnd_next(void)
 {
   PFS_host *host;
   PFS_instr_class *instr_class;
+  bool has_more_host= true;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.has_more_host();
+       has_more_host;
        m_pos.next_host())
   {
-    host= &host_array[m_pos.m_index_1];
-    if (host->m_lock.is_populated())
+    host= global_host_container.get(m_pos.m_index_1, & has_more_host);
+    if (host != NULL)
     {
       for ( ;
            m_pos.has_more_view();
@@ -124,9 +133,12 @@ int table_ews_by_host_by_event_name::rnd_next(void)
         case pos_ews_by_host_by_event_name::VIEW_IDLE:
           instr_class= find_idle_class(m_pos.m_index_3);
           break;
+        case pos_ews_by_host_by_event_name::VIEW_METADATA:
+          instr_class= find_metadata_class(m_pos.m_index_3);
+          break;
         default:
           instr_class= NULL;
-          DBUG_ASSERT(false);
+          assert(false);
           break;
         }
 
@@ -150,10 +162,9 @@ table_ews_by_host_by_event_name::rnd_pos(const void *pos)
   PFS_instr_class *instr_class;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < host_max);
 
-  host= &host_array[m_pos.m_index_1];
-  if (! host->m_lock.is_populated())
+  host= global_host_container.get(m_pos.m_index_1);
+  if (host == NULL)
     return HA_ERR_RECORD_DELETED;
 
   switch (m_pos.m_index_2)
@@ -179,9 +190,12 @@ table_ews_by_host_by_event_name::rnd_pos(const void *pos)
   case pos_ews_by_host_by_event_name::VIEW_IDLE:
     instr_class= find_idle_class(m_pos.m_index_3);
     break;
+  case pos_ews_by_host_by_event_name::VIEW_METADATA:
+    instr_class= find_metadata_class(m_pos.m_index_3);
+    break;
   default:
     instr_class= NULL;
-    DBUG_ASSERT(false);
+    assert(false);
     break;
   }
   if (instr_class)
@@ -196,7 +210,7 @@ table_ews_by_host_by_event_name::rnd_pos(const void *pos)
 void table_ews_by_host_by_event_name
 ::make_row(PFS_host *host, PFS_instr_class *klass)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   m_row_exists= false;
 
   host->m_lock.begin_optimistic_lock(&lock);
@@ -207,7 +221,11 @@ void table_ews_by_host_by_event_name
   m_row.m_event_name.make_row(klass);
 
   PFS_connection_wait_visitor visitor(klass);
-  PFS_connection_iterator::visit_host(host, true, true, & visitor);
+  PFS_connection_iterator::visit_host(host,
+                                      true,  /* accounts */
+                                      true,  /* threads */
+                                      false, /* THDs */
+                                      & visitor);
 
   if (! host->m_lock.end_optimistic_lock(&lock))
     return;
@@ -228,7 +246,7 @@ int table_ews_by_host_by_event_name
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
diff --git a/storage/perfschema/table_ews_by_host_by_event_name.h b/storage/perfschema/table_ews_by_host_by_event_name.h
index 8ce44a96617..fb1060cb45e 100644
--- a/storage/perfschema/table_ews_by_host_by_event_name.h
+++ b/storage/perfschema/table_ews_by_host_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -75,9 +75,6 @@ struct pos_ews_by_host_by_event_name
     m_index_3= 1;
   }
 
-  inline bool has_more_host(void)
-  { return (m_index_1 < host_max); }
-
   inline void next_host(void)
   {
     m_index_1++;
@@ -103,6 +100,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
diff --git a/storage/perfschema/table_ews_by_thread_by_event_name.cc b/storage/perfschema/table_ews_by_thread_by_event_name.cc
index 58fdbbc2d10..803e81b4a7b 100644
--- a/storage/perfschema/table_ews_by_thread_by_event_name.cc
+++ b/storage/perfschema/table_ews_by_thread_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,13 +26,15 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_ews_by_thread_by_event_name.h"
 #include "pfs_global.h"
 #include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_ews_by_thread_by_event_name::m_table_lock;
 
@@ -44,8 +46,7 @@ table_ews_by_thread_by_event_name::m_share=
   table_ews_by_thread_by_event_name::create,
   NULL, /* write_row */
   table_ews_by_thread_by_event_name::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_ews_by_thread_by_event_name::get_row_count,
   sizeof(pos_ews_by_thread_by_event_name),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_waits_summary_by_thread_by_event_name("
@@ -55,7 +56,8 @@ table_ews_by_thread_by_event_name::m_share=
                       "SUM_TIMER_WAIT BIGINT unsigned not null comment 'Total wait time of the summarized events that are timed.',"
                       "MIN_TIMER_WAIT BIGINT unsigned not null comment 'Minimum wait time of the summarized events that are timed.',"
                       "AVG_TIMER_WAIT BIGINT unsigned not null comment 'Average wait time of the summarized events that are timed.',"
-                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the summarized events that are timed.')") }
+                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the summarized events that are timed.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -71,6 +73,12 @@ table_ews_by_thread_by_event_name::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_ews_by_thread_by_event_name::get_row_count(void)
+{
+  return global_thread_container.get_row_count() * wait_class_max;
+}
+
 table_ews_by_thread_by_event_name::table_ews_by_thread_by_event_name()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(), m_next_pos()
@@ -86,18 +94,14 @@ int table_ews_by_thread_by_event_name::rnd_next(void)
 {
   PFS_thread *thread;
   PFS_instr_class *instr_class;
+  bool has_more_thread= true;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.has_more_thread();
+       has_more_thread;
        m_pos.next_thread())
   {
-    thread= &thread_array[m_pos.m_index_1];
-
-    /*
-      Important note: the thread scan is the outer loop (index 1),
-      to minimize the number of calls to atomic operations.
-    */
-    if (thread->m_lock.is_populated())
+    thread= global_thread_container.get(m_pos.m_index_1, & has_more_thread);
+    if (thread != NULL)
     {
       for ( ;
            m_pos.has_more_view();
@@ -126,8 +130,11 @@ int table_ews_by_thread_by_event_name::rnd_next(void)
         case pos_ews_by_thread_by_event_name::VIEW_IDLE:
           instr_class= find_idle_class(m_pos.m_index_3);
           break;
+        case pos_ews_by_thread_by_event_name::VIEW_METADATA:
+          instr_class= find_metadata_class(m_pos.m_index_3);
+          break;
         default:
-          DBUG_ASSERT(false);
+          assert(false);
           instr_class= NULL;
           break;
         }
@@ -152,52 +159,55 @@ table_ews_by_thread_by_event_name::rnd_pos(const void *pos)
   PFS_instr_class *instr_class;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < thread_max);
-
-  thread= &thread_array[m_pos.m_index_1];
-  if (! thread->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
 
-  switch (m_pos.m_index_2)
+  thread= global_thread_container.get(m_pos.m_index_1);
+  if (thread != NULL)
   {
-  case pos_ews_by_thread_by_event_name::VIEW_MUTEX:
-    instr_class= find_mutex_class(m_pos.m_index_3);
-    break;
-  case pos_ews_by_thread_by_event_name::VIEW_RWLOCK:
-    instr_class= find_rwlock_class(m_pos.m_index_3);
-    break;
-  case pos_ews_by_thread_by_event_name::VIEW_COND:
-    instr_class= find_cond_class(m_pos.m_index_3);
-    break;
-  case pos_ews_by_thread_by_event_name::VIEW_FILE:
-    instr_class= find_file_class(m_pos.m_index_3);
-    break;
-  case pos_ews_by_thread_by_event_name::VIEW_TABLE:
-    instr_class= find_table_class(m_pos.m_index_3);
-    break;
-  case pos_ews_by_thread_by_event_name::VIEW_SOCKET:
-    instr_class= find_socket_class(m_pos.m_index_3);
-    break;
-  case pos_ews_by_thread_by_event_name::VIEW_IDLE:
-    instr_class= find_idle_class(m_pos.m_index_3);
-    break;
-  default:
-    DBUG_ASSERT(false);
-    instr_class= NULL;
-  }
+    switch (m_pos.m_index_2)
+    {
+    case pos_ews_by_thread_by_event_name::VIEW_MUTEX:
+      instr_class= find_mutex_class(m_pos.m_index_3);
+      break;
+    case pos_ews_by_thread_by_event_name::VIEW_RWLOCK:
+      instr_class= find_rwlock_class(m_pos.m_index_3);
+      break;
+    case pos_ews_by_thread_by_event_name::VIEW_COND:
+      instr_class= find_cond_class(m_pos.m_index_3);
+      break;
+    case pos_ews_by_thread_by_event_name::VIEW_FILE:
+      instr_class= find_file_class(m_pos.m_index_3);
+      break;
+    case pos_ews_by_thread_by_event_name::VIEW_TABLE:
+      instr_class= find_table_class(m_pos.m_index_3);
+      break;
+    case pos_ews_by_thread_by_event_name::VIEW_SOCKET:
+      instr_class= find_socket_class(m_pos.m_index_3);
+      break;
+    case pos_ews_by_thread_by_event_name::VIEW_IDLE:
+      instr_class= find_idle_class(m_pos.m_index_3);
+      break;
+    case pos_ews_by_thread_by_event_name::VIEW_METADATA:
+      instr_class= find_metadata_class(m_pos.m_index_3);
+      break;
+    default:
+      assert(false);
+      instr_class= NULL;
+    }
 
-  if (instr_class)
-  {
-    make_row(thread, instr_class);
-    return 0;
+    if (instr_class)
+    {
+      make_row(thread, instr_class);
+      return 0;
+    }
   }
+
   return HA_ERR_RECORD_DELETED;
 }
 
 void table_ews_by_thread_by_event_name
 ::make_row(PFS_thread *thread, PFS_instr_class *klass)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   m_row_exists= false;
 
   /* Protect this reader against a thread termination */
@@ -211,9 +221,9 @@ void table_ews_by_thread_by_event_name
   PFS_connection_iterator::visit_thread(thread, &visitor);
 
   /*
-     If the aggregation for this class is deferred, then we must pull the
-     current wait stats from the instances associated with this thread.
-  */  
+    If the aggregation for this class is deferred, then we must pull the
+    current wait stats from the instances associated with this thread.
+  */
   if (klass->is_deferred())
   {
     /* Visit instances owned by this thread. Do not visit the class. */
@@ -243,7 +253,7 @@ int table_ews_by_thread_by_event_name
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 0);
+  assert(table->s->null_bytes == 0);
 
   for (; (f= *fields) ; fields++)
   {
diff --git a/storage/perfschema/table_ews_by_thread_by_event_name.h b/storage/perfschema/table_ews_by_thread_by_event_name.h
index b67664bfced..85209f6fad4 100644
--- a/storage/perfschema/table_ews_by_thread_by_event_name.h
+++ b/storage/perfschema/table_ews_by_thread_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -74,9 +74,6 @@ struct pos_ews_by_thread_by_event_name
     m_index_3= 1;
   }
 
-  inline bool has_more_thread(void)
-  { return (m_index_1 < thread_max); }
-
   inline void next_thread(void)
   {
     m_index_1++;
@@ -102,6 +99,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
diff --git a/storage/perfschema/table_ews_by_user_by_event_name.cc b/storage/perfschema/table_ews_by_user_by_event_name.cc
index df8bb41010a..d1e88da59aa 100644
--- a/storage/perfschema/table_ews_by_user_by_event_name.cc
+++ b/storage/perfschema/table_ews_by_user_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,14 +26,15 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_ews_by_user_by_event_name.h"
 #include "pfs_global.h"
-#include "pfs_account.h"
 #include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_ews_by_user_by_event_name::m_table_lock;
 
@@ -45,18 +46,18 @@ table_ews_by_user_by_event_name::m_share=
   table_ews_by_user_by_event_name::create,
   NULL, /* write_row */
   table_ews_by_user_by_event_name::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_ews_by_user_by_event_name::get_row_count,
   sizeof(pos_ews_by_user_by_event_name),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_waits_summary_by_user_by_event_name("
-                      "USER CHAR(" STRINGIFY_ARG(USERNAME_CHAR_LENGTH) ") collate utf8_bin default null comment 'User. Used together with EVENT_NAME for grouping events.',"
+                      "USER CHAR(" USERNAME_CHAR_LENGTH_STR ") collate utf8_bin default null comment 'User. Used together with EVENT_NAME for grouping events.',"
                       "EVENT_NAME VARCHAR(128) not null comment 'Event name. Used together with USER for grouping events.',"
                       "COUNT_STAR BIGINT unsigned not null comment 'Number of summarized events',"
                       "SUM_TIMER_WAIT BIGINT unsigned not null comment 'Total wait time of the summarized events that are timed.',"
                       "MIN_TIMER_WAIT BIGINT unsigned not null comment 'Minimum wait time of the summarized events that are timed.',"
                       "AVG_TIMER_WAIT BIGINT unsigned not null comment 'Average wait time of the summarized events that are timed.',"
-                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the summarized events that are timed.')") }
+                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the summarized events that are timed.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -74,6 +75,12 @@ table_ews_by_user_by_event_name::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_ews_by_user_by_event_name::get_row_count(void)
+{
+  return global_user_container.get_row_count() * wait_class_max;
+}
+
 table_ews_by_user_by_event_name::table_ews_by_user_by_event_name()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(), m_next_pos()
@@ -89,13 +96,14 @@ int table_ews_by_user_by_event_name::rnd_next(void)
 {
   PFS_user *user;
   PFS_instr_class *instr_class;
+  bool has_more_user= true;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.has_more_user();
+       has_more_user;
        m_pos.next_user())
   {
-    user= &user_array[m_pos.m_index_1];
-    if (user->m_lock.is_populated())
+    user= global_user_container.get(m_pos.m_index_1, & has_more_user);
+    if (user != NULL)
     {
       for ( ;
            m_pos.has_more_view();
@@ -124,9 +132,12 @@ int table_ews_by_user_by_event_name::rnd_next(void)
         case pos_ews_by_user_by_event_name::VIEW_IDLE:
           instr_class= find_idle_class(m_pos.m_index_3);
           break;
+        case pos_ews_by_user_by_event_name::VIEW_METADATA:
+          instr_class= find_metadata_class(m_pos.m_index_3);
+          break;
         default:
           instr_class= NULL;
-          DBUG_ASSERT(false);
+          assert(false);
           break;
         }
 
@@ -150,10 +161,9 @@ table_ews_by_user_by_event_name::rnd_pos(const void *pos)
   PFS_instr_class *instr_class;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < user_max);
 
-  user= &user_array[m_pos.m_index_1];
-  if (! user->m_lock.is_populated())
+  user= global_user_container.get(m_pos.m_index_1);
+  if (user == NULL)
     return HA_ERR_RECORD_DELETED;
 
   switch (m_pos.m_index_2)
@@ -179,9 +189,12 @@ table_ews_by_user_by_event_name::rnd_pos(const void *pos)
   case pos_ews_by_user_by_event_name::VIEW_IDLE:
     instr_class= find_idle_class(m_pos.m_index_3);
     break;
+  case pos_ews_by_user_by_event_name::VIEW_METADATA:
+    instr_class= find_metadata_class(m_pos.m_index_3);
+    break;
   default:
     instr_class= NULL;
-    DBUG_ASSERT(false);
+    assert(false);
     break;
   }
   if (instr_class)
@@ -196,7 +209,7 @@ table_ews_by_user_by_event_name::rnd_pos(const void *pos)
 void table_ews_by_user_by_event_name
 ::make_row(PFS_user *user, PFS_instr_class *klass)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   m_row_exists= false;
 
   user->m_lock.begin_optimistic_lock(&lock);
@@ -207,7 +220,11 @@ void table_ews_by_user_by_event_name
   m_row.m_event_name.make_row(klass);
 
   PFS_connection_wait_visitor visitor(klass);
-  PFS_connection_iterator::visit_user(user, true, true, & visitor);
+  PFS_connection_iterator::visit_user(user,
+                                      true,  /* accounts */
+                                      true,  /* threads */
+                                      false, /* THDs */
+                                      & visitor);
 
   if (! user->m_lock.end_optimistic_lock(&lock))
     return;
@@ -228,7 +245,7 @@ int table_ews_by_user_by_event_name
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
diff --git a/storage/perfschema/table_ews_by_user_by_event_name.h b/storage/perfschema/table_ews_by_user_by_event_name.h
index f4f29534be4..ffb2e9074b2 100644
--- a/storage/perfschema/table_ews_by_user_by_event_name.h
+++ b/storage/perfschema/table_ews_by_user_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -75,9 +75,6 @@ struct pos_ews_by_user_by_event_name
     m_index_3= 1;
   }
 
-  inline bool has_more_user(void)
-  { return (m_index_1 < user_max); }
-
   inline void next_user(void)
   {
     m_index_1++;
@@ -103,6 +100,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
diff --git a/storage/perfschema/table_ews_global_by_event_name.cc b/storage/perfschema/table_ews_global_by_event_name.cc
index eb44e30c3d4..d4853685a17 100644
--- a/storage/perfschema/table_ews_global_by_event_name.cc
+++ b/storage/perfschema/table_ews_global_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,7 +26,7 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
@@ -35,6 +35,7 @@
 #include "pfs_instr.h"
 #include "pfs_timer.h"
 #include "pfs_visitor.h"
+#include "field.h"
 
 THR_LOCK table_ews_global_by_event_name::m_table_lock;
 
@@ -46,8 +47,7 @@ table_ews_global_by_event_name::m_share=
   table_ews_global_by_event_name::create,
   NULL, /* write_row */
   table_ews_global_by_event_name::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_ews_global_by_event_name::get_row_count,
   sizeof(pos_ews_global_by_event_name),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE events_waits_summary_global_by_event_name("
@@ -56,7 +56,8 @@ table_ews_global_by_event_name::m_share=
                       "SUM_TIMER_WAIT BIGINT unsigned not null comment 'Total wait time of the summarized events that are timed.',"
                       "MIN_TIMER_WAIT BIGINT unsigned not null comment 'Minimum wait time of the summarized events that are timed.',"
                       "AVG_TIMER_WAIT BIGINT unsigned not null comment 'Average wait time of the summarized events that are timed.',"
-                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the summarized events that are timed.')") }
+                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the summarized events that are timed.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -75,6 +76,12 @@ table_ews_global_by_event_name::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_ews_global_by_event_name::get_row_count(void)
+{
+  return wait_class_max;
+}
+
 table_ews_global_by_event_name::table_ews_global_by_event_name()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(), m_next_pos()
@@ -169,6 +176,15 @@ int table_ews_global_by_event_name::rnd_next(void)
         return 0;
       }
       break;
+    case pos_ews_global_by_event_name::VIEW_METADATA:
+      instr_class= find_metadata_class(m_pos.m_index_2);
+      if (instr_class)
+      {
+        make_metadata_row(instr_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+      break;
     default:
       break;
     }
@@ -224,8 +240,8 @@ table_ews_global_by_event_name::rnd_pos(const void *pos)
     }
     break;
   case pos_ews_global_by_event_name::VIEW_TABLE:
-    DBUG_ASSERT(m_pos.m_index_2 >= 1);
-    DBUG_ASSERT(m_pos.m_index_2 <= 2);
+    assert(m_pos.m_index_2 >= 1);
+    assert(m_pos.m_index_2 <= 2);
     if (m_pos.m_index_2 == 1)
       make_table_io_row(&global_table_io_class);
     else
@@ -247,6 +263,17 @@ table_ews_global_by_event_name::rnd_pos(const void *pos)
       return 0;
     }
     break;
+  case pos_ews_global_by_event_name::VIEW_METADATA:
+    instr_class= find_metadata_class(m_pos.m_index_2);
+    if (instr_class)
+    {
+      make_metadata_row(instr_class);
+      return 0;
+    }
+    break;
+  default:
+    assert(false);
+    break;
   }
 
   return HA_ERR_RECORD_DELETED;
@@ -324,7 +351,7 @@ void table_ews_global_by_event_name
 
   PFS_table_lock_wait_visitor visitor;
   PFS_object_iterator::visit_all_tables(& visitor);
-  
+
   get_normalizer(klass);
   m_row.m_stat.set(m_normalizer, & visitor.m_stat);
   m_row_exists= true;
@@ -351,8 +378,27 @@ void table_ews_global_by_event_name
   PFS_connection_wait_visitor visitor(klass);
   PFS_connection_iterator::visit_global(false, /* hosts */
                                         false, /* users */
-                                        false, /* accts */
-                                        true,  /* threads */ &visitor);
+                                        false, /* accounts */
+                                        true,  /* threads */
+                                        false, /* THDs */
+                                        &visitor);
+  get_normalizer(klass);
+  m_row.m_stat.set(m_normalizer, &visitor.m_stat);
+  m_row_exists= true;
+}
+
+void table_ews_global_by_event_name
+::make_metadata_row(PFS_instr_class *klass)
+{
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_wait_visitor visitor(klass);
+  PFS_connection_iterator::visit_global(false, /* hosts */
+                                        true,  /* users */
+                                        true,  /* accounts */
+                                        true,  /* threads */
+                                        false, /* THDs */
+                                        &visitor);
   get_normalizer(klass);
   m_row.m_stat.set(m_normalizer, &visitor.m_stat);
   m_row_exists= true;
@@ -368,7 +414,7 @@ int table_ews_global_by_event_name
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 0);
+  assert(table->s->null_bytes == 0);
 
   for (; (f= *fields) ; fields++)
   {
diff --git a/storage/perfschema/table_ews_global_by_event_name.h b/storage/perfschema/table_ews_global_by_event_name.h
index 8157d274112..548f0af4666 100644
--- a/storage/perfschema/table_ews_global_by_event_name.h
+++ b/storage/perfschema/table_ews_global_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -88,6 +88,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
@@ -114,6 +115,7 @@ protected:
   void make_table_lock_row(PFS_instr_class *klass);
   void make_socket_row(PFS_socket_class *klass);
   void make_idle_row(PFS_instr_class *klass);
+  void make_metadata_row(PFS_instr_class *klass);
 
 private:
   /** Table share lock. */
diff --git a/storage/perfschema/table_file_instances.cc b/storage/perfschema/table_file_instances.cc
index 1f64ddb54c5..abbaf0745b3 100644
--- a/storage/perfschema/table_file_instances.cc
+++ b/storage/perfschema/table_file_instances.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,12 +26,14 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_file_instances.h"
 #include "pfs_global.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_file_instances::m_table_lock;
 
@@ -40,17 +42,17 @@ table_file_instances::m_share=
 {
   { C_STRING_WITH_LEN("file_instances") },
   &pfs_readonly_acl,
-  &table_file_instances::create,
+  table_file_instances::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_file_instances::get_row_count,
   sizeof(PFS_simple_index),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE file_instances("
                       "FILE_NAME VARCHAR(512) not null comment 'File name.',"
                       "EVENT_NAME VARCHAR(128) not null comment 'Instrument name associated with the file.',"
-                      "OPEN_COUNT INTEGER unsigned not null comment 'Open handles on the file. A value of greater than zero means that the file is currently open.')") }
+                      "OPEN_COUNT INTEGER unsigned not null comment 'Open handles on the file. A value of greater than zero means that the file is currently open.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_file_instances::create(void)
@@ -58,6 +60,12 @@ PFS_engine_table* table_file_instances::create(void)
   return new table_file_instances();
 }
 
+ha_rows
+table_file_instances::get_row_count(void)
+{
+  return global_file_container.get_row_count();
+}
+
 table_file_instances::table_file_instances()
   : PFS_engine_table(&m_share, &m_pos),
   m_row_exists(false), m_pos(0), m_next_pos(0)
@@ -73,17 +81,14 @@ int table_file_instances::rnd_next(void)
 {
   PFS_file *pfs;
 
-  for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index < file_max;
-       m_pos.next())
+  m_pos.set_at(&m_next_pos);
+  PFS_file_iterator it= global_file_container.iterate(m_pos.m_index);
+  pfs= it.scan_next(& m_pos.m_index);
+  if (pfs != NULL)
   {
-    pfs= &file_array[m_pos.m_index];
-    if (pfs->m_lock.is_populated())
-    {
-      make_row(pfs);
-      m_next_pos.set_after(&m_pos);
-      return 0;
-    }
+    make_row(pfs);
+    m_next_pos.set_after(&m_pos);
+    return 0;
   }
 
   return HA_ERR_END_OF_FILE;
@@ -94,19 +99,20 @@ int table_file_instances::rnd_pos(const void *pos)
   PFS_file *pfs;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index < file_max);
-  pfs= &file_array[m_pos.m_index];
 
-  if (! pfs->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
+  pfs= global_file_container.get(m_pos.m_index);
+  if (pfs != NULL)
+  {
+    make_row(pfs);
+    return 0;
+  }
 
-  make_row(pfs);
-  return 0;
+  return HA_ERR_RECORD_DELETED;
 }
 
 void table_file_instances::make_row(PFS_file *pfs)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   PFS_file_class *safe_class;
 
   m_row_exists= false;
@@ -139,7 +145,7 @@ int table_file_instances::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 0);
+  assert(table->s->null_bytes == 0);
 
   for (; (f= *fields) ; fields++)
   {
@@ -158,7 +164,7 @@ int table_file_instances::read_row_values(TABLE *table,
         set_field_ulong(f, m_row.m_open_count);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
diff --git a/storage/perfschema/table_file_instances.h b/storage/perfschema/table_file_instances.h
index 5b44e63028e..1517e83035b 100644
--- a/storage/perfschema/table_file_instances.h
+++ b/storage/perfschema/table_file_instances.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -58,6 +58,7 @@ public:
   /** Table share */
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
diff --git a/storage/perfschema/table_file_summary_by_event_name.cc b/storage/perfschema/table_file_summary_by_event_name.cc
index ce83453875f..e7c2d5559e6 100644
--- a/storage/perfschema/table_file_summary_by_event_name.cc
+++ b/storage/perfschema/table_file_summary_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -21,18 +21,19 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 
 /**
-  @file storage/perfschema/table_file_summary.cc
+  @file storage/perfschema/table_file_summary_by_event_name.cc
   Table FILE_SUMMARY_BY_EVENT_NAME(implementation).
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_file_summary_by_event_name.h"
 #include "pfs_global.h"
 #include "pfs_visitor.h"
+#include "field.h"
 
 THR_LOCK table_file_summary_by_event_name::m_table_lock;
 
@@ -41,11 +42,10 @@ table_file_summary_by_event_name::m_share=
 {
   { C_STRING_WITH_LEN("file_summary_by_event_name") },
   &pfs_truncatable_acl,
-  &table_file_summary_by_event_name::create,
+  table_file_summary_by_event_name::create,
   NULL, /* write_row */
   table_file_summary_by_event_name::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_file_summary_by_event_name::get_row_count,
   sizeof(PFS_simple_index),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE file_summary_by_event_name("
@@ -71,7 +71,8 @@ table_file_summary_by_event_name::m_share=
                       "SUM_TIMER_MISC BIGINT unsigned not null comment 'Total wait time of all miscellaneous operations that are timed.',"
                       "MIN_TIMER_MISC BIGINT unsigned not null comment 'Minimum wait time of all miscellaneous operations that are timed.',"
                       "AVG_TIMER_MISC BIGINT unsigned not null comment 'Average wait time of all miscellaneous operations that are timed.',"
-                      "MAX_TIMER_MISC BIGINT unsigned not null comment 'Maximum wait time of all miscellaneous operations that are timed.')") }
+                      "MAX_TIMER_MISC BIGINT unsigned not null comment 'Maximum wait time of all miscellaneous operations that are timed.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_file_summary_by_event_name::create(void)
@@ -86,6 +87,12 @@ int table_file_summary_by_event_name::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_file_summary_by_event_name::get_row_count(void)
+{
+  return file_class_max;
+}
+
 table_file_summary_by_event_name::table_file_summary_by_event_name()
   : PFS_engine_table(&m_share, &m_pos),
   m_pos(1), m_next_pos(1)
@@ -132,7 +139,7 @@ int table_file_summary_by_event_name::rnd_pos(const void *pos)
 
 /**
   Build a row.
-  @param klass            the file class the cursor is reading
+  @param file_class            the file class the cursor is reading
 */
 void table_file_summary_by_event_name::make_row(PFS_file_class *file_class)
 {
@@ -142,7 +149,7 @@ void table_file_summary_by_event_name::make_row(PFS_file_class *file_class)
   PFS_instance_iterator::visit_file_instances(file_class, &visitor);
 
   time_normalizer *normalizer= time_normalizer::get(wait_timer);
-  
+
   /* Collect timer and byte count stats */
   m_row.m_io_stat.set(normalizer, &visitor.m_file_io_stat);
   m_row_exists= true;
@@ -160,7 +167,7 @@ int table_file_summary_by_event_name::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 0);
+  assert(table->s->null_bytes == 0);
 
   for (; (f= *fields) ; fields++)
   {
@@ -242,7 +249,7 @@ int table_file_summary_by_event_name::read_row_values(TABLE *table,
         break;
 
       default:
-        DBUG_ASSERT(false);
+        assert(false);
         break;
       }
     } // if
diff --git a/storage/perfschema/table_file_summary_by_event_name.h b/storage/perfschema/table_file_summary_by_event_name.h
index b8cb293cb07..852b680262e 100644
--- a/storage/perfschema/table_file_summary_by_event_name.h
+++ b/storage/perfschema/table_file_summary_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -59,6 +59,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
diff --git a/storage/perfschema/table_file_summary_by_instance.cc b/storage/perfschema/table_file_summary_by_instance.cc
index 23172055bd1..2a41f9b7bff 100644
--- a/storage/perfschema/table_file_summary_by_instance.cc
+++ b/storage/perfschema/table_file_summary_by_instance.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -21,17 +21,19 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 
 /**
-  @file storage/perfschema/table_file_summary.cc
+  @file storage/perfschema/table_file_summary_by_instance.cc
   Table FILE_SUMMARY_BY_INSTANCE (implementation).
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_file_summary_by_instance.h"
 #include "pfs_global.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_file_summary_by_instance::m_table_lock;
 
@@ -40,11 +42,10 @@ table_file_summary_by_instance::m_share=
 {
   { C_STRING_WITH_LEN("file_summary_by_instance") },
   &pfs_truncatable_acl,
-  &table_file_summary_by_instance::create,
+  table_file_summary_by_instance::create,
   NULL, /* write_row */
   table_file_summary_by_instance::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_file_summary_by_instance::get_row_count,
   sizeof(PFS_simple_index),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE file_summary_by_instance("
@@ -72,7 +73,8 @@ table_file_summary_by_instance::m_share=
                       "SUM_TIMER_MISC BIGINT unsigned not null comment 'Total wait time of all miscellaneous operations that are timed.',"
                       "MIN_TIMER_MISC BIGINT unsigned not null comment 'Minimum wait time of all miscellaneous operations that are timed.',"
                       "AVG_TIMER_MISC BIGINT unsigned not null comment 'Average wait time of all miscellaneous operations that are timed.',"
-                      "MAX_TIMER_MISC BIGINT unsigned not null comment 'Maximum wait time of all miscellaneous operations that are timed.')") }
+                      "MAX_TIMER_MISC BIGINT unsigned not null comment 'Maximum wait time of all miscellaneous operations that are timed.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_file_summary_by_instance::create(void)
@@ -86,6 +88,12 @@ int table_file_summary_by_instance::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_file_summary_by_instance::get_row_count(void)
+{
+  return global_file_container.get_row_count();
+}
+
 table_file_summary_by_instance::table_file_summary_by_instance()
   : PFS_engine_table(&m_share, &m_pos),
   m_row_exists(false), m_pos(0), m_next_pos(0)
@@ -101,17 +109,14 @@ int table_file_summary_by_instance::rnd_next(void)
 {
   PFS_file *pfs;
 
-  for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index < file_max;
-       m_pos.next())
+  m_pos.set_at(&m_next_pos);
+  PFS_file_iterator it= global_file_container.iterate(m_pos.m_index);
+  pfs= it.scan_next(& m_pos.m_index);
+  if (pfs != NULL)
   {
-    pfs= &file_array[m_pos.m_index];
-    if (pfs->m_lock.is_populated())
-    {
-      make_row(pfs);
-      m_next_pos.set_after(&m_pos);
-      return 0;
-    }
+    make_row(pfs);
+    m_next_pos.set_after(&m_pos);
+    return 0;
   }
 
   return HA_ERR_END_OF_FILE;
@@ -122,14 +127,15 @@ int table_file_summary_by_instance::rnd_pos(const void *pos)
   PFS_file *pfs;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index < file_max);
-  pfs= &file_array[m_pos.m_index];
 
-  if (! pfs->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
+  pfs= global_file_container.get(m_pos.m_index);
+  if (pfs != NULL)
+  {
+    make_row(pfs);
+    return 0;
+  }
 
-  make_row(pfs);
-  return 0;
+  return HA_ERR_RECORD_DELETED;
 }
 
 /**
@@ -138,7 +144,7 @@ int table_file_summary_by_instance::rnd_pos(const void *pos)
 */
 void table_file_summary_by_instance::make_row(PFS_file *pfs)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   PFS_file_class *safe_class;
 
   m_row_exists= false;
@@ -175,7 +181,7 @@ int table_file_summary_by_instance::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 0);
+  assert(table->s->null_bytes == 0);
 
   for (; (f= *fields) ; fields++)
   {
@@ -263,7 +269,7 @@ int table_file_summary_by_instance::read_row_values(TABLE *table,
         set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_max);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
diff --git a/storage/perfschema/table_file_summary_by_instance.h b/storage/perfschema/table_file_summary_by_instance.h
index 0e7ce6958b2..f0c3c1303fb 100644
--- a/storage/perfschema/table_file_summary_by_instance.h
+++ b/storage/perfschema/table_file_summary_by_instance.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011 Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -67,6 +67,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
diff --git a/storage/perfschema/table_global_status.cc b/storage/perfschema/table_global_status.cc
new file mode 100644
index 00000000000..c68e501d819
--- /dev/null
+++ b/storage/perfschema/table_global_status.cc
@@ -0,0 +1,190 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_global_status.cc
+  Table global_status (implementation).
+*/
+
+#include "my_global.h"
+#include "table_global_status.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "pfs_global.h"
+
+THR_LOCK table_global_status::m_table_lock;
+
+PFS_engine_table_share
+table_global_status::m_share=
+{
+  { C_STRING_WITH_LEN("global_status") },
+  &pfs_truncatable_world_acl,
+  table_global_status::create,
+  NULL, /* write_row */
+  table_global_status::delete_all_rows,
+  table_global_status::get_row_count,
+  sizeof(pos_t),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE global_status("
+  "VARIABLE_NAME VARCHAR(64) not null comment 'The global status variable name.',"
+  "VARIABLE_VALUE VARCHAR(1024) comment 'The global status variable value.')") },
+  true   /* perpetual */
+};
+
+PFS_engine_table*
+table_global_status::create(void)
+{
+  return new table_global_status();
+}
+
+int table_global_status::delete_all_rows(void)
+{
+  mysql_mutex_lock(&LOCK_status);
+  reset_status_by_thread();
+  reset_status_by_account();
+  reset_status_by_user();
+  reset_status_by_host();
+  reset_global_status();
+  mysql_mutex_unlock(&LOCK_status);
+  return 0;
+}
+
+ha_rows table_global_status::get_row_count(void)
+{
+  mysql_mutex_lock(&LOCK_status);
+  ha_rows status_var_count= all_status_vars.elements;
+  mysql_mutex_unlock(&LOCK_status);
+  return status_var_count;
+}
+
+table_global_status::table_global_status()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_status_cache(false), m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+void table_global_status::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int table_global_status::rnd_init(bool scan)
+{
+  /* Build a cache of all global status variables. Sum across threads. */
+  m_status_cache.materialize_global();
+
+  /* Record the current number of status variables to detect subsequent changes. */
+  ulonglong status_version= m_status_cache.get_status_array_version();
+
+  /*
+    The table context holds the current version of the global status array.
+    If scan == true, then allocate a new context from mem_root and store in TLS.
+    If scan == false, then restore from TLS.
+  */
+  m_context= (table_global_status_context *)current_thd->alloc(sizeof(table_global_status_context));
+  new(m_context) table_global_status_context(status_version, !scan);
+  return 0;
+}
+
+int table_global_status::rnd_next(void)
+{
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < m_status_cache.size();
+       m_pos.next())
+  {
+    const Status_variable *status_var= m_status_cache.get(m_pos.m_index);
+    if (status_var != NULL)
+    {
+      make_row(status_var);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_global_status::rnd_pos(const void *pos)
+{
+  /* If global status array has changed, do nothing. */ // TODO: Issue warning
+  if (!m_context->versions_match())
+    return HA_ERR_RECORD_DELETED;
+
+  set_position(pos);
+  const Status_variable *status_var= m_status_cache.get(m_pos.m_index);
+  if (status_var != NULL)
+  {
+    make_row(status_var);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_global_status
+::make_row(const Status_variable *status_var)
+{
+  m_row_exists= false;
+  if (status_var->is_null())
+    return;
+  m_row.m_variable_name.make_row(status_var->m_name, status_var->m_name_length);
+  m_row.m_variable_value.make_row(status_var);
+  m_row_exists= true;
+}
+
+int table_global_status
+::read_row_values(TABLE *table,
+                  unsigned char *buf,
+                  Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* VARIABLE_NAME */
+        set_field_varchar_utf8(f, m_row.m_variable_name.m_str, m_row.m_variable_name.m_length);
+        break;
+      case 1: /* VARIABLE_VALUE */
+        m_row.m_variable_value.set_field(f);
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_global_status.h b/storage/perfschema/table_global_status.h
new file mode 100644
index 00000000000..53d7ca459c9
--- /dev/null
+++ b/storage/perfschema/table_global_status.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_GLOBAL_STATUS_H
+#define TABLE_GLOBAL_STATUS_H
+
+/**
+  @file storage/perfschema/table_global_status.h
+  Table global_status (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_variable.h"
+#include "table_helper.h"
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.GLOBAL_STATUS.
+*/
+struct row_global_status
+{
+  /** Column VARIABLE_NAME. */
+  PFS_variable_name_row m_variable_name;
+  /** Column VARIABLE_VALUE. */
+  PFS_variable_value_row m_variable_value;
+};
+
+/**
+  Store and retrieve table state information for queries that reinstantiate
+  the table object.
+*/
+class table_global_status_context : public PFS_table_context
+{
+public:
+  table_global_status_context(ulonglong current_version, bool restore) :
+    PFS_table_context(current_version, restore, THR_PFS_SG) { }
+};
+
+/** Table PERFORMANCE_SCHEMA.GLOBAL_STATUS. */
+class table_global_status : public PFS_engine_table
+{
+  typedef PFS_simple_index pos_t;
+
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+  table_global_status();
+
+public:
+  ~table_global_status()
+  {}
+
+protected:
+  void make_row(const Status_variable *system_var);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current THD variables. */
+  PFS_status_variable_cache m_status_cache;
+  /** Current row. */
+  row_global_status m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_t m_pos;
+  /** Next position. */
+  pos_t m_next_pos;
+
+  /** Table context with global status array version. */
+  table_global_status_context *m_context;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_global_variables.cc b/storage/perfschema/table_global_variables.cc
new file mode 100644
index 00000000000..67be7e4e99a
--- /dev/null
+++ b/storage/perfschema/table_global_variables.cc
@@ -0,0 +1,184 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_global_variables.cc
+  Table GLOBAL_VARIABLES (implementation).
+*/
+
+#include "my_global.h"
+#include "table_global_variables.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "pfs_global.h"
+
+THR_LOCK table_global_variables::m_table_lock;
+
+PFS_engine_table_share
+table_global_variables::m_share=
+{
+  { C_STRING_WITH_LEN("global_variables") },
+  &pfs_readonly_world_acl,
+  table_global_variables::create,
+  NULL, /* write_row */
+  NULL, /* delete_all_rows */
+  table_global_variables::get_row_count,
+  sizeof(pos_t),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE global_variables("
+  "VARIABLE_NAME VARCHAR(64) not null,"
+  "VARIABLE_VALUE VARCHAR(1024))") },
+  true   /* perpetual */
+};
+
+PFS_engine_table*
+table_global_variables::create(void)
+{
+  return new table_global_variables();
+}
+
+ha_rows table_global_variables::get_row_count(void)
+{
+  mysql_mutex_lock(&LOCK_plugin_delete);
+  mysql_prlock_rdlock(&LOCK_system_variables_hash);
+  ha_rows system_var_count= get_system_variable_hash_records();
+  mysql_prlock_unlock(&LOCK_system_variables_hash);
+  mysql_mutex_unlock(&LOCK_plugin_delete);
+  return system_var_count;
+}
+
+table_global_variables::table_global_variables()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_sysvar_cache(false), m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+void table_global_variables::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int table_global_variables::rnd_init(bool scan)
+{
+  /*
+    Build a list of system variables from the global system variable hash.
+    Filter by scope.
+  */
+  m_sysvar_cache.materialize_global();
+
+  /* Record the version of the system variable hash. */
+  ulonglong hash_version= m_sysvar_cache.get_sysvar_hash_version();
+
+  /*
+    The table context holds the current version of the system variable hash.
+    If scan == true, then allocate a new context from mem_root and store in TLS.
+    If scan == false, then restore from TLS.
+  */
+  m_context= (table_global_variables_context *)current_thd->alloc(sizeof(table_global_variables_context));
+  new(m_context) table_global_variables_context(hash_version, !scan);
+  return 0;
+}
+
+int table_global_variables::rnd_next(void)
+{
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < m_sysvar_cache.size();
+       m_pos.next())
+  {
+    const System_variable *system_var= m_sysvar_cache.get(m_pos.m_index);
+    if (system_var != NULL)
+    {
+      make_row(system_var);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_global_variables::rnd_pos(const void *pos)
+{
+  /* If system variable hash changes, do nothing. */ // TODO: Issue warning
+  if (!m_context->versions_match())
+    return HA_ERR_RECORD_DELETED;
+
+  set_position(pos);
+  assert(m_pos.m_index < m_sysvar_cache.size());
+
+  const System_variable *system_var= m_sysvar_cache.get(m_pos.m_index);
+  if (system_var != NULL)
+  {
+    make_row(system_var);
+    return 0;
+  }
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_global_variables
+::make_row(const System_variable *system_var)
+{
+  m_row_exists= false;
+  if (system_var->is_null() || system_var->is_ignored())
+    return;
+  m_row.m_variable_name.make_row(system_var->m_name, system_var->m_name_length);
+  m_row.m_variable_value.make_row(system_var);
+  m_row_exists= true;
+}
+
+int table_global_variables
+::read_row_values(TABLE *table,
+                  unsigned char *buf,
+                  Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* VARIABLE_NAME */
+        set_field_varchar_utf8(f, m_row.m_variable_name.m_str, m_row.m_variable_name.m_length);
+        break;
+      case 1: /* VARIABLE_VALUE */
+        m_row.m_variable_value.set_field(f);
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_global_variables.h b/storage/perfschema/table_global_variables.h
new file mode 100644
index 00000000000..8a9bbd63574
--- /dev/null
+++ b/storage/perfschema/table_global_variables.h
@@ -0,0 +1,117 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_GLOBAL_VARIABLES_H
+#define TABLE_GLOBAL_VARIABLES_H
+
+/**
+  @file storage/perfschema/table_global_variables.h
+  Table GLOBAL_VARIABLES (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_variable.h"
+#include "table_helper.h"
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  Store and retrieve table state information during queries that reinstantiate
+  the table object.
+*/
+class table_global_variables_context : public PFS_table_context
+{
+public:
+  table_global_variables_context(ulonglong hash_version, bool restore) :
+    PFS_table_context(hash_version, restore, THR_PFS_VG)  {}
+};
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.GLOBAL_VARIABLES.
+*/
+struct row_global_variables
+{
+  /** Column VARIABLE_NAME. */
+  PFS_variable_name_row m_variable_name;
+  /** Column VARIABLE_VALUE. */
+  PFS_variable_value_row m_variable_value;
+};
+
+/** Table PERFORMANCE_SCHEMA.GLOBAL_VARIABLES. */
+class table_global_variables : public PFS_engine_table
+{
+  typedef PFS_simple_index pos_t;
+
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+  table_global_variables();
+
+public:
+  ~table_global_variables()
+  {}
+
+protected:
+  void make_row(const System_variable *system_var);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current THD variables. */
+  PFS_system_variable_cache m_sysvar_cache;
+  /** Current row. */
+  row_global_variables m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_t m_pos;
+  /** Next position. */
+  pos_t m_next_pos;
+
+  /** Table context with system variable hash version. */
+  table_global_variables_context *m_context;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_helper.cc b/storage/perfschema/table_helper.cc
index d6bdd894483..0ae6f6c0786 100644
--- a/storage/perfschema/table_helper.cc
+++ b/storage/perfschema/table_helper.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,13 +26,16 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_engine_table.h"
 #include "table_helper.h"
 #include "pfs_host.h"
 #include "pfs_user.h"
 #include "pfs_account.h"
 #include "pfs_instr.h"
+#include "pfs_program.h"
+#include "field.h"
+#include "pfs_variable.h"
 
 int PFS_host_row::make_row(PFS_host *pfs)
 {
@@ -104,7 +107,7 @@ void PFS_account_row::set_field(uint index, Field *f)
         f->set_null();
       break;
     default:
-      DBUG_ASSERT(false);
+      assert(false);
       break;
   }
 }
@@ -147,7 +150,6 @@ int PFS_digest_row::make_row(PFS_statements_digest_stat* pfs)
   else
   {
     m_digest_length= 0;
-    m_digest_text.length(0);
   }
 
   return 0;
@@ -179,7 +181,7 @@ void PFS_digest_row::set_field(uint index, Field *f)
         f->set_null();
       break;
     default:
-      DBUG_ASSERT(false);
+      assert(false);
       break;
   }
 }
@@ -203,6 +205,98 @@ int PFS_object_row::make_row(PFS_table_share *pfs)
   return 0;
 }
 
+int PFS_object_row::make_row(PFS_program *pfs)
+{
+  m_object_type= pfs->m_type;
+
+  m_schema_name_length= pfs->m_schema_name_length;
+  if (m_schema_name_length > sizeof(m_schema_name))
+    return 1;
+  if (m_schema_name_length > 0)
+    memcpy(m_schema_name, pfs->m_schema_name, sizeof(m_schema_name));
+
+  m_object_name_length= pfs->m_object_name_length;
+  if (m_object_name_length > sizeof(m_object_name))
+    return 1;
+  if (m_object_name_length > 0)
+    memcpy(m_object_name, pfs->m_object_name, sizeof(m_object_name));
+
+  return 0;
+}
+
+int PFS_object_row::make_row(const MDL_key *mdl)
+{
+  MDL_key user_lock_workaround;
+  switch(mdl->mdl_namespace())
+  {
+  case MDL_key::BACKUP:
+    m_object_type= OBJECT_TYPE_BACKUP;
+    m_schema_name_length= 0;
+    m_object_name_length= 0;
+    break;
+  case MDL_key::SCHEMA:
+    m_object_type= OBJECT_TYPE_SCHEMA;
+    m_schema_name_length= mdl->db_name_length();
+    m_object_name_length= 0;
+    break;
+  case MDL_key::TABLE:
+    m_object_type= OBJECT_TYPE_TABLE;
+    m_schema_name_length= mdl->db_name_length();
+    m_object_name_length= mdl->name_length();
+    break;
+  case MDL_key::FUNCTION:
+    m_object_type= OBJECT_TYPE_FUNCTION;
+    m_schema_name_length= mdl->db_name_length();
+    m_object_name_length= mdl->name_length();
+    break;
+  case MDL_key::PROCEDURE:
+    m_object_type= OBJECT_TYPE_PROCEDURE;
+    m_schema_name_length= mdl->db_name_length();
+    m_object_name_length= mdl->name_length();
+    break;
+  case MDL_key::PACKAGE_BODY:
+    m_object_type= OBJECT_TYPE_PACKAGE_BODY;
+    m_schema_name_length= mdl->db_name_length();
+    m_object_name_length= mdl->name_length();
+    break;
+  case MDL_key::TRIGGER:
+    m_object_type= OBJECT_TYPE_TRIGGER;
+    m_schema_name_length= mdl->db_name_length();
+    m_object_name_length= mdl->name_length();
+    break;
+  case MDL_key::EVENT:
+    m_object_type= OBJECT_TYPE_EVENT;
+    m_schema_name_length= mdl->db_name_length();
+    m_object_name_length= mdl->name_length();
+    break;
+  case MDL_key::USER_LOCK:
+    m_object_type= OBJECT_TYPE_USER_LEVEL_LOCK;
+    user_lock_workaround.mdl_key_init(MDL_key::USER_LOCK, "", mdl->db_name());
+    mdl=& user_lock_workaround;
+    m_schema_name_length= 0;
+    m_object_name_length= mdl->name_length();
+    break;
+  case MDL_key::NAMESPACE_END:
+  default:
+    m_object_type= NO_OBJECT_TYPE;
+    m_schema_name_length= 0;
+    m_object_name_length= 0;
+    break;
+  }
+
+  if (m_schema_name_length > sizeof(m_schema_name))
+    return 1;
+  if (m_schema_name_length > 0)
+    memcpy(m_schema_name, mdl->db_name(), m_schema_name_length);
+
+  if (m_object_name_length > sizeof(m_object_name))
+    return 1;
+  if (m_object_name_length > 0)
+    memcpy(m_object_name, mdl->name(), m_object_name_length);
+
+  return 0;
+}
+
 void PFS_object_row::set_field(uint index, Field *f)
 {
   switch(index)
@@ -217,25 +311,69 @@ void PFS_object_row::set_field(uint index, Field *f)
       PFS_engine_table::set_field_varchar_utf8(f, m_object_name, m_object_name_length);
       break;
     default:
-      DBUG_ASSERT(false);
+      assert(false);
   }
 }
 
-int PFS_index_row::make_row(PFS_table_share *pfs, uint table_index)
+void PFS_object_row::set_nullable_field(uint index, Field *f)
+{
+  switch(index)
+  {
+    case 0: /* OBJECT_TYPE */
+      if (m_object_type != NO_OBJECT_TYPE)
+        set_field_object_type(f, m_object_type);
+      else
+        f->set_null();
+      break;
+    case 1: /* SCHEMA_NAME */
+      if (m_schema_name_length > 0)
+        PFS_engine_table::set_field_varchar_utf8(f, m_schema_name, m_schema_name_length);
+      else
+        f->set_null();
+      break;
+    case 2: /* OBJECT_NAME */
+      if (m_object_name_length > 0)
+        PFS_engine_table::set_field_varchar_utf8(f, m_object_name, m_object_name_length);
+      else
+        f->set_null();
+      break;
+    default:
+      assert(false);
+  }
+}
+
+int PFS_index_row::make_row(PFS_table_share *pfs,
+                            PFS_table_share_index *pfs_index,
+                            uint table_index)
 {
   if (m_object_row.make_row(pfs))
     return 1;
 
+  if (pfs_index == NULL)
+  {
+    if (table_index < MAX_INDEXES)
+    {
+      m_index_name_length= sprintf(m_index_name, "(index %d)", table_index);
+    }
+    else
+    {
+      m_index_name_length= 0;
+    }
+    return 0;
+  }
+
   if (table_index < MAX_INDEXES)
   {
-    PFS_table_key *key= &pfs->m_keys[table_index];
-    m_index_name_length= key->m_name_length;
+    m_index_name_length= pfs_index->m_key.m_name_length;
     if (m_index_name_length > sizeof(m_index_name))
       return 1;
-    memcpy(m_index_name, key->m_name, sizeof(m_index_name));
+
+    memcpy(m_index_name, pfs_index->m_key.m_name, sizeof(m_index_name));
   }
   else
+  {
     m_index_name_length= 0;
+  }
 
   return 0;
 }
@@ -256,7 +394,7 @@ void PFS_index_row::set_field(uint index, Field *f)
         f->set_null();
       break;
     default:
-      DBUG_ASSERT(false);
+      assert(false);
   }
 }
 
@@ -329,7 +467,38 @@ void PFS_statement_stat_row::set_field(uint index, Field *f)
       PFS_engine_table::set_field_ulonglong(f, m_no_good_index_used);
       break;
     default:
-      DBUG_ASSERT(false);
+      assert(false);
+      break;
+  }
+}
+
+void PFS_transaction_stat_row::set_field(uint index, Field *f)
+{
+  switch (index)
+  {
+    case 0: /* COUNT_STAR */
+    case 1: /* SUM_TIMER_WAIT */
+    case 2: /* MIN_TIMER_WAIT */
+    case 3: /* AVG_TIMER_WAIT */
+    case 4: /* MAX_TIMER_WAIT */
+      m_timer1_row.set_field(index, f);
+      break;
+    case 5: /* COUNT_READ_WRITE */
+    case 6: /* SUM_TIMER_READ_WRITE */
+    case 7: /* MIN_TIMER_READ_WRITE */
+    case 8: /* AVG_TIMER_READ_WRITE */
+    case 9: /* MAX_TIMER_READ_WRITE */
+      m_read_write_row.set_field(index-5, f);
+      break;
+    case 10: /* COUNT_READ_ONLY */
+    case 11: /* SUM_TIMER_READ_ONLY */
+    case 12: /* MIN_TIMER_READ_ONLY */
+    case 13: /* AVG_TIMER_READ_ONLY */
+    case 14: /* MAX_TIMER_READ_ONLY */
+      m_read_only_row.set_field(index-10, f);
+      break;
+    default:
+      assert(false);
       break;
   }
 }
@@ -345,7 +514,7 @@ void PFS_connection_stat_row::set_field(uint index, Field *f)
       PFS_engine_table::set_field_ulonglong(f, m_total_connections);
       break;
     default:
-      DBUG_ASSERT(false);
+      assert(false);
       break;
   }
 }
@@ -354,14 +523,374 @@ void set_field_object_type(Field *f, enum_object_type object_type)
 {
   switch (object_type)
   {
+  case OBJECT_TYPE_EVENT:
+    PFS_engine_table::set_field_varchar_utf8(f, "EVENT", 5);
+    break;
+  case OBJECT_TYPE_FUNCTION:
+    PFS_engine_table::set_field_varchar_utf8(f, "FUNCTION", 8);
+    break;
+  case OBJECT_TYPE_PROCEDURE:
+    PFS_engine_table::set_field_varchar_utf8(f, "PROCEDURE", 9);
+    break;
   case OBJECT_TYPE_TABLE:
     PFS_engine_table::set_field_varchar_utf8(f, "TABLE", 5);
     break;
   case OBJECT_TYPE_TEMPORARY_TABLE:
     PFS_engine_table::set_field_varchar_utf8(f, "TEMPORARY TABLE", 15);
     break;
+  case OBJECT_TYPE_TRIGGER:
+    PFS_engine_table::set_field_varchar_utf8(f, "TRIGGER", 7);
+    break;
+  case OBJECT_TYPE_BACKUP:
+    PFS_engine_table::set_field_varchar_utf8(f, "BACKUP", 6);
+    break;
+  case OBJECT_TYPE_SCHEMA:
+    PFS_engine_table::set_field_varchar_utf8(f, "SCHEMA", 6);
+    break;
+  case OBJECT_TYPE_PACKAGE_BODY:
+    PFS_engine_table::set_field_varchar_utf8(f, "PACKAGE BODY", 12);
+    break;
+  case OBJECT_TYPE_USER_LEVEL_LOCK:
+    PFS_engine_table::set_field_varchar_utf8(f, "USER LEVEL LOCK", 15);
+    break;
+  case NO_OBJECT_TYPE:
+  default:
+    assert(false);
+    PFS_engine_table::set_field_varchar_utf8(f, "", 0);
+    break;
+  }
+}
+
+void set_field_lock_type(Field *f, PFS_TL_LOCK_TYPE lock_type)
+{
+  switch (lock_type)
+  {
+  case PFS_TL_READ:
+    PFS_engine_table::set_field_varchar_utf8(f, "READ", 4);
+    break;
+  case PFS_TL_READ_WITH_SHARED_LOCKS:
+    PFS_engine_table::set_field_varchar_utf8(f, "READ WITH SHARED LOCKS", 22);
+    break;
+  case PFS_TL_READ_HIGH_PRIORITY:
+    PFS_engine_table::set_field_varchar_utf8(f, "READ HIGH PRIORITY", 18);
+    break;
+  case PFS_TL_READ_NO_INSERT:
+    PFS_engine_table::set_field_varchar_utf8(f, "READ NO INSERT", 14);
+    break;
+  case PFS_TL_WRITE_ALLOW_WRITE:
+    PFS_engine_table::set_field_varchar_utf8(f, "WRITE ALLOW WRITE", 17);
+    break;
+  case PFS_TL_WRITE_CONCURRENT_INSERT:
+    PFS_engine_table::set_field_varchar_utf8(f, "WRITE CONCURRENT INSERT", 23);
+    break;
+  case PFS_TL_WRITE_LOW_PRIORITY:
+    PFS_engine_table::set_field_varchar_utf8(f, "WRITE LOW PRIORITY", 18);
+    break;
+  case PFS_TL_WRITE:
+    PFS_engine_table::set_field_varchar_utf8(f, "WRITE", 5);
+    break;
+  case PFS_TL_READ_EXTERNAL:
+    PFS_engine_table::set_field_varchar_utf8(f, "READ EXTERNAL", 13);
+    break;
+  case PFS_TL_WRITE_EXTERNAL:
+    PFS_engine_table::set_field_varchar_utf8(f, "WRITE EXTERNAL", 14);
+    break;
+  case PFS_TL_NONE:
+    f->set_null();
+    break;
   default:
-    DBUG_ASSERT(false);
+    assert(false);
+  }
+}
+
+void set_field_mdl_type(Field *f, opaque_mdl_type mdl_type, bool backup)
+{
+  if (backup)
+  {
+    switch (mdl_type)
+    {
+    case MDL_BACKUP_START:
+      PFS_engine_table::set_field_varchar_utf8(f, STRING_WITH_LEN("BACKUP_START"));
+      break;
+    case MDL_BACKUP_FLUSH:
+      PFS_engine_table::set_field_varchar_utf8(f, STRING_WITH_LEN("BACKUP_FLUSH"));
+      break;
+    case MDL_BACKUP_WAIT_FLUSH:
+      PFS_engine_table::set_field_varchar_utf8(f, STRING_WITH_LEN("BACKUP_WAIT_FLUSH"));
+      break;
+    case MDL_BACKUP_WAIT_DDL:
+      PFS_engine_table::set_field_varchar_utf8(f, STRING_WITH_LEN("BACKUP_WAIT_DDL"));
+      break;
+    case MDL_BACKUP_WAIT_COMMIT:
+      PFS_engine_table::set_field_varchar_utf8(f, STRING_WITH_LEN("BACKUP_WAIT_COMMIT"));
+      break;
+    case MDL_BACKUP_FTWRL1:
+      PFS_engine_table::set_field_varchar_utf8(f, STRING_WITH_LEN("BACKUP_FTWRL1"));
+      break;
+    case MDL_BACKUP_FTWRL2:
+      PFS_engine_table::set_field_varchar_utf8(f, STRING_WITH_LEN("BACKUP_FTWRL2"));
+      break;
+    case MDL_BACKUP_DML:
+      PFS_engine_table::set_field_varchar_utf8(f, STRING_WITH_LEN("BACKUP_DML"));
+      break;
+    case MDL_BACKUP_TRANS_DML:
+      PFS_engine_table::set_field_varchar_utf8(f, STRING_WITH_LEN("BACKUP_TRANS_DML"));
+      break;
+    case MDL_BACKUP_SYS_DML:
+      PFS_engine_table::set_field_varchar_utf8(f, STRING_WITH_LEN("BACKUP_SYS_DML"));
+      break;
+    case MDL_BACKUP_DDL:
+      PFS_engine_table::set_field_varchar_utf8(f, STRING_WITH_LEN("BACKUP_DDL"));
+      break;
+    case MDL_BACKUP_BLOCK_DDL:
+      PFS_engine_table::set_field_varchar_utf8(f, STRING_WITH_LEN("BACKUP_BLOCK_DDL"));
+      break;
+    case MDL_BACKUP_ALTER_COPY:
+      PFS_engine_table::set_field_varchar_utf8(f, STRING_WITH_LEN("BACKUP_ALTER_COPY"));
+      break;
+    case MDL_BACKUP_COMMIT:
+      PFS_engine_table::set_field_varchar_utf8(f, STRING_WITH_LEN("BACKUP_COMMIT"));
+      break;
+    case MDL_BACKUP_END:
+      PFS_engine_table::set_field_varchar_utf8(f, STRING_WITH_LEN("BACKUP_END"));
+      break;
+    default:
+      DBUG_ASSERT(false);
+    }
+  }
+  else
+  {
+    enum_mdl_type e= (enum_mdl_type) mdl_type;
+    switch (e)
+    {
+    case MDL_INTENTION_EXCLUSIVE:
+      PFS_engine_table::set_field_varchar_utf8(f, "INTENTION_EXCLUSIVE", 19);
+      break;
+    case MDL_SHARED:
+      PFS_engine_table::set_field_varchar_utf8(f, "SHARED", 6);
+      break;
+    case MDL_SHARED_HIGH_PRIO:
+      PFS_engine_table::set_field_varchar_utf8(f, "SHARED_HIGH_PRIO", 16);
+      break;
+    case MDL_SHARED_READ:
+      PFS_engine_table::set_field_varchar_utf8(f, "SHARED_READ", 11);
+      break;
+    case MDL_SHARED_WRITE:
+      PFS_engine_table::set_field_varchar_utf8(f, "SHARED_WRITE", 12);
+      break;
+    case MDL_SHARED_UPGRADABLE:
+      PFS_engine_table::set_field_varchar_utf8(f, "SHARED_UPGRADABLE", 17);
+      break;
+    case MDL_SHARED_NO_WRITE:
+      PFS_engine_table::set_field_varchar_utf8(f, "SHARED_NO_WRITE", 15);
+      break;
+    case MDL_SHARED_NO_READ_WRITE:
+      PFS_engine_table::set_field_varchar_utf8(f, "SHARED_NO_READ_WRITE", 20);
+      break;
+    case MDL_EXCLUSIVE:
+      PFS_engine_table::set_field_varchar_utf8(f, "EXCLUSIVE", 9);
+      break;
+    default:
+      DBUG_ASSERT(false);
+    }
+  }
+}
+
+void set_field_mdl_duration(Field *f, opaque_mdl_duration mdl_duration)
+{
+  enum_mdl_duration e= (enum_mdl_duration) mdl_duration;
+  switch (e)
+  {
+  case MDL_STATEMENT:
+    PFS_engine_table::set_field_varchar_utf8(f, "STATEMENT", 9);
+    break;
+  case MDL_TRANSACTION:
+    PFS_engine_table::set_field_varchar_utf8(f, "TRANSACTION", 11);
+    break;
+  case MDL_EXPLICIT:
+    PFS_engine_table::set_field_varchar_utf8(f, "EXPLICIT", 8);
+    break;
+  case MDL_DURATION_END:
+  default:
+    assert(false);
+  }
+}
+
+void set_field_mdl_status(Field *f, opaque_mdl_status mdl_status)
+{
+  MDL_ticket::enum_psi_status e= static_cast<MDL_ticket::enum_psi_status>(mdl_status);
+  switch (e)
+  {
+  case MDL_ticket::PENDING:
+    PFS_engine_table::set_field_varchar_utf8(f, "PENDING", 7);
+    break;
+  case MDL_ticket::GRANTED:
+    PFS_engine_table::set_field_varchar_utf8(f, "GRANTED", 7);
+    break;
+  case MDL_ticket::PRE_ACQUIRE_NOTIFY:
+    PFS_engine_table::set_field_varchar_utf8(f, "PRE_ACQUIRE_NOTIFY", 18);
+    break;
+  case MDL_ticket::POST_RELEASE_NOTIFY:
+    PFS_engine_table::set_field_varchar_utf8(f, "POST_RELEASE_NOTIFY", 19);
+    break;
+  default:
+    assert(false);
+  }
+}
+
+void PFS_memory_stat_row::set_field(uint index, Field *f)
+{
+  ssize_t val;
+
+  switch (index)
+  {
+    case 0: /* COUNT_ALLOC */
+      PFS_engine_table::set_field_ulonglong(f, m_stat.m_alloc_count);
+      break;
+    case 1: /* COUNT_FREE */
+      PFS_engine_table::set_field_ulonglong(f, m_stat.m_free_count);
+      break;
+    case 2: /* SUM_NUMBER_OF_BYTES_ALLOC */
+      PFS_engine_table::set_field_ulonglong(f, m_stat.m_alloc_size);
+      break;
+    case 3: /* SUM_NUMBER_OF_BYTES_FREE */
+      PFS_engine_table::set_field_ulonglong(f, m_stat.m_free_size);
+      break;
+    case 4: /* LOW_COUNT_USED */
+      val= m_stat.m_alloc_count - m_stat.m_free_count - m_stat.m_free_count_capacity;
+      PFS_engine_table::set_field_longlong(f, val);
+      break;
+    case 5: /* CURRENT_COUNT_USED */
+      val= m_stat.m_alloc_count - m_stat.m_free_count;
+      PFS_engine_table::set_field_longlong(f, val);
+      break;
+    case 6: /* HIGH_COUNT_USED */
+      val= m_stat.m_alloc_count - m_stat.m_free_count + m_stat.m_alloc_count_capacity;
+      PFS_engine_table::set_field_longlong(f, val);
+      break;
+    case 7: /* LOW_NUMBER_OF_BYTES_USED */
+      val= m_stat.m_alloc_size - m_stat.m_free_size - m_stat.m_free_size_capacity;
+      PFS_engine_table::set_field_longlong(f, val);
+      break;
+    case 8: /* CURRENT_NUMBER_OF_BYTES_USED */
+      val= m_stat.m_alloc_size - m_stat.m_free_size;
+      PFS_engine_table::set_field_longlong(f, val);
+      break;
+    case 9: /* HIGH_NUMBER_OF_BYTES_USED */
+      val= m_stat.m_alloc_size - m_stat.m_free_size + m_stat.m_alloc_size_capacity;
+      PFS_engine_table::set_field_longlong(f, val);
+      break;
+    default:
+      assert(false);
+      break;
+  }
+}
+
+void set_field_isolation_level(Field *f, enum_isolation_level iso_level)
+{
+  switch (iso_level)
+  {
+  case TRANS_LEVEL_READ_UNCOMMITTED:
+    PFS_engine_table::set_field_varchar_utf8(f, "READ UNCOMMITTED", 16);
+    break;
+  case TRANS_LEVEL_READ_COMMITTED:
+    PFS_engine_table::set_field_varchar_utf8(f, "READ COMMITTED", 14);
+    break;
+  case TRANS_LEVEL_REPEATABLE_READ:
+    PFS_engine_table::set_field_varchar_utf8(f, "REPEATABLE READ", 15);
+    break;
+  case TRANS_LEVEL_SERIALIZABLE:
+    PFS_engine_table::set_field_varchar_utf8(f, "SERIALIZABLE", 12);
+    break;
+  default:
+    assert(false);
+  }
+}
+
+void set_field_xa_state(Field *f, enum_xa_transaction_state xa_state)
+{
+  switch (xa_state)
+  {
+  case TRANS_STATE_XA_NOTR:
+    PFS_engine_table::set_field_varchar_utf8(f, "NOTR", 4);
+    break;
+  case TRANS_STATE_XA_ACTIVE:
+    PFS_engine_table::set_field_varchar_utf8(f, "ACTIVE", 6);
+    break;
+  case TRANS_STATE_XA_IDLE:
+    PFS_engine_table::set_field_varchar_utf8(f, "IDLE", 4);
+    break;
+  case TRANS_STATE_XA_PREPARED:
+    PFS_engine_table::set_field_varchar_utf8(f, "PREPARED", 8);
+    break;
+  case TRANS_STATE_XA_ROLLBACK_ONLY:
+    PFS_engine_table::set_field_varchar_utf8(f, "ROLLBACK ONLY", 13);
+    break;
+  case TRANS_STATE_XA_COMMITTED:
+    PFS_engine_table::set_field_varchar_utf8(f, "COMMITTED", 9);
+    break;
+  default:
+    assert(false);
+  }
+}
+
+void PFS_variable_name_row::make_row(const char* str, size_t length)
+{
+  assert(length <= sizeof(m_str));
+  assert(length <= NAME_CHAR_LEN);
+
+  m_length= MY_MIN(static_cast<uint>(length), NAME_CHAR_LEN); /* enforce max name length */
+  if (m_length > 0)
+    memcpy(m_str, str, length);
+  m_str[m_length]= '\0';
+}
+
+void PFS_variable_value_row::make_row(const Status_variable *var)
+{
+  make_row(var->m_charset, var->m_value_str, var->m_value_length);
+}
+
+void PFS_variable_value_row::make_row(const System_variable *var)
+{
+  make_row(var->m_charset, var->m_value_str, var->m_value_length);
+}
+
+void PFS_variable_value_row::make_row(const CHARSET_INFO *cs, const char* str, size_t length)
+{
+  assert(cs != NULL);
+  assert(length <= sizeof(m_str));
+  if (length > 0)
+  {
+    memcpy(m_str, str, length);
+  }
+  m_length= static_cast<uint>(length);
+  m_charset= cs;
+}
+
+void PFS_variable_value_row::set_field(Field *f)
+{
+  PFS_engine_table::set_field_varchar(f, m_charset, m_str, m_length);
+}
+
+void PFS_user_variable_value_row::clear()
+{
+  my_free(m_value);
+  m_value= NULL;
+  m_value_length= 0;
+}
+
+void PFS_user_variable_value_row::make_row(const char* val, size_t length)
+{
+  if (length > 0)
+  {
+    m_value= (char*) my_malloc(PSI_NOT_INSTRUMENTED, length, MYF(0));
+    m_value_length= length;
+    memcpy(m_value, val, length);
+  }
+  else
+  {
+    m_value= NULL;
+    m_value_length= 0;
   }
 }
 
diff --git a/storage/perfschema/table_helper.h b/storage/perfschema/table_helper.h
index 62f94826754..c7e4bf4dfb5 100644
--- a/storage/perfschema/table_helper.h
+++ b/storage/perfschema/table_helper.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -31,7 +31,7 @@
 #include "pfs_digest.h"
 
 /*
-  Write MD5 hash value in a string to be used 
+  Write MD5 hash value in a string to be used
   as DIGEST for the statement.
 */
 #define MD5_HASH_TO_STRING(_hash, _str)                    \
@@ -47,6 +47,10 @@
 struct PFS_host;
 struct PFS_user;
 struct PFS_account;
+struct PFS_object_name;
+struct PFS_program;
+class System_variable;
+class Status_variable;
 
 /**
   @file storage/perfschema/table_helper.h
@@ -69,7 +73,8 @@ struct PFS_instrument_view_constants
   static const uint VIEW_TABLE= 5;
   static const uint VIEW_SOCKET= 6;
   static const uint VIEW_IDLE= 7;
-  static const uint LAST_VIEW= 7;
+  static const uint VIEW_METADATA= 8;
+  static const uint LAST_VIEW= 8;
 };
 
 /** Namespace, internal views used within object summaries. */
@@ -77,12 +82,8 @@ struct PFS_object_view_constants
 {
   static const uint FIRST_VIEW= 1;
   static const uint VIEW_TABLE= 1;
-  static const uint LAST_VIEW= 1;
-
-  /* Future use */
-  static const uint VIEW_EVENT= 2;
-  static const uint VIEW_PROCEDURE= 3;
-  static const uint VIEW_FUNCTION= 4;
+  static const uint VIEW_PROGRAM= 2;
+  static const uint LAST_VIEW= 2;
 };
 
 /** Row fragment for column HOST. */
@@ -189,8 +190,11 @@ struct PFS_object_row
 
   /** Build a row from a memory buffer. */
   int make_row(PFS_table_share *pfs);
+  int make_row(PFS_program *pfs);
+  int make_row(const MDL_key *pfs);
   /** Set a table field from the row. */
   void set_field(uint index, Field *f);
+  void set_nullable_field(uint index, Field *f);
 };
 
 /** Row fragment for columns OBJECT_TYPE, SCHEMA_NAME, OBJECT_NAME, INDEX_NAME. */
@@ -203,7 +207,8 @@ struct PFS_index_row
   uint m_index_name_length;
 
   /** Build a row from a memory buffer. */
-  int make_row(PFS_table_share *pfs, uint table_index);
+  int make_row(PFS_table_share *pfs, PFS_table_share_index *pfs_index,
+               uint table_index);
   /** Set a table field from the row. */
   void set_field(uint index, Field *f);
 };
@@ -222,6 +227,15 @@ struct PFS_stat_row
   /** Column MAX_TIMER_WAIT. */
   ulonglong m_max;
 
+  inline void reset()
+  {
+    m_count= 0;
+    m_sum= 0;
+    m_min= 0;
+    m_avg= 0;
+    m_max= 0;
+  }
+
   /** Build a row with timer fields from a memory buffer. */
   inline void set(time_normalizer *normalizer, const PFS_single_stat *stat)
   {
@@ -264,7 +278,7 @@ struct PFS_stat_row
         PFS_engine_table::set_field_ulonglong(f, m_max);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
     }
   }
 };
@@ -427,33 +441,106 @@ struct PFS_statement_stat_row
   /** Build a row from a memory buffer. */
   inline void set(time_normalizer *normalizer, const PFS_statement_stat *stat)
   {
+    if (stat->m_timer1_stat.m_count != 0)
+    {
+      m_timer1_row.set(normalizer, & stat->m_timer1_stat);
+
+      m_error_count= stat->m_error_count;
+      m_warning_count= stat->m_warning_count;
+      m_lock_time= stat->m_lock_time * MICROSEC_TO_PICOSEC;
+      m_rows_affected= stat->m_rows_affected;
+      m_rows_sent= stat->m_rows_sent;
+      m_rows_examined= stat->m_rows_examined;
+      m_created_tmp_disk_tables= stat->m_created_tmp_disk_tables;
+      m_created_tmp_tables= stat->m_created_tmp_tables;
+      m_select_full_join= stat->m_select_full_join;
+      m_select_full_range_join= stat->m_select_full_range_join;
+      m_select_range= stat->m_select_range;
+      m_select_range_check= stat->m_select_range_check;
+      m_select_scan= stat->m_select_scan;
+      m_sort_merge_passes= stat->m_sort_merge_passes;
+      m_sort_range= stat->m_sort_range;
+      m_sort_rows= stat->m_sort_rows;
+      m_sort_scan= stat->m_sort_scan;
+      m_no_index_used= stat->m_no_index_used;
+      m_no_good_index_used= stat->m_no_good_index_used;
+    }
+    else
+    {
+      m_timer1_row.reset();
+
+      m_error_count= 0;
+      m_warning_count= 0;
+      m_lock_time= 0;
+      m_rows_affected= 0;
+      m_rows_sent= 0;
+      m_rows_examined= 0;
+      m_created_tmp_disk_tables= 0;
+      m_created_tmp_tables= 0;
+      m_select_full_join= 0;
+      m_select_full_range_join= 0;
+      m_select_range= 0;
+      m_select_range_check= 0;
+      m_select_scan= 0;
+      m_sort_merge_passes= 0;
+      m_sort_range= 0;
+      m_sort_rows= 0;
+      m_sort_scan= 0;
+      m_no_index_used= 0;
+      m_no_good_index_used= 0;
+    }
+  }
+
+  /** Set a table field from the row. */
+  void set_field(uint index, Field *f);
+};
+
+/** Row fragment for stored program statistics. */
+struct PFS_sp_stat_row
+{
+  PFS_stat_row m_timer1_row;
+
+  /** Build a row from a memory buffer. */
+  inline void set(time_normalizer *normalizer, const PFS_sp_stat *stat)
+  {
     m_timer1_row.set(normalizer, & stat->m_timer1_stat);
+  }
 
-    m_error_count= stat->m_error_count;
-    m_warning_count= stat->m_warning_count;
-    m_lock_time= stat->m_lock_time * MICROSEC_TO_PICOSEC;
-    m_rows_affected= stat->m_rows_affected;
-    m_rows_sent= stat->m_rows_sent;
-    m_rows_examined= stat->m_rows_examined;
-    m_created_tmp_disk_tables= stat->m_created_tmp_disk_tables;
-    m_created_tmp_tables= stat->m_created_tmp_tables;
-    m_select_full_join= stat->m_select_full_join;
-    m_select_full_range_join= stat->m_select_full_range_join;
-    m_select_range= stat->m_select_range;
-    m_select_range_check= stat->m_select_range_check;
-    m_select_scan= stat->m_select_scan;
-    m_sort_merge_passes= stat->m_sort_merge_passes;
-    m_sort_range= stat->m_sort_range;
-    m_sort_rows= stat->m_sort_rows;
-    m_sort_scan= stat->m_sort_scan;
-    m_no_index_used= stat->m_no_index_used;
-    m_no_good_index_used= stat->m_no_good_index_used;
+  /** Set a table field from the row. */
+  inline void set_field(uint index, Field *f)
+  {
+    m_timer1_row.set_field(index, f);
+  }
+};
+
+/** Row fragment for transaction statistics columns. */
+struct PFS_transaction_stat_row
+{
+  PFS_stat_row m_timer1_row;
+  PFS_stat_row m_read_write_row;
+  PFS_stat_row m_read_only_row;
+  ulonglong m_savepoint_count;
+  ulonglong m_rollback_to_savepoint_count;
+  ulonglong m_release_savepoint_count;
+
+  /** Build a row from a memory buffer. */
+  inline void set(time_normalizer *normalizer, const PFS_transaction_stat *stat)
+  {
+    /* Combine read write/read only stats */
+    PFS_single_stat all;
+    all.aggregate(&stat->m_read_only_stat);
+    all.aggregate(&stat->m_read_write_stat);
+
+    m_timer1_row.set(normalizer, &all);
+    m_read_write_row.set(normalizer, &stat->m_read_write_stat);
+    m_read_only_row.set(normalizer, &stat->m_read_only_stat);
   }
 
   /** Set a table field from the row. */
   void set_field(uint index, Field *f);
 };
 
+/** Row fragment for connection statistics. */
 struct PFS_connection_stat_row
 {
   ulonglong m_current_connections;
@@ -470,6 +557,12 @@ struct PFS_connection_stat_row
 };
 
 void set_field_object_type(Field *f, enum_object_type object_type);
+void set_field_lock_type(Field *f, PFS_TL_LOCK_TYPE lock_type);
+void set_field_mdl_type(Field *f, opaque_mdl_type mdl_type, bool backup);
+void set_field_mdl_duration(Field *f, opaque_mdl_duration mdl_duration);
+void set_field_mdl_status(Field *f, opaque_mdl_status mdl_status);
+void set_field_isolation_level(Field *f, enum_isolation_level iso_level);
+void set_field_xa_state(Field *f, enum_xa_transaction_state xa_state);
 
 /** Row fragment for socket io statistics columns. */
 struct PFS_socket_io_stat_row
@@ -478,7 +571,7 @@ struct PFS_socket_io_stat_row
   PFS_byte_stat_row m_write;
   PFS_byte_stat_row m_misc;
   PFS_byte_stat_row m_all;
-  
+
   inline void set(time_normalizer *normalizer, const PFS_socket_io_stat *stat)
   {
     PFS_byte_stat all;
@@ -486,7 +579,7 @@ struct PFS_socket_io_stat_row
     m_read.set(normalizer, &stat->m_read);
     m_write.set(normalizer, &stat->m_write);
     m_misc.set(normalizer, &stat->m_misc);
-    
+
     /* Combine stats for all operations */
     all.aggregate(&stat->m_read);
     all.aggregate(&stat->m_write);
@@ -503,7 +596,7 @@ struct PFS_file_io_stat_row
   PFS_byte_stat_row m_write;
   PFS_byte_stat_row m_misc;
   PFS_byte_stat_row m_all;
-  
+
   inline void set(time_normalizer *normalizer, const PFS_file_io_stat *stat)
   {
     PFS_byte_stat all;
@@ -511,7 +604,7 @@ struct PFS_file_io_stat_row
     m_read.set(normalizer, &stat->m_read);
     m_write.set(normalizer, &stat->m_write);
     m_misc.set(normalizer, &stat->m_misc);
-    
+
     /* Combine stats for all operations */
     all.aggregate(&stat->m_read);
     all.aggregate(&stat->m_write);
@@ -521,6 +614,88 @@ struct PFS_file_io_stat_row
   }
 };
 
+/** Row fragment for memory statistics columns. */
+struct PFS_memory_stat_row
+{
+  PFS_memory_stat m_stat;
+
+  /** Build a row from a memory buffer. */
+  inline void set(const PFS_memory_stat *stat)
+  {
+    m_stat= *stat;
+  }
+
+  /** Set a table field from the row. */
+  void set_field(uint index, Field *f);
+};
+
+struct PFS_variable_name_row
+{
+public:
+  PFS_variable_name_row()
+  {
+    m_str[0]= '\0';
+    m_length= 0;
+  }
+
+  void make_row(const char* str, size_t length);
+
+  char m_str[NAME_CHAR_LEN+1];
+  uint m_length;
+};
+
+struct PFS_variable_value_row
+{
+public:
+  /** Set the row from a status variable. */
+  void make_row(const Status_variable *var);
+
+  /** Set the row from a system variable. */
+  void make_row(const System_variable *var);
+
+  /** Set a table field from the row. */
+  void set_field(Field *f);
+
+private:
+  void make_row(const CHARSET_INFO *cs, const char* str, size_t length);
+
+  char m_str[1024];
+  uint m_length;
+  const CHARSET_INFO *m_charset;
+};
+
+struct PFS_user_variable_value_row
+{
+public:
+  PFS_user_variable_value_row()
+    : m_value(NULL), m_value_length(0)
+  {}
+
+  PFS_user_variable_value_row(const PFS_user_variable_value_row& rhs)
+  {
+    make_row(rhs.m_value, rhs.m_value_length);
+  }
+
+  ~PFS_user_variable_value_row()
+  {
+    clear();
+  }
+
+  void make_row(const char* val, size_t length);
+
+  const char *get_value() const
+  { return m_value; }
+
+  size_t get_value_length() const
+  { return m_value_length; }
+
+  void clear();
+
+private:
+  char *m_value;
+  size_t m_value_length;
+};
+
 /** @} */
 
 #endif
diff --git a/storage/perfschema/table_host_cache.cc b/storage/perfschema/table_host_cache.cc
index 59ad3dd18a1..1c1cba60b0d 100644
--- a/storage/perfschema/table_host_cache.cc
+++ b/storage/perfschema/table_host_cache.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,9 +26,11 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "table_host_cache.h"
 #include "hostname.h"
+#include "field.h"
+#include "sql_class.h"
 
 THR_LOCK table_host_cache::m_table_lock;
 
@@ -37,11 +39,10 @@ table_host_cache::m_share=
 {
   { C_STRING_WITH_LEN("host_cache") },
   &pfs_truncatable_acl,
-  &table_host_cache::create,
+  table_host_cache::create,
   NULL, /* write_row */
   table_host_cache::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_host_cache::get_row_count,
   sizeof(PFS_simple_index), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE host_cache("
@@ -73,7 +74,8 @@ table_host_cache::m_share=
                       "FIRST_SEEN TIMESTAMP(0) NOT NULL default 0 comment 'Timestamp of the first connection attempt by the IP.',"
                       "LAST_SEEN TIMESTAMP(0) NOT NULL default 0 comment 'Timestamp of the most recent connection attempt by the IP.',"
                       "FIRST_ERROR_SEEN TIMESTAMP(0) null default 0 comment 'Timestamp of the first error seen from the IP.',"
-                      "LAST_ERROR_SEEN TIMESTAMP(0) null default 0 comment 'Timestamp of the most recent error seen from the IP.')") }
+                      "LAST_ERROR_SEEN TIMESTAMP(0) null default 0 comment 'Timestamp of the most recent error seen from the IP.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_host_cache::create(void)
@@ -82,7 +84,7 @@ PFS_engine_table* table_host_cache::create(void)
   if (t != NULL)
   {
     THD *thd= current_thd;
-    DBUG_ASSERT(thd != NULL);
+    assert(thd != NULL);
     t->materialize(thd);
   }
   return t;
@@ -100,6 +102,16 @@ table_host_cache::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_host_cache::get_row_count(void)
+{
+  ha_rows count;
+  hostname_cache_lock();
+  count= hostname_cache_size();
+  hostname_cache_unlock();
+  return count;
+}
+
 table_host_cache::table_host_cache()
   : PFS_engine_table(&m_share, &m_pos),
     m_all_rows(NULL), m_row_count(0),
@@ -115,8 +127,8 @@ void table_host_cache::materialize(THD *thd)
   row_host_cache *rows;
   row_host_cache *row;
 
-  DBUG_ASSERT(m_all_rows == NULL);
-  DBUG_ASSERT(m_row_count == 0);
+  assert(m_all_rows == NULL);
+  assert(m_row_count == 0);
 
   hostname_cache_lock();
 
@@ -229,7 +241,7 @@ int table_host_cache::rnd_next(void)
 int table_host_cache::rnd_pos(const void *pos)
 {
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index < m_row_count);
+  assert(m_pos.m_index < m_row_count);
   m_row= &m_all_rows[m_pos.m_index];
   return 0;
 }
@@ -241,10 +253,10 @@ int table_host_cache::read_row_values(TABLE *table,
 {
   Field *f;
 
-  DBUG_ASSERT(m_row);
+  assert(m_row);
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
@@ -350,7 +362,7 @@ int table_host_cache::read_row_values(TABLE *table,
           f->set_null();
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
diff --git a/storage/perfschema/table_host_cache.h b/storage/perfschema/table_host_cache.h
index 8add0b5049c..5680a49675f 100644
--- a/storage/perfschema/table_host_cache.h
+++ b/storage/perfschema/table_host_cache.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -111,6 +111,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
diff --git a/storage/perfschema/table_hosts.cc b/storage/perfschema/table_hosts.cc
index b65b3889fef..7c6f6212bb8 100644
--- a/storage/perfschema/table_hosts.cc
+++ b/storage/perfschema/table_hosts.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -21,13 +21,16 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "table_hosts.h"
 #include "pfs_instr_class.h"
 #include "pfs_instr.h"
 #include "pfs_account.h"
 #include "pfs_host.h"
 #include "pfs_visitor.h"
+#include "pfs_memory.h"
+#include "pfs_status.h"
+#include "field.h"
 
 THR_LOCK table_hosts::m_table_lock;
 
@@ -36,17 +39,17 @@ table_hosts::m_share=
 {
   { C_STRING_WITH_LEN("hosts") },
   &pfs_truncatable_acl,
-  &table_hosts::create,
+  table_hosts::create,
   NULL, /* write_row */
   table_hosts::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  cursor_by_host::get_row_count,
   sizeof(PFS_simple_index), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE hosts("
                       "HOST CHAR(" STRINGIFY_ARG(HOSTNAME_LENGTH) ") collate utf8_bin default null comment 'Host name used by the client to connect, NULL for internal threads or user sessions that failed to authenticate.',"
                       "CURRENT_CONNECTIONS bigint not null comment 'Current number of the host''s connections.',"
-                      "TOTAL_CONNECTIONS bigint not null comment 'Total number of the host''s connections')") }
+                      "TOTAL_CONNECTIONS bigint not null comment 'Total number of the host''s connections')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_hosts::create()
@@ -66,6 +69,15 @@ table_hosts::delete_all_rows(void)
   reset_events_statements_by_thread();
   reset_events_statements_by_account();
   reset_events_statements_by_host();
+  reset_events_transactions_by_thread();
+  reset_events_transactions_by_account();
+  reset_events_transactions_by_host();
+  reset_memory_by_thread();
+  reset_memory_by_account();
+  reset_memory_by_host();
+  reset_status_by_thread();
+  reset_status_by_account();
+  reset_status_by_host();
   purge_all_account();
   purge_all_host();
   return 0;
@@ -78,7 +90,7 @@ table_hosts::table_hosts()
 
 void table_hosts::make_row(PFS_host *pfs)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
 
   m_row_exists= false;
   pfs->m_lock.begin_optimistic_lock(&lock);
@@ -87,7 +99,11 @@ void table_hosts::make_row(PFS_host *pfs)
     return;
 
   PFS_connection_stat_visitor visitor;
-  PFS_connection_iterator::visit_host(pfs, true, true, & visitor);
+  PFS_connection_iterator::visit_host(pfs,
+                                      true,  /* accounts */
+                                      true,  /* threads */
+                                      false, /* THDs */
+                                      & visitor);
 
   if (! pfs->m_lock.end_optimistic_lock(& lock))
     return;
@@ -107,7 +123,7 @@ int table_hosts::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
@@ -124,7 +140,7 @@ int table_hosts::read_row_values(TABLE *table,
         m_row.m_connection_stat.set_field(f->field_index - 1, f);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
diff --git a/storage/perfschema/table_hosts.h b/storage/perfschema/table_hosts.h
index 422b6449b25..51bc7f2f2a7 100644
--- a/storage/perfschema/table_hosts.h
+++ b/storage/perfschema/table_hosts.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
diff --git a/storage/perfschema/table_md_locks.cc b/storage/perfschema/table_md_locks.cc
new file mode 100644
index 00000000000..d7e3423d9bc
--- /dev/null
+++ b/storage/perfschema/table_md_locks.cc
@@ -0,0 +1,207 @@
+/* Copyright (c) 2012, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_md_locks.cc
+  Table METADATA_LOCKS (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "pfs_instr.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_md_locks.h"
+#include "pfs_global.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
+
+THR_LOCK table_metadata_locks::m_table_lock;
+
+PFS_engine_table_share
+table_metadata_locks::m_share=
+{
+  { C_STRING_WITH_LEN("metadata_locks") },
+  &pfs_readonly_acl,
+  table_metadata_locks::create,
+  NULL, /* write_row */
+  NULL, /* delete_all_rows */
+  table_metadata_locks::get_row_count,
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE metadata_locks("
+  "OBJECT_TYPE VARCHAR(64) not null comment 'Object type. One of BACKUP, COMMIT, EVENT, FUNCTION, GLOBAL, LOCKING SERVICE, PROCEDURE, SCHEMA, TABLE, TABLESPACE, TRIGGER (unused) or USER LEVEL LOCK.',"
+  "OBJECT_SCHEMA VARCHAR(64) comment 'Object schema.',"
+  "OBJECT_NAME VARCHAR(64) comment 'Object name.',"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null comment 'Address in memory of the instrumented object.',"
+  "LOCK_TYPE VARCHAR(32) not null comment 'Lock type. One of BACKUP_FTWRL1, BACKUP_START, BACKUP_TRANS_DML, EXCLUSIVE, INTENTION_EXCLUSIVE, SHARED, SHARED_HIGH_PRIO, SHARED_NO_READ_WRITE, SHARED_NO_WRITE, SHARED_READ, SHARED_UPGRADABLE or SHARED_WRITE.',"
+  "LOCK_DURATION VARCHAR(32) not null comment 'Lock duration. One of EXPLICIT (locks released by explicit action, for example a global lock acquired with FLUSH TABLES WITH READ LOCK) , STATEMENT (locks implicitly released at statement end) or TRANSACTION (locks implicitly released at transaction end).',"
+  "LOCK_STATUS VARCHAR(32) not null comment 'Lock status. One of GRANTED, KILLED, PENDING, POST_RELEASE_NOTIFY, PRE_ACQUIRE_NOTIFY, TIMEOUT or VICTIM.',"
+  "SOURCE VARCHAR(64) comment 'Source file containing the instrumented code that produced the event, as well as the line number where the instrumentation occurred. This allows one to examine the source code involved.',"
+  "OWNER_THREAD_ID BIGINT unsigned comment 'Thread that requested the lock.',"
+  "OWNER_EVENT_ID BIGINT unsigned comment 'Event that requested the lock.')")},
+  false  /* perpetual */
+};
+
+PFS_engine_table* table_metadata_locks::create(void)
+{
+  return new table_metadata_locks();
+}
+
+ha_rows
+table_metadata_locks::get_row_count(void)
+{
+  return global_mdl_container.get_row_count();
+}
+
+table_metadata_locks::table_metadata_locks()
+  : PFS_engine_table(&m_share, &m_pos),
+  m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+void table_metadata_locks::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int table_metadata_locks::rnd_next(void)
+{
+  PFS_metadata_lock *pfs;
+
+  m_pos.set_at(&m_next_pos);
+  PFS_mdl_iterator it= global_mdl_container.iterate(m_pos.m_index);
+  pfs= it.scan_next(& m_pos.m_index);
+  if (pfs != NULL)
+  {
+    make_row(pfs);
+    m_next_pos.set_after(&m_pos);
+    return 0;
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_metadata_locks::rnd_pos(const void *pos)
+{
+  PFS_metadata_lock *pfs;
+
+  set_position(pos);
+
+  pfs= global_mdl_container.get(m_pos.m_index);
+  if (pfs != NULL)
+  {
+    make_row(pfs);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_metadata_locks::make_row(PFS_metadata_lock *pfs)
+{
+  pfs_optimistic_state lock;
+
+  m_row_exists= false;
+
+  /* Protect this reader against a metadata lock destroy */
+  pfs->m_lock.begin_optimistic_lock(&lock);
+
+  m_row.m_identity= pfs->m_identity;
+  m_row.m_mdl_type= pfs->m_mdl_type;
+  m_row.m_mdl_duration= pfs->m_mdl_duration;
+  m_row.m_mdl_status= pfs->m_mdl_status;
+
+  /* Disable source file and line to avoid stale __FILE__ pointers. */
+  m_row.m_source_length= 0;
+
+  m_row.m_owner_thread_id= static_cast<ulong>(pfs->m_owner_thread_id);
+  m_row.m_owner_event_id= static_cast<ulong>(pfs->m_owner_event_id);
+
+  if (m_row.m_object.make_row(& pfs->m_mdl_key))
+    return;
+
+  if (pfs->m_lock.end_optimistic_lock(&lock))
+    m_row_exists= true;
+}
+
+int table_metadata_locks::read_row_values(TABLE *table,
+                                          unsigned char *buf,
+                                          Field **fields,
+                                          bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* OBJECT_TYPE */
+      case 1: /* OBJECT_SCHEMA */
+      case 2: /* OBJECT_NAME */
+        m_row.m_object.set_nullable_field(f->field_index, f);
+        break;
+      case 3: /* OBJECT_INSTANCE */
+        set_field_ulonglong(f, (intptr) m_row.m_identity);
+        break;
+      case 4: /* LOCK_TYPE */
+        set_field_mdl_type(f, m_row.m_mdl_type, m_row.m_object.m_object_type == OBJECT_TYPE_BACKUP);
+        break;
+      case 5: /* LOCK_DURATION */
+        set_field_mdl_duration(f, m_row.m_mdl_duration);
+        break;
+      case 6: /* LOCK_STATUS */
+        set_field_mdl_status(f, m_row.m_mdl_status);
+        break;
+      case 7: /* SOURCE */
+        set_field_varchar_utf8(f, m_row.m_source, m_row.m_source_length);
+        break;
+      case 8: /* OWNER_THREAD_ID */
+        if (m_row.m_owner_thread_id != 0)
+          set_field_ulonglong(f, m_row.m_owner_thread_id);
+        else
+          f->set_null();
+        break;
+      case 9: /* OWNER_EVENT_ID */
+        if (m_row.m_owner_event_id != 0)
+          set_field_ulonglong(f, m_row.m_owner_event_id);
+        else
+          f->set_null();
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_md_locks.h b/storage/perfschema/table_md_locks.h
new file mode 100644
index 00000000000..c6a4a2eca8d
--- /dev/null
+++ b/storage/perfschema/table_md_locks.h
@@ -0,0 +1,106 @@
+/* Copyright (c) 2012, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_METADATA_LOCK_H
+#define TABLE_METADATA_LOCK_H
+
+/**
+  @file storage/perfschema/table_md_locks.h
+  Table METADATA_LOCKS (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "table_helper.h"
+
+struct PFS_metadata_lock;
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** A row of table PERFORMANCE_SCHEMA.MUTEX_INSTANCES. */
+struct row_metadata_lock
+{
+  /** Column OBJECT_INSTANCE_BEGIN. */
+  const void *m_identity;
+  opaque_mdl_type m_mdl_type;
+  opaque_mdl_duration m_mdl_duration;
+  opaque_mdl_status m_mdl_status;
+  /** Column SOURCE. */
+  char m_source[COL_SOURCE_SIZE];
+  /** Length in bytes of @c m_source. */
+  uint m_source_length;
+  /** Column OWNER_THREAD_ID. */
+  ulong m_owner_thread_id;
+  /** Column OWNER_EVENT_ID. */
+  ulong m_owner_event_id;
+  /** Columns OBJECT_TYPE, OBJECT_SCHEMA, OBJECT_NAME. */
+  PFS_object_row m_object;
+};
+
+/** Table PERFORMANCE_SCHEMA.METADATA_LOCKS. */
+class table_metadata_locks : public PFS_engine_table
+{
+public:
+  /** Table share. */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static ha_rows get_row_count();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+private:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_metadata_locks();
+
+public:
+  ~table_metadata_locks()
+  {}
+
+private:
+  void make_row(PFS_metadata_lock *pfs);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_metadata_lock m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_mems_by_account_by_event_name.cc b/storage/perfschema/table_mems_by_account_by_event_name.cc
new file mode 100644
index 00000000000..c4217641f6a
--- /dev/null
+++ b/storage/perfschema/table_mems_by_account_by_event_name.cc
@@ -0,0 +1,221 @@
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_mems_by_account_by_event_name.cc
+  Table MEMORY_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_mems_by_account_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+#include "pfs_memory.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
+
+THR_LOCK table_mems_by_account_by_event_name::m_table_lock;
+
+PFS_engine_table_share
+table_mems_by_account_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("memory_summary_by_account_by_event_name") },
+  &pfs_readonly_acl,
+  table_mems_by_account_by_event_name::create,
+  NULL, /* write_row */
+  table_mems_by_account_by_event_name::delete_all_rows,
+  table_mems_by_account_by_event_name::get_row_count,
+  sizeof(pos_mems_by_account_by_event_name),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE memory_summary_by_account_by_event_name("
+  "USER CHAR(32) collate utf8_bin default null comment 'User portion of the account.',"
+  "HOST CHAR(60) collate utf8_bin default null comment 'Host portion of the account.',"
+  "EVENT_NAME VARCHAR(128) not null comment 'Event name.',"
+  "COUNT_ALLOC BIGINT unsigned not null comment 'Total number of allocations to memory.',"
+  "COUNT_FREE BIGINT unsigned not null comment 'Total number of attempts to free the allocated memory.',"
+  "SUM_NUMBER_OF_BYTES_ALLOC BIGINT unsigned not null comment 'Total number of bytes allocated.',"
+  "SUM_NUMBER_OF_BYTES_FREE BIGINT unsigned not null comment 'Total number of bytes freed',"
+  "LOW_COUNT_USED BIGINT not null comment 'Lowest number of allocated blocks (lowest value of CURRENT_COUNT_USED).',"
+  "CURRENT_COUNT_USED BIGINT not null comment 'Currently allocated blocks that have not been freed (COUNT_ALLOC minus COUNT_FREE).',"
+  "HIGH_COUNT_USED BIGINT not null comment 'Highest number of allocated blocks (highest value of CURRENT_COUNT_USED).',"
+  "LOW_NUMBER_OF_BYTES_USED BIGINT not null comment 'Lowest number of bytes used.',"
+  "CURRENT_NUMBER_OF_BYTES_USED BIGINT not null comment 'Current number of bytes used (total allocated minus total freed).',"
+  "HIGH_NUMBER_OF_BYTES_USED BIGINT not null comment 'Highest number of bytes used.')")},
+  false  /* perpetual */
+};
+
+PFS_engine_table* table_mems_by_account_by_event_name::create(void)
+{
+  return new table_mems_by_account_by_event_name();
+}
+
+int
+table_mems_by_account_by_event_name::delete_all_rows(void)
+{
+  reset_memory_by_thread();
+  reset_memory_by_account();
+  return 0;
+}
+
+ha_rows
+table_mems_by_account_by_event_name::get_row_count(void)
+{
+  return global_account_container.get_row_count() * memory_class_max;
+}
+
+table_mems_by_account_by_event_name::table_mems_by_account_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+  m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_mems_by_account_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_mems_by_account_by_event_name::rnd_next(void)
+{
+  PFS_account *account;
+  PFS_memory_class *memory_class;
+  bool has_more_account= true;
+
+  for (m_pos.set_at(&m_next_pos);
+       has_more_account;
+       m_pos.next_account())
+  {
+    account= global_account_container.get(m_pos.m_index_1, & has_more_account);
+    if (account != NULL)
+    {
+      do
+      {
+        memory_class= find_memory_class(m_pos.m_index_2);
+        if (memory_class != NULL)
+        {
+          if (! memory_class->is_global())
+          {
+            make_row(account, memory_class);
+            m_next_pos.set_after(&m_pos);
+            return 0;
+          }
+
+          m_pos.next_class();
+        }
+      }
+      while (memory_class != NULL);
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_mems_by_account_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_account *account;
+  PFS_memory_class *memory_class;
+
+  set_position(pos);
+
+  account= global_account_container.get(m_pos.m_index_1);
+  if (account != NULL)
+  {
+    memory_class= find_memory_class(m_pos.m_index_2);
+    if (memory_class != NULL)
+    {
+      if (! memory_class->is_global())
+      {
+        make_row(account, memory_class);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_mems_by_account_by_event_name
+::make_row(PFS_account *account, PFS_memory_class *klass)
+{
+  pfs_optimistic_state lock;
+  m_row_exists= false;
+
+  account->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_account.make_row(account))
+    return;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_memory_visitor visitor(klass);
+  PFS_connection_iterator::visit_account(account,
+                                         true,  /* threads */
+                                         false, /* THDs */
+                                         & visitor);
+
+  if (! account->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(& visitor.m_stat);
+}
+
+int table_mems_by_account_by_event_name::read_row_values(TABLE *table,
+                                                    unsigned char *buf,
+                                                    Field **fields,
+                                                    bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* USER */
+      case 1: /* HOST */
+        m_row.m_account.set_field(f->field_index, f);
+        break;
+      case 2: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 3, ... HIGH_NUMBER_OF_BYTES_USED */
+        m_row.m_stat.set_field(f->field_index - 3, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_mems_by_account_by_event_name.h b/storage/perfschema/table_mems_by_account_by_event_name.h
new file mode 100644
index 00000000000..9d43c4cf404
--- /dev/null
+++ b/storage/perfschema/table_mems_by_account_by_event_name.h
@@ -0,0 +1,129 @@
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_MEMORY_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME_H
+#define TABLE_MEMORY_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_mems_by_account_by_event_name.h
+  Table MEMORY_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_account.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** A row of PERFORMANCE_SCHEMA.MEMORY_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME. */
+struct row_mems_by_account_by_event_name
+{
+  /** Column USER, HOST. */
+  PFS_account_row m_account;
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_ALLOC, ... */
+  PFS_memory_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_MEMORY_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
+  Index 1 on account (0 based)
+  Index 2 on memory class (1 based)
+*/
+struct pos_mems_by_account_by_event_name
+: public PFS_double_index
+{
+  pos_mems_by_account_by_event_name()
+    : PFS_double_index(0, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 1;
+  }
+
+  inline void next_account(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
+
+  inline void next_class(void)
+  {
+    m_index_2++;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.MEMORY_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME. */
+class table_mems_by_account_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+private:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_mems_by_account_by_event_name();
+
+public:
+  ~table_mems_by_account_by_event_name()
+  {}
+
+private:
+  void make_row(PFS_account *account, PFS_memory_class *klass);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_mems_by_account_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_mems_by_account_by_event_name m_pos;
+  /** Next position. */
+  pos_mems_by_account_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_mems_by_host_by_event_name.cc b/storage/perfschema/table_mems_by_host_by_event_name.cc
new file mode 100644
index 00000000000..b86fe70da28
--- /dev/null
+++ b/storage/perfschema/table_mems_by_host_by_event_name.cc
@@ -0,0 +1,221 @@
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_mems_by_host_by_event_name.cc
+  Table MEMORY_SUMMARY_BY_HOST_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_mems_by_host_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+#include "pfs_memory.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
+
+THR_LOCK table_mems_by_host_by_event_name::m_table_lock;
+
+PFS_engine_table_share
+table_mems_by_host_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("memory_summary_by_host_by_event_name") },
+  &pfs_readonly_acl,
+  table_mems_by_host_by_event_name::create,
+  NULL, /* write_row */
+  table_mems_by_host_by_event_name::delete_all_rows,
+  table_mems_by_host_by_event_name::get_row_count,
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE memory_summary_by_host_by_event_name("
+  "HOST CHAR(60) collate utf8_bin default null comment 'Host portion of the account.',"
+  "EVENT_NAME VARCHAR(128) not null comment 'Event name.',"
+  "COUNT_ALLOC BIGINT unsigned not null comment 'Total number of allocations to memory.',"
+  "COUNT_FREE BIGINT unsigned not null comment 'Total number of attempts to free the allocated memory.',"
+  "SUM_NUMBER_OF_BYTES_ALLOC BIGINT unsigned not null comment 'Total number of bytes allocated.',"
+  "SUM_NUMBER_OF_BYTES_FREE BIGINT unsigned not null comment 'Total number of bytes freed',"
+  "LOW_COUNT_USED BIGINT not null comment 'Lowest number of allocated blocks (lowest value of CURRENT_COUNT_USED).',"
+  "CURRENT_COUNT_USED BIGINT not null comment 'Currently allocated blocks that have not been freed (COUNT_ALLOC minus COUNT_FREE).',"
+  "HIGH_COUNT_USED BIGINT not null comment 'Highest number of allocated blocks (highest value of CURRENT_COUNT_USED).',"
+  "LOW_NUMBER_OF_BYTES_USED BIGINT not null comment 'Lowest number of bytes used.',"
+  "CURRENT_NUMBER_OF_BYTES_USED BIGINT not null comment 'Current number of bytes used (total allocated minus total freed).',"
+  "HIGH_NUMBER_OF_BYTES_USED BIGINT not null comment 'Highest number of bytes used.')")},
+  false  /* perpetual */
+};
+
+PFS_engine_table* table_mems_by_host_by_event_name::create(void)
+{
+  return new table_mems_by_host_by_event_name();
+}
+
+int
+table_mems_by_host_by_event_name::delete_all_rows(void)
+{
+  reset_memory_by_thread();
+  reset_memory_by_account();
+  reset_memory_by_host();
+  return 0;
+}
+
+ha_rows
+table_mems_by_host_by_event_name::get_row_count(void)
+{
+  return global_host_container.get_row_count() * memory_class_max;
+}
+
+table_mems_by_host_by_event_name::table_mems_by_host_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+  m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_mems_by_host_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_mems_by_host_by_event_name::rnd_next(void)
+{
+  PFS_host *host;
+  PFS_memory_class *memory_class;
+  bool has_more_host= true;
+
+  for (m_pos.set_at(&m_next_pos);
+       has_more_host;
+       m_pos.next_host())
+  {
+    host= global_host_container.get(m_pos.m_index_1, & has_more_host);
+    if (host != NULL)
+    {
+      do
+      {
+        memory_class= find_memory_class(m_pos.m_index_2);
+        if (memory_class != NULL)
+        {
+          if (! memory_class->is_global())
+          {
+            make_row(host, memory_class);
+            m_next_pos.set_after(&m_pos);
+            return 0;
+          }
+
+          m_pos.next_class();
+        }
+      }
+      while (memory_class != NULL);
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_mems_by_host_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_host *host;
+  PFS_memory_class *memory_class;
+
+  set_position(pos);
+
+  host= global_host_container.get(m_pos.m_index_1);
+  if (host != NULL)
+  {
+    memory_class= find_memory_class(m_pos.m_index_2);
+    if (memory_class != NULL)
+    {
+      if (! memory_class->is_global())
+      {
+        make_row(host, memory_class);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_mems_by_host_by_event_name
+::make_row(PFS_host *host, PFS_memory_class *klass)
+{
+  pfs_optimistic_state lock;
+  m_row_exists= false;
+
+  host->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_host.make_row(host))
+    return;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_memory_visitor visitor(klass);
+  PFS_connection_iterator::visit_host(host,
+                                      true,  /* accounts */
+                                      true,  /* threads */
+                                      false, /* THDs */
+                                      & visitor);
+
+  if (! host->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(& visitor.m_stat);
+}
+
+int table_mems_by_host_by_event_name::read_row_values(TABLE *table,
+                                                    unsigned char *buf,
+                                                    Field **fields,
+                                                    bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* HOST */
+        m_row.m_host.set_field(f);
+        break;
+      case 1: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 2, ... HIGH_NUMBER_OF_BYTES_USED */
+        m_row.m_stat.set_field(f->field_index - 2, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_mems_by_host_by_event_name.h b/storage/perfschema/table_mems_by_host_by_event_name.h
new file mode 100644
index 00000000000..a4fdde24fbd
--- /dev/null
+++ b/storage/perfschema/table_mems_by_host_by_event_name.h
@@ -0,0 +1,129 @@
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_MEMORY_SUMMARY_BY_HOST_BY_EVENT_NAME_H
+#define TABLE_MEMORY_SUMMARY_BY_HOST_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_mems_by_host_by_event_name.h
+  Table MEMORY_SUMMARY_BY_HOST_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_host.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** A row of PERFORMANCE_SCHEMA.MEMORY_SUMMARY_BY_HOST_BY_EVENT_NAME. */
+struct row_mems_by_host_by_event_name
+{
+  /** Column HOST */
+  PFS_host_row m_host;
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_ALLOC, ... */
+  PFS_memory_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_MEMORY_SUMMARY_BY_HOST_BY_EVENT_NAME.
+  Index 1 on host (0 based)
+  Index 2 on memory class (1 based)
+*/
+struct pos_mems_by_host_by_event_name
+: public PFS_double_index
+{
+  pos_mems_by_host_by_event_name()
+    : PFS_double_index(0, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 1;
+  }
+
+  inline void next_host(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
+
+  inline void next_class(void)
+  {
+    m_index_2++;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.MEMORY_SUMMARY_BY_HOST_BY_EVENT_NAME. */
+class table_mems_by_host_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+private:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_mems_by_host_by_event_name();
+
+public:
+  ~table_mems_by_host_by_event_name()
+  {}
+
+private:
+  void make_row(PFS_host *host, PFS_memory_class *klass);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_mems_by_host_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_mems_by_host_by_event_name m_pos;
+  /** Next position. */
+  pos_mems_by_host_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_mems_by_thread_by_event_name.cc b/storage/perfschema/table_mems_by_thread_by_event_name.cc
new file mode 100644
index 00000000000..8c79f8cc8d8
--- /dev/null
+++ b/storage/perfschema/table_mems_by_thread_by_event_name.cc
@@ -0,0 +1,214 @@
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_mems_by_thread_by_event_name.cc
+  Table MEMORY_SUMMARY_BY_THREAD_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_mems_by_thread_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+#include "pfs_memory.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
+
+THR_LOCK table_mems_by_thread_by_event_name::m_table_lock;
+
+PFS_engine_table_share
+table_mems_by_thread_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("memory_summary_by_thread_by_event_name") },
+  &pfs_readonly_acl,
+  table_mems_by_thread_by_event_name::create,
+  NULL, /* write_row */
+  table_mems_by_thread_by_event_name::delete_all_rows,
+  table_mems_by_thread_by_event_name::get_row_count,
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE memory_summary_by_thread_by_event_name("
+  "THREAD_ID BIGINT unsigned not null comment 'Thread id.',"
+  "EVENT_NAME VARCHAR(128) not null comment 'Event name.',"
+  "COUNT_ALLOC BIGINT unsigned not null comment 'Total number of allocations to memory.',"
+  "COUNT_FREE BIGINT unsigned not null comment 'Total number of attempts to free the allocated memory.',"
+  "SUM_NUMBER_OF_BYTES_ALLOC BIGINT unsigned not null comment 'Total number of bytes allocated.',"
+  "SUM_NUMBER_OF_BYTES_FREE BIGINT unsigned not null comment 'Total number of bytes freed',"
+  "LOW_COUNT_USED BIGINT not null comment 'Lowest number of allocated blocks (lowest value of CURRENT_COUNT_USED).',"
+  "CURRENT_COUNT_USED BIGINT not null comment 'Currently allocated blocks that have not been freed (COUNT_ALLOC minus COUNT_FREE).',"
+  "HIGH_COUNT_USED BIGINT not null comment 'Highest number of allocated blocks (highest value of CURRENT_COUNT_USED).',"
+  "LOW_NUMBER_OF_BYTES_USED BIGINT not null comment 'Lowest number of bytes used.',"
+  "CURRENT_NUMBER_OF_BYTES_USED BIGINT not null comment 'Current number of bytes used (total allocated minus total freed).',"
+  "HIGH_NUMBER_OF_BYTES_USED BIGINT not null comment 'Highest number of bytes used.')")},
+  false  /* perpetual */
+};
+
+PFS_engine_table* table_mems_by_thread_by_event_name::create(void)
+{
+  return new table_mems_by_thread_by_event_name();
+}
+
+int
+table_mems_by_thread_by_event_name::delete_all_rows(void)
+{
+  reset_memory_by_thread();
+  return 0;
+}
+
+ha_rows
+table_mems_by_thread_by_event_name::get_row_count(void)
+{
+  return global_thread_container.get_row_count() * memory_class_max;
+}
+
+table_mems_by_thread_by_event_name::table_mems_by_thread_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+  m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_mems_by_thread_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_mems_by_thread_by_event_name::rnd_next(void)
+{
+  PFS_thread *thread;
+  PFS_memory_class *memory_class;
+  bool has_more_thread= true;
+
+  for (m_pos.set_at(&m_next_pos);
+       has_more_thread;
+       m_pos.next_thread())
+  {
+    thread= global_thread_container.get(m_pos.m_index_1, & has_more_thread);
+    if (thread != NULL)
+    {
+      do
+      {
+        memory_class= find_memory_class(m_pos.m_index_2);
+        if (memory_class != NULL)
+        {
+          if (! memory_class->is_global())
+          {
+            make_row(thread, memory_class);
+            m_next_pos.set_after(&m_pos);
+            return 0;
+          }
+
+          m_pos.next_class();
+        }
+      }
+      while (memory_class != NULL);
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_mems_by_thread_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_thread *thread;
+  PFS_memory_class *memory_class;
+
+  set_position(pos);
+
+  thread= global_thread_container.get(m_pos.m_index_1);
+  if (thread != NULL)
+  {
+    memory_class= find_memory_class(m_pos.m_index_2);
+    if (memory_class != NULL)
+    {
+      if (! memory_class->is_global())
+      {
+        make_row(thread, memory_class);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_mems_by_thread_by_event_name
+::make_row(PFS_thread *thread, PFS_memory_class *klass)
+{
+  pfs_optimistic_state lock;
+  m_row_exists= false;
+
+  /* Protect this reader against a thread termination */
+  thread->m_lock.begin_optimistic_lock(&lock);
+
+  m_row.m_thread_internal_id= thread->m_thread_internal_id;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_memory_visitor visitor(klass);
+  PFS_connection_iterator::visit_thread(thread, & visitor);
+
+  if (! thread->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(& visitor.m_stat);
+}
+
+int table_mems_by_thread_by_event_name::read_row_values(TABLE *table,
+                                                    unsigned char *,
+                                                    Field **fields,
+                                                    bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 0);
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* THREAD_ID */
+        set_field_ulonglong(f, m_row.m_thread_internal_id);
+        break;
+      case 1: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 2, ... HIGH_NUMBER_OF_BYTES_USED */
+        m_row.m_stat.set_field(f->field_index - 2, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_mems_by_thread_by_event_name.h b/storage/perfschema/table_mems_by_thread_by_event_name.h
new file mode 100644
index 00000000000..b89172669e6
--- /dev/null
+++ b/storage/perfschema/table_mems_by_thread_by_event_name.h
@@ -0,0 +1,129 @@
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_MEMORY_SUMMARY_BY_THREAD_BY_EVENT_NAME_H
+#define TABLE_MEMORY_SUMMARY_BY_THREAD_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_mems_by_thread_by_event_name.h
+  Table MEMORY_SUMMARY_BY_THREAD_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** A row of PERFORMANCE_SCHEMA.MEMORY_SUMMARY_BY_THREAD_BY_EVENT_NAME. */
+struct row_mems_by_thread_by_event_name
+{
+  /** Column THREAD_ID. */
+  ulonglong m_thread_internal_id;
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_ALLOC, ... */
+  PFS_memory_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_MEMORY_SUMMARY_BY_THREAD_BY_EVENT_NAME.
+  Index 1 on thread (0 based).
+  Index 2 on memory class (1 based).
+*/
+struct pos_mems_by_thread_by_event_name
+: public PFS_double_index
+{
+  pos_mems_by_thread_by_event_name()
+    : PFS_double_index(0, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 1;
+  }
+
+  inline void next_thread(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
+
+  inline void next_class(void)
+  {
+    m_index_2++;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.MEMORY_SUMMARY_BY_THREAD_BY_EVENT_NAME. */
+class table_mems_by_thread_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+private:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_mems_by_thread_by_event_name();
+
+public:
+  ~table_mems_by_thread_by_event_name()
+  {}
+
+private:
+  void make_row(PFS_thread *thread, PFS_memory_class *klass);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_mems_by_thread_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_mems_by_thread_by_event_name m_pos;
+  /** Next position. */
+  pos_mems_by_thread_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_mems_by_user_by_event_name.cc b/storage/perfschema/table_mems_by_user_by_event_name.cc
new file mode 100644
index 00000000000..4bae383166a
--- /dev/null
+++ b/storage/perfschema/table_mems_by_user_by_event_name.cc
@@ -0,0 +1,221 @@
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_mems_by_user_by_event_name.cc
+  Table MEMORY_SUMMARY_BY_USER_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_mems_by_user_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+#include "pfs_memory.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
+
+THR_LOCK table_mems_by_user_by_event_name::m_table_lock;
+
+PFS_engine_table_share
+table_mems_by_user_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("memory_summary_by_user_by_event_name") },
+  &pfs_readonly_acl,
+  table_mems_by_user_by_event_name::create,
+  NULL, /* write_row */
+  table_mems_by_user_by_event_name::delete_all_rows,
+  table_mems_by_user_by_event_name::get_row_count,
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE memory_summary_by_user_by_event_name("
+  "USER CHAR(32) collate utf8_bin default null comment 'User portion of the account.',"
+  "EVENT_NAME VARCHAR(128) not null comment 'Event name.',"
+  "COUNT_ALLOC BIGINT unsigned not null comment 'Total number of allocations to memory.',"
+  "COUNT_FREE BIGINT unsigned not null comment 'Total number of attempts to free the allocated memory.',"
+  "SUM_NUMBER_OF_BYTES_ALLOC BIGINT unsigned not null comment 'Total number of bytes allocated.',"
+  "SUM_NUMBER_OF_BYTES_FREE BIGINT unsigned not null comment 'Total number of bytes freed',"
+  "LOW_COUNT_USED BIGINT not null comment 'Lowest number of allocated blocks (lowest value of CURRENT_COUNT_USED).',"
+  "CURRENT_COUNT_USED BIGINT not null comment 'Currently allocated blocks that have not been freed (COUNT_ALLOC minus COUNT_FREE).',"
+  "HIGH_COUNT_USED BIGINT not null comment 'Highest number of allocated blocks (highest value of CURRENT_COUNT_USED).',"
+  "LOW_NUMBER_OF_BYTES_USED BIGINT not null comment 'Lowest number of bytes used.',"
+  "CURRENT_NUMBER_OF_BYTES_USED BIGINT not null comment 'Current number of bytes used (total allocated minus total freed).',"
+  "HIGH_NUMBER_OF_BYTES_USED BIGINT not null comment 'Highest number of bytes used.')")},
+  false  /* perpetual */
+};
+
+PFS_engine_table* table_mems_by_user_by_event_name::create(void)
+{
+  return new table_mems_by_user_by_event_name();
+}
+
+int
+table_mems_by_user_by_event_name::delete_all_rows(void)
+{
+  reset_memory_by_thread();
+  reset_memory_by_account();
+  reset_memory_by_user();
+  return 0;
+}
+
+ha_rows
+table_mems_by_user_by_event_name::get_row_count(void)
+{
+  return global_user_container.get_row_count() * memory_class_max;
+}
+
+table_mems_by_user_by_event_name::table_mems_by_user_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+  m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_mems_by_user_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_mems_by_user_by_event_name::rnd_next(void)
+{
+  PFS_user *user;
+  PFS_memory_class *memory_class;
+  bool has_more_user= true;
+
+  for (m_pos.set_at(&m_next_pos);
+       has_more_user;
+       m_pos.next_user())
+  {
+    user= global_user_container.get(m_pos.m_index_1, & has_more_user);
+    if (user != NULL)
+    {
+      do
+      {
+        memory_class= find_memory_class(m_pos.m_index_2);
+        if (memory_class != NULL)
+        {
+          if (! memory_class->is_global())
+          {
+            make_row(user, memory_class);
+            m_next_pos.set_after(&m_pos);
+            return 0;
+          }
+
+          m_pos.next_class();
+        }
+      }
+      while (memory_class != NULL);
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_mems_by_user_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_user *user;
+  PFS_memory_class *memory_class;
+
+  set_position(pos);
+
+  user= global_user_container.get(m_pos.m_index_1);
+  if (user != NULL)
+  {
+    memory_class= find_memory_class(m_pos.m_index_2);
+    if (memory_class != NULL)
+    {
+      if (! memory_class->is_global())
+      {
+        make_row(user, memory_class);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_mems_by_user_by_event_name
+::make_row(PFS_user *user, PFS_memory_class *klass)
+{
+  pfs_optimistic_state lock;
+  m_row_exists= false;
+
+  user->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_user.make_row(user))
+    return;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_memory_visitor visitor(klass);
+  PFS_connection_iterator::visit_user(user,
+                                      true,  /* accounts */
+                                      true,  /* threads */
+                                      false, /* THDs */
+                                      & visitor);
+
+  if (! user->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(& visitor.m_stat);
+}
+
+int table_mems_by_user_by_event_name::read_row_values(TABLE *table,
+                                                    unsigned char *buf,
+                                                    Field **fields,
+                                                    bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* USER */
+        m_row.m_user.set_field(f);
+        break;
+      case 1: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 2, ... HIGH_NUMBER_OF_BYTES_USED */
+        m_row.m_stat.set_field(f->field_index - 2, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_mems_by_user_by_event_name.h b/storage/perfschema/table_mems_by_user_by_event_name.h
new file mode 100644
index 00000000000..34c4cd518ae
--- /dev/null
+++ b/storage/perfschema/table_mems_by_user_by_event_name.h
@@ -0,0 +1,129 @@
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_MEMORY_SUMMARY_BY_USER_BY_EVENT_NAME_H
+#define TABLE_MEMORY_SUMMARY_BY_USER_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_mems_by_user_by_event_name.h
+  Table MEMORY_SUMMARY_BY_USER_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_user.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** A row of PERFORMANCE_SCHEMA.MEMORY_SUMMARY_BY_USER_BY_EVENT_NAME. */
+struct row_mems_by_user_by_event_name
+{
+  /** Column USER. */
+  PFS_user_row m_user;
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_ALLOC, ... */
+  PFS_memory_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_MEMORY_SUMMARY_BY_USER_BY_EVENT_NAME.
+  Index 1 on user (0 based)
+  Index 2 on memory class (1 based)
+*/
+struct pos_mems_by_user_by_event_name
+: public PFS_double_index
+{
+  pos_mems_by_user_by_event_name()
+    : PFS_double_index(0, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 1;
+  }
+
+  inline void next_user(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
+
+  inline void next_class(void)
+  {
+    m_index_2++;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.MEMORY_SUMMARY_BY_USER_BY_EVENT_NAME. */
+class table_mems_by_user_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+private:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_mems_by_user_by_event_name();
+
+public:
+  ~table_mems_by_user_by_event_name()
+  {}
+
+private:
+  void make_row(PFS_user *user, PFS_memory_class *klass);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_mems_by_user_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_mems_by_user_by_event_name m_pos;
+  /** Next position. */
+  pos_mems_by_user_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_mems_global_by_event_name.cc b/storage/perfschema/table_mems_global_by_event_name.cc
new file mode 100644
index 00000000000..e4b681072c8
--- /dev/null
+++ b/storage/perfschema/table_mems_global_by_event_name.cc
@@ -0,0 +1,241 @@
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_mems_global_by_event_name.cc
+  Table MEMORY_SUMMARY_GLOBAL_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_mems_global_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+#include "pfs_builtin_memory.h"
+#include "pfs_memory.h"
+#include "field.h"
+
+THR_LOCK table_mems_global_by_event_name::m_table_lock;
+
+PFS_engine_table_share
+table_mems_global_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("memory_summary_global_by_event_name") },
+  &pfs_readonly_acl,
+  table_mems_global_by_event_name::create,
+  NULL, /* write_row */
+  table_mems_global_by_event_name::delete_all_rows,
+  table_mems_global_by_event_name::get_row_count,
+  sizeof(pos_t),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE memory_summary_global_by_event_name("
+  "EVENT_NAME VARCHAR(128) not null comment 'Event name.',"
+  "COUNT_ALLOC BIGINT unsigned not null comment 'Total number of allocations to memory.',"
+  "COUNT_FREE BIGINT unsigned not null comment 'Total number of attempts to free the allocated memory.',"
+  "SUM_NUMBER_OF_BYTES_ALLOC BIGINT unsigned not null comment 'Total number of bytes allocated.',"
+  "SUM_NUMBER_OF_BYTES_FREE BIGINT unsigned not null comment 'Total number of bytes freed',"
+  "LOW_COUNT_USED BIGINT not null comment 'Lowest number of allocated blocks (lowest value of CURRENT_COUNT_USED).',"
+  "CURRENT_COUNT_USED BIGINT not null comment 'Currently allocated blocks that have not been freed (COUNT_ALLOC minus COUNT_FREE).',"
+  "HIGH_COUNT_USED BIGINT not null comment 'Highest number of allocated blocks (highest value of CURRENT_COUNT_USED).',"
+  "LOW_NUMBER_OF_BYTES_USED BIGINT not null comment 'Lowest number of bytes used.',"
+  "CURRENT_NUMBER_OF_BYTES_USED BIGINT not null comment 'Current number of bytes used (total allocated minus total freed).',"
+  "HIGH_NUMBER_OF_BYTES_USED BIGINT not null comment 'Highest number of bytes used.')")},
+  false  /* perpetual */
+};
+
+PFS_engine_table* table_mems_global_by_event_name::create(void)
+{
+  return new table_mems_global_by_event_name();
+}
+
+int
+table_mems_global_by_event_name::delete_all_rows(void)
+{
+  reset_memory_by_thread();
+  reset_memory_by_account();
+  reset_memory_by_user();
+  reset_memory_by_host();
+  reset_memory_global();
+  return 0;
+}
+
+ha_rows
+table_mems_global_by_event_name::get_row_count(void)
+{
+  return memory_class_max;
+}
+
+table_mems_global_by_event_name::table_mems_global_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+  m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_mems_global_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_mems_global_by_event_name::rnd_next(void)
+{
+  PFS_memory_class *pfs;
+  PFS_builtin_memory_class *pfs_builtin;
+
+  /* Do not advertise hard coded instruments when disabled. */
+  if (! pfs_initialized)
+    return HA_ERR_END_OF_FILE;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.has_more_view();
+       m_pos.next_view())
+  {
+    switch (m_pos.m_index_1)
+    {
+    case pos_mems_global_by_event_name::VIEW_BUILTIN_MEMORY:
+      pfs_builtin= find_builtin_memory_class(m_pos.m_index_2);
+      if (pfs_builtin != NULL)
+      {
+        make_row(pfs_builtin);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+      break;
+    case pos_mems_global_by_event_name::VIEW_MEMORY:
+      pfs= find_memory_class(m_pos.m_index_2);
+      if (pfs != NULL)
+      {
+        make_row(pfs);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+      break;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_mems_global_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_builtin_memory_class *pfs_builtin;
+  PFS_memory_class *pfs;
+
+  /* Do not advertise hard coded instruments when disabled. */
+  if (! pfs_initialized)
+    return HA_ERR_END_OF_FILE;
+
+  set_position(pos);
+
+  switch(m_pos.m_index_1)
+  {
+  case pos_mems_global_by_event_name::VIEW_BUILTIN_MEMORY:
+    pfs_builtin= find_builtin_memory_class(m_pos.m_index_2);
+    if (pfs_builtin != NULL)
+    {
+      make_row(pfs_builtin);
+      return 0;
+    }
+    break;
+  case pos_mems_global_by_event_name::VIEW_MEMORY:
+    pfs= find_memory_class(m_pos.m_index_2);
+    if (pfs != NULL)
+    {
+      make_row(pfs);
+      return 0;
+    }
+    break;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_mems_global_by_event_name::make_row(PFS_memory_class *klass)
+{
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_memory_visitor visitor(klass);
+
+  if (klass->is_global())
+  {
+    PFS_connection_iterator::visit_global(false, /* hosts */
+                                          false, /* users */
+                                          false, /* accounts */
+                                          false, /* threads */
+                                          false, /* THDs */
+                                          &visitor);
+  }
+  else
+  {
+    PFS_connection_iterator::visit_global(true,  /* hosts */
+                                          false, /* users */
+                                          true,  /* accounts */
+                                          true,  /* threads */
+                                          false, /* THDs */
+                                          &visitor);
+  }
+
+  m_row.m_stat.set(& visitor.m_stat);
+  m_row_exists= true;
+}
+
+void table_mems_global_by_event_name::make_row(PFS_builtin_memory_class *klass)
+{
+  m_row.m_event_name.make_row(& klass->m_class);
+  m_row.m_stat.set(& klass->m_stat);
+  m_row_exists= true;
+}
+
+int table_mems_global_by_event_name::read_row_values(TABLE *table,
+                                                    unsigned char *,
+                                                    Field **fields,
+                                                    bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 0);
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 1, ... HIGH_NUMBER_OF_BYTES_USED */
+        m_row.m_stat.set_field(f->field_index - 1, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_mems_global_by_event_name.h b/storage/perfschema/table_mems_global_by_event_name.h
new file mode 100644
index 00000000000..46dbccbd414
--- /dev/null
+++ b/storage/perfschema/table_mems_global_by_event_name.h
@@ -0,0 +1,132 @@
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_MEMORY_SUMMARY_GLOBAL_BY_EVENT_NAME_H
+#define TABLE_MEMORY_SUMMARY_GLOBAL_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_mems_global_by_event_name.h
+  Table MEMORY_SUMMARY_GLOBAL_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_builtin_memory.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** A row of PERFORMANCE_SCHEMA.MEMORY_SUMMARY_GLOBAL_BY_EVENT_NAME. */
+struct row_mems_global_by_event_name
+{
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_ALLOC, ... */
+  PFS_memory_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.MEMORY_SUMMARY_GLOBAL_BY_EVENT_NAME.
+  Index 1 on view
+  Index 2 on instrument key (1 based)
+*/
+struct pos_mems_global_by_event_name : public PFS_double_index
+{
+  static const uint FIRST_VIEW= 1;
+  static const uint VIEW_BUILTIN_MEMORY= 1;
+  static const uint VIEW_MEMORY= 2;
+  static const uint LAST_VIEW= 2;
+
+  pos_mems_global_by_event_name()
+    : PFS_double_index(FIRST_VIEW, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= FIRST_VIEW;
+    m_index_2= 1;
+  }
+
+  inline bool has_more_view(void)
+  { return (m_index_1 <= LAST_VIEW); }
+
+  inline void next_view(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.MEMORY_SUMMARY_GLOBAL_BY_EVENT_NAME. */
+class table_mems_global_by_event_name : public PFS_engine_table
+{
+  typedef pos_mems_global_by_event_name pos_t;
+
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+private:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_mems_global_by_event_name();
+
+public:
+  ~table_mems_global_by_event_name()
+  {}
+
+private:
+  void make_row(PFS_builtin_memory_class *klass);
+  void make_row(PFS_memory_class *klass);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_mems_global_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_t m_pos;
+  /** Next position. */
+  pos_t m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_os_global_by_type.cc b/storage/perfschema/table_os_global_by_type.cc
index 4cb947e2fe9..b82ce9c2a7d 100644
--- a/storage/perfschema/table_os_global_by_type.cc
+++ b/storage/perfschema/table_os_global_by_type.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,12 +26,14 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_os_global_by_type.h"
 #include "pfs_global.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_os_global_by_type::m_table_lock;
 
@@ -43,8 +45,7 @@ table_os_global_by_type::m_share=
   table_os_global_by_type::create,
   NULL, /* write_row */
   table_os_global_by_type::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_os_global_by_type::get_row_count,
   sizeof(pos_os_global_by_type),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE objects_summary_global_by_type("
@@ -55,7 +56,8 @@ table_os_global_by_type::m_share=
                       "SUM_TIMER_WAIT BIGINT unsigned not null comment 'Total wait time of the summarized events that are timed.',"
                       "MIN_TIMER_WAIT BIGINT unsigned not null comment 'Minimum wait time of the summarized events that are timed.',"
                       "AVG_TIMER_WAIT BIGINT unsigned not null comment 'Average wait time of the summarized events that are timed.',"
-                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the summarized events that are timed.')") }
+                      "MAX_TIMER_WAIT BIGINT unsigned not null comment 'Maximum wait time of the summarized events that are timed.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -72,6 +74,13 @@ table_os_global_by_type::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_os_global_by_type::get_row_count(void)
+{
+  return global_table_share_container.get_row_count() +
+    global_program_container.get_row_count();
+}
+
 table_os_global_by_type::table_os_global_by_type()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(), m_next_pos()
@@ -85,22 +94,46 @@ void table_os_global_by_type::reset_position(void)
 
 int table_os_global_by_type::rnd_next(void)
 {
-  PFS_table_share *table_share;
-
   for (m_pos.set_at(&m_next_pos);
        m_pos.has_more_view();
        m_pos.next_view())
   {
     switch (m_pos.m_index_1) {
     case pos_os_global_by_type::VIEW_TABLE:
-      for ( ; m_pos.m_index_2 < table_share_max; m_pos.m_index_2++)
       {
-        table_share= &table_share_array[m_pos.m_index_2];
-        if (table_share->m_lock.is_populated())
+        PFS_table_share *table_share;
+        bool has_more_share= true;
+
+        for (;
+             has_more_share;
+             m_pos.m_index_2++)
         {
-          make_row(table_share);
-          m_next_pos.set_after(&m_pos);
-          return 0;
+          table_share= global_table_share_container.get(m_pos.m_index_2, & has_more_share);
+          if (table_share != NULL)
+          {
+            make_table_row(table_share);
+            m_next_pos.set_after(&m_pos);
+            return 0;
+          }
+        }
+      }
+      break;
+    case pos_os_global_by_type::VIEW_PROGRAM:
+      {
+        PFS_program *pfs_program;
+        bool has_more_program= true;
+
+        for (;
+             has_more_program;
+             m_pos.m_index_2++)
+        {
+          pfs_program= global_program_container.get(m_pos.m_index_2, & has_more_program);
+          if (pfs_program != NULL)
+          {
+            make_program_row(pfs_program);
+            m_next_pos.set_after(&m_pos);
+            return 0;
+          }
         }
       }
       break;
@@ -115,18 +148,29 @@ int table_os_global_by_type::rnd_next(void)
 int
 table_os_global_by_type::rnd_pos(const void *pos)
 {
-  PFS_table_share *table_share;
-
   set_position(pos);
 
   switch (m_pos.m_index_1) {
   case pos_os_global_by_type::VIEW_TABLE:
-    DBUG_ASSERT(m_pos.m_index_2 < table_share_max);
-    table_share= &table_share_array[m_pos.m_index_2];
-    if (table_share->m_lock.is_populated())
     {
-      make_row(table_share);
-      return 0;
+      PFS_table_share *table_share;
+      table_share= global_table_share_container.get(m_pos.m_index_2);
+      if (table_share != NULL)
+      {
+        make_table_row(table_share);
+        return 0;
+      }
+    }
+    break;
+  case pos_os_global_by_type::VIEW_PROGRAM:
+    {
+      PFS_program *pfs_program;
+      pfs_program= global_program_container.get(m_pos.m_index_2);
+      if (pfs_program != NULL)
+      {
+        make_program_row(pfs_program);
+        return 0;
+      }
     }
     break;
   default:
@@ -136,9 +180,29 @@ table_os_global_by_type::rnd_pos(const void *pos)
   return HA_ERR_RECORD_DELETED;
 }
 
-void table_os_global_by_type::make_row(PFS_table_share *share)
+void table_os_global_by_type::make_program_row(PFS_program *pfs_program)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
+  PFS_single_stat cumulated_stat;
+
+  m_row_exists= false;
+
+  pfs_program->m_lock.begin_optimistic_lock(&lock);
+
+  m_row.m_object.make_row(pfs_program);
+
+  time_normalizer *normalizer= time_normalizer::get(wait_timer);
+  m_row.m_stat.set(normalizer, &pfs_program->m_sp_stat.m_timer1_stat);
+
+  if (! pfs_program->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+}
+
+void table_os_global_by_type::make_table_row(PFS_table_share *share)
+{
+  pfs_optimistic_state lock;
   PFS_single_stat cumulated_stat;
   uint safe_key_count;
 
@@ -146,16 +210,12 @@ void table_os_global_by_type::make_row(PFS_table_share *share)
 
   share->m_lock.begin_optimistic_lock(&lock);
 
-  m_row.m_object_type= share->get_object_type();
-  memcpy(m_row.m_schema_name, share->m_schema_name, share->m_schema_name_length);
-  m_row.m_schema_name_length= share->m_schema_name_length;
-  memcpy(m_row.m_object_name, share->m_table_name, share->m_table_name_length);
-  m_row.m_object_name_length= share->m_table_name_length;
+  m_row.m_object.make_row(share);
 
   /* This is a dirty read, some thread can write data while we are reading it */
   safe_key_count= sanitize_index_count(share->m_key_count);
 
-  share->m_table_stat.sum(& cumulated_stat, safe_key_count);
+  share->sum(& cumulated_stat, safe_key_count);
 
   if (! share->m_lock.end_optimistic_lock(&lock))
     return;
@@ -165,11 +225,12 @@ void table_os_global_by_type::make_row(PFS_table_share *share)
   if (share->get_refcount() > 0)
   {
     /* For all the table handles still opened ... */
-    PFS_table *table= table_array;
-    PFS_table *table_last= table_array + table_max;
-    for ( ; table < table_last ; table++)
+    PFS_table_iterator it= global_table_container.iterate();
+    PFS_table *table= it.scan_next();
+
+    while (table != NULL)
     {
-      if ((table->m_share == share) && (table->m_lock.is_populated()))
+      if (table->m_share == share)
       {
         /*
           If the opened table handle is for this table share,
@@ -177,6 +238,7 @@ void table_os_global_by_type::make_row(PFS_table_share *share)
         */
         table->m_table_stat.sum(& cumulated_stat, safe_key_count);
       }
+      table= it.scan_next();
     }
   }
 
@@ -195,7 +257,7 @@ int table_os_global_by_type::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
@@ -205,15 +267,15 @@ int table_os_global_by_type::read_row_values(TABLE *table,
       switch(f->field_index)
       {
       case 0: /* OBJECT_TYPE */
-        set_field_object_type(f, m_row.m_object_type);
+        set_field_object_type(f, m_row.m_object.m_object_type);
         break;
       case 1: /* SCHEMA_NAME */
-        set_field_varchar_utf8(f, m_row.m_schema_name,
-                               m_row.m_schema_name_length);
+        set_field_varchar_utf8(f, m_row.m_object.m_schema_name,
+                               m_row.m_object.m_schema_name_length);
         break;
       case 2: /* OBJECT_NAME */
-        set_field_varchar_utf8(f, m_row.m_object_name,
-                               m_row.m_object_name_length);
+        set_field_varchar_utf8(f, m_row.m_object.m_object_name,
+                               m_row.m_object.m_object_name_length);
         break;
       case 3: /* COUNT */
         set_field_ulonglong(f, m_row.m_stat.m_count);
@@ -231,7 +293,7 @@ int table_os_global_by_type::read_row_values(TABLE *table,
         set_field_ulonglong(f, m_row.m_stat.m_max);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
diff --git a/storage/perfschema/table_os_global_by_type.h b/storage/perfschema/table_os_global_by_type.h
index 2b9293ece06..9cb3b47b964 100644
--- a/storage/perfschema/table_os_global_by_type.h
+++ b/storage/perfschema/table_os_global_by_type.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -32,6 +32,7 @@
 #include "pfs_engine_table.h"
 #include "pfs_instr_class.h"
 #include "pfs_instr.h"
+#include "pfs_program.h"
 #include "table_helper.h"
 
 /**
@@ -45,16 +46,9 @@
 */
 struct row_os_global_by_type
 {
-  /** Column OBJECT_TYPE. */
-  enum_object_type m_object_type;
-  /** Column SCHEMA_NAME. */
-  char m_schema_name[NAME_LEN];
-  /** Length in bytes of @c m_schema_name. */
-  uint m_schema_name_length;
-  /** Column OBJECT_NAME. */
-  char m_object_name[NAME_LEN];
-  /** Length in bytes of @c m_object_name. */
-  uint m_object_name_length;
+  /** Column OBJECT_TYPE, SCHEMA_NAME, OBJECT_NAME. */
+  PFS_object_row m_object;
+
   /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
   PFS_stat_row m_stat;
 };
@@ -96,6 +90,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
@@ -114,7 +109,8 @@ public:
   {}
 
 protected:
-  void make_row(PFS_table_share *table_share);
+  void make_table_row(PFS_table_share *table_share);
+  void make_program_row(PFS_program *pfs_program);
 
 private:
   /** Table share lock. */
diff --git a/storage/perfschema/table_performance_timers.cc b/storage/perfschema/table_performance_timers.cc
index 20d893bac14..68878919e2b 100644
--- a/storage/perfschema/table_performance_timers.cc
+++ b/storage/perfschema/table_performance_timers.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,10 +26,11 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "table_performance_timers.h"
 #include "pfs_timer.h"
 #include "pfs_global.h"
+#include "field.h"
 
 THR_LOCK table_performance_timers::m_table_lock;
 
@@ -38,18 +39,18 @@ table_performance_timers::m_share=
 {
   { C_STRING_WITH_LEN("performance_timers") },
   &pfs_readonly_acl,
-  &table_performance_timers::create,
+  table_performance_timers::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
-  NULL, /* get_row_count */
-  COUNT_TIMER_NAME, /* records */
+  table_performance_timers::get_row_count,
   sizeof(PFS_simple_index), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE performance_timers("
                       "TIMER_NAME ENUM ('CYCLE', 'NANOSECOND', 'MICROSECOND', 'MILLISECOND', 'TICK') not null comment 'Time name, used in the setup_timers table.',"
                       "TIMER_FREQUENCY BIGINT comment 'Number of timer units per second. Dependent on the processor speed.',"
                       "TIMER_RESOLUTION BIGINT comment 'Number of timer units by which timed values increase each time.',"
-                      "TIMER_OVERHEAD BIGINT comment 'Minimum timer overhead, determined during initialization by calling the timer 20 times and selecting the smallest value. Total overhead will be at least double this, as the timer is called at the beginning and end of each timed event.')") }
+                      "TIMER_OVERHEAD BIGINT comment 'Minimum timer overhead, determined during initialization by calling the timer 20 times and selecting the smallest value. Total overhead will be at least double this, as the timer is called at the beginning and end of each timed event.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_performance_timers::create(void)
@@ -57,6 +58,12 @@ PFS_engine_table* table_performance_timers::create(void)
   return new table_performance_timers();
 }
 
+ha_rows
+table_performance_timers::get_row_count(void)
+{
+  return COUNT_TIMER_NAME;
+}
+
 table_performance_timers::table_performance_timers()
   : PFS_engine_table(&m_share, &m_pos),
     m_row(NULL), m_pos(0), m_next_pos(0)
@@ -114,7 +121,7 @@ int table_performance_timers::rnd_next(void)
 int table_performance_timers::rnd_pos(const void *pos)
 {
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index < COUNT_TIMER_NAME);
+  assert(m_pos.m_index < COUNT_TIMER_NAME);
   m_row= &m_data[m_pos.m_index];
   return 0;
 }
@@ -126,10 +133,10 @@ int table_performance_timers::read_row_values(TABLE *table,
 {
   Field *f;
 
-  DBUG_ASSERT(m_row);
+  assert(m_row);
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
@@ -160,7 +167,7 @@ int table_performance_timers::read_row_values(TABLE *table,
           f->set_null();
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
diff --git a/storage/perfschema/table_performance_timers.h b/storage/perfschema/table_performance_timers.h
index 93210ac9882..a2e12a4f824 100644
--- a/storage/perfschema/table_performance_timers.h
+++ b/storage/perfschema/table_performance_timers.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -56,6 +56,7 @@ public:
   /** Table share. */
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
diff --git a/storage/perfschema/table_prepared_stmt_instances.cc b/storage/perfschema/table_prepared_stmt_instances.cc
new file mode 100644
index 00000000000..592386db4bb
--- /dev/null
+++ b/storage/perfschema/table_prepared_stmt_instances.cc
@@ -0,0 +1,294 @@
+/* Copyright (c) 2014, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_prepared_stmt_instances.cc
+  Table PREPARED_STATEMENTS_INSTANCES (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "pfs_global.h"
+#include "pfs_instr.h"
+#include "pfs_timer.h"
+#include "pfs_visitor.h"
+#include "pfs_prepared_stmt.h"
+#include "table_prepared_stmt_instances.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
+
+THR_LOCK table_prepared_stmt_instances::m_table_lock;
+
+PFS_engine_table_share
+table_prepared_stmt_instances::m_share=
+{
+  { C_STRING_WITH_LEN("prepared_statements_instances") },
+  &pfs_truncatable_acl,
+  table_prepared_stmt_instances::create,
+  NULL, /* write_row */
+  table_prepared_stmt_instances::delete_all_rows,
+  table_prepared_stmt_instances::get_row_count,
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE prepared_statements_instances("
+  "OBJECT_INSTANCE_BEGIN bigint(20) unsigned NOT NULL comment 'The address in memory of the instrumented prepared statement.',"
+  "STATEMENT_ID bigint(20) unsigned NOT NULL comment 'The internal statement ID assigned by the server.',"
+  "STATEMENT_NAME varchar(64) default NULL comment 'For the binary protocol, this column is NULL. For the text protocol, this column is the external statement name assigned by the user.',"
+  "SQL_TEXT longtext NOT NULL comment 'The prepared statement text, with ? placeholder markers.',"
+  "OWNER_THREAD_ID bigint(20) unsigned NOT NULL comment 'Event thread id that created the prepared statement.',"
+  "OWNER_EVENT_ID bigint(20) unsigned NOT NULL comment 'Event id that created the prepared statement.',"
+  "OWNER_OBJECT_TYPE enum('EVENT','FUNCTION','PROCEDURE','TABLE','TRIGGER') DEFAULT NULL comment 'NULL for a prepared statement created by a client session. Type of the stored program that created the prepared statement.',"
+  "OWNER_OBJECT_SCHEMA varchar(64) DEFAULT NULL comment 'NULL for a prepared statement created by a client session. Schema of the stored program that created the prepared statement.',"
+  "OWNER_OBJECT_NAME varchar(64) DEFAULT NULL comment 'NULL for a prepared statement created by a client session. Name of the stored program that created the prepared statement.',"
+  "TIMER_PREPARE bigint(20) unsigned NOT NULL comment 'The time spent executing the statement preparation itself.',"
+  "COUNT_REPREPARE bigint(20) unsigned NOT NULL comment 'The number of times the statement was reprepared internally.',"
+  "COUNT_EXECUTE bigint(20) unsigned NOT NULL comment 'Total times the prepared statement was executed.',"
+  "SUM_TIMER_EXECUTE bigint(20) unsigned NOT NULL comment 'Total time spent executing all prepared statements.',"
+  "MIN_TIMER_EXECUTE bigint(20) unsigned NOT NULL comment 'Minimum time spent executing any of the prepared statements.',"
+  "AVG_TIMER_EXECUTE bigint(20) unsigned NOT NULL comment 'Average time spent executing any of the prepared statements.',"
+  "MAX_TIMER_EXECUTE bigint(20) unsigned NOT NULL comment 'Maximum time spent executing any of the prepared statements.',"
+  "SUM_LOCK_TIME bigint(20) unsigned NOT NULL comment 'The total time spent (in picoseconds) waiting for table locks for the prepared statements.',"
+  "SUM_ERRORS bigint(20) unsigned NOT NULL comment 'The total number of errors that occurend for the prepared statements.',"
+  "SUM_WARNINGS bigint(20) unsigned NOT NULL comment 'The total number of warnings that occurend for the prepared statements.',"
+  "SUM_ROWS_AFFECTED bigint(20) unsigned NOT NULL comment 'The total number of affected rows by the prepared statements.',"
+  "SUM_ROWS_SENT bigint(20) unsigned NOT NULL comment 'The total number of rows returned by the prepared statements.',"
+  "SUM_ROWS_EXAMINED bigint(20) unsigned NOT NULL comment 'The total number of rows examined by the prepared statements.',"
+  "SUM_CREATED_TMP_DISK_TABLES bigint(20) unsigned NOT NULL comment 'The total number of on-disk temporary tables created by the prepared statements.',"
+  "SUM_CREATED_TMP_TABLES bigint(20) unsigned NOT NULL comment 'The total number of in-memory temporary tables created by the prepared statements.',"
+  "SUM_SELECT_FULL_JOIN bigint(20) unsigned NOT NULL comment 'The total number of full joins executed by the prepared statements.',"
+  "SUM_SELECT_FULL_RANGE_JOIN bigint(20) unsigned NOT NULL comment 'The total number of range search joins executed by the prepared statements.',"
+  "SUM_SELECT_RANGE bigint(20) unsigned NOT NULL comment 'The total number of joins that used ranges on the first table executed by the prepared statements.',"
+  "SUM_SELECT_RANGE_CHECK bigint(20) unsigned NOT NULL comment 'The total number of joins that check for key usage after each row executed by the prepared statements.',"
+  "SUM_SELECT_SCAN bigint(20) unsigned NOT NULL comment 'The total number of joins that did a full scan of the first table executed by the prepared statements.',"
+  "SUM_SORT_MERGE_PASSES bigint(20) unsigned NOT NULL comment 'The total number of merge passes that the sort algorithm has had to do for the prepared statements.',"
+  "SUM_SORT_RANGE bigint(20) unsigned NOT NULL comment 'The total number of sorts that were done using ranges for the prepared statements.',"
+  "SUM_SORT_ROWS bigint(20) unsigned NOT NULL comment 'The total number of sorted rows that were sorted by the prepared statements.',"
+  "SUM_SORT_SCAN bigint(20) unsigned NOT NULL comment 'The total number of sorts that were done by scanning the table by the prepared statements.',"
+  "SUM_NO_INDEX_USED bigint(20) unsigned NOT NULL comment 'The total number of statements that performed a table scan without using an index.',"
+  "SUM_NO_GOOD_INDEX_USED bigint(20) unsigned NOT NULL comment 'The total number of statements where no good index was found.')")},
+  false  /* perpetual */
+};
+
+PFS_engine_table*
+table_prepared_stmt_instances::create(void)
+{
+  return new table_prepared_stmt_instances();
+}
+
+int
+table_prepared_stmt_instances::delete_all_rows(void)
+{
+  reset_prepared_stmt_instances();
+  return 0;
+}
+
+ha_rows
+table_prepared_stmt_instances::get_row_count(void)
+{
+  return global_prepared_stmt_container.get_row_count();
+}
+
+table_prepared_stmt_instances::table_prepared_stmt_instances()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+void table_prepared_stmt_instances::reset_position(void)
+{
+  m_pos= 0;
+  m_next_pos= 0;
+}
+
+int table_prepared_stmt_instances::rnd_next(void)
+{
+  PFS_prepared_stmt* pfs;
+
+  m_pos.set_at(&m_next_pos);
+  PFS_prepared_stmt_iterator it= global_prepared_stmt_container.iterate(m_pos.m_index);
+  pfs= it.scan_next(& m_pos.m_index);
+  if (pfs != NULL)
+  {
+    make_row(pfs);
+    m_next_pos.set_after(&m_pos);
+    return 0;
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_prepared_stmt_instances::rnd_pos(const void *pos)
+{
+  PFS_prepared_stmt* pfs;
+
+  set_position(pos);
+
+  pfs= global_prepared_stmt_container.get(m_pos.m_index);
+  if (pfs != NULL)
+  {
+    make_row(pfs);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+
+void table_prepared_stmt_instances::make_row(PFS_prepared_stmt* prepared_stmt)
+{
+  pfs_optimistic_state lock;
+  m_row_exists= false;
+
+  prepared_stmt->m_lock.begin_optimistic_lock(&lock);
+
+  m_row.m_identity= prepared_stmt->m_identity;
+
+  m_row.m_stmt_id= prepared_stmt->m_stmt_id;
+
+  m_row.m_owner_thread_id= prepared_stmt->m_owner_thread_id;
+  m_row.m_owner_event_id= prepared_stmt->m_owner_event_id;
+
+  m_row.m_stmt_name_length= prepared_stmt->m_stmt_name_length;
+  if(m_row.m_stmt_name_length > 0)
+    memcpy(m_row.m_stmt_name, prepared_stmt->m_stmt_name,
+           m_row.m_stmt_name_length);
+
+  m_row.m_sql_text_length= prepared_stmt->m_sqltext_length;
+  if(m_row.m_sql_text_length > 0)
+    memcpy(m_row.m_sql_text, prepared_stmt->m_sqltext,
+           m_row.m_sql_text_length);
+
+  m_row.m_owner_object_type= prepared_stmt->m_owner_object_type;
+
+  m_row.m_owner_object_name_length= prepared_stmt->m_owner_object_name_length;
+  if(m_row.m_owner_object_name_length > 0)
+    memcpy(m_row.m_owner_object_name, prepared_stmt->m_owner_object_name,
+           m_row.m_owner_object_name_length);
+
+  m_row.m_owner_object_schema_length= prepared_stmt->m_owner_object_schema_length;
+  if(m_row.m_owner_object_schema_length > 0)
+    memcpy(m_row.m_owner_object_schema, prepared_stmt->m_owner_object_schema,
+           m_row.m_owner_object_schema_length);
+
+  time_normalizer *normalizer= time_normalizer::get(statement_timer);
+  /* Get prepared statement prepare stats. */
+  m_row.m_prepare_stat.set(normalizer, & prepared_stmt->m_prepare_stat);
+  /* Get prepared statement reprepare stats. */
+  m_row.m_reprepare_stat.set(normalizer, & prepared_stmt->m_reprepare_stat);
+  /* Get prepared statement execute stats. */
+  m_row.m_execute_stat.set(normalizer, & prepared_stmt->m_execute_stat);
+
+  if (! prepared_stmt->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+}
+
+int table_prepared_stmt_instances
+::read_row_values(TABLE *table, unsigned char *buf, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /*
+    Set the null bits.
+  */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* OBJECT_INSTANCE_BEGIN */
+        set_field_ulonglong(f, (intptr)m_row.m_identity);
+        break;
+      case 1: /* STATEMENT_ID */
+        set_field_ulonglong(f, m_row.m_stmt_id);
+        break;
+      case 2: /* STATEMENT_NAME */
+        if(m_row.m_stmt_name_length > 0)
+          set_field_varchar_utf8(f, m_row.m_stmt_name,
+                                 m_row.m_stmt_name_length);
+        else
+          f->set_null();
+        break;
+      case 3: /* SQL_TEXT */
+        if(m_row.m_sql_text_length > 0)
+          set_field_longtext_utf8(f, m_row.m_sql_text,
+                                 m_row.m_sql_text_length);
+        else
+          f->set_null();
+        break;
+      case 4: /* OWNER_THREAD_ID */
+        set_field_ulonglong(f, m_row.m_owner_thread_id);
+        break;
+      case 5: /* OWNER_EVENT_ID */
+        if(m_row.m_owner_event_id > 0)
+          set_field_ulonglong(f, m_row.m_owner_event_id);
+        else
+          f->set_null();
+        break;
+      case 6: /* OWNER_OBJECT_TYPE */
+        if(m_row.m_owner_object_type != 0)
+          set_field_enum(f, m_row.m_owner_object_type);
+        else
+          f->set_null();
+        break;
+      case 7: /* OWNER_OBJECT_SCHEMA */
+        if(m_row.m_owner_object_schema_length > 0)
+          set_field_varchar_utf8(f, m_row.m_owner_object_schema,
+                                 m_row.m_owner_object_schema_length);
+        else
+          f->set_null();
+        break;
+      case 8: /* OWNER_OBJECT_NAME */
+        if(m_row.m_owner_object_name_length > 0)
+          set_field_varchar_utf8(f, m_row.m_owner_object_name,
+                                 m_row.m_owner_object_name_length);
+        else
+          f->set_null();
+        break;
+      case 9:    /* TIMER_PREPARE */
+        m_row.m_prepare_stat.set_field(1, f);
+        break;
+      case 10:   /* COUNT_REPREPARE */
+        m_row.m_reprepare_stat.set_field(0, f);
+        break;
+      default: /* 14, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_execute_stat.set_field(f->field_index - 11, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_prepared_stmt_instances.h b/storage/perfschema/table_prepared_stmt_instances.h
new file mode 100644
index 00000000000..a061e0d9883
--- /dev/null
+++ b/storage/perfschema/table_prepared_stmt_instances.h
@@ -0,0 +1,132 @@
+/* Copyright (c) 2014, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_PREPARED_STMT_INSTANCES
+#define TABLE_PREPARED_STMT_INSTANCES
+
+/**
+  @file storage/perfschema/table_prepared_stmt_instances.h
+  Table PREPARED_STATEMENT_INSTANCE(declarations).
+*/
+
+#include "table_helper.h"
+#include "pfs_prepared_stmt.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.PREPARED_STATEMENT_INSTANCES.
+*/
+struct row_prepared_stmt_instances
+{
+  /** Column OBJECT_INSTANCE_BEGIN. */
+  const void *m_identity;
+
+  /** Column STMT_ID. */
+  ulonglong m_stmt_id;
+
+  /** Column STMT_NAME. */
+  char m_stmt_name[COL_INFO_SIZE];
+  int m_stmt_name_length;
+
+  /** Column SQL_TEXT. */
+  char m_sql_text[COL_INFO_SIZE];
+  int m_sql_text_length;
+
+  /** Column OWNER_THREAD_ID. */
+  ulonglong m_owner_thread_id;
+
+  /** Column OWNER_EVENT_ID. */
+  ulonglong m_owner_event_id;
+
+  /** Column OWNER_OBJECT_TYPE. */
+  enum_object_type m_owner_object_type;
+
+  /** Column OWNER_OBJECT_SCHEMA */
+  char m_owner_object_schema[COL_OBJECT_SCHEMA_SIZE];
+  int m_owner_object_schema_length;
+
+  /** Column OWNER_OBJECT_NAME */
+  char m_owner_object_name[COL_OBJECT_NAME_SIZE];
+  int m_owner_object_name_length;
+
+  /** Columns TIMER_PREPARE. */
+  PFS_stat_row m_prepare_stat;
+
+  /** Columns COUNT_REPREPARE. */
+  PFS_stat_row m_reprepare_stat;
+
+  /** Columns COUNT_STAR...SUM_NO_GOOD_INDEX_USED. */
+  PFS_statement_stat_row m_execute_stat;
+};
+
+/** Table PERFORMANCE_SCHEMA.PREPARED_STATEMENT_INSTANCES. */
+class table_prepared_stmt_instances : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_prepared_stmt_instances();
+
+public:
+  ~table_prepared_stmt_instances()
+  {}
+
+protected:
+  void make_row(PFS_prepared_stmt*);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_prepared_stmt_instances m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_replication_applier_configuration.cc b/storage/perfschema/table_replication_applier_configuration.cc
new file mode 100644
index 00000000000..809dfead9a2
--- /dev/null
+++ b/storage/perfschema/table_replication_applier_configuration.cc
@@ -0,0 +1,194 @@
+/*
+      Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+      This program is free software; you can redistribute it and/or modify
+      it under the terms of the GNU General Public License, version 2.0,
+      as published by the Free Software Foundation.
+
+      This program is also distributed with certain software (including
+      but not limited to OpenSSL) that is licensed under separate terms,
+      as designated in a particular file or component or in included license
+      documentation.  The authors of MySQL hereby grant you an additional
+      permission to link the program and your derivative works with the
+      separately licensed software that they have included with MySQL.
+
+      This program is distributed in the hope that it will be useful,
+      but WITHOUT ANY WARRANTY; without even the implied warranty of
+      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+      GNU General Public License, version 2.0, for more details.
+
+      You should have received a copy of the GNU General Public License
+      along with this program; if not, write to the Free Software
+      Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_replication_applier_configuration.cc
+  Table replication_applier_configuration (implementation).
+*/
+
+//#define HAVE_REPLICATION
+
+#include "my_global.h"
+#include "table_replication_applier_configuration.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "slave.h"
+//#include "rpl_info.h"
+#include "rpl_rli.h"
+#include "rpl_mi.h"
+#include "sql_parse.h"
+//#include "rpl_msr.h"   /* Multisource replication */
+
+#ifdef HAVE_REPLICATION
+THR_LOCK table_replication_applier_configuration::m_table_lock;
+
+PFS_engine_table_share
+table_replication_applier_configuration::m_share=
+{
+  { C_STRING_WITH_LEN("replication_applier_configuration") },
+  &pfs_readonly_acl,
+  table_replication_applier_configuration::create,
+  NULL, /* write_row */
+  NULL, /* delete_all_rows */
+  table_replication_applier_configuration::get_row_count,
+  sizeof(pos_t), /* ref length */
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE replication_applier_configuration("
+  "CHANNEL_NAME CHAR(64) collate utf8_general_ci not null comment 'Replication channel name.',"
+  "DESIRED_DELAY INTEGER not null comment 'Desired replica delay functionality not supported by MariaDB. Always 0.')") },
+  false  /* perpetual */
+};
+
+PFS_engine_table* table_replication_applier_configuration::create(void)
+{
+  return new table_replication_applier_configuration();
+}
+
+table_replication_applier_configuration
+  ::table_replication_applier_configuration()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+table_replication_applier_configuration
+  ::~table_replication_applier_configuration()
+{}
+
+void table_replication_applier_configuration::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+
+ha_rows table_replication_applier_configuration::get_row_count()
+{
+ return master_info_index->master_info_hash.records;
+}
+
+
+int table_replication_applier_configuration::rnd_next(void)
+{
+  Master_info *mi;
+  mysql_mutex_lock(&LOCK_active_mi);
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < master_info_index->master_info_hash.records;
+       m_pos.next())
+  {
+    mi= (Master_info *)my_hash_element(&master_info_index->master_info_hash, m_pos.m_index);
+
+    if (mi && mi->host[0])
+    {
+      make_row(mi);
+      m_next_pos.set_after(&m_pos);
+      mysql_mutex_unlock(&LOCK_active_mi);
+      return 0;
+    }
+  }
+
+  mysql_mutex_unlock(&LOCK_active_mi);
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_replication_applier_configuration::rnd_pos(const void *pos)
+{
+  Master_info *mi;
+  int res= HA_ERR_RECORD_DELETED;
+
+  set_position(pos);
+
+  mysql_mutex_lock(&LOCK_active_mi);
+
+  if ((mi= (Master_info *)my_hash_element(&master_info_index->master_info_hash, m_pos.m_index)))
+  {
+    make_row(mi);
+    res= 0;
+  }
+
+  mysql_mutex_unlock(&LOCK_active_mi);
+  return res;
+}
+
+void table_replication_applier_configuration::make_row(Master_info *mi)
+{
+  m_row_exists= false;
+
+  DBUG_ASSERT(mi != NULL);
+
+  mysql_mutex_lock(&mi->data_lock);
+  mysql_mutex_lock(&mi->rli.data_lock);
+
+  m_row.channel_name_length= static_cast<uint>(mi->connection_name.length);
+  memcpy(m_row.channel_name, mi->connection_name.str, m_row.channel_name_length);
+  m_row.desired_delay= 0; //mi->rli->get_sql_delay();
+
+  mysql_mutex_unlock(&mi->rli.data_lock);
+  mysql_mutex_unlock(&mi->data_lock);
+
+  m_row_exists= true;
+}
+
+int table_replication_applier_configuration::read_row_values(TABLE *table,
+                                                             unsigned char *buf,
+                                                             Field **fields,
+                                                             bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /*
+    Note:
+    There are no NULL columns in this table,
+    so there are no null bits reserved for NULL flags per column.
+    There are no VARCHAR columns either, so the record is not
+    in HA_OPTION_PACK_RECORD format as most other performance_schema tables.
+    When HA_OPTION_PACK_RECORD is not set,
+    the table record reserves an extra null byte, see open_binary_frm().
+  */
+
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /**channel_name*/
+        set_field_char_utf8(f, m_row.channel_name, m_row.channel_name_length);
+        break;
+      case 1: /** desired_delay */
+        set_field_ulong(f, static_cast<ulong>(m_row.desired_delay));
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+  return 0;
+}
+#endif
diff --git a/storage/perfschema/table_replication_applier_configuration.h b/storage/perfschema/table_replication_applier_configuration.h
new file mode 100644
index 00000000000..7baa9a6d990
--- /dev/null
+++ b/storage/perfschema/table_replication_applier_configuration.h
@@ -0,0 +1,107 @@
+/*
+   Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License, version 2.0,
+   as published by the Free Software Foundation.
+
+   This program is also distributed with certain software (including
+   but not limited to OpenSSL) that is licensed under separate terms,
+   as designated in a particular file or component or in included license
+   documentation.  The authors of MySQL hereby grant you an additional
+   permission to link the program and your derivative works with the
+   separately licensed software that they have included with MySQL.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License, version 2.0, for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+
+#ifndef TABLE_REPLICATION_APPLIER_CONFIGURATION_H
+#define TABLE_REPLICATION_APPLIER_CONFIGURATION_H
+
+/**
+  @file storage/perfschema/table_replication_applier_configuration.h
+  Table replication_applier_configuration (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "rpl_mi.h"
+#include "mysql_com.h"
+#include "my_thread.h"
+//#include "rpl_msr.h"
+//#include "rpl_info.h"  /*CHANNEL_NAME_LENGTH*/
+
+class Master_info;
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** A row in the table*/
+struct st_row_applier_config {
+  char channel_name[CHANNEL_NAME_LENGTH];
+  uint channel_name_length;
+  time_t desired_delay;
+  bool desired_delay_is_set;
+};
+
+/** Table PERFORMANCE_SCHEMA.replication_applier_configuration */
+class table_replication_applier_configuration: public PFS_engine_table
+{
+  typedef PFS_simple_index pos_t;
+
+private:
+  void make_row(Master_info *mi);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+  /** Current row */
+  st_row_applier_config m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_t m_pos;
+  /** Next position. */
+  pos_t m_next_pos;
+
+protected:
+  /**
+    Read the current row values.
+    @param table            Table handle
+    @param buf              row buffer
+    @param fields           Table fields
+    @param read_all         true if all columns are read.
+  */
+
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_replication_applier_configuration();
+
+public:
+  ~table_replication_applier_configuration();
+
+  /** Table share. */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static ha_rows get_row_count();
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_replication_applier_status.cc b/storage/perfschema/table_replication_applier_status.cc
new file mode 100644
index 00000000000..ebe922902e7
--- /dev/null
+++ b/storage/perfschema/table_replication_applier_status.cc
@@ -0,0 +1,222 @@
+/*
+      Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+      This program is free software; you can redistribute it and/or modify
+      it under the terms of the GNU General Public License, version 2.0,
+      as published by the Free Software Foundation.
+
+      This program is also distributed with certain software (including
+      but not limited to OpenSSL) that is licensed under separate terms,
+      as designated in a particular file or component or in included license
+      documentation.  The authors of MySQL hereby grant you an additional
+      permission to link the program and your derivative works with the
+      separately licensed software that they have included with MySQL.
+
+      This program is distributed in the hope that it will be useful,
+      but WITHOUT ANY WARRANTY; without even the implied warranty of
+      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+      GNU General Public License, version 2.0, for more details.
+
+      You should have received a copy of the GNU General Public License
+      along with this program; if not, write to the Free Software
+      Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_replication_applier_status.cc
+  Table replication_applier_status (implementation).
+*/
+
+//#define HAVE_REPLICATION
+
+#include "my_global.h"
+
+#ifdef HAVE_REPLICATION
+#include "table_replication_applier_status.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "slave.h"
+//#include "rpl_info.h"
+#include "rpl_rli.h"
+#include "rpl_mi.h"
+#include "sql_parse.h"
+//#include "rpl_msr.h"    /*Multi source replication */
+
+THR_LOCK table_replication_applier_status::m_table_lock;
+
+PFS_engine_table_share
+table_replication_applier_status::m_share=
+{
+  { C_STRING_WITH_LEN("replication_applier_status") },
+  &pfs_readonly_acl,
+  table_replication_applier_status::create,
+  NULL, /* write_row */
+  NULL, /* delete_all_rows */
+  table_replication_applier_status::get_row_count,    /* records */
+  sizeof(pos_t), /* ref length */
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE replication_applier_status("
+  "CHANNEL_NAME CHAR(64) collate utf8_general_ci not null comment 'The replication channel name.',"
+  "SERVICE_STATE ENUM('ON','OFF') not null comment 'Shows ON when the replication channel''s applier threads are active or idle, OFF means that the applier threads are not active.',"
+  "REMAINING_DELAY INTEGER unsigned comment 'Desired replica delay functionality not supported by MariaDB. Always 0.',"
+  "COUNT_TRANSACTIONS_RETRIES BIGINT unsigned not null comment 'The number of retries that were made because the replication SQL thread failed to apply a transaction.')") },
+  false  /* perpetual */
+};
+
+
+PFS_engine_table* table_replication_applier_status::create(void)
+{
+  return new table_replication_applier_status();
+}
+
+table_replication_applier_status::table_replication_applier_status()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+table_replication_applier_status::~table_replication_applier_status()
+{}
+
+void table_replication_applier_status::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+ha_rows table_replication_applier_status::get_row_count()
+{
+ return master_info_index->master_info_hash.records;
+}
+
+
+int table_replication_applier_status::rnd_next(void)
+{
+  Master_info *mi;
+  mysql_mutex_lock(&LOCK_active_mi);
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < master_info_index->master_info_hash.records;
+       m_pos.next())
+  {
+    mi= (Master_info *)my_hash_element(&master_info_index->master_info_hash, m_pos.m_index);
+
+    if (mi && mi->host[0])
+    {
+      make_row(mi);
+      m_next_pos.set_after(&m_pos);
+      mysql_mutex_unlock(&LOCK_active_mi);
+      return 0;
+    }
+  }
+
+  mysql_mutex_unlock(&LOCK_active_mi);
+  return HA_ERR_END_OF_FILE;
+}
+
+
+int table_replication_applier_status::rnd_pos(const void *pos)
+{
+  Master_info *mi=NULL;
+  int res= HA_ERR_RECORD_DELETED;
+
+  set_position(pos);
+
+  mysql_mutex_lock(&LOCK_active_mi);
+
+  if ((mi= (Master_info *)my_hash_element(&master_info_index->master_info_hash, m_pos.m_index)))
+  {
+    make_row(mi);
+    res= 0;
+  }
+
+  mysql_mutex_unlock(&LOCK_active_mi);
+  return res;
+}
+
+void table_replication_applier_status::make_row(Master_info *mi)
+{
+  char *slave_sql_running_state= NULL;
+
+  m_row_exists= false;
+
+  DBUG_ASSERT(mi != NULL);
+
+  m_row.channel_name_length= static_cast<uint>(mi->connection_name.length);
+  memcpy(m_row.channel_name, mi->connection_name.str, m_row.channel_name_length);
+
+  //mysql_mutex_lock(&mi->rli->info_thd_lock);
+
+  slave_sql_running_state= const_cast<char *>
+                           (mi->rli.sql_driver_thd ?
+                            mi->rli.sql_driver_thd->get_proc_info() : "");
+  //mysql_mutex_unlock(&mi->rli->info_thd_lock);
+
+
+  mysql_mutex_lock(&mi->data_lock);
+  mysql_mutex_lock(&mi->rli.data_lock);
+
+  if (mi->rli.slave_running)
+    m_row.service_state= PS_RPL_YES;
+  else
+    m_row.service_state= PS_RPL_NO;
+
+  m_row.remaining_delay= 0;
+  if (slave_sql_running_state == stage_sql_thd_waiting_until_delay.m_name)
+  {
+    time_t t= my_time(0), sql_delay_end= 0; //mi->rli.>get_sql_delay_end();
+    m_row.remaining_delay= (uint)(t < sql_delay_end ?
+                                      sql_delay_end - t : 0);
+    m_row.remaining_delay_is_set= true;
+  }
+  else
+    m_row.remaining_delay_is_set= false;
+
+  m_row.count_transactions_retries= mi->rli.retried_trans;
+
+  mysql_mutex_unlock(&mi->rli.data_lock);
+  mysql_mutex_unlock(&mi->data_lock);
+
+  m_row_exists= true;
+}
+
+int table_replication_applier_status::read_row_values(TABLE *table,
+                                       unsigned char *buf,
+                                       Field **fields,
+                                       bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /**channel_name*/
+         set_field_char_utf8(f, m_row.channel_name, m_row.channel_name_length);
+         break;
+      case 1: /* service_state */
+        set_field_enum(f, m_row.service_state);
+        break;
+      case 2: /* remaining_delay */
+        if (m_row.remaining_delay_is_set)
+          set_field_ulong(f, m_row.remaining_delay);
+        else
+          f->set_null();
+        break;
+      case 3: /* total number of times transactions were retried */
+        set_field_ulonglong(f, m_row.count_transactions_retries);
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+  return 0;
+}
+#endif
diff --git a/storage/perfschema/table_replication_applier_status.h b/storage/perfschema/table_replication_applier_status.h
new file mode 100644
index 00000000000..6278186bd53
--- /dev/null
+++ b/storage/perfschema/table_replication_applier_status.h
@@ -0,0 +1,118 @@
+/*
+   Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License, version 2.0,
+   as published by the Free Software Foundation.
+
+   This program is also distributed with certain software (including
+   but not limited to OpenSSL) that is licensed under separate terms,
+   as designated in a particular file or component or in included license
+   documentation.  The authors of MySQL hereby grant you an additional
+   permission to link the program and your derivative works with the
+   separately licensed software that they have included with MySQL.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License, version 2.0, for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+
+#ifndef TABLE_REPLICATION_APPLIER_STATUS_H
+#define TABLE_REPLICATION_APPLIER_STATUS_H
+
+/**
+  @file storage/perfschema/table_replication_applier_status.h
+  Table replication_applier_status (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "rpl_mi.h"
+#include "mysql_com.h"
+//#include "rpl_msr.h"
+//#include "rpl_info.h" /*CHANNEL_NAME_LENGTH*/
+#include "my_thread.h"
+
+class Master_info;
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+#ifndef ENUM_RPL_YES_NO
+#define ENUM_RPL_YES_NO
+/** enum values for Service_State field*/
+enum enum_rpl_yes_no {
+  PS_RPL_YES= 1,
+  PS_RPL_NO
+};
+#endif
+
+/** A row in the table. */
+struct st_row_applier_status {
+  char channel_name[CHANNEL_NAME_LENGTH];
+  uint channel_name_length;
+  enum_rpl_yes_no service_state;
+  uint remaining_delay;
+  bool remaining_delay_is_set;
+  ulong count_transactions_retries;
+};
+
+/** Table PERFORMANCE_SCHEMA.replication_applier_status */
+class table_replication_applier_status: public PFS_engine_table
+{
+  typedef PFS_simple_index pos_t;
+
+private:
+  void make_row(Master_info *mi);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+  /** Current row */
+  st_row_applier_status m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_t m_pos;
+  /** Next position. */
+  pos_t m_next_pos;
+
+protected:
+  /**
+    Read the current row values.
+    @param table            Table handle
+    @param buf              row buffer
+    @param fields           Table fields
+    @param read_all         true if all columns are read.
+  */
+
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_replication_applier_status();
+
+public:
+  ~table_replication_applier_status();
+
+  /** Table share. */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static ha_rows get_row_count();
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_replication_applier_status_by_coordinator.cc b/storage/perfschema/table_replication_applier_status_by_coordinator.cc
new file mode 100644
index 00000000000..906b69540a8
--- /dev/null
+++ b/storage/perfschema/table_replication_applier_status_by_coordinator.cc
@@ -0,0 +1,249 @@
+/*
+      Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+      This program is free software; you can redistribute it and/or modify
+      it under the terms of the GNU General Public License, version 2.0,
+      as published by the Free Software Foundation.
+
+      This program is also distributed with certain software (including
+      but not limited to OpenSSL) that is licensed under separate terms,
+      as designated in a particular file or component or in included license
+      documentation.  The authors of MySQL hereby grant you an additional
+      permission to link the program and your derivative works with the
+      separately licensed software that they have included with MySQL.
+
+      This program is distributed in the hope that it will be useful,
+      but WITHOUT ANY WARRANTY; without even the implied warranty of
+      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+      GNU General Public License, version 2.0, for more details.
+
+      You should have received a copy of the GNU General Public License
+      along with this program; if not, write to the Free Software
+      Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_replication_applier_status_by_cordinator.cc
+  Table replication_applier_status_by_coordinator (implementation).
+*/
+
+//#define HAVE_REPLICATION
+
+#include "my_global.h"
+
+#ifdef HAVE_REPLICATION
+#include "table_replication_applier_status_by_coordinator.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "slave.h"
+//#include "rpl_info.h"
+#include "rpl_rli.h"
+#include "rpl_mi.h"
+#include "sql_parse.h"
+//#include "rpl_msr.h"       /* Multisource replication */
+
+THR_LOCK table_replication_applier_status_by_coordinator::m_table_lock;
+
+PFS_engine_table_share
+table_replication_applier_status_by_coordinator::m_share=
+{
+  { C_STRING_WITH_LEN("replication_applier_status_by_coordinator") },
+  &pfs_readonly_acl,
+  table_replication_applier_status_by_coordinator::create,
+  NULL, /* write_row */
+  NULL, /* delete_all_rows */
+  table_replication_applier_status_by_coordinator::get_row_count,
+  sizeof(pos_t), /* ref length */
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE replication_applier_status_by_coordinator("
+  "CHANNEL_NAME CHAR(64) collate utf8_general_ci not null comment 'Replication channel name.',"
+  "THREAD_ID BIGINT UNSIGNED comment 'The SQL/coordinator thread ID.',"
+  "SERVICE_STATE ENUM('ON','OFF') not null comment 'ON (thread exists and is active or idle) or OFF (thread no longer exists).',"
+  "LAST_ERROR_NUMBER INTEGER not null comment 'Last error number that caused the SQL/coordinator thread to stop.',"
+  "LAST_ERROR_MESSAGE VARCHAR(1024) not null comment 'Last error message that caused the SQL/coordinator thread to stop.',"
+  "LAST_ERROR_TIMESTAMP TIMESTAMP(0) not null comment 'Timestamp that shows when the most recent SQL/coordinator error occured.')") },
+  false  /* perpetual */
+};
+
+PFS_engine_table* table_replication_applier_status_by_coordinator::create(void)
+{
+  return new table_replication_applier_status_by_coordinator();
+}
+
+table_replication_applier_status_by_coordinator
+  ::table_replication_applier_status_by_coordinator()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+table_replication_applier_status_by_coordinator
+  ::~table_replication_applier_status_by_coordinator()
+{}
+
+void table_replication_applier_status_by_coordinator::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+ha_rows table_replication_applier_status_by_coordinator::get_row_count()
+{
+ return master_info_index->master_info_hash.records;
+}
+
+
+int table_replication_applier_status_by_coordinator::rnd_next(void)
+{
+  Master_info *mi;
+
+  mysql_mutex_lock(&LOCK_active_mi);
+
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < master_info_index->master_info_hash.records;
+       m_pos.next())
+  {
+    mi= (Master_info *)my_hash_element(&master_info_index->master_info_hash, m_pos.m_index);
+
+    /*
+      Construct and display SQL Thread's (Coordinator) information in
+      'replication_applier_status_by_coordinator' table only in the case of
+      multi threaded slave mode. Code should do nothing in the case of single
+      threaded slave mode. In case of single threaded slave mode SQL Thread's
+      status will be reported as part of
+      'replication_applier_status_by_worker' table.
+    */
+    if (mi && mi->host[0] && /*mi->rli.get_worker_count() > */ 0)
+    {
+      make_row(mi);
+      m_next_pos.set_after(&m_pos);
+      mysql_mutex_unlock(&LOCK_active_mi);
+      return 0;
+    }
+  }
+
+  mysql_mutex_unlock(&LOCK_active_mi);
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_replication_applier_status_by_coordinator::rnd_pos(const void *pos)
+{
+  Master_info *mi=NULL;
+  int res= HA_ERR_RECORD_DELETED;
+
+  set_position(pos);
+
+  mysql_mutex_lock(&LOCK_active_mi);
+
+  if ((mi= (Master_info *)my_hash_element(&master_info_index->master_info_hash, m_pos.m_index)))
+  {
+    make_row(mi);
+    res= 0;
+  }
+
+  mysql_mutex_unlock(&LOCK_active_mi);
+  return res;
+}
+
+void table_replication_applier_status_by_coordinator::make_row(Master_info *mi)
+{
+  m_row_exists= false;
+
+  DBUG_ASSERT(mi != NULL);
+
+  mysql_mutex_lock(&mi->rli.data_lock);
+
+  m_row.channel_name_length= static_cast<uint>(mi->connection_name.length);
+  memcpy(m_row.channel_name, mi->connection_name.str, m_row.channel_name_length);
+
+  if (mi->rli.slave_running)
+  {
+    PSI_thread *psi= thd_get_psi(mi->rli.sql_driver_thd);
+    PFS_thread *pfs= reinterpret_cast<PFS_thread *> (psi);
+    if(pfs)
+    {
+      m_row.thread_id= pfs->m_thread_internal_id;
+      m_row.thread_id_is_null= false;
+    }
+    else
+      m_row.thread_id_is_null= true;
+  }
+  else
+    m_row.thread_id_is_null= true;
+
+  if (mi->rli.slave_running)
+    m_row.service_state= PS_RPL_YES;
+  else
+    m_row.service_state= PS_RPL_NO;
+
+  mysql_mutex_lock(&mi->rli.err_lock);
+
+  m_row.last_error_number= (long int) mi->rli.last_error().number;
+  m_row.last_error_message_length= 0;
+  m_row.last_error_timestamp= 0;
+
+  /** if error, set error message and timestamp */
+  if (m_row.last_error_number)
+  {
+    char *temp_store= (char*) mi->rli.last_error().message;
+    m_row.last_error_message_length= static_cast<uint>(strlen(temp_store));
+    memcpy(m_row.last_error_message, temp_store,
+           m_row.last_error_message_length);
+
+    /** time in millisecond since epoch */
+    m_row.last_error_timestamp= 0;//(ulonglong)mi->rli.last_error().skr*1000000;
+  }
+
+  mysql_mutex_unlock(&mi->rli.err_lock);
+  mysql_mutex_unlock(&mi->rli.data_lock);
+
+  m_row_exists= true;
+}
+
+int table_replication_applier_status_by_coordinator
+  ::read_row_values(TABLE *table, unsigned char *buf,
+                    Field **fields, bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* channel_name */
+         set_field_char_utf8(f, m_row.channel_name, m_row.channel_name_length);
+         break;
+      case 1: /*thread_id*/
+        if (!m_row.thread_id_is_null)
+          set_field_ulonglong(f, m_row.thread_id);
+        else
+          f->set_null();
+        break;
+      case 2: /*service_state*/
+        set_field_enum(f, m_row.service_state);
+        break;
+      case 3: /*last_error_number*/
+        set_field_ulong(f, m_row.last_error_number);
+        break;
+      case 4: /*last_error_message*/
+        set_field_varchar_utf8(f, m_row.last_error_message,
+                               m_row.last_error_message_length);
+        break;
+      case 5: /*last_error_timestamp*/
+        set_field_timestamp(f, m_row.last_error_timestamp);
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+  return 0;
+}
+#endif
diff --git a/storage/perfschema/table_replication_applier_status_by_coordinator.h b/storage/perfschema/table_replication_applier_status_by_coordinator.h
new file mode 100644
index 00000000000..36427e83e66
--- /dev/null
+++ b/storage/perfschema/table_replication_applier_status_by_coordinator.h
@@ -0,0 +1,124 @@
+/*
+   Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License, version 2.0,
+   as published by the Free Software Foundation.
+
+   This program is also distributed with certain software (including
+   but not limited to OpenSSL) that is licensed under separate terms,
+   as designated in a particular file or component or in included license
+   documentation.  The authors of MySQL hereby grant you an additional
+   permission to link the program and your derivative works with the
+   separately licensed software that they have included with MySQL.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License, version 2.0, for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+
+#ifndef TABLE_REPLICATION_APPLIER_STATUS_BY_COORDINATOR_H
+#define TABLE_REPLICATION_APPLIER_STATUS_BY_COORDINATOR_H
+
+/**
+  @file storage/perfschema/table_replication_applier_applier_by_coordinator.h
+  Table replication_applier_status_by_coordinator(declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "rpl_mi.h"
+#include "mysql_com.h"
+//#include "rpl_msr.h"
+//#include "rpl_info.h" /*CHANNEL_NAME_LENGTH*/
+#include "my_thread.h"
+
+class Master_info;
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+#ifndef ENUM_RPL_YES_NO
+#define ENUM_RPL_YES_NO
+/** enum values for Service_State of coordinator thread */
+enum enum_rpl_yes_no {
+  PS_RPL_YES= 1, /* Service_State= on */
+  PS_RPL_NO /* Service_State= off */
+};
+#endif
+
+/*
+  A row in coordinator's table. The fields with string values have an
+  additional length field denoted by <field_name>_length.
+*/
+struct st_row_coordinator {
+  char channel_name[CHANNEL_NAME_LENGTH];
+  uint channel_name_length;
+  ulonglong thread_id;
+  bool thread_id_is_null;
+  enum_rpl_yes_no service_state;
+  uint last_error_number;
+  char last_error_message[MAX_SLAVE_ERRMSG];
+  uint last_error_message_length;
+  ulonglong last_error_timestamp;
+};
+
+/** Table PERFORMANCE_SCHEMA.replication_applier_status_by_coordinator */
+class table_replication_applier_status_by_coordinator: public PFS_engine_table
+{
+  typedef PFS_simple_index pos_t;
+
+private:
+  void make_row(Master_info *mi);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+  /** Current row */
+  st_row_coordinator m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_t m_pos;
+  /** Next position. */
+  pos_t m_next_pos;
+
+protected:
+  /**
+    Read the current row values.
+    @param table            Table handle
+    @param buf              row buffer
+    @param fields           Table fields
+    @param read_all         true if all columns are read.
+  */
+
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_replication_applier_status_by_coordinator();
+
+public:
+  ~table_replication_applier_status_by_coordinator();
+
+  /** Table share. */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static ha_rows get_row_count();
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_replication_applier_status_by_worker.cc b/storage/perfschema/table_replication_applier_status_by_worker.cc
new file mode 100644
index 00000000000..74db4ee658c
--- /dev/null
+++ b/storage/perfschema/table_replication_applier_status_by_worker.cc
@@ -0,0 +1,412 @@
+/*
+      Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+      This program is free software; you can redistribute it and/or modify
+      it under the terms of the GNU General Public License, version 2.0,
+      as published by the Free Software Foundation.
+
+      This program is also distributed with certain software (including
+      but not limited to OpenSSL) that is licensed under separate terms,
+      as designated in a particular file or component or in included license
+      documentation.  The authors of MySQL hereby grant you an additional
+      permission to link the program and your derivative works with the
+      separately licensed software that they have included with MySQL.
+
+      This program is distributed in the hope that it will be useful,
+      but WITHOUT ANY WARRANTY; without even the implied warranty of
+      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+      GNU General Public License, version 2.0, for more details.
+
+      You should have received a copy of the GNU General Public License
+      along with this program; if not, write to the Free Software
+      Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_replication_applier_status_by_worker.cc
+  Table replication_applier_status_by_worker (implementation).
+*/
+
+//#define HAVE_REPLICATION
+
+#include "my_global.h"
+#include "table_replication_applier_status_by_worker.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "slave.h"
+//#include "rpl_info.h"
+#include "rpl_rli.h"
+#include "rpl_mi.h"
+#include "sql_parse.h"
+//#include "rpl_rli_pdb.h"
+//#include "rpl_msr.h"    /*Multi source replication */
+
+THR_LOCK table_replication_applier_status_by_worker::m_table_lock;
+
+PFS_engine_table_share
+table_replication_applier_status_by_worker::m_share=
+{
+  { C_STRING_WITH_LEN("replication_applier_status_by_worker") },
+  &pfs_readonly_acl,
+  table_replication_applier_status_by_worker::create,
+  NULL, /* write_row */
+  NULL, /* delete_all_rows */
+  table_replication_applier_status_by_worker::get_row_count, /*records*/
+  sizeof(pos_t), /* ref length */
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE replication_applier_status_by_worker("
+  "CHANNEL_NAME CHAR(64) collate utf8_general_ci not null comment 'Name of replication channel through which the transaction is received.',"
+  "WORKER_ID BIGINT UNSIGNED not null comment 'Worker identifier.,"
+  "THREAD_ID BIGINT UNSIGNED comment 'Thread_Id as displayed in the performance_schema.threads table for thread with name ''thread/sql/rpl_parallel_thread''. THREAD_ID will be NULL when worker threads are stopped due to error/force stop.',"
+  "SERVICE_STATE ENUM('ON','OFF') not null comment 'Whether or not the thread is running.',"
+  "LAST_SEEN_TRANSACTION CHAR(57) not null comment 'Last GTID executed by worker',"
+  "LAST_ERROR_NUMBER INTEGER not null comment 'Last Error that occurred on a particular worker.',"
+  "LAST_ERROR_MESSAGE VARCHAR(1024) not null comment 'Last error specific message.',"
+  "LAST_ERROR_TIMESTAMP TIMESTAMP(0) not null comment 'Time stamp of last error.')") },
+  false  /* perpetual */
+};
+
+PFS_engine_table* table_replication_applier_status_by_worker::create(void)
+{
+  return new table_replication_applier_status_by_worker();
+}
+
+table_replication_applier_status_by_worker
+  ::table_replication_applier_status_by_worker()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+table_replication_applier_status_by_worker
+  ::~table_replication_applier_status_by_worker()
+{}
+
+void table_replication_applier_status_by_worker::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+ha_rows table_replication_applier_status_by_worker::get_row_count()
+{
+  /*
+    Return an estimate, number of master info's multipled by worker threads
+  */
+ return  master_info_index->master_info_hash.records*32;
+}
+
+
+int table_replication_applier_status_by_worker::rnd_next(void)
+{
+  Slave_worker *worker;
+  Master_info *mi;
+  size_t wc;
+
+  mysql_mutex_lock(&LOCK_active_mi);
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.has_more_channels(master_info_index->master_info_hash.records);
+       m_pos.next_channel())
+  {
+    mi= (Master_info *)my_hash_element(&master_info_index->master_info_hash, m_pos.m_index_1);
+
+    if (mi && mi->host[0])
+    {
+      wc= mi->rli->get_worker_count();
+
+      if (wc == 0)
+      {
+        /* Single Thread Slave */
+        make_row(mi);
+        m_next_pos.set_channel_after(&m_pos);
+        channel_map.unlock();
+        return 0;
+      }
+
+      for (; m_pos.m_index_2 < wc; m_pos.next_worker())
+      {
+        /* Multi Thread Slave */
+
+        worker = mi->rli->get_worker(m_pos.m_index_2);
+        if (worker)
+        {
+          make_row(worker);
+          m_next_pos.set_after(&m_pos);
+          channel_map.unlock();
+          return 0;
+        }
+       }
+    }
+  }
+
+  mysql_mutex_unlock(&LOCK_active_mi);
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_replication_applier_status_by_worker::rnd_pos(const void *pos)
+{
+  Slave_worker *worker;
+  Master_info *mi;
+  int res= HA_ERR_RECORD_DELETED;
+  size_t wc;
+
+  set_position(pos);
+
+  mysql_mutex_lock(&LOCK_active_mi);
+
+  mi= (Master_info *)my_hash_element(&master_info_index->master_info_hash, m_pos.m_index_1);
+
+  if (!mi || !mi->host[0])
+    goto end;
+
+  wc = mi->rli->get_worker_count();
+
+  if (wc == 0)
+  {
+    /* Single Thread Slave */
+    make_row(mi);
+    res=0;
+  }
+  else
+  {
+    /* Multi Thread Slave */
+    if (m_pos.m_index_2 < wc)
+    {
+      worker = mi->rli->get_worker(m_pos.m_index_2);
+      if (worker != NULL)
+      {
+        make_row(worker);
+        res=0;
+      }
+    }
+  }
+
+end:
+  mysql_mutex_unlock(&LOCK_active_mi);
+  return res;
+}
+
+/**
+   Function to display SQL Thread's status as part of
+   'replication_applier_status_by_worker' in single threaded slave mode.
+
+   @param[in] Master_info
+
+   @retval void
+*/
+void table_replication_applier_status_by_worker::make_row(Master_info *mi)
+{
+  m_row_exists= false;
+
+  m_row.worker_id= 0;
+
+  m_row.thread_id= 0;
+
+  assert(mi != NULL);
+  assert(mi->rli != NULL);
+
+  mysql_mutex_lock(&mi->rli->data_lock);
+
+  m_row.channel_name_length= strlen(mi->get_channel());
+  memcpy(m_row.channel_name, (char*)mi->get_channel(), m_row.channel_name_length);
+
+  if (mi->rli->slave_running)
+  {
+    PSI_thread *psi= thd_get_psi(mi->rli->info_thd);
+    PFS_thread *pfs= reinterpret_cast<PFS_thread *> (psi);
+    if(pfs)
+    {
+      m_row.thread_id= pfs->m_thread_internal_id;
+      m_row.thread_id_is_null= false;
+    }
+    else
+      m_row.thread_id_is_null= true;
+  }
+  else
+    m_row.thread_id_is_null= true;
+
+  if (mi->rli->slave_running)
+    m_row.service_state= PS_RPL_YES;
+  else
+    m_row.service_state= PS_RPL_NO;
+
+  if (mi->rli->currently_executing_gtid.type == GTID_GROUP)
+  {
+    global_sid_lock->rdlock();
+    m_row.last_seen_transaction_length=
+      mi->rli->currently_executing_gtid.to_string(global_sid_map,
+                                            m_row.last_seen_transaction);
+    global_sid_lock->unlock();
+  }
+  else if (mi->rli->currently_executing_gtid.type == ANONYMOUS_GROUP)
+  {
+    m_row.last_seen_transaction_length=
+      mi->rli->currently_executing_gtid.to_string((rpl_sid *)NULL,
+                                            m_row.last_seen_transaction);
+  }
+  else
+  {
+    /*
+      For SQL thread currently_executing_gtid, type is set to
+      AUTOMATIC_GROUP when the SQL thread is not executing any
+      transaction.  For this case, the field should be empty.
+    */
+    assert(mi->rli->currently_executing_gtid.type == AUTOMATIC_GROUP);
+    m_row.last_seen_transaction_length= 0;
+    memcpy(m_row.last_seen_transaction, "", 1);
+  }
+
+  mysql_mutex_lock(&mi->rli->err_lock);
+
+  m_row.last_error_number= (long int) mi->rli->last_error().number;
+  m_row.last_error_message_length= 0;
+  m_row.last_error_timestamp= 0;
+
+  /** if error, set error message and timestamp */
+  if (m_row.last_error_number)
+  {
+    char *temp_store= (char*) mi->rli->last_error().message;
+    m_row.last_error_message_length= strlen(temp_store);
+    memcpy(m_row.last_error_message, temp_store,
+           m_row.last_error_message_length);
+
+    /** time in millisecond since epoch */
+    m_row.last_error_timestamp= (ulonglong)mi->rli->last_error().skr*1000000;
+  }
+
+  mysql_mutex_unlock(&mi->rli->err_lock);
+  mysql_mutex_unlock(&mi->rli->data_lock);
+  m_row_exists= true;
+}
+
+void table_replication_applier_status_by_worker::make_row(Slave_worker *w)
+{
+  m_row_exists= false;
+
+  m_row.worker_id= w->get_internal_id();
+
+  m_row.thread_id= 0;
+
+  m_row.channel_name_length= strlen(w->get_channel());
+  memcpy(m_row.channel_name, (char*)w->get_channel(), m_row.channel_name_length);
+
+  mysql_mutex_lock(&w->jobs_lock);
+  if (w->running_status == Slave_worker::RUNNING)
+  {
+    PSI_thread *psi= thd_get_psi(w->info_thd);
+    PFS_thread *pfs= reinterpret_cast<PFS_thread *> (psi);
+    if(pfs)
+    {
+      m_row.thread_id= pfs->m_thread_internal_id;
+      m_row.thread_id_is_null= false;
+    }
+    else /* no instrumentation found */
+      m_row.thread_id_is_null= true;
+  }
+  else
+    m_row.thread_id_is_null= true;
+
+  if (w->running_status == Slave_worker::RUNNING)
+    m_row.service_state= PS_RPL_YES;
+  else
+    m_row.service_state= PS_RPL_NO;
+
+  m_row.last_error_number= (unsigned int) w->last_error().number;
+
+  if (w->currently_executing_gtid.type == GTID_GROUP)
+  {
+    global_sid_lock->rdlock();
+    m_row.last_seen_transaction_length=
+      w->currently_executing_gtid.to_string(global_sid_map,
+                                            m_row.last_seen_transaction);
+    global_sid_lock->unlock();
+  }
+  else if (w->currently_executing_gtid.type == ANONYMOUS_GROUP)
+  {
+    m_row.last_seen_transaction_length=
+      w->currently_executing_gtid.to_string((rpl_sid *)NULL,
+                                            m_row.last_seen_transaction);
+  }
+  else
+  {
+    /*
+      For worker->currently_executing_gtid, type is set to
+      AUTOMATIC_GROUP when the worker is not executing any
+      transaction.  For this case, the field should be empty.
+    */
+    assert(w->currently_executing_gtid.type == AUTOMATIC_GROUP);
+    m_row.last_seen_transaction_length= 0;
+    memcpy(m_row.last_seen_transaction, "", 1);
+  }
+
+  m_row.last_error_number= (unsigned int) w->last_error().number;
+  m_row.last_error_message_length= 0;
+  m_row.last_error_timestamp= 0;
+
+  /** if error, set error message and timestamp */
+  if (m_row.last_error_number)
+  {
+    char * temp_store= (char*)w->last_error().message;
+    m_row.last_error_message_length= strlen(temp_store);
+    memcpy(m_row.last_error_message, w->last_error().message,
+           m_row.last_error_message_length);
+
+    /** time in millisecond since epoch */
+    m_row.last_error_timestamp= (ulonglong)w->last_error().skr*1000000;
+  }
+  mysql_mutex_unlock(&w->jobs_lock);
+
+  m_row_exists= true;
+}
+
+int table_replication_applier_status_by_worker
+  ::read_row_values(TABLE *table, unsigned char *buf,  Field **fields,
+                    bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /** channel_name */
+        set_field_char_utf8(f, m_row.channel_name, m_row.channel_name_length);
+        break;
+      case 1: /*worker_id*/
+        set_field_ulonglong(f, m_row.worker_id);
+        break;
+      case 2: /*thread_id*/
+        if(m_row.thread_id_is_null)
+          f->set_null();
+        else
+          set_field_ulonglong(f, m_row.thread_id);
+        break;
+      case 3: /*service_state*/
+        set_field_enum(f, m_row.service_state);
+        break;
+      case 4: /*last_seen_transaction*/
+        set_field_char_utf8(f, m_row.last_seen_transaction, m_row.last_seen_transaction_length);
+        break;
+      case 5: /*last_error_number*/
+        set_field_ulong(f, m_row.last_error_number);
+        break;
+      case 6: /*last_error_message*/
+        set_field_varchar_utf8(f, m_row.last_error_message, m_row.last_error_message_length);
+        break;
+      case 7: /*last_error_timestamp*/
+        set_field_timestamp(f, m_row.last_error_timestamp);
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+  return 0;
+}
diff --git a/storage/perfschema/table_replication_applier_status_by_worker.h b/storage/perfschema/table_replication_applier_status_by_worker.h
new file mode 100644
index 00000000000..3298d953158
--- /dev/null
+++ b/storage/perfschema/table_replication_applier_status_by_worker.h
@@ -0,0 +1,182 @@
+/*
+   Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License, version 2.0,
+   as published by the Free Software Foundation.
+
+   This program is also distributed with certain software (including
+   but not limited to OpenSSL) that is licensed under separate terms,
+   as designated in a particular file or component or in included license
+   documentation.  The authors of MySQL hereby grant you an additional
+   permission to link the program and your derivative works with the
+   separately licensed software that they have included with MySQL.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License, version 2.0, for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+
+#ifndef TABLE_REPLICATION_APPLIER_STATUS_BY_WORKER_H
+#define TABLE_REPLICATION_APPLIER_STATUS_BY_WORKER_H
+
+/**
+  @file storage/perfschema/table_replication_applier_status_by_worker.h
+  Table replication_applier_status_by_worker (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "rpl_mi.h"
+#include "mysql_com.h"
+//#include "rpl_rli_pdb.h"
+//#include "rpl_msr.h"
+//#include "rpl_info.h" /*CHANNEL_NAME_LENGTH*/
+#include "my_thread.h"
+
+class Slave_worker;
+class Master_info;
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+#ifndef ENUM_RPL_YES_NO
+#define ENUM_RPL_YES_NO
+/** enumerated values for service_state of worker thread*/
+enum enum_rpl_yes_no {
+  PS_RPL_YES= 1, /* service_state= on */
+  PS_RPL_NO /* service_state= off */
+};
+#endif
+
+/*
+  A row in worker's table. The fields with string values have an additional
+  length field denoted by <field_name>_length.
+*/
+struct st_row_worker {
+
+  char channel_name[CHANNEL_NAME_LENGTH];
+  uint channel_name_length;
+  /*
+    worker_id is added to the table because thread is killed at STOP SLAVE
+    but the status needs to show up, so worker_id is used as a permanent
+    identifier.
+  */
+  ulonglong worker_id;
+  ulonglong thread_id;
+  uint thread_id_is_null;
+  enum_rpl_yes_no service_state;
+  char last_seen_transaction[GTID_MAX_STR_LENGTH + 1];
+  uint last_seen_transaction_length;
+  uint last_error_number;
+  char last_error_message[MAX_SLAVE_ERRMSG];
+  uint last_error_message_length;
+  ulonglong last_error_timestamp;
+};
+
+/**
+  Position in table replication_applier_status_by_worker.
+  Index 1 for replication channel.
+  Index 2 for worker:
+  - position [0] is for Single Thread Slave (Master_info)
+  - position [1] .. [N] is for Multi Thread Slave (Slave_worker)
+*/
+struct pos_replication_applier_status_by_worker : public PFS_double_index
+{
+
+  pos_replication_applier_status_by_worker() : PFS_double_index(0, 0)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 0;
+  }
+
+  inline bool has_more_channels(uint num)
+  { return (m_index_1 < num); }
+
+  inline void next_channel(void)
+  {
+    m_index_1++;
+    m_index_2= 0;
+  }
+
+  inline void next_worker()
+  {
+    m_index_2++;
+  }
+
+  inline void
+  set_channel_after(const pos_replication_applier_status_by_worker *other)
+  {
+    m_index_1 = other->m_index_1 + 1;
+    m_index_2 = 0;
+  }
+};
+
+
+/** Table PERFORMANCE_SCHEMA.replication_applier_status_by_worker */
+class table_replication_applier_status_by_worker: public PFS_engine_table
+{
+  typedef pos_replication_applier_status_by_worker pos_t;
+
+private:
+  void make_row(Slave_worker *);
+  /*
+    Master_info to construct a row to display SQL Thread's status
+    information in STS mode
+  */
+  void make_row(Master_info *);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+  /** current row*/
+  st_row_worker m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_t m_pos;
+  /** Next position. */
+  pos_t m_next_pos;
+
+protected:
+  /**
+    Read the current row values.
+    @param table            Table handle
+    @param buf              row buffer
+    @param fields           Table fields
+    @param read_all         true if all columns are read.
+  */
+
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_replication_applier_status_by_worker();
+
+public:
+  ~table_replication_applier_status_by_worker();
+
+  /** Table share. */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static ha_rows get_row_count();
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_replication_connection_configuration.cc b/storage/perfschema/table_replication_connection_configuration.cc
new file mode 100644
index 00000000000..6af8641c7a7
--- /dev/null
+++ b/storage/perfschema/table_replication_connection_configuration.cc
@@ -0,0 +1,333 @@
+/*
+      Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+      This program is free software; you can redistribute it and/or modify
+      it under the terms of the GNU General Public License, version 2.0,
+      as published by the Free Software Foundation.
+
+      This program is also distributed with certain software (including
+      but not limited to OpenSSL) that is licensed under separate terms,
+      as designated in a particular file or component or in included license
+      documentation.  The authors of MySQL hereby grant you an additional
+      permission to link the program and your derivative works with the
+      separately licensed software that they have included with MySQL.
+
+      This program is distributed in the hope that it will be useful,
+      but WITHOUT ANY WARRANTY; without even the implied warranty of
+      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+      GNU General Public License, version 2.0, for more details.
+
+      You should have received a copy of the GNU General Public License
+      along with this program; if not, write to the Free Software
+      Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_replication_connection_configuration.cc
+  Table replication_connection_configuration (implementation).
+*/
+
+//#define HAVE_REPLICATION
+
+#include "my_global.h"
+#include "table_replication_connection_configuration.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "slave.h"
+//#include "rpl_info.h"
+#include "rpl_rli.h"
+#include "rpl_mi.h"
+#include "sql_parse.h"
+//#include "rpl_msr.h"             /* Multisource replciation */
+
+#ifdef HAVE_REPLICATION
+THR_LOCK table_replication_connection_configuration::m_table_lock;
+
+PFS_engine_table_share
+table_replication_connection_configuration::m_share=
+{
+  { C_STRING_WITH_LEN("replication_connection_configuration") },
+  &pfs_readonly_acl,
+  table_replication_connection_configuration::create,
+  NULL, /* write_row */
+  NULL, /* delete_all_rows */
+  table_replication_connection_configuration::get_row_count, /* records */
+  sizeof(pos_t), /* ref length */
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE replication_connection_configuration("
+  "CHANNEL_NAME CHAR(64) collate utf8_general_ci not null comment 'The replication channel used.',"
+  "HOST CHAR(60) collate utf8_bin not null comment 'The host name of the source that the replica is connected to.',"
+  "PORT INTEGER not null comment 'The port used to connect to the source.',"
+  "USER CHAR(32) collate utf8_bin not null comment 'The user name of the replication user account used to connect to the source.',"
+  "NETWORK_INTERFACE CHAR(60) collate utf8_bin not null comment 'The network interface that the replica is bound to, if any.',"
+  "AUTO_POSITION ENUM('1','0') not null comment '1 if GTID auto-positioning is in use; otherwise 0.',"
+  "SSL_ALLOWED ENUM('YES','NO','IGNORED') not null comment 'Wether SSL is allowed for the replica connection.',"
+  "SSL_CA_FILE VARCHAR(512) not null comment 'Path to the file that contains one or more certificates for trusted Certificate Authorities (CA) to use for TLS.',"
+  "SSL_CA_PATH VARCHAR(512) not null comment 'Path to a directory that contains one or more PEM files that contain X509 certificates for a trusted Certificate Authority (CA) to use for TLS.',"
+  "SSL_CERTIFICATE VARCHAR(512) not null comment 'Path to the certificate used to authenticate the master.',"
+  "SSL_CIPHER VARCHAR(512) not null comment 'Which cipher is used for encription.',"
+  "SSL_KEY VARCHAR(512) not null comment 'Path to the private key used for TLS.',"
+  "SSL_VERIFY_SERVER_CERTIFICATE ENUM('YES','NO') not null comment 'Wether the server certificate is verified as part of the SSL connection',"
+  "SSL_CRL_FILE VARCHAR(255) not null comment 'Path to the PEM file containing one or more revoked X.509 certificates.',"
+  "SSL_CRL_PATH VARCHAR(255) not null comment 'PATH to a folder containing PEM files containing one or more revoked X.509 certificates.',"
+  "CONNECTION_RETRY_INTERVAL INTEGER not null comment 'The number of seconds between connect retries.',"
+  "CONNECTION_RETRY_COUNT BIGINT unsigned not null comment 'The number of times the replica can attempt to reconnect to the source in the event of a lost connection.',"
+  "HEARTBEAT_INTERVAL DOUBLE(10,3) unsigned not null COMMENT 'Number of seconds after which a heartbeat will be sent.',"
+  "TLS_VERSION VARCHAR(255) not null comment 'Not implemented, always blank.')") },
+  false  /* perpetual */
+};
+
+
+PFS_engine_table* table_replication_connection_configuration::create(void)
+{
+  return new table_replication_connection_configuration();
+}
+
+table_replication_connection_configuration
+  ::table_replication_connection_configuration()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+table_replication_connection_configuration
+  ::~table_replication_connection_configuration()
+{}
+
+void table_replication_connection_configuration::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+ha_rows table_replication_connection_configuration::get_row_count()
+{
+  /*
+     We actually give the MAX_CHANNELS rather than the current
+     number of channels
+  */
+
+ return master_info_index->master_info_hash.records;
+}
+
+int table_replication_connection_configuration::rnd_next(void)
+{
+  Master_info *mi;
+
+  mysql_mutex_lock(&LOCK_active_mi);
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < master_info_index->master_info_hash.records;
+       m_pos.next())
+  {
+    mi= (Master_info *)my_hash_element(&master_info_index->master_info_hash, m_pos.m_index);
+
+    if (mi && mi->host[0])
+    {
+      make_row(mi);
+      m_next_pos.set_after(&m_pos);
+      mysql_mutex_unlock(&LOCK_active_mi);
+      return 0;
+    }
+  }
+
+  mysql_mutex_unlock(&LOCK_active_mi);
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_replication_connection_configuration::rnd_pos(const void *pos)
+{
+  Master_info *mi;
+  int res= HA_ERR_RECORD_DELETED;
+
+  mysql_mutex_lock(&LOCK_active_mi);
+
+  set_position(pos);
+
+  if ((mi= (Master_info *)my_hash_element(&master_info_index->master_info_hash, m_pos.m_index)))
+  {
+    make_row(mi);
+    res= 0;
+  }
+
+  mysql_mutex_unlock(&LOCK_active_mi);
+  return res;
+}
+
+void table_replication_connection_configuration::make_row(Master_info *mi)
+{
+  char * temp_store;
+
+  m_row_exists= false;
+
+
+  assert(mi != NULL);
+
+  mysql_mutex_lock(&mi->data_lock);
+  mysql_mutex_lock(&mi->rli.data_lock);
+
+  m_row.channel_name_length= static_cast<uint>(mi->connection_name.length);
+  memcpy(m_row.channel_name, mi->connection_name.str, m_row.channel_name_length);
+
+  m_row.host_length= static_cast<uint>(strlen(mi->host));
+  memcpy(m_row.host, mi->host, m_row.host_length);
+
+  m_row.port= (unsigned int) mi->port;
+
+  /* can't the user be NULL? */
+  temp_store= (char*)mi->user;
+  m_row.user_length= static_cast<uint>(strlen(temp_store));
+  memcpy(m_row.user, temp_store, m_row.user_length);
+
+  temp_store= const_cast<char*>(""); //(char*)mi->bind_addr;
+  m_row.network_interface_length= static_cast<uint>(strlen(temp_store));
+  memcpy(m_row.network_interface, temp_store, m_row.network_interface_length);
+
+  if (mi->using_gtid)
+    m_row.auto_position= PS_RPL_YES;
+  else
+    m_row.auto_position= PS_RPL_NO;
+
+#ifdef HAVE_OPENSSL
+  m_row.ssl_allowed= mi->ssl? PS_SSL_ALLOWED_YES:PS_SSL_ALLOWED_NO;
+#else
+  m_row.ssl_allowed= mi->ssl? PS_SSL_ALLOWED_IGNORED:PS_SSL_ALLOWED_NO;
+#endif
+
+  temp_store= (char*)mi->ssl_ca;
+  m_row.ssl_ca_file_length= static_cast<uint>(strlen(temp_store));
+  memcpy(m_row.ssl_ca_file, temp_store, m_row.ssl_ca_file_length);
+
+  temp_store= (char*)mi->ssl_capath;
+  m_row.ssl_ca_path_length= static_cast<uint>(strlen(temp_store));
+  memcpy(m_row.ssl_ca_path, temp_store, m_row.ssl_ca_path_length);
+
+  temp_store= (char*)mi->ssl_cert;
+  m_row.ssl_certificate_length= static_cast<uint>(strlen(temp_store));
+  memcpy(m_row.ssl_certificate, temp_store, m_row.ssl_certificate_length);
+
+  temp_store= (char*)mi->ssl_cipher;
+  m_row.ssl_cipher_length= static_cast<uint>(strlen(temp_store));
+  memcpy(m_row.ssl_cipher, temp_store, m_row.ssl_cipher_length);
+
+  temp_store= (char*)mi->ssl_key;
+  m_row.ssl_key_length= static_cast<uint>(strlen(temp_store));
+  memcpy(m_row.ssl_key, temp_store, m_row.ssl_key_length);
+
+  if (mi->ssl_verify_server_cert)
+    m_row.ssl_verify_server_certificate= PS_RPL_YES;
+  else
+    m_row.ssl_verify_server_certificate= PS_RPL_NO;
+
+  temp_store= (char*)mi->ssl_crl;
+  m_row.ssl_crl_file_length= static_cast<uint>(strlen(temp_store));
+  memcpy(m_row.ssl_crl_file, temp_store, m_row.ssl_crl_file_length);
+
+  temp_store= (char*)mi->ssl_crlpath;
+  m_row.ssl_crl_path_length= static_cast<uint>(strlen(temp_store));
+  memcpy(m_row.ssl_crl_path, temp_store, m_row.ssl_crl_path_length);
+
+  m_row.connection_retry_interval= (unsigned int) mi->connect_retry;
+
+  m_row.connection_retry_count= 0; //(ulong) mi->retry_count;
+
+  m_row.heartbeat_interval= (double)mi->heartbeat_period;
+
+  temp_store= (char*)""; //mi->tls_version;
+  m_row.tls_version_length= static_cast<uint>(strlen(temp_store));
+  memcpy(m_row.tls_version, temp_store, m_row.tls_version_length);
+
+  mysql_mutex_unlock(&mi->rli.data_lock);
+  mysql_mutex_unlock(&mi->data_lock);
+
+  m_row_exists= true;
+}
+
+int table_replication_connection_configuration::read_row_values(TABLE *table,
+                                                                unsigned char *,
+                                                                Field **fields,
+                                                                bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  assert(table->s->null_bytes == 0);
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /** channel_name */
+        set_field_char_utf8(f, m_row.channel_name, m_row.channel_name_length);
+        break;
+      case 1: /** host */
+        set_field_char_utf8(f, m_row.host, m_row.host_length);
+        break;
+      case 2: /** port */
+        set_field_ulong(f, m_row.port);
+        break;
+      case 3: /** user */
+        set_field_char_utf8(f, m_row.user, m_row.user_length);
+        break;
+      case 4: /** network_interface */
+        set_field_char_utf8(f, m_row.network_interface,
+                               m_row.network_interface_length);
+        break;
+      case 5: /** auto_position */
+        set_field_enum(f, m_row.auto_position);
+        break;
+      case 6: /** ssl_allowed */
+        set_field_enum(f, m_row. ssl_allowed);
+        break;
+      case 7: /**ssl_ca_file */
+        set_field_varchar_utf8(f, m_row.ssl_ca_file,
+                               m_row.ssl_ca_file_length);
+        break;
+      case 8: /** ssl_ca_path */
+        set_field_varchar_utf8(f, m_row.ssl_ca_path,
+                               m_row.ssl_ca_path_length);
+        break;
+      case 9: /** ssl_certificate */
+        set_field_varchar_utf8(f, m_row.ssl_certificate,
+                               m_row.ssl_certificate_length);
+        break;
+      case 10: /** ssl_cipher */
+        set_field_varchar_utf8(f, m_row.ssl_cipher, m_row.ssl_cipher_length);
+        break;
+      case 11: /** ssl_key */
+        set_field_varchar_utf8(f, m_row.ssl_key, m_row.ssl_key_length);
+        break;
+      case 12: /** ssl_verify_server_certificate */
+        set_field_enum(f, m_row.ssl_verify_server_certificate);
+        break;
+      case 13: /** ssl_crl_file */
+        set_field_varchar_utf8(f, m_row.ssl_crl_file,
+                               m_row.ssl_crl_file_length);
+        break;
+      case 14: /** ssl_crl_path */
+        set_field_varchar_utf8(f, m_row.ssl_crl_path,
+                               m_row.ssl_crl_path_length);
+        break;
+      case 15: /** connection_retry_interval */
+        set_field_ulong(f, m_row.connection_retry_interval);
+        break;
+      case 16: /** connect_retry_count */
+        set_field_ulonglong(f, m_row.connection_retry_count);
+        break;
+      case 17:/** number of seconds after which heartbeat will be sent */
+        set_field_double(f, m_row.heartbeat_interval);
+        break;
+      case 18: /** tls_version */
+        set_field_varchar_utf8(f, m_row.tls_version,
+                               m_row.tls_version_length);
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+  return 0;
+}
+#endif
diff --git a/storage/perfschema/table_replication_connection_configuration.h b/storage/perfschema/table_replication_connection_configuration.h
new file mode 100644
index 00000000000..4eb4726f7a6
--- /dev/null
+++ b/storage/perfschema/table_replication_connection_configuration.h
@@ -0,0 +1,152 @@
+/*
+   Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License, version 2.0,
+   as published by the Free Software Foundation.
+
+   This program is also distributed with certain software (including
+   but not limited to OpenSSL) that is licensed under separate terms,
+   as designated in a particular file or component or in included license
+   documentation.  The authors of MySQL hereby grant you an additional
+   permission to link the program and your derivative works with the
+   separately licensed software that they have included with MySQL.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License, version 2.0, for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+
+#ifndef TABLE_REPLICATION_CONFIGURATION_H
+#define TABLE_REPLICATION_CONFIGURATION_H
+
+/**
+  @file storage/perfschema/table_replication_connection_configuration.h
+  Table replication_connection_configuration (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "rpl_mi.h"
+#include "mysql_com.h"
+#include "my_thread.h"
+//#include "rpl_msr.h"
+//#include "rpl_info.h"  /* CHANNEL_NAME_LENGTH*/
+
+class Master_info;
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+#ifndef ENUM_RPL_YES_NO
+#define ENUM_RPL_YES_NO
+enum enum_rpl_yes_no {
+  PS_RPL_YES= 1,
+  PS_RPL_NO
+};
+#endif
+
+/** enum values for SSL_Allowed*/
+enum enum_ssl_allowed {
+    PS_SSL_ALLOWED_YES= 1,
+    PS_SSL_ALLOWED_NO,
+    PS_SSL_ALLOWED_IGNORED
+};
+
+/**
+  A row in the table. The fields with string values have an additional
+  length field denoted by <field_name>_length.
+*/
+struct st_row_connect_config {
+  char channel_name[CHANNEL_NAME_LENGTH];
+  uint channel_name_length;
+  char host[HOSTNAME_LENGTH];
+  uint host_length;
+  uint port;
+  char user[USERNAME_LENGTH];
+  uint user_length;
+  char network_interface[HOSTNAME_LENGTH];
+  uint network_interface_length;
+  enum_rpl_yes_no auto_position;
+  enum_ssl_allowed ssl_allowed;
+  char ssl_ca_file[FN_REFLEN];
+  uint ssl_ca_file_length;
+  char ssl_ca_path[FN_REFLEN];
+  uint ssl_ca_path_length;
+  char ssl_certificate[FN_REFLEN];
+  uint ssl_certificate_length;
+  char ssl_cipher[FN_REFLEN];
+  uint ssl_cipher_length;
+  char ssl_key[FN_REFLEN];
+  uint ssl_key_length;
+  enum_rpl_yes_no ssl_verify_server_certificate;
+  char ssl_crl_file[FN_REFLEN];
+  uint ssl_crl_file_length;
+  char ssl_crl_path[FN_REFLEN];
+  uint ssl_crl_path_length;
+  uint connection_retry_interval;
+  ulong connection_retry_count;
+  double heartbeat_interval;
+  char tls_version[FN_REFLEN];
+  uint tls_version_length;
+};
+
+/** Table PERFORMANCE_SCHEMA.TABLE_REPLICATION_CONNECTION_CONFIGURATION. */
+class table_replication_connection_configuration: public PFS_engine_table
+{
+  typedef PFS_simple_index pos_t;
+
+private:
+  void make_row(Master_info *);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current row */
+  st_row_connect_config m_row;
+  /** Current position. */
+  pos_t m_pos;
+  /** Next position. */
+  pos_t m_next_pos;
+
+protected:
+  /**
+    Read the current row values.
+    @param table            Table handle
+    @param buf              row buffer
+    @param fields           Table fields
+    @param read_all         true if all columns are read.
+  */
+
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_replication_connection_configuration();
+
+public:
+  ~table_replication_connection_configuration();
+
+  /** Table share. */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static ha_rows get_row_count();
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_replication_connection_status.cc b/storage/perfschema/table_replication_connection_status.cc
new file mode 100644
index 00000000000..fd0b5309f7e
--- /dev/null
+++ b/storage/perfschema/table_replication_connection_status.cc
@@ -0,0 +1,440 @@
+/*
+      Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+      This program is free software; you can redistribute it and/or modify
+      it under the terms of the GNU General Public License, version 2.0,
+      as published by the Free Software Foundation.
+
+      This program is also distributed with certain software (including
+      but not limited to OpenSSL) that is licensed under separate terms,
+      as designated in a particular file or component or in included license
+      documentation.  The authors of MySQL hereby grant you an additional
+      permission to link the program and your derivative works with the
+      separately licensed software that they have included with MySQL.
+
+      This program is distributed in the hope that it will be useful,
+      but WITHOUT ANY WARRANTY; without even the implied warranty of
+      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+      GNU General Public License, version 2.0, for more details.
+
+      You should have received a copy of the GNU General Public License
+      along with this program; if not, write to the Free Software
+      Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_replication_connection_status.cc
+  Table replication_connection_status (implementation).
+*/
+
+//#define HAVE_REPLICATION
+
+#include "my_global.h"
+#include "table_replication_connection_status.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "slave.h"
+//#include "rpl_info.h"
+#include "rpl_rli.h"
+#include "rpl_mi.h"
+#include "sql_parse.h"
+//#include "rpl_msr.h"           /* Multi source replication */
+#include "log.h"
+//#include "rpl_group_replication.h"
+
+/*
+  Callbacks implementation for GROUP_REPLICATION_CONNECTION_STATUS_CALLBACKS.
+*/
+static void set_channel_name(void* const context, const char& value,
+                             size_t length)
+{
+}
+
+static void set_group_name(void* const context, const char& value,
+                           size_t length)
+{
+  struct st_row_connect_status* row=
+      static_cast<struct st_row_connect_status*>(context);
+  const size_t max= UUID_LENGTH;
+  length= std::min(length, max);
+
+  row->group_name_is_null= false;
+  memcpy(row->group_name, &value, length);
+}
+
+static void set_source_uuid(void* const context, const char& value,
+                            size_t length)
+{
+  struct st_row_connect_status* row=
+      static_cast<struct st_row_connect_status*>(context);
+  const size_t max= UUID_LENGTH;
+  length= std::min(length, max);
+
+  row->source_uuid_is_null= false;
+  memcpy(row->source_uuid, &value, length);
+}
+
+static void set_service_state(void* const context, bool value)
+{
+  struct st_row_connect_status* row=
+      static_cast<struct st_row_connect_status*>(context);
+
+  row->service_state= value ? PS_RPL_CONNECT_SERVICE_STATE_YES
+                            : PS_RPL_CONNECT_SERVICE_STATE_NO;
+}
+
+
+THR_LOCK table_replication_connection_status::m_table_lock;
+
+
+/* Numbers in varchar count utf8 characters. */
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    {C_STRING_WITH_LEN("CHANNEL_NAME")},
+    {C_STRING_WITH_LEN("char(64)")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("GROUP_NAME")},
+    {C_STRING_WITH_LEN("char(36)")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("SOURCE_UUID")},
+    {C_STRING_WITH_LEN("char(36)")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("THREAD_ID")},
+    {C_STRING_WITH_LEN("bigint(20)")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("SERVICE_STATE")},
+    {C_STRING_WITH_LEN("enum('ON','OFF','CONNECTING')")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("COUNT_RECEIVED_HEARTBEATS")},
+    {C_STRING_WITH_LEN("bigint(20)")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("LAST_HEARTBEAT_TIMESTAMP")},
+    {C_STRING_WITH_LEN("timestamp")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("RECEIVED_TRANSACTION_SET")},
+    {C_STRING_WITH_LEN("longtext")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("LAST_ERROR_NUMBER")},
+    {C_STRING_WITH_LEN("int(11)")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("LAST_ERROR_MESSAGE")},
+    {C_STRING_WITH_LEN("varchar(1024)")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("LAST_ERROR_TIMESTAMP")},
+    {C_STRING_WITH_LEN("timestamp")},
+    {NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_replication_connection_status::m_field_def= { 11, field_types };
+
+PFS_engine_table_share
+table_replication_connection_status::m_share=
+{
+  { C_STRING_WITH_LEN("replication_connection_status") },
+  &pfs_readonly_acl,
+  table_replication_connection_status::create,
+  NULL, /* write_row */
+  NULL, /* delete_all_rows */
+  table_replication_connection_status::get_row_count, /* records */
+  sizeof(pos_t), /* ref length */
+  &m_table_lock,
+  &m_field_def,
+  false, /* checked */
+  false  /* perpetual */
+};
+
+PFS_engine_table* table_replication_connection_status::create(void)
+{
+  return new table_replication_connection_status();
+}
+
+table_replication_connection_status::table_replication_connection_status()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(0), m_next_pos(0)
+{
+}
+
+table_replication_connection_status::~table_replication_connection_status()
+{
+}
+
+void table_replication_connection_status::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+ha_rows table_replication_connection_status::get_row_count()
+{
+  /*A lock is not needed for an estimate */
+  return channel_map.get_max_channels();
+}
+
+
+
+int table_replication_connection_status::rnd_next(void)
+{
+  Master_info *mi= NULL;
+  channel_map.rdlock();
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < channel_map.get_max_channels();
+       m_pos.next())
+  {
+    mi= channel_map.get_mi_at_pos(m_pos.m_index);
+
+    if (mi && mi->host[0])
+    {
+      make_row(mi);
+      m_next_pos.set_after(&m_pos);
+      channel_map.unlock();
+      return 0;
+    }
+  }
+
+  channel_map.unlock();
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_replication_connection_status::rnd_pos(const void *pos)
+{
+  Master_info *mi;
+  int res= HA_ERR_RECORD_DELETED;
+
+  set_position(pos);
+
+  channel_map.rdlock();
+
+  if ((mi= channel_map.get_mi_at_pos(m_pos.m_index)))
+  {
+    make_row(mi);
+    res= 0;
+  }
+
+  channel_map.unlock();
+  return res;
+}
+
+void table_replication_connection_status::make_row(Master_info *mi)
+{
+  DBUG_ENTER("table_replication_connection_status::make_row");
+  m_row_exists= false;
+  bool error= false;
+
+  /* Default values */
+  m_row.group_name_is_null= true;
+  m_row.source_uuid_is_null= true;
+  m_row.thread_id_is_null= true;
+  m_row.service_state= PS_RPL_CONNECT_SERVICE_STATE_NO;
+
+  assert(mi != NULL);
+  assert(mi->rli != NULL);
+
+  mysql_mutex_lock(&mi->data_lock);
+  mysql_mutex_lock(&mi->rli->data_lock);
+
+  m_row.channel_name_length= mi->get_channel() ? strlen(mi->get_channel()):0;
+  memcpy(m_row.channel_name, mi->get_channel(), m_row.channel_name_length);
+
+  if (is_group_replication_plugin_loaded() &&
+      channel_map.is_group_replication_channel_name(mi->get_channel(), true))
+  {
+    /*
+      Group Replication applier channel.
+      Set callbacks on GROUP_REPLICATION_GROUP_MEMBER_STATS_CALLBACKS.
+    */
+    const GROUP_REPLICATION_CONNECTION_STATUS_CALLBACKS callbacks=
+    {
+      &m_row,
+      &set_channel_name,
+      &set_group_name,
+      &set_source_uuid,
+      &set_service_state,
+    };
+
+    // Query plugin and let callbacks do their job.
+    if (get_group_replication_connection_status_info(callbacks))
+    {
+      DBUG_PRINT("info", ("Group Replication stats not available!"));
+    }
+  }
+  else
+  {
+    /* Slave channel. */
+    if (mi->master_uuid[0] != 0)
+    {
+      memcpy(m_row.source_uuid, mi->master_uuid, UUID_LENGTH);
+      m_row.source_uuid_is_null= false;
+    }
+
+    if (mi->slave_running == MYSQL_SLAVE_RUN_CONNECT)
+      m_row.service_state= PS_RPL_CONNECT_SERVICE_STATE_YES;
+    else
+    {
+      if (mi->slave_running == MYSQL_SLAVE_RUN_NOT_CONNECT)
+        m_row.service_state= PS_RPL_CONNECT_SERVICE_STATE_CONNECTING;
+      else
+        m_row.service_state= PS_RPL_CONNECT_SERVICE_STATE_NO;
+    }
+  }
+
+  if (mi->slave_running == MYSQL_SLAVE_RUN_CONNECT)
+  {
+    PSI_thread *psi= thd_get_psi(mi->info_thd);
+    PFS_thread *pfs= reinterpret_cast<PFS_thread *> (psi);
+    if(pfs)
+    {
+      m_row.thread_id= pfs->m_thread_internal_id;
+      m_row.thread_id_is_null= false;
+    }
+  }
+
+  m_row.count_received_heartbeats= mi->received_heartbeats;
+  /*
+    Time in Milliseconds since epoch. active_mi->last_heartbeat contains
+    number of seconds so we multiply by 1000000.
+  */
+  m_row.last_heartbeat_timestamp= (ulonglong)mi->last_heartbeat*1000000;
+
+  {
+    global_sid_lock->wrlock();
+    const Gtid_set* io_gtid_set= mi->rli->get_gtid_set();
+
+    if ((m_row.received_transaction_set_length=
+         io_gtid_set->to_string(&m_row.received_transaction_set)) < 0)
+    {
+      my_free(m_row.received_transaction_set);
+      m_row.received_transaction_set_length= 0;
+      global_sid_lock->unlock();
+      error= true;
+      goto end;
+    }
+    global_sid_lock->unlock();
+  }
+
+  /* Errors */
+  mysql_mutex_lock(&mi->err_lock);
+  mysql_mutex_lock(&mi->rli->err_lock);
+  m_row.last_error_number= (unsigned int) mi->last_error().number;
+  m_row.last_error_message_length= 0;
+  m_row.last_error_timestamp= 0;
+
+  /** If error, set error message and timestamp */
+  if (m_row.last_error_number)
+  {
+    char* temp_store= (char*)mi->last_error().message;
+    m_row.last_error_message_length= strlen(temp_store);
+    memcpy(m_row.last_error_message, temp_store,
+           m_row.last_error_message_length);
+
+    /*
+      Time in millisecond since epoch. active_mi->last_error().skr contains
+      number of seconds so we multiply by 1000000. */
+    m_row.last_error_timestamp= (ulonglong)mi->last_error().skr*1000000;
+  }
+  mysql_mutex_unlock(&mi->rli->err_lock);
+  mysql_mutex_unlock(&mi->err_lock);
+
+end:
+  mysql_mutex_unlock(&mi->rli->data_lock);
+  mysql_mutex_unlock(&mi->data_lock);
+
+  if (!error)
+    m_row_exists= true;
+  DBUG_VOID_RETURN;
+}
+
+int table_replication_connection_status::read_row_values(TABLE *table,
+                                                         unsigned char *buf,
+                                                         Field **fields,
+                                                         bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /** channel_name*/
+        set_field_char_utf8(f, m_row.channel_name,m_row.channel_name_length);
+        break;
+      case 1: /** group_name */
+        if (m_row.group_name_is_null)
+          f->set_null();
+        else
+          set_field_char_utf8(f, m_row.group_name, UUID_LENGTH);
+        break;
+      case 2: /** source_uuid */
+        if (m_row.source_uuid_is_null)
+          f->set_null();
+        else
+          set_field_char_utf8(f, m_row.source_uuid, UUID_LENGTH);
+        break;
+      case 3: /** thread_id */
+        if(m_row.thread_id_is_null)
+          f->set_null();
+        else
+          set_field_ulonglong(f, m_row.thread_id);
+        break;
+      case 4: /** service_state */
+        set_field_enum(f, m_row.service_state);
+        break;
+      case 5: /** number of heartbeat events received **/
+        set_field_ulonglong(f, m_row.count_received_heartbeats);
+        break;
+      case 6: /** time of receipt of last heartbeat event **/
+        set_field_timestamp(f, m_row.last_heartbeat_timestamp);
+        break;
+      case 7: /** received_transaction_set */
+        set_field_longtext_utf8(f, m_row.received_transaction_set,
+                                m_row.received_transaction_set_length);
+        break;
+      case 8: /*last_error_number*/
+        set_field_ulong(f, m_row.last_error_number);
+        break;
+      case 9: /*last_error_message*/
+        set_field_varchar_utf8(f, m_row.last_error_message,
+                               m_row.last_error_message_length);
+        break;
+      case 10: /*last_error_timestamp*/
+        set_field_timestamp(f, m_row.last_error_timestamp);
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+  m_row.cleanup();
+
+  return 0;
+}
diff --git a/storage/perfschema/table_replication_connection_status.h b/storage/perfschema/table_replication_connection_status.h
new file mode 100644
index 00000000000..596a4e9ed49
--- /dev/null
+++ b/storage/perfschema/table_replication_connection_status.h
@@ -0,0 +1,150 @@
+/*
+   Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License, version 2.0,
+   as published by the Free Software Foundation.
+
+   This program is also distributed with certain software (including
+   but not limited to OpenSSL) that is licensed under separate terms,
+   as designated in a particular file or component or in included license
+   documentation.  The authors of MySQL hereby grant you an additional
+   permission to link the program and your derivative works with the
+   separately licensed software that they have included with MySQL.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License, version 2.0, for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+
+#ifndef TABLE_REPLICATION_CONNECTION_STATUS_H
+#define TABLE_REPLICATION_CONNECTION_STATUS_H
+
+/**
+  @file storage/perfschema/table_replication_connection_status.h
+  Table replication_connection_status (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "rpl_mi.h"
+#include "rpl_reporting.h" /* MAX_SLAVE_ERRMSG */
+#include "mysql_com.h"
+//#include "rpl_msr.h"
+//#include "rpl_info.h"  /*CHANNEL_NAME_LENGTH */
+#include "my_thread.h"
+
+class Master_info;
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+#ifndef ENUM_RPL_YES_NO
+#define ENUM_RPL_YES_NO
+enum enum_rpl_yes_no {
+  PS_RPL_YES= 1,
+  PS_RPL_NO
+};
+#endif
+
+enum enum_rpl_connect_status_service_state {
+  PS_RPL_CONNECT_SERVICE_STATE_YES= 1,
+  PS_RPL_CONNECT_SERVICE_STATE_NO,
+  PS_RPL_CONNECT_SERVICE_STATE_CONNECTING
+};
+
+/*
+  A row in the table. The fields with string values have an additional
+  length field denoted by <field_name>_length.
+*/
+struct st_row_connect_status {
+  char group_name[NAME_LEN];
+  bool group_name_is_null;
+  char channel_name[CHANNEL_NAME_LENGTH];
+  uint channel_name_length;
+  char source_uuid[11];   // typeof(server_id) == uint32
+  bool source_uuid_is_null;
+  ulonglong thread_id;
+  bool thread_id_is_null;
+  enum_rpl_connect_status_service_state service_state;
+  ulonglong count_received_heartbeats;
+  ulonglong last_heartbeat_timestamp;
+  char* received_transaction_set;
+  int received_transaction_set_length;
+  uint last_error_number;
+  char last_error_message[MAX_SLAVE_ERRMSG];
+  uint last_error_message_length;
+  ulonglong last_error_timestamp;
+
+  st_row_connect_status() : received_transaction_set(NULL) {}
+
+  void cleanup()
+  {
+    if (received_transaction_set != NULL)
+    {
+      my_free(received_transaction_set);
+      received_transaction_set= NULL;
+    }
+  }
+};
+
+
+/** Table PERFORMANCE_SCHEMA.REPLICATION_CONNECTION_STATUS. */
+class table_replication_connection_status: public PFS_engine_table
+{
+  typedef PFS_simple_index pos_t;
+
+private:
+  void make_row(Master_info *mi);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current row */
+  st_row_connect_status m_row;
+  /** Current position. */
+  pos_t m_pos;
+  /** Next position. */
+  pos_t m_next_pos;
+
+protected:
+  /**
+    Read the current row values.
+    @param table            Table handle
+    @param buf              row buffer
+    @param fields           Table fields
+    @param read_all         true if all columns are read.
+  */
+
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_replication_connection_status();
+
+public:
+  ~table_replication_connection_status();
+
+  /** Table share. */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static ha_rows get_row_count();
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_replication_group_member_stats.cc b/storage/perfschema/table_replication_group_member_stats.cc
new file mode 100644
index 00000000000..7d15e383d70
--- /dev/null
+++ b/storage/perfschema/table_replication_group_member_stats.cc
@@ -0,0 +1,372 @@
+/*
+  Copyright (c) 2014, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_replication_group_member_stats.cc
+  Table replication_group_member_stats (implementation).
+*/
+
+//#define HAVE_REPLICATION
+
+#include "my_global.h"
+#include "table_replication_group_member_stats.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "log.h"
+#include "rpl_group_replication.h"
+
+/*
+  Callbacks implementation for GROUP_REPLICATION_GROUP_MEMBER_STATS_CALLBACKS.
+*/
+static void set_channel_name(void* const context, const char& value,
+                             size_t length)
+{
+  struct st_row_group_member_stats* row=
+      static_cast<struct st_row_group_member_stats*>(context);
+  const size_t max= CHANNEL_NAME_LENGTH;
+  length= std::min(length, max);
+
+  row->channel_name_length= length;
+  memcpy(row->channel_name, &value, length);
+}
+
+static void set_view_id(void* const context, const char& value, size_t length)
+{
+  struct st_row_group_member_stats* row=
+      static_cast<struct st_row_group_member_stats*>(context);
+  const size_t max= HOSTNAME_LENGTH;
+  length= std::min(length, max);
+
+  row->view_id_length= length;
+  memcpy(row->view_id, &value, length);
+}
+
+static void set_member_id(void* const context, const char& value, size_t length)
+{
+  struct st_row_group_member_stats* row=
+      static_cast<struct st_row_group_member_stats*>(context);
+  const size_t max= UUID_LENGTH;
+  length= std::min(length, max);
+
+  row->member_id_length= length;
+  memcpy(row->member_id, &value, length);
+}
+
+static void set_transactions_committed(void* const context, const char& value,
+                                       size_t length)
+{
+  struct st_row_group_member_stats* row=
+      static_cast<struct st_row_group_member_stats*>(context);
+
+  if (row->trx_committed != NULL)
+    my_free(row->trx_committed);
+
+  row->trx_committed_length= length;
+  row->trx_committed= (char*) my_malloc(PSI_NOT_INSTRUMENTED,
+                                        length,
+                                        MYF(0));
+  memcpy(row->trx_committed, &value, length);
+}
+
+static void set_last_conflict_free_transaction(void* const context,
+                                               const char& value, size_t length)
+{
+  struct st_row_group_member_stats* row=
+      static_cast<struct st_row_group_member_stats*>(context);
+  const size_t max= Gtid::MAX_TEXT_LENGTH+1;
+  length= std::min(length, max);
+
+  row->last_cert_trx_length= length;
+  memcpy(row->last_cert_trx, &value, length);
+}
+
+static void set_transactions_in_queue(void* const context,
+                                      unsigned long long int value)
+{
+  struct st_row_group_member_stats* row=
+      static_cast<struct st_row_group_member_stats*>(context);
+  row->trx_in_queue= value;
+}
+
+static void set_transactions_certified(void* const context,
+                                       unsigned long long int value)
+{
+  struct st_row_group_member_stats* row=
+      static_cast<struct st_row_group_member_stats*>(context);
+  row->trx_checked= value;
+}
+
+static void set_transactions_conflicts_detected(void* const context,
+                                                unsigned long long int value)
+{
+  struct st_row_group_member_stats* row=
+      static_cast<struct st_row_group_member_stats*>(context);
+  row->trx_conflicts= value;
+}
+
+static void set_transactions_rows_in_validation(void* const context,
+                                                unsigned long long int value)
+{
+  struct st_row_group_member_stats* row=
+      static_cast<struct st_row_group_member_stats*>(context);
+  row->trx_rows_validating= value;
+}
+
+
+THR_LOCK table_replication_group_member_stats::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    {C_STRING_WITH_LEN("CHANNEL_NAME")},
+    {C_STRING_WITH_LEN("char(64)")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("VIEW_ID")},
+    {C_STRING_WITH_LEN("char(60)")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("MEMBER_ID")},
+    {C_STRING_WITH_LEN("char(36)")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("COUNT_TRANSACTIONS_IN_QUEUE")},
+    {C_STRING_WITH_LEN("bigint")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("COUNT_TRANSACTIONS_CHECKED")},
+    {C_STRING_WITH_LEN("bigint")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("COUNT_CONFLICTS_DETECTED")},
+    {C_STRING_WITH_LEN("bigint")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("COUNT_TRANSACTIONS_ROWS_VALIDATING")},
+    {C_STRING_WITH_LEN("bigint")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("TRANSACTIONS_COMMITTED_ALL_MEMBERS")},
+    {C_STRING_WITH_LEN("longtext")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("LAST_CONFLICT_FREE_TRANSACTION")},
+    {C_STRING_WITH_LEN("text")},
+    {NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_replication_group_member_stats::m_field_def=
+{ 9, field_types };
+
+PFS_engine_table_share
+table_replication_group_member_stats::m_share=
+{
+  { C_STRING_WITH_LEN("replication_group_member_stats") },
+  &pfs_readonly_acl,
+  &table_replication_group_member_stats::create,
+  NULL, /* write_row */
+  NULL, /* delete_all_rows */
+  table_replication_group_member_stats::get_row_count,
+  sizeof(PFS_simple_index), /* ref length */
+  &m_table_lock,
+  &m_field_def,
+  false, /* checked */
+  false  /* perpetual */
+};
+
+PFS_engine_table* table_replication_group_member_stats::create(void)
+{
+  return new table_replication_group_member_stats();
+}
+
+table_replication_group_member_stats::table_replication_group_member_stats()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(0), m_next_pos(0)
+{
+  m_row.trx_committed= NULL;
+}
+
+table_replication_group_member_stats::~table_replication_group_member_stats()
+{
+  if (m_row.trx_committed != NULL)
+  {
+    my_free(m_row.trx_committed);
+    m_row.trx_committed= NULL;
+  }
+}
+
+void table_replication_group_member_stats::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+ha_rows table_replication_group_member_stats::get_row_count()
+{
+  uint row_count= 0;
+
+  if (is_group_replication_plugin_loaded())
+    row_count= 1;
+
+  return row_count;
+}
+
+int table_replication_group_member_stats::rnd_next(void)
+{
+  if (!is_group_replication_plugin_loaded())
+    return HA_ERR_END_OF_FILE;
+
+  m_pos.set_at(&m_next_pos);
+  if (m_pos.m_index == 0)
+  {
+    make_row();
+    m_next_pos.set_after(&m_pos);
+    return 0;
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_replication_group_member_stats::rnd_pos(const void *pos)
+{
+  if (get_row_count() == 0)
+    return HA_ERR_END_OF_FILE;
+
+  set_position(pos);
+  assert(m_pos.m_index < 1);
+  make_row();
+
+  return 0;
+}
+
+void table_replication_group_member_stats::make_row()
+{
+  DBUG_ENTER("table_replication_group_member_stats::make_row");
+  // Set default values.
+  m_row_exists= false;
+  m_row.channel_name_length= 0;
+  m_row.view_id_length= 0;
+  m_row.member_id_length= 0;
+  m_row.trx_committed_length= 0;
+  m_row.last_cert_trx_length= 0;
+  m_row.trx_in_queue= 0;
+  m_row.trx_checked= 0;
+  m_row.trx_conflicts= 0;
+  m_row.trx_rows_validating= 0;
+
+  // Set callbacks on GROUP_REPLICATION_GROUP_MEMBER_STATS_CALLBACKS.
+  const GROUP_REPLICATION_GROUP_MEMBER_STATS_CALLBACKS callbacks=
+  {
+    &m_row,
+    &set_channel_name,
+    &set_view_id,
+    &set_member_id,
+    &set_transactions_committed,
+    &set_last_conflict_free_transaction,
+    &set_transactions_in_queue,
+    &set_transactions_certified,
+    &set_transactions_conflicts_detected,
+    &set_transactions_rows_in_validation,
+  };
+
+  // Query plugin and let callbacks do their job.
+  if (get_group_replication_group_member_stats_info(callbacks))
+  {
+    DBUG_PRINT("info", ("Group Replication stats not available!"));
+  }
+  else
+  {
+    m_row_exists= true;
+  }
+
+  DBUG_VOID_RETURN;
+}
+
+
+int table_replication_group_member_stats::read_row_values(TABLE *table,
+                                                   unsigned char *buf,
+                                                   Field **fields,
+                                                   bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  assert(table->s->null_bytes == 0);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /** channel_name */
+        set_field_char_utf8(f, m_row.channel_name,
+                               m_row.channel_name_length);
+        break;
+      case 1: /** view id */
+        set_field_char_utf8(f, m_row.view_id, m_row.view_id_length);
+        break;
+      case 2: /** member_id */
+        set_field_char_utf8(f, m_row.member_id, m_row.member_id_length);
+        break;
+      case 3: /** transaction_in_queue */
+        set_field_ulonglong(f, m_row.trx_in_queue);
+        break;
+      case 4: /** transactions_certified */
+        set_field_ulonglong(f, m_row.trx_checked);
+        break;
+      case 5: /** negatively_certified_transaction */
+        set_field_ulonglong(f, m_row.trx_conflicts);
+        break;
+      case 6: /** certification_db_size */
+        set_field_ulonglong(f, m_row.trx_rows_validating);
+        break;
+      case 7: /** stable_set */
+        set_field_longtext_utf8(f, m_row.trx_committed,
+                                m_row.trx_committed_length);
+        break;
+      case 8: /** last_certified_transaction */
+        set_field_longtext_utf8(f, m_row.last_cert_trx,
+                                m_row.last_cert_trx_length);
+
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+  return 0;
+}
diff --git a/storage/perfschema/table_replication_group_member_stats.h b/storage/perfschema/table_replication_group_member_stats.h
new file mode 100644
index 00000000000..67a05cb8e53
--- /dev/null
+++ b/storage/perfschema/table_replication_group_member_stats.h
@@ -0,0 +1,116 @@
+/*
+   Copyright (c) 2014, 2021, Oracle and/or its affiliates.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License, version 2.0,
+   as published by the Free Software Foundation.
+
+   This program is also distributed with certain software (including
+   but not limited to OpenSSL) that is licensed under separate terms,
+   as designated in a particular file or component or in included license
+   documentation.  The authors of MySQL hereby grant you an additional
+   permission to link the program and your derivative works with the
+   separately licensed software that they have included with MySQL.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License, version 2.0, for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+
+#ifndef TABLE_REPLICATION_GROUP_MEMBER_STATS_H
+#define TABLE_REPLICATION_GROUP_MEMBER_STATS_H
+
+/**
+  @file storage/perfschema/table_replication_group_member_stats.h
+  Table replication_group_member_stats (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "mysql_com.h"
+//#include "rpl_info.h"
+//#include "rpl_gtid.h"
+//#include <mysql/plugin_group_replication.h>
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row in node status table. The fields with string values have an additional
+  length field denoted by <field_name>_length.
+*/
+
+struct st_row_group_member_stats {
+  char channel_name[CHANNEL_NAME_LENGTH];
+  uint channel_name_length;
+  char view_id[HOSTNAME_LENGTH];
+  uint view_id_length;
+  char member_id[11];   // typeof(server_id) == uint32
+  uint member_id_length;
+  ulonglong trx_in_queue;
+  ulonglong trx_checked;
+  ulonglong trx_conflicts;
+  ulonglong trx_rows_validating;
+  char *trx_committed;
+  size_t trx_committed_length;
+  char last_cert_trx[GTID_MAX_STR_LENGTH + 1];
+  int last_cert_trx_length;
+};
+
+/** Table PERFORMANCE_SCHEMA.REPLICATION_GROUP_MEMBER_STATS. */
+class table_replication_group_member_stats: public PFS_engine_table
+{
+private:
+  void make_row();
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current row */
+  st_row_group_member_stats m_row;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+
+protected:
+  /**
+    Read the current row values.
+    @param table            Table handle
+    @param buf              row buffer
+    @param fields           Table fields
+    @param read_all         true if all columns are read.
+  */
+
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_replication_group_member_stats();
+
+public:
+  ~table_replication_group_member_stats();
+
+  /** Table share. */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static ha_rows get_row_count();
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+};
+
+/** @} */
+#endif
+
diff --git a/storage/perfschema/table_replication_group_members.cc b/storage/perfschema/table_replication_group_members.cc
new file mode 100644
index 00000000000..2a870e58ce3
--- /dev/null
+++ b/storage/perfschema/table_replication_group_members.cc
@@ -0,0 +1,281 @@
+/*
+      Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+      This program is free software; you can redistribute it and/or modify
+      it under the terms of the GNU General Public License, version 2.0,
+      as published by the Free Software Foundation.
+
+      This program is also distributed with certain software (including
+      but not limited to OpenSSL) that is licensed under separate terms,
+      as designated in a particular file or component or in included license
+      documentation.  The authors of MySQL hereby grant you an additional
+      permission to link the program and your derivative works with the
+      separately licensed software that they have included with MySQL.
+
+      This program is distributed in the hope that it will be useful,
+      but WITHOUT ANY WARRANTY; without even the implied warranty of
+      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+      GNU General Public License, version 2.0, for more details.
+
+      You should have received a copy of the GNU General Public License
+      along with this program; if not, write to the Free Software
+      Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_replication_group_members.cc
+  Table replication_group_members (implementation).
+*/
+
+//#define HAVE_REPLICATION
+
+#include "my_global.h"
+#include "table_replication_group_members.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "log.h"
+//#include "rpl_group_replication.h"
+
+/*
+  Callbacks implementation for GROUP_REPLICATION_GROUP_MEMBERS_CALLBACKS.
+*/
+static void set_channel_name(void* const context, const char& value,
+                             size_t length)
+{
+  struct st_row_group_members* row=
+      static_cast<struct st_row_group_members*>(context);
+  const size_t max= CHANNEL_NAME_LENGTH;
+  length= std::min(length, max);
+
+  row->channel_name_length= length;
+  memcpy(row->channel_name, &value, length);
+}
+
+static void set_member_id(void* const context, const char& value,
+                          size_t length)
+{
+  struct st_row_group_members* row=
+      static_cast<struct st_row_group_members*>(context);
+  const size_t max= UUID_LENGTH;
+  length= std::min(length, max);
+
+  row->member_id_length= length;
+  memcpy(row->member_id, &value, length);
+}
+
+static void set_member_host(void* const context, const char& value,
+                            size_t length)
+{
+  struct st_row_group_members* row=
+      static_cast<struct st_row_group_members*>(context);
+  const size_t max= HOSTNAME_LENGTH;
+  length= std::min(length, max);
+
+  row->member_host_length= length;
+  memcpy(row->member_host, &value, length);
+}
+
+static void set_member_port(void* const context, unsigned int value)
+{
+  struct st_row_group_members* row=
+      static_cast<struct st_row_group_members*>(context);
+  row->member_port= value;
+}
+
+static void set_member_state(void* const context, const char& value,
+                             size_t length)
+{
+  struct st_row_group_members* row=
+      static_cast<struct st_row_group_members*>(context);
+  const size_t max= NAME_LEN;
+  length= std::min(length, max);
+
+  row->member_state_length= length;
+  memcpy(row->member_state, &value, length);
+}
+
+
+THR_LOCK table_replication_group_members::m_table_lock;
+
+/* Numbers in varchar count utf8 characters. */
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    {C_STRING_WITH_LEN("CHANNEL_NAME")},
+    {C_STRING_WITH_LEN("char(64)")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("MEMBER_ID")},
+    {C_STRING_WITH_LEN("char(36)")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("MEMBER_HOST")},
+    {C_STRING_WITH_LEN("char(60)")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("MEMBER_PORT")},
+    {C_STRING_WITH_LEN("int(11)")},
+    {NULL, 0}
+  },
+  {
+    {C_STRING_WITH_LEN("MEMBER_STATE")},
+    {C_STRING_WITH_LEN("char(64)")},
+    {NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_replication_group_members::m_field_def=
+{ 5, field_types };
+
+PFS_engine_table_share
+table_replication_group_members::m_share=
+{
+  { C_STRING_WITH_LEN("replication_group_members") },
+  &pfs_readonly_acl,
+  &table_replication_group_members::create,
+  NULL, /* write_row */
+  NULL, /* delete_all_rows */
+  table_replication_group_members::get_row_count,
+  sizeof(PFS_simple_index), /* ref length */
+  &m_table_lock,
+  &m_field_def,
+  false, /* checked */
+  false  /* perpetual */
+};
+
+PFS_engine_table* table_replication_group_members::create(void)
+{
+  return new table_replication_group_members();
+}
+
+table_replication_group_members::table_replication_group_members()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+table_replication_group_members::~table_replication_group_members()
+{}
+
+void table_replication_group_members::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+ha_rows table_replication_group_members::get_row_count()
+{
+  return get_group_replication_members_number_info();
+}
+
+int table_replication_group_members::rnd_next(void)
+{
+  if (!is_group_replication_plugin_loaded())
+    return HA_ERR_END_OF_FILE;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < get_row_count();
+       m_pos.next())
+  {
+    make_row(m_pos.m_index);
+    m_next_pos.set_after(&m_pos);
+    return 0;
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_replication_group_members::rnd_pos(const void *pos)
+{
+  if (!is_group_replication_plugin_loaded())
+    return HA_ERR_END_OF_FILE;
+
+  set_position(pos);
+  assert(m_pos.m_index < get_row_count());
+  make_row(m_pos.m_index);
+
+  return 0;
+}
+
+void table_replication_group_members::make_row(uint index)
+{
+  DBUG_ENTER("table_replication_group_members::make_row");
+  // Set default values.
+  m_row_exists= false;
+  m_row.channel_name_length= 0;
+  m_row.member_id_length= 0;
+  m_row.member_host_length= 0;
+  m_row.member_port= 0;
+  m_row.member_state_length= 0;
+
+  // Set callbacks on GROUP_REPLICATION_GROUP_MEMBERS_CALLBACKS.
+  const GROUP_REPLICATION_GROUP_MEMBERS_CALLBACKS callbacks=
+  {
+    &m_row,
+    &set_channel_name,
+    &set_member_id,
+    &set_member_host,
+    &set_member_port,
+    &set_member_state,
+  };
+
+  // Query plugin and let callbacks do their job.
+  if (get_group_replication_group_members_info(index, callbacks))
+  {
+    DBUG_PRINT("info", ("Group Replication stats not available!"));
+  }
+  else
+  {
+    m_row_exists= true;
+  }
+
+  DBUG_VOID_RETURN;
+}
+
+
+int table_replication_group_members::read_row_values(TABLE *table,
+                                                     unsigned char *buf,
+                                                     Field **fields,
+                                                     bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /** channel_name */
+        set_field_char_utf8(f, m_row.channel_name, m_row.channel_name_length);
+        break;
+      case 1: /** member_id */
+        set_field_char_utf8(f, m_row.member_id, m_row.member_id_length);
+        break;
+      case 2: /** member_host */
+        set_field_char_utf8(f, m_row.member_host, m_row.member_host_length);
+        break;
+      case 3: /** member_port */
+        if (m_row.member_port > 0)
+          set_field_ulong(f, m_row.member_port);
+        else
+          f->set_null();
+        break;
+      case 4: /** member_state */
+        set_field_char_utf8(f, m_row.member_state, m_row.member_state_length);
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+  return 0;
+}
diff --git a/storage/perfschema/table_replication_group_members.h b/storage/perfschema/table_replication_group_members.h
new file mode 100644
index 00000000000..2df013fa35b
--- /dev/null
+++ b/storage/perfschema/table_replication_group_members.h
@@ -0,0 +1,108 @@
+/*
+   Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License, version 2.0,
+   as published by the Free Software Foundation.
+
+   This program is also distributed with certain software (including
+   but not limited to OpenSSL) that is licensed under separate terms,
+   as designated in a particular file or component or in included license
+   documentation.  The authors of MySQL hereby grant you an additional
+   permission to link the program and your derivative works with the
+   separately licensed software that they have included with MySQL.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License, version 2.0, for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+
+#ifndef TABLE_REPLICATION_GROUP_MEMBERS_H
+#define TABLE_REPLICATION_GROUP_MEMBERS_H
+
+/**
+  @file storage/perfschema/table_replication_group_members.h
+  Table replication_group_members (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "mysql_com.h"
+//#include "rpl_info.h"
+//#include <mysql/plugin_group_replication.h>
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row in connection nodes table. The fields with string values have an additional
+  length field denoted by <field_name>_length.
+*/
+struct st_row_group_members {
+  char channel_name[CHANNEL_NAME_LENGTH];
+  uint channel_name_length;
+  char member_id[11];   // typeof(server_id) == uint32
+  uint member_id_length;
+  char member_host[HOSTNAME_LENGTH];
+  uint member_host_length;
+  uint member_port;
+  char member_state[NAME_LEN];
+  uint member_state_length;
+};
+
+/** Table PERFORMANCE_SCHEMA.replication_group_members. */
+class table_replication_group_members: public PFS_engine_table
+{
+private:
+  void make_row(uint index);
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current row */
+  st_row_group_members m_row;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+
+protected:
+  /**
+    Read the current row values.
+    @param table            Table handle
+    @param buf              row buffer
+    @param fields           Table fields
+    @param read_all         true if all columns are read.
+  */
+
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_replication_group_members();
+
+public:
+  ~table_replication_group_members();
+
+  /** Table share. */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static ha_rows get_row_count();
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_session_account_connect_attrs.cc b/storage/perfschema/table_session_account_connect_attrs.cc
index bc673d8baba..563ce03d638 100644
--- a/storage/perfschema/table_session_account_connect_attrs.cc
+++ b/storage/perfschema/table_session_account_connect_attrs.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -29,12 +29,11 @@ PFS_engine_table_share
 table_session_account_connect_attrs::m_share=
 {
   { C_STRING_WITH_LEN("session_account_connect_attrs") },
-  &pfs_readonly_acl,
-  &table_session_account_connect_attrs::create,
+  &pfs_readonly_world_acl,
+  table_session_account_connect_attrs::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
-  NULL, /* get_row_count */
-  1000, /* records */
+  cursor_by_thread_connect_attr::get_row_count,
   sizeof(pos_connect_attr_by_thread_by_attr), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE session_account_connect_attrs("
@@ -42,7 +41,8 @@ table_session_account_connect_attrs::m_share=
                       "ATTR_NAME VARCHAR(32) NOT NULL comment 'Attribute name.',"
                       "ATTR_VALUE VARCHAR(1024) comment 'Attribute value.',"
                       "ORDINAL_POSITION INT comment 'Order in which attribute was added to the connection attributes.'"
-                      ") CHARACTER SET utf8 COLLATE utf8_bin") }
+                      ") CHARACTER SET utf8 COLLATE utf8_bin") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_session_account_connect_attrs::create()
@@ -63,7 +63,7 @@ table_session_account_connect_attrs::thread_fits(PFS_thread *thread)
     return false;
 
   /* The thread we compare to, by definition, has some instrumentation. */
-  DBUG_ASSERT(thread != NULL);
+  assert(thread != NULL);
 
   uint username_length= current_thread->m_username_length;
   uint hostname_length= current_thread->m_hostname_length;
diff --git a/storage/perfschema/table_session_account_connect_attrs.h b/storage/perfschema/table_session_account_connect_attrs.h
index 483001fcb91..dec5295808d 100644
--- a/storage/perfschema/table_session_account_connect_attrs.h
+++ b/storage/perfschema/table_session_account_connect_attrs.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2012, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
diff --git a/storage/perfschema/table_session_connect.cc b/storage/perfschema/table_session_connect.cc
index 0ef233ffd9e..dfb203c5d46 100644
--- a/storage/perfschema/table_session_connect.cc
+++ b/storage/perfschema/table_session_connect.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -22,13 +22,14 @@
 
 #include <my_global.h>
 #include "table_session_connect.h"
+#include "field.h"
 
 table_session_connect::table_session_connect(const PFS_engine_table_share *share)
  : cursor_by_thread_connect_attr(share)
 {
   if (session_connect_attrs_size_per_thread > 0)
   {
-    m_copy_session_connect_attrs= (char *) my_malloc(/* 5.7: PSI_INSTRUMENT_ME, */
+    m_copy_session_connect_attrs= (char *) my_malloc(PSI_INSTRUMENT_ME,
                                              session_connect_attrs_size_per_thread,
                                              MYF(0));
   }
@@ -80,7 +81,7 @@ bool parse_length_encoded_string(const char **ptr,
   if (*ptr - start_ptr + data_length > input_length)
     return true;
 
-  copy_length= copier.well_formed_copy(&my_charset_utf8_bin, dest, dest_size,
+  copy_length= copier.well_formed_copy(&my_charset_utf8mb3_bin, dest, dest_size,
                                        from_cs, *ptr, data_length, nchars_max);
   *copied_len= copy_length;
   (*ptr)+= data_length;
@@ -146,7 +147,7 @@ bool read_nth_attr(const char *connect_attrs,
 
     if (idx == ordinal)
       *attr_name_length= copy_length;
-      
+
     /* read the value */
     if (parse_length_encoded_string(&ptr,
                                     attr_value, max_attr_value, &copy_length,
@@ -168,8 +169,8 @@ bool read_nth_attr(const char *connect_attrs,
 
 void table_session_connect::make_row(PFS_thread *pfs, uint ordinal)
 {
-  pfs_lock lock;
-  pfs_lock session_lock;
+  pfs_optimistic_state lock;
+  pfs_optimistic_state session_lock;
   PFS_thread_class *safe_class;
   const CHARSET_INFO *cs;
 
@@ -249,7 +250,7 @@ int table_session_connect::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
@@ -279,7 +280,7 @@ int table_session_connect::read_row_values(TABLE *table,
         set_field_ulong(f, m_row.m_ordinal_position);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
diff --git a/storage/perfschema/table_session_connect.h b/storage/perfschema/table_session_connect.h
index ee1019fe7f2..b9be1ce304c 100644
--- a/storage/perfschema/table_session_connect.h
+++ b/storage/perfschema/table_session_connect.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, 2013, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2012, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -59,6 +59,7 @@ struct row_session_connect_attrs
   ulong m_ordinal_position;
 };
 
+/** Abstract table PERFORMANCE_SCHEMA.SESSION_CONNECT_ATTRS. */
 class table_session_connect : public cursor_by_thread_connect_attr
 {
 protected:
diff --git a/storage/perfschema/table_session_connect_attrs.cc b/storage/perfschema/table_session_connect_attrs.cc
index 79c9a260210..034ff12c91e 100644
--- a/storage/perfschema/table_session_connect_attrs.cc
+++ b/storage/perfschema/table_session_connect_attrs.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -30,11 +30,10 @@ table_session_connect_attrs::m_share=
 {
   { C_STRING_WITH_LEN("session_connect_attrs") },
   &pfs_readonly_acl,
-  &table_session_connect_attrs::create,
+  table_session_connect_attrs::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
-  NULL, /* get_row_count */
-  1000, /* records */
+  cursor_by_thread_connect_attr::get_row_count,
   sizeof(pos_connect_attr_by_thread_by_attr), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE session_connect_attrs("
@@ -42,7 +41,8 @@ table_session_connect_attrs::m_share=
                       "ATTR_NAME VARCHAR(32) NOT NULL comment 'Attribute name.',"
                       "ATTR_VALUE VARCHAR(1024) comment 'Attribute value.',"
                       "ORDINAL_POSITION INT comment 'Order in which attribute was added to the connection attributes.'"
-                      ") CHARACTER SET utf8 COLLATE utf8_bin") }
+                      ") CHARACTER SET utf8 COLLATE utf8_bin") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_session_connect_attrs::create()
diff --git a/storage/perfschema/table_session_connect_attrs.h b/storage/perfschema/table_session_connect_attrs.h
index 927c3a92af2..823207726f3 100644
--- a/storage/perfschema/table_session_connect_attrs.h
+++ b/storage/perfschema/table_session_connect_attrs.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2012, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
diff --git a/storage/perfschema/table_session_status.cc b/storage/perfschema/table_session_status.cc
new file mode 100644
index 00000000000..05795af4f3c
--- /dev/null
+++ b/storage/perfschema/table_session_status.cc
@@ -0,0 +1,185 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_session_status.cc
+  Table SESSION_STATUS (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "table_session_status.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "pfs_global.h"
+
+THR_LOCK table_session_status::m_table_lock;
+
+PFS_engine_table_share
+table_session_status::m_share=
+{
+  { C_STRING_WITH_LEN("session_status") },
+  &pfs_readonly_world_acl,
+  table_session_status::create,
+  NULL, /* write_row */
+  NULL, /* delete_all_rows */
+  table_session_status::get_row_count,
+  sizeof(pos_t),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE session_status("
+  "VARIABLE_NAME VARCHAR(64) not null comment 'The session status variable name.',"
+  "VARIABLE_VALUE VARCHAR(1024) comment 'The session status variable value.')") },
+  true   /* perpetual */
+};
+
+PFS_engine_table*
+table_session_status::create(void)
+{
+  return new table_session_status();
+}
+
+ha_rows table_session_status::get_row_count(void)
+{
+  mysql_mutex_lock(&LOCK_status);
+  ha_rows status_var_count= all_status_vars.elements;
+  mysql_mutex_unlock(&LOCK_status);
+  return status_var_count;
+}
+
+table_session_status::table_session_status()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_status_cache(false), m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+void table_session_status::reset_position(void)
+{
+  m_pos.m_index = 0;
+  m_next_pos.m_index = 0;
+}
+
+int table_session_status::rnd_init(bool scan)
+{
+ /* Build a cache of all status variables for this thread. */
+  m_status_cache.materialize_all(current_thd);
+
+  /* Record the current number of status variables to detect subsequent changes. */
+  ulonglong status_version= m_status_cache.get_status_array_version();
+
+  /*
+    The table context holds the current version of the global status array.
+    If scan == true, then allocate a new context from mem_root and store in TLS.
+    If scan == false, then restore from TLS.
+  */
+  m_context= (table_session_status_context *)current_thd->alloc(sizeof(table_session_status_context));
+  new(m_context) table_session_status_context(status_version, !scan);
+  return 0;
+}
+
+int table_session_status::rnd_next(void)
+{
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < m_status_cache.size();
+       m_pos.next())
+  {
+    if (m_status_cache.is_materialized())
+    {
+      const Status_variable *stat_var= m_status_cache.get(m_pos.m_index);
+      if (stat_var != NULL)
+      {
+        make_row(stat_var);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_session_status::rnd_pos(const void *pos)
+{
+  /* If global status array has changed, do nothing. */ // TODO: warning
+  if (!m_context->versions_match())
+    return HA_ERR_RECORD_DELETED;
+
+  set_position(pos);
+  assert(m_pos.m_index < m_status_cache.size());
+
+  if (m_status_cache.is_materialized())
+  {
+    const Status_variable *stat_var= m_status_cache.get(m_pos.m_index);
+    if (stat_var != NULL)
+    {
+      make_row(stat_var);
+      return 0;
+    }
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_session_status
+::make_row(const Status_variable *status_var)
+{
+  m_row_exists= false;
+  m_row.m_variable_name.make_row(status_var->m_name, status_var->m_name_length);
+  m_row.m_variable_value.make_row(status_var);
+  m_row_exists= true;
+}
+
+int table_session_status
+::read_row_values(TABLE *table,
+                  unsigned char *buf,
+                  Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(!m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* VARIABLE_NAME */
+        set_field_varchar_utf8(f, m_row.m_variable_name.m_str, m_row.m_variable_name.m_length);
+        break;
+      case 1: /* VARIABLE_VALUE */
+        m_row.m_variable_value.set_field(f);
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_session_status.h b/storage/perfschema/table_session_status.h
new file mode 100644
index 00000000000..ed42d3fc57d
--- /dev/null
+++ b/storage/perfschema/table_session_status.h
@@ -0,0 +1,119 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_SESSION_STATUS_H
+#define TABLE_SESSION_STATUS_H
+
+/**
+  @file storage/perfschema/table_session_status.h
+  Table SESSION_STATUS (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+#include "pfs_variable.h"
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.SESSION_STATUS.
+*/
+struct row_session_status
+{
+  /** Column THREAD_ID. */
+  ulonglong m_thread_internal_id;
+  /** Column VARIABLE_NAME. */
+  PFS_variable_name_row m_variable_name;
+  /** Column VARIABLE_VALUE. */
+  PFS_variable_value_row m_variable_value;
+};
+
+/**
+  Store and retrieve table state information for queries that reinstantiate
+  the table object.
+*/
+class table_session_status_context : public PFS_table_context
+{
+public:
+  table_session_status_context(ulonglong current_version, bool restore) :
+    PFS_table_context(current_version, restore, THR_PFS_SS) { }
+};
+
+/** Table PERFORMANCE_SCHEMA.SESSION_STATUS. */
+class table_session_status : public PFS_engine_table
+{
+  typedef PFS_simple_index pos_t;
+
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+  table_session_status();
+
+public:
+  ~table_session_status()
+  {}
+
+protected:
+  void make_row(const Status_variable *status_var);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current THD variables. */
+  PFS_status_variable_cache m_status_cache;
+  /** Current row. */
+  row_session_status m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_t m_pos;
+  /** Next position. */
+  pos_t m_next_pos;
+
+  /** Table context with global status array version. */
+  table_session_status_context *m_context;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_session_variables.cc b/storage/perfschema/table_session_variables.cc
new file mode 100644
index 00000000000..dd6914d81c8
--- /dev/null
+++ b/storage/perfschema/table_session_variables.cc
@@ -0,0 +1,187 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_session_variables.cc
+  Table SESSION_VARIABLES (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "table_session_variables.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "pfs_global.h"
+
+THR_LOCK table_session_variables::m_table_lock;
+
+PFS_engine_table_share
+table_session_variables::m_share=
+{
+  { C_STRING_WITH_LEN("session_variables") },
+  &pfs_readonly_world_acl,
+  table_session_variables::create,
+  NULL, /* write_row */
+  NULL, /* delete_all_rows */
+  table_session_variables::get_row_count,
+  sizeof(pos_t),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE session_variables("
+  "VARIABLE_NAME VARCHAR(64) not null,"
+  "VARIABLE_VALUE VARCHAR(1024))") },
+  true   /* perpetual */
+};
+
+PFS_engine_table*
+table_session_variables::create(void)
+{
+  return new table_session_variables();
+}
+
+ha_rows table_session_variables::get_row_count(void)
+{
+  mysql_mutex_lock(&LOCK_plugin_delete);
+  mysql_prlock_rdlock(&LOCK_system_variables_hash);
+  ha_rows system_var_count= get_system_variable_hash_records();
+  mysql_prlock_unlock(&LOCK_system_variables_hash);
+  mysql_mutex_unlock(&LOCK_plugin_delete);
+  return system_var_count;
+}
+
+table_session_variables::table_session_variables()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_sysvar_cache(false), m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+void table_session_variables::reset_position(void)
+{
+  m_pos.m_index = 0;
+  m_next_pos.m_index = 0;
+}
+
+int table_session_variables::rnd_init(bool scan)
+{
+  /* Build a cache of system variables for this thread. */
+  m_sysvar_cache.materialize_all(current_thd);
+
+  /* Record the version of the system variable hash. */
+  ulonglong hash_version= m_sysvar_cache.get_sysvar_hash_version();
+
+  /*
+    The table context holds the current version of the system variable hash.
+    If scan == true, then allocate a new context from mem_root and store in TLS.
+    If scan == false, then restore from TLS.
+  */
+  m_context= (table_session_variables_context *)current_thd->alloc(sizeof(table_session_variables_context));
+  new(m_context) table_session_variables_context(hash_version, !scan);
+  return 0;
+}
+
+int table_session_variables::rnd_next(void)
+{
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < m_sysvar_cache.size();
+       m_pos.next())
+  {
+    if (m_sysvar_cache.is_materialized())
+    {
+      const System_variable *system_var= m_sysvar_cache.get(m_pos.m_index);
+      if (system_var != NULL)
+      {
+        make_row(system_var);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_session_variables::rnd_pos(const void *pos)
+{
+  /* If system variable hash changes, do nothing. */
+  if (!m_context->versions_match())
+    return HA_ERR_RECORD_DELETED;
+
+  set_position(pos);
+  assert(m_pos.m_index < m_sysvar_cache.size());
+
+  if (m_sysvar_cache.is_materialized())
+  {
+    const System_variable *system_var= m_sysvar_cache.get(m_pos.m_index);
+    if (system_var != NULL)
+    {
+      make_row(system_var);
+      return 0;
+    }
+  }
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_session_variables
+::make_row(const System_variable *system_var)
+{
+  m_row_exists= false;
+  if (system_var->is_null() || system_var->is_ignored())
+    return;
+  m_row.m_variable_name.make_row(system_var->m_name, system_var->m_name_length);
+  m_row.m_variable_value.make_row(system_var);
+  m_row_exists= true;
+}
+
+int table_session_variables
+::read_row_values(TABLE *table,
+                  unsigned char *buf,
+                  Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(!m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* VARIABLE_NAME */
+        set_field_varchar_utf8(f, m_row.m_variable_name.m_str, m_row.m_variable_name.m_length);
+        break;
+      case 1: /* VARIABLE_VALUE */
+        m_row.m_variable_value.set_field(f);
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_session_variables.h b/storage/perfschema/table_session_variables.h
new file mode 100644
index 00000000000..a32b7483bce
--- /dev/null
+++ b/storage/perfschema/table_session_variables.h
@@ -0,0 +1,117 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_SESSION_VARIABLES_H
+#define TABLE_SESSION_VARIABLES_H
+
+/**
+  @file storage/perfschema/table_session_variables.h
+  Table SESSION_VARIABLES (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+#include "pfs_variable.h"
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  Store and retrieve table state information during queries that reinstantiate
+  the table object.
+*/
+class table_session_variables_context : public PFS_table_context
+{
+public:
+  table_session_variables_context(ulonglong hash_version, bool restore) :
+    PFS_table_context(hash_version, restore, THR_PFS_SV) {}
+};
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.SESSION_VARIABLES.
+*/
+struct row_session_variables
+{
+  /** Column VARIABLE_NAME. */
+  PFS_variable_name_row m_variable_name;
+  /** Column VARIABLE_VALUE. */
+  PFS_variable_value_row m_variable_value;
+};
+
+/** Table PERFORMANCE_SCHEMA.SESSION_VARIABLES. */
+class table_session_variables : public PFS_engine_table
+{
+  typedef PFS_simple_index pos_t;
+
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+  table_session_variables();
+
+public:
+  ~table_session_variables()
+  {}
+
+protected:
+  void make_row(const System_variable *system_var);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current THD variables. */
+  PFS_system_variable_cache m_sysvar_cache;
+  /** Current row. */
+  row_session_variables m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_t m_pos;
+  /** Next position. */
+  pos_t m_next_pos;
+
+  /** Table context with system variable hash version. */
+  table_session_variables_context *m_context;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_setup_actors.cc b/storage/perfschema/table_setup_actors.cc
index bc56f8e8714..03f42cb6433 100644
--- a/storage/perfschema/table_setup_actors.cc
+++ b/storage/perfschema/table_setup_actors.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,13 +26,15 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "pfs_setup_actor.h"
 #include "table_setup_actors.h"
 #include "pfs_global.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_setup_actors::m_table_lock;
 
@@ -45,13 +47,15 @@ table_setup_actors::m_share=
   table_setup_actors::write_row,
   table_setup_actors::delete_all_rows,
   table_setup_actors::get_row_count,
-  1000, /* records */
   sizeof(PFS_simple_index),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE setup_actors("
-                      "HOST CHAR(" STRINGIFY_ARG(HOSTNAME_LENGTH) ") collate utf8_bin default '%' not null comment 'Host name, either a literal, or the % wildcard representing any host.',"
-                      "USER CHAR(" STRINGIFY_ARG(USERNAME_CHAR_LENGTH) ") collate utf8_bin default '%' not null comment 'User name, either a literal or the % wildcard representing any name.',"
-                      "ROLE CHAR(" STRINGIFY_ARG(USERNAME_CHAR_LENGTH) ") collate utf8_bin default '%' not null comment 'Unused')") }
+                      "HOST CHAR(" HOSTNAME_LENGTH_STR ") collate utf8_bin default '%' not null comment 'Host name, either a literal, or the % wildcard representing any host.',"
+                      "USER CHAR(" USERNAME_CHAR_LENGTH_STR ") collate utf8_bin default '%' not null comment 'User name, either a literal or the % wildcard representing any name.',"
+                      "ROLE CHAR(" USERNAME_CHAR_LENGTH_STR ") collate utf8_bin default '%' not null comment 'Unused',"
+                      "ENABLED ENUM('YES', 'NO') not null default 'YES' comment 'Whether to enable instrumentation for foreground threads matched by the row.',"
+                      "HISTORY ENUM('YES', 'NO') not null default 'YES' comment 'Whether to log historical events for foreground threads matched by the row.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_setup_actors::create()
@@ -63,12 +67,16 @@ int table_setup_actors::write_row(TABLE *table, const unsigned char *buf,
                                   Field **fields)
 {
   Field *f;
-  String user_data("%", 1, &my_charset_utf8_bin);
-  String host_data("%", 1, &my_charset_utf8_bin);
-  String role_data("%", 1, &my_charset_utf8_bin);
+  String user_data("%", 1, &my_charset_utf8mb3_bin);
+  String host_data("%", 1, &my_charset_utf8mb3_bin);
+  String role_data("%", 1, &my_charset_utf8mb3_bin);
   String *user= &user_data;
   String *host= &host_data;
   String *role= &role_data;
+  enum_yes_no enabled_value= ENUM_YES;
+  enum_yes_no history_value= ENUM_YES;
+  bool enabled;
+  bool history;
 
   for (; (f= *fields) ; fields++)
   {
@@ -85,16 +93,34 @@ int table_setup_actors::write_row(TABLE *table, const unsigned char *buf,
       case 2: /* ROLE */
         role= get_field_char_utf8(f, &role_data);
         break;
+      case 3: /* ENABLED */
+        enabled_value= (enum_yes_no) get_field_enum(f);
+        break;
+      case 4: /* HISTORY */
+        history_value= (enum_yes_no) get_field_enum(f);
+        break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
 
+  /* Reject illegal enum values in ENABLED */
+  if ((enabled_value != ENUM_YES) && (enabled_value != ENUM_NO))
+    return HA_ERR_NO_REFERENCED_ROW;
+
+  /* Reject illegal enum values in HISTORY */
+  if ((history_value != ENUM_YES) && (history_value != ENUM_NO))
+    return HA_ERR_NO_REFERENCED_ROW;
+
+  /* Reject if any of user/host/role is not provided */
   if (user->length() == 0 || host->length() == 0 || role->length() == 0)
     return HA_ERR_WRONG_COMMAND;
 
-  return insert_setup_actor(user, host, role);
+  enabled= (enabled_value == ENUM_YES) ? true : false;
+  history= (history_value == ENUM_YES) ? true : false;
+
+  return insert_setup_actor(user, host, role, enabled, history);
 }
 
 int table_setup_actors::delete_all_rows(void)
@@ -104,7 +130,7 @@ int table_setup_actors::delete_all_rows(void)
 
 ha_rows table_setup_actors::get_row_count(void)
 {
-  return setup_actor_count();
+  return global_setup_actor_container.get_row_count();
 }
 
 table_setup_actors::table_setup_actors()
@@ -122,17 +148,14 @@ int table_setup_actors::rnd_next()
 {
   PFS_setup_actor *pfs;
 
-  for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index < setup_actor_max;
-       m_pos.next())
+  m_pos.set_at(&m_next_pos);
+  PFS_setup_actor_iterator it= global_setup_actor_container.iterate(m_pos.m_index);
+  pfs= it.scan_next(& m_pos.m_index);
+  if (pfs != NULL)
   {
-    pfs= &setup_actor_array[m_pos.m_index];
-    if (pfs->m_lock.is_populated())
-    {
-      make_row(pfs);
-      m_next_pos.set_after(&m_pos);
-      return 0;
-    }
+    make_row(pfs);
+    m_next_pos.set_after(&m_pos);
+    return 0;
   }
 
   return HA_ERR_END_OF_FILE;
@@ -144,9 +167,8 @@ int table_setup_actors::rnd_pos(const void *pos)
 
   set_position(pos);
 
-  DBUG_ASSERT(m_pos.m_index < setup_actor_max);
-  pfs= &setup_actor_array[m_pos.m_index];
-  if (pfs->m_lock.is_populated())
+  pfs= global_setup_actor_container.get(m_pos.m_index);
+  if (pfs != NULL)
   {
     make_row(pfs);
     return 0;
@@ -157,7 +179,7 @@ int table_setup_actors::rnd_pos(const void *pos)
 
 void table_setup_actors::make_row(PFS_setup_actor *pfs)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
 
   m_row_exists= false;
 
@@ -181,6 +203,9 @@ void table_setup_actors::make_row(PFS_setup_actor *pfs)
     return;
   memcpy(m_row.m_rolename, pfs->m_rolename, m_row.m_rolename_length);
 
+  m_row.m_enabled_ptr= &pfs->m_enabled;
+  m_row.m_history_ptr= &pfs->m_history;
+
   if (pfs->m_lock.end_optimistic_lock(&lock))
     m_row_exists= true;
 }
@@ -196,7 +221,7 @@ int table_setup_actors::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
 
   for (; (f= *fields) ; fields++)
   {
@@ -213,8 +238,14 @@ int table_setup_actors::read_row_values(TABLE *table,
       case 2: /* ROLE */
         set_field_char_utf8(f, m_row.m_rolename, m_row.m_rolename_length);
         break;
+      case 3: /* ENABLED */
+        set_field_enum(f, (*m_row.m_enabled_ptr) ? ENUM_YES : ENUM_NO);
+        break;
+      case 4: /* HISTORY */
+        set_field_enum(f, (*m_row.m_history_ptr) ? ENUM_YES : ENUM_NO);
+        break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
@@ -227,7 +258,9 @@ int table_setup_actors::update_row_values(TABLE *table,
                                           const unsigned char *new_buf,
                                           Field **fields)
 {
+  int result;
   Field *f;
+  enum_yes_no value;
 
   for (; (f= *fields) ; fields++)
   {
@@ -239,23 +272,37 @@ int table_setup_actors::update_row_values(TABLE *table,
       case 1: /* USER */
       case 2: /* ROLE */
         return HA_ERR_WRONG_COMMAND;
+      case 3: /* ENABLED */
+        value= (enum_yes_no) get_field_enum(f);
+        /* Reject illegal enum values in ENABLED */
+        if ((value != ENUM_YES) && (value != ENUM_NO))
+          return HA_ERR_NO_REFERENCED_ROW;
+        *m_row.m_enabled_ptr= (value == ENUM_YES) ? true : false;
+        break;
+      case 4: /* HISTORY */
+        value= (enum_yes_no) get_field_enum(f);
+        /* Reject illegal enum values in HISTORY */
+        if ((value != ENUM_YES) && (value != ENUM_NO))
+          return HA_ERR_NO_REFERENCED_ROW;
+        *m_row.m_history_ptr= (value == ENUM_YES) ? true : false;
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
 
-  return 0;
+  result= update_setup_actors_derived_flags();
+  return result;
 }
 
 int table_setup_actors::delete_row_values(TABLE *table,
                                           const unsigned char *buf,
                                           Field **fields)
 {
-  DBUG_ASSERT(m_row_exists);
+  assert(m_row_exists);
 
-  CHARSET_INFO *cs= &my_charset_utf8_bin;
+  CHARSET_INFO *cs= &my_charset_utf8mb3_bin;
   String user(m_row.m_username, m_row.m_username_length, cs);
   String role(m_row.m_rolename, m_row.m_rolename_length, cs);
   String host(m_row.m_hostname, m_row.m_hostname_length, cs);
diff --git a/storage/perfschema/table_setup_actors.h b/storage/perfschema/table_setup_actors.h
index 6bfc480a9c5..f9c316e3396 100644
--- a/storage/perfschema/table_setup_actors.h
+++ b/storage/perfschema/table_setup_actors.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -52,6 +52,10 @@ struct row_setup_actors
   char m_rolename[16];
   /** Length in bytes of @c m_rolename. */
   uint m_rolename_length;
+  /** Column ENABLED. */
+  bool *m_enabled_ptr;
+  /** Column HISTORY. */
+  bool *m_history_ptr;
 };
 
 /** Table PERFORMANCE_SCHEMA.SETUP_ACTORS. */
diff --git a/storage/perfschema/table_setup_consumers.cc b/storage/perfschema/table_setup_consumers.cc
index 00db0178219..88e689b9ff8 100644
--- a/storage/perfschema/table_setup_consumers.cc
+++ b/storage/perfschema/table_setup_consumers.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,73 +26,105 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "table_setup_consumers.h"
 #include "pfs_instr.h"
 #include "pfs_events_waits.h"
 #include "pfs_digest.h"
+#include "field.h"
+
+#define COUNT_SETUP_CONSUMERS 15
 
-#define COUNT_SETUP_CONSUMERS 12
 static row_setup_consumers all_setup_consumers_data[COUNT_SETUP_CONSUMERS]=
 {
   {
     { C_STRING_WITH_LEN("events_stages_current") },
     &flag_events_stages_current,
+    false,
     false
   },
   {
     { C_STRING_WITH_LEN("events_stages_history") },
     &flag_events_stages_history,
-    false
+    false,
+    true
   },
   {
     { C_STRING_WITH_LEN("events_stages_history_long") },
     &flag_events_stages_history_long,
-    false
+    false,
+    true
   },
   {
     { C_STRING_WITH_LEN("events_statements_current") },
     &flag_events_statements_current,
+    false,
     false
   },
   {
     { C_STRING_WITH_LEN("events_statements_history") },
     &flag_events_statements_history,
-    false
+    false,
+    true
   },
   {
     { C_STRING_WITH_LEN("events_statements_history_long") },
     &flag_events_statements_history_long,
+    false,
+    true
+  },
+  {
+    { C_STRING_WITH_LEN("events_transactions_current") },
+    &flag_events_transactions_current,
+    false,
     false
   },
   {
+    { C_STRING_WITH_LEN("events_transactions_history") },
+    &flag_events_transactions_history,
+    false,
+    true
+  },
+  {
+    { C_STRING_WITH_LEN("events_transactions_history_long") },
+    &flag_events_transactions_history_long,
+    false,
+    true
+  },
+  {
     { C_STRING_WITH_LEN("events_waits_current") },
     &flag_events_waits_current,
+    false,
     false
   },
   {
     { C_STRING_WITH_LEN("events_waits_history") },
     &flag_events_waits_history,
-    false
+    false,
+    true
   },
   {
     { C_STRING_WITH_LEN("events_waits_history_long") },
     &flag_events_waits_history_long,
-    false
+    false,
+    true
   },
   {
     { C_STRING_WITH_LEN("global_instrumentation") },
     &flag_global_instrumentation,
+    true,
     true
   },
   {
     { C_STRING_WITH_LEN("thread_instrumentation") },
     &flag_thread_instrumentation,
-    false
+    false,
+    true
   },
   {
     { C_STRING_WITH_LEN("statements_digest") },
     &flag_statements_digest,
+    false,
     false
   }
 };
@@ -104,16 +136,16 @@ table_setup_consumers::m_share=
 {
   { C_STRING_WITH_LEN("setup_consumers") },
   &pfs_updatable_acl,
-  &table_setup_consumers::create,
+  table_setup_consumers::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
-  NULL, /* get_row_count */
-  COUNT_SETUP_CONSUMERS, /* records */
+  table_setup_consumers::get_row_count,
   sizeof(PFS_simple_index), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE setup_consumers("
                       "NAME VARCHAR(64) not null comment 'Consumer name',"
-                      "ENABLED ENUM ('YES', 'NO') not null comment 'YES or NO for whether or not the consumer is enabled. You can modify this column to ensure that event information is added, or is not added.')") }
+                      "ENABLED ENUM ('YES', 'NO') not null comment 'YES or NO for whether or not the consumer is enabled. You can modify this column to ensure that event information is added, or is not added.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_setup_consumers::create(void)
@@ -121,6 +153,12 @@ PFS_engine_table* table_setup_consumers::create(void)
   return new table_setup_consumers();
 }
 
+ha_rows
+table_setup_consumers::get_row_count(void)
+{
+  return COUNT_SETUP_CONSUMERS;
+}
+
 table_setup_consumers::table_setup_consumers()
   : PFS_engine_table(&m_share, &m_pos),
     m_row(NULL), m_pos(0), m_next_pos(0)
@@ -156,7 +194,7 @@ int table_setup_consumers::rnd_next(void)
 int table_setup_consumers::rnd_pos(const void *pos)
 {
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index < COUNT_SETUP_CONSUMERS);
+  assert(m_pos.m_index < COUNT_SETUP_CONSUMERS);
   m_row= &all_setup_consumers_data[m_pos.m_index];
   return 0;
 }
@@ -168,11 +206,11 @@ int table_setup_consumers::read_row_values(TABLE *table,
 {
   Field *f;
 
-  DBUG_ASSERT(m_row);
+  assert(m_row);
 
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 0);
+  assert(table->s->null_bytes == 0);
 
   for (; (f= *fields) ; fields++)
   {
@@ -187,7 +225,7 @@ int table_setup_consumers::read_row_values(TABLE *table,
         set_field_enum(f, (*m_row->m_enabled_ptr) ? ENUM_YES : ENUM_NO);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
@@ -203,7 +241,7 @@ int table_setup_consumers::update_row_values(TABLE *table,
   Field *f;
   enum_yes_no value;
 
-  DBUG_ASSERT(m_row);
+  assert(m_row);
 
   for (; (f= *fields) ; fields++)
   {
@@ -220,14 +258,17 @@ int table_setup_consumers::update_row_values(TABLE *table,
         break;
       }
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
 
-  if (m_row->m_refresh)
+  if (m_row->m_instrument_refresh)
     update_instruments_derived_flags();
 
+  if (m_row->m_thread_refresh)
+    update_thread_derived_flags();
+
   return 0;
 }
 
diff --git a/storage/perfschema/table_setup_consumers.h b/storage/perfschema/table_setup_consumers.h
index 90da55920c6..77cd09dcfbb 100644
--- a/storage/perfschema/table_setup_consumers.h
+++ b/storage/perfschema/table_setup_consumers.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -43,8 +43,10 @@ struct row_setup_consumers
   LEX_STRING m_name;
   /** Column ENABLED. */
   bool *m_enabled_ptr;
-  /** Hidden column, refresh. */
-  bool m_refresh;
+  /** Hidden column, instrument refresh. */
+  bool m_instrument_refresh;
+  /** Hidden column, thread refresh. */
+  bool m_thread_refresh;
 };
 
 /** Table PERFORMANCE_SCHEMA.SETUP_CONSUMERS. */
@@ -54,6 +56,7 @@ public:
   /** Table share. */
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
diff --git a/storage/perfschema/table_setup_instruments.cc b/storage/perfschema/table_setup_instruments.cc
index e3656511cad..86682b8a1d0 100644
--- a/storage/perfschema/table_setup_instruments.cc
+++ b/storage/perfschema/table_setup_instruments.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,14 +26,16 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
+#include "pfs_builtin_memory.h"
 #include "pfs_instr.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_setup_instruments.h"
 #include "pfs_global.h"
 #include "pfs_setup_object.h"
+#include "field.h"
 
 THR_LOCK table_setup_instruments::m_table_lock;
 
@@ -42,17 +44,17 @@ table_setup_instruments::m_share=
 {
   { C_STRING_WITH_LEN("setup_instruments") },
   &pfs_updatable_acl,
-  &table_setup_instruments::create,
+  table_setup_instruments::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_setup_instruments::get_row_count,
   sizeof(pos_setup_instruments),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE setup_instruments("
                       "NAME VARCHAR(128) not null comment 'Instrument name',"
                       "ENABLED ENUM ('YES', 'NO') not null comment 'Whether or not the instrument is enabled. It can be disabled, and the instrument will produce no events.',"
-                      "TIMED ENUM ('YES', 'NO') not null comment 'Whether or not the instrument is timed. It can be set, but if disabled, events produced by the instrument will have NULL values for the corresponding TIMER_START, TIMER_END, and TIMER_WAIT values.')") }
+                      "TIMED ENUM ('YES', 'NO') not null comment 'Whether or not the instrument is timed. It can be set, but if disabled, events produced by the instrument will have NULL values for the corresponding TIMER_START, TIMER_END, and TIMER_WAIT values.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_setup_instruments::create(void)
@@ -60,6 +62,16 @@ PFS_engine_table* table_setup_instruments::create(void)
   return new table_setup_instruments();
 }
 
+ha_rows
+table_setup_instruments::get_row_count(void)
+{
+  return wait_class_max
+    + stage_class_max
+    + statement_class_max
+    + transaction_class_max
+    + memory_class_max;
+}
+
 table_setup_instruments::table_setup_instruments()
   : PFS_engine_table(&m_share, &m_pos),
     m_pos(), m_next_pos()
@@ -74,6 +86,9 @@ void table_setup_instruments::reset_position(void)
 int table_setup_instruments::rnd_next(void)
 {
   PFS_instr_class *instr_class= NULL;
+  PFS_builtin_memory_class *pfs_builtin;
+  bool update_enabled;
+  bool update_timed;
 
   /* Do not advertise hard coded instruments when disabled. */
   if (! pfs_initialized)
@@ -83,6 +98,9 @@ int table_setup_instruments::rnd_next(void)
        m_pos.has_more_view();
        m_pos.next_view())
   {
+    update_enabled= true;
+    update_timed= true;
+
     switch (m_pos.m_index_1)
     {
     case pos_setup_instruments::VIEW_MUTEX:
@@ -109,16 +127,35 @@ int table_setup_instruments::rnd_next(void)
     case pos_setup_instruments::VIEW_STATEMENT:
       instr_class= find_statement_class(m_pos.m_index_2);
       break;
+    case pos_setup_instruments::VIEW_TRANSACTION:
+      instr_class= find_transaction_class(m_pos.m_index_2);
+      break;
     case pos_setup_instruments::VIEW_SOCKET:
       instr_class= find_socket_class(m_pos.m_index_2);
       break;
     case pos_setup_instruments::VIEW_IDLE:
       instr_class= find_idle_class(m_pos.m_index_2);
       break;
+    case pos_setup_instruments::VIEW_BUILTIN_MEMORY:
+      update_enabled= false;
+      update_timed= false;
+      pfs_builtin= find_builtin_memory_class(m_pos.m_index_2);
+      if (pfs_builtin != NULL)
+        instr_class= & pfs_builtin->m_class;
+      else
+        instr_class= NULL;
+      break;
+    case pos_setup_instruments::VIEW_MEMORY:
+      update_timed= false;
+      instr_class= find_memory_class(m_pos.m_index_2);
+      break;
+    case pos_setup_instruments::VIEW_METADATA:
+      instr_class= find_metadata_class(m_pos.m_index_2);
+      break;
     }
     if (instr_class)
     {
-      make_row(instr_class);
+      make_row(instr_class, update_enabled, update_timed);
       m_next_pos.set_after(&m_pos);
       return 0;
     }
@@ -130,6 +167,9 @@ int table_setup_instruments::rnd_next(void)
 int table_setup_instruments::rnd_pos(const void *pos)
 {
   PFS_instr_class *instr_class= NULL;
+  PFS_builtin_memory_class *pfs_builtin;
+  bool update_enabled;
+  bool update_timed;
 
   /* Do not advertise hard coded instruments when disabled. */
   if (! pfs_initialized)
@@ -137,6 +177,9 @@ int table_setup_instruments::rnd_pos(const void *pos)
 
   set_position(pos);
 
+  update_enabled= true;
+  update_timed= true;
+
   switch (m_pos.m_index_1)
   {
   case pos_setup_instruments::VIEW_MUTEX:
@@ -163,25 +206,46 @@ int table_setup_instruments::rnd_pos(const void *pos)
   case pos_setup_instruments::VIEW_STATEMENT:
     instr_class= find_statement_class(m_pos.m_index_2);
     break;
+  case pos_setup_instruments::VIEW_TRANSACTION:
+    instr_class= find_transaction_class(m_pos.m_index_2);
+    break;
   case pos_setup_instruments::VIEW_SOCKET:
     instr_class= find_socket_class(m_pos.m_index_2);
     break;
   case pos_setup_instruments::VIEW_IDLE:
     instr_class= find_idle_class(m_pos.m_index_2);
     break;
+  case pos_setup_instruments::VIEW_BUILTIN_MEMORY:
+    update_enabled= false;
+    update_timed= false;
+    pfs_builtin= find_builtin_memory_class(m_pos.m_index_2);
+    if (pfs_builtin != NULL)
+      instr_class= & pfs_builtin->m_class;
+    else
+      instr_class= NULL;
+    break;
+  case pos_setup_instruments::VIEW_MEMORY:
+    update_timed= false;
+    instr_class= find_memory_class(m_pos.m_index_2);
+    break;
+  case pos_setup_instruments::VIEW_METADATA:
+    instr_class= find_metadata_class(m_pos.m_index_2);
+    break;
   }
   if (instr_class)
   {
-    make_row(instr_class);
+    make_row(instr_class, update_enabled, update_timed);
     return 0;
   }
 
   return HA_ERR_RECORD_DELETED;
 }
 
-void table_setup_instruments::make_row(PFS_instr_class *klass)
+void table_setup_instruments::make_row(PFS_instr_class *klass, bool update_enabled, bool update_timed)
 {
   m_row.m_instr_class= klass;
+  m_row.m_update_enabled= update_enabled;
+  m_row.m_update_timed= update_timed;
 }
 
 int table_setup_instruments::read_row_values(TABLE *table,
@@ -191,7 +255,7 @@ int table_setup_instruments::read_row_values(TABLE *table,
 {
   Field *f;
 
-  DBUG_ASSERT(table->s->null_bytes == 0);
+  assert(table->s->null_bytes == 0);
 
   /*
     The row always exist, the instrument classes
@@ -214,7 +278,7 @@ int table_setup_instruments::read_row_values(TABLE *table,
         set_field_enum(f, m_row.m_instr_class->m_timed ? ENUM_YES : ENUM_NO);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
@@ -239,15 +303,23 @@ int table_setup_instruments::update_row_values(TABLE *table,
       case 0: /* NAME */
         return HA_ERR_WRONG_COMMAND;
       case 1: /* ENABLED */
-        value= (enum_yes_no) get_field_enum(f);
-        m_row.m_instr_class->m_enabled= (value == ENUM_YES) ? true : false;
+        /* Do not raise error if m_update_enabled is false, silently ignore. */
+        if (m_row.m_update_enabled)
+        {
+          value= (enum_yes_no) get_field_enum(f);
+          m_row.m_instr_class->m_enabled= (value == ENUM_YES) ? true : false;
+        }
         break;
       case 2: /* TIMED */
-        value= (enum_yes_no) get_field_enum(f);
-        m_row.m_instr_class->m_timed= (value == ENUM_YES) ? true : false;
+        /* Do not raise error if m_update_timed is false, silently ignore. */
+        if (m_row.m_update_timed)
+        {
+          value= (enum_yes_no) get_field_enum(f);
+          m_row.m_instr_class->m_timed= (value == ENUM_YES) ? true : false;
+        }
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
@@ -274,6 +346,7 @@ int table_setup_instruments::update_row_values(TABLE *table,
       break;
     case pos_setup_instruments::VIEW_STAGE:
     case pos_setup_instruments::VIEW_STATEMENT:
+    case pos_setup_instruments::VIEW_TRANSACTION:
       /* No flag to update. */
       break;
     case pos_setup_instruments::VIEW_SOCKET:
@@ -282,8 +355,15 @@ int table_setup_instruments::update_row_values(TABLE *table,
     case pos_setup_instruments::VIEW_IDLE:
       /* No flag to update. */
       break;
+    case pos_setup_instruments::VIEW_BUILTIN_MEMORY:
+    case pos_setup_instruments::VIEW_MEMORY:
+      /* No flag to update. */
+      break;
+    case pos_setup_instruments::VIEW_METADATA:
+      update_metadata_derived_flags();
+      break;
     default:
-      DBUG_ASSERT(false);
+      assert(false);
       break;
   }
 
diff --git a/storage/perfschema/table_setup_instruments.h b/storage/perfschema/table_setup_instruments.h
index cd3715cfe55..59378824236 100644
--- a/storage/perfschema/table_setup_instruments.h
+++ b/storage/perfschema/table_setup_instruments.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -41,6 +41,10 @@ struct row_setup_instruments
 {
   /** Columns NAME, ENABLED, TIMED. */
   PFS_instr_class *m_instr_class;
+  /** True if column ENABLED can be updated. */
+  bool m_update_enabled;
+  /** True if column TIMED can be updated. */
+  bool m_update_timed;
 };
 
 /** Position of a cursor on PERFORMANCE_SCHEMA.SETUP_INSTRUMENTS. */
@@ -55,9 +59,13 @@ struct pos_setup_instruments : public PFS_double_index
   static const uint VIEW_TABLE= 6;
   static const uint VIEW_STAGE= 7;
   static const uint VIEW_STATEMENT= 8;
-  static const uint VIEW_SOCKET= 9;
-  static const uint VIEW_IDLE= 10;
-  static const uint LAST_VIEW= 10;
+  static const uint VIEW_TRANSACTION=9;
+  static const uint VIEW_SOCKET= 10;
+  static const uint VIEW_IDLE= 11;
+  static const uint VIEW_BUILTIN_MEMORY= 12;
+  static const uint VIEW_MEMORY= 13;
+  static const uint VIEW_METADATA= 14;
+  static const uint LAST_VIEW= 14;
 
   pos_setup_instruments()
     : PFS_double_index(FIRST_VIEW, 1)
@@ -86,6 +94,7 @@ public:
   /** Table share. */
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
@@ -109,7 +118,7 @@ public:
   {}
 
 private:
-  void make_row(PFS_instr_class *klass);
+  void make_row(PFS_instr_class *klass, bool update_enabled, bool update_timed);
 
   /** Table share lock. */
   static THR_LOCK m_table_lock;
diff --git a/storage/perfschema/table_setup_objects.cc b/storage/perfschema/table_setup_objects.cc
index 00ef7f329f3..b71d3b7afa9 100644
--- a/storage/perfschema/table_setup_objects.cc
+++ b/storage/perfschema/table_setup_objects.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,7 +26,7 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
@@ -34,6 +34,8 @@
 #include "table_setup_objects.h"
 #include "table_helper.h"
 #include "pfs_global.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_setup_objects::m_table_lock;
 
@@ -46,15 +48,15 @@ table_setup_objects::m_share=
   table_setup_objects::write_row,
   table_setup_objects::delete_all_rows,
   table_setup_objects::get_row_count,
-  1000, /* records */
   sizeof(PFS_simple_index),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE setup_objects("
-                      "OBJECT_TYPE ENUM ('TABLE') not null default 'TABLE' comment 'Type of object to instrument. Currently, only TABLE, for base table.',"
+                      "OBJECT_TYPE ENUM ('EVENT','FUNCTION','PROCEDURE','TABLE','TRIGGER') not null default 'TABLE' comment 'Type of object to instrument.',"
                       "OBJECT_SCHEMA VARCHAR(64) default '%' comment 'Schema containing the object, either the literal or % for any schema.',"
                       "OBJECT_NAME VARCHAR(64) not null default '%' comment 'Name of the instrumented object, either the literal or % for any object.',"
                       "ENABLED ENUM ('YES', 'NO') not null default 'YES' comment 'Whether the object''s events are instrumented or not. Can be disabled, in which case monitoring is not enabled for those objects.',"
-                      "TIMED ENUM ('YES', 'NO') not null default 'YES' comment 'Whether the object''s events are timed or not. Can be modified.')") }
+                      "TIMED ENUM ('YES', 'NO') not null default 'YES' comment 'Whether the object''s events are timed or not. Can be modified.')") },
+  false  /* perpetual */
 };
 
 int update_derived_flags()
@@ -64,6 +66,7 @@ int update_derived_flags()
     return HA_ERR_OUT_OF_MEM;
 
   update_table_share_derived_flags(thread);
+  update_program_share_derived_flags(thread);
   update_table_derived_flags();
   return 0;
 }
@@ -79,8 +82,8 @@ int table_setup_objects::write_row(TABLE *table, const unsigned char *buf,
   int result;
   Field *f;
   enum_object_type object_type= OBJECT_TYPE_TABLE;
-  String object_schema_data("%", 1, &my_charset_utf8_bin);
-  String object_name_data("%", 1, &my_charset_utf8_bin);
+  String object_schema_data("%", 1, &my_charset_utf8mb3_bin);
+  String object_name_data("%", 1, &my_charset_utf8mb3_bin);
   String *object_schema= &object_schema_data;
   String *object_name= &object_name_data;
   enum_yes_no enabled_value= ENUM_YES;
@@ -110,13 +113,15 @@ int table_setup_objects::write_row(TABLE *table, const unsigned char *buf,
         timed_value= (enum_yes_no) get_field_enum(f);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
 
   /* Reject illegal enum values in OBJECT_TYPE */
-  if (object_type != OBJECT_TYPE_TABLE)
+  if (object_type < FIRST_OBJECT_TYPE ||
+      object_type > LAST_OBJECT_TYPE  ||
+      object_type == OBJECT_TYPE_TEMPORARY_TABLE)
     return HA_ERR_NO_REFERENCED_ROW;
 
   /* Reject illegal enum values in ENABLED */
@@ -147,7 +152,7 @@ int table_setup_objects::delete_all_rows(void)
 
 ha_rows table_setup_objects::get_row_count(void)
 {
-  return setup_object_count();
+  return global_setup_object_container.get_row_count();
 }
 
 table_setup_objects::table_setup_objects()
@@ -165,17 +170,14 @@ int table_setup_objects::rnd_next(void)
 {
   PFS_setup_object *pfs;
 
-  for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index < setup_object_max;
-       m_pos.next())
+  m_pos.set_at(&m_next_pos);
+  PFS_setup_object_iterator it= global_setup_object_container.iterate(m_pos.m_index);
+  pfs= it.scan_next(& m_pos.m_index);
+  if (pfs != NULL)
   {
-    pfs= &setup_object_array[m_pos.m_index];
-    if (pfs->m_lock.is_populated())
-    {
-      make_row(pfs);
-      m_next_pos.set_after(&m_pos);
-      return 0;
-    }
+    make_row(pfs);
+    m_next_pos.set_after(&m_pos);
+    return 0;
   }
 
   return HA_ERR_END_OF_FILE;
@@ -187,9 +189,8 @@ int table_setup_objects::rnd_pos(const void *pos)
 
   set_position(pos);
 
-  DBUG_ASSERT(m_pos.m_index < setup_object_max);
-  pfs= &setup_object_array[m_pos.m_index];
-  if (pfs->m_lock.is_populated())
+  pfs= global_setup_object_container.get(m_pos.m_index);
+  if (pfs != NULL)
   {
     make_row(pfs);
     return 0;
@@ -200,7 +201,7 @@ int table_setup_objects::rnd_pos(const void *pos)
 
 void table_setup_objects::make_row(PFS_setup_object *pfs)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
 
   m_row_exists= false;
 
@@ -229,7 +230,7 @@ int table_setup_objects::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
@@ -262,7 +263,7 @@ int table_setup_objects::read_row_values(TABLE *table,
         set_field_enum(f, (*m_row.m_timed_ptr) ? ENUM_YES : ENUM_NO);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
@@ -304,7 +305,7 @@ int table_setup_objects::update_row_values(TABLE *table,
         *m_row.m_timed_ptr= (value == ENUM_YES) ? true : false;
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
@@ -317,9 +318,9 @@ int table_setup_objects::delete_row_values(TABLE *table,
                                            const unsigned char *buf,
                                            Field **fields)
 {
-  DBUG_ASSERT(m_row_exists);
+  assert(m_row_exists);
 
-  CHARSET_INFO *cs= &my_charset_utf8_bin;
+  CHARSET_INFO *cs= &my_charset_utf8mb3_bin;
   enum_object_type object_type= OBJECT_TYPE_TABLE;
   String object_schema(m_row.m_schema_name, m_row.m_schema_name_length, cs);
   String object_name(m_row.m_object_name, m_row.m_object_name_length, cs);
diff --git a/storage/perfschema/table_setup_objects.h b/storage/perfschema/table_setup_objects.h
index 570acc865ad..feca8fc2f74 100644
--- a/storage/perfschema/table_setup_objects.h
+++ b/storage/perfschema/table_setup_objects.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
diff --git a/storage/perfschema/table_setup_timers.cc b/storage/perfschema/table_setup_timers.cc
index 413b970749c..b8b70981a36 100644
--- a/storage/perfschema/table_setup_timers.cc
+++ b/storage/perfschema/table_setup_timers.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,12 +26,14 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "table_setup_timers.h"
 #include "pfs_column_values.h"
 #include "pfs_timer.h"
+#include "field.h"
+#include "derror.h" /* ER_THD */
 
-#define COUNT_SETUP_TIMERS 4
+#define COUNT_SETUP_TIMERS 5
 
 static row_setup_timers all_setup_timers_data[COUNT_SETUP_TIMERS]=
 {
@@ -50,6 +52,10 @@ static row_setup_timers all_setup_timers_data[COUNT_SETUP_TIMERS]=
   {
     { C_STRING_WITH_LEN("statement") },
     &statement_timer
+  },
+  {
+    { C_STRING_WITH_LEN("transaction") },
+    &transaction_timer
   }
 };
 
@@ -60,23 +66,36 @@ table_setup_timers::m_share=
 {
   { C_STRING_WITH_LEN("setup_timers") },
   &pfs_updatable_acl,
-  &table_setup_timers::create,
+  table_setup_timers::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
-  NULL, /* get_row_count */
-  COUNT_SETUP_TIMERS,
+  table_setup_timers::get_row_count,
   sizeof(PFS_simple_index),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE setup_timers("
                       "NAME VARCHAR(64) not null comment 'Type of instrument the timer is used for.',"
-                      "TIMER_NAME ENUM ('CYCLE', 'NANOSECOND', 'MICROSECOND', 'MILLISECOND', 'TICK') not null comment 'Timer applying to the instrument type. Can be modified.')") }
+                      "TIMER_NAME ENUM ('CYCLE', 'NANOSECOND', 'MICROSECOND', 'MILLISECOND', 'TICK') not null comment 'Timer applying to the instrument type. Can be modified.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_setup_timers::create(void)
 {
+  THD *thd = current_thd;
+  push_warning_printf(thd,
+                      Sql_condition::WARN_LEVEL_WARN,
+                      ER_WARN_DEPRECATED_SYNTAX_NO_REPLACEMENT,
+                      ER_THD(thd, ER_WARN_DEPRECATED_SYNTAX_NO_REPLACEMENT),
+                      "performance_schema.setup_timers");
+
   return new table_setup_timers();
 }
 
+ha_rows
+table_setup_timers::get_row_count(void)
+{
+  return COUNT_SETUP_TIMERS;
+}
+
 table_setup_timers::table_setup_timers()
   : PFS_engine_table(&m_share, &m_pos),
     m_row(NULL), m_pos(0), m_next_pos(0)
@@ -112,7 +131,7 @@ int table_setup_timers::rnd_next(void)
 int table_setup_timers::rnd_pos(const void *pos)
 {
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index < COUNT_SETUP_TIMERS);
+  assert(m_pos.m_index < COUNT_SETUP_TIMERS);
   m_row= &all_setup_timers_data[m_pos.m_index];
   return 0;
 }
@@ -124,10 +143,10 @@ int table_setup_timers::read_row_values(TABLE *table,
 {
   Field *f;
 
-  DBUG_ASSERT(m_row);
+  assert(m_row);
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 0);
+  assert(table->s->null_bytes == 0);
 
   for (; (f= *fields) ; fields++)
   {
@@ -142,7 +161,7 @@ int table_setup_timers::read_row_values(TABLE *table,
         set_field_enum(f, *(m_row->m_timer_name_ptr));
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
@@ -158,7 +177,7 @@ int table_setup_timers::update_row_values(TABLE *table,
   Field *f;
   longlong value;
 
-  DBUG_ASSERT(m_row);
+  assert(m_row);
 
   for (; (f= *fields) ; fields++)
   {
@@ -176,7 +195,7 @@ int table_setup_timers::update_row_values(TABLE *table,
           return HA_ERR_WRONG_COMMAND;
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
diff --git a/storage/perfschema/table_setup_timers.h b/storage/perfschema/table_setup_timers.h
index d591f3e0b67..9237fa2a059 100644
--- a/storage/perfschema/table_setup_timers.h
+++ b/storage/perfschema/table_setup_timers.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -52,6 +52,7 @@ public:
   /** Table share. */
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
diff --git a/storage/perfschema/table_socket_instances.cc b/storage/perfschema/table_socket_instances.cc
index fcfc6bb17c7..ee1818b900b 100644
--- a/storage/perfschema/table_socket_instances.cc
+++ b/storage/perfschema/table_socket_instances.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,12 +26,14 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_socket_instances.h"
 #include "pfs_global.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_socket_instances::m_table_lock;
 
@@ -40,11 +42,10 @@ table_socket_instances::m_share=
 {
   { C_STRING_WITH_LEN("socket_instances") },
   &pfs_readonly_acl,
-  &table_socket_instances::create,
+  table_socket_instances::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_socket_instances::get_row_count,
   sizeof(PFS_simple_index),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE socket_instances("
@@ -54,7 +55,8 @@ table_socket_instances::m_share=
                       "SOCKET_ID INTEGER not null comment 'The socket''s internal file handle.',"
                       "IP VARCHAR(64) not null comment 'Client IP address. Blank for Unix socket file, otherwise an IPv4 or IPv6 address. Together with the PORT identifies the connection.',"
                       "PORT INTEGER not null comment 'TCP/IP port number, from 0 to 65535. Together with the IP identifies the connection.',"
-                      "STATE ENUM('IDLE','ACTIVE') not null comment 'Socket status, either IDLE if waiting to receive a request from a client, or ACTIVE')") }
+                      "STATE ENUM('IDLE','ACTIVE') not null comment 'Socket status, either IDLE if waiting to receive a request from a client, or ACTIVE')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_socket_instances::create(void)
@@ -62,6 +64,12 @@ PFS_engine_table* table_socket_instances::create(void)
   return new table_socket_instances();
 }
 
+ha_rows
+table_socket_instances::get_row_count(void)
+{
+  return global_socket_container.get_row_count();
+}
+
 table_socket_instances::table_socket_instances()
   : PFS_engine_table(&m_share, &m_pos),
   m_row_exists(false), m_pos(0), m_next_pos(0)
@@ -77,17 +85,14 @@ int table_socket_instances::rnd_next(void)
 {
   PFS_socket *pfs;
 
-  for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index < socket_max;
-       m_pos.next())
+  m_pos.set_at(&m_next_pos);
+  PFS_socket_iterator it= global_socket_container.iterate(m_pos.m_index);
+  pfs= it.scan_next(& m_pos.m_index);
+  if (pfs != NULL)
   {
-    pfs= &socket_array[m_pos.m_index];
-    if (pfs->m_lock.is_populated())
-    {
-      make_row(pfs);
-      m_next_pos.set_after(&m_pos);
-      return 0;
-    }
+    make_row(pfs);
+    m_next_pos.set_after(&m_pos);
+    return 0;
   }
 
   return HA_ERR_END_OF_FILE;
@@ -98,19 +103,20 @@ int table_socket_instances::rnd_pos(const void *pos)
   PFS_socket *pfs;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index < socket_max);
-  pfs= &socket_array[m_pos.m_index];
 
-  if (! pfs->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
+  pfs= global_socket_container.get(m_pos.m_index);
+  if (pfs != NULL)
+  {
+    make_row(pfs);
+    return 0;
+  }
 
-  make_row(pfs);
-  return 0;
+  return HA_ERR_RECORD_DELETED;
 }
 
 void table_socket_instances::make_row(PFS_socket *pfs)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   PFS_socket_class *safe_class;
 
   m_row_exists= false;
@@ -158,7 +164,7 @@ int table_socket_instances::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
@@ -192,7 +198,7 @@ int table_socket_instances::read_row_values(TABLE *table,
         set_field_enum(f, m_row.m_state);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
diff --git a/storage/perfschema/table_socket_instances.h b/storage/perfschema/table_socket_instances.h
index 41b5ee3fd21..30f18086c3a 100644
--- a/storage/perfschema/table_socket_instances.h
+++ b/storage/perfschema/table_socket_instances.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -70,6 +70,7 @@ public:
   /** Table share */
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
diff --git a/storage/perfschema/table_socket_summary_by_event_name.cc b/storage/perfschema/table_socket_summary_by_event_name.cc
index 2db0ea2d6ff..6efab95d5b1 100644
--- a/storage/perfschema/table_socket_summary_by_event_name.cc
+++ b/storage/perfschema/table_socket_summary_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,13 +26,14 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_socket_summary_by_event_name.h"
 #include "pfs_global.h"
 #include "pfs_visitor.h"
+#include "field.h"
 
 THR_LOCK table_socket_summary_by_event_name::m_table_lock;
 
@@ -41,11 +42,10 @@ table_socket_summary_by_event_name::m_share=
 {
   { C_STRING_WITH_LEN("socket_summary_by_event_name") },
   &pfs_readonly_acl,
-  &table_socket_summary_by_event_name::create,
+  table_socket_summary_by_event_name::create,
   NULL, /* write_row */
   table_socket_summary_by_event_name::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_socket_summary_by_event_name::get_row_count,
   sizeof(PFS_simple_index),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE socket_summary_by_event_name("
@@ -71,7 +71,8 @@ table_socket_summary_by_event_name::m_share=
                       "SUM_TIMER_MISC BIGINT unsigned not null comment 'Total wait time of all miscellaneous operations that are timed.',"
                       "MIN_TIMER_MISC BIGINT unsigned not null comment 'Minimum wait time of all miscellaneous operations that are timed.',"
                       "AVG_TIMER_MISC BIGINT unsigned not null comment 'Average wait time of all miscellaneous operations that are timed.',"
-                      "MAX_TIMER_MISC BIGINT unsigned not null comment 'Maximum wait time of all miscellaneous operations that are timed.')") }
+                      "MAX_TIMER_MISC BIGINT unsigned not null comment 'Maximum wait time of all miscellaneous operations that are timed.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_socket_summary_by_event_name::create(void)
@@ -91,6 +92,12 @@ int table_socket_summary_by_event_name::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_socket_summary_by_event_name::get_row_count(void)
+{
+  return socket_class_max;
+}
+
 void table_socket_summary_by_event_name::reset_position(void)
 {
   m_pos.m_index= 1;
@@ -138,7 +145,7 @@ void table_socket_summary_by_event_name::make_row(PFS_socket_class *socket_class
   PFS_instance_iterator::visit_socket_instances(socket_class, &visitor);
 
   time_normalizer *normalizer= time_normalizer::get(wait_timer);
-  
+
   /* Collect timer and byte count stats */
   m_row.m_io_stat.set(normalizer, &visitor.m_socket_io_stat);
   m_row_exists= true;
@@ -155,7 +162,7 @@ int table_socket_summary_by_event_name::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 0);
+  assert(table->s->null_bytes == 0);
 
   for (; (f= *fields) ; fields++)
   {
@@ -237,7 +244,7 @@ int table_socket_summary_by_event_name::read_row_values(TABLE *table,
         break;
 
       default:
-        DBUG_ASSERT(false);
+        assert(false);
         break;
       }
     } // if
diff --git a/storage/perfschema/table_socket_summary_by_event_name.h b/storage/perfschema/table_socket_summary_by_event_name.h
index 4f679d89b09..39361b54bb5 100644
--- a/storage/perfschema/table_socket_summary_by_event_name.h
+++ b/storage/perfschema/table_socket_summary_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -60,6 +60,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
diff --git a/storage/perfschema/table_socket_summary_by_instance.cc b/storage/perfschema/table_socket_summary_by_instance.cc
index 8244d2c8bae..4bb7a2af773 100644
--- a/storage/perfschema/table_socket_summary_by_instance.cc
+++ b/storage/perfschema/table_socket_summary_by_instance.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,12 +26,14 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_socket_summary_by_instance.h"
 #include "pfs_global.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_socket_summary_by_instance::m_table_lock;
 
@@ -40,11 +42,10 @@ table_socket_summary_by_instance::m_share=
 {
   { C_STRING_WITH_LEN("socket_summary_by_instance") },
   &pfs_readonly_acl,
-  &table_socket_summary_by_instance::create,
+  table_socket_summary_by_instance::create,
   NULL, /* write_row */
   table_socket_summary_by_instance::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_socket_summary_by_instance::get_row_count,
   sizeof(PFS_simple_index),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE socket_summary_by_instance("
@@ -71,7 +72,8 @@ table_socket_summary_by_instance::m_share=
                       "SUM_TIMER_MISC BIGINT unsigned not null comment 'Total wait time of all miscellaneous operations that are timed.',"
                       "MIN_TIMER_MISC BIGINT unsigned not null comment 'Minimum wait time of all miscellaneous operations that are timed.',"
                       "AVG_TIMER_MISC BIGINT unsigned not null comment 'Average wait time of all miscellaneous operations that are timed.',"
-                      "MAX_TIMER_MISC BIGINT unsigned not null comment 'Maximum wait time of all miscellaneous operations that are timed.')") }
+                      "MAX_TIMER_MISC BIGINT unsigned not null comment 'Maximum wait time of all miscellaneous operations that are timed.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_socket_summary_by_instance::create(void)
@@ -90,6 +92,12 @@ int table_socket_summary_by_instance::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_socket_summary_by_instance::get_row_count(void)
+{
+  return global_socket_container.get_row_count();
+}
+
 void table_socket_summary_by_instance::reset_position(void)
 {
   m_pos.m_index= 0;
@@ -100,17 +108,14 @@ int table_socket_summary_by_instance::rnd_next(void)
 {
   PFS_socket *pfs;
 
-  for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index < socket_max;
-       m_pos.next())
+  m_pos.set_at(&m_next_pos);
+  PFS_socket_iterator it= global_socket_container.iterate(m_pos.m_index);
+  pfs= it.scan_next(& m_pos.m_index);
+  if (pfs != NULL)
   {
-    pfs= &socket_array[m_pos.m_index];
-    if (pfs->m_lock.is_populated())
-    {
-      make_row(pfs);
-      m_next_pos.set_after(&m_pos);
-      return 0;
-    }
+    make_row(pfs);
+    m_next_pos.set_after(&m_pos);
+    return 0;
   }
 
   return HA_ERR_END_OF_FILE;
@@ -121,19 +126,20 @@ int table_socket_summary_by_instance::rnd_pos(const void *pos)
   PFS_socket *pfs;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index < socket_max);
-  pfs= &socket_array[m_pos.m_index];
 
-  if (! pfs->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
+  pfs= global_socket_container.get(m_pos.m_index);
+  if (pfs != NULL)
+  {
+    make_row(pfs);
+    return 0;
+  }
 
-  make_row(pfs);
-  return 0;
+  return HA_ERR_RECORD_DELETED;
 }
 
 void table_socket_summary_by_instance::make_row(PFS_socket *pfs)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   PFS_socket_class *safe_class;
 
   m_row_exists= false;
@@ -170,7 +176,7 @@ int table_socket_summary_by_instance::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 0);
+  assert(table->s->null_bytes == 0);
 
   for (; (f= *fields) ; fields++)
   {
@@ -255,7 +261,7 @@ int table_socket_summary_by_instance::read_row_values(TABLE *table,
         set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_max);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
         break;
       }
     }
diff --git a/storage/perfschema/table_socket_summary_by_instance.h b/storage/perfschema/table_socket_summary_by_instance.h
index a9eab8d18b6..39a02c939b5 100644
--- a/storage/perfschema/table_socket_summary_by_instance.h
+++ b/storage/perfschema/table_socket_summary_by_instance.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -63,6 +63,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
diff --git a/storage/perfschema/table_status_by_account.cc b/storage/perfschema/table_status_by_account.cc
new file mode 100644
index 00000000000..347012bc064
--- /dev/null
+++ b/storage/perfschema/table_status_by_account.cc
@@ -0,0 +1,246 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_status_by_account.cc
+  Table STATUS_BY_ACCOUNT (implementation).
+*/
+
+#include "my_global.h"
+#include "table_status_by_account.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "pfs_global.h"
+#include "pfs_account.h"
+
+THR_LOCK table_status_by_account::m_table_lock;
+
+PFS_engine_table_share
+table_status_by_account::m_share=
+{
+  { C_STRING_WITH_LEN("status_by_account") },
+  &pfs_truncatable_acl,
+  table_status_by_account::create,
+  NULL, /* write_row */
+  table_status_by_account::delete_all_rows,
+  table_status_by_account::get_row_count,
+  sizeof(pos_t),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE status_by_account("
+  "USER CHAR(32) collate utf8_bin default null comment 'User for which the status variable is reported.',"
+  "HOST CHAR(60) collate utf8_bin default null comment 'Host for which the status variable is reported.',"
+  "VARIABLE_NAME VARCHAR(64) not null comment 'Status variable name.',"
+  "VARIABLE_VALUE VARCHAR(1024) comment 'Aggregated status variable value.' )") },
+  false  /* perpetual */
+};
+
+PFS_engine_table*
+table_status_by_account::create(void)
+{
+  return new table_status_by_account();
+}
+
+int table_status_by_account::delete_all_rows(void)
+{
+  mysql_mutex_lock(&LOCK_status);
+  reset_status_by_thread();
+  reset_status_by_account();
+  mysql_mutex_unlock(&LOCK_status);
+  return 0;
+}
+
+ha_rows table_status_by_account::get_row_count(void)
+{
+  mysql_mutex_lock(&LOCK_status);
+  size_t status_var_count= all_status_vars.elements;
+  mysql_mutex_unlock(&LOCK_status);
+  return (global_account_container.get_row_count() * status_var_count);
+}
+
+table_status_by_account::table_status_by_account()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_status_cache(true), m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_status_by_account::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_status_by_account::rnd_init(bool scan)
+{
+  if (show_compatibility_56)
+    return 0;
+
+  /*
+    Build array of SHOW_VARs from the global status array prior to materializing
+    threads in rnd_next() or rnd_pos().
+  */
+  m_status_cache.initialize_client_session();
+
+  /* Use the current number of status variables to detect changes. */
+  ulonglong status_version= m_status_cache.get_status_array_version();
+
+  /*
+    The table context holds the current version of the global status array
+    and a record of which accounts were materialized. If scan == true, then
+    allocate a new context from mem_root and store in TLS. If scan == false,
+    then restore from TLS.
+  */
+  m_context= (table_status_by_account_context *)current_thd->alloc(sizeof(table_status_by_account_context));
+  new(m_context) table_status_by_account_context(status_version, !scan);
+  return 0;
+}
+
+int table_status_by_account::rnd_next(void)
+{
+  if (show_compatibility_56)
+    return HA_ERR_END_OF_FILE;
+
+  /* If status array changes, exit with warning. */ // TODO: Issue warning
+  if (!m_context->versions_match())
+    return HA_ERR_END_OF_FILE;
+
+  /*
+    For each account, build a cache of status variables using totals from all
+    threads associated with the account.
+  */
+  bool has_more_account= true;
+
+  for (m_pos.set_at(&m_next_pos);
+       has_more_account;
+       m_pos.next_account())
+  {
+    PFS_account *pfs_account= global_account_container.get(m_pos.m_index_1, &has_more_account);
+
+    if (m_status_cache.materialize_account(pfs_account) == 0)
+    {
+      /* Mark this account as materialized. */
+      m_context->set_item(m_pos.m_index_1);
+
+      /* Get the next status variable. */
+      const Status_variable *stat_var= m_status_cache.get(m_pos.m_index_2);
+      if (stat_var != NULL)
+      {
+        make_row(pfs_account, stat_var);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_status_by_account::rnd_pos(const void *pos)
+{
+  if (show_compatibility_56)
+    return HA_ERR_RECORD_DELETED;
+
+  /* If status array changes, exit with warning. */ // TODO: Issue warning
+  if (!m_context->versions_match())
+    return HA_ERR_END_OF_FILE;
+
+  set_position(pos);
+  assert(m_pos.m_index_1 < global_account_container.get_row_count());
+
+  PFS_account *pfs_account= global_account_container.get(m_pos.m_index_1);
+
+  /*
+    Only materialize threads that were previously materialized by rnd_next().
+    If a account cannot be rematerialized, then do nothing.
+  */
+  if (m_context->is_item_set(m_pos.m_index_1) &&
+      m_status_cache.materialize_account(pfs_account) == 0)
+  {
+    const Status_variable *stat_var= m_status_cache.get(m_pos.m_index_2);
+    if (stat_var != NULL)
+    {
+      make_row(pfs_account, stat_var);
+      return 0;
+    }
+  }
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_status_by_account
+::make_row(PFS_account *pfs_account, const Status_variable *status_var)
+{
+  pfs_optimistic_state lock;
+  m_row_exists= false;
+  pfs_account->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_account.make_row(pfs_account))
+    return;
+
+  m_row.m_variable_name.make_row(status_var->m_name, status_var->m_name_length);
+  m_row.m_variable_value.make_row(status_var);
+
+  if (!pfs_account->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+}
+
+int table_status_by_account
+::read_row_values(TABLE *table,
+                  unsigned char *buf,
+                  Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* USER */
+      case 1: /* HOST */
+        m_row.m_account.set_field(f->field_index, f);
+        break;
+      case 2: /* VARIABLE_NAME */
+        set_field_varchar_utf8(f, m_row.m_variable_name.m_str, m_row.m_variable_name.m_length);
+        break;
+      case 3: /* VARIABLE_VALUE */
+        m_row.m_variable_value.set_field(f);
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_status_by_account.h b/storage/perfschema/table_status_by_account.h
new file mode 100644
index 00000000000..19a4f1db1de
--- /dev/null
+++ b/storage/perfschema/table_status_by_account.h
@@ -0,0 +1,156 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_STATUS_BY_ACCOUNT_H
+#define TABLE_STATUS_BY_ACCOUNT_H
+
+/**
+  @file storage/perfschema/table_status_by_account.h
+  Table STATUS_BY_ACCOUNT (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_account.h"
+#include "pfs_account.h"
+#include "pfs_host.h"
+#include "table_helper.h"
+#include "pfs_variable.h"
+#include "pfs_buffer_container.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.STATUS_BY_ACCOUNT.
+*/
+struct row_status_by_account
+{
+  /** Column USER, HOST. */
+  PFS_account_row m_account;
+  /** Column VARIABLE_NAME. */
+  PFS_variable_name_row m_variable_name;
+  /** Column VARIABLE_VALUE. */
+  PFS_variable_value_row m_variable_value;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.STATUS_BY_ACCOUNT.
+  Index 1 on account (0 based)
+  Index 2 on status variable (0 based)
+*/
+struct pos_status_by_account
+: public PFS_double_index
+{
+  pos_status_by_account()
+    : PFS_double_index(0, 0)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 0;
+  }
+
+  inline bool has_more_account(void)
+  { return (m_index_1 < global_account_container.get_row_count()); }
+
+  inline void next_account(void)
+  {
+    m_index_1++;
+    m_index_2= 0;
+  }
+};
+
+/**
+  Store and retrieve table state information for queries that reinstantiate
+  the table object.
+*/
+class table_status_by_account_context : public PFS_table_context
+{
+public:
+  table_status_by_account_context(ulonglong current_version, bool restore) :
+    PFS_table_context(current_version, global_account_container.get_row_count(), restore, THR_PFS_SBH) { }
+};
+
+/** Table PERFORMANCE_SCHEMA.STATUS_BY_ACCOUNT. */
+class table_status_by_account : public PFS_engine_table
+{
+  typedef pos_status_by_account pos_t;
+
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+  table_status_by_account();
+
+public:
+  ~table_status_by_account()
+  {}
+
+protected:
+  int materialize(PFS_thread *pfs_thread);
+  void make_row(PFS_account *pfs_account, const Status_variable *status_var);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Status variable cache for one account. */
+  PFS_status_variable_cache m_status_cache;
+
+  /** Current row. */
+  row_status_by_account m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_t m_pos;
+  /** Next position. */
+  pos_t m_next_pos;
+
+  /** Table context with global status array version and map of materialized threads. */
+  table_status_by_account_context *m_context;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_status_by_host.cc b/storage/perfschema/table_status_by_host.cc
new file mode 100644
index 00000000000..6b280011565
--- /dev/null
+++ b/storage/perfschema/table_status_by_host.cc
@@ -0,0 +1,245 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_status_by_host.cc
+  Table STATUS_BY_HOST (implementation).
+*/
+
+#include "my_global.h"
+#include "table_status_by_host.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "pfs_global.h"
+#include "pfs_account.h"
+
+THR_LOCK table_status_by_host::m_table_lock;
+
+PFS_engine_table_share
+table_status_by_host::m_share=
+{
+  { C_STRING_WITH_LEN("status_by_host") },
+  &pfs_truncatable_acl,
+  table_status_by_host::create,
+  NULL, /* write_row */
+  table_status_by_host::delete_all_rows,
+  table_status_by_host::get_row_count,
+  sizeof(pos_t),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE status_by_host("
+  "HOST CHAR(60) collate utf8_bin default null comment 'Host for which the status variable is reported.',"
+  "VARIABLE_NAME VARCHAR(64) not null comment 'Status variable name.',"
+  "VARIABLE_VALUE VARCHAR(1024) comment 'Aggregated status variable value.' )") },
+  false  /* perpetual */
+};
+
+PFS_engine_table*
+table_status_by_host::create(void)
+{
+  return new table_status_by_host();
+}
+
+int table_status_by_host::delete_all_rows(void)
+{
+  mysql_mutex_lock(&LOCK_status);
+  reset_status_by_thread();
+  reset_status_by_account();
+  reset_status_by_host();
+  mysql_mutex_unlock(&LOCK_status);
+  return 0;
+}
+
+ha_rows table_status_by_host::get_row_count(void)
+{
+  mysql_mutex_lock(&LOCK_status);
+  size_t status_var_count= all_status_vars.elements;
+  mysql_mutex_unlock(&LOCK_status);
+  return (global_host_container.get_row_count() * status_var_count);
+}
+
+table_status_by_host::table_status_by_host()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_status_cache(true), m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_status_by_host::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_status_by_host::rnd_init(bool scan)
+{
+  if (show_compatibility_56)
+    return 0;
+
+  /*
+    Build array of SHOW_VARs from the global status array prior to materializing
+    threads in rnd_next() or rnd_pos().
+  */
+  m_status_cache.initialize_client_session();
+
+  /* Use the current number of status variables to detect changes. */
+  ulonglong status_version= m_status_cache.get_status_array_version();
+
+  /*
+    The table context holds the current version of the global status array
+    and a record of which hosts were materialized. If scan == true, then
+    allocate a new context from mem_root and store in TLS. If scan == false,
+    then restore from TLS.
+  */
+  m_context= (table_status_by_host_context *)current_thd->alloc(sizeof(table_status_by_host_context));
+  new(m_context) table_status_by_host_context(status_version, !scan);
+  return 0;
+}
+
+int table_status_by_host::rnd_next(void)
+{
+  if (show_compatibility_56)
+    return HA_ERR_END_OF_FILE;
+
+  /* If status array changes, exit with warning. */ // TODO: Issue warning
+  if (!m_context->versions_match())
+    return HA_ERR_END_OF_FILE;
+
+  /*
+    For each user, build a cache of status variables using totals from all
+    threads associated with the host.
+  */
+  bool has_more_host= true;
+
+  for (m_pos.set_at(&m_next_pos);
+       has_more_host;
+       m_pos.next_host())
+  {
+    PFS_host *pfs_host= global_host_container.get(m_pos.m_index_1, &has_more_host);
+
+    if (m_status_cache.materialize_host(pfs_host) == 0)
+    {
+      /* Mark this host as materialized. */
+      m_context->set_item(m_pos.m_index_1);
+
+      /* Get the next status variable. */
+      const Status_variable *stat_var= m_status_cache.get(m_pos.m_index_2);
+      if (stat_var != NULL)
+      {
+        make_row(pfs_host, stat_var);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_status_by_host::rnd_pos(const void *pos)
+{
+  if (show_compatibility_56)
+    return 0;
+
+  /* If status array changes, exit with warning. */ // TODO: Issue warning
+  if (!m_context->versions_match())
+    return HA_ERR_END_OF_FILE;
+
+  set_position(pos);
+  assert(m_pos.m_index_1 < global_host_container.get_row_count());
+
+  PFS_host *pfs_host= global_host_container.get(m_pos.m_index_1);
+
+  /*
+    Only materialize threads that were previously materialized by rnd_next().
+    If a host cannot be rematerialized, then do nothing.
+  */
+  if (m_context->is_item_set(m_pos.m_index_1) &&
+      m_status_cache.materialize_host(pfs_host) == 0)
+  {
+    const Status_variable *stat_var= m_status_cache.get(m_pos.m_index_2);
+    if (stat_var != NULL)
+    {
+      make_row(pfs_host, stat_var);
+      return 0;
+    }
+  }
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_status_by_host
+::make_row(PFS_host *pfs_host, const Status_variable *status_var)
+{
+  pfs_optimistic_state lock;
+  m_row_exists= false;
+  pfs_host->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_host.make_row(pfs_host))
+    return;
+
+  m_row.m_variable_name.make_row(status_var->m_name, status_var->m_name_length);
+  m_row.m_variable_value.make_row(status_var);
+
+  if (!pfs_host->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+}
+
+int table_status_by_host
+::read_row_values(TABLE *table,
+                  unsigned char *buf,
+                  Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* HOST */
+        m_row.m_host.set_field(f);
+        break;
+      case 1: /* VARIABLE_NAME */
+        set_field_varchar_utf8(f, m_row.m_variable_name.m_str, m_row.m_variable_name.m_length);
+        break;
+      case 2: /* VARIABLE_VALUE */
+        m_row.m_variable_value.set_field(f);
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_status_by_host.h b/storage/perfschema/table_status_by_host.h
new file mode 100644
index 00000000000..f1c92268b02
--- /dev/null
+++ b/storage/perfschema/table_status_by_host.h
@@ -0,0 +1,154 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_STATUS_BY_HOST_H
+#define TABLE_STATUS_BY_HOST_H
+
+/**
+  @file storage/perfschema/table_status_by_host.h
+  Table STATUS_BY_HOST (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_host.h"
+#include "table_helper.h"
+#include "pfs_variable.h"
+#include "pfs_buffer_container.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.STATUS_BY_HOST.
+*/
+struct row_status_by_host
+{
+  /** Column HOST */
+  PFS_host_row m_host;
+  /** Column VARIABLE_NAME. */
+  PFS_variable_name_row m_variable_name;
+  /** Column VARIABLE_VALUE. */
+  PFS_variable_value_row m_variable_value;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.STATUS_BY_HOST.
+  Index 1 on host (0 based)
+  Index 2 on status variable (0 based)
+*/
+struct pos_status_by_host
+: public PFS_double_index
+{
+  pos_status_by_host()
+    : PFS_double_index(0, 0)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 0;
+  }
+
+  inline bool has_more_host(void)
+  { return (m_index_1 < global_host_container.get_row_count()); }
+
+  inline void next_host(void)
+  {
+    m_index_1++;
+    m_index_2= 0;
+  }
+};
+
+/**
+  Store and retrieve table state information for queries that reinstantiate
+  the table object.
+*/
+class table_status_by_host_context : public PFS_table_context
+{
+public:
+  table_status_by_host_context(ulonglong current_version, bool restore) :
+    PFS_table_context(current_version, global_host_container.get_row_count(), restore, THR_PFS_SBH) { }
+};
+
+/** Table PERFORMANCE_SCHEMA.STATUS_BY_HOST. */
+class table_status_by_host : public PFS_engine_table
+{
+  typedef pos_status_by_host pos_t;
+
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+  table_status_by_host();
+
+public:
+  ~table_status_by_host()
+  {}
+
+protected:
+  int materialize(PFS_thread *thread);
+  void make_row(PFS_host *pfs_host, const Status_variable *status_var);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Status variable cache for one host. */
+  PFS_status_variable_cache m_status_cache;
+
+  /** Current row. */
+  row_status_by_host m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_t m_pos;
+  /** Next position. */
+  pos_t m_next_pos;
+
+  /** Table context with global status array version and map of materialized threads. */
+  table_status_by_host_context *m_context;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_status_by_thread.cc b/storage/perfschema/table_status_by_thread.cc
new file mode 100644
index 00000000000..984a15b26eb
--- /dev/null
+++ b/storage/perfschema/table_status_by_thread.cc
@@ -0,0 +1,239 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_status_by_thread.cc
+  Table STATUS_BY_THREAD (implementation).
+*/
+
+#include "my_global.h"
+#include "table_status_by_thread.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "pfs_global.h"
+
+THR_LOCK table_status_by_thread::m_table_lock;
+
+PFS_engine_table_share
+table_status_by_thread::m_share=
+{
+  { C_STRING_WITH_LEN("status_by_thread") },
+  &pfs_truncatable_acl,
+  table_status_by_thread::create,
+  NULL, /* write_row */
+  table_status_by_thread::delete_all_rows,
+  table_status_by_thread::get_row_count,
+  sizeof(pos_t),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE status_by_thread("
+  "THREAD_ID BIGINT unsigned not null comment 'The thread identifier of the session in which the status variable is defined.',"
+  "VARIABLE_NAME VARCHAR(64) not null comment 'Status variable name.',"
+  "VARIABLE_VALUE VARCHAR(1024) comment 'Aggregated status variable value.' )") },
+  false  /* perpetual */
+};
+
+PFS_engine_table*
+table_status_by_thread::create(void)
+{
+  return new table_status_by_thread();
+}
+
+int table_status_by_thread::delete_all_rows(void)
+{
+  /* Lock required to aggregate to global_status_vars. */
+  mysql_mutex_lock(&LOCK_status);
+
+  reset_status_by_thread();
+
+  mysql_mutex_unlock(&LOCK_status);
+  return 0;
+}
+
+ha_rows table_status_by_thread::get_row_count(void)
+{
+  mysql_mutex_lock(&LOCK_status);
+  size_t status_var_count= all_status_vars.elements;
+  mysql_mutex_unlock(&LOCK_status);
+  return (global_thread_container.get_row_count() * status_var_count);
+}
+
+table_status_by_thread::table_status_by_thread()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_status_cache(true), m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_status_by_thread::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_status_by_thread::rnd_init(bool scan)
+{
+  if (show_compatibility_56)
+    return 0;
+
+  /*
+    Build array of SHOW_VARs from the global status array prior to materializing
+    threads in rnd_next() or rnd_pos().
+  */
+  m_status_cache.initialize_session();
+
+  /* Record the current number of status variables to detect subsequent changes. */
+  ulonglong status_version= m_status_cache.get_status_array_version();
+
+  /*
+    The table context holds the current version of the global status array
+    and a record of which threads were materialized. If scan == true, then
+    allocate a new context from mem_root and store in TLS. If scan == false,
+    then restore from TLS.
+  */
+  m_context= (table_status_by_thread_context *)current_thd->alloc(sizeof(table_status_by_thread_context));
+  new(m_context) table_status_by_thread_context(status_version, !scan);
+  return 0;
+}
+
+int table_status_by_thread::rnd_next(void)
+{
+  if (show_compatibility_56)
+    return HA_ERR_END_OF_FILE;
+
+  /* If global status array changes, exit with warning. */ // TODO: Issue warning
+  if (!m_context->versions_match())
+    return HA_ERR_END_OF_FILE;
+
+  bool has_more_thread= true;
+
+  for (m_pos.set_at(&m_next_pos);
+       has_more_thread;
+       m_pos.next_thread())
+  {
+    PFS_thread *pfs_thread= global_thread_container.get(m_pos.m_index_1, &has_more_thread);
+    if (m_status_cache.materialize_session(pfs_thread) == 0)
+    {
+      /* Mark this thread as materialized. */
+      m_context->set_item(m_pos.m_index_1);
+      const Status_variable *stat_var= m_status_cache.get(m_pos.m_index_2);
+      if (stat_var != NULL)
+      {
+        make_row(pfs_thread, stat_var);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_status_by_thread::rnd_pos(const void *pos)
+{
+  if (show_compatibility_56)
+    return HA_ERR_RECORD_DELETED;
+
+  /* If global status array has changed, do nothing. */
+  if (!m_context->versions_match())
+    return HA_ERR_RECORD_DELETED;
+
+  set_position(pos);
+  assert(m_pos.m_index_1 < global_thread_container.get_row_count());
+
+  PFS_thread *pfs_thread= global_thread_container.get(m_pos.m_index_1);
+  /*
+    Only materialize threads that were previously materialized by rnd_next().
+    If a thread cannot be rematerialized, then do nothing.
+  */
+  if (m_context->is_item_set(m_pos.m_index_1) &&
+      m_status_cache.materialize_session(pfs_thread) == 0)
+  {
+    const Status_variable *stat_var= m_status_cache.get(m_pos.m_index_2);
+    if (stat_var != NULL)
+    {
+      make_row(pfs_thread, stat_var);
+      return 0;
+    }
+  }
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_status_by_thread
+::make_row(PFS_thread *thread, const Status_variable *status_var)
+{
+  pfs_optimistic_state lock;
+  m_row_exists= false;
+  if (status_var->is_null())
+    return;
+
+  /* Protect this reader against a thread termination */
+  thread->m_lock.begin_optimistic_lock(&lock);
+
+  m_row.m_thread_internal_id= thread->m_thread_internal_id;
+  m_row.m_variable_name.make_row(status_var->m_name, status_var->m_name_length);
+  m_row.m_variable_value.make_row(status_var);
+
+  if (!thread->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+}
+
+int table_status_by_thread
+::read_row_values(TABLE *table,
+                  unsigned char *buf,
+                  Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* THREAD_ID */
+        set_field_ulonglong(f, m_row.m_thread_internal_id);
+        break;
+      case 1: /* VARIABLE_NAME */
+        set_field_varchar_utf8(f, m_row.m_variable_name.m_str, m_row.m_variable_name.m_length);
+        break;
+      case 2: /* VARIABLE_VALUE */
+        m_row.m_variable_value.set_field(f);
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_status_by_thread.h b/storage/perfschema/table_status_by_thread.h
new file mode 100644
index 00000000000..90a1735d1e2
--- /dev/null
+++ b/storage/perfschema/table_status_by_thread.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_STATUS_BY_THREAD_H
+#define TABLE_STATUS_BY_THREAD_H
+
+/**
+  @file storage/perfschema/table_status_by_thread.h
+  Table STATUS_BY_THREAD (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+#include "pfs_variable.h"
+#include "pfs_buffer_container.h"
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.STATUS_BY_THREAD.
+*/
+struct row_status_by_thread
+{
+  /** Column THREAD_ID. */
+  ulonglong m_thread_internal_id;
+  /** Column VARIABLE_NAME. */
+  PFS_variable_name_row m_variable_name;
+  /** Column VARIABLE_VALUE. */
+  PFS_variable_value_row m_variable_value;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.STATUS_BY_THREAD.
+  Index 1 on thread (0 based)
+  Index 2 on status variable (0 based)
+*/
+struct pos_status_by_thread
+: public PFS_double_index
+{
+  pos_status_by_thread()
+    : PFS_double_index(0, 0)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 0;
+  }
+
+  inline bool has_more_thread(void)
+  { return (m_index_1 < global_thread_container.get_row_count()); }
+
+  inline void next_thread(void)
+  {
+    m_index_1++;
+    m_index_2= 0;
+  }
+};
+
+/**
+  Store and retrieve table state information for queries that reinstantiate
+  the table object.
+*/
+class table_status_by_thread_context : public PFS_table_context
+{
+public:
+  table_status_by_thread_context(ulonglong current_version, bool restore) :
+    PFS_table_context(current_version, global_thread_container.get_row_count(), restore, THR_PFS_SBT) { }
+};
+
+/** Table PERFORMANCE_SCHEMA.STATUS_BY_THREAD. */
+class table_status_by_thread : public PFS_engine_table
+{
+  typedef pos_status_by_thread pos_t;
+
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+  table_status_by_thread();
+
+public:
+  ~table_status_by_thread()
+  {}
+
+protected:
+  int materialize(PFS_thread *thread);
+  void make_row(PFS_thread *thread, const Status_variable *status_var);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current THD variables. */
+  PFS_status_variable_cache m_status_cache;
+  /** Current row. */
+  row_status_by_thread m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_t m_pos;
+  /** Next position. */
+  pos_t m_next_pos;
+
+  /** Table context with global status array version and map of materialized threads. */
+  table_status_by_thread_context *m_context;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_status_by_user.cc b/storage/perfschema/table_status_by_user.cc
new file mode 100644
index 00000000000..84f81402e1c
--- /dev/null
+++ b/storage/perfschema/table_status_by_user.cc
@@ -0,0 +1,246 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_status_by_user.cc
+  Table STATUS_BY_USER (implementation).
+*/
+
+#include "my_global.h"
+#include "table_status_by_user.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "pfs_global.h"
+#include "pfs_account.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_status_by_user::m_table_lock;
+
+PFS_engine_table_share
+table_status_by_user::m_share=
+{
+  { C_STRING_WITH_LEN("status_by_user") },
+  &pfs_truncatable_acl,
+  table_status_by_user::create,
+  NULL, /* write_row */
+  table_status_by_user::delete_all_rows,
+  table_status_by_user::get_row_count,
+  sizeof(pos_t),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE status_by_user("
+  "USER CHAR(32) collate utf8_bin default null comment 'User for which the status variable is reported.',"
+  "VARIABLE_NAME VARCHAR(64) not null comment 'Status variable name.',"
+  "VARIABLE_VALUE VARCHAR(1024) comment 'Aggregated status variable value.' )") },
+  false  /* perpetual */
+};
+
+PFS_engine_table*
+table_status_by_user::create(void)
+{
+  return new table_status_by_user();
+}
+
+int table_status_by_user::delete_all_rows(void)
+{
+  mysql_mutex_lock(&LOCK_status);
+  reset_status_by_thread();
+  reset_status_by_account();
+  reset_status_by_user();
+  mysql_mutex_unlock(&LOCK_status);
+  return 0;
+}
+
+ha_rows table_status_by_user::get_row_count(void)
+{
+  mysql_mutex_lock(&LOCK_status);
+  size_t status_var_count= all_status_vars.elements;
+  mysql_mutex_unlock(&LOCK_status);
+  return (global_user_container.get_row_count() * status_var_count);
+}
+
+table_status_by_user::table_status_by_user()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_status_cache(true), m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_status_by_user::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_status_by_user::rnd_init(bool scan)
+{
+  if (show_compatibility_56)
+    return 0;
+
+  /*
+    Build array of SHOW_VARs from the global status array prior to materializing
+    threads in rnd_next() or rnd_pos().
+  */
+  m_status_cache.initialize_client_session();
+
+  /* Use the current number of status variables to detect changes. */
+  ulonglong status_version= m_status_cache.get_status_array_version();
+
+  /*
+    The table context holds the current version of the global status array
+    and a record of which users were materialized. If scan == true, then
+    allocate a new context from mem_root and store in TLS. If scan == false,
+    then restore from TLS.
+  */
+  m_context= (table_status_by_user_context *)current_thd->alloc(sizeof(table_status_by_user_context));
+  new(m_context) table_status_by_user_context(status_version, !scan);
+  return 0;
+}
+
+int table_status_by_user::rnd_next(void)
+{
+  if (show_compatibility_56)
+    return HA_ERR_END_OF_FILE;
+
+  /* If status array changes, exit with warning. */ // TODO: Issue warning
+  if (!m_context->versions_match())
+    return HA_ERR_END_OF_FILE;
+
+  /*
+    For each user, build a cache of status variables using totals from all
+    threads associated with the user.
+  */
+  bool has_more_user= true;
+
+  for (m_pos.set_at(&m_next_pos);
+       has_more_user;
+       m_pos.next_user())
+  {
+    PFS_user *pfs_user= global_user_container.get(m_pos.m_index_1, &has_more_user);
+
+    if (m_status_cache.materialize_user(pfs_user) == 0)
+    {
+      /* Mark this user as materialized. */
+      m_context->set_item(m_pos.m_index_1);
+
+      /* Get the next status variable. */
+      const Status_variable *stat_var= m_status_cache.get(m_pos.m_index_2);
+      if (stat_var != NULL)
+      {
+        make_row(pfs_user, stat_var);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_status_by_user::rnd_pos(const void *pos)
+{
+  if (show_compatibility_56)
+    return HA_ERR_RECORD_DELETED;
+
+  /* If status array changes, exit with warning. */ // TODO: Issue warning
+  if (!m_context->versions_match())
+    return HA_ERR_END_OF_FILE;
+
+  set_position(pos);
+  assert(m_pos.m_index_1 < global_user_container.get_row_count());
+
+  PFS_user *pfs_user= global_user_container.get(m_pos.m_index_1);
+
+  /*
+    Only materialize threads that were previously materialized by rnd_next().
+    If a user cannot be rematerialized, then do nothing.
+  */
+  if (m_context->is_item_set(m_pos.m_index_1) &&
+      m_status_cache.materialize_user(pfs_user) == 0)
+  {
+    const Status_variable *stat_var= m_status_cache.get(m_pos.m_index_2);
+    if (stat_var != NULL)
+    {
+      make_row(pfs_user, stat_var);
+      return 0;
+    }
+  }
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_status_by_user
+::make_row(PFS_user *user, const Status_variable *status_var)
+{
+  pfs_optimistic_state lock;
+  m_row_exists= false;
+  user->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_user.make_row(user))
+    return;
+
+  m_row.m_variable_name.make_row(status_var->m_name, status_var->m_name_length);
+  m_row.m_variable_value.make_row(status_var);
+
+  if (!user->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+}
+
+int table_status_by_user
+::read_row_values(TABLE *table,
+                  unsigned char *buf,
+                  Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* USER */
+        m_row.m_user.set_field(f);
+        break;
+      case 1: /* VARIABLE_NAME */
+        set_field_varchar_utf8(f, m_row.m_variable_name.m_str, m_row.m_variable_name.m_length);
+        break;
+      case 2: /* VARIABLE_VALUE */
+        m_row.m_variable_value.set_field(f);
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_status_by_user.h b/storage/perfschema/table_status_by_user.h
new file mode 100644
index 00000000000..2fe2cc24407
--- /dev/null
+++ b/storage/perfschema/table_status_by_user.h
@@ -0,0 +1,153 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_STATUS_BY_USER_H
+#define TABLE_STATUS_BY_USER_H
+
+/**
+  @file storage/perfschema/table_status_by_user.h
+  Table STATUS_BY_USER (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_user.h"
+#include "table_helper.h"
+#include "pfs_variable.h"
+#include "pfs_buffer_container.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.STATUS_BY_USER.
+*/
+struct row_status_by_user
+{
+  /** Column USER */
+  PFS_user_row m_user;
+  /** Column VARIABLE_NAME. */
+  PFS_variable_name_row m_variable_name;
+  /** Column VARIABLE_VALUE. */
+  PFS_variable_value_row m_variable_value;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.STATUS_BY_USER.
+  Index 1 on user (0 based)
+  Index 2 on status variable (0 based)
+*/
+struct pos_status_by_user
+: public PFS_double_index
+{
+  pos_status_by_user()
+    : PFS_double_index(0, 0)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 0;
+  }
+
+  inline bool has_more_user(void)
+  { return (m_index_1 < global_user_container.get_row_count()); }
+
+  inline void next_user(void)
+  {
+    m_index_1++;
+    m_index_2= 0;
+  }
+};
+
+/**
+  Store and retrieve table state information for queries that reinstantiate
+  the table object.
+*/
+class table_status_by_user_context : public PFS_table_context
+{
+public:
+  table_status_by_user_context(ulonglong current_version, bool restore) :
+    PFS_table_context(current_version, global_user_container.get_row_count(), restore, THR_PFS_SBU) { }
+};
+
+/** Table PERFORMANCE_SCHEMA.STATUS_BY_USER. */
+class table_status_by_user : public PFS_engine_table
+{
+  typedef pos_status_by_user pos_t;
+
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+  table_status_by_user();
+
+public:
+  ~table_status_by_user() { }
+
+protected:
+  int materialize(PFS_thread *thread);
+  void make_row(PFS_user *user, const Status_variable *status_var);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Status variable cache for one user. */
+  PFS_status_variable_cache m_status_cache;
+
+  /** Current row. */
+  row_status_by_user m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_t m_pos;
+  /** Next position. */
+  pos_t m_next_pos;
+
+  /** Table context with global status array version and map of materialized threads. */
+  table_status_by_user_context *m_context;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_sync_instances.cc b/storage/perfschema/table_sync_instances.cc
index aa9b004fc02..a85dea21ba2 100644
--- a/storage/perfschema/table_sync_instances.cc
+++ b/storage/perfschema/table_sync_instances.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -27,12 +27,14 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_sync_instances.h"
 #include "pfs_global.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_mutex_instances::m_table_lock;
 
@@ -41,17 +43,17 @@ table_mutex_instances::m_share=
 {
   { C_STRING_WITH_LEN("mutex_instances") },
   &pfs_readonly_acl,
-  &table_mutex_instances::create,
+  table_mutex_instances::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_mutex_instances::get_row_count,
   sizeof(PFS_simple_index),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE mutex_instances("
                       "NAME VARCHAR(128) not null comment 'Instrument name associated with the mutex.',"
                       "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null comment 'Memory address of the instrumented mutex.',"
-                      "LOCKED_BY_THREAD_ID BIGINT unsigned comment 'The THREAD_ID of the locking thread if a thread has a mutex locked, otherwise NULL.')") }
+                      "LOCKED_BY_THREAD_ID BIGINT unsigned comment 'The THREAD_ID of the locking thread if a thread has a mutex locked, otherwise NULL.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_mutex_instances::create(void)
@@ -59,6 +61,12 @@ PFS_engine_table* table_mutex_instances::create(void)
   return new table_mutex_instances();
 }
 
+ha_rows
+table_mutex_instances::get_row_count(void)
+{
+  return global_mutex_container.get_row_count();
+}
+
 table_mutex_instances::table_mutex_instances()
   : PFS_engine_table(&m_share, &m_pos),
   m_row_exists(false), m_pos(0), m_next_pos(0)
@@ -74,15 +82,14 @@ int table_mutex_instances::rnd_next(void)
 {
   PFS_mutex *pfs;
 
-  for (m_pos.set_at(&m_next_pos); m_pos.m_index < mutex_max; m_pos.next())
+  m_pos.set_at(&m_next_pos);
+  PFS_mutex_iterator it= global_mutex_container.iterate(m_pos.m_index);
+  pfs= it.scan_next(& m_pos.m_index);
+  if (pfs != NULL)
   {
-    pfs= &mutex_array[m_pos.m_index];
-    if (pfs->m_lock.is_populated())
-    {
-      make_row(pfs);
-      m_next_pos.set_after(&m_pos);
-      return 0;
-    }
+    make_row(pfs);
+    m_next_pos.set_after(&m_pos);
+    return 0;
   }
 
   return HA_ERR_END_OF_FILE;
@@ -93,9 +100,9 @@ int table_mutex_instances::rnd_pos(const void *pos)
   PFS_mutex *pfs;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index < mutex_max);
-  pfs= &mutex_array[m_pos.m_index];
-  if (pfs->m_lock.is_populated())
+
+  pfs= global_mutex_container.get(m_pos.m_index);
+  if (pfs != NULL)
   {
     make_row(pfs);
     return 0;
@@ -106,7 +113,7 @@ int table_mutex_instances::rnd_pos(const void *pos)
 
 void table_mutex_instances::make_row(PFS_mutex *pfs)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   PFS_mutex_class *safe_class;
 
   m_row_exists= false;
@@ -147,7 +154,7 @@ int table_mutex_instances::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
@@ -169,7 +176,7 @@ int table_mutex_instances::read_row_values(TABLE *table,
           f->set_null();
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
@@ -184,18 +191,18 @@ table_rwlock_instances::m_share=
 {
   { C_STRING_WITH_LEN("rwlock_instances") },
   &pfs_readonly_acl,
-  &table_rwlock_instances::create,
+  table_rwlock_instances::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_rwlock_instances::get_row_count,
   sizeof(PFS_simple_index),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE rwlock_instances("
                       "NAME VARCHAR(128) not null comment 'Instrument name associated with the read write lock',"
                       "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null comment 'Address in memory of the instrumented lock',"
                       "WRITE_LOCKED_BY_THREAD_ID BIGINT unsigned comment 'THREAD_ID of the locking thread if locked in write (exclusive) mode, otherwise NULL.',"
-                      "READ_LOCKED_BY_COUNT INTEGER unsigned not null comment 'Count of current read locks held')") }
+                      "READ_LOCKED_BY_COUNT INTEGER unsigned not null comment 'Count of current read locks held')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_rwlock_instances::create(void)
@@ -203,6 +210,12 @@ PFS_engine_table* table_rwlock_instances::create(void)
   return new table_rwlock_instances();
 }
 
+ha_rows
+table_rwlock_instances::get_row_count(void)
+{
+  return global_rwlock_container.get_row_count();
+}
+
 table_rwlock_instances::table_rwlock_instances()
   : PFS_engine_table(&m_share, &m_pos),
   m_row_exists(false), m_pos(0), m_next_pos(0)
@@ -218,15 +231,14 @@ int table_rwlock_instances::rnd_next(void)
 {
   PFS_rwlock *pfs;
 
-  for (m_pos.set_at(&m_next_pos); m_pos.m_index < rwlock_max; m_pos.next())
+  m_pos.set_at(&m_next_pos);
+  PFS_rwlock_iterator it= global_rwlock_container.iterate(m_pos.m_index);
+  pfs= it.scan_next(& m_pos.m_index);
+  if (pfs != NULL)
   {
-    pfs= &rwlock_array[m_pos.m_index];
-    if (pfs->m_lock.is_populated())
-    {
-      make_row(pfs);
-      m_next_pos.set_after(&m_pos);
-      return 0;
-    }
+    make_row(pfs);
+    m_next_pos.set_after(&m_pos);
+    return 0;
   }
 
   return HA_ERR_END_OF_FILE;
@@ -237,9 +249,9 @@ int table_rwlock_instances::rnd_pos(const void *pos)
   PFS_rwlock *pfs;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index < rwlock_max);
-  pfs= &rwlock_array[m_pos.m_index];
-  if (pfs->m_lock.is_populated())
+
+  pfs= global_rwlock_container.get(m_pos.m_index);
+  if (pfs != NULL)
   {
     make_row(pfs);
     return 0;
@@ -250,7 +262,7 @@ int table_rwlock_instances::rnd_pos(const void *pos)
 
 void table_rwlock_instances::make_row(PFS_rwlock *pfs)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   PFS_rwlock_class *safe_class;
 
   m_row_exists= false;
@@ -295,7 +307,7 @@ int table_rwlock_instances::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
@@ -320,7 +332,7 @@ int table_rwlock_instances::read_row_values(TABLE *table,
         set_field_ulong(f, m_row.m_readers);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
@@ -335,16 +347,16 @@ table_cond_instances::m_share=
 {
   { C_STRING_WITH_LEN("cond_instances") },
   &pfs_readonly_acl,
-  &table_cond_instances::create,
+  table_cond_instances::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_cond_instances::get_row_count,
   sizeof(PFS_simple_index),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE cond_instances("
                       "NAME VARCHAR(128) not null comment 'Client user name for the connection, or NULL if an internal thread.',"
-                      "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null comment 'Address in memory of the instrumented condition.')") }
+                      "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null comment 'Address in memory of the instrumented condition.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_cond_instances::create(void)
@@ -352,6 +364,12 @@ PFS_engine_table* table_cond_instances::create(void)
   return new table_cond_instances();
 }
 
+ha_rows
+table_cond_instances::get_row_count(void)
+{
+  return global_cond_container.get_row_count();
+}
+
 table_cond_instances::table_cond_instances()
   : PFS_engine_table(&m_share, &m_pos),
   m_row_exists(false), m_pos(0), m_next_pos(0)
@@ -367,15 +385,14 @@ int table_cond_instances::rnd_next(void)
 {
   PFS_cond *pfs;
 
-  for (m_pos.set_at(&m_next_pos); m_pos.m_index < cond_max; m_pos.next())
+  m_pos.set_at(&m_next_pos);
+  PFS_cond_iterator it= global_cond_container.iterate(m_pos.m_index);
+  pfs= it.scan_next(& m_pos.m_index);
+  if (pfs != NULL)
   {
-    pfs= &cond_array[m_pos.m_index];
-    if (pfs->m_lock.is_populated())
-    {
-      make_row(pfs);
-      m_next_pos.set_after(&m_pos);
-      return 0;
-    }
+    make_row(pfs);
+    m_next_pos.set_after(&m_pos);
+    return 0;
   }
 
   return HA_ERR_END_OF_FILE;
@@ -386,9 +403,9 @@ int table_cond_instances::rnd_pos(const void *pos)
   PFS_cond *pfs;
 
   set_position(pos);
-  DBUG_ASSERT(m_pos.m_index < cond_max);
-  pfs= &cond_array[m_pos.m_index];
-  if (pfs->m_lock.is_populated())
+
+  pfs= global_cond_container.get(m_pos.m_index);
+  if (pfs != NULL)
   {
     make_row(pfs);
     return 0;
@@ -399,7 +416,7 @@ int table_cond_instances::rnd_pos(const void *pos)
 
 void table_cond_instances::make_row(PFS_cond *pfs)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
   PFS_cond_class *safe_class;
 
   m_row_exists= false;
@@ -430,7 +447,7 @@ int table_cond_instances::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 0);
+  assert(table->s->null_bytes == 0);
 
   for (; (f= *fields) ; fields++)
   {
@@ -445,7 +462,7 @@ int table_cond_instances::read_row_values(TABLE *table,
         set_field_ulonglong(f, (intptr) m_row.m_identity);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
diff --git a/storage/perfschema/table_sync_instances.h b/storage/perfschema/table_sync_instances.h
index 6f7e1bf5523..f1f56248a12 100644
--- a/storage/perfschema/table_sync_instances.h
+++ b/storage/perfschema/table_sync_instances.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -62,6 +62,7 @@ public:
   /** Table share. */
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
@@ -119,6 +120,7 @@ public:
   /** Table share */
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
@@ -170,6 +172,7 @@ public:
   /** Table share. */
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
+  static ha_rows get_row_count();
 
   virtual int rnd_next();
   virtual int rnd_pos(const void *pos);
diff --git a/storage/perfschema/table_table_handles.cc b/storage/perfschema/table_table_handles.cc
new file mode 100644
index 00000000000..91ab7f889e4
--- /dev/null
+++ b/storage/perfschema/table_table_handles.cc
@@ -0,0 +1,214 @@
+/* Copyright (c) 2012, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_table_handles.cc
+  Table TABLE_TABLE_HANDLES (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_table_handles.h"
+#include "pfs_global.h"
+#include "pfs_stat.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
+
+THR_LOCK table_table_handles::m_table_lock;
+
+PFS_engine_table_share
+table_table_handles::m_share=
+{
+  { C_STRING_WITH_LEN("table_handles") },
+  &pfs_readonly_acl,
+  table_table_handles::create,
+  NULL, /* write_row */
+  NULL, /* delete_all_rows */
+  table_table_handles::get_row_count,
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE table_handles("
+  "OBJECT_TYPE VARCHAR(64) not null comment 'The table opened by a table handle.',"
+  "OBJECT_SCHEMA VARCHAR(64) not null comment 'The schema that contains the object.',"
+  "OBJECT_NAME VARCHAR(64) not null comment 'The name of the instrumented object.',"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null comment 'The table handle address in memory.',"
+  "OWNER_THREAD_ID BIGINT unsigned comment 'The thread owning the table handle.',"
+  "OWNER_EVENT_ID BIGINT unsigned comment 'The event which caused the table handle to be opened.',"
+  "INTERNAL_LOCK VARCHAR(64) comment 'The table lock used at the SQL level.',"
+  "EXTERNAL_LOCK VARCHAR(64) comment 'The table lock used at the storage engine level.')") },
+  false  /* perpetual */
+};
+
+PFS_engine_table*
+table_table_handles::create(void)
+{
+  return new table_table_handles();
+}
+
+ha_rows
+table_table_handles::get_row_count(void)
+{
+  return global_table_container.get_row_count();
+}
+
+table_table_handles::table_table_handles()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+void table_table_handles::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int table_table_handles::rnd_init(bool scan)
+{
+  return 0;
+}
+
+int table_table_handles::rnd_next(void)
+{
+  PFS_table *pfs;
+
+  m_pos.set_at(&m_next_pos);
+  PFS_table_iterator it= global_table_container.iterate(m_pos.m_index);
+  pfs= it.scan_next(& m_pos.m_index);
+  if (pfs != NULL)
+  {
+    make_row(pfs);
+    m_next_pos.set_after(&m_pos);
+    return 0;
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_table_handles::rnd_pos(const void *pos)
+{
+  PFS_table *pfs;
+
+  set_position(pos);
+
+  pfs= global_table_container.get(m_pos.m_index);
+  if (pfs != NULL)
+  {
+    make_row(pfs);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_table_handles::make_row(PFS_table *table)
+{
+  pfs_optimistic_state lock;
+  PFS_table_share *share;
+  PFS_thread *thread;
+
+  m_row_exists= false;
+
+  table->m_lock.begin_optimistic_lock(&lock);
+
+  share= sanitize_table_share(table->m_share);
+  if (share == NULL)
+    return;
+
+  if (m_row.m_object.make_row(share))
+    return;
+
+  m_row.m_identity= table->m_identity;
+
+  thread= sanitize_thread(table->m_thread_owner);
+  if (thread != NULL)
+  {
+    m_row.m_owner_thread_id= thread->m_thread_internal_id;
+    m_row.m_owner_event_id= table->m_owner_event_id;
+  }
+  else
+  {
+    m_row.m_owner_thread_id= 0;
+    m_row.m_owner_event_id= 0;
+  }
+
+  m_row.m_internal_lock= table->m_internal_lock;
+  m_row.m_external_lock= table->m_external_lock;
+
+  if (! table->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+}
+
+int table_table_handles::read_row_values(TABLE *table,
+                                         unsigned char *buf,
+                                         Field **fields,
+                                         bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* OBJECT_TYPE */
+      case 1: /* SCHEMA_NAME */
+      case 2: /* OBJECT_NAME */
+        m_row.m_object.set_field(f->field_index, f);
+        break;
+      case 3: /* OBJECT_INSTANCE_BEGIN */
+        set_field_ulonglong(f, (intptr) m_row.m_identity);
+        break;
+      case 4: /* OWNER_THREAD_ID */
+        set_field_ulonglong(f, m_row.m_owner_thread_id);
+        break;
+      case 5: /* OWNER_EVENT_ID */
+        set_field_ulonglong(f, m_row.m_owner_event_id);
+        break;
+      case 6: /* INTERNAL_LOCK */
+        set_field_lock_type(f, m_row.m_internal_lock);
+        break;
+      case 7: /* EXTERNAL_LOCK */
+        set_field_lock_type(f, m_row.m_external_lock);
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_table_handles.h b/storage/perfschema/table_table_handles.h
new file mode 100644
index 00000000000..3b738d89413
--- /dev/null
+++ b/storage/perfschema/table_table_handles.h
@@ -0,0 +1,108 @@
+/* Copyright (c) 2012, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_TABLE_HANDLES_H
+#define TABLE_TABLE_HANDLES_H
+
+/**
+  @file storage/perfschema/table_table_handles.h
+  Table TABLE_HANDLES (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.TABLE_HANDLES.
+*/
+struct row_table_handles
+{
+  /** Column OBJECT_TYPE, SCHEMA_NAME, OBJECT_NAME. */
+  PFS_object_row m_object;
+  /** Column OBJECT_INSTANCE_BEGIN. */
+  const void *m_identity;
+  /** Column OWNER_THREAD_ID. */
+  ulonglong m_owner_thread_id;
+  /** Column OWNER_EVENT_ID. */
+  ulonglong m_owner_event_id;
+  /** Column INTERNAL_LOCK. */
+  PFS_TL_LOCK_TYPE m_internal_lock;
+  /** Column EXTERNAL_LOCK. */
+  PFS_TL_LOCK_TYPE m_external_lock;
+};
+
+/** Table PERFORMANCE_SCHEMA.TABLE_HANDLES. */
+class table_table_handles : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_table_handles();
+
+public:
+  ~table_table_handles()
+  {}
+
+protected:
+  void make_row(PFS_table *table);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_table_handles m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_threads.cc b/storage/perfschema/table_threads.cc
index a0e4dd849e0..5528056c022 100644
--- a/storage/perfschema/table_threads.cc
+++ b/storage/perfschema/table_threads.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2014, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -21,7 +21,7 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "table_threads.h"
 #include "sql_parse.h"
 #include "pfs_instr_class.h"
@@ -34,11 +34,10 @@ table_threads::m_share=
 {
   { C_STRING_WITH_LEN("threads") },
   &pfs_updatable_acl,
-  &table_threads::create,
+  table_threads::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
-  NULL, /* get_row_count */
-  1000, /* records */
+  cursor_by_thread::get_row_count,
   sizeof(PFS_simple_index), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE threads("
@@ -46,8 +45,8 @@ table_threads::m_share=
                       "NAME VARCHAR(128) not null comment 'Name associated with the server''s thread instrumentation code, for example thread/sql/main for the server''s main() function, and thread/sql/one_connection for a user connection.',"
                       "TYPE VARCHAR(10) not null comment 'FOREGROUND or BACKGROUND, depending on the thread type. User connection threads are FOREGROUND, internal server threads are BACKGROUND.',"
                       "PROCESSLIST_ID BIGINT unsigned comment 'The PROCESSLIST.ID value for threads displayed in the INFORMATION_SCHEMA.PROCESSLIST table, or 0 for background threads. Also corresponds with the CONNECTION_ID() return value for the thread.',"
-                      "PROCESSLIST_USER VARCHAR(" STRINGIFY_ARG(USERNAME_CHAR_LENGTH) ") comment 'Foreground thread user, or NULL for a background thread.',"
-                      "PROCESSLIST_HOST VARCHAR(" STRINGIFY_ARG(HOSTNAME_LENGTH) ") comment 'Foreground thread host, or NULL for a background thread.',"
+                      "PROCESSLIST_USER VARCHAR(" USERNAME_CHAR_LENGTH_STR ") comment 'Foreground thread user, or NULL for a background thread.',"
+                      "PROCESSLIST_HOST VARCHAR(" HOSTNAME_LENGTH_STR ") comment 'Foreground thread host, or NULL for a background thread.',"
                       "PROCESSLIST_DB VARCHAR(64) comment 'Thread''s default database, or NULL if none exists.',"
                       "PROCESSLIST_COMMAND VARCHAR(16) comment 'Type of command executed by the thread. These correspond to the the COM_xxx client/server protocol commands, and the Com_xxx status variables. See Thread Command Values.',"
                       "PROCESSLIST_TIME BIGINT comment 'Time in seconds the thread has been in its current state.',"
@@ -55,7 +54,11 @@ table_threads::m_share=
                       "PROCESSLIST_INFO LONGTEXT comment 'Statement being executed by the thread, or NULL if a statement is not being executed. If a statement results in calling other statements, such as for a stored procedure, the innermost statement from the stored procedure is shown here.',"
                       "PARENT_THREAD_ID BIGINT unsigned comment 'THREAD_ID of the parent thread, if any. Subthreads can for example be spawned as a result of INSERT DELAYED statements.',"
                       "ROLE VARCHAR(64) comment 'Unused.',"
-                      "INSTRUMENTED ENUM ('YES', 'NO') not null comment 'YES or NO for Whether the thread is instrumented or not. For foreground threads, the initial value is determined by whether there''s a user/host match in the setup_actors table. Subthreads are again matched, while for background threads, this will be set to YES by default. To monitor events that the thread executes, INSTRUMENTED must be YES and the thread_instrumentation consumer in the setup_consumers table must also be YES.')") }
+                      "INSTRUMENTED ENUM ('YES', 'NO') not null comment 'YES or NO for Whether the thread is instrumented or not. For foreground threads, the initial value is determined by whether there''s a user/host match in the setup_actors table. Subthreads are again matched, while for background threads, this will be set to YES by default. To monitor events that the thread executes, INSTRUMENTED must be YES and the thread_instrumentation consumer in the setup_consumers table must also be YES.',"
+                      "HISTORY ENUM ('YES', 'NO') not null comment 'Whether to log historical events for the thread.',"
+                      "CONNECTION_TYPE VARCHAR(16) comment 'The protocol used to establish the connection, or NULL for background threads.',"
+                      "THREAD_OS_ID BIGINT unsigned comment 'The thread or task identifier as defined by the underlying operating system, if there is one.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_threads::create()
@@ -70,9 +73,9 @@ table_threads::table_threads()
 
 void table_threads::make_row(PFS_thread *pfs)
 {
-  pfs_lock lock;
-  pfs_lock session_lock;
-  pfs_lock stmt_lock;
+  pfs_optimistic_state lock;
+  pfs_optimistic_state session_lock;
+  pfs_optimistic_state stmt_lock;
   PFS_stage_class *stage_class;
   PFS_thread_class *safe_class;
 
@@ -88,6 +91,7 @@ void table_threads::make_row(PFS_thread *pfs)
   m_row.m_thread_internal_id= pfs->m_thread_internal_id;
   m_row.m_parent_thread_internal_id= pfs->m_parent_thread_internal_id;
   m_row.m_processlist_id= pfs->m_processlist_id;
+  m_row.m_thread_os_id= pfs->m_thread_os_id;
   m_row.m_name= safe_class->m_name;
   m_row.m_name_length= safe_class->m_name_length;
 
@@ -165,8 +169,12 @@ void table_threads::make_row(PFS_thread *pfs)
   {
     m_row.m_processlist_state_length= 0;
   }
+  m_row.m_connection_type = pfs->m_connection_type;
 
-  m_row.m_enabled_ptr= &pfs->m_enabled;
+
+  m_row.m_enabled= pfs->m_enabled;
+  m_row.m_history= pfs->m_history;
+  m_row.m_psi= pfs;
 
   if (pfs->m_lock.end_optimistic_lock(& lock))
     m_row_exists= true;
@@ -178,12 +186,14 @@ int table_threads::read_row_values(TABLE *table,
                                    bool read_all)
 {
   Field *f;
+  const char *str= NULL;
+  size_t len= 0;
 
   if (unlikely(! m_row_exists))
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 2);
+  assert(table->s->null_bytes == 2);
   buf[0]= 0;
   buf[1]= 0;
 
@@ -250,23 +260,16 @@ int table_threads::read_row_values(TABLE *table,
           f->set_null();
         break;
       case 9: /* PROCESSLIST_STATE */
+        /* This column's datatype is declared as varchar(64). Thread's state
+           message cannot be more than 64 characters. Otherwise, we will end up
+           in 'data truncated' warning/error (depends sql_mode setting) when
+           server is updating this column for those threads. To prevent this
+           kind of issue, an assert is added.
+         */
+        assert(m_row.m_processlist_state_length <= f->char_length());
         if (m_row.m_processlist_state_length > 0)
-        {
-          /* This column's datatype is declared as varchar(64). But in current
-             code, there are few process state messages which are greater than
-             64 characters(Eg:stage_slave_has_read_all_relay_log).
-             In those cases, we will end up in 'data truncated'
-             warning/error (depends sql_mode setting) when server is updating
-             this column for those threads. Since 5.6 is GAed, neither the
-             metadata of this column can be changed, nor those state messages.
-             So server will silently truncate the state message to 64 characters
-             if it is longer. In Upper versions(5.7+), these state messages are
-             changed to less than or equal to 64 characters.
-           */
           set_field_varchar_utf8(f, m_row.m_processlist_state_ptr,
-                                 MY_MIN(m_row.m_processlist_state_length,
-                                        f->char_length()));
-        }
+                                 m_row.m_processlist_state_length);
         else
           f->set_null();
         break;
@@ -287,10 +290,26 @@ int table_threads::read_row_values(TABLE *table,
         f->set_null();
         break;
       case 13: /* INSTRUMENTED */
-        set_field_enum(f, (*m_row.m_enabled_ptr) ? ENUM_YES : ENUM_NO);
+        set_field_enum(f, m_row.m_enabled ? ENUM_YES : ENUM_NO);
+        break;
+      case 14: /* HISTORY */
+        set_field_enum(f, m_row.m_history ? ENUM_YES : ENUM_NO);
+        break;
+      case 15: /* CONNECTION_TYPE */
+        str= vio_type_name(m_row.m_connection_type, & len);
+        if (len > 0)
+          set_field_varchar_utf8(f, str, (uint)len);
+        else
+          f->set_null();
+        break;
+      case 16: /* THREAD_OS_ID */
+        if (m_row.m_thread_os_id > 0)
+          set_field_ulonglong(f, m_row.m_thread_os_id);
+        else
+          f->set_null();
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
@@ -327,10 +346,17 @@ int table_threads::update_row_values(TABLE *table,
         return HA_ERR_WRONG_COMMAND;
       case 13: /* INSTRUMENTED */
         value= (enum_yes_no) get_field_enum(f);
-        *m_row.m_enabled_ptr= (value == ENUM_YES) ? true : false;
+        m_row.m_psi->set_enabled((value == ENUM_YES) ? true : false);
         break;
+      case 14: /* HISTORY */
+        value= (enum_yes_no) get_field_enum(f);
+        m_row.m_psi->set_history((value == ENUM_YES) ? true : false);
+        break;
+      case 15: /* CONNECTION_TYPE */
+      case 16: /* THREAD_OS_ID */
+        return HA_ERR_WRONG_COMMAND;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
diff --git a/storage/perfschema/table_threads.h b/storage/perfschema/table_threads.h
index 841b8102bca..a7ce83dad72 100644
--- a/storage/perfschema/table_threads.h
+++ b/storage/perfschema/table_threads.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -70,10 +70,18 @@ struct row_threads
   const char* m_processlist_info_ptr;
   /** Length in bytes of @c m_processlist_info_ptr. */
   uint m_processlist_info_length;
-  /** Column INSTRUMENTED. */
-  bool *m_enabled_ptr;
+  /** Column INSTRUMENTED (read). */
+  bool m_enabled;
+  /** Column HISTORY (read). */
+  bool m_history;
+  /** INSTRUMENTED and HISTORY (write). */
+  PFS_thread *m_psi;
   /** Column PARENT_THREAD_ID. */
   ulonglong m_parent_thread_internal_id;
+  /** Column CONNECTION_TYPE. */
+  enum_vio_type m_connection_type;
+  /** Column THREAD_OS_ID. */
+  my_thread_os_id_t m_thread_os_id;
 };
 
 /** Table PERFORMANCE_SCHEMA.THREADS. */
diff --git a/storage/perfschema/table_tiws_by_index_usage.cc b/storage/perfschema/table_tiws_by_index_usage.cc
index 034c91af653..7375c62ae38 100644
--- a/storage/perfschema/table_tiws_by_index_usage.cc
+++ b/storage/perfschema/table_tiws_by_index_usage.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,13 +26,15 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_tiws_by_index_usage.h"
 #include "pfs_global.h"
 #include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_tiws_by_index_usage::m_table_lock;
 
@@ -44,8 +46,7 @@ table_tiws_by_index_usage::m_share=
   table_tiws_by_index_usage::create,
   NULL, /* write_row */
   table_tiws_by_index_usage::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_tiws_by_index_usage::get_row_count,
   sizeof(pos_tiws_by_index_usage),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE table_io_waits_summary_by_index_usage("
@@ -87,7 +88,8 @@ table_tiws_by_index_usage::m_share=
                       "SUM_TIMER_DELETE BIGINT unsigned not null comment 'Total wait time of all delete operations that are timed.',"
                       "MIN_TIMER_DELETE BIGINT unsigned not null comment 'Minimum wait time of all delete operations that are timed.',"
                       "AVG_TIMER_DELETE BIGINT unsigned not null comment 'Average wait time of all delete operations that are timed.',"
-                      "MAX_TIMER_DELETE BIGINT unsigned not null comment 'Maximum wait time of all delete operations that are timed.')") }
+                      "MAX_TIMER_DELETE BIGINT unsigned not null comment 'Maximum wait time of all delete operations that are timed.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -104,6 +106,12 @@ table_tiws_by_index_usage::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_tiws_by_index_usage::get_row_count(void)
+{
+  return global_table_share_index_container.get_row_count();
+}
+
 table_tiws_by_index_usage::table_tiws_by_index_usage()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(), m_next_pos()
@@ -124,27 +132,31 @@ int table_tiws_by_index_usage::rnd_init(bool scan)
 int table_tiws_by_index_usage::rnd_next(void)
 {
   PFS_table_share *table_share;
+  bool has_more_table= true;
 
   for (m_pos.set_at(&m_next_pos);
-       m_pos.has_more_table();
+       has_more_table;
        m_pos.next_table())
   {
-    table_share= &table_share_array[m_pos.m_index_1];
-    if (table_share->m_lock.is_populated())
+    table_share= global_table_share_container.get(m_pos.m_index_1, & has_more_table);
+    if (table_share != NULL)
     {
-      uint safe_key_count= sanitize_index_count(table_share->m_key_count);
-      if (m_pos.m_index_2 < safe_key_count)
+      if (table_share->m_enabled)
       {
-        make_row(table_share, m_pos.m_index_2);
-        m_next_pos.set_after(&m_pos);
-        return 0;
-      }
-      if (m_pos.m_index_2 <= MAX_INDEXES)
-      {
-        m_pos.m_index_2= MAX_INDEXES;
-        make_row(table_share, m_pos.m_index_2);
-        m_next_pos.set_after(&m_pos);
-        return 0;
+        uint safe_key_count= sanitize_index_count(table_share->m_key_count);
+        if (m_pos.m_index_2 < safe_key_count)
+        {
+          make_row(table_share, m_pos.m_index_2);
+          m_next_pos.set_after(&m_pos);
+          return 0;
+        }
+        if (m_pos.m_index_2 <= MAX_INDEXES)
+        {
+          m_pos.m_index_2= MAX_INDEXES;
+          make_row(table_share, m_pos.m_index_2);
+          m_next_pos.set_after(&m_pos);
+          return 0;
+        }
       }
     }
   }
@@ -159,40 +171,58 @@ table_tiws_by_index_usage::rnd_pos(const void *pos)
 
   set_position(pos);
 
-  table_share= &table_share_array[m_pos.m_index_1];
-  if (table_share->m_lock.is_populated())
+  table_share= global_table_share_container.get(m_pos.m_index_1);
+  if (table_share != NULL)
   {
-    uint safe_key_count= sanitize_index_count(table_share->m_key_count);
-    if (m_pos.m_index_2 < safe_key_count)
-    {
-      make_row(table_share, m_pos.m_index_2);
-      return 0;
-    }
-    if (m_pos.m_index_2 == MAX_INDEXES)
+    if (table_share->m_enabled)
     {
-      make_row(table_share, m_pos.m_index_2);
-      return 0;
+      uint safe_key_count= sanitize_index_count(table_share->m_key_count);
+      if (m_pos.m_index_2 < safe_key_count)
+      {
+        make_row(table_share, m_pos.m_index_2);
+        return 0;
+      }
+      if (m_pos.m_index_2 == MAX_INDEXES)
+      {
+        make_row(table_share, m_pos.m_index_2);
+        return 0;
+      }
     }
   }
 
   return HA_ERR_RECORD_DELETED;
 }
 
-void table_tiws_by_index_usage::make_row(PFS_table_share *share, uint index)
+void table_tiws_by_index_usage::make_row(PFS_table_share *pfs_share,
+                                         uint index)
 {
-  pfs_lock lock;
+  PFS_table_share_index *pfs_index;
+  pfs_optimistic_state lock;
 
-  m_row_exists= false;
+  assert(index <= MAX_INDEXES);
 
-  share->m_lock.begin_optimistic_lock(&lock);
+  m_row_exists= false;
 
-  if (m_row.m_index.make_row(share, index))
-    return;
+  pfs_share->m_lock.begin_optimistic_lock(&lock);
 
   PFS_index_io_stat_visitor visitor;
-  PFS_object_iterator::visit_table_indexes(share, index, & visitor);
+  PFS_object_iterator::visit_table_indexes(pfs_share, index, & visitor);
+
+  if (! visitor.m_stat.m_has_data)
+  {
+    pfs_index= pfs_share->find_index_stat(index);
+    if (pfs_index == NULL)
+      return;
+  }
+  else
+  {
+    pfs_index= pfs_share->find_index_stat(index);
+  }
+
+  if (m_row.m_index.make_row(pfs_share, pfs_index, index))
+    return;
 
-  if (! share->m_lock.end_optimistic_lock(&lock))
+  if (! pfs_share->m_lock.end_optimistic_lock(&lock))
     return;
 
   m_row_exists= true;
@@ -210,7 +240,7 @@ int table_tiws_by_index_usage::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
@@ -331,7 +361,7 @@ int table_tiws_by_index_usage::read_row_values(TABLE *table,
         set_field_ulonglong(f, m_row.m_stat.m_delete.m_max);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
diff --git a/storage/perfschema/table_tiws_by_index_usage.h b/storage/perfschema/table_tiws_by_index_usage.h
index a284bc7f0bc..990f2dede66 100644
--- a/storage/perfschema/table_tiws_by_index_usage.h
+++ b/storage/perfschema/table_tiws_by_index_usage.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -54,7 +54,7 @@ struct row_tiws_by_index_usage
 /**
   Position of a cursor on
   PERFORMANCE_SCHEMA.TABLE_IO_WAIT_SUMMARY_BY_INDEX.
-  Index 1 on table_share_array (0 based)
+  Index 1 on global_table_share_container (0 based)
   Index 2 on index (0 based)
 */
 struct pos_tiws_by_index_usage : public PFS_double_index
@@ -69,11 +69,6 @@ struct pos_tiws_by_index_usage : public PFS_double_index
     m_index_2= 0;
   }
 
-  inline bool has_more_table(void)
-  {
-    return (m_index_1 < table_share_max);
-  }
-
   inline void next_table(void)
   {
     m_index_1++;
@@ -89,6 +84,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
diff --git a/storage/perfschema/table_tiws_by_table.cc b/storage/perfschema/table_tiws_by_table.cc
index 4061164dbf4..cbd754b3342 100644
--- a/storage/perfschema/table_tiws_by_table.cc
+++ b/storage/perfschema/table_tiws_by_table.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,13 +26,15 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_tiws_by_table.h"
 #include "pfs_global.h"
 #include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_tiws_by_table::m_table_lock;
 
@@ -44,8 +46,7 @@ table_tiws_by_table::m_share=
   table_tiws_by_table::create,
   NULL, /* write_row */
   table_tiws_by_table::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_tiws_by_table::get_row_count,
   sizeof(PFS_simple_index),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE table_io_waits_summary_by_table("
@@ -86,7 +87,8 @@ table_tiws_by_table::m_share=
                       "SUM_TIMER_DELETE BIGINT unsigned not null comment 'Total wait time of all delete operations that are timed.',"
                       "MIN_TIMER_DELETE BIGINT unsigned not null comment 'Minimum wait time of all delete operations that are timed.',"
                       "AVG_TIMER_DELETE BIGINT unsigned not null comment 'Average wait time of all delete operations that are timed.',"
-                      "MAX_TIMER_DELETE BIGINT unsigned not null comment 'Maximum wait time of all delete operations that are timed.')") }
+                      "MAX_TIMER_DELETE BIGINT unsigned not null comment 'Maximum wait time of all delete operations that are timed.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -103,6 +105,12 @@ table_tiws_by_table::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_tiws_by_table::get_row_count(void)
+{
+  return global_table_share_container.get_row_count();
+}
+
 table_tiws_by_table::table_tiws_by_table()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(0), m_next_pos(0)
@@ -122,20 +130,23 @@ int table_tiws_by_table::rnd_init(bool scan)
 
 int table_tiws_by_table::rnd_next(void)
 {
-  PFS_table_share *table_share;
+  PFS_table_share *pfs;
 
-  for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index < table_share_max;
-       m_pos.m_index++)
+  m_pos.set_at(&m_next_pos);
+  PFS_table_share_iterator it= global_table_share_container.iterate(m_pos.m_index);
+  do
   {
-    table_share= &table_share_array[m_pos.m_index];
-    if (table_share->m_lock.is_populated())
+    pfs= it.scan_next(& m_pos.m_index);
+    if (pfs != NULL)
     {
-      make_row(table_share);
-      m_next_pos.set_after(&m_pos);
-      return 0;
+      if (pfs->m_enabled)
+      {
+        make_row(pfs);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
     }
-  }
+  } while (pfs != NULL);
 
   return HA_ERR_END_OF_FILE;
 }
@@ -143,15 +154,18 @@ int table_tiws_by_table::rnd_next(void)
 int
 table_tiws_by_table::rnd_pos(const void *pos)
 {
-  PFS_table_share *table_share;
+  PFS_table_share *pfs;
 
   set_position(pos);
 
-  table_share= &table_share_array[m_pos.m_index];
-  if (table_share->m_lock.is_populated())
+  pfs= global_table_share_container.get(m_pos.m_index);
+  if (pfs != NULL)
   {
-    make_row(table_share);
-    return 0;
+    if (pfs->m_enabled)
+    {
+      make_row(pfs);
+      return 0;
+    }
   }
 
   return HA_ERR_RECORD_DELETED;
@@ -159,7 +173,7 @@ table_tiws_by_table::rnd_pos(const void *pos)
 
 void table_tiws_by_table::make_row(PFS_table_share *share)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
 
   m_row_exists= false;
 
@@ -189,7 +203,7 @@ int table_tiws_by_table::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
@@ -309,7 +323,7 @@ int table_tiws_by_table::read_row_values(TABLE *table,
         set_field_ulonglong(f, m_row.m_stat.m_delete.m_max);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
diff --git a/storage/perfschema/table_tiws_by_table.h b/storage/perfschema/table_tiws_by_table.h
index 7427ca797fa..693e07f0e65 100644
--- a/storage/perfschema/table_tiws_by_table.h
+++ b/storage/perfschema/table_tiws_by_table.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -59,6 +59,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
diff --git a/storage/perfschema/table_tlws_by_table.cc b/storage/perfschema/table_tlws_by_table.cc
index 759fdd88e12..ebb01567adf 100644
--- a/storage/perfschema/table_tlws_by_table.cc
+++ b/storage/perfschema/table_tlws_by_table.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,13 +26,15 @@
 */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "pfs_instr_class.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_tlws_by_table.h"
 #include "pfs_global.h"
 #include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+#include "field.h"
 
 THR_LOCK table_tlws_by_table::m_table_lock;
 
@@ -44,8 +46,7 @@ table_tlws_by_table::m_share=
   table_tlws_by_table::create,
   NULL, /* write_row */
   table_tlws_by_table::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  table_tlws_by_table::get_row_count,
   sizeof(PFS_simple_index),
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE table_lock_waits_summary_by_table("
@@ -121,7 +122,8 @@ table_tlws_by_table::m_share=
                       "SUM_TIMER_WRITE_EXTERNAL BIGINT unsigned not null comment 'Total wait time of all external write locks that are timed.',"
                       "MIN_TIMER_WRITE_EXTERNAL BIGINT unsigned not null comment 'Minimum wait time of all external write locks that are timed.',"
                       "AVG_TIMER_WRITE_EXTERNAL BIGINT unsigned not null comment 'Average wait time of all external write locks that are timed.',"
-                      "MAX_TIMER_WRITE_EXTERNAL BIGINT unsigned not null comment 'Maximum wait time of all external write locks that are timed.')") }
+                      "MAX_TIMER_WRITE_EXTERNAL BIGINT unsigned not null comment 'Maximum wait time of all external write locks that are timed.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table*
@@ -138,6 +140,12 @@ table_tlws_by_table::delete_all_rows(void)
   return 0;
 }
 
+ha_rows
+table_tlws_by_table::get_row_count(void)
+{
+  return global_table_share_container.get_row_count();
+}
+
 table_tlws_by_table::table_tlws_by_table()
   : PFS_engine_table(&m_share, &m_pos),
     m_row_exists(false), m_pos(0), m_next_pos(0)
@@ -157,20 +165,23 @@ int table_tlws_by_table::rnd_init(bool scan)
 
 int table_tlws_by_table::rnd_next(void)
 {
-  PFS_table_share *table_share;
+  PFS_table_share *pfs;
 
-  for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index < table_share_max;
-       m_pos.m_index++)
+  m_pos.set_at(&m_next_pos);
+  PFS_table_share_iterator it= global_table_share_container.iterate(m_pos.m_index);
+  do
   {
-    table_share= &table_share_array[m_pos.m_index];
-    if (table_share->m_lock.is_populated())
+    pfs= it.scan_next(& m_pos.m_index);
+    if (pfs != NULL)
     {
-      make_row(table_share);
-      m_next_pos.set_after(&m_pos);
-      return 0;
+      if (pfs->m_enabled)
+      {
+        make_row(pfs);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
     }
-  }
+  } while (pfs != NULL);
 
   return HA_ERR_END_OF_FILE;
 }
@@ -178,15 +189,18 @@ int table_tlws_by_table::rnd_next(void)
 int
 table_tlws_by_table::rnd_pos(const void *pos)
 {
-  PFS_table_share *table_share;
+  PFS_table_share *pfs;
 
   set_position(pos);
 
-  table_share= &table_share_array[m_pos.m_index];
-  if (table_share->m_lock.is_populated())
+  pfs= global_table_share_container.get(m_pos.m_index);
+  if (pfs != NULL)
   {
-    make_row(table_share);
-    return 0;
+    if (pfs->m_enabled)
+    {
+      make_row(pfs);
+      return 0;
+    }
   }
 
   return HA_ERR_RECORD_DELETED;
@@ -194,7 +208,7 @@ table_tlws_by_table::rnd_pos(const void *pos)
 
 void table_tlws_by_table::make_row(PFS_table_share *share)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
 
   m_row_exists= false;
 
@@ -224,7 +238,7 @@ int table_tlws_by_table::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
@@ -461,7 +475,7 @@ int table_tlws_by_table::read_row_values(TABLE *table,
         break;
 
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
diff --git a/storage/perfschema/table_tlws_by_table.h b/storage/perfschema/table_tlws_by_table.h
index b5872a07762..5bcf89d3a24 100644
--- a/storage/perfschema/table_tlws_by_table.h
+++ b/storage/perfschema/table_tlws_by_table.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -59,6 +59,7 @@ public:
   static PFS_engine_table_share m_share;
   static PFS_engine_table* create();
   static int delete_all_rows();
+  static ha_rows get_row_count();
 
   virtual int rnd_init(bool scan);
   virtual int rnd_next();
diff --git a/storage/perfschema/table_users.cc b/storage/perfschema/table_users.cc
index ae738e47d81..d56c90c0270 100644
--- a/storage/perfschema/table_users.cc
+++ b/storage/perfschema/table_users.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -21,13 +21,16 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 
 #include "my_global.h"
-#include "my_pthread.h"
+#include "my_thread.h"
 #include "table_users.h"
 #include "pfs_instr_class.h"
 #include "pfs_instr.h"
 #include "pfs_account.h"
 #include "pfs_user.h"
 #include "pfs_visitor.h"
+#include "pfs_memory.h"
+#include "pfs_status.h"
+#include "field.h"
 
 THR_LOCK table_users::m_table_lock;
 
@@ -36,17 +39,17 @@ table_users::m_share=
 {
   { C_STRING_WITH_LEN("users") },
   &pfs_truncatable_acl,
-  &table_users::create,
+  table_users::create,
   NULL, /* write_row */
   table_users::delete_all_rows,
-  NULL, /* get_row_count */
-  1000, /* records */
+  cursor_by_user::get_row_count,
   sizeof(PFS_simple_index), /* ref length */
   &m_table_lock,
   { C_STRING_WITH_LEN("CREATE TABLE users("
-                      "USER CHAR(" STRINGIFY_ARG(USERNAME_CHAR_LENGTH) ") collate utf8_bin default null comment 'The connection''s client user name for the connection, or NULL if an internal thread.',"
+                      "USER CHAR(" USERNAME_CHAR_LENGTH_STR ") collate utf8_bin default null comment 'The connection''s client user name for the connection, or NULL if an internal thread.',"
                       "CURRENT_CONNECTIONS bigint not null comment 'Current connections for the user.',"
-                      "TOTAL_CONNECTIONS bigint not null comment 'Total connections for the user.')") }
+                      "TOTAL_CONNECTIONS bigint not null comment 'Total connections for the user.')") },
+  false  /* perpetual */
 };
 
 PFS_engine_table* table_users::create()
@@ -66,6 +69,15 @@ table_users::delete_all_rows(void)
   reset_events_statements_by_thread();
   reset_events_statements_by_account();
   reset_events_statements_by_user();
+  reset_events_transactions_by_thread();
+  reset_events_transactions_by_account();
+  reset_events_transactions_by_user();
+  reset_memory_by_thread();
+  reset_memory_by_account();
+  reset_memory_by_user();
+  reset_status_by_thread();
+  reset_status_by_account();
+  reset_status_by_user();
   purge_all_account();
   purge_all_user();
   return 0;
@@ -78,7 +90,7 @@ table_users::table_users()
 
 void table_users::make_row(PFS_user *pfs)
 {
-  pfs_lock lock;
+  pfs_optimistic_state lock;
 
   m_row_exists= false;
   pfs->m_lock.begin_optimistic_lock(&lock);
@@ -87,7 +99,11 @@ void table_users::make_row(PFS_user *pfs)
     return;
 
   PFS_connection_stat_visitor visitor;
-  PFS_connection_iterator::visit_user(pfs, true, true, & visitor);
+  PFS_connection_iterator::visit_user(pfs,
+                                      true,  /* accounts */
+                                      true,  /* threads */
+                                      false, /* THDs */
+                                      & visitor);
 
   if (! pfs->m_lock.end_optimistic_lock(& lock))
     return;
@@ -107,7 +123,7 @@ int table_users::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  assert(table->s->null_bytes == 1);
   buf[0]= 0;
 
   for (; (f= *fields) ; fields++)
@@ -124,7 +140,7 @@ int table_users::read_row_values(TABLE *table,
         m_row.m_connection_stat.set_field(f->field_index - 1, f);
         break;
       default:
-        DBUG_ASSERT(false);
+        assert(false);
       }
     }
   }
diff --git a/storage/perfschema/table_users.h b/storage/perfschema/table_users.h
index 912f0f43714..c2476c71707 100644
--- a/storage/perfschema/table_users.h
+++ b/storage/perfschema/table_users.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
diff --git a/storage/perfschema/table_uvar_by_thread.cc b/storage/perfschema/table_uvar_by_thread.cc
new file mode 100644
index 00000000000..b80ae2beefe
--- /dev/null
+++ b/storage/perfschema/table_uvar_by_thread.cc
@@ -0,0 +1,329 @@
+/* Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_uvar_by_thread.cc
+  Table USER_VARIABLES_BY_THREAD (implementation).
+*/
+
+#include "my_global.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_uvar_by_thread.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+#include "pfs_buffer_container.h"
+
+/* Iteration on THD from the sql layer. */
+#include "sql_class.h"
+#include "mysqld_thd_manager.h"
+
+class Find_thd_user_var : public Find_THD_Impl
+{
+public:
+  Find_thd_user_var(THD *unsafe_thd)
+    : m_unsafe_thd(unsafe_thd)
+  {}
+
+  virtual bool operator()(THD *thd)
+  {
+    if (thd != m_unsafe_thd)
+      return false;
+
+    if (thd->user_vars.records == 0)
+      return false;
+
+    mysql_mutex_lock(&thd->LOCK_thd_data);
+    return true;
+  }
+
+private:
+  THD *m_unsafe_thd;
+};
+
+void User_variables::materialize(PFS_thread *pfs, THD *thd)
+{
+  reset();
+
+  m_pfs= pfs;
+  m_thread_internal_id= pfs->m_thread_internal_id;
+  m_array.reserve(thd->user_vars.records);
+
+  user_var_entry *sql_uvar;
+
+  uint index= 0;
+  User_variable empty;
+
+  /* Protects thd->user_vars. */
+  mysql_mutex_assert_owner(&thd->LOCK_thd_data);
+
+  for (;;)
+  {
+    sql_uvar= reinterpret_cast<user_var_entry*> (my_hash_element(& thd->user_vars, index));
+    if (sql_uvar == NULL)
+      break;
+
+    /*
+      m_array is a container of objects (not pointers)
+
+      Naive code can:
+      - build locally a new entry
+      - add it to the container
+      but this causes useless object construction, destruction, and deep copies.
+
+      What we do here:
+      - add a dummy (empty) entry
+      - the container does a deep copy on something empty,
+        so that there is nothing to copy.
+      - get a reference to the entry added in the container
+      - complete -- in place -- the entry initialization
+    */
+    m_array.push(empty);
+    User_variable & pfs_uvar= *m_array.back();
+
+    /* Copy VARIABLE_NAME */
+    const char *name= sql_uvar->name.str;
+    size_t name_length= sql_uvar->name.length;
+    DBUG_ASSERT(name_length <= sizeof(pfs_uvar.m_name));
+    pfs_uvar.m_name.make_row(name, name_length);
+
+    /* Copy VARIABLE_VALUE */
+    bool null_value;
+    String *str_value;
+    String str_buffer;
+    uint decimals= 0;
+    str_value= sql_uvar->val_str(& null_value, & str_buffer, decimals);
+    if (str_value != NULL)
+    {
+      pfs_uvar.m_value.make_row(str_value->ptr(), str_value->length());
+    }
+    else
+    {
+      pfs_uvar.m_value.make_row(NULL, 0);
+    }
+
+    index++;
+  }
+}
+
+THR_LOCK table_uvar_by_thread::m_table_lock;
+
+PFS_engine_table_share
+table_uvar_by_thread::m_share=
+{
+  { C_STRING_WITH_LEN("user_variables_by_thread") },
+  &pfs_readonly_acl,
+  table_uvar_by_thread::create,
+  NULL, /* write_row */
+  NULL, /* delete_all_rows */
+  table_uvar_by_thread::get_row_count,
+  sizeof(pos_t),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE user_variables_by_thread("
+  "THREAD_ID BIGINT unsigned not null comment 'The thread identifier of the session in which the variable is defined.',"
+  "VARIABLE_NAME VARCHAR(64) not null comment 'The variable name, without the leading @ character.',"
+  "VARIABLE_VALUE LONGBLOB comment 'The variable value')") },
+  false  /* perpetual */
+};
+
+PFS_engine_table*
+table_uvar_by_thread::create(void)
+{
+  return new table_uvar_by_thread();
+}
+
+ha_rows
+table_uvar_by_thread::get_row_count(void)
+{
+  /*
+    This is an estimate only, not a hard limit.
+    The row count is given as a multiple of thread_max,
+    so that a join between:
+    - table performance_schema.threads
+    - table performance_schema.user_variables_by_thread
+    will still evaluate relative table sizes correctly
+    when deciding a join order.
+  */
+  return global_thread_container.get_row_count() * 10;
+}
+
+table_uvar_by_thread::table_uvar_by_thread()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_uvar_by_thread::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_uvar_by_thread::rnd_next(void)
+{
+  PFS_thread *thread;
+  bool has_more_thread= true;
+
+  for (m_pos.set_at(&m_next_pos);
+       has_more_thread;
+       m_pos.next_thread())
+  {
+    thread= global_thread_container.get(m_pos.m_index_1, & has_more_thread);
+    if (thread != NULL)
+    {
+      if (materialize(thread) == 0)
+      {
+        const User_variable *uvar= m_THD_cache.get(m_pos.m_index_2);
+        if (uvar != NULL)
+        {
+          make_row(thread, uvar);
+          m_next_pos.set_after(&m_pos);
+          return 0;
+        }
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_uvar_by_thread::rnd_pos(const void *pos)
+{
+  PFS_thread *thread;
+
+  set_position(pos);
+
+  thread= global_thread_container.get(m_pos.m_index_1);
+  if (thread != NULL)
+  {
+    if (materialize(thread) == 0)
+    {
+      const User_variable *uvar= m_THD_cache.get(m_pos.m_index_2);
+      if (uvar != NULL)
+      {
+        make_row(thread, uvar);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+int table_uvar_by_thread::materialize(PFS_thread *thread)
+{
+  if (m_THD_cache.is_materialized(thread))
+    return 0;
+
+  if (! thread->m_lock.is_populated())
+    return 1;
+
+  THD *unsafe_thd= thread->m_thd;
+  if (unsafe_thd == NULL)
+    return 1;
+
+  Find_thd_user_var finder(unsafe_thd);
+  THD *safe_thd= Global_THD_manager::get_instance()->find_thd(&finder);
+  if (safe_thd == NULL)
+    return 1;
+
+  m_THD_cache.materialize(thread, safe_thd);
+  mysql_mutex_unlock(&safe_thd->LOCK_thd_data);
+  return 0;
+}
+
+void table_uvar_by_thread
+::make_row(PFS_thread *thread, const User_variable *uvar)
+{
+  pfs_optimistic_state lock;
+  m_row_exists= false;
+
+  /* Protect this reader against a thread termination */
+  thread->m_lock.begin_optimistic_lock(&lock);
+
+  m_row.m_thread_internal_id= thread->m_thread_internal_id;
+
+  /* uvar is materialized, pointing to it directly. */
+  m_row.m_variable_name= & uvar->m_name;
+  m_row.m_variable_value= & uvar->m_value;
+
+  if (! thread->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+}
+
+int table_uvar_by_thread
+::read_row_values(TABLE *table,
+                  unsigned char *buf,
+                  Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  assert(m_row.m_variable_name != NULL);
+  assert(m_row.m_variable_value != NULL);
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* THREAD_ID */
+        set_field_ulonglong(f, m_row.m_thread_internal_id);
+        break;
+      case 1: /* VARIABLE_NAME */
+        set_field_varchar_utf8(f,
+                               m_row.m_variable_name->m_str,
+                               m_row.m_variable_name->m_length);
+        break;
+      case 2: /* VARIABLE_VALUE */
+        if (m_row.m_variable_value->get_value_length() > 0)
+        {
+          set_field_blob(f,
+                         m_row.m_variable_value->get_value(),
+              static_cast<uint>(m_row.m_variable_value->get_value_length()));
+        }
+        else
+        {
+          f->set_null();
+        }
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_uvar_by_thread.h b/storage/perfschema/table_uvar_by_thread.h
new file mode 100644
index 00000000000..cd20897743d
--- /dev/null
+++ b/storage/perfschema/table_uvar_by_thread.h
@@ -0,0 +1,195 @@
+/* Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_UVAR_BY_THREAD_H
+#define TABLE_UVAR_BY_THREAD_H
+
+/**
+  @file storage/perfschema/table_uvar_by_thread.h
+  Table USER_VARIABLES_BY_THREAD (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+struct User_variable
+{
+public:
+  User_variable()
+  {}
+
+  User_variable(const User_variable& uv)
+    : m_name(uv.m_name), m_value(uv.m_value)
+  {}
+
+  ~User_variable()
+  {}
+
+  PFS_variable_name_row m_name;
+  PFS_user_variable_value_row m_value;
+};
+
+class User_variables
+{
+  typedef Dynamic_array<User_variable> User_variable_array;
+
+public:
+  User_variables()
+    : m_pfs(NULL), m_thread_internal_id(0), m_array(PSI_INSTRUMENT_MEM)
+  {
+  }
+
+  void reset()
+  {
+    m_pfs= NULL;
+    m_thread_internal_id= 0;
+    for (uint i=0; i < m_array.elements(); i++)
+      m_array.at(i).~User_variable();
+    m_array.clear();
+  }
+
+  void materialize(PFS_thread *pfs, THD *thd);
+
+  bool is_materialized(PFS_thread *pfs)
+  {
+    assert(pfs != NULL);
+    if (m_pfs != pfs)
+      return false;
+    if (m_thread_internal_id != pfs->m_thread_internal_id)
+      return false;
+    return true;
+  }
+
+  const User_variable *get(uint index) const
+  {
+    if (index >= m_array.elements())
+      return NULL;
+
+    const User_variable *p= & m_array.at(index);
+    return p;
+  }
+
+private:
+  PFS_thread *m_pfs;
+  ulonglong m_thread_internal_id;
+  User_variable_array m_array;
+};
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.USER_VARIABLES_BY_THREAD.
+*/
+struct row_uvar_by_thread
+{
+  /** Column THREAD_ID. */
+  ulonglong m_thread_internal_id;
+  /** Column VARIABLE_NAME. */
+  const PFS_variable_name_row *m_variable_name;
+  /** Column VARIABLE_VALUE. */
+  const PFS_user_variable_value_row *m_variable_value;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.USER_VARIABLES_BY_THREAD.
+  Index 1 on thread (0 based)
+  Index 2 on user variable (0 based)
+*/
+struct pos_uvar_by_thread
+: public PFS_double_index
+{
+  pos_uvar_by_thread()
+    : PFS_double_index(0, 0)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 0;
+  }
+
+  inline void next_thread(void)
+  {
+    m_index_1++;
+    m_index_2= 0;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.USER_VARIABLES_BY_THREAD. */
+class table_uvar_by_thread : public PFS_engine_table
+{
+  typedef pos_uvar_by_thread pos_t;
+
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static ha_rows get_row_count();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_uvar_by_thread();
+
+public:
+  ~table_uvar_by_thread()
+  { m_THD_cache.reset(); }
+
+protected:
+  int materialize(PFS_thread *thread);
+  void make_row(PFS_thread *thread, const User_variable *uvar);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current THD user variables. */
+  User_variables m_THD_cache;
+  /** Current row. */
+  row_uvar_by_thread m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_t m_pos;
+  /** Next position. */
+  pos_t m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_variables_by_thread.cc b/storage/perfschema/table_variables_by_thread.cc
new file mode 100644
index 00000000000..bdf1aaf5227
--- /dev/null
+++ b/storage/perfschema/table_variables_by_thread.cc
@@ -0,0 +1,229 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_variables_by_thread.cc
+  Table VARIABLES_BY_THREAD (implementation).
+*/
+
+#include "my_global.h"
+#include "table_variables_by_thread.h"
+#include "my_thread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "pfs_global.h"
+
+THR_LOCK table_variables_by_thread::m_table_lock;
+
+PFS_engine_table_share
+table_variables_by_thread::m_share=
+{
+  { C_STRING_WITH_LEN("variables_by_thread") },
+  &pfs_readonly_acl,
+  table_variables_by_thread::create,
+  NULL, /* write_row */
+  NULL, /* delete_all_rows */
+  table_variables_by_thread::get_row_count,
+  sizeof(pos_t),
+  &m_table_lock,
+  { C_STRING_WITH_LEN("CREATE TABLE user_variables_by_thread("
+  "THREAD_ID BIGINT unsigned not null,"
+  "VARIABLE_NAME VARCHAR(64) not null,"
+  "VARIABLE_VALUE VARCHAR(1024))") },
+  true   /* perpetual */
+};
+
+PFS_engine_table*
+table_variables_by_thread::create(void)
+{
+  return new table_variables_by_thread();
+}
+
+ha_rows table_variables_by_thread::get_row_count(void)
+{
+  mysql_mutex_lock(&LOCK_plugin_delete);
+  mysql_prlock_rdlock(&LOCK_system_variables_hash);
+  ulong system_var_count= get_system_variable_hash_records();
+  mysql_prlock_unlock(&LOCK_system_variables_hash);
+  mysql_mutex_unlock(&LOCK_plugin_delete);
+  return (global_thread_container.get_row_count() * system_var_count);
+}
+
+table_variables_by_thread::table_variables_by_thread()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_sysvar_cache(true), m_row_exists(false), m_pos(), m_next_pos(), m_context(NULL)
+{}
+
+void table_variables_by_thread::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_variables_by_thread::rnd_init(bool scan)
+{
+  /*
+    Build array of SHOW_VARs from system variable hash prior to materializing
+    threads in rnd_next() or rnd_pos().
+  */
+  m_sysvar_cache.initialize_session();
+
+  /* Record the version of the system variable hash. */
+  ulonglong hash_version= m_sysvar_cache.get_sysvar_hash_version();
+
+  /*
+    The table context holds the current version of the system variable hash and
+    a record of which threads were materialized.
+    If scan == true, then allocate a new context from mem_root and store in TLS.
+    If scan == false, then restore from TLS.
+  */
+  m_context= (table_variables_by_thread_context *)current_thd->alloc(sizeof(table_variables_by_thread_context));
+  new(m_context) table_variables_by_thread_context(hash_version, !scan);
+  return 0;
+}
+
+int table_variables_by_thread::rnd_next(void)
+{
+  /* If system variable hash changes, exit with warning. */ // TODO: Issue warning
+  if (!m_context->versions_match())
+    return HA_ERR_END_OF_FILE;
+
+  bool has_more_thread= true;
+
+  for (m_pos.set_at(&m_next_pos);
+       has_more_thread;
+       m_pos.next_thread())
+  {
+    PFS_thread *pfs_thread= global_thread_container.get(m_pos.m_index_1, &has_more_thread);
+
+    /* Materialize all variables for the current thread. Assign a dedicated mem_root. */
+    if (m_sysvar_cache.materialize_session(pfs_thread, true) == 0)
+    {
+      /* Mark this thread as materialized. */
+      m_context->set_item(m_pos.m_index_1);
+
+      const System_variable *system_var= m_sysvar_cache.get(m_pos.m_index_2);
+      if (system_var != NULL)
+      {
+        make_row(pfs_thread, system_var);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_variables_by_thread::rnd_pos(const void *pos)
+{
+  /* If system variable hash changes, do nothing. */
+  if (!m_context->versions_match())
+    return HA_ERR_RECORD_DELETED;
+
+  set_position(pos);
+  assert(m_pos.m_index_1 < global_thread_container.get_row_count());
+
+  PFS_thread *pfs_thread= global_thread_container.get(m_pos.m_index_1);
+    /*
+    Only materialize threads that were previously materialized by rnd_next().
+    If a thread cannot be rematerialized, then do nothing.
+    Only materialize the requested system variable to avoid repeated
+    materialization of each thread, such as with ORDER BY variable_name.
+  */
+  if (m_context->is_item_set(m_pos.m_index_1) &&
+      /* Materialize only the requested variable. */
+      m_sysvar_cache.materialize_session(pfs_thread, m_pos.m_index_2) == 0)
+  {
+    /* Get the first (and only) element from the cache. */
+    const System_variable *system_var= m_sysvar_cache.get();
+    if (system_var != NULL)
+    {
+      make_row(pfs_thread, system_var);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_variables_by_thread
+::make_row(PFS_thread *thread, const System_variable *system_var)
+{
+  pfs_optimistic_state lock;
+  m_row_exists= false;
+  if (system_var->is_null() || system_var->is_ignored())
+    return;
+
+  /* Protect this reader against a thread termination */
+  thread->m_lock.begin_optimistic_lock(&lock);
+
+  m_row.m_thread_internal_id= thread->m_thread_internal_id;
+  m_row.m_variable_name.make_row(system_var->m_name, system_var->m_name_length);
+  m_row.m_variable_value.make_row(system_var);
+
+  if (!thread->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+}
+
+int table_variables_by_thread
+::read_row_values(TABLE *table,
+                  unsigned char *buf,
+                  Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  assert(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* THREAD_ID */
+        set_field_ulonglong(f, m_row.m_thread_internal_id);
+        break;
+      case 1: /* VARIABLE_NAME */
+        set_field_varchar_utf8(f, m_row.m_variable_name.m_str, m_row.m_variable_name.m_length);
+        break;
+      case 2: /* VARIABLE_VALUE */
+        m_row.m_variable_value.set_field(f);
+        break;
+      default:
+        assert(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_variables_by_thread.h b/storage/perfschema/table_variables_by_thread.h
new file mode 100644
index 00000000000..5143067adbb
--- /dev/null
+++ b/storage/perfschema/table_variables_by_thread.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_VARIABLES_BY_THREAD_H
+#define TABLE_VARIABLES_BY_THREAD_H
+
+/**
+  @file storage/perfschema/table_variables_by_thread.h
+  Table VARIABLES_BY_THREAD (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+#include "pfs_variable.h"
+#include "pfs_buffer_container.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.VARIABLES_BY_THREAD.
+*/
+struct row_variables_by_thread
+{
+  /** Column THREAD_ID. */
+  ulonglong m_thread_internal_id;
+  /** Column VARIABLE_NAME. */
+  PFS_variable_name_row m_variable_name;
+  /** Column VARIABLE_VALUE. */
+  PFS_variable_value_row m_variable_value;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.VARIABLES_BY_THREAD.
+  Index 1 on thread (0 based)
+  Index 2 on system variable (0 based)
+*/
+struct pos_variables_by_thread
+: public PFS_double_index
+{
+  pos_variables_by_thread()
+    : PFS_double_index(0, 0)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 0;
+  }
+
+  inline bool has_more_thread(void)
+  { return (m_index_1 < global_thread_container.get_row_count()); }
+
+  inline void next_thread(void)
+  {
+    m_index_1++;
+    m_index_2= 0;
+  }
+};
+
+/**
+  Store and retrieve table state information during queries that reinstantiate
+  the table object.
+*/
+class table_variables_by_thread_context : public PFS_table_context
+{
+public:
+  table_variables_by_thread_context(ulonglong hash_version, bool restore) :
+    PFS_table_context(hash_version, global_thread_container.get_row_count(), restore, THR_PFS_VBT) { }
+};
+
+/** Table PERFORMANCE_SCHEMA.VARIABLES_BY_THREAD. */
+class table_variables_by_thread : public PFS_engine_table
+{
+  typedef pos_variables_by_thread pos_t;
+
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static ha_rows get_row_count();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+  table_variables_by_thread();
+
+public:
+  ~table_variables_by_thread()
+  {}
+
+protected:
+  int materialize(PFS_thread *thread);
+  void make_row(PFS_thread *thread, const System_variable *system_var);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current THD variables. */
+  PFS_system_variable_cache m_sysvar_cache;
+  /** Current row. */
+  row_variables_by_thread m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_t m_pos;
+  /** Next position. */
+  pos_t m_next_pos;
+
+  /** Table context with system variable hash version and map of materialized threads. */
+  table_variables_by_thread_context *m_context;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/unittest/CMakeLists.txt b/storage/perfschema/unittest/CMakeLists.txt
index 7b8c906a4e6..712f22242c3 100644
--- a/storage/perfschema/unittest/CMakeLists.txt
+++ b/storage/perfschema/unittest/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2009, 2021, Oracle and/or its affiliates.
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License, version 2.0,
@@ -35,5 +35,6 @@ ADD_CONVENIENCE_LIBRARY(pfs_server_stubs pfs_server_stubs.cc)
 ADD_DEPENDENCIES(pfs_server_stubs GenError)
 
 MY_ADD_TESTS(pfs_instr_class pfs_instr_class-oom pfs_instr pfs_instr-oom
-             pfs_account-oom pfs_host-oom pfs_timer pfs_user-oom pfs pfs_misc
+             pfs_account-oom pfs_host-oom pfs_timer pfs_user-oom pfs_noop pfs
+             pfs_misc
   EXT "cc" LINK_LIBRARIES perfschema mysys pfs_server_stubs)
diff --git a/storage/perfschema/unittest/conf.txt b/storage/perfschema/unittest/conf.txt
index f84ba3317b1..db9cf980c33 100644
--- a/storage/perfschema/unittest/conf.txt
+++ b/storage/perfschema/unittest/conf.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2009, 2010, Oracle and/or its affiliates.
+# Copyright (c) 2009, 2021, Oracle and/or its affiliates.
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License, version 2.0,
diff --git a/storage/perfschema/unittest/pfs-t.cc b/storage/perfschema/unittest/pfs-t.cc
index f5f38367691..2240c2917e8 100644
--- a/storage/perfschema/unittest/pfs-t.cc
+++ b/storage/perfschema/unittest/pfs-t.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2018, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -21,11 +21,12 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 
 #include <my_global.h>
-#include <my_pthread.h>
+#include <my_thread.h>
 #include <pfs_server.h>
 #include <pfs_instr_class.h>
 #include <pfs_instr.h>
 #include <pfs_global.h>
+#include <pfs_buffer_container.h>
 #include <tap.h>
 
 #include <string.h>
@@ -33,6 +34,7 @@
 
 #include "stub_print_error.h"
 #include "stub_pfs_defaults.h"
+#include "stub_global_status_var.h"
 
 void unload_performance_schema();
 
@@ -48,30 +50,30 @@ void setup_thread(PSI_thread *t, bool enabled)
 
 PFS_file* lookup_file_by_name(const char* name)
 {
-  uint i;
   PFS_file *pfs;
   size_t len= strlen(name);
   size_t dirlen;
   const char *filename;
-  size_t filename_length;;
+  size_t filename_length;
 
-  for (i= 0; i < file_max; i++)
+  PFS_file_iterator it= global_file_container.iterate();
+  pfs= it.scan_next();
+
+  while (pfs != NULL)
   {
-    pfs= & file_array[i];
-    if (pfs->m_lock.is_populated())
-    {
-      /*
-        When a file "foo" is instrumented, the name is normalized
-        to "/path/to/current/directory/foo", so we remove the
-        directory name here to find it back.
-      */
-      dirlen= dirname_length(pfs->m_filename);
-      filename= pfs->m_filename + dirlen;
-      filename_length= pfs->m_filename_length - dirlen;
-      if ((len == filename_length) &&
-          (strncmp(name, filename, filename_length) == 0))
-        return pfs;
-    }
+    /*
+      When a file "foo" is instrumented, the name is normalized
+      to "/path/to/current/directory/foo", so we remove the
+      directory name here to find it back.
+    */
+    dirlen= dirname_length(pfs->m_filename);
+    filename= pfs->m_filename + dirlen;
+    filename_length= pfs->m_filename_length - dirlen;
+    if ((len == filename_length) &&
+        (strncmp(name, filename, filename_length) == 0))
+      return pfs;
+
+    pfs= it.scan_next();
   }
 
   return NULL;
@@ -118,10 +120,24 @@ void test_bootstrap()
   param.m_statement_class_sizing= 0;
   param.m_events_statements_history_sizing= 0;
   param.m_events_statements_history_long_sizing= 0;
+  param.m_events_transactions_history_sizing= 0;
+  param.m_events_transactions_history_long_sizing= 0;
   param.m_digest_sizing= 0;
   param.m_session_connect_attrs_sizing= 0;
+  param.m_program_sizing= 0;
+  param.m_statement_stack_sizing= 0;
+  param.m_memory_class_sizing= 0;
+  param.m_metadata_lock_sizing= 0;
   param.m_max_digest_length= 0;
+  param.m_max_sql_text_length= 0;
+
+  param.m_hints.m_table_definition_cache = 100;
+  param.m_hints.m_table_open_cache       = 100;
+  param.m_hints.m_max_connections        = 100;
+  param.m_hints.m_open_files_limit       = 100;
+  param.m_hints.m_max_prepared_stmt_count= 100;
 
+  pre_initialize_performance_schema();
   boot= initialize_performance_schema(& param);
   ok(boot != NULL, "boot");
   ok(boot->get_interface != NULL, "boot->get_interface");
@@ -143,7 +159,7 @@ void test_bootstrap()
 */
 PSI * load_perfschema()
 {
-  void *psi;
+  PSI *psi;
   PSI_bootstrap *boot;
   PFS_global_param param;
 
@@ -177,13 +193,27 @@ PSI * load_perfschema()
   param.m_statement_class_sizing= 0;
   param.m_events_statements_history_sizing= 0;
   param.m_events_statements_history_long_sizing= 0;
+  param.m_events_transactions_history_sizing= 0;
+  param.m_events_transactions_history_long_sizing= 0;
   param.m_digest_sizing= 0;
   param.m_session_connect_attrs_sizing= 0;
+  param.m_program_sizing= 0;
+  param.m_statement_stack_sizing= 10;
+  param.m_memory_class_sizing= 10;
+  param.m_metadata_lock_sizing= 10;
   param.m_max_digest_length= 0;
+  param.m_max_sql_text_length= 1000;
+
+  param.m_hints.m_table_definition_cache = 100;
+  param.m_hints.m_table_open_cache       = 100;
+  param.m_hints.m_max_connections        = 100;
+  param.m_hints.m_open_files_limit       = 100;
+  param.m_hints.m_max_prepared_stmt_count= 100;
 
+  pre_initialize_performance_schema();
   /* test_bootstrap() covered this, assuming it just works */
   boot= initialize_performance_schema(& param);
-  psi= boot->get_interface(PSI_VERSION_1);
+  psi= (PSI *)boot->get_interface(PSI_VERSION_1);
 
   /* Reset every consumer to a known state */
   flag_global_instrumentation= true;
@@ -335,6 +365,20 @@ void test_bad_registration()
   ok(dummy_rwlock_key == 0, "zero key");
 
   dummy_rwlock_key= 9999;
+  PSI_rwlock_info bad_rwlock_2_sx[]=
+  {
+    { & dummy_rwlock_key,
+      /* 109 chars name */
+      "12345678901234567890123456789012345678901234567890"
+      "12345678901234567890123456789012345678901234567890"
+      "123456789",
+      PSI_RWLOCK_FLAG_SX}
+  };
+
+  psi->register_rwlock("Y", bad_rwlock_2_sx, 1);
+  ok(dummy_rwlock_key == 0, "zero key SX");
+
+  dummy_rwlock_key= 9999;
   PSI_rwlock_info bad_rwlock_3[]=
   {
     { & dummy_rwlock_key,
@@ -351,6 +395,23 @@ void test_bad_registration()
   psi->register_rwlock("X", bad_rwlock_3, 1);
   ok(dummy_rwlock_key == 2, "assigned key");
 
+  dummy_rwlock_key= 9999;
+  PSI_rwlock_info bad_rwlock_3_sx[]=
+  {
+    { & dummy_rwlock_key,
+      /* 108 chars name */
+      "12345678901234567890123456789012345678901234567890"
+      "12345678901234567890123456789012345678901234567890"
+      "12345678",
+      PSI_RWLOCK_FLAG_SX}
+  };
+
+  psi->register_rwlock("YY", bad_rwlock_3_sx, 1);
+  ok(dummy_rwlock_key == 0, "zero key SX");
+
+  psi->register_rwlock("Y", bad_rwlock_3_sx, 1);
+  ok(dummy_rwlock_key == 3, "assigned key SX");
+
   /*
     Test that length('wait/synch/cond/' (16) + category + '/' (1)) < 32
     --> category can be up to 14 chars for a cond.
@@ -698,23 +759,27 @@ void test_init_disabled()
   socket_class_A= find_socket_class(socket_key_A);
   ok(socket_class_A != NULL, "socket class A");
 
-  /* Pretend thread T-1 is running, and disabled, with thread_instrumentation */
+  /*
+    Pretend thread T-1 is running, and disabled, with thread_instrumentation.
+    Disabled instruments are still created so they can be enabled later.
+  */
+
   /* ------------------------------------------------------------------------ */
 
   psi->set_thread(thread_1);
   setup_thread(thread_1, false);
 
-  /* disabled M-A + disabled T-1: no instrumentation */
+  /* disabled M-A + disabled T-1: instrumentation */
 
   mutex_class_A->m_enabled= false;
   mutex_A1= psi->init_mutex(mutex_key_A, NULL);
-  ok(mutex_A1 == NULL, "mutex_A1 not instrumented");
+  ok(mutex_A1 != NULL, "mutex_A1 disabled, instrumented");
 
   /* enabled M-A + disabled T-1: instrumentation (for later) */
 
   mutex_class_A->m_enabled= true;
   mutex_A1= psi->init_mutex(mutex_key_A, NULL);
-  ok(mutex_A1 != NULL, "mutex_A1 instrumented");
+  ok(mutex_A1 != NULL, "mutex_A1 enabled, instrumented");
 
   /* broken key + disabled T-1: no instrumentation */
 
@@ -728,13 +793,13 @@ void test_init_disabled()
 
   rwlock_class_A->m_enabled= false;
   rwlock_A1= psi->init_rwlock(rwlock_key_A, NULL);
-  ok(rwlock_A1 == NULL, "rwlock_A1 not instrumented");
+  ok(rwlock_A1 != NULL, "rwlock_A1 disabled, instrumented");
 
   /* enabled RW-A + disabled T-1: instrumentation (for later) */
 
   rwlock_class_A->m_enabled= true;
   rwlock_A1= psi->init_rwlock(rwlock_key_A, NULL);
-  ok(rwlock_A1 != NULL, "rwlock_A1 instrumented");
+  ok(rwlock_A1 != NULL, "rwlock_A1 enabled, instrumented");
 
   /* broken key + disabled T-1: no instrumentation */
 
@@ -748,13 +813,13 @@ void test_init_disabled()
 
   cond_class_A->m_enabled= false;
   cond_A1= psi->init_cond(cond_key_A, NULL);
-  ok(cond_A1 == NULL, "cond_A1 not instrumented");
+  ok(cond_A1 != NULL, "cond_A1 disabled, instrumented");
 
   /* enabled C-A + disabled T-1: instrumentation (for later) */
 
   cond_class_A->m_enabled= true;
   cond_A1= psi->init_cond(cond_key_A, NULL);
-  ok(cond_A1 != NULL, "cond_A1 instrumented");
+  ok(cond_A1 != NULL, "cond_A1 enabled, instrumented");
 
   /* broken key + disabled T-1: no instrumentation */
 
@@ -783,22 +848,22 @@ void test_init_disabled()
   file_class_A->m_enabled= true;
   psi->create_file(0, "foo", (File) 12);
   file_A1= lookup_file_by_name("foo");
-  ok(file_A1 == NULL, "not instrumented");
+  ok(file_A1 == NULL, "file_A1 not instrumented");
   psi->create_file(99, "foo", (File) 12);
   file_A1= lookup_file_by_name("foo");
-  ok(file_A1 == NULL, "not instrumented");
+  ok(file_A1 == NULL, "file_A1 not instrumented");
 
   /* disabled S-A + disabled T-1: no instrumentation */
 
   socket_class_A->m_enabled= false;
   socket_A1= psi->init_socket(socket_key_A, NULL, NULL, 0);
-  ok(socket_A1 == NULL, "socket_A1 not instrumented");
+  ok(socket_A1 != NULL, "socket_A1 disabled, instrumented");
 
   /* enabled S-A + disabled T-1: instrumentation (for later) */
 
   socket_class_A->m_enabled= true;
   socket_A1= psi->init_socket(socket_key_A, NULL, NULL, 0);
-  ok(socket_A1 != NULL, "socket_A1 instrumented");
+  ok(socket_A1 != NULL, "socket_A1 enabled, instrumented");
 
   /* broken key + disabled T-1: no instrumentation */
 
@@ -807,7 +872,7 @@ void test_init_disabled()
   ok(socket_A1 == NULL, "socket key 0 not instrumented");
   socket_A1= psi->init_socket(99, NULL, NULL, 0);
   ok(socket_A1 == NULL, "broken socket key not instrumented");
-  
+
   /* Pretend thread T-1 is enabled */
   /* ----------------------------- */
 
@@ -817,85 +882,85 @@ void test_init_disabled()
 
   mutex_class_A->m_enabled= false;
   mutex_A1= psi->init_mutex(mutex_key_A, NULL);
-  ok(mutex_A1 == NULL, "not instrumented");
+  ok(mutex_A1 != NULL, "mutex_A1 disabled, instrumented");
 
   /* enabled M-A + enabled T-1: instrumentation */
 
   mutex_class_A->m_enabled= true;
   mutex_A1= psi->init_mutex(mutex_key_A, NULL);
-  ok(mutex_A1 != NULL, "instrumented");
+  ok(mutex_A1 != NULL, "mutex_A1 enabled, instrumented");
   psi->destroy_mutex(mutex_A1);
 
   /* broken key + enabled T-1: no instrumentation */
 
   mutex_class_A->m_enabled= true;
   mutex_A1= psi->init_mutex(0, NULL);
-  ok(mutex_A1 == NULL, "not instrumented");
+  ok(mutex_A1 == NULL, "mutex_A1 not instrumented");
   mutex_A1= psi->init_mutex(99, NULL);
-  ok(mutex_A1 == NULL, "not instrumented");
+  ok(mutex_A1 == NULL, "mutex_A1 not instrumented");
 
   /* disabled RW-A + enabled T-1: no instrumentation */
 
   rwlock_class_A->m_enabled= false;
   rwlock_A1= psi->init_rwlock(rwlock_key_A, NULL);
-  ok(rwlock_A1 == NULL, "not instrumented");
+  ok(rwlock_A1 != NULL, "rwlock_A1 disabled, instrumented");
 
   /* enabled RW-A + enabled T-1: instrumentation */
 
   rwlock_class_A->m_enabled= true;
   rwlock_A1= psi->init_rwlock(rwlock_key_A, NULL);
-  ok(rwlock_A1 != NULL, "instrumented");
+  ok(rwlock_A1 != NULL, "rwlock_A1 enabled, instrumented");
   psi->destroy_rwlock(rwlock_A1);
 
   /* broken key + enabled T-1: no instrumentation */
 
   rwlock_class_A->m_enabled= true;
   rwlock_A1= psi->init_rwlock(0, NULL);
-  ok(rwlock_A1 == NULL, "not instrumented");
+  ok(rwlock_A1 == NULL, "rwlock_A1 not instrumented");
   rwlock_A1= psi->init_rwlock(99, NULL);
-  ok(rwlock_A1 == NULL, "not instrumented");
+  ok(rwlock_A1 == NULL, "rwlock_A1 not instrumented");
 
   /* disabled C-A + enabled T-1: no instrumentation */
 
   cond_class_A->m_enabled= false;
   cond_A1= psi->init_cond(cond_key_A, NULL);
-  ok(cond_A1 == NULL, "not instrumented");
+  ok(cond_A1 != NULL, "cond_A1 disabled, instrumented");
 
   /* enabled C-A + enabled T-1: instrumentation */
 
   cond_class_A->m_enabled= true;
   cond_A1= psi->init_cond(cond_key_A, NULL);
-  ok(cond_A1 != NULL, "instrumented");
+  ok(cond_A1 != NULL, "cond_A1 enabled, instrumented");
   psi->destroy_cond(cond_A1);
 
   /* broken key + enabled T-1: no instrumentation */
 
   cond_class_A->m_enabled= true;
   cond_A1= psi->init_cond(0, NULL);
-  ok(cond_A1 == NULL, "not instrumented");
+  ok(cond_A1 == NULL, "cond_A1 not instrumented");
   cond_A1= psi->init_cond(99, NULL);
-  ok(cond_A1 == NULL, "not instrumented");
+  ok(cond_A1 == NULL, "cond_A1 not instrumented");
 
   /* disabled F-A + enabled T-1: no instrumentation */
 
   file_class_A->m_enabled= false;
   psi->create_file(file_key_A, "foo", (File) 12);
   file_A1= lookup_file_by_name("foo");
-  ok(file_A1 == NULL, "not instrumented");
+  ok(file_A1 == NULL, "file_A1 not instrumented");
 
   /* enabled F-A + open failed + enabled T-1: no instrumentation */
 
   file_class_A->m_enabled= true;
   psi->create_file(file_key_A, "foo", (File) -1);
   file_A1= lookup_file_by_name("foo");
-  ok(file_A1 == NULL, "not instrumented");
+  ok(file_A1 == NULL, "file_A1 not instrumented");
 
   /* enabled F-A + out-of-descriptors + enabled T-1: no instrumentation */
 
   file_class_A->m_enabled= true;
   psi->create_file(file_key_A, "foo", (File) 65000);
   file_A1= lookup_file_by_name("foo");
-  ok(file_A1 == NULL, "not instrumented");
+  ok(file_A1 == NULL, "file_A1 not instrumented");
   ok(file_handle_lost == 1, "lost a file handle");
   file_handle_lost= 0;
 
@@ -920,22 +985,22 @@ void test_init_disabled()
   /* disabled S-A + enabled T-1: no instrumentation */
 
   socket_class_A->m_enabled= false;
-  ok(socket_A1 == NULL, "not instrumented");
+  ok(socket_A1 == NULL, "socket_A1 not instrumented");
 
   /* enabled S-A + enabled T-1: instrumentation */
 
   socket_class_A->m_enabled= true;
   socket_A1= psi->init_socket(socket_key_A, NULL, NULL, 0);
-  ok(socket_A1 != NULL, "instrumented");
+  ok(socket_A1 != NULL, "socket_A1 instrumented");
   psi->destroy_socket(socket_A1);
 
   /* broken key + enabled T-1: no instrumentation */
 
   socket_class_A->m_enabled= true;
   socket_A1= psi->init_socket(0, NULL, NULL, 0);
-  ok(socket_A1 == NULL, "not instrumented");
+  ok(socket_A1 == NULL, "socket_A1 not instrumented");
   socket_A1= psi->init_socket(99, NULL, NULL, 0);
-  ok(socket_A1 == NULL, "not instrumented");
+  ok(socket_A1 == NULL, "socket_A1 not instrumented");
 
   /* Pretend the running thread is not instrumented */
   /* ---------------------------------------------- */
@@ -946,13 +1011,13 @@ void test_init_disabled()
 
   mutex_class_A->m_enabled= false;
   mutex_A1= psi->init_mutex(mutex_key_A, NULL);
-  ok(mutex_A1 == NULL, "mutex_A1 not instrumented");
+  ok(mutex_A1 != NULL, "mutex_A1 disabled, instrumented");
 
   /* enabled M-A + unknown thread: instrumentation (for later) */
 
   mutex_class_A->m_enabled= true;
   mutex_A1= psi->init_mutex(mutex_key_A, NULL);
-  ok(mutex_A1 != NULL, "mutex_A1 instrumented");
+  ok(mutex_A1 != NULL, "mutex_A1 enabled, instrumented");
 
   /* broken key + unknown thread: no instrumentation */
 
@@ -966,13 +1031,13 @@ void test_init_disabled()
 
   rwlock_class_A->m_enabled= false;
   rwlock_A1= psi->init_rwlock(rwlock_key_A, NULL);
-  ok(rwlock_A1 == NULL, "rwlock_A1 not instrumented");
+  ok(rwlock_A1 != NULL, "rwlock_A1 disabled, instrumented");
 
   /* enabled RW-A + unknown thread: instrumentation (for later) */
 
   rwlock_class_A->m_enabled= true;
   rwlock_A1= psi->init_rwlock(rwlock_key_A, NULL);
-  ok(rwlock_A1 != NULL, "rwlock_A1 instrumented");
+  ok(rwlock_A1 != NULL, "rwlock_A1 enabled, instrumented");
 
   /* broken key + unknown thread: no instrumentation */
 
@@ -986,13 +1051,13 @@ void test_init_disabled()
 
   cond_class_A->m_enabled= false;
   cond_A1= psi->init_cond(cond_key_A, NULL);
-  ok(cond_A1 == NULL, "cond_A1 not instrumented");
+  ok(cond_A1 != NULL, "cond_A1 disabled, instrumented");
 
   /* enabled C-A + unknown thread: instrumentation (for later) */
 
   cond_class_A->m_enabled= true;
   cond_A1= psi->init_cond(cond_key_A, NULL);
-  ok(cond_A1 != NULL, "cond_A1 instrumented");
+  ok(cond_A1 != NULL, "cond_A1 enabled, instrumented");
 
   /* broken key + unknown thread: no instrumentation */
 
@@ -1007,14 +1072,14 @@ void test_init_disabled()
   file_class_A->m_enabled= false;
   psi->create_file(file_key_A, "foo", (File) 12);
   file_A1= lookup_file_by_name("foo");
-  ok(file_A1 == NULL, "not instrumented");
+  ok(file_A1 == NULL, "file_A1 not instrumented");
 
   /* enabled F-A + unknown thread: no instrumentation */
 
   file_class_A->m_enabled= true;
   psi->create_file(file_key_A, "foo", (File) 12);
   file_A1= lookup_file_by_name("foo");
-  ok(file_A1 == NULL, "not instrumented");
+  ok(file_A1 == NULL, "file_A1 not instrumented");
 
   /* broken key + unknown thread: no instrumentation */
 
@@ -1030,13 +1095,13 @@ void test_init_disabled()
 
   socket_class_A->m_enabled= false;
   socket_A1= psi->init_socket(socket_key_A, NULL, NULL, 0);
-  ok(socket_A1 == NULL, "socket_A1 not instrumented");
+  ok(socket_A1 != NULL, "socket_A1 disabled, instrumented");
 
   /* enabled S-A + unknown thread: instrumentation (for later) */
 
   socket_class_A->m_enabled= true;
   socket_A1= psi->init_socket(socket_key_A, NULL, NULL, 0);
-  ok(socket_A1 != NULL, "socket_A1 instrumented");
+  ok(socket_A1 != NULL, "socket_A1 enabled, instrumented");
 
   /* broken key + unknown thread: no instrumentation */
 
@@ -1521,7 +1586,7 @@ void test_event_name_index()
   memset(& param, 0xFF, sizeof(param));
   param.m_enabled= true;
 
-  /* NOTE: Need to add 3 to each index: table io, table lock, idle */
+  /* NOTE: Need to add 4 to each index: table io, table lock, idle, metadata lock */
 
   /* Per mutex info waits should be at [0..9] */
   param.m_mutex_class_sizing= 10;
@@ -1546,9 +1611,16 @@ void test_event_name_index()
   param.m_statement_class_sizing= 0;
   param.m_events_statements_history_sizing= 0;
   param.m_events_statements_history_long_sizing= 0;
+  param.m_events_transactions_history_sizing= 0;
+  param.m_events_transactions_history_long_sizing= 0;
   param.m_digest_sizing= 0;
   param.m_session_connect_attrs_sizing= 0;
+  param.m_program_sizing= 0;
+  param.m_statement_stack_sizing= 10;
+  param.m_memory_class_sizing= 12;
+  param.m_metadata_lock_sizing= 10;
   param.m_max_digest_length= 0;
+  param.m_max_sql_text_length= 1000;
 
   param.m_mutex_sizing= 0;
   param.m_rwlock_sizing= 0;
@@ -1563,6 +1635,13 @@ void test_event_name_index()
   param.m_setup_actor_sizing= 0;
   param.m_setup_object_sizing= 0;
 
+  param.m_hints.m_table_definition_cache = 100;
+  param.m_hints.m_table_open_cache       = 100;
+  param.m_hints.m_max_connections        = 100;
+  param.m_hints.m_open_files_limit       = 100;
+  param.m_hints.m_max_prepared_stmt_count= 100;
+
+  pre_initialize_performance_schema();
   boot= initialize_performance_schema(& param);
   ok(boot != NULL, "bootstrap");
   psi= (PSI*) boot->get_interface(PSI_VERSION_1);
@@ -1580,10 +1659,10 @@ void test_event_name_index()
   psi->register_mutex("X", dummy_mutexes, 2);
   mutex_class= find_mutex_class(dummy_mutex_key_1);
   ok(mutex_class != NULL, "mutex class 1");
-  ok(mutex_class->m_event_name_index == 3, "index 3");
+  ok(mutex_class->m_event_name_index == 4, "index 4");
   mutex_class= find_mutex_class(dummy_mutex_key_2);
   ok(mutex_class != NULL, "mutex class 2");
-  ok(mutex_class->m_event_name_index == 4, "index 4");
+  ok(mutex_class->m_event_name_index == 5, "index 5");
 
   PFS_rwlock_class *rwlock_class;
   PSI_rwlock_key dummy_rwlock_key_1;
@@ -1597,10 +1676,10 @@ void test_event_name_index()
   psi->register_rwlock("X", dummy_rwlocks, 2);
   rwlock_class= find_rwlock_class(dummy_rwlock_key_1);
   ok(rwlock_class != NULL, "rwlock class 1");
-  ok(rwlock_class->m_event_name_index == 13, "index 13");
+  ok(rwlock_class->m_event_name_index == 14, "index 14");
   rwlock_class= find_rwlock_class(dummy_rwlock_key_2);
   ok(rwlock_class != NULL, "rwlock class 2");
-  ok(rwlock_class->m_event_name_index == 14, "index 14");
+  ok(rwlock_class->m_event_name_index == 15, "index 15");
 
   PFS_cond_class *cond_class;
   PSI_cond_key dummy_cond_key_1;
@@ -1614,10 +1693,10 @@ void test_event_name_index()
   psi->register_cond("X", dummy_conds, 2);
   cond_class= find_cond_class(dummy_cond_key_1);
   ok(cond_class != NULL, "cond class 1");
-  ok(cond_class->m_event_name_index == 33, "index 33");
+  ok(cond_class->m_event_name_index == 34, "index 34");
   cond_class= find_cond_class(dummy_cond_key_2);
   ok(cond_class != NULL, "cond class 2");
-  ok(cond_class->m_event_name_index == 34, "index 34");
+  ok(cond_class->m_event_name_index == 35, "index 35");
 
   PFS_file_class *file_class;
   PSI_file_key dummy_file_key_1;
@@ -1631,10 +1710,10 @@ void test_event_name_index()
   psi->register_file("X", dummy_files, 2);
   file_class= find_file_class(dummy_file_key_1);
   ok(file_class != NULL, "file class 1");
-  ok(file_class->m_event_name_index == 73, "index 73");
+  ok(file_class->m_event_name_index == 74, "index 74");
   file_class= find_file_class(dummy_file_key_2);
   ok(file_class != NULL, "file class 2");
-  ok(file_class->m_event_name_index == 74, "index 74");
+  ok(file_class->m_event_name_index == 75, "index 75");
 
   PFS_socket_class *socket_class;
   PSI_socket_key dummy_socket_key_1;
@@ -1648,14 +1727,162 @@ void test_event_name_index()
   psi->register_socket("X", dummy_sockets, 2);
   socket_class= find_socket_class(dummy_socket_key_1);
   ok(socket_class != NULL, "socket class 1");
-  ok(socket_class->m_event_name_index == 153, "index 153");
+  ok(socket_class->m_event_name_index == 154, "index 154");
   socket_class= find_socket_class(dummy_socket_key_2);
   ok(socket_class != NULL, "socket class 2");
-  ok(socket_class->m_event_name_index == 154, "index 154");
+  ok(socket_class->m_event_name_index == 155, "index 155");
 
   ok(global_table_io_class.m_event_name_index == 0, "index 0");
   ok(global_table_lock_class.m_event_name_index == 1, "index 1");
-  ok(wait_class_max= 313, "313 event names"); // 3 global classes
+  ok(wait_class_max= 314, "314 event names"); // 4 global classes
+
+  shutdown_performance_schema();
+}
+
+void test_memory_instruments()
+{
+  PSI *psi;
+  PSI_thread *owner;
+
+  diag("test_memory_instruments");
+
+  psi= load_perfschema();
+
+  PSI_memory_key memory_key_A;
+  PSI_memory_info all_memory[]=
+  {
+    { & memory_key_A, "M-A", 0}
+  };
+
+  PSI_thread_key thread_key_1;
+  PSI_thread_info all_thread[]=
+  {
+    { & thread_key_1, "T-1", 0}
+  };
+
+  psi->register_memory("test", all_memory, 1);
+  psi->register_thread("test", all_thread, 1);
+
+  PFS_memory_class *memory_class_A;
+  PSI_thread *thread_1;
+  PSI_memory_key key;
+
+  /* Preparation */
+
+  thread_1= psi->new_thread(thread_key_1, NULL, 0);
+  ok(thread_1 != NULL, "T-1");
+  psi->set_thread_id(thread_1, 1);
+
+  memory_class_A= find_memory_class(memory_key_A);
+  ok(memory_class_A != NULL, "memory info A");
+
+  /* Pretend thread T-1 is running, and enabled */
+  /* ------------------------------------------ */
+
+  psi->set_thread(thread_1);
+  setup_thread(thread_1, true);
+
+  /* Enable all instruments */
+
+  memory_class_A->m_enabled= true;
+
+  /* for coverage, need to print stats collected. */
+
+  key= psi->memory_alloc(memory_key_A, 100, & owner);
+  ok(key == memory_key_A, "alloc memory info A");
+  key= psi->memory_realloc(memory_key_A, 100, 200, & owner);
+  ok(key == memory_key_A, "realloc memory info A");
+  key= psi->memory_realloc(memory_key_A, 200, 300, & owner);
+  ok(key == memory_key_A, "realloc up memory info A");
+  key= psi->memory_realloc(memory_key_A, 300, 50, & owner);
+  ok(key == memory_key_A, "realloc down memory info A");
+  psi->memory_free(memory_key_A, 50, owner);
+
+  /* Use global instrumentation only */
+  /* ------------------------------- */
+
+  flag_thread_instrumentation= false;
+
+  key= psi->memory_alloc(memory_key_A, 100, & owner);
+  ok(key == memory_key_A, "alloc memory info A");
+  key= psi->memory_realloc(memory_key_A, 100, 200, & owner);
+  ok(key == memory_key_A, "realloc memory info A");
+  key= psi->memory_realloc(memory_key_A, 200, 300, & owner);
+  ok(key == memory_key_A, "realloc up memory info A");
+  key= psi->memory_realloc(memory_key_A, 300, 50, & owner);
+  ok(key == memory_key_A, "realloc down memory info A");
+  psi->memory_free(memory_key_A, 50, owner);
+
+  /* Garbage, for robustness */
+  /* ----------------------- */
+
+  key= psi->memory_alloc(9999, 100, & owner);
+  ok(key == PSI_NOT_INSTRUMENTED, "alloc with unknown key");
+  key= psi->memory_realloc(PSI_NOT_INSTRUMENTED, 100, 200, & owner);
+  ok(key == PSI_NOT_INSTRUMENTED, "realloc with unknown key");
+  psi->memory_free(PSI_NOT_INSTRUMENTED, 200, owner);
+
+  shutdown_performance_schema();
+}
+
+void test_leaks()
+{
+  PSI_bootstrap *boot;
+  PFS_global_param param;
+
+  /* Allocate everything, to make sure cleanup does not forget anything. */
+
+  memset(& param, 0xFF, sizeof(param));
+  param.m_enabled= true;
+  param.m_mutex_class_sizing= 10;
+  param.m_rwlock_class_sizing= 10;
+  param.m_cond_class_sizing= 10;
+  param.m_thread_class_sizing= 10;
+  param.m_table_share_sizing= 10;
+  param.m_file_class_sizing= 10;
+  param.m_socket_class_sizing= 10;
+  param.m_mutex_sizing= 1000;
+  param.m_rwlock_sizing= 1000;
+  param.m_cond_sizing= 1000;
+  param.m_thread_sizing= 1000;
+  param.m_table_sizing= 1000;
+  param.m_file_sizing= 1000;
+  param.m_file_handle_sizing= 1000;
+  param.m_socket_sizing= 1000;
+  param.m_events_waits_history_sizing= 10;
+  param.m_events_waits_history_long_sizing= 1000;
+  param.m_setup_actor_sizing= 1000;
+  param.m_setup_object_sizing= 1000;
+  param.m_host_sizing= 1000;
+  param.m_user_sizing= 1000;
+  param.m_account_sizing= 1000;
+  param.m_stage_class_sizing= 10;
+  param.m_events_stages_history_sizing= 10;
+  param.m_events_stages_history_long_sizing= 1000;
+  param.m_statement_class_sizing= 10;
+  param.m_events_statements_history_sizing= 10;
+  param.m_events_statements_history_long_sizing= 1000;
+  param.m_session_connect_attrs_sizing= 1000;
+  param.m_memory_class_sizing= 10;
+  param.m_metadata_lock_sizing= 1000;
+  param.m_digest_sizing= 1000;
+  param.m_program_sizing= 1000;
+  param.m_statement_stack_sizing= 10;
+  param.m_max_digest_length= 1000;
+  param.m_max_sql_text_length= 1000;
+
+  param.m_hints.m_table_definition_cache = 100;
+  param.m_hints.m_table_open_cache       = 100;
+  param.m_hints.m_max_connections        = 100;
+  param.m_hints.m_open_files_limit       = 100;
+  param.m_hints.m_max_prepared_stmt_count= 100;
+
+  pre_initialize_performance_schema();
+  boot= initialize_performance_schema(& param);
+  ok(boot != NULL, "bootstrap");
+  shutdown_performance_schema();
+
+  /* Leaks will be reported with valgrind */
 }
 
 void do_all_tests()
@@ -1668,11 +1895,13 @@ void do_all_tests()
   test_locker_disabled();
   test_file_instrumentation_leak();
   test_event_name_index();
+  test_memory_instruments();
+  test_leaks();
 }
 
 int main(int argc, char **argv)
 {
-  plan(216);
+  plan(232);
   MY_INIT(argv[0]);
   do_all_tests();
   my_end(0);
diff --git a/storage/perfschema/unittest/pfs_account-oom-t.cc b/storage/perfschema/unittest/pfs_account-oom-t.cc
index 1ca66445e0c..ea39903484c 100644
--- a/storage/perfschema/unittest/pfs_account-oom-t.cc
+++ b/storage/perfschema/unittest/pfs_account-oom-t.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, 2017, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -21,87 +21,142 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 
 #include <my_global.h>
-#include <my_pthread.h>
+#include <my_thread.h>
 #include <pfs_instr.h>
 #include <pfs_stat.h>
 #include <pfs_global.h>
+#include <pfs_defaults.h>
+#include <pfs_user.h>
+#include <pfs_host.h>
 #include <pfs_account.h>
+#include <pfs_buffer_container.h>
 #include <tap.h>
 
 #include "stub_pfs_global.h"
+#include "stub_global_status_var.h"
 
 #include <string.h> /* memset */
 
-void test_oom()
-{
-  int rc;
-  PFS_global_param param;
-
-  memset(& param, 0xFF, sizeof(param));
-  param.m_enabled= true;
-  param.m_mutex_class_sizing= 0;
-  param.m_rwlock_class_sizing= 0;
-  param.m_cond_class_sizing= 0;
-  param.m_thread_class_sizing= 10;
-  param.m_table_share_sizing= 0;
-  param.m_file_class_sizing= 0;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 1000;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 0;
-  param.m_events_waits_history_sizing= 10;
-  param.m_events_waits_history_long_sizing= 0;
-  param.m_setup_actor_sizing= 0;
-  param.m_setup_object_sizing= 0;
-  param.m_host_sizing= 0;
-  param.m_user_sizing= 0;
-  param.m_account_sizing= 1000;
-  param.m_stage_class_sizing= 50;
-  param.m_events_stages_history_sizing= 0;
-  param.m_events_stages_history_long_sizing= 0;
-  param.m_statement_class_sizing= 50;
-  param.m_events_statements_history_sizing= 0;
-  param.m_events_statements_history_long_sizing= 0;
-  param.m_session_connect_attrs_sizing= 0;
-
-  /* Setup */
+PFS_thread pfs_thread;
 
+void initialize_performance_schema_helper(PFS_global_param *param)
+{
   stub_alloc_always_fails= false;
   stub_alloc_fails_after_count= 1000;
 
-  init_event_name_sizing(& param);
-  rc= init_stage_class(param.m_stage_class_sizing);
-  ok(rc == 0, "init stage class");
-  rc= init_statement_class(param.m_statement_class_sizing);
-  ok(rc == 0, "init statement class");
+  param->m_enabled= true;
+  param->m_thread_class_sizing= 10;
+  param->m_thread_sizing= 1000;
+  param->m_account_sizing= 1000;
+  transaction_class_max= 0;
+
+  pfs_thread.m_account_hash_pins= NULL;
+
+  init_event_name_sizing(param);
+  init_sync_class(param->m_mutex_class_sizing, param->m_rwlock_class_sizing, param->m_cond_class_sizing);
+  init_thread_class(param->m_thread_class_sizing);
+  init_table_share(param->m_table_share_sizing);
+  init_table_share_lock_stat(param->m_table_lock_stat_sizing);
+  init_table_share_index_stat(param->m_index_stat_sizing);
+  init_file_class(param->m_file_class_sizing);
+  init_stage_class(param->m_stage_class_sizing);
+  init_statement_class(param->m_statement_class_sizing);
+  init_socket_class(param->m_socket_class_sizing);
+  init_memory_class(param->m_memory_class_sizing);
+  init_instruments(param);
+  init_events_waits_history_long(param->m_events_waits_history_long_sizing);
+  init_events_stages_history_long(param->m_events_stages_history_long_sizing);
+  init_events_statements_history_long(param->m_events_statements_history_long_sizing);
+  init_events_transactions_history_long(param->m_events_transactions_history_long_sizing);
+  init_file_hash(param);
+  init_table_share_hash(param);
+  init_setup_actor(param);
+  init_setup_actor_hash(param);
+  init_setup_object(param);
+  init_setup_object_hash(param);
+  init_host(param);
+  init_host_hash(param);
+  init_user(param);
+  init_user_hash(param);
+  init_account(param);
+  init_account_hash(param);
+  init_digest(param);
+  init_digest_hash(param);
+  init_program(param);
+  init_program_hash(param);
+  init_prepared_stmt(param);
+  pfs_initialized= true;
+}
+
+void test_oom()
+{
+  PFS_global_param param;
+  PFS_account *pfs_account;
+  const char *username= "username";
+  const char *hostname= "hostname";
 
-  /* Tests */
+  uint user_len= (uint)strlen(username);
+  uint host_len= (uint)strlen(hostname);
 
+  /* Account. */
+  memset(&param, 0, sizeof(param));
+  initialize_performance_schema_helper(&param);
   stub_alloc_fails_after_count= 1;
-  rc= init_account(& param);
-  ok(rc == 1, "oom (account)");
-  cleanup_account();
-
+  pfs_account= find_or_create_account(&pfs_thread, username, user_len, hostname, host_len);
+  ok(pfs_account == NULL, "oom (account)");
+  ok(global_account_container.m_lost == 1, "lost (account)");
+  shutdown_performance_schema();
+
+  /* Account waits. */
+  memset(&param, 0, sizeof(param));
+  param.m_mutex_class_sizing= 10;
+  initialize_performance_schema_helper(&param);
   stub_alloc_fails_after_count= 2;
-  rc= init_account(& param);
-  ok(rc == 1, "oom (account waits)");
-  cleanup_account();
+  pfs_account= find_or_create_account(&pfs_thread, username, user_len, hostname, host_len);
+  ok(pfs_account == NULL, "oom (account waits)");
+  ok(global_account_container.m_lost == 1, "lost (account waits)");
+  shutdown_performance_schema();
 
-  stub_alloc_fails_after_count= 3;
-  rc= init_account(& param);
-  ok(rc == 1, "oom (account stages)");
-  cleanup_account();
 
-  stub_alloc_fails_after_count= 4;
-  rc= init_account(& param);
-  ok(rc == 1, "oom (account statements)");
-  cleanup_account();
-
-  cleanup_statement_class();
-  cleanup_stage_class();
+  /* Account stages. */
+  memset(&param, 0, sizeof(param));
+  param.m_stage_class_sizing= 10;
+  initialize_performance_schema_helper(&param);
+  stub_alloc_fails_after_count= 3;
+  pfs_account= find_or_create_account(&pfs_thread, username, user_len, hostname, host_len);
+  ok(pfs_account == NULL, "oom (account stages)");
+  ok(global_account_container.m_lost == 1, "lost (account stages)");
+  shutdown_performance_schema();
+
+  /* Account statements. */
+  memset(&param, 0, sizeof(param));
+  param.m_statement_class_sizing= 10;
+  initialize_performance_schema_helper(&param);
+  stub_alloc_fails_after_count= 3;
+  pfs_account= find_or_create_account(&pfs_thread, username, user_len, hostname, host_len);
+  ok(pfs_account == NULL, "oom (account statements)");
+  ok(global_account_container.m_lost == 1, "lost (account statements)");
+  shutdown_performance_schema();
+
+  /* Account transactions. */
+  memset(&param, 0, sizeof(param));
+  initialize_performance_schema_helper(&param);
+  transaction_class_max= 1;
+  stub_alloc_fails_after_count= 3;
+  pfs_account= find_or_create_account(&pfs_thread, username, user_len, hostname, host_len);
+  ok(pfs_account == NULL, "oom (account transactions)");
+  ok(global_account_container.m_lost == 1, "lost (account transactions)");
+  shutdown_performance_schema();
+
+  /* Account memory. */
+  memset(&param, 0, sizeof(param));
+  param.m_memory_class_sizing= 10;
+  initialize_performance_schema_helper(&param);
+  stub_alloc_fails_after_count= 3;
+  pfs_account= find_or_create_account(&pfs_thread, username, user_len, hostname, host_len);
+  ok(pfs_account == NULL, "oom (account memory)");
+  ok(global_account_container.m_lost == 1, "lost (account memory)");
+  shutdown_performance_schema();
 }
 
 void do_all_tests()
@@ -111,10 +166,9 @@ void do_all_tests()
 
 int main(int, char **)
 {
-  plan(6);
+  plan(12);
   MY_INIT("pfs_account-oom-t");
   do_all_tests();
   my_end(0);
   return (exit_status());
 }
-
diff --git a/storage/perfschema/unittest/pfs_connect_attr-t.cc b/storage/perfschema/unittest/pfs_connect_attr-t.cc
index 7ebfb0c8294..e414a17a1ef 100644
--- a/storage/perfschema/unittest/pfs_connect_attr-t.cc
+++ b/storage/perfschema/unittest/pfs_connect_attr-t.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2017, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -21,7 +21,7 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 
 #include <my_global.h>
-#include <my_pthread.h>
+#include <my_thread.h>
 #include <pfs_server.h>
 #include <pfs_instr_class.h>
 #include <pfs_instr.h>
@@ -47,7 +47,7 @@ void test_blob_parser()
   unsigned char packet[10000], *ptr;
   uint name_len, value_len, idx, packet_length;
   bool result;
-  const CHARSET_INFO *cs= &my_charset_utf8_bin;
+  const CHARSET_INFO *cs= &my_charset_utf8mb3_bin;
 
   diag("test_blob_parser");
 
@@ -164,7 +164,7 @@ void test_multibyte_lengths()
   char name[100], value[4096];
   uint name_len, value_len;
   bool result;
-  const CHARSET_INFO *cs= &my_charset_utf8_bin;
+  const CHARSET_INFO *cs= &my_charset_utf8mb3_bin;
 
   unsigned char var_len_packet[] = {
     252, 2, 0, 'k', '1',
@@ -197,7 +197,7 @@ void test_utf8_parser()
   char name[33 * 6], value[1024 * 6], packet[1500 * 6], *ptr;
   uint name_len, value_len;
   bool result;
-  const CHARSET_INFO *cs= &my_charset_utf8_bin;
+  const CHARSET_INFO *cs= &my_charset_utf8mb3_bin;
 
   /* note : this is encoded in utf-8 */
   const char *attr1= "Георги";
@@ -249,7 +249,7 @@ void test_utf8_parser_bad_encoding()
   char name[33 * 3], value[1024 * 3], packet[1500 * 3], *ptr;
   uint name_len, value_len;
   bool result;
-  const CHARSET_INFO *cs= &my_charset_utf8_bin;
+  const CHARSET_INFO *cs= &my_charset_utf8mb3_bin;
 
   /* note : this is encoded in utf-8 */
   const char *attr= "Георги";
@@ -258,7 +258,7 @@ void test_utf8_parser_bad_encoding()
   ptr= packet;
   *ptr++= strlen(attr);
   memcpy(ptr, attr, strlen(attr));
-  ptr[0]= 0xFA; // invalid UTF-8 char
+  ptr[0]= (char)0xFA; // invalid UTF-8 char
   ptr+= strlen(attr);
   *ptr++= strlen(val);
   memcpy(ptr, val, strlen(val));
diff --git a/storage/perfschema/unittest/pfs_host-oom-t.cc b/storage/perfschema/unittest/pfs_host-oom-t.cc
index 14af1af0352..0dc77f9e307 100644
--- a/storage/perfschema/unittest/pfs_host-oom-t.cc
+++ b/storage/perfschema/unittest/pfs_host-oom-t.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, 2017, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -21,21 +21,26 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 
 #include <my_global.h>
-#include <my_pthread.h>
+#include <my_thread.h>
 #include <pfs_instr.h>
 #include <pfs_stat.h>
 #include <pfs_global.h>
 #include <pfs_host.h>
+#include <pfs_buffer_container.h>
 #include <tap.h>
 
 #include "stub_pfs_global.h"
+#include "stub_global_status_var.h"
 
 #include <string.h> /* memset */
 
+extern struct PSI_bootstrap PFS_bootstrap;
+
 void test_oom()
 {
-  int rc;
+  PSI *psi;
   PFS_global_param param;
+  PSI_bootstrap *boot;
 
   memset(& param, 0xFF, sizeof(param));
   param.m_enabled= true;
@@ -45,6 +50,7 @@ void test_oom()
   param.m_thread_class_sizing= 10;
   param.m_table_share_sizing= 0;
   param.m_file_class_sizing= 0;
+  param.m_socket_class_sizing= 0;
   param.m_mutex_sizing= 0;
   param.m_rwlock_sizing= 0;
   param.m_cond_sizing= 0;
@@ -52,12 +58,13 @@ void test_oom()
   param.m_table_sizing= 0;
   param.m_file_sizing= 0;
   param.m_file_handle_sizing= 0;
+  param.m_socket_sizing= 0;
   param.m_events_waits_history_sizing= 10;
   param.m_events_waits_history_long_sizing= 0;
   param.m_setup_actor_sizing= 0;
   param.m_setup_object_sizing= 0;
-  param.m_host_sizing= 1000;
   param.m_user_sizing= 0;
+  param.m_host_sizing= 1000;
   param.m_account_sizing= 0;
   param.m_stage_class_sizing= 50;
   param.m_events_stages_history_sizing= 0;
@@ -65,43 +72,64 @@ void test_oom()
   param.m_statement_class_sizing= 50;
   param.m_events_statements_history_sizing= 0;
   param.m_events_statements_history_long_sizing= 0;
+  param.m_events_transactions_history_sizing= 0;
+  param.m_events_transactions_history_long_sizing= 0;
+  param.m_digest_sizing= 0;
   param.m_session_connect_attrs_sizing= 0;
+  param.m_program_sizing= 0;
+  param.m_statement_stack_sizing= 0;
+  param.m_memory_class_sizing= 10;
+  param.m_metadata_lock_sizing= 0;
+  param.m_max_digest_length= 0;
+  param.m_max_sql_text_length= 0;
 
   /* Setup */
 
   stub_alloc_always_fails= false;
   stub_alloc_fails_after_count= 1000;
 
-  init_event_name_sizing(& param);
-  rc= init_stage_class(param.m_stage_class_sizing);
-  ok(rc == 0, "init stage class");
-  rc= init_statement_class(param.m_statement_class_sizing);
-  ok(rc == 0, "init statement class");
+  pre_initialize_performance_schema();
+  boot= initialize_performance_schema(&param);
+  psi= (PSI *)boot->get_interface(PSI_VERSION_1);
+
+  PSI_thread_key thread_key_1;
+  PSI_thread_info all_thread[]=
+  {
+    {&thread_key_1, "T-1", 0}
+  };
+  psi->register_thread("test", all_thread, 1);
+
+  PSI_thread *thread_1= psi->new_thread(thread_key_1, NULL, 0);
+  psi->set_thread(thread_1);
 
   /* Tests */
 
-  stub_alloc_fails_after_count= 1;
-  rc= init_host(& param);
-  ok(rc == 1, "oom (host)");
-  cleanup_host();
+  int first_fail= 1;
+  stub_alloc_fails_after_count= first_fail;
+  psi->set_thread_account("", 0, "host1", 5);
+  ok(global_host_container.m_lost == 1, "oom (host)");
+
+  stub_alloc_fails_after_count= first_fail + 1;
+  psi->set_thread_account("", 0, "host2", 5);
+  ok(global_host_container.m_lost == 2, "oom (host waits)");
+
+  stub_alloc_fails_after_count= first_fail + 2;
+  psi->set_thread_account("", 0, "host3", 5);
+  ok(global_host_container.m_lost == 3, "oom (host stages)");
 
-  stub_alloc_fails_after_count= 2;
-  rc= init_host(& param);
-  ok(rc == 1, "oom (host waits)");
-  cleanup_host();
+  stub_alloc_fails_after_count= first_fail + 3;
+  psi->set_thread_account("", 0, "host4", 5);
+  ok(global_host_container.m_lost == 4, "oom (host statements)");
 
-  stub_alloc_fails_after_count= 3;
-  rc= init_host(& param);
-  ok(rc == 1, "oom (host stages)");
-  cleanup_host();
+  stub_alloc_fails_after_count= first_fail + 4;
+  psi->set_thread_account("", 0, "host5", 5);
+  ok(global_host_container.m_lost == 5, "oom (host transactions)");
 
-  stub_alloc_fails_after_count= 4;
-  rc= init_host(& param);
-  ok(rc == 1, "oom (host statements)");
-  cleanup_host();
+  stub_alloc_fails_after_count= first_fail + 5;
+  psi->set_thread_account("", 0, "host6", 5);
+  ok(global_host_container.m_lost == 6, "oom (host memory)");
 
-  cleanup_statement_class();
-  cleanup_stage_class();
+  shutdown_performance_schema();
 }
 
 void do_all_tests()
diff --git a/storage/perfschema/unittest/pfs_instr-oom-t.cc b/storage/perfschema/unittest/pfs_instr-oom-t.cc
index 231fd1e100b..5f0d97e6be4 100644
--- a/storage/perfschema/unittest/pfs_instr-oom-t.cc
+++ b/storage/perfschema/unittest/pfs_instr-oom-t.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2017, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -18,46 +18,139 @@
 
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software Foundation,
-  51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
 
 #include <my_global.h>
-#include <my_pthread.h>
+#include <my_thread.h>
 #include <pfs_instr.h>
 #include <pfs_stat.h>
 #include <pfs_global.h>
+#include <pfs_user.h>
+#include <pfs_host.h>
+#include <pfs_account.h>
 #include <pfs_instr_class.h>
+#include <pfs_buffer_container.h>
 #include <tap.h>
 
 #include "stub_pfs_global.h"
+#include "stub_global_status_var.h"
 
 #include <string.h> /* memset */
 
+extern struct PSI_bootstrap PFS_bootstrap;
+
+PSI_thread_key thread_key_1;
+PSI_thread_info all_thread[]=
+{
+  {&thread_key_1, "T-1", 0}
+};
+
+/** Simulate initialize_performance_schema(). */
+
+PSI * initialize_performance_schema_helper(PFS_global_param *param)
+{
+  PSI *psi;
+
+  stub_alloc_always_fails= false;
+  stub_alloc_fails_after_count= 1000;
+
+  param->m_enabled= true;
+  param->m_thread_class_sizing= 10;
+  param->m_thread_sizing= 1000;
+
+  pre_initialize_performance_schema();
+
+  init_event_name_sizing(param);
+  init_sync_class(param->m_mutex_class_sizing, param->m_rwlock_class_sizing, param->m_cond_class_sizing);
+  init_thread_class(param->m_thread_class_sizing);
+  init_table_share(param->m_table_share_sizing);
+  init_table_share_lock_stat(param->m_table_lock_stat_sizing);
+  init_table_share_index_stat(param->m_index_stat_sizing);
+  init_file_class(param->m_file_class_sizing);
+  init_stage_class(param->m_stage_class_sizing);
+  init_statement_class(param->m_statement_class_sizing);
+  init_socket_class(param->m_socket_class_sizing);
+  init_memory_class(param->m_memory_class_sizing);
+  init_instruments(param);
+  init_events_waits_history_long(param->m_events_waits_history_long_sizing);
+  init_events_stages_history_long(param->m_events_stages_history_long_sizing);
+  init_events_statements_history_long(param->m_events_statements_history_long_sizing);
+  init_events_transactions_history_long(param->m_events_transactions_history_long_sizing);
+  init_file_hash(param);
+  init_table_share_hash(param);
+  init_setup_actor(param);
+  init_setup_actor_hash(param);
+  init_setup_object(param);
+  init_setup_object_hash(param);
+  init_host(param);
+  init_host_hash(param);
+  init_user(param);
+  init_user_hash(param);
+  init_account(param);
+  init_account_hash(param);
+  init_digest(param);
+  init_digest_hash(param);
+  init_program(param);
+  init_program_hash(param);
+  init_prepared_stmt(param);
+  pfs_initialized= true;
+
+  PSI_bootstrap *boot= &PFS_bootstrap;
+  psi= (PSI *)boot->get_interface(PSI_VERSION_1);
+  psi->register_thread("test", all_thread, 1);
+  return (psi);
+}
+
 void test_oom()
 {
   int rc;
+  PSI *psi;
   PFS_global_param param;
 
-  stub_alloc_always_fails= true;
+  stub_alloc_always_fails= false;
+  stub_alloc_fails_after_count= 1000;
+
+  PFS_mutex_class dummy_mutex_class;
+  PFS_rwlock_class dummy_rwlock_class;
+  PFS_cond_class dummy_cond_class;
+  PFS_thread_class dummy_thread_class;
+  PFS_file_class dummy_file_class;
+  PFS_socket_class dummy_socket_class;
+  PFS_table_share dummy_table_share;
+  PFS_mutex *mutex_1;
+  PFS_mutex *mutex_2;
+  PFS_rwlock *rwlock_1;
+  PFS_rwlock *rwlock_2;
+  PFS_cond *cond_1;
+  PFS_cond *cond_2;
+  PFS_thread *thread_1;
+  PFS_thread *thread_2;
+  PFS_file *file_1;
+  PFS_file *file_2;
+  PFS_socket *socket_1;
+  PFS_socket *socket_2;
+  PFS_table *table_1;
+  PFS_table *table_2;
 
   memset(& param, 0xFF, sizeof(param));
   param.m_enabled= true;
-  param.m_mutex_class_sizing= 10;
-  param.m_rwlock_class_sizing= 0;
-  param.m_cond_class_sizing= 0;
-  param.m_thread_class_sizing= 0;
-  param.m_table_share_sizing= 0;
-  param.m_file_class_sizing= 0;
-  param.m_socket_class_sizing= 0;
-  param.m_mutex_sizing= 1000;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 0;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 0;
-  param.m_socket_sizing= 0;
-  param.m_events_waits_history_sizing= 0;
-  param.m_events_waits_history_long_sizing= 0;
+  param.m_mutex_class_sizing= 1;
+  param.m_rwlock_class_sizing= 1;
+  param.m_cond_class_sizing= 1;
+  param.m_thread_class_sizing= 1;
+  param.m_table_share_sizing= 1;
+  param.m_file_class_sizing= 1;
+  param.m_socket_class_sizing= 1;
+  param.m_mutex_sizing= 1;
+  param.m_rwlock_sizing= 1;
+  param.m_cond_sizing= 1;
+  param.m_thread_sizing= 1;
+  param.m_table_sizing= 1;
+  param.m_file_sizing= 1;
+  param.m_file_handle_sizing= 100;
+  param.m_socket_sizing= 2;
+  param.m_events_waits_history_sizing= 10;
+  param.m_events_waits_history_long_sizing= 10000;
   param.m_setup_actor_sizing= 0;
   param.m_setup_object_sizing= 0;
   param.m_host_sizing= 0;
@@ -69,614 +162,285 @@ void test_oom()
   param.m_statement_class_sizing= 0;
   param.m_events_statements_history_sizing= 0;
   param.m_events_statements_history_long_sizing= 0;
+  param.m_events_transactions_history_sizing= 0;
+  param.m_events_transactions_history_long_sizing= 0;
+  param.m_digest_sizing= 0;
   param.m_session_connect_attrs_sizing= 0;
-
-  init_event_name_sizing(& param);
-  rc= init_instruments(& param);
-  ok(rc == 1, "oom (mutex)");
+  param.m_program_sizing= 0;
+  param.m_prepared_stmt_sizing= 0;
+  param.m_statement_stack_sizing= 0;
+  param.m_memory_class_sizing= 1;
+  param.m_metadata_lock_sizing= 0;
+  param.m_max_digest_length= 0;
+  param.m_max_sql_text_length= 0;
+
+  init_event_name_sizing(&param);
+  rc= init_instruments(&param);
+  ok(rc == 0, "instances init");
+
+  dummy_mutex_class.m_event_name_index= 0;
+  dummy_mutex_class.m_flags= 0;
+  dummy_mutex_class.m_enabled= true;
+  dummy_mutex_class.m_volatility= PSI_VOLATILITY_UNKNOWN;
+  dummy_rwlock_class.m_event_name_index= 1;
+  dummy_rwlock_class.m_flags= 0;
+  dummy_rwlock_class.m_enabled= true;
+  dummy_rwlock_class.m_volatility= PSI_VOLATILITY_UNKNOWN;
+  dummy_cond_class.m_event_name_index= 2;
+  dummy_cond_class.m_flags= 0;
+  dummy_cond_class.m_enabled= true;
+  dummy_cond_class.m_volatility = PSI_VOLATILITY_UNKNOWN;
+  dummy_file_class.m_event_name_index= 3;
+  dummy_file_class.m_flags= 0;
+  dummy_file_class.m_enabled= true;
+  dummy_file_class.m_volatility = PSI_VOLATILITY_UNKNOWN;
+  dummy_socket_class.m_event_name_index= 4;
+  dummy_socket_class.m_flags= 0;
+  dummy_socket_class.m_enabled= true;
+  dummy_socket_class.m_volatility = PSI_VOLATILITY_UNKNOWN;
+  dummy_table_share.m_enabled= true;
+  dummy_table_share.m_timed= true;
+
+  /* Create mutex. */
+  stub_alloc_always_fails= false;
+  mutex_1= create_mutex(&dummy_mutex_class, NULL);
+  ok(mutex_1 != NULL, "create mutex");
+  destroy_mutex(mutex_1);
   cleanup_instruments();
 
-  param.m_enabled= true;
-  param.m_mutex_class_sizing= 0;
-  param.m_rwlock_class_sizing= 10;
-  param.m_cond_class_sizing= 0;
-  param.m_thread_class_sizing= 0;
-  param.m_table_share_sizing= 0;
-  param.m_file_class_sizing= 0;
-  param.m_socket_class_sizing= 0;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 1000;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 0;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 0;
-  param.m_socket_sizing= 0;
-  param.m_events_waits_history_sizing= 0;
-  param.m_events_waits_history_long_sizing= 0;
-  param.m_setup_actor_sizing= 0;
-  param.m_setup_object_sizing= 0;
-  param.m_host_sizing= 0;
-  param.m_user_sizing= 0;
-  param.m_account_sizing= 0;
-  param.m_stage_class_sizing= 0;
-  param.m_events_stages_history_sizing= 0;
-  param.m_events_stages_history_long_sizing= 0;
-  param.m_statement_class_sizing= 0;
-  param.m_events_statements_history_sizing= 0;
-  param.m_events_statements_history_long_sizing= 0;
-  param.m_session_connect_attrs_sizing= 0;
-
-  init_event_name_sizing(& param);
-  rc= init_instruments(& param);
-  ok(rc == 1, "oom (rwlock)");
+  stub_alloc_always_fails= true;
+  mutex_2= create_mutex(&dummy_mutex_class, NULL);
+  ok(mutex_2 == NULL, "oom (create mutex)");
+
+  /* Create rwlock. */
+  stub_alloc_always_fails = false;
+  rc = init_instruments(&param);
+  ok(rc == 0, "instances init");
+  rwlock_1= create_rwlock(&dummy_rwlock_class, NULL);
+  ok(rwlock_1 != NULL, "create rwlock");
+  destroy_rwlock(rwlock_1);
   cleanup_instruments();
 
-  param.m_enabled= true;
-  param.m_mutex_class_sizing= 0;
-  param.m_rwlock_class_sizing= 0;
-  param.m_cond_class_sizing= 10;
-  param.m_thread_class_sizing= 0;
-  param.m_table_share_sizing= 0;
-  param.m_file_class_sizing= 0;
-  param.m_socket_class_sizing= 0;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 1000;
-  param.m_thread_sizing= 0;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 0;
-  param.m_socket_sizing= 0;
-  param.m_events_waits_history_sizing= 0;
-  param.m_events_waits_history_long_sizing= 0;
-  param.m_setup_actor_sizing= 0;
-  param.m_setup_object_sizing= 0;
-  param.m_host_sizing= 0;
-  param.m_user_sizing= 0;
-  param.m_account_sizing= 0;
-  param.m_stage_class_sizing= 0;
-  param.m_events_stages_history_sizing= 0;
-  param.m_events_stages_history_long_sizing= 0;
-  param.m_statement_class_sizing= 0;
-  param.m_events_statements_history_sizing= 0;
-  param.m_events_statements_history_long_sizing= 0;
-  param.m_session_connect_attrs_sizing= 0;
-
-  init_event_name_sizing(& param);
-  rc= init_instruments(& param);
-  ok(rc == 1, "oom (cond)");
+  stub_alloc_always_fails= true;
+  rwlock_2= create_rwlock(&dummy_rwlock_class, NULL);
+  ok(rwlock_2 == NULL, "oom (create rwlock)");
+
+  /* Create cond. */
+  stub_alloc_always_fails = false;
+  rc = init_instruments(&param);
+  ok(rc == 0, "instances init");
+  cond_1= create_cond(&dummy_cond_class, NULL);
+  ok(cond_1 != NULL, "create cond");
+  destroy_cond(cond_1);
   cleanup_instruments();
 
-  param.m_enabled= true;
-  param.m_mutex_class_sizing= 0;
-  param.m_rwlock_class_sizing= 0;
-  param.m_cond_class_sizing= 0;
-  param.m_thread_class_sizing= 0;
-  param.m_table_share_sizing= 0;
-  param.m_file_class_sizing= 10;
-  param.m_socket_class_sizing= 0;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 0;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 1000;
-  param.m_file_handle_sizing= 1000;
-  param.m_socket_sizing= 0;
-  param.m_events_waits_history_sizing= 0;
-  param.m_events_waits_history_long_sizing= 0;
-  param.m_setup_actor_sizing= 0;
-  param.m_setup_object_sizing= 0;
-  param.m_host_sizing= 0;
-  param.m_user_sizing= 0;
-  param.m_account_sizing= 0;
-  param.m_stage_class_sizing= 0;
-  param.m_events_stages_history_sizing= 0;
-  param.m_events_stages_history_long_sizing= 0;
-  param.m_statement_class_sizing= 0;
-  param.m_events_statements_history_sizing= 0;
-  param.m_events_statements_history_long_sizing= 0;
-  param.m_session_connect_attrs_sizing= 0;
+  stub_alloc_always_fails= true;
+  cond_2= create_cond(&dummy_cond_class, NULL);
+  ok(cond_2 == NULL, "oom (create cond)");
 
-  init_event_name_sizing(& param);
-  rc= init_instruments(& param);
-  ok(rc == 1, "oom (file)");
-  cleanup_instruments();
+  /* Create file. */
+  PFS_thread fake_thread;
+  rc = init_instruments(&param);
+  fake_thread.m_filename_hash_pins= NULL;
+  init_file_hash(&param);
 
-  param.m_enabled= true;
-  param.m_mutex_class_sizing= 0;
-  param.m_rwlock_class_sizing= 0;
-  param.m_cond_class_sizing= 0;
-  param.m_thread_class_sizing= 0;
-  param.m_table_share_sizing= 0;
-  param.m_file_class_sizing= 0;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 0;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 1000;
-  param.m_events_waits_history_sizing= 0;
-  param.m_events_waits_history_long_sizing= 0;
-  param.m_setup_actor_sizing= 0;
-  param.m_setup_object_sizing= 0;
-  param.m_host_sizing= 0;
-  param.m_user_sizing= 0;
-  param.m_account_sizing= 0;
-  param.m_stage_class_sizing= 0;
-  param.m_events_stages_history_sizing= 0;
-  param.m_events_stages_history_long_sizing= 0;
-  param.m_statement_class_sizing= 0;
-  param.m_events_statements_history_sizing= 0;
-  param.m_events_statements_history_long_sizing= 0;
-  param.m_session_connect_attrs_sizing= 0;
+  stub_alloc_always_fails = true;
+  file_2 = find_or_create_file(&fake_thread, &dummy_file_class, "dummy", 5, true);
+  ok(file_2 == NULL, "oom (create file)");
 
-  init_event_name_sizing(& param);
-  rc= init_instruments(& param);
-  ok(rc == 1, "oom (file handle)");
+  stub_alloc_always_fails= false;
+  file_1= find_or_create_file(&fake_thread, &dummy_file_class, "dummy", 5, true);
+  ok(file_1 != NULL, "create file");
+  release_file(file_1);
   cleanup_instruments();
 
-  param.m_enabled= true;
-  param.m_mutex_class_sizing= 0;
-  param.m_rwlock_class_sizing= 0;
-  param.m_cond_class_sizing= 0;
-  param.m_thread_class_sizing= 0;
-  param.m_table_share_sizing= 10;
-  param.m_file_class_sizing= 0;
-  param.m_socket_class_sizing= 0;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 0;
-  param.m_table_sizing= 1000;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 0;
-  param.m_socket_sizing= 0;
-  param.m_events_waits_history_sizing= 0;
-  param.m_events_waits_history_long_sizing= 0;
-  param.m_setup_actor_sizing= 0;
-  param.m_setup_object_sizing= 0;
-  param.m_host_sizing= 0;
-  param.m_user_sizing= 0;
-  param.m_account_sizing= 0;
-  param.m_stage_class_sizing= 0;
-  param.m_events_stages_history_sizing= 0;
-  param.m_events_stages_history_long_sizing= 0;
-  param.m_statement_class_sizing= 0;
-  param.m_events_statements_history_sizing= 0;
-  param.m_events_statements_history_long_sizing= 0;
-  param.m_session_connect_attrs_sizing= 0;
-
-  init_event_name_sizing(& param);
-  rc= init_instruments(& param);
-  ok(rc == 1, "oom (table)");
+  /* Create socket. */
+  stub_alloc_always_fails = false;
+  rc = init_instruments(&param);
+  ok(rc == 0, "instances init");
+  socket_1= create_socket(&dummy_socket_class, NULL, NULL, 0);
+  ok(socket_1 != NULL, "create socket");
+  destroy_socket(socket_1);
   cleanup_instruments();
 
-  param.m_enabled= true;
-  param.m_mutex_class_sizing= 0;
-  param.m_rwlock_class_sizing= 0;
-  param.m_cond_class_sizing= 0;
-  param.m_thread_class_sizing= 10;
-  param.m_table_share_sizing= 0;
-  param.m_file_class_sizing= 0;
-  param.m_socket_class_sizing= 0;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 1000;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 0;
-  param.m_socket_sizing= 0;
-  param.m_events_waits_history_sizing= 0;
-  param.m_events_waits_history_long_sizing= 0;
-  param.m_setup_actor_sizing= 0;
-  param.m_setup_object_sizing= 0;
-  param.m_host_sizing= 0;
-  param.m_user_sizing= 0;
-  param.m_account_sizing= 0;
-  param.m_stage_class_sizing= 0;
-  param.m_events_stages_history_sizing= 0;
-  param.m_events_stages_history_long_sizing= 0;
-  param.m_statement_class_sizing= 0;
-  param.m_events_statements_history_sizing= 0;
-  param.m_events_statements_history_long_sizing= 0;
-  param.m_session_connect_attrs_sizing= 0;
+  stub_alloc_always_fails= true;
+  socket_2= create_socket(&dummy_socket_class, NULL, NULL, 0);
+  ok(socket_2 == NULL, "oom (create socket)");
 
-  init_event_name_sizing(& param);
-  rc= init_instruments(& param);
-  ok(rc == 1, "oom (thread)");
+  /* Create table. */
+  stub_alloc_always_fails= false;
+  rc = init_instruments(&param);
+  table_1= create_table(&dummy_table_share, &fake_thread, NULL);
+  ok(table_1 != NULL, "create table");
+  destroy_table(table_1);
   cleanup_instruments();
 
+  stub_alloc_always_fails= true;
+  table_2= create_table(&dummy_table_share, &fake_thread, NULL);
+  ok(table_2 == NULL, "oom (create table)");
+
+  /* Create thread. */
   stub_alloc_always_fails= false;
+  rc = init_instruments(&param);
+  thread_1= create_thread(&dummy_thread_class, NULL, 0);
+  ok(thread_1 != NULL, "create thread");
+  destroy_thread(thread_1);
+  cleanup_instruments();
 
-  param.m_enabled= true;
-  param.m_mutex_class_sizing= 0;
-  param.m_rwlock_class_sizing= 0;
-  param.m_cond_class_sizing= 0;
-  param.m_thread_class_sizing= 10;
-  param.m_table_share_sizing= 0;
-  param.m_file_class_sizing= 0;
-  param.m_socket_class_sizing= 0;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 1000;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 0;
-  param.m_socket_sizing= 0;
-  param.m_events_waits_history_sizing= 10;
-  param.m_events_waits_history_long_sizing= 0;
-  param.m_setup_actor_sizing= 0;
-  param.m_setup_object_sizing= 0;
-  param.m_host_sizing= 0;
-  param.m_user_sizing= 0;
-  param.m_account_sizing= 0;
-  param.m_stage_class_sizing= 0;
-  param.m_events_stages_history_sizing= 0;
-  param.m_events_stages_history_long_sizing= 0;
-  param.m_statement_class_sizing= 0;
-  param.m_events_statements_history_sizing= 0;
-  param.m_events_statements_history_long_sizing= 0;
-  param.m_session_connect_attrs_sizing= 0;
+  stub_alloc_always_fails= true;
+  thread_2= create_thread(&dummy_thread_class, NULL, 0);
+  ok(thread_2 == NULL, "oom (create thread)");
 
-  stub_alloc_fails_after_count= 2;
-  init_event_name_sizing(& param);
-  rc= init_instruments(& param);
-  ok(rc == 1, "oom (thread waits history sizing)");
-  cleanup_instruments();
+  PSI_thread *thread;
 
-  param.m_enabled= true;
+  /* Per thread wait. */
+  memset(&param, 0, sizeof(param));
   param.m_mutex_class_sizing= 50;
   param.m_rwlock_class_sizing= 50;
   param.m_cond_class_sizing= 50;
-  param.m_thread_class_sizing= 10;
-  param.m_table_share_sizing= 0;
   param.m_file_class_sizing= 50;
   param.m_socket_class_sizing= 0;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 1000;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 0;
-  param.m_socket_sizing= 0;
-  param.m_events_waits_history_sizing= 0;
-  param.m_events_waits_history_long_sizing= 0;
-  param.m_setup_actor_sizing= 0;
-  param.m_setup_object_sizing= 0;
-  param.m_stage_class_sizing= 0;
-  param.m_events_stages_history_sizing= 0;
-  param.m_events_stages_history_long_sizing= 0;
-  param.m_statement_class_sizing= 0;
-  param.m_events_statements_history_sizing= 0;
-  param.m_events_statements_history_long_sizing= 0;
-  param.m_session_connect_attrs_sizing= 0;
-
+  psi= initialize_performance_schema_helper(&param);
   stub_alloc_fails_after_count= 2;
-  init_event_name_sizing(& param);
-  rc= init_instruments(& param);
-  ok(rc == 1, "oom (per thread wait)");
+  thread= psi->new_thread(thread_key_1, NULL, 0);
+  ok(thread == NULL, "oom (per thread wait)");
 
   cleanup_sync_class();
   cleanup_thread_class();
   cleanup_file_class();
   cleanup_instruments();
 
+  /* Thread waits history sizing. */
+  memset(&param, 0, sizeof(param));
   param.m_enabled= true;
-  param.m_mutex_class_sizing= 0;
-  param.m_rwlock_class_sizing= 0;
-  param.m_cond_class_sizing= 0;
-  param.m_thread_class_sizing= 0;
-  param.m_table_share_sizing= 0;
-  param.m_file_class_sizing= 0;
-  param.m_socket_class_sizing= 10;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 0;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 0;
-  param.m_socket_sizing= 1000;
-  param.m_events_waits_history_sizing= 0;
-  param.m_events_waits_history_long_sizing= 0;
-  param.m_setup_actor_sizing= 0;
-  param.m_setup_object_sizing= 0;
-
-  init_event_name_sizing(& param);
-  rc= init_instruments(& param);
-  ok(rc == 1, "oom (socket)");
+  param.m_events_waits_history_sizing= 10;
+  psi= initialize_performance_schema_helper(&param);
+  stub_alloc_fails_after_count= 3;
+  thread= psi->new_thread(thread_key_1, NULL, 0);
+  ok(thread == NULL, "oom (thread waits history sizing)");
 
+  cleanup_thread_class();
   cleanup_instruments();
 
-  param.m_host_sizing= 0;
-  param.m_user_sizing= 0;
-  param.m_account_sizing= 0;
-  param.m_stage_class_sizing= 0;
-  param.m_events_stages_history_sizing= 0;
-  param.m_events_stages_history_long_sizing= 0;
-  param.m_statement_class_sizing= 0;
-  param.m_events_statements_history_sizing= 0;
-  param.m_events_statements_history_long_sizing= 0;
-  param.m_session_connect_attrs_sizing= 0;
+  /* Per thread stages. */
+  memset(&param, 0, sizeof(param));
+  param.m_stage_class_sizing= 50;
+  psi= initialize_performance_schema_helper(&param);
+  stub_alloc_fails_after_count= 3;
+  thread= psi->new_thread(thread_key_1, NULL, 0);
+  ok(thread == NULL, "oom (per thread stages)");
 
-  stub_alloc_fails_after_count= 1;
-  init_event_name_sizing(& param);
-  rc= init_instruments(& param);
-  ok(rc == 1, "oom (per thread waits)");
+  cleanup_stage_class();
+  cleanup_thread_class();
   cleanup_instruments();
+  cleanup_stage_class();
 
-  param.m_enabled= true;
-  param.m_mutex_class_sizing= 0;
-  param.m_rwlock_class_sizing= 0;
-  param.m_cond_class_sizing= 0;
-  param.m_thread_class_sizing= 10;
-  param.m_table_share_sizing= 0;
-  param.m_file_class_sizing= 0;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 1000;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 0;
-  param.m_events_waits_history_sizing= 0;
-  param.m_events_waits_history_long_sizing= 0;
-  param.m_setup_actor_sizing= 0;
-  param.m_setup_object_sizing= 0;
-  param.m_host_sizing= 0;
-  param.m_user_sizing= 0;
-  param.m_account_sizing= 0;
-  param.m_stage_class_sizing= 0;
+  /* Thread stages history sizing. */
+  memset(&param, 0, sizeof(param));
   param.m_events_stages_history_sizing= 10;
-  param.m_events_stages_history_long_sizing= 0;
-  param.m_statement_class_sizing= 0;
-  param.m_events_statements_history_sizing= 0;
-  param.m_events_statements_history_long_sizing= 0;
-  param.m_session_connect_attrs_sizing= 0;
-
+  psi= initialize_performance_schema_helper(&param);
   stub_alloc_fails_after_count= 3;
-  init_event_name_sizing(& param);
-  rc= init_instruments(& param);
-  ok(rc == 1, "oom (thread stages history sizing)");
-
-  cleanup_thread_class();
+  thread= psi->new_thread(thread_key_1, NULL, 0);
+  ok(thread == NULL, "oom (thread stages history sizing)");
+  
   cleanup_instruments();
+  cleanup_thread_class();
 
-  param.m_enabled= true;
-  param.m_mutex_class_sizing= 0;
-  param.m_rwlock_class_sizing= 0;
-  param.m_cond_class_sizing= 0;
-  param.m_thread_class_sizing= 10;
-  param.m_table_share_sizing= 0;
-  param.m_file_class_sizing= 0;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 1000;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 0;
-  param.m_events_waits_history_sizing= 0;
-  param.m_events_waits_history_long_sizing= 0;
-  param.m_setup_actor_sizing= 0;
-  param.m_setup_object_sizing= 0;
-  param.m_host_sizing= 0;
-  param.m_user_sizing= 0;
-  param.m_account_sizing= 0;
+  /* Per thread statements. */
+  memset(&param, 0, sizeof(param));
   param.m_stage_class_sizing= 50;
-  param.m_events_stages_history_sizing= 0;
-  param.m_events_stages_history_long_sizing= 0;
-  param.m_statement_class_sizing= 0;
-  param.m_events_statements_history_sizing= 0;
-  param.m_events_statements_history_long_sizing= 0;
-  param.m_session_connect_attrs_sizing= 0;
+  psi= initialize_performance_schema_helper(&param);
+  init_statement_class(param.m_statement_class_sizing);
+  stub_alloc_fails_after_count= 3;
+  thread= psi->new_thread(thread_key_1, NULL, 0);
+  ok(thread == NULL, "oom (per thread statements)");
 
-  stub_alloc_fails_after_count= 2;
-  init_event_name_sizing(& param);
-  rc= init_instruments(& param);
-  ok(rc == 1, "oom (per thread stages)");
-  
   cleanup_stage_class();
+  cleanup_statement_class();
   cleanup_thread_class();
   cleanup_instruments();
 
-  param.m_enabled= true;
-  param.m_mutex_class_sizing= 0;
-  param.m_rwlock_class_sizing= 0;
-  param.m_cond_class_sizing= 0;
-  param.m_thread_class_sizing= 10;
-  param.m_table_share_sizing= 0;
-  param.m_file_class_sizing= 0;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 1000;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 0;
-  param.m_events_waits_history_sizing= 0;
-  param.m_events_waits_history_long_sizing= 0;
-  param.m_setup_actor_sizing= 0;
-  param.m_setup_object_sizing= 0;
-  param.m_host_sizing= 0;
-  param.m_user_sizing= 0;
-  param.m_account_sizing= 0;
-  param.m_stage_class_sizing= 0;
-  param.m_events_stages_history_sizing= 0;
-  param.m_events_stages_history_long_sizing= 0;
-  param.m_statement_class_sizing= 0;
+  /* Thread statements history sizing. */
+  memset(&param, 0, sizeof(param));
   param.m_events_statements_history_sizing= 10;
-  param.m_events_statements_history_long_sizing= 0;
-  param.m_session_connect_attrs_sizing= 0;
-
-  stub_alloc_fails_after_count= 2;
-  init_event_name_sizing(& param);
-  rc= init_instruments(& param);
-  ok(rc == 1, "oom (thread statements history sizing)");
-
+  psi= initialize_performance_schema_helper(&param);
+  stub_alloc_fails_after_count= 3;
+  thread= psi->new_thread(thread_key_1, NULL, 0);
+  ok(thread == NULL, "oom (thread statements history sizing)");
+  
   cleanup_thread_class();
   cleanup_instruments();
 
-  param.m_enabled= true;
-  param.m_mutex_class_sizing= 0;
-  param.m_rwlock_class_sizing= 0;
-  param.m_cond_class_sizing= 0;
-  param.m_thread_class_sizing= 10;
-  param.m_table_share_sizing= 0;
-  param.m_file_class_sizing= 0;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 1000;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 0;
-  param.m_events_waits_history_sizing= 0;
-  param.m_events_waits_history_long_sizing= 0;
-  param.m_setup_actor_sizing= 0;
-  param.m_setup_object_sizing= 0;
-  param.m_host_sizing= 0;
-  param.m_user_sizing= 0;
-  param.m_account_sizing= 0;
-  param.m_stage_class_sizing= 0;
-  param.m_events_stages_history_sizing= 0;
-  param.m_events_stages_history_long_sizing= 0;
-  param.m_statement_class_sizing= 50;
-  param.m_events_statements_history_sizing= 0;
-  param.m_events_statements_history_long_sizing= 0;
-  param.m_session_connect_attrs_sizing= 0;
-
-  stub_alloc_fails_after_count= 2;
-  init_event_name_sizing(& param);
-  rc= init_instruments(& param);
-  ok(rc == 1, "oom (per thread statements)");
+  /* Per thread transactions. */
+  memset(&param, 0, sizeof(param));
+  psi= initialize_performance_schema_helper(&param);
+  transaction_class_max= 1; // set by register_global_classes();
+  stub_alloc_fails_after_count= 3;
+  thread= psi->new_thread(thread_key_1, NULL, 0);
+  ok(thread == NULL, "oom (per thread transactions)");
+  transaction_class_max= 0;
 
-  cleanup_statement_class();
   cleanup_thread_class();
   cleanup_instruments();
 
-  param.m_enabled= true;
-  param.m_mutex_class_sizing= 10;
-  param.m_rwlock_class_sizing= 0;
-  param.m_cond_class_sizing= 0;
-  param.m_thread_class_sizing= 0;
-  param.m_table_share_sizing= 0;
-  param.m_file_class_sizing= 0;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 0;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 0;
-  param.m_events_waits_history_sizing= 0;
-  param.m_events_waits_history_long_sizing= 0;
-  param.m_setup_actor_sizing= 0;
-  param.m_setup_object_sizing= 0;
-  param.m_host_sizing= 0;
-  param.m_user_sizing= 0;
-  param.m_account_sizing= 0;
-  param.m_stage_class_sizing= 0;
-  param.m_events_stages_history_sizing= 0;
-  param.m_events_stages_history_long_sizing= 0;
-  param.m_statement_class_sizing= 0;
-  param.m_events_statements_history_sizing= 0;
-  param.m_events_statements_history_long_sizing= 0;
-  param.m_session_connect_attrs_sizing= 0;
-
-  stub_alloc_fails_after_count= 1;
-  init_event_name_sizing(& param);
-  rc= init_instruments(& param);
-  ok(rc == 1, "oom (global waits)");
+  /* Thread transactions history sizing. */
+  memset(&param, 0, sizeof(param));
+  param.m_events_transactions_history_sizing= 10;
+  psi= initialize_performance_schema_helper(&param);
+  stub_alloc_fails_after_count= 3;
+  thread= psi->new_thread(thread_key_1, NULL, 0);
+  ok(thread == NULL, "oom (thread transactions history sizing)");
 
-  cleanup_sync_class();
+  cleanup_thread_class();
   cleanup_instruments();
 
+  /* Global stages. */
+  memset(&param, 0, sizeof(param));
   param.m_enabled= true;
   param.m_mutex_class_sizing= 10;
-  param.m_rwlock_class_sizing= 0;
-  param.m_cond_class_sizing= 0;
-  param.m_thread_class_sizing= 0;
-  param.m_table_share_sizing= 0;
-  param.m_file_class_sizing= 0;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 0;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 0;
-  param.m_events_waits_history_sizing= 0;
-  param.m_events_waits_history_long_sizing= 0;
-  param.m_setup_actor_sizing= 0;
-  param.m_setup_object_sizing= 0;
-  param.m_host_sizing= 0;
-  param.m_user_sizing= 0;
-  param.m_account_sizing= 0;
   param.m_stage_class_sizing= 20;
-  param.m_events_stages_history_sizing= 0;
-  param.m_events_stages_history_long_sizing= 0;
-  param.m_statement_class_sizing= 0;
-  param.m_events_statements_history_sizing= 0;
-  param.m_events_statements_history_long_sizing= 0;
-  param.m_session_connect_attrs_sizing= 0;
 
-  stub_alloc_fails_after_count= 3;
-  init_event_name_sizing(& param);
+  stub_alloc_fails_after_count= 2;
+  init_event_name_sizing(&param);
   rc= init_stage_class(param.m_stage_class_sizing);
   ok(rc == 0, "init stage class");
   rc= init_instruments(& param);
   ok(rc == 1, "oom (global stages)");
 
-  cleanup_sync_class();
   cleanup_stage_class();
   cleanup_instruments();
 
+  /* Global statements. */
+  memset(&param, 0, sizeof(param));
   param.m_enabled= true;
   param.m_mutex_class_sizing= 10;
-  param.m_rwlock_class_sizing= 0;
-  param.m_cond_class_sizing= 0;
-  param.m_thread_class_sizing= 0;
-  param.m_table_share_sizing= 0;
-  param.m_file_class_sizing= 0;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 0;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 0;
-  param.m_events_waits_history_sizing= 0;
-  param.m_events_waits_history_long_sizing= 0;
-  param.m_setup_actor_sizing= 0;
-  param.m_setup_object_sizing= 0;
-  param.m_host_sizing= 0;
-  param.m_user_sizing= 0;
-  param.m_account_sizing= 0;
-  param.m_stage_class_sizing= 0;
-  param.m_events_stages_history_sizing= 0;
-  param.m_events_stages_history_long_sizing= 0;
   param.m_statement_class_sizing= 20;
-  param.m_events_statements_history_sizing= 0;
-  param.m_events_statements_history_long_sizing= 0;
-  param.m_session_connect_attrs_sizing= 0;
 
-  stub_alloc_fails_after_count= 3;
-  init_event_name_sizing(& param);
+  stub_alloc_fails_after_count= 2;
+  init_event_name_sizing(&param);
   rc= init_statement_class(param.m_statement_class_sizing);
   ok(rc == 0, "init statement class");
-  rc= init_instruments(& param);
+  rc= init_instruments(&param);
   ok(rc == 1, "oom (global statements)");
 
-  cleanup_sync_class();
   cleanup_statement_class();
   cleanup_instruments();
+
+  /* Global memory. */
+  memset(&param, 0, sizeof(param));
+  param.m_enabled= true;
+  param.m_mutex_class_sizing= 10;
+  param.m_memory_class_sizing= 20;
+
+  stub_alloc_fails_after_count= 2;
+  init_event_name_sizing(&param);
+  rc= init_memory_class(param.m_memory_class_sizing);
+  ok(rc == 0, "init memory class");
+  rc= init_instruments(& param);
+  ok(rc == 1, "oom (global memory)");
+
+  cleanup_memory_class();
+  cleanup_instruments();
 }
 
 void do_all_tests()
@@ -686,10 +450,10 @@ void do_all_tests()
 
 int main(int argc, char **argv)
 {
-  plan(20);
+  plan(32);
   MY_INIT(argv[0]);
   do_all_tests();
   my_end(0);
-  return (exit_status());
+  return exit_status();
 }
 
diff --git a/storage/perfschema/unittest/pfs_instr-t.cc b/storage/perfschema/unittest/pfs_instr-t.cc
index c9f4bac1171..95971214c83 100644
--- a/storage/perfschema/unittest/pfs_instr-t.cc
+++ b/storage/perfschema/unittest/pfs_instr-t.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2017, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -21,13 +21,16 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 
 #include <my_global.h>
-#include <my_pthread.h>
+#include <my_thread.h>
 #include <pfs_instr.h>
 #include <pfs_stat.h>
 #include <pfs_global.h>
 #include <pfs_instr_class.h>
+#include <pfs_buffer_container.h>
 #include <tap.h>
 
+#include "stub_global_status_var.h"
+
 #include <memory.h>
 
 PFS_global_param param;
@@ -66,9 +69,15 @@ void test_no_instruments()
   param.m_statement_class_sizing= 0;
   param.m_events_statements_history_sizing= 0;
   param.m_events_statements_history_long_sizing= 0;
+  param.m_events_transactions_history_sizing= 0;
+  param.m_events_transactions_history_long_sizing= 0;
   param.m_digest_sizing= 0;
   param.m_session_connect_attrs_sizing= 0;
-  param.m_max_digest_length= 0;
+  param.m_program_sizing= 0;
+  param.m_prepared_stmt_sizing= 0;
+  param.m_statement_stack_sizing= 0;
+  param.m_memory_class_sizing= 0;
+  param.m_metadata_lock_sizing= 0;
 
   init_event_name_sizing(& param);
   rc= init_instruments(& param);
@@ -95,6 +104,27 @@ void test_no_instances()
   PFS_socket *socket;
   PFS_table *table;
 
+  dummy_mutex_class.m_event_name_index = 0;
+  dummy_mutex_class.m_flags = 0;
+  dummy_mutex_class.m_enabled = true;
+  dummy_mutex_class.m_volatility = PSI_VOLATILITY_UNKNOWN;
+  dummy_rwlock_class.m_event_name_index = 1;
+  dummy_rwlock_class.m_flags = 0;
+  dummy_rwlock_class.m_enabled = true;
+  dummy_rwlock_class.m_volatility = PSI_VOLATILITY_UNKNOWN;
+  dummy_cond_class.m_event_name_index = 2;
+  dummy_cond_class.m_flags = 0;
+  dummy_cond_class.m_enabled = true;
+  dummy_cond_class.m_volatility = PSI_VOLATILITY_UNKNOWN;
+  dummy_file_class.m_event_name_index = 3;
+  dummy_file_class.m_flags = 0;
+  dummy_file_class.m_enabled = true;
+  dummy_file_class.m_volatility = PSI_VOLATILITY_UNKNOWN;
+  dummy_socket_class.m_event_name_index = 4;
+  dummy_socket_class.m_flags = 0;
+  dummy_socket_class.m_enabled = true;
+  dummy_socket_class.m_volatility = PSI_VOLATILITY_UNKNOWN;
+
   memset(& param, 0xFF, sizeof(param));
   param.m_enabled= true;
   param.m_mutex_class_sizing= 1;
@@ -125,9 +155,15 @@ void test_no_instances()
   param.m_statement_class_sizing= 0;
   param.m_events_statements_history_sizing= 0;
   param.m_events_statements_history_long_sizing= 0;
+  param.m_events_transactions_history_sizing= 0;
+  param.m_events_transactions_history_long_sizing= 0;
   param.m_digest_sizing= 0;
   param.m_session_connect_attrs_sizing= 0;
-  param.m_max_digest_length= 0;
+  param.m_program_sizing= 0;
+  param.m_prepared_stmt_sizing= 0;
+  param.m_statement_stack_sizing= 0;
+  param.m_memory_class_sizing= 1;
+  param.m_metadata_lock_sizing= 0;
 
   init_event_name_sizing(& param);
   rc= init_instruments(& param);
@@ -135,50 +171,50 @@ void test_no_instances()
 
   mutex= create_mutex(& dummy_mutex_class, NULL);
   ok(mutex == NULL, "no mutex");
-  ok(mutex_lost == 1, "lost 1");
+  ok(global_mutex_container.get_lost_counter() == 1, "lost 1");
   mutex= create_mutex(& dummy_mutex_class, NULL);
   ok(mutex == NULL, "no mutex");
-  ok(mutex_lost == 2, "lost 2");
+  ok(global_mutex_container.get_lost_counter() == 2, "lost 2");
 
   rwlock= create_rwlock(& dummy_rwlock_class, NULL);
   ok(rwlock == NULL, "no rwlock");
-  ok(rwlock_lost == 1, "lost 1");
+  ok(global_rwlock_container.m_lost == 1, "lost 1");
   rwlock= create_rwlock(& dummy_rwlock_class, NULL);
   ok(rwlock == NULL, "no rwlock");
-  ok(rwlock_lost == 2, "lost 2");
+  ok(global_rwlock_container.m_lost == 2, "lost 2");
 
   cond= create_cond(& dummy_cond_class, NULL);
   ok(cond == NULL, "no cond");
-  ok(cond_lost == 1, "lost 1");
+  ok(global_cond_container.m_lost == 1, "lost 1");
   cond= create_cond(& dummy_cond_class, NULL);
   ok(cond == NULL, "no cond");
-  ok(cond_lost == 2, "lost 2");
+  ok(global_cond_container.m_lost == 2, "lost 2");
 
   thread= create_thread(& dummy_thread_class, NULL, 0);
   ok(thread == NULL, "no thread");
-  ok(thread_lost == 1, "lost 1");
+  ok(global_thread_container.m_lost == 1, "lost 1");
   thread= create_thread(& dummy_thread_class, NULL, 0);
   ok(thread == NULL, "no thread");
-  ok(thread_lost == 2, "lost 2");
+  ok(global_thread_container.m_lost == 2, "lost 2");
 
   PFS_thread fake_thread;
   fake_thread.m_filename_hash_pins= NULL;
 
   file= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5, true);
   ok(file == NULL, "no file");
-  ok(file_lost == 1, "lost 1");
+  ok(global_file_container.m_lost == 1, "lost 1");
   file= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5, true);
   ok(file == NULL, "no file");
-  ok(file_lost == 2, "lost 2");
+  ok(global_file_container.m_lost == 2, "lost 2");
 
-  init_file_hash();
+  init_file_hash(& param);
 
   file= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5, true);
   ok(file == NULL, "no file");
-  ok(file_lost == 3, "lost 3");
+  ok(global_file_container.m_lost == 3, "lost 3");
   file= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5, true);
   ok(file == NULL, "no file");
-  ok(file_lost == 4, "lost 4");
+  ok(global_file_container.m_lost == 4, "lost 4");
 
   char long_file_name[10000];
   int size= sizeof(long_file_name);
@@ -186,21 +222,21 @@ void test_no_instances()
 
   file= find_or_create_file(& fake_thread, & dummy_file_class, long_file_name, size, true);
   ok(file == NULL, "no file");
-  ok(file_lost == 5, "lost 5");
+  ok(global_file_container.m_lost == 5, "lost 5");
 
   table= create_table(& dummy_table_share, & fake_thread, NULL);
   ok(table == NULL, "no table");
-  ok(table_lost == 1, "lost 1");
+  ok(global_table_container.m_lost == 1, "lost 1");
   table= create_table(& dummy_table_share, & fake_thread, NULL);
   ok(table == NULL, "no table");
-  ok(table_lost == 2, "lost 2");
+  ok(global_table_container.m_lost == 2, "lost 2");
 
   socket= create_socket(& dummy_socket_class, NULL, NULL, 0);
   ok(socket == NULL, "no socket");
-  ok(socket_lost == 1, "lost 1");
+  ok(global_socket_container.m_lost == 1, "lost 1");
   socket= create_socket(& dummy_socket_class, NULL, NULL, 0);
   ok(socket == NULL, "no socket");
-  ok(socket_lost == 2, "lost 2");
+  ok(global_socket_container.m_lost == 2, "lost 2");
 
   /* No result to test, just make sure it does not crash */
   reset_events_waits_by_instance();
@@ -265,139 +301,163 @@ void test_with_instances()
   param.m_statement_class_sizing= 0;
   param.m_events_statements_history_sizing= 0;
   param.m_events_statements_history_long_sizing= 0;
+  param.m_events_transactions_history_sizing= 0;
+  param.m_events_transactions_history_long_sizing= 0;
   param.m_digest_sizing= 0;
   param.m_session_connect_attrs_sizing= 0;
-  param.m_max_digest_length= 0;
+  param.m_program_sizing= 0;
+  param.m_prepared_stmt_sizing= 0;
+  param.m_statement_stack_sizing= 0;
+  param.m_memory_class_sizing= 1;
+  param.m_metadata_lock_sizing= 0;
 
   init_event_name_sizing(& param);
   rc= init_instruments(& param);
   ok(rc == 0, "instances init");
 
   dummy_mutex_class.m_event_name_index= 0;
+  dummy_mutex_class.m_flags= 0;
+  dummy_mutex_class.m_enabled= true;
+  dummy_mutex_class.m_volatility = PSI_VOLATILITY_UNKNOWN;
   dummy_rwlock_class.m_event_name_index= 1;
+  dummy_rwlock_class.m_flags= 0;
+  dummy_rwlock_class.m_enabled= true;
+  dummy_rwlock_class.m_volatility = PSI_VOLATILITY_UNKNOWN;
   dummy_cond_class.m_event_name_index= 2;
+  dummy_cond_class.m_flags= 0;
+  dummy_cond_class.m_enabled= true;
+  dummy_cond_class.m_volatility = PSI_VOLATILITY_UNKNOWN;
   dummy_file_class.m_event_name_index= 3;
+  dummy_file_class.m_flags= 0;
+  dummy_file_class.m_enabled= true;
+  dummy_file_class.m_volatility = PSI_VOLATILITY_UNKNOWN;
   dummy_socket_class.m_event_name_index= 4;
+  dummy_socket_class.m_flags= 0;
+  dummy_socket_class.m_enabled= true;
+  dummy_socket_class.m_volatility = PSI_VOLATILITY_UNKNOWN;
+
+  dummy_table_share.m_enabled= true;
+  dummy_table_share.m_timed= true;
 
   mutex_1= create_mutex(& dummy_mutex_class, NULL);
   ok(mutex_1 != NULL, "mutex");
-  ok(mutex_lost == 0, "not lost");
+  ok(global_mutex_container.get_lost_counter() == 0, "not lost");
   mutex_2= create_mutex(& dummy_mutex_class, NULL);
   ok(mutex_2 != NULL, "mutex");
-  ok(mutex_lost == 0, "not lost");
+  ok(global_mutex_container.get_lost_counter() == 0, "not lost");
   mutex_2= create_mutex(& dummy_mutex_class, NULL);
   ok(mutex_2 == NULL, "no mutex");
-  ok(mutex_lost == 1, "lost 1");
+  ok(global_mutex_container.get_lost_counter() == 1, "lost 1");
   destroy_mutex(mutex_1);
   mutex_2= create_mutex(& dummy_mutex_class, NULL);
   ok(mutex_2 != NULL, "mutex");
-  ok(mutex_lost == 1, "no new loss");
+  ok(global_mutex_container.get_lost_counter() == 1, "no new loss");
 
   rwlock_1= create_rwlock(& dummy_rwlock_class, NULL);
   ok(rwlock_1 != NULL, "rwlock");
-  ok(rwlock_lost == 0, "not lost");
+  ok(global_rwlock_container.m_lost == 0, "not lost");
   rwlock_2= create_rwlock(& dummy_rwlock_class, NULL);
   ok(rwlock_2 != NULL, "rwlock");
-  ok(rwlock_lost == 0, "not lost");
+  ok(global_rwlock_container.m_lost == 0, "not lost");
   rwlock_2= create_rwlock(& dummy_rwlock_class, NULL);
   ok(rwlock_2 == NULL, "no rwlock");
-  ok(rwlock_lost == 1, "lost 1");
+  ok(global_rwlock_container.m_lost == 1, "lost 1");
   destroy_rwlock(rwlock_1);
   rwlock_2= create_rwlock(& dummy_rwlock_class, NULL);
   ok(rwlock_2 != NULL, "rwlock");
-  ok(rwlock_lost == 1, "no new loss");
+  ok(global_rwlock_container.m_lost == 1, "no new loss");
 
   cond_1= create_cond(& dummy_cond_class, NULL);
   ok(cond_1 != NULL, "cond");
-  ok(cond_lost == 0, "not lost");
+  ok(global_cond_container.m_lost == 0, "not lost");
   cond_2= create_cond(& dummy_cond_class, NULL);
   ok(cond_2 != NULL, "cond");
-  ok(cond_lost == 0, "not lost");
+  ok(global_cond_container.m_lost == 0, "not lost");
   cond_2= create_cond(& dummy_cond_class, NULL);
   ok(cond_2 == NULL, "no cond");
-  ok(cond_lost == 1, "lost 1");
+  ok(global_cond_container.m_lost == 1, "lost 1");
   destroy_cond(cond_1);
   cond_2= create_cond(& dummy_cond_class, NULL);
   ok(cond_2 != NULL, "cond");
-  ok(cond_lost == 1, "no new loss");
+  ok(global_cond_container.m_lost == 1, "no new loss");
 
   thread_1= create_thread(& dummy_thread_class, NULL, 0);
   ok(thread_1 != NULL, "thread");
-  ok(thread_lost == 0, "not lost");
+  ok(global_thread_container.m_lost == 0, "not lost");
   thread_2= create_thread(& dummy_thread_class, NULL, 0);
   ok(thread_2 != NULL, "thread");
-  ok(thread_lost == 0, "not lost");
+  ok(global_thread_container.m_lost == 0, "not lost");
   thread_2= create_thread(& dummy_thread_class, NULL, 0);
   ok(thread_2 == NULL, "no thread");
-  ok(thread_lost == 1, "lost 1");
+  ok(global_thread_container.m_lost == 1, "lost 1");
   destroy_thread(thread_1);
   thread_2= create_thread(& dummy_thread_class, NULL, 0);
   ok(thread_2 != NULL, "thread");
-  ok(thread_lost == 1, "no new loss");
+  ok(global_thread_container.m_lost == 1, "no new loss");
 
   PFS_thread fake_thread;
   fake_thread.m_filename_hash_pins= NULL;
 
   file_1= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5, true);
   ok(file_1 == NULL, "no file");
-  ok(file_lost == 1, "lost 1");
+  ok(global_file_container.m_lost == 1, "lost 1");
   file_1= find_or_create_file(& fake_thread, & dummy_file_class, "dummy", 5, true);
   ok(file_1 == NULL, "no file");
-  ok(file_lost == 2, "lost 2");
+  ok(global_file_container.m_lost == 2, "lost 2");
 
-  init_file_hash();
-  file_lost= 0;
+  init_file_hash(& param);
+  global_file_container.m_lost= 0;
 
   file_1= find_or_create_file(& fake_thread, & dummy_file_class, "dummy_A", 7, true);
   ok(file_1 != NULL, "file");
   ok(file_1->m_file_stat.m_open_count == 1, "open count 1");
-  ok(file_lost == 0, "not lost");
+  ok(global_file_container.m_lost == 0, "not lost");
   file_2= find_or_create_file(& fake_thread, & dummy_file_class, "dummy_A", 7, true);
   ok(file_1 == file_2, "same file");
   ok(file_1->m_file_stat.m_open_count == 2, "open count 2");
-  ok(file_lost == 0, "not lost");
+  ok(global_file_container.m_lost == 0, "not lost");
   release_file(file_2);
   ok(file_1->m_file_stat.m_open_count == 1, "open count 1");
   file_2= find_or_create_file(& fake_thread, & dummy_file_class, "dummy_B", 7, true);
   ok(file_2 != NULL, "file");
-  ok(file_lost == 0, "not lost");
+  ok(global_file_container.m_lost == 0, "not lost");
   file_2= find_or_create_file(& fake_thread, & dummy_file_class, "dummy_C", 7, true);
   ok(file_2 == NULL, "no file");
-  ok(file_lost == 1, "lost");
+  ok(global_file_container.m_lost == 1, "lost");
   release_file(file_1);
   /* the file still exists, not destroyed */
   ok(file_1->m_file_stat.m_open_count == 0, "open count 0");
   file_2= find_or_create_file(& fake_thread, & dummy_file_class, "dummy_D", 7, true);
   ok(file_2 == NULL, "no file");
-  ok(file_lost == 2, "lost");
+  ok(global_file_container.m_lost == 2, "lost");
 
   socket_1= create_socket(& dummy_socket_class, NULL, NULL, 0);
   ok(socket_1 != NULL, "socket");
-  ok(socket_lost == 0, "not lost");
+  ok(global_socket_container.m_lost == 0, "not lost");
   socket_2= create_socket(& dummy_socket_class, NULL, NULL, 0);
   ok(socket_2 != NULL, "socket");
-  ok(socket_lost == 0, "not lost");
+  ok(global_socket_container.m_lost == 0, "not lost");
   socket_2= create_socket(& dummy_socket_class, NULL, NULL, 0);
   ok(socket_2 == NULL, "no socket");
-  ok(socket_lost == 1, "lost 1");
+  ok(global_socket_container.m_lost == 1, "lost 1");
   destroy_socket(socket_1);
   socket_2= create_socket(& dummy_socket_class, NULL, NULL, 0);
   ok(socket_2 != NULL, "socket");
-  ok(socket_lost == 1, "no new loss");
+  ok(global_socket_container.m_lost == 1, "no new loss");
 
   table_1= create_table(& dummy_table_share, & fake_thread, NULL);
   ok(table_1 != NULL, "table");
-  ok(table_lost == 0, "not lost");
+  ok(global_table_container.m_lost == 0, "not lost");
   table_2= create_table(& dummy_table_share, & fake_thread, NULL);
   ok(table_2 != NULL, "table");
-  ok(table_lost == 0, "not lost");
+  ok(global_table_container.m_lost == 0, "not lost");
   table_2= create_table(& dummy_table_share, & fake_thread, NULL);
   ok(table_2 == NULL, "no table");
-  ok(table_lost == 1, "lost 1");
+  ok(global_table_container.m_lost == 1, "lost 1");
   destroy_table(table_1);
   table_2= create_table(& dummy_table_share, & fake_thread, NULL);
   ok(table_2 != NULL, "table");
-  ok(table_lost == 1, "no new loss");
+  ok(global_table_container.m_lost == 1, "no new loss");
 
   //TODO: test that cleanup works
   reset_events_waits_by_instance();
@@ -409,6 +469,9 @@ void test_with_instances()
 
 void do_all_tests()
 {
+  flag_global_instrumentation= true;
+  flag_thread_instrumentation= true;
+
   test_no_instruments();
   test_no_instances();
   test_with_instances();
diff --git a/storage/perfschema/unittest/pfs_instr_class-oom-t.cc b/storage/perfschema/unittest/pfs_instr_class-oom-t.cc
index 7eb21a33bd7..da80a658f99 100644
--- a/storage/perfschema/unittest/pfs_instr_class-oom-t.cc
+++ b/storage/perfschema/unittest/pfs_instr_class-oom-t.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2017, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -21,16 +21,24 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 
 #include <my_global.h>
-#include <my_pthread.h>
+#include <my_thread.h>
 #include <pfs_instr_class.h>
+#include <pfs_instr.h>
 #include <pfs_global.h>
 #include <tap.h>
+#include <sql_class.h>
+#include <pfs_buffer_container.h>
 
 #include "stub_pfs_global.h"
+#include "stub_global_status_var.h"
 
 void test_oom()
 {
   int rc;
+  PFS_global_param param;
+  TABLE_SHARE table_share;
+  PFS_thread pfs_thread;
+  PFS_table_share *pfs_table_share;
 
   rc= init_sync_class(1000, 0, 0);
   ok(rc == 1, "oom (mutex)");
@@ -42,14 +50,14 @@ void test_oom()
   ok(rc == 1, "oom (thread)");
   rc= init_file_class(1000);
   ok(rc == 1, "oom (file)");
-  rc= init_table_share(1000);
-  ok(rc == 1, "oom (cond)");
   rc= init_socket_class(1000);
   ok(rc == 1, "oom (socket)");
   rc= init_stage_class(1000);
   ok(rc == 1, "oom (stage)");
   rc= init_statement_class(1000);
   ok(rc == 1, "oom (statement)");
+  rc= init_memory_class(1000);
+  ok(rc == 1, "oom (memory)");
 
   cleanup_sync_class();
   cleanup_thread_class();
@@ -58,6 +66,36 @@ void test_oom()
   cleanup_socket_class();
   cleanup_stage_class();
   cleanup_statement_class();
+  cleanup_memory_class();
+
+  /* Table share classes. */
+  memset(&param, 0, sizeof(param));
+  param.m_enabled= true;
+  param.m_table_share_sizing= 100;
+  param.m_setup_object_sizing= 100;
+
+  pfs_thread.m_table_share_hash_pins= NULL;
+  pfs_thread.m_setup_object_hash_pins= NULL;
+  
+  char db_name[]= "schema 1";
+  char table_name[]= "table 1";
+  table_share.db.str= db_name;
+  table_share.db.length= strlen(db_name);
+  table_share.table_name.str= table_name;
+  table_share.table_name.length= strlen(table_name);
+
+  init_table_share(param.m_table_share_sizing);
+  init_table_share_hash(&param);
+  init_setup_object_hash(&param);
+
+  stub_alloc_always_fails= false;
+  pfs_table_share= find_or_create_table_share(&pfs_thread, false, &table_share);
+  ok(pfs_table_share == NULL, "oom (pfs table share)");
+  ok(global_table_share_container.m_lost == 1, "oom (table share)");
+
+  cleanup_table_share();
+  cleanup_table_share_hash();
+  cleanup_setup_object_hash();
 }
 
 void do_all_tests()
@@ -67,7 +105,7 @@ void do_all_tests()
 
 int main(int argc, char **argv)
 {
-  plan(9);
+  plan(11);
   MY_INIT(argv[0]);
   do_all_tests();
   my_end(0);
diff --git a/storage/perfschema/unittest/pfs_instr_class-t.cc b/storage/perfschema/unittest/pfs_instr_class-t.cc
index f1e246dc387..09a38706b86 100644
--- a/storage/perfschema/unittest/pfs_instr_class-t.cc
+++ b/storage/perfschema/unittest/pfs_instr_class-t.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2017, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -21,13 +21,15 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 
 #include <my_global.h>
-#include <my_pthread.h>
+#include <my_thread.h>
 #include <string.h>                             // strncpy
 #include <pfs_instr_class.h>
 #include <pfs_instr.h>
 #include <pfs_global.h>
 #include <tap.h>
 
+#include "stub_global_status_var.h"
+
 void test_no_registration()
 {
   int rc;
@@ -35,12 +37,14 @@ void test_no_registration()
   PFS_thread_key thread_key;
   PFS_file_key file_key;
   PFS_socket_key socket_key;
+  PFS_memory_key memory_key;
   PFS_mutex_class *mutex;
   PFS_rwlock_class *rwlock;
   PFS_cond_class *cond;
   PFS_thread_class *thread;
   PFS_file_class *file;
   PFS_socket_class *socket;
+  PFS_memory_class *memory;
   /* PFS_table_share *table; */
 
   rc= init_sync_class(0, 0, 0);
@@ -53,6 +57,8 @@ void test_no_registration()
   ok(rc == 0, "zero init (socket)");
   rc= init_table_share(0);
   ok(rc == 0, "zero init (table)");
+  rc= init_memory_class(0);
+  ok(rc == 0, "zero init (memory)");
 
   key= register_mutex_class("FOO", 3, 0);
   ok(key == 0, "no mutex registered");
@@ -96,6 +102,13 @@ void test_no_registration()
   socket_key= register_socket_class("FOO", 3, 0);
   ok(socket_key == 0, "no socket registered");
 
+  memory_key= register_memory_class("FOO", 3, 0);
+  ok(memory_key == 0, "no memory registered");
+  memory_key= register_memory_class("BAR", 3, 0);
+  ok(memory_key == 0, "no memory registered");
+  memory_key= register_memory_class("FOO", 3, 0);
+  ok(memory_key == 0, "no memory registered");
+
 #ifdef LATER
   PFS_thread fake_thread;
   fake_thread.m_table_share_hash_pins= NULL;
@@ -150,11 +163,19 @@ void test_no_registration()
   socket= find_socket_class(9999);
   ok(socket == NULL, "no socket key 9999");
 
+  memory= find_memory_class(0);
+  ok(memory == NULL, "no memory key 0");
+  memory= find_memory_class(1);
+  ok(memory == NULL, "no memory key 1");
+  memory= find_memory_class(9999);
+  ok(memory == NULL, "no memory key 9999");
+
   cleanup_sync_class();
   cleanup_thread_class();
   cleanup_file_class();
   cleanup_socket_class();
   cleanup_table_share();
+  cleanup_memory_class();
 }
 
 void test_mutex_registration()
@@ -480,6 +501,53 @@ void test_table_registration()
 #endif
 }
 
+void test_memory_registration()
+{
+  int rc;
+  PFS_memory_key key;
+  PFS_memory_class *memory;
+
+  rc= init_memory_class(5);
+  ok(rc == 0, "room for 5 memory");
+
+  key= register_memory_class("FOO", 3, 0);
+  ok(key == 1, "foo registered");
+  key= register_memory_class("BAR", 3, 0);
+  ok(key == 2, "bar registered");
+  key= register_memory_class("FOO", 3, 0);
+  ok(key == 1, "foo re registered");
+  key= register_memory_class("Memory-3", 8, 0);
+  ok(key == 3, "Memory-3 registered");
+  key= register_memory_class("Memory-4", 8, 0);
+  ok(key == 4, "Memory-4 registered");
+  key= register_memory_class("Memory-5", 8, 0);
+  ok(key == 5, "Memory-5 registered");
+  ok(memory_class_lost == 0, "lost nothing");
+  key= register_memory_class("Memory-6", 8, 0);
+  ok(key == 0, "Memory-6 not registered");
+  ok(memory_class_lost == 1, "lost 1 memory");
+  key= register_memory_class("Memory-7", 8, 0);
+  ok(key == 0, "Memory-7 not registered");
+  ok(memory_class_lost == 2, "lost 2 memory");
+  key= register_memory_class("Memory-3", 8, 0);
+  ok(key == 3, "Memory-3 re registered");
+  ok(memory_class_lost == 2, "lost 2 memory");
+  key= register_memory_class("Memory-5", 8, 0);
+  ok(key == 5, "Memory-5 re registered");
+  ok(memory_class_lost == 2, "lost 2 memory");
+
+  memory= find_memory_class(0);
+  ok(memory == NULL, "no key 0");
+  memory= find_memory_class(3);
+  ok(memory != NULL, "found key 3");
+  ok(strncmp(memory->m_name, "Memory-3", 8) == 0, "key 3 is Memory-3");
+  ok(memory->m_name_length == 8, "name length 3");
+  memory= find_memory_class(9999);
+  ok(memory == NULL, "no key 9999");
+
+  cleanup_memory_class();
+}
+
 #ifdef LATER
 void set_wait_stat(PFS_instr_class *klass)
 {
@@ -668,12 +736,13 @@ void do_all_tests()
   test_file_registration();
   test_socket_registration();
   test_table_registration();
+  test_memory_registration();
   test_instruments_reset();
 }
 
 int main(int argc, char **argv)
 {
-  plan(181);
+  plan(209);
   MY_INIT(argv[0]);
   do_all_tests();
   my_end(0);
diff --git a/storage/perfschema/unittest/pfs_misc-t.cc b/storage/perfschema/unittest/pfs_misc-t.cc
index 7d274c0820d..e2522a053e1 100644
--- a/storage/perfschema/unittest/pfs_misc-t.cc
+++ b/storage/perfschema/unittest/pfs_misc-t.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2015, 2017, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -21,22 +21,26 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 
 #include <my_global.h>
+#include <my_thread.h>
 #include <pfs_instr.h>
 #include <pfs_stat.h>
 #include <pfs_global.h>
 #include <pfs_instr_class.h>
+#include <pfs_buffer_container.h>
 #include <tap.h>
 
+#include "stub_global_status_var.h"
+
 #include <memory.h>
 
 void test_digest_length_overflow()
 {
   if (sizeof(size_t) != 4)
   {
-    skip(2, "digest length overflow requires a 32-bit environment");
+    skip(3, "digest length overflow requires a 32-bit environment");
     return;
   }
-  
+
   PFS_global_param param;
   memset(&param, 0, sizeof(param));
   param.m_enabled= true;
@@ -45,19 +49,32 @@ void test_digest_length_overflow()
      parameters. The Performance Schema should detect the overflow, free
      allocated memory and abort initialization with a warning.
   */
-  
+
   /* Max digest length, events_statements_history_long. */
   param.m_events_statements_history_long_sizing= 10000;
   param.m_digest_sizing= 1000;
   param.m_max_digest_length= (1024 * 1024);
+  param.m_max_sql_text_length= 0;
   pfs_max_digest_length= param.m_max_digest_length;
+  pfs_max_sqltext= param.m_max_sql_text_length;
 
   int rc = init_events_statements_history_long(param.m_events_statements_history_long_sizing);
   ok(rc == 1, "digest length overflow (init_events_statements_history_long");
 
+  /* Max sql text length, events_statements_history_long. */
+  param.m_max_sql_text_length= (1024 * 1024);
+  param.m_max_digest_length= 0;
+  pfs_max_digest_length= param.m_max_digest_length;
+  pfs_max_sqltext= param.m_max_sql_text_length;
+
+  rc = init_events_statements_history_long(param.m_events_statements_history_long_sizing);
+  ok(rc == 1, "sql text length overflow (init_events_statements_history_long");
+
   /* Max digest length, events_statements_summary_by_digest. */
   param.m_max_digest_length= (1024 * 1024);
   param.m_digest_sizing= 10000;
+  pfs_max_digest_length= param.m_max_digest_length;
+  pfs_max_sqltext= param.m_max_sql_text_length;
 
   rc = init_digest(&param);
   ok(rc == 1, "digest length overflow (init_digest)");
@@ -70,10 +87,9 @@ void do_all_tests()
 
 int main(int, char **)
 {
-  plan(2);
+  plan(3);
   MY_INIT("pfs_misc-t");
   do_all_tests();
   my_end(0);
   return (exit_status());
 }
-
diff --git a/storage/perfschema/unittest/pfs_noop-t.cc b/storage/perfschema/unittest/pfs_noop-t.cc
new file mode 100644
index 00000000000..e577a7cc509
--- /dev/null
+++ b/storage/perfschema/unittest/pfs_noop-t.cc
@@ -0,0 +1,242 @@
+/* Copyright (c) 2013, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#include <my_global.h>
+#include <my_thread.h>
+#include <pfs_server.h>
+#include <pfs_instr_class.h>
+#include <pfs_instr.h>
+#include <pfs_global.h>
+#include <tap.h>
+
+#include <string.h>
+#include <memory.h>
+
+#include "stub_print_error.h"
+#include "stub_pfs_defaults.h"
+
+void test_noop()
+{
+  PSI_mutex *mutex;
+  PSI_rwlock *rwlock;
+  PSI_cond *cond;
+  PSI_socket *socket;
+  PSI_table_share *table_share;
+  PSI_table *table;
+  PSI_file *file;
+  PSI_thread *thread;
+  PSI_file_locker *file_locker;
+  PSI_idle_locker *idle_locker;
+  PSI_mutex_locker *mutex_locker;
+  PSI_rwlock_locker *rwlock_locker;
+  PSI_cond_locker *cond_locker;
+  PSI_table_locker *table_locker;
+  PSI_statement_locker *statement_locker;
+  PSI_transaction_locker *transaction_locker;
+  PSI_socket_locker *socket_locker;
+  PSI_digest_locker *digest_locker;
+  PSI_sp_locker *sp_locker;
+  PSI_sp_share *sp_share;
+  PSI_memory_key memory_key;
+  PSI_metadata_lock *metadata_lock;
+  PSI_metadata_locker *metadata_locker;
+  PSI_thread *owner;
+
+  diag("test_noop");
+
+  PSI_server->register_mutex(NULL, NULL, 0);
+  PSI_server->register_rwlock(NULL, NULL, 0);
+  PSI_server->register_cond(NULL, NULL, 0);
+  PSI_server->register_thread(NULL, NULL, 0);
+  PSI_server->register_file(NULL, NULL, 0);
+  PSI_server->register_stage(NULL, NULL, 0);
+  PSI_server->register_statement(NULL, NULL, 0);
+  PSI_server->register_socket(NULL, NULL, 0);
+
+  ok(true, "register");
+  mutex= PSI_server->init_mutex(1, NULL);
+  ok(mutex == NULL, "no mutex");
+  PSI_server->destroy_mutex(NULL);
+  rwlock= PSI_server->init_rwlock(1, NULL);
+  ok(rwlock == NULL, "no rwlock");
+  PSI_server->destroy_rwlock(NULL);
+  cond= PSI_server->init_cond(1, NULL);
+  ok(cond == NULL, "no cond");
+  PSI_server->destroy_cond(NULL);
+  socket= PSI_server->init_socket(1, NULL, NULL, 0);
+  ok(socket == NULL, "no socket");
+  PSI_server->destroy_socket(NULL);
+  table_share= PSI_server->get_table_share(false, NULL);
+  ok(table_share == NULL, "no table_share");
+  PSI_server->release_table_share(NULL);
+  PSI_server->drop_table_share(false, NULL, 0, NULL, 0);
+  table= PSI_server->open_table(NULL, NULL);
+  ok(table == NULL, "no table");
+  PSI_server->unbind_table(NULL);
+  table= PSI_server->rebind_table(NULL, NULL, NULL);
+  ok(table == NULL, "no table");
+  PSI_server->close_table(NULL, NULL);
+  PSI_server->create_file(1, NULL, 2);
+  /* TODO: spawn thread */
+  thread= PSI_server->new_thread(1, NULL, 2);
+  ok(thread == NULL, "no thread");
+  PSI_server->set_thread_id(NULL, 1);
+  thread= PSI_server->get_thread();
+  ok(thread == NULL, "no thread");
+  PSI_server->set_thread_user(NULL, 0);
+  PSI_server->set_thread_account(NULL, 0, NULL, 0);
+  PSI_server->set_thread_db(NULL, 0);
+  PSI_server->set_thread_command(1);
+  PSI_server->set_thread_start_time(1);
+  PSI_server->set_thread_state(NULL);
+  PSI_server->set_thread_info(NULL, 0);
+  PSI_server->set_thread(NULL);
+  PSI_server->delete_current_thread();
+  PSI_server->delete_thread(NULL);
+  file_locker= PSI_server->get_thread_file_name_locker(NULL, 1, PSI_FILE_OPEN, NULL, NULL);
+  ok(file_locker == NULL, "no file_locker");
+  file_locker= PSI_server->get_thread_file_stream_locker(NULL, NULL, PSI_FILE_OPEN);
+  ok(file_locker == NULL, "no file_locker");
+  file_locker= PSI_server->get_thread_file_descriptor_locker(NULL, 0, PSI_FILE_OPEN);
+  ok(file_locker == NULL, "no file_locker");
+  PSI_server->unlock_mutex(NULL);
+  PSI_server->unlock_rwlock(NULL);
+  PSI_server->signal_cond(NULL);
+  PSI_server->broadcast_cond(NULL);
+  idle_locker= PSI_server->start_idle_wait(NULL, NULL, 0);
+  ok(idle_locker == NULL, "no idle_locker");
+  PSI_server->end_idle_wait(NULL);
+  mutex_locker= PSI_server->start_mutex_wait(NULL, NULL, PSI_MUTEX_LOCK, NULL, 0);
+  ok(mutex_locker == NULL, "no mutex_locker");
+  PSI_server->end_mutex_wait(NULL, 0);
+  rwlock_locker= PSI_server->start_rwlock_rdwait(NULL, NULL, PSI_RWLOCK_READLOCK, NULL, 0);
+  ok(rwlock_locker == NULL, "no rwlock_locker");
+  PSI_server->end_rwlock_rdwait(NULL, 0);
+  rwlock_locker= PSI_server->start_rwlock_wrwait(NULL, NULL, PSI_RWLOCK_WRITELOCK, NULL, 0);
+  ok(rwlock_locker == NULL, "no rwlock_locker");
+  PSI_server->end_rwlock_wrwait(NULL, 0);
+  cond_locker= PSI_server->start_cond_wait(NULL, NULL, NULL, PSI_COND_WAIT, NULL, 0);
+  ok(cond_locker == NULL, "no cond_locker");
+  PSI_server->end_cond_wait(NULL, 0);
+  table_locker= PSI_server->start_table_io_wait(NULL, NULL, PSI_TABLE_FETCH_ROW, 0, NULL, 0);
+  ok(table_locker == NULL, "no table_locker");
+  PSI_server->end_table_io_wait(NULL, 0);
+  table_locker= PSI_server->start_table_lock_wait(NULL, NULL, PSI_TABLE_LOCK, 0, NULL, 0);
+  ok(table_locker == NULL, "no table_locker");
+  PSI_server->end_table_lock_wait(NULL);
+  PSI_server->start_file_open_wait(NULL, NULL, 0);
+  file= PSI_server->end_file_open_wait(NULL, NULL);
+  ok(file == NULL, "no file");
+  PSI_server->end_file_open_wait_and_bind_to_descriptor(NULL, 0);
+  PSI_server->start_file_wait(NULL, 0, NULL, 0);
+  PSI_server->end_file_wait(NULL, 0);
+  PSI_server->start_file_close_wait(NULL, NULL, 0);
+  PSI_server->end_file_close_wait(NULL, 0);
+  PSI_server->end_file_rename_wait(NULL, NULL, NULL, 0);
+  PSI_server->start_stage(1, NULL, 0);
+
+  PSI_stage_progress *progress;
+  progress= PSI_server->get_current_stage_progress();
+  ok(progress == NULL, "no progress");
+
+  PSI_server->end_stage();
+  statement_locker= PSI_server->get_thread_statement_locker(NULL, 1, NULL, NULL);
+  ok(statement_locker == NULL, "no statement_locker");
+  statement_locker= PSI_server->refine_statement(NULL, 1);
+  ok(statement_locker == NULL, "no statement_locker");
+  PSI_server->start_statement(NULL, NULL, 0, NULL, 0);
+  PSI_server->set_statement_text(NULL, NULL, 0);
+  PSI_server->set_statement_lock_time(NULL, 0);
+  PSI_server->set_statement_rows_sent(NULL, 0);
+  PSI_server->set_statement_rows_examined(NULL, 0);
+  PSI_server->inc_statement_created_tmp_disk_tables(NULL, 0);
+  PSI_server->inc_statement_created_tmp_tables(NULL, 0);
+  PSI_server->inc_statement_select_full_join(NULL, 0);
+  PSI_server->inc_statement_select_full_range_join(NULL, 0);
+  PSI_server->inc_statement_select_range(NULL, 0);
+  PSI_server->inc_statement_select_range_check(NULL, 0);
+  PSI_server->inc_statement_select_scan(NULL, 0);
+  PSI_server->inc_statement_sort_merge_passes(NULL, 0);
+  PSI_server->inc_statement_sort_range(NULL, 0);
+  PSI_server->inc_statement_sort_rows(NULL, 0);
+  PSI_server->inc_statement_sort_scan(NULL, 0);
+  PSI_server->set_statement_no_index_used(NULL);
+  PSI_server->set_statement_no_good_index_used(NULL);
+  PSI_server->end_statement(NULL, NULL);
+  socket_locker= PSI_server->start_socket_wait(NULL, NULL, PSI_SOCKET_SEND, 1, NULL, 0);
+  ok(socket_locker == NULL, "no socket_locker");
+  PSI_server->end_socket_wait(NULL, 0);
+  PSI_server->set_socket_state(NULL, PSI_SOCKET_STATE_IDLE);
+  PSI_server->set_socket_info(NULL, NULL, NULL, 0);
+  PSI_server->set_socket_thread_owner(NULL);
+  digest_locker= PSI_server->digest_start(NULL);
+  ok(digest_locker == NULL, "no digest_locker");
+  PSI_server->digest_end(NULL, NULL);
+  sp_locker= PSI_server->start_sp(NULL, NULL);
+  ok(sp_locker == NULL, "no sp_locker");
+  PSI_server->end_sp(NULL);
+  PSI_server->drop_sp(0, NULL, 0, NULL, 0);
+  sp_share= PSI_server->get_sp_share(0, NULL, 0, NULL, 0);
+  ok(sp_share == NULL, "no sp_share");
+  PSI_server->release_sp_share(NULL);
+  PSI_server->register_memory(NULL, NULL, 0);
+  memory_key= PSI_server->memory_alloc(0, 0, & owner);
+  ok(memory_key == PSI_NOT_INSTRUMENTED, "no memory_key");
+  memory_key= PSI_server->memory_realloc(0, 0, 0, & owner);
+  ok(memory_key == PSI_NOT_INSTRUMENTED, "no memory_key");
+  PSI_server->memory_free(0, 0, NULL);
+  PSI_server->unlock_table(NULL);
+  metadata_lock= PSI_server->create_metadata_lock(NULL, NULL, 1, 2, 3, NULL, 0);
+  ok(metadata_lock == NULL, "no metadata_lock");
+  PSI_server->set_metadata_lock_status(NULL, 0);
+  PSI_server->destroy_metadata_lock(NULL);
+  metadata_locker= PSI_server->start_metadata_wait(NULL, NULL, NULL, 0);
+  ok(metadata_locker == NULL, "no metadata_locker");
+  PSI_server->end_metadata_wait(NULL, 0);
+  
+  transaction_locker= PSI_server->get_thread_transaction_locker(NULL, NULL, 0, 1, false, 1);
+  ok(transaction_locker == NULL, "no transaction_locker");
+  PSI_server->start_transaction(NULL, NULL, 0);
+  PSI_server->end_transaction(NULL, true);
+
+  PSI_server->set_transaction_gtid(NULL, NULL, NULL);
+  PSI_server->set_transaction_trxid(NULL, NULL);
+  PSI_server->set_transaction_xa_state(NULL, 1);
+  PSI_server->set_transaction_xid(NULL, NULL, 1);
+  PSI_server->inc_transaction_release_savepoint(NULL, 1);
+  PSI_server->inc_transaction_rollback_to_savepoint(NULL, 1);
+  PSI_server->inc_transaction_savepoints(NULL, 1);
+
+  PSI_server->set_thread_THD(NULL, NULL);
+
+  ok(true, "all noop api called");
+}
+
+int main(int, char **)
+{
+  plan(34);
+
+  MY_INIT("pfs_noop-t");
+  test_noop();
+  return (exit_status());
+}
+
diff --git a/storage/perfschema/unittest/pfs_server_stubs.cc b/storage/perfschema/unittest/pfs_server_stubs.cc
index 0cabce37e51..9fc3b2fb143 100644
--- a/storage/perfschema/unittest/pfs_server_stubs.cc
+++ b/storage/perfschema/unittest/pfs_server_stubs.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates. All rights
+   reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -29,6 +30,7 @@
 #include "sql_class.h"
 #include "sql_show.h"
 
+struct system_status_var global_status_var;
 struct sql_digest_storage;
 
 uint lower_case_table_names= 0;
@@ -39,8 +41,17 @@ void compute_digest_md5(const sql_digest_storage *, unsigned char *)
 {
 }
 
+void reset_status_vars()
+{
+}
+
+void sql_print_warning(const char *format, ...)
+{
+  /* Do not pollute the unit test output with annoying messages. */
+}
+
 class sys_var { public: enum where { AUTO }; };
-void set_sys_var_value_origin(void *ptr, enum sys_var::where here)
+void set_sys_var_value_origin(void *, enum sys_var::where, const char *)
 {
 }
 
diff --git a/storage/perfschema/unittest/pfs_timer-t.cc b/storage/perfschema/unittest/pfs_timer-t.cc
index 139454b8649..22fdb163aed 100644
--- a/storage/perfschema/unittest/pfs_timer-t.cc
+++ b/storage/perfschema/unittest/pfs_timer-t.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2017, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -22,7 +22,6 @@
 
 #include <my_global.h>
 #include <my_pthread.h>
-#include <pfs_atomic.h>
 #include <pfs_timer.h>
 #include "my_sys.h"
 #include <tap.h>
diff --git a/storage/perfschema/unittest/pfs_user-oom-t.cc b/storage/perfschema/unittest/pfs_user-oom-t.cc
index ca451f3e457..4cc23a018d2 100644
--- a/storage/perfschema/unittest/pfs_user-oom-t.cc
+++ b/storage/perfschema/unittest/pfs_user-oom-t.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, 2017, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2011, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -21,21 +21,24 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 
 #include <my_global.h>
-#include <my_pthread.h>
+#include <my_thread.h>
 #include <pfs_instr.h>
 #include <pfs_stat.h>
 #include <pfs_global.h>
 #include <pfs_user.h>
+#include <pfs_buffer_container.h>
 #include <tap.h>
 
 #include "stub_pfs_global.h"
+#include "stub_global_status_var.h"
 
 #include <string.h> /* memset */
 
 void test_oom()
 {
-  int rc;
+  PSI *psi;
   PFS_global_param param;
+  PSI_bootstrap *boot;
 
   memset(& param, 0xFF, sizeof(param));
   param.m_enabled= true;
@@ -45,6 +48,7 @@ void test_oom()
   param.m_thread_class_sizing= 10;
   param.m_table_share_sizing= 0;
   param.m_file_class_sizing= 0;
+  param.m_socket_class_sizing= 0;
   param.m_mutex_sizing= 0;
   param.m_rwlock_sizing= 0;
   param.m_cond_sizing= 0;
@@ -52,6 +56,7 @@ void test_oom()
   param.m_table_sizing= 0;
   param.m_file_sizing= 0;
   param.m_file_handle_sizing= 0;
+  param.m_socket_sizing= 0;
   param.m_events_waits_history_sizing= 10;
   param.m_events_waits_history_long_sizing= 0;
   param.m_setup_actor_sizing= 0;
@@ -65,42 +70,64 @@ void test_oom()
   param.m_statement_class_sizing= 50;
   param.m_events_statements_history_sizing= 0;
   param.m_events_statements_history_long_sizing= 0;
+  param.m_events_transactions_history_sizing= 0;
+  param.m_events_transactions_history_long_sizing= 0;
+  param.m_digest_sizing= 0;
+  param.m_session_connect_attrs_sizing= 0;
+  param.m_program_sizing= 0;
+  param.m_statement_stack_sizing= 0;
+  param.m_memory_class_sizing= 10;
+  param.m_metadata_lock_sizing= 0;
+  param.m_max_digest_length= 0;
+  param.m_max_sql_text_length= 0;
 
   /* Setup */
 
   stub_alloc_always_fails= false;
   stub_alloc_fails_after_count= 1000;
 
-  init_event_name_sizing(& param);
-  rc= init_stage_class(param.m_stage_class_sizing);
-  ok(rc == 0, "init stage class");
-  rc= init_statement_class(param.m_statement_class_sizing);
-  ok(rc == 0, "init statement class");
+  pre_initialize_performance_schema();
+  boot= initialize_performance_schema(&param);
+  psi= (PSI *)boot->get_interface(PSI_VERSION_1);
+
+  PSI_thread_key thread_key_1;
+  PSI_thread_info all_thread[]=
+  {
+    {&thread_key_1, "T-1", 0}
+  };
+  psi->register_thread("test", all_thread, 1);
+
+  PSI_thread *thread_1= psi->new_thread(thread_key_1, NULL, 0);
+  psi->set_thread(thread_1);
 
   /* Tests */
 
-  stub_alloc_fails_after_count= 1;
-  rc= init_user(& param);
-  ok(rc == 1, "oom (user)");
-  cleanup_user();
+  int first_fail= 1;
+  stub_alloc_fails_after_count= first_fail;
+  psi->set_thread_account("user1", 5, "", 0);
+  ok(global_user_container.m_lost == 1, "oom (user)");
+
+  stub_alloc_fails_after_count= first_fail + 1;
+  psi->set_thread_account("user2", 5, "", 0);
+  ok(global_user_container.m_lost == 2, "oom (user waits)");
+
+  stub_alloc_fails_after_count= first_fail + 2;
+  psi->set_thread_account("user3", 5, "", 0);
+  ok(global_user_container.m_lost == 3, "oom (user stages)");
 
-  stub_alloc_fails_after_count= 2;
-  rc= init_user(& param);
-  ok(rc == 1, "oom (user waits)");
-  cleanup_user();
+  stub_alloc_fails_after_count= first_fail + 3;
+  psi->set_thread_account("user4", 5, "", 0);
+  ok(global_user_container.m_lost == 4, "oom (user statements)");
 
-  stub_alloc_fails_after_count= 3;
-  rc= init_user(& param);
-  ok(rc == 1, "oom (user stages)");
-  cleanup_user();
+  stub_alloc_fails_after_count= first_fail + 4;
+  psi->set_thread_account("user5", 5, "", 0);
+  ok(global_user_container.m_lost == 5, "oom (user transactions)");
 
-  stub_alloc_fails_after_count= 4;
-  rc= init_user(& param);
-  ok(rc == 1, "oom (user statements)");
-  cleanup_user();
+  stub_alloc_fails_after_count= first_fail + 5;
+  psi->set_thread_account("user6", 5, "", 0);
+  ok(global_user_container.m_lost == 6, "oom (user memory)");
 
-  cleanup_statement_class();
-  cleanup_stage_class();
+  shutdown_performance_schema();
 }
 
 void do_all_tests()
diff --git a/storage/perfschema/unittest/stub_global_status_var.h b/storage/perfschema/unittest/stub_global_status_var.h
new file mode 100644
index 00000000000..c2785d2eac1
--- /dev/null
+++ b/storage/perfschema/unittest/stub_global_status_var.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2.0,
+  as published by the Free Software Foundation.
+
+  This program is also distributed with certain software (including
+  but not limited to OpenSSL) that is licensed under separate terms,
+  as designated in a particular file or component or in included license
+  documentation.  The authors of MySQL hereby grant you an additional
+  permission to link the program and your derivative works with the
+  separately licensed software that they have included with MySQL.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License, version 2.0, for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <pfs_global.h>
+#include <string.h>
+
+
+void add_to_status(STATUS_VAR *to_var, STATUS_VAR *from_var)
+{
+}
diff --git a/storage/perfschema/unittest/stub_pfs_defaults.h b/storage/perfschema/unittest/stub_pfs_defaults.h
index 951508733e2..338fd9f8324 100644
--- a/storage/perfschema/unittest/stub_pfs_defaults.h
+++ b/storage/perfschema/unittest/stub_pfs_defaults.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
diff --git a/storage/perfschema/unittest/stub_pfs_global.h b/storage/perfschema/unittest/stub_pfs_global.h
index b7adbe33504..6d10e29161d 100644
--- a/storage/perfschema/unittest/stub_pfs_global.h
+++ b/storage/perfschema/unittest/stub_pfs_global.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
    Copyright (c) 2022, MariaDB Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -30,16 +30,18 @@
 #endif
 
 bool pfs_initialized= false;
+size_t pfs_allocated_memory_size= 0;
+size_t pfs_allocated_memory_count= 0;
 
 bool stub_alloc_always_fails= true;
 int stub_alloc_fails_after_count= 0;
 
-void *pfs_malloc(size_t size, myf)
+void *pfs_malloc(PFS_builtin_memory_class *klass, size_t size, myf)
 {
   /*
     Catch non initialized sizing parameter in the unit tests.
   */
-  DBUG_ASSERT(size <= 100*1024*1024);
+  assert(size <= 100*1024*1024);
 
   if (stub_alloc_always_fails)
     return NULL;
@@ -63,19 +65,27 @@ void *pfs_malloc(size_t size, myf)
   return ptr;
 }
 
-void pfs_free(void *ptr)
+void pfs_free(PFS_builtin_memory_class *, size_t, void *ptr)
 {
   if (ptr != NULL)
     free(ptr);
 }
 
-void *pfs_malloc_array(size_t n, size_t size, myf flags)
+void *pfs_malloc_array(PFS_builtin_memory_class *klass, size_t n, size_t size, myf flags)
 {
   size_t array_size= n * size;
   /* Check for overflow before allocating. */
   if (is_overflow(array_size, n, size))
     return NULL;
-  return pfs_malloc(array_size, flags);
+  return pfs_malloc(klass, array_size, flags);
+}
+
+void pfs_free_array(PFS_builtin_memory_class *klass, size_t n, size_t size, void *ptr)
+{
+  if (ptr == NULL)
+    return;
+  size_t array_size= n * size;
+  return pfs_free(klass, array_size, ptr);
 }
 
 bool is_overflow(size_t product, size_t n1, size_t n2)
@@ -90,3 +100,4 @@ void pfs_print_error(const char *format, ...)
 {
 }
 
+
diff --git a/storage/perfschema/unittest/stub_print_error.h b/storage/perfschema/unittest/stub_print_error.h
index 2cd2ad5b03c..ac1ecf1f637 100644
--- a/storage/perfschema/unittest/stub_print_error.h
+++ b/storage/perfschema/unittest/stub_print_error.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2021, Oracle and/or its affiliates.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
@@ -26,7 +26,7 @@
 
 bool pfs_initialized= false;
 
-void *pfs_malloc(size_t size, myf flags)
+void *pfs_malloc(PFS_builtin_memory_class *klass, size_t size, myf flags)
 {
   void *ptr= malloc(size);
   if (ptr && (flags & MY_ZEROFILL))
@@ -34,19 +34,27 @@ void *pfs_malloc(size_t size, myf flags)
   return ptr;
 }
 
-void pfs_free(void *ptr)
+void pfs_free(PFS_builtin_memory_class *, size_t, void *ptr)
 {
   if (ptr != NULL)
     free(ptr);
 }
 
-void *pfs_malloc_array(size_t n, size_t size, myf flags)
+void *pfs_malloc_array(PFS_builtin_memory_class *klass, size_t n, size_t size, myf flags)
 {
   size_t array_size= n * size;
   /* Check for overflow before allocating. */
   if (is_overflow(array_size, n, size))
     return NULL;
-  return pfs_malloc(array_size, flags);
+  return pfs_malloc(klass, array_size, flags);
+}
+
+void pfs_free_array(PFS_builtin_memory_class *klass, size_t n, size_t size, void *ptr)
+{
+  if (ptr == NULL)
+    return;
+  size_t array_size= n * size;
+  return pfs_free(klass, array_size, ptr);
 }
 
 bool is_overflow(size_t product, size_t n1, size_t n2)
diff --git a/storage/rocksdb/CMakeLists.txt b/storage/rocksdb/CMakeLists.txt
index de0f7a111fc..3a7922515a7 100644
--- a/storage/rocksdb/CMakeLists.txt
+++ b/storage/rocksdb/CMakeLists.txt
@@ -251,8 +251,8 @@ ADD_LIBRARY(rocksdb_tools STATIC
 MYSQL_ADD_EXECUTABLE(sst_dump rocksdb/tools/sst_dump.cc COMPONENT rocksdb-engine)
 TARGET_LINK_LIBRARIES(sst_dump rocksdblib)
 
-MYSQL_ADD_EXECUTABLE(mysql_ldb tools/mysql_ldb.cc COMPONENT rocksdb-engine)
-TARGET_LINK_LIBRARIES(mysql_ldb rocksdb_tools rocksdb_aux_lib dbug)
+MYSQL_ADD_EXECUTABLE(mariadb-ldb tools/mysql_ldb.cc COMPONENT rocksdb-engine)
+TARGET_LINK_LIBRARIES(mariadb-ldb rocksdb_tools rocksdb_aux_lib dbug)
 
 CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/myrocks_hotbackup.py
   ${CMAKE_CURRENT_BINARY_DIR}/myrocks_hotbackup @ONLY)
@@ -271,7 +271,7 @@ IF(MSVC)
     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267")
   ENDIF()
 ELSEIF(CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-  SET_TARGET_PROPERTIES(rocksdb_tools sst_dump mysql_ldb PROPERTIES COMPILE_FLAGS "-Wno-error")
+  SET_TARGET_PROPERTIES(rocksdb_tools sst_dump mariadb-ldb PROPERTIES COMPILE_FLAGS "-Wno-error")
 ENDIF()
 
 IF(GIT_EXECUTABLE AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/rocksdb/.git)
diff --git a/storage/rocksdb/build_rocksdb.cmake b/storage/rocksdb/build_rocksdb.cmake
index e23862ee659..762368b4a80 100644
--- a/storage/rocksdb/build_rocksdb.cmake
+++ b/storage/rocksdb/build_rocksdb.cmake
@@ -112,8 +112,8 @@ if(NOT WIN32)
 endif()
 
 include(CheckCCompilerFlag)
-# ppc64 or ppc64le
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64")
+# ppc64 or ppc64le or powerpc64 (BSD)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64|powerpc64")
   CHECK_C_COMPILER_FLAG("-maltivec" HAS_ALTIVEC)
   if(HAS_ALTIVEC)
     message(STATUS " HAS_ALTIVEC yes")
@@ -127,7 +127,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=power8")
   endif()
   ADD_DEFINITIONS(-DHAVE_POWER8 -DHAS_ALTIVEC)
-endif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64")
+endif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64|powerpc64")
 
 if(CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
  set(SYSTEM_LIBS ${SYSTEM_LIBS} -latomic)
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index ce9bb7d49e2..79398a49648 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -3133,6 +3133,8 @@ protected:
     s_tx_list.erase(this);
     RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
   }
+  virtual bool is_prepared()        { return false; };
+  virtual void detach_prepared_tx() {};
 };
 
 /*
@@ -3169,7 +3171,16 @@ class Rdb_transaction_impl : public Rdb_transaction {
 
   virtual bool is_writebatch_trx() const override { return false; }
 
- private:
+  bool is_prepared() override {
+    return m_rocksdb_tx && rocksdb::Transaction::PREPARED == m_rocksdb_tx->GetState();
+  }
+
+  void detach_prepared_tx() override {
+    DBUG_ASSERT(rocksdb::Transaction::PREPARED == m_rocksdb_tx->GetState());
+    m_rocksdb_tx = nullptr;
+  }
+
+private:
   void release_tx(void) {
     // We are done with the current active transaction object.  Preserve it
     // for later reuse.
@@ -3810,7 +3821,8 @@ static int rocksdb_close_connection(handlerton *const hton, THD *const thd) {
           "disconnecting",
           rc);
     }
-
+    if (tx->is_prepared())
+      tx->detach_prepared_tx();
     delete tx;
   }
   return HA_EXIT_SUCCESS;
@@ -4112,15 +4124,14 @@ static int rocksdb_recover(handlerton* hton, XID* xid_list, uint len)
   MariaRocks just flushes everything right away ATM
 */
 
-static void rocksdb_checkpoint_request(handlerton *hton,
-                                       void *cookie)
+static void rocksdb_checkpoint_request(void *cookie)
 {
-  const rocksdb::Status s= rdb->SyncWAL();
+  const rocksdb::Status s= rdb->FlushWAL(true);
   //TODO: what to do on error?
   if (s.ok())
   {
     rocksdb_wal_group_syncs++;
-    commit_checkpoint_notify_ha(hton, cookie);
+    commit_checkpoint_notify_ha(cookie);
   }
 }
 
@@ -4800,10 +4811,10 @@ static inline void rocksdb_register_tx(handlerton *const hton, THD *const thd,
                                        Rdb_transaction *const tx) {
   DBUG_ASSERT(tx != nullptr);
 
-  trans_register_ha(thd, FALSE, rocksdb_hton);
+  trans_register_ha(thd, FALSE, rocksdb_hton, 0);
   if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
     tx->start_stmt();
-    trans_register_ha(thd, TRUE, rocksdb_hton);
+    trans_register_ha(thd, TRUE, rocksdb_hton, 0);
   }
 }
 
@@ -5288,7 +5299,6 @@ static int rocksdb_init_func(void *const p) {
                    &rdb_block_cache_resize_mutex, MY_MUTEX_INIT_FAST);
   Rdb_transaction::init_mutex();
 
-  rocksdb_hton->state = SHOW_OPTION_YES;
   rocksdb_hton->create = rocksdb_create_handler;
   rocksdb_hton->close_connection = rocksdb_close_connection;
 
@@ -5321,7 +5331,7 @@ static int rocksdb_init_func(void *const p) {
 #ifdef MARIAROCKS_NOT_YET
   rocksdb_hton->update_table_stats = rocksdb_update_table_stats;
 #endif // MARIAROCKS_NOT_YET
-  
+
   /*
   Not needed in MariaDB:
   rocksdb_hton->flush_logs = rocksdb_flush_wal;
@@ -5990,6 +6000,7 @@ Rdb_table_handler *Rdb_open_tables_map::get_table_handler(
     // Since we did not find it in the hash map, attempt to create and add it
     // to the hash map.
     if (!(table_handler = reinterpret_cast<Rdb_table_handler *>(my_multi_malloc(
+              PSI_INSTRUMENT_ME,
               MYF(MY_WME | MY_ZEROFILL), &table_handler, sizeof(*table_handler),
               &tmp_name, table_name_str.length() + 1, NullS)))) {
       // Allocating a new Rdb_table_handler and a new table name failed.
@@ -6342,6 +6353,7 @@ ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
       m_dup_sk_packed_tuple(nullptr),
       m_dup_sk_packed_tuple_old(nullptr),
       m_pack_buffer(nullptr),
+      m_record_buffer(nullptr),
       m_lock_rows(RDB_LOCK_NONE),
       m_keyread_only(false),
       m_insert_with_update(false),
@@ -6556,6 +6568,7 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
   uint key_len = 0;
   uint max_packed_sk_len = 0;
   uint pack_key_len = 0;
+  uint record_len = table->s->reclength + table->s->null_bytes;
 
   m_pk_descr = kd_arr[pk_index(table_arg, tbl_def_arg)];
   if (has_hidden_pk(table_arg)) {
@@ -6569,11 +6582,11 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
   // move this into get_table_handler() ??
   m_pk_descr->setup(table_arg, tbl_def_arg);
 
-  m_pk_tuple = reinterpret_cast<uchar *>(my_malloc(key_len, MYF(0)));
+  m_pk_tuple = reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, key_len, MYF(0)));
 
   pack_key_len = m_pk_descr->max_storage_fmt_length();
   m_pk_packed_tuple =
-      reinterpret_cast<uchar *>(my_malloc(pack_key_len, MYF(0)));
+      reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, pack_key_len, MYF(0)));
 
   /* Sometimes, we may use m_sk_packed_tuple for storing packed PK */
   max_packed_sk_len = pack_key_len;
@@ -6591,20 +6604,22 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
   }
 
   m_sk_packed_tuple =
-      reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
+      reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
   m_sk_match_prefix_buf =
-      reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
+      reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
   m_sk_packed_tuple_old =
-      reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
+      reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
   m_end_key_packed_tuple =
-      reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
+      reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
   m_pack_buffer =
-      reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
+      reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
+  m_record_buffer =
+      reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, record_len, MYF(0)));
 
   m_scan_it_lower_bound =
-      reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
+      reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
   m_scan_it_upper_bound =
-      reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
+      reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
 
   /*
     If inplace alter is happening, allocate special buffers for unique
@@ -6612,15 +6627,16 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
   */
   if (alloc_alter_buffers) {
     m_dup_sk_packed_tuple =
-        reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
+        reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
     m_dup_sk_packed_tuple_old =
-        reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
+        reinterpret_cast<uchar *>(my_malloc(PSI_INSTRUMENT_ME, max_packed_sk_len, MYF(0)));
   }
 
   if (m_pk_tuple == nullptr || m_pk_packed_tuple == nullptr ||
       m_sk_packed_tuple == nullptr || m_sk_packed_tuple_old == nullptr ||
       m_end_key_packed_tuple == nullptr || m_pack_buffer == nullptr ||
       m_scan_it_upper_bound == nullptr || m_scan_it_lower_bound == nullptr ||
+      m_record_buffer == nullptr ||
       (alloc_alter_buffers && (m_dup_sk_packed_tuple == nullptr ||
                                m_dup_sk_packed_tuple_old == nullptr))) {
     // One or more of the above allocations failed.  Clean up and exit
@@ -6654,6 +6670,9 @@ void ha_rocksdb::free_key_buffers() {
   my_free(m_pack_buffer);
   m_pack_buffer = nullptr;
 
+  my_free(m_record_buffer);
+  m_record_buffer = nullptr;
+
   my_free(m_dup_sk_packed_tuple);
   m_dup_sk_packed_tuple = nullptr;
 
@@ -8085,7 +8104,8 @@ int ha_rocksdb::position_to_correct_key(
           rc = HA_ERR_KEY_NOT_FOUND;
         } else if (find_flag == HA_READ_PREFIX_LAST) {
           uint size = kd.pack_index_tuple(table, m_pack_buffer,
-                                          m_sk_packed_tuple, key, keypart_map);
+                                          m_sk_packed_tuple, m_record_buffer,
+                                          key, keypart_map);
           rocksdb::Slice lookup_tuple(
               reinterpret_cast<char *>(m_sk_packed_tuple), size);
 
@@ -8125,7 +8145,7 @@ int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd,
   if (end_key) {
     *end_key_packed_size =
         kd.pack_index_tuple(table, m_pack_buffer, m_end_key_packed_tuple,
-                            end_key->key, end_key->keypart_map);
+                            m_record_buffer, end_key->key, end_key->keypart_map);
 
     /*
       Calculating length of the equal conditions here. 4 byte index id is
@@ -8383,11 +8403,9 @@ int ha_rocksdb::read_range_first(const key_range *const start_key,
 #endif
     increment_statistics(&SSV::ha_read_key_count);
 
-    MYSQL_TABLE_IO_WAIT(m_psi, PSI_TABLE_FETCH_ROW, active_index, 0, {
-      result =
-          index_read_map_impl(table->record[0], start_key->key,
-                              start_key->keypart_map, start_key->flag, end_key);
-    })
+    result =
+        index_read_map_impl(table->record[0], start_key->key,
+                            start_key->keypart_map, start_key->flag, end_key);
   }
   if (result) {
     DBUG_RETURN((result == HA_ERR_KEY_NOT_FOUND) ? HA_ERR_END_OF_FILE : result);
@@ -8498,7 +8516,8 @@ int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key,
       This is a special case, use DB::Get.
     */
     const uint size = kd.pack_index_tuple(table, m_pack_buffer,
-                                          m_pk_packed_tuple, key, keypart_map);
+                                          m_pk_packed_tuple, m_record_buffer,
+                                          key, keypart_map);
     bool skip_lookup = is_blind_delete_enabled();
 
     rc = get_row_by_rowid(buf, m_pk_packed_tuple, size, skip_lookup, false);
@@ -8524,14 +8543,14 @@ int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key,
                                                    .user_defined_key_parts) -
                            1;
     packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
-                                      key, tmp_map);
+                                      m_record_buffer, key, tmp_map);
     if (table->key_info[active_index].user_defined_key_parts !=
         kd.get_key_parts()) {
       using_full_key = false;
     }
   } else {
     packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
-                                      key, keypart_map);
+                                      m_record_buffer, key, keypart_map);
   }
 
   if ((pushed_idx_cond && pushed_idx_cond_keyno == active_index) &&
@@ -9003,6 +9022,7 @@ int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid,
   Rdb_transaction *const tx = get_or_create_tx(table->in_use);
   DBUG_ASSERT(tx != nullptr);
 
+#ifdef ENABLED_DEBUG_SYNC
   DEBUG_SYNC(ha_thd(), "rocksdb.get_row_by_rowid");
   DBUG_EXECUTE_IF("dbug.rocksdb.get_row_by_rowid", {
     THD *thd = ha_thd();
@@ -9012,6 +9032,7 @@ int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid,
     DBUG_ASSERT(opt_debug_sync_timeout > 0);
     DBUG_ASSERT(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
   };);
+#endif /* ENABLED_DEBUG_SYNC */
 
   bool found;
   rocksdb::Status s;
@@ -11373,7 +11394,6 @@ ulonglong ha_rocksdb::table_flags() const
 }
 
 
-
 /**
   @return
     HA_EXIT_SUCCESS  OK
@@ -11946,8 +11966,9 @@ int ha_rocksdb::extra(enum ha_extra_function operation) {
   Given a starting key and an ending key, estimate the number of rows that
   will exist between the two keys.
 */
-ha_rows ha_rocksdb::records_in_range(uint inx, key_range *const min_key,
-                                     key_range *const max_key) {
+ha_rows ha_rocksdb::records_in_range(uint inx, const key_range *const min_key,
+                                     const key_range *const max_key,
+                                     page_range *pages) {
   DBUG_ENTER_FUNC();
 
   ha_rows ret = THDVAR(ha_thd(), records_in_range);
@@ -11966,6 +11987,7 @@ ha_rows ha_rocksdb::records_in_range(uint inx, key_range *const min_key,
   uint size1 = 0;
   if (min_key) {
     size1 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
+                                m_record_buffer,
                                 min_key->key, min_key->keypart_map);
     if (min_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
         min_key->flag == HA_READ_PREFIX_LAST ||
@@ -11979,6 +12001,7 @@ ha_rows ha_rocksdb::records_in_range(uint inx, key_range *const min_key,
   uint size2 = 0;
   if (max_key) {
     size2 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple_old,
+                                m_record_buffer,
                                 max_key->key, max_key->keypart_map);
     if (max_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
         max_key->flag == HA_READ_PREFIX_LAST ||
@@ -12457,6 +12480,7 @@ my_core::enum_alter_inplace_result ha_rocksdb::check_if_supported_inplace_alter(
         ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
         ALTER_PARTITIONED |
         ALTER_ADD_UNIQUE_INDEX |
+        ALTER_INDEX_ORDER |
         ALTER_CHANGE_CREATE_OPTION)) {
     DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
   }
@@ -14177,7 +14201,7 @@ void rocksdb_set_collation_exception_list(THD *const thd,
   rdb_set_collation_exception_list(val == nullptr ? "" : val);
 
   //psergey-todo: what is the purpose of the below??
-  const char *val_copy= val? my_strdup(val, MYF(0)): nullptr;
+  const char *val_copy= val? my_strdup(PSI_INSTRUMENT_ME, val, MYF(0)): nullptr;
   my_free(*static_cast<char**>(var_ptr));
   *static_cast<const char**>(var_ptr) = val_copy;
 }
@@ -14390,7 +14414,7 @@ static int rocksdb_validate_update_cf_options(
   // This can cause invalid memory access after validation is finished.
   // To avoid this kind case, let's alway duplicate the str if str is not
   // nullptr
-  *(const char **)save = (str == nullptr) ? nullptr : my_strdup(str, MYF(0));
+  *(const char **)save = (str == nullptr) ? nullptr : my_strdup(PSI_INSTRUMENT_ME, str, MYF(0));
 
   if (str == nullptr) {
     return HA_EXIT_SUCCESS;
diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h
index 766b04ac7d2..00dd0ff4624 100644
--- a/storage/rocksdb/ha_rocksdb.h
+++ b/storage/rocksdb/ha_rocksdb.h
@@ -235,6 +235,11 @@ class ha_rocksdb : public my_core::handler {
   */
   uchar *m_pack_buffer;
 
+  /*
+    A buffer long enough to store table record
+   */
+  uchar *m_record_buffer;
+
   /* class to convert between Mysql format and RocksDB format*/
   std::shared_ptr<Rdb_converter> m_converter;
 
@@ -499,12 +504,6 @@ public:
     DBUG_RETURN(&key_map_full);
   }
 
-  bool primary_key_is_clustered() override {
-    DBUG_ENTER_FUNC();
-
-    DBUG_RETURN(true);
-  }
-
   bool should_store_row_debug_checksums() const {
     return m_store_row_debug_checksums && (rand() % 100 < m_checksums_pct);
   }
@@ -906,8 +905,10 @@ public:
   int check(THD *const thd, HA_CHECK_OPT *const check_opt) override
       MY_ATTRIBUTE((__warn_unused_result__));
   int remove_rows(Rdb_tbl_def *const tbl);
-  ha_rows records_in_range(uint inx, key_range *const min_key,
-                           key_range *const max_key) override
+  ha_rows records_in_range(uint inx,
+                           const key_range *const min_key,
+                           const key_range *const max_key,
+                           page_range *pages) override
       MY_ATTRIBUTE((__warn_unused_result__));
 
   int delete_table(Rdb_tbl_def *const tbl);
diff --git a/storage/rocksdb/mysql-test/rocksdb/include/index_merge1.inc b/storage/rocksdb/mysql-test/rocksdb/include/index_merge1.inc
index b5cf7bff763..c1462e7817a 100644
--- a/storage/rocksdb/mysql-test/rocksdb/include/index_merge1.inc
+++ b/storage/rocksdb/mysql-test/rocksdb/include/index_merge1.inc
@@ -676,7 +676,7 @@ drop table t1;
 if ($merge_table_support)
 {
 #
-# BUG#17314: Index_merge/intersection not choosen by the optimizer for MERGE tables
+# BUG#17314: Index_merge/intersection not chosen by the optimizer for MERGE tables
 #
 create table t0 (a int);
 insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
diff --git a/storage/rocksdb/mysql-test/rocksdb/r/binlog_rotate_crash.result b/storage/rocksdb/mysql-test/rocksdb/r/binlog_rotate_crash.result
new file mode 100644
index 00000000000..1b22141813a
--- /dev/null
+++ b/storage/rocksdb/mysql-test/rocksdb/r/binlog_rotate_crash.result
@@ -0,0 +1,19 @@
+#
+# MDEV-25305: MyRocks: Killing server during RESET MASTER can lose last transactions
+#
+set global rocksdb_flush_log_at_trx_commit=1;
+create table t1 (a int, b int, key(a)) engine=rocksdb;
+insert into t1 values (1,1),(2,2);
+select * from t1;
+a	b
+1	1
+2	2
+flush tables;
+set @@debug_dbug="+d,crash_after_reset_master";
+RESET MASTER;
+# Must show the inserted rows:
+select * from t1;
+a	b
+1	1
+2	2
+drop table t1;
diff --git a/storage/rocksdb/mysql-test/rocksdb/r/group_min_max.result b/storage/rocksdb/mysql-test/rocksdb/r/group_min_max.result
index 6507aa43ae1..b8cb4157d37 100644
--- a/storage/rocksdb/mysql-test/rocksdb/r/group_min_max.result
+++ b/storage/rocksdb/mysql-test/rocksdb/r/group_min_max.result
@@ -1808,10 +1808,10 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	index	NULL	idx_t1_2	147	NULL	128	Using where; Using index
 explain select distinct a1 from t1 where a1 in ('a', 'd') and a2 = 'b';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	130	NULL	5	Using where; Using index for group-by
+1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_2	130	NULL	5	Using where; Using index for group-by
 explain select distinct a1 from t1 where a1 in ('a', 'd') and a2 = 'e';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	130	NULL	5	Using where; Using index for group-by
+1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_2	130	NULL	5	Using where; Using index for group-by
 explain select distinct a1,a2,b from t2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	range	NULL	idx_t2_2	146	NULL	#	Using index for group-by
@@ -1978,10 +1978,10 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	NULL	idx_t1_2	147	NULL	17	Using where; Using index for group-by; Using temporary; Using filesort
 explain select distinct a1 from t1 where a1 in ('a', 'd') and a2 = 'b' group by a1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	130	NULL	5	Using where; Using index for group-by
+1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_2	130	NULL	5	Using where; Using index for group-by
 explain select distinct a1 from t1 where a1 in ('a', 'd') and a2 = 'e' group by a1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	130	NULL	5	Using where; Using index for group-by
+1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_2	130	NULL	5	Using where; Using index for group-by
 explain select distinct a1,a2,b from t2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	range	NULL	idx_t2_2	146	NULL	#	Using index for group-by
@@ -2131,7 +2131,7 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	147	NULL	17	Using where; Using index for group-by
 explain select concat(ord(min(b)),ord(max(b))),min(b),max(b) from t1 group by a1,a2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	9	Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_2	147	NULL	9	Using index for group-by
 select a1,a2,b, concat(min(c), max(c)) from t1 where a1 < 'd' group by a1,a2,b;
 a1	a2	b	concat(min(c), max(c))
 a	a	a	a111d111
@@ -2285,7 +2285,7 @@ c
 d
 explain select a1 from t1 where a2 = 'b' group by a1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	130	NULL	5	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_2	130	NULL	5	Using where; Using index for group-by
 select a1 from t1 where a2 = 'b' group by a1;
 a1
 a
@@ -2294,7 +2294,7 @@ c
 d
 explain select distinct a1 from t1 where a2 = 'b';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	130	NULL	5	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_2	130	NULL	5	Using where; Using index for group-by
 select distinct a1 from t1 where a2 = 'b';
 a1
 a
diff --git a/storage/rocksdb/mysql-test/rocksdb/r/handler_basic.result b/storage/rocksdb/mysql-test/rocksdb/r/handler_basic.result
index efc6ccde500..4f285134c00 100644
--- a/storage/rocksdb/mysql-test/rocksdb/r/handler_basic.result
+++ b/storage/rocksdb/mysql-test/rocksdb/r/handler_basic.result
@@ -34,6 +34,7 @@ Handler_read_rnd	0
 Handler_read_rnd_deleted	0
 Handler_read_rnd_next	0
 FLUSH STATUS;
+SET GLOBAL rocksdb_force_flush_memtable_and_lzero_now=1;
 SELECT * FROM t1 WHERE b=6;
 id	a	b
 6	NULL	6
diff --git a/storage/rocksdb/mysql-test/rocksdb/r/i_s.result b/storage/rocksdb/mysql-test/rocksdb/r/i_s.result
new file mode 100644
index 00000000000..6e7368ac5bc
--- /dev/null
+++ b/storage/rocksdb/mysql-test/rocksdb/r/i_s.result
@@ -0,0 +1,159 @@
+SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES
+WHERE TABLE_SCHEMA='INFORMATION_SCHEMA'
+    AND TABLE_NAME LIKE 'ROCKSDB%'
+  ORDER BY TABLE_NAME;
+TABLE_NAME
+ROCKSDB_CFSTATS
+ROCKSDB_CF_OPTIONS
+ROCKSDB_COMPACTION_STATS
+ROCKSDB_DBSTATS
+ROCKSDB_DDL
+ROCKSDB_DEADLOCK
+ROCKSDB_GLOBAL_INFO
+ROCKSDB_INDEX_FILE_MAP
+ROCKSDB_LOCKS
+ROCKSDB_PERF_CONTEXT
+ROCKSDB_PERF_CONTEXT_GLOBAL
+ROCKSDB_SST_PROPS
+ROCKSDB_TRX
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_CF_OPTIONS;
+Table	Create Table
+ROCKSDB_CF_OPTIONS	CREATE TEMPORARY TABLE `ROCKSDB_CF_OPTIONS` (
+  `CF_NAME` varchar(193) NOT NULL,
+  `OPTION_TYPE` varchar(193) NOT NULL,
+  `VALUE` varchar(193) NOT NULL
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_CFSTATS;
+Table	Create Table
+ROCKSDB_CFSTATS	CREATE TEMPORARY TABLE `ROCKSDB_CFSTATS` (
+  `CF_NAME` varchar(193) NOT NULL,
+  `STAT_TYPE` varchar(193) NOT NULL,
+  `VALUE` bigint(21) NOT NULL
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_COMPACTION_STATS;
+Table	Create Table
+ROCKSDB_COMPACTION_STATS	CREATE TEMPORARY TABLE `ROCKSDB_COMPACTION_STATS` (
+  `CF_NAME` varchar(193) NOT NULL,
+  `LEVEL` varchar(513) NOT NULL,
+  `TYPE` varchar(513) NOT NULL,
+  `VALUE` double NOT NULL
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_DBSTATS;
+Table	Create Table
+ROCKSDB_DBSTATS	CREATE TEMPORARY TABLE `ROCKSDB_DBSTATS` (
+  `STAT_TYPE` varchar(193) NOT NULL,
+  `VALUE` bigint(21) NOT NULL
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_DDL;
+Table	Create Table
+ROCKSDB_DDL	CREATE TEMPORARY TABLE `ROCKSDB_DDL` (
+  `TABLE_SCHEMA` varchar(193) NOT NULL,
+  `TABLE_NAME` varchar(193) NOT NULL,
+  `PARTITION_NAME` varchar(193),
+  `INDEX_NAME` varchar(193) NOT NULL,
+  `COLUMN_FAMILY` int(11) NOT NULL,
+  `INDEX_NUMBER` int(11) NOT NULL,
+  `INDEX_TYPE` smallint(6) NOT NULL,
+  `KV_FORMAT_VERSION` smallint(6) NOT NULL,
+  `TTL_DURATION` bigint(21) NOT NULL,
+  `INDEX_FLAGS` bigint(21) NOT NULL,
+  `CF` varchar(193) NOT NULL,
+  `AUTO_INCREMENT` bigint(21) unsigned
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_DEADLOCK;
+Table	Create Table
+ROCKSDB_DEADLOCK	CREATE TEMPORARY TABLE `ROCKSDB_DEADLOCK` (
+  `DEADLOCK_ID` bigint(21) NOT NULL,
+  `TIMESTAMP` bigint(21) NOT NULL,
+  `TRANSACTION_ID` bigint(21) NOT NULL,
+  `CF_NAME` varchar(193) NOT NULL,
+  `WAITING_KEY` varchar(513) NOT NULL,
+  `LOCK_TYPE` varchar(193) NOT NULL,
+  `INDEX_NAME` varchar(193) NOT NULL,
+  `TABLE_NAME` varchar(193) NOT NULL,
+  `ROLLED_BACK` bigint(21) NOT NULL
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_GLOBAL_INFO;
+Table	Create Table
+ROCKSDB_GLOBAL_INFO	CREATE TEMPORARY TABLE `ROCKSDB_GLOBAL_INFO` (
+  `TYPE` varchar(513) NOT NULL,
+  `NAME` varchar(513) NOT NULL,
+  `VALUE` varchar(513) NOT NULL
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_INDEX_FILE_MAP;
+Table	Create Table
+ROCKSDB_INDEX_FILE_MAP	CREATE TEMPORARY TABLE `ROCKSDB_INDEX_FILE_MAP` (
+  `COLUMN_FAMILY` int(11) NOT NULL,
+  `INDEX_NUMBER` int(11) NOT NULL,
+  `SST_NAME` varchar(193) NOT NULL,
+  `NUM_ROWS` bigint(21) NOT NULL,
+  `DATA_SIZE` bigint(21) NOT NULL,
+  `ENTRY_DELETES` bigint(21) NOT NULL,
+  `ENTRY_SINGLEDELETES` bigint(21) NOT NULL,
+  `ENTRY_MERGES` bigint(21) NOT NULL,
+  `ENTRY_OTHERS` bigint(21) NOT NULL,
+  `DISTINCT_KEYS_PREFIX` varchar(800) NOT NULL
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_LOCKS;
+Table	Create Table
+ROCKSDB_LOCKS	CREATE TEMPORARY TABLE `ROCKSDB_LOCKS` (
+  `COLUMN_FAMILY_ID` int(11) NOT NULL,
+  `TRANSACTION_ID` int(11) NOT NULL,
+  `KEY` varchar(513) NOT NULL,
+  `MODE` varchar(32) NOT NULL
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_PERF_CONTEXT;
+Table	Create Table
+ROCKSDB_PERF_CONTEXT	CREATE TEMPORARY TABLE `ROCKSDB_PERF_CONTEXT` (
+  `TABLE_SCHEMA` varchar(193) NOT NULL,
+  `TABLE_NAME` varchar(193) NOT NULL,
+  `PARTITION_NAME` varchar(193),
+  `STAT_TYPE` varchar(193) NOT NULL,
+  `VALUE` bigint(21) NOT NULL
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_PERF_CONTEXT_GLOBAL;
+Table	Create Table
+ROCKSDB_PERF_CONTEXT_GLOBAL	CREATE TEMPORARY TABLE `ROCKSDB_PERF_CONTEXT_GLOBAL` (
+  `STAT_TYPE` varchar(193) NOT NULL,
+  `VALUE` bigint(21) NOT NULL
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_SST_PROPS;
+Table	Create Table
+ROCKSDB_SST_PROPS	CREATE TEMPORARY TABLE `ROCKSDB_SST_PROPS` (
+  `SST_NAME` varchar(193) NOT NULL,
+  `COLUMN_FAMILY` int(11) NOT NULL,
+  `DATA_BLOCKS` bigint(21) NOT NULL,
+  `ENTRIES` bigint(21) NOT NULL,
+  `RAW_KEY_SIZE` bigint(21) NOT NULL,
+  `RAW_VALUE_SIZE` bigint(21) NOT NULL,
+  `DATA_BLOCK_SIZE` bigint(21) NOT NULL,
+  `INDEX_BLOCK_SIZE` bigint(21) NOT NULL,
+  `INDEX_PARTITIONS` int(11) NOT NULL,
+  `TOP_LEVEL_INDEX_SIZE` bigint(21) NOT NULL,
+  `FILTER_BLOCK_SIZE` bigint(21) NOT NULL,
+  `COMPRESSION_ALGO` varchar(193) NOT NULL,
+  `CREATION_TIME` bigint(21) NOT NULL,
+  `FILE_CREATION_TIME` bigint(21) NOT NULL,
+  `OLDEST_KEY_TIME` bigint(21) NOT NULL,
+  `FILTER_POLICY` varchar(193) NOT NULL,
+  `COMPRESSION_OPTIONS` varchar(193) NOT NULL
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_TRX;
+Table	Create Table
+ROCKSDB_TRX	CREATE TEMPORARY TABLE `ROCKSDB_TRX` (
+  `TRANSACTION_ID` bigint(21) NOT NULL,
+  `STATE` varchar(193) NOT NULL,
+  `NAME` varchar(193) NOT NULL,
+  `WRITE_COUNT` bigint(21) NOT NULL,
+  `LOCK_COUNT` bigint(21) NOT NULL,
+  `TIMEOUT_SEC` int(11) NOT NULL,
+  `WAITING_KEY` varchar(513) NOT NULL,
+  `WAITING_COLUMN_FAMILY_ID` int(11) NOT NULL,
+  `IS_REPLICATION` int(11) NOT NULL,
+  `SKIP_TRX_API` int(11) NOT NULL,
+  `READ_ONLY` int(11) NOT NULL,
+  `HAS_DEADLOCK_DETECTION` int(11) NOT NULL,
+  `NUM_ONGOING_BULKLOAD` int(11) NOT NULL,
+  `THREAD_ID` int(11) NOT NULL,
+  `QUERY` varchar(193) NOT NULL
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
diff --git a/storage/rocksdb/mysql-test/rocksdb/r/i_s_deadlock.result b/storage/rocksdb/mysql-test/rocksdb/r/i_s_deadlock.result
index 7824839808e..4591d64ee46 100644
--- a/storage/rocksdb/mysql-test/rocksdb/r/i_s_deadlock.result
+++ b/storage/rocksdb/mysql-test/rocksdb/r/i_s_deadlock.result
@@ -13,15 +13,15 @@ connection default;
 show create table information_schema.rocksdb_deadlock;
 Table	Create Table
 ROCKSDB_DEADLOCK	CREATE TEMPORARY TABLE `ROCKSDB_DEADLOCK` (
-  `DEADLOCK_ID` bigint(8) NOT NULL,
-  `TIMESTAMP` bigint(8) NOT NULL,
-  `TRANSACTION_ID` bigint(8) NOT NULL,
+  `DEADLOCK_ID` bigint(21) NOT NULL,
+  `TIMESTAMP` bigint(21) NOT NULL,
+  `TRANSACTION_ID` bigint(21) NOT NULL,
   `CF_NAME` varchar(193) NOT NULL,
   `WAITING_KEY` varchar(513) NOT NULL,
   `LOCK_TYPE` varchar(193) NOT NULL,
   `INDEX_NAME` varchar(193) NOT NULL,
   `TABLE_NAME` varchar(193) NOT NULL,
-  `ROLLED_BACK` bigint(8) NOT NULL
+  `ROLLED_BACK` bigint(21) NOT NULL
 ) ENGINE=MEMORY DEFAULT CHARSET=utf8
 create table t (i int primary key) engine=rocksdb;
 insert into t values (1), (2), (3);
diff --git a/storage/rocksdb/mysql-test/rocksdb/r/index_merge_rocksdb2.result b/storage/rocksdb/mysql-test/rocksdb/r/index_merge_rocksdb2.result
index efe4eaf2141..9de77014593 100644
--- a/storage/rocksdb/mysql-test/rocksdb/r/index_merge_rocksdb2.result
+++ b/storage/rocksdb/mysql-test/rocksdb/r/index_merge_rocksdb2.result
@@ -287,7 +287,7 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 NULL	UNION RESULT	<union1,2>	ALL	NULL	NULL	NULL	NULL	NULL	
 explain select * from (select * from t1 where key1 = 3 or key2 =3) as Z where key8 >5;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	index_merge	i1,i2,i8	i1,i2	4,4	NULL	4	Using union(i1,i2); Using where
+1	SIMPLE	t1	range	i1,i2,i8	i8	4	NULL	2	Using index condition; Using where
 create table t3 like t0;
 insert into t3 select * from t0;
 alter table t3 add key9 int not null, add index i9(key9);
diff --git a/storage/rocksdb/mysql-test/rocksdb/r/innodb_i_s_tables_disabled.result b/storage/rocksdb/mysql-test/rocksdb/r/innodb_i_s_tables_disabled.result
index 578a26b3e4f..ac6e2bcc633 100644
--- a/storage/rocksdb/mysql-test/rocksdb/r/innodb_i_s_tables_disabled.result
+++ b/storage/rocksdb/mysql-test/rocksdb/r/innodb_i_s_tables_disabled.result
@@ -17,257 +17,241 @@ page_size	buffer_pool_instance	pages_used	pages_free	relocation_ops	relocation_t
 SELECT * FROM INFORMATION_SCHEMA.INNODB_CMPMEM_RESET;
 page_size	buffer_pool_instance	pages_used	pages_free	relocation_ops	relocation_time
 SELECT * FROM INFORMATION_SCHEMA.INNODB_METRICS;
-NAME	SUBSYSTEM	COUNT	MAX_COUNT	MIN_COUNT	AVG_COUNT	COUNT_RESET	MAX_COUNT_RESET	MIN_COUNT_RESET	AVG_COUNT_RESET	TIME_ENABLED	TIME_DISABLED	TIME_ELAPSED	TIME_RESET	STATUS	TYPE	COMMENT
-metadata_table_handles_opened	metadata	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of table handles opened
-metadata_table_handles_closed	metadata	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of table handles closed
-metadata_table_reference_count	metadata	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Table reference counter
-lock_deadlocks	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of deadlocks
-lock_timeouts	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of lock timeouts
-lock_rec_lock_waits	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of times enqueued into record lock wait queue
-lock_table_lock_waits	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of times enqueued into table lock wait queue
-lock_rec_lock_requests	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of record locks requested
-lock_rec_lock_created	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of record locks created
-lock_rec_lock_removed	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of record locks removed from the lock queue
-lock_rec_locks	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Current number of record locks on tables
-lock_table_lock_created	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of table locks created
-lock_table_lock_removed	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of table locks removed from the lock queue
-lock_table_locks	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Current number of table locks on tables
-lock_row_lock_current_waits	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of row locks currently being waited for (innodb_row_lock_current_waits)
-lock_row_lock_time	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Time spent in acquiring row locks, in milliseconds (innodb_row_lock_time)
-lock_row_lock_time_max	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	The maximum time to acquire a row lock, in milliseconds (innodb_row_lock_time_max)
-lock_row_lock_waits	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of times a row lock had to be waited for (innodb_row_lock_waits)
-lock_row_lock_time_avg	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	The average time to acquire a row lock, in milliseconds (innodb_row_lock_time_avg)
-buffer_pool_size	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Server buffer pool size (all buffer pools) in bytes
-buffer_pool_reads	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of reads directly from disk (innodb_buffer_pool_reads)
-buffer_pool_read_requests	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of logical read requests (innodb_buffer_pool_read_requests)
-buffer_pool_write_requests	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of write requests (innodb_buffer_pool_write_requests)
-buffer_pool_wait_free	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of times waited for free buffer (innodb_buffer_pool_wait_free)
-buffer_pool_read_ahead	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of pages read as read ahead (innodb_buffer_pool_read_ahead)
-buffer_pool_read_ahead_evicted	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Read-ahead pages evicted without being accessed (innodb_buffer_pool_read_ahead_evicted)
-buffer_pool_pages_total	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Total buffer pool size in pages (innodb_buffer_pool_pages_total)
-buffer_pool_pages_misc	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Buffer pages for misc use such as row locks or the adaptive hash index (innodb_buffer_pool_pages_misc)
-buffer_pool_pages_data	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Buffer pages containing data (innodb_buffer_pool_pages_data)
-buffer_pool_bytes_data	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Buffer bytes containing data (innodb_buffer_pool_bytes_data)
-buffer_pool_pages_dirty	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Buffer pages currently dirty (innodb_buffer_pool_pages_dirty)
-buffer_pool_bytes_dirty	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Buffer bytes currently dirty (innodb_buffer_pool_bytes_dirty)
-buffer_pool_pages_free	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Buffer pages currently free (innodb_buffer_pool_pages_free)
-buffer_pages_created	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of pages created (innodb_pages_created)
-buffer_pages_written	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of pages written (innodb_pages_written)
-buffer_index_pages_written	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of index pages written (innodb_index_pages_written)
-buffer_non_index_pages_written	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of non index pages written (innodb_non_index_pages_written)
-buffer_pages_read	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of pages read (innodb_pages_read)
-buffer_index_sec_rec_cluster_reads	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of secondary record reads triggered cluster read
-buffer_index_sec_rec_cluster_reads_avoided	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of secondary record reads avoided triggering cluster read
-buffer_data_reads	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Amount of data read in bytes (innodb_data_reads)
-buffer_data_written	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Amount of data written in bytes (innodb_data_written)
-buffer_flush_batch_scanned	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_owner	Total pages scanned as part of flush batch
-buffer_flush_batch_num_scan	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Number of times buffer flush list flush is called
-buffer_flush_batch_scanned_per_call	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Pages scanned per flush batch scan
-buffer_flush_batch_total_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_owner	Total pages flushed as part of flush batch
-buffer_flush_batches	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Number of flush batches
-buffer_flush_batch_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Pages queued as a flush batch
-buffer_flush_neighbor_total_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_owner	Total neighbors flushed as part of neighbor flush
-buffer_flush_neighbor	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Number of times neighbors flushing is invoked
-buffer_flush_neighbor_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Pages queued as a neighbor batch
-buffer_flush_n_to_flush_requested	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of pages requested for flushing.
-buffer_flush_n_to_flush_by_age	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of pages target by LSN Age for flushing.
-buffer_flush_adaptive_avg_time_slot	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Avg time (ms) spent for adaptive flushing recently per slot.
-buffer_LRU_batch_flush_avg_time_slot	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Avg time (ms) spent for LRU batch flushing recently per slot.
-buffer_flush_adaptive_avg_time_thread	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Avg time (ms) spent for adaptive flushing recently per thread.
-buffer_LRU_batch_flush_avg_time_thread	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Avg time (ms) spent for LRU batch flushing recently per thread.
-buffer_flush_adaptive_avg_time_est	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Estimated time (ms) spent for adaptive flushing recently.
-buffer_LRU_batch_flush_avg_time_est	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Estimated time (ms) spent for LRU batch flushing recently.
-buffer_flush_avg_time	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Avg time (ms) spent for flushing recently.
-buffer_flush_adaptive_avg_pass	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of adaptive flushes passed during the recent Avg period.
-buffer_LRU_batch_flush_avg_pass	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of LRU batch flushes passed during the recent Avg period.
-buffer_flush_avg_pass	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of flushes passed during the recent Avg period.
-buffer_LRU_get_free_loops	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Total loops in LRU get free.
-buffer_LRU_get_free_waits	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Total sleep waits in LRU get free.
-buffer_flush_avg_page_rate	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Average number of pages at which flushing is happening
-buffer_flush_lsn_avg_rate	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Average redo generation rate
-buffer_flush_pct_for_dirty	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Percent of IO capacity used to avoid max dirty page limit
-buffer_flush_pct_for_lsn	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Percent of IO capacity used to avoid reusable redo space limit
-buffer_flush_sync_waits	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of times a wait happens due to sync flushing
-buffer_flush_adaptive_total_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_owner	Total pages flushed as part of adaptive flushing
-buffer_flush_adaptive	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Number of adaptive batches
-buffer_flush_adaptive_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Pages queued as an adaptive batch
-buffer_flush_sync_total_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_owner	Total pages flushed as part of sync batches
-buffer_flush_sync	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Number of sync batches
-buffer_flush_sync_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Pages queued as a sync batch
-buffer_flush_background_total_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_owner	Total pages flushed as part of background batches
-buffer_flush_background	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Number of background batches
-buffer_flush_background_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Pages queued as a background batch
-buffer_LRU_batch_scanned	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_owner	Total pages scanned as part of LRU batch
-buffer_LRU_batch_num_scan	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Number of times LRU batch is called
-buffer_LRU_batch_scanned_per_call	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Pages scanned per LRU batch call
-buffer_LRU_batch_flush_total_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_owner	Total pages flushed as part of LRU batches
-buffer_LRU_batches_flush	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Number of LRU batches
-buffer_LRU_batch_flush_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Pages queued as an LRU batch
-buffer_LRU_batch_evict_total_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_owner	Total pages evicted as part of LRU batches
-buffer_LRU_batches_evict	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Number of LRU batches
-buffer_LRU_batch_evict_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Pages queued as an LRU batch
-buffer_LRU_single_flush_scanned	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_owner	Total pages scanned as part of single page LRU flush
-buffer_LRU_single_flush_num_scan	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Number of times single page LRU flush is called
-buffer_LRU_single_flush_scanned_per_call	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Page scanned per single LRU flush
-buffer_LRU_single_flush_failure_count	Buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of times attempt to flush a single page from LRU failed
-buffer_LRU_get_free_search	Buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of searches performed for a clean page
-buffer_LRU_search_scanned	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_owner	Total pages scanned as part of LRU search
-buffer_LRU_search_num_scan	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Number of times LRU search is performed
-buffer_LRU_search_scanned_per_call	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Page scanned per single LRU search
-buffer_LRU_unzip_search_scanned	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_owner	Total pages scanned as part of LRU unzip search
-buffer_LRU_unzip_search_num_scan	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Number of times LRU unzip search is performed
-buffer_LRU_unzip_search_scanned_per_call	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	set_member	Page scanned per single LRU unzip search
-buffer_page_read_index_leaf	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Index Leaf Pages read
-buffer_page_read_index_non_leaf	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Index Non-leaf Pages read
-buffer_page_read_index_ibuf_leaf	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Insert Buffer Index Leaf Pages read
-buffer_page_read_index_ibuf_non_leaf	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Insert Buffer Index Non-Leaf Pages read
-buffer_page_read_undo_log	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Undo Log Pages read
-buffer_page_read_index_inode	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Index Inode Pages read
-buffer_page_read_ibuf_free_list	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Insert Buffer Free List Pages read
-buffer_page_read_ibuf_bitmap	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Insert Buffer Bitmap Pages read
-buffer_page_read_system_page	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of System Pages read
-buffer_page_read_trx_system	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Transaction System Pages read
-buffer_page_read_fsp_hdr	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of File Space Header Pages read
-buffer_page_read_xdes	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Extent Descriptor Pages read
-buffer_page_read_blob	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Uncompressed BLOB Pages read
-buffer_page_read_zblob	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of First Compressed BLOB Pages read
-buffer_page_read_zblob2	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Subsequent Compressed BLOB Pages read
-buffer_page_read_other	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of other/unknown (old version of InnoDB) Pages read
-buffer_page_written_index_leaf	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Index Leaf Pages written
-buffer_page_written_index_non_leaf	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Index Non-leaf Pages written
-buffer_page_written_index_ibuf_leaf	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Insert Buffer Index Leaf Pages written
-buffer_page_written_index_ibuf_non_leaf	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Insert Buffer Index Non-Leaf Pages written
-buffer_page_written_undo_log	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Undo Log Pages written
-buffer_page_written_index_inode	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Index Inode Pages written
-buffer_page_written_ibuf_free_list	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Insert Buffer Free List Pages written
-buffer_page_written_ibuf_bitmap	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Insert Buffer Bitmap Pages written
-buffer_page_written_system_page	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of System Pages written
-buffer_page_written_trx_system	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Transaction System Pages written
-buffer_page_written_fsp_hdr	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of File Space Header Pages written
-buffer_page_written_xdes	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Extent Descriptor Pages written
-buffer_page_written_blob	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Uncompressed BLOB Pages written
-buffer_page_written_zblob	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of First Compressed BLOB Pages written
-buffer_page_written_zblob2	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Subsequent Compressed BLOB Pages written
-buffer_page_written_other	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of other/unknown (old version InnoDB) Pages written
-os_data_reads	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of reads initiated (innodb_data_reads)
-os_data_writes	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of writes initiated (innodb_data_writes)
-os_data_fsyncs	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of fsync() calls (innodb_data_fsyncs)
-os_pending_reads	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of reads pending
-os_pending_writes	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of writes pending
-os_log_bytes_written	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Bytes of log written (innodb_os_log_written)
-os_log_fsyncs	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of fsync log writes (innodb_os_log_fsyncs)
-os_log_pending_fsyncs	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of pending fsync write (innodb_os_log_pending_fsyncs)
-os_log_pending_writes	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of pending log file writes (innodb_os_log_pending_writes)
-trx_rw_commits	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of read-write transactions  committed
-trx_ro_commits	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of read-only transactions committed
-trx_nl_ro_commits	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of non-locking auto-commit read-only transactions committed
-trx_commits_insert_update	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of transactions committed with inserts and updates
-trx_rollbacks	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of transactions rolled back
-trx_rollbacks_savepoint	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of transactions rolled back to savepoint
-trx_active_transactions	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of active transactions
-trx_rseg_history_len	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Length of the TRX_RSEG_HISTORY list
-trx_undo_slots_used	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of undo slots used
-trx_undo_slots_cached	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of undo slots cached
-trx_rseg_current_size	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Current rollback segment size in pages
-purge_del_mark_records	purge	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of delete-marked rows purged
-purge_upd_exist_or_extern_records	purge	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of purges on updates of existing records and updates on delete marked record with externally stored field
-purge_invoked	purge	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of times purge was invoked
-purge_undo_log_pages	purge	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of undo log pages handled by the purge
-purge_dml_delay_usec	purge	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Microseconds DML to be delayed due to purge lagging
-purge_stop_count	purge	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Number of times purge was stopped
-purge_resume_count	purge	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Number of times purge was resumed
-log_checkpoints	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of checkpoints
-log_lsn_last_flush	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	LSN of Last flush
-log_lsn_last_checkpoint	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	LSN at last checkpoint
-log_lsn_current	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Current LSN value
-log_lsn_checkpoint_age	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Current LSN value minus LSN at last checkpoint
-log_lsn_buf_pool_oldest	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	The oldest modified block LSN in the buffer pool
-log_max_modified_age_async	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Maximum LSN difference; when exceeded, start asynchronous preflush
-log_max_modified_age_sync	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Maximum LSN difference; when exceeded, start synchronous preflush
-log_pending_log_flushes	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Pending log flushes
-log_pending_checkpoint_writes	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Pending checkpoints
-log_num_log_io	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Number of log I/Os
-log_waits	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of log waits due to small log buffer (innodb_log_waits)
-log_write_requests	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of log write requests (innodb_log_write_requests)
-log_writes	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of log writes (innodb_log_writes)
-log_padded	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Bytes of log padded for log write ahead
-compress_pages_compressed	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of pages compressed
-compress_pages_decompressed	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of pages decompressed
-compression_pad_increments	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of times padding is incremented to avoid compression failures
-compression_pad_decrements	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of times padding is decremented due to good compressibility
-compress_saved	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of bytes saved by page compression
-compress_pages_page_compressed	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of pages compressed by page compression
-compress_page_compressed_trim_op	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of TRIM operation performed by page compression
-compress_pages_page_decompressed	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of pages decompressed by page compression
-compress_pages_page_compression_error	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of page compression errors
-compress_pages_encrypted	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of pages encrypted
-compress_pages_decrypted	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of pages decrypted
-index_page_splits	index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of index page splits
-index_page_merge_attempts	index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of index page merge attempts
-index_page_merge_successful	index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of successful index page merges
-index_page_reorg_attempts	index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of index page reorganization attempts
-index_page_reorg_successful	index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of successful index page reorganizations
-index_page_discards	index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of index pages discarded
-adaptive_hash_searches	adaptive_hash_index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of successful searches using Adaptive Hash Index
-adaptive_hash_searches_btree	adaptive_hash_index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of searches using B-tree on an index search
-adaptive_hash_pages_added	adaptive_hash_index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of index pages on which the Adaptive Hash Index is built
-adaptive_hash_pages_removed	adaptive_hash_index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of index pages whose corresponding Adaptive Hash Index entries were removed
-adaptive_hash_rows_added	adaptive_hash_index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Adaptive Hash Index rows added
-adaptive_hash_rows_removed	adaptive_hash_index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Adaptive Hash Index rows removed
-adaptive_hash_rows_deleted_no_hash_entry	adaptive_hash_index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of rows deleted that did not have corresponding Adaptive Hash Index entries
-adaptive_hash_rows_updated	adaptive_hash_index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of Adaptive Hash Index rows updated
-file_num_open_files	file_system	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	Number of files currently open (innodb_num_open_files)
-ibuf_merges_insert	change_buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of inserted records merged by change buffering
-ibuf_merges_delete_mark	change_buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of deleted records merged by change buffering
-ibuf_merges_delete	change_buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of purge records merged by change buffering
-ibuf_merges_discard_insert	change_buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of insert merged operations discarded
-ibuf_merges_discard_delete_mark	change_buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of deleted merged operations discarded
-ibuf_merges_discard_delete	change_buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of purge merged  operations discarded
-ibuf_merges	change_buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of change buffer merges
-ibuf_size	change_buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Change buffer size in pages
-innodb_master_thread_sleeps	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of times (seconds) master thread sleeps
-innodb_activity_count	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Current server activity count
-innodb_master_active_loops	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of times master thread performs its tasks when server is active
-innodb_master_idle_loops	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of times master thread performs its tasks when server is idle
-innodb_background_drop_table_usec	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Time (in microseconds) spent to process drop table list
-innodb_ibuf_merge_usec	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Time (in microseconds) spent to process change buffer merge
-innodb_log_flush_usec	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Time (in microseconds) spent to flush log records
-innodb_mem_validate_usec	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Time (in microseconds) spent to do memory validation
-innodb_master_purge_usec	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Time (in microseconds) spent by master thread to purge records
-innodb_dict_lru_usec	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Time (in microseconds) spent to process DICT LRU list
-innodb_dict_lru_count_active	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of tables evicted from DICT LRU list in the active loop
-innodb_dict_lru_count_idle	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of tables evicted from DICT LRU list in the idle loop
-innodb_checkpoint_usec	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Time (in microseconds) spent by master thread to do checkpoint
-innodb_dblwr_writes	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of doublewrite operations that have been performed (innodb_dblwr_writes)
-innodb_dblwr_pages_written	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of pages that have been written for doublewrite operations (innodb_dblwr_pages_written)
-innodb_page_size	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	value	InnoDB page size in bytes (innodb_page_size)
-innodb_rwlock_s_spin_waits	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of rwlock spin waits due to shared latch request
-innodb_rwlock_x_spin_waits	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of rwlock spin waits due to exclusive latch request
-innodb_rwlock_sx_spin_waits	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of rwlock spin waits due to sx latch request
-innodb_rwlock_s_spin_rounds	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of rwlock spin loop rounds due to shared latch request
-innodb_rwlock_x_spin_rounds	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of rwlock spin loop rounds due to exclusive latch request
-innodb_rwlock_sx_spin_rounds	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of rwlock spin loop rounds due to sx latch request
-innodb_rwlock_s_os_waits	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of OS waits due to shared latch request
-innodb_rwlock_x_os_waits	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of OS waits due to exclusive latch request
-innodb_rwlock_sx_os_waits	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of OS waits due to sx latch request
-dml_reads	dml	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of rows read
-dml_inserts	dml	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of rows inserted
-dml_deletes	dml	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of rows deleted
-dml_updates	dml	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of rows updated
-dml_system_reads	dml	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of system rows read
-dml_system_inserts	dml	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of system rows inserted
-dml_system_deletes	dml	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of system rows deleted
-dml_system_updates	dml	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	status_counter	Number of system rows updated
-ddl_background_drop_indexes	ddl	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of indexes waiting to be dropped after failed index creation
-ddl_background_drop_tables	ddl	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of tables in background drop table list
-ddl_online_create_index	ddl	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of indexes being created online
-ddl_pending_alter_table	ddl	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of ALTER TABLE, CREATE INDEX, DROP INDEX in progress
-ddl_sort_file_alter_table	ddl	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of sort files created during alter table
-ddl_log_file_alter_table	ddl	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of log files created during alter table
-icp_attempts	icp	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Number of attempts for index push-down condition checks
-icp_no_match	icp	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Index push-down condition does not match
-icp_out_of_range	icp	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Index push-down condition out of range
-icp_match	icp	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	disabled	counter	Index push-down condition matches
+NAME	SUBSYSTEM	COUNT	MAX_COUNT	MIN_COUNT	AVG_COUNT	COUNT_RESET	MAX_COUNT_RESET	MIN_COUNT_RESET	AVG_COUNT_RESET	TIME_ENABLED	TIME_DISABLED	TIME_ELAPSED	TIME_RESET	ENABLED	TYPE	COMMENT
+metadata_table_handles_opened	metadata	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of table handles opened
+metadata_table_handles_closed	metadata	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of table handles closed
+metadata_table_reference_count	metadata	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Table reference counter
+lock_deadlocks	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of deadlocks
+lock_timeouts	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of lock timeouts
+lock_rec_lock_waits	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of times enqueued into record lock wait queue
+lock_table_lock_waits	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of times enqueued into table lock wait queue
+lock_rec_lock_requests	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of record locks requested
+lock_rec_lock_created	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of record locks created
+lock_rec_lock_removed	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of record locks removed from the lock queue
+lock_rec_locks	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Current number of record locks on tables
+lock_table_lock_created	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of table locks created
+lock_table_lock_removed	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of table locks removed from the lock queue
+lock_table_locks	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Current number of table locks on tables
+lock_row_lock_current_waits	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of row locks currently being waited for (innodb_row_lock_current_waits)
+lock_row_lock_time	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Time spent in acquiring row locks, in milliseconds (innodb_row_lock_time)
+lock_row_lock_time_max	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	The maximum time to acquire a row lock, in milliseconds (innodb_row_lock_time_max)
+lock_row_lock_waits	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of times a row lock had to be waited for (innodb_row_lock_waits)
+lock_row_lock_time_avg	lock	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	The average time to acquire a row lock, in milliseconds (innodb_row_lock_time_avg)
+buffer_pool_size	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Server buffer pool size (all buffer pools) in bytes
+buffer_pool_reads	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of reads directly from disk (innodb_buffer_pool_reads)
+buffer_pool_read_requests	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of logical read requests (innodb_buffer_pool_read_requests)
+buffer_pool_write_requests	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of write requests (innodb_buffer_pool_write_requests)
+buffer_pool_wait_free	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of times waited for free buffer (innodb_buffer_pool_wait_free)
+buffer_pool_read_ahead	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of pages read as read ahead (innodb_buffer_pool_read_ahead)
+buffer_pool_read_ahead_evicted	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Read-ahead pages evicted without being accessed (innodb_buffer_pool_read_ahead_evicted)
+buffer_pool_pages_total	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Total buffer pool size in pages (innodb_buffer_pool_pages_total)
+buffer_pool_pages_misc	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Buffer pages for misc use such as row locks or the adaptive hash index (innodb_buffer_pool_pages_misc)
+buffer_pool_pages_data	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Buffer pages containing data (innodb_buffer_pool_pages_data)
+buffer_pool_bytes_data	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Buffer bytes containing data (innodb_buffer_pool_bytes_data)
+buffer_pool_pages_dirty	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Buffer pages currently dirty (innodb_buffer_pool_pages_dirty)
+buffer_pool_bytes_dirty	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Buffer bytes currently dirty (innodb_buffer_pool_bytes_dirty)
+buffer_pool_pages_free	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Buffer pages currently free (innodb_buffer_pool_pages_free)
+buffer_pages_created	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of pages created (innodb_pages_created)
+buffer_pages_written	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of pages written (innodb_pages_written)
+buffer_index_pages_written	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of index pages written (innodb_index_pages_written)
+buffer_non_index_pages_written	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of non index pages written (innodb_non_index_pages_written)
+buffer_pages_read	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of pages read (innodb_pages_read)
+buffer_index_sec_rec_cluster_reads	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of secondary record reads triggered cluster read
+buffer_index_sec_rec_cluster_reads_avoided	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of secondary record reads avoided triggering cluster read
+buffer_data_reads	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Amount of data read in bytes (innodb_data_reads)
+buffer_data_written	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Amount of data written in bytes (innodb_data_written)
+buffer_flush_batch_scanned	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_owner	Total pages scanned as part of flush batch
+buffer_flush_batch_num_scan	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Number of times buffer flush list flush is called
+buffer_flush_batch_scanned_per_call	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Pages scanned per flush batch scan
+buffer_flush_batch_total_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_owner	Total pages flushed as part of flush batch
+buffer_flush_batches	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Number of flush batches
+buffer_flush_batch_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Pages queued as a flush batch
+buffer_flush_neighbor_total_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_owner	Total neighbors flushed as part of neighbor flush
+buffer_flush_neighbor	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Number of times neighbors flushing is invoked
+buffer_flush_neighbor_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Pages queued as a neighbor batch
+buffer_flush_n_to_flush_requested	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of pages requested for flushing.
+buffer_flush_n_to_flush_by_age	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of pages target by LSN Age for flushing.
+buffer_flush_adaptive_avg_time	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Avg time (ms) spent for adaptive flushing recently.
+buffer_flush_adaptive_avg_pass	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of adaptive flushes passed during the recent Avg period.
+buffer_LRU_get_free_loops	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Total loops in LRU get free.
+buffer_LRU_get_free_waits	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Total sleep waits in LRU get free.
+buffer_flush_avg_page_rate	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Average number of pages at which flushing is happening
+buffer_flush_lsn_avg_rate	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Average redo generation rate
+buffer_flush_pct_for_dirty	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Percent of IO capacity used to avoid max dirty page limit
+buffer_flush_pct_for_lsn	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Percent of IO capacity used to avoid reusable redo space limit
+buffer_flush_sync_waits	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of times a wait happens due to sync flushing
+buffer_flush_adaptive_total_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_owner	Total pages flushed as part of adaptive flushing
+buffer_flush_adaptive	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Number of adaptive batches
+buffer_flush_adaptive_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Pages queued as an adaptive batch
+buffer_flush_sync_total_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_owner	Total pages flushed as part of sync batches
+buffer_flush_sync	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Number of sync batches
+buffer_flush_sync_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Pages queued as a sync batch
+buffer_flush_background_total_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_owner	Total pages flushed as part of background batches
+buffer_flush_background	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Number of background batches
+buffer_flush_background_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Pages queued as a background batch
+buffer_LRU_batch_scanned	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_owner	Total pages scanned as part of LRU batch
+buffer_LRU_batch_num_scan	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Number of times LRU batch is called
+buffer_LRU_batch_scanned_per_call	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Pages scanned per LRU batch call
+buffer_LRU_batch_flush_total_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_owner	Total pages flushed as part of LRU batches
+buffer_LRU_batches_flush	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Number of LRU batches
+buffer_LRU_batch_flush_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Pages queued as an LRU batch
+buffer_LRU_batch_evict_total_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_owner	Total pages evicted as part of LRU batches
+buffer_LRU_batches_evict	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Number of LRU batches
+buffer_LRU_batch_evict_pages	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Pages queued as an LRU batch
+buffer_LRU_single_flush_failure_count	Buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of times attempt to flush a single page from LRU failed
+buffer_LRU_get_free_search	Buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of searches performed for a clean page
+buffer_LRU_search_scanned	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_owner	Total pages scanned as part of LRU search
+buffer_LRU_search_num_scan	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Number of times LRU search is performed
+buffer_LRU_search_scanned_per_call	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Page scanned per single LRU search
+buffer_LRU_unzip_search_scanned	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_owner	Total pages scanned as part of LRU unzip search
+buffer_LRU_unzip_search_num_scan	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Number of times LRU unzip search is performed
+buffer_LRU_unzip_search_scanned_per_call	buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	set_member	Page scanned per single LRU unzip search
+buffer_page_read_index_leaf	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Index Leaf Pages read
+buffer_page_read_index_non_leaf	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Index Non-leaf Pages read
+buffer_page_read_index_ibuf_leaf	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Insert Buffer Index Leaf Pages read
+buffer_page_read_index_ibuf_non_leaf	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Insert Buffer Index Non-Leaf Pages read
+buffer_page_read_undo_log	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Undo Log Pages read
+buffer_page_read_index_inode	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Index Inode Pages read
+buffer_page_read_ibuf_free_list	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Insert Buffer Free List Pages read
+buffer_page_read_ibuf_bitmap	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Insert Buffer Bitmap Pages read
+buffer_page_read_system_page	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of System Pages read
+buffer_page_read_trx_system	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Transaction System Pages read
+buffer_page_read_fsp_hdr	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of File Space Header Pages read
+buffer_page_read_xdes	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Extent Descriptor Pages read
+buffer_page_read_blob	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Uncompressed BLOB Pages read
+buffer_page_read_zblob	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of First Compressed BLOB Pages read
+buffer_page_read_zblob2	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Subsequent Compressed BLOB Pages read
+buffer_page_read_other	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of other/unknown (old version of InnoDB) Pages read
+buffer_page_written_index_leaf	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Index Leaf Pages written
+buffer_page_written_index_non_leaf	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Index Non-leaf Pages written
+buffer_page_written_index_ibuf_leaf	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Insert Buffer Index Leaf Pages written
+buffer_page_written_index_ibuf_non_leaf	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Insert Buffer Index Non-Leaf Pages written
+buffer_page_written_undo_log	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Undo Log Pages written
+buffer_page_written_index_inode	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Index Inode Pages written
+buffer_page_written_ibuf_free_list	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Insert Buffer Free List Pages written
+buffer_page_written_ibuf_bitmap	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Insert Buffer Bitmap Pages written
+buffer_page_written_system_page	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of System Pages written
+buffer_page_written_trx_system	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Transaction System Pages written
+buffer_page_written_fsp_hdr	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of File Space Header Pages written
+buffer_page_written_xdes	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Extent Descriptor Pages written
+buffer_page_written_blob	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Uncompressed BLOB Pages written
+buffer_page_written_zblob	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of First Compressed BLOB Pages written
+buffer_page_written_zblob2	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Subsequent Compressed BLOB Pages written
+buffer_page_written_other	buffer_page_io	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of other/unknown (old version InnoDB) Pages written
+os_data_reads	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of reads initiated (innodb_data_reads)
+os_data_writes	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of writes initiated (innodb_data_writes)
+os_data_fsyncs	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of fsync() calls (innodb_data_fsyncs)
+os_pending_reads	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of reads pending
+os_pending_writes	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of writes pending
+os_log_bytes_written	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Bytes of log written (innodb_os_log_written)
+os_log_fsyncs	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of fsync log writes (innodb_os_log_fsyncs)
+os_log_pending_fsyncs	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of pending fsync write (innodb_os_log_pending_fsyncs)
+os_log_pending_writes	os	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of pending log file writes (innodb_os_log_pending_writes)
+trx_rw_commits	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of read-write transactions  committed
+trx_ro_commits	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of read-only transactions committed
+trx_nl_ro_commits	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of non-locking auto-commit read-only transactions committed
+trx_commits_insert_update	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of transactions committed with inserts and updates
+trx_rollbacks	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of transactions rolled back
+trx_rollbacks_savepoint	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of transactions rolled back to savepoint
+trx_active_transactions	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of active transactions
+trx_rseg_history_len	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Length of the TRX_RSEG_HISTORY list
+trx_undo_slots_used	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of undo slots used
+trx_undo_slots_cached	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of undo slots cached
+trx_rseg_current_size	transaction	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Current rollback segment size in pages
+purge_del_mark_records	purge	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of delete-marked rows purged
+purge_upd_exist_or_extern_records	purge	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of purges on updates of existing records and updates on delete marked record with externally stored field
+purge_invoked	purge	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of times purge was invoked
+purge_undo_log_pages	purge	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of undo log pages handled by the purge
+purge_dml_delay_usec	purge	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Microseconds DML to be delayed due to purge lagging
+purge_stop_count	purge	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Number of times purge was stopped
+purge_resume_count	purge	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Number of times purge was resumed
+log_checkpoints	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of checkpoints
+log_lsn_last_flush	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	LSN of Last flush
+log_lsn_last_checkpoint	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	LSN at last checkpoint
+log_lsn_current	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Current LSN value
+log_lsn_checkpoint_age	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Current LSN value minus LSN at last checkpoint
+log_lsn_buf_pool_oldest	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	The oldest modified block LSN in the buffer pool
+log_max_modified_age_async	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Maximum LSN difference; when exceeded, start asynchronous preflush
+log_pending_log_flushes	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Pending log flushes
+log_pending_checkpoint_writes	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Pending checkpoints
+log_num_log_io	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Number of log I/Os
+log_waits	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of log waits due to small log buffer (innodb_log_waits)
+log_write_requests	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of log write requests (innodb_log_write_requests)
+log_writes	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of log writes (innodb_log_writes)
+log_padded	recovery	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Bytes of log padded for log write ahead
+compress_pages_compressed	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of pages compressed
+compress_pages_decompressed	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of pages decompressed
+compression_pad_increments	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of times padding is incremented to avoid compression failures
+compression_pad_decrements	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of times padding is decremented due to good compressibility
+compress_saved	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of bytes saved by page compression
+compress_pages_page_compressed	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of pages compressed by page compression
+compress_page_compressed_trim_op	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of TRIM operation performed by page compression
+compress_pages_page_decompressed	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of pages decompressed by page compression
+compress_pages_page_compression_error	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of page compression errors
+compress_pages_encrypted	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of pages encrypted
+compress_pages_decrypted	compression	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of pages decrypted
+index_page_splits	index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of index page splits
+index_page_merge_attempts	index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of index page merge attempts
+index_page_merge_successful	index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of successful index page merges
+index_page_reorg_attempts	index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of index page reorganization attempts
+index_page_reorg_successful	index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of successful index page reorganizations
+index_page_discards	index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of index pages discarded
+adaptive_hash_searches	adaptive_hash_index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of successful searches using Adaptive Hash Index
+adaptive_hash_searches_btree	adaptive_hash_index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of searches using B-tree on an index search
+adaptive_hash_pages_added	adaptive_hash_index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of index pages on which the Adaptive Hash Index is built
+adaptive_hash_pages_removed	adaptive_hash_index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of index pages whose corresponding Adaptive Hash Index entries were removed
+adaptive_hash_rows_added	adaptive_hash_index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Adaptive Hash Index rows added
+adaptive_hash_rows_removed	adaptive_hash_index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Adaptive Hash Index rows removed
+adaptive_hash_rows_deleted_no_hash_entry	adaptive_hash_index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of rows deleted that did not have corresponding Adaptive Hash Index entries
+adaptive_hash_rows_updated	adaptive_hash_index	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of Adaptive Hash Index rows updated
+file_num_open_files	file_system	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	Number of files currently open (innodb_num_open_files)
+ibuf_merges_insert	change_buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of inserted records merged by change buffering
+ibuf_merges_delete_mark	change_buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of deleted records merged by change buffering
+ibuf_merges_delete	change_buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of purge records merged by change buffering
+ibuf_merges_discard_insert	change_buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of insert merged operations discarded
+ibuf_merges_discard_delete_mark	change_buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of deleted merged operations discarded
+ibuf_merges_discard_delete	change_buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of purge merged  operations discarded
+ibuf_merges	change_buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of change buffer merges
+ibuf_size	change_buffer	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Change buffer size in pages
+innodb_master_thread_sleeps	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of times (seconds) master thread sleeps
+innodb_activity_count	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Current server activity count
+innodb_master_active_loops	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of times master thread performs its tasks when server is active
+innodb_master_idle_loops	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of times master thread performs its tasks when server is idle
+innodb_background_drop_table_usec	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Time (in microseconds) spent to process drop table list
+innodb_log_flush_usec	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Time (in microseconds) spent to flush log records
+innodb_dict_lru_usec	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Time (in microseconds) spent to process DICT LRU list
+innodb_dict_lru_count_active	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of tables evicted from DICT LRU list in the active loop
+innodb_dict_lru_count_idle	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of tables evicted from DICT LRU list in the idle loop
+innodb_dblwr_writes	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of doublewrite operations that have been performed (innodb_dblwr_writes)
+innodb_dblwr_pages_written	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of pages that have been written for doublewrite operations (innodb_dblwr_pages_written)
+innodb_page_size	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	value	InnoDB page size in bytes (innodb_page_size)
+innodb_rwlock_s_spin_waits	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of rwlock spin waits due to shared latch request
+innodb_rwlock_x_spin_waits	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of rwlock spin waits due to exclusive latch request
+innodb_rwlock_sx_spin_waits	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of rwlock spin waits due to sx latch request
+innodb_rwlock_s_spin_rounds	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of rwlock spin loop rounds due to shared latch request
+innodb_rwlock_x_spin_rounds	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of rwlock spin loop rounds due to exclusive latch request
+innodb_rwlock_sx_spin_rounds	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of rwlock spin loop rounds due to sx latch request
+innodb_rwlock_s_os_waits	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of OS waits due to shared latch request
+innodb_rwlock_x_os_waits	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of OS waits due to exclusive latch request
+innodb_rwlock_sx_os_waits	server	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of OS waits due to sx latch request
+dml_reads	dml	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of rows read
+dml_inserts	dml	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of rows inserted
+dml_deletes	dml	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of rows deleted
+dml_updates	dml	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of rows updated
+dml_system_reads	dml	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of system rows read
+dml_system_inserts	dml	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of system rows inserted
+dml_system_deletes	dml	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of system rows deleted
+dml_system_updates	dml	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	status_counter	Number of system rows updated
+ddl_background_drop_indexes	ddl	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of indexes waiting to be dropped after failed index creation
+ddl_background_drop_tables	ddl	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of tables in background drop table list
+ddl_online_create_index	ddl	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of indexes being created online
+ddl_pending_alter_table	ddl	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of ALTER TABLE, CREATE INDEX, DROP INDEX in progress
+ddl_sort_file_alter_table	ddl	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of sort files created during alter table
+ddl_log_file_alter_table	ddl	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of log files created during alter table
+icp_attempts	icp	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Number of attempts for index push-down condition checks
+icp_no_match	icp	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Index push-down condition does not match
+icp_out_of_range	icp	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Index push-down condition out of range
+icp_match	icp	0	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0	counter	Index push-down condition matches
 SELECT * FROM INFORMATION_SCHEMA.INNODB_FT_DEFAULT_STOPWORD;
 value
 a
@@ -337,6 +321,6 @@ ID	FOR_NAME	REF_NAME	N_COLS	TYPE
 SELECT * FROM INFORMATION_SCHEMA.INNODB_SYS_FOREIGN_COLS;
 ID	FOR_COL_NAME	REF_COL_NAME	POS
 SELECT * FROM INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES;
-SPACE	NAME	FLAG	ROW_FORMAT	PAGE_SIZE	ZIP_PAGE_SIZE	SPACE_TYPE	FS_BLOCK_SIZE	FILE_SIZE	ALLOCATED_SIZE
+SPACE	NAME	FLAG	ROW_FORMAT	PAGE_SIZE	ZIP_PAGE_SIZE	FS_BLOCK_SIZE	FILE_SIZE	ALLOCATED_SIZE
 SELECT * FROM INFORMATION_SCHEMA.INNODB_SYS_DATAFILES;
 SPACE	PATH
diff --git a/storage/rocksdb/mysql-test/rocksdb/r/records_in_range.result b/storage/rocksdb/mysql-test/rocksdb/r/records_in_range.result
index ce3d7d9147e..feeabd7ee79 100644
--- a/storage/rocksdb/mysql-test/rocksdb/r/records_in_range.result
+++ b/storage/rocksdb/mysql-test/rocksdb/r/records_in_range.result
@@ -67,7 +67,7 @@ Warnings:
 Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 700
 explain extended select a,b from t1 where a = 700;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1	ref	ka	ka	5	const	15000	100.00	
+1	SIMPLE	t1	ALL	ka	NULL	NULL	NULL	20000	75.00	Using where
 Warnings:
 Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` where `test`.`t1`.`a` = 700
 explain extended select a from t1 where a in (700, 800);
diff --git a/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result b/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result
index 4f0d68845a3..2dc3d02d4b1 100644
--- a/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result
+++ b/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result
@@ -2634,4 +2634,11 @@ DELETE t2, t1 FROM t2 LEFT JOIN t1 ON t2.id2 = t1.id2 AND t2.id1 = t1.id1 WHERE
 SET rocksdb_bulk_load_size= @save_rocksdb_bulk_load_size;
 SET rocksdb_commit_in_the_middle=0;
 DROP TABLE t1, t2;
+#
+# MDEV-21831: Assertion `length == pack_length()' failed in Field_inet6::sort_string upon
+# INSERT into RocksDB table
+#
+CREATE TABLE t1 (a INET6 NOT NULL, KEY (a)) ENGINE=RocksDB;
+INSERT INTO t1 VALUES ('41::1'),('61::1');
+DROP TABLE t1;
 SET GLOBAL ROCKSDB_PAUSE_BACKGROUND_WORK = @ORIG_PAUSE_BACKGROUND_WORK;
diff --git a/storage/rocksdb/mysql-test/rocksdb/r/ttl_rows_examined.result b/storage/rocksdb/mysql-test/rocksdb/r/ttl_rows_examined.result
index b4e718f0f9e..97a2eff5e2a 100644
--- a/storage/rocksdb/mysql-test/rocksdb/r/ttl_rows_examined.result
+++ b/storage/rocksdb/mysql-test/rocksdb/r/ttl_rows_examined.result
@@ -26,8 +26,8 @@ set debug_sync='now WAIT_FOR parked';
 affected rows: 0
 SHOW PROCESSLIST;
 Id	User	Host	db	Command	Time	State	Info	Progress
-###	###	###	###	Query	###	Init	SHOW PROCESSLIST	0.000
 ###	###	###	###	Query	###	debug sync point: rocksdb.ttl_rows_examined	SELECT * FROM t_re	0.000
+###	###	###	###	Query	###	starting	SHOW PROCESSLIST	0.000
 ###	###	###	###	Sleep	###		NULL	0.000
 affected rows: 3
 set debug_sync='now SIGNAL go';
diff --git a/storage/rocksdb/mysql-test/rocksdb/r/type_float_indexes.result b/storage/rocksdb/mysql-test/rocksdb/r/type_float_indexes.result
index 99d6bbe45b9..ca6ed6d5d18 100644
--- a/storage/rocksdb/mysql-test/rocksdb/r/type_float_indexes.result
+++ b/storage/rocksdb/mysql-test/rocksdb/r/type_float_indexes.result
@@ -114,7 +114,7 @@ INSERT INTO t1 (f,r,d,dp,pk) VALUES
 (4644,1422.22,466664.999,0.5,5);
 EXPLAIN SELECT DISTINCT d FROM t1 ORDER BY d;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	index	NULL	d	9	NULL	#	Using index
+1	SIMPLE	t1	range	NULL	d	9	NULL	#	Using index for group-by
 SELECT DISTINCT d FROM t1 ORDER BY d;
 d
 -1
@@ -177,7 +177,7 @@ INSERT INTO t1 (f,r,d,dp,pk) VALUES
 (1.2345,0,0,0,6);
 EXPLAIN SELECT DISTINCT f FROM t1 ORDER BY f;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	index	NULL	f	5	NULL	#	Using index
+1	SIMPLE	t1	range	NULL	f	5	NULL	#	Using index for group-by
 SELECT DISTINCT f FROM t1 ORDER BY f;
 f
 -1
diff --git a/storage/rocksdb/mysql-test/rocksdb/r/xa.result b/storage/rocksdb/mysql-test/rocksdb/r/xa.result
index 12ae2b474b6..8cb6f39bbac 100644
--- a/storage/rocksdb/mysql-test/rocksdb/r/xa.result
+++ b/storage/rocksdb/mysql-test/rocksdb/r/xa.result
@@ -1,6 +1,7 @@
-# 
-# MDEV-13155: XA recovery not supported for RocksDB (Just a testcase)
 #
+# MDEV-742 fixes
+#   MDEV-13155: XA recovery not supported for RocksDB
+# as well.
 call mtr.add_suppression("Found .* prepared XA transactions");
 connect  con1,localhost,root,,test;
 DROP TABLE IF EXISTS t1;
@@ -15,19 +16,55 @@ INSERT INTO t1 (a) VALUES (3);
 INSERT INTO t1 (a) VALUES (4);
 XA END 'xa2';
 XA PREPARE 'xa2';
+connect  con3,localhost,root,,test;
+XA START 'xa3';
+INSERT INTO t1 (a) VALUES (5);
+INSERT INTO t1 (a) VALUES (6);
+XA END 'xa3';
+XA PREPARE 'xa3';
+disconnect con3;
 connection default;
 SELECT * FROM t1;
 a
+Must be all three XA:s in
+XA RECOVER;
+formatID	gtrid_length	bqual_length	data
+1	3	0	xa3
+1	3	0	xa1
+1	3	0	xa2
 # restart
 connect  con3,localhost,root,,test;
 XA RECOVER;
 formatID	gtrid_length	bqual_length	data
+1	3	0	xa3
 1	3	0	xa1
 1	3	0	xa2
 XA ROLLBACK 'xa1';
 XA COMMIT 'xa2';
+XA ROLLBACK 'xa3';
+SELECT a FROM t1;
+a
+3
+4
+connect  con4,localhost,root,,test;
+XA START 'xa4';
+INSERT INTO t1 (a) VALUES (7);
+INSERT INTO t1 (a) VALUES (8);
+XA END 'xa4';
+XA PREPARE 'xa4';
+connection default;
+# Now restart through graceful shutdown
+# restart
+connect  con5,localhost,root,,test;
+Must have 'xa4'
+XA RECOVER;
+formatID	gtrid_length	bqual_length	data
+1	3	0	xa4
+XA COMMIT 'xa4';
 SELECT a FROM t1;
 a
 3
 4
+7
+8
 DROP TABLE t1;
diff --git a/storage/rocksdb/mysql-test/rocksdb/t/binlog_rotate_crash.test b/storage/rocksdb/mysql-test/rocksdb/t/binlog_rotate_crash.test
new file mode 100644
index 00000000000..dd42001b89e
--- /dev/null
+++ b/storage/rocksdb/mysql-test/rocksdb/t/binlog_rotate_crash.test
@@ -0,0 +1,31 @@
+--source include/have_rocksdb.inc
+--source include/have_log_bin.inc
+--source include/have_debug_sync.inc
+
+--echo #
+--echo # MDEV-25305: MyRocks: Killing server during RESET MASTER can lose last transactions
+--echo #
+
+set global rocksdb_flush_log_at_trx_commit=1;
+
+create table t1 (a int, b int, key(a)) engine=rocksdb;
+insert into t1 values (1,1),(2,2);
+select * from t1;
+flush tables;
+
+--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+restart
+EOF
+
+set @@debug_dbug="+d,crash_after_reset_master";
+
+--disable_reconnect
+--error 0,2013
+RESET MASTER;
+--enable_reconnect
+--source include/wait_until_connected_again.inc
+--echo # Must show the inserted rows:
+select * from t1;
+
+drop table t1;
+
diff --git a/storage/rocksdb/mysql-test/rocksdb/t/disabled.def b/storage/rocksdb/mysql-test/rocksdb/t/disabled.def
index 11c9c08c36e..627d7da4171 100644
--- a/storage/rocksdb/mysql-test/rocksdb/t/disabled.def
+++ b/storage/rocksdb/mysql-test/rocksdb/t/disabled.def
@@ -95,3 +95,4 @@ mysqlbinlog_gtid_skip_empty_trans_rocksdb : MariaRocks: requires GTIDs
 drop_table: Hangs on shutdown
 add_index_inplace: not stable result
 rocksdb_range2 : result difference, update after MDEV-16746 is fixed
+add_index_inplace: FORCE INDEX gives wrong count
diff --git a/storage/rocksdb/mysql-test/rocksdb/t/handler_basic.test b/storage/rocksdb/mysql-test/rocksdb/t/handler_basic.test
index 7b1652c759b..22b5d69780d 100644
--- a/storage/rocksdb/mysql-test/rocksdb/t/handler_basic.test
+++ b/storage/rocksdb/mysql-test/rocksdb/t/handler_basic.test
@@ -28,6 +28,7 @@ SELECT * FROM t1 WHERE id=8;
 SHOW SESSION STATUS LIKE 'Handler_read%';
 
 FLUSH STATUS;
+SET GLOBAL rocksdb_force_flush_memtable_and_lzero_now=1;
 SELECT * FROM t1 WHERE b=6;
 SHOW SESSION STATUS LIKE 'Handler_read%';
 
diff --git a/storage/rocksdb/mysql-test/rocksdb/t/i_s.test b/storage/rocksdb/mysql-test/rocksdb/t/i_s.test
new file mode 100644
index 00000000000..c9dc98c9253
--- /dev/null
+++ b/storage/rocksdb/mysql-test/rocksdb/t/i_s.test
@@ -0,0 +1,21 @@
+--source include/have_rocksdb.inc
+--source include/have_partition.inc
+
+SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES
+  WHERE TABLE_SCHEMA='INFORMATION_SCHEMA'
+    AND TABLE_NAME LIKE 'ROCKSDB%'
+  ORDER BY TABLE_NAME;
+
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_CF_OPTIONS;
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_CFSTATS;
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_COMPACTION_STATS;
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_DBSTATS;
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_DDL;
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_DEADLOCK;
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_GLOBAL_INFO;
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_INDEX_FILE_MAP;
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_LOCKS;
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_PERF_CONTEXT;
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_PERF_CONTEXT_GLOBAL;
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_SST_PROPS;
+SHOW CREATE TABLE INFORMATION_SCHEMA.ROCKSDB_TRX;
diff --git a/storage/rocksdb/mysql-test/rocksdb/t/issue900.test b/storage/rocksdb/mysql-test/rocksdb/t/issue900.test
index c420d418c20..ce52e0adbf1 100644
--- a/storage/rocksdb/mysql-test/rocksdb/t/issue900.test
+++ b/storage/rocksdb/mysql-test/rocksdb/t/issue900.test
@@ -8,6 +8,6 @@ INSERT INTO t1 VALUES('1','0','1');
 # Would segfault here
 --error ER_DUP_ENTRY
 ALTER TABLE t1 ADD INDEX(c3), ADD UNIQUE (c3);
---error ER_KEY_DOES_NOT_EXITS
+--error ER_KEY_DOES_NOT_EXISTS
 SELECT c3 FROM t1 FORCE INDEX(c3) ORDER BY c3;
 DROP TABLE t1;
diff --git a/storage/rocksdb/mysql-test/rocksdb/t/rocksdb.test b/storage/rocksdb/mysql-test/rocksdb/t/rocksdb.test
index 84a9a5bbd5a..f7de167bd96 100644
--- a/storage/rocksdb/mysql-test/rocksdb/t/rocksdb.test
+++ b/storage/rocksdb/mysql-test/rocksdb/t/rocksdb.test
@@ -1962,5 +1962,13 @@ SET rocksdb_bulk_load_size= @save_rocksdb_bulk_load_size;
 SET rocksdb_commit_in_the_middle=0;
 DROP TABLE t1, t2;
 
+--echo #
+--echo # MDEV-21831: Assertion `length == pack_length()' failed in Field_inet6::sort_string upon
+--echo # INSERT into RocksDB table
+--echo #
+
+CREATE TABLE t1 (a INET6 NOT NULL, KEY (a)) ENGINE=RocksDB;
+INSERT INTO t1 VALUES ('41::1'),('61::1');
+DROP TABLE t1;
 
 SET GLOBAL ROCKSDB_PAUSE_BACKGROUND_WORK = @ORIG_PAUSE_BACKGROUND_WORK;
diff --git a/storage/rocksdb/mysql-test/rocksdb/t/rqg.inc b/storage/rocksdb/mysql-test/rocksdb/t/rqg.inc
index 40154d9eaa7..0f3246de06f 100644
--- a/storage/rocksdb/mysql-test/rocksdb/t/rqg.inc
+++ b/storage/rocksdb/mysql-test/rocksdb/t/rqg.inc
@@ -25,8 +25,8 @@ foreach $grammar_file (split(/ /, $ENV{'GRAMMAR_FILES'})) {
 
   # Errors from the gentest.pl file will be captured in the results file
   my $cmd = "perl $ENV{'RQG_BASE'}/gentest.pl " .
-      "--dsn=dbi:mysql:host=:port=:user=root:database=$ENV{'TESTDB'}" .
-      ":mysql_socket=$ENV{'MYSQL_SOCKET'} " .
+      "--dsn=DBI:MariaDB:host=:port=:user=root:database=$ENV{'TESTDB'}" .
+      ":mariadb_socket=$ENV{'MYSQL_SOCKET'} " .
       "--gendata=$ENV{'RQG_BASE'}/conf/$ENV{'TESTDIR'}/$ENV{'DATA_FILE'} " .
       "--grammar=$ENV{'RQG_BASE'}/conf/$ENV{'TESTDIR'}/$grammar_file " .
       "--threads=5 --queries=10000 --duration=60 --sqltrace 2>&1 >> " .
diff --git a/storage/rocksdb/mysql-test/rocksdb/t/xa.test b/storage/rocksdb/mysql-test/rocksdb/t/xa.test
index f8f381f0580..0c23e71df8c 100644
--- a/storage/rocksdb/mysql-test/rocksdb/t/xa.test
+++ b/storage/rocksdb/mysql-test/rocksdb/t/xa.test
@@ -1,6 +1,7 @@
---echo # 
---echo # MDEV-13155: XA recovery not supported for RocksDB (Just a testcase)
 --echo #
+--echo # MDEV-742 fixes
+--echo #   MDEV-13155: XA recovery not supported for RocksDB
+--echo # as well.
  
 call mtr.add_suppression("Found .* prepared XA transactions");
 
@@ -22,17 +23,51 @@ INSERT INTO t1 (a) VALUES (3);
 INSERT INTO t1 (a) VALUES (4);
 XA END 'xa2';
 XA PREPARE 'xa2';
- 
+
+--connect (con3,localhost,root,,test)
+XA START 'xa3';
+INSERT INTO t1 (a) VALUES (5);
+INSERT INTO t1 (a) VALUES (6);
+XA END 'xa3';
+XA PREPARE 'xa3';
+--disconnect con3
+
 --connection default
 SELECT * FROM t1;
 
+--echo Must be all three XA:s in
+XA RECOVER;
+
 --let $shutdown_timeout= 0
 --source include/restart_mysqld.inc
  
 --connect (con3,localhost,root,,test)
 --disable_abort_on_error
-XA RECOVER;
+XA RECOVER; # like above
 XA ROLLBACK 'xa1';
 XA COMMIT 'xa2';
+XA ROLLBACK 'xa3';
 SELECT a FROM t1;
+
+--connect (con4,localhost,root,,test)
+XA START 'xa4';
+INSERT INTO t1 (a) VALUES (7);
+INSERT INTO t1 (a) VALUES (8);
+XA END 'xa4';
+XA PREPARE 'xa4';
+
+--connection default
+--echo # Now restart through graceful shutdown
+--source include/restart_mysqld.inc
+
+
+--connect (con5,localhost,root,,test)
+--disable_abort_on_error
+
+--echo Must have 'xa4'
+XA RECOVER;
+XA COMMIT 'xa4';
+
+SELECT a FROM t1;
+
 DROP TABLE t1;
diff --git a/storage/rocksdb/mysql-test/rocksdb_rpl/r/rpl_xa.result b/storage/rocksdb/mysql-test/rocksdb_rpl/r/rpl_xa.result
new file mode 100644
index 00000000000..86f73f2fc9d
--- /dev/null
+++ b/storage/rocksdb/mysql-test/rocksdb_rpl/r/rpl_xa.result
@@ -0,0 +1,61 @@
+include/master-slave.inc
+[connection master]
+connection master;
+create table ti (a int, b int) engine=innodb;
+create table t1 (a int, b int) engine=rocksdb;
+insert into ti values(0, 0);
+insert into t1 values(0, 0);
+xa start 't';
+insert into ti values(1, 2);
+insert into t1 values(1, 2);
+xa end 't';
+xa prepare 't';
+xa commit 't';
+connection slave;
+include/diff_tables.inc [master:t1, slave:t1]
+connection master;
+xa start 't';
+insert into ti values(3, 4);
+insert into t1 values(3, 4);
+xa end 't';
+xa prepare 't';
+xa rollback 't';
+connection slave;
+include/diff_tables.inc [master:t1, slave:t1]
+connection master;
+SET pseudo_slave_mode=1;
+create table t2 (a int) engine=rocksdb;
+xa start 't';
+insert into ti values (5, 6);
+insert into t1 values (5, 6);
+xa end 't';
+xa prepare 't';
+xa start 's';
+insert into ti values (7, 8);
+insert into t2 values (0);
+xa end 's';
+xa prepare 's';
+include/save_master_gtid.inc
+connection slave;
+include/sync_with_master_gtid.inc
+xa recover;
+formatID	gtrid_length	bqual_length	data
+1	1	0	t
+1	1	0	s
+connection master;
+xa commit 't';
+xa commit 's';
+SET pseudo_slave_mode=0;
+Warnings:
+Warning	1231	Slave applier execution mode not active, statement ineffective.
+xa start 'r';
+insert into t1 values(7, 8);
+xa end 'r';
+xa prepare 'r';
+xa commit 'r';
+connection slave;
+include/diff_tables.inc [master:t1, slave:t1]
+include/diff_tables.inc [master:t2, slave:t2]
+connection master;
+drop table ti, t1, t2;
+include/rpl_end.inc
diff --git a/storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_xa.inc b/storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_xa.inc
new file mode 100644
index 00000000000..253d9f16316
--- /dev/null
+++ b/storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_xa.inc
@@ -0,0 +1,84 @@
+#
+# This "body" file checks general properties of XA transaction replication
+# as of MDEV-7974, including XA of mixed engine branches.
+# Parameters:
+# --let rpl_xa_check= SELECT ...
+#
+connection master;
+create table ti (a int, b int) engine=innodb;
+create table t1 (a int, b int) engine=rocksdb;
+insert into ti values(0, 0);
+insert into t1 values(0, 0);
+xa start 't';
+insert into ti values(1, 2);
+insert into t1 values(1, 2);
+xa end 't';
+xa prepare 't';
+xa commit 't';
+
+sync_slave_with_master;
+let $diff_tables= master:t1, slave:t1;
+source include/diff_tables.inc;
+
+connection master;
+
+xa start 't';
+insert into ti values(3, 4);
+insert into t1 values(3, 4);
+xa end 't';
+xa prepare 't';
+xa rollback 't';
+
+sync_slave_with_master;
+let $diff_tables= master:t1, slave:t1;
+source include/diff_tables.inc;
+
+connection master;
+SET pseudo_slave_mode=1;
+create table t2 (a int) engine=rocksdb;
+xa start 't';
+insert into ti values (5, 6);
+insert into t1 values (5, 6);
+xa end 't';
+xa prepare 't';
+xa start 's';
+insert into ti values (7, 8);
+insert into t2 values (0);
+xa end 's';
+xa prepare 's';
+--source include/save_master_gtid.inc
+
+connection slave;
+source include/sync_with_master_gtid.inc;
+if ($rpl_xa_check)
+{
+  --eval $rpl_xa_check
+  if ($rpl_xa_verbose)
+  {
+    --eval SELECT $rpl_xa_check_lhs
+    --eval SELECT $rpl_xa_check_rhs
+  }
+}
+xa recover;
+
+connection master;
+xa commit 't';
+xa commit 's';
+SET pseudo_slave_mode=0;
+
+# pure rocksdb xa
+xa start 'r';
+insert into t1 values(7, 8);
+xa end 'r';
+xa prepare 'r';
+xa commit 'r';
+
+
+sync_slave_with_master;
+let $diff_tables= master:t1, slave:t1;
+source include/diff_tables.inc;
+let $diff_tables= master:t2, slave:t2;
+source include/diff_tables.inc;
+
+connection master;
+drop table ti, t1, t2;
diff --git a/storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_xa.test b/storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_xa.test
new file mode 100644
index 00000000000..34384c74ca9
--- /dev/null
+++ b/storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_xa.test
@@ -0,0 +1,7 @@
+source include/have_rocksdb.inc;
+source include/have_innodb.inc;
+source include/master-slave.inc;
+source include/have_binlog_format_row.inc;
+
+source rpl_xa.inc;
+source include/rpl_end.inc;
diff --git a/storage/rocksdb/mysql-test/rocksdb_sys_vars/t/all_vars.test b/storage/rocksdb/mysql-test/rocksdb_sys_vars/t/all_vars.test
index fc700357155..7943d3e4c0b 100644
--- a/storage/rocksdb/mysql-test/rocksdb_sys_vars/t/all_vars.test
+++ b/storage/rocksdb/mysql-test/rocksdb_sys_vars/t/all_vars.test
@@ -1,6 +1,5 @@
 --source include/have_rocksdb.inc
 --source include/not_embedded.inc
---source include/not_threadpool.inc
 
 # This test verifies that *all* MyRocks system variables are tested by the
 # rocksdb_sys_vars suite. For every MyRocks system variable there must be a
diff --git a/storage/rocksdb/rdb_converter.cc b/storage/rocksdb/rdb_converter.cc
index 677ff106753..65f0b81cc7f 100644
--- a/storage/rocksdb/rdb_converter.cc
+++ b/storage/rocksdb/rdb_converter.cc
@@ -436,7 +436,7 @@ void Rdb_converter::setup_field_encoders() {
   uchar cur_null_mask = 0x1;
 
   m_encoder_arr = static_cast<Rdb_field_encoder *>(
-      my_malloc(m_table->s->fields * sizeof(Rdb_field_encoder), MYF(0)));
+      my_malloc(PSI_INSTRUMENT_ME, m_table->s->fields * sizeof(Rdb_field_encoder), MYF(0)));
   if (m_encoder_arr == nullptr) {
     return;
   }
@@ -646,9 +646,9 @@ int Rdb_converter::verify_row_debug_checksum(
         rdb_netbuf_to_uint32((const uchar *)reader->read(RDB_CHECKSUM_SIZE));
 
     const uint32_t computed_key_chksum =
-        my_core::crc32(0, rdb_slice_to_uchar_ptr(key), key->size());
+        my_core::my_checksum(0, rdb_slice_to_uchar_ptr(key), key->size());
     const uint32_t computed_val_chksum =
-        my_core::crc32(0, rdb_slice_to_uchar_ptr(value),
+        my_core::my_checksum(0, rdb_slice_to_uchar_ptr(value),
                        value->size() - RDB_CHECKSUM_CHUNK_SIZE);
 
     DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_checksum1", stored_key_chksum++;);
@@ -816,10 +816,10 @@ int Rdb_converter::encode_value_slice(
   }
 
   if (store_row_debug_checksums) {
-    const uint32_t key_crc32 = my_core::crc32(
+    const uint32_t key_crc32 = my_core::my_checksum(
         0, rdb_slice_to_uchar_ptr(&pk_packed_slice), pk_packed_slice.size());
     const uint32_t val_crc32 =
-        my_core::crc32(0, rdb_mysql_str_to_uchar_str(&m_storage_record),
+        my_core::my_checksum(0, rdb_mysql_str_to_uchar_str(&m_storage_record),
                        m_storage_record.length());
     uchar key_crc_buf[RDB_CHECKSUM_SIZE];
     uchar val_crc_buf[RDB_CHECKSUM_SIZE];
diff --git a/storage/rocksdb/rdb_datadic.cc b/storage/rocksdb/rdb_datadic.cc
index cad6833e466..2094923edf4 100644
--- a/storage/rocksdb/rdb_datadic.cc
+++ b/storage/rocksdb/rdb_datadic.cc
@@ -1,5 +1,6 @@
 /*
    Copyright (c) 2012,2013 Monty Program Ab
+   Copyright (c) 2020, MariaDB Corporation.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -367,14 +368,14 @@ Rdb_key_def::Rdb_key_def(const Rdb_key_def &k)
                   m_total_index_flags_length == 0);
   if (k.m_pack_info) {
     const size_t size = sizeof(Rdb_field_packing) * k.m_key_parts;
-    void *pack_info= my_malloc(size, MYF(0));
+    void *pack_info= my_malloc(PSI_INSTRUMENT_ME, size, MYF(0));
     memcpy(pack_info, k.m_pack_info, size);
     m_pack_info = reinterpret_cast<Rdb_field_packing *>(pack_info);
   }
 
   if (k.m_pk_part_no) {
     const size_t size = sizeof(uint) * m_key_parts;
-    m_pk_part_no = reinterpret_cast<uint *>(my_malloc(size, MYF(0)));
+    m_pk_part_no = reinterpret_cast<uint *>(my_malloc(PSI_INSTRUMENT_ME, size, MYF(0)));
     memcpy(m_pk_part_no, k.m_pk_part_no, size);
   }
 }
@@ -448,14 +449,14 @@ void Rdb_key_def::setup(const TABLE *const tbl,
 
     if (secondary_key) {
       m_pk_part_no = reinterpret_cast<uint *>(
-          my_malloc(sizeof(uint) * m_key_parts, MYF(0)));
+          my_malloc(PSI_INSTRUMENT_ME, sizeof(uint) * m_key_parts, MYF(0)));
     } else {
       m_pk_part_no = nullptr;
     }
 
     const size_t size = sizeof(Rdb_field_packing) * m_key_parts;
     m_pack_info =
-        reinterpret_cast<Rdb_field_packing *>(my_malloc(size, MYF(0)));
+        reinterpret_cast<Rdb_field_packing *>(my_malloc(PSI_INSTRUMENT_ME, size, MYF(0)));
 
     /*
       Guaranteed not to error here as checks have been made already during
@@ -986,7 +987,7 @@ uint Rdb_key_def::get_memcmp_sk_parts(const TABLE *table,
   Convert index tuple into storage (i.e. mem-comparable) format
 
   @detail
-    Currently this is done by unpacking into table->record[0] and then
+    Currently this is done by unpacking into record_buffer and then
     packing index columns into storage format.
 
   @param pack_buffer Temporary area for packing varchar columns. Its
@@ -995,6 +996,7 @@ uint Rdb_key_def::get_memcmp_sk_parts(const TABLE *table,
 
 uint Rdb_key_def::pack_index_tuple(TABLE *const tbl, uchar *const pack_buffer,
                                    uchar *const packed_tuple,
+                                   uchar *const record_buffer,
                                    const uchar *const key_tuple,
                                    const key_part_map &keypart_map) const {
   DBUG_ASSERT(tbl != nullptr);
@@ -1004,13 +1006,13 @@ uint Rdb_key_def::pack_index_tuple(TABLE *const tbl, uchar *const pack_buffer,
 
   /* We were given a record in KeyTupleFormat. First, save it to record */
   const uint key_len = calculate_key_len(tbl, m_keyno, key_tuple, keypart_map);
-  key_restore(tbl->record[0], key_tuple, &tbl->key_info[m_keyno], key_len);
+  key_restore(record_buffer, key_tuple, &tbl->key_info[m_keyno], key_len);
 
   uint n_used_parts = my_count_bits(keypart_map);
   if (keypart_map == HA_WHOLE_KEY) n_used_parts = 0;  // Full key is used
 
   /* Then, convert the record into a mem-comparable form */
-  return pack_record(tbl, pack_buffer, tbl->record[0], packed_tuple, nullptr,
+  return pack_record(tbl, pack_buffer, record_buffer, packed_tuple, nullptr,
                      false, 0, n_used_parts);
 }
 
@@ -1431,9 +1433,10 @@ uint Rdb_key_def::pack_record(const TABLE *const tbl, uchar *const pack_buffer,
     // ha_rocksdb::convert_record_to_storage_format
     //
     if (should_store_row_debug_checksums) {
-      const uint32_t key_crc32 = crc32(0, packed_tuple, tuple - packed_tuple);
+      const uint32_t key_crc32 =
+          my_checksum(0, packed_tuple, tuple - packed_tuple);
       const uint32_t val_crc32 =
-          crc32(0, unpack_info->ptr(), unpack_info->get_current_pos());
+          my_checksum(0, unpack_info->ptr(), unpack_info->get_current_pos());
 
       unpack_info->write_uint8(RDB_CHECKSUM_DATA_TAG);
       unpack_info->write_uint32(key_crc32);
@@ -1689,9 +1692,9 @@ int Rdb_key_def::unpack_record(TABLE *const table, uchar *const buf,
           (const uchar *)unp_reader.read(RDB_CHECKSUM_SIZE));
 
       const uint32_t computed_key_chksum =
-          crc32(0, (const uchar *)packed_key->data(), packed_key->size());
+          my_checksum(0, packed_key->data(), packed_key->size());
       const uint32_t computed_val_chksum =
-          crc32(0, (const uchar *)unpack_info->data(),
+          my_checksum(0, unpack_info->data(),
                 unpack_info->size() - RDB_CHECKSUM_CHUNK_SIZE);
 
       DBUG_EXECUTE_IF("myrocks_simulate_bad_key_checksum1",
@@ -2123,14 +2126,14 @@ int Rdb_key_def::unpack_utf8_str(
   while (src < src_end) {
     my_wc_t wc = (src[0] << 8) | src[1];
     src += 2;
-    int res = cset->cset->wc_mb(cset, wc, dst, dst_end);
+    int res = cset->wc_mb(wc, dst, dst_end);
     DBUG_ASSERT(res > 0 && res <= 3);
     if (res < 0) return UNPACK_FAILURE;
     dst += res;
   }
 
-  cset->cset->fill(cset, reinterpret_cast<char *>(dst), dst_end - dst,
-                   cset->pad_char);
+  cset->fill(reinterpret_cast<char *>(dst), dst_end - dst,
+             cset->pad_char);
   return UNPACK_SUCCESS;
 }
 
@@ -2253,8 +2256,8 @@ void Rdb_key_def::pack_with_varchar_encoding(
   const size_t value_length = (field_var->length_bytes == 1)
                                   ? (uint)*field->ptr
                                   : uint2korr(field->ptr);
-  size_t xfrm_len = charset->coll->strnxfrm(
-      charset, buf, fpi->m_max_image_len, field_var->char_length(),
+  size_t xfrm_len = charset->strnxfrm(
+      buf, fpi->m_max_image_len, field_var->char_length(),
       field_var->ptr + field_var->length_bytes, value_length, 0);
 
   /* Got a mem-comparable image in 'buf'. Now, produce varlength encoding */
@@ -2365,11 +2368,11 @@ void Rdb_key_def::pack_with_varchar_space_pad(
                                   ? (uint)*field->ptr
                                   : uint2korr(field->ptr);
 
-  const size_t trimmed_len = charset->cset->lengthsp(
-      charset, (const char *)field_var->ptr + field_var->length_bytes,
+  const size_t trimmed_len = charset->lengthsp(
+      (const char *)field_var->ptr + field_var->length_bytes,
       value_length);
-  const size_t xfrm_len = charset->coll->strnxfrm(
-      charset, buf, fpi->m_max_image_len, field_var->char_length(),
+  const size_t xfrm_len = charset->strnxfrm(
+      buf, fpi->m_max_image_len, field_var->char_length(),
       field_var->ptr + field_var->length_bytes, trimmed_len, 0);
 
   /* Got a mem-comparable image in 'buf'. Now, produce varlength encoding */
@@ -2501,7 +2504,7 @@ static int unpack_charset(
 
   for (uint ii = 0; ii < src_len; ii += 2) {
     my_wc_t wc = (src[ii] << 8) | src[ii + 1];
-    int res = cset->cset->wc_mb(cset, wc, dst + used, dst_end);
+    int res = cset->wc_mb(wc, dst + used, dst_end);
     DBUG_ASSERT(res > 0 && res <= 3);
     if (res < 0) {
       return UNPACK_FAILURE;
@@ -2657,7 +2660,7 @@ int Rdb_key_def::unpack_binary_or_utf8_varchar_space_pad(
         my_wc_t wc = (src[0] << 8) | src[1];
         src += 2;
         const CHARSET_INFO *cset = fpi->m_varchar_charset;
-        int res = cset->cset->wc_mb(cset, wc, dst, dst_end);
+        int res = cset->wc_mb(wc, dst, dst_end);
         DBUG_ASSERT(res <= 3);
         if (res <= 0) return UNPACK_FAILURE;
         dst += res;
@@ -3055,14 +3058,14 @@ static void rdb_get_mem_comparable_space(const CHARSET_INFO *const cs,
       // multi-byte form of the ' ' (space) character
       uchar space_mb[MAX_MULTI_BYTE_CHAR_SIZE];
 
-      const size_t space_mb_len = cs->cset->wc_mb(
-          cs, (my_wc_t)cs->pad_char, space_mb, space_mb + sizeof(space_mb));
+      const size_t space_mb_len = cs->wc_mb(
+          (my_wc_t)cs->pad_char, space_mb, space_mb + sizeof(space_mb));
 
       // mem-comparable image of the space character
       std::array<uchar, 20> space;
 
-      const size_t space_len = cs->coll->strnxfrm(
-          cs, space.data(), sizeof(space), 1, space_mb, space_mb_len, 0);
+      const size_t space_len = cs->strnxfrm(
+          space.data(), sizeof(space), 1, space_mb, space_mb_len, 0);
       Rdb_charset_space_info *const info = new Rdb_charset_space_info;
       info->space_xfrm_len = space_len;
       info->space_mb_len = space_mb_len;
@@ -3120,7 +3123,7 @@ static const Rdb_collation_codec *rdb_init_collation_mapping(
           for (uint idx = 0; idx < p.second.size(); idx++) {
             uchar src = p.second[idx];
             uchar bits =
-                my_bit_log2(my_round_up_to_next_power(p.second.size()));
+                my_bit_log2_uint32(my_round_up_to_next_power(p.second.size()));
             cur->m_enc_idx[src] = idx;
             cur->m_enc_size[src] = bits;
             cur->m_dec_size[dst] = bits;
@@ -3314,7 +3317,9 @@ bool Rdb_field_packing::setup(const Rdb_key_def *const key_descr,
       field->field_length = field->char_length() * cs->mbmaxlen.
     */
     const CHARSET_INFO *cs = field->charset();
-    m_max_image_len = cs->coll->strnxfrmlen(cs, field->field_length);
+    m_max_image_len = cs->strnxfrmlen(type == MYSQL_TYPE_STRING ?
+                                      field->pack_length() :
+                                      field->field_length);
   }
   const bool is_varchar = (type == MYSQL_TYPE_VARCHAR);
   const CHARSET_INFO *cs = field->charset();
@@ -3788,8 +3793,7 @@ bool Rdb_validate_tbls::check_frm_file(const std::string &fullpath,
   */
   char eng_type_buf[NAME_CHAR_LEN+1];
   LEX_CSTRING eng_type_str = {eng_type_buf, 0}; 
-  bool is_sequence;
-  enum Table_type type = dd_frm_type(nullptr, fullfilename.c_ptr(), &eng_type_str, &is_sequence);
+  enum Table_type type = dd_frm_type(nullptr, fullfilename.c_ptr(), &eng_type_str);
   if (type == TABLE_TYPE_UNKNOWN) {
     // NO_LINT_DEBUG
     sql_print_warning("RocksDB: Failed to open/read .from file: %s",
diff --git a/storage/rocksdb/rdb_datadic.h b/storage/rocksdb/rdb_datadic.h
index 7bcc45d3f62..903cecdc379 100644
--- a/storage/rocksdb/rdb_datadic.h
+++ b/storage/rocksdb/rdb_datadic.h
@@ -252,7 +252,8 @@ class Rdb_key_def {
  public:
   /* Convert a key from KeyTupleFormat to mem-comparable form */
   uint pack_index_tuple(TABLE *const tbl, uchar *const pack_buffer,
-                        uchar *const packed_tuple, const uchar *const key_tuple,
+                        uchar *const packed_tuple, uchar *const record_buffer,
+                        const uchar *const key_tuple,
                         const key_part_map &keypart_map) const;
 
   uchar *pack_field(Field *const field, Rdb_field_packing *pack_info,
diff --git a/storage/rocksdb/rdb_i_s.cc b/storage/rocksdb/rdb_i_s.cc
index 01a2066ae26..5350ec3bce9 100644
--- a/storage/rocksdb/rdb_i_s.cc
+++ b/storage/rocksdb/rdb_i_s.cc
@@ -65,11 +65,14 @@ namespace RDB_CFSTATS_FIELD {
 enum { CF_NAME = 0, STAT_TYPE, VALUE };
 }  // namespace RDB_CFSTATS_FIELD
 
+using namespace Show;
+
 static ST_FIELD_INFO rdb_i_s_cfstats_fields_info[] = {
-    ROCKSDB_FIELD_INFO("CF_NAME", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("STAT_TYPE", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("VALUE", sizeof(uint64_t), MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO_END};
+    Column("CF_NAME",   Varchar(NAME_LEN + 1), NOT_NULL),
+    Column("STAT_TYPE", Varchar(NAME_LEN + 1), NOT_NULL),
+    Column("VALUE",     SLonglong(),           NOT_NULL),
+    CEnd()
+};
 
 static int rdb_i_s_cfstats_fill_table(
     my_core::THD *const thd, my_core::TABLE_LIST *const tables,
@@ -165,9 +168,9 @@ enum { STAT_TYPE = 0, VALUE };
 }  // namespace RDB_DBSTATS_FIELD
 
 static ST_FIELD_INFO rdb_i_s_dbstats_fields_info[] = {
-    ROCKSDB_FIELD_INFO("STAT_TYPE", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("VALUE", sizeof(uint64_t), MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO_END};
+    Column("STAT_TYPE", Varchar(NAME_LEN + 1), NOT_NULL),
+    Column("VALUE",     SLonglong(),           NOT_NULL),
+    CEnd()};
 
 static int rdb_i_s_dbstats_fill_table(
     my_core::THD *const thd, my_core::TABLE_LIST *const tables,
@@ -261,13 +264,12 @@ enum { TABLE_SCHEMA = 0, TABLE_NAME, PARTITION_NAME, STAT_TYPE, VALUE };
 }  // namespace RDB_PERF_CONTEXT_FIELD
 
 static ST_FIELD_INFO rdb_i_s_perf_context_fields_info[] = {
-    ROCKSDB_FIELD_INFO("TABLE_SCHEMA", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("TABLE_NAME", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("PARTITION_NAME", NAME_LEN + 1, MYSQL_TYPE_STRING,
-                       MY_I_S_MAYBE_NULL),
-    ROCKSDB_FIELD_INFO("STAT_TYPE", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("VALUE", sizeof(uint64_t), MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO_END};
+    Column("TABLE_SCHEMA",   Varchar(NAME_LEN + 1), NOT_NULL),
+    Column("TABLE_NAME",     Varchar(NAME_LEN + 1), NOT_NULL),
+    Column("PARTITION_NAME", Varchar(NAME_LEN + 1), NULLABLE),
+    Column("STAT_TYPE",      Varchar(NAME_LEN + 1), NOT_NULL),
+    Column("VALUE",          SLonglong(),           NOT_NULL),
+    CEnd()};
 
 static int rdb_i_s_perf_context_fill_table(
     my_core::THD *const thd, my_core::TABLE_LIST *const tables,
@@ -364,9 +366,9 @@ enum { STAT_TYPE = 0, VALUE };
 }  // namespace RDB_PERF_CONTEXT_GLOBAL_FIELD
 
 static ST_FIELD_INFO rdb_i_s_perf_context_global_fields_info[] = {
-    ROCKSDB_FIELD_INFO("STAT_TYPE", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("VALUE", sizeof(uint64_t), MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO_END};
+    Column("STAT_TYPE", Varchar(NAME_LEN + 1), NOT_NULL),
+    Column("VALUE",     SLonglong(),           NOT_NULL),
+    CEnd()};
 
 static int rdb_i_s_perf_context_global_fill_table(
     my_core::THD *const thd, my_core::TABLE_LIST *const tables,
@@ -434,10 +436,10 @@ enum { CF_NAME = 0, OPTION_TYPE, VALUE };
 }  // namespace RDB_CFOPTIONS_FIELD
 
 static ST_FIELD_INFO rdb_i_s_cfoptions_fields_info[] = {
-    ROCKSDB_FIELD_INFO("CF_NAME", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("OPTION_TYPE", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("VALUE", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO_END};
+    Column("CF_NAME",     Varchar(NAME_LEN + 1), NOT_NULL),
+    Column("OPTION_TYPE", Varchar(NAME_LEN + 1), NOT_NULL),
+    Column("VALUE",       Varchar(NAME_LEN + 1), NOT_NULL),
+    CEnd()};
 
 static int rdb_i_s_cfoptions_fill_table(
     my_core::THD *const thd, my_core::TABLE_LIST *const tables,
@@ -696,10 +698,10 @@ enum { TYPE = 0, NAME, VALUE };
 }
 
 static ST_FIELD_INFO rdb_i_s_global_info_fields_info[] = {
-    ROCKSDB_FIELD_INFO("TYPE", FN_REFLEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("NAME", FN_REFLEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("VALUE", FN_REFLEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO_END};
+    Column("TYPE",  Varchar(FN_REFLEN + 1), NOT_NULL),
+    Column("NAME",  Varchar(FN_REFLEN + 1), NOT_NULL),
+    Column("VALUE", Varchar(FN_REFLEN + 1), NOT_NULL),
+    CEnd()};
 
 /*
  * helper function for rdb_i_s_global_info_fill_table
@@ -900,11 +902,11 @@ static int rdb_i_s_compact_stats_fill_table(
 }
 
 static ST_FIELD_INFO rdb_i_s_compact_stats_fields_info[] = {
-    ROCKSDB_FIELD_INFO("CF_NAME", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("LEVEL", FN_REFLEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("TYPE", FN_REFLEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("VALUE", sizeof(double), MYSQL_TYPE_DOUBLE, 0),
-    ROCKSDB_FIELD_INFO_END};
+    Column("CF_NAME", Varchar(NAME_LEN + 1),               NOT_NULL),
+    Column("LEVEL",   Varchar(FN_REFLEN + 1),              NOT_NULL),
+    Column("TYPE",    Varchar(FN_REFLEN + 1),              NOT_NULL),
+    Column("VALUE",   Double(MY_INT64_NUM_DECIMAL_DIGITS), NOT_NULL),
+    CEnd()};
 
 namespace  // anonymous namespace = not visible outside this source file
 {
@@ -937,22 +939,19 @@ enum {
 }  // namespace RDB_DDL_FIELD
 
 static ST_FIELD_INFO rdb_i_s_ddl_fields_info[] = {
-    ROCKSDB_FIELD_INFO("TABLE_SCHEMA", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("TABLE_NAME", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("PARTITION_NAME", NAME_LEN + 1, MYSQL_TYPE_STRING,
-                       MY_I_S_MAYBE_NULL),
-    ROCKSDB_FIELD_INFO("INDEX_NAME", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("COLUMN_FAMILY", sizeof(uint32_t), MYSQL_TYPE_LONG, 0),
-    ROCKSDB_FIELD_INFO("INDEX_NUMBER", sizeof(uint32_t), MYSQL_TYPE_LONG, 0),
-    ROCKSDB_FIELD_INFO("INDEX_TYPE", sizeof(uint16_t), MYSQL_TYPE_SHORT, 0),
-    ROCKSDB_FIELD_INFO("KV_FORMAT_VERSION", sizeof(uint16_t), MYSQL_TYPE_SHORT,
-                       0),
-    ROCKSDB_FIELD_INFO("TTL_DURATION", sizeof(uint64), MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO("INDEX_FLAGS", sizeof(uint64), MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO("CF", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("AUTO_INCREMENT", sizeof(uint64_t), MYSQL_TYPE_LONGLONG,
-                       MY_I_S_MAYBE_NULL | MY_I_S_UNSIGNED),
-    ROCKSDB_FIELD_INFO_END};
+    Column("TABLE_SCHEMA",      Varchar(NAME_LEN + 1),  NOT_NULL),
+    Column("TABLE_NAME",        Varchar(NAME_LEN + 1),  NOT_NULL),
+    Column("PARTITION_NAME",    Varchar(NAME_LEN + 1),  NULLABLE),
+    Column("INDEX_NAME",        Varchar(NAME_LEN + 1),  NOT_NULL),
+    Column("COLUMN_FAMILY",     SLong(),                NOT_NULL),
+    Column("INDEX_NUMBER",      SLong(),                NOT_NULL),
+    Column("INDEX_TYPE",        SShort(6),              NOT_NULL),
+    Column("KV_FORMAT_VERSION", SShort(6),              NOT_NULL),
+    Column("TTL_DURATION",      SLonglong(),            NOT_NULL),
+    Column("INDEX_FLAGS",       SLonglong(),            NOT_NULL),
+    Column("CF",                Varchar(NAME_LEN + 1),  NOT_NULL),
+    Column("AUTO_INCREMENT",    ULonglong(),            NULLABLE),
+    CEnd()};
 
 int Rdb_ddl_scanner::add_table(Rdb_tbl_def *tdef) {
   DBUG_ASSERT(tdef != nullptr);
@@ -1155,34 +1154,24 @@ enum {
 }  // namespace RDB_SST_PROPS_FIELD
 
 static ST_FIELD_INFO rdb_i_s_sst_props_fields_info[] = {
-    ROCKSDB_FIELD_INFO("SST_NAME", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("COLUMN_FAMILY", sizeof(uint32_t), MYSQL_TYPE_LONG, 0),
-    ROCKSDB_FIELD_INFO("DATA_BLOCKS", sizeof(int64_t), MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO("ENTRIES", sizeof(int64_t), MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO("RAW_KEY_SIZE", sizeof(int64_t), MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO("RAW_VALUE_SIZE", sizeof(int64_t), MYSQL_TYPE_LONGLONG,
-                       0),
-    ROCKSDB_FIELD_INFO("DATA_BLOCK_SIZE", sizeof(int64_t), MYSQL_TYPE_LONGLONG,
-                       0),
-    ROCKSDB_FIELD_INFO("INDEX_BLOCK_SIZE", sizeof(int64_t), MYSQL_TYPE_LONGLONG,
-                       0),
-    ROCKSDB_FIELD_INFO("INDEX_PARTITIONS", sizeof(uint32_t), MYSQL_TYPE_LONG,
-                       0),
-    ROCKSDB_FIELD_INFO("TOP_LEVEL_INDEX_SIZE", sizeof(int64_t),
-                       MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO("FILTER_BLOCK_SIZE", sizeof(int64_t),
-                       MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO("COMPRESSION_ALGO", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("CREATION_TIME", sizeof(int64_t), MYSQL_TYPE_LONGLONG,
-                       0),
-    ROCKSDB_FIELD_INFO("FILE_CREATION_TIME", sizeof(int64_t),
-                       MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO("OLDEST_KEY_TIME", sizeof(int64_t), MYSQL_TYPE_LONGLONG,
-                       0),
-    ROCKSDB_FIELD_INFO("FILTER_POLICY", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("COMPRESSION_OPTIONS", NAME_LEN + 1, MYSQL_TYPE_STRING,
-                       0),
-    ROCKSDB_FIELD_INFO_END};
+    Column("SST_NAME",             Varchar(NAME_LEN + 1), NOT_NULL),
+    Column("COLUMN_FAMILY",        SLong(),               NOT_NULL),
+    Column("DATA_BLOCKS",          SLonglong(),           NOT_NULL),
+    Column("ENTRIES",              SLonglong(),           NOT_NULL),
+    Column("RAW_KEY_SIZE",         SLonglong(),           NOT_NULL),
+    Column("RAW_VALUE_SIZE",       SLonglong(),           NOT_NULL),
+    Column("DATA_BLOCK_SIZE",      SLonglong(),           NOT_NULL),
+    Column("INDEX_BLOCK_SIZE",     SLonglong(),           NOT_NULL),
+    Column("INDEX_PARTITIONS",     SLong(),               NOT_NULL),
+    Column("TOP_LEVEL_INDEX_SIZE", SLonglong(),           NOT_NULL),
+    Column("FILTER_BLOCK_SIZE",    SLonglong(),           NOT_NULL),
+    Column("COMPRESSION_ALGO",     Varchar(NAME_LEN + 1), NOT_NULL),
+    Column("CREATION_TIME",        SLonglong(),           NOT_NULL),
+    Column("FILE_CREATION_TIME",   SLonglong(),           NOT_NULL),
+    Column("OLDEST_KEY_TIME",      SLonglong(),           NOT_NULL),
+    Column("FILTER_POLICY",        Varchar(NAME_LEN + 1), NOT_NULL),
+    Column("COMPRESSION_OPTIONS",  Varchar(NAME_LEN + 1), NOT_NULL),
+    CEnd()};
 
 static int rdb_i_s_sst_props_fill_table(
     my_core::THD *const thd, my_core::TABLE_LIST *const tables,
@@ -1328,20 +1317,17 @@ static ST_FIELD_INFO rdb_i_s_index_file_map_fields_info[] = {
      *   SST_NAME => the name of the SST file containing some indexes
      *   NUM_ROWS => the number of entries of this index id in this SST file
      *   DATA_SIZE => the data size stored in this SST file for this index id */
-    ROCKSDB_FIELD_INFO("COLUMN_FAMILY", sizeof(uint32_t), MYSQL_TYPE_LONG, 0),
-    ROCKSDB_FIELD_INFO("INDEX_NUMBER", sizeof(uint32_t), MYSQL_TYPE_LONG, 0),
-    ROCKSDB_FIELD_INFO("SST_NAME", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("NUM_ROWS", sizeof(int64_t), MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO("DATA_SIZE", sizeof(int64_t), MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO("ENTRY_DELETES", sizeof(int64_t), MYSQL_TYPE_LONGLONG,
-                       0),
-    ROCKSDB_FIELD_INFO("ENTRY_SINGLEDELETES", sizeof(int64_t),
-                       MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO("ENTRY_MERGES", sizeof(int64_t), MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO("ENTRY_OTHERS", sizeof(int64_t), MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO("DISTINCT_KEYS_PREFIX", MAX_REF_PARTS * 25,
-                       MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO_END};
+    Column("COLUMN_FAMILY",       SLong(),                     NOT_NULL),
+    Column("INDEX_NUMBER",        SLong(),                     NOT_NULL),
+    Column("SST_NAME",            Varchar(NAME_LEN + 1),       NOT_NULL),
+    Column("NUM_ROWS",            SLonglong(),                 NOT_NULL),
+    Column("DATA_SIZE",           SLonglong(),                 NOT_NULL),
+    Column("ENTRY_DELETES",       SLonglong(),                 NOT_NULL),
+    Column("ENTRY_SINGLEDELETES", SLonglong(),                 NOT_NULL),
+    Column("ENTRY_MERGES",        SLonglong(),                 NOT_NULL),
+    Column("ENTRY_OTHERS",        SLonglong(),                 NOT_NULL),
+    Column("DISTINCT_KEYS_PREFIX",Varchar(MAX_REF_PARTS * 25), NOT_NULL),
+    CEnd()};
 
 /* Fill the information_schema.rocksdb_index_file_map virtual table */
 static int rdb_i_s_index_file_map_fill_table(
@@ -1475,12 +1461,11 @@ enum { COLUMN_FAMILY_ID = 0, TRANSACTION_ID, KEY, MODE };
 }  // namespace RDB_LOCKS_FIELD
 
 static ST_FIELD_INFO rdb_i_s_lock_info_fields_info[] = {
-    ROCKSDB_FIELD_INFO("COLUMN_FAMILY_ID", sizeof(uint32_t), MYSQL_TYPE_LONG,
-                       0),
-    ROCKSDB_FIELD_INFO("TRANSACTION_ID", sizeof(uint32_t), MYSQL_TYPE_LONG, 0),
-    ROCKSDB_FIELD_INFO("KEY", FN_REFLEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("MODE", 32, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO_END};
+    Column("COLUMN_FAMILY_ID", SLong(),                NOT_NULL),
+    Column("TRANSACTION_ID",   SLong(),                NOT_NULL),
+    Column("KEY",              Varchar(FN_REFLEN + 1), NOT_NULL),
+    Column("MODE",             Varchar(32),            NOT_NULL),
+    CEnd()};
 
 /* Fill the information_schema.rocksdb_locks virtual table */
 static int rdb_i_s_lock_info_fill_table(
@@ -1577,27 +1562,22 @@ enum {
 }  // namespace RDB_TRX_FIELD
 
 static ST_FIELD_INFO rdb_i_s_trx_info_fields_info[] = {
-    ROCKSDB_FIELD_INFO("TRANSACTION_ID", sizeof(ulonglong), MYSQL_TYPE_LONGLONG,
-                       0),
-    ROCKSDB_FIELD_INFO("STATE", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("NAME", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("WRITE_COUNT", sizeof(ulonglong), MYSQL_TYPE_LONGLONG,
-                       0),
-    ROCKSDB_FIELD_INFO("LOCK_COUNT", sizeof(ulonglong), MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO("TIMEOUT_SEC", sizeof(uint32_t), MYSQL_TYPE_LONG, 0),
-    ROCKSDB_FIELD_INFO("WAITING_KEY", FN_REFLEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("WAITING_COLUMN_FAMILY_ID", sizeof(uint32_t),
-                       MYSQL_TYPE_LONG, 0),
-    ROCKSDB_FIELD_INFO("IS_REPLICATION", sizeof(uint32_t), MYSQL_TYPE_LONG, 0),
-    ROCKSDB_FIELD_INFO("SKIP_TRX_API", sizeof(uint32_t), MYSQL_TYPE_LONG, 0),
-    ROCKSDB_FIELD_INFO("READ_ONLY", sizeof(uint32_t), MYSQL_TYPE_LONG, 0),
-    ROCKSDB_FIELD_INFO("HAS_DEADLOCK_DETECTION", sizeof(uint32_t),
-                       MYSQL_TYPE_LONG, 0),
-    ROCKSDB_FIELD_INFO("NUM_ONGOING_BULKLOAD", sizeof(uint32_t),
-                       MYSQL_TYPE_LONG, 0),
-    ROCKSDB_FIELD_INFO("THREAD_ID", sizeof(ulong), MYSQL_TYPE_LONG, 0),
-    ROCKSDB_FIELD_INFO("QUERY", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO_END};
+  Column("TRANSACTION_ID",         SLonglong(),            NOT_NULL),
+  Column("STATE",                  Varchar(NAME_LEN + 1),  NOT_NULL),
+  Column("NAME",                   Varchar(NAME_LEN + 1),  NOT_NULL),
+  Column("WRITE_COUNT",            SLonglong(),            NOT_NULL),
+  Column("LOCK_COUNT",             SLonglong(),            NOT_NULL),
+  Column("TIMEOUT_SEC",            SLong(),                NOT_NULL),
+  Column("WAITING_KEY",            Varchar(FN_REFLEN + 1), NOT_NULL),
+  Column("WAITING_COLUMN_FAMILY_ID",SLong(),               NOT_NULL),
+  Column("IS_REPLICATION",         SLong(),                NOT_NULL),
+  Column("SKIP_TRX_API",           SLong(),                NOT_NULL),
+  Column("READ_ONLY",              SLong(),                NOT_NULL),
+  Column("HAS_DEADLOCK_DETECTION", SLong(),                NOT_NULL),
+  Column("NUM_ONGOING_BULKLOAD",   SLong(),                NOT_NULL),
+  Column("THREAD_ID",              SLong(),                NOT_NULL),
+  Column("QUERY",                  Varchar(NAME_LEN + 1),  NOT_NULL),
+  CEnd()};
 
 /* Fill the information_schema.rocksdb_trx virtual table */
 static int rdb_i_s_trx_info_fill_table(
@@ -1704,19 +1684,16 @@ enum {
 }  // namespace RDB_DEADLOCK_FIELD
 
 static ST_FIELD_INFO rdb_i_s_deadlock_info_fields_info[] = {
-    ROCKSDB_FIELD_INFO("DEADLOCK_ID", sizeof(ulonglong), MYSQL_TYPE_LONGLONG,
-                       0),
-    ROCKSDB_FIELD_INFO("TIMESTAMP", sizeof(ulonglong), MYSQL_TYPE_LONGLONG, 0),
-    ROCKSDB_FIELD_INFO("TRANSACTION_ID", sizeof(ulonglong), MYSQL_TYPE_LONGLONG,
-                       0),
-    ROCKSDB_FIELD_INFO("CF_NAME", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("WAITING_KEY", FN_REFLEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("LOCK_TYPE", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("INDEX_NAME", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("TABLE_NAME", NAME_LEN + 1, MYSQL_TYPE_STRING, 0),
-    ROCKSDB_FIELD_INFO("ROLLED_BACK", sizeof(ulonglong), MYSQL_TYPE_LONGLONG,
-                       0),
-    ROCKSDB_FIELD_INFO_END};
+    Column("DEADLOCK_ID",    SLonglong(),            NOT_NULL),
+    Column("TIMESTAMP",      SLonglong(),            NOT_NULL),
+    Column("TRANSACTION_ID", SLonglong(),            NOT_NULL),
+    Column("CF_NAME",        Varchar(NAME_LEN + 1),  NOT_NULL),
+    Column("WAITING_KEY",    Varchar(FN_REFLEN + 1), NOT_NULL),
+    Column("LOCK_TYPE",      Varchar(NAME_LEN + 1),  NOT_NULL),
+    Column("INDEX_NAME",     Varchar(NAME_LEN + 1),  NOT_NULL),
+    Column("TABLE_NAME",     Varchar(NAME_LEN + 1),  NOT_NULL),
+    Column("ROLLED_BACK",    SLonglong(),            NOT_NULL),
+    CEnd()};
 
 /* Fill the information_schema.rocksdb_trx virtual table */
 static int rdb_i_s_deadlock_info_fill_table(
diff --git a/storage/sequence/mysql-test/sequence/group_by.result b/storage/sequence/mysql-test/sequence/group_by.result
index 7902fe6a348..795c32c780f 100644
--- a/storage/sequence/mysql-test/sequence/group_by.result
+++ b/storage/sequence/mysql-test/sequence/group_by.result
@@ -103,3 +103,33 @@ count(NULL)
 select count(NULL) from seq_1_to_3 limit 0;
 count(NULL)
 # End of 10.3 tests
+#
+# MDEV-16327: Server doesn't account for engines that supports
+# OFFSET on their own.
+#
+select count(NULL) from seq_1_to_3 limit 1;
+count(NULL)
+0
+explain format=json select count(NULL) from seq_1_to_3 limit 1;
+EXPLAIN
+{
+  "query_block": {
+    "select_id": 1,
+    "table": {
+      "message": "Storage engine handles GROUP BY"
+    }
+  }
+}
+select count(NULL) from seq_1_to_3 limit 1 offset 1;
+count(NULL)
+explain format=json select count(NULL) from seq_1_to_3 limit 1 offset 1;
+EXPLAIN
+{
+  "query_block": {
+    "select_id": 1,
+    "table": {
+      "message": "Storage engine handles GROUP BY"
+    }
+  }
+}
+# End of 10.5 tests
diff --git a/storage/sequence/mysql-test/sequence/group_by.test b/storage/sequence/mysql-test/sequence/group_by.test
index 18e44cd2ab1..ca43ba1a5d7 100644
--- a/storage/sequence/mysql-test/sequence/group_by.test
+++ b/storage/sequence/mysql-test/sequence/group_by.test
@@ -56,3 +56,15 @@ select count(NULL) from seq_1_to_3;
 select count(NULL) from seq_1_to_3 limit 0;
 
 --echo # End of 10.3 tests
+
+--echo #
+--echo # MDEV-16327: Server doesn't account for engines that supports
+--echo # OFFSET on their own.
+--echo #
+
+select count(NULL) from seq_1_to_3 limit 1;
+explain format=json select count(NULL) from seq_1_to_3 limit 1;
+select count(NULL) from seq_1_to_3 limit 1 offset 1;
+explain format=json select count(NULL) from seq_1_to_3 limit 1 offset 1;
+
+--echo # End of 10.5 tests
diff --git a/storage/sequence/sequence.cc b/storage/sequence/sequence.cc
index 8eae98955c3..f5a18094521 100644
--- a/storage/sequence/sequence.cc
+++ b/storage/sequence/sequence.cc
@@ -28,6 +28,7 @@
 #include <handler.h>
 #include <table.h>
 #include <field.h>
+#include <sql_limit.h>
 
 static handlerton *sequence_hton;
 
@@ -52,7 +53,7 @@ public:
   }
 };
 
-class ha_seq: public handler
+class ha_seq final : public handler
 {
 private:
   THR_LOCK_DATA lock;
@@ -68,10 +69,15 @@ public:
 
   /* open/close/locking */
   int create(const char *name, TABLE *table_arg,
-             HA_CREATE_INFO *create_info) { return HA_ERR_WRONG_COMMAND; }
+             HA_CREATE_INFO *create_info)
+  { return HA_ERR_WRONG_COMMAND; }
 
   int open(const char *name, int mode, uint test_if_locked);
   int close(void);
+  int delete_table(const char *name)
+  {
+    return 0;
+  }
   THR_LOCK_DATA **store_lock(THD *, THR_LOCK_DATA **, enum thr_lock_type);
 
   /* table scan */
@@ -92,9 +98,8 @@ public:
   int index_prev(uchar *buf);
   int index_first(uchar *buf);
   int index_last(uchar *buf);
-  ha_rows records_in_range(uint inx, key_range *min_key,
-                                   key_range *max_key);
-
+  ha_rows records_in_range(uint inx, const key_range *start_key,
+                           const key_range *end_key, page_range *pages);
   double scan_time() { return (double)nvalues(); }
   double read_time(uint index, uint ranges, ha_rows rows) { return (double)rows; }
   double keyread_time(uint index, uint ranges, ha_rows rows) { return (double)rows; }
@@ -233,8 +238,9 @@ int ha_seq::index_last(uchar *buf)
   return index_prev(buf);
 }
 
-ha_rows ha_seq::records_in_range(uint inx, key_range *min_key,
-                                 key_range *max_key)
+ha_rows ha_seq::records_in_range(uint inx, const key_range *min_key,
+                                 const key_range *max_key,
+                                 page_range *pages)
 {
   ulonglong kmin= min_key ? uint8korr(min_key->key) : seqs->from;
   ulonglong kmax= max_key ? uint8korr(max_key->key) : seqs->to - 1;
@@ -361,15 +367,21 @@ static int dummy_savepoint(handlerton *, THD *, void *) { return 0; }
 
 class ha_seq_group_by_handler: public group_by_handler
 {
+  Select_limit_counters limit;
   List<Item> *fields;
   TABLE_LIST *table_list;
   bool first_row;
 
 public:
   ha_seq_group_by_handler(THD *thd_arg, List<Item> *fields_arg,
-                          TABLE_LIST *table_list_arg)
-    : group_by_handler(thd_arg, sequence_hton), fields(fields_arg),
-      table_list(table_list_arg) {}
+                          TABLE_LIST *table_list_arg,
+                          Select_limit_counters *orig_lim)
+    : group_by_handler(thd_arg, sequence_hton),  limit(orig_lim[0]),
+      fields(fields_arg), table_list(table_list_arg)
+    {
+      // Reset limit because we are handling it now
+      orig_lim->set_unlimited();
+    }
   ~ha_seq_group_by_handler() {}
   int init_scan() { first_row= 1 ; return 0; }
   int next_row();
@@ -425,7 +437,8 @@ create_group_by_handler(THD *thd, Query *query)
   }
 
   /* Create handler and return it */
-  handler= new ha_seq_group_by_handler(thd, query->select, query->from);
+  handler= new ha_seq_group_by_handler(thd, query->select, query->from,
+                                       query->limit);
   return handler;
 }
 
@@ -440,7 +453,9 @@ int ha_seq_group_by_handler::next_row()
     Check if this is the first call to the function. If not, we have already
     returned all data.
   */
-  if (!first_row)
+  if (!first_row ||
+      limit.get_offset_limit() > 0 ||
+      limit.get_select_limit() == 0)
     DBUG_RETURN(HA_ERR_END_OF_FILE);
   first_row= 0;
 
@@ -482,11 +497,21 @@ int ha_seq_group_by_handler::next_row()
   Initialize the interface between the sequence engine and MariaDB
 *****************************************************************************/
 
+static int drop_table(handlerton *hton, const char *path)
+{
+  const char *name= strrchr(path, FN_LIBCHAR)+1;
+  ulonglong from, to, step;
+  if (parse_table_name(name, strlen(name), &from, &to, &step))
+    return ENOENT;
+  return 0;
+}
+
 static int init(void *p)
 {
   handlerton *hton= (handlerton *)p;
   sequence_hton= hton;
   hton->create= create_handler;
+  hton->drop_table= drop_table;
   hton->discover_table= discover_table;
   hton->discover_table_existence= discover_table_existence;
   hton->commit= hton->rollback= dummy_commit_rollback;
@@ -516,4 +541,3 @@ maria_declare_plugin(sequence)
   MariaDB_PLUGIN_MATURITY_STABLE
 }
 maria_declare_plugin_end;
-
diff --git a/storage/sphinx/ha_sphinx.cc b/storage/sphinx/ha_sphinx.cc
index f2bc24c47d4..0ca08bc3c47 100644
--- a/storage/sphinx/ha_sphinx.cc
+++ b/storage/sphinx/ha_sphinx.cc
@@ -199,7 +199,7 @@ enum ESphRankMode
 	SPH_RANK_PROXIMITY_BM25		= 0,	///< default mode, phrase proximity major factor and BM25 minor one
 	SPH_RANK_BM25				= 1,	///< statistical mode, BM25 ranking only (faster but worse quality)
 	SPH_RANK_NONE				= 2,	///< no ranking, all matches get a weight of 1
-	SPH_RANK_WORDCOUNT			= 3,	///< simple word-count weighting, rank is a weighted sum of per-field keyword occurence counts
+	SPH_RANK_WORDCOUNT			= 3,	///< simple word-count weighting, rank is a weighted sum of per-field keyword occurrence counts
 	SPH_RANK_PROXIMITY			= 4,	///< phrase proximity
 	SPH_RANK_MATCHANY			= 5,	///< emulate old match-any weighting
 	SPH_RANK_FIELDMASK			= 6,	///< sets bits where there were matches
@@ -596,6 +596,7 @@ private:
 
 	struct Override_t
 	{
+		Override_t() : m_dIds(PSI_INSTRUMENT_MEM), m_dValues(PSI_INSTRUMENT_MEM) {}
 		union Value_t
 		{
 			uint32		m_uValue;
@@ -695,7 +696,7 @@ handlerton sphinx_hton =
 	NULL,	// create_cursor_read_view
 	NULL,	// set_cursor_read_view
 	NULL,	// close_cursor_read_view
-	HTON_CAN_RECREATE
+	HTON_CAN_RECREATE | HTON_AUTOMATIC_DELETE_TABLE
 };
 #else
 static handlerton * sphinx_hton_ptr = NULL;
@@ -737,17 +738,18 @@ static int sphinx_init_func ( void * p )
 	{
 		sphinx_init = 1;
 		void ( pthread_mutex_init ( &sphinx_mutex, MY_MUTEX_INIT_FAST ) );
-		sphinx_hash_init ( &sphinx_open_tables, system_charset_info, 32, 0, 0,
-			sphinx_get_key, 0, 0 );
+                sphinx_hash_init ( PSI_NOT_INSTRUMENTED, &sphinx_open_tables,
+                                   system_charset_info, 32, 0, 0,
+                                   sphinx_get_key, 0, 0 );
 
 		#if MYSQL_VERSION_ID > 50100
 		handlerton * hton = (handlerton*) p;
-		hton->state = SHOW_OPTION_YES;
 		hton->db_type = DB_TYPE_AUTOASSIGN;
 		hton->create = sphinx_create_handler;
 		hton->close_connection = sphinx_close_connection;
 		hton->show_status = sphinx_show_status;
 		hton->panic = sphinx_panic;
+		hton->drop_table= [](handlerton *, const char*) { return -1; };
 		hton->flags = HTON_CAN_RECREATE;
 		#endif
 	}
@@ -769,10 +771,8 @@ static int sphinx_close_connection ( handlerton * hton, THD * thd )
 {
 	// deallocate common handler data
 	SPH_ENTER_FUNC();
-	void ** tmp = thd_ha_data ( thd, hton );
-	CSphTLS * pTls = (CSphTLS *) (*tmp);
+	CSphTLS * pTls = (CSphTLS *) thd_get_ha_data ( thd, hton );
 	SafeDelete ( pTls );
-	*tmp = NULL;
 	SPH_RET(0);
 }
 
@@ -844,7 +844,7 @@ bool sphinx_show_status ( THD * thd )
 
 #if MYSQL_VERSION_ID>50100
 	// 5.1.x style stats
-	CSphTLS * pTls = (CSphTLS*) ( *thd_ha_data ( thd, hton ) );
+	CSphTLS * pTls = (CSphTLS*) ( thd_get_ha_data ( thd, hton ) );
 
 #define LOC_STATS(_key,_keylen,_val,_vallen) \
 	stat_print ( thd, sphinx_hton_name, strlen(sphinx_hton_name), _key, _keylen, _val, _vallen );
@@ -1306,6 +1306,7 @@ CSphSEQuery::CSphSEQuery ( const char * sQuery, int iLength, const char * sIndex
 	, m_fGeoLongitude ( 0.0f )
 	, m_sComment ( (char*) "" )
 	, m_sSelect ( (char*) "*" )
+        , m_dOverrides (PSI_INSTRUMENT_MEM)
 
 	, m_pBuf ( NULL )
 	, m_pCur ( NULL )
@@ -2118,11 +2119,7 @@ int ha_sphinx::open ( const char * name, int, uint )
 
 	thr_lock_data_init ( &m_pShare->m_tLock, &m_tLock, NULL );
 
-	#if MYSQL_VERSION_ID>50100
-	*thd_ha_data ( table->in_use, ht ) = NULL;
-	#else
-	table->in_use->ha_data [ sphinx_hton.slot ] = NULL;
-	#endif
+	thd_set_ha_data ( table->in_use, ht, 0 );
 
 	SPH_RET(0);
 }
@@ -2805,23 +2802,16 @@ CSphSEThreadTable * ha_sphinx::GetTls()
 {
 	SPH_ENTER_METHOD()
 	// where do we store that pointer in today's version?
-	CSphTLS ** ppTls;
-#if MYSQL_VERSION_ID>50100
-	ppTls = (CSphTLS**) thd_ha_data ( table->in_use, ht );
-#else
-	ppTls = (CSphTLS**) &current_thd->ha_data[sphinx_hton.slot];
-#endif // >50100
+	CSphTLS * pTls = (CSphTLS*) thd_get_ha_data ( table->in_use, ht );
 
 	CSphSEThreadTable * pTable = NULL;
 	// allocate if needed
-	if ( !*ppTls )
-	{
-		*ppTls = new CSphTLS ( this );
-		pTable = (*ppTls)->m_pHeadTable;
-	} else
+	if ( !pTls )
 	{
-		pTable = (*ppTls)->m_pHeadTable;
+		pTls = new CSphTLS ( this );
+		thd_set_ha_data(table->in_use, ht, pTls);
 	}
+	pTable = pTls->m_pHeadTable;
 
 	while ( pTable && pTable->m_pHandler!=this )
 		pTable = pTable->m_pTableNext;
@@ -2829,8 +2819,8 @@ CSphSEThreadTable * ha_sphinx::GetTls()
 	if ( !pTable )
 	{
 		pTable = new CSphSEThreadTable ( this );
-		pTable->m_pTableNext = (*ppTls)->m_pHeadTable;
-		(*ppTls)->m_pHeadTable = pTable;
+		pTable->m_pTableNext = pTls->m_pHeadTable;
+		pTls->m_pHeadTable = pTable;
 	}
 
 	// errors will be handled by caller
@@ -3371,7 +3361,7 @@ int ha_sphinx::rename_table ( const char *, const char * )
 // if start_key matches any rows.
 //
 // Called from opt_range.cc by check_quick_keys().
-ha_rows ha_sphinx::records_in_range ( uint, key_range *, key_range * )
+ha_rows ha_sphinx::records_in_range ( uint, const key_range *, const key_range *, page_range *)
 {
 	SPH_ENTER_METHOD();
 	SPH_RET(3); // low number to force index usage
@@ -3532,7 +3522,7 @@ CSphSEStats * sphinx_get_stats ( THD * thd, SHOW_VAR * out )
 #if MYSQL_VERSION_ID>50100
 	if ( sphinx_hton_ptr )
 	{
-		CSphTLS * pTls = (CSphTLS *) *thd_ha_data ( thd, sphinx_hton_ptr );
+		CSphTLS * pTls = (CSphTLS *) thd_get_ha_data ( thd, sphinx_hton_ptr );
 
 		if ( pTls && pTls->m_pHeadTable && pTls->m_pHeadTable->m_bStats )
 			return &pTls->m_pHeadTable->m_tStats;
@@ -3597,7 +3587,7 @@ int sphinx_showfunc_words ( THD * thd, SHOW_VAR * out, char * sBuffer )
 #if MYSQL_VERSION_ID>50100
 	if ( sphinx_hton_ptr )
 	{
-		CSphTLS * pTls = (CSphTLS *) *thd_ha_data ( thd, sphinx_hton_ptr );
+		CSphTLS * pTls = (CSphTLS *) thd_get_ha_data ( thd, sphinx_hton_ptr );
 #else
 	{
 		CSphTLS * pTls = (CSphTLS *) thd->ha_data[sphinx_hton.slot];
diff --git a/storage/sphinx/ha_sphinx.h b/storage/sphinx/ha_sphinx.h
index decd88bad5a..f03e9d8c797 100644
--- a/storage/sphinx/ha_sphinx.h
+++ b/storage/sphinx/ha_sphinx.h
@@ -30,7 +30,7 @@ struct CSphSEStats;
 struct CSphSEThreadTable;
 
 /// Sphinx SE handler class
-class ha_sphinx : public handler
+class ha_sphinx final : public handler
 {
 protected:
 	THR_LOCK_DATA	m_tLock;				///< MySQL lock
@@ -119,7 +119,7 @@ public:
 	int				reset();
 	int				external_lock ( THD * thd, int lock_type );
 	int				delete_all_rows ();
-	ha_rows			records_in_range ( uint inx, key_range * min_key, key_range * max_key );
+	ha_rows			        records_in_range ( uint inx, const key_range * min_key, const key_range * max_key,  page_range *pages);
 
 	int				delete_table ( const char * from );
 	int				rename_table ( const char * from, const char * to );
diff --git a/storage/spider/CMakeLists.txt b/storage/spider/CMakeLists.txt
index 706b11ac141..397478bfc40 100644
--- a/storage/spider/CMakeLists.txt
+++ b/storage/spider/CMakeLists.txt
@@ -50,11 +50,14 @@ ELSEIF(PLUGIN_PARTITION MATCHES "^NO$")
 ELSE()
   INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/spider/hs_client)
 
-  INSTALL(FILES
-    ${CMAKE_SOURCE_DIR}/storage/spider/scripts/install_spider.sql
-    DESTINATION ${INSTALL_MYSQLSHAREDIR} COMPONENT Server
-  )
-  MYSQL_ADD_PLUGIN(spider ${SPIDER_SOURCES} STORAGE_ENGINE MODULE_ONLY MODULE_OUTPUT_NAME "ha_spider")
+  IF(DEB)
+    SET(extra_options COMPONENT spider-engine)
+  ELSE()
+    SET(extra_options CONFIG spider.cnf)
+  ENDIF()
+
+  MYSQL_ADD_PLUGIN(spider ${SPIDER_SOURCES} ${extra_options}
+    STORAGE_ENGINE MODULE_ONLY)
   IF(NOT TARGET spider)
     RETURN()
   ENDIF()
diff --git a/storage/spider/ha_spider.cc b/storage/spider/ha_spider.cc
index eb4052abdf5..48bf8f4c664 100644
--- a/storage/spider/ha_spider.cc
+++ b/storage/spider/ha_spider.cc
@@ -1,5 +1,5 @@
 /* Copyright (C) 2008-2019 Kentoku Shiba
-   Copyright (C) 2019 MariaDB corp
+   Copyright (C) 2019-2022 MariaDB corp
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -76,11 +76,8 @@ ha_spider::ha_spider(
   spider_alloc_calc_mem_init(mem_calc, 139);
   spider_alloc_calc_mem(spider_current_trx, mem_calc, sizeof(*this));
   share = NULL;
-  trx = NULL;
   conns = NULL;
   need_mons = NULL;
-  condition = NULL;
-  cond_check = FALSE;
   blob_buff = NULL;
   conn_keys = NULL;
   spider_thread_id = 0;
@@ -90,10 +87,8 @@ ha_spider::ha_spider(
   trx_hs_w_conn_adjustment = 0;
 #endif
   search_link_query_id = 0;
-  searched_bitmap = NULL;
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-  partition_handler_share = NULL;
-  pt_handler_share_creator = NULL;
+  partition_handler = NULL;
 #endif
 #ifdef HA_MRR_USE_DEFAULT_IMPL
   multi_range_keys = NULL;
@@ -102,7 +97,6 @@ ha_spider::ha_spider(
   append_tblnm_alias = NULL;
   use_index_merge = FALSE;
   is_clone = FALSE;
-  clone_bitmap_init = FALSE;
   pt_clone_source_handler = NULL;
   pt_clone_last_searcher = NULL;
   ft_handler = NULL;
@@ -116,6 +110,7 @@ ha_spider::ha_spider(
 #ifdef SPIDER_HAS_GROUP_BY_HANDLER
   use_fields = FALSE;
 #endif
+  dml_inited = FALSE;
   use_pre_call = FALSE;
   use_pre_action = FALSE;
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
@@ -128,10 +123,6 @@ ha_spider::ha_spider(
   hs_decrement = FALSE;
   hs_pushed_strref_num = 0;
 #endif
-  direct_update_fields = NULL;
-#endif
-#ifdef INFO_KIND_FORCE_LIMIT_BEGIN
-  info_limit = 9223372036854775807LL;
 #endif
 #ifdef HA_CAN_BULK_ACCESS
   is_bulk_access_clone = FALSE;
@@ -188,11 +179,8 @@ ha_spider::ha_spider(
   spider_alloc_calc_mem_init(mem_calc, 0);
   spider_alloc_calc_mem(spider_current_trx, mem_calc, sizeof(*this));
   share = NULL;
-  trx = NULL;
   conns = NULL;
   need_mons = NULL;
-  condition = NULL;
-  cond_check = FALSE;
   blob_buff = NULL;
   conn_keys = NULL;
   spider_thread_id = 0;
@@ -202,10 +190,8 @@ ha_spider::ha_spider(
   trx_hs_w_conn_adjustment = 0;
 #endif
   search_link_query_id = 0;
-  searched_bitmap = NULL;
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-  partition_handler_share = NULL;
-  pt_handler_share_creator = NULL;
+  partition_handler = NULL;
 #endif
 #ifdef HA_MRR_USE_DEFAULT_IMPL
   multi_range_keys = NULL;
@@ -214,7 +200,6 @@ ha_spider::ha_spider(
   append_tblnm_alias = NULL;
   use_index_merge = FALSE;
   is_clone = FALSE;
-  clone_bitmap_init = FALSE;
   pt_clone_source_handler = NULL;
   pt_clone_last_searcher = NULL;
   ft_handler = NULL;
@@ -228,6 +213,7 @@ ha_spider::ha_spider(
 #ifdef SPIDER_HAS_GROUP_BY_HANDLER
   use_fields = FALSE;
 #endif
+  dml_inited = FALSE;
   use_pre_call = FALSE;
   use_pre_action = FALSE;
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
@@ -240,10 +226,6 @@ ha_spider::ha_spider(
   hs_decrement = FALSE;
   hs_pushed_strref_num = 0;
 #endif
-  direct_update_fields = NULL;
-#endif
-#ifdef INFO_KIND_FORCE_LIMIT_BEGIN
-  info_limit = 9223372036854775807LL;
 #endif
 #ifdef HA_CAN_BULK_ACCESS
   is_bulk_access_clone = FALSE;
@@ -295,6 +277,14 @@ ha_spider::~ha_spider()
 {
   DBUG_ENTER("ha_spider::~ha_spider");
   DBUG_PRINT("info",("spider this=%p", this));
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+  partition_handler = NULL;
+#endif
+  if (wide_handler_owner)
+  {
+    spider_free(spider_current_trx, wide_handler, MYF(0));
+  }
+  wide_handler = NULL;
   spider_free_mem_calc(spider_current_trx, mem_calc_id, sizeof(*this));
   DBUG_VOID_RETURN;
 }
@@ -340,165 +330,148 @@ int ha_spider::open(
   THD *thd = ha_thd();
   int error_num, roop_count;
   int init_sql_alloc_size;
+  ha_spider *spider, *owner;
+  bool wide_handler_alloc = FALSE;
+  SPIDER_WIDE_SHARE *wide_share;
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-  SPIDER_PARTITION_SHARE *partition_share;
-  uchar *idx_read_bitmap, *idx_write_bitmap,
-    *rnd_read_bitmap, *rnd_write_bitmap;
   uint part_num;
-  bool create_pt_handler_share = FALSE, pt_handler_mutex = FALSE,
-    may_be_clone = FALSE;
-  ha_spider **pt_handler_share_handlers;
-#ifdef SPIDER_HAS_HASH_VALUE_TYPE
-  my_hash_value_type hash_value;
-#endif
+  bool partition_handler_alloc = FALSE;
+  ha_spider **wide_handler_handlers = NULL;
+  ha_partition *clone_source;
 #endif
   DBUG_ENTER("ha_spider::open");
   DBUG_PRINT("info",("spider this=%p", this));
 
   dup_key_idx = (uint) -1;
   conn_kinds = SPIDER_CONN_KIND_MYSQL;
-  if (!spider_get_share(name, table, thd, this, &error_num))
-    goto error_get_share;
-  thr_lock_data_init(&share->lock,&lock,NULL);
-
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-  partition_share = share->partition_share;
   table->file->get_no_parts("", &part_num);
-  if (partition_share)
+  if (part_num)
   {
-    pt_handler_mutex = TRUE;
-    pthread_mutex_lock(&partition_share->pt_handler_mutex);
-/*
-    if (
-      !partition_share->partition_handler_share ||
-      partition_share->partition_handler_share->table != table
-    )
-      create_pt_handler_share = TRUE;
-*/
-#ifdef SPIDER_HAS_HASH_VALUE_TYPE
-    hash_value = my_calc_hash(&partition_share->pt_handler_hash,
-      (uchar*) &table, sizeof(TABLE *));
-    if (!(partition_handler_share = (SPIDER_PARTITION_HANDLER_SHARE*)
-      my_hash_search_using_hash_value(&partition_share->pt_handler_hash,
-      hash_value, (uchar*) &table, sizeof(TABLE *))))
-#else
-    if (!(partition_handler_share = (SPIDER_PARTITION_HANDLER_SHARE*)
-      my_hash_search(&partition_share->pt_handler_hash, (uchar*) &table,
-      sizeof(TABLE *))))
-#endif
+    wide_handler_handlers =
+      (ha_spider **) ((ha_partition *) table->file)->get_child_handlers();
+    spider = wide_handler_handlers[0];
+    owner = wide_handler_handlers[part_num - 1];
+    clone_source = ((ha_partition *) table->file)->get_clone_source();
+    if (clone_source)
     {
-      create_pt_handler_share = TRUE;
+      is_clone = TRUE;
     }
+  } else {
+#endif
+    spider = this;
+    owner = this;
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+    clone_source = NULL;
   }
-
-  if (create_pt_handler_share)
+#endif
+  if (!spider->wide_handler)
   {
-    if (!(searched_bitmap = (uchar *)
-      spider_bulk_malloc(spider_current_trx, 15, MYF(MY_WME),
-        &searched_bitmap, sizeof(uchar) * no_bytes_in_map(table->read_set),
-        &ft_discard_bitmap, sizeof(uchar) * no_bytes_in_map(table->read_set),
-        &position_bitmap, sizeof(uchar) * no_bytes_in_map(table->read_set),
-        &partition_handler_share, sizeof(SPIDER_PARTITION_HANDLER_SHARE),
-        &idx_read_bitmap, sizeof(uchar) * no_bytes_in_map(table->read_set),
-        &idx_write_bitmap, sizeof(uchar) * no_bytes_in_map(table->read_set),
-        &rnd_read_bitmap, sizeof(uchar) * no_bytes_in_map(table->read_set),
-        &rnd_write_bitmap, sizeof(uchar) * no_bytes_in_map(table->read_set),
-        &pt_handler_share_handlers, sizeof(ha_spider *) * part_num,
-        NullS))
-    ) {
-      error_num = HA_ERR_OUT_OF_MEM;
-      goto error_searched_bitmap_alloc;
-    }
-    DBUG_PRINT("info",("spider create partition_handler_share"));
-    partition_handler_share->use_count = 1;
-/*
-    if (partition_handler_share->use_count < part_num)
-      partition_share->partition_handler_share = partition_handler_share;
-*/
-    DBUG_PRINT("info",("spider table=%p", table));
-    partition_handler_share->table = table;
-    partition_handler_share->searched_bitmap = NULL;
-    partition_handler_share->ft_discard_bitmap = NULL;
-    partition_handler_share->idx_read_bitmap = idx_read_bitmap;
-    partition_handler_share->idx_write_bitmap = idx_write_bitmap;
-    partition_handler_share->rnd_read_bitmap = rnd_read_bitmap;
-    partition_handler_share->rnd_write_bitmap = rnd_write_bitmap;
-    partition_handler_share->between_flg = FALSE;
-    partition_handler_share->idx_bitmap_is_set = FALSE;
-    partition_handler_share->rnd_bitmap_is_set = FALSE;
-    partition_handler_share->table_hash_value = hash_value;
-    partition_handler_share->creator = this;
-    partition_handler_share->parallel_search_query_id = 0;
-    pt_handler_share_creator = this;
-    if (part_num)
-    {
-      partition_handler_share->handlers = (void **) pt_handler_share_handlers;
-      partition_handler_share->handlers[0] = this;
-    } else
-      partition_handler_share->handlers = NULL;
-    uint old_elements = partition_share->pt_handler_hash.array.max_element;
-#ifdef HASH_UPDATE_WITH_HASH_VALUE
-    if (my_hash_insert_with_hash_value(&partition_share->pt_handler_hash,
-      hash_value, (uchar*) partition_handler_share))
+    uchar *searched_bitmap;
+    uchar *ft_discard_bitmap;
+    uchar *position_bitmap;
+    uchar *idx_read_bitmap;
+    uchar *idx_write_bitmap;
+    uchar *rnd_read_bitmap;
+    uchar *rnd_write_bitmap;
+    if (!(wide_handler = (SPIDER_WIDE_HANDLER *)
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+      spider_bulk_malloc(spider_current_trx, 16, MYF(MY_WME | MY_ZEROFILL),
+        &wide_handler, sizeof(SPIDER_WIDE_HANDLER),
+        &searched_bitmap,
+          (uint) sizeof(uchar) * no_bytes_in_map(table->read_set),
+        &ft_discard_bitmap,
+          (uint) sizeof(uchar) * no_bytes_in_map(table->read_set),
+        &position_bitmap,
+          (uint) sizeof(uchar) * no_bytes_in_map(table->read_set),
+        &idx_read_bitmap,
+          (uint) sizeof(uchar) * no_bytes_in_map(table->read_set),
+        &idx_write_bitmap,
+          (uint) sizeof(uchar) * no_bytes_in_map(table->read_set),
+        &rnd_read_bitmap,
+          (uint) sizeof(uchar) * no_bytes_in_map(table->read_set),
+        &rnd_write_bitmap,
+          (uint) sizeof(uchar) * no_bytes_in_map(table->read_set),
+        &partition_handler,
+          (uint) sizeof(SPIDER_PARTITION_HANDLER),
+        NullS)
 #else
-    if (my_hash_insert(&partition_share->pt_handler_hash,
-      (uchar*) partition_handler_share))
+      spider_bulk_malloc(spider_current_trx, 16, MYF(MY_WME | MY_ZEROFILL),
+        &wide_handler, sizeof(SPIDER_WIDE_HANDLER),
+        &searched_bitmap,
+          (uint) sizeof(uchar) * no_bytes_in_map(table->read_set),
+        &ft_discard_bitmap,
+          (uint) sizeof(uchar) * no_bytes_in_map(table->read_set),
+        &position_bitmap,
+          (uint) sizeof(uchar) * no_bytes_in_map(table->read_set),
+        &idx_read_bitmap,
+          (uint) sizeof(uchar) * no_bytes_in_map(table->read_set),
+        &idx_write_bitmap,
+          (uint) sizeof(uchar) * no_bytes_in_map(table->read_set),
+        &rnd_read_bitmap,
+          (uint) sizeof(uchar) * no_bytes_in_map(table->read_set),
+        &rnd_write_bitmap,
+          (uint) sizeof(uchar) * no_bytes_in_map(table->read_set),
+        NullS)
 #endif
-    {
-      error_num = HA_ERR_OUT_OF_MEM;
-      goto error_hash_insert;
-    }
-    if (partition_share->pt_handler_hash.array.max_element > old_elements)
-    {
-      spider_alloc_calc_mem(spider_current_trx,
-        partition_share->pt_handler_hash,
-        (partition_share->pt_handler_hash.array.max_element - old_elements) *
-        partition_share->pt_handler_hash.array.size_of_element);
-    }
-    pthread_mutex_unlock(&partition_share->pt_handler_mutex);
-    pt_handler_mutex = FALSE;
-  } else {
-#endif
-    if (!(searched_bitmap = (uchar *)
-      spider_bulk_malloc(spider_current_trx, 16, MYF(MY_WME),
-        &searched_bitmap, sizeof(uchar) * no_bytes_in_map(table->read_set),
-        &ft_discard_bitmap, sizeof(uchar) * no_bytes_in_map(table->read_set),
-        &position_bitmap, sizeof(uchar) * no_bytes_in_map(table->read_set),
-        NullS))
+        )
     ) {
       error_num = HA_ERR_OUT_OF_MEM;
-      goto error_searched_bitmap_alloc;
-    }
+      goto error_wide_handler_alloc;
+    }
+    spider->wide_handler = wide_handler;
+    owner->wide_handler = wide_handler;
+    wide_handler->searched_bitmap = searched_bitmap;
+    wide_handler->ft_discard_bitmap = ft_discard_bitmap;
+    wide_handler->position_bitmap = position_bitmap;
+    wide_handler->idx_read_bitmap = idx_read_bitmap;
+    wide_handler->idx_write_bitmap = idx_write_bitmap;
+    wide_handler->rnd_read_bitmap = rnd_read_bitmap;
+    wide_handler->rnd_write_bitmap = rnd_write_bitmap;
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-    if (partition_share)
-    {
-      DBUG_PRINT("info",("spider copy partition_handler_share"));
-/*
-      partition_handler_share = (SPIDER_PARTITION_HANDLER_SHARE *)
-        partition_share->partition_handler_share;
-*/
-      if (part_num)
-      {
-        if (partition_handler_share->use_count >= part_num)
-          may_be_clone = TRUE;
-        else {
-          partition_handler_share->handlers[
-            partition_handler_share->use_count] = this;
-          partition_handler_share->use_count++;
-        }
-      }
-/*
-      if (partition_handler_share->use_count == part_num)
-        partition_share->partition_handler_share = NULL;
-*/
-      pthread_mutex_unlock(&partition_share->pt_handler_mutex);
-      pt_handler_mutex = FALSE;
-    }
+    wide_handler->partition_handler = partition_handler;
+#endif
+    wide_handler->owner = owner;
+    if (table_share->tmp_table == NO_TMP_TABLE)
+      wide_handler->top_share = table->s;
+    owner->wide_handler_owner = TRUE;
+    memset(wide_handler->ft_discard_bitmap, 0xFF,
+      no_bytes_in_map(table->read_set));
+    memset(wide_handler->searched_bitmap, 0,
+      no_bytes_in_map(table->read_set));
+    wide_handler_alloc = TRUE;
+
+  if (!share && !spider_get_share(name, table, thd, this, &error_num))
+    goto error_get_share;
+
+  wide_share = share->wide_share;
+
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+    DBUG_PRINT("info",("spider create partition_handler"));
+    DBUG_PRINT("info",("spider table=%p", table));
+    partition_handler->table = table;
+    partition_handler->no_parts = part_num;
+    partition_handler->owner = owner;
+    partition_handler->parallel_search_query_id = 0;
+    spider->partition_handler = partition_handler;
+    owner->partition_handler = partition_handler;
+    partition_handler->handlers = wide_handler_handlers;
+    partition_handler_alloc = TRUE;
+  } else {
+    wide_handler = spider->wide_handler;
+    partition_handler = wide_handler->partition_handler;
+
+    if (!share && !spider_get_share(name, table, thd, this, &error_num))
+      goto error_get_share;
+
+    wide_share= share->wide_share;
+  }
+  if (wide_handler_alloc)
+  {
+    thr_lock_data_init(&wide_share->lock, &wide_handler->lock, NULL);
   }
-#endif
-  memset(ft_discard_bitmap, 0xFF, no_bytes_in_map(table->read_set));
-  memset(searched_bitmap, 0, no_bytes_in_map(table->read_set));
 
+#endif
   init_sql_alloc_size =
     spider_param_init_sql_alloc_size(thd, share->init_sql_alloc_size);
 
@@ -557,10 +530,6 @@ int ha_spider::open(
     }
   }
 
-#ifdef WITH_PARTITION_STORAGE_ENGINE
-  if (may_be_clone && thd_sql_command(thd) != SQLCOM_ALTER_TABLE)
-    is_clone = TRUE;
-#endif
   if (is_clone)
   {
 #ifdef WITH_PARTITION_STORAGE_ENGINE
@@ -568,28 +537,38 @@ int ha_spider::open(
     {
       for (roop_count = 0; roop_count < (int) part_num; roop_count++)
       {
-        if (((ha_spider *) partition_handler_share->handlers[roop_count])->
-          share == share)
+        if (partition_handler->handlers[roop_count]->share == share)
         {
           pt_clone_source_handler =
-            (ha_spider *) partition_handler_share->handlers[roop_count];
+            partition_handler->handlers[roop_count];
           break;
         }
       }
     }
 #endif
 
-    sql_command = pt_clone_source_handler->sql_command;
-    result_list.lock_type = pt_clone_source_handler->result_list.lock_type;
-    lock_mode = pt_clone_source_handler->lock_mode;
+    wide_handler->external_lock_type =
+      pt_clone_source_handler->wide_handler->external_lock_type;
 
-    if (!pt_clone_source_handler->clone_bitmap_init)
+    if (wide_handler_alloc)
     {
-      pt_clone_source_handler->set_select_column_mode();
-      pt_clone_source_handler->clone_bitmap_init = TRUE;
+      wide_handler->lock_mode =
+        pt_clone_source_handler->wide_handler->lock_mode;
+      if (!partition_handler->clone_bitmap_init)
+      {
+        pt_clone_source_handler->set_select_column_mode();
+        partition_handler->clone_bitmap_init = TRUE;
+      }
+      set_clone_searched_bitmap();
+      wide_handler->position_bitmap_init = FALSE;
+      wide_handler->sql_command =
+        pt_clone_source_handler->wide_handler->sql_command;
+    }
+  } else {
+    if (share->semi_table_lock)
+    {
+      wide_handler->semi_table_lock = TRUE;
     }
-    set_clone_searched_bitmap();
-    position_bitmap_init = FALSE;
   }
 #ifdef HA_CAN_BULK_ACCESS
   external_lock_cnt = 0;
@@ -609,50 +588,37 @@ error_reset:
 error_init_blob_buff:
 error_init_result_list:
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-  if (
-    partition_handler_share &&
-    pt_handler_share_creator == this
-  ) {
-    partition_share = share->partition_share;
-    if (!pt_handler_mutex)
-      pthread_mutex_lock(&partition_share->pt_handler_mutex);
-/*
-    if (partition_share->partition_handler_share == partition_handler_share)
-      partition_share->partition_handler_share = NULL;
-*/
-#ifdef HASH_UPDATE_WITH_HASH_VALUE
-    my_hash_delete_with_hash_value(&partition_share->pt_handler_hash,
-      partition_handler_share->table_hash_value,
-      (uchar*) partition_handler_share);
-#else
-    my_hash_delete(&partition_share->pt_handler_hash,
-      (uchar*) partition_handler_share);
-#endif
-    pthread_mutex_unlock(&partition_share->pt_handler_mutex);
-    pt_handler_mutex = FALSE;
-  }
-error_hash_insert:
-  partition_handler_share = NULL;
-  pt_handler_share_creator = NULL;
-#endif
-  if (searched_bitmap)
+  if (partition_handler_alloc)
   {
-    spider_free(spider_current_trx, searched_bitmap, MYF(0));
-    searched_bitmap = NULL;
+    wide_share = share->wide_share;
+    spider->partition_handler = NULL;
+    owner->partition_handler = NULL;
   }
-error_searched_bitmap_alloc:
-#ifdef WITH_PARTITION_STORAGE_ENGINE
-  if (pt_handler_mutex)
-    pthread_mutex_unlock(&partition_share->pt_handler_mutex);
+  partition_handler = NULL;
 #endif
   spider_free_share(share);
   share = NULL;
-error_get_share:
   if (conn_keys)
   {
     spider_free(spider_current_trx, conn_keys, MYF(0));
     conn_keys = NULL;
   }
+error_get_share:
+  if (wide_handler_alloc)
+  {
+    spider_free(spider_current_trx, wide_handler, MYF(0));
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+    if (wide_handler_handlers)
+    {
+      wide_handler_handlers[0]->wide_handler = NULL;
+    }
+#endif
+    spider->wide_handler = NULL;
+    owner->wide_handler = NULL;
+    owner->wide_handler_owner = FALSE;
+  }
+  wide_handler = NULL;
+error_wide_handler_alloc:
   DBUG_RETURN(error_num);
 }
 
@@ -660,9 +626,6 @@ int ha_spider::close()
 {
   int error_num = 0, roop_count, error_num2;
   THD *thd = ha_thd();
-#ifdef WITH_PARTITION_STORAGE_ENGINE
-  SPIDER_PARTITION_SHARE *partition_share;
-#endif
   backup_error_status();
   DBUG_ENTER("ha_spider::close");
   DBUG_PRINT("info",("spider this=%p", this));
@@ -688,8 +651,8 @@ int ha_spider::close()
         bulk_access_link_first->spider));
       DBUG_PRINT("info",("spider bulk_access_link->spider->dbton_handler=%p",
         bulk_access_link_first->spider->dbton_handler));
-      DBUG_PRINT("info",("spider ptr bulk_access_link->spider->dbton_handler=%p",
-        &bulk_access_link_first->spider->dbton_handler));
+      DBUG_PRINT("info",("spider ptr bulk_access_link->spider->dbton_handler="
+        "%p", &bulk_access_link_first->spider->dbton_handler));
       bulk_access_link_current = bulk_access_link_first->next;
       delete_bulk_access_link(bulk_access_link_first);
       bulk_access_link_first = bulk_access_link_current;
@@ -752,34 +715,14 @@ int ha_spider::close()
     conn_keys = NULL;
   }
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-  if (
-    partition_handler_share &&
-    pt_handler_share_creator == this
-  ) {
-    partition_share = share->partition_share;
-    pthread_mutex_lock(&partition_share->pt_handler_mutex);
-/*
-    if (partition_share->partition_handler_share == partition_handler_share)
-      partition_share->partition_handler_share = NULL;
-*/
-#ifdef HASH_UPDATE_WITH_HASH_VALUE
-    my_hash_delete_with_hash_value(&partition_share->pt_handler_hash,
-      partition_handler_share->table_hash_value,
-      (uchar*) partition_handler_share);
-#else
-    my_hash_delete(&partition_share->pt_handler_hash,
-      (uchar*) partition_handler_share);
-#endif
-    pthread_mutex_unlock(&partition_share->pt_handler_mutex);
-  }
-  partition_handler_share = NULL;
-  pt_handler_share_creator = NULL;
+  partition_handler = NULL;
 #endif
-  if (searched_bitmap)
+  if (wide_handler_owner)
   {
-    spider_free(spider_current_trx, searched_bitmap, MYF(0));
-    searched_bitmap = NULL;
+    spider_free(spider_current_trx, wide_handler, MYF(0));
+    wide_handler_owner = FALSE;
   }
+  wide_handler = NULL;
   if (blob_buff)
   {
     delete [] blob_buff;
@@ -824,34 +767,23 @@ int ha_spider::close()
   }
 #endif
 #endif
-#ifdef HA_CAN_BULK_ACCESS
-/*
-  if (init_ha_mem_root)
-  {
-    free_root(&ha_mem_root, MYF(0));
-    init_ha_mem_root = FALSE;
-  }
-*/
-#endif
   is_clone = FALSE;
   pt_clone_source_handler = NULL;
   share = NULL;
-  trx = NULL;
   conns = NULL;
 
   DBUG_RETURN(error_num);
 }
 
-int ha_spider::check_access_kind(
+int ha_spider::check_access_kind_for_connection(
   THD *thd,
   bool write_request
 ) {
   int error_num, roop_count;
-  DBUG_ENTER("ha_spider::check_access_kind");
+  DBUG_ENTER("ha_spider::check_access_kind_for_connection");
   DBUG_PRINT("info",("spider this=%p", this));
-  sql_command = thd_sql_command(thd);
   conn_kinds = 0;
-  switch (sql_command)
+  switch (wide_handler->sql_command)
   {
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
     case SQLCOM_HS_READ:
@@ -953,33 +885,60 @@ int ha_spider::check_access_kind(
   {
     DBUG_RETURN(error_num);
   }
-  DBUG_PRINT("info",("spider sql_command=%u", sql_command));
+  DBUG_PRINT("info",("spider wide_handler->semi_trx_isolation_chk = %s",
+    wide_handler->semi_trx_isolation_chk ? "TRUE" : "FALSE"));
+  if (wide_handler->semi_trx_isolation_chk)
+  {
+    SPIDER_SET_CONNS_PARAM(semi_trx_isolation_chk, TRUE, conns,
+      share->link_statuses, conn_link_idx, (int) share->link_count,
+      SPIDER_LINK_STATUS_RECOVERY);
+  }
+  DBUG_PRINT("info",("spider wide_handler->semi_trx_chk = %s",
+    wide_handler->semi_trx_chk ? "TRUE" : "FALSE"));
+  if (wide_handler->semi_trx_chk)
+  {
+    SPIDER_SET_CONNS_PARAM(semi_trx_chk, TRUE, conns, share->link_statuses,
+      conn_link_idx, (int) share->link_count, SPIDER_LINK_STATUS_RECOVERY);
+  } else {
+    SPIDER_SET_CONNS_PARAM(semi_trx_chk, FALSE, conns, share->link_statuses,
+      conn_link_idx, (int) share->link_count, SPIDER_LINK_STATUS_RECOVERY);
+  }
+  DBUG_RETURN(0);
+}
+
+void ha_spider::check_access_kind(
+  THD *thd
+) {
+  DBUG_ENTER("ha_spider::check_access_kind");
+  DBUG_PRINT("info",("spider this=%p", this));
+  wide_handler->sql_command = thd_sql_command(thd);
+  DBUG_PRINT("info",("spider sql_command=%u", wide_handler->sql_command));
   DBUG_PRINT("info",("spider thd->query_id=%lld", thd->query_id));
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
 #ifdef HS_HAS_SQLCOM
-  if (sql_command == SQLCOM_HS_UPDATE)
-    update_request = TRUE;
+  if (wide_handler->sql_command == SQLCOM_HS_UPDATE)
+    wide_handler->update_request = TRUE;
   else
 #endif
-    update_request = FALSE;
+    wide_handler->update_request = FALSE;
 #else
   if (
 #ifdef HS_HAS_SQLCOM
-    sql_command == SQLCOM_HS_UPDATE ||
+    wide_handler->sql_command == SQLCOM_HS_UPDATE ||
 #endif
-    sql_command == SQLCOM_UPDATE ||
-    sql_command == SQLCOM_UPDATE_MULTI ||
+    wide_handler->sql_command == SQLCOM_UPDATE ||
+    wide_handler->sql_command == SQLCOM_UPDATE_MULTI ||
     /* for triggers */
-    sql_command == SQLCOM_INSERT ||
-    sql_command == SQLCOM_INSERT_SELECT ||
-    sql_command == SQLCOM_DELETE ||
-    sql_command == SQLCOM_DELETE_MULTI
+    wide_handler->sql_command == SQLCOM_INSERT ||
+    wide_handler->sql_command == SQLCOM_INSERT_SELECT ||
+    wide_handler->sql_command == SQLCOM_DELETE ||
+    wide_handler->sql_command == SQLCOM_DELETE_MULTI
   )
-    update_request = TRUE;
+    wide_handler->update_request = TRUE;
   else
-    update_request = FALSE;
+    wide_handler->update_request = FALSE;
 #endif
-  DBUG_RETURN(0);
+  DBUG_VOID_RETURN;
 }
 
 #ifdef HA_CAN_BULK_ACCESS
@@ -993,14 +952,20 @@ int ha_spider::additional_lock(
   {
     if (is_bulk_access_clone)
     {
-      DBUG_RETURN(check_access_kind(thd, (lock_type >= TL_WRITE_ALLOW_WRITE)));
+      check_access_kind(thd);
+      DBUG_RETURN(check_access_kind_for_connection(thd,
+        (lock_type >= TL_WRITE_ALLOW_WRITE)));
     } else if (bulk_access_link_exec_tgt->called)
     {
-      DBUG_RETURN(bulk_access_link_exec_tgt->spider->check_access_kind(
-        thd, (lock_type >= TL_WRITE_ALLOW_WRITE)));
+      bulk_access_link_exec_tgt->spider->check_access_kind(thd);
+      DBUG_RETURN(bulk_access_link_exec_tgt->spider->
+        check_access_kind_for_connection(
+          thd, (lock_type >= TL_WRITE_ALLOW_WRITE)));
     }
   }
-  DBUG_RETURN(check_access_kind(thd, (lock_type >= TL_WRITE_ALLOW_WRITE)));
+  check_access_kind(thd);
+  DBUG_RETURN(check_access_kind_for_connection(thd,
+    (lock_type >= TL_WRITE_ALLOW_WRITE)));
 }
 #endif
 
@@ -1009,53 +974,37 @@ THR_LOCK_DATA **ha_spider::store_lock(
   THR_LOCK_DATA **to,
   enum thr_lock_type lock_type
 ) {
-  int error_num, roop_count;
   DBUG_ENTER("ha_spider::store_lock");
   DBUG_PRINT("info",("spider this=%p", this));
-  if (lock_type == TL_IGNORE)
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+  if (
+    wide_handler->stage == SPD_HND_STAGE_STORE_LOCK &&
+    wide_handler->stage_executor != this)
   {
-    *to++ = &lock;
     DBUG_RETURN(to);
   }
-  if ((error_num = check_access_kind(thd,
-    (lock_type >= TL_WRITE_ALLOW_WRITE))))
+  wide_handler->stage = SPD_HND_STAGE_STORE_LOCK;
+  wide_handler->stage_executor = this;
+#endif
+  wide_handler->lock_table_type = 0;
+  if (lock_type == TL_IGNORE)
   {
-    store_error_num = error_num;
+    *to++ = &wide_handler->lock;
     DBUG_RETURN(to);
   }
-  DBUG_PRINT("info",("spider sql_command=%u", sql_command));
+  check_access_kind(thd);
+  DBUG_PRINT("info",("spider sql_command=%u", wide_handler->sql_command));
   DBUG_PRINT("info",("spider lock_type=%d", lock_type));
   DBUG_PRINT("info",("spider thd->query_id=%lld", thd->query_id));
-  if (sql_command == SQLCOM_ALTER_TABLE)
-  {
-    if (trx->query_id != thd->query_id)
-    {
-      spider_free_trx_alter_table(trx);
-      trx->query_id = thd->query_id;
-      trx->tmp_flg = FALSE;
-    }
-    if (!(SPIDER_ALTER_TABLE*) my_hash_search(&trx->trx_alter_table_hash,
-      (uchar*) share->table_name, share->table_name_length))
-    {
-      if (spider_create_trx_alter_table(trx, share, FALSE))
-      {
-        store_error_num = HA_ERR_OUT_OF_MEM;
-        DBUG_RETURN(to);
-      }
-    }
-  }
 
-  this->lock_type = lock_type;
-  selupd_lock_mode = spider_param_selupd_lock_mode(thd,
-    share->selupd_lock_mode);
+  wide_handler->lock_type = lock_type;
   if (
-    sql_command != SQLCOM_DROP_TABLE &&
-    sql_command != SQLCOM_ALTER_TABLE
+    wide_handler->sql_command != SQLCOM_DROP_TABLE &&
+    wide_handler->sql_command != SQLCOM_ALTER_TABLE
   ) {
-    SPIDER_SET_CONNS_PARAM(semi_trx_chk, FALSE, conns, share->link_statuses,
-      conn_link_idx, (int) share->link_count, SPIDER_LINK_STATUS_RECOVERY);
+    wide_handler->semi_trx_chk = FALSE;
   }
-  switch (sql_command)
+  switch (wide_handler->sql_command)
   {
     case SQLCOM_SELECT:
     case SQLCOM_HA_READ:
@@ -1063,17 +1012,14 @@ THR_LOCK_DATA **ha_spider::store_lock(
     case SQLCOM_HS_READ:
 #endif
       if (lock_type == TL_READ_WITH_SHARED_LOCKS)
-        lock_mode = 1;
+        wide_handler->lock_mode = 1;
       else if (lock_type <= TL_READ_NO_INSERT)
       {
-        lock_mode = 0;
-        SPIDER_SET_CONNS_PARAM(semi_trx_isolation_chk, TRUE, conns,
-          share->link_statuses, conn_link_idx, (int) share->link_count,
-          SPIDER_LINK_STATUS_RECOVERY);
+        wide_handler->lock_mode = 0;
+        wide_handler->semi_trx_isolation_chk = TRUE;
       } else
-        lock_mode = -1;
-      SPIDER_SET_CONNS_PARAM(semi_trx_chk, TRUE, conns, share->link_statuses,
-        conn_link_idx, (int) share->link_count, SPIDER_LINK_STATUS_RECOVERY);
+        wide_handler->lock_mode = -1;
+      wide_handler->semi_trx_chk = TRUE;
       break;
     case SQLCOM_UPDATE:
     case SQLCOM_UPDATE_MULTI:
@@ -1094,40 +1040,38 @@ THR_LOCK_DATA **ha_spider::store_lock(
 #endif
       if (lock_type >= TL_READ && lock_type <= TL_READ_NO_INSERT)
       {
-        lock_mode = selupd_lock_mode;
-        SPIDER_SET_CONNS_PARAM(semi_trx_isolation_chk, TRUE, conns,
-          share->link_statuses, conn_link_idx, (int) share->link_count,
-          SPIDER_LINK_STATUS_RECOVERY);
+        wide_handler->lock_mode = -2;
+        wide_handler->semi_trx_isolation_chk = TRUE;
       } else
-        lock_mode = -1;
-      SPIDER_SET_CONNS_PARAM(semi_trx_chk, TRUE, conns, share->link_statuses,
-        conn_link_idx, (int) share->link_count, SPIDER_LINK_STATUS_RECOVERY);
+        wide_handler->lock_mode = -1;
+      wide_handler->semi_trx_chk = TRUE;
       break;
     default:
-        lock_mode = -1;
+        wide_handler->lock_mode = -1;
   }
   switch (lock_type)
   {
     case TL_READ_HIGH_PRIORITY:
-      high_priority = TRUE;
+      wide_handler->high_priority = TRUE;
       break;
     case TL_WRITE_DELAYED:
-      insert_delayed = TRUE;
+      wide_handler->insert_delayed = TRUE;
       break;
     case TL_WRITE_LOW_PRIORITY:
-      low_priority = TRUE;
+      wide_handler->low_priority = TRUE;
       break;
     default:
       break;
   }
 
-  if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK)
+  if (wide_handler->lock_type != TL_IGNORE &&
+    wide_handler->lock.type == TL_UNLOCK)
   {
     if (
-      sql_command == SQLCOM_DROP_TABLE ||
-      sql_command == SQLCOM_ALTER_TABLE ||
-      sql_command == SQLCOM_SHOW_CREATE ||
-      sql_command == SQLCOM_SHOW_FIELDS
+      wide_handler->sql_command == SQLCOM_DROP_TABLE ||
+      wide_handler->sql_command == SQLCOM_ALTER_TABLE ||
+      wide_handler->sql_command == SQLCOM_SHOW_CREATE ||
+      wide_handler->sql_command == SQLCOM_SHOW_FIELDS
     ) {
       if (
         lock_type == TL_READ_NO_INSERT &&
@@ -1140,80 +1084,77 @@ THR_LOCK_DATA **ha_spider::store_lock(
       )
         lock_type = TL_WRITE_ALLOW_WRITE;
     } else if (
-      sql_command == SQLCOM_LOCK_TABLES ||
-      (spider_param_lock_exchange(thd) == 1 && share->semi_table_lock))
+      wide_handler->sql_command == SQLCOM_LOCK_TABLES ||
+      (spider_param_lock_exchange(thd) == 1 && wide_handler->semi_table_lock))
     {
       DBUG_PRINT("info",("spider lock exchange route"));
-      DBUG_PRINT("info",("spider lock_type=%u", this->lock_type));
+      DBUG_PRINT("info",("spider lock_type=%u", wide_handler->lock_type));
       if (
         (
-          this->lock_type == TL_READ ||
-          this->lock_type == TL_READ_NO_INSERT ||
-          this->lock_type == TL_WRITE_LOW_PRIORITY ||
-          this->lock_type == TL_WRITE
+          wide_handler->lock_type == TL_READ ||
+          wide_handler->lock_type == TL_READ_NO_INSERT ||
+          wide_handler->lock_type == TL_WRITE_LOW_PRIORITY ||
+          wide_handler->lock_type == TL_WRITE
         ) &&
         !spider_param_local_lock_table(thd)
       ) {
-        for (
-          roop_count = spider_conn_link_idx_next(share->link_statuses,
-            conn_link_idx, -1, share->link_count,
-            SPIDER_LINK_STATUS_RECOVERY);
-          roop_count < (int) share->link_count;
-          roop_count = spider_conn_link_idx_next(share->link_statuses,
-            conn_link_idx, roop_count, share->link_count,
-            SPIDER_LINK_STATUS_RECOVERY)
-        ) {
-          SPIDER_CONN *conn = conns[roop_count];
-          int appended = 0;
-          if ((error_num = dbton_handler[conn->dbton_id]->
-            append_lock_tables_list(conn, roop_count, &appended)))
-          {
-            store_error_num = error_num;
-            DBUG_RETURN(to);
-          }
-          if (appended)
+        wide_handler->lock_table_type = 1;
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+        if (partition_handler && partition_handler->handlers)
+        {
+          uint roop_count;
+          for (roop_count = 0; roop_count < partition_handler->no_parts;
+            ++roop_count)
           {
-            conn->table_lock = 2;
+            if (unlikely((store_error_num =
+              partition_handler->handlers[roop_count]->
+                append_lock_tables_list())))
+            {
+              break;
+            }
           }
+        } else {
+#endif
+          store_error_num = append_lock_tables_list();
+#ifdef WITH_PARTITION_STORAGE_ENGINE
         }
+#endif
       }
     } else {
       DBUG_PRINT("info",("spider default lock route"));
-      DBUG_PRINT("info",("spider lock_type=%u", this->lock_type));
+      DBUG_PRINT("info",("spider lock_type=%u", wide_handler->lock_type));
       if (
-        this->lock_type == TL_READ ||
-        this->lock_type == TL_READ_NO_INSERT ||
-        this->lock_type == TL_WRITE_LOW_PRIORITY ||
-        this->lock_type == TL_WRITE
+        wide_handler->lock_type == TL_READ ||
+        wide_handler->lock_type == TL_READ_NO_INSERT ||
+        wide_handler->lock_type == TL_WRITE_LOW_PRIORITY ||
+        wide_handler->lock_type == TL_WRITE
       ) {
-        for (
-          roop_count = spider_conn_link_idx_next(share->link_statuses,
-            conn_link_idx, -1, share->link_count,
-            SPIDER_LINK_STATUS_RECOVERY);
-          roop_count < (int) share->link_count;
-          roop_count = spider_conn_link_idx_next(share->link_statuses,
-            conn_link_idx, roop_count, share->link_count,
-            SPIDER_LINK_STATUS_RECOVERY)
+        if (
+          !spider_param_local_lock_table(thd) &&
+          spider_param_semi_table_lock(thd, wide_handler->semi_table_lock)
         ) {
-          if (
-            conns[roop_count] &&
-            conns[roop_count]->table_lock != 1 &&
-            spider_param_semi_table_lock(thd, share->semi_table_lock) &&
-            !spider_param_local_lock_table(thd)
-          ) {
-            SPIDER_CONN *conn = conns[roop_count];
-            int appended = 0;
-            if ((error_num = dbton_handler[conn->dbton_id]->
-              append_lock_tables_list(conn, roop_count, &appended)))
-            {
-              store_error_num = error_num;
-              DBUG_RETURN(to);
-            }
-            if (appended)
+          wide_handler->lock_table_type = 2;
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+          if (partition_handler && partition_handler->handlers)
+          {
+            uint roop_count;
+            for (roop_count = 0;
+              roop_count < partition_handler->no_parts;
+              ++roop_count)
             {
-              conn->table_lock = 3;
+              if (unlikely((store_error_num =
+                partition_handler->handlers[roop_count]->
+                  append_lock_tables_list())))
+              {
+                break;
+              }
             }
+          } else {
+#endif
+            store_error_num = append_lock_tables_list();
+#ifdef WITH_PARTITION_STORAGE_ENGINE
           }
+#endif
         }
       }
       if (
@@ -1228,9 +1169,9 @@ THR_LOCK_DATA **ha_spider::store_lock(
       )
         lock_type = TL_WRITE_ALLOW_WRITE;
     }
-    lock.type = lock_type;
+    wide_handler->lock.type = lock_type;
   }
-  *to++ = &lock;
+  *to++ = &wide_handler->lock;
   DBUG_RETURN(to);
 }
 
@@ -1238,8 +1179,8 @@ int ha_spider::external_lock(
   THD *thd,
   int lock_type
 ) {
-  int error_num, roop_count;
-  bool sync_trx_isolation = spider_param_sync_trx_isolation(thd);
+  int error_num = 0;
+  SPIDER_TRX *trx;
   backup_error_status();
   DBUG_ENTER("ha_spider::external_lock");
   DBUG_PRINT("info",("spider this=%p", this));
@@ -1247,41 +1188,49 @@ int ha_spider::external_lock(
 #if MYSQL_VERSION_ID < 50500
   DBUG_PRINT("info",("spider thd->options=%x", (int) thd->options));
 #endif
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+  if (
+    wide_handler->stage == SPD_HND_STAGE_EXTERNAL_LOCK &&
+    wide_handler->stage_executor != this)
+  {
+    DBUG_RETURN(0);
+  }
+  wide_handler->stage = SPD_HND_STAGE_EXTERNAL_LOCK;
+  wide_handler->stage_executor = this;
+#endif
 #ifdef HANDLER_HAS_NEED_INFO_FOR_AUTO_INC
   info_auto_called = FALSE;
 #endif
 
-  sql_command = thd_sql_command(thd);
-  if (sql_command == SQLCOM_BEGIN)
-    sql_command = SQLCOM_UNLOCK_TABLES;
-  if (
-    sql_command == SQLCOM_UNLOCK_TABLES &&
-    (error_num = spider_check_trx_and_get_conn(thd, this,
-      FALSE))
-  ) {
+  wide_handler->sql_command = thd_sql_command(thd);
+  if (wide_handler->sql_command == SQLCOM_BEGIN)
+    wide_handler->sql_command = SQLCOM_UNLOCK_TABLES;
+
+  trx = spider_get_trx(thd, TRUE, &error_num);
+  if (error_num)
     DBUG_RETURN(error_num);
-  }
+  wide_handler->trx = trx;
 
-  DBUG_PRINT("info",("spider sql_command=%d", sql_command));
-  DBUG_ASSERT(trx == spider_get_trx(thd, TRUE, &error_num));
+  DBUG_PRINT("info",("spider sql_command=%d", wide_handler->sql_command));
 #ifdef HA_CAN_BULK_ACCESS
-  external_lock_cnt++;
+  wide_handler->external_lock_cnt++;
 #endif
   if (
     lock_type == F_UNLCK &&
-    sql_command != SQLCOM_UNLOCK_TABLES
+    wide_handler->sql_command != SQLCOM_UNLOCK_TABLES
   )
     DBUG_RETURN(0);
   if (store_error_num)
     DBUG_RETURN(store_error_num);
+  wide_handler->external_lock_type = lock_type;
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
   if ((conn_kinds & SPIDER_CONN_KIND_MYSQL))
   {
 #endif
     if (
       /* SQLCOM_RENAME_TABLE and SQLCOM_DROP_DB don't come here */
-      sql_command == SQLCOM_DROP_TABLE ||
-      sql_command == SQLCOM_ALTER_TABLE
+      wide_handler->sql_command == SQLCOM_DROP_TABLE ||
+      wide_handler->sql_command == SQLCOM_ALTER_TABLE
     ) {
       if (trx->locked_connections)
       {
@@ -1291,277 +1240,59 @@ int ha_spider::external_lock(
       }
       DBUG_RETURN(0);
     }
-    if (!conns[search_link_idx])
+    if (unlikely((error_num = spider_internal_start_trx(this))))
     {
-      my_message(ER_SPIDER_REMOTE_SERVER_GONE_AWAY_NUM,
-        ER_SPIDER_REMOTE_SERVER_GONE_AWAY_STR, MYF(0));
-      DBUG_RETURN(ER_SPIDER_REMOTE_SERVER_GONE_AWAY_NUM);
-    }
-    for (
-      roop_count = spider_conn_link_idx_next(share->link_statuses,
-        conn_link_idx, -1, share->link_count,
-        SPIDER_LINK_STATUS_RECOVERY);
-      roop_count < (int) share->link_count;
-      roop_count = spider_conn_link_idx_next(share->link_statuses,
-        conn_link_idx, roop_count, share->link_count,
-        SPIDER_LINK_STATUS_RECOVERY)
-    ) {
-      if (sql_command == SQLCOM_TRUNCATE)
-        DBUG_RETURN(0);
-      else if (sql_command != SQLCOM_UNLOCK_TABLES)
-      {
-        DBUG_PRINT("info",("spider conns[%d]->join_trx=%u",
-          roop_count, conns[roop_count]->join_trx));
-        if (
-          (!conns[roop_count]->join_trx &&
-            (error_num = spider_internal_start_trx(this, conns[roop_count],
-              roop_count)))
-        ) {
-          if (
-            share->monitoring_kind[roop_count] &&
-            need_mons[roop_count]
-          ) {
-            error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
-                share,
-                roop_count,
-                (uint32) share->monitoring_sid[roop_count],
-                share->table_name,
-                share->table_name_length,
-                conn_link_idx[roop_count],
-                NULL,
-                0,
-                share->monitoring_kind[roop_count],
-                share->monitoring_limit[roop_count],
-                share->monitoring_flag[roop_count],
-                TRUE
-              );
-          }
-          DBUG_RETURN(check_error_mode(error_num));
-        }
-        result_list.lock_type = lock_type;
-        reset_first_link_idx();
-        if (
-          conns[roop_count]->semi_trx_isolation == -2 &&
-          conns[roop_count]->semi_trx_isolation_chk == TRUE &&
-          sync_trx_isolation &&
-          spider_param_semi_trx_isolation(trx->thd) >= 0
-        ) {
-/*
-          if (conns[roop_count]->trx_isolation !=
-            spider_param_semi_trx_isolation(trx->thd))
-          {
-*/
-            spider_conn_queue_semi_trx_isolation(conns[roop_count],
-              spider_param_semi_trx_isolation(trx->thd));
-/*
-          }
-          conns[roop_count]->semi_trx_isolation =
-            spider_param_semi_trx_isolation(trx->thd);
-          conns[roop_count]->trx_isolation =
-            thd_tx_isolation(conns[roop_count]->thd);
-          DBUG_PRINT("info",("spider conn=%p", conns[roop_count]));
-          DBUG_PRINT("info",("spider conn->trx_isolation=%d",
-            conns[roop_count]->trx_isolation));
-*/
-        } else {
-          if (sync_trx_isolation)
-          {
-            if ((error_num = spider_check_and_set_trx_isolation(
-              conns[roop_count], &need_mons[roop_count])))
-            {
-              if (
-                share->monitoring_kind[roop_count] &&
-                need_mons[roop_count]
-              ) {
-                error_num = spider_ping_table_mon_from_table(
-                    trx,
-                    trx->thd,
-                    share,
-                    roop_count,
-                    (uint32) share->monitoring_sid[roop_count],
-                    share->table_name,
-                    share->table_name_length,
-                    conn_link_idx[roop_count],
-                    NULL,
-                    0,
-                    share->monitoring_kind[roop_count],
-                    share->monitoring_limit[roop_count],
-                    share->monitoring_flag[roop_count],
-                    TRUE
-                  );
-              }
-              DBUG_RETURN(check_error_mode(error_num));
-            }
-          }
-          conns[roop_count]->semi_trx_isolation = -1;
-        }
-      }
-      if (conns[roop_count]->table_lock >= 2)
-      {
-        if (
-          conns[roop_count]->db_conn->have_lock_table_list() &&
-          (error_num = spider_db_lock_tables(this, roop_count))
-        ) {
-          if (
-            share->monitoring_kind[roop_count] &&
-            need_mons[roop_count]
-          ) {
-            error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
-                share,
-                roop_count,
-                (uint32) share->monitoring_sid[roop_count],
-                share->table_name,
-                share->table_name_length,
-                conn_link_idx[roop_count],
-                NULL,
-                0,
-                share->monitoring_kind[roop_count],
-                share->monitoring_limit[roop_count],
-                share->monitoring_flag[roop_count],
-                TRUE
-              );
-          }
-          conns[roop_count]->table_lock = 0;
-          DBUG_RETURN(check_error_mode(error_num));
-        }
-        if (conns[roop_count]->table_lock == 2)
-          conns[roop_count]->table_lock = 1;
-      } else if (sql_command == SQLCOM_UNLOCK_TABLES ||
-        spider_param_internal_unlock(thd) == 1)
-      {
-        if (conns[roop_count]->table_lock == 1)
-        {
-          conns[roop_count]->table_lock = 0;
-          if (!conns[roop_count]->trx_start)
-            conns[roop_count]->disable_reconnect = FALSE;
-          if ((error_num = spider_db_unlock_tables(this, roop_count)))
-          {
-            if (
-              share->monitoring_kind[roop_count] &&
-              need_mons[roop_count]
-            ) {
-              error_num = spider_ping_table_mon_from_table(
-                  trx,
-                  trx->thd,
-                  share,
-                  roop_count,
-                  (uint32) share->monitoring_sid[roop_count],
-                  share->table_name,
-                  share->table_name_length,
-                  conn_link_idx[roop_count],
-                  NULL,
-                  0,
-                  share->monitoring_kind[roop_count],
-                  share->monitoring_limit[roop_count],
-                  share->monitoring_flag[roop_count],
-                  TRUE
-                );
-            }
-            DBUG_RETURN(check_error_mode(error_num));
-          }
-        }
-      }
+      DBUG_RETURN(error_num);
     }
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
   } else {
-    result_list.lock_type = lock_type;
-    reset_first_link_idx();
     trans_register_ha(trx->thd, FALSE, spider_hton_ptr);
     if (thd_test_options(trx->thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
       trans_register_ha(trx->thd, TRUE, spider_hton_ptr);
   }
+#endif
 
-  if ((conn_kinds & SPIDER_CONN_KIND_HS_READ))
+  if (wide_handler->lock_table_type > 0 ||
+    wide_handler->sql_command == SQLCOM_UNLOCK_TABLES)
   {
-    SPIDER_CONN *hs_conn;
-    for (
-      roop_count = spider_conn_link_idx_next(share->link_statuses,
-        conn_link_idx, -1, share->link_count,
-        SPIDER_LINK_STATUS_RECOVERY);
-      roop_count < (int) share->link_count;
-      roop_count = spider_conn_link_idx_next(share->link_statuses,
-        conn_link_idx, roop_count, share->link_count,
-        SPIDER_LINK_STATUS_RECOVERY)
-    ) {
-      hs_conn = hs_r_conns[roop_count];
-      if (
-        hs_conn &&
-        hs_conn->hsc_query_id != thd->query_id &&
-        hs_conn->hs_pre_age == hs_conn->hs_age
-      ) {
-        double interval = spider_param_hs_ping_interval(thd);
-        time_t tmp_time = (time_t) time((time_t*) 0);
-        DBUG_PRINT("info",
-          ("spider difftime=%f", difftime(tmp_time, hs_conn->ping_time)));
-        DBUG_PRINT("info", ("spider interval=%f", interval));
-        if (
-          hs_conn->server_lost ||
-          difftime(tmp_time, hs_conn->ping_time) >= interval
-        ) {
-          DBUG_PRINT("info", ("spider hsr[%d] need reconnect", roop_count));
-          hs_conn->hs_pre_age++;
-          hs_conn->ping_time = tmp_time;
+    if (wide_handler->sql_command == SQLCOM_UNLOCK_TABLES)
+    {
+      /* lock tables does not call reset() */
+      /* unlock tables does not call store_lock() */
+      wide_handler->lock_table_type = 0;
+    }
+
+    /* lock/unlock tables */
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+    if (partition_handler && partition_handler->handlers)
+    {
+      uint roop_count;
+      for (roop_count = 0; roop_count < partition_handler->no_parts;
+        ++roop_count)
+      {
+        if (unlikely((error_num =
+          partition_handler->handlers[roop_count]->lock_tables())))
+        {
+          DBUG_RETURN(error_num);
         }
-        hs_conn->hsc_query_id = thd->query_id;
       }
-    }
-  }
-  if (
-#if defined(HS_HAS_SQLCOM) && defined(HANDLER_HAS_DIRECT_UPDATE_ROWS)
-    (
-#endif
-      conn_kinds & SPIDER_CONN_KIND_HS_WRITE
-#if defined(HS_HAS_SQLCOM) && defined(HANDLER_HAS_DIRECT_UPDATE_ROWS)
-    ) ||
-    /* for direct_update */
-    sql_command == SQLCOM_HS_UPDATE ||
-    sql_command == SQLCOM_HS_DELETE
+    } else {
 #endif
-  ) {
-    SPIDER_CONN *hs_conn;
-    for (
-      roop_count = spider_conn_link_idx_next(share->link_statuses,
-        conn_link_idx, -1, share->link_count,
-        SPIDER_LINK_STATUS_RECOVERY);
-      roop_count < (int) share->link_count;
-      roop_count = spider_conn_link_idx_next(share->link_statuses,
-        conn_link_idx, roop_count, share->link_count,
-        SPIDER_LINK_STATUS_RECOVERY)
-    ) {
-      hs_conn = hs_w_conns[roop_count];
-      if (
-        hs_conn &&
-        hs_conn->hsc_query_id != thd->query_id &&
-        hs_conn->hs_pre_age == hs_conn->hs_age
-      ) {
-        double interval = spider_param_hs_ping_interval(thd);
-        time_t tmp_time = (time_t) time((time_t*) 0);
-        DBUG_PRINT("info",
-          ("spider difftime=%f", difftime(tmp_time, hs_conn->ping_time)));
-        DBUG_PRINT("info", ("spider interval=%f", interval));
-        if (
-          hs_conn->server_lost ||
-          difftime(tmp_time, hs_conn->ping_time) >= interval
-        ) {
-          DBUG_PRINT("info", ("spider hsw[%d] need reconnect", roop_count));
-          hs_conn->hs_pre_age++;
-          hs_conn->ping_time = tmp_time;
-        }
-        hs_conn->hsc_query_id = thd->query_id;
+      if (unlikely((error_num = lock_tables())))
+      {
+        DBUG_RETURN(error_num);
       }
+#ifdef WITH_PARTITION_STORAGE_ENGINE
     }
-  }
 #endif
+  }
 
-  DBUG_PRINT("info",("spider trx_start=%s", trx->trx_start ? "TRUE" : "FALSE"));
+  DBUG_PRINT("info",("spider trx_start=%s",
+    trx->trx_start ? "TRUE" : "FALSE"));
   /* need to check after spider_internal_start_trx() */
   if (trx->trx_start)
   {
-    switch (sql_command)
+    switch (wide_handler->sql_command)
     {
       case SQLCOM_SELECT:
       case SQLCOM_HA_READ:
@@ -1596,6 +1327,24 @@ int ha_spider::external_lock(
   DBUG_RETURN(0);
 }
 
+int ha_spider::start_stmt(
+  THD *thd,
+  thr_lock_type lock_type
+) {
+  DBUG_ENTER("ha_spider::start_stmt");
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+  if (
+    wide_handler->stage == SPD_HND_STAGE_START_STMT &&
+    wide_handler->stage_executor != this)
+  {
+    DBUG_RETURN(0);
+  }
+  wide_handler->stage = SPD_HND_STAGE_START_STMT;
+  wide_handler->stage_executor = this;
+#endif
+  DBUG_RETURN(0);
+}
+
 int ha_spider::reset()
 {
   int error_num = 0, error_num2, roop_count;
@@ -1665,25 +1414,49 @@ int ha_spider::reset()
 #endif
   result_list.direct_distinct = FALSE;
   store_error_num = 0;
-#ifdef WITH_PARTITION_STORAGE_ENGINE
   if (
-    partition_handler_share &&
-    partition_handler_share->searched_bitmap
+    wide_handler &&
+    wide_handler->sql_command != SQLCOM_END
   ) {
+    wide_handler->sql_command = SQLCOM_END;
+    wide_handler->between_flg = FALSE;
+    wide_handler->idx_bitmap_is_set = FALSE;
+    wide_handler->rnd_bitmap_is_set = FALSE;
+    wide_handler->quick_mode = FALSE;
+    wide_handler->keyread = FALSE;
+    wide_handler->ignore_dup_key = FALSE;
+    wide_handler->write_can_replace = FALSE;
+    wide_handler->insert_with_update = FALSE;
+    wide_handler->low_priority = FALSE;
+    wide_handler->high_priority = FALSE;
+    wide_handler->insert_delayed = FALSE;
+    wide_handler->lock_table_type = 0;
+    wide_handler->semi_trx_isolation_chk = FALSE;
+    wide_handler->semi_trx_chk = FALSE;
     if (!is_clone)
     {
-      partition_handler_share->searched_bitmap = NULL;
-      partition_handler_share->ft_discard_bitmap = NULL;
+      memset(wide_handler->ft_discard_bitmap, 0xFF,
+        no_bytes_in_map(table->read_set));
+      memset(wide_handler->searched_bitmap, 0,
+        no_bytes_in_map(table->read_set));
     }
-    partition_handler_share->between_flg = FALSE;
-    partition_handler_share->idx_bitmap_is_set = FALSE;
-    partition_handler_share->rnd_bitmap_is_set = FALSE;
-  }
+    while (wide_handler->condition)
+    {
+      tmp_cond = wide_handler->condition->next;
+      spider_free(spider_current_trx, wide_handler->condition, MYF(0));
+      wide_handler->condition = tmp_cond;
+    }
+    wide_handler->cond_check = FALSE;
+#ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
+    wide_handler->direct_update_fields = NULL;
+#endif
+#ifdef INFO_KIND_FORCE_LIMIT_BEGIN
+    wide_handler->info_limit = 9223372036854775807LL;
+#endif
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+    wide_handler->stage = SPD_HND_STAGE_NONE;
+    wide_handler->stage_executor = NULL;
 #endif
-  if (!is_clone)
-  {
-    memset(ft_discard_bitmap, 0xFF, no_bytes_in_map(table->read_set));
-    memset(searched_bitmap, 0, no_bytes_in_map(table->read_set));
   }
   if (!(tmp_trx = spider_get_trx(thd, TRUE, &error_num2)))
   {
@@ -1693,82 +1466,11 @@ int ha_spider::reset()
   }
   if (share)
   {
-    trx_bak = trx;
-    trx = tmp_trx;
+    trx_bak = wide_handler->trx;
+    wide_handler->trx = tmp_trx;
     if ((error_num2 = spider_db_free_result(this, FALSE)))
       error_num = error_num2;
-    trx = trx_bak;
-/*
-    int semi_table_lock_conn = spider_param_semi_table_lock_connection(thd,
-      share->semi_table_lock_conn);
-    if (semi_table_lock_conn)
-      first_byte = '0' +
-        spider_param_semi_table_lock(thd, share->semi_table_lock);
-    else
-      first_byte = '0';
-    DBUG_PRINT("info",("spider semi_table_lock_conn = %d",
-      semi_table_lock_conn));
-    DBUG_PRINT("info",("spider semi_table_lock = %d",
-      spider_param_semi_table_lock(thd, share->semi_table_lock)));
-    DBUG_PRINT("info",("spider first_byte = %d", first_byte));
-    if (tmp_trx->spider_thread_id != spider_thread_id ||
-      (tmp_trx->trx_conn_adjustment != trx_conn_adjustment &&
-        tmp_trx->trx_conn_adjustment - 1 != trx_conn_adjustment) ||
-      first_byte != *conn_keys[0]
-    ) {
-      DBUG_PRINT("info",(first_byte != *conn_keys[0] ?
-        "spider change conn type" : tmp_trx != trx ? "spider change thd" :
-        "spider next trx"));
-      trx = tmp_trx;
-      spider_thread_id = tmp_trx->spider_thread_id;
-      trx_conn_adjustment = tmp_trx->trx_conn_adjustment;
-
-      first_byte_bak = *conn_keys[0];
-      *conn_keys[0] = first_byte;
-      for (
-        roop_count = spider_conn_link_idx_next(share->link_statuses,
-          conn_link_idx, -1, share->link_count,
-          SPIDER_LINK_STATUS_RECOVERY);
-        roop_count < share->link_count;
-        roop_count = spider_conn_link_idx_next(share->link_statuses,
-          conn_link_idx, roop_count, share->link_count,
-          SPIDER_LINK_STATUS_RECOVERY)
-      ) {
-        *conn_keys[roop_count] = first_byte;
-        if (
-          !(conns[roop_count] =
-            spider_get_conn(share, roop_count, conn_keys[roop_count], trx,
-              this, FALSE, TRUE, SPIDER_CONN_KIND_MYSQL, &error_num))
-        ) {
-          if (
-            share->monitoring_kind[roop_count] &&
-            need_mons[roop_count]
-          ) {
-            error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
-                share,
-                roop_count,
-                (uint32) share->monitoring_sid[roop_count],
-                share->table_name,
-                share->table_name_length,
-                conn_link_idx[roop_count],
-                NULL,
-                0,
-                share->monitoring_kind[roop_count],
-                share->monitoring_limit[roop_count],
-                share->monitoring_flag[roop_count],
-                TRUE
-              );
-          }
-          DBUG_PRINT("info",("spider get conn error"));
-          *conn_keys[0] = first_byte_bak;
-          conns[0] = NULL;
-          DBUG_RETURN(error_num);
-        }
-      }
-    }
-*/
+    wide_handler->trx = trx_bak;
     memset(need_mons, 0, sizeof(int) * share->link_count);
     memset(result_list.casual_read, 0, sizeof(int) * share->link_count);
     rm_bulk_tmp_table();
@@ -1798,19 +1500,12 @@ int ha_spider::reset()
       }
     }
   }
-  quick_mode = FALSE;
-  keyread = FALSE;
-  ignore_dup_key = FALSE;
-  write_can_replace = FALSE;
-  insert_with_update = FALSE;
-  low_priority = FALSE;
-  high_priority = FALSE;
-  insert_delayed = FALSE;
+  dml_inited = FALSE;
   use_pre_call = FALSE;
   use_pre_action = FALSE;
   pre_bitmap_checked = FALSE;
   bulk_insert = FALSE;
-  clone_bitmap_init = FALSE;
+  partition_handler->clone_bitmap_init = FALSE;
   result_list.tmp_table_join = FALSE;
   result_list.use_union = FALSE;
   result_list.use_both_key = FALSE;
@@ -1818,13 +1513,6 @@ int ha_spider::reset()
   conn_kinds = SPIDER_CONN_KIND_MYSQL;
   use_index_merge = FALSE;
   init_rnd_handler = FALSE;
-  while (condition)
-  {
-    tmp_cond = condition->next;
-    spider_free(spider_current_trx, condition, MYF(0));
-    condition = tmp_cond;
-  }
-  cond_check = FALSE;
 #ifdef HA_MRR_USE_DEFAULT_IMPL
   if (multi_range_keys)
   {
@@ -1848,10 +1536,6 @@ int ha_spider::reset()
 #endif
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
   do_direct_update = FALSE;
-  direct_update_fields = NULL;
-#endif
-#ifdef INFO_KIND_FORCE_LIMIT_BEGIN
-  info_limit = 9223372036854775807LL;
 #endif
   prev_index_rnd_init = SPD_NONE;
   result_list.have_sql_kind_backup = FALSE;
@@ -1889,55 +1573,64 @@ int ha_spider::extra(
   DBUG_ENTER("ha_spider::extra");
   DBUG_PRINT("info",("spider this=%p", this));
   DBUG_PRINT("info",("spider operation=%d", (int) operation));
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+  if (
+    wide_handler->stage == SPD_HND_STAGE_EXTRA &&
+    wide_handler->stage_executor != this)
+  {
+    DBUG_RETURN(0);
+  }
+  wide_handler->stage = SPD_HND_STAGE_EXTRA;
+  wide_handler->stage_executor = this;
+#endif
   switch (operation)
   {
     case HA_EXTRA_QUICK:
-      quick_mode = TRUE;
+      wide_handler->quick_mode = TRUE;
       break;
     case HA_EXTRA_KEYREAD:
       if (!is_clone)
       {
-        keyread = TRUE;
+        wide_handler->keyread = TRUE;
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-        if (update_request)
+        if (wide_handler->update_request)
         {
           if (check_partitioned())
-            keyread = FALSE;
+            wide_handler->keyread = FALSE;
         }
 #endif
       }
       break;
     case HA_EXTRA_NO_KEYREAD:
-      keyread = FALSE;
+      wide_handler->keyread = FALSE;
       break;
     case HA_EXTRA_IGNORE_DUP_KEY:
-      ignore_dup_key = TRUE;
+      wide_handler->ignore_dup_key = TRUE;
       break;
     case HA_EXTRA_NO_IGNORE_DUP_KEY:
-      ignore_dup_key = FALSE;
+      wide_handler->ignore_dup_key = FALSE;
       break;
     case HA_EXTRA_WRITE_CAN_REPLACE:
-      write_can_replace = TRUE;
+      wide_handler->write_can_replace = TRUE;
       break;
     case HA_EXTRA_WRITE_CANNOT_REPLACE:
-      write_can_replace = FALSE;
+      wide_handler->write_can_replace = FALSE;
       break;
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
     case HA_EXTRA_INSERT_WITH_UPDATE:
-      insert_with_update = TRUE;
-      check_insert_dup_update_pushdown();
+      wide_handler->insert_with_update = TRUE;
       break;
 #endif
     case HA_EXTRA_ATTACH_CHILDREN:
       DBUG_PRINT("info",("spider HA_EXTRA_ATTACH_CHILDREN"));
-      if (!(trx = spider_get_trx(ha_thd(), TRUE, &error_num)))
+      if (!(wide_handler->trx = spider_get_trx(ha_thd(), TRUE, &error_num)))
         DBUG_RETURN(error_num);
       break;
 #if MYSQL_VERSION_ID < 50500
 #else
     case HA_EXTRA_ADD_CHILDREN_LIST:
       DBUG_PRINT("info",("spider HA_EXTRA_ADD_CHILDREN_LIST"));
-      if (!(trx = spider_get_trx(ha_thd(), TRUE, &error_num)))
+      if (!(wide_handler->trx = spider_get_trx(ha_thd(), TRUE, &error_num)))
         DBUG_RETURN(error_num);
       break;
 #endif
@@ -1960,7 +1653,8 @@ int ha_spider::extra(
           part_num < spider_user_defined_key_parts(key_info);
           key_part++, part_num++
         ) {
-          spider_set_bit(searched_bitmap, key_part->field->field_index);
+          spider_set_bit(wide_handler->searched_bitmap,
+            key_part->field->field_index);
         }
       } else {
         DBUG_PRINT("info",("spider need all columns"));
@@ -1970,7 +1664,7 @@ int ha_spider::extra(
           *field;
           field++
         ) {
-          spider_set_bit(searched_bitmap, (*field)->field_index);
+          spider_set_bit(wide_handler->searched_bitmap, (*field)->field_index);
         }
       }
       break;
@@ -2002,6 +1696,13 @@ int ha_spider::index_init(
     }
   }
 #endif
+  if (!dml_inited)
+  {
+    if (unlikely((error_num = dml_init())))
+    {
+      DBUG_RETURN(error_num);
+    }
+  }
   pushed_pos = NULL;
   active_index = idx;
   result_list.sorted = sorted;
@@ -2013,20 +1714,21 @@ int ha_spider::index_init(
   if (pre_bitmap_checked)
     pre_bitmap_checked = FALSE;
   else {
-    if (result_list.lock_type == F_WRLCK)
+    if (wide_handler->external_lock_type == F_WRLCK)
     {
       pk_update = FALSE;
 /*
       check_and_start_bulk_update(SPD_BU_START_BY_INDEX_OR_RND_INIT);
 */
       if (
-        update_request &&
+        wide_handler->update_request &&
         share->have_recovery_link &&
         (pk_update = spider_check_pk_update(table))
       ) {
         bitmap_set_all(table->read_set);
         if (is_clone)
-          memset(searched_bitmap, 0xFF, no_bytes_in_map(table->read_set));
+          memset(wide_handler->searched_bitmap, 0xFF,
+            no_bytes_in_map(table->read_set));
       }
     }
 
@@ -2120,7 +1822,7 @@ int ha_spider::index_read_map_internal(
   backup_error_status();
   DBUG_ENTER("ha_spider::index_read_map_internal");
   DBUG_PRINT("info",("spider this=%p", this));
-  if (trx->thd->killed)
+  if (wide_handler->trx->thd->killed)
   {
     my_error(ER_QUERY_INTERRUPTED, MYF(0));
     DBUG_RETURN(ER_QUERY_INTERRUPTED);
@@ -2165,7 +1867,7 @@ int ha_spider::index_read_map_internal(
   DBUG_PRINT("info",("spider result_list.finish_flg = FALSE"));
   result_list.finish_flg = FALSE;
   result_list.record_num = 0;
-  if (keyread)
+  if (wide_handler->keyread)
     result_list.keyread = TRUE;
   else
     result_list.keyread = FALSE;
@@ -2280,7 +1982,8 @@ int ha_spider::index_read_map_internal(
 #ifndef WITHOUT_SPIDER_BG_SEARCH
     if (result_list.bgs_phase > 0)
     {
-      if ((error_num = spider_check_and_init_casual_read(trx->thd, this,
+      if ((error_num = spider_check_and_init_casual_read(
+        wide_handler->trx->thd, this,
         roop_count)))
         DBUG_RETURN(error_num);
       if ((error_num = spider_bg_conn_search(this, roop_count, roop_start,
@@ -2292,8 +1995,8 @@ int ha_spider::index_read_map_internal(
           need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              trx,
-              trx->thd,
+              wide_handler->trx,
+              wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -2362,7 +2065,7 @@ int ha_spider::index_read_map_internal(
         conn_kind[roop_count] != SPIDER_CONN_KIND_MYSQL
       ) {
         connection_ids[roop_count] = conn->connection_id;
-        spider_trx_add_bulk_access_conn(trx, conn);
+        spider_trx_add_bulk_access_conn(wide_handler->trx, conn);
         SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
         pthread_mutex_unlock(&conn->mta_conn_mutex);
       } else {
@@ -2386,8 +2089,8 @@ int ha_spider::index_read_map_internal(
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -2405,7 +2108,7 @@ int ha_spider::index_read_map_internal(
           DBUG_RETURN(check_error_mode_eof(error_num));
         }
         spider_conn_set_timeout_from_share(conn, roop_count,
-          trx->thd, share);
+          wide_handler->trx->thd, share);
         if (dbton_hdl->execute_sql(
           sql_type,
           conn,
@@ -2422,8 +2125,8 @@ int ha_spider::index_read_map_internal(
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -2455,8 +2158,8 @@ int ha_spider::index_read_map_internal(
               need_mons[roop_count]
             ) {
               error_num = spider_ping_table_mon_from_table(
-                  trx,
-                  trx->thd,
+                  wide_handler->trx,
+                  wide_handler->trx->thd,
                   share,
                   roop_count,
                   (uint32) share->monitoring_sid[roop_count],
@@ -2645,7 +2348,7 @@ int ha_spider::index_read_last_map_internal(
   backup_error_status();
   DBUG_ENTER("ha_spider::index_read_last_map_internal");
   DBUG_PRINT("info",("spider this=%p", this));
-  if (trx->thd->killed)
+  if (wide_handler->trx->thd->killed)
   {
     my_error(ER_QUERY_INTERRUPTED, MYF(0));
     DBUG_RETURN(ER_QUERY_INTERRUPTED);
@@ -2699,7 +2402,7 @@ int ha_spider::index_read_last_map_internal(
   DBUG_PRINT("info",("spider result_list.finish_flg = FALSE"));
   result_list.finish_flg = FALSE;
   result_list.record_num = 0;
-  if (keyread)
+  if (wide_handler->keyread)
     result_list.keyread = TRUE;
   else
     result_list.keyread = FALSE;
@@ -2790,7 +2493,8 @@ int ha_spider::index_read_last_map_internal(
 #ifndef WITHOUT_SPIDER_BG_SEARCH
     if (result_list.bgs_phase > 0)
     {
-      if ((error_num = spider_check_and_init_casual_read(trx->thd, this,
+      if ((error_num = spider_check_and_init_casual_read(
+        wide_handler->trx->thd, this,
         roop_count)))
         DBUG_RETURN(error_num);
       if ((error_num = spider_bg_conn_search(this, roop_count, roop_start,
@@ -2802,8 +2506,8 @@ int ha_spider::index_read_last_map_internal(
           need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              trx,
-              trx->thd,
+              wide_handler->trx,
+              wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -2869,7 +2573,7 @@ int ha_spider::index_read_last_map_internal(
       if (is_bulk_access_clone)
       {
         connection_ids[roop_count] = conn->connection_id;
-        spider_trx_add_bulk_access_conn(trx, conn);
+        spider_trx_add_bulk_access_conn(wide_handler->trx, conn);
         SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
         pthread_mutex_unlock(&conn->mta_conn_mutex);
       } else {
@@ -2893,8 +2597,8 @@ int ha_spider::index_read_last_map_internal(
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -2912,7 +2616,7 @@ int ha_spider::index_read_last_map_internal(
           DBUG_RETURN(check_error_mode_eof(error_num));
         }
         spider_conn_set_timeout_from_share(conn, roop_count,
-          trx->thd, share);
+          wide_handler->trx->thd, share);
         if (dbton_hdl->execute_sql(
           sql_type,
           conn,
@@ -2929,8 +2633,8 @@ int ha_spider::index_read_last_map_internal(
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -2962,8 +2666,8 @@ int ha_spider::index_read_last_map_internal(
               need_mons[roop_count]
             ) {
               error_num = spider_ping_table_mon_from_table(
-                  trx,
-                  trx->thd,
+                  wide_handler->trx,
+                  wide_handler->trx->thd,
                   share,
                   roop_count,
                   (uint32) share->monitoring_sid[roop_count],
@@ -3046,7 +2750,7 @@ int ha_spider::index_next(
   backup_error_status();
   DBUG_ENTER("ha_spider::index_next");
   DBUG_PRINT("info",("spider this=%p", this));
-  if (trx->thd->killed)
+  if (wide_handler->trx->thd->killed)
   {
     my_error(ER_QUERY_INTERRUPTED, MYF(0));
     DBUG_RETURN(ER_QUERY_INTERRUPTED);
@@ -3092,7 +2796,7 @@ int ha_spider::index_prev(
   backup_error_status();
   DBUG_ENTER("ha_spider::index_prev");
   DBUG_PRINT("info",("spider this=%p", this));
-  if (trx->thd->killed)
+  if (wide_handler->trx->thd->killed)
   {
     my_error(ER_QUERY_INTERRUPTED, MYF(0));
     DBUG_RETURN(ER_QUERY_INTERRUPTED);
@@ -3139,7 +2843,7 @@ int ha_spider::index_first_internal(
   backup_error_status();
   DBUG_ENTER("ha_spider::index_first_internal");
   DBUG_PRINT("info",("spider this=%p", this));
-  if (trx->thd->killed)
+  if (wide_handler->trx->thd->killed)
   {
     my_error(ER_QUERY_INTERRUPTED, MYF(0));
     DBUG_RETURN(ER_QUERY_INTERRUPTED);
@@ -3182,7 +2886,7 @@ int ha_spider::index_first_internal(
     DBUG_PRINT("info",("spider result_list.finish_flg = FALSE"));
     result_list.finish_flg = FALSE;
     result_list.record_num = 0;
-    if (keyread)
+    if (wide_handler->keyread)
       result_list.keyread = TRUE;
     else
       result_list.keyread = FALSE;
@@ -3274,7 +2978,8 @@ int ha_spider::index_first_internal(
 #ifndef WITHOUT_SPIDER_BG_SEARCH
       if (result_list.bgs_phase > 0)
       {
-        if ((error_num = spider_check_and_init_casual_read(trx->thd, this,
+        if ((error_num = spider_check_and_init_casual_read(
+          wide_handler->trx->thd, this,
           roop_count)))
           DBUG_RETURN(error_num);
         if ((error_num = spider_bg_conn_search(this, roop_count, roop_start,
@@ -3286,8 +2991,8 @@ int ha_spider::index_first_internal(
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -3354,7 +3059,7 @@ int ha_spider::index_first_internal(
         if (is_bulk_access_clone)
         {
           connection_ids[roop_count] = conn->connection_id;
-          spider_trx_add_bulk_access_conn(trx, conn);
+          spider_trx_add_bulk_access_conn(wide_handler->trx, conn);
           SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
           pthread_mutex_unlock(&conn->mta_conn_mutex);
         } else {
@@ -3378,8 +3083,8 @@ int ha_spider::index_first_internal(
               need_mons[roop_count]
             ) {
               error_num = spider_ping_table_mon_from_table(
-                  trx,
-                  trx->thd,
+                  wide_handler->trx,
+                  wide_handler->trx->thd,
                   share,
                   roop_count,
                   (uint32) share->monitoring_sid[roop_count],
@@ -3397,7 +3102,7 @@ int ha_spider::index_first_internal(
             DBUG_RETURN(check_error_mode_eof(error_num));
           }
           spider_conn_set_timeout_from_share(conn, roop_count,
-            trx->thd, share);
+            wide_handler->trx->thd, share);
           if (dbton_hdl->execute_sql(
             sql_type,
             conn,
@@ -3414,8 +3119,8 @@ int ha_spider::index_first_internal(
               need_mons[roop_count]
             ) {
               error_num = spider_ping_table_mon_from_table(
-                  trx,
-                  trx->thd,
+                  wide_handler->trx,
+                  wide_handler->trx->thd,
                   share,
                   roop_count,
                   (uint32) share->monitoring_sid[roop_count],
@@ -3447,8 +3152,8 @@ int ha_spider::index_first_internal(
                 need_mons[roop_count]
               ) {
                 error_num = spider_ping_table_mon_from_table(
-                    trx,
-                    trx->thd,
+                    wide_handler->trx,
+                    wide_handler->trx->thd,
                     share,
                     roop_count,
                     (uint32) share->monitoring_sid[roop_count],
@@ -3541,7 +3246,7 @@ int ha_spider::index_last_internal(
   backup_error_status();
   DBUG_ENTER("ha_spider::index_last_internal");
   DBUG_PRINT("info",("spider this=%p", this));
-  if (trx->thd->killed)
+  if (wide_handler->trx->thd->killed)
   {
     my_error(ER_QUERY_INTERRUPTED, MYF(0));
     DBUG_RETURN(ER_QUERY_INTERRUPTED);
@@ -3584,7 +3289,7 @@ int ha_spider::index_last_internal(
     DBUG_PRINT("info",("spider result_list.finish_flg = FALSE"));
     result_list.finish_flg = FALSE;
     result_list.record_num = 0;
-    if (keyread)
+    if (wide_handler->keyread)
       result_list.keyread = TRUE;
     else
       result_list.keyread = FALSE;
@@ -3676,7 +3381,8 @@ int ha_spider::index_last_internal(
 #ifndef WITHOUT_SPIDER_BG_SEARCH
       if (result_list.bgs_phase > 0)
       {
-        if ((error_num = spider_check_and_init_casual_read(trx->thd, this,
+        if ((error_num = spider_check_and_init_casual_read(
+          wide_handler->trx->thd, this,
           roop_count)))
           DBUG_RETURN(error_num);
         if ((error_num = spider_bg_conn_search(this, roop_count, roop_start,
@@ -3688,8 +3394,8 @@ int ha_spider::index_last_internal(
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -3756,7 +3462,7 @@ int ha_spider::index_last_internal(
         if (is_bulk_access_clone)
         {
           connection_ids[roop_count] = conn->connection_id;
-          spider_trx_add_bulk_access_conn(trx, conn);
+          spider_trx_add_bulk_access_conn(wide_handler->trx, conn);
           SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
           pthread_mutex_unlock(&conn->mta_conn_mutex);
         } else {
@@ -3780,8 +3486,8 @@ int ha_spider::index_last_internal(
               need_mons[roop_count]
             ) {
               error_num = spider_ping_table_mon_from_table(
-                  trx,
-                  trx->thd,
+                  wide_handler->trx,
+                  wide_handler->trx->thd,
                   share,
                   roop_count,
                   (uint32) share->monitoring_sid[roop_count],
@@ -3799,7 +3505,7 @@ int ha_spider::index_last_internal(
             DBUG_RETURN(check_error_mode_eof(error_num));
           }
           spider_conn_set_timeout_from_share(conn, roop_count,
-            trx->thd, share);
+            wide_handler->trx->thd, share);
           if (dbton_hdl->execute_sql(
             sql_type,
             conn,
@@ -3816,8 +3522,8 @@ int ha_spider::index_last_internal(
               need_mons[roop_count]
             ) {
               error_num = spider_ping_table_mon_from_table(
-                  trx,
-                  trx->thd,
+                  wide_handler->trx,
+                  wide_handler->trx->thd,
                   share,
                   roop_count,
                   (uint32) share->monitoring_sid[roop_count],
@@ -3849,8 +3555,8 @@ int ha_spider::index_last_internal(
                 need_mons[roop_count]
               ) {
                 error_num = spider_ping_table_mon_from_table(
-                    trx,
-                    trx->thd,
+                    wide_handler->trx,
+                    wide_handler->trx->thd,
                     share,
                     roop_count,
                     (uint32) share->monitoring_sid[roop_count],
@@ -3944,7 +3650,7 @@ int ha_spider::index_next_same(
   backup_error_status();
   DBUG_ENTER("ha_spider::index_next_same");
   DBUG_PRINT("info",("spider this=%p", this));
-  if (trx->thd->killed)
+  if (wide_handler->trx->thd->killed)
   {
     my_error(ER_QUERY_INTERRUPTED, MYF(0));
     DBUG_RETURN(ER_QUERY_INTERRUPTED);
@@ -3996,7 +3702,7 @@ int ha_spider::read_range_first_internal(
   backup_error_status();
   DBUG_ENTER("ha_spider::read_range_first_internal");
   DBUG_PRINT("info",("spider this=%p", this));
-  if (trx->thd->killed)
+  if (wide_handler->trx->thd->killed)
   {
     my_error(ER_QUERY_INTERRUPTED, MYF(0));
     DBUG_RETURN(ER_QUERY_INTERRUPTED);
@@ -4046,7 +3752,7 @@ int ha_spider::read_range_first_internal(
   DBUG_PRINT("info",("spider result_list.finish_flg = FALSE"));
   result_list.finish_flg = FALSE;
   result_list.record_num = 0;
-  if (keyread)
+  if (wide_handler->keyread)
     result_list.keyread = TRUE;
   else
     result_list.keyread = FALSE;
@@ -4138,7 +3844,8 @@ int ha_spider::read_range_first_internal(
 #ifndef WITHOUT_SPIDER_BG_SEARCH
     if (result_list.bgs_phase > 0)
     {
-      if ((error_num = spider_check_and_init_casual_read(trx->thd, this,
+      if ((error_num = spider_check_and_init_casual_read(
+        wide_handler->trx->thd, this,
         roop_count)))
         DBUG_RETURN(error_num);
       if ((error_num = spider_bg_conn_search(this, roop_count, roop_start,
@@ -4150,8 +3857,8 @@ int ha_spider::read_range_first_internal(
           need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              trx,
-              trx->thd,
+              wide_handler->trx,
+              wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -4217,7 +3924,7 @@ int ha_spider::read_range_first_internal(
       if (is_bulk_access_clone)
       {
         connection_ids[roop_count] = conn->connection_id;
-        spider_trx_add_bulk_access_conn(trx, conn);
+        spider_trx_add_bulk_access_conn(wide_handler->trx, conn);
         SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
         pthread_mutex_unlock(&conn->mta_conn_mutex);
       } else {
@@ -4241,8 +3948,8 @@ int ha_spider::read_range_first_internal(
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -4260,7 +3967,7 @@ int ha_spider::read_range_first_internal(
           DBUG_RETURN(check_error_mode_eof(error_num));
         }
         spider_conn_set_timeout_from_share(conn, roop_count,
-          trx->thd, share);
+          wide_handler->trx->thd, share);
         if (dbton_hdl->execute_sql(
           sql_type,
           conn,
@@ -4277,8 +3984,8 @@ int ha_spider::read_range_first_internal(
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -4310,8 +4017,8 @@ int ha_spider::read_range_first_internal(
               need_mons[roop_count]
             ) {
               error_num = spider_ping_table_mon_from_table(
-                  trx,
-                  trx->thd,
+                  wide_handler->trx,
+                  wide_handler->trx->thd,
                   share,
                   roop_count,
                   (uint32) share->monitoring_sid[roop_count],
@@ -4401,7 +4108,7 @@ int ha_spider::read_range_next()
   backup_error_status();
   DBUG_ENTER("ha_spider::read_range_next");
   DBUG_PRINT("info",("spider this=%p", this));
-  if (trx->thd->killed)
+  if (wide_handler->trx->thd->killed)
   {
     my_error(ER_QUERY_INTERRUPTED, MYF(0));
     DBUG_RETURN(ER_QUERY_INTERRUPTED);
@@ -4511,17 +4218,18 @@ ha_rows ha_spider::multi_range_read_info_const(
   DBUG_PRINT("info",("spider this=%p", this));
   if (!pre_bitmap_checked)
   {
-    if (result_list.lock_type == F_WRLCK)
+    if (wide_handler->external_lock_type == F_WRLCK)
     {
       pk_update = FALSE;
       if (
-        update_request &&
+        wide_handler->update_request &&
         share->have_recovery_link &&
         (pk_update = spider_check_pk_update(table))
       ) {
         bitmap_set_all(table->read_set);
         if (is_clone)
-          memset(searched_bitmap, 0xFF, no_bytes_in_map(table->read_set));
+          memset(wide_handler->searched_bitmap, 0xFF,
+            no_bytes_in_map(table->read_set));
       }
     }
 
@@ -4575,17 +4283,18 @@ ha_rows ha_spider::multi_range_read_info(
   DBUG_PRINT("info",("spider this=%p", this));
   if (!pre_bitmap_checked)
   {
-    if (result_list.lock_type == F_WRLCK)
+    if (wide_handler->external_lock_type == F_WRLCK)
     {
       pk_update = FALSE;
       if (
-        update_request &&
+        wide_handler->update_request &&
         share->have_recovery_link &&
         (pk_update = spider_check_pk_update(table))
       ) {
         bitmap_set_all(table->read_set);
         if (is_clone)
-          memset(searched_bitmap, 0xFF, no_bytes_in_map(table->read_set));
+          memset(wide_handler->searched_bitmap, 0xFF,
+            no_bytes_in_map(table->read_set));
       }
     }
 
@@ -4620,7 +4329,7 @@ int ha_spider::multi_range_read_init(
   uint mode,
   HANDLER_BUFFER *buf
 ) {
-  bka_mode = spider_param_bka_mode(trx->thd, share->bka_mode);
+  bka_mode = spider_param_bka_mode(wide_handler->trx->thd, share->bka_mode);
   backup_error_status();
   DBUG_ENTER("ha_spider::multi_range_read_init");
   DBUG_PRINT("info",("spider this=%p", this));
@@ -4668,12 +4377,12 @@ int ha_spider::read_multi_range_first_internal(
   backup_error_status();
   DBUG_ENTER("ha_spider::multi_range_read_next_first");
 #else
-  bka_mode = spider_param_bka_mode(trx->thd, share->bka_mode);
+  bka_mode = spider_param_bka_mode(wide_handler->trx->thd, share->bka_mode);
   backup_error_status();
   DBUG_ENTER("ha_spider::read_multi_range_first_internal");
 #endif
   DBUG_PRINT("info",("spider this=%p", this));
-  if (trx->thd->killed)
+  if (wide_handler->trx->thd->killed)
   {
     my_error(ER_QUERY_INTERRUPTED, MYF(0));
     DBUG_RETURN(ER_QUERY_INTERRUPTED);
@@ -4728,7 +4437,7 @@ int ha_spider::read_multi_range_first_internal(
     result_list.multi_split_read <= 1 ||
     (sql_kinds & SPIDER_SQL_KIND_HANDLER)
   ) {
-    if (keyread)
+    if (wide_handler->keyread)
       result_list.keyread = TRUE;
     else
       result_list.keyread = FALSE;
@@ -4843,7 +4552,8 @@ int ha_spider::read_multi_range_first_internal(
 #ifndef WITHOUT_SPIDER_BG_SEARCH
         if (result_list.bgs_phase > 0)
         {
-          if ((error_num = spider_check_and_init_casual_read(trx->thd, this,
+          if ((error_num = spider_check_and_init_casual_read(
+            wide_handler->trx->thd, this,
             roop_count)))
             DBUG_RETURN(error_num);
           error_num = spider_bg_conn_search(this, roop_count, roop_start,
@@ -4855,8 +4565,8 @@ int ha_spider::read_multi_range_first_internal(
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -4921,7 +4631,7 @@ int ha_spider::read_multi_range_first_internal(
           if (is_bulk_access_clone)
           {
             connection_ids[roop_count] = conn->connection_id;
-            spider_trx_add_bulk_access_conn(trx, conn);
+            spider_trx_add_bulk_access_conn(wide_handler->trx, conn);
             SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
             pthread_mutex_unlock(&conn->mta_conn_mutex);
 #ifdef HA_MRR_USE_DEFAULT_IMPL
@@ -4951,8 +4661,8 @@ int ha_spider::read_multi_range_first_internal(
                 need_mons[roop_count]
               ) {
                 error_num = spider_ping_table_mon_from_table(
-                    trx,
-                    trx->thd,
+                    wide_handler->trx,
+                    wide_handler->trx->thd,
                     share,
                     roop_count,
                     (uint32) share->monitoring_sid[roop_count],
@@ -4971,7 +4681,7 @@ int ha_spider::read_multi_range_first_internal(
             if (!error_num)
             {
               spider_conn_set_timeout_from_share(conn, roop_count,
-                trx->thd, share);
+                wide_handler->trx->thd, share);
               if (dbton_hdl->execute_sql(
                 sql_type,
                 conn,
@@ -4988,8 +4698,8 @@ int ha_spider::read_multi_range_first_internal(
                   need_mons[roop_count]
                 ) {
                   error_num = spider_ping_table_mon_from_table(
-                      trx,
-                      trx->thd,
+                      wide_handler->trx,
+                      wide_handler->trx->thd,
                       share,
                       roop_count,
                       (uint32) share->monitoring_sid[roop_count],
@@ -5023,8 +4733,8 @@ int ha_spider::read_multi_range_first_internal(
                   need_mons[roop_count]
                 ) {
                   error_num = spider_ping_table_mon_from_table(
-                      trx,
-                      trx->thd,
+                      wide_handler->trx,
+                      wide_handler->trx->thd,
                       share,
                       roop_count,
                       (uint32) share->monitoring_sid[roop_count],
@@ -5127,7 +4837,7 @@ int ha_spider::read_multi_range_first_internal(
     if (error_num)
       DBUG_RETURN(check_error_mode_eof(error_num));
   } else {
-    bool tmp_high_priority = high_priority;
+    bool tmp_high_priority = wide_handler->high_priority;
     bool have_multi_range;
 #ifdef HA_MRR_USE_DEFAULT_IMPL
     const uchar *first_mrr_start_key;
@@ -5136,7 +4846,7 @@ int ha_spider::read_multi_range_first_internal(
     uint first_mrr_end_key_length;
     have_second_range = FALSE;
 #endif
-    if (keyread)
+    if (wide_handler->keyread)
       result_list.keyread = TRUE;
     else
       result_list.keyread = FALSE;
@@ -5522,7 +5232,7 @@ int ha_spider::read_multi_range_first_internal(
             (error_num = set_union_table_name_pos_sql())
           )
             DBUG_RETURN(error_num);
-          high_priority = FALSE;
+          wide_handler->high_priority = FALSE;
           if (
             share->key_hint &&
             (error_num = append_hint_after_table_sql_part(
@@ -5606,7 +5316,7 @@ int ha_spider::read_multi_range_first_internal(
 #ifdef HA_MRR_USE_DEFAULT_IMPL
         while (!range_res);
 #endif
-        high_priority = tmp_high_priority;
+        wide_handler->high_priority = tmp_high_priority;
         if ((error_num = append_union_all_end_sql_part(
           SPIDER_SQL_TYPE_SELECT_SQL)))
           DBUG_RETURN(error_num);
@@ -5665,7 +5375,8 @@ int ha_spider::read_multi_range_first_internal(
 #ifndef WITHOUT_SPIDER_BG_SEARCH
         if (result_list.bgs_phase > 0)
         {
-          if ((error_num = spider_check_and_init_casual_read(trx->thd, this,
+          if ((error_num = spider_check_and_init_casual_read(
+            wide_handler->trx->thd, this,
             roop_count)))
             DBUG_RETURN(error_num);
           if ((error_num = spider_bg_conn_search(this, roop_count, roop_start,
@@ -5677,8 +5388,8 @@ int ha_spider::read_multi_range_first_internal(
               need_mons[roop_count]
             ) {
               error_num = spider_ping_table_mon_from_table(
-                  trx,
-                  trx->thd,
+                  wide_handler->trx,
+                  wide_handler->trx->thd,
                   share,
                   roop_count,
                   (uint32) share->monitoring_sid[roop_count],
@@ -5746,7 +5457,7 @@ int ha_spider::read_multi_range_first_internal(
           if (is_bulk_access_clone)
           {
             connection_ids[roop_count] = conn->connection_id;
-            spider_trx_add_bulk_access_conn(trx, conn);
+            spider_trx_add_bulk_access_conn(wide_handler->trx, conn);
             SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
             pthread_mutex_unlock(&conn->mta_conn_mutex);
 #ifdef HA_MRR_USE_DEFAULT_IMPL
@@ -5776,8 +5487,8 @@ int ha_spider::read_multi_range_first_internal(
                 need_mons[roop_count]
               ) {
                 error_num = spider_ping_table_mon_from_table(
-                    trx,
-                    trx->thd,
+                    wide_handler->trx,
+                    wide_handler->trx->thd,
                     share,
                     roop_count,
                     (uint32) share->monitoring_sid[roop_count],
@@ -5802,7 +5513,7 @@ int ha_spider::read_multi_range_first_internal(
               spider_set_bit(result_list.tmp_table_created, roop_count);
               result_list.tmp_tables_created = TRUE;
               spider_conn_set_timeout_from_share(conn, roop_count,
-                trx->thd, share);
+                wide_handler->trx->thd, share);
               if (dbton_hdl->execute_sql(
                 SPIDER_SQL_TYPE_TMP_SQL,
                 conn,
@@ -5819,8 +5530,8 @@ int ha_spider::read_multi_range_first_internal(
                   need_mons[roop_count]
                 ) {
                   error_num = spider_ping_table_mon_from_table(
-                      trx,
-                      trx->thd,
+                      wide_handler->trx,
+                      wide_handler->trx->thd,
                       share,
                       roop_count,
                       (uint32) share->monitoring_sid[roop_count],
@@ -5840,7 +5551,7 @@ int ha_spider::read_multi_range_first_internal(
               spider_db_discard_multiple_result(this, roop_count, conn);
             }
             spider_conn_set_timeout_from_share(conn, roop_count,
-              trx->thd, share);
+              wide_handler->trx->thd, share);
             if (dbton_hdl->execute_sql(
               sql_type,
               conn,
@@ -5857,8 +5568,8 @@ int ha_spider::read_multi_range_first_internal(
                 need_mons[roop_count]
               ) {
                 error_num = spider_ping_table_mon_from_table(
-                    trx,
-                    trx->thd,
+                    wide_handler->trx,
+                    wide_handler->trx->thd,
                     share,
                     roop_count,
                     (uint32) share->monitoring_sid[roop_count],
@@ -5890,8 +5601,8 @@ int ha_spider::read_multi_range_first_internal(
                   need_mons[roop_count]
                 ) {
                   error_num = spider_ping_table_mon_from_table(
-                      trx,
-                      trx->thd,
+                      wide_handler->trx,
+                      wide_handler->trx->thd,
                       share,
                       roop_count,
                       (uint32) share->monitoring_sid[roop_count],
@@ -6179,7 +5890,7 @@ int ha_spider::read_multi_range_next(
   DBUG_ENTER("ha_spider::read_multi_range_next");
 #endif
   DBUG_PRINT("info",("spider this=%p", this));
-  if (trx->thd->killed)
+  if (wide_handler->trx->thd->killed)
   {
     my_error(ER_QUERY_INTERRUPTED, MYF(0));
     DBUG_RETURN(ER_QUERY_INTERRUPTED);
@@ -6343,7 +6054,8 @@ int ha_spider::read_multi_range_next(
 #ifndef WITHOUT_SPIDER_BG_SEARCH
         if (result_list.bgs_phase > 0)
         {
-          if ((error_num = spider_check_and_init_casual_read(trx->thd, this,
+          if ((error_num = spider_check_and_init_casual_read(
+            wide_handler->trx->thd, this,
             roop_count)))
             DBUG_RETURN(error_num);
           error_num = spider_bg_conn_search(this, roop_count, roop_start,
@@ -6355,8 +6067,8 @@ int ha_spider::read_multi_range_next(
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -6421,7 +6133,7 @@ int ha_spider::read_multi_range_next(
           if (is_bulk_access_clone)
           {
             connection_ids[roop_count] = conn->connection_id;
-            spider_trx_add_bulk_access_conn(trx, conn);
+            spider_trx_add_bulk_access_conn(wide_handler->trx, conn);
             SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
             pthread_mutex_unlock(&conn->mta_conn_mutex);
 #ifdef HA_MRR_USE_DEFAULT_IMPL
@@ -6451,8 +6163,8 @@ int ha_spider::read_multi_range_next(
                 need_mons[roop_count]
               ) {
                 error_num = spider_ping_table_mon_from_table(
-                    trx,
-                    trx->thd,
+                    wide_handler->trx,
+                    wide_handler->trx->thd,
                     share,
                     roop_count,
                     (uint32) share->monitoring_sid[roop_count],
@@ -6471,7 +6183,7 @@ int ha_spider::read_multi_range_next(
             if (!error_num)
             {
               spider_conn_set_timeout_from_share(conn, roop_count,
-                trx->thd, share);
+                wide_handler->trx->thd, share);
               if (dbton_hdl->execute_sql(
                 sql_type,
                 conn,
@@ -6488,8 +6200,8 @@ int ha_spider::read_multi_range_next(
                   need_mons[roop_count]
                 ) {
                   error_num = spider_ping_table_mon_from_table(
-                      trx,
-                      trx->thd,
+                      wide_handler->trx,
+                      wide_handler->trx->thd,
                       share,
                       roop_count,
                       (uint32) share->monitoring_sid[roop_count],
@@ -6523,8 +6235,8 @@ int ha_spider::read_multi_range_next(
                   need_mons[roop_count]
                 ) {
                   error_num = spider_ping_table_mon_from_table(
-                      trx,
-                      trx->thd,
+                      wide_handler->trx,
+                      wide_handler->trx->thd,
                       share,
                       roop_count,
                       (uint32) share->monitoring_sid[roop_count],
@@ -6724,7 +6436,7 @@ int ha_spider::read_multi_range_next(
     multi_range_ranges = multi_range_curr;
 #endif
 
-    bool tmp_high_priority = high_priority;
+    bool tmp_high_priority = wide_handler->high_priority;
     bool have_multi_range;
     multi_range_cnt = 0;
     error_num = 0;
@@ -7032,7 +6744,7 @@ int ha_spider::read_multi_range_next(
             (error_num = set_union_table_name_pos_sql())
           )
             DBUG_RETURN(error_num);
-          high_priority = FALSE;
+          wide_handler->high_priority = FALSE;
           if (
             share->key_hint &&
             (error_num = append_hint_after_table_sql_part(
@@ -7102,7 +6814,7 @@ int ha_spider::read_multi_range_next(
 #ifdef HA_MRR_USE_DEFAULT_IMPL
         while (!range_res);
 #endif
-        high_priority = tmp_high_priority;
+        wide_handler->high_priority = tmp_high_priority;
         if ((error_num =
           append_union_all_end_sql_part(SPIDER_SQL_TYPE_SELECT_SQL)))
           DBUG_RETURN(error_num);
@@ -7160,7 +6872,8 @@ int ha_spider::read_multi_range_next(
 #ifndef WITHOUT_SPIDER_BG_SEARCH
         if (result_list.bgs_phase > 0)
         {
-          if ((error_num = spider_check_and_init_casual_read(trx->thd, this,
+          if ((error_num = spider_check_and_init_casual_read(
+            wide_handler->trx->thd, this,
             roop_count)))
             DBUG_RETURN(error_num);
           if ((error_num = spider_bg_conn_search(this, roop_count, roop_start,
@@ -7172,8 +6885,8 @@ int ha_spider::read_multi_range_next(
               need_mons[roop_count]
             ) {
               error_num = spider_ping_table_mon_from_table(
-                  trx,
-                  trx->thd,
+                  wide_handler->trx,
+                  wide_handler->trx->thd,
                   share,
                   roop_count,
                   (uint32) share->monitoring_sid[roop_count],
@@ -7241,7 +6954,7 @@ int ha_spider::read_multi_range_next(
           if (is_bulk_access_clone)
           {
             connection_ids[roop_count] = conn->connection_id;
-            spider_trx_add_bulk_access_conn(trx, conn);
+            spider_trx_add_bulk_access_conn(wide_handler->trx, conn);
             SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
             pthread_mutex_unlock(&conn->mta_conn_mutex);
 #ifdef HA_MRR_USE_DEFAULT_IMPL
@@ -7271,8 +6984,8 @@ int ha_spider::read_multi_range_next(
                 need_mons[roop_count]
               ) {
                 error_num = spider_ping_table_mon_from_table(
-                    trx,
-                    trx->thd,
+                    wide_handler->trx,
+                    wide_handler->trx->thd,
                     share,
                     roop_count,
                     (uint32) share->monitoring_sid[roop_count],
@@ -7297,7 +7010,7 @@ int ha_spider::read_multi_range_next(
               spider_set_bit(result_list.tmp_table_created, roop_count);
               result_list.tmp_tables_created = TRUE;
               spider_conn_set_timeout_from_share(conn, roop_count,
-                trx->thd, share);
+                wide_handler->trx->thd, share);
               if (dbton_hdl->execute_sql(
                 SPIDER_SQL_TYPE_TMP_SQL,
                 conn,
@@ -7314,8 +7027,8 @@ int ha_spider::read_multi_range_next(
                   need_mons[roop_count]
                 ) {
                   error_num = spider_ping_table_mon_from_table(
-                      trx,
-                      trx->thd,
+                      wide_handler->trx,
+                      wide_handler->trx->thd,
                       share,
                       roop_count,
                       (uint32) share->monitoring_sid[roop_count],
@@ -7335,7 +7048,7 @@ int ha_spider::read_multi_range_next(
               spider_db_discard_multiple_result(this, roop_count, conn);
             }
             spider_conn_set_timeout_from_share(conn, roop_count,
-              trx->thd, share);
+              wide_handler->trx->thd, share);
             if (dbton_hdl->execute_sql(
               sql_type,
               conn,
@@ -7352,8 +7065,8 @@ int ha_spider::read_multi_range_next(
                 need_mons[roop_count]
               ) {
                 error_num = spider_ping_table_mon_from_table(
-                    trx,
-                    trx->thd,
+                    wide_handler->trx,
+                    wide_handler->trx->thd,
                     share,
                     roop_count,
                     (uint32) share->monitoring_sid[roop_count],
@@ -7385,8 +7098,8 @@ int ha_spider::read_multi_range_next(
                   need_mons[roop_count]
                 ) {
                   error_num = spider_ping_table_mon_from_table(
-                      trx,
-                      trx->thd,
+                      wide_handler->trx,
+                      wide_handler->trx->thd,
                       share,
                       roop_count,
                       (uint32) share->monitoring_sid[roop_count],
@@ -7549,15 +7262,22 @@ int ha_spider::rnd_init(
   DBUG_ENTER("ha_spider::rnd_init");
   DBUG_PRINT("info",("spider this=%p", this));
   DBUG_PRINT("info",("spider scan=%s", scan ? "TRUE" : "FALSE"));
+  if (!dml_inited)
+  {
+    if (unlikely((error_num = dml_init())))
+    {
+      DBUG_RETURN(error_num);
+    }
+  }
   pushed_pos = NULL;
 /*
-  if (result_list.lock_type == F_WRLCK)
+  if (wide_handler->external_lock_type == F_WRLCK)
     check_and_start_bulk_update(SPD_BU_START_BY_INDEX_OR_RND_INIT);
 */
   rnd_scan_and_first = scan;
   if (
     scan &&
-    sql_command != SQLCOM_ALTER_TABLE
+    wide_handler->sql_command != SQLCOM_ALTER_TABLE
   ) {
     spider_set_result_list_param(this);
     pk_update = FALSE;
@@ -7622,14 +7342,15 @@ int ha_spider::rnd_init(
       use_spatial_index = FALSE;
 
       if (
-        update_request &&
+        wide_handler->update_request &&
         share->have_recovery_link &&
-        result_list.lock_type == F_WRLCK &&
+        wide_handler->external_lock_type == F_WRLCK &&
         (pk_update = spider_check_pk_update(table))
       ) {
         bitmap_set_all(table->read_set);
         if (is_clone)
-          memset(searched_bitmap, 0xFF, no_bytes_in_map(table->read_set));
+          memset(wide_handler->searched_bitmap, 0xFF,
+            no_bytes_in_map(table->read_set));
       }
 
       set_select_column_mode();
@@ -7699,17 +7420,17 @@ int ha_spider::rnd_next_internal(
 ) {
   int error_num;
   ha_spider *direct_limit_offset_spider =
-    (ha_spider *) partition_handler_share->creator;
+    (ha_spider *) partition_handler->owner;
   backup_error_status();
   DBUG_ENTER("ha_spider::rnd_next_internal");
   DBUG_PRINT("info",("spider this=%p", this));
-  if (trx->thd->killed)
+  if (wide_handler->trx->thd->killed)
   {
     my_error(ER_QUERY_INTERRUPTED, MYF(0));
     DBUG_RETURN(ER_QUERY_INTERRUPTED);
   }
   /* do not copy table data at alter table */
-  if (sql_command == SQLCOM_ALTER_TABLE)
+  if (wide_handler->sql_command == SQLCOM_ALTER_TABLE)
     DBUG_RETURN(HA_ERR_END_OF_FILE);
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
   do_direct_update = FALSE;
@@ -7739,7 +7460,7 @@ int ha_spider::rnd_next_internal(
         DBUG_RETURN(check_error_mode_eof(HA_ERR_END_OF_FILE));
       }
       if (
-        partition_handler_share->handlers &&
+        partition_handler->handlers &&
         direct_limit_offset_spider->direct_current_offset > 0
       ) {
         longlong table_count = this->records();
@@ -7855,7 +7576,8 @@ int ha_spider::rnd_next_internal(
 #ifndef WITHOUT_SPIDER_BG_SEARCH
       if (result_list.bgs_phase > 0)
       {
-        if ((error_num = spider_check_and_init_casual_read(trx->thd, this,
+        if ((error_num = spider_check_and_init_casual_read(
+          wide_handler->trx->thd, this,
           roop_count)))
           DBUG_RETURN(error_num);
         if ((error_num = spider_bg_conn_search(this, roop_count, roop_start,
@@ -7867,8 +7589,8 @@ int ha_spider::rnd_next_internal(
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -7937,8 +7659,8 @@ int ha_spider::rnd_next_internal(
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -7956,7 +7678,7 @@ int ha_spider::rnd_next_internal(
           DBUG_RETURN(check_error_mode_eof(error_num));
         }
         spider_conn_set_timeout_from_share(conn, roop_count,
-          trx->thd, share);
+          wide_handler->trx->thd, share);
         if (dbton_hdl->execute_sql(
           sql_type,
           conn,
@@ -7973,8 +7695,8 @@ int ha_spider::rnd_next_internal(
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -8006,8 +7728,8 @@ int ha_spider::rnd_next_internal(
               need_mons[roop_count]
             ) {
               error_num = spider_ping_table_mon_from_table(
-                  trx,
-                  trx->thd,
+                  wide_handler->trx,
+                  wide_handler->trx->thd,
                   share,
                   roop_count,
                   (uint32) share->monitoring_sid[roop_count],
@@ -8124,15 +7846,15 @@ void ha_spider::position(
     DBUG_PRINT("info",("spider self position"));
     DBUG_PRINT("info",
       ("spider current_row_num=%lld", result_list.current_row_num));
-    if (!position_bitmap_init)
+    if (!wide_handler->position_bitmap_init)
     {
       if (select_column_mode)
       {
         spider_db_handler *dbton_hdl =
           dbton_handler[result_list.current->dbton_id];
-        dbton_hdl->copy_minimum_select_bitmap(position_bitmap);
+        dbton_hdl->copy_minimum_select_bitmap(wide_handler->position_bitmap);
       }
-      position_bitmap_init = TRUE;
+      wide_handler->position_bitmap_init = TRUE;
     }
     spider_db_create_position(this, (SPIDER_POSITION *) ref);
   }
@@ -8154,7 +7876,7 @@ int ha_spider::rnd_pos(
       ((uchar *) table->read_set->bitmap)[roop_count]));
   }
 #endif
-  if (trx->thd->killed)
+  if (wide_handler->trx->thd->killed)
   {
     my_error(ER_QUERY_INTERRUPTED, MYF(0));
     DBUG_RETURN(ER_QUERY_INTERRUPTED);
@@ -8370,7 +8092,7 @@ int ha_spider::ft_read_internal(
   backup_error_status();
   DBUG_ENTER("ha_spider::ft_read_internal");
   DBUG_PRINT("info",("spider this=%p", this));
-  if (trx->thd->killed)
+  if (wide_handler->trx->thd->killed)
   {
     my_error(ER_QUERY_INTERRUPTED, MYF(0));
     DBUG_RETURN(ER_QUERY_INTERRUPTED);
@@ -8390,7 +8112,7 @@ int ha_spider::ft_read_internal(
     DBUG_PRINT("info",("spider result_list.finish_flg = FALSE"));
     result_list.finish_flg = FALSE;
     result_list.record_num = 0;
-    if (keyread)
+    if (wide_handler->keyread)
       result_list.keyread = TRUE;
     else
       result_list.keyread = FALSE;
@@ -8491,7 +8213,8 @@ int ha_spider::ft_read_internal(
 #ifndef WITHOUT_SPIDER_BG_SEARCH
       if (result_list.bgs_phase > 0)
       {
-        if ((error_num = spider_check_and_init_casual_read(trx->thd, this,
+        if ((error_num = spider_check_and_init_casual_read(
+          wide_handler->trx->thd, this,
           roop_count)))
           DBUG_RETURN(error_num);
         if ((error_num = spider_bg_conn_search(this, roop_count, roop_start,
@@ -8503,8 +8226,8 @@ int ha_spider::ft_read_internal(
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -8568,8 +8291,8 @@ int ha_spider::ft_read_internal(
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -8586,7 +8309,8 @@ int ha_spider::ft_read_internal(
           }
           DBUG_RETURN(check_error_mode_eof(error_num));
         }
-        spider_conn_set_timeout_from_share(conn, roop_count, trx->thd, share);
+        spider_conn_set_timeout_from_share(conn, roop_count,
+          wide_handler->trx->thd, share);
         if (dbton_hdl->execute_sql(
           SPIDER_SQL_TYPE_SELECT_SQL,
           conn,
@@ -8603,8 +8327,8 @@ int ha_spider::ft_read_internal(
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -8636,8 +8360,8 @@ int ha_spider::ft_read_internal(
               need_mons[roop_count]
             ) {
               error_num = spider_ping_table_mon_from_table(
-                  trx,
-                  trx->thd,
+                  wide_handler->trx,
+                  wide_handler->trx->thd,
                   share,
                   roop_count,
                   (uint32) share->monitoring_sid[roop_count],
@@ -8736,7 +8460,7 @@ int ha_spider::info(
 #ifdef HANDLER_HAS_CAN_USE_FOR_AUTO_INC_INIT
   auto_inc_temporary = FALSE;
 #endif
-  sql_command = thd_sql_command(thd);
+  wide_handler->sql_command = thd_sql_command(thd);
 /*
   if (
     sql_command == SQLCOM_DROP_TABLE ||
@@ -8757,8 +8481,8 @@ int ha_spider::info(
       }
     }
     if (
-      sql_command == SQLCOM_DROP_TABLE ||
-      sql_command == SQLCOM_ALTER_TABLE
+      wide_handler->sql_command == SQLCOM_DROP_TABLE ||
+      wide_handler->sql_command == SQLCOM_ALTER_TABLE
     )
       DBUG_RETURN(0);
 /*
@@ -8789,7 +8513,7 @@ int ha_spider::info(
         pthread_mutex_unlock(&share->sts_mutex);
       else {
         if ((spider_init_error_table =
-          spider_get_init_error_table(trx, share, FALSE)))
+          spider_get_init_error_table(wide_handler->trx, share, FALSE)))
         {
           DBUG_PRINT("info",("spider diff=%f",
             difftime(tmp_time, spider_init_error_table->init_error_time)));
@@ -8798,8 +8522,8 @@ int ha_spider::info(
             spider_param_table_init_error_interval())
           {
             pthread_mutex_unlock(&share->sts_mutex);
-            if (sql_command == SQLCOM_SHOW_CREATE ||
-                sql_command == SQLCOM_SHOW_FIELDS)
+            if (wide_handler->sql_command == SQLCOM_SHOW_CREATE ||
+                wide_handler->sql_command == SQLCOM_SHOW_FIELDS)
             {
               if (thd->is_error())
               {
@@ -8825,9 +8549,7 @@ int ha_spider::info(
     if (flag & HA_STATUS_AUTO)
     {
       if (
-#ifdef WITH_PARTITION_STORAGE_ENGINE
-        share->partition_share &&
-#endif
+        share->wide_share &&
         tmp_auto_increment_mode == 1 &&
         !share->lgtm_tblhnd_share->auto_increment_init
       ) {
@@ -8860,7 +8582,8 @@ int ha_spider::info(
                 if (
                   spider_init_error_table ||
                   (spider_init_error_table =
-                    spider_get_init_error_table(trx, share, TRUE))
+                    spider_get_init_error_table(wide_handler->trx,
+                      share, TRUE))
                 ) {
                   spider_init_error_table->init_error = error_num;
                   if ((spider_init_error_table->init_error_with_message =
@@ -8873,8 +8596,8 @@ int ha_spider::info(
                 share->init_error = TRUE;
                 share->init = TRUE;
               }
-              if (sql_command == SQLCOM_SHOW_CREATE ||
-                  sql_command == SQLCOM_SHOW_FIELDS)
+              if (wide_handler->sql_command == SQLCOM_SHOW_CREATE ||
+                  wide_handler->sql_command == SQLCOM_SHOW_FIELDS)
               {
                 if (thd->is_error())
                 {
@@ -8899,8 +8622,8 @@ int ha_spider::info(
                 need_mons[search_link_idx]
               ) {
                 error_num = spider_ping_table_mon_from_table(
-                    trx,
-                    trx->thd,
+                    wide_handler->trx,
+                    wide_handler->trx->thd,
                     share,
                     search_link_idx,
                     (uint32) share->monitoring_sid[search_link_idx],
@@ -8920,7 +8643,8 @@ int ha_spider::info(
                 if (
                   spider_init_error_table ||
                   (spider_init_error_table =
-                    spider_get_init_error_table(trx, share, TRUE))
+                    spider_get_init_error_table(wide_handler->trx,
+                      share, TRUE))
                 ) {
                   spider_init_error_table->init_error = error_num;
 /*
@@ -8937,8 +8661,8 @@ int ha_spider::info(
                 share->init_error = TRUE;
                 share->init = TRUE;
               }
-              if (sql_command == SQLCOM_SHOW_CREATE ||
-                  sql_command == SQLCOM_SHOW_FIELDS)
+              if (wide_handler->sql_command == SQLCOM_SHOW_CREATE ||
+                  wide_handler->sql_command == SQLCOM_SHOW_FIELDS)
               {
                 if (thd->is_error())
                 {
@@ -8967,8 +8691,8 @@ int ha_spider::info(
               if ((error_num = spider_create_sts_thread(share)))
               {
                 pthread_mutex_unlock(&share->sts_mutex);
-                if (sql_command == SQLCOM_SHOW_CREATE ||
-                    sql_command == SQLCOM_SHOW_FIELDS)
+                if (wide_handler->sql_command == SQLCOM_SHOW_CREATE ||
+                    wide_handler->sql_command == SQLCOM_SHOW_FIELDS)
                 {
                   if (thd->is_error())
                   {
@@ -8999,8 +8723,8 @@ int ha_spider::info(
     {
       if ((error_num = check_crd()))
       {
-        if (sql_command == SQLCOM_SHOW_CREATE ||
-            sql_command == SQLCOM_SHOW_FIELDS)
+        if (wide_handler->sql_command == SQLCOM_SHOW_CREATE ||
+            wide_handler->sql_command == SQLCOM_SHOW_FIELDS)
         {
           if (thd->is_error())
           {
@@ -9039,8 +8763,7 @@ int ha_spider::info(
 #ifdef HANDLER_HAS_CAN_USE_FOR_AUTO_INC_INIT
       auto_inc_temporary = FALSE;
 #endif
-#ifdef WITH_PARTITION_STORAGE_ENGINE
-      if (share->partition_share && table->next_number_field)
+      if (share->wide_share && table->next_number_field)
       {
         ulonglong first_value, nb_reserved_values;
         if (
@@ -9077,12 +8800,9 @@ int ha_spider::info(
             share->lgtm_tblhnd_share->auto_increment_value;
         }
       } else {
-#endif
         stats.auto_increment_value =
           share->lgtm_tblhnd_share->auto_increment_value;
-#ifdef WITH_PARTITION_STORAGE_ENGINE
       }
-#endif
     }
   }
   if (flag & HA_STATUS_ERRKEY)
@@ -9092,9 +8812,10 @@ int ha_spider::info(
 
 ha_rows ha_spider::records_in_range(
   uint inx,
-  key_range *start_key,
-  key_range *end_key
-) {
+  const key_range *start_key,
+  const key_range *end_key,
+  page_range *pages)
+{
   int error_num;
   THD *thd = ha_thd();
   double crd_interval = spider_param_crd_interval(thd, share->crd_interval);
@@ -9120,7 +8841,7 @@ ha_rows ha_spider::records_in_range(
       pthread_mutex_unlock(&share->crd_mutex);
     else {
       if ((spider_init_error_table =
-        spider_get_init_error_table(trx, share, FALSE)))
+        spider_get_init_error_table(wide_handler->trx, share, FALSE)))
       {
         DBUG_PRINT("info",("spider diff=%f",
           difftime(tmp_time, spider_init_error_table->init_error_time)));
@@ -9158,6 +8879,15 @@ ha_rows ha_spider::records_in_range(
       share->static_key_cardinality[inx] == -1 &&
       difftime(tmp_time, share->crd_get_time) >= crd_interval
     ) {
+      if (!dml_inited)
+      {
+        if (unlikely((error_num = dml_init())))
+        {
+          if (check_error_mode(error_num))
+            my_errno = error_num;
+          DBUG_RETURN(HA_POS_ERROR);
+        }
+      }
       if (
         crd_interval == 0 ||
         !pthread_mutex_trylock(&share->crd_mutex)
@@ -9183,8 +8913,8 @@ ha_rows ha_spider::records_in_range(
                 need_mons[search_link_idx]
               ) {
                 error_num = spider_ping_table_mon_from_table(
-                    trx,
-                    trx->thd,
+                    wide_handler->trx,
+                    wide_handler->trx->thd,
                     share,
                     search_link_idx,
                     (uint32) share->monitoring_sid[search_link_idx],
@@ -9204,7 +8934,8 @@ ha_rows ha_spider::records_in_range(
                 if (
                   spider_init_error_table ||
                   (spider_init_error_table =
-                    spider_get_init_error_table(trx, share, TRUE))
+                    spider_get_init_error_table(wide_handler->trx,
+                      share, TRUE))
                 ) {
                   spider_init_error_table->init_error = error_num;
 /*
@@ -9392,6 +9123,15 @@ ha_rows ha_spider::records_in_range(
     DBUG_RETURN((ha_rows) rows);
   } else if (crd_mode == 3)
   {
+    if (!dml_inited)
+    {
+      if (unlikely((error_num = dml_init())))
+      {
+        if (check_error_mode(error_num))
+          my_errno = error_num;
+        DBUG_RETURN(HA_POS_ERROR);
+      }
+    }
     result_list.key_info = &table->key_info[inx];
     DBUG_RETURN(spider_db_explain_select(start_key, end_key, this,
       search_link_idx));
@@ -9424,7 +9164,7 @@ int ha_spider::check_crd()
       pthread_mutex_unlock(&share->crd_mutex);
     else {
       if ((spider_init_error_table =
-        spider_get_init_error_table(trx, share, FALSE)))
+        spider_get_init_error_table(wide_handler->trx, share, FALSE)))
       {
         DBUG_PRINT("info",("spider diff=%f",
           difftime(tmp_time, spider_init_error_table->init_error_time)));
@@ -9483,8 +9223,8 @@ int ha_spider::check_crd()
               need_mons[search_link_idx]
             ) {
               error_num = spider_ping_table_mon_from_table(
-                  trx,
-                  trx->thd,
+                  wide_handler->trx,
+                  wide_handler->trx->thd,
                   share,
                   search_link_idx,
                   (uint32) share->monitoring_sid[search_link_idx],
@@ -9504,7 +9244,7 @@ int ha_spider::check_crd()
               if (
                 spider_init_error_table ||
                 (spider_init_error_table =
-                  spider_get_init_error_table(trx, share, TRUE))
+                  spider_get_init_error_table(wide_handler->trx, share, TRUE))
               ) {
                 spider_init_error_table->init_error = error_num;
                 if ((spider_init_error_table->init_error_with_message =
@@ -9564,7 +9304,7 @@ int ha_spider::pre_records()
   backup_error_status();
   DBUG_ENTER("ha_spider::pre_records");
   DBUG_PRINT("info",("spider this=%p", this));
-  if (sql_command == SQLCOM_ALTER_TABLE)
+  if (wide_handler->sql_command == SQLCOM_ALTER_TABLE)
   {
     DBUG_RETURN(0);
   }
@@ -9572,7 +9312,7 @@ int ha_spider::pre_records()
   {
     DBUG_RETURN(0);
   }
-  THD *thd = trx->thd;
+  THD *thd = wide_handler->trx->thd;
   if (
     spider_param_sync_autocommit(thd) &&
     (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
@@ -9595,7 +9335,7 @@ ha_rows ha_spider::records()
   backup_error_status();
   DBUG_ENTER("ha_spider::records");
   DBUG_PRINT("info",("spider this=%p", this));
-  if (sql_command == SQLCOM_ALTER_TABLE)
+  if (wide_handler->sql_command == SQLCOM_ALTER_TABLE)
   {
     use_pre_action = FALSE;
     DBUG_RETURN(0);
@@ -9606,7 +9346,7 @@ ha_rows ha_spider::records()
   }
   if (!use_pre_action && !this->result_list.direct_limit_offset)
   {
-    THD *thd = trx->thd;
+    THD *thd = wide_handler->trx->thd;
     if (
       spider_param_sync_autocommit(thd) &&
       (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
@@ -9634,7 +9374,14 @@ int ha_spider::pre_calculate_checksum()
   backup_error_status();
   DBUG_ENTER("ha_spider::pre_calculate_checksum");
   DBUG_PRINT("info",("spider this=%p", this));
-  THD *thd = trx->thd;
+  THD *thd = wide_handler->trx->thd;
+  if (!dml_inited)
+  {
+    if (unlikely((error_num = dml_init())))
+    {
+      DBUG_RETURN(error_num);
+    }
+  }
   if (
     spider_param_sync_autocommit(thd) &&
     (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
@@ -9658,9 +9405,16 @@ int ha_spider::calculate_checksum()
   backup_error_status();
   DBUG_ENTER("ha_spider::calculate_checksum");
   DBUG_PRINT("info",("spider this=%p", this));
+  if (!dml_inited)
+  {
+    if (unlikely((error_num = dml_init())))
+    {
+      DBUG_RETURN(error_num);
+    }
+  }
   if (!use_pre_action && !this->result_list.direct_limit_offset)
   {
-    THD *thd = trx->thd;
+    THD *thd = wide_handler->trx->thd;
     if (
       spider_param_sync_autocommit(thd) &&
       (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
@@ -9750,6 +9504,36 @@ ulonglong ha_spider::table_flags() const
   DBUG_RETURN(flags);
 }
 
+ulong ha_spider::table_flags_for_partition()
+{
+  DBUG_ENTER("ha_spider::table_flags_for_partition");
+  DBUG_PRINT("info",("spider this=%p", this));
+  ulong flags =
+#ifdef HA_PT_CALL_AT_ONCE_STORE_LOCK
+    HA_PT_CALL_AT_ONCE_STORE_LOCK |
+#endif
+#ifdef HA_PT_CALL_AT_ONCE_EXTERNAL_LOCK
+    HA_PT_CALL_AT_ONCE_EXTERNAL_LOCK |
+#endif
+#ifdef HA_PT_CALL_AT_ONCE_START_STMT
+    HA_PT_CALL_AT_ONCE_START_STMT |
+#endif
+#ifdef HA_PT_CALL_AT_ONCE_EXTRA
+    HA_PT_CALL_AT_ONCE_EXTRA |
+#endif
+#ifdef HA_PT_CALL_AT_ONCE_COND_PUSH
+    HA_PT_CALL_AT_ONCE_COND_PUSH |
+#endif
+#ifdef HA_PT_CALL_AT_ONCE_INFO_PUSH
+    HA_PT_CALL_AT_ONCE_INFO_PUSH |
+#endif
+#ifdef HA_PT_CALL_AT_ONCE_TOP_TABLE
+    HA_PT_CALL_AT_ONCE_TOP_TABLE |
+#endif
+    0;
+  DBUG_RETURN(flags);
+}
+
 const char *ha_spider::index_type(
   uint key_number
 ) {
@@ -9877,7 +9661,7 @@ int ha_spider::update_auto_increment()
 /*
   if (
     next_insert_id >= auto_inc_interval_for_cur_row.maximum() &&
-    trx->thd->auto_inc_intervals_forced.get_current()
+    wide_handler->trx->thd->auto_inc_intervals_forced.get_current()
   ) {
     force_auto_increment = TRUE;
     DBUG_PRINT("info",("spider force_auto_increment=TRUE"));
@@ -10101,6 +9885,13 @@ int ha_spider::write_row(
     DBUG_RETURN(error_num);
   }
 #endif
+  if (!dml_inited)
+  {
+    if (unlikely((error_num = dml_init())))
+    {
+      DBUG_RETURN(error_num);
+    }
+  }
 #ifndef SPIDER_WITHOUT_HA_STATISTIC_INCREMENT
   ha_statistic_increment(&SSV::ha_write_count);
 #endif
@@ -10163,19 +9954,21 @@ int ha_spider::write_row(
   if (!bulk_insert || bulk_size < 0)
   {
     direct_dup_insert =
-      spider_param_direct_dup_insert(trx->thd, share->direct_dup_insert);
+      spider_param_direct_dup_insert(wide_handler->trx->thd,
+        share->direct_dup_insert);
     DBUG_PRINT("info",("spider direct_dup_insert=%d", direct_dup_insert));
     if ((error_num = spider_db_bulk_insert_init(this, table)))
       DBUG_RETURN(check_error_mode(error_num));
     if (bulk_insert)
       bulk_size =
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
-        (insert_with_update && !result_list.insert_dup_update_pushdown) ||
+        (wide_handler->insert_with_update &&
+          !result_list.insert_dup_update_pushdown) ||
 #else
-        insert_with_update ||
+        wide_handler->insert_with_update ||
 #endif
-        (!direct_dup_insert && ignore_dup_key) ?
-        0 : spider_param_bulk_size(trx->thd, share->bulk_size);
+        (!direct_dup_insert && wide_handler->ignore_dup_key) ?
+        0 : spider_param_bulk_size(wide_handler->trx->thd, share->bulk_size);
     else
       bulk_size = 0;
   }
@@ -10199,7 +9992,7 @@ int ha_spider::pre_write_row(
 ) {
   int error_num;
   ulonglong option_backup = 0;
-  THD *thd = trx->thd;
+  THD *thd = wide_handler->trx->thd;
   DBUG_ENTER("ha_spider::pre_write_row");
   DBUG_PRINT("info",("spider this=%p", this));
 #if MYSQL_VERSION_ID < 50500
@@ -10411,6 +10204,28 @@ int ha_spider::update_row(
 }
 
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
+bool ha_spider::check_direct_update_sql_part(
+  st_select_lex *select_lex,
+  longlong select_limit,
+  longlong offset_limit
+) {
+  uint roop_count, dbton_id;
+  spider_db_handler *dbton_hdl;
+  DBUG_ENTER("ha_spider::check_direct_update_sql_part");
+  for (roop_count = 0; roop_count < share->use_sql_dbton_count; roop_count++)
+  {
+    dbton_id = share->use_sql_dbton_ids[roop_count];
+    dbton_hdl = dbton_handler[dbton_id];
+    if (
+      dbton_hdl->first_link_idx >= 0 &&
+      dbton_hdl->check_direct_update(select_lex, select_limit, offset_limit)
+    ) {
+      DBUG_RETURN(TRUE);
+    }
+  }
+  DBUG_RETURN(FALSE);
+}
+
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS_WITH_HS
 #ifdef SPIDER_MDEV_16246
 int ha_spider::direct_update_rows_init(
@@ -10437,7 +10252,7 @@ int ha_spider::direct_update_rows_init(
   st_select_lex *select_lex;
   longlong select_limit;
   longlong offset_limit;
-  THD *thd = trx->thd;
+  THD *thd = wide_handler->trx->thd;
   DBUG_ENTER("ha_spider::direct_update_rows_init");
   DBUG_PRINT("info",("spider this=%p", this));
 #ifdef HA_CAN_BULK_ACCESS
@@ -10466,6 +10281,13 @@ int ha_spider::direct_update_rows_init(
 #endif
   }
 #endif
+  if (!dml_inited)
+  {
+    if (unlikely((error_num = dml_init())))
+    {
+      DBUG_RETURN(error_num);
+    }
+  }
   direct_update_init(
     thd,
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
@@ -10494,6 +10316,7 @@ int ha_spider::direct_update_rows_init(
       !select_lex ||
       select_lex->table_list.elements != 1 ||
       check_update_columns_sql_part() ||
+      check_direct_update_sql_part(select_lex, select_limit, offset_limit) ||
       spider_db_append_condition(this, NULL, 0, TRUE)
     ) {
       DBUG_PRINT("info",("spider FALSE by condition"));
@@ -10515,7 +10338,7 @@ int ha_spider::direct_update_rows_init(
       }
       result_list.direct_order_limit = TRUE;
     }
-    trx->direct_update_count++;
+    wide_handler->trx->direct_update_count++;
     DBUG_PRINT("info",("spider OK"));
     DBUG_RETURN(0);
   }
@@ -10558,7 +10381,7 @@ int ha_spider::direct_update_rows_init(
       DBUG_RETURN(error_num);
     }
 #endif
-    trx->direct_update_count++;
+    wide_handler->trx->direct_update_count++;
     DBUG_PRINT("info",("spider OK"));
     DBUG_RETURN(0);
   }
@@ -10587,10 +10410,10 @@ int ha_spider::direct_update_rows_init()
   st_select_lex *select_lex;
   longlong select_limit;
   longlong offset_limit;
-  List_iterator<Item> it(*direct_update_fields);
+  List_iterator<Item> it(*wide_handler->direct_update_fields);
   Item *item;
   Field *field;
-  THD *thd = trx->thd;
+  THD *thd = wide_handler->trx->thd;
   DBUG_ENTER("ha_spider::direct_update_rows_init");
   DBUG_PRINT("info",("spider this=%p", this));
   if (thd->variables.time_zone != UTC)
@@ -10639,14 +10462,21 @@ int ha_spider::direct_update_rows_init()
 #endif
   }
 #endif
+  if (!dml_inited)
+  {
+    if (unlikely(dml_init()))
+    {
+      DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+    }
+  }
   direct_update_init(
     thd,
     FALSE
   );
-  if (!condition)
-    cond_check = FALSE;
+  if (!wide_handler->condition)
+    wide_handler->cond_check = FALSE;
   spider_get_select_limit(this, &select_lex, &select_limit, &offset_limit);
-  if (direct_update_fields)
+  if (wide_handler->direct_update_fields)
   {
     if (
 #if MYSQL_VERSION_ID < 50500
@@ -10661,6 +10491,7 @@ int ha_spider::direct_update_rows_init()
       !select_lex ||
       select_lex->table_list.elements != 1 ||
       check_update_columns_sql_part() ||
+      check_direct_update_sql_part(select_lex, select_limit, offset_limit) ||
       spider_db_append_condition(this, NULL, 0, TRUE)
     ) {
       DBUG_PRINT("info",("spider FALSE by condition"));
@@ -10682,20 +10513,20 @@ int ha_spider::direct_update_rows_init()
       }
       result_list.direct_order_limit = TRUE;
     }
-    trx->direct_update_count++;
+    wide_handler->trx->direct_update_count++;
     DBUG_PRINT("info",("spider OK"));
     DBUG_RETURN(0);
   }
 
   DBUG_PRINT("info",("spider offset_limit=%lld", offset_limit));
-  DBUG_PRINT("info",("spider sql_command=%u", sql_command));
+  DBUG_PRINT("info",("spider sql_command=%u", wide_handler->sql_command));
   DBUG_PRINT("info",("spider do_direct_update=%s",
     do_direct_update ? "TRUE" : "FALSE"));
   if (
     !offset_limit &&
     do_direct_update
   ) {
-    trx->direct_update_count++;
+    wide_handler->trx->direct_update_count++;
     DBUG_PRINT("info",("spider OK"));
     DBUG_RETURN(0);
   }
@@ -10744,6 +10575,13 @@ int ha_spider::pre_direct_update_rows_init(
     bulk_access_link_current->called = TRUE;
     DBUG_RETURN(error_num);
   }
+  if (!dml_inited)
+  {
+    if (unlikely((error_num = dml_init())))
+    {
+      DBUG_RETURN(error_num);
+    }
+  }
 #ifdef SPIDER_MDEV_16246
   pre_direct_init_result = direct_update_rows_init(
     update_fields, mode, ranges, range_count, sorted, new_data);
@@ -10788,6 +10626,13 @@ int ha_spider::pre_direct_update_rows_init()
     bulk_access_link_current->called = TRUE;
     DBUG_RETURN(error_num);
   }
+  if (!dml_inited)
+  {
+    if (unlikely((error_num = dml_init())))
+    {
+      DBUG_RETURN(error_num);
+    }
+  }
 #ifdef SPIDER_MDEV_16246
   pre_direct_init_result = direct_update_rows_init(update_fields);
 #else
@@ -11004,6 +10849,28 @@ int ha_spider::delete_row(
 }
 
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
+bool ha_spider::check_direct_delete_sql_part(
+  st_select_lex *select_lex,
+  longlong select_limit,
+  longlong offset_limit
+) {
+  uint roop_count, dbton_id;
+  spider_db_handler *dbton_hdl;
+  DBUG_ENTER("ha_spider::check_direct_delete_sql_part");
+  for (roop_count = 0; roop_count < share->use_sql_dbton_count; roop_count++)
+  {
+    dbton_id = share->use_sql_dbton_ids[roop_count];
+    dbton_hdl = dbton_handler[dbton_id];
+    if (
+      dbton_hdl->first_link_idx >= 0 &&
+      dbton_hdl->check_direct_delete(select_lex, select_limit, offset_limit)
+    ) {
+      DBUG_RETURN(TRUE);
+    }
+  }
+  DBUG_RETURN(FALSE);
+}
+
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS_WITH_HS
 int ha_spider::direct_delete_rows_init(
   uint mode,
@@ -11017,7 +10884,7 @@ int ha_spider::direct_delete_rows_init(
   st_select_lex *select_lex;
   longlong select_limit;
   longlong offset_limit;
-  THD *thd = trx->thd;
+  THD *thd = wide_handler->trx->thd;
   DBUG_ENTER("ha_spider::direct_delete_rows_init");
   DBUG_PRINT("info",("spider this=%p", this));
 #ifdef HA_CAN_BULK_ACCESS
@@ -11039,6 +10906,13 @@ int ha_spider::direct_delete_rows_init(
       mode, ranges, range_count, sorted));
   }
 #endif
+  if (!dml_inited)
+  {
+    if (unlikely((error_num = dml_init())))
+    {
+      DBUG_RETURN(error_num);
+    }
+  }
   direct_update_init(
     thd,
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
@@ -11064,6 +10938,7 @@ int ha_spider::direct_delete_rows_init(
 #endif
       !select_lex ||
       select_lex->table_list.elements != 1 ||
+      check_direct_delete_sql_part(select_lex, select_limit, offset_limit) ||
       spider_db_append_condition(this, NULL, 0, TRUE)
     ) {
       DBUG_PRINT("info",("spider FALSE by condition"));
@@ -11085,7 +10960,7 @@ int ha_spider::direct_delete_rows_init(
       }
       result_list.direct_order_limit = TRUE;
     }
-    trx->direct_delete_count++;
+    wide_handler->trx->direct_delete_count++;
     DBUG_PRINT("info",("spider OK"));
     DBUG_RETURN(0);
   }
@@ -11108,7 +10983,7 @@ int ha_spider::direct_delete_rows_init(
       DBUG_RETURN(error_num);
     }
 #endif
-    trx->direct_delete_count++;
+    wide_handler->trx->direct_delete_count++;
     DBUG_PRINT("info",("spider OK"));
     DBUG_RETURN(0);
   }
@@ -11122,7 +10997,7 @@ int ha_spider::direct_delete_rows_init()
   st_select_lex *select_lex;
   longlong select_limit;
   longlong offset_limit;
-  THD *thd = trx->thd;
+  THD *thd = wide_handler->trx->thd;
   DBUG_ENTER("ha_spider::direct_delete_rows_init");
   DBUG_PRINT("info",("spider this=%p", this));
 #ifdef HA_CAN_BULK_ACCESS
@@ -11143,12 +11018,19 @@ int ha_spider::direct_delete_rows_init()
     DBUG_RETURN(bulk_access_link_exec_tgt->spider->direct_delete_rows_init());
   }
 #endif
+  if (!dml_inited)
+  {
+    if (unlikely(dml_init()))
+    {
+      DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+    }
+  }
   direct_update_init(
     thd,
     FALSE
   );
-  if (!condition)
-    cond_check = FALSE;
+  if (!wide_handler->condition)
+    wide_handler->cond_check = FALSE;
   spider_get_select_limit(this, &select_lex, &select_limit, &offset_limit);
   if (
 #if MYSQL_VERSION_ID < 50500
@@ -11162,6 +11044,7 @@ int ha_spider::direct_delete_rows_init()
 #endif
     !select_lex ||
     select_lex->table_list.elements != 1 ||
+    check_direct_delete_sql_part(select_lex, select_limit, offset_limit) ||
     spider_db_append_condition(this, NULL, 0, TRUE)
   ) {
     DBUG_PRINT("info",("spider FALSE by condition"));
@@ -11183,7 +11066,7 @@ int ha_spider::direct_delete_rows_init()
     }
     result_list.direct_order_limit = TRUE;
   }
-  trx->direct_delete_count++;
+  wide_handler->trx->direct_delete_count++;
   DBUG_PRINT("info",("spider OK"));
   DBUG_RETURN(0);
 }
@@ -11209,6 +11092,13 @@ int ha_spider::pre_direct_delete_rows_init(
     bulk_access_link_current->called = TRUE;
     DBUG_RETURN(error_num);
   }
+  if (!dml_inited)
+  {
+    if (unlikely((error_num = dml_init())))
+    {
+      DBUG_RETURN(error_num);
+    }
+  }
   pre_direct_init_result = direct_delete_rows_init(
     mode, ranges, range_count, sorted);
   DBUG_RETURN(pre_direct_init_result);
@@ -11227,6 +11117,13 @@ int ha_spider::pre_direct_delete_rows_init()
     bulk_access_link_current->called = TRUE;
     DBUG_RETURN(error_num);
   }
+  if (!dml_inited)
+  {
+    if (unlikely((error_num = dml_init())))
+    {
+      DBUG_RETURN(error_num);
+    }
+  }
   pre_direct_init_result = direct_delete_rows_init();
   DBUG_RETURN(pre_direct_init_result);
 }
@@ -11390,7 +11287,8 @@ int ha_spider::delete_all_rows()
     sql_kind[roop_count] = SPIDER_SQL_KIND_SQL;
   if ((error_num = spider_db_delete_all_rows(this)))
     DBUG_RETURN(check_error_mode(error_num));
-  if (sql_command == SQLCOM_TRUNCATE && table->found_next_number_field)
+  if (wide_handler->sql_command == SQLCOM_TRUNCATE &&
+    table->found_next_number_field)
   {
     DBUG_PRINT("info",("spider reset auto increment"));
     pthread_mutex_lock(&share->lgtm_tblhnd_share->auto_increment_mutex);
@@ -11419,7 +11317,7 @@ int ha_spider::truncate()
       table_share->db.str, table_share->table_name.str);
     DBUG_RETURN(ER_SPIDER_READ_ONLY_NUM);
   }
-  sql_command = SQLCOM_TRUNCATE;
+  wide_handler->sql_command = SQLCOM_TRUNCATE;
   if ((error_num = spider_check_trx_and_get_conn(thd, this, FALSE)))
   {
     DBUG_RETURN(error_num);
@@ -11432,7 +11330,8 @@ int ha_spider::truncate()
     sql_kind[roop_count] = SPIDER_SQL_KIND_SQL;
   if ((error_num = spider_db_delete_all_rows(this)))
     DBUG_RETURN(check_error_mode(error_num));
-  if (sql_command == SQLCOM_TRUNCATE && table->found_next_number_field)
+  if (wide_handler->sql_command == SQLCOM_TRUNCATE &&
+    table->found_next_number_field)
   {
     DBUG_PRINT("info",("spider reset auto increment"));
     pthread_mutex_lock(&share->lgtm_tblhnd_share->auto_increment_mutex);
@@ -11452,10 +11351,10 @@ int ha_spider::truncate()
 void ha_spider::bulk_req_exec()
 {
   int need_mon;
-  SPIDER_CONN *conn = trx->bulk_access_conn_first;
+  SPIDER_CONN *conn = wide_handler->trx->bulk_access_conn_first;
   DBUG_ENTER("ha_spider::bulk_req_exec");
   DBUG_PRINT("info",("spider this=%p", this));
-  DBUG_PRINT("info",("spider trx=%p", trx));
+  DBUG_PRINT("info",("spider trx=%p", wide_handler->trx));
   DBUG_PRINT("info",("spider first_conn=%p", conn));
   while (conn)
   {
@@ -11497,7 +11396,7 @@ void ha_spider::bulk_req_exec()
     }
     conn = conn->bulk_access_next;
   }
-  trx->bulk_access_conn_first = NULL;
+  wide_handler->trx->bulk_access_conn_first = NULL;
   DBUG_VOID_RETURN;
 }
 #endif
@@ -11519,7 +11418,7 @@ double ha_spider::read_time(
 ) {
   DBUG_ENTER("ha_spider::read_time");
   DBUG_PRINT("info",("spider this=%p", this));
-  if (keyread)
+  if (wide_handler->keyread)
   {
     DBUG_PRINT("info",("spider read_time(keyread) = %.6f",
       share->read_rate * table->key_info[index].key_length *
@@ -11602,11 +11501,7 @@ int ha_spider::create(
   uint sql_command = thd_sql_command(thd), roop_count;
   SPIDER_TRX *trx;
   TABLE *table_tables = NULL;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   bool need_lock = FALSE;
   DBUG_ENTER("ha_spider::create");
   DBUG_PRINT("info",("spider this=%p", this));
@@ -11653,7 +11548,7 @@ int ha_spider::create(
     if (!(tmp_share.static_key_cardinality = (longlong *)
       spider_bulk_malloc(spider_current_trx, 246, MYF(MY_WME),
         &tmp_share.static_key_cardinality,
-          sizeof(*tmp_share.static_key_cardinality) * form->s->keys,
+          (uint) (sizeof(*tmp_share.static_key_cardinality) * form->s->keys),
         NullS))
     ) {
       error_num = HA_ERR_OUT_OF_MEM;
@@ -11802,6 +11697,27 @@ void ha_spider::update_create_info(
 ) {
   DBUG_ENTER("ha_spider::update_create_info");
   DBUG_PRINT("info",("spider this=%p", this));
+  if (wide_handler && wide_handler->sql_command == SQLCOM_ALTER_TABLE)
+  {
+    SPIDER_TRX *trx = wide_handler->trx;
+    THD *thd = trx->thd;
+    if (trx->query_id != thd->query_id)
+    {
+      spider_free_trx_alter_table(trx);
+      trx->query_id = thd->query_id;
+      trx->tmp_flg = FALSE;
+    }
+    if (!(SPIDER_ALTER_TABLE*) my_hash_search(&trx->trx_alter_table_hash,
+      (uchar*) share->table_name, share->table_name_length))
+    {
+      if (spider_create_trx_alter_table(trx, share, FALSE))
+      {
+        store_error_num = HA_ERR_OUT_OF_MEM;
+        DBUG_VOID_RETURN;
+      }
+    }
+  }
+
   if (!create_info->connect_string.str)
   {
     create_info->connect_string.str = table->s->connect_string.str;
@@ -11837,11 +11753,7 @@ int ha_spider::rename_table(
   TABLE *table_tables = NULL;
   SPIDER_ALTER_TABLE *alter_table_from, *alter_table_to;
   SPIDER_LGTM_TBLHND_SHARE *from_lgtm_tblhnd_share, *to_lgtm_tblhnd_share;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   bool need_lock = FALSE;
   DBUG_ENTER("ha_spider::rename_table");
   DBUG_PRINT("info",("spider this=%p", this));
@@ -12065,11 +11977,7 @@ int ha_spider::delete_table(
   TABLE *table_tables = NULL;
   uint sql_command = thd_sql_command(thd);
   SPIDER_ALTER_TABLE *alter_table;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   bool need_lock = FALSE;
   DBUG_ENTER("ha_spider::delete_table");
   DBUG_PRINT("info",("spider this=%p", this));
@@ -12320,32 +12228,6 @@ bool ha_spider::is_fatal_error(
   DBUG_RETURN(TRUE);
 }
 
-Field *ha_spider::get_top_table_field(
-  uint16 field_index
-) {
-  Field *field;
-  DBUG_ENTER("ha_spider::get_top_table_field");
-#ifdef HA_CAN_BULK_ACCESS
-  if (is_bulk_access_clone)
-  {
-    DBUG_RETURN(pt_clone_source_handler->get_top_table_field(field_index));
-  }
-#endif
-  DBUG_PRINT("info",("spider field_index=%u", field_index));
-#ifdef HANDLER_HAS_TOP_TABLE_FIELDS
-  if (set_top_table_fields)
-  {
-    field = top_table->field[field_index];
-  } else {
-#endif
-    field = table->field[field_index];
-#ifdef HANDLER_HAS_TOP_TABLE_FIELDS
-  }
-#endif
-  DBUG_PRINT("info",("spider out field=%p", field));
-  DBUG_RETURN(field);
-}
-
 Field *ha_spider::field_exchange(
   Field *field
 ) {
@@ -12358,22 +12240,9 @@ Field *ha_spider::field_exchange(
 #endif
   DBUG_PRINT("info",("spider in field=%p", field));
   DBUG_PRINT("info",("spider in field->table=%p", field->table));
-#ifdef HANDLER_HAS_TOP_TABLE_FIELDS
-  if (set_top_table_fields)
-  {
-    DBUG_PRINT("info",("spider top_table=%p", top_table));
-    if (field->table != top_table)
-      DBUG_RETURN(NULL);
-    if (!(field = top_table_field[field->field_index]))
-      DBUG_RETURN(NULL);
-  } else {
-#endif
-    DBUG_PRINT("info",("spider table=%p", table));
-    if (field->table != table)
-      DBUG_RETURN(NULL);
-#ifdef HANDLER_HAS_TOP_TABLE_FIELDS
-  }
-#endif
+  DBUG_PRINT("info",("spider table=%p", table));
+  if (field->table != table)
+    DBUG_RETURN(NULL);
   DBUG_PRINT("info",("spider out field=%p", field));
   DBUG_RETURN(field);
 }
@@ -12382,7 +12251,17 @@ const COND *ha_spider::cond_push(
   const COND *cond
 ) {
   DBUG_ENTER("ha_spider::cond_push");
-  cond_check = FALSE;
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+  if (
+    wide_handler->stage == SPD_HND_STAGE_COND_PUSH &&
+    wide_handler->stage_executor != this)
+  {
+    DBUG_RETURN(NULL);
+  }
+  wide_handler->stage = SPD_HND_STAGE_COND_PUSH;
+  wide_handler->stage_executor = this;
+#endif
+  wide_handler->cond_check = FALSE;
   if (cond)
   {
     SPIDER_CONDITION *tmp_cond;
@@ -12391,8 +12270,8 @@ const COND *ha_spider::cond_push(
     )
       DBUG_RETURN(cond);
     tmp_cond->cond = (COND *) cond;
-    tmp_cond->next = condition;
-    condition = tmp_cond;
+    tmp_cond->next = wide_handler->condition;
+    wide_handler->condition = tmp_cond;
   }
   DBUG_RETURN(NULL);
 }
@@ -12400,11 +12279,21 @@ const COND *ha_spider::cond_push(
 void ha_spider::cond_pop()
 {
   DBUG_ENTER("ha_spider::cond_pop");
-  if (condition)
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+  if (
+    wide_handler->stage == SPD_HND_STAGE_COND_POP &&
+    wide_handler->stage_executor != this)
+  {
+    DBUG_VOID_RETURN;
+  }
+  wide_handler->stage = SPD_HND_STAGE_COND_POP;
+  wide_handler->stage_executor = this;
+#endif
+  if (wide_handler->condition)
   {
-    SPIDER_CONDITION *tmp_cond = condition->next;
-    spider_free(spider_current_trx, condition, MYF(0));
-    condition = tmp_cond;
+    SPIDER_CONDITION *tmp_cond = wide_handler->condition->next;
+    spider_free(spider_current_trx, wide_handler->condition, MYF(0));
+    wide_handler->condition = tmp_cond;
   }
   DBUG_VOID_RETURN;
 }
@@ -12416,6 +12305,16 @@ int ha_spider::info_push(
   int error_num = 0;
   DBUG_ENTER("ha_spider::info_push");
   DBUG_PRINT("info",("spider this=%p", this));
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+  if (
+    wide_handler->stage == SPD_HND_STAGE_INFO_PUSH &&
+    wide_handler->stage_executor != this)
+  {
+    DBUG_RETURN(0);
+  }
+  wide_handler->stage = SPD_HND_STAGE_INFO_PUSH;
+  wide_handler->stage_executor = this;
+#endif
 #ifdef HA_CAN_BULK_ACCESS
   if (
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
@@ -12455,33 +12354,39 @@ int ha_spider::info_push(
       size_t roop_count;
       Field *field;
       SPIDER_HS_UINT32_INFO *tmp_info = (SPIDER_HS_UINT32_INFO *) info;
-      hs_pushed_ret_fields_num = tmp_info->info_size;
-      if (hs_pushed_ret_fields_size < hs_pushed_ret_fields_num)
+      wide_handler->hs_pushed_ret_fields_num = tmp_info->info_size;
+      if (wide_handler->hs_pushed_ret_fields_size <
+        wide_handler->hs_pushed_ret_fields_num)
       {
-        if (hs_pushed_ret_fields)
-          spider_free(spider_current_trx, hs_pushed_ret_fields, MYF(0));
-        if (!(hs_pushed_ret_fields = (uint32 *)
+        if (wide_handler->hs_pushed_ret_fields)
+          spider_free(spider_current_trx, wide_handler->hs_pushed_ret_fields,
+            MYF(0));
+        if (!(wide_handler->hs_pushed_ret_fields = (uint32 *)
           spider_bulk_malloc(spider_current_trx, 17, MYF(MY_WME),
-          &hs_pushed_ret_fields, sizeof(uint32) * hs_pushed_ret_fields_num,
+          &wide_handler->hs_pushed_ret_fields,
+            sizeof(uint32) * wide_handler->hs_pushed_ret_fields_num,
           NullS))
         ) {
           DBUG_RETURN(HA_ERR_OUT_OF_MEM);
         }
-        hs_pushed_ret_fields_size = hs_pushed_ret_fields_num;
+        wide_handler->hs_pushed_ret_fields_size =
+          wide_handler->hs_pushed_ret_fields_num;
       }
-      memcpy(hs_pushed_ret_fields, tmp_info->info,
-        sizeof(uint32) * hs_pushed_ret_fields_num);
+      memcpy(wide_handler->hs_pushed_ret_fields, tmp_info->info,
+        sizeof(uint32) * wide_handler->hs_pushed_ret_fields_num);
       bitmap_clear_all(table->read_set);
       bitmap_clear_all(table->write_set);
-      hs_pushed_lcl_fields_num = 0;
-      for (roop_count = 0; roop_count < hs_pushed_ret_fields_num; roop_count++)
+      wide_handler->hs_pushed_lcl_fields_num = 0;
+      for (roop_count = 0; roop_count < wide_handler->hs_pushed_ret_fields_num;
+        roop_count++)
       {
-        field = get_top_table_field(hs_pushed_ret_fields[roop_count]);
+        field = get_top_table_field(
+          wide_handler->hs_pushed_ret_fields[roop_count]);
         if ((field = field_exchange(field)))
         {
           if (!bitmap_is_set(table->read_set, field->field_index))
           {
-            ++hs_pushed_lcl_fields_num;
+            ++wide_handler->hs_pushed_lcl_fields_num;
             bitmap_set_bit(table->read_set, field->field_index);
             bitmap_set_bit(table->write_set, field->field_index);
           }
@@ -12503,172 +12408,168 @@ int ha_spider::info_push(
       }
 #endif
       Field *field;
-      if (hs_pushed_ret_fields)
+      if (wide_handler->hs_pushed_ret_fields)
       {
         field = get_top_table_field(
-          hs_pushed_ret_fields[hs_pushed_strref_num]);
+          wide_handler->hs_pushed_ret_fields[
+            wide_handler->hs_pushed_strref_num]);
       } else {
         field = get_top_table_field(
-          pt_clone_source_handler->hs_pushed_ret_fields[hs_pushed_strref_num]);
+          pt_clone_source_handler->wide_handler->hs_pushed_ret_fields[
+            wide_handler->hs_pushed_strref_num]);
       }
       if (!field_exchange(field))
       {
-        hs_pushed_strref_num++;
+        wide_handler->hs_pushed_strref_num++;
         break;
       }
-      hs_pushed_strref_num++;
-      if ((error_num = push_back_hs_upds(*((SPIDER_HS_STRING_REF*) info))))
+      wide_handler->hs_pushed_strref_num++;
+      if (partition_handler && partition_handler->handlers)
       {
-        DBUG_RETURN(error_num);
+        size_t roop_count;
+        ha_spider **handlers = partition_handler->handlers;
+        for (roop_count = 0; roop_count < partition_handler->no_parts;
+          ++roop_count)
+        {
+          if ((error_num = handlers[roop_count]->push_back_hs_upds(
+            *((SPIDER_HS_STRING_REF*) info))))
+          {
+            DBUG_RETURN(error_num);
+          }
+        }
+      } else {
+        if ((error_num = push_back_hs_upds(*((SPIDER_HS_STRING_REF*) info))))
+        {
+          DBUG_RETURN(error_num);
+        }
       }
       break;
     }
     case INFO_KIND_HS_CLEAR_STRING_REF:
       DBUG_PRINT("info",("spider INFO_KIND_HS_CLEAR_STRING_REF"));
-      hs_pushed_strref_num = 0;
-      if ((error_num = reset_hs_upds(SPIDER_SQL_TYPE_UPDATE_HS)))
+      wide_handler->hs_pushed_strref_num = 0;
+      if (partition_handler && partition_handler->handlers)
       {
-        DBUG_RETURN(error_num);
+        size_t roop_count;
+        ha_spider **handlers = partition_handler->handlers;
+        for (roop_count = 0; roop_count < partition_handler->no_parts;
+          ++roop_count)
+        {
+          if ((error_num = handlers[roop_count]->reset_hs_upds(
+            SPIDER_SQL_TYPE_UPDATE_HS)))
+          {
+            DBUG_RETURN(error_num);
+          }
+        }
+      } else {
+        if ((error_num = reset_hs_upds(SPIDER_SQL_TYPE_UPDATE_HS)))
+        {
+          DBUG_RETURN(error_num);
+        }
       }
       break;
     case INFO_KIND_HS_INCREMENT_BEGIN:
       DBUG_PRINT("info",("spider INFO_KIND_HS_INCREMENT_BEGIN"));
-      hs_increment = TRUE;
+      wide_handler->hs_increment = TRUE;
       break;
     case INFO_KIND_HS_INCREMENT_END:
       DBUG_PRINT("info",("spider INFO_KIND_HS_INCREMENT_END"));
-      hs_increment = FALSE;
+      wide_handler->hs_increment = FALSE;
       break;
     case INFO_KIND_HS_DECREMENT_BEGIN:
       DBUG_PRINT("info",("spider INFO_KIND_HS_DECREMENT_BEGIN"));
-      hs_decrement = TRUE;
+      wide_handler->hs_decrement = TRUE;
       break;
     case INFO_KIND_HS_DECREMENT_END:
       DBUG_PRINT("info",("spider INFO_KIND_HS_DECREMENT_END"));
-      hs_decrement = FALSE;
+      wide_handler->hs_decrement = FALSE;
       break;
 #endif
 #ifdef INFO_KIND_UPDATE_FIELDS
     case INFO_KIND_UPDATE_FIELDS:
       DBUG_PRINT("info",("spider INFO_KIND_UPDATE_FIELDS"));
-      direct_update_fields = (List<Item> *) info;
-      update_request = TRUE;
+      wide_handler->direct_update_fields = (List<Item> *) info;
+      wide_handler->update_request = TRUE;
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-      if (keyread && check_partitioned())
-        keyread = FALSE;
+      if (wide_handler->keyread && check_partitioned())
+        wide_handler->keyread = FALSE;
 #endif
       break;
 #endif
 #ifdef INFO_KIND_UPDATE_VALUES
     case INFO_KIND_UPDATE_VALUES:
       DBUG_PRINT("info",("spider INFO_KIND_UPDATE_VALUES"));
-      direct_update_values = (List<Item> *) info;
+      wide_handler->direct_update_values = (List<Item> *) info;
       break;
 #endif
 #ifdef INFO_KIND_FORCE_LIMIT_BEGIN
     case INFO_KIND_FORCE_LIMIT_BEGIN:
       DBUG_PRINT("info",("spider INFO_KIND_FORCE_LIMIT_BEGIN"));
-      info_limit = *((longlong *) info);
-/*
-      trx->direct_aggregate_count++;
-*/
+      wide_handler->info_limit = *((longlong *) info);
       break;
     case INFO_KIND_FORCE_LIMIT_END:
       DBUG_PRINT("info",("spider INFO_KIND_FORCE_LIMIT_END"));
-      info_limit = 9223372036854775807LL;
+      wide_handler->info_limit = 9223372036854775807LL;
       break;
 #endif
 #endif
 #ifdef HA_CAN_BULK_ACCESS
     case INFO_KIND_BULK_ACCESS_BEGIN:
       DBUG_PRINT("info",("spider INFO_KIND_BULK_ACCESS_BEGIN"));
-      if (bulk_access_started)
+      if (partition_handler && partition_handler->handlers)
       {
-        if (!bulk_access_link_current->next)
+        size_t roop_count;
+        ha_spider **handlers = partition_handler->handlers;
+        for (roop_count = 0; roop_count < partition_handler->no_parts;
+          ++roop_count)
         {
-          if (!(bulk_access_link_current->next = create_bulk_access_link()))
+          if ((error_num = handlers[roop_count]->bulk_access_begin(info)))
           {
-            DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+            DBUG_RETURN(error_num);
           }
-          bulk_access_link_current->next->sequence_num =
-            bulk_access_link_current->sequence_num + 1;
         }
-        bulk_access_link_current = bulk_access_link_current->next;
       } else {
-        if (!bulk_access_link_first)
+        if ((error_num = bulk_access_begin(info)))
         {
-          if (!(bulk_access_link_first = create_bulk_access_link()))
-          {
-            DBUG_RETURN(HA_ERR_OUT_OF_MEM);
-          }
-          bulk_access_link_first->sequence_num = 0;
+          DBUG_RETURN(error_num);
         }
-        bulk_access_link_current = bulk_access_link_first;
-        bulk_access_started = TRUE;
-        bulk_access_executing = FALSE;
-      }
-      if (
-        (error_num = bulk_access_link_current->spider->
-          sync_from_clone_source(this)) ||
-        (error_num = bulk_access_link_current->spider->
-          check_access_kind(trx->thd, (lock_type >= TL_WRITE_ALLOW_WRITE)))
-      ) {
-        DBUG_RETURN(error_num);
-      }
-#ifdef HA_CAN_BULK_ACCESS
-#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-      memset(
-        bulk_access_link_current->spider->result_list.hs_r_bulk_open_index, 0,
-        share->link_bitmap_size);
-      memset(
-        bulk_access_link_current->spider->result_list.hs_w_bulk_open_index, 0,
-        share->link_bitmap_size);
-#endif
-#endif
-/*
-#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-      if ((error_num = bulk_access_link_current->spider->reset_hs_strs_pos(
-        SPIDER_SQL_TYPE_UPDATE_HS)))
-      {
-        DBUG_RETURN(error_num);
       }
-#endif
-*/
-      bulk_access_link_current->spider->bulk_access_executing = FALSE;
-      bulk_access_link_current->spider->bulk_access_pre_called = FALSE;
-      bulk_access_link_current->used = TRUE;
-      bulk_access_link_current->called = FALSE;
-      *((void **) info) = bulk_access_link_current;
       break;
     case INFO_KIND_BULK_ACCESS_CURRENT:
       DBUG_PRINT("info",("spider INFO_KIND_BULK_ACCESS_CURRENT"));
-      bulk_access_executing = TRUE;
-      bulk_access_link_exec_tgt = (SPIDER_BULK_ACCESS_LINK *) info;
-      if (bulk_access_link_exec_tgt->spider->pt_clone_source_handler != this)
-      {
-        DBUG_PRINT("info",("spider this=%p", this));
-        DBUG_PRINT("info",("spider pt_clone_source_handler=%p",
-          bulk_access_link_exec_tgt->spider->pt_clone_source_handler));
-        /* partitioned */
-        uint sequence_num = bulk_access_link_exec_tgt->sequence_num;
-        for (
-          bulk_access_link_exec_tgt = bulk_access_link_first;
-          bulk_access_link_exec_tgt;
-          bulk_access_link_exec_tgt = bulk_access_link_exec_tgt->next
-        ) {
-          if (bulk_access_link_exec_tgt->sequence_num >= sequence_num)
+      if (partition_handler && partition_handler->handlers)
+      {
+        size_t roop_count;
+        ha_spider **handlers = partition_handler->handlers;
+        for (roop_count = 0; roop_count < partition_handler->no_parts;
+          ++roop_count)
+        {
+          if ((error_num = handlers[roop_count]->bulk_access_current(info)))
           {
-            DBUG_ASSERT(
-              bulk_access_link_exec_tgt->sequence_num == sequence_num);
-            break;
+            DBUG_RETURN(error_num);
           }
         }
+      } else {
+        if ((error_num = bulk_access_current(info)))
+        {
+          DBUG_RETURN(error_num);
+        }
       }
-      bulk_access_link_exec_tgt->spider->bulk_access_executing = TRUE;
       break;
     case INFO_KIND_BULK_ACCESS_END:
       DBUG_PRINT("info",("spider INFO_KIND_BULK_ACCESS_END"));
-      bulk_access_started = FALSE;
+      if (partition_handler && partition_handler->handlers)
+      {
+        size_t roop_count;
+        ha_spider **handlers = partition_handler->handlers;
+        for (roop_count = 0; roop_count < partition_handler->no_parts;
+          ++roop_count)
+        {
+          handlers[roop_count]->bulk_access_end();
+        }
+      } else {
+        bulk_access_end();
+      }
       break;
 #endif
     default:
@@ -12694,16 +12595,6 @@ TABLE *ha_spider::get_table()
   DBUG_RETURN(table);
 }
 
-TABLE *ha_spider::get_top_table()
-{
-  DBUG_ENTER("ha_spider::get_top_table");
-#ifdef HANDLER_HAS_TOP_TABLE_FIELDS
-  if (set_top_table_fields)
-    DBUG_RETURN(top_table);
-#endif
-  DBUG_RETURN(table);
-}
-
 void ha_spider::set_ft_discard_bitmap()
 {
   DBUG_ENTER("ha_spider::set_ft_discard_bitmap");
@@ -12734,7 +12625,8 @@ void ha_spider::set_ft_discard_bitmap()
             if (!field || !(field = field_exchange(field)))
               continue;
             DBUG_PRINT("info",("spider clear_bit=%u", field->field_index));
-            spider_clear_bit(ft_discard_bitmap, field->field_index);
+            spider_clear_bit(wide_handler->ft_discard_bitmap,
+              field->field_index);
           }
         }
       }
@@ -12760,7 +12652,8 @@ void ha_spider::set_ft_discard_bitmap()
         if (!field || !(field = field_exchange(field)))
           continue;
         DBUG_PRINT("info",("spider field_index=%u", field->field_index));
-        if (!spider_bit_is_set(ft_discard_bitmap, field->field_index))
+        if (!spider_bit_is_set(wide_handler->ft_discard_bitmap,
+          field->field_index))
         {
           bool match_flag = FALSE;
           List_iterator_fast<Item_func_match> fmi(*select_lex->ftfunc_list);
@@ -12786,7 +12679,8 @@ void ha_spider::set_ft_discard_bitmap()
           if (!match_flag)
           {
             DBUG_PRINT("info",("spider set_bit=%u", field->field_index));
-            spider_set_bit(ft_discard_bitmap, field->field_index);
+            spider_set_bit(wide_handler->ft_discard_bitmap,
+              field->field_index);
           }
         }
       }
@@ -12802,18 +12696,19 @@ void ha_spider::set_searched_bitmap()
   for (roop_count = 0; roop_count < (int) ((table_share->fields + 7) / 8);
     roop_count++)
   {
-    searched_bitmap[roop_count] =
+    wide_handler->searched_bitmap[roop_count] =
       ((uchar *) table->read_set->bitmap)[roop_count] |
       ((uchar *) table->write_set->bitmap)[roop_count];
     DBUG_PRINT("info",("spider roop_count=%d", roop_count));
     DBUG_PRINT("info",("spider searched_bitmap=%d",
-      searched_bitmap[roop_count]));
+      wide_handler->searched_bitmap[roop_count]));
     DBUG_PRINT("info",("spider read_set=%d",
       ((uchar *) table->read_set->bitmap)[roop_count]));
     DBUG_PRINT("info",("spider write_set=%d",
       ((uchar *) table->write_set->bitmap)[roop_count]));
   }
-  if (sql_command == SQLCOM_UPDATE || sql_command == SQLCOM_UPDATE_MULTI)
+  if (wide_handler->sql_command == SQLCOM_UPDATE ||
+    wide_handler->sql_command == SQLCOM_UPDATE_MULTI)
   {
     DBUG_PRINT("info",("spider update option start"));
     Item *item;
@@ -12829,7 +12724,7 @@ void ha_spider::set_searched_bitmap()
           DBUG_PRINT("info",("spider field is for different table"));
           continue;
         }
-        spider_set_bit(searched_bitmap, field->field_index);
+        spider_set_bit(wide_handler->searched_bitmap, field->field_index);
         DBUG_PRINT("info",("spider set searched_bitmap=%u",
           field->field_index));
       } else {
@@ -12843,23 +12738,26 @@ void ha_spider::set_searched_bitmap()
 void ha_spider::set_clone_searched_bitmap()
 {
   DBUG_ENTER("ha_spider::set_clone_searched_bitmap");
-  DBUG_PRINT("info",("spider searched_bitmap=%p", searched_bitmap));
+  DBUG_PRINT("info",("spider searched_bitmap=%p",
+    wide_handler->searched_bitmap));
 #ifndef DBUG_OFF
   int roop_count;
   for (roop_count = 0; roop_count < (int) ((table_share->fields + 7) / 8);
     roop_count++)
     DBUG_PRINT("info", ("spider before searched_bitmap is %x",
-      ((uchar *) searched_bitmap)[roop_count]));
+      ((uchar *) wide_handler->searched_bitmap)[roop_count]));
 #endif
-  memcpy(searched_bitmap, pt_clone_source_handler->searched_bitmap,
+  memcpy(wide_handler->searched_bitmap,
+    pt_clone_source_handler->wide_handler->searched_bitmap,
     (table_share->fields + 7) / 8);
 #ifndef DBUG_OFF
   for (roop_count = 0; roop_count < (int) ((table_share->fields + 7) / 8);
     roop_count++)
     DBUG_PRINT("info", ("spider after searched_bitmap is %x",
-      ((uchar *) searched_bitmap)[roop_count]));
+      ((uchar *) wide_handler->searched_bitmap)[roop_count]));
 #endif
-  memcpy(ft_discard_bitmap, pt_clone_source_handler->ft_discard_bitmap,
+  memcpy(wide_handler->ft_discard_bitmap,
+    pt_clone_source_handler->wide_handler->ft_discard_bitmap,
     (table_share->fields + 7) / 8);
   DBUG_VOID_RETURN;
 }
@@ -12891,7 +12789,7 @@ void ha_spider::set_searched_bitmap_from_item_list()
     if (!field || !(field = field_exchange(field)))
       continue;
     DBUG_PRINT("info",("spider field_index=%u", field->field_index));
-    spider_set_bit(searched_bitmap, field->field_index);
+    spider_set_bit(wide_handler->searched_bitmap, field->field_index);
   }
   DBUG_VOID_RETURN;
 }
@@ -12902,9 +12800,9 @@ void ha_spider::set_select_column_mode()
   KEY *key_info;
   KEY_PART_INFO *key_part;
   Field *field;
-  THD *thd = trx->thd;
+  THD *thd = wide_handler->trx->thd;
   DBUG_ENTER("ha_spider::set_select_column_mode");
-  position_bitmap_init = FALSE;
+  wide_handler->position_bitmap_init = FALSE;
 #ifndef DBUG_OFF
   for (roop_count = 0; roop_count < (int) ((table_share->fields + 7) / 8);
     roop_count++)
@@ -12915,72 +12813,48 @@ void ha_spider::set_select_column_mode()
     share->select_column_mode);
   if (select_column_mode)
   {
-    DBUG_PRINT("info",("spider searched_bitmap=%p", searched_bitmap));
-#ifdef WITH_PARTITION_STORAGE_ENGINE
-    if (
-      partition_handler_share &&
-      partition_handler_share->searched_bitmap
-    ) {
-      if (partition_handler_share->searched_bitmap != searched_bitmap)
-      {
-        memcpy(searched_bitmap, partition_handler_share->searched_bitmap,
-          (table_share->fields + 7) / 8);
-        memcpy(ft_discard_bitmap, partition_handler_share->ft_discard_bitmap,
-          (table_share->fields + 7) / 8);
-      }
-      partition_handler_share->between_flg = FALSE;
-      DBUG_PRINT("info",("spider copy searched_bitmap"));
-    } else {
-#endif
-      set_searched_bitmap();
-      set_searched_bitmap_from_item_list();
-      if (result_list.lock_type == F_WRLCK && sql_command != SQLCOM_SELECT)
-      {
+    DBUG_PRINT("info",("spider searched_bitmap=%p",
+      wide_handler->searched_bitmap));
+    set_searched_bitmap();
+    set_searched_bitmap_from_item_list();
+    if (wide_handler->external_lock_type == F_WRLCK &&
+      wide_handler->sql_command != SQLCOM_SELECT)
+    {
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-        uint part_num = 0;
-        if (update_request)
-          part_num = check_partitioned();
+      uint part_num = 0;
+      if (wide_handler->update_request)
+        part_num = check_partitioned();
 #endif
-        if (
+      if (
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-          part_num ||
+        part_num ||
 #endif
-          table_share->primary_key == MAX_KEY
-        ) {
-          /* need all columns */
-          for (roop_count = 0; roop_count < (int) table_share->fields;
-            roop_count++)
-            spider_set_bit(searched_bitmap, roop_count);
-        } else {
-          /* need primary key columns */
-          key_info = &table_share->key_info[table_share->primary_key];
-          key_part = key_info->key_part;
-          for (roop_count = 0;
-            roop_count < (int) spider_user_defined_key_parts(key_info);
-            roop_count++)
-          {
-            field = key_part[roop_count].field;
-            spider_set_bit(searched_bitmap, field->field_index);
-          }
-        }
-#ifndef DBUG_OFF
+        table_share->primary_key == MAX_KEY
+      ) {
+        /* need all columns */
+        for (roop_count = 0; roop_count < (int) table_share->fields;
+          roop_count++)
+          spider_set_bit(wide_handler->searched_bitmap, roop_count);
+      } else {
+        /* need primary key columns */
+        key_info = &table_share->key_info[table_share->primary_key];
+        key_part = key_info->key_part;
         for (roop_count = 0;
-          roop_count < (int) ((table_share->fields + 7) / 8);
+          roop_count < (int) spider_user_defined_key_parts(key_info);
           roop_count++)
-          DBUG_PRINT("info", ("spider change bitmap is %x",
-            searched_bitmap[roop_count]));
-#endif
-      }
-#ifdef WITH_PARTITION_STORAGE_ENGINE
-      if (partition_handler_share)
-      {
-        partition_handler_share->searched_bitmap = searched_bitmap;
-        partition_handler_share->ft_discard_bitmap = ft_discard_bitmap;
-        partition_handler_share->between_flg = TRUE;
-        DBUG_PRINT("info",("spider set searched_bitmap"));
+        {
+          field = key_part[roop_count].field;
+          spider_set_bit(wide_handler->searched_bitmap, field->field_index);
+        }
       }
-    }
+#ifndef DBUG_OFF
+      for (roop_count = 0;
+        roop_count < (int) ((table_share->fields + 7) / 8);
+        roop_count++)
+        DBUG_PRINT("info", ("spider change bitmap is %x",
+          wide_handler->searched_bitmap[roop_count]));
 #endif
+    }
   }
   DBUG_VOID_RETURN;
 }
@@ -12988,55 +12862,55 @@ void ha_spider::set_select_column_mode()
 #ifdef WITH_PARTITION_STORAGE_ENGINE
 void ha_spider::check_select_column(bool rnd)
 {
-  THD *thd = trx->thd;
+  THD *thd = wide_handler->trx->thd;
   DBUG_ENTER("ha_spider::check_select_column");
   select_column_mode = spider_param_select_column_mode(thd,
     share->select_column_mode);
-  if (select_column_mode && partition_handler_share)
+  if (select_column_mode)
   {
     if (!rnd)
     {
-      if (partition_handler_share->between_flg)
+      if (wide_handler->between_flg)
       {
-        memcpy(partition_handler_share->idx_read_bitmap,
+        memcpy(wide_handler->idx_read_bitmap,
           table->read_set->bitmap, (table_share->fields + 7) / 8);
-        memcpy(partition_handler_share->idx_write_bitmap,
+        memcpy(wide_handler->idx_write_bitmap,
           table->write_set->bitmap, (table_share->fields + 7) / 8);
-        partition_handler_share->between_flg = FALSE;
-        partition_handler_share->idx_bitmap_is_set = TRUE;
+        wide_handler->between_flg = FALSE;
+        wide_handler->idx_bitmap_is_set = TRUE;
         DBUG_PRINT("info",("spider set idx_bitmap"));
-      } else if (partition_handler_share->idx_bitmap_is_set)
+      } else if (wide_handler->idx_bitmap_is_set)
       {
         memcpy(table->read_set->bitmap,
-          partition_handler_share->idx_read_bitmap,
+          wide_handler->idx_read_bitmap,
           (table_share->fields + 7) / 8);
         memcpy(table->write_set->bitmap,
-          partition_handler_share->idx_write_bitmap,
+          wide_handler->idx_write_bitmap,
           (table_share->fields + 7) / 8);
         DBUG_PRINT("info",("spider copy idx_bitmap"));
       }
     } else {
       if (
-        !partition_handler_share->rnd_bitmap_is_set &&
+        !wide_handler->rnd_bitmap_is_set &&
         (
-          partition_handler_share->between_flg ||
-          partition_handler_share->idx_bitmap_is_set
+          wide_handler->between_flg ||
+          wide_handler->idx_bitmap_is_set
         )
       ) {
-        memcpy(partition_handler_share->rnd_read_bitmap,
+        memcpy(wide_handler->rnd_read_bitmap,
           table->read_set->bitmap, (table_share->fields + 7) / 8);
-        memcpy(partition_handler_share->rnd_write_bitmap,
+        memcpy(wide_handler->rnd_write_bitmap,
           table->write_set->bitmap, (table_share->fields + 7) / 8);
-        partition_handler_share->between_flg = FALSE;
-        partition_handler_share->rnd_bitmap_is_set = TRUE;
+        wide_handler->between_flg = FALSE;
+        wide_handler->rnd_bitmap_is_set = TRUE;
         DBUG_PRINT("info",("spider set rnd_bitmap"));
-      } else if (partition_handler_share->rnd_bitmap_is_set)
+      } else if (wide_handler->rnd_bitmap_is_set)
       {
         memcpy(table->read_set->bitmap,
-          partition_handler_share->rnd_read_bitmap,
+          wide_handler->rnd_read_bitmap,
           (table_share->fields + 7) / 8);
         memcpy(table->write_set->bitmap,
-          partition_handler_share->rnd_write_bitmap,
+          wide_handler->rnd_write_bitmap,
           (table_share->fields + 7) / 8);
         DBUG_PRINT("info",("spider copy rnd_bitmap"));
       }
@@ -13185,7 +13059,7 @@ void ha_spider::check_distinct_key_query()
 {
   DBUG_ENTER( "ha_spider::check_distinct_key_query" );
 
-  if ( result_list.direct_distinct && !partition_handler_share->handlers &&
+  if ( result_list.direct_distinct && !partition_handler->handlers &&
        result_list.keyread && result_list.check_direct_order_limit )
   {
     // SELECT DISTINCT query using an index in a non-partitioned configuration
@@ -13255,11 +13129,11 @@ int ha_spider::check_ha_range_eof()
     result_list.use_both_key ? "TRUE" : "FALSE"));
   DBUG_PRINT("info",("spider sql_kind[%u]=%u",
     search_link_idx, sql_kind[search_link_idx]));
-  DBUG_PRINT("info",("spider sql_command=%u", sql_command));
+  DBUG_PRINT("info",("spider sql_command=%u", wide_handler->sql_command));
   if (
     result_list.use_both_key &&
     (sql_kind[search_link_idx] & SPIDER_SQL_KIND_HANDLER) &&
-    sql_command != SQLCOM_HA_READ
+    wide_handler->sql_command != SQLCOM_HA_READ
   ) {
     int cmp_result = key_cmp(result_list.key_info->key_part,
       end_key->key, end_key->length);
@@ -13348,8 +13222,8 @@ int ha_spider::drop_tmp_tables()
             need_mons[roop_count]
           ) {
             tmp_error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -13369,7 +13243,7 @@ int ha_spider::drop_tmp_tables()
         if (!tmp_error_num)
         {
           spider_conn_set_timeout_from_share(conn, roop_count,
-            trx->thd, share);
+            wide_handler->trx->thd, share);
           if (dbton_hdl->execute_sql(
             SPIDER_SQL_TYPE_DROP_TMP_TABLE_SQL,
             conn,
@@ -13386,8 +13260,8 @@ int ha_spider::drop_tmp_tables()
               need_mons[roop_count]
             ) {
               tmp_error_num = spider_ping_table_mon_from_table(
-                  trx,
-                  trx->thd,
+                  wide_handler->trx,
+                  wide_handler->trx->thd,
                   share,
                   roop_count,
                   (uint32) share->monitoring_sid[roop_count],
@@ -13508,8 +13382,8 @@ int ha_spider::close_opened_handler(
         need_mons[link_idx]
       ) {
         error_num2 = spider_ping_table_mon_from_table(
-          trx,
-          trx->thd,
+          wide_handler->trx,
+          wide_handler->trx->thd,
           share,
           link_idx,
           (uint32) share->monitoring_sid[link_idx],
@@ -13527,9 +13401,10 @@ int ha_spider::close_opened_handler(
       error_num = error_num2;
     }
     spider_clear_bit(m_handler_opened, link_idx);
-    if (release_conn)
+    if (release_conn && !conns[link_idx]->join_trx)
     {
-      spider_free_conn_from_trx(trx, conns[link_idx], FALSE, FALSE, NULL);
+      spider_free_conn_from_trx(wide_handler->trx, conns[link_idx],
+        FALSE, FALSE, NULL);
       conns[link_idx] = NULL;
     }
   }
@@ -13544,8 +13419,8 @@ int ha_spider::close_opened_handler(
         need_mons[link_idx]
       ) {
         error_num2 = spider_ping_table_mon_from_table(
-          trx,
-          trx->thd,
+          wide_handler->trx,
+          wide_handler->trx->thd,
           share,
           link_idx,
           (uint32) share->monitoring_sid[link_idx],
@@ -13563,16 +13438,18 @@ int ha_spider::close_opened_handler(
       error_num = error_num2;
     }
     spider_clear_bit(r_handler_opened, link_idx);
-    if (release_conn)
+    if (release_conn && !hs_w_conns[link_idx]->join_trx)
     {
       if (
         !hs_r_conns[link_idx]->opened_handlers &&
-        trx->trx_hs_r_conn_adjustment == trx_hs_r_conn_adjustment &&
-        spider_param_hs_r_conn_recycle_mode(trx->thd) != 2
+        wide_handler->trx->trx_hs_r_conn_adjustment ==
+          trx_hs_r_conn_adjustment &&
+        spider_param_hs_r_conn_recycle_mode(wide_handler->trx->thd) != 2
       ) {
-        trx->trx_hs_r_conn_adjustment++;
+        wide_handler->trx->trx_hs_r_conn_adjustment++;
       }
-      spider_free_conn_from_trx(trx, hs_r_conns[link_idx], FALSE, FALSE, NULL);
+      spider_free_conn_from_trx(wide_handler->trx, hs_r_conns[link_idx],
+        FALSE, FALSE, NULL);
       hs_r_conns[link_idx] = NULL;
     }
   }
@@ -13586,8 +13463,8 @@ int ha_spider::close_opened_handler(
         need_mons[link_idx]
       ) {
         error_num2 = spider_ping_table_mon_from_table(
-          trx,
-          trx->thd,
+          wide_handler->trx,
+          wide_handler->trx->thd,
           share,
           link_idx,
           (uint32) share->monitoring_sid[link_idx],
@@ -13605,16 +13482,18 @@ int ha_spider::close_opened_handler(
       error_num = error_num2;
     }
     spider_clear_bit(w_handler_opened, link_idx);
-    if (release_conn)
+    if (release_conn && !hs_w_conns[link_idx]->join_trx)
     {
       if (
         !hs_w_conns[link_idx]->opened_handlers &&
-        trx->trx_hs_w_conn_adjustment == trx_hs_w_conn_adjustment &&
-        spider_param_hs_w_conn_recycle_mode(trx->thd) != 2
+        wide_handler->trx->trx_hs_w_conn_adjustment ==
+          trx_hs_w_conn_adjustment &&
+        spider_param_hs_w_conn_recycle_mode(wide_handler->trx->thd) != 2
       ) {
-        trx->trx_hs_w_conn_adjustment++;
+        wide_handler->trx->trx_hs_w_conn_adjustment++;
       }
-      spider_free_conn_from_trx(trx, hs_w_conns[link_idx], FALSE, FALSE, NULL);
+      spider_free_conn_from_trx(wide_handler->trx, hs_w_conns[link_idx],
+        FALSE, FALSE, NULL);
       hs_w_conns[link_idx] = NULL;
     }
   }
@@ -13688,8 +13567,8 @@ int ha_spider::index_handler_init()
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -13795,8 +13674,8 @@ int ha_spider::rnd_handler_init()
             need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                trx,
-                trx->thd,
+                wide_handler->trx,
+                wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -13858,7 +13737,7 @@ void ha_spider::set_error_mode()
     case SQLCOM_SHOW_ENGINE_STATUS:
     case SQLCOM_SHOW_ENGINE_MUTEX:
     case SQLCOM_SHOW_PROCESSLIST:
-    case SQLCOM_SHOW_MASTER_STAT:
+    case SQLCOM_SHOW_BINLOG_STAT:
     case SQLCOM_SHOW_SLAVE_STAT:
     case SQLCOM_SHOW_GRANTS:
     case SQLCOM_SHOW_CREATE:
@@ -13980,10 +13859,10 @@ void ha_spider::check_pre_call(
   }
   if (
     use_parallel &&
-    thd->query_id != partition_handler_share->parallel_search_query_id
+    thd->query_id != partition_handler->parallel_search_query_id
   ) {
-    partition_handler_share->parallel_search_query_id = thd->query_id;
-    ++trx->parallel_search_count;
+    partition_handler->parallel_search_query_id = thd->query_id;
+    ++wide_handler->trx->parallel_search_count;
   }
   use_pre_call = use_parallel;
   if (!use_pre_call)
@@ -14005,7 +13884,7 @@ void ha_spider::check_pre_call(
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
 void ha_spider::check_insert_dup_update_pushdown()
 {
-  THD *thd = trx->thd;
+  THD *thd = wide_handler->trx->thd;
   DBUG_ENTER("ha_spider::check_insert_dup_update_pushdown");
   DBUG_PRINT("info",("spider this=%p", this));
   if (!spider_param_direct_dup_insert(thd, share->direct_dup_insert))
@@ -14013,8 +13892,8 @@ void ha_spider::check_insert_dup_update_pushdown()
     DBUG_PRINT("info",("spider FALSE by direct_dup_insert"));
     DBUG_VOID_RETURN;
   }
-  direct_update_fields = &thd->lex->update_list;
-  direct_update_values = &thd->lex->value_list;
+  wide_handler->direct_update_fields = &thd->lex->update_list;
+  wide_handler->direct_update_values = &thd->lex->value_list;
   if (!append_dup_update_pushdown_sql_part(NULL, 0))
   {
     result_list.insert_dup_update_pushdown = TRUE;
@@ -14041,8 +13920,8 @@ SPIDER_BULK_ACCESS_LINK *ha_spider::create_bulk_access_link()
 */
   if (!(bulk_access_link = (SPIDER_BULK_ACCESS_LINK *)
     spider_bulk_malloc(spider_current_trx, 168, MYF(MY_WME),
-    &bulk_access_link, sizeof(SPIDER_BULK_ACCESS_LINK),
-    &ref, ALIGN_SIZE(ref_length) * 2,
+    &bulk_access_link, (uint) (sizeof(SPIDER_BULK_ACCESS_LINK)),
+    &ref, (uint) (ALIGN_SIZE(ref_length) * 2),
     NullS))
   ) {
     goto error_bulk_malloc;
@@ -14109,14 +13988,14 @@ int ha_spider::sync_from_clone_source(
   if (!synced_from_clone_source)
   {
     DBUG_PRINT("info",("spider synced from clone source all"));
-    trx = spider->trx;
+    wide_handler->trx = spider->wide_handler->trx;
     sql_command = spider->sql_command;
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
     conn_kinds = spider->conn_kinds;
     memcpy(conn_kind, spider->conn_kind, sizeof(uint) * share->link_count);
 #endif
-    result_list.lock_type = spider->result_list.lock_type;
-    lock_type = spider->lock_type;
+    wide_handler->external_lock_type =
+      spider->wide_handler->external_lock_type;
     selupd_lock_mode = spider->selupd_lock_mode;
     update_request = spider->update_request;
     lock_mode = spider->lock_mode;
@@ -14157,8 +14036,8 @@ int ha_spider::sync_from_clone_source(
     conn_kinds = spider->conn_kinds;
     memcpy(conn_kind, spider->conn_kind, sizeof(uint) * share->link_count);
 #endif
-    result_list.lock_type = spider->result_list.lock_type;
-    lock_type = spider->lock_type;
+    wide_handler->external_lock_type =
+      spider->wide_handler->external_lock_type;
     selupd_lock_mode = spider->selupd_lock_mode;
     update_request = spider->update_request;
     lock_mode = spider->lock_mode;
@@ -14166,7 +14045,8 @@ int ha_spider::sync_from_clone_source(
     insert_delayed = spider->insert_delayed;
     low_priority = spider->low_priority;
 
-    if ((error_num = spider_check_trx_and_get_conn(spider->trx->thd,
+    if ((error_num = spider_check_trx_and_get_conn(
+      spider->wide_handler->trx->thd,
       this, TRUE)))
     {
       DBUG_RETURN(error_num);
@@ -14208,6 +14088,7 @@ void ha_spider::sync_from_clone_source_base(
     dbton_hdl = dbton_handler[dbton_id];
     dbton_hdl2 = spider->dbton_handler[dbton_id];
     dbton_hdl->first_link_idx = dbton_hdl2->first_link_idx;
+    dbton_hdl->strict_group_by = dbton_hdl2->strict_group_by;
   }
   DBUG_VOID_RETURN;
 }
@@ -14223,6 +14104,7 @@ void ha_spider::set_first_link_idx()
     dbton_id = share->use_dbton_ids[roop_count2];
     dbton_hdl = dbton_handler[dbton_id];
     dbton_hdl->first_link_idx = -1;
+    dbton_hdl->strict_group_by = FALSE;
   }
   for (
     roop_count = spider_conn_link_idx_next(share->link_statuses,
@@ -14241,6 +14123,10 @@ void ha_spider::set_first_link_idx()
       {
         dbton_hdl->first_link_idx = roop_count;
       }
+      if (share->strict_group_bys[all_link_idx])
+      {
+        dbton_hdl->strict_group_by = TRUE;
+      }
     }
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
     dbton_id = share->hs_dbton_ids[all_link_idx];
@@ -14251,6 +14137,10 @@ void ha_spider::set_first_link_idx()
       {
         dbton_hdl->first_link_idx = roop_count;
       }
+      if (share->strict_group_bys[all_link_idx])
+      {
+        dbton_hdl->strict_group_by = TRUE;
+      }
     }
 #endif
   }
@@ -15159,7 +15049,7 @@ int ha_spider::append_sum_select_sql_part(
       DBUG_RETURN(error_num);
     }
   }
-  trx->direct_aggregate_count++;
+  wide_handler->trx->direct_aggregate_count++;
   DBUG_RETURN(0);
 }
 #endif
@@ -16123,14 +16013,16 @@ int ha_spider::mk_bulk_tmp_table_and_bulk_start()
       if (
         !tmp_table[roop_count] &&
         !(tmp_table[roop_count] = spider_mk_sys_tmp_table(
-          trx->thd, table, &result_list.upd_tmp_tbl_prms[roop_count],
+          wide_handler->trx->thd, table,
+          &result_list.upd_tmp_tbl_prms[roop_count],
           &field_name, result_list.update_sqls[roop_count].charset()))
       )
 #else
       if (
         !tmp_table[roop_count] &&
         !(tmp_table[roop_count] = spider_mk_sys_tmp_table(
-          trx->thd, table, &result_list.upd_tmp_tbl_prms[roop_count], "a",
+          wide_handler->trx->thd, table,
+          &result_list.upd_tmp_tbl_prms[roop_count], "a",
           result_list.update_sqls[roop_count].charset()))
       )
 #endif
@@ -16150,7 +16042,8 @@ error_2:
     if (tmp_table[roop_count - 1])
     {
       tmp_table[roop_count - 1]->file->ha_end_bulk_insert();
-      spider_rm_sys_tmp_table(trx->thd, tmp_table[roop_count - 1],
+      spider_rm_sys_tmp_table(wide_handler->trx->thd,
+        tmp_table[roop_count - 1],
         &result_list.upd_tmp_tbl_prms[roop_count - 1]);
       tmp_table[roop_count - 1] = NULL;
     }
@@ -16179,7 +16072,8 @@ void ha_spider::rm_bulk_tmp_table()
   {
     if (tmp_table[roop_count - 1])
     {
-      spider_rm_sys_tmp_table(trx->thd, tmp_table[roop_count - 1],
+      spider_rm_sys_tmp_table(wide_handler->trx->thd,
+        tmp_table[roop_count - 1],
         &result_list.upd_tmp_tbl_prms[roop_count - 1]);
       tmp_table[roop_count - 1] = NULL;
     }
@@ -16315,3 +16209,512 @@ int ha_spider::set_union_table_name_pos_sql()
   }
   DBUG_RETURN(0);
 }
+
+int ha_spider::append_lock_tables_list()
+{
+  int error_num, roop_count;
+  DBUG_ENTER("ha_spider::append_lock_tables_list");
+  DBUG_PRINT("info",("spider lock_table_type=%u",
+    wide_handler->lock_table_type));
+
+  if ((error_num = spider_check_trx_and_get_conn(wide_handler->trx->thd, this,
+    FALSE)))
+  {
+    DBUG_RETURN(error_num);
+  }
+
+  if (wide_handler->lock_table_type == 1)
+  {
+    for (
+      roop_count = spider_conn_link_idx_next(share->link_statuses,
+        conn_link_idx, -1, share->link_count,
+        SPIDER_LINK_STATUS_RECOVERY);
+      roop_count < (int) share->link_count;
+      roop_count = spider_conn_link_idx_next(share->link_statuses,
+        conn_link_idx, roop_count, share->link_count,
+        SPIDER_LINK_STATUS_RECOVERY)
+    ) {
+      SPIDER_CONN *conn = conns[roop_count];
+      int appended = 0;
+      if ((error_num = dbton_handler[conn->dbton_id]->
+        append_lock_tables_list(conn, roop_count, &appended)))
+      {
+        DBUG_RETURN(error_num);
+      }
+      if (appended)
+      {
+        conn->table_lock = 2;
+      }
+    }
+  } else if (wide_handler->lock_table_type == 2)
+  {
+    for (
+      roop_count = spider_conn_link_idx_next(share->link_statuses,
+        conn_link_idx, -1, share->link_count,
+        SPIDER_LINK_STATUS_RECOVERY);
+      roop_count < (int) share->link_count;
+      roop_count = spider_conn_link_idx_next(share->link_statuses,
+        conn_link_idx, roop_count, share->link_count,
+        SPIDER_LINK_STATUS_RECOVERY)
+    ) {
+      if (
+        conns[roop_count] &&
+        conns[roop_count]->table_lock != 1 &&
+        spider_param_semi_table_lock(wide_handler->trx->thd,
+          share->semi_table_lock)
+      ) {
+        SPIDER_CONN *conn = conns[roop_count];
+        int appended = 0;
+        if ((error_num = dbton_handler[conn->dbton_id]->
+          append_lock_tables_list(conn, roop_count, &appended)))
+        {
+          DBUG_RETURN(error_num);
+        }
+        if (appended)
+        {
+          conn->table_lock = 3;
+        }
+      }
+    }
+  }
+  DBUG_RETURN(0);
+}
+
+int ha_spider::lock_tables()
+{
+  int error_num, roop_count;
+  DBUG_ENTER("ha_spider::lock_tables");
+  DBUG_PRINT("info",("spider lock_table_type=%u",
+    wide_handler->lock_table_type));
+
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+  if ((conn_kinds & SPIDER_CONN_KIND_MYSQL))
+  {
+#endif
+    if (!conns[search_link_idx])
+    {
+      my_message(ER_SPIDER_REMOTE_SERVER_GONE_AWAY_NUM,
+        ER_SPIDER_REMOTE_SERVER_GONE_AWAY_STR, MYF(0));
+      DBUG_RETURN(ER_SPIDER_REMOTE_SERVER_GONE_AWAY_NUM);
+    }
+    for (
+      roop_count = spider_conn_link_idx_next(share->link_statuses,
+        conn_link_idx, -1, share->link_count,
+        SPIDER_LINK_STATUS_RECOVERY);
+      roop_count < (int) share->link_count;
+      roop_count = spider_conn_link_idx_next(share->link_statuses,
+        conn_link_idx, roop_count, share->link_count,
+        SPIDER_LINK_STATUS_RECOVERY)
+    ) {
+      if (wide_handler->sql_command != SQLCOM_UNLOCK_TABLES)
+      {
+        DBUG_PRINT("info",("spider conns[%d]->join_trx=%u",
+          roop_count, conns[roop_count]->join_trx));
+        if (
+          (!conns[roop_count]->join_trx &&
+            (error_num = spider_internal_start_trx_for_connection(this,
+              conns[roop_count],
+              roop_count)))
+        ) {
+          if (
+            share->monitoring_kind[roop_count] &&
+            need_mons[roop_count]
+          ) {
+            error_num = spider_ping_table_mon_from_table(
+                wide_handler->trx,
+                wide_handler->trx->thd,
+                share,
+                roop_count,
+                (uint32) share->monitoring_sid[roop_count],
+                share->table_name,
+                share->table_name_length,
+                conn_link_idx[roop_count],
+                NULL,
+                0,
+                share->monitoring_kind[roop_count],
+                share->monitoring_limit[roop_count],
+                share->monitoring_flag[roop_count],
+                TRUE
+              );
+          }
+          DBUG_RETURN(check_error_mode(error_num));
+        }
+        reset_first_link_idx();
+      }
+      if (conns[roop_count]->table_lock >= 2)
+      {
+        if (
+          conns[roop_count]->db_conn->have_lock_table_list() &&
+          (error_num = spider_db_lock_tables(this, roop_count))
+        ) {
+          if (
+            share->monitoring_kind[roop_count] &&
+            need_mons[roop_count]
+          ) {
+            error_num = spider_ping_table_mon_from_table(
+                wide_handler->trx,
+                wide_handler->trx->thd,
+                share,
+                roop_count,
+                (uint32) share->monitoring_sid[roop_count],
+                share->table_name,
+                share->table_name_length,
+                conn_link_idx[roop_count],
+                NULL,
+                0,
+                share->monitoring_kind[roop_count],
+                share->monitoring_limit[roop_count],
+                share->monitoring_flag[roop_count],
+                TRUE
+              );
+          }
+          conns[roop_count]->table_lock = 0;
+          DBUG_RETURN(check_error_mode(error_num));
+        }
+        if (conns[roop_count]->table_lock == 2)
+          conns[roop_count]->table_lock = 1;
+      } else if (wide_handler->sql_command == SQLCOM_UNLOCK_TABLES ||
+        spider_param_internal_unlock(wide_handler->trx->thd) == 1)
+      {
+        if (conns[roop_count]->table_lock == 1)
+        {
+          conns[roop_count]->table_lock = 0;
+          if (!conns[roop_count]->trx_start)
+            conns[roop_count]->disable_reconnect = FALSE;
+          if ((error_num = spider_db_unlock_tables(this, roop_count)))
+          {
+            if (
+              share->monitoring_kind[roop_count] &&
+              need_mons[roop_count]
+            ) {
+              error_num = spider_ping_table_mon_from_table(
+                  wide_handler->trx,
+                  wide_handler->trx->thd,
+                  share,
+                  roop_count,
+                  (uint32) share->monitoring_sid[roop_count],
+                  share->table_name,
+                  share->table_name_length,
+                  conn_link_idx[roop_count],
+                  NULL,
+                  0,
+                  share->monitoring_kind[roop_count],
+                  share->monitoring_limit[roop_count],
+                  share->monitoring_flag[roop_count],
+                  TRUE
+                );
+            }
+            DBUG_RETURN(check_error_mode(error_num));
+          }
+        }
+      }
+    }
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+  }
+#endif
+  DBUG_RETURN(0);
+}
+
+int ha_spider::dml_init()
+{
+  int error_num, roop_count;
+  SPIDER_TRX *trx = wide_handler->trx;
+  THD *thd = trx->thd;
+  bool sync_trx_isolation = spider_param_sync_trx_isolation(thd);
+  DBUG_ENTER("ha_spider::dml_init");
+  if (wide_handler->lock_mode == -2)
+  {
+    wide_handler->lock_mode = spider_param_selupd_lock_mode(thd,
+      share->selupd_lock_mode);
+  }
+  if ((error_num = check_access_kind_for_connection(thd,
+    (wide_handler->lock_type >= TL_WRITE_ALLOW_WRITE))))
+  {
+    DBUG_RETURN(error_num);
+  }
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+  if ((conn_kinds & SPIDER_CONN_KIND_MYSQL))
+  {
+#endif
+    if (!conns[search_link_idx])
+    {
+      my_message(ER_SPIDER_REMOTE_SERVER_GONE_AWAY_NUM,
+        ER_SPIDER_REMOTE_SERVER_GONE_AWAY_STR, MYF(0));
+      DBUG_RETURN(ER_SPIDER_REMOTE_SERVER_GONE_AWAY_NUM);
+    }
+    if (wide_handler->sql_command == SQLCOM_TRUNCATE)
+      DBUG_RETURN(0);
+    for (
+      roop_count = spider_conn_link_idx_next(share->link_statuses,
+        conn_link_idx, -1, share->link_count,
+        SPIDER_LINK_STATUS_RECOVERY);
+      roop_count < (int) share->link_count;
+      roop_count = spider_conn_link_idx_next(share->link_statuses,
+        conn_link_idx, roop_count, share->link_count,
+        SPIDER_LINK_STATUS_RECOVERY)
+    ) {
+      DBUG_PRINT("info",("spider conns[%d]->join_trx=%u",
+        roop_count, conns[roop_count]->join_trx));
+      if (
+        (!conns[roop_count]->join_trx &&
+          (error_num = spider_internal_start_trx_for_connection(this,
+            conns[roop_count],
+            roop_count)))
+      ) {
+        if (
+          share->monitoring_kind[roop_count] &&
+          need_mons[roop_count]
+        ) {
+          error_num = spider_ping_table_mon_from_table(
+              trx,
+              trx->thd,
+              share,
+              roop_count,
+              (uint32) share->monitoring_sid[roop_count],
+              share->table_name,
+              share->table_name_length,
+              conn_link_idx[roop_count],
+              NULL,
+              0,
+              share->monitoring_kind[roop_count],
+              share->monitoring_limit[roop_count],
+              share->monitoring_flag[roop_count],
+              TRUE
+            );
+        }
+        DBUG_RETURN(check_error_mode(error_num));
+      }
+      reset_first_link_idx();
+      if (
+        conns[roop_count]->semi_trx_isolation == -2 &&
+        conns[roop_count]->semi_trx_isolation_chk == TRUE &&
+        sync_trx_isolation &&
+        spider_param_semi_trx_isolation(trx->thd) >= 0
+      ) {
+        spider_conn_queue_semi_trx_isolation(conns[roop_count],
+          spider_param_semi_trx_isolation(trx->thd));
+      } else {
+        if (sync_trx_isolation)
+        {
+          if ((error_num = spider_check_and_set_trx_isolation(
+            conns[roop_count], &need_mons[roop_count])))
+          {
+            if (
+              share->monitoring_kind[roop_count] &&
+              need_mons[roop_count]
+            ) {
+              error_num = spider_ping_table_mon_from_table(
+                  trx,
+                  trx->thd,
+                  share,
+                  roop_count,
+                  (uint32) share->monitoring_sid[roop_count],
+                  share->table_name,
+                  share->table_name_length,
+                  conn_link_idx[roop_count],
+                  NULL,
+                  0,
+                  share->monitoring_kind[roop_count],
+                  share->monitoring_limit[roop_count],
+                  share->monitoring_flag[roop_count],
+                  TRUE
+                );
+            }
+            DBUG_RETURN(check_error_mode(error_num));
+          }
+        }
+        conns[roop_count]->semi_trx_isolation = -1;
+      }
+    }
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+  } else {
+    reset_first_link_idx();
+  }
+  if ((conn_kinds & SPIDER_CONN_KIND_HS_READ))
+  {
+    SPIDER_CONN *hs_conn;
+    for (
+      roop_count = spider_conn_link_idx_next(share->link_statuses,
+        conn_link_idx, -1, share->link_count,
+        SPIDER_LINK_STATUS_RECOVERY);
+      roop_count < (int) share->link_count;
+      roop_count = spider_conn_link_idx_next(share->link_statuses,
+        conn_link_idx, roop_count, share->link_count,
+        SPIDER_LINK_STATUS_RECOVERY)
+    ) {
+      hs_conn = hs_r_conns[roop_count];
+      if (
+        hs_conn &&
+        hs_conn->hsc_query_id != thd->query_id &&
+        hs_conn->hs_pre_age == hs_conn->hs_age
+      ) {
+        double interval = spider_param_hs_ping_interval(thd);
+        time_t tmp_time = (time_t) time((time_t*) 0);
+        DBUG_PRINT("info",
+          ("spider difftime=%f", difftime(tmp_time, hs_conn->ping_time)));
+        DBUG_PRINT("info", ("spider interval=%f", interval));
+        if (
+          hs_conn->server_lost ||
+          difftime(tmp_time, hs_conn->ping_time) >= interval
+        ) {
+          DBUG_PRINT("info", ("spider hsr[%d] need reconnect", roop_count));
+          hs_conn->hs_pre_age++;
+          hs_conn->ping_time = tmp_time;
+        }
+        hs_conn->hsc_query_id = thd->query_id;
+      }
+    }
+  }
+  if (
+#if defined(HS_HAS_SQLCOM) && defined(HANDLER_HAS_DIRECT_UPDATE_ROWS)
+    (
+#endif
+      conn_kinds & SPIDER_CONN_KIND_HS_WRITE
+#if defined(HS_HAS_SQLCOM) && defined(HANDLER_HAS_DIRECT_UPDATE_ROWS)
+    ) ||
+    /* for direct_update */
+    wide_handler->sql_command == SQLCOM_HS_UPDATE ||
+    wide_handler->sql_command == SQLCOM_HS_DELETE
+#endif
+  ) {
+    SPIDER_CONN *hs_conn;
+    for (
+      roop_count = spider_conn_link_idx_next(share->link_statuses,
+        conn_link_idx, -1, share->link_count,
+        SPIDER_LINK_STATUS_RECOVERY);
+      roop_count < (int) share->link_count;
+      roop_count = spider_conn_link_idx_next(share->link_statuses,
+        conn_link_idx, roop_count, share->link_count,
+        SPIDER_LINK_STATUS_RECOVERY)
+    ) {
+      hs_conn = hs_w_conns[roop_count];
+      if (
+        hs_conn &&
+        hs_conn->hsc_query_id != thd->query_id &&
+        hs_conn->hs_pre_age == hs_conn->hs_age
+      ) {
+        double interval = spider_param_hs_ping_interval(thd);
+        time_t tmp_time = (time_t) time((time_t*) 0);
+        DBUG_PRINT("info",
+          ("spider difftime=%f", difftime(tmp_time, hs_conn->ping_time)));
+        DBUG_PRINT("info", ("spider interval=%f", interval));
+        if (
+          hs_conn->server_lost ||
+          difftime(tmp_time, hs_conn->ping_time) >= interval
+        ) {
+          DBUG_PRINT("info", ("spider hsw[%d] need reconnect", roop_count));
+          hs_conn->hs_pre_age++;
+          hs_conn->ping_time = tmp_time;
+        }
+        hs_conn->hsc_query_id = thd->query_id;
+      }
+    }
+  }
+#endif
+#ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
+  if (wide_handler->insert_with_update)
+  {
+    check_insert_dup_update_pushdown();
+  }
+#endif
+  dml_inited = TRUE;
+  DBUG_RETURN(0);
+}
+
+#ifdef HA_CAN_BULK_ACCESS
+int ha_spider::bulk_access_begin(
+  void *info
+) {
+  DBUG_ENTER("ha_spider::bulk_access_begin");
+  DBUG_PRINT("info",("spider this=%p", this));
+  if (bulk_access_started)
+  {
+    if (!bulk_access_link_current->next)
+    {
+      if (!(bulk_access_link_current->next = create_bulk_access_link()))
+      {
+        DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+      }
+      bulk_access_link_current->next->sequence_num =
+        bulk_access_link_current->sequence_num + 1;
+    }
+    bulk_access_link_current = bulk_access_link_current->next;
+  } else {
+    if (!bulk_access_link_first)
+    {
+      if (!(bulk_access_link_first = create_bulk_access_link()))
+      {
+        DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+      }
+      bulk_access_link_first->sequence_num = 0;
+    }
+    bulk_access_link_current = bulk_access_link_first;
+    bulk_access_started = TRUE;
+    bulk_access_executing = FALSE;
+  }
+  bulk_access_link_current->spider->
+    check_access_kind(wide_handler->trx->thd);
+  if (
+    (error_num = bulk_access_link_current->spider->
+      sync_from_clone_source(this)) ||
+    (error_num = bulk_access_link_current->spider->
+      check_access_kind_for_connection(wide_handler->trx->thd,
+        (lock_type >= TL_WRITE_ALLOW_WRITE)))
+  ) {
+    DBUG_RETURN(error_num);
+  }
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+  memset(
+    bulk_access_link_current->spider->result_list.hs_r_bulk_open_index, 0,
+    share->link_bitmap_size);
+  memset(
+    bulk_access_link_current->spider->result_list.hs_w_bulk_open_index, 0,
+    share->link_bitmap_size);
+#endif
+  bulk_access_link_current->spider->bulk_access_executing = FALSE;
+  bulk_access_link_current->spider->bulk_access_pre_called = FALSE;
+  bulk_access_link_current->used = TRUE;
+  bulk_access_link_current->called = FALSE;
+  *((void **) info) = bulk_access_link_current;
+  DBUG_RETURN(0);
+}
+
+int ha_spider::bulk_access_current(
+  void *info
+) {
+  DBUG_ENTER("ha_spider::bulk_access_current");
+  DBUG_PRINT("info",("spider this=%p", this));
+  bulk_access_executing = TRUE;
+  bulk_access_link_exec_tgt = (SPIDER_BULK_ACCESS_LINK *) info;
+  if (bulk_access_link_exec_tgt->spider->pt_clone_source_handler != this)
+  {
+    DBUG_PRINT("info",("spider this=%p", this));
+    DBUG_PRINT("info",("spider pt_clone_source_handler=%p",
+      bulk_access_link_exec_tgt->spider->pt_clone_source_handler));
+    /* partitioned */
+    uint sequence_num = bulk_access_link_exec_tgt->sequence_num;
+    for (
+      bulk_access_link_exec_tgt = bulk_access_link_first;
+      bulk_access_link_exec_tgt;
+      bulk_access_link_exec_tgt = bulk_access_link_exec_tgt->next
+    ) {
+      if (bulk_access_link_exec_tgt->sequence_num >= sequence_num)
+      {
+        DBUG_ASSERT(
+          bulk_access_link_exec_tgt->sequence_num == sequence_num);
+        break;
+      }
+    }
+  }
+  bulk_access_link_exec_tgt->spider->bulk_access_executing = TRUE;
+  DBUG_RETURN(0);
+}
+
+void ha_spider::bulk_access_end()
+{
+  DBUG_ENTER("ha_spider::bulk_access_end");
+  DBUG_PRINT("info",("spider this=%p", this));
+  bulk_access_started = FALSE;
+  DBUG_VOID_RETURN;
+}
+#endif
diff --git a/storage/spider/ha_spider.h b/storage/spider/ha_spider.h
index cb0a2abcc06..3036f8d522a 100644
--- a/storage/spider/ha_spider.h
+++ b/storage/spider/ha_spider.h
@@ -1,5 +1,5 @@
 /* Copyright (C) 2008-2019 Kentoku Shiba
-   Copyright (C) 2019 MariaDB corp
+   Copyright (C) 2019-2022 MariaDB corp
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -49,12 +49,10 @@ struct st_spider_ft_info
   String *key;
 };
 
-class ha_spider: public handler
+class ha_spider final : public handler
 {
 public:
-  THR_LOCK_DATA      lock;
   SPIDER_SHARE       *share;
-  SPIDER_TRX         *trx;
   ulonglong          spider_thread_id;
   ulonglong          trx_conn_adjustment;
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
@@ -90,18 +88,15 @@ public:
   int                search_link_idx;
   int                result_link_idx;
   SPIDER_RESULT_LIST result_list;
-  SPIDER_CONDITION   *condition;
   spider_string      *blob_buff;
-  uchar              *searched_bitmap;
-  uchar              *ft_discard_bitmap;
-  bool               position_bitmap_init;
-  uchar              *position_bitmap;
   SPIDER_POSITION    *pushed_pos;
   SPIDER_POSITION    pushed_pos_buf;
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-  SPIDER_PARTITION_HANDLER_SHARE *partition_handler_share;
-  ha_spider          *pt_handler_share_creator;
+  SPIDER_PARTITION_HANDLER *partition_handler;
 #endif
+  bool                wide_handler_owner = FALSE;
+  SPIDER_WIDE_HANDLER *wide_handler = NULL;
+
 #ifdef HA_CAN_BULK_ACCESS
   int                pre_direct_init_result;
   bool               is_bulk_access_clone;
@@ -116,10 +111,8 @@ public:
   bool               init_ha_mem_root;
   MEM_ROOT           ha_mem_root;
 */
-  ulonglong          external_lock_cnt;
 #endif
   bool               is_clone;
-  bool               clone_bitmap_init;
   ha_spider          *pt_clone_source_handler;
   ha_spider          *pt_clone_last_searcher;
   bool               use_index_merge;
@@ -161,22 +154,11 @@ public:
 
   ha_spider          *next;
 
+  bool               dml_inited;
   bool               rnd_scan_and_first;
-  bool               quick_mode;
-  bool               keyread;
-  bool               ignore_dup_key;
-  bool               write_can_replace;
-  bool               insert_with_update;
-  bool               low_priority;
-  bool               high_priority;
-  bool               insert_delayed;
   bool               use_pre_call;
   bool               use_pre_action;
   bool               pre_bitmap_checked;
-  enum thr_lock_type lock_type;
-  int                lock_mode;
-  uint               sql_command;
-  int                selupd_lock_mode;
   bool               bulk_insert;
 #ifdef HANDLER_HAS_NEED_INFO_FOR_AUTO_INC
   bool               info_auto_called;
@@ -189,12 +171,9 @@ public:
   int                store_error_num;
   uint               dup_key_idx;
   int                select_column_mode;
-  bool               update_request;
   bool               pk_update;
   bool               force_auto_increment;
   int                bka_mode;
-  bool               cond_check;
-  int                cond_check_error;
   int                error_mode;
   ulonglong          store_last_insert_id;
 
@@ -216,14 +195,7 @@ public:
   uint32             **hs_w_ret_fields;
   size_t             *hs_r_ret_fields_num;
   size_t             *hs_w_ret_fields_num;
-  uint32             *hs_pushed_ret_fields;
-  size_t             hs_pushed_ret_fields_num;
-  size_t             hs_pushed_ret_fields_size;
-  size_t             hs_pushed_lcl_fields_num;
   uchar              *tmp_column_bitmap;
-  bool               hs_increment;
-  bool               hs_decrement;
-  uint32             hs_pushed_strref_num;
 #endif
 #endif
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
@@ -232,11 +204,6 @@ public:
   bool               maybe_do_hs_direct_update;
 #endif
   uint               direct_update_kinds;
-  List<Item>         *direct_update_fields;
-  List<Item>         *direct_update_values;
-#endif
-#ifdef INFO_KIND_FORCE_LIMIT_BEGIN
-  longlong           info_limit;
 #endif
   spider_index_rnd_init prev_index_rnd_init;
 #ifdef HANDLER_HAS_DIRECT_AGGREGATE
@@ -283,10 +250,13 @@ public:
     uint test_if_locked
   );
   int close();
-  int check_access_kind(
+  int check_access_kind_for_connection(
     THD *thd,
     bool write_request
   );
+  void check_access_kind(
+    THD *thd
+  );
 #ifdef HA_CAN_BULK_ACCESS
   int additional_lock(
     THD *thd,
@@ -302,6 +272,10 @@ public:
     THD *thd,
     int lock_type
   );
+  int start_stmt(
+    THD *thd,
+    thr_lock_type lock_type
+  );
   int reset();
   int extra(
     enum ha_extra_function operation
@@ -514,8 +488,9 @@ public:
   );
   ha_rows records_in_range(
     uint inx,
-    key_range *start_key,
-    key_range *end_key
+    const key_range *start_key,
+    const key_range *end_key,
+    page_range *pages
   );
   int check_crd();
   int pre_records();
@@ -526,6 +501,7 @@ public:
 #endif
   const char *table_type() const;
   ulonglong table_flags() const;
+  ulong table_flags_for_partition();
   const char *index_type(
     uint key_number
   );
@@ -610,6 +586,11 @@ public:
   );
 #endif
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
+  bool check_direct_update_sql_part(
+    st_select_lex *select_lex,
+    longlong select_limit,
+    longlong offset_limit
+  );
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS_WITH_HS
 #ifdef SPIDER_MDEV_16246
   inline int direct_update_rows_init(
@@ -734,6 +715,11 @@ public:
     const uchar *buf
   );
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
+  bool check_direct_delete_sql_part(
+    st_select_lex *select_lex,
+    longlong select_limit,
+    longlong offset_limit
+  );
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS_WITH_HS
   inline int direct_delete_rows_init()
   {
@@ -870,9 +856,6 @@ public:
     int error_num,
     uint flags
   );
-  Field *get_top_table_field(
-    uint16 field_index
-  );
   Field *field_exchange(
     Field *field
   );
@@ -888,7 +871,6 @@ public:
   void return_record_by_parent();
 #endif
   TABLE *get_table();
-  TABLE *get_top_table();
   void set_ft_discard_bitmap();
   void set_searched_bitmap();
   void set_clone_searched_bitmap();
@@ -1256,4 +1238,16 @@ public:
 #endif
   int init_union_table_name_pos_sql();
   int set_union_table_name_pos_sql();
+  int append_lock_tables_list();
+  int lock_tables();
+  int dml_init();
+#ifdef HA_CAN_BULK_ACCESS
+  int bulk_access_begin(
+    void *info
+  );
+  int bulk_access_current(
+    void *info
+  );
+  void bulk_access_end();
+#endif
 };
diff --git a/storage/spider/hs_client/config.cpp b/storage/spider/hs_client/config.cpp
index 97d479220e0..0003c3fd645 100644
--- a/storage/spider/hs_client/config.cpp
+++ b/storage/spider/hs_client/config.cpp
@@ -35,8 +35,8 @@ conf_get_key(
 
 config::config()
 {
-  if (my_hash_init(&conf_hash, &my_charset_bin, 32, 0, 0,
-    (my_hash_get_key) conf_get_key, 0, 0))
+  if (my_hash_init(PSI_INSTRUMENT_ME, &conf_hash, &my_charset_bin, 32, 0, 0,
+                   (my_hash_get_key) conf_get_key, 0, 0))
     init = FALSE;
   else
     init = TRUE;
diff --git a/storage/spider/hs_client/hs_compat.h b/storage/spider/hs_client/hs_compat.h
index 8505d7978b7..fb9b02ad4f0 100644
--- a/storage/spider/hs_client/hs_compat.h
+++ b/storage/spider/hs_client/hs_compat.h
@@ -16,7 +16,12 @@
 #ifndef HS_COMPAT_H
 #define HS_COMPAT_H
 
-#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >= 100213
+#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >= 100500
+#define SPD_INIT_DYNAMIC_ARRAY2(A, B, C, D, E, F) \
+  my_init_dynamic_array2(PSI_INSTRUMENT_ME, A, B, C, D, E, F)
+#define SPD_INIT_ALLOC_ROOT(A, B, C, D) \
+  init_alloc_root(PSI_INSTRUMENT_ME, A, B, C, D)
+#elif defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >= 100213
 #define SPD_INIT_DYNAMIC_ARRAY2(A, B, C, D, E, F) \
   my_init_dynamic_array2(A, B, C, D, E, F)
 #define SPD_INIT_ALLOC_ROOT(A, B, C, D) \
diff --git a/storage/spider/mysql-test/spider/bugfix/include/insert_select_deinit.inc b/storage/spider/mysql-test/spider/bugfix/include/insert_select_deinit.inc
new file mode 100644
index 00000000000..b8b2f200689
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/include/insert_select_deinit.inc
@@ -0,0 +1,16 @@
+--connection master_1
+set session spider_direct_dup_insert= @old_spider_direct_dup_insert;
+--let $MASTER_1_COMMENT_2_1= $MASTER_1_COMMENT_2_1_BACKUP
+--let $MASTER_1_COMMENT_2_1_2= $MASTER_1_COMMENT_2_1_2_BACKUP
+--let $CHILD2_1_DROP_TABLES= $CHILD2_1_DROP_TABLES_BACKUP
+--let $CHILD2_1_DROP_TABLES2= $CHILD2_1_DROP_TABLES2_BACKUP
+--let $CHILD2_1_CREATE_TABLES= $CHILD2_1_CREATE_TABLES_BACKUP
+--let $CHILD2_1_CREATE_TABLES2= $CHILD2_1_CREATE_TABLES2_BACKUP
+--let $CHILD2_1_SELECT_TABLES= $CHILD2_1_SELECT_TABLES_BACKUP
+--disable_warnings
+--disable_query_log
+--disable_result_log
+--source ../t/test_deinit.inc
+--enable_result_log
+--enable_query_log
+--enable_warnings
diff --git a/storage/spider/mysql-test/spider/bugfix/include/insert_select_init.inc b/storage/spider/mysql-test/spider/bugfix/include/insert_select_init.inc
new file mode 100644
index 00000000000..62a8821a3c9
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/include/insert_select_init.inc
@@ -0,0 +1,43 @@
+--disable_warnings
+--disable_query_log
+--disable_result_log
+--source ../t/test_init.inc
+--enable_result_log
+--enable_query_log
+--enable_warnings
+--let $MASTER_1_COMMENT_2_1_BACKUP= $MASTER_1_COMMENT_2_1
+let $MASTER_1_COMMENT_2_1=
+  COMMENT='table "tbl_a", srv "s_2_1"';
+--let $MASTER_1_COMMENT_2_1_2_BACKUP= $MASTER_1_COMMENT_2_1_2
+let $MASTER_1_COMMENT_2_1_2=
+  COMMENT='table "tbl_b", srv "s_2_1"';
+--let $CHILD2_1_DROP_TABLES_BACKUP= $CHILD2_1_DROP_TABLES
+let $CHILD2_1_DROP_TABLES=
+  DROP TABLE IF EXISTS tbl_a;
+--let $CHILD2_1_DROP_TABLES2_BACKUP= $CHILD2_1_DROP_TABLES2
+let $CHILD2_1_DROP_TABLES2=
+  DROP TABLE IF EXISTS tbl_b;
+--let $CHILD2_1_CREATE_TABLES_BACKUP= $CHILD2_1_CREATE_TABLES
+let $CHILD2_1_CREATE_TABLES=
+  CREATE TABLE tbl_a (
+    skey int NOT NULL,
+    dt date NOT NULL,
+    tm time NOT NULL,
+    KEY idx1 (skey,dt,tm)
+  ) $CHILD2_1_ENGINE $CHILD2_1_CHARSET;
+--let $CHILD2_1_CREATE_TABLES2_BACKUP= $CHILD2_1_CREATE_TABLES2
+let $CHILD2_1_CREATE_TABLES2=
+  CREATE TABLE tbl_b (
+    pkey int NOT NULL,
+    dt datetime NOT NULL,
+    PRIMARY KEY (pkey),
+    KEY idx1 (dt)
+  ) $CHILD2_1_ENGINE $CHILD2_1_CHARSET;
+--let $CHILD2_1_SELECT_TABLES_BACKUP= $CHILD2_1_SELECT_TABLES
+let $CHILD2_1_SELECT_TABLES=
+  SELECT pkey, dt FROM tbl_b ORDER BY pkey;
+let $CHILD2_1_SELECT_ARGUMENT1=
+  SELECT argument FROM mysql.general_log WHERE argument LIKE '%select %';
+--connection master_1
+set @old_spider_direct_dup_insert= @@spider_direct_dup_insert;
+set session spider_direct_dup_insert= 1;
diff --git a/storage/spider/mysql-test/spider/bugfix/include/self_reference_deinit.inc b/storage/spider/mysql-test/spider/bugfix/include/self_reference_deinit.inc
new file mode 100644
index 00000000000..34376d50584
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/include/self_reference_deinit.inc
@@ -0,0 +1,10 @@
+--connection master_1
+set spider_same_server_link= @old_spider_same_server_link;
+--let $MASTER_1_COMMENT_2_1= $MASTER_1_COMMENT_2_1_BACKUP
+--disable_warnings
+--disable_query_log
+--disable_result_log
+--source ../t/test_deinit.inc
+--enable_result_log
+--enable_query_log
+--enable_warnings
diff --git a/storage/spider/mysql-test/spider/bugfix/include/self_reference_init.inc b/storage/spider/mysql-test/spider/bugfix/include/self_reference_init.inc
new file mode 100644
index 00000000000..b7e2c4d02ca
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/include/self_reference_init.inc
@@ -0,0 +1,13 @@
+--disable_warnings
+--disable_query_log
+--disable_result_log
+--source ../t/test_init.inc
+--enable_result_log
+--enable_query_log
+--enable_warnings
+--let $MASTER_1_COMMENT_2_1_BACKUP= $MASTER_1_COMMENT_2_1
+let $MASTER_1_COMMENT_2_1=
+  COMMENT='table "tbl_a", host "127.0.0.1", port "$MASTER_1_MYPORT", user "root"';
+--connection master_1
+set @old_spider_same_server_link= @@spider_same_server_link;
+set spider_same_server_link= ON;
diff --git a/storage/spider/mysql-test/spider/bugfix/include/strict_group_by_deinit.inc b/storage/spider/mysql-test/spider/bugfix/include/strict_group_by_deinit.inc
new file mode 100644
index 00000000000..2b0f9cbb701
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/include/strict_group_by_deinit.inc
@@ -0,0 +1,18 @@
+--let $MASTER_1_COMMENT_2_1= $MASTER_1_COMMENT_2_1_BACKUP
+--let $CHILD2_1_DROP_TABLES= $CHILD2_1_DROP_TABLES_BACKUP
+--let $CHILD2_1_CREATE_TABLES= $CHILD2_1_CREATE_TABLES_BACKUP
+--let $CHILD2_1_SELECT_TABLES= $CHILD2_1_SELECT_TABLES_BACKUP
+--let $CHILD2_2_DROP_TABLES= $CHILD2_2_DROP_TABLES_BACKUP
+--let $CHILD2_2_CREATE_TABLES= $CHILD2_2_CREATE_TABLES_BACKUP
+--let $CHILD2_2_SELECT_TABLES= $CHILD2_2_SELECT_TABLES_BACKUP
+--connection child2_1
+set global sql_mode= @old_sql_mode;
+--connection master_1
+set session spider_sync_sql_mode= @old_spider_sync_sql_mode;
+--disable_warnings
+--disable_query_log
+--disable_result_log
+--source ../t/test_deinit.inc
+--enable_result_log
+--enable_query_log
+--enable_warnings
diff --git a/storage/spider/mysql-test/spider/bugfix/include/strict_group_by_init.inc b/storage/spider/mysql-test/spider/bugfix/include/strict_group_by_init.inc
new file mode 100644
index 00000000000..15c2a9eed7c
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/include/strict_group_by_init.inc
@@ -0,0 +1,60 @@
+--disable_warnings
+--disable_query_log
+--disable_result_log
+--source ../t/test_init.inc
+if (!$HAVE_PARTITION)
+{
+  --source strict_group_by_deinit.inc
+  --enable_result_log
+  --enable_query_log
+  --enable_warnings
+  skip Test requires partitioning;
+}
+--enable_result_log
+--enable_query_log
+--enable_warnings
+--let $MASTER_1_COMMENT_2_1_BACKUP= $MASTER_1_COMMENT_2_1
+let $MASTER_1_COMMENT_2_1=
+  COMMENT='table "tbl_a"'
+  PARTITION BY KEY(pkey) (
+    PARTITION pt1 COMMENT='srv "s_2_1"',
+    PARTITION pt2 COMMENT='srv "s_2_2"'
+  );
+--let $CHILD2_1_DROP_TABLES_BACKUP= $CHILD2_1_DROP_TABLES
+let $CHILD2_1_DROP_TABLES=
+  DROP TABLE IF EXISTS tbl_a;
+--let $CHILD2_1_CREATE_TABLES_BACKUP= $CHILD2_1_CREATE_TABLES
+let $CHILD2_1_CREATE_TABLES=
+  CREATE TABLE tbl_a (
+    pkey int NOT NULL,
+    skey int NOT NULL,
+    PRIMARY KEY (pkey),
+    KEY idx1 (skey)
+  ) $CHILD2_1_ENGINE $CHILD2_1_CHARSET;
+--let $CHILD2_1_SELECT_TABLES_BACKUP= $CHILD2_1_SELECT_TABLES
+let $CHILD2_1_SELECT_TABLES=
+  SELECT pkey, skey FROM tbl_a ORDER BY pkey;
+let $CHILD2_1_SELECT_ARGUMENT1=
+  SELECT argument FROM mysql.general_log WHERE argument LIKE '%select %';
+--let $CHILD2_2_DROP_TABLES_BACKUP= $CHILD2_2_DROP_TABLES
+let $CHILD2_2_DROP_TABLES=
+  DROP TABLE IF EXISTS tbl_a;
+--let $CHILD2_2_CREATE_TABLES_BACKUP= $CHILD2_2_CREATE_TABLES
+let $CHILD2_2_CREATE_TABLES=
+  CREATE TABLE tbl_a (
+    pkey int NOT NULL,
+    skey int NOT NULL,
+    PRIMARY KEY (pkey),
+    KEY idx1 (skey)
+  ) $CHILD2_2_ENGINE $CHILD2_2_CHARSET;
+--let $CHILD2_2_SELECT_TABLES_BACKUP= $CHILD2_2_SELECT_TABLES
+let $CHILD2_2_SELECT_TABLES=
+  SELECT pkey, skey FROM tbl_a ORDER BY pkey;
+let $CHILD2_2_SELECT_ARGUMENT1=
+  SELECT argument FROM mysql.general_log WHERE argument LIKE '%select %';
+--connection master_1
+set @old_spider_sync_sql_mode= @@spider_sync_sql_mode;
+set session spider_sync_sql_mode= FALSE;
+--connection child2_1
+set @old_sql_mode= @@sql_mode;
+set global sql_mode= 'ONLY_FULL_GROUP_BY';
diff --git a/storage/spider/mysql-test/spider/bugfix/r/insert_select.result b/storage/spider/mysql-test/spider/bugfix/r/insert_select.result
new file mode 100644
index 00000000000..0783995e287
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/r/insert_select.result
@@ -0,0 +1,102 @@
+for master_1
+for child2
+child2_1
+child2_2
+child2_3
+for child3
+connection master_1;
+set @old_spider_direct_dup_insert= @@spider_direct_dup_insert;
+set session spider_direct_dup_insert= 1;
+
+this test is for MDEV-16399
+
+drop and create databases
+connection master_1;
+CREATE DATABASE auto_test_local;
+USE auto_test_local;
+connection child2_1;
+SET @old_log_output = @@global.log_output;
+SET GLOBAL log_output = 'TABLE,FILE';
+CREATE DATABASE auto_test_remote;
+USE auto_test_remote;
+
+create table and insert
+connection child2_1;
+CHILD2_1_CREATE_TABLES
+CHILD2_1_CREATE_TABLES2
+TRUNCATE TABLE mysql.general_log;
+connection master_1;
+CREATE TABLE tbl_a (
+skey int NOT NULL,
+dt date NOT NULL,
+tm time NOT NULL,
+KEY idx1 (skey,dt,tm)
+) MASTER_1_ENGINE MASTER_1_CHARSET MASTER_1_COMMENT_2_1
+CREATE TABLE tbl_b (
+pkey int NOT NULL,
+dt datetime NOT NULL,
+PRIMARY KEY (pkey),
+KEY idx1 (dt)
+) MASTER_1_ENGINE MASTER_1_CHARSET MASTER_1_COMMENT_2_1_2
+INSERT INTO tbl_a (skey, dt, tm) VALUES (0, '2012-01-01', '12:00:00'),(1, '2012-02-01', '12:00:00'),(2, '2012-03-01', '12:00:00'),(3, '2012-04-01', '12:00:00'),(4, '2012-05-01', '12:00:00'),(5, '2012-06-01', '12:00:00'),(6, '2012-07-01', '12:00:00'),(7, '2012-08-01', '12:00:00'),(8, '2012-09-01', '12:00:00'),(9, '2012-10-01', '12:00:00');
+INSERT INTO tbl_a (skey, dt, tm) VALUES (0, '2013-01-01', '13:00:00'),(1, '2013-02-01', '13:00:00'),(2, '2013-03-01', '13:00:00'),(3, '2013-04-01', '13:00:00'),(4, '2013-05-01', '13:00:00'),(5, '2013-06-01', '13:00:00'),(6, '2013-07-01', '13:00:00'),(7, '2013-08-01', '13:00:00'),(8, '2013-09-01', '13:00:00'),(9, '2013-10-01', '13:00:00');
+INSERT INTO tbl_a (skey, dt, tm) VALUES (0, '2012-11-01', '11:00:00'),(1, '2012-12-01', '11:00:00'),(2, '2012-11-30', '11:00:00'),(3, '2012-11-29', '11:00:00'),(4, '2012-11-28', '11:00:00'),(5, '2012-11-27', '11:00:00'),(6, '2012-11-26', '11:00:00'),(7, '2012-11-25', '11:00:00'),(8, '2012-11-24', '11:00:00'),(9, '2012-11-23', '11:00:00');
+FLUSH TABLES;
+
+select test 1
+connection child2_1;
+TRUNCATE TABLE mysql.general_log;
+connection master_1;
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 4 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 3 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 2 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 1 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 0 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 9 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 8 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 7 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 6 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 5 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+connection child2_1;
+SELECT argument FROM mysql.general_log WHERE argument LIKE '%select %';
+argument
+select t0.`skey` `skey`,cast((concat(t0.`dt` , _latin1' ' , t0.`tm`)) as datetime) `CAST(CONCAT(dt, ' ', tm) AS datetime)` from `auto_test_remote`.`tbl_a` t0 where ((t0.`skey` = 4) and (t0.`dt` > _latin1'2012-11-21')) lock in share mode
+select t0.`skey` `skey`,cast((concat(t0.`dt` , _latin1' ' , t0.`tm`)) as datetime) `CAST(CONCAT(dt, ' ', tm) AS datetime)` from `auto_test_remote`.`tbl_a` t0 where ((t0.`skey` = 3) and (t0.`dt` > _latin1'2012-11-21')) lock in share mode
+select t0.`skey` `skey`,cast((concat(t0.`dt` , _latin1' ' , t0.`tm`)) as datetime) `CAST(CONCAT(dt, ' ', tm) AS datetime)` from `auto_test_remote`.`tbl_a` t0 where ((t0.`skey` = 2) and (t0.`dt` > _latin1'2012-11-21')) lock in share mode
+select t0.`skey` `skey`,cast((concat(t0.`dt` , _latin1' ' , t0.`tm`)) as datetime) `CAST(CONCAT(dt, ' ', tm) AS datetime)` from `auto_test_remote`.`tbl_a` t0 where ((t0.`skey` = 1) and (t0.`dt` > _latin1'2012-11-21')) lock in share mode
+select t0.`skey` `skey`,cast((concat(t0.`dt` , _latin1' ' , t0.`tm`)) as datetime) `CAST(CONCAT(dt, ' ', tm) AS datetime)` from `auto_test_remote`.`tbl_a` t0 where ((t0.`skey` = 0) and (t0.`dt` > _latin1'2012-11-21')) lock in share mode
+select t0.`skey` `skey`,cast((concat(t0.`dt` , _latin1' ' , t0.`tm`)) as datetime) `CAST(CONCAT(dt, ' ', tm) AS datetime)` from `auto_test_remote`.`tbl_a` t0 where ((t0.`skey` = 9) and (t0.`dt` > _latin1'2012-11-21')) lock in share mode
+select t0.`skey` `skey`,cast((concat(t0.`dt` , _latin1' ' , t0.`tm`)) as datetime) `CAST(CONCAT(dt, ' ', tm) AS datetime)` from `auto_test_remote`.`tbl_a` t0 where ((t0.`skey` = 8) and (t0.`dt` > _latin1'2012-11-21')) lock in share mode
+select t0.`skey` `skey`,cast((concat(t0.`dt` , _latin1' ' , t0.`tm`)) as datetime) `CAST(CONCAT(dt, ' ', tm) AS datetime)` from `auto_test_remote`.`tbl_a` t0 where ((t0.`skey` = 7) and (t0.`dt` > _latin1'2012-11-21')) lock in share mode
+select t0.`skey` `skey`,cast((concat(t0.`dt` , _latin1' ' , t0.`tm`)) as datetime) `CAST(CONCAT(dt, ' ', tm) AS datetime)` from `auto_test_remote`.`tbl_a` t0 where ((t0.`skey` = 6) and (t0.`dt` > _latin1'2012-11-21')) lock in share mode
+select t0.`skey` `skey`,cast((concat(t0.`dt` , _latin1' ' , t0.`tm`)) as datetime) `CAST(CONCAT(dt, ' ', tm) AS datetime)` from `auto_test_remote`.`tbl_a` t0 where ((t0.`skey` = 5) and (t0.`dt` > _latin1'2012-11-21')) lock in share mode
+SELECT argument FROM mysql.general_log WHERE argument LIKE '%select %'
+SELECT pkey, dt FROM tbl_b ORDER BY pkey;
+pkey	dt
+0	2013-01-01 13:00:00
+1	2012-12-01 11:00:00
+2	2012-11-30 11:00:00
+3	2012-11-29 11:00:00
+4	2012-11-28 11:00:00
+5	2012-11-27 11:00:00
+6	2012-11-26 11:00:00
+7	2012-11-25 11:00:00
+8	2012-11-24 11:00:00
+9	2012-11-23 11:00:00
+
+deinit
+connection master_1;
+DROP DATABASE IF EXISTS auto_test_local;
+connection child2_1;
+DROP DATABASE IF EXISTS auto_test_remote;
+SET GLOBAL log_output = @old_log_output;
+connection master_1;
+set session spider_direct_dup_insert= @old_spider_direct_dup_insert;
+for master_1
+for child2
+child2_1
+child2_2
+child2_3
+for child3
+
+end of test
diff --git a/storage/spider/mysql-test/spider/bugfix/r/mdev_22246.result b/storage/spider/mysql-test/spider/bugfix/r/mdev_22246.result
index 749c750e018..0254d8bfd1c 100644
--- a/storage/spider/mysql-test/spider/bugfix/r/mdev_22246.result
+++ b/storage/spider/mysql-test/spider/bugfix/r/mdev_22246.result
@@ -47,10 +47,10 @@ SELECT * FROM tbl_a;
 id	node
 2	DB-G1
 1	DB-G0
-SELECT * FROM tbl_a WHERE id != 0;
+SELECT * FROM tbl_a WHERE id <0 || id >0;
 id	node
-2	DB-G1
 1	DB-G0
+2	DB-G1
 connection child2_1;
 SELECT * FROM tbl_a ORDER BY id;
 id	node
diff --git a/storage/spider/mysql-test/spider/bugfix/r/mdev_25684.result b/storage/spider/mysql-test/spider/bugfix/r/mdev_25684.result
new file mode 100644
index 00000000000..8d4bea79476
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/r/mdev_25684.result
@@ -0,0 +1,18 @@
+for master_1
+for child2
+child2_1
+child2_2
+child2_3
+for child3
+
+MDEV-25684 Crash in THD::find_temporary_table while calling spider_direct_sql UDF without temporary table created
+
+connection master_1;
+SELECT SPIDER_DIRECT_SQL('SELECT * FROM s', 'non_existing_temporary_table', 'srv "s_2_1"');
+ERROR HY000: Temporary table 'test.non_existing_temporary_table' is not found
+for master_1
+for child2
+child2_1
+child2_2
+child2_3
+for child3
diff --git a/storage/spider/mysql-test/spider/bugfix/r/mdev_26158.result b/storage/spider/mysql-test/spider/bugfix/r/mdev_26158.result
new file mode 100644
index 00000000000..2870dab2702
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/r/mdev_26158.result
@@ -0,0 +1,27 @@
+#
+# MDEV-26158 SIGSEGV in spider_free_mem from ha_spider::open on INSERT
+#
+for master_1
+for child2
+child2_1
+child2_2
+child2_3
+for child3
+connection master_1;
+CREATE DATABASE auto_test_local;
+USE auto_test_local;
+CREATE TABLE t (
+c INT
+) ENGINE=Spider DEFAULT CHARSET=utf8 COMMENT='table "tbl_a"'
+PARTITION BY LIST COLUMNS(`c`) (
+PARTITION `pt1` DEFAULT COMMENT = 'srv "s_2_1"'
+);
+INSERT INTO t SELECT * FROM t;
+ERROR 42000: Unknown database 'auto_test_remote'
+DROP DATABASE auto_test_local;
+for master_1
+for child2
+child2_1
+child2_2
+child2_3
+for child3
diff --git a/storage/spider/mysql-test/spider/bugfix/r/mdev_26539.result b/storage/spider/mysql-test/spider/bugfix/r/mdev_26539.result
new file mode 100644
index 00000000000..4e195fddfad
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/r/mdev_26539.result
@@ -0,0 +1,36 @@
+for master_1
+for child2
+child2_1
+child2_2
+child2_3
+for child3
+#
+# MDEV-26539 SIGSEGV in spider_check_and_set_trx_isolation and I_P_List_iterator from THD::drop_temporary_table (10.5.3 opt only) on ALTER
+#
+connection child2_1;
+CREATE DATABASE auto_test_remote;
+USE auto_test_remote;
+CREATE TABLE tbl_a (
+c INT
+) ENGINE=InnoDB DEFAULT CHARSET=utf8;
+connection master_1;
+CREATE DATABASE auto_test_local;
+USE auto_test_local;
+CREATE TABLE tbl_a (
+c INT
+) ENGINE=Spider DEFAULT CHARSET=utf8 COMMENT='table "tbl_a"' PARTITION BY LIST COLUMNS (c) (
+PARTITION pt1 DEFAULT COMMENT = 'srv "s_2_1"'
+);
+INSERT INTO tbl_a VALUES (1);
+ALTER TABLE tbl_a CHECK PARTITION ALL;
+Table	Op	Msg_type	Msg_text
+auto_test_local.tbl_a	check	status	OK
+DROP DATABASE auto_test_local;
+connection child2_1;
+DROP DATABASE auto_test_remote;
+for master_1
+for child2
+child2_1
+child2_2
+child2_3
+for child3
diff --git a/storage/spider/mysql-test/spider/bugfix/r/mdev_26582.result b/storage/spider/mysql-test/spider/bugfix/r/mdev_26582.result
new file mode 100644
index 00000000000..54a4fc44b48
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/r/mdev_26582.result
@@ -0,0 +1,11 @@
+#
+# MDEV-26582 SIGSEGV in spider_db_bulk_insert and spider_db_connect and spider_db_before_query, and hang in "End of update loop" / "Reset for next command" query states
+#
+CREATE DATABASE IF NOT EXISTS auto_test_local;
+USE auto_test_local;
+CREATE TABLE t (i CHAR) ENGINE=SPIDER;
+INSERT INTO t VALUES (0);
+ERROR HY000: Unable to connect to foreign data source: localhost
+INSERT t SELECT 1 ON DUPLICATE KEY UPDATE c=1;
+ERROR 42S22: Unknown column 'c' in 'field list'
+DROP DATABASE IF EXISTS auto_test_local;
diff --git a/storage/spider/mysql-test/spider/bugfix/r/mdev_27240.result b/storage/spider/mysql-test/spider/bugfix/r/mdev_27240.result
new file mode 100644
index 00000000000..9dd247337ee
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/r/mdev_27240.result
@@ -0,0 +1,16 @@
+for master_1
+for child2
+for child3
+CREATE DATABASE auto_test_local;
+USE auto_test_local;
+CREATE TABLE tbl_a (a INT KEY) ENGINE=SPIDER;
+SELECT a.z FROM tbl_a AS a,tbl_a b WHERE a.z=b.z;
+ERROR 42S22: Unknown column 'a.z' in 'field list'
+ALTER TABLE tbl_a CHANGE c c INT;
+ERROR 42S22: Unknown column 'c' in 'tbl_a'
+LOCK TABLE tbl_a READ;
+ERROR HY000: Unable to connect to foreign data source: localhost
+DROP DATABASE auto_test_local;
+for master_1
+for child2
+for child3
diff --git a/storage/spider/mysql-test/spider/bugfix/r/self_reference.result b/storage/spider/mysql-test/spider/bugfix/r/self_reference.result
new file mode 100644
index 00000000000..1ce9c60a93f
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/r/self_reference.result
@@ -0,0 +1,36 @@
+for master_1
+for child2
+for child3
+connection master_1;
+set @old_spider_same_server_link= @@spider_same_server_link;
+set spider_same_server_link= ON;
+
+this test is for MDEV-6268
+
+drop and create databases
+connection master_1;
+CREATE DATABASE auto_test_local;
+USE auto_test_local;
+
+create table
+connection master_1;
+CREATE TABLE tbl_a (
+pkey int NOT NULL,
+PRIMARY KEY (pkey)
+) MASTER_1_ENGINE MASTER_1_CHARSET MASTER_1_COMMENT_2_1
+
+select test 1
+connection master_1;
+SELECT pkey FROM tbl_a;
+ERROR HY000: An infinite loop is detected when opening table auto_test_local.tbl_a
+
+deinit
+connection master_1;
+DROP DATABASE IF EXISTS auto_test_local;
+connection master_1;
+set spider_same_server_link= @old_spider_same_server_link;
+for master_1
+for child2
+for child3
+
+end of test
diff --git a/storage/spider/mysql-test/spider/bugfix/r/slave_trx_isolation.result b/storage/spider/mysql-test/spider/bugfix/r/slave_trx_isolation.result
index d7a0c1044a7..d50da8a7613 100644
--- a/storage/spider/mysql-test/spider/bugfix/r/slave_trx_isolation.result
+++ b/storage/spider/mysql-test/spider/bugfix/r/slave_trx_isolation.result
@@ -48,7 +48,7 @@ SET SESSION sql_log_bin= 0;
 connection child2_1;
 SELECT argument FROM mysql.general_log WHERE argument LIKE '%set %';
 argument
-set session time_zone = '+00:00'
+set session time_zone = '+00:00';set @`spider_lc_./auto_test_remote/tbl_a` = '-xxxxxxxxxxxx-xxxxx-./auto_test_local/tbl_a-'
 SET NAMES utf8
 set session transaction isolation level read committed;set session autocommit = 1;set session wait_timeout = 604800;set session sql_mode = 'strict_trans_tables,error_for_division_by_zero,no_auto_create_user,no_engine_substitution';start transaction
 SELECT argument FROM mysql.general_log WHERE argument LIKE '%set %'
diff --git a/storage/spider/mysql-test/spider/bugfix/r/sql_mode_mariadb.result b/storage/spider/mysql-test/spider/bugfix/r/sql_mode_mariadb.result
index 5048fbb423c..1bf6fbccbc9 100644
--- a/storage/spider/mysql-test/spider/bugfix/r/sql_mode_mariadb.result
+++ b/storage/spider/mysql-test/spider/bugfix/r/sql_mode_mariadb.result
@@ -50,7 +50,7 @@ pkey
 connection child2_1;
 SELECT argument FROM mysql.general_log WHERE argument LIKE '%sql_mode%';
 argument
-set session transaction isolation level repeatable read;set session autocommit = 1;set session sql_log_off = 0;set session wait_timeout = 604800;set session sql_mode = 'real_as_float,ignore_bad_table_options,no_unsigned_subtraction,no_dir_in_create,no_auto_value_on_zero,strict_trans_tables,strict_all_tables,no_zero_in_date,no_zero_date,allow_invalid_dates,error_for_division_by_zero,no_auto_create_user,high_not_precedence,no_engine_substitution,pad_char_to_full_length,empty_string_is_null,simultaneous_assignment,time_round_fractional';set session time_zone = '+00:00';start transaction
+set session transaction isolation level repeatable read;set session autocommit = 1;set session sql_log_off = 0;set session wait_timeout = 604800;set session sql_mode = 'real_as_float,ignore_bad_table_options,no_unsigned_subtraction,no_dir_in_create,no_auto_value_on_zero,strict_trans_tables,strict_all_tables,no_zero_in_date,no_zero_date,allow_invalid_dates,error_for_division_by_zero,no_auto_create_user,high_not_precedence,no_engine_substitution,pad_char_to_full_length,empty_string_is_null,simultaneous_assignment,time_round_fractional';set session time_zone = '+00:00';set @`spider_lc_./auto_test_remote/tbl_a` = '-xxxxxxxxxxxx-xxxxx-./auto_test_local/tbl_a-';start transaction
 SELECT argument FROM mysql.general_log WHERE argument LIKE '%sql_mode%'
 SELECT pkey FROM tbl_a ORDER BY pkey;
 pkey
diff --git a/storage/spider/mysql-test/spider/bugfix/r/sql_mode_mysql.result b/storage/spider/mysql-test/spider/bugfix/r/sql_mode_mysql.result
index 08f9a6007aa..3ec96a66031 100644
--- a/storage/spider/mysql-test/spider/bugfix/r/sql_mode_mysql.result
+++ b/storage/spider/mysql-test/spider/bugfix/r/sql_mode_mysql.result
@@ -50,7 +50,7 @@ pkey
 connection child2_1;
 SELECT argument FROM mysql.general_log WHERE argument LIKE '%sql_mode%';
 argument
-set session transaction isolation level repeatable read;set session autocommit = 1;set session sql_log_off = 0;set session wait_timeout = 604800;set session sql_mode = 'real_as_float,ignore_bad_table_options,no_unsigned_subtraction,no_dir_in_create,no_auto_value_on_zero,strict_trans_tables,strict_all_tables,no_zero_in_date,no_zero_date,allow_invalid_dates,error_for_division_by_zero,no_auto_create_user,high_not_precedence,no_engine_substitution,pad_char_to_full_length';set session time_zone = '+00:00';start transaction
+set session transaction isolation level repeatable read;set session autocommit = 1;set session sql_log_off = 0;set session wait_timeout = 604800;set session sql_mode = 'real_as_float,ignore_bad_table_options,no_unsigned_subtraction,no_dir_in_create,no_auto_value_on_zero,strict_trans_tables,strict_all_tables,no_zero_in_date,no_zero_date,allow_invalid_dates,error_for_division_by_zero,no_auto_create_user,high_not_precedence,no_engine_substitution,pad_char_to_full_length';set session time_zone = '+00:00';set @`spider_lc_./auto_test_remote/tbl_a` = '-xxxxxxxxxxxx-xxxxx-./auto_test_local/tbl_a-';start transaction
 SELECT argument FROM mysql.general_log WHERE argument LIKE '%sql_mode%'
 SELECT pkey FROM tbl_a ORDER BY pkey;
 pkey
diff --git a/storage/spider/mysql-test/spider/bugfix/r/strict_group_by.result b/storage/spider/mysql-test/spider/bugfix/r/strict_group_by.result
new file mode 100644
index 00000000000..f2287dea65a
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/r/strict_group_by.result
@@ -0,0 +1,124 @@
+for master_1
+for child2
+child2_1
+child2_2
+child2_3
+for child3
+connection master_1;
+set @old_spider_sync_sql_mode= @@spider_sync_sql_mode;
+set session spider_sync_sql_mode= FALSE;
+connection child2_1;
+set @old_sql_mode= @@sql_mode;
+set global sql_mode= 'ONLY_FULL_GROUP_BY';
+
+this test is for MDEV-18988
+
+drop and create databases
+connection master_1;
+CREATE DATABASE auto_test_local;
+USE auto_test_local;
+connection child2_1;
+SET @old_log_output = @@global.log_output;
+SET GLOBAL log_output = 'TABLE,FILE';
+CREATE DATABASE auto_test_remote;
+USE auto_test_remote;
+connection child2_2;
+SET @old_log_output = @@global.log_output;
+SET GLOBAL log_output = 'TABLE,FILE';
+CREATE DATABASE auto_test_remote2;
+USE auto_test_remote2;
+
+create table and insert
+connection child2_1;
+CHILD2_1_CREATE_TABLES
+TRUNCATE TABLE mysql.general_log;
+connection child2_2;
+CHILD2_2_CREATE_TABLES
+TRUNCATE TABLE mysql.general_log;
+connection master_1;
+CREATE TABLE tbl_a (
+pkey int NOT NULL,
+skey int NOT NULL,
+PRIMARY KEY (pkey),
+KEY idx1 (skey)
+) MASTER_1_ENGINE MASTER_1_CHARSET MASTER_1_COMMENT_2_1
+INSERT INTO tbl_a (pkey,skey) VALUES (0,0),(1,1),(2,2),(3,3),(4,4),(5,5),(6,6),(7,7),(8,8),(9,9);
+INSERT INTO tbl_a (pkey,skey) VALUES (10,10),(11,11),(12,12),(13,13),(14,14),(15,15),(16,16),(17,17),(18,18),(19,19);
+INSERT INTO tbl_a (pkey,skey) VALUES (20,5),(21,6),(22,7),(23,8),(24,9),(25,10),(26,11),(27,12),(28,13),(29,14);
+
+select test 1
+connection child2_1;
+TRUNCATE TABLE mysql.general_log;
+connection child2_2;
+TRUNCATE TABLE mysql.general_log;
+connection master_1;
+FLUSH TABLES;
+SELECT count(pkey) cnt, skey FROM tbl_a;
+cnt	skey
+30	1
+connection child2_1;
+SELECT argument FROM mysql.general_log WHERE argument LIKE '%select %';
+argument
+select count(`pkey`),min(`pkey`),min(`skey`) from `auto_test_remote`.`tbl_a`
+SELECT argument FROM mysql.general_log WHERE argument LIKE '%select %'
+SELECT pkey, skey FROM tbl_a ORDER BY pkey;
+pkey	skey
+1	1
+3	3
+5	5
+7	7
+9	9
+11	11
+13	13
+15	15
+17	17
+19	19
+21	6
+23	8
+25	10
+27	12
+29	14
+connection child2_2;
+SELECT argument FROM mysql.general_log WHERE argument LIKE '%select %';
+argument
+select count(`pkey`),min(`pkey`),min(`skey`) from `auto_test_remote2`.`tbl_a`
+SELECT argument FROM mysql.general_log WHERE argument LIKE '%select %'
+SELECT pkey, skey FROM tbl_a ORDER BY pkey;
+pkey	skey
+0	0
+2	2
+4	4
+6	6
+8	8
+10	10
+12	12
+14	14
+16	16
+18	18
+20	5
+22	7
+24	9
+26	11
+28	13
+
+deinit
+connection master_1;
+DROP DATABASE IF EXISTS auto_test_local;
+connection child2_1;
+DROP DATABASE IF EXISTS auto_test_remote;
+SET GLOBAL log_output = @old_log_output;
+connection child2_2;
+DROP DATABASE IF EXISTS auto_test_remote2;
+SET GLOBAL log_output = @old_log_output;
+connection child2_1;
+set global sql_mode= @old_sql_mode;
+connection master_1;
+set session spider_sync_sql_mode= @old_spider_sync_sql_mode;
+for master_1
+for child2
+child2_1
+child2_2
+child2_3
+for child3
+
+end of test
diff --git a/storage/spider/mysql-test/spider/bugfix/t/insert_select.cnf b/storage/spider/mysql-test/spider/bugfix/t/insert_select.cnf
new file mode 100644
index 00000000000..05dfd8a0bce
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/t/insert_select.cnf
@@ -0,0 +1,3 @@
+!include include/default_mysqld.cnf
+!include ../my_1_1.cnf
+!include ../my_2_1.cnf
diff --git a/storage/spider/mysql-test/spider/bugfix/t/insert_select.test b/storage/spider/mysql-test/spider/bugfix/t/insert_select.test
new file mode 100644
index 00000000000..381e72f7b1b
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/t/insert_select.test
@@ -0,0 +1,99 @@
+--source ../include/insert_select_init.inc
+--echo
+--echo this test is for MDEV-16399
+--echo
+--echo drop and create databases
+
+--connection master_1
+--disable_warnings
+CREATE DATABASE auto_test_local;
+USE auto_test_local;
+
+--connection child2_1
+SET @old_log_output = @@global.log_output;
+SET GLOBAL log_output = 'TABLE,FILE';
+CREATE DATABASE auto_test_remote;
+USE auto_test_remote;
+--enable_warnings
+
+--echo
+--echo create table and insert
+
+--connection child2_1
+--disable_query_log
+echo CHILD2_1_CREATE_TABLES;
+eval $CHILD2_1_CREATE_TABLES;
+echo CHILD2_1_CREATE_TABLES2;
+eval $CHILD2_1_CREATE_TABLES2;
+--enable_query_log
+TRUNCATE TABLE mysql.general_log;
+
+--connection master_1
+--disable_query_log
+echo CREATE TABLE tbl_a (
+    skey int NOT NULL,
+    dt date NOT NULL,
+    tm time NOT NULL,
+    KEY idx1 (skey,dt,tm)
+) MASTER_1_ENGINE MASTER_1_CHARSET MASTER_1_COMMENT_2_1;
+eval CREATE TABLE tbl_a (
+    skey int NOT NULL,
+    dt date NOT NULL,
+    tm time NOT NULL,
+    KEY idx1 (skey,dt,tm)
+) $MASTER_1_ENGINE $MASTER_1_CHARSET $MASTER_1_COMMENT_2_1;
+echo CREATE TABLE tbl_b (
+    pkey int NOT NULL,
+    dt datetime NOT NULL,
+    PRIMARY KEY (pkey),
+    KEY idx1 (dt)
+) MASTER_1_ENGINE MASTER_1_CHARSET MASTER_1_COMMENT_2_1_2;
+eval CREATE TABLE tbl_b (
+    pkey int NOT NULL,
+    dt datetime NOT NULL,
+    PRIMARY KEY (pkey),
+    KEY idx1 (dt)
+) $MASTER_1_ENGINE $MASTER_1_CHARSET $MASTER_1_COMMENT_2_1_2;
+--enable_query_log
+INSERT INTO tbl_a (skey, dt, tm) VALUES (0, '2012-01-01', '12:00:00'),(1, '2012-02-01', '12:00:00'),(2, '2012-03-01', '12:00:00'),(3, '2012-04-01', '12:00:00'),(4, '2012-05-01', '12:00:00'),(5, '2012-06-01', '12:00:00'),(6, '2012-07-01', '12:00:00'),(7, '2012-08-01', '12:00:00'),(8, '2012-09-01', '12:00:00'),(9, '2012-10-01', '12:00:00');
+INSERT INTO tbl_a (skey, dt, tm) VALUES (0, '2013-01-01', '13:00:00'),(1, '2013-02-01', '13:00:00'),(2, '2013-03-01', '13:00:00'),(3, '2013-04-01', '13:00:00'),(4, '2013-05-01', '13:00:00'),(5, '2013-06-01', '13:00:00'),(6, '2013-07-01', '13:00:00'),(7, '2013-08-01', '13:00:00'),(8, '2013-09-01', '13:00:00'),(9, '2013-10-01', '13:00:00');
+INSERT INTO tbl_a (skey, dt, tm) VALUES (0, '2012-11-01', '11:00:00'),(1, '2012-12-01', '11:00:00'),(2, '2012-11-30', '11:00:00'),(3, '2012-11-29', '11:00:00'),(4, '2012-11-28', '11:00:00'),(5, '2012-11-27', '11:00:00'),(6, '2012-11-26', '11:00:00'),(7, '2012-11-25', '11:00:00'),(8, '2012-11-24', '11:00:00'),(9, '2012-11-23', '11:00:00');
+FLUSH TABLES;
+
+--echo
+--echo select test 1
+
+--connection child2_1
+TRUNCATE TABLE mysql.general_log;
+
+--connection master_1
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 4 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 3 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 2 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 1 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 0 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 9 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 8 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 7 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 6 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 5 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
+
+--connection child2_1
+eval $CHILD2_1_SELECT_ARGUMENT1;
+eval $CHILD2_1_SELECT_TABLES;
+
+--echo
+--echo deinit
+--disable_warnings
+
+--connection master_1
+DROP DATABASE IF EXISTS auto_test_local;
+
+--connection child2_1
+DROP DATABASE IF EXISTS auto_test_remote;
+SET GLOBAL log_output = @old_log_output;
+
+--enable_warnings
+--source ../include/insert_select_deinit.inc
+--echo
+--echo end of test
diff --git a/storage/spider/mysql-test/spider/bugfix/t/mdev_22246.test b/storage/spider/mysql-test/spider/bugfix/t/mdev_22246.test
index 63b04c14e11..9e58bc1a836 100644
--- a/storage/spider/mysql-test/spider/bugfix/t/mdev_22246.test
+++ b/storage/spider/mysql-test/spider/bugfix/t/mdev_22246.test
@@ -64,7 +64,7 @@ TRUNCATE TABLE mysql.general_log;
 
 --connection master_1
 SELECT * FROM tbl_a;
-SELECT * FROM tbl_a WHERE id != 0;
+SELECT * FROM tbl_a WHERE id <0 || id >0;
 
 --connection child2_1
 eval $CHILD2_1_SELECT_TABLES;
diff --git a/storage/spider/mysql-test/spider/bugfix/t/mdev_25684.cnf b/storage/spider/mysql-test/spider/bugfix/t/mdev_25684.cnf
new file mode 100644
index 00000000000..05dfd8a0bce
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/t/mdev_25684.cnf
@@ -0,0 +1,3 @@
+!include include/default_mysqld.cnf
+!include ../my_1_1.cnf
+!include ../my_2_1.cnf
diff --git a/storage/spider/mysql-test/spider/bugfix/t/mdev_25684.test b/storage/spider/mysql-test/spider/bugfix/t/mdev_25684.test
new file mode 100644
index 00000000000..243031e52dd
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/t/mdev_25684.test
@@ -0,0 +1,19 @@
+--disable_query_log
+--disable_result_log
+--source ../../t/test_init.inc
+--enable_result_log
+--enable_query_log
+
+--echo
+--echo MDEV-25684 Crash in THD::find_temporary_table while calling spider_direct_sql UDF without temporary table created
+--echo
+
+--connection master_1
+--error 12703
+SELECT SPIDER_DIRECT_SQL('SELECT * FROM s', 'non_existing_temporary_table', 'srv "s_2_1"');
+
+--disable_query_log
+--disable_result_log
+--source ../../t/test_deinit.inc
+--enable_result_log
+--enable_query_log
diff --git a/storage/spider/mysql-test/spider/bugfix/t/mdev_26158.cnf b/storage/spider/mysql-test/spider/bugfix/t/mdev_26158.cnf
new file mode 100644
index 00000000000..05dfd8a0bce
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/t/mdev_26158.cnf
@@ -0,0 +1,3 @@
+!include include/default_mysqld.cnf
+!include ../my_1_1.cnf
+!include ../my_2_1.cnf
diff --git a/storage/spider/mysql-test/spider/bugfix/t/mdev_26158.test b/storage/spider/mysql-test/spider/bugfix/t/mdev_26158.test
new file mode 100644
index 00000000000..0484d2b6652
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/t/mdev_26158.test
@@ -0,0 +1,31 @@
+--echo #
+--echo # MDEV-26158 SIGSEGV in spider_free_mem from ha_spider::open on INSERT
+--echo #
+
+--disable_query_log
+--disable_result_log
+--source ../../t/test_init.inc
+--enable_result_log
+--enable_query_log
+
+--connection master_1
+CREATE DATABASE auto_test_local;
+USE auto_test_local;
+
+eval CREATE TABLE t (
+    c INT
+) $MASTER_1_ENGINE $MASTER_1_CHARSET COMMENT='table "tbl_a"'
+PARTITION BY LIST COLUMNS(`c`) (
+    PARTITION `pt1` DEFAULT COMMENT = 'srv "s_2_1"'
+);
+
+--error ER_BAD_DB_ERROR
+INSERT INTO t SELECT * FROM t;
+
+DROP DATABASE auto_test_local;
+
+--disable_query_log
+--disable_result_log
+--source ../../t/test_deinit.inc
+--enable_result_log
+--enable_query_log
diff --git a/storage/spider/mysql-test/spider/bugfix/t/mdev_26539.cnf b/storage/spider/mysql-test/spider/bugfix/t/mdev_26539.cnf
new file mode 100644
index 00000000000..05dfd8a0bce
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/t/mdev_26539.cnf
@@ -0,0 +1,3 @@
+!include include/default_mysqld.cnf
+!include ../my_1_1.cnf
+!include ../my_2_1.cnf
diff --git a/storage/spider/mysql-test/spider/bugfix/t/mdev_26539.test b/storage/spider/mysql-test/spider/bugfix/t/mdev_26539.test
new file mode 100644
index 00000000000..f2561f8c9a5
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/t/mdev_26539.test
@@ -0,0 +1,40 @@
+--disable_query_log
+--disable_result_log
+--source ../../t/test_init.inc
+--enable_result_log
+--enable_query_log
+
+--echo #
+--echo # MDEV-26539 SIGSEGV in spider_check_and_set_trx_isolation and I_P_List_iterator from THD::drop_temporary_table (10.5.3 opt only) on ALTER
+--echo #
+
+--connection child2_1
+CREATE DATABASE auto_test_remote;
+USE auto_test_remote;
+eval CREATE TABLE tbl_a (
+    c INT
+) $CHILD2_1_ENGINE $CHILD2_1_CHARSET;
+
+--connection master_1
+CREATE DATABASE auto_test_local;
+USE auto_test_local;
+
+eval CREATE TABLE tbl_a (
+    c INT
+) $MASTER_1_ENGINE $MASTER_1_CHARSET COMMENT='table "tbl_a"' PARTITION BY LIST COLUMNS (c) (
+    PARTITION pt1 DEFAULT COMMENT = 'srv "s_2_1"'
+);
+
+INSERT INTO tbl_a VALUES (1);
+ALTER TABLE tbl_a CHECK PARTITION ALL;
+
+DROP DATABASE auto_test_local;
+
+--connection child2_1
+DROP DATABASE auto_test_remote;
+
+--disable_query_log
+--disable_result_log
+--source ../../t/test_deinit.inc
+--enable_result_log
+--enable_query_log
diff --git a/storage/spider/mysql-test/spider/bugfix/t/mdev_26582.cnf b/storage/spider/mysql-test/spider/bugfix/t/mdev_26582.cnf
new file mode 100644
index 00000000000..b0853e32654
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/t/mdev_26582.cnf
@@ -0,0 +1,2 @@
+!include include/default_mysqld.cnf
+!include ../my_1_1.cnf
diff --git a/storage/spider/mysql-test/spider/bugfix/t/mdev_26582.test b/storage/spider/mysql-test/spider/bugfix/t/mdev_26582.test
new file mode 100644
index 00000000000..714a5e1c853
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/t/mdev_26582.test
@@ -0,0 +1,27 @@
+--echo #
+--echo # MDEV-26582 SIGSEGV in spider_db_bulk_insert and spider_db_connect and spider_db_before_query, and hang in "End of update loop" / "Reset for next command" query states
+--echo #
+
+# NOTE: The bug does not reproduce if we import ../../t/test_init.inc instead.
+--disable_query_log
+--disable_result_log
+--source ../../include/init_spider.inc
+--enable_result_log
+--enable_query_log
+
+CREATE DATABASE IF NOT EXISTS auto_test_local;
+USE auto_test_local;
+
+CREATE TABLE t (i CHAR) ENGINE=SPIDER;
+--error ER_CONNECT_TO_FOREIGN_DATA_SOURCE
+INSERT INTO t VALUES (0);
+--error ER_BAD_FIELD_ERROR
+INSERT t SELECT 1 ON DUPLICATE KEY UPDATE c=1;
+
+DROP DATABASE IF EXISTS auto_test_local;
+
+--disable_query_log
+--disable_result_log
+--source ../../include/deinit_spider.inc
+--enable_result_log
+--enable_query_log
diff --git a/storage/spider/mysql-test/spider/bugfix/t/mdev_27240.cnf b/storage/spider/mysql-test/spider/bugfix/t/mdev_27240.cnf
new file mode 100644
index 00000000000..b0853e32654
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/t/mdev_27240.cnf
@@ -0,0 +1,2 @@
+!include include/default_mysqld.cnf
+!include ../my_1_1.cnf
diff --git a/storage/spider/mysql-test/spider/bugfix/t/mdev_27240.test b/storage/spider/mysql-test/spider/bugfix/t/mdev_27240.test
new file mode 100644
index 00000000000..552ce3aa6de
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/t/mdev_27240.test
@@ -0,0 +1,28 @@
+--disable_warnings
+--disable_query_log
+--disable_result_log
+--source ../../t/test_init.inc
+--enable_result_log
+--enable_query_log
+--enable_warnings
+
+CREATE DATABASE auto_test_local;
+USE auto_test_local;
+
+CREATE TABLE tbl_a (a INT KEY) ENGINE=SPIDER;
+--error ER_BAD_FIELD_ERROR
+SELECT a.z FROM tbl_a AS a,tbl_a b WHERE a.z=b.z;
+--error ER_BAD_FIELD_ERROR
+ALTER TABLE tbl_a CHANGE c c INT;
+--error ER_CONNECT_TO_FOREIGN_DATA_SOURCE
+LOCK TABLE tbl_a READ;
+
+DROP DATABASE auto_test_local;
+
+--disable_warnings
+--disable_query_log
+--disable_result_log
+--source ../../t/test_deinit.inc
+--enable_result_log
+--enable_query_log
+--enable_warnings
diff --git a/storage/spider/mysql-test/spider/bugfix/t/self_reference.cnf b/storage/spider/mysql-test/spider/bugfix/t/self_reference.cnf
new file mode 100644
index 00000000000..b0853e32654
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/t/self_reference.cnf
@@ -0,0 +1,2 @@
+!include include/default_mysqld.cnf
+!include ../my_1_1.cnf
diff --git a/storage/spider/mysql-test/spider/bugfix/t/self_reference.test b/storage/spider/mysql-test/spider/bugfix/t/self_reference.test
new file mode 100644
index 00000000000..b93cf8ef715
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/t/self_reference.test
@@ -0,0 +1,45 @@
+--source ../include/self_reference_init.inc
+--echo
+--echo this test is for MDEV-6268
+--echo
+--echo drop and create databases
+
+--connection master_1
+--disable_warnings
+CREATE DATABASE auto_test_local;
+USE auto_test_local;
+--enable_warnings
+
+--echo
+--echo create table
+
+--connection master_1
+--disable_query_log
+echo CREATE TABLE tbl_a (
+    pkey int NOT NULL,
+    PRIMARY KEY (pkey)
+) MASTER_1_ENGINE MASTER_1_CHARSET MASTER_1_COMMENT_2_1;
+eval CREATE TABLE tbl_a (
+    pkey int NOT NULL,
+    PRIMARY KEY (pkey)
+) $MASTER_1_ENGINE $MASTER_1_CHARSET $MASTER_1_COMMENT_2_1;
+--enable_query_log
+
+--echo
+--echo select test 1
+
+--connection master_1
+--error 12719
+SELECT pkey FROM tbl_a;
+
+--echo
+--echo deinit
+--disable_warnings
+
+--connection master_1
+DROP DATABASE IF EXISTS auto_test_local;
+
+--enable_warnings
+--source ../include/self_reference_deinit.inc
+--echo
+--echo end of test
diff --git a/storage/spider/mysql-test/spider/bugfix/t/slave_trx_isolation.test b/storage/spider/mysql-test/spider/bugfix/t/slave_trx_isolation.test
index 652fbb1c11c..c608ae018ea 100644
--- a/storage/spider/mysql-test/spider/bugfix/t/slave_trx_isolation.test
+++ b/storage/spider/mysql-test/spider/bugfix/t/slave_trx_isolation.test
@@ -70,6 +70,7 @@ sync_with_master;
 SET SESSION sql_log_bin= 0;
 
 --connection child2_1
+--replace_regex /-[0-9a-f]{12}-[0-9a-f]+-/-xxxxxxxxxxxx-xxxxx-/
 eval $CHILD2_1_SELECT_ARGUMENT1;
 eval $CHILD2_1_SELECT_TABLES;
 
diff --git a/storage/spider/mysql-test/spider/bugfix/t/sql_mode.inc b/storage/spider/mysql-test/spider/bugfix/t/sql_mode.inc
index ae7c15c5081..f094b1f80a3 100644
--- a/storage/spider/mysql-test/spider/bugfix/t/sql_mode.inc
+++ b/storage/spider/mysql-test/spider/bugfix/t/sql_mode.inc
@@ -48,6 +48,7 @@ TRUNCATE TABLE mysql.general_log;
 SELECT * FROM tbl_a ORDER BY pkey;
 
 --connection child2_1
+--replace_regex /-[0-9a-f]{12}-[0-9a-f]+-/-xxxxxxxxxxxx-xxxxx-/
 eval $CHILD2_1_SELECT_ARGUMENT1;
 eval $CHILD2_1_SELECT_TABLES;
 
diff --git a/storage/spider/mysql-test/spider/bugfix/t/strict_group_by.cnf b/storage/spider/mysql-test/spider/bugfix/t/strict_group_by.cnf
new file mode 100644
index 00000000000..e0ffb99c38e
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/t/strict_group_by.cnf
@@ -0,0 +1,4 @@
+!include include/default_mysqld.cnf
+!include ../my_1_1.cnf
+!include ../my_2_1.cnf
+!include ../my_2_2.cnf
diff --git a/storage/spider/mysql-test/spider/bugfix/t/strict_group_by.test b/storage/spider/mysql-test/spider/bugfix/t/strict_group_by.test
new file mode 100644
index 00000000000..8b8da46d56c
--- /dev/null
+++ b/storage/spider/mysql-test/spider/bugfix/t/strict_group_by.test
@@ -0,0 +1,98 @@
+--source ../include/strict_group_by_init.inc
+--echo
+--echo this test is for MDEV-18988
+--echo
+--echo drop and create databases
+--connection master_1
+--disable_warnings
+CREATE DATABASE auto_test_local;
+USE auto_test_local;
+
+--connection child2_1
+SET @old_log_output = @@global.log_output;
+SET GLOBAL log_output = 'TABLE,FILE';
+CREATE DATABASE auto_test_remote;
+USE auto_test_remote;
+
+--connection child2_2
+SET @old_log_output = @@global.log_output;
+SET GLOBAL log_output = 'TABLE,FILE';
+CREATE DATABASE auto_test_remote2;
+USE auto_test_remote2;
+--enable_warnings
+
+--echo
+--echo create table and insert
+
+--connection child2_1
+--disable_query_log
+echo CHILD2_1_CREATE_TABLES;
+eval $CHILD2_1_CREATE_TABLES;
+--enable_query_log
+TRUNCATE TABLE mysql.general_log;
+
+--connection child2_2
+--disable_query_log
+echo CHILD2_2_CREATE_TABLES;
+eval $CHILD2_2_CREATE_TABLES;
+--enable_query_log
+TRUNCATE TABLE mysql.general_log;
+
+--connection master_1
+--disable_query_log
+echo CREATE TABLE tbl_a (
+    pkey int NOT NULL,
+    skey int NOT NULL,
+    PRIMARY KEY (pkey),
+    KEY idx1 (skey)
+) MASTER_1_ENGINE MASTER_1_CHARSET MASTER_1_COMMENT_2_1;
+eval CREATE TABLE tbl_a (
+    pkey int NOT NULL,
+    skey int NOT NULL,
+    PRIMARY KEY (pkey),
+    KEY idx1 (skey)
+) $MASTER_1_ENGINE $MASTER_1_CHARSET $MASTER_1_COMMENT_2_1;
+--enable_query_log
+INSERT INTO tbl_a (pkey,skey) VALUES (0,0),(1,1),(2,2),(3,3),(4,4),(5,5),(6,6),(7,7),(8,8),(9,9);
+INSERT INTO tbl_a (pkey,skey) VALUES (10,10),(11,11),(12,12),(13,13),(14,14),(15,15),(16,16),(17,17),(18,18),(19,19);
+INSERT INTO tbl_a (pkey,skey) VALUES (20,5),(21,6),(22,7),(23,8),(24,9),(25,10),(26,11),(27,12),(28,13),(29,14);
+
+--echo
+--echo select test 1
+
+--connection child2_1
+TRUNCATE TABLE mysql.general_log;
+
+--connection child2_2
+TRUNCATE TABLE mysql.general_log;
+
+--connection master_1
+FLUSH TABLES;
+SELECT count(pkey) cnt, skey FROM tbl_a;
+
+--connection child2_1
+eval $CHILD2_1_SELECT_ARGUMENT1;
+eval $CHILD2_1_SELECT_TABLES;
+
+--connection child2_2
+eval $CHILD2_2_SELECT_ARGUMENT1;
+eval $CHILD2_2_SELECT_TABLES;
+
+--echo
+--echo deinit
+--disable_warnings
+--connection master_1
+DROP DATABASE IF EXISTS auto_test_local;
+
+--connection child2_1
+DROP DATABASE IF EXISTS auto_test_remote;
+SET GLOBAL log_output = @old_log_output;
+
+--connection child2_2
+DROP DATABASE IF EXISTS auto_test_remote2;
+SET GLOBAL log_output = @old_log_output;
+
+--enable_warnings
+--source ../include/strict_group_by_deinit.inc
+--echo
+--echo end of test
diff --git a/storage/spider/mysql-test/spider/include/deinit_spider.inc b/storage/spider/mysql-test/spider/include/deinit_spider.inc
index 51cc075edaa..dd474c59bc7 100644
--- a/storage/spider/mysql-test/spider/include/deinit_spider.inc
+++ b/storage/spider/mysql-test/spider/include/deinit_spider.inc
@@ -33,6 +33,7 @@ DROP FUNCTION spider_copy_tables;
 DROP FUNCTION spider_ping_table;
 DROP FUNCTION spider_bg_direct_sql;
 DROP FUNCTION spider_direct_sql;
+UNINSTALL PLUGIN spider_wrapper_protocols;
 UNINSTALL PLUGIN spider_alloc_mem;
 UNINSTALL PLUGIN spider;
 DROP TABLE IF EXISTS mysql.spider_xa;
@@ -46,6 +47,10 @@ DROP TABLE IF EXISTS mysql.spider_table_sts;
 DROP TABLE IF EXISTS mysql.spider_table_crd;
 if ($VERSION_COMPILE_OS_WIN)
 {
+  if ($MASTER_1_MYPORT)
+  {
+    DROP SERVER s_1;
+  }
   if ($CHILD2_1_MYPORT)
   {
     DROP SERVER s_2_1;
@@ -73,6 +78,10 @@ if ($VERSION_COMPILE_OS_WIN)
 }
 if (!$VERSION_COMPILE_OS_WIN)
 {
+  if ($MASTER_1_MYSOCK)
+  {
+    DROP SERVER s_1;
+  }
   if ($CHILD2_1_MYSOCK)
   {
     DROP SERVER s_2_1;
diff --git a/storage/spider/mysql-test/spider/include/init_spider.inc b/storage/spider/mysql-test/spider/include/init_spider.inc
index 69d1fae425e..6a22ffcc1fd 100644
--- a/storage/spider/mysql-test/spider/include/init_spider.inc
+++ b/storage/spider/mysql-test/spider/include/init_spider.inc
@@ -3,6 +3,16 @@ let $VERSION_COMPILE_OS_WIN=
 if ($VERSION_COMPILE_OS_WIN)
 {
   INSTALL PLUGIN spider SONAME 'ha_spider.dll';
+  if ($MASTER_1_MYPORT)
+  {
+    eval CREATE SERVER s_1 FOREIGN DATA WRAPPER mysql OPTIONS (
+      HOST 'localhost',
+      DATABASE 'auto_test_local',
+      USER 'root',
+      PASSWORD '',
+      PORT $MASTER_1_MYPORT
+    );
+  }
   if ($CHILD2_1_MYPORT)
   {
     eval CREATE SERVER s_2_1 FOREIGN DATA WRAPPER mysql OPTIONS (
@@ -67,6 +77,16 @@ if ($VERSION_COMPILE_OS_WIN)
 if (!$VERSION_COMPILE_OS_WIN)
 {
   INSTALL PLUGIN spider SONAME 'ha_spider.so';
+  if ($MASTER_1_MYSOCK)
+  {
+    eval CREATE SERVER s_1 FOREIGN DATA WRAPPER mysql OPTIONS (
+      HOST 'localhost',
+      DATABASE 'auto_test_local',
+      USER 'root',
+      PASSWORD '',
+      SOCKET '$MASTER_1_MYSOCK'
+    );
+  }
   if ($CHILD2_1_MYSOCK)
   {
     eval CREATE SERVER s_2_1 FOREIGN DATA WRAPPER mysql OPTIONS (
@@ -131,6 +151,11 @@ if (!$VERSION_COMPILE_OS_WIN)
 
 let $SERVER_NAME=
   `SELECT SUBSTRING_INDEX(SUBSTRING_INDEX(version(), '-', 2), '-', -1)`;
+if (`SELECT IF('$SERVER_NAME' REGEXP '^[0-9]+\$', 1, 0)`)
+{
+  let $SERVER_NAME=
+    `SELECT SUBSTRING_INDEX(SUBSTRING_INDEX(version(), '-', 3), '-', -1)`;
+}
 let $SERVER_MAJOR_VERSION=
   `SELECT SUBSTRING_INDEX(version(), '.', 1)`;
 let $SERVER_MINOR_VERSION=
diff --git a/storage/spider/mysql-test/spider/r/direct_left_join_nullable.result b/storage/spider/mysql-test/spider/r/direct_left_join_nullable.result
index e646bc9bf38..194a6b31cba 100644
--- a/storage/spider/mysql-test/spider/r/direct_left_join_nullable.result
+++ b/storage/spider/mysql-test/spider/r/direct_left_join_nullable.result
@@ -87,7 +87,7 @@ a	b	c	a
 connection child2_1;
 SELECT argument FROM mysql.general_log WHERE argument LIKE '%select %';
 argument
-select t0.`a` `a`,t2.`b` `b`,t2.`c` `c`,t3.`a` `a` from `auto_test_remote`.`ta_r_no_idx` t0 left join `auto_test_remote`.`ta_r_auto_inc` t1 on ((t1.`a` = t0.`a`) and (t0.`a` is not null)) left join `auto_test_remote`.`ta_r_3` t2 on (t2.`c` = t1.`c`) left join `auto_test_remote`.`ta_r` t3 on ((t3.`b` = t2.`b`) and (t2.`b` is not null)) where 1 order by t0.`a` desc
+select t0.`a` `a`,t2.`b` `b`,t2.`c` `c`,t3.`a` `a` from `auto_test_remote`.`ta_r_no_idx` t0 left join `auto_test_remote`.`ta_r_auto_inc` t1 on (t1.`a` = t0.`a`) left join `auto_test_remote`.`ta_r_3` t2 on (t2.`c` = t1.`c`) left join `auto_test_remote`.`ta_r` t3 on (t3.`b` = t2.`b`) where 1 order by t0.`a` desc
 SELECT argument FROM mysql.general_log WHERE argument LIKE '%select %'
 SELECT a, b, date_format(c, '%Y-%m-%d %H:%i:%s') FROM ta_r ORDER BY a;
 a	b	date_format(c, '%Y-%m-%d %H:%i:%s')
diff --git a/storage/spider/mysql-test/spider/r/direct_left_right_join_nullable.result b/storage/spider/mysql-test/spider/r/direct_left_right_join_nullable.result
index f3c6e189444..e6720c1113f 100644
--- a/storage/spider/mysql-test/spider/r/direct_left_right_join_nullable.result
+++ b/storage/spider/mysql-test/spider/r/direct_left_right_join_nullable.result
@@ -87,7 +87,7 @@ NULL	NULL	NULL	3
 connection child2_1;
 SELECT argument FROM mysql.general_log WHERE argument LIKE '%select %';
 argument
-select t0.`a` `a`,t2.`b` `b`,t2.`c` `c`,t3.`a` `a` from `auto_test_remote`.`ta_r_no_idx` t3 left join (`auto_test_remote`.`ta_r_auto_inc` t2 join `auto_test_remote`.`ta_r_3` t1 join `auto_test_remote`.`ta_r` t0) on ((t2.`b` = t3.`b`) and (t2.`c` = t1.`c`) and (t0.`a` = t1.`a`) and (t1.`a` is not null) and (t3.`b` is not null)) where 1 order by t3.`a` desc
+select t0.`a` `a`,t2.`b` `b`,t2.`c` `c`,t3.`a` `a` from `auto_test_remote`.`ta_r_no_idx` t3 left join (`auto_test_remote`.`ta_r_auto_inc` t2 join `auto_test_remote`.`ta_r_3` t1 join `auto_test_remote`.`ta_r` t0) on ((t2.`b` = t3.`b`) and (t2.`c` = t1.`c`) and (t0.`a` = t1.`a`) and (t1.`a` is not null)) where 1 order by t3.`a` desc
 SELECT argument FROM mysql.general_log WHERE argument LIKE '%select %'
 SELECT a, b, date_format(c, '%Y-%m-%d %H:%i:%s') FROM ta_r ORDER BY a;
 a	b	date_format(c, '%Y-%m-%d %H:%i:%s')
diff --git a/storage/spider/mysql-test/spider/r/direct_right_join_nullable.result b/storage/spider/mysql-test/spider/r/direct_right_join_nullable.result
index a05781cb6d6..a0b44c95cee 100644
--- a/storage/spider/mysql-test/spider/r/direct_right_join_nullable.result
+++ b/storage/spider/mysql-test/spider/r/direct_right_join_nullable.result
@@ -87,7 +87,7 @@ NULL	c	2000-01-03 00:00:00	3
 connection child2_1;
 SELECT argument FROM mysql.general_log WHERE argument LIKE '%select %';
 argument
-select t0.`a` `a`,t2.`b` `b`,t2.`c` `c`,t3.`a` `a` from `auto_test_remote`.`ta_r_no_idx` t3 left join `auto_test_remote`.`ta_r_auto_inc` t2 on ((t2.`b` = t3.`b`) and (t3.`b` is not null)) left join `auto_test_remote`.`ta_r_3` t1 on (t1.`c` = t2.`c`) left join `auto_test_remote`.`ta_r` t0 on ((t0.`a` = t1.`a`) and (t1.`a` is not null)) where 1 order by t3.`a` desc
+select t0.`a` `a`,t2.`b` `b`,t2.`c` `c`,t3.`a` `a` from `auto_test_remote`.`ta_r_no_idx` t3 left join `auto_test_remote`.`ta_r_auto_inc` t2 on (t2.`b` = t3.`b`) left join `auto_test_remote`.`ta_r_3` t1 on (t1.`c` = t2.`c`) left join `auto_test_remote`.`ta_r` t0 on ((t0.`a` = t1.`a`) and (t1.`a` is not null)) where 1 order by t3.`a` desc
 SELECT argument FROM mysql.general_log WHERE argument LIKE '%select %'
 SELECT a, b, date_format(c, '%Y-%m-%d %H:%i:%s') FROM ta_r ORDER BY a;
 a	b	date_format(c, '%Y-%m-%d %H:%i:%s')
diff --git a/storage/spider/mysql-test/spider/r/direct_right_left_right_join_nullable.result b/storage/spider/mysql-test/spider/r/direct_right_left_right_join_nullable.result
index 48cd9c2c75f..0ee74cae7a3 100644
--- a/storage/spider/mysql-test/spider/r/direct_right_left_right_join_nullable.result
+++ b/storage/spider/mysql-test/spider/r/direct_right_left_right_join_nullable.result
@@ -87,7 +87,7 @@ NULL	c	2000-01-03 00:00:00	3
 connection child2_1;
 SELECT argument FROM mysql.general_log WHERE argument LIKE '%select %';
 argument
-select t0.`a` `a`,t2.`b` `b`,t2.`c` `c`,t3.`a` `a` from `auto_test_remote`.`ta_r_no_idx` t3 left join (`auto_test_remote`.`ta_r_auto_inc` t2 join `auto_test_remote`.`ta_r_3` t1 left join `auto_test_remote`.`ta_r` t0 on ((t0.`a` = t1.`a`) and (t1.`a` is not null))) on ((t2.`b` = t3.`b`) and (t2.`c` = t1.`c`) and (t3.`b` is not null)) where 1 order by t3.`a` desc
+select t0.`a` `a`,t2.`b` `b`,t2.`c` `c`,t3.`a` `a` from `auto_test_remote`.`ta_r_no_idx` t3 left join (`auto_test_remote`.`ta_r_auto_inc` t2 join `auto_test_remote`.`ta_r_3` t1 left join `auto_test_remote`.`ta_r` t0 on ((t0.`a` = t1.`a`) and (t1.`a` is not null))) on ((t2.`b` = t3.`b`) and (t2.`c` = t1.`c`)) where 1 order by t3.`a` desc
 SELECT argument FROM mysql.general_log WHERE argument LIKE '%select %'
 SELECT a, b, date_format(c, '%Y-%m-%d %H:%i:%s') FROM ta_r ORDER BY a;
 a	b	date_format(c, '%Y-%m-%d %H:%i:%s')
diff --git a/storage/spider/mysql-test/spider/r/slave_trx_isolation.result b/storage/spider/mysql-test/spider/r/slave_trx_isolation.result
index e68b4a2c82d..a9b88d50628 100644
--- a/storage/spider/mysql-test/spider/r/slave_trx_isolation.result
+++ b/storage/spider/mysql-test/spider/r/slave_trx_isolation.result
@@ -51,7 +51,7 @@ SET SESSION sql_log_bin= 0;
 connection child2_1;
 SELECT argument FROM mysql.general_log WHERE argument LIKE '%set %';
 argument
-set session time_zone = '+00:00'
+set session time_zone = '+00:00';set @`spider_lc_./auto_test_remote/tbl_a` = '-xxxxxxxxxxxx-xxxxx-./auto_test_local/tbl_a-'
 SET NAMES utf8
 set session transaction isolation level read committed;set session autocommit = 1;set session wait_timeout = 604800;set session sql_mode = 'strict_trans_tables,error_for_division_by_zero,no_auto_create_user,no_engine_substitution';start transaction
 SELECT argument FROM mysql.general_log WHERE argument LIKE '%set %'
diff --git a/storage/spider/mysql-test/spider/r/spider_wrapper_protocols.result b/storage/spider/mysql-test/spider/r/spider_wrapper_protocols.result
new file mode 100644
index 00000000000..f1a7b6208ac
--- /dev/null
+++ b/storage/spider/mysql-test/spider/r/spider_wrapper_protocols.result
@@ -0,0 +1,22 @@
+for master_1
+for child2
+child2_1
+child2_2
+child2_3
+for child3
+child3_1
+child3_2
+child3_3
+connection master_1;
+SELECT * FROM information_schema.spider_wrapper_protocols;
+for master_1
+for child2
+child2_1
+child2_2
+child2_3
+for child3
+child3_1
+child3_2
+child3_3
+
+end of test
diff --git a/storage/spider/mysql-test/spider/t/slave_trx_isolation.test b/storage/spider/mysql-test/spider/t/slave_trx_isolation.test
index 507e5340779..f7f41358f51 100644
--- a/storage/spider/mysql-test/spider/t/slave_trx_isolation.test
+++ b/storage/spider/mysql-test/spider/t/slave_trx_isolation.test
@@ -108,6 +108,7 @@ if ($USE_CHILD_GROUP2)
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --replace_regex /-[0-9a-f]{12}-[0-9a-f]+-/-xxxxxxxxxxxx-xxxxx-/
     eval $CHILD2_1_SELECT_ARGUMENT1;
   }
   eval $CHILD2_1_SELECT_TABLES;
diff --git a/storage/spider/mysql-test/spider/t/spider_wrapper_protocols.test b/storage/spider/mysql-test/spider/t/spider_wrapper_protocols.test
new file mode 100644
index 00000000000..d0753cf73c4
--- /dev/null
+++ b/storage/spider/mysql-test/spider/t/spider_wrapper_protocols.test
@@ -0,0 +1,16 @@
+--disable_warnings
+--disable_query_log
+--disable_result_log
+--source test_init.inc
+--enable_query_log
+
+--connection master_1
+SELECT * FROM information_schema.spider_wrapper_protocols;
+
+--disable_query_log
+--source test_deinit.inc
+--enable_result_log
+--enable_query_log
+--enable_warnings
+--echo
+--echo end of test
diff --git a/storage/spider/scripts/install_spider.sql b/storage/spider/scripts/install_spider.sql
deleted file mode 100644
index 403bd99fd68..00000000000
--- a/storage/spider/scripts/install_spider.sql
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (C) 2010-2019 Kentoku Shiba
-# 
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; version 2 of the License.
-# 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-# 
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1335  USA
-
-drop procedure if exists mysql.spider_plugin_installer;
-delimiter //
-create procedure mysql.spider_plugin_installer()
-begin
-  set @win_plugin := IF(@@version_compile_os like 'Win%', 1, 0);
-  set @have_spider_i_s_plugin := 0;
-  select @have_spider_i_s_plugin := 1 from INFORMATION_SCHEMA.plugins where PLUGIN_NAME = 'SPIDER';
-  set @have_spider_plugin := 0;
-  select @have_spider_plugin := 1 from mysql.plugin where name = 'spider';
-  if @have_spider_i_s_plugin = 0 then
-    if @have_spider_plugin = 1 then
-      -- spider plugin is present in mysql.plugin but not in
-      -- information_schema.plugins.  Remove spider plugin entry
-      -- in mysql.plugin first.
-      delete from mysql.plugin where name = 'spider';
-    end if;
-    -- Install spider plugin
-    if @win_plugin = 0 then 
-      install plugin spider soname 'ha_spider.so';
-    else
-      install plugin spider soname 'ha_spider.dll';
-    end if;
-  end if;
-end;//
-delimiter ;
-call mysql.spider_plugin_installer;
-drop procedure mysql.spider_plugin_installer;
diff --git a/storage/spider/spd_conn.cc b/storage/spider/spd_conn.cc
index 1e973493ba8..4b7ece6f265 100644
--- a/storage/spider/spd_conn.cc
+++ b/storage/spider/spd_conn.cc
@@ -1,4 +1,4 @@
-/* Copyright (C) 2008-2019 Kentoku Shiba
+/* Copyright (C) 2008-2020 Kentoku Shiba
    Copyright (C) 2019, 2020, MariaDB Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -26,6 +26,7 @@
 #include "probes_mysql.h"
 #include "sql_class.h"
 #include "sql_partition.h"
+#include "sql_table.h"
 #include "tztime.h"
 #endif
 #include "spd_err.h"
@@ -56,6 +57,8 @@ inline void SPIDER_set_next_thread_id(THD *A)
 
 extern handlerton *spider_hton_ptr;
 extern SPIDER_DBTON spider_dbton[SPIDER_DBTON_SIZE];
+extern struct charset_info_st *spd_charset_utf8mb3_bin;
+extern LEX_CSTRING spider_unique_id;
 pthread_mutex_t spider_conn_id_mutex;
 pthread_mutex_t spider_ipport_conn_mutex;
 ulonglong spider_conn_id = 1;
@@ -66,6 +69,7 @@ extern pthread_attr_t spider_pt_attr;
 #ifdef HAVE_PSI_INTERFACE
 extern PSI_mutex_key spd_key_mutex_mta_conn;
 extern PSI_mutex_key spd_key_mutex_conn_i;
+extern PSI_mutex_key spd_key_mutex_conn_loop_check;
 extern PSI_cond_key spd_key_cond_conn_i;
 #ifndef WITHOUT_SPIDER_BG_SEARCH
 extern PSI_mutex_key spd_key_mutex_bg_conn_chain;
@@ -145,6 +149,102 @@ uchar *spider_ipport_conn_get_key(
   DBUG_RETURN((uchar*) ip_port->key);
 }
 
+static uchar *spider_loop_check_full_get_key(
+  SPIDER_CONN_LOOP_CHECK *ptr,
+  size_t *length,
+  my_bool not_used __attribute__ ((unused))
+) {
+  DBUG_ENTER("spider_loop_check_full_get_key");
+  *length = ptr->full_name.length;
+  DBUG_RETURN((uchar*) ptr->full_name.str);
+}
+
+static uchar *spider_loop_check_to_get_key(
+  SPIDER_CONN_LOOP_CHECK *ptr,
+  size_t *length,
+  my_bool not_used __attribute__ ((unused))
+) {
+  DBUG_ENTER("spider_loop_check_to_get_key");
+  *length = ptr->to_name.length;
+  DBUG_RETURN((uchar*) ptr->to_name.str);
+}
+
+int spider_conn_init(
+  SPIDER_CONN *conn
+) {
+  int error_num = HA_ERR_OUT_OF_MEM;
+  DBUG_ENTER("spider_conn_init");
+#if MYSQL_VERSION_ID < 50500
+  if (pthread_mutex_init(&conn->loop_check_mutex, MY_MUTEX_INIT_FAST))
+#else
+  if (mysql_mutex_init(spd_key_mutex_conn_loop_check, &conn->loop_check_mutex,
+    MY_MUTEX_INIT_FAST))
+#endif
+  {
+    goto error_loop_check_mutex_init;
+  }
+  if (
+    my_hash_init(PSI_INSTRUMENT_ME, &conn->loop_checked, spd_charset_utf8mb3_bin, 32, 0, 0,
+                   (my_hash_get_key) spider_loop_check_full_get_key, 0, 0)
+  ) {
+    goto error_loop_checked_hash_init;
+  }
+  spider_alloc_calc_mem_init(conn->loop_checked, 268);
+  spider_alloc_calc_mem(spider_current_trx,
+    conn->loop_checked,
+    conn->loop_checked.array.max_element *
+    conn->loop_checked.array.size_of_element);
+  if (
+    my_hash_init(PSI_INSTRUMENT_ME, &conn->loop_check_queue, spd_charset_utf8mb3_bin, 32, 0, 0,
+                   (my_hash_get_key) spider_loop_check_to_get_key, 0, 0)
+  ) {
+    goto error_loop_check_queue_hash_init;
+  }
+  spider_alloc_calc_mem_init(conn->loop_check_queue, 269);
+  spider_alloc_calc_mem(spider_current_trx,
+    conn->loop_check_queue,
+    conn->loop_check_queue.array.max_element *
+    conn->loop_check_queue.array.size_of_element);
+  DBUG_RETURN(0);
+
+error_loop_check_queue_hash_init:
+  spider_free_mem_calc(spider_current_trx,
+    conn->loop_checked_id,
+    conn->loop_checked.array.max_element *
+    conn->loop_checked.array.size_of_element);
+  my_hash_free(&conn->loop_checked);
+error_loop_checked_hash_init:
+  pthread_mutex_destroy(&conn->loop_check_mutex);
+error_loop_check_mutex_init:
+  DBUG_RETURN(error_num);
+}
+
+void spider_conn_done(
+  SPIDER_CONN *conn
+) {
+  SPIDER_CONN_LOOP_CHECK *lcptr;
+  DBUG_ENTER("spider_conn_done");
+  uint l = 0;
+  while ((lcptr = (SPIDER_CONN_LOOP_CHECK *) my_hash_element(
+    &conn->loop_checked, l)))
+  {
+    spider_free(spider_current_trx, lcptr, MYF(0));
+    ++l;
+  }
+  spider_free_mem_calc(spider_current_trx,
+    conn->loop_check_queue_id,
+    conn->loop_check_queue.array.max_element *
+    conn->loop_check_queue.array.size_of_element);
+  my_hash_free(&conn->loop_check_queue);
+  spider_free_mem_calc(spider_current_trx,
+    conn->loop_checked_id,
+    conn->loop_checked.array.max_element *
+    conn->loop_checked.array.size_of_element);
+  my_hash_free(&conn->loop_checked);
+  pthread_mutex_destroy(&conn->loop_check_mutex);
+  DBUG_VOID_RETURN;
+}
+
 int spider_reset_conn_setted_parameter(
   SPIDER_CONN *conn,
   THD *thd
@@ -183,7 +283,7 @@ int spider_reset_conn_setted_parameter(
     conn->default_database.length(default_database_length);
   } else
     conn->default_database.length(0);
-  DBUG_RETURN(0);
+  DBUG_RETURN(spider_conn_reset_queue_loop_check(conn));
 }
 
 int spider_free_conn_alloc(
@@ -199,6 +299,7 @@ int spider_free_conn_alloc(
     delete conn->db_conn;
     conn->db_conn = NULL;
   }
+  spider_conn_done(conn);
   DBUG_ASSERT(!conn->mta_conn_mutex_file_pos.file_name);
   pthread_mutex_destroy(&conn->mta_conn_mutex);
   conn->default_database.free();
@@ -312,6 +413,10 @@ void spider_free_conn_from_trx(
         }
       } else {
         /* conn_recycle_mode == 0 */
+        if (conn->quick_target)
+        {
+          spider_db_free_result((ha_spider *) conn->quick_target, TRUE);
+        }
         spider_free_conn(conn);
       }
     } else if (roop_count)
@@ -459,8 +564,9 @@ SPIDER_CONN *spider_create_conn(
   SPIDER_CONN *conn;
   SPIDER_IP_PORT_CONN *ip_port_conn;
   char *tmp_name, *tmp_host, *tmp_username, *tmp_password, *tmp_socket;
-  char *tmp_wrapper, *tmp_ssl_ca, *tmp_ssl_capath, *tmp_ssl_cert;
+  char *tmp_wrapper, *tmp_db, *tmp_ssl_ca, *tmp_ssl_capath, *tmp_ssl_cert;
   char *tmp_ssl_cipher, *tmp_ssl_key, *tmp_default_file, *tmp_default_group;
+  char *tmp_dsn;
   DBUG_ENTER("spider_create_conn");
 
   if (unlikely(!UTC))
@@ -474,32 +580,45 @@ SPIDER_CONN *spider_create_conn(
   if (conn_kind == SPIDER_CONN_KIND_MYSQL)
   {
 #endif
+    bool tables_on_different_db_are_joinable;
+    if (share->sql_dbton_ids[link_idx] != SPIDER_DBTON_SIZE)
+    {
+      tables_on_different_db_are_joinable =
+        spider_dbton[share->sql_dbton_ids[link_idx]].db_util->
+          tables_on_different_db_are_joinable();
+    } else {
+      tables_on_different_db_are_joinable = TRUE;
+    }
     if (!(conn = (SPIDER_CONN *)
       spider_bulk_malloc(spider_current_trx, 18, MYF(MY_WME | MY_ZEROFILL),
-        &conn, sizeof(*conn),
-        &tmp_name, share->conn_keys_lengths[link_idx] + 1,
-        &tmp_host, share->tgt_hosts_lengths[link_idx] + 1,
+        &conn, (uint) (sizeof(*conn)),
+        &tmp_name, (uint) (share->conn_keys_lengths[link_idx] + 1),
+        &tmp_host, (uint) (share->tgt_hosts_lengths[link_idx] + 1),
         &tmp_username,
-          share->tgt_usernames_lengths[link_idx] + 1,
+          (uint) (share->tgt_usernames_lengths[link_idx] + 1),
         &tmp_password,
-          share->tgt_passwords_lengths[link_idx] + 1,
-        &tmp_socket, share->tgt_sockets_lengths[link_idx] + 1,
+          (uint) (share->tgt_passwords_lengths[link_idx] + 1),
+        &tmp_socket, (uint) (share->tgt_sockets_lengths[link_idx] + 1),
         &tmp_wrapper,
-          share->tgt_wrappers_lengths[link_idx] + 1,
-        &tmp_ssl_ca, share->tgt_ssl_cas_lengths[link_idx] + 1,
+          (uint) (share->tgt_wrappers_lengths[link_idx] + 1),
+        &tmp_db, (uint) (tables_on_different_db_are_joinable ?
+          0 : share->tgt_dbs_lengths[link_idx] + 1),
+        &tmp_ssl_ca, (uint) (share->tgt_ssl_cas_lengths[link_idx] + 1),
         &tmp_ssl_capath,
-          share->tgt_ssl_capaths_lengths[link_idx] + 1,
+          (uint) (share->tgt_ssl_capaths_lengths[link_idx] + 1),
         &tmp_ssl_cert,
-          share->tgt_ssl_certs_lengths[link_idx] + 1,
+          (uint) (share->tgt_ssl_certs_lengths[link_idx] + 1),
         &tmp_ssl_cipher,
-          share->tgt_ssl_ciphers_lengths[link_idx] + 1,
+          (uint) (share->tgt_ssl_ciphers_lengths[link_idx] + 1),
         &tmp_ssl_key,
-          share->tgt_ssl_keys_lengths[link_idx] + 1,
+          (uint) (share->tgt_ssl_keys_lengths[link_idx] + 1),
         &tmp_default_file,
-          share->tgt_default_files_lengths[link_idx] + 1,
+          (uint) (share->tgt_default_files_lengths[link_idx] + 1),
         &tmp_default_group,
-          share->tgt_default_groups_lengths[link_idx] + 1,
-        &need_mon, sizeof(int),
+          (uint) (share->tgt_default_groups_lengths[link_idx] + 1),
+        &tmp_dsn,
+          (uint) (share->tgt_dsns_lengths[link_idx] + 1),
+        &need_mon, (uint) (sizeof(int)),
         NullS))
     ) {
       *error_num = HA_ERR_OUT_OF_MEM;
@@ -534,6 +653,13 @@ SPIDER_CONN *spider_create_conn(
     conn->tgt_wrapper = tmp_wrapper;
     memcpy(conn->tgt_wrapper, share->tgt_wrappers[link_idx],
       share->tgt_wrappers_lengths[link_idx]);
+    if (!tables_on_different_db_are_joinable)
+    {
+      conn->tgt_db_length = share->tgt_dbs_lengths[link_idx];
+      conn->tgt_db = tmp_db;
+      memcpy(conn->tgt_db, share->tgt_dbs[link_idx],
+        share->tgt_dbs_lengths[link_idx]);
+    }
     conn->tgt_ssl_ca_length = share->tgt_ssl_cas_lengths[link_idx];
     if (conn->tgt_ssl_ca_length)
     {
@@ -591,6 +717,15 @@ SPIDER_CONN *spider_create_conn(
         share->tgt_default_groups_lengths[link_idx]);
     } else
       conn->tgt_default_group = NULL;
+    conn->tgt_dsn_length =
+      share->tgt_dsns_lengths[link_idx];
+    if (conn->tgt_dsn_length)
+    {
+      conn->tgt_dsn = tmp_dsn;
+      memcpy(conn->tgt_dsn, share->tgt_dsns[link_idx],
+        share->tgt_dsns_lengths[link_idx]);
+    } else
+      conn->tgt_dsn = NULL;
     conn->tgt_port = share->tgt_ports[link_idx];
     conn->tgt_ssl_vsc = share->tgt_ssl_vscs[link_idx];
     conn->dbton_id = share->sql_dbton_ids[link_idx];
@@ -598,13 +733,13 @@ SPIDER_CONN *spider_create_conn(
   } else if (conn_kind == SPIDER_CONN_KIND_HS_READ) {
     if (!(conn = (SPIDER_CONN *)
       spider_bulk_malloc(spider_current_trx, 19, MYF(MY_WME | MY_ZEROFILL),
-        &conn, sizeof(*conn),
-        &tmp_name, share->hs_read_conn_keys_lengths[link_idx] + 1,
-        &tmp_host, share->tgt_hosts_lengths[link_idx] + 1,
-        &tmp_socket, share->hs_read_socks_lengths[link_idx] + 1,
+        &conn, (uint) (sizeof(*conn)),
+        &tmp_name, (uint) (share->hs_read_conn_keys_lengths[link_idx] + 1),
+        &tmp_host, (uint) (share->tgt_hosts_lengths[link_idx] + 1),
+        &tmp_socket, (uint) (share->hs_read_socks_lengths[link_idx] + 1),
         &tmp_wrapper,
-          share->tgt_wrappers_lengths[link_idx] + 1,
-        &need_mon, sizeof(int),
+          (uint) (share->tgt_wrappers_lengths[link_idx] + 1),
+        &need_mon, (uint) (sizeof(int)),
         NullS))
     ) {
       *error_num = HA_ERR_OUT_OF_MEM;
@@ -640,13 +775,13 @@ SPIDER_CONN *spider_create_conn(
   } else {
     if (!(conn = (SPIDER_CONN *)
       spider_bulk_malloc(spider_current_trx, 20, MYF(MY_WME | MY_ZEROFILL),
-        &conn, sizeof(*conn),
-        &tmp_name, share->hs_write_conn_keys_lengths[link_idx] + 1,
-        &tmp_host, share->tgt_hosts_lengths[link_idx] + 1,
-        &tmp_socket, share->hs_write_socks_lengths[link_idx] + 1,
+        &conn, (uint) (sizeof(*conn)),
+        &tmp_name, (uint) (share->hs_write_conn_keys_lengths[link_idx] + 1),
+        &tmp_host, (uint) (share->tgt_hosts_lengths[link_idx] + 1),
+        &tmp_socket, (uint) (share->hs_write_socks_lengths[link_idx] + 1),
         &tmp_wrapper,
-          share->tgt_wrappers_lengths[link_idx] + 1,
-        &need_mon, sizeof(int),
+          (uint) (share->tgt_wrappers_lengths[link_idx] + 1),
+        &need_mon, (uint) (sizeof(int)),
         NullS))
     ) {
       *error_num = HA_ERR_OUT_OF_MEM;
@@ -737,6 +872,11 @@ SPIDER_CONN *spider_create_conn(
     goto error_mta_conn_mutex_init;
   }
 
+  if (unlikely((*error_num = spider_conn_init(conn))))
+  {
+    goto error_conn_init;
+  }
+
   spider_conn_queue_connect(share, conn, link_idx);
   conn->ping_time = (time_t) time((time_t*) 0);
   conn->connect_error_time = conn->ping_time;
@@ -788,12 +928,10 @@ SPIDER_CONN *spider_create_conn(
 
   DBUG_RETURN(conn);
 
-/*
-error_init_lock_table_hash:
-  DBUG_ASSERT(!conn->mta_conn_mutex_file_pos.file_name);
-  pthread_mutex_destroy(&conn->mta_conn_mutex);
-*/
 error_too_many_ipport_count:
+  spider_conn_done(conn);
+error_conn_init:
+  pthread_mutex_destroy(&conn->mta_conn_mutex);
 error_mta_conn_mutex_init:
 error_db_conn_init:
   delete conn->db_conn;
@@ -957,7 +1095,7 @@ SPIDER_CONN *spider_get_conn(
 
           }
           else
-          {	/* did not enable conncetion pool , create_conn */
+          { /* did not enable conncetion pool , create_conn */
             DBUG_PRINT("info",("spider create new conn"));
             if (!(conn = spider_create_conn(share, spider, link_idx,
               base_link_idx, conn_kind, error_num)))
@@ -1228,6 +1366,20 @@ SPIDER_CONN *spider_get_conn(
       conn->queued_ping = FALSE;
   }
 
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+  if (conn_kind == SPIDER_CONN_KIND_MYSQL)
+  {
+#endif
+    if (unlikely(spider && spider->wide_handler->top_share &&
+      (*error_num = spider_conn_queue_loop_check(
+        conn, spider, base_link_idx))))
+    {
+      goto error;
+    }
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+  }
+#endif
+
   DBUG_PRINT("info",("spider conn=%p", conn));
   DBUG_RETURN(conn);
 
@@ -1283,7 +1435,7 @@ int spider_check_and_get_casual_read_conn(
     if (
       !(spider->conns[link_idx] =
         spider_get_conn(spider->share, link_idx,
-          spider->conn_keys[link_idx], spider->trx,
+          spider->conn_keys[link_idx], spider->wide_handler->trx,
           spider, FALSE, TRUE, SPIDER_CONN_KIND_MYSQL,
           &error_num))
     ) {
@@ -1480,6 +1632,424 @@ void spider_conn_queue_UTC_time_zone(
   DBUG_VOID_RETURN;
 }
 
+int spider_conn_queue_and_merge_loop_check(
+  SPIDER_CONN *conn,
+  SPIDER_CONN_LOOP_CHECK *lcptr
+) {
+  int error_num = HA_ERR_OUT_OF_MEM;
+  char *tmp_name, *from_name, *cur_name, *to_name, *full_name, *from_value,
+    *merged_value;
+  SPIDER_CONN_LOOP_CHECK *lcqptr, *lcrptr;
+  DBUG_ENTER("spider_conn_queue_and_merge_loop_check");
+  DBUG_PRINT("info", ("spider conn=%p", conn));
+#ifdef SPIDER_HAS_HASH_VALUE_TYPE
+  if (unlikely(!(lcqptr = (SPIDER_CONN_LOOP_CHECK *)
+    my_hash_search_using_hash_value(&conn->loop_check_queue,
+    lcptr->hash_value_to,
+    (uchar *) lcptr->to_name.str, lcptr->to_name.length))))
+#else
+  if (unlikely(!(lcqptr = (SPIDER_CONN_LOOP_CHECK *) my_hash_search(
+    &conn->loop_check_queue,
+    (uchar *) lcptr->to_name.str, lcptr->to_name.length))))
+#endif
+  {
+    DBUG_PRINT("info", ("spider create merged_value and insert"));
+    lcptr->merged_value.length = spider_unique_id.length +
+      lcptr->cur_name.length + lcptr->from_value.length + 1;
+    tmp_name = (char *) lcptr->merged_value.str;
+    memcpy(tmp_name, spider_unique_id.str, spider_unique_id.length);
+    tmp_name += spider_unique_id.length;
+    memcpy(tmp_name, lcptr->cur_name.str, lcptr->cur_name.length);
+    tmp_name += lcptr->cur_name.length;
+    *tmp_name = '-';
+    ++tmp_name;
+    memcpy(tmp_name, lcptr->from_value.str, lcptr->from_value.length + 1);
+#ifdef HASH_UPDATE_WITH_HASH_VALUE
+    if (unlikely(my_hash_insert_with_hash_value(&conn->loop_check_queue,
+      lcptr->hash_value_to, (uchar *) lcptr)))
+#else
+    if (unlikely(my_hash_insert(&conn->loop_check_queue, (uchar *) lcptr)))
+#endif
+    {
+      goto error_hash_insert_queue;
+    }
+    lcptr->flag |= SPIDER_LOP_CHK_QUEUED;
+  } else {
+    DBUG_PRINT("info", ("spider append merged_value and replace"));
+    if (unlikely(!spider_bulk_malloc(spider_current_trx, 271, MYF(MY_WME),
+      &lcrptr, (uint) (sizeof(SPIDER_CONN_LOOP_CHECK)),
+      &from_name, (uint) (lcqptr->from_name.length + 1),
+      &cur_name, (uint) (lcqptr->cur_name.length + 1),
+      &to_name, (uint) (lcqptr->to_name.length + 1),
+      &full_name, (uint) (lcqptr->full_name.length + 1),
+      &from_value, (uint) (lcqptr->from_value.length + 1),
+      &merged_value, (uint) (lcqptr->merged_value.length +
+        spider_unique_id.length + lcptr->cur_name.length +
+        lcptr->from_value.length + 2),
+      NullS)
+    )) {
+      goto error_alloc_loop_check_replace;
+    }
+#ifdef SPIDER_HAS_HASH_VALUE_TYPE
+    lcrptr->hash_value_to = lcqptr->hash_value_to;
+    lcrptr->hash_value_full = lcqptr->hash_value_full;
+#endif
+    lcrptr->from_name.str = from_name;
+    lcrptr->from_name.length = lcqptr->from_name.length;
+    memcpy(from_name, lcqptr->from_name.str, lcqptr->from_name.length + 1);
+    lcrptr->cur_name.str = cur_name;
+    lcrptr->cur_name.length = lcqptr->cur_name.length;
+    memcpy(cur_name, lcqptr->cur_name.str, lcqptr->cur_name.length + 1);
+    lcrptr->to_name.str = to_name;
+    lcrptr->to_name.length = lcqptr->to_name.length;
+    memcpy(to_name, lcqptr->to_name.str, lcqptr->to_name.length + 1);
+    lcrptr->full_name.str = full_name;
+    lcrptr->full_name.length = lcqptr->full_name.length;
+    memcpy(full_name, lcqptr->full_name.str, lcqptr->full_name.length + 1);
+    lcrptr->from_value.str = from_value;
+    lcrptr->from_value.length = lcqptr->from_value.length;
+    memcpy(from_value, lcqptr->from_value.str, lcqptr->from_value.length + 1);
+    lcrptr->merged_value.str = merged_value;
+    lcrptr->merged_value.length = lcqptr->merged_value.length;
+    memcpy(merged_value,
+      lcqptr->merged_value.str, lcqptr->merged_value.length);
+    merged_value += lcqptr->merged_value.length;
+    memcpy(merged_value, spider_unique_id.str, spider_unique_id.length);
+    merged_value += spider_unique_id.length;
+    memcpy(merged_value, lcptr->cur_name.str, lcptr->cur_name.length);
+    merged_value += lcptr->cur_name.length;
+    *merged_value = '-';
+    ++merged_value;
+    memcpy(merged_value, lcptr->from_value.str, lcptr->from_value.length + 1);
+
+    DBUG_PRINT("info", ("spider free lcqptr"));
+#ifdef HASH_UPDATE_WITH_HASH_VALUE
+    my_hash_delete_with_hash_value(&conn->loop_checked,
+      lcqptr->hash_value_full, (uchar *) lcqptr);
+    my_hash_delete_with_hash_value(&conn->loop_check_queue,
+      lcqptr->hash_value_to, (uchar *) lcqptr);
+#else
+    my_hash_delete(&conn->loop_checked, (uchar*) lcqptr);
+    my_hash_delete(&conn->loop_check_queue, (uchar*) lcqptr);
+#endif
+    spider_free(spider_current_trx, lcqptr, MYF(0));
+
+    lcptr = lcrptr;
+#ifdef HASH_UPDATE_WITH_HASH_VALUE
+    if (unlikely(my_hash_insert_with_hash_value(&conn->loop_checked,
+      lcptr->hash_value_full, (uchar *) lcptr)))
+#else
+    if (unlikely(my_hash_insert(&conn->loop_checked, (uchar *) lcptr)))
+#endif
+    {
+      goto error_hash_insert;
+    }
+#ifdef HASH_UPDATE_WITH_HASH_VALUE
+    if (unlikely(my_hash_insert_with_hash_value(&conn->loop_check_queue,
+      lcptr->hash_value_to, (uchar *) lcptr)))
+#else
+    if (unlikely(my_hash_insert(&conn->loop_check_queue, (uchar *) lcptr)))
+#endif
+    {
+      goto error_hash_insert_queue;
+    }
+    lcptr->flag = SPIDER_LOP_CHK_MERAGED;
+    lcptr->next = NULL;
+    if (!conn->loop_check_meraged_first)
+    {
+      conn->loop_check_meraged_first = lcptr;
+      conn->loop_check_meraged_last = lcptr;
+    } else {
+      conn->loop_check_meraged_last->next = lcptr;
+      conn->loop_check_meraged_last = lcptr;
+    }
+  }
+  DBUG_RETURN(0);
+
+error_alloc_loop_check_replace:
+error_hash_insert_queue:
+#ifdef HASH_UPDATE_WITH_HASH_VALUE
+  my_hash_delete_with_hash_value(&conn->loop_checked,
+    lcptr->hash_value_full, (uchar *) lcptr);
+#else
+  my_hash_delete(&conn->loop_checked, (uchar*) lcptr);
+#endif
+error_hash_insert:
+  spider_free(spider_current_trx, lcptr, MYF(0));
+  pthread_mutex_unlock(&conn->loop_check_mutex);
+  DBUG_RETURN(error_num);
+}
+
+int spider_conn_reset_queue_loop_check(
+  SPIDER_CONN *conn
+) {
+  int error_num;
+  SPIDER_CONN_LOOP_CHECK *lcptr;
+  DBUG_ENTER("spider_conn_reset_queue_loop_check");
+  uint l = 0;
+  pthread_mutex_lock(&conn->loop_check_mutex);
+  while ((lcptr = (SPIDER_CONN_LOOP_CHECK *) my_hash_element(
+    &conn->loop_checked, l)))
+  {
+    if (!lcptr->flag)
+    {
+      DBUG_PRINT("info", ("spider free lcptr"));
+#ifdef HASH_UPDATE_WITH_HASH_VALUE
+      my_hash_delete_with_hash_value(&conn->loop_checked,
+        lcptr->hash_value_full, (uchar *) lcptr);
+#else
+      my_hash_delete(&conn->loop_checked, (uchar*) lcptr);
+#endif
+      spider_free(spider_current_trx, lcptr, MYF(0));
+    }
+    ++l;
+  }
+
+  lcptr = conn->loop_check_ignored_first;
+  while (lcptr)
+  {
+    lcptr->flag = 0;
+    if ((error_num = spider_conn_queue_and_merge_loop_check(conn, lcptr)))
+    {
+      goto error_queue_and_merge;
+    }
+    lcptr = lcptr->next;
+  }
+  conn->loop_check_meraged_first = NULL;
+  pthread_mutex_unlock(&conn->loop_check_mutex);
+  DBUG_RETURN(0);
+
+error_queue_and_merge:
+  lcptr = lcptr->next;
+  while (lcptr)
+  {
+    lcptr->flag = 0;
+    lcptr = lcptr->next;
+  }
+  conn->loop_check_meraged_first = NULL;
+  pthread_mutex_unlock(&conn->loop_check_mutex);
+  DBUG_RETURN(error_num);
+}
+
+int spider_conn_queue_loop_check(
+  SPIDER_CONN *conn,
+  ha_spider *spider,
+  int link_idx
+) {
+  int error_num = HA_ERR_OUT_OF_MEM;
+  uint conn_link_idx = spider->conn_link_idx[link_idx], buf_sz;
+  char path[FN_REFLEN + 1];
+  char *tmp_name, *from_name, *cur_name, *to_name, *full_name, *from_value,
+    *merged_value;
+  user_var_entry *loop_check;
+  char *loop_check_buf;
+  THD *thd = spider->wide_handler->trx->thd;
+  TABLE_SHARE *top_share = spider->wide_handler->top_share;
+  SPIDER_SHARE *share = spider->share;
+  SPIDER_CONN_LOOP_CHECK *lcptr;
+  LEX_CSTRING lex_str, from_str, to_str;
+  DBUG_ENTER("spider_conn_queue_loop_check");
+  DBUG_PRINT("info", ("spider conn=%p", conn));
+  lex_str.length = top_share->path.length + SPIDER_SQL_LOP_CHK_PRM_PRF_LEN;
+  buf_sz = lex_str.length + 2;
+  loop_check_buf = (char *) my_alloca(buf_sz);
+  if (unlikely(!loop_check_buf))
+  {
+    DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+  }
+  lex_str.str = loop_check_buf;
+  memcpy(loop_check_buf,
+    SPIDER_SQL_LOP_CHK_PRM_PRF_STR, SPIDER_SQL_LOP_CHK_PRM_PRF_LEN);
+  memcpy(loop_check_buf + SPIDER_SQL_LOP_CHK_PRM_PRF_LEN,
+    top_share->path.str, top_share->path.length);
+  loop_check_buf[lex_str.length] = '\0';
+  DBUG_PRINT("info", ("spider param name=%s", lex_str.str));
+  loop_check = get_variable(&thd->user_vars, &lex_str, FALSE);
+  if (!loop_check || loop_check->type != STRING_RESULT)
+  {
+    DBUG_PRINT("info", ("spider client is not Spider"));
+    lex_str.str = "";
+    lex_str.length = 0;
+    from_str.str = "";
+    from_str.length = 0;
+  } else {
+    lex_str.str = loop_check->value;
+    lex_str.length = loop_check->length;
+    DBUG_PRINT("info", ("spider from_str=%s", lex_str.str));
+    if (unlikely(!(tmp_name = strchr(loop_check->value, '-'))))
+    {
+      DBUG_PRINT("info", ("spider invalid value for loop checking 1"));
+      from_str.str = "";
+      from_str.length = 0;
+    }
+    else if (unlikely(!(tmp_name = strchr(tmp_name + 1, '-'))))
+    {
+      DBUG_PRINT("info", ("spider invalid value for loop checking 2"));
+      from_str.str = "";
+      from_str.length = 0;
+    }
+    else if (unlikely(!(tmp_name = strchr(tmp_name + 1, '-'))))
+    {
+      DBUG_PRINT("info", ("spider invalid value for loop checking 3"));
+      from_str.str = "";
+      from_str.length = 0;
+    }
+    else if (unlikely(!(tmp_name = strchr(tmp_name + 1, '-'))))
+    {
+      DBUG_PRINT("info", ("spider invalid value for loop checking 4"));
+      from_str.str = "";
+      from_str.length = 0;
+    }
+    else
+    {
+      from_str.str = lex_str.str;
+      from_str.length = tmp_name - lex_str.str + 1;
+    }
+  }
+  my_afree(loop_check_buf);
+
+  to_str.length = build_table_filename(path, FN_REFLEN,
+    share->tgt_dbs[conn_link_idx] ? share->tgt_dbs[conn_link_idx] : "",
+    share->tgt_table_names[conn_link_idx], "", 0);
+  to_str.str = path;
+  DBUG_PRINT("info", ("spider to=%s", to_str.str));
+  buf_sz = from_str.length + top_share->path.length + to_str.length + 3;
+  loop_check_buf = (char *) my_alloca(buf_sz);
+  if (unlikely(!loop_check_buf))
+  {
+    DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+  }
+  DBUG_PRINT("info", ("spider top_share->path=%s", top_share->path.str));
+  memcpy(loop_check_buf, from_str.str, from_str.length);
+  tmp_name = loop_check_buf + from_str.length;
+  *tmp_name = '-';
+  ++tmp_name;
+  memcpy(tmp_name, top_share->path.str, top_share->path.length);
+  tmp_name += top_share->path.length;
+  *tmp_name = '-';
+  ++tmp_name;
+  memcpy(tmp_name, to_str.str, to_str.length);
+  tmp_name += to_str.length;
+  *tmp_name = '\0';
+#ifdef SPIDER_HAS_HASH_VALUE_TYPE
+  my_hash_value_type hash_value = my_calc_hash(&conn->loop_checked,
+    (uchar *) loop_check_buf, buf_sz - 1);
+#endif
+  pthread_mutex_lock(&conn->loop_check_mutex);
+#ifdef SPIDER_HAS_HASH_VALUE_TYPE
+  lcptr = (SPIDER_CONN_LOOP_CHECK *)
+    my_hash_search_using_hash_value(&conn->loop_checked, hash_value,
+    (uchar *) loop_check_buf, buf_sz - 1);
+#else
+  lcptr = (SPIDER_CONN_LOOP_CHECK *) my_hash_search(
+    &conn->loop_checked, (uchar *) loop_check_buf, buf_sz - 1);
+#endif
+  if (unlikely(
+    !lcptr ||
+    (
+      !lcptr->flag &&
+      (
+        lcptr->from_value.length != lex_str.length ||
+        memcmp(lcptr->from_value.str, lex_str.str, lex_str.length)
+      )
+    )
+  ))
+  {
+    if (unlikely(lcptr))
+    {
+      DBUG_PRINT("info", ("spider free lcptr"));
+#ifdef HASH_UPDATE_WITH_HASH_VALUE
+      my_hash_delete_with_hash_value(&conn->loop_checked,
+        lcptr->hash_value_full, (uchar *) lcptr);
+#else
+      my_hash_delete(&conn->loop_checked, (uchar*) lcptr);
+#endif
+      spider_free(spider_current_trx, lcptr, MYF(0));
+    }
+    DBUG_PRINT("info", ("spider alloc_lcptr"));
+    if (unlikely(!spider_bulk_malloc(spider_current_trx, 272, MYF(MY_WME),
+      &lcptr, (uint) (sizeof(SPIDER_CONN_LOOP_CHECK)),
+      &from_name, (uint) (from_str.length + 1),
+      &cur_name, (uint) (top_share->path.length + 1),
+      &to_name, (uint) (to_str.length + 1),
+      &full_name, (uint) (buf_sz),
+      &from_value, (uint) (lex_str.length + 1),
+      &merged_value, (uint) (spider_unique_id.length + top_share->path.length +
+        lex_str.length + 2),
+      NullS)
+    )) {
+      my_afree(loop_check_buf);
+      goto error_alloc_loop_check;
+    }
+    lcptr->flag = 0;
+    lcptr->from_name.str = from_name;
+    lcptr->from_name.length = from_str.length;
+    memcpy(from_name, from_str.str, from_str.length + 1);
+    lcptr->cur_name.str = cur_name;
+    lcptr->cur_name.length = top_share->path.length;
+    memcpy(cur_name, top_share->path.str, top_share->path.length + 1);
+    lcptr->to_name.str = to_name;
+    lcptr->to_name.length = to_str.length;
+    memcpy(to_name, to_str.str, to_str.length + 1);
+    lcptr->full_name.str = full_name;
+    lcptr->full_name.length = buf_sz - 1;
+    memcpy(full_name, loop_check_buf, buf_sz);
+    lcptr->from_value.str = from_value;
+    lcptr->from_value.length = lex_str.length;
+    memcpy(from_value, lex_str.str, lex_str.length + 1);
+    lcptr->merged_value.str = merged_value;
+#ifdef SPIDER_HAS_HASH_VALUE_TYPE
+    lcptr->hash_value_to = my_calc_hash(&conn->loop_checked,
+      (uchar *) to_str.str, to_str.length);
+    lcptr->hash_value_full = hash_value;
+#endif
+#ifdef HASH_UPDATE_WITH_HASH_VALUE
+    if (unlikely(my_hash_insert_with_hash_value(&conn->loop_checked,
+      lcptr->hash_value_full, (uchar *) lcptr)))
+#else
+    if (unlikely(my_hash_insert(&conn->loop_checked, (uchar *) lcptr)))
+#endif
+    {
+      my_afree(loop_check_buf);
+      goto error_hash_insert;
+    }
+  } else {
+    if (!lcptr->flag)
+    {
+      DBUG_PRINT("info", ("spider add to ignored list"));
+      lcptr->flag |= SPIDER_LOP_CHK_IGNORED;
+      lcptr->next = NULL;
+      if (!conn->loop_check_ignored_first)
+      {
+        conn->loop_check_ignored_first = lcptr;
+        conn->loop_check_ignored_last = lcptr;
+      } else {
+        conn->loop_check_ignored_last->next = lcptr;
+        conn->loop_check_ignored_last = lcptr;
+      }
+    }
+    pthread_mutex_unlock(&conn->loop_check_mutex);
+    my_afree(loop_check_buf);
+    DBUG_PRINT("info", ("spider be sent or queued already"));
+    DBUG_RETURN(0);
+  }
+  my_afree(loop_check_buf);
+
+  if ((error_num = spider_conn_queue_and_merge_loop_check(conn, lcptr)))
+  {
+    goto error_queue_and_merge;
+  }
+  pthread_mutex_unlock(&conn->loop_check_mutex);
+  DBUG_RETURN(0);
+
+error_hash_insert:
+  spider_free(spider_current_trx, lcptr, MYF(0));
+error_queue_and_merge:
+  pthread_mutex_unlock(&conn->loop_check_mutex);
+error_alloc_loop_check:
+  DBUG_RETURN(error_num);
+}
+
 void spider_conn_queue_start_transaction(
   SPIDER_CONN *conn
 ) {
@@ -1745,7 +2315,8 @@ int spider_set_conn_bg_param(
   int error_num, roop_count, bgs_mode;
   SPIDER_SHARE *share = spider->share;
   SPIDER_RESULT_LIST *result_list = &spider->result_list;
-  THD *thd = spider->trx->thd;
+  SPIDER_WIDE_HANDLER *wide_handler = spider->wide_handler;
+  THD *thd = wide_handler->trx->thd;
   DBUG_ENTER("spider_set_conn_bg_param");
   DBUG_PRINT("info",("spider spider=%p", spider));
   bgs_mode =
@@ -1754,10 +2325,11 @@ int spider_set_conn_bg_param(
     result_list->bgs_phase = 0;
   else if (
     bgs_mode <= 2 &&
-    (result_list->lock_type == F_WRLCK || spider->lock_mode == 2)
+    (wide_handler->external_lock_type == F_WRLCK ||
+      wide_handler->lock_mode == 2)
   )
     result_list->bgs_phase = 0;
-  else if (bgs_mode <= 1 && spider->lock_mode == 1)
+  else if (bgs_mode <= 1 && wide_handler->lock_mode == 1)
     result_list->bgs_phase = 0;
   else {
     result_list->bgs_phase = 1;
@@ -1804,12 +2376,12 @@ int spider_set_conn_bg_param(
       for (
         roop_count = spider_conn_link_idx_next(share->link_statuses,
           spider->conn_link_idx, -1, share->link_count,
-          spider->lock_mode ?
+          spider->wide_handler->lock_mode ?
           SPIDER_LINK_STATUS_RECOVERY : SPIDER_LINK_STATUS_OK);
         roop_count < (int) share->link_count;
         roop_count = spider_conn_link_idx_next(share->link_statuses,
           spider->conn_link_idx, roop_count, share->link_count,
-          spider->lock_mode ?
+          spider->wide_handler->lock_mode ?
           SPIDER_LINK_STATUS_RECOVERY : SPIDER_LINK_STATUS_OK)
       ) {
         if ((error_num = spider_create_conn_thread(spider->conns[roop_count])))
@@ -2291,6 +2863,21 @@ int spider_bg_conn_search(
             }
           }
           result_list->bgs_phase = 2;
+          if (conn->db_conn->limit_mode() == 1)
+          {
+            conn->db_conn->set_limit(result_list->limit_num);
+            if (!discard_result)
+            {
+              if ((error_num = spider_db_store_result_for_reuse_cursor(
+                spider, link_idx, result_list->table)))
+              {
+                pthread_mutex_unlock(&conn->bg_conn_mutex);
+                DBUG_RETURN(error_num);
+              }
+            }
+            pthread_mutex_unlock(&conn->bg_conn_mutex);
+            DBUG_RETURN(0);
+          }
         }
         result_list->bgs_working = TRUE;
         conn->bg_search = TRUE;
@@ -2425,6 +3012,21 @@ int spider_bg_conn_search(
               DBUG_RETURN(error_num);
             }
           }
+          if (conn->db_conn->limit_mode() == 1)
+          {
+            conn->db_conn->set_limit(result_list->limit_num);
+            if (!discard_result)
+            {
+              if ((error_num = spider_db_store_result_for_reuse_cursor(
+                spider, link_idx, result_list->table)))
+              {
+                pthread_mutex_unlock(&conn->bg_conn_mutex);
+                DBUG_RETURN(error_num);
+              }
+            }
+            pthread_mutex_unlock(&conn->bg_conn_mutex);
+            DBUG_RETURN(0);
+          }
         }
         conn->bg_target = spider;
         conn->link_idx = link_idx;
@@ -2519,7 +3121,7 @@ void *spider_bg_conn_action(
     pthread_cond_signal(&conn->bg_conn_sync_cond);
     pthread_mutex_unlock(&conn->bg_conn_sync_mutex);
 #if !defined(MYSQL_DYNAMIC_PLUGIN) || !defined(_WIN32)
-    my_pthread_setspecific_ptr(THR_THD, NULL);
+    set_current_thd(nullptr);
 #endif
     my_thread_end();
     DBUG_RETURN(NULL);
@@ -2583,7 +3185,7 @@ void *spider_bg_conn_action(
       pthread_mutex_unlock(&conn->bg_conn_mutex);
       pthread_mutex_unlock(&conn->bg_conn_sync_mutex);
 #if !defined(MYSQL_DYNAMIC_PLUGIN) || !defined(_WIN32)
-      my_pthread_setspecific_ptr(THR_THD, NULL);
+      set_current_thd(nullptr);
 #endif
       my_thread_end();
       DBUG_RETURN(NULL);
@@ -2691,7 +3293,7 @@ void *spider_bg_conn_action(
                   conn->link_idx);
                 result_list->tmp_tables_created = TRUE;
                 spider_conn_set_timeout_from_share(conn, conn->link_idx,
-                  spider->trx->thd, share);
+                  spider->wide_handler->trx->thd, share);
                 if (dbton_handler->execute_sql(
                   SPIDER_SQL_TYPE_TMP_SQL,
                   conn,
@@ -2709,7 +3311,7 @@ void *spider_bg_conn_action(
               if (!result_list->bgs_error)
               {
                 spider_conn_set_timeout_from_share(conn, conn->link_idx,
-                  spider->trx->thd, share);
+                  spider->wide_handler->trx->thd, share);
                 if (dbton_handler->execute_sql(
                   sql_type,
                   conn,
@@ -2971,6 +3573,7 @@ void *spider_bg_sts_action(
   SPIDER_TRX *trx;
   int error_num = 0, roop_count;
   ha_spider spider;
+  SPIDER_WIDE_HANDLER wide_handler;
   int *need_mons;
   SPIDER_CONN **conns;
   uint *conn_link_idx;
@@ -3057,14 +3660,15 @@ void *spider_bg_sts_action(
     share->bg_sts_init = FALSE;
     pthread_mutex_unlock(&share->sts_mutex);
 #if !defined(MYSQL_DYNAMIC_PLUGIN) || !defined(_WIN32)
-    my_pthread_setspecific_ptr(THR_THD, NULL);
+    set_current_thd(nullptr);
 #endif
     my_thread_end();
     my_afree(need_mons);
     DBUG_RETURN(NULL);
   }
   share->bg_sts_thd = thd;
-  spider.trx = trx;
+  spider.wide_handler = &wide_handler;
+  wide_handler.trx = trx;
   spider.share = share;
   spider.conns = conns;
   spider.conn_link_idx = conn_link_idx;
@@ -3117,7 +3721,7 @@ void *spider_bg_sts_action(
     share->bg_sts_init = FALSE;
     pthread_mutex_unlock(&share->sts_mutex);
 #if !defined(MYSQL_DYNAMIC_PLUGIN) || !defined(_WIN32)
-    my_pthread_setspecific_ptr(THR_THD, NULL);
+    set_current_thd(nullptr);
 #endif
     my_thread_end();
     my_afree(need_mons);
@@ -3146,7 +3750,7 @@ void *spider_bg_sts_action(
       pthread_cond_signal(&share->bg_sts_sync_cond);
       pthread_mutex_unlock(&share->sts_mutex);
 #if !defined(MYSQL_DYNAMIC_PLUGIN) || !defined(_WIN32)
-      my_pthread_setspecific_ptr(THR_THD, NULL);
+      set_current_thd(nullptr);
 #endif
       my_thread_end();
       my_afree(need_mons);
@@ -3336,6 +3940,7 @@ void *spider_bg_crd_action(
   SPIDER_TRX *trx;
   int error_num = 0, roop_count;
   ha_spider spider;
+  SPIDER_WIDE_HANDLER wide_handler;
   TABLE table;
   int *need_mons;
   SPIDER_CONN **conns;
@@ -3423,7 +4028,7 @@ void *spider_bg_crd_action(
     share->bg_crd_init = FALSE;
     pthread_mutex_unlock(&share->crd_mutex);
 #if !defined(MYSQL_DYNAMIC_PLUGIN) || !defined(_WIN32)
-    my_pthread_setspecific_ptr(THR_THD, NULL);
+    set_current_thd(nullptr);
 #endif
     my_thread_end();
     my_afree(need_mons);
@@ -3433,7 +4038,8 @@ void *spider_bg_crd_action(
   table.s = share->table_share;
   table.field = share->table_share->field;
   table.key_info = share->table_share->key_info;
-  spider.trx = trx;
+  spider.wide_handler = &wide_handler;
+  wide_handler.trx = trx;
   spider.change_table_ptr(&table, share->table_share);
   spider.share = share;
   spider.conns = conns;
@@ -3487,7 +4093,7 @@ void *spider_bg_crd_action(
     share->bg_crd_init = FALSE;
     pthread_mutex_unlock(&share->crd_mutex);
 #if !defined(MYSQL_DYNAMIC_PLUGIN) || !defined(_WIN32)
-    my_pthread_setspecific_ptr(THR_THD, NULL);
+    set_current_thd(nullptr);
 #endif
     my_thread_end();
     my_afree(need_mons);
@@ -3516,7 +4122,7 @@ void *spider_bg_crd_action(
       pthread_cond_signal(&share->bg_crd_sync_cond);
       pthread_mutex_unlock(&share->crd_mutex);
 #if !defined(MYSQL_DYNAMIC_PLUGIN) || !defined(_WIN32)
-      my_pthread_setspecific_ptr(THR_THD, NULL);
+      set_current_thd(nullptr);
 #endif
       my_thread_end();
       my_afree(need_mons);
@@ -3687,13 +4293,16 @@ int spider_create_mon_threads(
       }
       if (!(share->bg_mon_thds = (THD **)
         spider_bulk_malloc(spider_current_trx, 23, MYF(MY_WME | MY_ZEROFILL),
-          &share->bg_mon_thds, sizeof(THD *) * share->all_link_count,
-          &share->bg_mon_threads, sizeof(pthread_t) * share->all_link_count,
-          &share->bg_mon_mutexes, sizeof(pthread_mutex_t) *
-            share->all_link_count,
-          &share->bg_mon_conds, sizeof(pthread_cond_t) * share->all_link_count,
+          &share->bg_mon_thds,
+            (uint) (sizeof(THD *) * share->all_link_count),
+          &share->bg_mon_threads,
+            (uint) (sizeof(pthread_t) * share->all_link_count),
+          &share->bg_mon_mutexes,
+            (uint) (sizeof(pthread_mutex_t) * share->all_link_count),
+          &share->bg_mon_conds,
+            (uint) (sizeof(pthread_cond_t) * share->all_link_count),
           &share->bg_mon_sleep_conds,
-            sizeof(pthread_cond_t) * share->all_link_count,
+            (uint) (sizeof(pthread_cond_t) * share->all_link_count),
           NullS))
       ) {
         error_num = HA_ERR_OUT_OF_MEM;
@@ -3912,7 +4521,7 @@ void *spider_bg_mon_action(
     pthread_cond_signal(&share->bg_mon_conds[link_idx]);
     pthread_mutex_unlock(&share->bg_mon_mutexes[link_idx]);
 #if !defined(MYSQL_DYNAMIC_PLUGIN) || !defined(_WIN32)
-    my_pthread_setspecific_ptr(THR_THD, NULL);
+    set_current_thd(nullptr);
 #endif
     my_thread_end();
     DBUG_RETURN(NULL);
@@ -3951,7 +4560,7 @@ void *spider_bg_mon_action(
       spider_free_trx(trx, TRUE);
       delete thd;
 #if !defined(MYSQL_DYNAMIC_PLUGIN) || !defined(_WIN32)
-      my_pthread_setspecific_ptr(THR_THD, NULL);
+      set_current_thd(nullptr);
 #endif
       my_thread_end();
       DBUG_RETURN(NULL);
@@ -4116,11 +4725,12 @@ int spider_conn_get_link_status(
 int spider_conn_lock_mode(
   ha_spider *spider
 ) {
-  SPIDER_RESULT_LIST *result_list = &spider->result_list;
+  SPIDER_WIDE_HANDLER *wide_handler = spider->wide_handler;
   DBUG_ENTER("spider_conn_lock_mode");
-  if (result_list->lock_type == F_WRLCK || spider->lock_mode == 2)
+  if (wide_handler->external_lock_type == F_WRLCK ||
+    wide_handler->lock_mode == 2)
     DBUG_RETURN(SPIDER_LOCK_MODE_EXCLUSIVE);
-  else if (spider->lock_mode == 1)
+  else if (wide_handler->lock_mode == 1)
     DBUG_RETURN(SPIDER_LOCK_MODE_SHARED);
   DBUG_RETURN(SPIDER_LOCK_MODE_NO_LOCK);
 }
@@ -4143,7 +4753,7 @@ bool spider_conn_use_handler(
   int lock_mode,
   int link_idx
 ) {
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   int use_handler = spider_param_use_handler(thd,
     spider->share->use_handlers[link_idx]);
   DBUG_ENTER("spider_conn_use_handler");
@@ -4216,7 +4826,7 @@ bool spider_conn_use_handler(
     DBUG_RETURN(FALSE);
   }
   if (
-    spider->sql_command == SQLCOM_HA_READ &&
+    spider->wide_handler->sql_command == SQLCOM_HA_READ &&
     (
       !(use_handler & 2) ||
       (
@@ -4231,7 +4841,7 @@ bool spider_conn_use_handler(
     DBUG_RETURN(TRUE);
   }
   if (
-    spider->sql_command != SQLCOM_HA_READ &&
+    spider->wide_handler->sql_command != SQLCOM_HA_READ &&
     lock_mode == SPIDER_LOCK_MODE_NO_LOCK &&
     spider_param_sync_trx_isolation(thd) &&
     thd_tx_isolation(thd) != ISO_SERIALIZABLE &&
@@ -4553,7 +5163,8 @@ SPIDER_IP_PORT_CONN* spider_create_ipport_conn(SPIDER_CONN *conn)
   DBUG_ENTER("spider_create_ipport_conn");
   if (conn)
   {
-    SPIDER_IP_PORT_CONN *ret = (SPIDER_IP_PORT_CONN *) my_malloc(sizeof(*ret), MY_ZEROFILL | MY_WME);
+    SPIDER_IP_PORT_CONN *ret = (SPIDER_IP_PORT_CONN *)
+      my_malloc(PSI_INSTRUMENT_ME, sizeof(*ret), MY_ZEROFILL | MY_WME);
     if (!ret)
     {
       goto err_return_direct;
@@ -4587,8 +5198,8 @@ SPIDER_IP_PORT_CONN* spider_create_ipport_conn(SPIDER_CONN *conn)
       goto err_malloc_key;
     }
 
-    ret->key = (char *) my_malloc(ret->key_len + conn->tgt_host_length + 1,
-      MY_ZEROFILL | MY_WME);
+    ret->key = (char *) my_malloc(PSI_INSTRUMENT_ME, ret->key_len +
+                             conn->tgt_host_length + 1, MY_ZEROFILL | MY_WME);
     if (!ret->key) {
       pthread_cond_destroy(&ret->cond);
       pthread_mutex_destroy(&ret->mutex);
diff --git a/storage/spider/spd_conn.h b/storage/spider/spd_conn.h
index 1612593a1cb..92da278eecc 100644
--- a/storage/spider/spd_conn.h
+++ b/storage/spider/spd_conn.h
@@ -1,5 +1,5 @@
-/* Copyright (C) 2008-2019 Kentoku Shiba
-   Copyright (C) 2019 MariaDB corp
+/* Copyright (C) 2008-2020 Kentoku Shiba
+   Copyright (C) 2019-2020 MariaDB corp
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -26,6 +26,26 @@
 #define SPIDER_SIMPLE_CHECKSUM_TABLE      4
 #endif
 
+#define SPIDER_LOP_CHK_QUEUED             (1 << 0)
+#define SPIDER_LOP_CHK_MERAGED            (1 << 1)
+#define SPIDER_LOP_CHK_IGNORED            (1 << 2)
+
+typedef struct st_spider_conn_loop_check
+{
+  uint               flag;
+#ifdef SPIDER_HAS_HASH_VALUE_TYPE
+  my_hash_value_type hash_value_to;
+  my_hash_value_type hash_value_full;
+#endif
+  LEX_CSTRING        from_name;
+  LEX_CSTRING        cur_name;
+  LEX_CSTRING        to_name;
+  LEX_CSTRING        full_name;
+  LEX_CSTRING        from_value;
+  LEX_CSTRING        merged_value;
+  st_spider_conn_loop_check *next;
+} SPIDER_CONN_LOOP_CHECK;
+
 uchar *spider_conn_get_key(
   SPIDER_CONN *conn,
   size_t *length,
@@ -38,6 +58,14 @@ uchar *spider_ipport_conn_get_key(
   my_bool not_used __attribute__ ((unused))
 );
 
+int spider_conn_init(
+  SPIDER_CONN *conn
+);
+
+void spider_conn_done(
+  SPIDER_CONN *conn
+);
+
 int spider_reset_conn_setted_parameter(
   SPIDER_CONN *conn,
   THD *thd
@@ -155,6 +183,21 @@ void spider_conn_queue_UTC_time_zone(
   SPIDER_CONN *conn
 );
 
+int spider_conn_queue_and_merge_loop_check(
+  SPIDER_CONN *conn,
+  SPIDER_CONN_LOOP_CHECK *lcptr
+);
+
+int spider_conn_reset_queue_loop_check(
+  SPIDER_CONN *conn
+);
+
+int spider_conn_queue_loop_check(
+  SPIDER_CONN *conn,
+  ha_spider *spider,
+  int link_idx
+);
+
 void spider_conn_queue_start_transaction(
   SPIDER_CONN *conn
 );
diff --git a/storage/spider/spd_copy_tables.cc b/storage/spider/spd_copy_tables.cc
index 319b02462b1..51e3b920eea 100644
--- a/storage/spider/spd_copy_tables.cc
+++ b/storage/spider/spd_copy_tables.cc
@@ -1,4 +1,5 @@
-/* Copyright (C) 2009-2018 Kentoku Shiba
+/* Copyright (C) 2009-2019 Kentoku Shiba
+   Copyright (C) 2019 MariaDB corp
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -352,11 +353,7 @@ int spider_udf_get_copy_tgt_tables(
 ) {
   int error_num, roop_count;
   TABLE *table_tables = NULL;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   char table_key[MAX_KEY_LENGTH];
   SPIDER_COPY_TABLE_CONN *table_conn = NULL, *src_table_conn_prev = NULL,
     *dst_table_conn_prev = NULL;
@@ -389,12 +386,15 @@ int spider_udf_get_copy_tgt_tables(
   do {
     if (!(table_conn = (SPIDER_COPY_TABLE_CONN *)
       spider_bulk_malloc(spider_current_trx, 25, MYF(MY_WME | MY_ZEROFILL),
-        &table_conn, sizeof(SPIDER_COPY_TABLE_CONN),
-        &tmp_share, sizeof(SPIDER_SHARE),
-        &tmp_connect_info, sizeof(char *) * SPIDER_TMP_SHARE_CHAR_PTR_COUNT,
-        &tmp_connect_info_length, sizeof(uint) * SPIDER_TMP_SHARE_UINT_COUNT,
-        &tmp_long, sizeof(long) * SPIDER_TMP_SHARE_LONG_COUNT,
-        &tmp_longlong, sizeof(longlong) * SPIDER_TMP_SHARE_LONGLONG_COUNT,
+        &table_conn, (uint) (sizeof(SPIDER_COPY_TABLE_CONN)),
+        &tmp_share, (uint) (sizeof(SPIDER_SHARE)),
+        &tmp_connect_info,
+          (uint) (sizeof(char *) * SPIDER_TMP_SHARE_CHAR_PTR_COUNT),
+        &tmp_connect_info_length,
+          (uint) (sizeof(uint) * SPIDER_TMP_SHARE_UINT_COUNT),
+        &tmp_long, (uint) (sizeof(long) * SPIDER_TMP_SHARE_LONG_COUNT),
+        &tmp_longlong,
+          (uint) (sizeof(longlong) * SPIDER_TMP_SHARE_LONGLONG_COUNT),
         NullS))
     ) {
       spider_sys_index_end(table_tables);
@@ -705,12 +705,12 @@ int spider_udf_copy_tables_create_table_list(
   if (!(copy_tables->link_idxs[0] = (int *)
     spider_bulk_malloc(spider_current_trx, 26, MYF(MY_WME | MY_ZEROFILL),
       &copy_tables->link_idxs[0],
-        sizeof(int) * copy_tables->link_idx_count[0],
+        (uint) (sizeof(int) * copy_tables->link_idx_count[0]),
       &copy_tables->link_idxs[1],
-        sizeof(int) * copy_tables->link_idx_count[1],
-      &tmp_name_ptr, sizeof(char) * (
+        (uint) (sizeof(int) * copy_tables->link_idx_count[1]),
+      &tmp_name_ptr, (uint) (sizeof(char) * (
         spider_table_name_length * 2 + copy_tables->database_length + 3
-      ),
+      )),
       NullS))
   ) {
     my_error(ER_OUT_OF_RESOURCES, MYF(0), HA_ERR_OUT_OF_MEM);
@@ -835,6 +835,7 @@ long long spider_copy_tables_body(
   spider_string *tmp_sql = NULL;
   SPIDER_COPY_TABLE_CONN *table_conn, *src_tbl_conn, *dst_tbl_conn;
   SPIDER_CONN *tmp_conn;
+  SPIDER_WIDE_HANDLER *wide_handler;
   spider_db_copy_table *select_ct, *insert_ct;
   MEM_ROOT mem_root;
   longlong bulk_insert_rows;
@@ -905,7 +906,7 @@ long long spider_copy_tables_body(
 
   if (!(copy_tables = (SPIDER_COPY_TABLES *)
     spider_bulk_malloc(spider_current_trx, 27, MYF(MY_WME | MY_ZEROFILL),
-      &copy_tables, sizeof(SPIDER_COPY_TABLES),
+      &copy_tables, (uint) (sizeof(SPIDER_COPY_TABLES)),
       NullS))
   ) {
     my_error(ER_OUT_OF_RESOURCES, MYF(0), HA_ERR_OUT_OF_MEM);
@@ -993,7 +994,7 @@ long long spider_copy_tables_body(
 #if MYSQL_VERSION_ID < 50500
   if (open_and_lock_tables(thd, table_list))
 #else
-  table_list->mdl_request.init(
+    MDL_REQUEST_INIT(&table_list->mdl_request,
     MDL_key::TABLE,
     SPIDER_TABLE_LIST_db_str(table_list),
     SPIDER_TABLE_LIST_table_name_str(table_list),
@@ -1124,13 +1125,15 @@ long long spider_copy_tables_body(
         __func__, __FILE__, __LINE__, MYF(MY_WME | MY_ZEROFILL),
         &tmp_spider->dbton_handler,
           sizeof(spider_db_handler *) * SPIDER_DBTON_SIZE,
+        &wide_handler, sizeof(SPIDER_WIDE_HANDLER),
         NullS))
     ) {
       my_error(ER_OUT_OF_RESOURCES, MYF(0), HA_ERR_OUT_OF_MEM);
       goto error;
     }
     tmp_spider->share = table_conn->share;
-    tmp_spider->trx = copy_tables->trx;
+    tmp_spider->wide_handler = wide_handler;
+    wide_handler->trx = copy_tables->trx;
 /*
     if (spider_db_append_set_names(table_conn->share))
     {
@@ -1143,7 +1146,7 @@ long long spider_copy_tables_body(
     tmp_sql[roop_count].set_charset(copy_tables->access_charset);
     tmp_spider->result_list.sqls = &tmp_sql[roop_count];
     tmp_spider->need_mons = &table_conn->need_mon;
-    tmp_spider->lock_type = TL_READ;
+    tmp_spider->wide_handler->lock_type = TL_READ;
     tmp_spider->conn_link_idx = &tmp_conn_link_idx;
     uint dbton_id = tmp_spider->share->use_dbton_ids[0];
     if (!(tmp_spider->dbton_handler[dbton_id] =
@@ -1168,13 +1171,15 @@ long long spider_copy_tables_body(
         __func__, __FILE__, __LINE__, MYF(MY_WME | MY_ZEROFILL),
         &tmp_spider->dbton_handler,
           sizeof(spider_db_handler *) * SPIDER_DBTON_SIZE,
+        &wide_handler, sizeof(SPIDER_WIDE_HANDLER),
         NullS))
     ) {
       my_error(ER_OUT_OF_RESOURCES, MYF(0), HA_ERR_OUT_OF_MEM);
       goto error;
     }
     tmp_spider->share = table_conn->share;
-    tmp_spider->trx = copy_tables->trx;
+    tmp_spider->wide_handler = wide_handler;
+    wide_handler->trx = copy_tables->trx;
 /*
     if (spider_db_append_set_names(table_conn->share))
     {
@@ -1187,7 +1192,7 @@ long long spider_copy_tables_body(
     tmp_sql[roop_count].set_charset(copy_tables->access_charset);
     tmp_spider->result_list.sqls = &tmp_sql[roop_count];
     tmp_spider->need_mons = &table_conn->need_mon;
-    tmp_spider->lock_type = TL_WRITE;
+    tmp_spider->wide_handler->lock_type = TL_WRITE;
     tmp_spider->conn_link_idx = &tmp_conn_link_idx;
     uint dbton_id = tmp_spider->share->use_dbton_ids[0];
     if (!(tmp_spider->dbton_handler[dbton_id] =
diff --git a/storage/spider/spd_db_conn.cc b/storage/spider/spd_db_conn.cc
index be9ee324ee1..d2cce0ba6d0 100644
--- a/storage/spider/spd_db_conn.cc
+++ b/storage/spider/spd_db_conn.cc
@@ -257,9 +257,9 @@ int spider_db_ping(
 ) {
   DBUG_ENTER("spider_db_ping");
 #ifndef DBUG_OFF
-  if (spider->trx->thd)
+  if (spider->wide_handler->trx->thd)
     DBUG_PRINT("info", ("spider thd->query_id is %lld",
-      spider->trx->thd->query_id));
+      spider->wide_handler->trx->thd->query_id));
 #endif
   DBUG_RETURN(spider_db_ping_internal(spider->share, conn,
     spider->conn_link_idx[link_idx], &spider->need_mons[link_idx]));
@@ -383,6 +383,12 @@ int spider_db_conn_queue_action(
           append_time_zone(&sql_str, conn->queued_time_zone_val))
       ) ||
       (
+        conn->loop_check_queue.records &&
+        conn->db_conn->set_loop_check_in_bulk_sql() &&
+        (error_num = spider_dbton[conn->dbton_id].db_util->
+          append_loop_check(&sql_str, conn))
+      ) ||
+      (
         conn->queued_trx_start &&
         conn->db_conn->trx_start_in_bulk_sql() &&
         (error_num = spider_dbton[conn->dbton_id].db_util->
@@ -430,8 +436,8 @@ int spider_db_conn_queue_action(
         (!conn->queued_autocommit_val && conn->autocommit != 0)
       ) &&
       !conn->db_conn->set_autocommit_in_bulk_sql() &&
-      (error_num = spider_dbton[conn->dbton_id].db_util->
-        append_autocommit(&sql_str, conn->queued_autocommit_val))
+      (error_num = conn->db_conn->
+        set_autocommit(conn->queued_autocommit_val, (int *) conn->need_mon))
     ) {
       DBUG_RETURN(error_num);
     }
@@ -442,8 +448,8 @@ int spider_db_conn_queue_action(
         (!conn->queued_sql_log_off_val && conn->sql_log_off != 0)
       ) &&
       !conn->db_conn->set_sql_log_off_in_bulk_sql() &&
-      (error_num = spider_dbton[conn->dbton_id].db_util->
-        append_sql_log_off(&sql_str, conn->queued_sql_log_off_val))
+      (error_num = conn->db_conn->
+        set_sql_log_off(conn->queued_sql_log_off_val, (int *) conn->need_mon))
     ) {
       DBUG_RETURN(error_num);
     }
@@ -451,8 +457,9 @@ int spider_db_conn_queue_action(
       conn->queued_wait_timeout &&
       conn->queued_wait_timeout_val != conn->wait_timeout &&
       !conn->db_conn->set_wait_timeout_in_bulk_sql() &&
-      (error_num = spider_dbton[conn->dbton_id].db_util->
-        append_wait_timeout(&sql_str, conn->queued_wait_timeout_val))
+      (error_num = conn->db_conn->
+        set_wait_timeout(conn->queued_wait_timeout_val,
+        (int *) conn->need_mon))
     ) {
       DBUG_RETURN(error_num);
     }
@@ -460,8 +467,9 @@ int spider_db_conn_queue_action(
       conn->queued_sql_mode &&
       conn->queued_sql_mode_val != conn->sql_mode &&
       !conn->db_conn->set_sql_mode_in_bulk_sql() &&
-      (error_num = spider_dbton[conn->dbton_id].db_util->
-        append_sql_mode(&sql_str, conn->queued_sql_mode_val))
+      (error_num = conn->db_conn->
+        set_sql_mode(conn->queued_sql_mode_val,
+        (int *) conn->need_mon))
     ) {
       DBUG_RETURN(error_num);
     }
@@ -469,8 +477,16 @@ int spider_db_conn_queue_action(
       conn->queued_time_zone &&
       conn->queued_time_zone_val != conn->time_zone &&
       !conn->db_conn->set_time_zone_in_bulk_sql() &&
-      (error_num = spider_dbton[conn->dbton_id].db_util->
-        append_time_zone(&sql_str, conn->queued_time_zone_val))
+      (error_num = conn->db_conn->
+        set_time_zone(conn->queued_time_zone_val,
+        (int *) conn->need_mon))
+    ) {
+      DBUG_RETURN(error_num);
+    }
+    if (
+      conn->loop_check_queue.records &&
+      !conn->db_conn->set_loop_check_in_bulk_sql() &&
+      (error_num = conn->db_conn->set_loop_check((int *) conn->need_mon))
     ) {
       DBUG_RETURN(error_num);
     }
@@ -580,6 +596,11 @@ int spider_db_conn_queue_action(
       DBUG_PRINT("info", ("spider conn->time_zone=%p",
         conn->time_zone));
     }
+
+    if (conn->loop_check_queue.records)
+    {
+      conn->db_conn->fin_loop_check();
+    }
     spider_conn_clear_queue(conn);
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
   } else if (conn->server_lost)
@@ -916,6 +937,7 @@ int spider_db_set_names_internal(
     }
     if (
       spider_param_use_default_database(trx->thd) &&
+      share->tgt_dbs[all_link_idx] &&
       (
         !conn->default_database.length() ||
         conn->default_database.length() !=
@@ -952,7 +974,8 @@ int spider_db_set_names(
   int link_idx
 ) {
   DBUG_ENTER("spider_db_set_names");
-  DBUG_RETURN(spider_db_set_names_internal(spider->trx, spider->share, conn,
+  DBUG_RETURN(spider_db_set_names_internal(spider->wide_handler->trx,
+    spider->share, conn,
     spider->conn_link_idx[link_idx], &spider->need_mons[link_idx]));
 }
 
@@ -977,8 +1000,8 @@ int spider_db_query_with_set_names(
       spider->need_mons[link_idx]
     ) {
       error_num = spider_ping_table_mon_from_table(
-          spider->trx,
-          spider->trx->thd,
+          spider->wide_handler->trx,
+          spider->wide_handler->trx->thd,
           share,
           link_idx,
           (uint32) share->monitoring_sid[link_idx],
@@ -995,7 +1018,8 @@ int spider_db_query_with_set_names(
     }
     DBUG_RETURN(error_num);
   }
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   if (dbton_hdl->execute_sql(
     sql_type,
@@ -1009,8 +1033,8 @@ int spider_db_query_with_set_names(
       spider->need_mons[link_idx]
     ) {
       error_num = spider_ping_table_mon_from_table(
-          spider->trx,
-          spider->trx->thd,
+          spider->wide_handler->trx,
+          spider->wide_handler->trx->thd,
           share,
           link_idx,
           (uint32) share->monitoring_sid[link_idx],
@@ -1059,8 +1083,8 @@ int spider_db_query_for_bulk_update(
       spider->need_mons[link_idx]
     ) {
       error_num = spider_ping_table_mon_from_table(
-          spider->trx,
-          spider->trx->thd,
+          spider->wide_handler->trx,
+          spider->wide_handler->trx->thd,
           share,
           link_idx,
           (uint32) share->monitoring_sid[link_idx],
@@ -1077,7 +1101,8 @@ int spider_db_query_for_bulk_update(
     }
     DBUG_RETURN(error_num);
   }
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   spider_db_handler *dbton_hdl = spider->dbton_handler[conn->dbton_id];
   if (dbton_hdl->execute_sql(
@@ -1099,8 +1124,8 @@ int spider_db_query_for_bulk_update(
       spider->need_mons[link_idx]
     ) {
       error_num = spider_ping_table_mon_from_table(
-          spider->trx,
-          spider->trx->thd,
+          spider->wide_handler->trx,
+          spider->wide_handler->trx->thd,
           share,
           link_idx,
           (uint32) share->monitoring_sid[link_idx],
@@ -1116,7 +1141,7 @@ int spider_db_query_for_bulk_update(
         );
     }
     if (
-      spider->ignore_dup_key &&
+      spider->wide_handler->ignore_dup_key &&
       (
         error_num == ER_DUP_ENTRY ||
         error_num == ER_DUP_KEY ||
@@ -1124,7 +1149,7 @@ int spider_db_query_for_bulk_update(
       )
     ) {
       ++(*dup_key_found);
-      spider->trx->thd->clear_error();
+      spider->wide_handler->trx->thd->clear_error();
       DBUG_RETURN(0);
     }
     DBUG_RETURN(error_num);
@@ -1146,8 +1171,8 @@ int spider_db_query_for_bulk_update(
       spider->need_mons[link_idx]
     ) {
       error_num = spider_ping_table_mon_from_table(
-          spider->trx,
-          spider->trx->thd,
+          spider->wide_handler->trx,
+          spider->wide_handler->trx->thd,
           share,
           link_idx,
           (uint32) share->monitoring_sid[link_idx],
@@ -1439,7 +1464,7 @@ int spider_db_append_name_with_quote_str_internal(
   {
     head_code = *name;
 #ifdef SPIDER_HAS_MY_CHARLEN
-    if ((length = my_charlen(cs, name, name_end)) < 1)
+    if ((length = my_ci_charlen(cs, (const uchar *) name, (const uchar *) name_end)) < 1)
 #else
     if (!(length = my_mbcharlen(cs, (uchar) head_code)))
 #endif
@@ -2789,7 +2814,7 @@ int spider_db_fetch_for_item_sum_func(
 ) {
   int error_num;
   SPIDER_SHARE *share = spider->share;
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   DBUG_ENTER("spider_db_fetch_for_item_sum_func");
   DBUG_PRINT("info",("spider Sumfunctype = %d", item_sum->sum_func()));
   switch (item_sum->sum_func())
@@ -2826,7 +2851,8 @@ int spider_db_fetch_for_item_sum_func(
           if (!spider->direct_aggregate_item_first)
           {
             if (!spider_bulk_malloc(spider_current_trx, 240, MYF(MY_WME),
-              &spider->direct_aggregate_item_first, sizeof(SPIDER_ITEM_HLD),
+              &spider->direct_aggregate_item_first,
+              (uint) (sizeof(SPIDER_ITEM_HLD)),
               NullS)
             ) {
               DBUG_RETURN(HA_ERR_OUT_OF_MEM);
@@ -2845,7 +2871,7 @@ int spider_db_fetch_for_item_sum_func(
           {
             if (!spider_bulk_malloc(spider_current_trx, 241, MYF(MY_WME),
               &spider->direct_aggregate_item_current->next,
-              sizeof(SPIDER_ITEM_HLD), NullS)
+              (uint) (sizeof(SPIDER_ITEM_HLD)), NullS)
             ) {
               DBUG_RETURN(HA_ERR_OUT_OF_MEM);
             }
@@ -3516,7 +3542,8 @@ int spider_db_fetch_minimum_columns(
   ) {
     DBUG_PRINT("info", ("spider field_index %u", (*field)->field_index));
     DBUG_PRINT("info", ("spider searched_bitmap %u",
-      spider_bit_is_set(spider->searched_bitmap, (*field)->field_index)));
+      spider_bit_is_set(spider->wide_handler->searched_bitmap,
+      (*field)->field_index)));
     DBUG_PRINT("info", ("spider read_set %u",
       bitmap_is_set(table->read_set, (*field)->field_index)));
     DBUG_PRINT("info", ("spider write_set %u",
@@ -3698,7 +3725,7 @@ int spider_db_free_result(
   SPIDER_RESULT *result;
   SPIDER_RESULT *prev;
   SPIDER_SHARE *share = spider->share;
-  SPIDER_TRX *trx = spider->trx;
+  SPIDER_TRX *trx = spider->wide_handler->trx;
   SPIDER_POSITION *position;
   int roop_count, error_num;
   DBUG_ENTER("spider_db_free_result");
@@ -4008,7 +4035,8 @@ int spider_db_store_result(
 #endif
           result_list->quick_phase == 2
         ) {
-          if (result_list->low_mem_read)
+          if (result_list->low_mem_read &&
+            result_list->current->result->limit_mode() == 0)
           {
             do {
               spider_db_free_one_result(result_list,
@@ -4058,8 +4086,9 @@ int spider_db_store_result(
         spider_clear_bit(spider->db_request_phase, link_idx);
       }
       st_spider_db_request_key request_key;
-      request_key.spider_thread_id = spider->trx->spider_thread_id;
-      request_key.query_id = spider->trx->thd->query_id;
+      request_key.spider_thread_id =
+        spider->wide_handler->trx->spider_thread_id;
+      request_key.query_id = spider->wide_handler->trx->thd->query_id;
       request_key.handler = spider;
       request_key.request_id = spider->db_request_id[link_idx];
       request_key.next = NULL;
@@ -4161,12 +4190,13 @@ int spider_db_store_result(
           spider_clear_bit(spider->db_request_phase, link_idx);
         }
         st_spider_db_request_key request_key;
-        request_key.spider_thread_id = spider->trx->spider_thread_id;
-        request_key.query_id = spider->trx->thd->query_id;
+        request_key.spider_thread_id =
+          spider->wide_handler->trx->spider_thread_id;
+        request_key.query_id = spider->wide_handler->trx->thd->query_id;
         request_key.handler = spider;
         request_key.request_id = spider->db_request_id[link_idx];
         request_key.next = NULL;
-        if (!(current->result = conn->db_conn->use_result(&request_key,
+        if (!(current->result = conn->db_conn->use_result(spider, &request_key,
           &error_num)))
         {
           if (!error_num)
@@ -4245,8 +4275,8 @@ int spider_db_store_result(
       current->field_count = field_count;
       if (!(position = (SPIDER_POSITION *)
         spider_bulk_malloc(spider_current_trx, 7, MYF(MY_WME | MY_ZEROFILL),
-          &position, sizeof(SPIDER_POSITION) * page_size,
-          &tmp_row, sizeof(char*) * field_count,
+          &position, (uint) (sizeof(SPIDER_POSITION) * page_size),
+          &tmp_row, (uint) (sizeof(SPIDER_DB_ROW) * field_count),
           NullS))
       )
         DBUG_RETURN(HA_ERR_OUT_OF_MEM);
@@ -4372,15 +4402,20 @@ int spider_db_store_result(
         result_list->quick_mode == 3 ||
         result_list->limit_num == roop_count
       ) {
-        current->result->free_result();
-        if (!current->result_tmp_tbl)
-        {
-          delete current->result;
-          current->result = NULL;
+        if (
+          result_list->limit_num != roop_count ||
+          conn->db_conn->limit_mode() != 1
+        ) {
+          current->result->free_result();
+          if (!current->result_tmp_tbl)
+          {
+            delete current->result;
+            current->result = NULL;
+          }
+          DBUG_PRINT("info", ("spider conn[%p]->quick_target=NULL", conn));
+          conn->quick_target = NULL;
+          spider->quick_targets[link_idx] = NULL;
         }
-        DBUG_PRINT("info", ("spider conn[%p]->quick_target=NULL", conn));
-        conn->quick_target = NULL;
-        spider->quick_targets[link_idx] = NULL;
       }
 #ifndef WITHOUT_SPIDER_BG_SEARCH
       DBUG_PRINT("info", ("spider bgs_phase=%d", result_list->bgs_phase));
@@ -4429,8 +4464,8 @@ int spider_db_store_result(
       spider_clear_bit(spider->db_request_phase, link_idx);
     }
     st_spider_db_request_key request_key;
-    request_key.spider_thread_id = spider->trx->spider_thread_id;
-    request_key.query_id = spider->trx->thd->query_id;
+    request_key.spider_thread_id = spider->wide_handler->trx->spider_thread_id;
+    request_key.query_id = spider->wide_handler->trx->thd->query_id;
     request_key.handler = spider;
     request_key.request_id = spider->db_request_id[link_idx];
     request_key.next = NULL;
@@ -4464,6 +4499,382 @@ int spider_db_store_result(
   DBUG_RETURN(0);
 }
 
+int spider_db_store_result_for_reuse_cursor(
+  ha_spider *spider,
+  int link_idx,
+  TABLE *table
+) {
+  int error_num;
+  SPIDER_CONN *conn;
+  SPIDER_RESULT_LIST *result_list = &spider->result_list;
+  SPIDER_RESULT *current;
+  DBUG_ENTER("spider_db_store_result_for_reuse_cursor");
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+  DBUG_ASSERT(spider->conn_kind[link_idx] == SPIDER_CONN_KIND_MYSQL);
+#endif
+  conn = spider->conns[link_idx];
+  DBUG_PRINT("info",("spider conn->connection_id=%llu",
+    conn->connection_id));
+  DBUG_PRINT("info",("spider spider->connection_ids[%d]=%llu",
+    link_idx, spider->connection_ids[link_idx]));
+  if (conn->connection_id != spider->connection_ids[link_idx])
+  {
+    my_message(ER_SPIDER_REMOTE_SERVER_GONE_AWAY_NUM,
+      ER_SPIDER_REMOTE_SERVER_GONE_AWAY_STR, MYF(0));
+    DBUG_RETURN(ER_SPIDER_REMOTE_SERVER_GONE_AWAY_NUM);
+  }
+  if (!result_list->current)
+  {
+    if (!result_list->first)
+    {
+      if (!(result_list->first = (SPIDER_RESULT *)
+        spider_malloc(spider_current_trx, 4, sizeof(*result_list->first),
+          MYF(MY_WME | MY_ZEROFILL)))
+      ) {
+        DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+      }
+      TMP_TABLE_PARAM *tmp_tbl_prm = (TMP_TABLE_PARAM *)
+        &result_list->first->result_tmp_tbl_prm;
+      tmp_tbl_prm->init();
+      tmp_tbl_prm->field_count = 3;
+      result_list->last = result_list->first;
+      result_list->current = result_list->first;
+    } else {
+      result_list->current = result_list->first;
+    }
+    result_list->bgs_current = result_list->current;
+    current = (SPIDER_RESULT*) result_list->current;
+  } else {
+    if (
+#ifndef WITHOUT_SPIDER_BG_SEARCH
+      result_list->bgs_phase > 0 ||
+#endif
+      result_list->quick_phase > 0
+    ) {
+      if (result_list->bgs_current == result_list->last)
+      {
+        if (!(result_list->last = (SPIDER_RESULT *)
+          spider_malloc(spider_current_trx, 5, sizeof(*result_list->last),
+             MYF(MY_WME | MY_ZEROFILL)))
+        ) {
+          DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+        }
+        TMP_TABLE_PARAM *tmp_tbl_prm = (TMP_TABLE_PARAM *)
+          &result_list->last->result_tmp_tbl_prm;
+        tmp_tbl_prm->init();
+        tmp_tbl_prm->field_count = 3;
+        result_list->bgs_current->next = result_list->last;
+        result_list->last->prev = result_list->bgs_current;
+        result_list->bgs_current = result_list->last;
+      } else {
+        result_list->bgs_current = result_list->bgs_current->next;
+      }
+      if (
+#ifndef WITHOUT_SPIDER_BG_SEARCH
+        result_list->bgs_phase == 1 ||
+#endif
+        result_list->quick_phase == 2
+      ) {
+        result_list->current = result_list->bgs_current;
+        result_list->quick_phase = 0;
+      }
+      current = (SPIDER_RESULT*) result_list->bgs_current;
+    } else {
+      if (result_list->current == result_list->last)
+      {
+        if (!(result_list->last = (SPIDER_RESULT *)
+          spider_malloc(spider_current_trx, 6, sizeof(*result_list->last),
+            MYF(MY_WME | MY_ZEROFILL)))
+        ) {
+          DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+        }
+        TMP_TABLE_PARAM *tmp_tbl_prm = (TMP_TABLE_PARAM *)
+          &result_list->last->result_tmp_tbl_prm;
+        tmp_tbl_prm->init();
+        tmp_tbl_prm->field_count = 3;
+        result_list->current->next = result_list->last;
+        result_list->last->prev = result_list->current;
+        result_list->current = result_list->last;
+      } else {
+        result_list->current = result_list->current->next;
+      }
+      result_list->bgs_current = result_list->current;
+      current = (SPIDER_RESULT*) result_list->current;
+    }
+  }
+
+  if (result_list->quick_mode == 0)
+  {
+    if (spider_bit_is_set(spider->db_request_phase, link_idx))
+    {
+      spider_clear_bit(spider->db_request_phase, link_idx);
+    }
+    current->result = current->prev->result;
+    current->result->set_limit(result_list->limit_num);
+    current->record_num = current->result->num_rows();
+    current->dbton_id = current->result->dbton_id;
+    result_list->record_num += current->record_num;
+    DBUG_PRINT("info",("spider current->record_num=%lld",
+      current->record_num));
+    DBUG_PRINT("info",("spider result_list->record_num=%lld",
+      result_list->record_num));
+    DBUG_PRINT("info",("spider result_list->internal_limit=%lld",
+      result_list->internal_limit));
+    DBUG_PRINT("info",("spider result_list->split_read=%lld",
+      result_list->split_read));
+    if (
+      result_list->internal_limit <= result_list->record_num ||
+      result_list->split_read > current->record_num
+    ) {
+      DBUG_PRINT("info",("spider set finish_flg point 2"));
+      DBUG_PRINT("info",("spider current->finish_flg = TRUE"));
+      DBUG_PRINT("info",("spider result_list->finish_flg = TRUE"));
+      current->finish_flg = TRUE;
+      result_list->finish_flg = TRUE;
+    }
+#ifndef WITHOUT_SPIDER_BG_SEARCH
+    if (result_list->bgs_phase <= 1)
+    {
+#endif
+      result_list->current_row_num = 0;
+#ifndef WITHOUT_SPIDER_BG_SEARCH
+    }
+#endif
+  } else {
+    DBUG_ASSERT(current->prev);
+    DBUG_ASSERT(current->prev->result);
+    /* has_result() for case of result with result_tmp_tbl */
+    if (current->prev->result->has_result())
+    {
+      current->result = current->prev->result;
+      current->result->set_limit(result_list->limit_num);
+      current->prev->result = NULL;
+      result_list->limit_num -= current->prev->record_num;
+    } else {
+      if (spider_bit_is_set(spider->db_request_phase, link_idx))
+      {
+        spider_clear_bit(spider->db_request_phase, link_idx);
+      }
+      current->result = current->prev->result;
+      current->result->set_limit(result_list->limit_num);
+      DBUG_PRINT("info", ("spider conn[%p]->quick_target=%p", conn, spider));
+      conn->quick_target = spider;
+      spider->quick_targets[link_idx] = spider;
+    }
+    current->dbton_id = current->result->dbton_id;
+    SPIDER_DB_ROW *row;
+    if (!(row = current->result->fetch_row()))
+    {
+      error_num = current->result->get_errno();
+      DBUG_PRINT("info",("spider set finish_flg point 3"));
+      DBUG_PRINT("info",("spider current->finish_flg = TRUE"));
+      DBUG_PRINT("info",("spider result_list->finish_flg = TRUE"));
+      current->finish_flg = TRUE;
+      result_list->finish_flg = TRUE;
+      current->result->free_result();
+      delete current->result;
+      current->result = NULL;
+      DBUG_PRINT("info", ("spider conn[%p]->quick_target=NULL", conn));
+      conn->quick_target = NULL;
+      spider->quick_targets[link_idx] = NULL;
+      if (
+#ifndef WITHOUT_SPIDER_BG_SEARCH
+        result_list->bgs_phase <= 1 &&
+#endif
+        result_list->quick_phase == 0
+      ) {
+        result_list->current_row_num = 0;
+        table->status = STATUS_NOT_FOUND;
+      }
+      if (error_num && error_num != HA_ERR_END_OF_FILE)
+        DBUG_RETURN(error_num);
+      /* This shouldn't return HA_ERR_END_OF_FILE */
+      DBUG_RETURN(0);
+    }
+    SPIDER_DB_ROW *tmp_row;
+    uint field_count = current->result->num_fields();
+    SPIDER_POSITION *position;
+    longlong page_size;
+    int roop_count = 0;
+    if (!result_list->quick_page_size)
+    {
+      if (result_list->quick_mode == 3)
+      {
+        page_size = 0;
+      } else {
+        result_list->quick_page_size = result_list->limit_num;
+        page_size = result_list->limit_num;
+      }
+    } else {
+      page_size =
+        result_list->limit_num < result_list->quick_page_size ?
+        result_list->limit_num : result_list->quick_page_size;
+    }
+    current->field_count = field_count;
+    if (!(position = (SPIDER_POSITION *)
+      spider_bulk_malloc(spider_current_trx, 7, MYF(MY_WME | MY_ZEROFILL),
+        &position, (uint) (sizeof(SPIDER_POSITION) * page_size),
+        &tmp_row, (uint) (sizeof(SPIDER_DB_ROW) * field_count),
+        NullS))
+    )
+      DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+    current->pos_page_size = (int) page_size;
+    current->first_position = position;
+    current->tmp_tbl_row = tmp_row;
+    if (result_list->quick_mode == 3)
+    {
+      while (page_size > roop_count && row)
+      {
+        if (result_list->quick_page_byte < row->get_byte_size())
+        {
+          current->pos_page_size = roop_count;
+          page_size = roop_count;
+          result_list->quick_page_size = roop_count;
+          result_list->quick_page_byte = 0;
+          break;
+        } else {
+          result_list->quick_page_byte -= row->get_byte_size();
+        }
+        if (!(position->row = row->clone()))
+        {
+          DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+        }
+        position++;
+        roop_count++;
+        row = current->result->fetch_row();
+      }
+    } else {
+      do {
+        if (!(position->row = row->clone()))
+        {
+          DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+        }
+        position++;
+        roop_count++;
+        if (result_list->quick_page_byte < row->get_byte_size())
+        {
+          current->pos_page_size = roop_count;
+          page_size = roop_count;
+          result_list->quick_page_size = roop_count;
+          result_list->quick_page_byte = 0;
+          break;
+        } else {
+          result_list->quick_page_byte -= row->get_byte_size();
+        }
+      } while (
+        page_size > roop_count &&
+        (row = current->result->fetch_row())
+      );
+    }
+    if (
+      result_list->quick_mode == 3 &&
+      page_size == roop_count &&
+      result_list->limit_num > roop_count &&
+      row
+    ) {
+      THD *thd = current_thd;
+      char buf[MAX_FIELD_WIDTH];
+      spider_string tmp_str(buf, MAX_FIELD_WIDTH, &my_charset_bin);
+      tmp_str.init_calc_mem(120);
+
+      DBUG_PRINT("info",("spider store result to temporary table"));
+      DBUG_ASSERT(!current->result_tmp_tbl);
+#ifdef SPIDER_use_LEX_CSTRING_for_Field_blob_constructor
+      LEX_CSTRING field_name1 = {STRING_WITH_LEN("a")};
+      LEX_CSTRING field_name2 = {STRING_WITH_LEN("b")};
+      LEX_CSTRING field_name3 = {STRING_WITH_LEN("c")};
+      if (!(current->result_tmp_tbl = spider_mk_sys_tmp_table_for_result(
+        thd, table, &current->result_tmp_tbl_prm, &field_name1, &field_name2,
+        &field_name3, &my_charset_bin)))
+#else
+      if (!(current->result_tmp_tbl = spider_mk_sys_tmp_table_for_result(
+        thd, table, &current->result_tmp_tbl_prm, "a", "b", "c",
+        &my_charset_bin)))
+#endif
+      {
+        DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+      }
+      current->result_tmp_tbl_thd = thd;
+      TABLE *tmp_tbl = current->result_tmp_tbl;
+      tmp_tbl->file->extra(HA_EXTRA_WRITE_CACHE);
+      tmp_tbl->file->ha_start_bulk_insert((ha_rows) 0);
+      do {
+        if ((error_num = row->store_to_tmp_table(tmp_tbl, &tmp_str)))
+        {
+          tmp_tbl->file->ha_end_bulk_insert();
+          DBUG_RETURN(error_num);
+        }
+        roop_count++;
+      } while (
+        result_list->limit_num > roop_count &&
+        (row = current->result->fetch_row())
+      );
+      tmp_tbl->file->ha_end_bulk_insert();
+      page_size = result_list->limit_num;
+    }
+    current->record_num = roop_count;
+    result_list->record_num += roop_count;
+    if (
+      result_list->internal_limit <= result_list->record_num ||
+      page_size > roop_count ||
+      (
+        result_list->quick_mode == 3 &&
+        result_list->limit_num > roop_count
+      )
+    ) {
+      DBUG_PRINT("info",("spider set finish_flg point 4"));
+      DBUG_PRINT("info",("spider current->finish_flg = TRUE"));
+      DBUG_PRINT("info",("spider result_list->finish_flg = TRUE"));
+      current->finish_flg = TRUE;
+      result_list->finish_flg = TRUE;
+      current->result->free_result();
+      if (!current->result_tmp_tbl)
+      {
+        delete current->result;
+        current->result = NULL;
+      }
+      DBUG_PRINT("info", ("spider conn[%p]->quick_target=NULL", conn));
+      conn->quick_target = NULL;
+      spider->quick_targets[link_idx] = NULL;
+    } else if (
+      result_list->quick_mode == 3 ||
+      result_list->limit_num == roop_count
+    ) {
+      if (result_list->limit_num != roop_count)
+      {
+        current->result->free_result();
+        if (!current->result_tmp_tbl)
+        {
+          delete current->result;
+          current->result = NULL;
+        }
+        DBUG_PRINT("info", ("spider conn[%p]->quick_target=NULL", conn));
+        conn->quick_target = NULL;
+        spider->quick_targets[link_idx] = NULL;
+      }
+    }
+#ifndef WITHOUT_SPIDER_BG_SEARCH
+    DBUG_PRINT("info", ("spider bgs_phase=%d", result_list->bgs_phase));
+#endif
+    DBUG_PRINT("info", ("spider quick_phase=%d", result_list->quick_phase));
+    if (
+#ifndef WITHOUT_SPIDER_BG_SEARCH
+      result_list->bgs_phase <= 1 &&
+#endif
+      result_list->quick_phase == 0
+    ) {
+      result_list->current_row_num = 0;
+    }
+    DBUG_PRINT("info", ("spider result_list->current=%p", result_list->current));
+    DBUG_PRINT("info", ("spider current=%p", current));
+    DBUG_PRINT("info", ("spider first_position=%p", current->first_position));
+    DBUG_PRINT("info", ("spider current_row_num=%lld", result_list->current_row_num));
+    DBUG_PRINT("info", ("spider first_position[]=%p", &current->first_position[result_list->current_row_num]));
+    DBUG_PRINT("info", ("spider row=%p", current->first_position[result_list->current_row_num].row));
+  }
+  DBUG_RETURN(0);
+}
+
 void spider_db_discard_result(
   ha_spider *spider,
   int link_idx,
@@ -4477,12 +4888,12 @@ void spider_db_discard_result(
     spider_clear_bit(spider->db_request_phase, link_idx);
   }
   st_spider_db_request_key request_key;
-  request_key.spider_thread_id = spider->trx->spider_thread_id;
-  request_key.query_id = spider->trx->thd->query_id;
+  request_key.spider_thread_id = spider->wide_handler->trx->spider_thread_id;
+  request_key.query_id = spider->wide_handler->trx->thd->query_id;
   request_key.handler = spider;
   request_key.request_id = spider->db_request_id[link_idx];
   request_key.next = NULL;
-  if ((result = conn->db_conn->use_result(&request_key, &error_num)))
+  if ((result = conn->db_conn->use_result(spider, &request_key, &error_num)))
   {
     result->free_result();
     delete result;
@@ -4503,8 +4914,8 @@ void spider_db_discard_multiple_result(
   {
     spider_clear_bit(spider->db_request_phase, link_idx);
   }
-  request_key.spider_thread_id = spider->trx->spider_thread_id;
-  request_key.query_id = spider->trx->thd->query_id;
+  request_key.spider_thread_id = spider->wide_handler->trx->spider_thread_id;
+  request_key.query_id = spider->wide_handler->trx->thd->query_id;
   request_key.handler = spider;
   request_key.request_id = spider->db_request_id[link_idx];
   request_key.next = NULL;
@@ -4512,7 +4923,7 @@ void spider_db_discard_multiple_result(
   {
     if (!conn->db_conn->cmp_request_key_to_snd(&request_key))
       break;
-    if ((result = conn->db_conn->use_result(&request_key, &error_num)))
+    if ((result = conn->db_conn->use_result(spider, &request_key, &error_num)))
     {
       result->free_result();
       delete result;
@@ -4800,73 +5211,90 @@ int spider_db_seek_next(
                 pthread_mutex_lock(&conn->mta_conn_mutex);
                 SPIDER_SET_FILE_POS(&conn->mta_conn_mutex_file_pos);
               }
-              conn->need_mon = &spider->need_mons[link_idx];
-              DBUG_ASSERT(!conn->mta_conn_mutex_lock_already);
-              DBUG_ASSERT(!conn->mta_conn_mutex_unlock_later);
-              conn->mta_conn_mutex_lock_already = TRUE;
-              conn->mta_conn_mutex_unlock_later = TRUE;
-              if ((error_num = spider_db_set_names(spider, conn, link_idx)))
+              if (conn->db_conn->limit_mode() == 1)
               {
-                DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
-                DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
-                conn->mta_conn_mutex_lock_already = FALSE;
-                conn->mta_conn_mutex_unlock_later = FALSE;
+                conn->db_conn->set_limit(result_list->limit_num);
+                if (fields->is_first_link_ok_chain(link_idx_chain))
+                {
+                  if ((error_num = spider_db_store_result_for_reuse_cursor(
+                    spider, link_idx, table)))
+                  {
+                    SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
+                    pthread_mutex_unlock(&conn->bg_conn_mutex);
+                    DBUG_RETURN(error_num);
+                  }
+                }
                 SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
-                pthread_mutex_unlock(&conn->mta_conn_mutex);
-                if (
-                  spider->need_mons[link_idx]
+                pthread_mutex_unlock(&conn->bg_conn_mutex);
+              } else {
+                conn->need_mon = &spider->need_mons[link_idx];
+                DBUG_ASSERT(!conn->mta_conn_mutex_lock_already);
+                DBUG_ASSERT(!conn->mta_conn_mutex_unlock_later);
+                conn->mta_conn_mutex_lock_already = TRUE;
+                conn->mta_conn_mutex_unlock_later = TRUE;
+                if ((error_num = spider_db_set_names(spider, conn, link_idx)))
+                {
+                  DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
+                  DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
+                  conn->mta_conn_mutex_lock_already = FALSE;
+                  conn->mta_conn_mutex_unlock_later = FALSE;
+                  SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
+                  pthread_mutex_unlock(&conn->mta_conn_mutex);
+                  if (
+                    spider->need_mons[link_idx]
+                  ) {
+                    error_num = fields->ping_table_mon_from_table(link_idx_chain);
+                  }
+                  DBUG_PRINT("info",("spider error_num 7a=%d", error_num));
+                  DBUG_RETURN(error_num);
+                }
+                spider_conn_set_timeout_from_share(conn, link_idx,
+                  spider->wide_handler->trx->thd, share);
+                if (dbton_handler->execute_sql(
+                  sql_type,
+                  conn,
+                  result_list->quick_mode,
+                  &spider->need_mons[link_idx])
                 ) {
-                  error_num = fields->ping_table_mon_from_table(link_idx_chain);
+                  DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
+                  DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
+                  conn->mta_conn_mutex_lock_already = FALSE;
+                  conn->mta_conn_mutex_unlock_later = FALSE;
+                  error_num = spider_db_errorno(conn);
+                  if (
+                    spider->need_mons[link_idx]
+                  ) {
+                    error_num = fields->ping_table_mon_from_table(link_idx_chain);
+                  }
+                  DBUG_PRINT("info",("spider error_num 8a=%d", error_num));
+                  DBUG_RETURN(error_num);
                 }
-                DBUG_PRINT("info",("spider error_num 7a=%d", error_num));
-                DBUG_RETURN(error_num);
-              }
-              spider_conn_set_timeout_from_share(conn, link_idx,
-                spider->trx->thd, share);
-              if (dbton_handler->execute_sql(
-                sql_type,
-                conn,
-                result_list->quick_mode,
-                &spider->need_mons[link_idx])
-              ) {
+                spider->connection_ids[link_idx] = conn->connection_id;
                 DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
                 DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
                 conn->mta_conn_mutex_lock_already = FALSE;
                 conn->mta_conn_mutex_unlock_later = FALSE;
-                error_num = spider_db_errorno(conn);
-                if (
-                  spider->need_mons[link_idx]
-                ) {
-                  error_num = fields->ping_table_mon_from_table(link_idx_chain);
-                }
-                DBUG_PRINT("info",("spider error_num 8a=%d", error_num));
-                DBUG_RETURN(error_num);
-              }
-              spider->connection_ids[link_idx] = conn->connection_id;
-              DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
-              DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
-              conn->mta_conn_mutex_lock_already = FALSE;
-              conn->mta_conn_mutex_unlock_later = FALSE;
-              if (fields->is_first_link_ok_chain(link_idx_chain))
-              {
-                if ((error_num = spider_db_store_result(spider, link_idx,
-                  table)))
+                if (fields->is_first_link_ok_chain(link_idx_chain))
                 {
-                  if (
-                    error_num != HA_ERR_END_OF_FILE &&
-                    spider->need_mons[link_idx]
-                  ) {
-                    error_num =
-                      fields->ping_table_mon_from_table(link_idx_chain);
+                  if ((error_num = spider_db_store_result(spider, link_idx,
+                    table)))
+                  {
+                    if (
+                      error_num != HA_ERR_END_OF_FILE &&
+                      spider->need_mons[link_idx]
+                    ) {
+                      error_num =
+                        fields->ping_table_mon_from_table(link_idx_chain);
+                    }
+                    DBUG_PRINT("info",("spider error_num 9a=%d", error_num));
+                    DBUG_RETURN(error_num);
                   }
-                  DBUG_PRINT("info",("spider error_num 9a=%d", error_num));
-                  DBUG_RETURN(error_num);
+                  spider->result_link_idx = link_ok;
+                } else {
+                  spider_db_discard_result(spider, link_idx, conn);
+                  SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
+                  pthread_mutex_unlock(&conn->mta_conn_mutex);
                 }
-                spider->result_link_idx = link_ok;
-              } else {
-                spider_db_discard_result(spider, link_idx, conn);
-                SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
-                pthread_mutex_unlock(&conn->mta_conn_mutex);
               }
             }
           } else {
@@ -4908,98 +5336,79 @@ int spider_db_seek_next(
                 pthread_mutex_lock(&conn->mta_conn_mutex);
                 SPIDER_SET_FILE_POS(&conn->mta_conn_mutex_file_pos);
               }
-              conn->need_mon = &spider->need_mons[roop_count];
-              DBUG_ASSERT(!conn->mta_conn_mutex_lock_already);
-              DBUG_ASSERT(!conn->mta_conn_mutex_unlock_later);
-              conn->mta_conn_mutex_lock_already = TRUE;
-              conn->mta_conn_mutex_unlock_later = TRUE;
-              if ((error_num = spider_db_set_names(spider, conn, roop_count)))
+              if (conn->db_conn->limit_mode() == 1)
               {
-                DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
-                DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
-                conn->mta_conn_mutex_lock_already = FALSE;
-                conn->mta_conn_mutex_unlock_later = FALSE;
+                conn->db_conn->set_limit(result_list->limit_num);
+                if (roop_count == link_ok)
+                {
+                  if ((error_num = spider_db_store_result_for_reuse_cursor(
+                    spider, link_idx, table)))
+                  {
+                    SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
+                    pthread_mutex_unlock(&conn->bg_conn_mutex);
+                    DBUG_RETURN(error_num);
+                  }
+                }
                 SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
-                pthread_mutex_unlock(&conn->mta_conn_mutex);
-                if (
-                  share->monitoring_kind[roop_count] &&
-                  spider->need_mons[roop_count]
-                ) {
-                  error_num = spider_ping_table_mon_from_table(
-                      spider->trx,
-                      spider->trx->thd,
-                      share,
-                      roop_count,
-                      (uint32) share->monitoring_sid[roop_count],
-                      share->table_name,
-                      share->table_name_length,
-                      spider->conn_link_idx[roop_count],
-                      NULL,
-                      0,
-                      share->monitoring_kind[roop_count],
-                      share->monitoring_limit[roop_count],
-                      share->monitoring_flag[roop_count],
-                      TRUE
-                    );
+                pthread_mutex_unlock(&conn->bg_conn_mutex);
+              } else {
+                conn->need_mon = &spider->need_mons[roop_count];
+                DBUG_ASSERT(!conn->mta_conn_mutex_lock_already);
+                DBUG_ASSERT(!conn->mta_conn_mutex_unlock_later);
+                conn->mta_conn_mutex_lock_already = TRUE;
+                conn->mta_conn_mutex_unlock_later = TRUE;
+                if ((error_num = spider_db_set_names(spider, conn, roop_count)))
+                {
+                  DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
+                  DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
+                  conn->mta_conn_mutex_lock_already = FALSE;
+                  conn->mta_conn_mutex_unlock_later = FALSE;
+                  SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
+                  pthread_mutex_unlock(&conn->mta_conn_mutex);
+                  if (
+                    share->monitoring_kind[roop_count] &&
+                    spider->need_mons[roop_count]
+                  ) {
+                    error_num = spider_ping_table_mon_from_table(
+                        spider->wide_handler->trx,
+                        spider->wide_handler->trx->thd,
+                        share,
+                        roop_count,
+                        (uint32) share->monitoring_sid[roop_count],
+                        share->table_name,
+                        share->table_name_length,
+                        spider->conn_link_idx[roop_count],
+                        NULL,
+                        0,
+                        share->monitoring_kind[roop_count],
+                        share->monitoring_limit[roop_count],
+                        share->monitoring_flag[roop_count],
+                        TRUE
+                      );
+                  }
+                  DBUG_PRINT("info",("spider error_num 7=%d", error_num));
+                  DBUG_RETURN(error_num);
                 }
-                DBUG_PRINT("info",("spider error_num 7=%d", error_num));
-                DBUG_RETURN(error_num);
-              }
-              spider_conn_set_timeout_from_share(conn, roop_count,
-                spider->trx->thd, share);
-              if (dbton_handler->execute_sql(
-                sql_type,
-                conn,
-                result_list->quick_mode,
-                &spider->need_mons[roop_count])
-              ) {
-                DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
-                DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
-                conn->mta_conn_mutex_lock_already = FALSE;
-                conn->mta_conn_mutex_unlock_later = FALSE;
-                error_num = spider_db_errorno(conn);
-                if (
-                  share->monitoring_kind[roop_count] &&
-                  spider->need_mons[roop_count]
+                spider_conn_set_timeout_from_share(conn, roop_count,
+                  spider->wide_handler->trx->thd, share);
+                if (dbton_handler->execute_sql(
+                  sql_type,
+                  conn,
+                  result_list->quick_mode,
+                  &spider->need_mons[roop_count])
                 ) {
-                  error_num = spider_ping_table_mon_from_table(
-                      spider->trx,
-                      spider->trx->thd,
-                      share,
-                      roop_count,
-                      (uint32) share->monitoring_sid[roop_count],
-                      share->table_name,
-                      share->table_name_length,
-                      spider->conn_link_idx[roop_count],
-                      NULL,
-                      0,
-                      share->monitoring_kind[roop_count],
-                      share->monitoring_limit[roop_count],
-                      share->monitoring_flag[roop_count],
-                      TRUE
-                    );
-                }
-                DBUG_PRINT("info",("spider error_num 8=%d", error_num));
-                DBUG_RETURN(error_num);
-              }
-              spider->connection_ids[roop_count] = conn->connection_id;
-              DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
-              DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
-              conn->mta_conn_mutex_lock_already = FALSE;
-              conn->mta_conn_mutex_unlock_later = FALSE;
-              if (roop_count == link_ok)
-              {
-                if ((error_num = spider_db_store_result(spider, roop_count,
-                  table)))
-                {
+                  DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
+                  DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
+                  conn->mta_conn_mutex_lock_already = FALSE;
+                  conn->mta_conn_mutex_unlock_later = FALSE;
+                  error_num = spider_db_errorno(conn);
                   if (
-                    error_num != HA_ERR_END_OF_FILE &&
                     share->monitoring_kind[roop_count] &&
                     spider->need_mons[roop_count]
                   ) {
                     error_num = spider_ping_table_mon_from_table(
-                        spider->trx,
-                        spider->trx->thd,
+                        spider->wide_handler->trx,
+                        spider->wide_handler->trx->thd,
                         share,
                         roop_count,
                         (uint32) share->monitoring_sid[roop_count],
@@ -5014,14 +5423,50 @@ int spider_db_seek_next(
                         TRUE
                       );
                   }
-                  DBUG_PRINT("info",("spider error_num 9=%d", error_num));
+                  DBUG_PRINT("info",("spider error_num 8=%d", error_num));
                   DBUG_RETURN(error_num);
                 }
-                spider->result_link_idx = link_ok;
-              } else {
-                spider_db_discard_result(spider, roop_count, conn);
-                SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
-                pthread_mutex_unlock(&conn->mta_conn_mutex);
+                spider->connection_ids[roop_count] = conn->connection_id;
+                DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
+                DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
+                conn->mta_conn_mutex_lock_already = FALSE;
+                conn->mta_conn_mutex_unlock_later = FALSE;
+                if (roop_count == link_ok)
+                {
+                  if ((error_num = spider_db_store_result(spider, roop_count,
+                    table)))
+                  {
+                    if (
+                      error_num != HA_ERR_END_OF_FILE &&
+                      share->monitoring_kind[roop_count] &&
+                      spider->need_mons[roop_count]
+                    ) {
+                      error_num = spider_ping_table_mon_from_table(
+                          spider->wide_handler->trx,
+                          spider->wide_handler->trx->thd,
+                          share,
+                          roop_count,
+                          (uint32) share->monitoring_sid[roop_count],
+                          share->table_name,
+                          share->table_name_length,
+                          spider->conn_link_idx[roop_count],
+                          NULL,
+                          0,
+                          share->monitoring_kind[roop_count],
+                          share->monitoring_limit[roop_count],
+                          share->monitoring_flag[roop_count],
+                          TRUE
+                        );
+                    }
+                    DBUG_PRINT("info",("spider error_num 9=%d", error_num));
+                    DBUG_RETURN(error_num);
+                  }
+                  spider->result_link_idx = link_ok;
+                } else {
+                  spider_db_discard_result(spider, roop_count, conn);
+                  SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
+                  pthread_mutex_unlock(&conn->mta_conn_mutex);
+                }
               }
             }
 #ifdef SPIDER_HAS_GROUP_BY_HANDLER
@@ -5178,95 +5623,79 @@ int spider_db_seek_last(
         SPIDER_SET_FILE_POS(&conn->mta_conn_mutex_file_pos);
       }
       DBUG_PRINT("info",("spider sql_type=%lu", sql_type));
-      conn->need_mon = &spider->need_mons[roop_count];
-      DBUG_ASSERT(!conn->mta_conn_mutex_lock_already);
-      DBUG_ASSERT(!conn->mta_conn_mutex_unlock_later);
-      conn->mta_conn_mutex_lock_already = TRUE;
-      conn->mta_conn_mutex_unlock_later = TRUE;
-      if ((error_num = spider_db_set_names(spider, conn, roop_count)))
+      if (conn->db_conn->limit_mode() == 1)
       {
-        DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
-        DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
-        conn->mta_conn_mutex_lock_already = FALSE;
-        conn->mta_conn_mutex_unlock_later = FALSE;
+        conn->db_conn->set_limit(result_list->limit_num);
+        if (roop_count == link_ok)
+        {
+          if ((error_num = spider_db_store_result_for_reuse_cursor(
+            spider, roop_count, table)))
+          {
+            SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
+            pthread_mutex_unlock(&conn->bg_conn_mutex);
+            DBUG_RETURN(error_num);
+          }
+        }
         SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
-        pthread_mutex_unlock(&conn->mta_conn_mutex);
-        if (
-          share->monitoring_kind[roop_count] &&
-          spider->need_mons[roop_count]
-        ) {
-          error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
-              share,
-              roop_count,
-              (uint32) share->monitoring_sid[roop_count],
-              share->table_name,
-              share->table_name_length,
-              spider->conn_link_idx[roop_count],
-              NULL,
-              0,
-              share->monitoring_kind[roop_count],
-              share->monitoring_limit[roop_count],
-              share->monitoring_flag[roop_count],
-              TRUE
-            );
+        pthread_mutex_unlock(&conn->bg_conn_mutex);
+      } else {
+        conn->need_mon = &spider->need_mons[roop_count];
+        DBUG_ASSERT(!conn->mta_conn_mutex_lock_already);
+        DBUG_ASSERT(!conn->mta_conn_mutex_unlock_later);
+        conn->mta_conn_mutex_lock_already = TRUE;
+        conn->mta_conn_mutex_unlock_later = TRUE;
+        if ((error_num = spider_db_set_names(spider, conn, roop_count)))
+        {
+          DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
+          DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
+          conn->mta_conn_mutex_lock_already = FALSE;
+          conn->mta_conn_mutex_unlock_later = FALSE;
+          SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
+          pthread_mutex_unlock(&conn->mta_conn_mutex);
+          if (
+            share->monitoring_kind[roop_count] &&
+            spider->need_mons[roop_count]
+          ) {
+            error_num = spider_ping_table_mon_from_table(
+                spider->wide_handler->trx,
+                spider->wide_handler->trx->thd,
+                share,
+                roop_count,
+                (uint32) share->monitoring_sid[roop_count],
+                share->table_name,
+                share->table_name_length,
+                spider->conn_link_idx[roop_count],
+                NULL,
+                0,
+                share->monitoring_kind[roop_count],
+                share->monitoring_limit[roop_count],
+                share->monitoring_flag[roop_count],
+                TRUE
+              );
+          }
+          DBUG_RETURN(error_num);
         }
-        DBUG_RETURN(error_num);
-      }
-      spider_conn_set_timeout_from_share(conn, roop_count, spider->trx->thd,
-        share);
-      if (dbton_handler->execute_sql(
-        sql_type,
-        conn,
-        result_list->quick_mode,
-        &spider->need_mons[roop_count])
-      ) {
-        DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
-        DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
-        conn->mta_conn_mutex_lock_already = FALSE;
-        conn->mta_conn_mutex_unlock_later = FALSE;
-        error_num = spider_db_errorno(conn);
-        if (
-          share->monitoring_kind[roop_count] &&
-          spider->need_mons[roop_count]
+        spider_conn_set_timeout_from_share(conn, roop_count,
+          spider->wide_handler->trx->thd,
+          share);
+        if (dbton_handler->execute_sql(
+          sql_type,
+          conn,
+          result_list->quick_mode,
+          &spider->need_mons[roop_count])
         ) {
-          error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
-              share,
-              roop_count,
-              (uint32) share->monitoring_sid[roop_count],
-              share->table_name,
-              share->table_name_length,
-              spider->conn_link_idx[roop_count],
-              NULL,
-              0,
-              share->monitoring_kind[roop_count],
-              share->monitoring_limit[roop_count],
-              share->monitoring_flag[roop_count],
-              TRUE
-            );
-        }
-        DBUG_RETURN(error_num);
-      }
-      spider->connection_ids[roop_count] = conn->connection_id;
-      DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
-      DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
-      conn->mta_conn_mutex_lock_already = FALSE;
-      conn->mta_conn_mutex_unlock_later = FALSE;
-      if (roop_count == link_ok)
-      {
-        if ((error_num = spider_db_store_result(spider, roop_count, table)))
-        {
+          DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
+          DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
+          conn->mta_conn_mutex_lock_already = FALSE;
+          conn->mta_conn_mutex_unlock_later = FALSE;
+          error_num = spider_db_errorno(conn);
           if (
-            error_num != HA_ERR_END_OF_FILE &&
             share->monitoring_kind[roop_count] &&
             spider->need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                spider->trx,
-                spider->trx->thd,
+                spider->wide_handler->trx,
+                spider->wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -5283,11 +5712,45 @@ int spider_db_seek_last(
           }
           DBUG_RETURN(error_num);
         }
-        spider->result_link_idx = link_ok;
-      } else {
-        spider_db_discard_result(spider, roop_count, conn);
-        SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
-        pthread_mutex_unlock(&conn->mta_conn_mutex);
+        spider->connection_ids[roop_count] = conn->connection_id;
+        DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
+        DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
+        conn->mta_conn_mutex_lock_already = FALSE;
+        conn->mta_conn_mutex_unlock_later = FALSE;
+        if (roop_count == link_ok)
+        {
+          if ((error_num = spider_db_store_result(spider, roop_count, table)))
+          {
+            if (
+              error_num != HA_ERR_END_OF_FILE &&
+              share->monitoring_kind[roop_count] &&
+              spider->need_mons[roop_count]
+            ) {
+              error_num = spider_ping_table_mon_from_table(
+                  spider->wide_handler->trx,
+                  spider->wide_handler->trx->thd,
+                  share,
+                  roop_count,
+                  (uint32) share->monitoring_sid[roop_count],
+                  share->table_name,
+                  share->table_name_length,
+                  spider->conn_link_idx[roop_count],
+                  NULL,
+                  0,
+                  share->monitoring_kind[roop_count],
+                  share->monitoring_limit[roop_count],
+                  share->monitoring_flag[roop_count],
+                  TRUE
+                );
+            }
+            DBUG_RETURN(error_num);
+          }
+          spider->result_link_idx = link_ok;
+        } else {
+          spider_db_discard_result(spider, roop_count, conn);
+          SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
+          pthread_mutex_unlock(&conn->mta_conn_mutex);
+        }
       }
     }
     result_list->current_row_num = result_list->current->record_num - 1;
@@ -5413,8 +5876,8 @@ int spider_db_seek_last(
         spider->need_mons[roop_count]
       ) {
         error_num = spider_ping_table_mon_from_table(
-            spider->trx,
-            spider->trx->thd,
+            spider->wide_handler->trx,
+            spider->wide_handler->trx->thd,
             share,
             roop_count,
             (uint32) share->monitoring_sid[roop_count],
@@ -5431,7 +5894,8 @@ int spider_db_seek_last(
       }
       DBUG_RETURN(error_num);
     }
-    spider_conn_set_timeout_from_share(conn, roop_count, spider->trx->thd,
+    spider_conn_set_timeout_from_share(conn, roop_count,
+      spider->wide_handler->trx->thd,
       share);
     if (dbton_handler->execute_sql(
       sql_type,
@@ -5449,8 +5913,8 @@ int spider_db_seek_last(
         spider->need_mons[roop_count]
       ) {
         error_num = spider_ping_table_mon_from_table(
-            spider->trx,
-            spider->trx->thd,
+            spider->wide_handler->trx,
+            spider->wide_handler->trx->thd,
             share,
             roop_count,
             (uint32) share->monitoring_sid[roop_count],
@@ -5482,8 +5946,8 @@ int spider_db_seek_last(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -5582,7 +6046,7 @@ void spider_db_create_position(
   pos->direct_aggregate = result_list->direct_aggregate;
 #endif
   pos->sql_kind = spider->sql_kind[spider->result_link_idx];
-  pos->position_bitmap = spider->position_bitmap;
+  pos->position_bitmap = spider->wide_handler->position_bitmap;
   pos->ft_first = spider->ft_first;
   pos->ft_current = spider->ft_current;
   pos->result = current;
@@ -5954,7 +6418,7 @@ int spider_db_simple_action(
   bool pre_call
 ) {
   int error_num;
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   SPIDER_CONN *conn;
   DBUG_ENTER("spider_db_simple_action");
   if (pre_call)
@@ -6026,14 +6490,19 @@ void spider_db_set_cardinarity(
     {
       key_part = &key_info->key_part[roop_count2];
       field = key_part->field;
-      rec_per_key = (ha_rows) share->stat.records /
-        share->cardinality[field->field_index];
-      if (rec_per_key > ~(ulong) 0)
-        key_info->rec_per_key[roop_count2] = ~(ulong) 0;
-      else if (rec_per_key == 0)
+      if (share->cardinality[field->field_index])
+      {
+        rec_per_key = (ha_rows) share->stat.records /
+          share->cardinality[field->field_index];
+        if (rec_per_key > ~(ulong) 0)
+          key_info->rec_per_key[roop_count2] = ~(ulong) 0;
+        else if (rec_per_key == 0)
+          key_info->rec_per_key[roop_count2] = 1;
+        else
+          key_info->rec_per_key[roop_count2] = (ulong) rec_per_key;
+      } else {
         key_info->rec_per_key[roop_count2] = 1;
-      else
-        key_info->rec_per_key[roop_count2] = (ulong) rec_per_key;
+      }
       DBUG_PRINT("info",
         ("spider column id=%d", field->field_index));
       DBUG_PRINT("info",
@@ -6066,8 +6535,8 @@ int spider_db_show_index(
 }
 
 ha_rows spider_db_explain_select(
-  key_range *start_key,
-  key_range *end_key,
+  const key_range *start_key,
+  const key_range *end_key,
   ha_spider *spider,
   int link_idx
 ) {
@@ -6104,7 +6573,8 @@ int spider_db_bulk_insert_init(
       SPIDER_LINK_STATUS_RECOVERY)
   ) {
     if (spider->conns[roop_count])
-      spider->conns[roop_count]->ignore_dup_key = spider->ignore_dup_key;
+      spider->conns[roop_count]->ignore_dup_key =
+        spider->wide_handler->ignore_dup_key;
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
     if (
       spider_conn_use_handler(spider, spider->lock_mode, roop_count) &&
@@ -6124,8 +6594,8 @@ int spider_db_bulk_insert_init(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -6145,7 +6615,8 @@ int spider_db_bulk_insert_init(
       spider->set_handler_opened(roop_count);
     }
 #else
-    spider_conn_use_handler(spider, spider->lock_mode, roop_count);
+    spider_conn_use_handler(spider, spider->wide_handler->lock_mode,
+      roop_count);
 #endif
   }
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
@@ -6179,7 +6650,7 @@ int spider_db_bulk_insert(
   SPIDER_RESULT_LIST *result_list = &spider->result_list;
 #endif
   SPIDER_SHARE *share = spider->share;
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   DBUG_ENTER("spider_db_bulk_insert");
 
   if (!bulk_end)
@@ -6190,7 +6661,11 @@ int spider_db_bulk_insert(
 #endif
       if ((error_num = spider->append_insert_values_sql_part(
         SPIDER_SQL_TYPE_INSERT_SQL)))
+      {
+        if (spider->sql_kinds & SPIDER_SQL_KIND_SQL)
+          spider->set_insert_to_pos_sql(SPIDER_SQL_TYPE_INSERT_SQL);
         DBUG_RETURN(error_num);
+      }
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
     }
     if (spider->sql_kinds & SPIDER_SQL_KIND_HS)
@@ -6210,6 +6685,8 @@ int spider_db_bulk_insert(
     if ((error_num = spider->append_insert_terminator_sql_part(
       SPIDER_SQL_TYPE_INSERT_SQL)))
     {
+      if (spider->sql_kinds & SPIDER_SQL_KIND_SQL)
+        spider->set_insert_to_pos_sql(SPIDER_SQL_TYPE_INSERT_SQL);
       DBUG_RETURN(error_num);
     }
 #ifdef HA_CAN_BULK_ACCESS
@@ -6244,6 +6721,8 @@ int spider_db_bulk_insert(
           if ((error_num = dbton_handler->set_sql_for_exec(sql_type,
             roop_count2)))
           {
+            if (spider->sql_kinds & SPIDER_SQL_KIND_SQL)
+              spider->set_insert_to_pos_sql(SPIDER_SQL_TYPE_INSERT_SQL);
             if (dbton_handler->need_lock_before_set_sql_for_exec(sql_type))
             {
               SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
@@ -6273,6 +6752,8 @@ int spider_db_bulk_insert(
         conn->mta_conn_mutex_unlock_later = TRUE;
         if ((error_num = spider_db_set_names(spider, conn, roop_count2)))
         {
+          if (spider->sql_kinds & SPIDER_SQL_KIND_SQL)
+            spider->set_insert_to_pos_sql(SPIDER_SQL_TYPE_INSERT_SQL);
           DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
           DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
           conn->mta_conn_mutex_lock_already = FALSE;
@@ -6284,8 +6765,8 @@ int spider_db_bulk_insert(
             spider->need_mons[roop_count2]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                spider->trx,
-                spider->trx->thd,
+                spider->wide_handler->trx,
+                spider->wide_handler->trx->thd,
                 share,
                 roop_count2,
                 (uint32) share->monitoring_sid[roop_count2],
@@ -6302,7 +6783,8 @@ int spider_db_bulk_insert(
           }
           DBUG_RETURN(error_num);
         }
-        spider_conn_set_timeout_from_share(conn, roop_count2, spider->trx->thd,
+        spider_conn_set_timeout_from_share(conn, roop_count2,
+          spider->wide_handler->trx->thd,
           share);
         if (dbton_handler->execute_sql(
           sql_type,
@@ -6331,8 +6813,8 @@ int spider_db_bulk_insert(
             spider->need_mons[roop_count2]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                spider->trx,
-                spider->trx->thd,
+                spider->wide_handler->trx,
+                spider->wide_handler->trx->thd,
                 share,
                 roop_count2,
                 (uint32) share->monitoring_sid[roop_count2],
@@ -6374,12 +6856,14 @@ int spider_db_bulk_insert(
               spider_clear_bit(spider->db_request_phase, roop_count2);
             }
             st_spider_db_request_key request_key;
-            request_key.spider_thread_id = spider->trx->spider_thread_id;
-            request_key.query_id = spider->trx->thd->query_id;
+            request_key.spider_thread_id =
+              spider->wide_handler->trx->spider_thread_id;
+            request_key.query_id = spider->wide_handler->trx->thd->query_id;
             request_key.handler = spider;
             request_key.request_id = spider->db_request_id[roop_count2];
             request_key.next = NULL;
-            if ((result = conn->db_conn->use_result(&request_key, &error_num)))
+            if ((result = conn->db_conn->use_result(spider, &request_key,
+              &error_num)))
             {
               result->free_result();
               delete result;
@@ -6482,7 +6966,7 @@ int spider_db_bulk_bulk_insert(
   SPIDER_SHARE *share = spider->share;
   SPIDER_CONN *conn, *first_insert_conn = NULL;
   TABLE *table = spider->get_table();
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   DBUG_ENTER("spider_db_bulk_bulk_insert");
   for (
     roop_count2 = spider_conn_link_idx_next(share->link_statuses,
@@ -6536,12 +7020,14 @@ int spider_db_bulk_bulk_insert(
           spider_clear_bit(spider->db_request_phase, roop_count2);
         }
         st_spider_db_request_key request_key;
-        request_key.spider_thread_id = spider->trx->spider_thread_id;
-        request_key.query_id = spider->trx->thd->query_id;
+        request_key.spider_thread_id =
+          spider->wide_handler->trx->spider_thread_id;
+        request_key.query_id = spider->wide_handler->trx->thd->query_id;
         request_key.handler = spider;
         request_key.request_id = spider->db_request_id[roop_count2];
         request_key.next = NULL;
-        if ((result = conn->db_conn->use_result(&request_key, &error_num)))
+        if ((result = conn->db_conn->use_result(spider, &request_key,
+          &error_num)))
         {
           result->free_result();
           delete result;
@@ -6613,7 +7099,7 @@ int spider_db_update_auto_increment(
   int link_idx
 ) {
   int roop_count;
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   ulonglong last_insert_id, affected_rows;
   SPIDER_SHARE *share = spider->share;
   TABLE *table = spider->get_table();
@@ -6786,7 +7272,7 @@ int spider_db_bulk_update_end(
   ha_rows *dup_key_found
 ) {
   int error_num = 0, error_num2, roop_count;
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   SPIDER_SHARE *share = spider->share;
   SPIDER_CONN *conn;
   bool is_error = thd->is_error();
@@ -6977,7 +7463,7 @@ int spider_db_update(
     conn = spider->conns[roop_count];
     spider_db_handler *dbton_hdl = spider->dbton_handler[conn->dbton_id];
 #if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >= 100000
-    conn->ignore_dup_key = spider->ignore_dup_key;
+    conn->ignore_dup_key = spider->wide_handler->ignore_dup_key;
 #endif
     pthread_mutex_assert_not_owner(&conn->mta_conn_mutex);
     if (dbton_hdl->need_lock_before_set_sql_for_exec(
@@ -7021,8 +7507,8 @@ int spider_db_update(
         spider->need_mons[roop_count]
       ) {
         error_num = spider_ping_table_mon_from_table(
-            spider->trx,
-            spider->trx->thd,
+            spider->wide_handler->trx,
+            spider->wide_handler->trx->thd,
             share,
             roop_count,
             (uint32) share->monitoring_sid[roop_count],
@@ -7039,7 +7525,8 @@ int spider_db_update(
       }
       DBUG_RETURN(error_num);
     }
-    spider_conn_set_timeout_from_share(conn, roop_count, spider->trx->thd,
+    spider_conn_set_timeout_from_share(conn, roop_count,
+      spider->wide_handler->trx->thd,
       share);
     if (dbton_hdl->execute_sql(
       SPIDER_SQL_TYPE_UPDATE_SQL,
@@ -7060,8 +7547,8 @@ int spider_db_update(
         spider->need_mons[roop_count]
       ) {
         error_num = spider_ping_table_mon_from_table(
-            spider->trx,
-            spider->trx->thd,
+            spider->wide_handler->trx,
+            spider->wide_handler->trx->thd,
             share,
             roop_count,
             (uint32) share->monitoring_sid[roop_count],
@@ -7096,7 +7583,8 @@ int spider_db_update(
         pthread_mutex_unlock(&conn->mta_conn_mutex);
         DBUG_RETURN(error_num);
       }
-      spider_conn_set_timeout_from_share(conn, roop_count, spider->trx->thd,
+      spider_conn_set_timeout_from_share(conn, roop_count,
+        spider->wide_handler->trx->thd,
         share);
       if (dbton_hdl->execute_sql(
         SPIDER_SQL_TYPE_INSERT_SQL,
@@ -7117,8 +7605,8 @@ int spider_db_update(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -7204,7 +7692,7 @@ int spider_db_direct_update(
   SQL access -> SQL remote access with dirct_update
     spider->do_direct_update &&
     spider->direct_update_kinds == SPIDER_SQL_KIND_SQL &&
-    spider->direct_update_fields
+    spider->wide_handler->direct_update_fields
 
   Handlersocket access -> SQL remote access with dirct_update
     spider->do_direct_update &&
@@ -7352,7 +7840,7 @@ int spider_db_direct_update(
     if (spider->is_bulk_access_clone)
     {
       spider->connection_ids[roop_count] = conn->connection_id;
-      spider_trx_add_bulk_access_conn(spider->trx, conn);
+      spider_trx_add_bulk_access_conn(spider->wide_handler->trx, conn);
     } else {
 #endif
       conn->need_mon = &spider->need_mons[roop_count];
@@ -7373,8 +7861,8 @@ int spider_db_direct_update(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -7391,7 +7879,8 @@ int spider_db_direct_update(
         }
         DBUG_RETURN(error_num);
       }
-      spider_conn_set_timeout_from_share(conn, roop_count, spider->trx->thd,
+      spider_conn_set_timeout_from_share(conn, roop_count,
+        spider->wide_handler->trx->thd,
         share);
       if (
         (error_num = dbton_hdl->execute_sql(
@@ -7415,8 +7904,8 @@ int spider_db_direct_update(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -7453,12 +7942,14 @@ int spider_db_direct_update(
           spider_clear_bit(spider->db_request_phase, roop_count);
         }
         st_spider_db_request_key request_key;
-        request_key.spider_thread_id = spider->trx->spider_thread_id;
-        request_key.query_id = spider->trx->thd->query_id;
+        request_key.spider_thread_id =
+          spider->wide_handler->trx->spider_thread_id;
+        request_key.query_id = spider->wide_handler->trx->thd->query_id;
         request_key.handler = spider;
         request_key.request_id = spider->db_request_id[roop_count];
         request_key.next = NULL;
-        if ((result = conn->db_conn->use_result(&request_key, &error_num)))
+        if ((result = conn->db_conn->use_result(spider, &request_key,
+          &error_num)))
         {
           if (!counted)
           {
@@ -7537,7 +8028,7 @@ int spider_db_direct_update(
   SQL access -> SQL remote access with dirct_update
     spider->do_direct_update &&
     spider->direct_update_kinds == SPIDER_SQL_KIND_SQL &&
-    spider->direct_update_fields
+    spider->wide_handler->direct_update_fields
 */
 
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
@@ -7628,7 +8119,7 @@ int spider_db_direct_update(
     if (spider->is_bulk_access_clone)
     {
       spider->connection_ids[roop_count] = conn->connection_id;
-      spider_trx_add_bulk_access_conn(spider->trx, conn);
+      spider_trx_add_bulk_access_conn(spider->wide_handler->trx, conn);
     } else {
 #endif
       conn->need_mon = &spider->need_mons[roop_count];
@@ -7649,8 +8140,8 @@ int spider_db_direct_update(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -7667,7 +8158,8 @@ int spider_db_direct_update(
         }
         DBUG_RETURN(error_num);
       }
-      spider_conn_set_timeout_from_share(conn, roop_count, spider->trx->thd,
+      spider_conn_set_timeout_from_share(conn, roop_count,
+        spider->wide_handler->trx->thd,
         share);
       if (
         (error_num = dbton_hdl->execute_sql(
@@ -7676,7 +8168,8 @@ int spider_db_direct_update(
           -1,
           &spider->need_mons[roop_count])
         ) &&
-        (error_num != HA_ERR_FOUND_DUPP_KEY || !spider->ignore_dup_key)
+        (error_num != HA_ERR_FOUND_DUPP_KEY ||
+          !spider->wide_handler->ignore_dup_key)
       ) {
         DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
         DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
@@ -7691,8 +8184,8 @@ int spider_db_direct_update(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -7797,12 +8290,14 @@ int spider_db_bulk_direct_update(
         spider_clear_bit(spider->db_request_phase, roop_count);
       }
       st_spider_db_request_key request_key;
-      request_key.spider_thread_id = spider->trx->spider_thread_id;
-      request_key.query_id = spider->trx->thd->query_id;
+      request_key.spider_thread_id =
+        spider->wide_handler->trx->spider_thread_id;
+      request_key.query_id = spider->wide_handler->trx->thd->query_id;
       request_key.handler = spider;
       request_key.request_id = spider->db_request_id[roop_count];
       request_key.next = NULL;
-      if ((result = conn->db_conn->use_result(&request_key, &error_num)))
+      if ((result = conn->db_conn->use_result(spider, &request_key,
+        &error_num)))
       {
         if (!counted)
         {
@@ -8068,7 +8563,7 @@ int spider_db_direct_delete(
     if (spider->is_bulk_access_clone)
     {
       spider->connection_ids[roop_count] = conn->connection_id;
-      spider_trx_add_bulk_access_conn(spider->trx, conn);
+      spider_trx_add_bulk_access_conn(spider->wide_handler->trx, conn);
     } else {
 #endif
       conn->need_mon = &spider->need_mons[roop_count];
@@ -8089,8 +8584,8 @@ int spider_db_direct_delete(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -8107,7 +8602,8 @@ int spider_db_direct_delete(
         }
         DBUG_RETURN(error_num);
       }
-      spider_conn_set_timeout_from_share(conn, roop_count, spider->trx->thd,
+      spider_conn_set_timeout_from_share(conn, roop_count,
+        spider->wide_handler->trx->thd,
         share);
       if (dbton_hdl->execute_sql(
         sql_type,
@@ -8125,8 +8621,8 @@ int spider_db_direct_delete(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -8165,12 +8661,14 @@ int spider_db_direct_delete(
           spider_clear_bit(spider->db_request_phase, roop_count);
         }
         st_spider_db_request_key request_key;
-        request_key.spider_thread_id = spider->trx->spider_thread_id;
-        request_key.query_id = spider->trx->thd->query_id;
+        request_key.spider_thread_id =
+          spider->wide_handler->trx->spider_thread_id;
+        request_key.query_id = spider->wide_handler->trx->thd->query_id;
         request_key.handler = spider;
         request_key.request_id = spider->db_request_id[roop_count];
         request_key.next = NULL;
-        if ((result = conn->db_conn->use_result(&request_key, &error_num)))
+        if ((result = conn->db_conn->use_result(spider, &request_key,
+          &error_num)))
         {
           if (!counted)
           {
@@ -8304,7 +8802,7 @@ int spider_db_direct_delete(
     if (spider->is_bulk_access_clone)
     {
       spider->connection_ids[roop_count] = conn->connection_id;
-      spider_trx_add_bulk_access_conn(spider->trx, conn);
+      spider_trx_add_bulk_access_conn(spider->wide_handler->trx, conn);
     } else {
 #endif
       conn->need_mon = &spider->need_mons[roop_count];
@@ -8325,8 +8823,8 @@ int spider_db_direct_delete(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -8343,7 +8841,8 @@ int spider_db_direct_delete(
         }
         DBUG_RETURN(error_num);
       }
-      spider_conn_set_timeout_from_share(conn, roop_count, spider->trx->thd,
+      spider_conn_set_timeout_from_share(conn, roop_count,
+        spider->wide_handler->trx->thd,
         share);
       if (dbton_hdl->execute_sql(
         sql_type,
@@ -8361,8 +8860,8 @@ int spider_db_direct_delete(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -8458,7 +8957,8 @@ int spider_db_delete_all_rows(
     DBUG_ASSERT(!conn->mta_conn_mutex_unlock_later);
     conn->mta_conn_mutex_lock_already = TRUE;
     conn->mta_conn_mutex_unlock_later = TRUE;
-    spider_conn_set_timeout_from_share(conn, roop_count, spider->trx->thd,
+    spider_conn_set_timeout_from_share(conn, roop_count,
+      spider->wide_handler->trx->thd,
       share);
     if (
       (error_num = spider_db_set_names(spider, conn, roop_count)) ||
@@ -8489,8 +8989,8 @@ int spider_db_delete_all_rows(
             spider->need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                spider->trx,
-                spider->trx->thd,
+                spider->wide_handler->trx,
+                spider->wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -8520,8 +9020,8 @@ int spider_db_delete_all_rows(
             spider->need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                spider->trx,
-                spider->trx->thd,
+                spider->wide_handler->trx,
+                spider->wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -8538,7 +9038,8 @@ int spider_db_delete_all_rows(
           }
           DBUG_RETURN(error_num);
         }
-        spider_conn_set_timeout_from_share(conn, roop_count, spider->trx->thd,
+        spider_conn_set_timeout_from_share(conn, roop_count,
+          spider->wide_handler->trx->thd,
           share);
         if (dbton_hdl->execute_sql(
           SPIDER_SQL_TYPE_DELETE_SQL,
@@ -8556,8 +9057,8 @@ int spider_db_delete_all_rows(
             spider->need_mons[roop_count]
           ) {
             error_num = spider_ping_table_mon_from_table(
-                spider->trx,
-                spider->trx->thd,
+                spider->wide_handler->trx,
+                spider->wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -8586,8 +9087,8 @@ int spider_db_delete_all_rows(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -8626,7 +9127,7 @@ int spider_db_disable_keys(
   spider_db_handler *dbton_hdl;
   DBUG_ENTER("spider_db_disable_keys");
   if (
-    spider_param_internal_optimize(spider->trx->thd,
+    spider_param_internal_optimize(spider->wide_handler->trx->thd,
       share->internal_optimize) == 1
   ) {
     for (
@@ -8647,8 +9148,8 @@ int spider_db_disable_keys(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -8679,7 +9180,7 @@ int spider_db_enable_keys(
   spider_db_handler *dbton_hdl;
   DBUG_ENTER("spider_db_enable_keys");
   if (
-    spider_param_internal_optimize(spider->trx->thd,
+    spider_param_internal_optimize(spider->wide_handler->trx->thd,
       share->internal_optimize) == 1
   ) {
     for (
@@ -8700,8 +9201,8 @@ int spider_db_enable_keys(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -8733,7 +9234,7 @@ int spider_db_check_table(
   spider_db_handler *dbton_hdl;
   DBUG_ENTER("spider_db_check_table");
   if (
-    spider_param_internal_optimize(spider->trx->thd,
+    spider_param_internal_optimize(spider->wide_handler->trx->thd,
       share->internal_optimize) == 1
   ) {
     for (
@@ -8754,8 +9255,8 @@ int spider_db_check_table(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -8787,7 +9288,7 @@ int spider_db_repair_table(
   spider_db_handler *dbton_hdl;
   DBUG_ENTER("spider_db_repair_table");
   if (
-    spider_param_internal_optimize(spider->trx->thd,
+    spider_param_internal_optimize(spider->wide_handler->trx->thd,
       share->internal_optimize) == 1
   ) {
     for (
@@ -8808,8 +9309,8 @@ int spider_db_repair_table(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -8840,7 +9341,7 @@ int spider_db_analyze_table(
   spider_db_handler *dbton_hdl;
   DBUG_ENTER("spider_db_analyze_table");
   if (
-    spider_param_internal_optimize(spider->trx->thd,
+    spider_param_internal_optimize(spider->wide_handler->trx->thd,
       share->internal_optimize) == 1
   ) {
     for (
@@ -8861,8 +9362,8 @@ int spider_db_analyze_table(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -8893,7 +9394,7 @@ int spider_db_optimize_table(
   spider_db_handler *dbton_hdl;
   DBUG_ENTER("spider_db_optimize_table");
   if (
-    spider_param_internal_optimize(spider->trx->thd,
+    spider_param_internal_optimize(spider->wide_handler->trx->thd,
       share->internal_optimize) == 1
   ) {
     for (
@@ -8914,8 +9415,8 @@ int spider_db_optimize_table(
           spider->need_mons[roop_count]
         ) {
           error_num = spider_ping_table_mon_from_table(
-              spider->trx,
-              spider->trx->thd,
+              spider->wide_handler->trx,
+              spider->wide_handler->trx->thd,
               share,
               roop_count,
               (uint32) share->monitoring_sid[roop_count],
@@ -8964,8 +9465,8 @@ int spider_db_flush_tables(
         spider->need_mons[roop_count]
       ) {
         error_num = spider_ping_table_mon_from_table(
-            spider->trx,
-            spider->trx->thd,
+            spider->wide_handler->trx,
+            spider->wide_handler->trx->thd,
             share,
             roop_count,
             (uint32) share->monitoring_sid[roop_count],
@@ -9012,8 +9513,8 @@ int spider_db_flush_logs(
         spider->need_mons[roop_count]
       ) {
         error_num = spider_ping_table_mon_from_table(
-            spider->trx,
-            spider->trx->thd,
+            spider->wide_handler->trx,
+            spider->wide_handler->trx->thd,
             share,
             roop_count,
             (uint32) share->monitoring_sid[roop_count],
@@ -9192,7 +9693,7 @@ int spider_db_print_item_type_default(
   spider_string *str
 ) {
   DBUG_ENTER("spider_db_print_item_type_default");
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   SPIDER_SHARE *share = spider->share;
   if (spider_param_skip_default_condition(thd,
     share->skip_default_condition))
@@ -9565,7 +10066,7 @@ int spider_db_open_item_ref(
     if (
       (*(item_ref->ref))->type() != Item::CACHE_ITEM &&
       item_ref->ref_type() != Item_ref::VIEW_REF &&
-      !item_ref->table_name &&
+      !item_ref->table_name.str &&
       item_ref->name.str &&
       item_ref->alias_name_used
     )
@@ -9691,9 +10192,9 @@ int spider_db_open_item_string(
   if (str)
   {
     THD *thd = NULL;
-    TABLE *table;
-    MY_BITMAP *saved_map;
-    Time_zone *saved_time_zone;
+    TABLE *UNINIT_VAR(table);
+    MY_BITMAP *saved_map = NULL;
+    Time_zone *UNINIT_VAR(saved_time_zone);
     String str_value;
     char tmp_buf[MAX_FIELD_WIDTH];
     spider_string tmp_str(tmp_buf, MAX_FIELD_WIDTH, str->charset());
@@ -9735,7 +10236,9 @@ int spider_db_open_item_string(
           goto end;
         }
       }
-      if (str->charset() != tmp_str2->charset())
+      DBUG_PRINT("info",("spider dbton_id=%u", dbton_id));
+      if (str->charset() != tmp_str2->charset() &&
+        spider_dbton[dbton_id].db_util->append_charset_name_before_string())
       {
         if ((error_num = spider_db_append_charset_name_before_string(str,
           tmp_str2->charset())))
@@ -10054,12 +10557,13 @@ int spider_db_append_condition(
         DBUG_RETURN(error_num);
     }
   } else {
-    if (spider->cond_check)
-      DBUG_RETURN(spider->cond_check_error);
-    spider->cond_check = TRUE;
-    if ((spider->cond_check_error = spider->append_condition_sql_part(
-      NULL, 0, SPIDER_SQL_TYPE_SELECT_SQL, TRUE)))
-      DBUG_RETURN(spider->cond_check_error);
+    if (spider->wide_handler->cond_check)
+      DBUG_RETURN(spider->wide_handler->cond_check_error);
+    spider->wide_handler->cond_check = TRUE;
+    if ((spider->wide_handler->cond_check_error =
+      spider->append_condition_sql_part(
+        NULL, 0, SPIDER_SQL_TYPE_SELECT_SQL, TRUE)))
+      DBUG_RETURN(spider->wide_handler->cond_check_error);
   }
   DBUG_RETURN(0);
 }
@@ -10076,8 +10580,8 @@ int spider_db_append_update_columns(
 ) {
   int error_num;
   bool add_comma = FALSE;
-  List_iterator_fast<Item> fi(*spider->direct_update_fields),
-    vi(*spider->direct_update_values);
+  List_iterator_fast<Item> fi(*spider->wide_handler->direct_update_fields),
+    vi(*spider->wide_handler->direct_update_values);
   Item *field, *value;
   DBUG_ENTER("spider_db_append_update_columns");
   while ((field = fi++))
@@ -10121,6 +10625,31 @@ int spider_db_append_update_columns(
 }
 #endif
 
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+bool spider_db_check_select_colum_in_group(
+  st_select_lex *select_lex,
+  Field *field
+) {
+  ORDER *group;
+  DBUG_ENTER("spider_db_check_select_colum_in_group");
+  for (group = (ORDER *) select_lex->group_list.first; group;
+    group = group->next)
+  {
+    Item *item = *group->item;
+    if (item->type() == Item::FIELD_ITEM)
+    {
+      Item_field *item_field = (Item_field *) item;
+      if (item_field->field == field)
+      {
+        /* This field can be used directly */
+        DBUG_RETURN(TRUE);
+      }
+    }
+  }
+  DBUG_RETURN(FALSE);
+}
+#endif
+
 uint spider_db_check_ft_idx(
   Item_func *item_func,
   ha_spider *spider
@@ -10388,8 +10917,8 @@ int spider_db_udf_direct_sql(
 #else
   if (direct_sql->real_table_used)
   {
-    if (spider_sys_open_tables(c_thd, &direct_sql->table_list_first,
-      &direct_sql->open_tables_backup))
+    if (spider_sys_open_and_lock_tables(c_thd, &direct_sql->table_list_first,
+                               &direct_sql->open_tables_backup))
     {
       direct_sql->real_table_used = FALSE;
       DBUG_RETURN(my_errno);
@@ -10419,7 +10948,7 @@ int spider_db_udf_direct_sql(
       need_trx_end = TRUE;
     } else {
 #endif
-      if (c_thd->transaction.stmt.ha_list)
+      if (c_thd->transaction->stmt.ha_list)
         need_trx_end = FALSE;
       else
         need_trx_end = TRUE;
@@ -10516,7 +11045,7 @@ int spider_db_udf_direct_sql(
         request_key.handler = direct_sql;
         request_key.request_id = 1;
         request_key.next = NULL;
-        if ((result = conn->db_conn->use_result(&request_key, &error_num)))
+        if ((result = conn->db_conn->use_result(NULL, &request_key, &error_num)))
         {
           end_of_file = FALSE;
           if (roop_count >= 0)
@@ -10655,7 +11184,7 @@ int spider_db_udf_direct_sql(
               if (table->file->has_transactions())
 #endif
               {
-                table->file->ha_external_lock(table->in_use, F_UNLCK);
+                table->file->ha_external_unlock(table->in_use);
 #if MYSQL_VERSION_ID < 50500
 #else
                 if (
@@ -10751,10 +11280,12 @@ int spider_db_udf_direct_sql_select_db(
   SPIDER_DB_CONN *db_conn = conn->db_conn;
   DBUG_ENTER("spider_db_udf_direct_sql_select_db");
   pthread_mutex_assert_owner(&conn->mta_conn_mutex);
+  if (
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-  if (direct_sql->access_mode == 0)
-  {
+    direct_sql->access_mode == 0 &&
 #endif
+    spider_dbton[conn->dbton_id].db_util->database_has_default_value()
+  ) {
     DBUG_ASSERT(conn->mta_conn_mutex_file_pos.file_name);
     if (
       !conn->default_database.length() ||
@@ -10786,9 +11317,7 @@ int spider_db_udf_direct_sql_select_db(
         direct_sql->tgt_default_db_name_length + 1);
       conn->default_database.length(direct_sql->tgt_default_db_name_length);
     }
-#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
   }
-#endif
   DBUG_RETURN(0);
 }
 
@@ -10882,10 +11411,12 @@ int spider_db_udf_ping_table(
     int need_mon = 0;
     uint tmp_conn_link_idx = 0;
     ha_spider spider;
+    SPIDER_WIDE_HANDLER wide_handler;
     uchar db_request_phase = 0;
     ulonglong db_request_id = 0;
     spider.share = share;
-    spider.trx = trx;
+    spider.wide_handler = &wide_handler;
+    wide_handler.trx = trx;
     spider.need_mons = &need_mon;
     spider.conn_link_idx = &tmp_conn_link_idx;
     spider.db_request_phase = &db_request_phase;
@@ -11131,28 +11662,41 @@ int spider_db_udf_ping_table_append_select(
   str->q_append(SPIDER_SQL_SELECT_STR, SPIDER_SQL_SELECT_LEN);
   str->q_append(SPIDER_SQL_ONE_STR, SPIDER_SQL_ONE_LEN);
   str->q_append(SPIDER_SQL_FROM_STR, SPIDER_SQL_FROM_LEN);
-  if ((error_num = spider_db_append_name_with_quote_str(str,
-    share->tgt_dbs[0], dbton_id)))
-    DBUG_RETURN(error_num);
-  if (str->reserve(SPIDER_SQL_DOT_LEN))
-    DBUG_RETURN(HA_ERR_OUT_OF_MEM);
-  str->q_append(SPIDER_SQL_DOT_STR, SPIDER_SQL_DOT_LEN);
+  if (share->tgt_dbs[0])
+  {
+    if ((error_num = spider_db_append_name_with_quote_str(str,
+      share->tgt_dbs[0], dbton_id)))
+      DBUG_RETURN(error_num);
+    if (str->reserve(SPIDER_SQL_DOT_LEN))
+      DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+    str->q_append(SPIDER_SQL_DOT_STR, SPIDER_SQL_DOT_LEN);
+  }
   if ((error_num = spider_db_append_name_with_quote_str(str,
     share->tgt_table_names[0], share->sql_dbton_ids[0])))
     DBUG_RETURN(error_num);
 
-  limit_str_length = my_sprintf(limit_str, (limit_str, "%lld", limit));
-  if (str->reserve(
-    (use_where ? (where_str->length() * 2) : 0) +
-    SPIDER_SQL_LIMIT_LEN + limit_str_length
-  ))
-    DBUG_RETURN(HA_ERR_OUT_OF_MEM);
-  if (use_where)
+  if (spider_dbton[dbton_id].db_util->limit_mode() == 1)
   {
-    str->append_escape_string(where_str->ptr(), where_str->length());
+    if (use_where)
+    {
+      if (str->reserve(where_str->length() * 2))
+        DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+      str->append_escape_string(where_str->ptr(), where_str->length());
+    }
+  } else {
+    limit_str_length = my_sprintf(limit_str, (limit_str, "%lld", limit));
+    if (str->reserve(
+      (use_where ? (where_str->length() * 2) : 0) +
+      SPIDER_SQL_LIMIT_LEN + limit_str_length
+    ))
+      DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+    if (use_where)
+    {
+      str->append_escape_string(where_str->ptr(), where_str->length());
+    }
+    str->q_append(SPIDER_SQL_LIMIT_STR, SPIDER_SQL_LIMIT_LEN);
+    str->q_append(limit_str, limit_str_length);
   }
-  str->q_append(SPIDER_SQL_LIMIT_STR, SPIDER_SQL_LIMIT_LEN);
-  str->q_append(limit_str, limit_str_length);
   DBUG_RETURN(0);
 }
 
@@ -11181,6 +11725,7 @@ int spider_db_udf_ping_table_mon_next(
   int init_sql_alloc_size =
     spider_param_init_sql_alloc_size(thd, share->init_sql_alloc_size);
   ha_spider spider;
+  SPIDER_WIDE_HANDLER wide_handler;
   SPIDER_TRX trx;
   DBUG_ENTER("spider_db_udf_ping_table_mon_next");
   char *sql_buf = (char *) my_alloca(init_sql_alloc_size);
@@ -11195,7 +11740,8 @@ int spider_db_udf_ping_table_mon_next(
   sql_str.length(0);
   trx.thd = thd;
   spider.share = share;
-  spider.trx = &trx;
+  spider.wide_handler = &wide_handler;
+  wide_handler.trx = &trx;
   spider.need_mons = &need_mon;
   spider.conn_link_idx = &tmp_conn_link_idx;
 
@@ -11351,8 +11897,8 @@ int spider_db_udf_copy_tables(
   DBUG_ENTER("spider_db_udf_copy_tables");
   if (!(last_row_pos = (ulong *)
     spider_bulk_malloc(spider_current_trx, 30, MYF(MY_WME),
-      &last_row_pos, sizeof(ulong) * table->s->fields,
-      &last_lengths, sizeof(ulong) * table->s->fields,
+      &last_row_pos, (uint) (sizeof(ulong) * table->s->fields),
+      &last_lengths, (uint) (sizeof(ulong) * table->s->fields),
       NullS))
   ) {
     my_error(ER_OUT_OF_RESOURCES, MYF(0), HA_ERR_OUT_OF_MEM);
@@ -11517,7 +12063,7 @@ int spider_db_udf_copy_tables(
       request_key.handler = copy_tables;
       request_key.request_id = 1;
       request_key.next = NULL;
-      if ((result = tmp_conn->db_conn->use_result(&request_key, &error_num)))
+      if ((result = tmp_conn->db_conn->use_result(NULL, &request_key, &error_num)))
       {
         SPIDER_DB_ROW *row;
         roop_count = 0;
@@ -11918,7 +12464,8 @@ int spider_db_open_handler(
       goto error;
     }
 
-    spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+    spider_conn_set_timeout_from_share(conn, link_idx,
+      spider->wide_handler->trx->thd,
       share);
     if (dbton_hdl->execute_sql(
       SPIDER_SQL_TYPE_HANDLER,
@@ -11997,10 +12544,11 @@ int spider_db_open_handler(
     if (spider->is_bulk_access_clone && !spider->bulk_access_executing)
     {
       spider->connection_ids[link_idx] = conn->connection_id;
-      spider_trx_add_bulk_access_conn(spider->trx, conn);
+      spider_trx_add_bulk_access_conn(spider->wide_handler->trx, conn);
     } else {
 #endif
-      spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+      spider_conn_set_timeout_from_share(conn, link_idx,
+        spider->wide_handler->trx->thd,
         share);
       if (dbton_hdl->execute_sql(
         SPIDER_SQL_TYPE_SELECT_HS,
@@ -12018,12 +12566,14 @@ int spider_db_open_handler(
         spider_clear_bit(spider->db_request_phase, link_idx);
       }
       st_spider_db_request_key request_key;
-      request_key.spider_thread_id = spider->trx->spider_thread_id;
-      request_key.query_id = spider->trx->thd->query_id;
+      request_key.spider_thread_id =
+        spider->wide_handler->trx->spider_thread_id;
+      request_key.query_id = spider->wide_handler->trx->thd->query_id;
       request_key.handler = spider;
       request_key.request_id = spider->db_request_id[link_idx];
       request_key.next = NULL;
-      if (!(result = conn->db_conn->use_result(&request_key, &error_num)))
+      if (!(result = conn->db_conn->use_result(spider, &request_key,
+        &error_num)))
       {
         if (!error_num)
         {
@@ -12156,12 +12706,13 @@ int spider_db_bulk_open_handler(
       spider_clear_bit(spider->db_request_phase, link_idx);
     }
     st_spider_db_request_key request_key;
-    request_key.spider_thread_id = spider->trx->spider_thread_id;
-    request_key.query_id = spider->trx->thd->query_id;
+    request_key.spider_thread_id = spider->wide_handler->trx->spider_thread_id;
+    request_key.query_id = spider->wide_handler->trx->thd->query_id;
     request_key.handler = spider;
     request_key.request_id = spider->db_request_id[link_idx];
     request_key.next = NULL;
-    if (!(result = conn->db_conn->use_result(&request_key, &error_num)))
+    if (!(result = conn->db_conn->use_result(spider, &request_key,
+      &error_num)))
     {
       if (!error_num)
       {
@@ -12216,7 +12767,8 @@ int spider_db_close_handler(
         DBUG_RETURN(error_num);
       }
 
-      spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+      spider_conn_set_timeout_from_share(conn, link_idx,
+        spider->wide_handler->trx->thd,
         spider->share);
       if (dbton_hdl->execute_sql(
         SPIDER_SQL_TYPE_HANDLER,
@@ -12292,4 +12844,3 @@ bool spider_db_conn_is_network_error(
   }
   DBUG_RETURN(FALSE);
 }
-
diff --git a/storage/spider/spd_db_conn.h b/storage/spider/spd_db_conn.h
index e820851d257..8af820c7dfa 100644
--- a/storage/spider/spd_db_conn.h
+++ b/storage/spider/spd_db_conn.h
@@ -1,5 +1,5 @@
-/* Copyright (C) 2008-2019 Kentoku Shiba
-   Copyright (C) 2019 MariaDB corp
+/* Copyright (C) 2008-2020 Kentoku Shiba
+   Copyright (C) 2019-2020 MariaDB corp
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -40,6 +40,10 @@
 #define SPIDER_SQL_OPEN_PAREN_LEN (sizeof(SPIDER_SQL_OPEN_PAREN_STR) - 1)
 #define SPIDER_SQL_CLOSE_PAREN_STR ")"
 #define SPIDER_SQL_CLOSE_PAREN_LEN (sizeof(SPIDER_SQL_CLOSE_PAREN_STR) - 1)
+#define SPIDER_SQL_OPEN_BRACE_STR "{"
+#define SPIDER_SQL_OPEN_BRACE_LEN (sizeof(SPIDER_SQL_OPEN_BRACE_STR) - 1)
+#define SPIDER_SQL_CLOSE_BRACE_STR "}"
+#define SPIDER_SQL_CLOSE_BRACE_LEN (sizeof(SPIDER_SQL_CLOSE_BRACE_STR) - 1)
 #define SPIDER_SQL_COMMA_STR ","
 #define SPIDER_SQL_COMMA_LEN (sizeof(SPIDER_SQL_COMMA_STR) - 1)
 #define SPIDER_SQL_UNION_ALL_STR ")union all("
@@ -617,6 +621,12 @@ int spider_db_store_result(
   TABLE *table
 );
 
+int spider_db_store_result_for_reuse_cursor(
+  ha_spider *spider,
+  int link_idx,
+  TABLE *table
+);
+
 void spider_db_discard_result(
   ha_spider *spider,
   int link_idx,
@@ -741,8 +751,8 @@ int spider_db_show_index(
 );
 
 ha_rows spider_db_explain_select(
-  key_range *start_key,
-  key_range *end_key,
+  const key_range *start_key,
+  const key_range *end_key,
   ha_spider *spider,
   int link_idx
 );
@@ -1061,6 +1071,13 @@ int spider_db_append_update_columns(
 );
 #endif
 
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+bool spider_db_check_select_colum_in_group(
+  st_select_lex *select_lex,
+  Field *field
+);
+#endif
+
 uint spider_db_check_ft_idx(
   Item_func *item_func,
   ha_spider *spider
diff --git a/storage/spider/spd_db_handlersocket.cc b/storage/spider/spd_db_handlersocket.cc
index b8e4c0e705a..b3e73b856a3 100644
--- a/storage/spider/spd_db_handlersocket.cc
+++ b/storage/spider/spd_db_handlersocket.cc
@@ -127,7 +127,10 @@ SPIDER_DBTON spider_dbton_handlersocket = {
   NULL,
   spider_handlersocket_create_conn,
   spider_handlersocket_support_direct_join,
-  &spider_db_handlersocket_utility
+  &spider_db_handlersocket_utility,
+  "For communicating using the handlersocket protocol",
+  "0.1.0",
+  SPIDER_MATURITY_BETA
 };
 
 #ifndef HANDLERSOCKET_MYSQL_UTIL
@@ -505,8 +508,8 @@ SPIDER_DB_ROW *spider_db_handlersocket_row::clone()
     DBUG_RETURN(NULL);
   }
   if (!spider_bulk_malloc(spider_current_trx, 169, MYF(MY_WME),
-    &clone_row->hs_row, sizeof(SPIDER_HS_STRING_REF) * field_count,
-    &tmp_char, row_size,
+    &clone_row->hs_row, (uint) (sizeof(SPIDER_HS_STRING_REF) * field_count),
+    &tmp_char, (uint) (row_size),
     NullS)
   ) {
     delete clone_row;
@@ -1438,6 +1441,7 @@ spider_db_result *spider_db_handlersocket::store_result(
 }
 
 spider_db_result *spider_db_handlersocket::use_result(
+  ha_spider *spider,
   st_spider_db_request_key *request_key,
   int *error_num
 ) {
@@ -5421,8 +5425,8 @@ int spider_handlersocket_handler::append_delete_all_rows_part(
 }
 
 int spider_handlersocket_handler::append_explain_select_part(
-  key_range *start_key,
-  key_range *end_key,
+  const key_range *start_key,
+  const key_range *end_key,
   ulong sql_type,
   int link_idx
 ) {
@@ -6167,8 +6171,8 @@ void spider_handlersocket_handler::minimum_select_bitmap_create()
   {
     uint field_index = (*field_p)->field_index;
     if (
-      spider_bit_is_set(spider->searched_bitmap, field_index) |
-      bitmap_is_set(table->read_set, field_index) |
+      spider_bit_is_set(spider->searched_bitmap, field_index) ||
+      bitmap_is_set(table->read_set, field_index) ||
       bitmap_is_set(table->write_set, field_index)
     ) {
       spider_set_bit(minimum_select_bitmap, field_index);
diff --git a/storage/spider/spd_db_handlersocket.h b/storage/spider/spd_db_handlersocket.h
index 19a4a391ed6..f7a454b66af 100644
--- a/storage/spider/spd_db_handlersocket.h
+++ b/storage/spider/spd_db_handlersocket.h
@@ -330,6 +330,7 @@ public:
     int *error_num
   );
   spider_db_result *use_result(
+    ha_spider *spider,
     st_spider_db_request_key *request_key,
     int *error_num
   );
@@ -843,8 +844,8 @@ public:
     ulong sql_type
   );
   int append_explain_select_part(
-    key_range *start_key,
-    key_range *end_key,
+    const key_range *start_key,
+    const key_range *end_key,
     ulong sql_type,
     int link_idx
   );
@@ -966,8 +967,8 @@ public:
     ulonglong &last_insert_id
   );
   ha_rows explain_select(
-    key_range *start_key,
-    key_range *end_key,
+    const key_range *start_key,
+    const key_range *end_key,
     int link_idx
   );
   int lock_tables(
diff --git a/storage/spider/spd_db_include.cc b/storage/spider/spd_db_include.cc
index 7f600142187..b9a0532d1b9 100644
--- a/storage/spider/spd_db_include.cc
+++ b/storage/spider/spd_db_include.cc
@@ -1,4 +1,5 @@
-/* Copyright (C) 2018-2019 Kentoku Shiba
+/* Copyright (C) 2018-2020 Kentoku Shiba
+   Copyright (C) 2018-2020 MariaDB corp
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -31,6 +32,9 @@
 #include "spd_err.h"
 #include "spd_db_include.h"
 #include "spd_include.h"
+#include "spd_conn.h"
+
+extern SPIDER_DBTON spider_dbton[SPIDER_DBTON_SIZE];
 
 spider_db_result::spider_db_result(
   SPIDER_DB_CONN *in_db_conn
@@ -51,6 +55,13 @@ int spider_db_result::fetch_table_checksum(
 }
 #endif
 
+uint spider_db_result::limit_mode()
+{
+  DBUG_ENTER("spider_db_result::limit_mode");
+  DBUG_PRINT("info",("spider this=%p", this));
+  DBUG_RETURN(spider_dbton[dbton_id].db_util->limit_mode());
+}
+
 spider_db_conn::spider_db_conn(
   SPIDER_CONN *in_conn
 ) : conn(in_conn), dbton_id(in_conn->dbton_id)
@@ -60,6 +71,107 @@ spider_db_conn::spider_db_conn(
   DBUG_VOID_RETURN;
 }
 
+bool spider_db_conn::set_loop_check_in_bulk_sql()
+{
+  DBUG_ENTER("spider_db_conn::set_loop_check_in_bulk_sql");
+  DBUG_PRINT("info",("spider this=%p", this));
+  DBUG_RETURN(FALSE);
+}
+
+int spider_db_conn::set_loop_check(
+  int *need_mon
+) {
+  DBUG_ENTER("spider_db_conn::set_loop_check");
+  DBUG_PRINT("info",("spider this=%p", this));
+  /* nothing to do */
+  DBUG_RETURN(0);
+}
+
+int spider_db_conn::fin_loop_check()
+{
+  st_spider_conn_loop_check *lcptr;
+  DBUG_ENTER("spider_db_conn::fin_loop_check");
+  DBUG_PRINT("info",("spider this=%p", this));
+  if (conn->loop_check_queue.records)
+  {
+    uint l = 0;
+    while ((lcptr = (SPIDER_CONN_LOOP_CHECK *) my_hash_element(
+      &conn->loop_check_queue, l)))
+    {
+      lcptr->flag = 0;
+      ++l;
+    }
+    my_hash_reset(&conn->loop_check_queue);
+  }
+  lcptr = conn->loop_check_ignored_first;
+  while (lcptr)
+  {
+    lcptr->flag = 0;
+    lcptr = lcptr->next;
+  }
+  conn->loop_check_ignored_first = NULL;
+  lcptr = conn->loop_check_meraged_first;
+  while (lcptr)
+  {
+    lcptr->flag = 0;
+    lcptr = lcptr->next;
+  }
+  conn->loop_check_meraged_first = NULL;
+  DBUG_RETURN(0);
+}
+
+uint spider_db_conn::limit_mode()
+{
+  DBUG_ENTER("spider_db_conn::limit_mode");
+  DBUG_PRINT("info",("spider this=%p", this));
+  DBUG_RETURN(spider_dbton[dbton_id].db_util->limit_mode());
+}
+
+int spider_db_util::append_loop_check(
+  spider_string *str,
+  SPIDER_CONN *conn
+) {
+  DBUG_ENTER("spider_db_util::append_loop_check");
+  DBUG_PRINT("info",("spider this=%p", this));
+  /* nothing to do */
+  DBUG_RETURN(0);
+}
+
+bool spider_db_util::tables_on_different_db_are_joinable()
+{
+  DBUG_ENTER("spider_db_util::tables_on_different_db_are_joinable");
+  DBUG_PRINT("info",("spider this=%p", this));
+  DBUG_RETURN(TRUE);
+}
+
+bool spider_db_util::socket_has_default_value()
+{
+  DBUG_ENTER("spider_db_util::socket_has_default_value");
+  DBUG_PRINT("info",("spider this=%p", this));
+  DBUG_RETURN(TRUE);
+}
+
+bool spider_db_util::database_has_default_value()
+{
+  DBUG_ENTER("spider_db_util::database_has_default_value");
+  DBUG_PRINT("info",("spider this=%p", this));
+  DBUG_RETURN(TRUE);
+}
+
+bool spider_db_util::append_charset_name_before_string()
+{
+  DBUG_ENTER("spider_db_util::append_charset_name_before_string");
+  DBUG_PRINT("info",("spider this=%p", this));
+  DBUG_RETURN(FALSE);
+}
+
+uint spider_db_util::limit_mode()
+{
+  DBUG_ENTER("spider_db_util::limit_mode");
+  DBUG_PRINT("info",("spider this=%p", this));
+  DBUG_RETURN(0);
+}
+
 #ifdef HA_HAS_CHECKSUM_EXTENDED
 bool spider_db_share::checksum_support()
 {
@@ -76,3 +188,39 @@ int spider_db_handler::checksum_table(
   DBUG_RETURN(0);
 }
 #endif
+
+#ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
+bool spider_db_handler::check_direct_update(
+  st_select_lex *select_lex,
+  longlong select_limit,
+  longlong offset_limit
+) {
+  DBUG_ENTER("spider_db_handler::check_direct_update");
+  DBUG_PRINT("info",("spider this=%p", this));
+  if (
+    select_limit != 9223372036854775807LL ||
+    offset_limit != 0 ||
+    select_lex->order_list.elements
+  ) {
+    DBUG_RETURN(TRUE);
+  }
+  DBUG_RETURN(FALSE);
+}
+
+bool spider_db_handler::check_direct_delete(
+  st_select_lex *select_lex,
+  longlong select_limit,
+  longlong offset_limit
+) {
+  DBUG_ENTER("spider_db_handler::check_direct_delete");
+  DBUG_PRINT("info",("spider this=%p", this));
+  if (
+    select_limit != 9223372036854775807LL ||
+    offset_limit != 0 ||
+    select_lex->order_list.elements
+  ) {
+    DBUG_RETURN(TRUE);
+  }
+  DBUG_RETURN(FALSE);
+}
+#endif
diff --git a/storage/spider/spd_db_include.h b/storage/spider/spd_db_include.h
index 9b005ba7ccd..41e24e06c21 100644
--- a/storage/spider/spd_db_include.h
+++ b/storage/spider/spd_db_include.h
@@ -1,4 +1,5 @@
-/* Copyright (C) 2008-2018 Kentoku Shiba
+/* Copyright (C) 2008-2020 Kentoku Shiba
+   Copyright (C) 2019-2020 MariaDB corp
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -20,6 +21,10 @@
 
 #define SPIDER_DBTON_SIZE 15
 
+#ifndef SIZEOF_STORED_DOUBLE
+#define SIZEOF_STORED_DOUBLE 8
+#endif
+
 #define SPIDER_DB_WRAPPER_MYSQL "mysql"
 #define SPIDER_DB_WRAPPER_MARIADB "mariadb"
 
@@ -89,11 +94,15 @@ typedef st_spider_result SPIDER_RESULT;
 
 #define SPIDER_SQL_SEMICOLON_STR ";"
 #define SPIDER_SQL_SEMICOLON_LEN sizeof(SPIDER_SQL_SEMICOLON_STR) - 1
+#define SPIDER_SQL_COLON_STR ":"
+#define SPIDER_SQL_COLON_LEN sizeof(SPIDER_SQL_COLON_STR) - 1
 #define SPIDER_SQL_VALUE_QUOTE_STR "'"
 #define SPIDER_SQL_VALUE_QUOTE_LEN (sizeof(SPIDER_SQL_VALUE_QUOTE_STR) - 1)
 
 #define SPIDER_SQL_DOT_STR "."
 #define SPIDER_SQL_DOT_LEN (sizeof(SPIDER_SQL_DOT_STR) - 1)
+#define SPIDER_SQL_HYPHEN_STR "-"
+#define SPIDER_SQL_HYPHEN_LEN (sizeof(SPIDER_SQL_HYPHEN_STR) - 1)
 
 #define SPIDER_SQL_EQUAL_STR " = "
 #define SPIDER_SQL_EQUAL_LEN (sizeof(SPIDER_SQL_EQUAL_STR) - 1)
@@ -209,6 +218,11 @@ typedef st_spider_result SPIDER_RESULT;
 #define SPIDER_SQL_CONNECTION_LEN (sizeof(SPIDER_SQL_CONNECTION_STR) - 1)
 #define SPIDER_SQL_LCL_NAME_QUOTE_STR "`"
 #define SPIDER_SQL_LCL_NAME_QUOTE_LEN (sizeof(SPIDER_SQL_LCL_NAME_QUOTE_STR) - 1)
+#define SPIDER_SQL_MIN_STR "min"
+#define SPIDER_SQL_MIN_LEN (sizeof(SPIDER_SQL_MIN_STR) - 1)
+
+#define SPIDER_SQL_LOP_CHK_PRM_PRF_STR "spider_lc_"
+#define SPIDER_SQL_LOP_CHK_PRM_PRF_LEN (sizeof(SPIDER_SQL_LOP_CHK_PRM_PRF_STR) - 1)
 
 #define SPIDER_CONN_KIND_MYSQL (1 << 0)
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
@@ -870,6 +884,10 @@ public:
     spider_string *str,
     Time_zone *time_zone
   ) = 0;
+  virtual int append_loop_check(
+    spider_string *str,
+    SPIDER_CONN *conn
+  );
   virtual int append_start_transaction(
     spider_string *str
   ) = 0;
@@ -940,6 +958,11 @@ public:
     spider_string *str
   ) = 0;
 #endif
+  virtual bool tables_on_different_db_are_joinable();
+  virtual bool socket_has_default_value();
+  virtual bool database_has_default_value();
+  virtual bool append_charset_name_before_string();
+  virtual uint limit_mode();
 };
 
 class spider_db_row
@@ -990,12 +1013,12 @@ public:
 
 class spider_db_result
 {
-protected:
-  SPIDER_DB_CONN *db_conn;
 public:
-  uint dbton_id;
+  SPIDER_DB_CONN *db_conn;
+  uint           dbton_id;
   spider_db_result(SPIDER_DB_CONN *in_db_conn);
   virtual ~spider_db_result() {}
+  virtual void set_limit(longlong value) {}
   virtual bool has_result() = 0;
   virtual void free_result() = 0;
   virtual SPIDER_DB_ROW *current_row() = 0;
@@ -1050,19 +1073,20 @@ public:
     CHARSET_INFO *access_charset
   ) = 0;
 #endif
+  virtual uint limit_mode();
 };
 
 class spider_db_conn
 {
-protected:
-  SPIDER_CONN    *conn;
 public:
-  uint dbton_id;
+  SPIDER_CONN    *conn;
+  uint           dbton_id;
   spider_db_conn(
     SPIDER_CONN *in_conn
   );
   virtual ~spider_db_conn() {}
   virtual int init() = 0;
+  virtual void set_limit(longlong value) {}
   virtual bool is_connected() = 0;
   virtual void bg_connect() = 0;
   virtual int connect(
@@ -1101,6 +1125,7 @@ public:
     int *error_num
   ) = 0;
   virtual spider_db_result *use_result(
+    ha_spider *spider,
     st_spider_db_request_key *request_key,
     int *error_num
   ) = 0;
@@ -1182,6 +1207,11 @@ public:
     Time_zone *time_zone,
     int *need_mon
   ) = 0;
+  virtual bool set_loop_check_in_bulk_sql();
+  virtual int set_loop_check(
+    int *need_mon
+  );
+  virtual int fin_loop_check();
   virtual int show_master_status(
     SPIDER_TRX *trx,
     SPIDER_SHARE *share,
@@ -1264,6 +1294,7 @@ public:
   virtual bool cmp_request_key_to_snd(
     st_spider_db_request_key *request_key
   ) = 0;
+  virtual uint limit_mode();
 };
 
 class spider_db_share
@@ -1323,6 +1354,7 @@ public:
 #ifdef SPIDER_HAS_GROUP_BY_HANDLER
   SPIDER_LINK_IDX_CHAIN *link_idx_chain;
 #endif
+  bool strict_group_by;
   bool no_where_cond;
   spider_db_handler(ha_spider *spider, spider_db_share *db_share) :
     dbton_id(db_share->dbton_id), spider(spider), db_share(db_share),
@@ -1567,8 +1599,8 @@ public:
     ulong sql_type
   ) = 0;
   virtual int append_explain_select_part(
-    key_range *start_key,
-    key_range *end_key,
+    const key_range *start_key,
+    const key_range *end_key,
     ulong sql_type,
     int link_idx
   ) = 0;
@@ -1685,8 +1717,8 @@ public:
     ulonglong &last_insert_id
   ) = 0;
   virtual ha_rows explain_select(
-    key_range *start_key,
-    key_range *end_key,
+    const key_range *start_key,
+    const key_range *end_key,
     int link_idx
   ) = 0;
   virtual int lock_tables(
@@ -1805,6 +1837,18 @@ public:
     ulong sql_type
   ) = 0;
 #endif
+#ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
+  virtual bool check_direct_update(
+    st_select_lex *select_lex,
+    longlong select_limit,
+    longlong offset_limit
+  );
+  virtual bool check_direct_delete(
+    st_select_lex *select_lex,
+    longlong select_limit,
+    longlong offset_limit
+  );
+#endif
 };
 
 class spider_db_copy_table
@@ -1880,6 +1924,23 @@ enum spider_db_access_type
   SPIDER_DB_ACCESS_TYPE_NOSQL
 };
 
+#define SPIDER_MATURITY_UNKNOWN      0
+#define SPIDER_MATURITY_EXPERIMENTAL 1
+#define SPIDER_MATURITY_ALPHA        2
+#define SPIDER_MATURITY_BETA         3
+#define SPIDER_MATURITY_GAMMA        4
+#define SPIDER_MATURITY_STABLE       5
+
+static const LEX_CSTRING maturity_name[] =
+{
+  { STRING_WITH_LEN("Unknown") },
+  { STRING_WITH_LEN("Experimental") },
+  { STRING_WITH_LEN("Alpha") },
+  { STRING_WITH_LEN("Beta") },
+  { STRING_WITH_LEN("Gamma") },
+  { STRING_WITH_LEN("Stable") }
+};
+
 typedef struct st_spider_dbton
 {
   uint dbton_id;
@@ -1895,6 +1956,9 @@ typedef struct st_spider_dbton
   SPIDER_DB_CONN *(*create_db_conn)(SPIDER_CONN *conn);
   bool (*support_direct_join)();
   spider_db_util *db_util;
+  const char *descr;
+  const char *version_info;
+  unsigned int maturity;
 } SPIDER_DBTON;
 
 typedef struct st_spider_position
@@ -2042,7 +2106,6 @@ typedef struct st_spider_result_list
 #endif
     int                   quick_phase;
   bool                    keyread;
-  int                     lock_type;
   TABLE                   *table;
 #ifndef WITHOUT_SPIDER_BG_SEARCH
   volatile int            bgs_error;
diff --git a/storage/spider/spd_db_mysql.cc b/storage/spider/spd_db_mysql.cc
index 3425a97049f..5c41947cdeb 100644
--- a/storage/spider/spd_db_mysql.cc
+++ b/storage/spider/spd_db_mysql.cc
@@ -1,4 +1,5 @@
 /* Copyright (C) 2012-2018 Kentoku Shiba
+   Copyright (c) 2020, MariaDB Corporation.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -47,7 +48,7 @@
 #include "spd_sys_table.h"
 #include "spd_table.h"
 
-extern struct charset_info_st *spd_charset_utf8_bin;
+extern struct charset_info_st *spd_charset_utf8mb3_bin;
 extern bool volatile *spd_abort_loop;
 
 extern handlerton *spider_hton_ptr;
@@ -99,6 +100,9 @@ static const char *name_quote_str = SPIDER_SQL_NAME_QUOTE_STR;
 #define SPIDER_SQL_TIME_ZONE_STR "set session time_zone = '"
 #define SPIDER_SQL_TIME_ZONE_LEN sizeof(SPIDER_SQL_TIME_ZONE_STR) - 1
 
+#define SPIDER_SQL_SET_USER_VAL_STR "set @`"
+#define SPIDER_SQL_SET_USER_VAL_LEN sizeof(SPIDER_SQL_SET_USER_VAL_STR) - 1
+
 #define SPIDER_SQL_COMMIT_STR "commit"
 #define SPIDER_SQL_COMMIT_LEN sizeof(SPIDER_SQL_COMMIT_STR) - 1
 #define SPIDER_SQL_ROLLBACK_STR "rollback"
@@ -325,7 +329,10 @@ SPIDER_DBTON spider_dbton_mysql = {
   spider_mysql_create_copy_table,
   spider_mysql_create_conn,
   spider_mysql_support_direct_join,
-  &spider_db_mysql_utility
+  &spider_db_mysql_utility,
+  "For communicating to MySQL using native protocol",
+  "3.4.0",
+  SPIDER_MATURITY_STABLE
 };
 
 SPIDER_DBTON spider_dbton_mariadb = {
@@ -339,7 +346,10 @@ SPIDER_DBTON spider_dbton_mariadb = {
   spider_mariadb_create_copy_table,
   spider_mariadb_create_conn,
   spider_mariadb_support_direct_join,
-  &spider_db_mariadb_utility
+  &spider_db_mariadb_utility,
+  "For communicating to MariaDB using native protocol",
+  "3.4.0",
+  SPIDER_MATURITY_STABLE
 };
 
 spider_db_mbase_row::spider_db_mbase_row(
@@ -545,9 +555,9 @@ SPIDER_DB_ROW *spider_db_mbase_row::clone()
     row_size = record_size + field_count;
   }
   if (!spider_bulk_malloc(spider_current_trx, 29, MYF(MY_WME),
-    &clone_row->row, sizeof(char*) * field_count,
-    &tmp_char, row_size,
-    &clone_row->lengths, sizeof(ulong) * field_count,
+    &clone_row->row, (uint) (sizeof(char*) * field_count),
+    &tmp_char, (uint) (row_size),
+    &clone_row->lengths, (uint) (sizeof(ulong) * field_count),
     NullS)
   ) {
     delete clone_row;
@@ -1865,7 +1875,7 @@ int spider_db_mbase::init()
   DBUG_ENTER("spider_db_mbase::init");
   DBUG_PRINT("info",("spider this=%p", this));
   if (
-    my_hash_init(&lock_table_hash, spd_charset_utf8_bin, 32, 0, 0,
+    my_hash_init(PSI_INSTRUMENT_ME, &lock_table_hash, spd_charset_utf8mb3_bin, 32, 0, 0,
       (my_hash_get_key) spider_link_get_key, 0, 0)
   ) {
     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
@@ -2279,19 +2289,23 @@ bool spider_db_mbase::is_xa_nota_error(
   DBUG_RETURN(xa_nota);
 }
 
-void spider_db_mbase::print_warnings(
+int spider_db_mbase::print_warnings(
   struct tm *l_time
 ) {
+  int error_num = 0;
   DBUG_ENTER("spider_db_mbase::print_warnings");
   DBUG_PRINT("info",("spider this=%p", this));
   if (db_conn->status == MYSQL_STATUS_READY)
   {
+    if (
 #if MYSQL_VERSION_ID < 50500
-    if (!(db_conn->last_used_con->server_status & SERVER_MORE_RESULTS_EXISTS))
+      !(db_conn->last_used_con->server_status & SERVER_MORE_RESULTS_EXISTS) &&
+      db_conn->last_used_con->warning_count
 #else
-    if (!(db_conn->server_status & SERVER_MORE_RESULTS_EXISTS))
+      !(db_conn->server_status & SERVER_MORE_RESULTS_EXISTS) &&
+      db_conn->warning_count
 #endif
-    {
+    ) {
       if (
         spider_param_dry_access() ||
         !mysql_real_query(db_conn, SPIDER_SQL_SHOW_WARNINGS_STR,
@@ -2309,7 +2323,7 @@ void spider_db_mbase::print_warnings(
           {
             if (res)
               mysql_free_result(res);
-            DBUG_VOID_RETURN;
+            DBUG_RETURN(0);
           }
           /* no record is ok */
         }
@@ -2317,24 +2331,40 @@ void spider_db_mbase::print_warnings(
         if (num_fields != 3)
         {
           mysql_free_result(res);
-          DBUG_VOID_RETURN;
+          DBUG_RETURN(0);
         }
-        while (row)
+        if (l_time)
         {
-          fprintf(stderr, "%04d%02d%02d %02d:%02d:%02d [WARN SPIDER RESULT] "
-            "from [%s] %ld to %ld: %s %s %s\n",
-            l_time->tm_year + 1900, l_time->tm_mon + 1, l_time->tm_mday,
-            l_time->tm_hour, l_time->tm_min, l_time->tm_sec,
-            conn->tgt_host, (ulong) db_conn->thread_id,
-            (ulong) current_thd->thread_id, row[0], row[1], row[2]);
-          row = mysql_fetch_row(res);
+          while (row)
+          {
+            fprintf(stderr, "%04d%02d%02d %02d:%02d:%02d [WARN SPIDER RESULT] "
+              "from [%s] %ld to %ld: %s %s %s\n",
+              l_time->tm_year + 1900, l_time->tm_mon + 1, l_time->tm_mday,
+              l_time->tm_hour, l_time->tm_min, l_time->tm_sec,
+              conn->tgt_host, (ulong) db_conn->thread_id,
+              (ulong) current_thd->thread_id, row[0], row[1], row[2]);
+            row = mysql_fetch_row(res);
+          }
+        } else {
+          while (row)
+          {
+            DBUG_PRINT("info",("spider row[0]=%s", row[0]));
+            DBUG_PRINT("info",("spider row[1]=%s", row[1]));
+            DBUG_PRINT("info",("spider row[2]=%s", row[2]));
+            longlong res_num =
+              (longlong) my_strtoll10(row[1], (char**) NULL, &error_num);
+            DBUG_PRINT("info",("spider res_num=%lld", res_num));
+            my_printf_error((int) res_num, row[2], MYF(0));
+            error_num = (int) res_num;
+            row = mysql_fetch_row(res);
+          }
         }
         if (res)
           mysql_free_result(res);
       }
     }
   }
-  DBUG_VOID_RETURN;
+  DBUG_RETURN(error_num);
 }
 
 spider_db_result *spider_db_mbase::store_result(
@@ -2366,6 +2396,7 @@ spider_db_result *spider_db_mbase::store_result(
 }
 
 spider_db_result *spider_db_mbase::use_result(
+  ha_spider *spider,
   st_spider_db_request_key *request_key,
   int *error_num
 ) {
@@ -3333,6 +3364,115 @@ int spider_db_mbase::set_time_zone(
   DBUG_RETURN(0);
 }
 
+bool spider_db_mbase::set_loop_check_in_bulk_sql()
+{
+  DBUG_ENTER("spider_db_mbase::set_loop_check_in_bulk_sql");
+  DBUG_PRINT("info",("spider this=%p", this));
+  DBUG_RETURN(TRUE);
+}
+
+int spider_db_mbase::set_loop_check(
+  int *need_mon
+) {
+  SPIDER_CONN_LOOP_CHECK *lcptr;
+  char sql_buf[MAX_FIELD_WIDTH];
+  spider_string sql_str(sql_buf, sizeof(sql_buf), &my_charset_bin);
+  DBUG_ENTER("spider_db_mbase::set_loop_check");
+  DBUG_PRINT("info",("spider this=%p", this));
+  sql_str.init_calc_mem(270);
+  while ((lcptr = (SPIDER_CONN_LOOP_CHECK *) my_hash_element(
+    &conn->loop_check_queue, 0)))
+  {
+    sql_str.length(0);
+    if (sql_str.reserve(SPIDER_SQL_SET_USER_VAL_LEN +
+      SPIDER_SQL_LOP_CHK_PRM_PRF_LEN + lcptr->to_name.length +
+      SPIDER_SQL_NAME_QUOTE_LEN + SPIDER_SQL_EQUAL_LEN +
+      SPIDER_SQL_VALUE_QUOTE_LEN +
+      lcptr->merged_value.length + SPIDER_SQL_VALUE_QUOTE_LEN))
+    {
+      DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+    }
+    sql_str.q_append(SPIDER_SQL_SET_USER_VAL_STR, SPIDER_SQL_SET_USER_VAL_LEN);
+    sql_str.q_append(SPIDER_SQL_LOP_CHK_PRM_PRF_STR,
+      SPIDER_SQL_LOP_CHK_PRM_PRF_LEN);
+    sql_str.q_append(lcptr->to_name.str, lcptr->to_name.length);
+    sql_str.q_append(SPIDER_SQL_NAME_QUOTE_STR, SPIDER_SQL_NAME_QUOTE_LEN);
+    sql_str.q_append(SPIDER_SQL_EQUAL_STR, SPIDER_SQL_EQUAL_LEN);
+    sql_str.q_append(SPIDER_SQL_VALUE_QUOTE_STR, SPIDER_SQL_VALUE_QUOTE_LEN);
+    sql_str.q_append(lcptr->merged_value.str, lcptr->merged_value.length);
+    sql_str.q_append(SPIDER_SQL_VALUE_QUOTE_STR, SPIDER_SQL_VALUE_QUOTE_LEN);
+
+    pthread_mutex_assert_not_owner(&conn->mta_conn_mutex);
+    pthread_mutex_lock(&conn->mta_conn_mutex);
+    SPIDER_SET_FILE_POS(&conn->mta_conn_mutex_file_pos);
+    conn->need_mon = need_mon;
+    DBUG_ASSERT(!conn->mta_conn_mutex_lock_already);
+    DBUG_ASSERT(!conn->mta_conn_mutex_unlock_later);
+    conn->mta_conn_mutex_lock_already = TRUE;
+    conn->mta_conn_mutex_unlock_later = TRUE;
+    if (spider_db_query(
+      conn,
+      sql_str.ptr(),
+      sql_str.length(),
+      -1,
+      need_mon)
+    ) {
+      DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
+      DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
+      conn->mta_conn_mutex_lock_already = FALSE;
+      conn->mta_conn_mutex_unlock_later = FALSE;
+      DBUG_RETURN(spider_db_errorno(conn));
+    }
+    DBUG_ASSERT(conn->mta_conn_mutex_lock_already);
+    DBUG_ASSERT(conn->mta_conn_mutex_unlock_later);
+    conn->mta_conn_mutex_lock_already = FALSE;
+    conn->mta_conn_mutex_unlock_later = FALSE;
+    SPIDER_CLEAR_FILE_POS(&conn->mta_conn_mutex_file_pos);
+    pthread_mutex_unlock(&conn->mta_conn_mutex);
+
+#ifdef HASH_UPDATE_WITH_HASH_VALUE
+    my_hash_delete_with_hash_value(&conn->loop_check_queue,
+      lcptr->hash_value, (uchar *) lcptr);
+#else
+    my_hash_delete(&conn->loop_check_queue, (uchar*) lcptr);
+#endif
+  }
+  DBUG_RETURN(0);
+}
+
+int spider_db_mbase::fin_loop_check()
+{
+  st_spider_conn_loop_check *lcptr;
+  DBUG_ENTER("spider_db_mbase::fin_loop_check");
+  DBUG_PRINT("info",("spider this=%p", this));
+  if (conn->loop_check_queue.records)
+  {
+    uint l = 0;
+    while ((lcptr = (SPIDER_CONN_LOOP_CHECK *) my_hash_element(
+      &conn->loop_check_queue, l)))
+    {
+      lcptr->flag = 0;
+      ++l;
+    }
+    my_hash_reset(&conn->loop_check_queue);
+  }
+  lcptr = conn->loop_check_ignored_first;
+  while (lcptr)
+  {
+    lcptr->flag = 0;
+    lcptr = lcptr->next;
+  }
+  conn->loop_check_ignored_first = NULL;
+  lcptr = conn->loop_check_meraged_first;
+  while (lcptr)
+  {
+    lcptr->flag = 0;
+    lcptr = lcptr->next;
+  }
+  conn->loop_check_meraged_first = NULL;
+  DBUG_RETURN(0);
+}
+
 int spider_db_mbase::exec_simple_sql_with_result(
   SPIDER_TRX *trx,
   SPIDER_SHARE *share,
@@ -3724,7 +3864,7 @@ int spider_db_mbase::append_lock_tables(
   {
     tmp_spider = tmp_link_for_hash->spider;
     tmp_link_idx = tmp_link_for_hash->link_idx;
-    switch (tmp_spider->lock_type)
+    switch (tmp_spider->wide_handler->lock_type)
     {
       case TL_READ:
         lock_type = SPIDER_DB_TABLE_LOCK_READ_LOCAL;
@@ -3740,7 +3880,8 @@ int spider_db_mbase::append_lock_tables(
         break;
       default:
         // no lock
-        DBUG_PRINT("info",("spider lock_type=%d", tmp_spider->lock_type));
+        DBUG_PRINT("info",("spider lock_type=%d",
+          tmp_spider->wide_handler->lock_type));
         DBUG_RETURN(0);
     }
     conn_link_idx = tmp_spider->conn_link_idx[tmp_link_idx];
@@ -5494,6 +5635,47 @@ int spider_db_mbase_util::append_time_zone(
   DBUG_RETURN(0);
 }
 
+int spider_db_mbase_util::append_loop_check(
+  spider_string *str,
+  SPIDER_CONN *conn
+) {
+  SPIDER_CONN_LOOP_CHECK *lcptr;
+  DBUG_ENTER("spider_db_mbase_util::append_loop_check");
+  DBUG_PRINT("info",("spider this=%p", this));
+  DBUG_PRINT("info",("spider str=%s", str->c_ptr_safe()));
+  uint l = 0;
+  while ((lcptr = (SPIDER_CONN_LOOP_CHECK *) my_hash_element(
+    &conn->loop_check_queue, l)))
+  {
+    DBUG_PRINT("info",("spider lcptr=%p", lcptr));
+    if (str->reserve(SPIDER_SQL_SEMICOLON_LEN + SPIDER_SQL_SET_USER_VAL_LEN +
+      SPIDER_SQL_LOP_CHK_PRM_PRF_LEN + lcptr->to_name.length +
+      SPIDER_SQL_NAME_QUOTE_LEN + SPIDER_SQL_EQUAL_LEN +
+      SPIDER_SQL_VALUE_QUOTE_LEN +
+      lcptr->merged_value.length + SPIDER_SQL_VALUE_QUOTE_LEN))
+    {
+      DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+    }
+    if (str->length())
+    {
+      str->q_append(SPIDER_SQL_SEMICOLON_STR, SPIDER_SQL_SEMICOLON_LEN);
+    }
+    str->q_append(SPIDER_SQL_SET_USER_VAL_STR, SPIDER_SQL_SET_USER_VAL_LEN);
+    str->q_append(SPIDER_SQL_LOP_CHK_PRM_PRF_STR,
+      SPIDER_SQL_LOP_CHK_PRM_PRF_LEN);
+    str->q_append(lcptr->to_name.str, lcptr->to_name.length);
+    str->q_append(SPIDER_SQL_NAME_QUOTE_STR, SPIDER_SQL_NAME_QUOTE_LEN);
+    str->q_append(SPIDER_SQL_EQUAL_STR, SPIDER_SQL_EQUAL_LEN);
+    str->q_append(SPIDER_SQL_VALUE_QUOTE_STR, SPIDER_SQL_VALUE_QUOTE_LEN);
+    str->q_append(lcptr->merged_value.str, lcptr->merged_value.length);
+    str->q_append(SPIDER_SQL_VALUE_QUOTE_STR, SPIDER_SQL_VALUE_QUOTE_LEN);
+
+    ++l;
+    DBUG_PRINT("info",("spider str=%s", str->c_ptr_safe()));
+  }
+  DBUG_RETURN(0);
+}
+
 int spider_db_mbase_util::append_start_transaction(
   spider_string *str
 ) {
@@ -6478,7 +6660,8 @@ int spider_db_mbase_util::open_item_func(
       break;
     case Item_func::FUNC_SP:
     case Item_func::UDF_FUNC:
-      use_pushdown_udf = spider_param_use_pushdown_udf(spider->trx->thd,
+      use_pushdown_udf = spider_param_use_pushdown_udf(
+        spider->wide_handler->trx->thd,
         spider->share->use_pushdown_udf);
       if (!use_pushdown_udf)
         /*
@@ -6707,7 +6890,7 @@ int spider_db_mbase_util::open_item_func(
       last_str_length = SPIDER_SQL_CLOSE_PAREN_LEN;
       break;
     default:
-      THD *thd = spider->trx->thd;
+      THD *thd = spider->wide_handler->trx->thd;
       SPIDER_SHARE *share = spider->share;
       if (spider_param_skip_default_condition(thd,
         share->skip_default_condition))
@@ -7545,6 +7728,13 @@ int spider_db_mbase_util::append_having(
 }
 #endif
 
+bool spider_db_mbase_util::append_charset_name_before_string()
+{
+  DBUG_ENTER("spider_db_mbase_util::append_charset_name_before_string");
+  DBUG_PRINT("info",("spider this=%p", this));
+  DBUG_RETURN(TRUE);
+}
+
 spider_mbase_share::spider_mbase_share(
   st_spider_share *share,
   uint dbton_id,
@@ -8411,9 +8601,11 @@ int spider_mbase_share::discover_table_structure(
     if (!conn->disable_reconnect)
     {
       ha_spider tmp_spider;
+      SPIDER_WIDE_HANDLER wide_handler;
       int need_mon = 0;
       uint tmp_conn_link_idx = 0;
-      tmp_spider.trx = trx;
+      tmp_spider.wide_handler = &wide_handler;
+      wide_handler.trx = trx;
       tmp_spider.share = spider_share;
       tmp_spider.need_mons = &need_mon;
       tmp_spider.conn_link_idx = &tmp_conn_link_idx;
@@ -8729,7 +8921,7 @@ spider_mariadb_handler::~spider_mariadb_handler()
 int spider_mbase_handler::init()
 {
   uint roop_count;
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   st_spider_share *share = spider->share;
   int init_sql_alloc_size =
     spider_param_init_sql_alloc_size(thd, share->init_sql_alloc_size);
@@ -9165,7 +9357,7 @@ int spider_mbase_handler::append_create_tmp_bka_table(
 ) {
   int error_num;
   SPIDER_SHARE *share = spider->share;
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   char *bka_engine = spider_param_bka_engine(thd, share->bka_engine);
   uint bka_engine_length = strlen(bka_engine),
     cset_length = strlen(table_charset->csname),
@@ -9567,10 +9759,10 @@ int spider_mbase_handler::append_insert(
   direct_insert_kind = SPIDER_SQL_DIRECT_INSERT_KIND_INSERT;
   if (
     (
-      spider->write_can_replace ||
+      spider->wide_handler->write_can_replace ||
       /* for direct_dup_insert without patch for partition */
-      spider->sql_command == SQLCOM_REPLACE ||
-      spider->sql_command == SQLCOM_REPLACE_SELECT
+      spider->wide_handler->sql_command == SQLCOM_REPLACE ||
+      spider->wide_handler->sql_command == SQLCOM_REPLACE_SELECT
     ) &&
     spider->direct_dup_insert
   ) {
@@ -9583,13 +9775,13 @@ int spider_mbase_handler::append_insert(
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
     str->q_append(SPIDER_SQL_INSERT_STR, SPIDER_SQL_INSERT_LEN);
   }
-  if (spider->low_priority)
+  if (spider->wide_handler->low_priority)
   {
     if (str->reserve(SPIDER_SQL_LOW_PRIORITY_LEN))
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
     str->q_append(SPIDER_SQL_LOW_PRIORITY_STR, SPIDER_SQL_LOW_PRIORITY_LEN);
   }
-  else if (spider->insert_delayed)
+  else if (spider->wide_handler->insert_delayed)
   {
     if (share->internal_delayed)
     {
@@ -9599,28 +9791,28 @@ int spider_mbase_handler::append_insert(
     }
   }
   else if (
-    spider->lock_type >= TL_WRITE &&
-    !spider->write_can_replace &&
+    spider->wide_handler->lock_type >= TL_WRITE &&
+    !spider->wide_handler->write_can_replace &&
     /* for direct_dup_insert without patch for partition */
-    spider->sql_command != SQLCOM_REPLACE &&
-    spider->sql_command != SQLCOM_REPLACE_SELECT
+    spider->wide_handler->sql_command != SQLCOM_REPLACE &&
+    spider->wide_handler->sql_command != SQLCOM_REPLACE_SELECT
   ) {
     if (str->reserve(SPIDER_SQL_HIGH_PRIORITY_LEN))
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
     str->q_append(SPIDER_SQL_HIGH_PRIORITY_STR, SPIDER_SQL_HIGH_PRIORITY_LEN);
   }
   if (
-    spider->ignore_dup_key &&
+    spider->wide_handler->ignore_dup_key &&
     spider->direct_dup_insert &&
-    !spider->write_can_replace &&
+    !spider->wide_handler->write_can_replace &&
 #ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
-    (!spider->insert_with_update || !dup_update_sql.length()) &&
+    (!spider->wide_handler->insert_with_update || !dup_update_sql.length()) &&
 #else
-    !spider->insert_with_update &&
+    !spider->wide_handler->insert_with_update &&
 #endif
     /* for direct_dup_insert without patch for partition */
-    spider->sql_command != SQLCOM_REPLACE &&
-    spider->sql_command != SQLCOM_REPLACE_SELECT
+    spider->wide_handler->sql_command != SQLCOM_REPLACE &&
+    spider->wide_handler->sql_command != SQLCOM_REPLACE_SELECT
   ) {
     direct_insert_kind = SPIDER_SQL_DIRECT_INSERT_KIND_IGNORE;
     if (str->reserve(SPIDER_SQL_SQL_IGNORE_LEN))
@@ -9647,15 +9839,15 @@ int spider_mbase_handler::append_update(
   if (str->reserve(SPIDER_SQL_UPDATE_LEN))
     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
   str->q_append(SPIDER_SQL_UPDATE_STR, SPIDER_SQL_UPDATE_LEN);
-  if (spider->low_priority)
+  if (spider->wide_handler->low_priority)
   {
     if (str->reserve(SPIDER_SQL_LOW_PRIORITY_LEN))
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
     str->q_append(SPIDER_SQL_LOW_PRIORITY_STR, SPIDER_SQL_LOW_PRIORITY_LEN);
   }
   if (
-    spider->ignore_dup_key &&
-    !spider->insert_with_update
+    spider->wide_handler->ignore_dup_key &&
+    !spider->wide_handler->insert_with_update
   ) {
     if (str->reserve(SPIDER_SQL_SQL_IGNORE_LEN))
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
@@ -9686,20 +9878,20 @@ int spider_mbase_handler::append_delete(
   if (str->reserve(SPIDER_SQL_DELETE_LEN))
     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
   str->q_append(SPIDER_SQL_DELETE_STR, SPIDER_SQL_DELETE_LEN);
-  if (spider->low_priority)
+  if (spider->wide_handler->low_priority)
   {
     if (str->reserve(SPIDER_SQL_LOW_PRIORITY_LEN))
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
     str->q_append(SPIDER_SQL_LOW_PRIORITY_STR, SPIDER_SQL_LOW_PRIORITY_LEN);
   }
-  if (spider->quick_mode)
+  if (spider->wide_handler->quick_mode)
   {
     if (str->reserve(SPIDER_SQL_SQL_QUICK_MODE_LEN))
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
     str->q_append(SPIDER_SQL_SQL_QUICK_MODE_STR,
       SPIDER_SQL_SQL_QUICK_MODE_LEN);
   }
-  if (spider->ignore_dup_key)
+  if (spider->wide_handler->ignore_dup_key)
   {
     if (str->reserve(SPIDER_SQL_SQL_IGNORE_LEN))
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
@@ -9861,7 +10053,7 @@ int spider_mbase_handler::append_direct_update_set(
   DBUG_ENTER("spider_mbase_handler::append_direct_update_set");
   if (
     spider->direct_update_kinds == SPIDER_SQL_KIND_SQL &&
-    spider->direct_update_fields
+    spider->wide_handler->direct_update_fields
   ) {
     if (str->reserve(SPIDER_SQL_SET_LEN))
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
@@ -10004,6 +10196,7 @@ int spider_mbase_handler::append_select(
   ulong sql_type
 ) {
   SPIDER_RESULT_LIST *result_list = &spider->result_list;
+  SPIDER_WIDE_HANDLER *wide_handler = spider->wide_handler;
   DBUG_ENTER("spider_mbase_handler::append_select");
   if (sql_type == SPIDER_SQL_TYPE_HANDLER)
   {
@@ -10020,13 +10213,15 @@ int spider_mbase_handler::append_select(
         DBUG_RETURN(HA_ERR_OUT_OF_MEM);
       str->q_append(SPIDER_SQL_DISTINCT_STR, SPIDER_SQL_DISTINCT_LEN);
     }
-    if (result_list->lock_type != F_WRLCK && spider->lock_mode < 1)
+    if (wide_handler->external_lock_type != F_WRLCK &&
+      wide_handler->lock_mode < 1)
     {
       /* no lock */
 #ifdef SPIDER_SQL_CACHE_IS_IN_LEX
-      LEX *lex = spider->trx->thd->lex;
+      LEX *lex = wide_handler->trx->thd->lex;
 #else
-      st_select_lex *select_lex = &spider->trx->thd->lex->select_lex;
+      st_select_lex *select_lex =
+        &wide_handler->trx->thd->lex->select_lex;
 #endif
       if (
 #ifdef SPIDER_SQL_CACHE_IS_IN_LEX
@@ -10064,7 +10259,7 @@ int spider_mbase_handler::append_select(
           SPIDER_SQL_SQL_NO_CACHE_LEN);
       }
     }
-    if (spider->high_priority)
+    if (wide_handler->high_priority)
     {
       if (str->reserve(SPIDER_SQL_HIGH_PRIORITY_LEN))
         DBUG_RETURN(HA_ERR_OUT_OF_MEM);
@@ -10097,10 +10292,58 @@ int spider_mbase_handler::append_table_select_part(
 int spider_mbase_handler::append_table_select(
   spider_string *str
 ) {
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+  st_select_lex *select_lex = NULL;
+  bool sgb = (spider->result_list.direct_aggregate &&
+    spider_param_strict_group_by(current_thd, (strict_group_by ? 1 : 0)) == 1);
+#endif
   DBUG_ENTER("spider_mbase_handler::append_table_select");
-  table_name_pos = str->length() + mysql_share->table_select_pos;
-  if (str->append(*(mysql_share->table_select)))
-    DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+  if (sgb)
+  {
+    select_lex = spider_get_select_lex(spider);
+    JOIN *join = select_lex->join;
+    if (!(*join->sum_funcs) && !select_lex->group_list.elements)
+    {
+      select_lex = NULL;
+    }
+  }
+  if (select_lex)
+  {
+    TABLE *table = spider->get_table();
+    Field **field;
+    int field_length;
+    for (field = table->field; *field; field++)
+    {
+      field_length =
+        mysql_share->column_name_str[(*field)->field_index].length();
+      if (!spider_db_check_select_colum_in_group(select_lex, *field))
+      {
+        if (str->reserve(SPIDER_SQL_MIN_LEN + SPIDER_SQL_OPEN_PAREN_LEN +
+          field_length + /* SPIDER_SQL_NAME_QUOTE_LEN */ 2 +
+          SPIDER_SQL_CLOSE_PAREN_LEN + SPIDER_SQL_COMMA_LEN))
+          DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+        str->q_append(SPIDER_SQL_MIN_STR, SPIDER_SQL_MIN_LEN);
+        str->q_append(SPIDER_SQL_OPEN_PAREN_STR, SPIDER_SQL_OPEN_PAREN_LEN);
+        mysql_share->append_column_name(str, (*field)->field_index);
+        str->q_append(SPIDER_SQL_CLOSE_PAREN_STR, SPIDER_SQL_CLOSE_PAREN_LEN);
+      } else {
+        if (str->reserve(field_length + /* SPIDER_SQL_NAME_QUOTE_LEN */ 2 +
+          SPIDER_SQL_COMMA_LEN))
+          DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+        mysql_share->append_column_name(str, (*field)->field_index);
+      }
+      str->q_append(SPIDER_SQL_COMMA_STR, SPIDER_SQL_COMMA_LEN);
+    }
+    str->length(str->length() - SPIDER_SQL_COMMA_LEN);
+  } else {
+#endif
+    table_name_pos = str->length() + mysql_share->table_select_pos;
+    if (str->append(*(mysql_share->table_select)))
+      DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+  }
+#endif
   DBUG_RETURN(0);
 }
 
@@ -10128,10 +10371,63 @@ int spider_mbase_handler::append_key_select(
   spider_string *str,
   uint idx
 ) {
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+  st_select_lex *select_lex = NULL;
+  bool sgb = (spider->result_list.direct_aggregate &&
+    spider_param_strict_group_by(current_thd, (strict_group_by ? 1 : 0)) == 1);
+#endif
   DBUG_ENTER("spider_mbase_handler::append_key_select");
-  table_name_pos = str->length() + mysql_share->key_select_pos[idx];
-  if (str->append(mysql_share->key_select[idx]))
-    DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+  if (sgb)
+  {
+    select_lex = spider_get_select_lex(spider);
+    JOIN *join = select_lex->join;
+    if (!(*join->sum_funcs) && !select_lex->group_list.elements)
+    {
+      select_lex = NULL;
+    }
+  }
+  if (select_lex)
+  {
+    TABLE *table = spider->get_table();
+    KEY *key_info = &table->key_info[idx];
+    KEY_PART_INFO *key_part;
+    Field *field;
+    uint part_num;
+    int field_length;
+    for (key_part = key_info->key_part, part_num = 0;
+      part_num < spider_user_defined_key_parts(key_info);
+      key_part++, part_num++)
+    {
+      field = key_part->field;
+      field_length = mysql_share->column_name_str[field->field_index].length();
+      if (!spider_db_check_select_colum_in_group(select_lex, field))
+      {
+        if (str->reserve(SPIDER_SQL_MIN_LEN + SPIDER_SQL_OPEN_PAREN_LEN +
+          field_length + /* SPIDER_SQL_NAME_QUOTE_LEN */ 2 +
+          SPIDER_SQL_CLOSE_PAREN_LEN + SPIDER_SQL_COMMA_LEN))
+          DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+        str->q_append(SPIDER_SQL_MIN_STR, SPIDER_SQL_MIN_LEN);
+        str->q_append(SPIDER_SQL_OPEN_PAREN_STR, SPIDER_SQL_OPEN_PAREN_LEN);
+        mysql_share->append_column_name(str, field->field_index);
+        str->q_append(SPIDER_SQL_CLOSE_PAREN_STR, SPIDER_SQL_CLOSE_PAREN_LEN);
+      } else {
+        if (str->reserve(field_length + /* SPIDER_SQL_NAME_QUOTE_LEN */ 2 +
+          SPIDER_SQL_COMMA_LEN))
+          DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+        mysql_share->append_column_name(str, field->field_index);
+      }
+      str->q_append(SPIDER_SQL_COMMA_STR, SPIDER_SQL_COMMA_LEN);
+    }
+    str->length(str->length() - SPIDER_SQL_COMMA_LEN);
+  } else {
+#endif
+    table_name_pos = str->length() + mysql_share->key_select_pos[idx];
+    if (str->append(mysql_share->key_select[idx]))
+      DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+  }
+#endif
   DBUG_RETURN(0);
 }
 
@@ -10162,7 +10458,23 @@ int spider_mbase_handler::append_minimum_select(
   Field **field;
   int field_length;
   bool appended = FALSE;
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+  st_select_lex *select_lex = NULL;
+  bool sgb = (spider->result_list.direct_aggregate &&
+    spider_param_strict_group_by(current_thd, (strict_group_by ? 1 : 0)) == 1);
+#endif
   DBUG_ENTER("spider_mbase_handler::append_minimum_select");
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+  if (sgb)
+  {
+    select_lex = spider_get_select_lex(spider);
+    JOIN *join = select_lex->join;
+    if (!(*join->sum_funcs) && !select_lex->group_list.elements)
+    {
+      select_lex = NULL;
+    }
+  }
+#endif
   minimum_select_bitmap_create();
   for (field = table->field; *field; field++)
   {
@@ -10173,10 +10485,27 @@ int spider_mbase_handler::append_minimum_select(
 */
       field_length =
         mysql_share->column_name_str[(*field)->field_index].length();
-      if (str->reserve(field_length +
-        /* SPIDER_SQL_NAME_QUOTE_LEN */ 2 + SPIDER_SQL_COMMA_LEN))
-        DBUG_RETURN(HA_ERR_OUT_OF_MEM);
-      mysql_share->append_column_name(str, (*field)->field_index);
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+      if (select_lex &&
+        !spider_db_check_select_colum_in_group(select_lex, *field))
+      {
+        if (str->reserve(SPIDER_SQL_MIN_LEN + SPIDER_SQL_OPEN_PAREN_LEN +
+          field_length + /* SPIDER_SQL_NAME_QUOTE_LEN */ 2 +
+          SPIDER_SQL_CLOSE_PAREN_LEN + SPIDER_SQL_COMMA_LEN))
+          DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+        str->q_append(SPIDER_SQL_MIN_STR, SPIDER_SQL_MIN_LEN);
+        str->q_append(SPIDER_SQL_OPEN_PAREN_STR, SPIDER_SQL_OPEN_PAREN_LEN);
+        mysql_share->append_column_name(str, (*field)->field_index);
+        str->q_append(SPIDER_SQL_CLOSE_PAREN_STR, SPIDER_SQL_CLOSE_PAREN_LEN);
+      } else {
+#endif
+        if (str->reserve(field_length +
+          /* SPIDER_SQL_NAME_QUOTE_LEN */ 2 + SPIDER_SQL_COMMA_LEN))
+          DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+        mysql_share->append_column_name(str, (*field)->field_index);
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+      }
+#endif
       str->q_append(SPIDER_SQL_COMMA_STR, SPIDER_SQL_COMMA_LEN);
       appended = TRUE;
     }
@@ -10199,16 +10528,50 @@ int spider_mbase_handler::append_table_select_with_alias(
   TABLE *table = spider->get_table();
   Field **field;
   int field_length;
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+  st_select_lex *select_lex = NULL;
+  bool sgb = (spider->result_list.direct_aggregate &&
+    spider_param_strict_group_by(current_thd, (strict_group_by ? 1 : 0)) == 1);
+#endif
   DBUG_ENTER("spider_mbase_handler::append_table_select_with_alias");
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+  if (sgb)
+  {
+    select_lex = spider_get_select_lex(spider);
+    JOIN *join = select_lex->join;
+    if (!(*join->sum_funcs) && !select_lex->group_list.elements)
+    {
+      select_lex = NULL;
+    }
+  }
+#endif
   for (field = table->field; *field; field++)
   {
     field_length =
       mysql_share->column_name_str[(*field)->field_index].length();
-    if (str->reserve(alias_length + field_length +
-      /* SPIDER_SQL_NAME_QUOTE_LEN */ 2 + SPIDER_SQL_COMMA_LEN))
-      DBUG_RETURN(HA_ERR_OUT_OF_MEM);
-    str->q_append(alias, alias_length);
-    mysql_share->append_column_name(str, (*field)->field_index);
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+    if (select_lex &&
+      !spider_db_check_select_colum_in_group(select_lex, *field))
+    {
+      if (str->reserve(SPIDER_SQL_MIN_LEN + SPIDER_SQL_OPEN_PAREN_LEN +
+        alias_length + field_length + /* SPIDER_SQL_NAME_QUOTE_LEN */ 2 +
+        SPIDER_SQL_CLOSE_PAREN_LEN + SPIDER_SQL_COMMA_LEN))
+        DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+      str->q_append(SPIDER_SQL_MIN_STR, SPIDER_SQL_MIN_LEN);
+      str->q_append(SPIDER_SQL_OPEN_PAREN_STR, SPIDER_SQL_OPEN_PAREN_LEN);
+      str->q_append(alias, alias_length);
+      mysql_share->append_column_name(str, (*field)->field_index);
+      str->q_append(SPIDER_SQL_CLOSE_PAREN_STR, SPIDER_SQL_CLOSE_PAREN_LEN);
+    } else {
+#endif
+      if (str->reserve(alias_length + field_length +
+        /* SPIDER_SQL_NAME_QUOTE_LEN */ 2 + SPIDER_SQL_COMMA_LEN))
+        DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+      str->q_append(alias, alias_length);
+      mysql_share->append_column_name(str, (*field)->field_index);
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+    }
+#endif
     str->q_append(SPIDER_SQL_COMMA_STR, SPIDER_SQL_COMMA_LEN);
   }
   str->length(str->length() - SPIDER_SQL_COMMA_LEN);
@@ -10225,17 +10588,51 @@ int spider_mbase_handler::append_key_select_with_alias(
   Field *field;
   uint part_num;
   int field_length;
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+  st_select_lex *select_lex = NULL;
+  bool sgb = (spider->result_list.direct_aggregate &&
+    spider_param_strict_group_by(current_thd, (strict_group_by ? 1 : 0)) == 1);
+#endif
   DBUG_ENTER("spider_mbase_handler::append_key_select_with_alias");
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+  if (sgb)
+  {
+    select_lex = spider_get_select_lex(spider);
+    JOIN *join = select_lex->join;
+    if (!(*join->sum_funcs) && !select_lex->group_list.elements)
+    {
+      select_lex = NULL;
+    }
+  }
+#endif
   for (key_part = key_info->key_part, part_num = 0;
     part_num < spider_user_defined_key_parts(key_info); key_part++, part_num++)
   {
     field = key_part->field;
     field_length = mysql_share->column_name_str[field->field_index].length();
-    if (str->reserve(alias_length + field_length +
-      /* SPIDER_SQL_NAME_QUOTE_LEN */ 2 + SPIDER_SQL_COMMA_LEN))
-      DBUG_RETURN(HA_ERR_OUT_OF_MEM);
-    str->q_append(alias, alias_length);
-    mysql_share->append_column_name(str, field->field_index);
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+    if (select_lex &&
+      !spider_db_check_select_colum_in_group(select_lex, field))
+    {
+      if (str->reserve(SPIDER_SQL_MIN_LEN + SPIDER_SQL_OPEN_PAREN_LEN +
+        alias_length + field_length + /* SPIDER_SQL_NAME_QUOTE_LEN */ 2 +
+        SPIDER_SQL_CLOSE_PAREN_LEN + SPIDER_SQL_COMMA_LEN))
+        DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+      str->q_append(SPIDER_SQL_MIN_STR, SPIDER_SQL_MIN_LEN);
+      str->q_append(SPIDER_SQL_OPEN_PAREN_STR, SPIDER_SQL_OPEN_PAREN_LEN);
+      str->q_append(alias, alias_length);
+      mysql_share->append_column_name(str, field->field_index);
+      str->q_append(SPIDER_SQL_CLOSE_PAREN_STR, SPIDER_SQL_CLOSE_PAREN_LEN);
+    } else {
+#endif
+      if (str->reserve(alias_length + field_length +
+        /* SPIDER_SQL_NAME_QUOTE_LEN */ 2 + SPIDER_SQL_COMMA_LEN))
+        DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+      str->q_append(alias, alias_length);
+      mysql_share->append_column_name(str, field->field_index);
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+    }
+#endif
     str->q_append(SPIDER_SQL_COMMA_STR, SPIDER_SQL_COMMA_LEN);
   }
   str->length(str->length() - SPIDER_SQL_COMMA_LEN);
@@ -10251,7 +10648,23 @@ int spider_mbase_handler::append_minimum_select_with_alias(
   Field **field;
   int field_length;
   bool appended = FALSE;
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+  st_select_lex *select_lex = NULL;
+  bool sgb = (spider->result_list.direct_aggregate &&
+    spider_param_strict_group_by(current_thd, (strict_group_by ? 1 : 0)) == 1);
+#endif
   DBUG_ENTER("spider_mbase_handler::append_minimum_select_with_alias");
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+  if (sgb)
+  {
+    select_lex = spider_get_select_lex(spider);
+    JOIN *join = select_lex->join;
+    if (!(*join->sum_funcs) && !select_lex->group_list.elements)
+    {
+      select_lex = NULL;
+    }
+  }
+#endif
   minimum_select_bitmap_create();
   for (field = table->field; *field; field++)
   {
@@ -10262,11 +10675,29 @@ int spider_mbase_handler::append_minimum_select_with_alias(
 */
       field_length =
         mysql_share->column_name_str[(*field)->field_index].length();
-      if (str->reserve(alias_length + field_length +
-        /* SPIDER_SQL_NAME_QUOTE_LEN */ 2 + SPIDER_SQL_COMMA_LEN))
-        DBUG_RETURN(HA_ERR_OUT_OF_MEM);
-      str->q_append(alias, alias_length);
-      mysql_share->append_column_name(str, (*field)->field_index);
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+      if (select_lex &&
+        !spider_db_check_select_colum_in_group(select_lex, *field))
+      {
+        if (str->reserve(SPIDER_SQL_MIN_LEN + SPIDER_SQL_OPEN_PAREN_LEN +
+          alias_length + field_length + /* SPIDER_SQL_NAME_QUOTE_LEN */ 2 +
+          SPIDER_SQL_CLOSE_PAREN_LEN + SPIDER_SQL_COMMA_LEN))
+          DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+        str->q_append(SPIDER_SQL_MIN_STR, SPIDER_SQL_MIN_LEN);
+        str->q_append(SPIDER_SQL_OPEN_PAREN_STR, SPIDER_SQL_OPEN_PAREN_LEN);
+        str->q_append(alias, alias_length);
+        mysql_share->append_column_name(str, (*field)->field_index);
+        str->q_append(SPIDER_SQL_CLOSE_PAREN_STR, SPIDER_SQL_CLOSE_PAREN_LEN);
+      } else {
+#endif
+        if (str->reserve(alias_length + field_length +
+          /* SPIDER_SQL_NAME_QUOTE_LEN */ 2 + SPIDER_SQL_COMMA_LEN))
+          DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+        str->q_append(alias, alias_length);
+        mysql_share->append_column_name(str, (*field)->field_index);
+#ifdef HANDLER_HAS_DIRECT_AGGREGATE
+      }
+#endif
       str->q_append(SPIDER_SQL_COMMA_STR, SPIDER_SQL_COMMA_LEN);
       appended = TRUE;
     }
@@ -11062,7 +11493,7 @@ int spider_mbase_handler::append_update_where(
 ) {
   uint field_name_length;
   Field **field;
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   SPIDER_SHARE *share = spider->share;
   bool no_pk = (table->s->primary_key == MAX_KEY);
   DBUG_ENTER("spider_mbase_handler::append_update_where");
@@ -11228,7 +11659,7 @@ int spider_mbase_handler::append_condition_part(
         ha_where_pos = str->length();
 
         if (
-          spider->sql_command == SQLCOM_HA_READ ||
+          spider->wide_handler->sql_command == SQLCOM_HA_READ ||
           !spider->result_list.use_both_key
         ) {
           if (sql_part2.length())
@@ -11257,7 +11688,7 @@ int spider_mbase_handler::append_condition(
   ulong sql_type
 ) {
   int error_num, restart_pos = 0, start_where_pos;
-  SPIDER_CONDITION *tmp_cond = spider->condition;
+  SPIDER_CONDITION *tmp_cond = spider->wide_handler->condition;
   DBUG_ENTER("spider_mbase_handler::append_condition");
   if (str && start_where)
   {
@@ -11268,7 +11699,7 @@ int spider_mbase_handler::append_condition(
 
   if (spider->is_clone && !tmp_cond)
   {
-    tmp_cond = spider->pt_clone_source_handler->condition;
+    tmp_cond = spider->pt_clone_source_handler->wide_handler->condition;
   }
 
   while (tmp_cond)
@@ -12135,13 +12566,13 @@ int spider_mbase_handler::append_limit(
     str->q_append(SPIDER_SQL_LIMIT_STR, SPIDER_SQL_LIMIT_LEN);
     if (offset)
     {
-      length = (uint32) (my_charset_bin.cset->longlong10_to_str)(
-        &my_charset_bin, buf, SPIDER_LONGLONG_LEN + 1, -10, offset);
+      length = (uint32) (my_charset_bin.longlong10_to_str)(
+        buf, SPIDER_LONGLONG_LEN + 1, -10, offset);
       str->q_append(buf, length);
       str->q_append(SPIDER_SQL_COMMA_STR, SPIDER_SQL_COMMA_LEN);
     }
-    length = (uint32) (my_charset_bin.cset->longlong10_to_str)(
-      &my_charset_bin, buf, SPIDER_LONGLONG_LEN + 1, -10, limit);
+    length = (uint32) (my_charset_bin.longlong10_to_str)(
+      buf, SPIDER_LONGLONG_LEN + 1, -10, limit);
     str->q_append(buf, length);
   }
   DBUG_RETURN(0);
@@ -12740,7 +13171,7 @@ int spider_mbase_handler::append_from(
     str->q_append(SPIDER_SQL_FROM_STR, SPIDER_SQL_FROM_LEN);
     table_name_pos = str->length();
     append_table_name_with_adjusting(str, link_idx, sql_type);
-    if(spider_param_index_hint_pushdown(spider->trx->thd))
+    if(spider_param_index_hint_pushdown(spider->wide_handler->trx->thd))
     {
       if((error_num = append_index_hint(str, link_idx, sql_type)))
       {
@@ -12821,7 +13252,8 @@ int spider_mbase_handler::append_optimize_table(
 ) {
   SPIDER_SHARE *share = spider->share;
   int conn_link_idx = spider->conn_link_idx[link_idx];
-  int local_length = spider_param_internal_optimize_local(spider->trx->thd,
+  int local_length = spider_param_internal_optimize_local(
+    spider->wide_handler->trx->thd,
     share->internal_optimize_local) * SPIDER_SQL_SQL_LOCAL_LEN;
   DBUG_ENTER("spider_mbase_handler::append_optimize_table");
   DBUG_PRINT("info",("spider this=%p", this));
@@ -12866,7 +13298,8 @@ int spider_mbase_handler::append_analyze_table(
 ) {
   SPIDER_SHARE *share = spider->share;
   int conn_link_idx = spider->conn_link_idx[link_idx];
-  int local_length = spider_param_internal_optimize_local(spider->trx->thd,
+  int local_length = spider_param_internal_optimize_local(
+    spider->wide_handler->trx->thd,
     share->internal_optimize_local) * SPIDER_SQL_SQL_LOCAL_LEN;
   DBUG_ENTER("spider_mbase_handler::append_analyze_table");
   DBUG_PRINT("info",("spider this=%p", this));
@@ -12913,7 +13346,8 @@ int spider_mbase_handler::append_repair_table(
 ) {
   SPIDER_SHARE *share = spider->share;
   int conn_link_idx = spider->conn_link_idx[link_idx];
-  int local_length = spider_param_internal_optimize_local(spider->trx->thd,
+  int local_length = spider_param_internal_optimize_local(
+    spider->wide_handler->trx->thd,
     share->internal_optimize_local) * SPIDER_SQL_SQL_LOCAL_LEN;
   DBUG_ENTER("spider_mbase_handler::append_repair_table");
   DBUG_PRINT("info",("spider this=%p", this));
@@ -13123,7 +13557,7 @@ int spider_mbase_handler::append_delete_all_rows(
   int error_num;
   DBUG_ENTER("spider_mbase_handler::append_delete_all_rows");
   DBUG_PRINT("info",("spider this=%p", this));
-  if (spider->sql_command == SQLCOM_TRUNCATE)
+  if (spider->wide_handler->sql_command == SQLCOM_TRUNCATE)
   {
     if ((error_num = append_truncate(str, sql_type, first_link_idx)))
       DBUG_RETURN(error_num);
@@ -13155,8 +13589,8 @@ int spider_mbase_handler::append_truncate(
 }
 
 int spider_mbase_handler::append_explain_select_part(
-  key_range *start_key,
-  key_range *end_key,
+  const key_range *start_key,
+  const key_range *end_key,
   ulong sql_type,
   int link_idx
 ) {
@@ -13179,8 +13613,8 @@ int spider_mbase_handler::append_explain_select_part(
 
 int spider_mbase_handler::append_explain_select(
   spider_string *str,
-  key_range *start_key,
-  key_range *end_key,
+  const key_range *start_key,
+  const key_range *end_key,
   ulong sql_type,
   int link_idx
 ) {
@@ -13429,7 +13863,7 @@ bool spider_mbase_handler::bulk_tmp_table_created()
 
 int spider_mbase_handler::mk_bulk_tmp_table_and_bulk_start()
 {
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   TABLE *table = spider->get_table();
   DBUG_ENTER("spider_mbase_handler::mk_bulk_tmp_table_and_bulk_start");
   DBUG_PRINT("info",("spider this=%p", this));
@@ -13458,7 +13892,8 @@ void spider_mbase_handler::rm_bulk_tmp_table()
   DBUG_PRINT("info",("spider this=%p", this));
   if (upd_tmp_tbl)
   {
-    spider_rm_sys_tmp_table(spider->trx->thd, upd_tmp_tbl, &upd_tmp_tbl_prm);
+    spider_rm_sys_tmp_table(spider->wide_handler->trx->thd, upd_tmp_tbl,
+      &upd_tmp_tbl_prm);
     upd_tmp_tbl = NULL;
   }
   DBUG_VOID_RETURN;
@@ -13532,9 +13967,12 @@ int spider_mbase_handler::append_lock_tables_list(
   spider_db_mbase *db_conn = (spider_db_mbase *) conn->db_conn;
   DBUG_ENTER("spider_mbase_handler::append_lock_tables_list");
   DBUG_PRINT("info",("spider this=%p", this));
+  DBUG_PRINT("info",("spider db_conn=%p", db_conn));
   tmp_link_for_hash2 = &link_for_hash[link_idx];
   tmp_link_for_hash2->db_table_str =
     &mysql_share->db_table_str[conn_link_idx];
+  DBUG_PRINT("info",("spider db_table_str=%s",
+    tmp_link_for_hash2->db_table_str->c_ptr_safe()));
 #ifdef SPIDER_HAS_HASH_VALUE_TYPE
   tmp_link_for_hash2->db_table_str_hash_value =
     mysql_share->db_table_str_hash_value[conn_link_idx];
@@ -13555,7 +13993,8 @@ int spider_mbase_handler::append_lock_tables_list(
       DBUG_RETURN(error_num);
     *appended = 1;
   } else {
-    if (tmp_link_for_hash->spider->lock_type < spider->lock_type)
+    if (tmp_link_for_hash->spider->wide_handler->lock_type <
+      spider->wide_handler->lock_type)
     {
 #ifdef HASH_UPDATE_WITH_HASH_VALUE
       my_hash_delete_with_hash_value(
@@ -13595,7 +14034,7 @@ int spider_mbase_handler::append_lock_tables_list(
 int spider_mbase_handler::realloc_sql(
   ulong *realloced
 ) {
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   st_spider_share *share = spider->share;
   int init_sql_alloc_size =
     spider_param_init_sql_alloc_size(thd, share->init_sql_alloc_size);
@@ -14055,7 +14494,8 @@ int spider_mbase_handler::show_table_status(
     conn->mta_conn_mutex_lock_already = TRUE;
     conn->mta_conn_mutex_unlock_later = TRUE;
     conn->disable_connect_retry = TRUE;
-    spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+    spider_conn_set_timeout_from_share(conn, link_idx,
+      spider->wide_handler->trx->thd,
       share);
     if (
       (error_num = spider_db_set_names(spider, conn, link_idx)) ||
@@ -14096,7 +14536,8 @@ int spider_mbase_handler::show_table_status(
           pthread_mutex_unlock(&conn->mta_conn_mutex);
           DBUG_RETURN(error_num);
         }
-        spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+        spider_conn_set_timeout_from_share(conn, link_idx,
+          spider->wide_handler->trx->thd,
           share);
         if (spider_db_query(
           conn,
@@ -14124,8 +14565,8 @@ int spider_mbase_handler::show_table_status(
       }
     }
     st_spider_db_request_key request_key;
-    request_key.spider_thread_id = spider->trx->spider_thread_id;
-    request_key.query_id = spider->trx->thd->query_id;
+    request_key.spider_thread_id = spider->wide_handler->trx->spider_thread_id;
+    request_key.query_id = spider->wide_handler->trx->thd->query_id;
     request_key.handler = spider;
     request_key.request_id = 1;
     request_key.next = NULL;
@@ -14211,7 +14652,8 @@ int spider_mbase_handler::show_table_status(
     conn->mta_conn_mutex_lock_already = TRUE;
     conn->mta_conn_mutex_unlock_later = TRUE;
     conn->disable_connect_retry = TRUE;
-    spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+    spider_conn_set_timeout_from_share(conn, link_idx,
+      spider->wide_handler->trx->thd,
       share);
     if (
       (error_num = spider_db_set_names(spider, conn, link_idx)) ||
@@ -14252,7 +14694,8 @@ int spider_mbase_handler::show_table_status(
           pthread_mutex_unlock(&conn->mta_conn_mutex);
           DBUG_RETURN(error_num);
         }
-        spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+        spider_conn_set_timeout_from_share(conn, link_idx,
+          spider->wide_handler->trx->thd,
           share);
         if (spider_db_query(
           conn,
@@ -14280,8 +14723,8 @@ int spider_mbase_handler::show_table_status(
       }
     }
     st_spider_db_request_key request_key;
-    request_key.spider_thread_id = spider->trx->spider_thread_id;
-    request_key.query_id = spider->trx->thd->query_id;
+    request_key.spider_thread_id = spider->wide_handler->trx->spider_thread_id;
+    request_key.query_id = spider->wide_handler->trx->thd->query_id;
     request_key.handler = spider;
     request_key.request_id = 1;
     request_key.next = NULL;
@@ -14346,6 +14789,10 @@ int spider_mbase_handler::show_table_status(
       DBUG_RETURN(error_num);
     }
   }
+  if ((error_num = ((spider_db_mbase *) conn->db_conn)->print_warnings(NULL)))
+  {
+    DBUG_RETURN(error_num);
+  }
   if (share->static_records_for_status != -1)
   {
     share->stat.records = (ha_rows) share->static_records_for_status;
@@ -14395,7 +14842,8 @@ int spider_mbase_handler::show_index(
     DBUG_ASSERT(!conn->mta_conn_mutex_unlock_later);
     conn->mta_conn_mutex_lock_already = TRUE;
     conn->mta_conn_mutex_unlock_later = TRUE;
-    spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+    spider_conn_set_timeout_from_share(conn, link_idx,
+      spider->wide_handler->trx->thd,
       share);
     if (
       (error_num = spider_db_set_names(spider, conn, link_idx)) ||
@@ -14434,7 +14882,8 @@ int spider_mbase_handler::show_index(
           pthread_mutex_unlock(&conn->mta_conn_mutex);
           DBUG_RETURN(error_num);
         }
-        spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+        spider_conn_set_timeout_from_share(conn, link_idx,
+          spider->wide_handler->trx->thd,
           share);
         if (spider_db_query(
           conn,
@@ -14460,8 +14909,8 @@ int spider_mbase_handler::show_index(
       }
     }
     st_spider_db_request_key request_key;
-    request_key.spider_thread_id = spider->trx->spider_thread_id;
-    request_key.query_id = spider->trx->thd->query_id;
+    request_key.spider_thread_id = spider->wide_handler->trx->spider_thread_id;
+    request_key.query_id = spider->wide_handler->trx->thd->query_id;
     request_key.handler = spider;
     request_key.request_id = 1;
     request_key.next = NULL;
@@ -14543,7 +14992,8 @@ int spider_mbase_handler::show_index(
     DBUG_ASSERT(!conn->mta_conn_mutex_unlock_later);
     conn->mta_conn_mutex_lock_already = TRUE;
     conn->mta_conn_mutex_unlock_later = TRUE;
-    spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+    spider_conn_set_timeout_from_share(conn, link_idx,
+      spider->wide_handler->trx->thd,
       share);
     if (
       (error_num = spider_db_set_names(spider, conn, link_idx)) ||
@@ -14582,7 +15032,8 @@ int spider_mbase_handler::show_index(
           pthread_mutex_unlock(&conn->mta_conn_mutex);
           DBUG_RETURN(error_num);
         }
-        spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+        spider_conn_set_timeout_from_share(conn, link_idx,
+          spider->wide_handler->trx->thd,
           share);
         if (spider_db_query(
           conn,
@@ -14608,8 +15059,8 @@ int spider_mbase_handler::show_index(
       }
     }
     st_spider_db_request_key request_key;
-    request_key.spider_thread_id = spider->trx->spider_thread_id;
-    request_key.query_id = spider->trx->thd->query_id;
+    request_key.spider_thread_id = spider->wide_handler->trx->spider_thread_id;
+    request_key.query_id = spider->wide_handler->trx->thd->query_id;
     request_key.handler = spider;
     request_key.request_id = 1;
     request_key.next = NULL;
@@ -14746,7 +15197,8 @@ int spider_mbase_handler::simple_action(
   DBUG_ASSERT(!conn->mta_conn_mutex_unlock_later);
   conn->mta_conn_mutex_lock_already = TRUE;
   conn->mta_conn_mutex_unlock_later = TRUE;
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   if (
     (error_num = spider_db_set_names(spider, conn, link_idx)) ||
@@ -14787,7 +15239,8 @@ int spider_mbase_handler::simple_action(
         DBUG_PRINT("info", ("spider error_num=%d 2", error_num));
         DBUG_RETURN(error_num);
       }
-      spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+      spider_conn_set_timeout_from_share(conn, link_idx,
+        spider->wide_handler->trx->thd,
         share);
       if (spider_db_query(
         conn,
@@ -14815,8 +15268,8 @@ int spider_mbase_handler::simple_action(
     }
   }
   st_spider_db_request_key request_key;
-  request_key.spider_thread_id = spider->trx->spider_thread_id;
-  request_key.query_id = spider->trx->thd->query_id;
+  request_key.spider_thread_id = spider->wide_handler->trx->spider_thread_id;
+  request_key.query_id = spider->wide_handler->trx->thd->query_id;
   request_key.handler = spider;
   request_key.request_id = 1;
   request_key.next = NULL;
@@ -14886,7 +15339,7 @@ int spider_mbase_handler::show_records(
     DBUG_PRINT("info", ("spider error_num=%d", error_num));
     DBUG_RETURN(error_num);
   }
-  spider->trx->direct_aggregate_count++;
+  spider->wide_handler->trx->direct_aggregate_count++;
   DBUG_RETURN(0);
 }
 
@@ -14911,8 +15364,8 @@ int spider_mbase_handler::show_last_insert_id(
 }
 
 ha_rows spider_mbase_handler::explain_select(
-  key_range *start_key,
-  key_range *end_key,
+  const key_range *start_key,
+  const key_range *end_key,
   int link_idx
 ) {
   int error_num;
@@ -14938,7 +15391,8 @@ ha_rows spider_mbase_handler::explain_select(
   DBUG_ASSERT(!conn->mta_conn_mutex_unlock_later);
   conn->mta_conn_mutex_lock_already = TRUE;
   conn->mta_conn_mutex_unlock_later = TRUE;
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     spider->share);
   if (
     (error_num = spider_db_set_names(spider, conn, link_idx)) ||
@@ -14981,7 +15435,8 @@ ha_rows spider_mbase_handler::explain_select(
         pthread_mutex_unlock(&conn->mta_conn_mutex);
         DBUG_RETURN(HA_POS_ERROR);
       }
-      spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+      spider_conn_set_timeout_from_share(conn, link_idx,
+        spider->wide_handler->trx->thd,
         spider->share);
       if (spider_db_query(
         conn,
@@ -15014,8 +15469,8 @@ ha_rows spider_mbase_handler::explain_select(
     }
   }
   st_spider_db_request_key request_key;
-  request_key.spider_thread_id = spider->trx->spider_thread_id;
-  request_key.query_id = spider->trx->thd->query_id;
+  request_key.spider_thread_id = spider->wide_handler->trx->spider_thread_id;
+  request_key.query_id = spider->wide_handler->trx->thd->query_id;
   request_key.handler = spider;
   request_key.request_id = 1;
   request_key.next = NULL;
@@ -15095,7 +15550,8 @@ int spider_mbase_handler::lock_tables(
       pthread_mutex_unlock(&conn->mta_conn_mutex);
       DBUG_RETURN(error_num);
     }
-    spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+    spider_conn_set_timeout_from_share(conn, link_idx,
+      spider->wide_handler->trx->thd,
       spider->share);
     if (spider_db_query(
       conn,
@@ -15120,7 +15576,7 @@ int spider_mbase_handler::lock_tables(
   if (!conn->table_locked)
   {
     conn->table_locked = TRUE;
-    spider->trx->locked_connections++;
+    spider->wide_handler->trx->locked_connections++;
   }
   DBUG_RETURN(0);
 }
@@ -15135,7 +15591,7 @@ int spider_mbase_handler::unlock_tables(
   {
     spider_string *str = &sql;
     conn->table_locked = FALSE;
-    spider->trx->locked_connections--;
+    spider->wide_handler->trx->locked_connections--;
 
     str->length(0);
     if ((error_num = conn->db_conn->append_unlock_tables(str)))
@@ -15144,7 +15600,8 @@ int spider_mbase_handler::unlock_tables(
     }
     if (str->length())
     {
-      spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+      spider_conn_set_timeout_from_share(conn, link_idx,
+        spider->wide_handler->trx->thd,
         spider->share);
       pthread_mutex_assert_not_owner(&conn->mta_conn_mutex);
       pthread_mutex_lock(&conn->mta_conn_mutex);
@@ -15211,7 +15668,8 @@ int spider_mbase_handler::disable_keys(
     pthread_mutex_unlock(&conn->mta_conn_mutex);
     DBUG_RETURN(error_num);
   }
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   if (spider_db_query(
     conn,
@@ -15269,7 +15727,8 @@ int spider_mbase_handler::enable_keys(
     pthread_mutex_unlock(&conn->mta_conn_mutex);
     DBUG_RETURN(error_num);
   }
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   if (spider_db_query(
     conn,
@@ -15328,7 +15787,8 @@ int spider_mbase_handler::check_table(
     pthread_mutex_unlock(&conn->mta_conn_mutex);
     DBUG_RETURN(error_num);
   }
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   if (spider_db_query(
     conn,
@@ -15387,7 +15847,8 @@ int spider_mbase_handler::repair_table(
     pthread_mutex_unlock(&conn->mta_conn_mutex);
     DBUG_RETURN(error_num);
   }
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   if (spider_db_query(
     conn,
@@ -15445,7 +15906,8 @@ int spider_mbase_handler::analyze_table(
     pthread_mutex_unlock(&conn->mta_conn_mutex);
     DBUG_RETURN(error_num);
   }
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   if (spider_db_query(
     conn,
@@ -15503,7 +15965,8 @@ int spider_mbase_handler::optimize_table(
     pthread_mutex_unlock(&conn->mta_conn_mutex);
     DBUG_RETURN(error_num);
   }
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   if (spider_db_query(
     conn,
@@ -15544,7 +16007,8 @@ int spider_mbase_handler::flush_tables(
   {
     DBUG_RETURN(error_num);
   }
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   pthread_mutex_assert_not_owner(&conn->mta_conn_mutex);
   pthread_mutex_lock(&conn->mta_conn_mutex);
@@ -15585,7 +16049,8 @@ int spider_mbase_handler::flush_logs(
   SPIDER_SHARE *share = spider->share;
   DBUG_ENTER("spider_mbase_handler::flush_logs");
   DBUG_PRINT("info",("spider this=%p", this));
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   pthread_mutex_assert_not_owner(&conn->mta_conn_mutex);
   pthread_mutex_lock(&conn->mta_conn_mutex);
@@ -15723,16 +16188,17 @@ void spider_mbase_handler::minimum_select_bitmap_create()
       }
     }
   }
-  DBUG_PRINT("info",("spider searched_bitmap=%p", spider->searched_bitmap));
+  DBUG_PRINT("info",("spider searched_bitmap=%p",
+    spider->wide_handler->searched_bitmap));
   for (field_p = table->field; *field_p; field_p++)
   {
     uint field_index = (*field_p)->field_index;
     DBUG_PRINT("info",("spider field_index=%u", field_index));
     DBUG_PRINT("info",("spider ft_discard_bitmap=%s",
-      spider_bit_is_set(spider->ft_discard_bitmap, field_index) ?
+      spider_bit_is_set(spider->wide_handler->ft_discard_bitmap, field_index) ?
         "TRUE" : "FALSE"));
     DBUG_PRINT("info",("spider searched_bitmap=%s",
-      spider_bit_is_set(spider->searched_bitmap, field_index) ?
+      spider_bit_is_set(spider->wide_handler->searched_bitmap, field_index) ?
         "TRUE" : "FALSE"));
     DBUG_PRINT("info",("spider read_set=%s",
       bitmap_is_set(table->read_set, field_index) ?
@@ -15741,10 +16207,10 @@ void spider_mbase_handler::minimum_select_bitmap_create()
       bitmap_is_set(table->write_set, field_index) ?
         "TRUE" : "FALSE"));
     if (
-      spider_bit_is_set(spider->ft_discard_bitmap, field_index) &
+      spider_bit_is_set(spider->wide_handler->ft_discard_bitmap, field_index) &&
       (
-        spider_bit_is_set(spider->searched_bitmap, field_index) |
-        bitmap_is_set(table->read_set, field_index) |
+        spider_bit_is_set(spider->wide_handler->searched_bitmap, field_index) ||
+        bitmap_is_set(table->read_set, field_index) ||
         bitmap_is_set(table->write_set, field_index)
       )
     ) {
@@ -15792,7 +16258,7 @@ int spider_mbase_handler::init_union_table_name_pos()
   if (!union_table_name_pos_first)
   {
     if (!spider_bulk_malloc(spider_current_trx, 236, MYF(MY_WME),
-      &union_table_name_pos_first, sizeof(SPIDER_INT_HLD),
+      &union_table_name_pos_first, (uint) (sizeof(SPIDER_INT_HLD)),
       NullS)
     ) {
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
@@ -15813,7 +16279,7 @@ int spider_mbase_handler::set_union_table_name_pos()
     if (!union_table_name_pos_current->next)
     {
       if (!spider_bulk_malloc(spider_current_trx, 237, MYF(MY_WME),
-        &union_table_name_pos_current->next, sizeof(SPIDER_INT_HLD),
+        &union_table_name_pos_current->next, (uint) (sizeof(SPIDER_INT_HLD)),
         NullS)
       ) {
         DBUG_RETURN(HA_ERR_OUT_OF_MEM);
@@ -16181,6 +16647,28 @@ int spider_mbase_handler::append_order_by(
 }
 #endif
 
+#ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
+bool spider_mbase_handler::check_direct_update(
+  st_select_lex *select_lex,
+  longlong select_limit,
+  longlong offset_limit
+) {
+  DBUG_ENTER("spider_mbase_handler::check_direct_update");
+  DBUG_PRINT("info",("spider this=%p", this));
+  DBUG_RETURN(FALSE);
+}
+
+bool spider_mbase_handler::check_direct_delete(
+  st_select_lex *select_lex,
+  longlong select_limit,
+  longlong offset_limit
+) {
+  DBUG_ENTER("spider_mbase_handler::check_direct_delete");
+  DBUG_PRINT("info",("spider this=%p", this));
+  DBUG_RETURN(FALSE);
+}
+#endif
+
 spider_mbase_copy_table::spider_mbase_copy_table(
   spider_mbase_share *db_share
 ) : spider_db_copy_table(
@@ -16514,13 +17002,13 @@ int spider_mbase_copy_table::append_limit(
     sql.q_append(SPIDER_SQL_LIMIT_STR, SPIDER_SQL_LIMIT_LEN);
     if (offset)
     {
-      length = (uint32) (my_charset_bin.cset->longlong10_to_str)(
-        &my_charset_bin, buf, SPIDER_LONGLONG_LEN + 1, -10, offset);
+      length = (uint32) (my_charset_bin.longlong10_to_str)(
+        buf, SPIDER_LONGLONG_LEN + 1, -10, offset);
       sql.q_append(buf, length);
       sql.q_append(SPIDER_SQL_COMMA_STR, SPIDER_SQL_COMMA_LEN);
     }
-    length = (uint32) (my_charset_bin.cset->longlong10_to_str)(
-      &my_charset_bin, buf, SPIDER_LONGLONG_LEN + 1, -10, limit);
+    length = (uint32) (my_charset_bin.longlong10_to_str)(
+      buf, SPIDER_LONGLONG_LEN + 1, -10, limit);
     sql.q_append(buf, length);
   }
   DBUG_RETURN(0);
diff --git a/storage/spider/spd_db_mysql.h b/storage/spider/spd_db_mysql.h
index 626bb4d5624..45565154136 100644
--- a/storage/spider/spd_db_mysql.h
+++ b/storage/spider/spd_db_mysql.h
@@ -91,6 +91,10 @@ public:
     spider_string *str,
     Time_zone *time_zone
   );
+  int append_loop_check(
+    spider_string *str,
+    SPIDER_CONN *conn
+  );
   int append_start_transaction(
     spider_string *str
   );
@@ -195,6 +199,7 @@ public:
     spider_string *str
   );
 #endif
+  bool append_charset_name_before_string();
 };
 
 class spider_db_mysql_util: public spider_db_mbase_util
@@ -439,7 +444,7 @@ public:
   bool is_xa_nota_error(
     int error_num
   );
-  void print_warnings(
+  int print_warnings(
     struct tm *l_time
   );
   spider_db_result *store_result(
@@ -448,6 +453,7 @@ public:
     int *error_num
   );
   spider_db_result *use_result(
+    ha_spider *spider,
     st_spider_db_request_key *request_key,
     int *error_num
   );
@@ -529,6 +535,11 @@ public:
     Time_zone *time_zone,
     int *need_mon
   );
+  bool set_loop_check_in_bulk_sql();
+  int set_loop_check(
+    int *need_mon
+  );
+  int fin_loop_check();
   int exec_simple_sql_with_result(
     SPIDER_TRX *trx,
     SPIDER_SHARE *share,
@@ -1381,15 +1392,15 @@ public:
     int link_idx
   );
   int append_explain_select_part(
-    key_range *start_key,
-    key_range *end_key,
+    const key_range *start_key,
+    const key_range *end_key,
     ulong sql_type,
     int link_idx
   );
   int append_explain_select(
     spider_string *str,
-    key_range *start_key,
-    key_range *end_key,
+    const key_range *start_key,
+    const key_range *end_key,
     ulong sql_type,
     int link_idx
   );
@@ -1518,8 +1529,8 @@ public:
     ulonglong &last_insert_id
   );
   ha_rows explain_select(
-    key_range *start_key,
-    key_range *end_key,
+    const key_range *start_key,
+    const key_range *end_key,
     int link_idx
   );
   int lock_tables(
@@ -1663,6 +1674,18 @@ public:
     spider_fields *fields
   );
 #endif
+#ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
+  bool check_direct_update(
+    st_select_lex *select_lex,
+    longlong select_limit,
+    longlong offset_limit
+  );
+  bool check_direct_delete(
+    st_select_lex *select_lex,
+    longlong select_limit,
+    longlong offset_limit
+  );
+#endif
 };
 
 class spider_mysql_handler: public spider_mbase_handler
diff --git a/storage/spider/spd_db_oracle.cc b/storage/spider/spd_db_oracle.cc
index b7561ccb96e..e2a7e5941c1 100644
--- a/storage/spider/spd_db_oracle.cc
+++ b/storage/spider/spd_db_oracle.cc
@@ -1,4 +1,5 @@
 /* Copyright (C) 2012-2018 Kentoku Shiba
+   Copyright (c) 2020, MariaDB Corporation.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -48,7 +49,7 @@
 #include "spd_sys_table.h"
 #include "spd_table.h"
 
-extern struct charset_info_st *spd_charset_utf8_bin;
+extern struct charset_info_st *spd_charset_utf8mb3_bin;
 
 extern handlerton *spider_hton_ptr;
 extern pthread_mutex_t spider_open_conn_mutex;
@@ -335,7 +336,10 @@ SPIDER_DBTON spider_dbton_oracle = {
   spider_oracle_create_copy_table,
   spider_oracle_create_conn,
   spider_oracle_support_direct_join,
-  &spider_db_oracle_utility
+  &spider_db_oracle_utility,
+  "For communicating Oracle using native protocol",
+  "1.0.0",
+  SPIDER_MATURITY_BETA
 };
 
 spider_db_oracle_row::spider_db_oracle_row() :
@@ -588,16 +592,16 @@ int spider_db_oracle_row::init()
   if (
     !(ind = (sb2 *)
       spider_bulk_malloc(spider_current_trx, 161, MYF(MY_WME | MY_ZEROFILL),
-        &ind, sizeof(sb2) * field_count,
-        &rlen, sizeof(ub2) * field_count,
-        &coltp, sizeof(ub2) * field_count,
-        &colsz, sizeof(ub2) * field_count,
-        &row_size, sizeof(ulong) * field_count,
-        &val, sizeof(char *) * field_count,
-        &tmp_val, MAX_FIELD_WIDTH * field_count,
-        &defnp, sizeof(OCIDefine *) * field_count,
-        &lobhp, sizeof(OCILobLocator *) * field_count,
-        &colhp, sizeof(OCIParam *) * field_count,
+        &ind, (uint) (sizeof(sb2) * field_count),
+        &rlen, (uint) (sizeof(ub2) * field_count),
+        &coltp, (uint) (sizeof(ub2) * field_count),
+        &colsz, (uint) (sizeof(ub2) * field_count),
+        &row_size, (uint) (sizeof(ulong) * field_count),
+        &val, (uint) (sizeof(char *) * field_count),
+        &tmp_val, (uint) (MAX_FIELD_WIDTH * field_count),
+        &defnp, (uint) (sizeof(OCIDefine *) * field_count),
+        &lobhp, (uint) (sizeof(OCILobLocator *) * field_count),
+        &colhp, (uint) (sizeof(OCIParam *) * field_count),
         NullS)
     ) ||
     !(val_str = new spider_string[field_count])
@@ -1148,7 +1152,7 @@ int spider_db_oracle::init()
   DBUG_ENTER("spider_db_oracle::init");
   DBUG_PRINT("info",("spider this=%p", this));
   if (
-    my_hash_init(&lock_table_hash, spd_charset_utf8_bin, 32, 0, 0,
+    my_hash_init(&lock_table_hash, spd_charset_utf8mb3_bin, 32, 0, 0,
       (my_hash_get_key) spider_link_get_key, 0, 0)
   ) {
     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
@@ -1699,6 +1703,7 @@ spider_db_result *spider_db_oracle::store_result(
 }
 
 spider_db_result *spider_db_oracle::use_result(
+  ha_spider *spider,
   st_spider_db_request_key *request_key,
   int *error_num
 ) {
@@ -2384,7 +2389,7 @@ int spider_db_oracle::append_lock_tables(
 
     tmp_spider = tmp_link_for_hash->spider;
     tmp_link_idx = tmp_link_for_hash->link_idx;
-    switch (tmp_spider->lock_type)
+    switch (tmp_spider->wide_handler->lock_type)
     {
       case TL_READ:
         lock_type = SPIDER_DB_TABLE_LOCK_READ_LOCAL;
@@ -2400,7 +2405,8 @@ int spider_db_oracle::append_lock_tables(
         break;
       default:
         // no lock
-        DBUG_PRINT("info",("spider lock_type=%d", tmp_spider->lock_type));
+        DBUG_PRINT("info",("spider lock_type=%d",
+          tmp_spider->wide_handler->lock_type));
         DBUG_RETURN(0);
     }
     conn_link_idx = tmp_spider->conn_link_idx[tmp_link_idx];
@@ -4084,7 +4090,8 @@ int spider_db_oracle_util::open_item_func(
       }
       break;
     case Item_func::UDF_FUNC:
-      use_pushdown_udf = spider_param_use_pushdown_udf(spider->trx->thd,
+      use_pushdown_udf = spider_param_use_pushdown_udf(
+        spider->wide_handler->trx->thd,
         spider->share->use_pushdown_udf);
       if (!use_pushdown_udf)
         DBUG_RETURN(ER_SPIDER_COND_SKIP_NUM);
@@ -4225,7 +4232,7 @@ int spider_db_oracle_util::open_item_func(
       DBUG_RETURN(ER_SPIDER_COND_SKIP_NUM);
 #endif
     default:
-      THD *thd = spider->trx->thd;
+      THD *thd = spider->wide_handler->trx->thd;
       SPIDER_SHARE *share = spider->share;
       if (spider_param_skip_default_condition(thd,
         share->skip_default_condition))
@@ -5556,7 +5563,7 @@ spider_oracle_handler::~spider_oracle_handler()
 int spider_oracle_handler::init()
 {
   uint roop_count;
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   st_spider_share *share = spider->share;
   int init_sql_alloc_size =
     spider_param_init_sql_alloc_size(thd, share->init_sql_alloc_size);
@@ -5938,7 +5945,7 @@ int spider_oracle_handler::append_create_tmp_bka_table(
 ) {
   int error_num;
   SPIDER_SHARE *share = spider->share;
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   char *bka_engine = spider_param_bka_engine(thd, share->bka_engine);
   uint bka_engine_length = strlen(bka_engine),
     cset_length = strlen(table_charset->csname);
@@ -6548,7 +6555,7 @@ int spider_oracle_handler::append_direct_update_set(
   DBUG_ENTER("spider_oracle_handler::append_direct_update_set");
   if (
     spider->direct_update_kinds == SPIDER_SQL_KIND_SQL &&
-    spider->direct_update_fields
+    spider->wide_handler->direct_update_fields
   ) {
     if (str->reserve(SPIDER_SQL_SET_LEN))
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
@@ -6656,8 +6663,8 @@ int spider_oracle_handler::append_update_columns(
   uint alias_length
 ) {
   int error_num;
-  List_iterator_fast<Item> fi(*spider->direct_update_fields),
-    vi(*spider->direct_update_values);
+  List_iterator_fast<Item> fi(*spider->wide_handler->direct_update_fields),
+    vi(*spider->wide_handler->direct_update_values);
   Item *field, *value;
   DBUG_ENTER("spider_oracle_handler::append_update_columns");
   while ((field = fi++))
@@ -7687,7 +7694,7 @@ int spider_oracle_handler::append_update_where(
 ) {
   uint field_name_length;
   Field **field;
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   SPIDER_SHARE *share = spider->share;
   bool no_pk = (table->s->primary_key == MAX_KEY);
   DBUG_ENTER("spider_oracle_handler::append_update_where");
@@ -9179,20 +9186,20 @@ int spider_oracle_handler::append_limit(
         ((SPIDER_LONGLONG_LEN) * 2)))
         DBUG_RETURN(HA_ERR_OUT_OF_MEM);
       str->q_append(SPIDER_SQL_BETWEEN_STR, SPIDER_SQL_BETWEEN_LEN);
-      length = (uint32) (my_charset_bin.cset->longlong10_to_str)(
-        &my_charset_bin, buf, SPIDER_LONGLONG_LEN + 1, -10, offset + 1);
+      length = (uint32) (my_charset_bin.longlong10_to_str)(
+        buf, SPIDER_LONGLONG_LEN + 1, -10, offset + 1);
       str->q_append(buf, length);
       str->q_append(SPIDER_SQL_AND_STR, SPIDER_SQL_AND_LEN);
-      length = (uint32) (my_charset_bin.cset->longlong10_to_str)(
-        &my_charset_bin, buf, SPIDER_LONGLONG_LEN + 1, -10, limit + offset);
+      length = (uint32) (my_charset_bin.longlong10_to_str)(
+        buf, SPIDER_LONGLONG_LEN + 1, -10, limit + offset);
       str->q_append(buf, length);
     } else {
       if (str->reserve(SPIDER_SQL_HS_LTEQUAL_LEN +
         (SPIDER_LONGLONG_LEN)))
         DBUG_RETURN(HA_ERR_OUT_OF_MEM);
       str->q_append(SPIDER_SQL_HS_LTEQUAL_STR, SPIDER_SQL_HS_LTEQUAL_LEN);
-      length = (uint32) (my_charset_bin.cset->longlong10_to_str)(
-        &my_charset_bin, buf, SPIDER_LONGLONG_LEN + 1, -10, limit);
+      length = (uint32) (my_charset_bin.longlong10_to_str)(
+        buf, SPIDER_LONGLONG_LEN + 1, -10, limit);
       str->q_append(buf, length);
     }
     if (update_rownum_appended)
@@ -9885,7 +9892,8 @@ int spider_oracle_handler::append_optimize_table(
 ) {
   SPIDER_SHARE *share = spider->share;
   int conn_link_idx = spider->conn_link_idx[link_idx];
-  int local_length = spider_param_internal_optimize_local(spider->trx->thd,
+  int local_length = spider_param_internal_optimize_local(
+    spider->wide_handler->trx->thd,
     share->internal_optimize_local) * SPIDER_SQL_SQL_LOCAL_LEN;
   DBUG_ENTER("spider_oracle_handler::append_optimize_table");
   DBUG_PRINT("info",("spider this=%p", this));
@@ -9930,7 +9938,8 @@ int spider_oracle_handler::append_analyze_table(
 ) {
   SPIDER_SHARE *share = spider->share;
   int conn_link_idx = spider->conn_link_idx[link_idx];
-  int local_length = spider_param_internal_optimize_local(spider->trx->thd,
+  int local_length = spider_param_internal_optimize_local(
+    spider->wide_handler->trx->thd,
     share->internal_optimize_local) * SPIDER_SQL_SQL_LOCAL_LEN;
   DBUG_ENTER("spider_oracle_handler::append_analyze_table");
   DBUG_PRINT("info",("spider this=%p", this));
@@ -9977,7 +9986,8 @@ int spider_oracle_handler::append_repair_table(
 ) {
   SPIDER_SHARE *share = spider->share;
   int conn_link_idx = spider->conn_link_idx[link_idx];
-  int local_length = spider_param_internal_optimize_local(spider->trx->thd,
+  int local_length = spider_param_internal_optimize_local(
+    spider->wide_handler->trx->thd,
     share->internal_optimize_local) * SPIDER_SQL_SQL_LOCAL_LEN;
   DBUG_ENTER("spider_oracle_handler::append_repair_table");
   DBUG_PRINT("info",("spider this=%p", this));
@@ -10219,8 +10229,8 @@ int spider_oracle_handler::append_truncate(
 }
 
 int spider_oracle_handler::append_explain_select_part(
-  key_range *start_key,
-  key_range *end_key,
+  const key_range *start_key,
+  const key_range *end_key,
   ulong sql_type,
   int link_idx
 ) {
@@ -10243,8 +10253,8 @@ int spider_oracle_handler::append_explain_select_part(
 
 int spider_oracle_handler::append_explain_select(
   spider_string *str,
-  key_range *start_key,
-  key_range *end_key,
+  const key_range *start_key,
+  const key_range *end_key,
   ulong sql_type,
   int link_idx
 ) {
@@ -10494,7 +10504,7 @@ bool spider_oracle_handler::bulk_tmp_table_created()
 
 int spider_oracle_handler::mk_bulk_tmp_table_and_bulk_start()
 {
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   TABLE *table = spider->get_table();
   DBUG_ENTER("spider_oracle_handler::mk_bulk_tmp_table_and_bulk_start");
   DBUG_PRINT("info",("spider this=%p", this));
@@ -10523,7 +10533,8 @@ void spider_oracle_handler::rm_bulk_tmp_table()
   DBUG_PRINT("info",("spider this=%p", this));
   if (upd_tmp_tbl)
   {
-    spider_rm_sys_tmp_table(spider->trx->thd, upd_tmp_tbl, &upd_tmp_tbl_prm);
+    spider_rm_sys_tmp_table(spider->wide_handler->trx->thd, upd_tmp_tbl,
+      &upd_tmp_tbl_prm);
     upd_tmp_tbl = NULL;
   }
   DBUG_VOID_RETURN;
@@ -10620,7 +10631,8 @@ int spider_oracle_handler::append_lock_tables_list(
       DBUG_RETURN(error_num);
     *appended = 1;
   } else {
-    if (tmp_link_for_hash->spider->lock_type < spider->lock_type)
+    if (tmp_link_for_hash->spider->wide_handler->lock_type <
+      spider->wide_handler->lock_type)
     {
 #ifdef HASH_UPDATE_WITH_HASH_VALUE
       my_hash_delete_with_hash_value(
@@ -10660,7 +10672,7 @@ int spider_oracle_handler::append_lock_tables_list(
 int spider_oracle_handler::realloc_sql(
   ulong *realloced
 ) {
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   st_spider_share *share = spider->share;
   int init_sql_alloc_size =
     spider_param_init_sql_alloc_size(thd, share->init_sql_alloc_size);
@@ -11211,7 +11223,8 @@ int spider_oracle_handler::show_table_status(
     DBUG_ASSERT(!conn->mta_conn_mutex_unlock_later);
     conn->mta_conn_mutex_lock_already = TRUE;
     conn->mta_conn_mutex_unlock_later = TRUE;
-    spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+    spider_conn_set_timeout_from_share(conn, link_idx,
+      spider->wide_handler->trx->thd,
       share);
     if (
       (error_num = spider_db_set_names(spider, conn, link_idx)) ||
@@ -11250,7 +11263,8 @@ int spider_oracle_handler::show_table_status(
           pthread_mutex_unlock(&conn->mta_conn_mutex);
           DBUG_RETURN(error_num);
         }
-        spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+        spider_conn_set_timeout_from_share(conn, link_idx,
+          spider->wide_handler->trx->thd,
           share);
         if (spider_db_query(
           conn,
@@ -11276,8 +11290,8 @@ int spider_oracle_handler::show_table_status(
       }
     }
     st_spider_db_request_key request_key;
-    request_key.spider_thread_id = spider->trx->spider_thread_id;
-    request_key.query_id = spider->trx->thd->query_id;
+    request_key.spider_thread_id = spider->wide_handler->trx->spider_thread_id;
+    request_key.query_id = spider->wide_handler->trx->thd->query_id;
     request_key.handler = spider;
     request_key.request_id = 1;
     request_key.next = NULL;
@@ -11361,7 +11375,8 @@ int spider_oracle_handler::show_index(
     DBUG_ASSERT(!conn->mta_conn_mutex_unlock_later);
     conn->mta_conn_mutex_lock_already = TRUE;
     conn->mta_conn_mutex_unlock_later = TRUE;
-    spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+    spider_conn_set_timeout_from_share(conn, link_idx,
+      spider->wide_handler->trx->thd,
       share);
     if (
       (error_num = spider_db_set_names(spider, conn, link_idx)) ||
@@ -11400,7 +11415,8 @@ int spider_oracle_handler::show_index(
           pthread_mutex_unlock(&conn->mta_conn_mutex);
           DBUG_RETURN(error_num);
         }
-        spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+        spider_conn_set_timeout_from_share(conn, link_idx,
+          spider->wide_handler->trx->thd,
           share);
         if (spider_db_query(
           conn,
@@ -11426,8 +11442,8 @@ int spider_oracle_handler::show_index(
       }
     }
     st_spider_db_request_key request_key;
-    request_key.spider_thread_id = spider->trx->spider_thread_id;
-    request_key.query_id = spider->trx->thd->query_id;
+    request_key.spider_thread_id = spider->wide_handler->trx->spider_thread_id;
+    request_key.query_id = spider->wide_handler->trx->thd->query_id;
     request_key.handler = spider;
     request_key.request_id = 1;
     request_key.next = NULL;
@@ -11500,7 +11516,8 @@ int spider_oracle_handler::show_records(
   DBUG_ASSERT(!conn->mta_conn_mutex_unlock_later);
   conn->mta_conn_mutex_lock_already = TRUE;
   conn->mta_conn_mutex_unlock_later = TRUE;
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   if (
     (error_num = spider_db_set_names(spider, conn, link_idx)) ||
@@ -11541,7 +11558,8 @@ int spider_oracle_handler::show_records(
         DBUG_PRINT("info", ("spider error_num=%d 2", error_num));
         DBUG_RETURN(error_num);
       }
-      spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+      spider_conn_set_timeout_from_share(conn, link_idx,
+        spider->wide_handler->trx->thd,
         share);
       if (spider_db_query(
         conn,
@@ -11569,8 +11587,8 @@ int spider_oracle_handler::show_records(
     }
   }
   st_spider_db_request_key request_key;
-  request_key.spider_thread_id = spider->trx->spider_thread_id;
-  request_key.query_id = spider->trx->thd->query_id;
+  request_key.spider_thread_id = spider->wide_handler->trx->spider_thread_id;
+  request_key.query_id = spider->wide_handler->trx->thd->query_id;
   request_key.handler = spider;
   request_key.request_id = 1;
   request_key.next = NULL;
@@ -11614,7 +11632,7 @@ int spider_oracle_handler::show_records(
     DBUG_PRINT("info", ("spider error_num=%d 7", error_num));
     DBUG_RETURN(error_num);
   }
-  spider->trx->direct_aggregate_count++;
+  spider->wide_handler->trx->direct_aggregate_count++;
   DBUG_RETURN(0);
 }
 
@@ -11639,7 +11657,8 @@ int spider_oracle_handler::show_autoinc(
   DBUG_ASSERT(!conn->mta_conn_mutex_unlock_later);
   conn->mta_conn_mutex_lock_already = TRUE;
   conn->mta_conn_mutex_unlock_later = TRUE;
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   if (
     (error_num = spider_db_set_names(spider, conn, link_idx)) ||
@@ -11680,7 +11699,8 @@ int spider_oracle_handler::show_autoinc(
         DBUG_PRINT("info", ("spider error_num=%d 2", error_num));
         DBUG_RETURN(error_num);
       }
-      spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+      spider_conn_set_timeout_from_share(conn, link_idx,
+        spider->wide_handler->trx->thd,
         share);
       if (spider_db_query(
         conn,
@@ -11708,8 +11728,8 @@ int spider_oracle_handler::show_autoinc(
     }
   }
   st_spider_db_request_key request_key;
-  request_key.spider_thread_id = spider->trx->spider_thread_id;
-  request_key.query_id = spider->trx->thd->query_id;
+  request_key.spider_thread_id = spider->wide_handler->trx->spider_thread_id;
+  request_key.query_id = spider->wide_handler->trx->thd->query_id;
   request_key.handler = spider;
   request_key.request_id = 1;
   request_key.next = NULL;
@@ -11795,8 +11815,8 @@ int spider_oracle_handler::show_last_insert_id(
     DBUG_RETURN(error_num);
   }
   st_spider_db_request_key request_key;
-  request_key.spider_thread_id = spider->trx->spider_thread_id;
-  request_key.query_id = spider->trx->thd->query_id;
+  request_key.spider_thread_id = spider->wide_handler->trx->spider_thread_id;
+  request_key.query_id = spider->wide_handler->trx->thd->query_id;
   request_key.handler = spider;
   request_key.request_id = 1;
   request_key.next = NULL;
@@ -11828,8 +11848,8 @@ int spider_oracle_handler::show_last_insert_id(
 }
 
 ha_rows spider_oracle_handler::explain_select(
-  key_range *start_key,
-  key_range *end_key,
+  const key_range *start_key,
+  const key_range *end_key,
   int link_idx
 ) {
   int error_num;
@@ -11855,7 +11875,8 @@ ha_rows spider_oracle_handler::explain_select(
   DBUG_ASSERT(!conn->mta_conn_mutex_unlock_later);
   conn->mta_conn_mutex_lock_already = TRUE;
   conn->mta_conn_mutex_unlock_later = TRUE;
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     spider->share);
   if (
     (error_num = spider_db_set_names(spider, conn, link_idx)) ||
@@ -11898,7 +11919,8 @@ ha_rows spider_oracle_handler::explain_select(
         pthread_mutex_unlock(&conn->mta_conn_mutex);
         DBUG_RETURN(HA_POS_ERROR);
       }
-      spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+      spider_conn_set_timeout_from_share(conn, link_idx,
+        spider->wide_handler->trx->thd,
         spider->share);
       if (spider_db_query(
         conn,
@@ -11931,8 +11953,8 @@ ha_rows spider_oracle_handler::explain_select(
     }
   }
   st_spider_db_request_key request_key;
-  request_key.spider_thread_id = spider->trx->spider_thread_id;
-  request_key.query_id = spider->trx->thd->query_id;
+  request_key.spider_thread_id = spider->wide_handler->trx->spider_thread_id;
+  request_key.query_id = spider->wide_handler->trx->thd->query_id;
   request_key.handler = spider;
   request_key.request_id = 1;
   request_key.next = NULL;
@@ -12013,7 +12035,8 @@ int spider_oracle_handler::lock_tables(
         pthread_mutex_unlock(&conn->mta_conn_mutex);
         DBUG_RETURN(error_num);
       }
-      spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+      spider_conn_set_timeout_from_share(conn, link_idx,
+        spider->wide_handler->trx->thd,
         spider->share);
       if (spider_db_query(
         conn,
@@ -12038,7 +12061,7 @@ int spider_oracle_handler::lock_tables(
     if (!conn->table_locked)
     {
       conn->table_locked = TRUE;
-      spider->trx->locked_connections++;
+      spider->wide_handler->trx->locked_connections++;
     }
   } while (str->length());
   DBUG_RETURN(0);
@@ -12093,7 +12116,8 @@ int spider_oracle_handler::disable_keys(
     pthread_mutex_unlock(&conn->mta_conn_mutex);
     DBUG_RETURN(error_num);
   }
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   if (spider_db_query(
     conn,
@@ -12151,7 +12175,8 @@ int spider_oracle_handler::enable_keys(
     pthread_mutex_unlock(&conn->mta_conn_mutex);
     DBUG_RETURN(error_num);
   }
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   if (spider_db_query(
     conn,
@@ -12210,7 +12235,8 @@ int spider_oracle_handler::check_table(
     pthread_mutex_unlock(&conn->mta_conn_mutex);
     DBUG_RETURN(error_num);
   }
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   if (spider_db_query(
     conn,
@@ -12269,7 +12295,8 @@ int spider_oracle_handler::repair_table(
     pthread_mutex_unlock(&conn->mta_conn_mutex);
     DBUG_RETURN(error_num);
   }
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   if (spider_db_query(
     conn,
@@ -12327,7 +12354,8 @@ int spider_oracle_handler::analyze_table(
     pthread_mutex_unlock(&conn->mta_conn_mutex);
     DBUG_RETURN(error_num);
   }
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   if (spider_db_query(
     conn,
@@ -12385,7 +12413,8 @@ int spider_oracle_handler::optimize_table(
     pthread_mutex_unlock(&conn->mta_conn_mutex);
     DBUG_RETURN(error_num);
   }
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   if (spider_db_query(
     conn,
@@ -12426,7 +12455,8 @@ int spider_oracle_handler::flush_tables(
   {
     DBUG_RETURN(error_num);
   }
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   pthread_mutex_assert_not_owner(&conn->mta_conn_mutex);
   pthread_mutex_lock(&conn->mta_conn_mutex);
@@ -12467,7 +12497,8 @@ int spider_oracle_handler::flush_logs(
   SPIDER_SHARE *share = spider->share;
   DBUG_ENTER("spider_oracle_handler::flush_logs");
   DBUG_PRINT("info",("spider this=%p", this));
-  spider_conn_set_timeout_from_share(conn, link_idx, spider->trx->thd,
+  spider_conn_set_timeout_from_share(conn, link_idx,
+    spider->wide_handler->trx->thd,
     share);
   pthread_mutex_assert_not_owner(&conn->mta_conn_mutex);
   pthread_mutex_lock(&conn->mta_conn_mutex);
@@ -12608,8 +12639,8 @@ void spider_oracle_handler::minimum_select_bitmap_create()
   {
     uint field_index = (*field_p)->field_index;
     if (
-      spider_bit_is_set(spider->searched_bitmap, field_index) |
-      bitmap_is_set(table->read_set, field_index) |
+      spider_bit_is_set(spider->searched_bitmap, field_index) ||
+      bitmap_is_set(table->read_set, field_index) ||
       bitmap_is_set(table->write_set, field_index)
     ) {
       spider_set_bit(minimum_select_bitmap, field_index);
@@ -12655,7 +12686,7 @@ int spider_oracle_handler::init_union_table_name_pos()
   if (!union_table_name_pos_first)
   {
     if (!spider_bulk_malloc(spider_current_trx, 238, MYF(MY_WME),
-      &union_table_name_pos_first, sizeof(SPIDER_INT_HLD),
+      &union_table_name_pos_first, (uint) (sizeof(SPIDER_INT_HLD)),
       NullS)
     ) {
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
@@ -12676,7 +12707,7 @@ int spider_oracle_handler::set_union_table_name_pos()
     if (!union_table_name_pos_current->next)
     {
       if (!spider_bulk_malloc(spider_current_trx, 239, MYF(MY_WME),
-        &union_table_name_pos_current->next, sizeof(SPIDER_INT_HLD),
+        &union_table_name_pos_current->next, (uint) (sizeof(SPIDER_INT_HLD)),
         NullS)
       ) {
         DBUG_RETURN(HA_ERR_OUT_OF_MEM);
@@ -13460,20 +13491,20 @@ int spider_oracle_copy_table::append_limit(
         ((SPIDER_LONGLONG_LEN) * 2)))
         DBUG_RETURN(HA_ERR_OUT_OF_MEM);
       sql.q_append(SPIDER_SQL_BETWEEN_STR, SPIDER_SQL_BETWEEN_LEN);
-      length = (uint32) (my_charset_bin.cset->longlong10_to_str)(
-        &my_charset_bin, buf, SPIDER_LONGLONG_LEN + 1, -10, offset);
+      length = (uint32) (my_charset_bin.longlong10_to_str)(
+        buf, SPIDER_LONGLONG_LEN + 1, -10, offset);
       sql.q_append(buf, length);
       sql.q_append(SPIDER_SQL_AND_STR, SPIDER_SQL_AND_LEN);
-      length = (uint32) (my_charset_bin.cset->longlong10_to_str)(
-        &my_charset_bin, buf, SPIDER_LONGLONG_LEN + 1, -10, limit);
+      length = (uint32) (my_charset_bin.longlong10_to_str)(
+        buf, SPIDER_LONGLONG_LEN + 1, -10, limit);
       sql.q_append(buf, length);
     } else {
       if (sql.reserve(SPIDER_SQL_HS_LTEQUAL_LEN +
         (SPIDER_LONGLONG_LEN)))
         DBUG_RETURN(HA_ERR_OUT_OF_MEM);
       sql.q_append(SPIDER_SQL_HS_LTEQUAL_STR, SPIDER_SQL_HS_LTEQUAL_LEN);
-      length = (uint32) (my_charset_bin.cset->longlong10_to_str)(
-        &my_charset_bin, buf, SPIDER_LONGLONG_LEN + 1, -10, limit);
+      length = (uint32) (my_charset_bin.longlong10_to_str)(
+        buf, SPIDER_LONGLONG_LEN + 1, -10, limit);
       sql.q_append(buf, length);
     }
   }
diff --git a/storage/spider/spd_db_oracle.h b/storage/spider/spd_db_oracle.h
index ebdc23a9bfa..c8c1522599f 100644
--- a/storage/spider/spd_db_oracle.h
+++ b/storage/spider/spd_db_oracle.h
@@ -380,6 +380,7 @@ public:
     int *error_num
   );
   spider_db_result *use_result(
+    ha_spider *spider,
     st_spider_db_request_key *request_key,
     int *error_num
   );
@@ -1266,15 +1267,15 @@ public:
     int link_idx
   );
   int append_explain_select_part(
-    key_range *start_key,
-    key_range *end_key,
+    const key_range *start_key,
+    const key_range *end_key,
     ulong sql_type,
     int link_idx
   );
   int append_explain_select(
     spider_string *str,
-    key_range *start_key,
-    key_range *end_key,
+    const key_range *start_key,
+    const key_range *end_key,
     ulong sql_type,
     int link_idx
   );
@@ -1397,8 +1398,8 @@ public:
     ulonglong &last_insert_id
   );
   ha_rows explain_select(
-    key_range *start_key,
-    key_range *end_key,
+    const key_range *start_key,
+    const key_range *end_key,
     int link_idx
   );
   int lock_tables(
diff --git a/storage/spider/spd_direct_sql.cc b/storage/spider/spd_direct_sql.cc
index 09f23046455..1486cbece9e 100644
--- a/storage/spider/spd_direct_sql.cc
+++ b/storage/spider/spd_direct_sql.cc
@@ -1,4 +1,5 @@
-/* Copyright (C) 2009-2018 Kentoku Shiba
+/* Copyright (C) 2009-2020 Kentoku Shiba
+   Copyright (C) 2019-2020 MariaDB corp
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -117,31 +118,32 @@ int spider_udf_direct_sql_create_table_list(
 #if MYSQL_VERSION_ID < 50500
   if (!(direct_sql->db_names = (char**)
     spider_bulk_malloc(spider_current_trx, 31, MYF(MY_WME | MY_ZEROFILL),
-      &direct_sql->db_names, sizeof(char*) * table_count,
-      &direct_sql->table_names, sizeof(char*) * table_count,
-      &direct_sql->tables, sizeof(TABLE*) * table_count,
-      &tmp_name_ptr, sizeof(char) * (
+      &direct_sql->db_names, (uint) (sizeof(char*) * table_count),
+      &direct_sql->table_names, (uint) (sizeof(char*) * table_count),
+      &direct_sql->tables, (uint) (sizeof(TABLE*) * table_count),
+      &tmp_name_ptr, (uint) (sizeof(char) * (
         table_name_list_length +
         thd->db_length * table_count +
         2 * table_count
-      ),
-      &direct_sql->iop, sizeof(int) * table_count,
+      )),
+      &direct_sql->iop, (uint) (sizeof(int) * table_count),
       NullS))
   )
 #else
   if (!(direct_sql->db_names = (char**)
     spider_bulk_malloc(spider_current_trx, 31, MYF(MY_WME | MY_ZEROFILL),
-      &direct_sql->db_names, sizeof(char*) * table_count,
-      &direct_sql->table_names, sizeof(char*) * table_count,
-      &direct_sql->tables, sizeof(TABLE*) * table_count,
-      &tmp_name_ptr, sizeof(char) * (
+      &direct_sql->db_names, (uint) (sizeof(char*) * table_count),
+      &direct_sql->table_names, (uint) (sizeof(char*) * table_count),
+      &direct_sql->tables, (uint) (sizeof(TABLE*) * table_count),
+      &tmp_name_ptr, (uint) (sizeof(char) * (
         table_name_list_length +
         SPIDER_THD_db_length(thd) * table_count +
         2 * table_count
-      ),
-      &direct_sql->iop, sizeof(int) * table_count,
-      &direct_sql->table_list, sizeof(TABLE_LIST) * table_count,
-      &direct_sql->real_table_bitmap, sizeof(uchar) * ((table_count + 7) / 8),
+      )),
+      &direct_sql->iop, (uint) (sizeof(int) * table_count),
+      &direct_sql->table_list, (uint) (sizeof(TABLE_LIST) * table_count),
+      &direct_sql->real_table_bitmap,
+        (uint) (sizeof(uchar) * ((table_count + 7) / 8)),
       NullS))
   )
 #endif
@@ -204,17 +206,79 @@ int spider_udf_direct_sql_create_conn_key(
   char *tmp_name, port_str[6];
   DBUG_ENTER("spider_udf_direct_sql_create_conn_key");
 
-  /* tgt_db not use */
+  uint roop_count2;
+  bool tables_on_different_db_are_joinable = TRUE;
+  direct_sql->dbton_id = SPIDER_DBTON_SIZE;
+  DBUG_PRINT("info",("spider direct_sql->tgt_wrapper=%s",
+    direct_sql->tgt_wrapper));
+  for (roop_count2 = 0; roop_count2 < SPIDER_DBTON_SIZE; roop_count2++)
+  {
+    DBUG_PRINT("info",("spider spider_dbton[%d].wrapper=%s", roop_count2,
+      spider_dbton[roop_count2].wrapper ?
+        spider_dbton[roop_count2].wrapper : "NULL"));
+    if (
+      spider_dbton[roop_count2].wrapper &&
+      !strcmp(direct_sql->tgt_wrapper, spider_dbton[roop_count2].wrapper)
+    ) {
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+      if (direct_sql->access_mode == 0)
+      {
+#endif
+        if (spider_dbton[roop_count2].db_access_type ==
+          SPIDER_DB_ACCESS_TYPE_SQL)
+        {
+          direct_sql->dbton_id = roop_count2;
+          break;
+        }
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+      } else {
+        if (spider_dbton[roop_count2].db_access_type ==
+          SPIDER_DB_ACCESS_TYPE_NOSQL)
+        {
+          direct_sql->dbton_id = roop_count2;
+          break;
+        }
+      }
+#endif
+    }
+  }
+  if (direct_sql->dbton_id == SPIDER_DBTON_SIZE)
+  {
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+    if (direct_sql->access_mode == 0)
+    {
+#endif
+      my_printf_error(
+        ER_SPIDER_SQL_WRAPPER_IS_INVALID_NUM,
+        ER_SPIDER_SQL_WRAPPER_IS_INVALID_STR,
+        MYF(0), direct_sql->tgt_wrapper);
+      DBUG_RETURN(ER_SPIDER_SQL_WRAPPER_IS_INVALID_NUM);
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+    } else {
+      my_printf_error(
+        ER_SPIDER_NOSQL_WRAPPER_IS_INVALID_NUM,
+        ER_SPIDER_NOSQL_WRAPPER_IS_INVALID_STR,
+        MYF(0), direct_sql->tgt_wrapper);
+      DBUG_RETURN(ER_SPIDER_NOSQL_WRAPPER_IS_INVALID_NUM);
+    }
+#endif
+  }
+
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
   if (direct_sql->access_mode == 0)
   {
 #endif
+    tables_on_different_db_are_joinable =
+      spider_dbton[direct_sql->dbton_id].db_util->
+        tables_on_different_db_are_joinable();
     direct_sql->conn_key_length
       = 1
       + direct_sql->tgt_wrapper_length + 1
       + direct_sql->tgt_host_length + 1
       + 5 + 1
       + direct_sql->tgt_socket_length + 1
+      + (tables_on_different_db_are_joinable ?
+        0 : direct_sql->tgt_default_db_name_length + 1)
       + direct_sql->tgt_username_length + 1
       + direct_sql->tgt_password_length + 1
       + direct_sql->tgt_ssl_ca_length + 1
@@ -224,7 +288,8 @@ int spider_udf_direct_sql_create_conn_key(
       + direct_sql->tgt_ssl_key_length + 1
       + 1 + 1
       + direct_sql->tgt_default_file_length + 1
-      + direct_sql->tgt_default_group_length;
+      + direct_sql->tgt_default_group_length + 1
+      + direct_sql->tgt_dsn_length;
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
   } else {
     direct_sql->conn_key_length
@@ -261,6 +326,16 @@ int spider_udf_direct_sql_create_conn_key(
   if (direct_sql->access_mode == 0)
   {
 #endif
+    if (!tables_on_different_db_are_joinable)
+    {
+      if (direct_sql->tgt_default_db_name)
+      {
+        DBUG_PRINT("info",("spider tgt_default_db_name=%s",
+          direct_sql->tgt_default_db_name));
+        tmp_name = strmov(tmp_name + 1, direct_sql->tgt_default_db_name);
+      } else
+        tmp_name++;
+    }
     if (direct_sql->tgt_username)
     {
       DBUG_PRINT("info",("spider tgt_username=%s", direct_sql->tgt_username));
@@ -321,65 +396,16 @@ int spider_udf_direct_sql_create_conn_key(
       tmp_name = strmov(tmp_name + 1, direct_sql->tgt_default_group);
     } else
       tmp_name++;
-#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-  }
-#endif
-  uint roop_count2;
-  direct_sql->dbton_id = SPIDER_DBTON_SIZE;
-  DBUG_PRINT("info",("spider direct_sql->tgt_wrapper=%s",
-    direct_sql->tgt_wrapper));
-  for (roop_count2 = 0; roop_count2 < SPIDER_DBTON_SIZE; roop_count2++)
-  {
-    DBUG_PRINT("info",("spider spider_dbton[%d].wrapper=%s", roop_count2,
-      spider_dbton[roop_count2].wrapper ?
-        spider_dbton[roop_count2].wrapper : "NULL"));
-    if (
-      spider_dbton[roop_count2].wrapper &&
-      !strcmp(direct_sql->tgt_wrapper, spider_dbton[roop_count2].wrapper)
-    ) {
-#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-      if (direct_sql->access_mode == 0)
-      {
-#endif
-        if (spider_dbton[roop_count2].db_access_type ==
-          SPIDER_DB_ACCESS_TYPE_SQL)
-        {
-          direct_sql->dbton_id = roop_count2;
-          break;
-        }
-#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-      } else {
-        if (spider_dbton[roop_count2].db_access_type ==
-          SPIDER_DB_ACCESS_TYPE_NOSQL)
-        {
-          direct_sql->dbton_id = roop_count2;
-          break;
-        }
-      }
-#endif
-    }
-  }
-  if (direct_sql->dbton_id == SPIDER_DBTON_SIZE)
-  {
-#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-    if (direct_sql->access_mode == 0)
+    if (direct_sql->tgt_dsn)
     {
-#endif
-      my_printf_error(
-        ER_SPIDER_SQL_WRAPPER_IS_INVALID_NUM,
-        ER_SPIDER_SQL_WRAPPER_IS_INVALID_STR,
-        MYF(0), direct_sql->tgt_wrapper);
-      DBUG_RETURN(ER_SPIDER_SQL_WRAPPER_IS_INVALID_NUM);
+      DBUG_PRINT("info",("spider tgt_dsn=%s",
+        direct_sql->tgt_dsn));
+      tmp_name = strmov(tmp_name + 1, direct_sql->tgt_dsn);
+    } else
+      tmp_name++;
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-    } else {
-      my_printf_error(
-        ER_SPIDER_NOSQL_WRAPPER_IS_INVALID_NUM,
-        ER_SPIDER_NOSQL_WRAPPER_IS_INVALID_STR,
-        MYF(0), direct_sql->tgt_wrapper);
-      DBUG_RETURN(ER_SPIDER_NOSQL_WRAPPER_IS_INVALID_NUM);
-    }
-#endif
   }
+#endif
 #ifdef SPIDER_HAS_HASH_VALUE_TYPE
   direct_sql->conn_key_hash_value = my_calc_hash(&spider_open_connections,
     (uchar*) direct_sql->conn_key, direct_sql->conn_key_length);
@@ -394,9 +420,11 @@ SPIDER_CONN *spider_udf_direct_sql_create_conn(
   SPIDER_CONN *conn;
   SPIDER_IP_PORT_CONN *ip_port_conn;
   char *tmp_name, *tmp_host, *tmp_username, *tmp_password, *tmp_socket;
-  char *tmp_wrapper, *tmp_ssl_ca, *tmp_ssl_capath, *tmp_ssl_cert;
+  char *tmp_wrapper, *tmp_db, *tmp_ssl_ca, *tmp_ssl_capath, *tmp_ssl_cert;
   char *tmp_ssl_cipher, *tmp_ssl_key, *tmp_default_file, *tmp_default_group;
+  char *tmp_dsn;
   int *need_mon;
+  bool tables_on_different_db_are_joinable = TRUE;
   DBUG_ENTER("spider_udf_direct_sql_create_conn");
 
   if (unlikely(!UTC))
@@ -410,25 +438,32 @@ SPIDER_CONN *spider_udf_direct_sql_create_conn(
   if (direct_sql->access_mode == 0)
   {
 #endif
+    tables_on_different_db_are_joinable =
+      spider_dbton[direct_sql->dbton_id].db_util->
+        tables_on_different_db_are_joinable();
     if (!(conn = (SPIDER_CONN *)
       spider_bulk_malloc(spider_current_trx, 32, MYF(MY_WME | MY_ZEROFILL),
-        &conn, sizeof(*conn),
-        &tmp_name, direct_sql->conn_key_length + 1,
-        &tmp_host, direct_sql->tgt_host_length + 1,
-        &tmp_username, direct_sql->tgt_username_length + 1,
-        &tmp_password, direct_sql->tgt_password_length + 1,
-        &tmp_socket, direct_sql->tgt_socket_length + 1,
-        &tmp_wrapper, direct_sql->tgt_wrapper_length + 1,
-        &tmp_ssl_ca, direct_sql->tgt_ssl_ca_length + 1,
-        &tmp_ssl_capath, direct_sql->tgt_ssl_capath_length + 1,
-        &tmp_ssl_cert, direct_sql->tgt_ssl_cert_length + 1,
-        &tmp_ssl_cipher, direct_sql->tgt_ssl_cipher_length + 1,
-        &tmp_ssl_key, direct_sql->tgt_ssl_key_length + 1,
+        &conn, (uint) (sizeof(*conn)),
+        &tmp_name, (uint) (direct_sql->conn_key_length + 1),
+        &tmp_host, (uint) (direct_sql->tgt_host_length + 1),
+        &tmp_username, (uint) (direct_sql->tgt_username_length + 1),
+        &tmp_password, (uint) (direct_sql->tgt_password_length + 1),
+        &tmp_socket, (uint) (direct_sql->tgt_socket_length + 1),
+        &tmp_wrapper, (uint) (direct_sql->tgt_wrapper_length + 1),
+        &tmp_db, (uint) (tables_on_different_db_are_joinable ?
+          0 : direct_sql->tgt_default_db_name_length + 1),
+        &tmp_ssl_ca, (uint) (direct_sql->tgt_ssl_ca_length + 1),
+        &tmp_ssl_capath, (uint) (direct_sql->tgt_ssl_capath_length + 1),
+        &tmp_ssl_cert, (uint) (direct_sql->tgt_ssl_cert_length + 1),
+        &tmp_ssl_cipher, (uint) (direct_sql->tgt_ssl_cipher_length + 1),
+        &tmp_ssl_key, (uint) (direct_sql->tgt_ssl_key_length + 1),
         &tmp_default_file,
-          direct_sql->tgt_default_file_length + 1,
+          (uint) (direct_sql->tgt_default_file_length + 1),
         &tmp_default_group,
-          direct_sql->tgt_default_group_length + 1,
-        &need_mon, sizeof(int),
+          (uint) (direct_sql->tgt_default_group_length + 1),
+        &tmp_dsn,
+          (uint) (direct_sql->tgt_dsn_length + 1),
+        &need_mon, (uint) (sizeof(int)),
         NullS))
     ) {
       *error_num = HA_ERR_OUT_OF_MEM;
@@ -439,12 +474,12 @@ SPIDER_CONN *spider_udf_direct_sql_create_conn(
   } else {
     if (!(conn = (SPIDER_CONN *)
       spider_bulk_malloc(spider_current_trx, 33, MYF(MY_WME | MY_ZEROFILL),
-        &conn, sizeof(*conn),
-        &tmp_name, direct_sql->conn_key_length + 1,
-        &tmp_host, direct_sql->tgt_host_length + 1,
-        &tmp_socket, direct_sql->tgt_socket_length + 1,
-        &tmp_wrapper, direct_sql->tgt_wrapper_length + 1,
-        &need_mon, sizeof(int),
+        &conn, (uint) (sizeof(*conn)),
+        &tmp_name, (uint) (direct_sql->conn_key_length + 1),
+        &tmp_host, (uint) (direct_sql->tgt_host_length + 1),
+        &tmp_socket, (uint) (direct_sql->tgt_socket_length + 1),
+        &tmp_wrapper, (uint) (direct_sql->tgt_wrapper_length + 1),
+        &need_mon, (uint) (sizeof(int)),
         NullS))
     ) {
       *error_num = HA_ERR_OUT_OF_MEM;
@@ -473,6 +508,13 @@ SPIDER_CONN *spider_udf_direct_sql_create_conn(
     conn->tgt_socket = tmp_socket;
     memcpy(conn->tgt_socket, direct_sql->tgt_socket,
       direct_sql->tgt_socket_length);
+    if (!tables_on_different_db_are_joinable)
+    {
+      conn->tgt_db_length = direct_sql->tgt_default_db_name_length;
+      conn->tgt_db = tmp_db;
+      memcpy(conn->tgt_db, direct_sql->tgt_default_db_name,
+        direct_sql->tgt_default_db_name_length);
+    }
     conn->tgt_username_length = direct_sql->tgt_username_length;
     conn->tgt_username = tmp_username;
     memcpy(conn->tgt_username, direct_sql->tgt_username,
@@ -537,6 +579,14 @@ SPIDER_CONN *spider_udf_direct_sql_create_conn(
         direct_sql->tgt_default_group_length);
     } else
       conn->tgt_default_group = NULL;
+    conn->tgt_dsn_length = direct_sql->tgt_dsn_length;
+    if (conn->tgt_dsn_length)
+    {
+      conn->tgt_dsn = tmp_dsn;
+      memcpy(conn->tgt_dsn, direct_sql->tgt_dsn,
+        direct_sql->tgt_dsn_length);
+    } else
+      conn->tgt_dsn = NULL;
     conn->tgt_ssl_vsc = direct_sql->tgt_ssl_vsc;
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
   } else {
@@ -593,6 +643,11 @@ SPIDER_CONN *spider_udf_direct_sql_create_conn(
     goto error_mta_conn_mutex_init;
   }
 
+  if (unlikely((*error_num = spider_conn_init(conn))))
+  {
+    goto error_conn_init;
+  }
+
   if ((*error_num = spider_db_udf_direct_sql_connect(direct_sql, conn)))
     goto error;
   conn->ping_time = (time_t) time((time_t*) 0);
@@ -646,8 +701,10 @@ SPIDER_CONN *spider_udf_direct_sql_create_conn(
 
 error:
   DBUG_ASSERT(!conn->mta_conn_mutex_file_pos.file_name);
-  pthread_mutex_destroy(&conn->mta_conn_mutex);
 error_too_many_ipport_count:
+  spider_conn_done(conn);
+error_conn_init:
+  pthread_mutex_destroy(&conn->mta_conn_mutex);
 error_mta_conn_mutex_init:
 error_db_conn_init:
   delete conn->db_conn;
@@ -1199,6 +1256,7 @@ int spider_udf_parse_direct_sql_param(
         SPIDER_PARAM_INT("cto", connect_timeout, 0);
         SPIDER_PARAM_STR("dff", tgt_default_file);
         SPIDER_PARAM_STR("dfg", tgt_default_group);
+        SPIDER_PARAM_STR("dsn", tgt_dsn);
         SPIDER_PARAM_LONGLONG("prt", priority, 0);
         SPIDER_PARAM_INT("rto", net_read_timeout, 0);
         SPIDER_PARAM_STR("sca", tgt_ssl_ca);
@@ -1328,6 +1386,10 @@ int spider_udf_set_direct_sql_param_default(
   SPIDER_TRX *trx,
   SPIDER_DIRECT_SQL *direct_sql
 ) {
+  bool check_socket;
+  bool check_database;
+  bool socket_has_default_value;
+  bool database_has_default_value;
   int error_num, roop_count;
   DBUG_ENTER("spider_udf_set_direct_sql_param_default");
   if (direct_sql->server_name)
@@ -1336,8 +1398,66 @@ int spider_udf_set_direct_sql_param_default(
       DBUG_RETURN(error_num);
   }
 
+  if (
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+    direct_sql->access_mode == 0 &&
+#endif
+    !direct_sql->tgt_socket &&
+    (!direct_sql->tgt_host || !strcmp(direct_sql->tgt_host, my_localhost))
+  ) {
+    check_socket = TRUE;
+  } else {
+    check_socket = FALSE;
+  }
   if (!direct_sql->tgt_default_db_name)
   {
+    check_database = TRUE;
+  } else {
+    check_database = FALSE;
+  }
+  if (check_socket || check_database)
+  {
+    socket_has_default_value = check_socket;
+    database_has_default_value = check_database;
+    if (direct_sql->tgt_wrapper)
+    {
+      for (roop_count = 0; roop_count < SPIDER_DBTON_SIZE; roop_count++)
+      {
+        DBUG_PRINT("info",("spider direct_sql->tgt_wrapper=%s",
+          direct_sql->tgt_wrapper));
+        DBUG_PRINT("info",("spider spider_dbton[%d].wrapper=%s", roop_count,
+          spider_dbton[roop_count].wrapper ?
+            spider_dbton[roop_count].wrapper : "NULL"));
+        if (
+          spider_dbton[roop_count].wrapper &&
+          !strcmp(direct_sql->tgt_wrapper,
+            spider_dbton[roop_count].wrapper)
+        ) {
+          if (spider_dbton[roop_count].db_access_type ==
+            SPIDER_DB_ACCESS_TYPE_SQL)
+          {
+            if (check_socket)
+            {
+              socket_has_default_value = spider_dbton[roop_count].
+                db_util->socket_has_default_value();
+            }
+            if (check_database)
+            {
+              database_has_default_value = spider_dbton[roop_count].
+                db_util->database_has_default_value();
+            }
+            break;
+          }
+        }
+      }
+    }
+  } else {
+    socket_has_default_value = FALSE;
+    database_has_default_value = FALSE;
+  }
+
+  if (database_has_default_value)
+  {
     DBUG_PRINT("info",("spider create default tgt_default_db_name"));
     direct_sql->tgt_default_db_name_length = SPIDER_THD_db_length(trx->thd);
     if (
@@ -1432,13 +1552,8 @@ int spider_udf_set_direct_sql_param_default(
   if (direct_sql->tgt_ssl_vsc == -1)
     direct_sql->tgt_ssl_vsc = 0;
 
-  if (
-#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-    direct_sql->access_mode == 0 &&
-#endif
-    !direct_sql->tgt_socket &&
-    !strcmp(direct_sql->tgt_host, my_localhost)
-  ) {
+  if (socket_has_default_value)
+  {
     DBUG_PRINT("info",("spider create default tgt_socket"));
     direct_sql->tgt_socket_length = strlen((char *) MYSQL_UNIX_ADDR);
     if (
@@ -1565,6 +1680,10 @@ void spider_udf_free_direct_sql_alloc(
   {
     spider_free(spider_current_trx, direct_sql->tgt_default_group, MYF(0));
   }
+  if (direct_sql->tgt_dsn)
+  {
+    spider_free(spider_current_trx, direct_sql->tgt_dsn, MYF(0));
+  }
   if (direct_sql->conn_key)
   {
     spider_free(spider_current_trx, direct_sql->conn_key, MYF(0));
@@ -1601,8 +1720,8 @@ long long spider_direct_sql_body(
   SPIDER_BACKUP_DASTATUS;
   if (!(direct_sql = (SPIDER_DIRECT_SQL *)
     spider_bulk_malloc(spider_current_trx, 34, MYF(MY_WME | MY_ZEROFILL),
-      &direct_sql, sizeof(SPIDER_DIRECT_SQL),
-      &sql, sizeof(char) * args->lengths[0],
+      &direct_sql, (uint) (sizeof(SPIDER_DIRECT_SQL)),
+      &sql, (uint) (sizeof(char) * args->lengths[0]),
       NullS))
   ) {
     error_num = HA_ERR_OUT_OF_MEM;
@@ -1748,7 +1867,7 @@ long long spider_direct_sql_body(
         SPIDER_TABLE_LIST_table_name_length(&table_list),
         SPIDER_TABLE_LIST_table_name_str(&table_list), TL_WRITE);
 #endif
-      tables->mdl_request.init(MDL_key::TABLE,
+      MDL_REQUEST_INIT(&tables->mdl_request, MDL_key::TABLE,
         SPIDER_TABLE_LIST_db_str(&table_list),
         SPIDER_TABLE_LIST_table_name_str(&table_list),
         MDL_SHARED_WRITE, MDL_TRANSACTION);
@@ -1810,7 +1929,7 @@ long long spider_direct_sql_body(
       if (conn->bg_init)
         pthread_mutex_unlock(&conn->bg_conn_mutex);
       if (direct_sql->modified_non_trans_table)
-        thd->transaction.stmt.modified_non_trans_table = TRUE;
+        thd->transaction->stmt.modified_non_trans_table = TRUE;
       if (error_num == HA_ERR_OUT_OF_MEM)
         my_error(ER_OUT_OF_RESOURCES, MYF(0), HA_ERR_OUT_OF_MEM);
       goto error;
@@ -1818,7 +1937,7 @@ long long spider_direct_sql_body(
     if (conn->bg_init)
       pthread_mutex_unlock(&conn->bg_conn_mutex);
     if (direct_sql->modified_non_trans_table)
-      thd->transaction.stmt.modified_non_trans_table = TRUE;
+      thd->transaction->stmt.modified_non_trans_table = TRUE;
 #ifndef WITHOUT_SPIDER_BG_SEARCH
   }
   if (!bg)
@@ -1927,7 +2046,7 @@ void spider_direct_sql_deinit_body(
     if (bg_direct_sql->modified_non_trans_table)
     {
       THD *thd = current_thd;
-      thd->transaction.stmt.modified_non_trans_table = TRUE;
+      thd->transaction->stmt.modified_non_trans_table = TRUE;
     }
     pthread_cond_destroy(&bg_direct_sql->bg_cond);
     pthread_mutex_destroy(&bg_direct_sql->bg_mutex);
@@ -1958,7 +2077,7 @@ long long spider_direct_sql_bg_end(
     pthread_cond_wait(&bg_direct_sql->bg_cond, &bg_direct_sql->bg_mutex);
   pthread_mutex_unlock(&bg_direct_sql->bg_mutex);
   if (bg_direct_sql->modified_non_trans_table)
-    thd->transaction.stmt.modified_non_trans_table = TRUE;
+    thd->transaction->stmt.modified_non_trans_table = TRUE;
   if (bg_direct_sql->bg_error)
   {
     my_message(bg_direct_sql->bg_error, bg_direct_sql->bg_error_msg, MYF(0));
diff --git a/storage/spider/spd_environ.h b/storage/spider/spd_environ.h
index 728cc7e1781..320b0edeb30 100644
--- a/storage/spider/spd_environ.h
+++ b/storage/spider/spd_environ.h
@@ -1,4 +1,5 @@
-/* Copyright (C) 2008-2018 Kentoku Shiba & 2017 MariaDB corp
+/* Copyright (C) 2008-2020 Kentoku Shiba
+   Copyright (C) 2017-2020 MariaDB corp
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -23,12 +24,12 @@
 #define SPIDER_HANDLER_START_BULK_INSERT_HAS_FLAGS
 #endif
 
-#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >=	100100
+#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >= 100100
 #define SPIDER_SUPPORT_CREATE_OR_REPLACE_TABLE
 #define SPIDER_NET_HAS_THD
 #endif
 
-#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >=	100211
+#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >= 100211
 #define HANDLER_HAS_TOP_TABLE_FIELDS
 #define HANDLER_HAS_DIRECT_UPDATE_ROWS
 #define HANDLER_HAS_DIRECT_AGGREGATE
@@ -39,18 +40,22 @@
 #define HANDLER_HAS_CAN_USE_FOR_AUTO_INC_INIT
 #endif
 
-#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >=	100300
+#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >= 100300
 #define SPIDER_UPDATE_ROW_HAS_CONST_NEW_DATA
 #endif
 
-#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >=	100309
+#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >= 100309
 #define SPIDER_MDEV_16246
 #endif
 
-#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >=	100400
+#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >= 100400
 #define SPIDER_USE_CONST_ITEM_FOR_STRING_INT_REAL_DECIMAL_DATE_ITEM
 #define SPIDER_SQL_CACHE_IS_IN_LEX
 #define SPIDER_LIKE_FUNC_HAS_GET_NEGATED
 #define HA_HAS_CHECKSUM_EXTENDED
 #endif
+
+#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >= 100500
+#define SPIDER_I_S_USE_SHOW_FOR_COLUMN
+#endif
 #endif /* SPD_ENVIRON_INCLUDED */
diff --git a/storage/spider/spd_err.h b/storage/spider/spd_err.h
index 9889fcfa7fb..60b2a084714 100644
--- a/storage/spider/spd_err.h
+++ b/storage/spider/spd_err.h
@@ -1,4 +1,5 @@
-/* Copyright (C) 2008-2017 Kentoku Shiba
+/* Copyright (C) 2008-2020 Kentoku Shiba
+   Copyright (C) 2019-2020 MariaDB corp
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -120,13 +121,19 @@
 #define ER_SPIDER_ORACLE_STR "Error from Oracle %d %d %s"
 #define ER_SPIDER_ORACLE_NUM 12712
 #define ER_SPIDER_ORACLE_ERR "Oracle error"
+#define ER_SPIDER_DATASOURCE_STR "Error from %s %d %s %s"
+#define ER_SPIDER_DATASOURCE_NUM 12712
 #define ER_SPIDER_CON_COUNT_ERROR 12713
 #define ER_SPIDER_CON_COUNT_ERROR_STR "Too many connections between spider and remote"
 #define ER_SPIDER_TABLE_OPEN_TIMEOUT_NUM 12714
 #define ER_SPIDER_TABLE_OPEN_TIMEOUT_STR "Table %s.%s open timeout"
+#define ER_SPIDER_INFINITE_LOOP_NUM 12719
+#define ER_SPIDER_INFINITE_LOOP_STR "An infinite loop is detected when opening table %s.%s"
 #define ER_SPIDER_SAME_SERVER_LINK_NUM 12720
 #define ER_SPIDER_SAME_SERVER_LINK_STR1 "Host:%s and Socket:%s aim self server. Please change spider_same_server_link parameter if this link is required."
 #define ER_SPIDER_SAME_SERVER_LINK_STR2 "Host:%s and Port:%ld aim self server. Please change spider_same_server_link parameter if this link is required."
+#define ER_SPIDER_CANT_NUM 12721
+#define ER_SPIDER_CANT_STR1 "Can't %s%d"
 #define ER_SPIDER_COND_SKIP_NUM 12801
 
 #define ER_SPIDER_UNKNOWN_NUM 12500
diff --git a/storage/spider/spd_group_by_handler.cc b/storage/spider/spd_group_by_handler.cc
index de041897239..30d81b5a4ae 100644
--- a/storage/spider/spd_group_by_handler.cc
+++ b/storage/spider/spd_group_by_handler.cc
@@ -1,4 +1,5 @@
-/* Copyright (C) 2008-2018 Kentoku Shiba
+/* Copyright (C) 2008-2019 Kentoku Shiba
+   Copyright (C) 2019 MariaDB corp
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -637,9 +638,9 @@ SPIDER_CONN_HOLDER *spider_fields::create_conn_holder(
   DBUG_PRINT("info",("spider this=%p", this));
   return_conn_holder = (SPIDER_CONN_HOLDER *)
     spider_bulk_malloc(spider_current_trx, 252, MYF(MY_WME | MY_ZEROFILL),
-      &return_conn_holder, sizeof(SPIDER_CONN_HOLDER),
+      &return_conn_holder, (uint) (sizeof(SPIDER_CONN_HOLDER)),
       &table_link_idx_holder,
-        table_count * sizeof(SPIDER_TABLE_LINK_IDX_HOLDER),
+        (uint) (table_count * sizeof(SPIDER_TABLE_LINK_IDX_HOLDER)),
       NullS
     );
   if (!return_conn_holder)
@@ -794,7 +795,7 @@ void spider_fields::choose_a_conn(
   SPIDER_CONN_HOLDER *conn_holder;
   longlong balance_total = 0, balance_val;
   double rand_val;
-  THD *thd = table_holder[0].spider->trx->thd;
+  THD *thd = table_holder[0].spider->wide_handler->trx->thd;
   DBUG_ENTER("spider_fields::choose_a_conn");
   DBUG_PRINT("info",("spider this=%p", this));
   for (current_conn_holder = first_conn_holder; current_conn_holder;
@@ -1146,8 +1147,8 @@ int spider_fields::ping_table_mon_from_table(
     if (tmp_share->monitoring_kind[tmp_link_idx])
     {
       error_num_buf = spider_ping_table_mon_from_table(
-          tmp_spider->trx,
-          tmp_spider->trx->thd,
+          tmp_spider->wide_handler->trx,
+          tmp_spider->wide_handler->trx->thd,
           tmp_share,
           tmp_link_idx,
           (uint32) tmp_share->monitoring_sid[tmp_link_idx],
@@ -1180,7 +1181,7 @@ spider_group_by_handler::spider_group_by_handler(
   fields->set_pos_to_first_table_holder();
   SPIDER_TABLE_HOLDER *table_holder = fields->get_next_table_holder();
   spider = table_holder->spider;
-  trx = spider->trx;
+  trx = spider->wide_handler->trx;
   DBUG_VOID_RETURN;
 }
 
@@ -1938,6 +1939,12 @@ group_by_handler *spider_create_group_by_handler(
     delete fields;
     DBUG_RETURN(NULL);
   }
+  if (spider->dml_init())
+  {
+    DBUG_PRINT("info",("spider can not init for dml"));
+    delete fields;
+    DBUG_RETURN(NULL);
+  }
   for (
     roop_count = spider_conn_link_idx_next(share->link_statuses,
       spider->conn_link_idx, -1, share->link_count,
@@ -2020,6 +2027,12 @@ group_by_handler *spider_create_group_by_handler(
     }
     DBUG_PRINT("info",("spider s->db=%s", from->table->s->db.str));
     DBUG_PRINT("info",("spider s->table_name=%s", from->table->s->table_name.str));
+    if (spider->dml_init())
+    {
+      DBUG_PRINT("info",("spider can not init for dml"));
+      delete fields;
+      DBUG_RETURN(NULL);
+    }
     for (
       roop_count = spider_conn_link_idx_next(share->link_statuses,
         spider->conn_link_idx, -1, share->link_count,
diff --git a/storage/spider/spd_i_s.cc b/storage/spider/spd_i_s.cc
index c43c666601f..24000f3e6c1 100644
--- a/storage/spider/spd_i_s.cc
+++ b/storage/spider/spd_i_s.cc
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2018 Kentoku Shiba
+/* Copyright (C) 2012-2020 Kentoku Shiba
+   Copyright (C) 2020 MariaDB corp
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -44,6 +45,21 @@ extern ulonglong  spider_free_mem_count[SPIDER_MEM_CALC_LIST_NUM];
 static struct st_mysql_storage_engine spider_i_s_info =
 { MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION };
 
+namespace Show {
+#ifdef SPIDER_I_S_USE_SHOW_FOR_COLUMN
+static ST_FIELD_INFO spider_i_s_alloc_mem_fields_info[] =
+{
+  Column("ID",                ULong(10),     NOT_NULL, "id"),
+  Column("FUNC_NAME",         Varchar(64),   NULLABLE, "func_name"),
+  Column("FILE_NAME",         Varchar(64),   NULLABLE, "file_name"),
+  Column("LINE_NO",           ULong(10),     NULLABLE, "line_no"),
+  Column("TOTAL_ALLOC_MEM",   ULonglong(20), NULLABLE, "total_alloc_mem"),
+  Column("CURRENT_ALLOC_MEM", SLonglong(20), NULLABLE, "current_alloc_mem"),
+  Column("ALLOC_MEM_COUNT",   ULonglong(20), NULLABLE, "alloc_mem_count"),
+  Column("FREE_MEM_COUNT",    ULonglong(20), NULLABLE, "free_mem_count"),
+  CEnd()
+};
+#else
 static ST_FIELD_INFO spider_i_s_alloc_mem_fields_info[] =
 {
   {"ID", 10, MYSQL_TYPE_LONG, 0, MY_I_S_UNSIGNED, "id", SKIP_OPEN_TABLE},
@@ -63,6 +79,8 @@ static ST_FIELD_INFO spider_i_s_alloc_mem_fields_info[] =
     MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL, "free_mem_count", SKIP_OPEN_TABLE},
   {NULL, 0,  MYSQL_TYPE_STRING, 0, 0, NULL, 0}
 };
+#endif
+} // namespace Show
 
 static int spider_i_s_alloc_mem_fill_table(
   THD *thd,
@@ -117,7 +135,7 @@ static int spider_i_s_alloc_mem_init(
 ) {
   ST_SCHEMA_TABLE *schema = (ST_SCHEMA_TABLE *) p;
   DBUG_ENTER("spider_i_s_alloc_mem_init");
-  schema->fields_info = spider_i_s_alloc_mem_fields_info;
+  schema->fields_info = Show::spider_i_s_alloc_mem_fields_info;
   schema->fill_table = spider_i_s_alloc_mem_fill_table;
   schema->idx_field1 = 0;
   DBUG_RETURN(0);
@@ -167,3 +185,127 @@ struct st_maria_plugin spider_i_s_alloc_mem_maria =
   MariaDB_PLUGIN_MATURITY_STABLE,
 };
 #endif
+
+extern SPIDER_DBTON spider_dbton[SPIDER_DBTON_SIZE];
+
+namespace Show {
+#ifdef SPIDER_I_S_USE_SHOW_FOR_COLUMN
+static ST_FIELD_INFO spider_i_s_wrapper_protocols_fields_info[] =
+{
+  Column("WRAPPER_NAME",        Varchar(NAME_CHAR_LEN), NOT_NULL, ""),
+  Column("WRAPPER_VERSION",     Varchar(20),            NOT_NULL, ""),
+  Column("WRAPPER_DESCRIPTION", Longtext(65535),        NULLABLE, ""),
+  Column("WRAPPER_MATURITY",    Varchar(12),            NOT_NULL, ""),
+  CEnd()
+};
+#else
+static ST_FIELD_INFO spider_i_s_wrapper_protocols_fields_info[] =
+{
+  {"WRAPPER_NAME", NAME_CHAR_LEN, MYSQL_TYPE_STRING, 0, 0, 0, SKIP_OPEN_TABLE},
+  {"WRAPPER_VERSION", 20, MYSQL_TYPE_STRING, 0, 0, 0, SKIP_OPEN_TABLE},
+  {"WRAPPER_DESCRIPTION", 65535, MYSQL_TYPE_STRING, 0, 1, 0, SKIP_OPEN_TABLE},
+  {"WRAPPER_MATURITY", 12, MYSQL_TYPE_STRING, 0, 0, 0, SKIP_OPEN_TABLE},
+  {0, 0,  MYSQL_TYPE_STRING, 0, 0, 0, 0}
+};
+#endif
+} // namespace Show
+
+static int spider_i_s_wrapper_protocols_fill_table(
+  THD *thd,
+  TABLE_LIST *tables,
+  COND *cond
+) {
+  uint roop_count;
+  SPIDER_DBTON *dbton;
+  TABLE *table = tables->table;
+  DBUG_ENTER("spider_i_s_wrapper_protocols_fill_table");
+  for (roop_count = 0; roop_count < SPIDER_DBTON_SIZE; roop_count++)
+  {
+    dbton = &spider_dbton[roop_count];
+    if (!dbton->wrapper)
+    {
+      continue;
+    }
+    table->field[0]->store(dbton->wrapper,
+      strlen(dbton->wrapper), system_charset_info);
+    table->field[1]->store(dbton->version_info,
+      strlen(dbton->version_info), system_charset_info);
+    if (dbton->descr)
+    {
+      table->field[2]->set_notnull();
+      table->field[2]->store(dbton->descr,
+        strlen(dbton->descr), system_charset_info);
+    } else {
+      table->field[2]->set_null();
+    }
+    if (dbton->maturity <= SPIDER_MATURITY_STABLE)
+    {
+      table->field[3]->store(maturity_name[dbton->maturity].str,
+        maturity_name[dbton->maturity].length, system_charset_info);
+    } else {
+      table->field[3]->store(maturity_name[0].str,
+        maturity_name[0].length, system_charset_info);
+    }
+    if (schema_table_store_record(thd, table))
+    {
+      DBUG_RETURN(1);
+    }
+  }
+  DBUG_RETURN(0);
+}
+
+static int spider_i_s_wrapper_protocols_init(
+  void *p
+) {
+  ST_SCHEMA_TABLE *schema = (ST_SCHEMA_TABLE *) p;
+  DBUG_ENTER("spider_i_s_wrapper_protocols_init");
+  schema->fields_info = Show::spider_i_s_wrapper_protocols_fields_info;
+  schema->fill_table = spider_i_s_wrapper_protocols_fill_table;
+  schema->idx_field1 = 0;
+  DBUG_RETURN(0);
+}
+
+static int spider_i_s_wrapper_protocols_deinit(
+  void *p
+) {
+  DBUG_ENTER("spider_i_s_wrapper_protocols_deinit");
+  DBUG_RETURN(0);
+}
+
+struct st_mysql_plugin spider_i_s_wrapper_protocols =
+{
+  MYSQL_INFORMATION_SCHEMA_PLUGIN,
+  &spider_i_s_info,
+  "SPIDER_WRAPPER_PROTOCOLS",
+  "Kentoku Shiba, MariaDB Corp",
+  "Available wrapper protocols of Spider",
+  PLUGIN_LICENSE_GPL,
+  spider_i_s_wrapper_protocols_init,
+  spider_i_s_wrapper_protocols_deinit,
+  0x0001,
+  NULL,
+  NULL,
+  NULL,
+#if MYSQL_VERSION_ID >= 50600
+  0,
+#endif
+};
+
+#ifdef MARIADB_BASE_VERSION
+struct st_maria_plugin spider_i_s_wrapper_protocols_maria =
+{
+  MYSQL_INFORMATION_SCHEMA_PLUGIN,
+  &spider_i_s_info,
+  "SPIDER_WRAPPER_PROTOCOLS",
+  "Kentoku Shiba, MariaDB Corp",
+  "Available wrapper protocols of Spider",
+  PLUGIN_LICENSE_GPL,
+  spider_i_s_wrapper_protocols_init,
+  spider_i_s_wrapper_protocols_deinit,
+  0x0100,
+  NULL,
+  NULL,
+  "1.0",
+  MariaDB_PLUGIN_MATURITY_STABLE,
+};
+#endif
diff --git a/storage/spider/spd_include.h b/storage/spider/spd_include.h
index 95064fa4c72..46d0fb1ce96 100644
--- a/storage/spider/spd_include.h
+++ b/storage/spider/spd_include.h
@@ -1,5 +1,5 @@
-/* Copyright (C) 2008-2019 Kentoku Shiba
-   Copyright (C) 2019 MariaDB corp
+/* Copyright (C) 2008-2020 Kentoku Shiba
+   Copyright (C) 2019-2022 MariaDB corp
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -232,7 +232,7 @@ const char SPIDER_empty_string = "";
 #define SPIDER_HAS_HASH_VALUE_TYPE
 #endif
 
-#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >=	100400
+#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >= 100400
 #define SPIDER_date_mode_t(A) date_mode_t(A)
 #define SPIDER_str_to_datetime(A,B,C,D,E) str_to_datetime_or_date(A,B,C,D,E)
 #define SPIDER_get_linkage(A) A->get_linkage()
@@ -242,6 +242,33 @@ const char SPIDER_empty_string = "";
 #define SPIDER_get_linkage(A) A->linkage
 #endif
 
+#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >= 100500
+typedef start_new_trans *SPIDER_Open_tables_backup;
+#elif MYSQL_VERSION_ID < 50500
+typedef Open_tables_state SPIDER_Open_tables_backup;
+#else
+typedef Open_tables_backup SPIDER_Open_tables_backup;
+#endif
+
+#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >= 100500
+#define SPIDER_reset_n_backup_open_tables_state(A,B,C) do { \
+  if (!(*(B) = new start_new_trans(A))) \
+  { \
+    DBUG_RETURN(C); \
+  } \
+} while (0)
+#define SPIDER_restore_backup_open_tables_state(A,B) do { \
+  (*(B))->restore_old_transaction(); \
+  delete *(B); \
+} while (0)
+#define SPIDER_sys_close_thread_tables(A) (A)->commit_whole_transaction_and_close_tables()
+#else
+#define SPIDER_REQUIRE_DEFINE_FOR_SECONDARY_OPEN_TABLES_BACKUP
+#define SPIDER_reset_n_backup_open_tables_state(A,B,C) (A)->reset_n_backup_open_tables_state(B)
+#define SPIDER_restore_backup_open_tables_state(A,B) (A)->restore_backup_open_tables_state(B)
+#define SPIDER_sys_close_thread_tables(A) close_thread_tables(A)
+#endif
+
 #define spider_bitmap_size(A) ((A + 7) / 8)
 #define spider_set_bit(BITMAP, BIT) \
   ((BITMAP)[(BIT) / 8] |= (1 << ((BIT) & 7)))
@@ -260,12 +287,12 @@ const char SPIDER_empty_string = "";
 #define SPIDER_LINK_MON_DRAW_FEW_MON         1
 #define SPIDER_LINK_MON_DRAW                 2
 
-#define SPIDER_TMP_SHARE_CHAR_PTR_COUNT     20
-#define SPIDER_TMP_SHARE_UINT_COUNT         17
-#define SPIDER_TMP_SHARE_LONG_COUNT         19
+#define SPIDER_TMP_SHARE_CHAR_PTR_COUNT     21
+#define SPIDER_TMP_SHARE_UINT_COUNT         SPIDER_TMP_SHARE_CHAR_PTR_COUNT
+#define SPIDER_TMP_SHARE_LONG_COUNT         20
 #define SPIDER_TMP_SHARE_LONGLONG_COUNT      3
 
-#define SPIDER_MEM_CALC_LIST_NUM           268
+#define SPIDER_MEM_CALC_LIST_NUM           314
 #define SPIDER_CONN_META_BUF_LEN           64
 
 #define SPIDER_BACKUP_DASTATUS \
@@ -355,6 +382,7 @@ typedef struct st_spider_alter_table
   char               **tmp_tgt_ssl_keys;
   char               **tmp_tgt_default_files;
   char               **tmp_tgt_default_groups;
+  char               **tmp_tgt_dsns;
   char               **tmp_static_link_ids;
   long               *tmp_tgt_ports;
   long               *tmp_tgt_ssl_vscs;
@@ -376,6 +404,7 @@ typedef struct st_spider_alter_table
   uint               *tmp_tgt_ssl_keys_lengths;
   uint               *tmp_tgt_default_files_lengths;
   uint               *tmp_tgt_default_groups_lengths;
+  uint               *tmp_tgt_dsns_lengths;
   uint               *tmp_static_link_ids_lengths;
 
   uint               tmp_server_names_charlen;
@@ -393,6 +422,7 @@ typedef struct st_spider_alter_table
   uint               tmp_tgt_ssl_keys_charlen;
   uint               tmp_tgt_default_files_charlen;
   uint               tmp_tgt_default_groups_charlen;
+  uint               tmp_tgt_dsns_charlen;
   uint               tmp_static_link_ids_charlen;
 
   uint               tmp_server_names_length;
@@ -410,6 +440,7 @@ typedef struct st_spider_alter_table
   uint               tmp_tgt_ssl_keys_length;
   uint               tmp_tgt_default_files_length;
   uint               tmp_tgt_default_groups_length;
+  uint               tmp_tgt_dsns_length;
   uint               tmp_static_link_ids_length;
   uint               tmp_tgt_ports_length;
   uint               tmp_tgt_ssl_vscs_length;
@@ -417,6 +448,8 @@ typedef struct st_spider_alter_table
   uint               tmp_link_statuses_length;
 } SPIDER_ALTER_TABLE;
 
+typedef struct st_spider_conn_loop_check SPIDER_CONN_LOOP_CHECK;
+
 /* database connection */
 typedef struct st_spider_conn
 {
@@ -484,6 +517,7 @@ typedef struct st_spider_conn
   char               *tgt_password;
   char               *tgt_socket;
   char               *tgt_wrapper;
+  char               *tgt_db; /* for not joinable tables on different db */
   char               *tgt_ssl_ca;
   char               *tgt_ssl_capath;
   char               *tgt_ssl_cert;
@@ -491,6 +525,7 @@ typedef struct st_spider_conn
   char               *tgt_ssl_key;
   char               *tgt_default_file;
   char               *tgt_default_group;
+  char               *tgt_dsn;
   long               tgt_port;
   long               tgt_ssl_vsc;
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
@@ -503,6 +538,7 @@ typedef struct st_spider_conn
   uint               tgt_password_length;
   uint               tgt_socket_length;
   uint               tgt_wrapper_length;
+  uint               tgt_db_length;
   uint               tgt_ssl_ca_length;
   uint               tgt_ssl_capath_length;
   uint               tgt_ssl_cert_length;
@@ -510,6 +546,7 @@ typedef struct st_spider_conn
   uint               tgt_ssl_key_length;
   uint               tgt_default_file_length;
   uint               tgt_default_group_length;
+  uint               tgt_dsn_length;
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
   uint               hs_sock_length;
 #endif
@@ -607,6 +644,22 @@ typedef struct st_spider_conn
   SPIDER_LINK_IDX_CHAIN *link_idx_chain;
 #endif
   SPIDER_IP_PORT_CONN *ip_port_conn;
+
+  pthread_mutex_t    loop_check_mutex;
+  HASH               loop_checked;
+  uint               loop_checked_id;
+  const char         *loop_checked_func_name;
+  const char         *loop_checked_file_name;
+  ulong              loop_checked_line_no;
+  HASH               loop_check_queue;
+  uint               loop_check_queue_id;
+  const char         *loop_check_queue_func_name;
+  const char         *loop_check_queue_file_name;
+  ulong              loop_check_queue_line_no;
+  SPIDER_CONN_LOOP_CHECK *loop_check_ignored_first;
+  SPIDER_CONN_LOOP_CHECK *loop_check_ignored_last;
+  SPIDER_CONN_LOOP_CHECK *loop_check_meraged_first;
+  SPIDER_CONN_LOOP_CHECK *loop_check_meraged_last;
 } SPIDER_CONN;
 
 typedef struct st_spider_lgtm_tblhnd_share
@@ -623,28 +676,18 @@ typedef struct st_spider_lgtm_tblhnd_share
 } SPIDER_LGTM_TBLHND_SHARE;
 
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-typedef struct st_spider_patition_handler_share
+typedef struct st_spider_patition_handler
 {
-  uint               use_count;
+  bool               clone_bitmap_init;
+  query_id_t         parallel_search_query_id;
+  uint               no_parts;
   TABLE              *table;
-#ifdef SPIDER_HAS_HASH_VALUE_TYPE
-  my_hash_value_type table_hash_value;
+  ha_spider          *owner;
+  ha_spider          **handlers;
+} SPIDER_PARTITION_HANDLER;
 #endif
-  void               *creator;
-  void               **handlers;
-  uchar              *searched_bitmap;
-  uchar              *ft_discard_bitmap;
-  uchar              *idx_read_bitmap;
-  uchar              *idx_write_bitmap;
-  uchar              *rnd_read_bitmap;
-  uchar              *rnd_write_bitmap;
-  bool               between_flg;
-  bool               idx_bitmap_is_set;
-  bool               rnd_bitmap_is_set;
-  query_id_t         parallel_search_query_id;
-} SPIDER_PARTITION_HANDLER_SHARE;
 
-typedef struct st_spider_patition_share
+typedef struct st_spider_wide_share
 {
   char               *table_name;
   uint               table_name_length;
@@ -652,14 +695,9 @@ typedef struct st_spider_patition_share
   my_hash_value_type table_path_hash_value;
 #endif
   uint               use_count;
+  THR_LOCK           lock;
   pthread_mutex_t    sts_mutex;
   pthread_mutex_t    crd_mutex;
-  pthread_mutex_t    pt_handler_mutex;
-  HASH               pt_handler_hash;
-  uint               pt_handler_hash_id;
-  const char         *pt_handler_hash_func_name;
-  const char         *pt_handler_hash_file_name;
-  ulong              pt_handler_hash_line_no;
 
   volatile bool      sts_init;
   volatile bool      crd_init;
@@ -668,11 +706,101 @@ typedef struct st_spider_patition_share
   ha_statistics      stat;
 
   longlong           *cardinality;
-/*
-  volatile SPIDER_PARTITION_HANDLER_SHARE *partition_handler_share;
-*/
-} SPIDER_PARTITION_SHARE;
+} SPIDER_WIDE_SHARE;
+
+enum spider_hnd_stage {
+  SPD_HND_STAGE_NONE,
+  SPD_HND_STAGE_STORE_LOCK,
+  SPD_HND_STAGE_EXTERNAL_LOCK,
+  SPD_HND_STAGE_START_STMT,
+  SPD_HND_STAGE_EXTRA,
+  SPD_HND_STAGE_COND_PUSH,
+  SPD_HND_STAGE_COND_POP,
+  SPD_HND_STAGE_INFO_PUSH,
+  SPD_HND_STAGE_SET_TOP_TABLE_AND_FIELDS,
+  SPD_HND_STAGE_CLEAR_TOP_TABLE_FIELDS
+};
+
+typedef struct st_spider_wide_handler
+{
+  spider_hnd_stage   stage;
+  handler            *stage_executor;
+  THR_LOCK_DATA      lock;
+  SPIDER_TRX         *trx;
+  uchar              *searched_bitmap;
+  uchar              *ft_discard_bitmap;
+  uchar              *position_bitmap;
+  uchar              *idx_read_bitmap;
+  uchar              *idx_write_bitmap;
+  uchar              *rnd_read_bitmap;
+  uchar              *rnd_write_bitmap;
+  SPIDER_CONDITION   *condition;
+  void               *owner;
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+#ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
+  uint32             *hs_pushed_ret_fields;
+#endif
 #endif
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+  SPIDER_PARTITION_HANDLER *partition_handler;
+#endif
+#ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
+  List<Item>         *direct_update_fields;
+  List<Item>         *direct_update_values;
+#endif
+  TABLE_SHARE        *top_share;
+  enum thr_lock_type lock_type;
+  uchar              lock_table_type;
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+#ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
+  uint32             hs_pushed_strref_num;
+#endif
+#endif
+  int                lock_mode;
+  int                external_lock_type;
+  int                cond_check_error;
+  uint               sql_command;
+  uint               top_table_fields;
+#ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
+#ifdef INFO_KIND_FORCE_LIMIT_BEGIN
+  longlong           info_limit;
+#endif
+#endif
+#ifdef HA_CAN_BULK_ACCESS
+  ulonglong          external_lock_cnt;
+#endif
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+#ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
+  size_t             hs_pushed_ret_fields_num;
+  size_t             hs_pushed_ret_fields_size;
+  size_t             hs_pushed_lcl_fields_num;
+#endif
+#endif
+  bool               between_flg;
+  bool               idx_bitmap_is_set;
+  bool               rnd_bitmap_is_set;
+  bool               position_bitmap_init;
+  bool               semi_trx_isolation_chk;
+  bool               semi_trx_chk;
+  bool               low_priority;
+  bool               high_priority;
+  bool               insert_delayed;
+  bool               consistent_snapshot;
+  bool               quick_mode;
+  bool               keyread;
+  bool               update_request;
+  bool               ignore_dup_key;
+  bool               write_can_replace;
+  bool               insert_with_update;
+  bool               cond_check;
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+#ifdef HANDLER_HAS_DIRECT_UPDATE_ROWS
+  bool               hs_increment;
+  bool               hs_decrement;
+#endif
+#endif
+  bool               semi_table_lock;
+} SPIDER_WIDE_HANDLER;
 
 typedef struct st_spider_transaction
 {
@@ -809,7 +937,6 @@ typedef struct st_spider_share
 /*
   pthread_mutex_t    auto_increment_mutex;
 */
-  THR_LOCK           lock;
   TABLE_SHARE        *table_share;
   SPIDER_LGTM_TBLHND_SHARE *lgtm_tblhnd_share;
 #ifdef SPIDER_HAS_HASH_VALUE_TYPE
@@ -946,6 +1073,7 @@ typedef struct st_spider_share
   int                bulk_size;
   int                bulk_update_mode;
   int                bulk_update_size;
+  int                buffer_size;
   int                internal_optimize;
   int                internal_optimize_local;
   double             scan_rate;
@@ -1016,6 +1144,7 @@ typedef struct st_spider_share
   char               **tgt_ssl_keys;
   char               **tgt_default_files;
   char               **tgt_default_groups;
+  char               **tgt_dsns;
   char               **static_link_ids;
   char               **tgt_pk_names;
   char               **tgt_sequence_names;
@@ -1054,6 +1183,7 @@ typedef struct st_spider_share
   long               *net_write_timeouts;
   long               *access_balances;
   long               *bka_table_name_types;
+  long               *strict_group_bys;
 
   uint               *server_names_lengths;
   uint               *tgt_table_names_lengths;
@@ -1070,6 +1200,7 @@ typedef struct st_spider_share
   uint               *tgt_ssl_keys_lengths;
   uint               *tgt_default_files_lengths;
   uint               *tgt_default_groups_lengths;
+  uint               *tgt_dsns_lengths;
   uint               *static_link_ids_lengths;
   uint               *tgt_pk_names_lengths;
   uint               *tgt_sequence_names_lengths;
@@ -1100,6 +1231,7 @@ typedef struct st_spider_share
   uint               tgt_ssl_keys_charlen;
   uint               tgt_default_files_charlen;
   uint               tgt_default_groups_charlen;
+  uint               tgt_dsns_charlen;
   uint               static_link_ids_charlen;
   uint               tgt_pk_names_charlen;
   uint               tgt_sequence_names_charlen;
@@ -1126,6 +1258,7 @@ typedef struct st_spider_share
   uint               tgt_ssl_keys_length;
   uint               tgt_default_files_length;
   uint               tgt_default_groups_length;
+  uint               tgt_dsns_length;
   uint               static_link_ids_length;
   uint               tgt_pk_names_length;
   uint               tgt_sequence_names_length;
@@ -1164,6 +1297,7 @@ typedef struct st_spider_share
   uint               net_write_timeouts_length;
   uint               access_balances_length;
   uint               bka_table_name_types_length;
+  uint               strict_group_bys_length;
 
   /* for dbton */
   uchar              dbton_bitmap[spider_bitmap_size(SPIDER_DBTON_SIZE)];
@@ -1181,9 +1315,7 @@ typedef struct st_spider_share
 #endif
 
   SPIDER_ALTER_TABLE alter_table;
-#ifdef WITH_PARTITION_STORAGE_ENGINE
-  SPIDER_PARTITION_SHARE *partition_share;
-#endif
+  SPIDER_WIDE_SHARE  *wide_share;
 } SPIDER_SHARE;
 
 typedef struct st_spider_link_pack
@@ -1220,7 +1352,7 @@ typedef struct st_spider_direct_sql
   TABLE_LIST           *table_list_first;
   TABLE_LIST           *table_list;
   uchar                *real_table_bitmap;
-  Open_tables_backup   open_tables_backup;
+  SPIDER_Open_tables_backup open_tables_backup;
   THD                  *open_tables_thd;
 #endif
 
@@ -1262,6 +1394,7 @@ typedef struct st_spider_direct_sql
   char                 *tgt_ssl_key;
   char                 *tgt_default_file;
   char                 *tgt_default_group;
+  char                 *tgt_dsn;
   char                 *conn_key;
   long                 tgt_port;
   long                 tgt_ssl_vsc;
@@ -1280,6 +1413,7 @@ typedef struct st_spider_direct_sql
   uint                 tgt_ssl_key_length;
   uint                 tgt_default_file_length;
   uint                 tgt_default_group_length;
+  uint                 tgt_dsn_length;
   uint                 conn_key_length;
   uint                 dbton_id;
 #ifdef SPIDER_HAS_HASH_VALUE_TYPE
diff --git a/storage/spider/spd_init_query.h b/storage/spider/spd_init_query.h
index 4c58f8d80a4..f1fc558ab98 100644
--- a/storage/spider/spd_init_query.h
+++ b/storage/spider/spd_init_query.h
@@ -1,5 +1,5 @@
-/* Copyright (C) 2010-2019 Kentoku Shiba
-   Copyright (C) 2019 MariaDB corp
+/* Copyright (C) 2010-2020 Kentoku Shiba
+   Copyright (C) 2019-2020 MariaDB corp
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -51,6 +51,7 @@ static LEX_STRING spider_init_queries[] = {
     "  ssl_verify_server_cert tinyint not null default 0,"
     "  default_file text,"
     "  default_group char(64) default null,"
+    "  dsn char(64) default null,"
     "  key idx1 (data, format_id, gtrid_length, host)"
     ") engine=MyISAM default charset=utf8 collate=utf8_bin"
   )},
@@ -74,6 +75,7 @@ static LEX_STRING spider_init_queries[] = {
     "  ssl_verify_server_cert tinyint not null default 0,"
     "  default_file text,"
     "  default_group char(64) default null,"
+    "  dsn char(64) default null,"
     "  thread_id int default null,"
     "  status char(8) not null default '',"
     "  failed_time timestamp not null default current_timestamp,"
@@ -102,6 +104,7 @@ static LEX_STRING spider_init_queries[] = {
     "  monitoring_binlog_pos_at_failing tinyint not null default 0,"
     "  default_file text,"
     "  default_group char(64) default null,"
+    "  dsn char(64) default null,"
     "  tgt_db_name char(64) default null,"
     "  tgt_table_name char(64) default null,"
     "  link_status tinyint not null default 1,"
@@ -133,6 +136,7 @@ static LEX_STRING spider_init_queries[] = {
     "  ssl_verify_server_cert tinyint not null default 0,"
     "  default_file text,"
     "  default_group char(64) default null,"
+    "  dsn char(64) default null,"
     "  primary key (db_name, table_name, link_id, sid)"
     ") engine=MyISAM default charset=utf8 collate=utf8_bin"
   )},
@@ -214,6 +218,10 @@ static LEX_STRING spider_init_queries[] = {
     "begin"
     "  select substring_index(substring_index(version(), '-', 2), '-', -1)"
     "    into @server_name;"
+    "  if @server_name regexp '^[0-9]+$' then"
+    "    select substring_index(substring_index(version(), '-', 3), '-', -1)"
+    "      into @server_name;"
+    "  end if;"
     "  select substring_index(version(), '.', 1)"
     "    into @server_major_version;"
     "  select substring_index(substring_index(version(), '.', 2), '.', -1)"
@@ -591,6 +599,21 @@ static LEX_STRING spider_init_queries[] = {
     "      primary key (db_name, table_name, table_id, partition_id)"
     "    ) engine=Aria transactional=1 default charset=utf8 collate=utf8_bin;"
     "  end if;"
+/*
+  Fix for version 3.4
+*/
+    "  call mysql.spider_fix_one_table('spider_link_mon_servers', 'dsn',"
+    "   'alter table mysql.spider_link_mon_servers"
+    "    add column dsn char(64) default null after default_group');"
+    "  call mysql.spider_fix_one_table('spider_tables', 'dsn',"
+    "   'alter table mysql.spider_tables"
+    "    add column dsn char(64) default null after default_group');"
+    "  call mysql.spider_fix_one_table('spider_xa_failed_log', 'dsn',"
+    "   'alter table mysql.spider_xa_failed_log"
+    "    add column dsn char(64) default null after default_group');"
+    "  call mysql.spider_fix_one_table('spider_xa_member', 'dsn',"
+    "   'alter table mysql.spider_xa_member"
+    "    add column dsn char(64) default null after default_group');"
     "end;"
   )},
   {C_STRING_WITH_LEN(
@@ -663,6 +686,31 @@ static LEX_STRING spider_init_queries[] = {
     "      install plugin spider_alloc_mem soname 'ha_spider.dll';"
     "    end if;"
     "  end if;"
+/*
+  Install spider_wrapper_protocols plugin
+*/
+    "  set @have_spider_i_s_wrapper_protocols_plugin := 0;"
+    "  select @have_spider_i_s_wrapper_protocols_plugin := 1"
+    "    from INFORMATION_SCHEMA.plugins"
+    "    where PLUGIN_NAME = 'SPIDER_WRAPPER_PROTOCOLS';"
+    "  set @have_spider_wrapper_protocols_plugin := 0;"
+    "  select @have_spider_wrapper_protocols_plugin := 1 from mysql.plugin"
+    "    where name = 'spider_wrapper_protocols';"
+    "  if @have_spider_i_s_wrapper_protocols_plugin = 0 then"
+    "    if @have_spider_wrapper_protocols_plugin = 1 then"
+    "      /*"
+    "        spider_wrapper_protocols plugin is present in mysql.plugin but not in"
+    "        information_schema.plugins. Remove spider_wrapper_protocols plugin entry"
+    "        in mysql.plugin first."
+    "      */"
+    "      delete from mysql.plugin where name = 'spider_wrapper_protocols';"
+    "    end if;"
+    "    if @win_plugin = 0 then "
+    "      install plugin spider_wrapper_protocols soname 'ha_spider.so';"
+    "    else"
+    "      install plugin spider_wrapper_protocols soname 'ha_spider.dll';"
+    "    end if;"
+    "  end if;"
     "  set @have_spider_direct_sql_udf := 0;"
     "  select @have_spider_direct_sql_udf := 1 from mysql.func"
     "    where name = 'spider_direct_sql';"
diff --git a/storage/spider/spd_malloc.cc b/storage/spider/spd_malloc.cc
index 40b37ff4377..a9438f2ac58 100644
--- a/storage/spider/spd_malloc.cc
+++ b/storage/spider/spd_malloc.cc
@@ -203,7 +203,7 @@ void *spider_alloc_mem(
   uchar *ptr;
   DBUG_ENTER("spider_alloc_mem");
   size += ALIGN_SIZE(sizeof(uint)) + ALIGN_SIZE(sizeof(uint));
-  if (!(ptr = (uchar *) my_malloc(size, my_flags)))
+  if (!(ptr = (uchar *) my_malloc(PSI_INSTRUMENT_ME, size, my_flags)))
     DBUG_RETURN(NULL);
 
   spider_alloc_mem_calc(trx, id, func_name, file_name, line_no, size);
@@ -233,7 +233,7 @@ void *spider_bulk_alloc_mem(
     total_size += ALIGN_SIZE(va_arg(args, uint));
   va_end(args);
 
-  if (!(top_ptr = (uchar *) my_malloc(total_size, my_flags)))
+  if (!(top_ptr = (uchar *) my_malloc(PSI_INSTRUMENT_ME, total_size, my_flags)))
     DBUG_RETURN(NULL);
 
   spider_alloc_mem_calc(trx, id, func_name, file_name, line_no, total_size);
@@ -476,9 +476,10 @@ char *spider_string::c_ptr_safe()
 
 LEX_STRING spider_string::lex_string() const
 {
+  LEX_STRING res= { (char*) str.ptr(), str.length() };
   DBUG_ENTER("spider_string::lex_string");
   DBUG_PRINT("info",("spider this=%p", this));
-  DBUG_RETURN(str.lex_string());
+  DBUG_RETURN(res);
 }
 
 void spider_string::set(
diff --git a/storage/spider/spd_param.cc b/storage/spider/spd_param.cc
index 81540a1ef91..b23877ca92a 100644
--- a/storage/spider/spd_param.cc
+++ b/storage/spider/spd_param.cc
@@ -35,8 +35,10 @@
 #include "spd_trx.h"
 
 extern struct st_mysql_plugin spider_i_s_alloc_mem;
+extern struct st_mysql_plugin spider_i_s_wrapper_protocols;
 #ifdef MARIADB_BASE_VERSION
 extern struct st_maria_plugin spider_i_s_alloc_mem_maria;
+extern struct st_maria_plugin spider_i_s_wrapper_protocols_maria;
 #endif
 
 extern volatile ulonglong spider_mon_table_cache_version;
@@ -1053,6 +1055,31 @@ int spider_param_bulk_update_size(
 
 /*
  -1 :use table parameter
+  0-:buffer size
+ */
+static MYSQL_THDVAR_INT(
+  buffer_size, /* name */
+  PLUGIN_VAR_RQCMDARG, /* opt */
+  "Buffer size", /* comment */
+  NULL, /* check */
+  NULL, /* update */
+  -1, /* def */
+  -1, /* min */
+  2147483647, /* max */
+  0 /* blk */
+);
+
+int spider_param_buffer_size(
+  THD *thd,
+  int buffer_size
+) {
+  DBUG_ENTER("spider_param_buffer_size");
+  DBUG_RETURN(THDVAR(thd, buffer_size) == -1 ?
+    buffer_size : THDVAR(thd, buffer_size));
+}
+
+/*
+ -1 :use table parameter
   0 :off
   1 :on
  */
@@ -3425,6 +3452,32 @@ bool spider_param_sync_sql_mode(
   DBUG_RETURN(THDVAR(thd, sync_sql_mode));
 }
 
+/*
+ -1 : use table parameter
+  0 : do not strict
+  1 : do strict
+ */
+static MYSQL_THDVAR_INT(
+  strict_group_by, /* name */
+  PLUGIN_VAR_RQCMDARG, /* opt */
+  "Use columns in select clause strictly for group by clause",
+  NULL, /* check */
+  NULL, /* update */
+  -1, /* def */
+  -1, /* min */
+  1, /* max */
+  0 /* blk */
+);
+
+int spider_param_strict_group_by(
+  THD *thd,
+  int strict_group_by
+) {
+  DBUG_ENTER("spider_param_strict_group_by");
+  DBUG_RETURN(THDVAR(thd, strict_group_by) == -1 ?
+    strict_group_by : THDVAR(thd, strict_group_by));
+}
+
 static struct st_mysql_storage_engine spider_storage_engine =
 { MYSQL_HANDLERTON_INTERFACE_VERSION };
 
@@ -3463,6 +3516,7 @@ static struct st_mysql_sys_var* spider_system_variables[] = {
   MYSQL_SYSVAR(bulk_size),
   MYSQL_SYSVAR(bulk_update_mode),
   MYSQL_SYSVAR(bulk_update_size),
+  MYSQL_SYSVAR(buffer_size),
   MYSQL_SYSVAR(internal_optimize),
   MYSQL_SYSVAR(internal_optimize_local),
   MYSQL_SYSVAR(use_flash_logs),
@@ -3578,6 +3632,7 @@ static struct st_mysql_sys_var* spider_system_variables[] = {
   MYSQL_SYSVAR(remote_wait_timeout),
   MYSQL_SYSVAR(wait_timeout),
   MYSQL_SYSVAR(sync_sql_mode),
+  MYSQL_SYSVAR(strict_group_by),
   NULL
 };
 
@@ -3599,7 +3654,8 @@ mysql_declare_plugin(spider)
   0,
 #endif
 },
-spider_i_s_alloc_mem
+spider_i_s_alloc_mem,
+spider_i_s_wrapper_protocols
 mysql_declare_plugin_end;
 
 #ifdef MARIADB_BASE_VERSION
@@ -3619,6 +3675,7 @@ maria_declare_plugin(spider)
   SPIDER_DETAIL_VERSION,
   MariaDB_PLUGIN_MATURITY_STABLE
 },
-spider_i_s_alloc_mem_maria
+spider_i_s_alloc_mem_maria,
+spider_i_s_wrapper_protocols_maria
 maria_declare_plugin_end;
 #endif
diff --git a/storage/spider/spd_param.h b/storage/spider/spd_param.h
index 9ffb9e8c278..c3a79cec065 100644
--- a/storage/spider/spd_param.h
+++ b/storage/spider/spd_param.h
@@ -126,6 +126,10 @@ int spider_param_bulk_update_size(
   THD *thd,
   int bulk_update_size
 );
+int spider_param_buffer_size(
+  THD *thd,
+  int buffer_size
+);
 int spider_param_internal_optimize(
   THD *thd,
   int internal_optimize
@@ -384,6 +388,7 @@ my_bool spider_param_index_hint_pushdown(
 );
 uint spider_param_max_connections();
 uint spider_param_conn_wait_timeout();
+uint spider_param_internal_lock_wait_timeout();
 uint spider_param_log_result_errors();
 uint spider_param_log_result_error_with_sql();
 uint spider_param_internal_xa_id_type(
@@ -431,3 +436,7 @@ int spider_param_wait_timeout(
 bool spider_param_sync_sql_mode(
   THD *thd
 );
+int spider_param_strict_group_by(
+  THD *thd,
+  int strict_group_by
+);
diff --git a/storage/spider/spd_ping_table.cc b/storage/spider/spd_ping_table.cc
index 431d46063c3..f220a9d97c2 100644
--- a/storage/spider/spd_ping_table.cc
+++ b/storage/spider/spd_ping_table.cc
@@ -1,4 +1,5 @@
-/* Copyright (C) 2009-2018 Kentoku Shiba
+/* Copyright (C) 2009-2019 Kentoku Shiba
+   Copyright (C) 2019 MariaDB corp
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -289,11 +290,7 @@ int spider_get_ping_table_mon(
 ) {
   int error_num;
   TABLE *table_link_mon = NULL;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   char table_key[MAX_KEY_LENGTH];
   SPIDER_TABLE_MON *table_mon, *table_mon_prev = NULL;
   SPIDER_SHARE *tmp_share;
@@ -367,12 +364,15 @@ create_table_mon:
   do {
     if (!(table_mon = (SPIDER_TABLE_MON *)
       spider_bulk_malloc(spider_current_trx, 35, MYF(MY_WME | MY_ZEROFILL),
-        &table_mon, sizeof(SPIDER_TABLE_MON),
-        &tmp_share, sizeof(SPIDER_SHARE),
-        &tmp_connect_info, sizeof(char *) * SPIDER_TMP_SHARE_CHAR_PTR_COUNT,
-        &tmp_connect_info_length, sizeof(uint) * SPIDER_TMP_SHARE_UINT_COUNT,
-        &tmp_long, sizeof(long) * SPIDER_TMP_SHARE_LONG_COUNT,
-        &tmp_longlong, sizeof(longlong) * SPIDER_TMP_SHARE_LONGLONG_COUNT,
+        &table_mon, (uint) (sizeof(SPIDER_TABLE_MON)),
+        &tmp_share, (uint) (sizeof(SPIDER_SHARE)),
+        &tmp_connect_info,
+          (uint) (sizeof(char *) * SPIDER_TMP_SHARE_CHAR_PTR_COUNT),
+        &tmp_connect_info_length,
+          (uint) (sizeof(uint) * SPIDER_TMP_SHARE_UINT_COUNT),
+        &tmp_long, (uint) (sizeof(long) * SPIDER_TMP_SHARE_LONG_COUNT),
+        &tmp_longlong,
+          (uint) (sizeof(longlong) * SPIDER_TMP_SHARE_LONGLONG_COUNT),
         NullS))
     ) {
       spider_sys_index_end(table_link_mon);
@@ -471,11 +471,7 @@ SPIDER_TABLE_MON_LIST *spider_get_ping_table_tgt(
   int *error_num
 ) {
   TABLE *table_tables = NULL;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   char table_key[MAX_KEY_LENGTH];
 
   SPIDER_TABLE_MON_LIST *table_mon_list = NULL;
@@ -491,13 +487,17 @@ SPIDER_TABLE_MON_LIST *spider_get_ping_table_tgt(
   SPD_INIT_ALLOC_ROOT(&mem_root, 4096, 0, MYF(MY_WME));
   if (!(table_mon_list = (SPIDER_TABLE_MON_LIST *)
     spider_bulk_malloc(spider_current_trx, 36, MYF(MY_WME | MY_ZEROFILL),
-      &table_mon_list, sizeof(SPIDER_TABLE_MON_LIST),
-      &tmp_share, sizeof(SPIDER_SHARE),
-      &tmp_connect_info, sizeof(char *) * SPIDER_TMP_SHARE_CHAR_PTR_COUNT,
-      &tmp_connect_info_length, sizeof(uint) * SPIDER_TMP_SHARE_UINT_COUNT,
-      &tmp_long, sizeof(long) * SPIDER_TMP_SHARE_LONG_COUNT,
-      &tmp_longlong, sizeof(longlong) * SPIDER_TMP_SHARE_LONGLONG_COUNT,
-      &key_str, str->length() + 1,
+      &table_mon_list, (uint) (sizeof(SPIDER_TABLE_MON_LIST)),
+      &tmp_share, (uint) (sizeof(SPIDER_SHARE)),
+      &tmp_connect_info,
+        (uint) (sizeof(char *) * SPIDER_TMP_SHARE_CHAR_PTR_COUNT),
+      &tmp_connect_info_length,
+        (uint) (sizeof(uint) * SPIDER_TMP_SHARE_UINT_COUNT),
+      &tmp_long,
+        (uint) (sizeof(long) * SPIDER_TMP_SHARE_LONG_COUNT),
+      &tmp_longlong,
+        (uint) (sizeof(longlong) * SPIDER_TMP_SHARE_LONGLONG_COUNT),
+      &key_str, (uint) (str->length() + 1),
       NullS))
   ) {
     my_error(HA_ERR_OUT_OF_MEM, MYF(0));
@@ -690,12 +690,9 @@ int spider_get_ping_table_gtid_pos(
   int error_num, source_link_idx, need_mon;
   char table_key[MAX_KEY_LENGTH];
   TABLE *table_tables, *table_gtid_pos;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup_tables;
-  Open_tables_state open_tables_backup_gtid_pos;
-#else
-  Open_tables_backup open_tables_backup_tables;
-  Open_tables_backup open_tables_backup_gtid_pos;
+  SPIDER_Open_tables_backup open_tables_backup_tables;
+#ifdef SPIDER_REQUIRE_DEFINE_FOR_SECONDARY_OPEN_TABLES_BACKUP
+  SPIDER_Open_tables_backup open_tables_backup_gtid_pos;
 #endif
   MEM_ROOT mem_root;
   long link_status;
@@ -713,6 +710,7 @@ int spider_get_ping_table_gtid_pos(
       db_name = setted db_name and
       table_name = setted table_name
   */
+#ifdef SPIDER_REQUIRE_DEFINE_FOR_SECONDARY_OPEN_TABLES_BACKUP
   if (
     !(table_tables = spider_open_sys_table(
       thd, SPIDER_SYS_TABLES_TABLE_NAME_STR,
@@ -728,6 +726,45 @@ int spider_get_ping_table_gtid_pos(
       &open_tables_backup_gtid_pos, need_lock, &error_num))
   )
     goto error_open_table_gtid_pos;
+#else
+  TABLE_LIST tables_tables;
+  TABLE_LIST tables_gtid_pos;
+  TABLE_LIST *tables = &tables_tables;
+  LEX_CSTRING db_name =
+  {
+    "mysql",
+    sizeof("mysql") - 1
+  };
+  LEX_CSTRING tbl_name_tables =
+  {
+    SPIDER_SYS_TABLES_TABLE_NAME_STR,
+    SPIDER_SYS_TABLES_TABLE_NAME_LEN
+  };
+  LEX_CSTRING tbl_name_gtid_pos =
+  {
+    SPIDER_SYS_POS_FOR_RECOVERY_TABLE_NAME_STR,
+    SPIDER_SYS_POS_FOR_RECOVERY_TABLE_NAME_LEN
+  };
+  tables_tables.init_one_table(&db_name, &tbl_name_tables, 0, TL_READ);
+  tables_gtid_pos.init_one_table(&db_name, &tbl_name_gtid_pos, 0, TL_READ);
+  MDL_REQUEST_INIT(&tables_tables.mdl_request, MDL_key::TABLE,
+    SPIDER_TABLE_LIST_db_str(&tables_tables),
+    SPIDER_TABLE_LIST_table_name_str(&tables_tables),
+    MDL_SHARED_READ, MDL_TRANSACTION);
+  MDL_REQUEST_INIT(&tables_gtid_pos.mdl_request, MDL_key::TABLE,
+    SPIDER_TABLE_LIST_db_str(&tables_gtid_pos),
+    SPIDER_TABLE_LIST_table_name_str(&tables_gtid_pos),
+    MDL_SHARED_READ, MDL_TRANSACTION);
+  tables_tables.next_global = &tables_gtid_pos;
+  if (spider_sys_open_and_lock_tables(thd, &tables,
+    &open_tables_backup_tables))
+  {
+    error_num = my_errno;
+    goto error_open_table_tables;
+  }
+  table_tables = tables_tables.table;
+  table_gtid_pos = tables_gtid_pos.table;
+#endif
 
   table_tables->use_all_columns();
   table_gtid_pos->use_all_columns();
@@ -814,8 +851,10 @@ int spider_get_ping_table_gtid_pos(
   {
     goto error_sys_index_end;
   }
-  spider_close_sys_table(thd, table_gtid_pos, &open_tables_backup_gtid_pos,
-    need_lock);
+#ifdef SPIDER_REQUIRE_DEFINE_FOR_SECONDARY_OPEN_TABLES_BACKUP
+  spider_close_sys_table(thd, table_gtid_pos,
+    &open_tables_backup_gtid_pos, need_lock);
+#endif
   spider_close_sys_table(thd, table_tables, &open_tables_backup_tables,
     need_lock);
 
@@ -827,9 +866,12 @@ error_get_sys_tables_link_status:
   spider_sys_index_end(table_tables);
 error_sys_index_end:
 error_get_sys_table_by_idx:
-  spider_close_sys_table(thd, table_gtid_pos, &open_tables_backup_gtid_pos,
+#ifdef SPIDER_REQUIRE_DEFINE_FOR_SECONDARY_OPEN_TABLES_BACKUP
+  spider_close_sys_table(thd, table_gtid_pos,
+    &open_tables_backup_gtid_pos,
     need_lock);
 error_open_table_gtid_pos:
+#endif
   spider_close_sys_table(thd, table_tables, &open_tables_backup_tables,
     need_lock);
 error_open_table_tables:
@@ -842,12 +884,9 @@ int spider_init_ping_table_mon_cache(
   bool need_lock
 ) {
   int error_num, same;
+  uint old_elements;
   TABLE *table_link_mon = NULL;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   SPIDER_MON_KEY mon_key;
   DBUG_ENTER("spider_init_ping_table_mon_cache");
 
@@ -891,11 +930,23 @@ int spider_init_ping_table_mon_cache(
         {
           mon_key.sort = spider_calc_for_sort(3, mon_key.db_name,
             mon_key.table_name, mon_key.link_id);
+          old_elements = spider_mon_table_cache.max_element;
           if (push_dynamic(&spider_mon_table_cache, (uchar *) &mon_key))
           {
             error_num = HA_ERR_OUT_OF_MEM;
             goto error_push_dynamic;
           }
+          if (spider_mon_table_cache.max_element != old_elements)
+          {
+            spider_free_mem_calc(spider_current_trx,
+              spider_mon_table_cache_id,
+              old_elements *
+              spider_mon_table_cache.size_of_element);
+            spider_alloc_calc_mem(spider_current_trx,
+              spider_mon_table_cache,
+              spider_mon_table_cache.max_element *
+              spider_mon_table_cache.size_of_element);
+          }
         }
 
         if ((error_num = spider_sys_index_next(table_link_mon)))
@@ -915,12 +966,16 @@ int spider_init_ping_table_mon_cache(
       (uchar *) dynamic_element(&spider_mon_table_cache, 0, SPIDER_MON_KEY *),
       spider_mon_table_cache.elements, sizeof(SPIDER_MON_KEY),
       (qsort_cmp) spider_compare_for_sort);
-    uint old_elements = spider_mon_table_cache.max_element;
+    old_elements = spider_mon_table_cache.max_element;
     freeze_size(&spider_mon_table_cache);
-    if (spider_mon_table_cache.max_element < old_elements)
+    if (spider_mon_table_cache.max_element != old_elements)
     {
       spider_free_mem_calc(spider_current_trx,
         spider_mon_table_cache_id,
+        old_elements *
+        spider_mon_table_cache.size_of_element);
+      spider_alloc_calc_mem(spider_current_trx,
+        spider_mon_table_cache,
         spider_mon_table_cache.max_element *
         spider_mon_table_cache.size_of_element);
     }
diff --git a/storage/spider/spd_sys_table.cc b/storage/spider/spd_sys_table.cc
index ada48c4982d..a467c2c00fa 100644
--- a/storage/spider/spd_sys_table.cc
+++ b/storage/spider/spd_sys_table.cc
@@ -39,6 +39,113 @@
 extern handlerton *spider_hton_ptr;
 extern Time_zone *spd_tz_system;
 
+#define SPIDER_XA_FORMAT_ID_POS                               0
+#define SPIDER_XA_GTRID_LENGTH_POS                            1
+#define SPIDER_XA_BQUAL_LENGTH_POS                            2
+#define SPIDER_XA_DATA_POS                                    3
+#define SPIDER_XA_STATUS_POS                                  4
+
+#define SPIDER_XA_MEMBER_FORMAT_ID_POS                        0
+#define SPIDER_XA_MEMBER_GTRID_LENGTH_POS                     1
+#define SPIDER_XA_MEMBER_BQUAL_LENGTH_POS                     2
+#define SPIDER_XA_MEMBER_DATA_POS                             3
+#define SPIDER_XA_MEMBER_SCHEME_POS                           4
+#define SPIDER_XA_MEMBER_HOST_POS                             5
+#define SPIDER_XA_MEMBER_PORT_POS                             6
+#define SPIDER_XA_MEMBER_SOCKET_POS                           7
+#define SPIDER_XA_MEMBER_USERNAME_POS                         8
+#define SPIDER_XA_MEMBER_PASSWORD_POS                         9
+#define SPIDER_XA_MEMBER_SSL_CA_POS                          10
+#define SPIDER_XA_MEMBER_SSL_CAPATH_POS                      11
+#define SPIDER_XA_MEMBER_SSL_CERT_POS                        12
+#define SPIDER_XA_MEMBER_SSL_CIPHER_POS                      13
+#define SPIDER_XA_MEMBER_SSL_KEY_POS                         14
+#define SPIDER_XA_MEMBER_SSL_VERIFY_SERVER_CERT_POS          15
+#define SPIDER_XA_MEMBER_DEFAULT_FILE_POS                    16
+#define SPIDER_XA_MEMBER_DEFAULT_GROUP_POS                   17
+#define SPIDER_XA_MEMBER_DSN_POS                             18
+#define SPIDER_XA_FAILED_LOG_THREAD_ID_POS                   19
+#define SPIDER_XA_FAILED_LOG_STATUS_POS                      20
+#define SPIDER_XA_FAILED_LOG_FAILED_TIME_POS                 21
+
+#define SPIDER_TABLES_DB_NAME_POS                             0
+#define SPIDER_TABLES_TABLE_NAME_POS                          1
+#define SPIDER_TABLES_LINK_ID_POS                             2
+#define SPIDER_TABLES_PRIORITY_POS                            3
+#define SPIDER_TABLES_SERVER_POS                              4
+#define SPIDER_TABLES_SCHEME_POS                              5
+#define SPIDER_TABLES_HOST_POS                                6
+#define SPIDER_TABLES_PORT_POS                                7
+#define SPIDER_TABLES_SOCKET_POS                              8
+#define SPIDER_TABLES_USERNAME_POS                            9
+#define SPIDER_TABLES_PASSWORD_POS                           10
+#define SPIDER_TABLES_SSL_CA_POS                             11
+#define SPIDER_TABLES_SSL_CAPATH_POS                         12
+#define SPIDER_TABLES_SSL_CERT_POS                           13
+#define SPIDER_TABLES_SSL_CIPHER_POS                         14
+#define SPIDER_TABLES_SSL_KEY_POS                            15
+#define SPIDER_TABLES_SSL_VERIFY_SERVER_CERT_POS             16
+#define SPIDER_TABLES_MONITORING_BINLOG_POS_AT_FAILING_POS   17
+#define SPIDER_TABLES_DEFAULT_FILE_POS                       18
+#define SPIDER_TABLES_DEFAULT_GROUP_POS                      19
+#define SPIDER_TABLES_DSN_POS                                20
+#define SPIDER_TABLES_TGT_DB_NAME_POS                        21
+#define SPIDER_TABLES_TGT_TABLE_NAME_POS                     22
+#define SPIDER_TABLES_LINK_STATUS_POS                        23
+#define SPIDER_TABLES_BLOCK_STATUS_POS                       24
+#define SPIDER_TABLES_STATIC_LINK_ID_POS                     25
+
+#define SPIDER_LINK_MON_SERVERS_DB_NAME_POS                   0
+#define SPIDER_LINK_MON_SERVERS_TABLE_NAME_POS                1
+#define SPIDER_LINK_MON_SERVERS_LINK_ID_POS                   2
+#define SPIDER_LINK_MON_SERVERS_SID_POS                       3
+#define SPIDER_LINK_MON_SERVERS_SERVER_POS                    4
+#define SPIDER_LINK_MON_SERVERS_SCHEME_POS                    5
+#define SPIDER_LINK_MON_SERVERS_HOST_POS                      6
+#define SPIDER_LINK_MON_SERVERS_PORT_POS                      7
+#define SPIDER_LINK_MON_SERVERS_SOCKET_POS                    8
+#define SPIDER_LINK_MON_SERVERS_USERNAME_POS                  9
+#define SPIDER_LINK_MON_SERVERS_PASSWORD_POS                 10
+#define SPIDER_LINK_MON_SERVERS_SSL_CA_POS                   11
+#define SPIDER_LINK_MON_SERVERS_SSL_CAPATH_POS               12
+#define SPIDER_LINK_MON_SERVERS_SSL_CERT_POS                 13
+#define SPIDER_LINK_MON_SERVERS_SSL_CIPHER_POS               14
+#define SPIDER_LINK_MON_SERVERS_SSL_KEY_POS                  15
+#define SPIDER_LINK_MON_SERVERS_SSL_VERIFY_SERVER_CERT_POS   16
+#define SPIDER_LINK_MON_SERVERS_DEFAULT_FILE_POS             17
+#define SPIDER_LINK_MON_SERVERS_DEFAULT_GROUP_POS            18
+#define SPIDER_LINK_MON_SERVERS_DSN_POS                      19
+
+#define SPIDER_LINK_FAILED_LOG_DB_NAME_POS                    0
+#define SPIDER_LINK_FAILED_LOG_TABLE_NAME_POS                 1
+#define SPIDER_LINK_FAILED_LOG_LINK_ID_POS                    2
+#define SPIDER_LINK_FAILED_LOG_FAILED_TIME_POS                3
+
+#define SPIDER_TABLE_POSITION_FOR_RECOVERY_DB_NAME_POS        0
+#define SPIDER_TABLE_POSITION_FOR_RECOVERY_TABLE_NAME_POS     1
+#define SPIDER_TABLE_POSITION_FOR_RECOVERY_FAILED_LINK_ID_POS 2
+#define SPIDER_TABLE_POSITION_FOR_RECOVERY_SOURCE_LINK_ID_POS 3
+#define SPIDER_TABLE_POSITION_FOR_RECOVERY_FILE_POS           4
+#define SPIDER_TABLE_POSITION_FOR_RECOVERY_POSITION_POS       5
+#define SPIDER_TABLE_POSITION_FOR_RECOVERY_GTID_POS           6
+
+#define SPIDER_TABLE_STS_DB_NAME_POS                          0
+#define SPIDER_TABLE_STS_TABLE_NAME_POS                       1
+#define SPIDER_TABLE_STS_DATA_FILE_LENGTH_POS                 2
+#define SPIDER_TABLE_STS_MAX_DATA_FILE_LENGTH_POS             3
+#define SPIDER_TABLE_STS_INDEX_FILE_LENGTH_POS                4
+#define SPIDER_TABLE_STS_RECORDS_POS                          5
+#define SPIDER_TABLE_STS_MEAN_REC_LENGTH_POS                  6
+#define SPIDER_TABLE_STS_CHECK_TIME_POS                       7
+#define SPIDER_TABLE_STS_CREATE_TIME_POS                      8
+#define SPIDER_TABLE_STS_UPDATE_TIME_POS                      9
+#define SPIDER_TABLE_STS_CHECKSUM_POS                        10
+
+#define SPIDER_TABLE_CRD_DB_NAME_POS                          0
+#define SPIDER_TABLE_CRD_TABLE_NAME_POS                       1
+#define SPIDER_TABLE_CRD_KEY_SEQ_POS                          2
+#define SPIDER_TABLE_CRD_CARDINALITY_POS                      3
+
 /**
   Insert a Spider system table row.
 
@@ -121,28 +228,15 @@ inline int spider_delete_sys_table_row(TABLE *table, int record_number = 0,
   return error_num;
 }
 
-#if MYSQL_VERSION_ID < 50500
 TABLE *spider_open_sys_table(
   THD *thd,
   const char *table_name,
   int table_name_length,
   bool write,
-  Open_tables_state *open_tables_backup,
+  SPIDER_Open_tables_backup *open_tables_backup,
   bool need_lock,
   int *error_num
-)
-#else
-TABLE *spider_open_sys_table(
-  THD *thd,
-  const char *table_name,
-  int table_name_length,
-  bool write,
-  Open_tables_backup *open_tables_backup,
-  bool need_lock,
-  int *error_num
-)
-#endif
-{
+) {
   TABLE *table;
   TABLE_LIST tables;
 #if MYSQL_VERSION_ID < 50500
@@ -199,7 +293,7 @@ TABLE *spider_open_sys_table(
     }
 #if MYSQL_VERSION_ID < 50500
   } else {
-    thd->reset_n_backup_open_tables_state(open_tables_backup);
+    SPIDER_reset_n_backup_open_tables_state(thd, open_tables_backup, NULL);
 
     if (!(table = (TABLE*) spider_malloc(spider_current_trx, 12,
       sizeof(*table), MYF(MY_WME))))
@@ -494,28 +588,18 @@ TABLE *spider_open_sys_table(
 error:
   spider_free(spider_current_trx, table, MYF(0));
 error_malloc:
-  thd->restore_backup_open_tables_state(open_tables_backup);
+  SPIDER_restore_backup_open_tables_state(thd, open_tables_backup);
 #endif
 error_col_num_chk:
   DBUG_RETURN(NULL);
 }
 
-#if MYSQL_VERSION_ID < 50500
-void spider_close_sys_table(
-  THD *thd,
-  TABLE *table,
-  Open_tables_state *open_tables_backup,
-  bool need_lock
-)
-#else
 void spider_close_sys_table(
   THD *thd,
   TABLE *table,
-  Open_tables_backup *open_tables_backup,
+  SPIDER_Open_tables_backup *open_tables_backup,
   bool need_lock
-)
-#endif
-{
+) {
   DBUG_ENTER("spider_close_sys_table");
 #if MYSQL_VERSION_ID < 50500
   if (need_lock)
@@ -525,7 +609,7 @@ void spider_close_sys_table(
     table->file->ha_reset();
     closefrm(table, TRUE);
     spider_free(spider_current_trx, table, MYF(0));
-    thd->restore_backup_open_tables_state(open_tables_backup);
+    SPIDER_restore_backup_open_tables_state(thd, open_tables_backup);
   }
 #else
   spider_sys_close_table(thd, open_tables_backup);
@@ -535,20 +619,28 @@ void spider_close_sys_table(
 
 #if MYSQL_VERSION_ID < 50500
 #else
-bool spider_sys_open_tables(
+bool spider_sys_open_and_lock_tables(
   THD *thd,
   TABLE_LIST **tables,
-  Open_tables_backup *open_tables_backup
+  SPIDER_Open_tables_backup *open_tables_backup
 ) {
   uint counter;
+  uint flags = MYSQL_OPEN_IGNORE_GLOBAL_READ_LOCK |
+    MYSQL_LOCK_IGNORE_GLOBAL_READ_ONLY | MYSQL_OPEN_IGNORE_FLUSH |
+    MYSQL_LOCK_IGNORE_TIMEOUT | MYSQL_LOCK_LOG_TABLE;
   ulonglong utime_after_lock_backup = thd->utime_after_lock;
-  DBUG_ENTER("spider_sys_open_tables");
-  thd->reset_n_backup_open_tables_state(open_tables_backup);
-  if (open_tables(thd, tables, &counter,
-    MYSQL_OPEN_IGNORE_GLOBAL_READ_LOCK | MYSQL_LOCK_IGNORE_GLOBAL_READ_ONLY |
-    MYSQL_OPEN_IGNORE_FLUSH | MYSQL_LOCK_IGNORE_TIMEOUT | MYSQL_LOCK_LOG_TABLE
-  )) {
-    thd->restore_backup_open_tables_state(open_tables_backup);
+  DBUG_ENTER("spider_sys_open_and_lock_tables");
+  SPIDER_reset_n_backup_open_tables_state(thd, open_tables_backup, TRUE);
+  if (open_tables(thd, tables, &counter, flags))
+  {
+    SPIDER_restore_backup_open_tables_state(thd, open_tables_backup);
+    thd->utime_after_lock = utime_after_lock_backup;
+    DBUG_RETURN(TRUE);
+  }
+  if (lock_tables(thd, *tables, counter, flags))
+  {
+    SPIDER_sys_close_thread_tables(thd);
+    SPIDER_restore_backup_open_tables_state(thd, open_tables_backup);
     thd->utime_after_lock = utime_after_lock_backup;
     DBUG_RETURN(TRUE);
   }
@@ -559,13 +651,15 @@ bool spider_sys_open_tables(
 TABLE *spider_sys_open_table(
   THD *thd,
   TABLE_LIST *tables,
-  Open_tables_backup *open_tables_backup
+  SPIDER_Open_tables_backup *open_tables_backup
 ) {
   TABLE *table;
   ulonglong utime_after_lock_backup = thd->utime_after_lock;
   DBUG_ENTER("spider_sys_open_table");
   if (open_tables_backup)
-    thd->reset_n_backup_open_tables_state(open_tables_backup);
+  {
+    SPIDER_reset_n_backup_open_tables_state(thd, open_tables_backup, NULL);
+  }
   if ((table = open_ltable(thd, tables, tables->lock_type,
     MYSQL_OPEN_IGNORE_GLOBAL_READ_LOCK | MYSQL_LOCK_IGNORE_GLOBAL_READ_ONLY |
     MYSQL_OPEN_IGNORE_FLUSH | MYSQL_LOCK_IGNORE_TIMEOUT | MYSQL_LOCK_LOG_TABLE
@@ -573,18 +667,23 @@ TABLE *spider_sys_open_table(
     table->use_all_columns();
     table->s->no_replicate = 1;
   } else if (open_tables_backup)
-    thd->restore_backup_open_tables_state(open_tables_backup);
+  {
+    SPIDER_restore_backup_open_tables_state(thd, open_tables_backup);
+  }
   thd->utime_after_lock = utime_after_lock_backup;
   DBUG_RETURN(table);
 }
 
 void spider_sys_close_table(
   THD *thd,
-  Open_tables_backup *open_tables_backup
+  SPIDER_Open_tables_backup *open_tables_backup
 ) {
   DBUG_ENTER("spider_sys_close_table");
-  close_thread_tables(thd);
-  thd->restore_backup_open_tables_state(open_tables_backup);
+  if (open_tables_backup)
+  {
+    SPIDER_sys_close_thread_tables(thd);
+    SPIDER_restore_backup_open_tables_state(thd, open_tables_backup);
+  }
   DBUG_VOID_RETURN;
 }
 #endif
@@ -827,9 +926,9 @@ void spider_store_xa_pk(
   XID *xid
 ) {
   DBUG_ENTER("spider_store_xa_pk");
-  table->field[0]->store(xid->formatID);
-  table->field[1]->store(xid->gtrid_length);
-  table->field[3]->store(
+  table->field[SPIDER_XA_FORMAT_ID_POS]->store(xid->formatID);
+  table->field[SPIDER_XA_GTRID_LENGTH_POS]->store(xid->gtrid_length);
+  table->field[SPIDER_XA_DATA_POS]->store(
     xid->data,
     (uint) xid->gtrid_length + xid->bqual_length,
     system_charset_info);
@@ -841,7 +940,7 @@ void spider_store_xa_bqual_length(
   XID *xid
 ) {
   DBUG_ENTER("spider_store_xa_bqual_length");
-  table->field[2]->store(xid->bqual_length);
+  table->field[SPIDER_XA_BQUAL_LENGTH_POS]->store(xid->bqual_length);
   DBUG_VOID_RETURN;
 }
 
@@ -850,7 +949,7 @@ void spider_store_xa_status(
   const char *status
 ) {
   DBUG_ENTER("spider_store_xa_status");
-  table->field[4]->store(
+  table->field[SPIDER_XA_STATUS_POS]->store(
     status,
     (uint) strlen(status),
     system_charset_info);
@@ -863,19 +962,19 @@ void spider_store_xa_member_pk(
   SPIDER_CONN *conn
 ) {
   DBUG_ENTER("spider_store_xa_member_pk");
-  table->field[0]->store(xid->formatID);
-  table->field[1]->store(xid->gtrid_length);
-  table->field[3]->store(
+  table->field[SPIDER_XA_MEMBER_FORMAT_ID_POS]->store(xid->formatID);
+  table->field[SPIDER_XA_MEMBER_GTRID_LENGTH_POS]->store(xid->gtrid_length);
+  table->field[SPIDER_XA_MEMBER_DATA_POS]->store(
     xid->data,
     (uint) xid->gtrid_length + xid->bqual_length,
     system_charset_info);
-  table->field[5]->store(
+  table->field[SPIDER_XA_MEMBER_HOST_POS]->store(
     conn->tgt_host,
     (uint) conn->tgt_host_length,
     system_charset_info);
-  table->field[6]->store(
+  table->field[SPIDER_XA_MEMBER_PORT_POS]->store(
     conn->tgt_port);
-  table->field[7]->store(
+  table->field[SPIDER_XA_MEMBER_SOCKET_POS]->store(
     conn->tgt_socket,
     (uint) conn->tgt_socket_length,
     system_charset_info);
@@ -888,104 +987,115 @@ void spider_store_xa_member_info(
   SPIDER_CONN *conn
 ) {
   DBUG_ENTER("spider_store_xa_member_info");
-  table->field[2]->store(xid->bqual_length);
-  table->field[4]->store(
+  table->field[SPIDER_XA_MEMBER_BQUAL_LENGTH_POS]->store(xid->bqual_length);
+  table->field[SPIDER_XA_MEMBER_SCHEME_POS]->store(
     conn->tgt_wrapper,
     (uint) conn->tgt_wrapper_length,
     system_charset_info);
-  table->field[8]->store(
+  table->field[SPIDER_XA_MEMBER_USERNAME_POS]->store(
     conn->tgt_username,
     (uint) conn->tgt_username_length,
     system_charset_info);
-  table->field[9]->store(
+  table->field[SPIDER_XA_MEMBER_PASSWORD_POS]->store(
     conn->tgt_password,
     (uint) conn->tgt_password_length,
     system_charset_info);
   if (conn->tgt_ssl_ca)
   {
-    table->field[10]->set_notnull();
-    table->field[10]->store(
+    table->field[SPIDER_XA_MEMBER_SSL_CA_POS]->set_notnull();
+    table->field[SPIDER_XA_MEMBER_SSL_CA_POS]->store(
       conn->tgt_ssl_ca,
       (uint) conn->tgt_ssl_ca_length,
       system_charset_info);
   } else {
-    table->field[10]->set_null();
-    table->field[10]->reset();
+    table->field[SPIDER_XA_MEMBER_SSL_CA_POS]->set_null();
+    table->field[SPIDER_XA_MEMBER_SSL_CA_POS]->reset();
   }
   if (conn->tgt_ssl_capath)
   {
-    table->field[11]->set_notnull();
-    table->field[11]->store(
+    table->field[SPIDER_XA_MEMBER_SSL_CAPATH_POS]->set_notnull();
+    table->field[SPIDER_XA_MEMBER_SSL_CAPATH_POS]->store(
       conn->tgt_ssl_capath,
       (uint) conn->tgt_ssl_capath_length,
       system_charset_info);
   } else {
-    table->field[11]->set_null();
-    table->field[11]->reset();
+    table->field[SPIDER_XA_MEMBER_SSL_CAPATH_POS]->set_null();
+    table->field[SPIDER_XA_MEMBER_SSL_CAPATH_POS]->reset();
   }
   if (conn->tgt_ssl_cert)
   {
-    table->field[12]->set_notnull();
-    table->field[12]->store(
+    table->field[SPIDER_XA_MEMBER_SSL_CERT_POS]->set_notnull();
+    table->field[SPIDER_XA_MEMBER_SSL_CERT_POS]->store(
       conn->tgt_ssl_cert,
       (uint) conn->tgt_ssl_cert_length,
       system_charset_info);
   } else {
-    table->field[12]->set_null();
-    table->field[12]->reset();
+    table->field[SPIDER_XA_MEMBER_SSL_CERT_POS]->set_null();
+    table->field[SPIDER_XA_MEMBER_SSL_CERT_POS]->reset();
   }
   if (conn->tgt_ssl_cipher)
   {
-    table->field[13]->set_notnull();
-    table->field[13]->store(
+    table->field[SPIDER_XA_MEMBER_SSL_CIPHER_POS]->set_notnull();
+    table->field[SPIDER_XA_MEMBER_SSL_CIPHER_POS]->store(
       conn->tgt_ssl_cipher,
       (uint) conn->tgt_ssl_cipher_length,
       system_charset_info);
   } else {
-    table->field[13]->set_null();
-    table->field[13]->reset();
+    table->field[SPIDER_XA_MEMBER_SSL_CIPHER_POS]->set_null();
+    table->field[SPIDER_XA_MEMBER_SSL_CIPHER_POS]->reset();
   }
   if (conn->tgt_ssl_key)
   {
-    table->field[14]->set_notnull();
-    table->field[14]->store(
+    table->field[SPIDER_XA_MEMBER_SSL_KEY_POS]->set_notnull();
+    table->field[SPIDER_XA_MEMBER_SSL_KEY_POS]->store(
       conn->tgt_ssl_key,
       (uint) conn->tgt_ssl_key_length,
       system_charset_info);
   } else {
-    table->field[14]->set_null();
-    table->field[14]->reset();
+    table->field[SPIDER_XA_MEMBER_SSL_KEY_POS]->set_null();
+    table->field[SPIDER_XA_MEMBER_SSL_KEY_POS]->reset();
   }
   if (conn->tgt_ssl_vsc >= 0)
   {
-    table->field[15]->set_notnull();
-    table->field[15]->store(
+    table->field[SPIDER_XA_MEMBER_SSL_VERIFY_SERVER_CERT_POS]->set_notnull();
+    table->field[SPIDER_XA_MEMBER_SSL_VERIFY_SERVER_CERT_POS]->store(
       conn->tgt_ssl_vsc);
   } else {
-    table->field[15]->set_null();
-    table->field[15]->reset();
+    table->field[SPIDER_XA_MEMBER_SSL_VERIFY_SERVER_CERT_POS]->set_null();
+    table->field[SPIDER_XA_MEMBER_SSL_VERIFY_SERVER_CERT_POS]->reset();
   }
   if (conn->tgt_default_file)
   {
-    table->field[16]->set_notnull();
-    table->field[16]->store(
+    table->field[SPIDER_XA_MEMBER_DEFAULT_FILE_POS]->set_notnull();
+    table->field[SPIDER_XA_MEMBER_DEFAULT_FILE_POS]->store(
       conn->tgt_default_file,
       (uint) conn->tgt_default_file_length,
       system_charset_info);
   } else {
-    table->field[16]->set_null();
-    table->field[16]->reset();
+    table->field[SPIDER_XA_MEMBER_DEFAULT_FILE_POS]->set_null();
+    table->field[SPIDER_XA_MEMBER_DEFAULT_FILE_POS]->reset();
   }
   if (conn->tgt_default_group)
   {
-    table->field[17]->set_notnull();
-    table->field[17]->store(
+    table->field[SPIDER_XA_MEMBER_DEFAULT_GROUP_POS]->set_notnull();
+    table->field[SPIDER_XA_MEMBER_DEFAULT_GROUP_POS]->store(
       conn->tgt_default_group,
       (uint) conn->tgt_default_group_length,
       system_charset_info);
   } else {
-    table->field[17]->set_null();
-    table->field[17]->reset();
+    table->field[SPIDER_XA_MEMBER_DEFAULT_GROUP_POS]->set_null();
+    table->field[SPIDER_XA_MEMBER_DEFAULT_GROUP_POS]->reset();
+  }
+  if (conn->tgt_dsn)
+  {
+    table->field[SPIDER_XA_MEMBER_DSN_POS]->set_notnull();
+    table->field[SPIDER_XA_MEMBER_DSN_POS]->store(
+      conn->tgt_dsn,
+      (uint) conn->tgt_dsn_length,
+      system_charset_info);
+  } else {
+    table->field[SPIDER_XA_MEMBER_DSN_POS]->set_null();
+    table->field[SPIDER_XA_MEMBER_DSN_POS]->reset();
   }
   DBUG_VOID_RETURN;
 }
@@ -1016,18 +1126,20 @@ void spider_store_tables_name(
     ptr_table = "";
     ptr_diff_table = 1;
   }
-  table->field[0]->store(
+  table->field[SPIDER_TABLES_DB_NAME_POS]->store(
     ptr_db,
     (uint)(ptr_diff_table - 1),
     system_charset_info);
-  DBUG_PRINT("info",("spider field[0]->null_bit = %d",
-    table->field[0]->null_bit));
-  table->field[1]->store(
+  DBUG_PRINT("info",("spider field[%u]->null_bit = %d",
+    SPIDER_TABLES_DB_NAME_POS,
+    table->field[SPIDER_TABLES_DB_NAME_POS]->null_bit));
+  table->field[SPIDER_TABLES_TABLE_NAME_POS]->store(
     ptr_table,
     (uint) ((my_ptrdiff_t) name_length - ptr_diff_db - ptr_diff_table),
     system_charset_info);
-  DBUG_PRINT("info",("spider field[1]->null_bit = %d",
-    table->field[1]->null_bit));
+  DBUG_PRINT("info",("spider field[%u]->null_bit = %d",
+    SPIDER_TABLES_TABLE_NAME_POS,
+    table->field[SPIDER_TABLES_TABLE_NAME_POS]->null_bit));
   DBUG_VOID_RETURN;
 }
 
@@ -1039,18 +1151,20 @@ void spider_store_db_and_table_name(
   const uint table_name_length
 ) {
   DBUG_ENTER("spider_store_db_and_table_name");
-  table->field[0]->store(
+  table->field[SPIDER_TABLES_DB_NAME_POS]->store(
     db_name,
     db_name_length,
     system_charset_info);
-  DBUG_PRINT("info",("spider field[0]->null_bit = %d",
-    table->field[0]->null_bit));
-  table->field[1]->store(
+  DBUG_PRINT("info",("spider field[%u]->null_bit = %d",
+    SPIDER_TABLES_DB_NAME_POS,
+    table->field[SPIDER_TABLES_DB_NAME_POS]->null_bit));
+  table->field[SPIDER_TABLES_TABLE_NAME_POS]->store(
     table_name,
     table_name_length,
     system_charset_info);
-  DBUG_PRINT("info",("spider field[1]->null_bit = %d",
-    table->field[1]->null_bit));
+  DBUG_PRINT("info",("spider field[%u]->null_bit = %d",
+    SPIDER_TABLES_TABLE_NAME_POS,
+    table->field[SPIDER_TABLES_TABLE_NAME_POS]->null_bit));
   DBUG_VOID_RETURN;
 }
 
@@ -1059,8 +1173,8 @@ void spider_store_tables_link_idx(
   int link_idx
 ) {
   DBUG_ENTER("spider_store_tables_link_idx");
-  table->field[2]->set_notnull();
-  table->field[2]->store(link_idx);
+  table->field[SPIDER_TABLES_LINK_ID_POS]->set_notnull();
+  table->field[SPIDER_TABLES_LINK_ID_POS]->store(link_idx);
   DBUG_VOID_RETURN;
 }
 
@@ -1070,12 +1184,13 @@ void spider_store_tables_link_idx_str(
   const uint link_idx_length
 ) {
   DBUG_ENTER("spider_store_tables_link_idx_str");
-  table->field[2]->store(
+  table->field[SPIDER_TABLES_LINK_ID_POS]->store(
     link_idx,
     link_idx_length,
     system_charset_info);
-  DBUG_PRINT("info",("spider field[2]->null_bit = %d",
-    table->field[2]->null_bit));
+  DBUG_PRINT("info",("spider field[%u]->null_bit = %d",
+    SPIDER_TABLES_LINK_ID_POS,
+    table->field[SPIDER_TABLES_LINK_ID_POS]->null_bit));
   DBUG_VOID_RETURN;
 }
 
@@ -1087,14 +1202,14 @@ void spider_store_tables_static_link_id(
   DBUG_ENTER("spider_store_tables_static_link_id");
   if (static_link_id)
   {
-    table->field[24]->set_notnull();
-    table->field[24]->store(
+    table->field[SPIDER_TABLES_STATIC_LINK_ID_POS]->set_notnull();
+    table->field[SPIDER_TABLES_STATIC_LINK_ID_POS]->store(
       static_link_id,
       static_link_id_length,
       system_charset_info);
   } else {
-    table->field[24]->set_null();
-    table->field[24]->reset();
+    table->field[SPIDER_TABLES_STATIC_LINK_ID_POS]->set_null();
+    table->field[SPIDER_TABLES_STATIC_LINK_ID_POS]->reset();
   }
   DBUG_VOID_RETURN;
 }
@@ -1105,7 +1220,7 @@ void spider_store_tables_priority(
 ) {
   DBUG_ENTER("spider_store_tables_priority");
   DBUG_PRINT("info",("spider priority = %lld", priority));
-  table->field[3]->store(priority, FALSE);
+  table->field[SPIDER_TABLES_PRIORITY_POS]->store(priority, FALSE);
   DBUG_VOID_RETURN;
 }
 
@@ -1117,209 +1232,221 @@ void spider_store_tables_connect_info(
   DBUG_ENTER("spider_store_tables_connect_info");
   if (alter_table->tmp_server_names[link_idx])
   {
-    table->field[4]->set_notnull();
-    table->field[4]->store(
+    table->field[SPIDER_TABLES_SERVER_POS]->set_notnull();
+    table->field[SPIDER_TABLES_SERVER_POS]->store(
       alter_table->tmp_server_names[link_idx],
       (uint) alter_table->tmp_server_names_lengths[link_idx],
       system_charset_info);
   } else {
-    table->field[4]->set_null();
-    table->field[4]->reset();
+    table->field[SPIDER_TABLES_SERVER_POS]->set_null();
+    table->field[SPIDER_TABLES_SERVER_POS]->reset();
   }
   if (alter_table->tmp_tgt_wrappers[link_idx])
   {
-    table->field[5]->set_notnull();
-    table->field[5]->store(
+    table->field[SPIDER_TABLES_SCHEME_POS]->set_notnull();
+    table->field[SPIDER_TABLES_SCHEME_POS]->store(
       alter_table->tmp_tgt_wrappers[link_idx],
       (uint) alter_table->tmp_tgt_wrappers_lengths[link_idx],
       system_charset_info);
   } else {
-    table->field[5]->set_null();
-    table->field[5]->reset();
+    table->field[SPIDER_TABLES_SCHEME_POS]->set_null();
+    table->field[SPIDER_TABLES_SCHEME_POS]->reset();
   }
   if (alter_table->tmp_tgt_hosts[link_idx])
   {
-    table->field[6]->set_notnull();
-    table->field[6]->store(
+    table->field[SPIDER_TABLES_HOST_POS]->set_notnull();
+    table->field[SPIDER_TABLES_HOST_POS]->store(
       alter_table->tmp_tgt_hosts[link_idx],
       (uint) alter_table->tmp_tgt_hosts_lengths[link_idx],
       system_charset_info);
   } else {
-    table->field[6]->set_null();
-    table->field[6]->reset();
+    table->field[SPIDER_TABLES_HOST_POS]->set_null();
+    table->field[SPIDER_TABLES_HOST_POS]->reset();
   }
   if (alter_table->tmp_tgt_ports[link_idx] >= 0)
   {
-    table->field[7]->set_notnull();
-    table->field[7]->store(
+    table->field[SPIDER_TABLES_PORT_POS]->set_notnull();
+    table->field[SPIDER_TABLES_PORT_POS]->store(
       alter_table->tmp_tgt_ports[link_idx]);
   } else {
-    table->field[7]->set_null();
-    table->field[7]->reset();
+    table->field[SPIDER_TABLES_PORT_POS]->set_null();
+    table->field[SPIDER_TABLES_PORT_POS]->reset();
   }
   if (alter_table->tmp_tgt_sockets[link_idx])
   {
-    table->field[8]->set_notnull();
-    table->field[8]->store(
+    table->field[SPIDER_TABLES_SOCKET_POS]->set_notnull();
+    table->field[SPIDER_TABLES_SOCKET_POS]->store(
       alter_table->tmp_tgt_sockets[link_idx],
       (uint) alter_table->tmp_tgt_sockets_lengths[link_idx],
       system_charset_info);
   } else {
-    table->field[8]->set_null();
-    table->field[8]->reset();
+    table->field[SPIDER_TABLES_SOCKET_POS]->set_null();
+    table->field[SPIDER_TABLES_SOCKET_POS]->reset();
   }
   if (alter_table->tmp_tgt_usernames[link_idx])
   {
-    table->field[9]->set_notnull();
-    table->field[9]->store(
+    table->field[SPIDER_TABLES_USERNAME_POS]->set_notnull();
+    table->field[SPIDER_TABLES_USERNAME_POS]->store(
       alter_table->tmp_tgt_usernames[link_idx],
       (uint) alter_table->tmp_tgt_usernames_lengths[link_idx],
       system_charset_info);
   } else {
-    table->field[9]->set_null();
-    table->field[9]->reset();
+    table->field[SPIDER_TABLES_USERNAME_POS]->set_null();
+    table->field[SPIDER_TABLES_USERNAME_POS]->reset();
   }
   if (alter_table->tmp_tgt_passwords[link_idx])
   {
-    table->field[10]->set_notnull();
-    table->field[10]->store(
+    table->field[SPIDER_TABLES_PASSWORD_POS]->set_notnull();
+    table->field[SPIDER_TABLES_PASSWORD_POS]->store(
       alter_table->tmp_tgt_passwords[link_idx],
       (uint) alter_table->tmp_tgt_passwords_lengths[link_idx],
       system_charset_info);
   } else {
-    table->field[10]->set_null();
-    table->field[10]->reset();
+    table->field[SPIDER_TABLES_PASSWORD_POS]->set_null();
+    table->field[SPIDER_TABLES_PASSWORD_POS]->reset();
   }
   if (alter_table->tmp_tgt_ssl_cas[link_idx])
   {
-    table->field[11]->set_notnull();
-    table->field[11]->store(
+    table->field[SPIDER_TABLES_SSL_CA_POS]->set_notnull();
+    table->field[SPIDER_TABLES_SSL_CA_POS]->store(
       alter_table->tmp_tgt_ssl_cas[link_idx],
       (uint) alter_table->tmp_tgt_ssl_cas_lengths[link_idx],
       system_charset_info);
   } else {
-    table->field[11]->set_null();
-    table->field[11]->reset();
+    table->field[SPIDER_TABLES_SSL_CA_POS]->set_null();
+    table->field[SPIDER_TABLES_SSL_CA_POS]->reset();
   }
   if (alter_table->tmp_tgt_ssl_capaths[link_idx])
   {
-    table->field[12]->set_notnull();
-    table->field[12]->store(
+    table->field[SPIDER_TABLES_SSL_CAPATH_POS]->set_notnull();
+    table->field[SPIDER_TABLES_SSL_CAPATH_POS]->store(
       alter_table->tmp_tgt_ssl_capaths[link_idx],
       (uint) alter_table->tmp_tgt_ssl_capaths_lengths[link_idx],
       system_charset_info);
   } else {
-    table->field[12]->set_null();
-    table->field[12]->reset();
+    table->field[SPIDER_TABLES_SSL_CAPATH_POS]->set_null();
+    table->field[SPIDER_TABLES_SSL_CAPATH_POS]->reset();
   }
   if (alter_table->tmp_tgt_ssl_certs[link_idx])
   {
-    table->field[13]->set_notnull();
-    table->field[13]->store(
+    table->field[SPIDER_TABLES_SSL_CERT_POS]->set_notnull();
+    table->field[SPIDER_TABLES_SSL_CERT_POS]->store(
       alter_table->tmp_tgt_ssl_certs[link_idx],
       (uint) alter_table->tmp_tgt_ssl_certs_lengths[link_idx],
       system_charset_info);
   } else {
-    table->field[13]->set_null();
-    table->field[13]->reset();
+    table->field[SPIDER_TABLES_SSL_CERT_POS]->set_null();
+    table->field[SPIDER_TABLES_SSL_CERT_POS]->reset();
   }
   if (alter_table->tmp_tgt_ssl_ciphers[link_idx])
   {
-    table->field[14]->set_notnull();
-    table->field[14]->store(
+    table->field[SPIDER_TABLES_SSL_CIPHER_POS]->set_notnull();
+    table->field[SPIDER_TABLES_SSL_CIPHER_POS]->store(
       alter_table->tmp_tgt_ssl_ciphers[link_idx],
       (uint) alter_table->tmp_tgt_ssl_ciphers_lengths[link_idx],
       system_charset_info);
   } else {
-    table->field[14]->set_null();
-    table->field[14]->reset();
+    table->field[SPIDER_TABLES_SSL_CIPHER_POS]->set_null();
+    table->field[SPIDER_TABLES_SSL_CIPHER_POS]->reset();
   }
   if (alter_table->tmp_tgt_ssl_keys[link_idx])
   {
-    table->field[15]->set_notnull();
-    table->field[15]->store(
+    table->field[SPIDER_TABLES_SSL_KEY_POS]->set_notnull();
+    table->field[SPIDER_TABLES_SSL_KEY_POS]->store(
       alter_table->tmp_tgt_ssl_keys[link_idx],
       (uint) alter_table->tmp_tgt_ssl_keys_lengths[link_idx],
       system_charset_info);
   } else {
-    table->field[15]->set_null();
-    table->field[15]->reset();
+    table->field[SPIDER_TABLES_SSL_KEY_POS]->set_null();
+    table->field[SPIDER_TABLES_SSL_KEY_POS]->reset();
   }
   if (alter_table->tmp_tgt_ssl_vscs[link_idx] >= 0)
   {
-    table->field[16]->set_notnull();
-    table->field[16]->store(
+    table->field[SPIDER_TABLES_SSL_VERIFY_SERVER_CERT_POS]->set_notnull();
+    table->field[SPIDER_TABLES_SSL_VERIFY_SERVER_CERT_POS]->store(
       alter_table->tmp_tgt_ssl_vscs[link_idx]);
   } else {
-    table->field[16]->set_null();
-    table->field[16]->reset();
+    table->field[SPIDER_TABLES_SSL_VERIFY_SERVER_CERT_POS]->set_null();
+    table->field[SPIDER_TABLES_SSL_VERIFY_SERVER_CERT_POS]->reset();
   }
-  table->field[17]->set_notnull();
+  table->field[SPIDER_TABLES_MONITORING_BINLOG_POS_AT_FAILING_POS]->
+    set_notnull();
   if (alter_table->tmp_monitoring_binlog_pos_at_failing[link_idx] >= 0)
   {
-    table->field[17]->store(
+    table->field[SPIDER_TABLES_MONITORING_BINLOG_POS_AT_FAILING_POS]->store(
       alter_table->tmp_monitoring_binlog_pos_at_failing[link_idx]);
   } else {
-    table->field[17]->store(0);
+    table->field[SPIDER_TABLES_MONITORING_BINLOG_POS_AT_FAILING_POS]->store(0);
   }
   if (alter_table->tmp_tgt_default_files[link_idx])
   {
-    table->field[18]->set_notnull();
-    table->field[18]->store(
+    table->field[SPIDER_TABLES_DEFAULT_FILE_POS]->set_notnull();
+    table->field[SPIDER_TABLES_DEFAULT_FILE_POS]->store(
       alter_table->tmp_tgt_default_files[link_idx],
       (uint) alter_table->tmp_tgt_default_files_lengths[link_idx],
       system_charset_info);
   } else {
-    table->field[18]->set_null();
-    table->field[18]->reset();
+    table->field[SPIDER_TABLES_DEFAULT_FILE_POS]->set_null();
+    table->field[SPIDER_TABLES_DEFAULT_FILE_POS]->reset();
   }
   if (alter_table->tmp_tgt_default_groups[link_idx])
   {
-    table->field[19]->set_notnull();
-    table->field[19]->store(
+    table->field[SPIDER_TABLES_DEFAULT_GROUP_POS]->set_notnull();
+    table->field[SPIDER_TABLES_DEFAULT_GROUP_POS]->store(
       alter_table->tmp_tgt_default_groups[link_idx],
       (uint) alter_table->tmp_tgt_default_groups_lengths[link_idx],
       system_charset_info);
   } else {
-    table->field[19]->set_null();
-    table->field[19]->reset();
+    table->field[SPIDER_TABLES_DEFAULT_GROUP_POS]->set_null();
+    table->field[SPIDER_TABLES_DEFAULT_GROUP_POS]->reset();
+  }
+  if (alter_table->tmp_tgt_dsns[link_idx])
+  {
+    table->field[SPIDER_TABLES_DSN_POS]->set_notnull();
+    table->field[SPIDER_TABLES_DSN_POS]->store(
+      alter_table->tmp_tgt_dsns[link_idx],
+      (uint) alter_table->tmp_tgt_dsns_lengths[link_idx],
+      system_charset_info);
+  } else {
+    table->field[SPIDER_TABLES_DSN_POS]->set_null();
+    table->field[SPIDER_TABLES_DSN_POS]->reset();
   }
   if (alter_table->tmp_tgt_dbs[link_idx])
   {
-    table->field[20]->set_notnull();
-    table->field[20]->store(
+    table->field[SPIDER_TABLES_TGT_DB_NAME_POS]->set_notnull();
+    table->field[SPIDER_TABLES_TGT_DB_NAME_POS]->store(
       alter_table->tmp_tgt_dbs[link_idx],
       (uint) alter_table->tmp_tgt_dbs_lengths[link_idx],
       system_charset_info);
   } else {
-    table->field[20]->set_null();
-    table->field[20]->reset();
+    table->field[SPIDER_TABLES_TGT_DB_NAME_POS]->set_null();
+    table->field[SPIDER_TABLES_TGT_DB_NAME_POS]->reset();
   }
   if (alter_table->tmp_tgt_table_names[link_idx])
   {
-    table->field[21]->set_notnull();
-    table->field[21]->store(
+    table->field[SPIDER_TABLES_TGT_TABLE_NAME_POS]->set_notnull();
+    table->field[SPIDER_TABLES_TGT_TABLE_NAME_POS]->store(
       alter_table->tmp_tgt_table_names[link_idx],
       (uint) alter_table->tmp_tgt_table_names_lengths[link_idx],
       system_charset_info);
   } else {
-    table->field[21]->set_null();
-    table->field[21]->reset();
+    table->field[SPIDER_TABLES_TGT_TABLE_NAME_POS]->set_null();
+    table->field[SPIDER_TABLES_TGT_TABLE_NAME_POS]->reset();
   }
-  table->field[23]->store((longlong) 0, FALSE);
+  table->field[SPIDER_TABLES_BLOCK_STATUS_POS]->store((longlong) 0, FALSE);
   if (alter_table->tmp_static_link_ids[link_idx])
   {
     DBUG_PRINT("info",("spider static_link_id[%d] = %s",
       link_idx, alter_table->tmp_static_link_ids[link_idx]));
-    table->field[24]->set_notnull();
-    table->field[24]->store(
+    table->field[SPIDER_TABLES_STATIC_LINK_ID_POS]->set_notnull();
+    table->field[SPIDER_TABLES_STATIC_LINK_ID_POS]->store(
       alter_table->tmp_static_link_ids[link_idx],
       (uint) alter_table->tmp_static_link_ids_lengths[link_idx],
       system_charset_info);
   } else {
     DBUG_PRINT("info",("spider static_link_id[%d] = NULL", link_idx));
-    table->field[24]->set_null();
-    table->field[24]->reset();
+    table->field[SPIDER_TABLES_STATIC_LINK_ID_POS]->set_null();
+    table->field[SPIDER_TABLES_STATIC_LINK_ID_POS]->reset();
   }
   DBUG_VOID_RETURN;
 }
@@ -1331,17 +1458,7 @@ void spider_store_tables_link_status(
   DBUG_ENTER("spider_store_tables_link_status");
   DBUG_PRINT("info",("spider link_status = %ld", link_status));
   if (link_status > SPIDER_LINK_STATUS_NO_CHANGE)
-    table->field[22]->store(link_status, FALSE);
-  DBUG_VOID_RETURN;
-}
-
-void spider_store_link_chk_server_id(
-  TABLE *table,
-  uint32 server_id
-) {
-  DBUG_ENTER("spider_store_link_chk_server_id");
-  table->field[3]->set_notnull();
-  table->field[3]->store(server_id);
+    table->field[SPIDER_TABLES_LINK_STATUS_POS]->store(link_status, FALSE);
   DBUG_VOID_RETURN;
 }
 
@@ -1350,8 +1467,10 @@ void spider_store_binlog_pos_failed_link_idx(
   int failed_link_idx
 ) {
   DBUG_ENTER("spider_store_binlog_pos_failed_link_idx");
-  table->field[2]->set_notnull();
-  table->field[2]->store(failed_link_idx);
+  table->field[SPIDER_TABLE_POSITION_FOR_RECOVERY_FAILED_LINK_ID_POS]->
+    set_notnull();
+  table->field[SPIDER_TABLE_POSITION_FOR_RECOVERY_FAILED_LINK_ID_POS]->
+    store(failed_link_idx);
   DBUG_VOID_RETURN;
 }
 
@@ -1360,8 +1479,10 @@ void spider_store_binlog_pos_source_link_idx(
   int source_link_idx
 ) {
   DBUG_ENTER("spider_store_binlog_pos_source_link_idx");
-  table->field[3]->set_notnull();
-  table->field[3]->store(source_link_idx);
+  table->field[SPIDER_TABLE_POSITION_FOR_RECOVERY_SOURCE_LINK_ID_POS]->
+    set_notnull();
+  table->field[SPIDER_TABLE_POSITION_FOR_RECOVERY_SOURCE_LINK_ID_POS]->
+    store(source_link_idx);
   DBUG_VOID_RETURN;
 }
 
@@ -1377,22 +1498,25 @@ void spider_store_binlog_pos_binlog_file(
   if (!file_name)
   {
     DBUG_PRINT("info",("spider file_name is NULL"));
-    table->field[4]->set_null();
-    table->field[4]->reset();
+    table->field[SPIDER_TABLE_POSITION_FOR_RECOVERY_FILE_POS]->set_null();
+    table->field[SPIDER_TABLE_POSITION_FOR_RECOVERY_FILE_POS]->reset();
   } else {
     DBUG_PRINT("info",("spider file_name = %s", file_name));
-    table->field[4]->set_notnull();
-    table->field[4]->store(file_name, file_name_length, binlog_pos_cs);
+    table->field[SPIDER_TABLE_POSITION_FOR_RECOVERY_FILE_POS]->set_notnull();
+    table->field[SPIDER_TABLE_POSITION_FOR_RECOVERY_FILE_POS]->store(
+      file_name, file_name_length, binlog_pos_cs);
   }
   if (!position)
   {
     DBUG_PRINT("info",("spider position is NULL"));
-    table->field[5]->set_null();
-    table->field[5]->reset();
+    table->field[SPIDER_TABLE_POSITION_FOR_RECOVERY_POSITION_POS]->set_null();
+    table->field[SPIDER_TABLE_POSITION_FOR_RECOVERY_POSITION_POS]->reset();
   } else {
     DBUG_PRINT("info",("spider position = %s", position));
-    table->field[5]->set_notnull();
-    table->field[5]->store(position, position_length, binlog_pos_cs);
+    table->field[SPIDER_TABLE_POSITION_FOR_RECOVERY_POSITION_POS]->
+      set_notnull();
+    table->field[SPIDER_TABLE_POSITION_FOR_RECOVERY_POSITION_POS]->store(
+      position, position_length, binlog_pos_cs);
   }
   DBUG_VOID_RETURN;
 }
@@ -1407,12 +1531,13 @@ void spider_store_binlog_pos_gtid(
   if (!gtid)
   {
     DBUG_PRINT("info",("spider gtid is NULL"));
-    table->field[6]->set_null();
-    table->field[6]->reset();
+    table->field[SPIDER_TABLE_POSITION_FOR_RECOVERY_GTID_POS]->set_null();
+    table->field[SPIDER_TABLE_POSITION_FOR_RECOVERY_GTID_POS]->reset();
   } else {
     DBUG_PRINT("info",("spider gtid = %s", gtid));
-    table->field[6]->set_notnull();
-    table->field[6]->store(gtid, gtid_length, binlog_pos_cs);
+    table->field[SPIDER_TABLE_POSITION_FOR_RECOVERY_GTID_POS]->set_notnull();
+    table->field[SPIDER_TABLE_POSITION_FOR_RECOVERY_GTID_POS]->store(
+      gtid, gtid_length, binlog_pos_cs);
   }
   DBUG_VOID_RETURN;
 }
@@ -1423,24 +1548,30 @@ void spider_store_table_sts_info(
 ) {
   MYSQL_TIME mysql_time;
   DBUG_ENTER("spider_store_table_sts_info");
-  table->field[2]->store((longlong) stat->data_file_length, TRUE);
-  table->field[3]->store((longlong) stat->max_data_file_length, TRUE);
-  table->field[4]->store((longlong) stat->index_file_length, TRUE);
-  table->field[5]->store((longlong) stat->records, TRUE);
-  table->field[6]->store((longlong) stat->mean_rec_length, TRUE);
+  table->field[SPIDER_TABLE_STS_DATA_FILE_LENGTH_POS]->store(
+    (longlong) stat->data_file_length, TRUE);
+  table->field[SPIDER_TABLE_STS_MAX_DATA_FILE_LENGTH_POS]->store(
+    (longlong) stat->max_data_file_length, TRUE);
+  table->field[SPIDER_TABLE_STS_INDEX_FILE_LENGTH_POS]->store(
+    (longlong) stat->index_file_length, TRUE);
+  table->field[SPIDER_TABLE_STS_RECORDS_POS]->store(
+    (longlong) stat->records, TRUE);
+  table->field[SPIDER_TABLE_STS_MEAN_REC_LENGTH_POS]->store(
+    (longlong) stat->mean_rec_length, TRUE);
   spd_tz_system->gmt_sec_to_TIME(&mysql_time, (my_time_t) stat->check_time);
-  table->field[7]->store_time(&mysql_time);
+  table->field[SPIDER_TABLE_STS_CHECK_TIME_POS]->store_time(&mysql_time);
   spd_tz_system->gmt_sec_to_TIME(&mysql_time, (my_time_t) stat->create_time);
-  table->field[8]->store_time(&mysql_time);
+  table->field[SPIDER_TABLE_STS_CREATE_TIME_POS]->store_time(&mysql_time);
   spd_tz_system->gmt_sec_to_TIME(&mysql_time, (my_time_t) stat->update_time);
-  table->field[9]->store_time(&mysql_time);
+  table->field[SPIDER_TABLE_STS_UPDATE_TIME_POS]->store_time(&mysql_time);
   if (stat->checksum_null)
   {
-    table->field[10]->set_null();
-    table->field[10]->reset();
+    table->field[SPIDER_TABLE_STS_CHECKSUM_POS]->set_null();
+    table->field[SPIDER_TABLE_STS_CHECKSUM_POS]->reset();
   } else {
-    table->field[10]->set_notnull();
-    table->field[10]->store((longlong) stat->checksum, TRUE);
+    table->field[SPIDER_TABLE_STS_CHECKSUM_POS]->set_notnull();
+    table->field[SPIDER_TABLE_STS_CHECKSUM_POS]->store(
+      (longlong) stat->checksum, TRUE);
   }
   DBUG_VOID_RETURN;
 }
@@ -1451,8 +1582,9 @@ void spider_store_table_crd_info(
   longlong *cardinality
 ) {
   DBUG_ENTER("spider_store_table_crd_info");
-  table->field[2]->store((longlong) *seq, TRUE);
-  table->field[3]->store((longlong) *cardinality, FALSE);
+  table->field[SPIDER_TABLE_CRD_KEY_SEQ_POS]->store((longlong) *seq, TRUE);
+  table->field[SPIDER_TABLE_CRD_CARDINALITY_POS]->store(
+    (longlong) *cardinality, FALSE);
   DBUG_VOID_RETURN;
 }
 
@@ -1652,7 +1784,8 @@ int spider_log_tables_link_failed(
   spider_store_tables_link_idx(table, link_idx);
 #if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >= 100000
 #else
-  if (table->field[3] == table->timestamp_field)
+  if (table->field[SPIDER_LINK_FAILED_LOG_FAILED_TIME_POS] ==
+    table->timestamp_field)
     table->timestamp_field->set_time();
 #endif
   if ((error_num = spider_write_sys_table_row(table)))
@@ -1676,20 +1809,22 @@ int spider_log_xa_failed(
   spider_store_xa_member_info(table, xid, conn);
   if (thd)
   {
-    table->field[18]->set_notnull();
-    table->field[18]->store(thd->thread_id, TRUE);
+    table->field[SPIDER_XA_FAILED_LOG_THREAD_ID_POS]->set_notnull();
+    table->field[SPIDER_XA_FAILED_LOG_THREAD_ID_POS]->store(
+      thd->thread_id, TRUE);
   } else {
-    table->field[18]->set_null();
-    table->field[18]->reset();
+    table->field[SPIDER_XA_FAILED_LOG_THREAD_ID_POS]->set_null();
+    table->field[SPIDER_XA_FAILED_LOG_THREAD_ID_POS]->reset();
   }
-  table->field[19]->store(
+  table->field[SPIDER_XA_FAILED_LOG_STATUS_POS]->store(
     status,
     (uint) strlen(status),
     system_charset_info);
 
 #if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID >= 100000
 #else
-  if (table->field[20] == table->timestamp_field)
+  if (table->field[SPIDER_XA_FAILED_LOG_FAILED_TIME_POS] ==
+    table->timestamp_field)
     table->timestamp_field->set_time();
 #endif
   if ((error_num = spider_write_sys_table_row(table)))
@@ -2079,25 +2214,25 @@ int spider_get_sys_xid(
 ) {
   char *ptr;
   DBUG_ENTER("spider_get_sys_xid");
-  ptr = get_field(mem_root, table->field[0]);
+  ptr = get_field(mem_root, table->field[SPIDER_XA_FORMAT_ID_POS]);
   if (ptr)
   {
     xid->formatID = atoi(ptr);
   } else
     xid->formatID = 0;
-  ptr = get_field(mem_root, table->field[1]);
+  ptr = get_field(mem_root, table->field[SPIDER_XA_GTRID_LENGTH_POS]);
   if (ptr)
   {
     xid->gtrid_length = atoi(ptr);
   } else
     xid->gtrid_length = 0;
-  ptr = get_field(mem_root, table->field[2]);
+  ptr = get_field(mem_root, table->field[SPIDER_XA_BQUAL_LENGTH_POS]);
   if (ptr)
   {
     xid->bqual_length = atoi(ptr);
   } else
     xid->bqual_length = 0;
-  ptr = get_field(mem_root, table->field[3]);
+  ptr = get_field(mem_root, table->field[SPIDER_XA_DATA_POS]);
   if (ptr)
   {
     strmov(xid->data, ptr);
@@ -2113,7 +2248,7 @@ int spider_get_sys_server_info(
 ) {
   char *ptr;
   DBUG_ENTER("spider_get_sys_server_info");
-  if ((ptr = get_field(mem_root, table->field[4])))
+  if ((ptr = get_field(mem_root, table->field[SPIDER_XA_MEMBER_SCHEME_POS])))
   {
     share->tgt_wrappers_lengths[link_idx] = strlen(ptr);
     share->tgt_wrappers[link_idx] = spider_create_string(ptr,
@@ -2122,7 +2257,7 @@ int spider_get_sys_server_info(
     share->tgt_wrappers_lengths[link_idx] = 0;
     share->tgt_wrappers[link_idx] = NULL;
   }
-  if ((ptr = get_field(mem_root, table->field[5])))
+  if ((ptr = get_field(mem_root, table->field[SPIDER_XA_MEMBER_HOST_POS])))
   {
     share->tgt_hosts_lengths[link_idx] = strlen(ptr);
     share->tgt_hosts[link_idx] = spider_create_string(ptr,
@@ -2131,12 +2266,12 @@ int spider_get_sys_server_info(
     share->tgt_hosts_lengths[link_idx] = 0;
     share->tgt_hosts[link_idx] = NULL;
   }
-  if ((ptr = get_field(mem_root, table->field[6])))
+  if ((ptr = get_field(mem_root, table->field[SPIDER_XA_MEMBER_PORT_POS])))
   {
     share->tgt_ports[link_idx] = atol(ptr);
   } else
     share->tgt_ports[link_idx] = MYSQL_PORT;
-  if ((ptr = get_field(mem_root, table->field[7])))
+  if ((ptr = get_field(mem_root, table->field[SPIDER_XA_MEMBER_SOCKET_POS])))
   {
     share->tgt_sockets_lengths[link_idx] = strlen(ptr);
     share->tgt_sockets[link_idx] = spider_create_string(ptr,
@@ -2145,7 +2280,7 @@ int spider_get_sys_server_info(
     share->tgt_sockets_lengths[link_idx] = 0;
     share->tgt_sockets[link_idx] = NULL;
   }
-  if ((ptr = get_field(mem_root, table->field[8])))
+  if ((ptr = get_field(mem_root, table->field[SPIDER_XA_MEMBER_USERNAME_POS])))
   {
     share->tgt_usernames_lengths[link_idx] = strlen(ptr);
     share->tgt_usernames[link_idx] =
@@ -2154,7 +2289,7 @@ int spider_get_sys_server_info(
     share->tgt_usernames_lengths[link_idx] = 0;
     share->tgt_usernames[link_idx] = NULL;
   }
-  if ((ptr = get_field(mem_root, table->field[9])))
+  if ((ptr = get_field(mem_root, table->field[SPIDER_XA_MEMBER_PASSWORD_POS])))
   {
     share->tgt_passwords_lengths[link_idx] = strlen(ptr);
     share->tgt_passwords[link_idx] =
@@ -2164,8 +2299,8 @@ int spider_get_sys_server_info(
     share->tgt_passwords[link_idx] = NULL;
   }
   if (
-    !table->field[10]->is_null() &&
-    (ptr = get_field(mem_root, table->field[10]))
+    !table->field[SPIDER_XA_MEMBER_SSL_CA_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_XA_MEMBER_SSL_CA_POS]))
   ) {
     share->tgt_ssl_cas_lengths[link_idx] = strlen(ptr);
     share->tgt_ssl_cas[link_idx] =
@@ -2175,8 +2310,8 @@ int spider_get_sys_server_info(
     share->tgt_ssl_cas[link_idx] = NULL;
   }
   if (
-    !table->field[11]->is_null() &&
-    (ptr = get_field(mem_root, table->field[11]))
+    !table->field[SPIDER_XA_MEMBER_SSL_CAPATH_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_XA_MEMBER_SSL_CAPATH_POS]))
   ) {
     share->tgt_ssl_capaths_lengths[link_idx] = strlen(ptr);
     share->tgt_ssl_capaths[link_idx] =
@@ -2186,8 +2321,8 @@ int spider_get_sys_server_info(
     share->tgt_ssl_capaths[link_idx] = NULL;
   }
   if (
-    !table->field[12]->is_null() &&
-    (ptr = get_field(mem_root, table->field[12]))
+    !table->field[SPIDER_XA_MEMBER_SSL_CERT_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_XA_MEMBER_SSL_CERT_POS]))
   ) {
     share->tgt_ssl_certs_lengths[link_idx] = strlen(ptr);
     share->tgt_ssl_certs[link_idx] =
@@ -2197,8 +2332,8 @@ int spider_get_sys_server_info(
     share->tgt_ssl_certs[link_idx] = NULL;
   }
   if (
-    !table->field[13]->is_null() &&
-    (ptr = get_field(mem_root, table->field[13]))
+    !table->field[SPIDER_XA_MEMBER_SSL_CIPHER_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_XA_MEMBER_SSL_CIPHER_POS]))
   ) {
     share->tgt_ssl_ciphers_lengths[link_idx] = strlen(ptr);
     share->tgt_ssl_ciphers[link_idx] =
@@ -2208,8 +2343,8 @@ int spider_get_sys_server_info(
     share->tgt_ssl_ciphers[link_idx] = NULL;
   }
   if (
-    !table->field[14]->is_null() &&
-    (ptr = get_field(mem_root, table->field[14]))
+    !table->field[SPIDER_XA_MEMBER_SSL_KEY_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_XA_MEMBER_SSL_KEY_POS]))
   ) {
     share->tgt_ssl_keys_lengths[link_idx] = strlen(ptr);
     share->tgt_ssl_keys[link_idx] =
@@ -2219,15 +2354,17 @@ int spider_get_sys_server_info(
     share->tgt_ssl_keys[link_idx] = NULL;
   }
   if (
-    !table->field[15]->is_null() &&
-    (ptr = get_field(mem_root, table->field[15]))
+    !table->field[SPIDER_XA_MEMBER_SSL_VERIFY_SERVER_CERT_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->
+      field[SPIDER_XA_MEMBER_SSL_VERIFY_SERVER_CERT_POS]))
   ) {
     share->tgt_ssl_vscs[link_idx] = atol(ptr);
   } else
     share->tgt_ssl_vscs[link_idx] = 0;
   if (
-    !table->field[16]->is_null() &&
-    (ptr = get_field(mem_root, table->field[16]))
+    !table->field[SPIDER_XA_MEMBER_DEFAULT_FILE_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->
+      field[SPIDER_XA_MEMBER_DEFAULT_FILE_POS]))
   ) {
     share->tgt_default_files_lengths[link_idx] = strlen(ptr);
     share->tgt_default_files[link_idx] =
@@ -2237,8 +2374,9 @@ int spider_get_sys_server_info(
     share->tgt_default_files[link_idx] = NULL;
   }
   if (
-    !table->field[17]->is_null() &&
-    (ptr = get_field(mem_root, table->field[17]))
+    !table->field[SPIDER_XA_MEMBER_DEFAULT_GROUP_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->
+      field[SPIDER_XA_MEMBER_DEFAULT_GROUP_POS]))
   ) {
     share->tgt_default_groups_lengths[link_idx] = strlen(ptr);
     share->tgt_default_groups[link_idx] =
@@ -2247,6 +2385,17 @@ int spider_get_sys_server_info(
     share->tgt_default_groups_lengths[link_idx] = 0;
     share->tgt_default_groups[link_idx] = NULL;
   }
+  if (
+    !table->field[SPIDER_XA_MEMBER_DSN_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_XA_MEMBER_DSN_POS]))
+  ) {
+    share->tgt_dsns_lengths[link_idx] = strlen(ptr);
+    share->tgt_dsns[link_idx] =
+      spider_create_string(ptr, share->tgt_dsns_lengths[link_idx]);
+  } else {
+    share->tgt_dsns_lengths[link_idx] = 0;
+    share->tgt_dsns[link_idx] = NULL;
+  }
   DBUG_RETURN(0);
 }
 
@@ -2261,7 +2410,7 @@ int spider_check_sys_xa_status(
   char *ptr;
   int error_num;
   DBUG_ENTER("spider_check_sys_xa_status");
-  ptr = get_field(mem_root, table->field[4]);
+  ptr = get_field(mem_root, table->field[SPIDER_XA_STATUS_POS]);
   if (ptr)
   {
     if (
@@ -2285,13 +2434,13 @@ int spider_get_sys_tables(
 ) {
   char *ptr;
   DBUG_ENTER("spider_get_sys_tables");
-  if ((ptr = get_field(mem_root, table->field[0])))
+  if ((ptr = get_field(mem_root, table->field[SPIDER_TABLES_DB_NAME_POS])))
   {
     *db_name = spider_create_string(ptr, strlen(ptr));
   } else {
     *db_name = NULL;
   }
-  if ((ptr = get_field(mem_root, table->field[1])))
+  if ((ptr = get_field(mem_root, table->field[SPIDER_TABLES_TABLE_NAME_POS])))
   {
     *table_name = spider_create_string(ptr, strlen(ptr));
   } else {
@@ -2309,55 +2458,67 @@ int spider_get_sys_tables_connect_info(
   char *ptr;
   int error_num = 0;
   DBUG_ENTER("spider_get_sys_tables_connect_info");
-  if ((ptr = get_field(mem_root, table->field[3])))
+  DBUG_PRINT("info",("spider link_idx:%d", link_idx));
+  if ((ptr = get_field(mem_root, table->field[SPIDER_TABLES_PRIORITY_POS])))
   {
     share->priority = my_strtoll10(ptr, (char**) NULL, &error_num);
   } else
     share->priority = 1000000;
+  DBUG_PRINT("info",("spider priority:%lld", share->priority));
   if (
-    !table->field[4]->is_null() &&
-    (ptr = get_field(mem_root, table->field[4]))
+    !table->field[SPIDER_TABLES_SERVER_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_SERVER_POS]))
   ) {
     share->server_names_lengths[link_idx] = strlen(ptr);
     share->server_names[link_idx] =
       spider_create_string(ptr, share->server_names_lengths[link_idx]);
+    DBUG_PRINT("info",("spider server_name:%s",
+      share->server_names[link_idx]));
   } else {
     share->server_names_lengths[link_idx] = 0;
     share->server_names[link_idx] = NULL;
+    DBUG_PRINT("info",("spider server_name is NULL"));
   }
   if (
-    !table->field[5]->is_null() &&
-    (ptr = get_field(mem_root, table->field[5]))
+    !table->field[SPIDER_TABLES_SCHEME_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_SCHEME_POS]))
   ) {
     share->tgt_wrappers_lengths[link_idx] = strlen(ptr);
     share->tgt_wrappers[link_idx] =
       spider_create_string(ptr, share->tgt_wrappers_lengths[link_idx]);
+    DBUG_PRINT("info",("spider tgt_wrapper:%s",
+      share->tgt_wrappers[link_idx]));
   } else {
     share->tgt_wrappers_lengths[link_idx] = 0;
     share->tgt_wrappers[link_idx] = NULL;
+    DBUG_PRINT("info",("spider tgt_wrapper is NULL"));
   }
   if (
-    !table->field[6]->is_null() &&
-    (ptr = get_field(mem_root, table->field[6]))
+    !table->field[SPIDER_TABLES_HOST_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_HOST_POS]))
   ) {
     share->tgt_hosts_lengths[link_idx] = strlen(ptr);
     share->tgt_hosts[link_idx] =
       spider_create_string(ptr, share->tgt_hosts_lengths[link_idx]);
+    DBUG_PRINT("info",("spider tgt_host:%s",
+      share->tgt_hosts[link_idx]));
   } else {
     share->tgt_hosts_lengths[link_idx] = 0;
     share->tgt_hosts[link_idx] = NULL;
+    DBUG_PRINT("info",("spider tgt_host is NULL"));
   }
   if (
-    !table->field[7]->is_null() &&
-    (ptr = get_field(mem_root, table->field[7]))
+    !table->field[SPIDER_TABLES_PORT_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_PORT_POS]))
   ) {
     share->tgt_ports[link_idx] = atol(ptr);
   } else {
     share->tgt_ports[link_idx] = -1;
   }
+  DBUG_PRINT("info",("spider port:%ld", share->tgt_ports[link_idx]));
   if (
-    !table->field[8]->is_null() &&
-    (ptr = get_field(mem_root, table->field[8]))
+    !table->field[SPIDER_TABLES_SOCKET_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_SOCKET_POS]))
   ) {
     share->tgt_sockets_lengths[link_idx] = strlen(ptr);
     share->tgt_sockets[link_idx] =
@@ -2367,8 +2528,8 @@ int spider_get_sys_tables_connect_info(
     share->tgt_sockets[link_idx] = NULL;
   }
   if (
-    !table->field[9]->is_null() &&
-    (ptr = get_field(mem_root, table->field[9]))
+    !table->field[SPIDER_TABLES_USERNAME_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_USERNAME_POS]))
   ) {
     share->tgt_usernames_lengths[link_idx] = strlen(ptr);
     share->tgt_usernames[link_idx] =
@@ -2378,8 +2539,8 @@ int spider_get_sys_tables_connect_info(
     share->tgt_usernames[link_idx] = NULL;
   }
   if (
-    !table->field[10]->is_null() &&
-    (ptr = get_field(mem_root, table->field[10]))
+    !table->field[SPIDER_TABLES_PASSWORD_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_PASSWORD_POS]))
   ) {
     share->tgt_passwords_lengths[link_idx] = strlen(ptr);
     share->tgt_passwords[link_idx] =
@@ -2389,8 +2550,8 @@ int spider_get_sys_tables_connect_info(
     share->tgt_passwords[link_idx] = NULL;
   }
   if (
-    !table->field[11]->is_null() &&
-    (ptr = get_field(mem_root, table->field[11]))
+    !table->field[SPIDER_TABLES_SSL_CA_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_SSL_CA_POS]))
   ) {
     share->tgt_ssl_cas_lengths[link_idx] = strlen(ptr);
     share->tgt_ssl_cas[link_idx] =
@@ -2400,8 +2561,8 @@ int spider_get_sys_tables_connect_info(
     share->tgt_ssl_cas[link_idx] = NULL;
   }
   if (
-    !table->field[12]->is_null() &&
-    (ptr = get_field(mem_root, table->field[12]))
+    !table->field[SPIDER_TABLES_SSL_CAPATH_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_SSL_CAPATH_POS]))
   ) {
     share->tgt_ssl_capaths_lengths[link_idx] = strlen(ptr);
     share->tgt_ssl_capaths[link_idx] =
@@ -2411,8 +2572,8 @@ int spider_get_sys_tables_connect_info(
     share->tgt_ssl_capaths[link_idx] = NULL;
   }
   if (
-    !table->field[13]->is_null() &&
-    (ptr = get_field(mem_root, table->field[13]))
+    !table->field[SPIDER_TABLES_SSL_CERT_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_SSL_CERT_POS]))
   ) {
     share->tgt_ssl_certs_lengths[link_idx] = strlen(ptr);
     share->tgt_ssl_certs[link_idx] =
@@ -2422,8 +2583,8 @@ int spider_get_sys_tables_connect_info(
     share->tgt_ssl_certs[link_idx] = NULL;
   }
   if (
-    !table->field[14]->is_null() &&
-    (ptr = get_field(mem_root, table->field[14]))
+    !table->field[SPIDER_TABLES_SSL_CIPHER_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_SSL_CIPHER_POS]))
   ) {
     share->tgt_ssl_ciphers_lengths[link_idx] = strlen(ptr);
     share->tgt_ssl_ciphers[link_idx] =
@@ -2433,8 +2594,8 @@ int spider_get_sys_tables_connect_info(
     share->tgt_ssl_ciphers[link_idx] = NULL;
   }
   if (
-    !table->field[15]->is_null() &&
-    (ptr = get_field(mem_root, table->field[15]))
+    !table->field[SPIDER_TABLES_SSL_KEY_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_SSL_KEY_POS]))
   ) {
     share->tgt_ssl_keys_lengths[link_idx] = strlen(ptr);
     share->tgt_ssl_keys[link_idx] =
@@ -2444,22 +2605,25 @@ int spider_get_sys_tables_connect_info(
     share->tgt_ssl_keys[link_idx] = NULL;
   }
   if (
-    !table->field[16]->is_null() &&
-    (ptr = get_field(mem_root, table->field[16]))
+    !table->field[SPIDER_TABLES_SSL_VERIFY_SERVER_CERT_POS]->is_null() &&
+    (ptr = get_field(mem_root,
+      table->field[SPIDER_TABLES_SSL_VERIFY_SERVER_CERT_POS]))
   ) {
     share->tgt_ssl_vscs[link_idx] = atol(ptr);
   } else
     share->tgt_ssl_vscs[link_idx] = -1;
   if (
-    !table->field[17]->is_null() &&
-    (ptr = get_field(mem_root, table->field[17]))
+    !table->field[SPIDER_TABLES_MONITORING_BINLOG_POS_AT_FAILING_POS]->
+      is_null() &&
+    (ptr = get_field(mem_root, table->
+      field[SPIDER_TABLES_MONITORING_BINLOG_POS_AT_FAILING_POS]))
   ) {
     share->monitoring_binlog_pos_at_failing[link_idx] = atol(ptr);
   } else
     share->monitoring_binlog_pos_at_failing[link_idx] = 0;
   if (
-    !table->field[18]->is_null() &&
-    (ptr = get_field(mem_root, table->field[18]))
+    !table->field[SPIDER_TABLES_DEFAULT_FILE_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_DEFAULT_FILE_POS]))
   ) {
     share->tgt_default_files_lengths[link_idx] = strlen(ptr);
     share->tgt_default_files[link_idx] =
@@ -2469,8 +2633,8 @@ int spider_get_sys_tables_connect_info(
     share->tgt_default_files[link_idx] = NULL;
   }
   if (
-    !table->field[19]->is_null() &&
-    (ptr = get_field(mem_root, table->field[19]))
+    !table->field[SPIDER_TABLES_DEFAULT_GROUP_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_DEFAULT_GROUP_POS]))
   ) {
     share->tgt_default_groups_lengths[link_idx] = strlen(ptr);
     share->tgt_default_groups[link_idx] =
@@ -2480,8 +2644,19 @@ int spider_get_sys_tables_connect_info(
     share->tgt_default_groups[link_idx] = NULL;
   }
   if (
-    !table->field[20]->is_null() &&
-    (ptr = get_field(mem_root, table->field[20]))
+    !table->field[SPIDER_TABLES_DSN_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_DSN_POS]))
+  ) {
+    share->tgt_dsns_lengths[link_idx] = strlen(ptr);
+    share->tgt_dsns[link_idx] =
+      spider_create_string(ptr, share->tgt_dsns_lengths[link_idx]);
+  } else {
+    share->tgt_dsns_lengths[link_idx] = 0;
+    share->tgt_dsns[link_idx] = NULL;
+  }
+  if (
+    !table->field[SPIDER_TABLES_TGT_DB_NAME_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_TGT_DB_NAME_POS]))
   ) {
     share->tgt_dbs_lengths[link_idx] = strlen(ptr);
     share->tgt_dbs[link_idx] =
@@ -2491,8 +2666,8 @@ int spider_get_sys_tables_connect_info(
     share->tgt_dbs[link_idx] = NULL;
   }
   if (
-    !table->field[21]->is_null() &&
-    (ptr = get_field(mem_root, table->field[21]))
+    !table->field[SPIDER_TABLES_TGT_TABLE_NAME_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_TGT_TABLE_NAME_POS]))
   ) {
     share->tgt_table_names_lengths[link_idx] = strlen(ptr);
     share->tgt_table_names[link_idx] =
@@ -2502,8 +2677,8 @@ int spider_get_sys_tables_connect_info(
     share->tgt_table_names[link_idx] = NULL;
   }
   if (
-    !table->field[24]->is_null() &&
-    (ptr = get_field(mem_root, table->field[24]))
+    !table->field[SPIDER_TABLES_STATIC_LINK_ID_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_TABLES_STATIC_LINK_ID_POS]))
   ) {
     share->static_link_ids_lengths[link_idx] = strlen(ptr);
     share->static_link_ids[link_idx] =
@@ -2523,7 +2698,8 @@ int spider_get_sys_tables_monitoring_binlog_pos_at_failing(
   char *ptr;
   int error_num = 0;
   DBUG_ENTER("spider_get_sys_tables_monitoring_binlog_pos_at_failing");
-  if ((ptr = get_field(mem_root, table->field[17])))
+  if ((ptr = get_field(mem_root, table->
+    field[SPIDER_TABLES_MONITORING_BINLOG_POS_AT_FAILING_POS])))
     *monitoring_binlog_pos_at_failing = (long) my_strtoll10(ptr, (char**) NULL,
       &error_num);
   else
@@ -2542,7 +2718,7 @@ int spider_get_sys_tables_link_status(
   char *ptr;
   int error_num = 0;
   DBUG_ENTER("spider_get_sys_tables_link_status");
-  if ((ptr = get_field(mem_root, table->field[22])))
+  if ((ptr = get_field(mem_root, table->field[SPIDER_TABLES_LINK_STATUS_POS])))
   {
     share->link_statuses[link_idx] =
       (long) my_strtoll10(ptr, (char**) NULL, &error_num);
@@ -2561,7 +2737,7 @@ int spider_get_sys_tables_link_status(
   char *ptr;
   int error_num = 0;
   DBUG_ENTER("spider_get_sys_tables_link_status");
-  if ((ptr = get_field(mem_root, table->field[22])))
+  if ((ptr = get_field(mem_root, table->field[SPIDER_TABLES_LINK_STATUS_POS])))
     *link_status = (long) my_strtoll10(ptr, (char**) NULL, &error_num);
   else
     *link_status = 1;
@@ -2577,7 +2753,7 @@ int spider_get_sys_tables_link_idx(
   char *ptr;
   int error_num = 0;
   DBUG_ENTER("spider_get_sys_tables_link_idx");
-  if ((ptr = get_field(mem_root, table->field[2])))
+  if ((ptr = get_field(mem_root, table->field[SPIDER_TABLES_LINK_ID_POS])))
     *link_idx = (int) my_strtoll10(ptr, (char**) NULL, &error_num);
   else
     *link_idx = 1;
@@ -2595,8 +2771,9 @@ int spider_get_sys_tables_static_link_id(
   DBUG_ENTER("spider_get_sys_tables_static_link_id");
   *static_link_id = NULL;
   if (
-    !table->field[24]->is_null() &&
-    (*static_link_id = get_field(mem_root, table->field[24]))
+    !table->field[SPIDER_TABLES_STATIC_LINK_ID_POS]->is_null() &&
+    (*static_link_id = get_field(mem_root, table->
+      field[SPIDER_TABLES_STATIC_LINK_ID_POS]))
   ) {
     *static_link_id_length = strlen(*static_link_id);
   } else {
@@ -2618,12 +2795,18 @@ void spider_get_sys_table_sts_info(
 #endif
   long not_used_long;
   DBUG_ENTER("spider_get_sys_table_sts_info");
-  stat->data_file_length = (ulonglong) table->field[2]->val_int();
-  stat->max_data_file_length = (ulonglong) table->field[3]->val_int();
-  stat->index_file_length = (ulonglong) table->field[4]->val_int();
-  stat->records = (ha_rows) table->field[5]->val_int();
-  stat->mean_rec_length = (ulong) table->field[6]->val_int();
-  table->field[7]->get_date(&mysql_time, SPIDER_date_mode_t(0));
+  stat->data_file_length = (ulonglong) table->
+    field[SPIDER_TABLE_STS_DATA_FILE_LENGTH_POS]->val_int();
+  stat->max_data_file_length = (ulonglong) table->
+    field[SPIDER_TABLE_STS_MAX_DATA_FILE_LENGTH_POS]->val_int();
+  stat->index_file_length = (ulonglong) table->
+    field[SPIDER_TABLE_STS_INDEX_FILE_LENGTH_POS]->val_int();
+  stat->records = (ha_rows) table->
+    field[SPIDER_TABLE_STS_RECORDS_POS]->val_int();
+  stat->mean_rec_length = (ulong) table->
+    field[SPIDER_TABLE_STS_MEAN_REC_LENGTH_POS]->val_int();
+  table->field[SPIDER_TABLE_STS_CHECK_TIME_POS]->get_date(&mysql_time,
+    SPIDER_date_mode_t(0));
 #ifdef MARIADB_BASE_VERSION
   stat->check_time = (time_t) my_system_gmt_sec(&mysql_time,
     &not_used_long, &not_used_uint);
@@ -2631,7 +2814,8 @@ void spider_get_sys_table_sts_info(
   stat->check_time = (time_t) my_system_gmt_sec(&mysql_time,
     &not_used_long, &not_used_my_bool);
 #endif
-  table->field[8]->get_date(&mysql_time, SPIDER_date_mode_t(0));
+  table->field[SPIDER_TABLE_STS_CREATE_TIME_POS]->get_date(&mysql_time,
+    SPIDER_date_mode_t(0));
 #ifdef MARIADB_BASE_VERSION
   stat->create_time = (time_t) my_system_gmt_sec(&mysql_time,
     &not_used_long, &not_used_uint);
@@ -2639,7 +2823,8 @@ void spider_get_sys_table_sts_info(
   stat->create_time = (time_t) my_system_gmt_sec(&mysql_time,
     &not_used_long, &not_used_my_bool);
 #endif
-  table->field[9]->get_date(&mysql_time, SPIDER_date_mode_t(0));
+  table->field[SPIDER_TABLE_STS_UPDATE_TIME_POS]->get_date(&mysql_time,
+    SPIDER_date_mode_t(0));
 #ifdef MARIADB_BASE_VERSION
   stat->update_time = (time_t) my_system_gmt_sec(&mysql_time,
     &not_used_long, &not_used_uint);
@@ -2647,13 +2832,14 @@ void spider_get_sys_table_sts_info(
   stat->update_time = (time_t) my_system_gmt_sec(&mysql_time,
     &not_used_long, &not_used_my_bool);
 #endif
-  if (table->field[10]->is_null())
+  if (table->field[SPIDER_TABLE_STS_CHECKSUM_POS]->is_null())
   {
     stat->checksum_null = TRUE;
     stat->checksum = 0;
   } else {
     stat->checksum_null = FALSE;
-    stat->checksum = (ha_checksum) table->field[10]->val_int();
+    stat->checksum = (ha_checksum) table->
+      field[SPIDER_TABLE_STS_CHECKSUM_POS]->val_int();
   }
   DBUG_VOID_RETURN;
 }
@@ -2665,10 +2851,11 @@ void spider_get_sys_table_crd_info(
 ) {
   uint seq;
   DBUG_ENTER("spider_get_sys_table_crd_info");
-  seq = (uint) table->field[2]->val_int();
+  seq = (uint) table->field[SPIDER_TABLE_CRD_KEY_SEQ_POS]->val_int();
   if (seq < number_of_keys)
   {
-    cardinality[seq] = (longlong) table->field[3]->val_int();
+    cardinality[seq] = (longlong) table->
+      field[SPIDER_TABLE_CRD_CARDINALITY_POS]->val_int();
   }
   DBUG_VOID_RETURN;
 }
@@ -2683,11 +2870,7 @@ int spider_sys_update_tables_link_status(
 ) {
   int error_num;
   TABLE *table_tables = NULL;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   DBUG_ENTER("spider_sys_update_tables_link_status");
   if (
     !(table_tables = spider_open_sys_table(
@@ -2721,11 +2904,7 @@ int spider_sys_log_tables_link_failed(
 ) {
   int error_num;
   TABLE *table_tables = NULL;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   DBUG_ENTER("spider_sys_log_tables_link_failed");
   if (
     !(table_tables = spider_open_sys_table(
@@ -2760,11 +2939,7 @@ int spider_sys_log_xa_failed(
 ) {
   int error_num;
   TABLE *table_tables = NULL;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   DBUG_ENTER("spider_sys_log_xa_failed");
   if (
     !(table_tables = spider_open_sys_table(
@@ -2797,9 +2972,9 @@ int spider_get_sys_link_mon_key(
   uint db_name_length, table_name_length, link_id_length;
   DBUG_ENTER("spider_get_sys_link_mon_key");
   if (
-    table->field[0]->is_null() ||
-    table->field[1]->is_null() ||
-    table->field[2]->is_null()
+    table->field[SPIDER_LINK_MON_SERVERS_DB_NAME_POS]->is_null() ||
+    table->field[SPIDER_LINK_MON_SERVERS_TABLE_NAME_POS]->is_null() ||
+    table->field[SPIDER_LINK_MON_SERVERS_LINK_ID_POS]->is_null()
   ) {
     my_printf_error(ER_SPIDER_SYS_TABLE_VERSION_NUM,
       ER_SPIDER_SYS_TABLE_VERSION_STR, MYF(0),
@@ -2808,9 +2983,12 @@ int spider_get_sys_link_mon_key(
   }
 
   if (
-    !(db_name = get_field(mem_root, table->field[0])) ||
-    !(table_name = get_field(mem_root, table->field[1])) ||
-    !(link_id = get_field(mem_root, table->field[2]))
+    !(db_name = get_field(mem_root,
+      table->field[SPIDER_LINK_MON_SERVERS_DB_NAME_POS])) ||
+    !(table_name = get_field(mem_root,
+      table->field[SPIDER_LINK_MON_SERVERS_TABLE_NAME_POS])) ||
+    !(link_id = get_field(mem_root,
+      table->field[SPIDER_LINK_MON_SERVERS_LINK_ID_POS]))
   )
     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
 
@@ -2860,7 +3038,8 @@ int spider_get_sys_link_mon_server_id(
   char *ptr;
   int error_num = 0;
   DBUG_ENTER("spider_get_sys_link_mon_server_id");
-  if ((ptr = get_field(mem_root, table->field[3])))
+  if ((ptr = get_field(mem_root,
+    table->field[SPIDER_LINK_MON_SERVERS_SID_POS])))
     *server_id = (uint32) my_strtoll10(ptr, (char**) NULL, &error_num);
   else
     *server_id = ~(uint32) 0;
@@ -2877,8 +3056,9 @@ int spider_get_sys_link_mon_connect_info(
   int error_num = 0;
   DBUG_ENTER("spider_get_sys_link_mon_connect_info");
   if (
-    !table->field[4]->is_null() &&
-    (ptr = get_field(mem_root, table->field[4]))
+    !table->field[SPIDER_LINK_MON_SERVERS_SERVER_POS]->is_null() &&
+    (ptr = get_field(mem_root,
+      table->field[SPIDER_LINK_MON_SERVERS_SERVER_POS]))
   ) {
     share->server_names_lengths[link_idx] = strlen(ptr);
     share->server_names[link_idx] =
@@ -2888,8 +3068,9 @@ int spider_get_sys_link_mon_connect_info(
     share->server_names[link_idx] = NULL;
   }
   if (
-    !table->field[5]->is_null() &&
-    (ptr = get_field(mem_root, table->field[5]))
+    !table->field[SPIDER_LINK_MON_SERVERS_SCHEME_POS]->is_null() &&
+    (ptr = get_field(mem_root,
+      table->field[SPIDER_LINK_MON_SERVERS_SCHEME_POS]))
   ) {
     share->tgt_wrappers_lengths[link_idx] = strlen(ptr);
     share->tgt_wrappers[link_idx] =
@@ -2899,8 +3080,8 @@ int spider_get_sys_link_mon_connect_info(
     share->tgt_wrappers[link_idx] = NULL;
   }
   if (
-    !table->field[6]->is_null() &&
-    (ptr = get_field(mem_root, table->field[6]))
+    !table->field[SPIDER_LINK_MON_SERVERS_HOST_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_LINK_MON_SERVERS_HOST_POS]))
   ) {
     share->tgt_hosts_lengths[link_idx] = strlen(ptr);
     share->tgt_hosts[link_idx] =
@@ -2910,16 +3091,17 @@ int spider_get_sys_link_mon_connect_info(
     share->tgt_hosts[link_idx] = NULL;
   }
   if (
-    !table->field[7]->is_null() &&
-    (ptr = get_field(mem_root, table->field[7]))
+    !table->field[SPIDER_LINK_MON_SERVERS_PORT_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_LINK_MON_SERVERS_PORT_POS]))
   ) {
     share->tgt_ports[link_idx] = atol(ptr);
   } else {
     share->tgt_ports[link_idx] = -1;
   }
   if (
-    !table->field[8]->is_null() &&
-    (ptr = get_field(mem_root, table->field[8]))
+    !table->field[SPIDER_LINK_MON_SERVERS_SOCKET_POS]->is_null() &&
+    (ptr = get_field(mem_root,
+      table->field[SPIDER_LINK_MON_SERVERS_SOCKET_POS]))
   ) {
     share->tgt_sockets_lengths[link_idx] = strlen(ptr);
     share->tgt_sockets[link_idx] =
@@ -2929,8 +3111,9 @@ int spider_get_sys_link_mon_connect_info(
     share->tgt_sockets[link_idx] = NULL;
   }
   if (
-    !table->field[9]->is_null() &&
-    (ptr = get_field(mem_root, table->field[9]))
+    !table->field[SPIDER_LINK_MON_SERVERS_USERNAME_POS]->is_null() &&
+    (ptr = get_field(mem_root,
+      table->field[SPIDER_LINK_MON_SERVERS_USERNAME_POS]))
   ) {
     share->tgt_usernames_lengths[link_idx] = strlen(ptr);
     share->tgt_usernames[link_idx] =
@@ -2940,8 +3123,9 @@ int spider_get_sys_link_mon_connect_info(
     share->tgt_usernames[link_idx] = NULL;
   }
   if (
-    !table->field[10]->is_null() &&
-    (ptr = get_field(mem_root, table->field[10]))
+    !table->field[SPIDER_LINK_MON_SERVERS_PASSWORD_POS]->is_null() &&
+    (ptr = get_field(mem_root,
+      table->field[SPIDER_LINK_MON_SERVERS_PASSWORD_POS]))
   ) {
     share->tgt_passwords_lengths[link_idx] = strlen(ptr);
     share->tgt_passwords[link_idx] =
@@ -2951,8 +3135,9 @@ int spider_get_sys_link_mon_connect_info(
     share->tgt_passwords[link_idx] = NULL;
   }
   if (
-    !table->field[11]->is_null() &&
-    (ptr = get_field(mem_root, table->field[11]))
+    !table->field[SPIDER_LINK_MON_SERVERS_SSL_CA_POS]->is_null() &&
+    (ptr = get_field(mem_root,
+      table->field[SPIDER_LINK_MON_SERVERS_SSL_CA_POS]))
   ) {
     share->tgt_ssl_cas_lengths[link_idx] = strlen(ptr);
     share->tgt_ssl_cas[link_idx] =
@@ -2962,8 +3147,9 @@ int spider_get_sys_link_mon_connect_info(
     share->tgt_ssl_cas[link_idx] = NULL;
   }
   if (
-    !table->field[12]->is_null() &&
-    (ptr = get_field(mem_root, table->field[12]))
+    !table->field[SPIDER_LINK_MON_SERVERS_SSL_CAPATH_POS]->is_null() &&
+    (ptr = get_field(mem_root,
+      table->field[SPIDER_LINK_MON_SERVERS_SSL_CAPATH_POS]))
   ) {
     share->tgt_ssl_capaths_lengths[link_idx] = strlen(ptr);
     share->tgt_ssl_capaths[link_idx] =
@@ -2973,8 +3159,9 @@ int spider_get_sys_link_mon_connect_info(
     share->tgt_ssl_capaths[link_idx] = NULL;
   }
   if (
-    !table->field[13]->is_null() &&
-    (ptr = get_field(mem_root, table->field[13]))
+    !table->field[SPIDER_LINK_MON_SERVERS_SSL_CERT_POS]->is_null() &&
+    (ptr = get_field(mem_root,
+      table->field[SPIDER_LINK_MON_SERVERS_SSL_CERT_POS]))
   ) {
     share->tgt_ssl_certs_lengths[link_idx] = strlen(ptr);
     share->tgt_ssl_certs[link_idx] =
@@ -2984,8 +3171,9 @@ int spider_get_sys_link_mon_connect_info(
     share->tgt_ssl_certs[link_idx] = NULL;
   }
   if (
-    !table->field[14]->is_null() &&
-    (ptr = get_field(mem_root, table->field[14]))
+    !table->field[SPIDER_LINK_MON_SERVERS_SSL_CIPHER_POS]->is_null() &&
+    (ptr = get_field(mem_root,
+      table->field[SPIDER_LINK_MON_SERVERS_SSL_CIPHER_POS]))
   ) {
     share->tgt_ssl_ciphers_lengths[link_idx] = strlen(ptr);
     share->tgt_ssl_ciphers[link_idx] =
@@ -2995,8 +3183,9 @@ int spider_get_sys_link_mon_connect_info(
     share->tgt_ssl_ciphers[link_idx] = NULL;
   }
   if (
-    !table->field[15]->is_null() &&
-    (ptr = get_field(mem_root, table->field[15]))
+    !table->field[SPIDER_LINK_MON_SERVERS_SSL_KEY_POS]->is_null() &&
+    (ptr = get_field(mem_root,
+      table->field[SPIDER_LINK_MON_SERVERS_SSL_KEY_POS]))
   ) {
     share->tgt_ssl_keys_lengths[link_idx] = strlen(ptr);
     share->tgt_ssl_keys[link_idx] =
@@ -3006,15 +3195,18 @@ int spider_get_sys_link_mon_connect_info(
     share->tgt_ssl_keys[link_idx] = NULL;
   }
   if (
-    !table->field[16]->is_null() &&
-    (ptr = get_field(mem_root, table->field[16]))
+    !table->field[SPIDER_LINK_MON_SERVERS_SSL_VERIFY_SERVER_CERT_POS]->
+      is_null() &&
+    (ptr = get_field(mem_root,
+      table->field[SPIDER_LINK_MON_SERVERS_SSL_VERIFY_SERVER_CERT_POS]))
   ) {
     share->tgt_ssl_vscs[link_idx] = atol(ptr);
   } else
     share->tgt_ssl_vscs[link_idx] = -1;
   if (
-    !table->field[17]->is_null() &&
-    (ptr = get_field(mem_root, table->field[17]))
+    !table->field[SPIDER_LINK_MON_SERVERS_DEFAULT_FILE_POS]->is_null() &&
+    (ptr = get_field(mem_root,
+      table->field[SPIDER_LINK_MON_SERVERS_DEFAULT_FILE_POS]))
   ) {
     share->tgt_default_files_lengths[link_idx] = strlen(ptr);
     share->tgt_default_files[link_idx] =
@@ -3024,8 +3216,9 @@ int spider_get_sys_link_mon_connect_info(
     share->tgt_default_files[link_idx] = NULL;
   }
   if (
-    !table->field[18]->is_null() &&
-    (ptr = get_field(mem_root, table->field[18]))
+    !table->field[SPIDER_LINK_MON_SERVERS_DEFAULT_GROUP_POS]->is_null() &&
+    (ptr = get_field(mem_root,
+      table->field[SPIDER_LINK_MON_SERVERS_DEFAULT_GROUP_POS]))
   ) {
     share->tgt_default_groups_lengths[link_idx] = strlen(ptr);
     share->tgt_default_groups[link_idx] =
@@ -3034,6 +3227,17 @@ int spider_get_sys_link_mon_connect_info(
     share->tgt_default_groups_lengths[link_idx] = 0;
     share->tgt_default_groups[link_idx] = NULL;
   }
+  if (
+    !table->field[SPIDER_LINK_MON_SERVERS_DSN_POS]->is_null() &&
+    (ptr = get_field(mem_root, table->field[SPIDER_LINK_MON_SERVERS_DSN_POS]))
+  ) {
+    share->tgt_dsns_lengths[link_idx] = strlen(ptr);
+    share->tgt_dsns[link_idx] =
+      spider_create_string(ptr, share->tgt_dsns_lengths[link_idx]);
+  } else {
+    share->tgt_dsns_lengths[link_idx] = 0;
+    share->tgt_dsns[link_idx] = NULL;
+  }
   DBUG_RETURN(error_num);
 }
 
@@ -3080,11 +3284,7 @@ int spider_sys_insert_or_update_table_sts(
 ) {
   int error_num;
   TABLE *table_sts = NULL;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   DBUG_ENTER("spider_sys_insert_or_update_table_sts");
   if (
     !(table_sts = spider_open_sys_table(
@@ -3121,11 +3321,7 @@ int spider_sys_insert_or_update_table_crd(
 ) {
   int error_num;
   TABLE *table_crd = NULL;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   DBUG_ENTER("spider_sys_insert_or_update_table_crd");
   if (
     !(table_crd = spider_open_sys_table(
@@ -3161,11 +3357,7 @@ int spider_sys_delete_table_sts(
 ) {
   int error_num;
   TABLE *table_sts = NULL;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   DBUG_ENTER("spider_sys_delete_table_sts");
   if (
     !(table_sts = spider_open_sys_table(
@@ -3199,11 +3391,7 @@ int spider_sys_delete_table_crd(
 ) {
   int error_num;
   TABLE *table_crd = NULL;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   DBUG_ENTER("spider_sys_delete_table_crd");
   if (
     !(table_crd = spider_open_sys_table(
@@ -3239,11 +3427,7 @@ int spider_sys_get_table_sts(
   int error_num;
   char table_key[MAX_KEY_LENGTH];
   TABLE *table_sts = NULL;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   DBUG_ENTER("spider_sys_get_table_sts");
   if (
     !(table_sts = spider_open_sys_table(
@@ -3292,12 +3476,9 @@ int spider_sys_get_table_crd(
   char table_key[MAX_KEY_LENGTH];
   bool index_inited = FALSE;
   TABLE *table_crd = NULL;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   DBUG_ENTER("spider_sys_get_table_crd");
+
   if (
     !(table_crd = spider_open_sys_table(
       thd, SPIDER_SYS_TABLE_CRD_TABLE_NAME_STR,
@@ -3342,6 +3523,7 @@ int spider_sys_get_table_crd(
 error:
   if (index_inited)
     spider_sys_index_end(table_crd);
+
   if (table_crd)
     spider_close_sys_table(thd, table_crd, &open_tables_backup, need_lock);
   DBUG_RETURN(error_num);
@@ -3636,6 +3818,10 @@ TABLE *spider_find_temporary_table(
 ) {
   DBUG_ENTER("spider_find_temporary_table");
 #ifdef SPIDER_open_temporary_table
+  if (!thd->has_temporary_tables())
+  {
+    DBUG_RETURN(NULL);
+  }
   if (thd->open_temporary_table(table_list))
   {
     DBUG_RETURN(NULL);
diff --git a/storage/spider/spd_sys_table.h b/storage/spider/spd_sys_table.h
index df933ec61b8..d5faf6793d4 100644
--- a/storage/spider/spd_sys_table.h
+++ b/storage/spider/spd_sys_table.h
@@ -51,15 +51,15 @@
 #define SPIDER_SYS_XA_COL_CNT 5
 #define SPIDER_SYS_XA_PK_COL_CNT 3
 #define SPIDER_SYS_XA_IDX1_COL_CNT 1
-#define SPIDER_SYS_XA_MEMBER_COL_CNT 18
+#define SPIDER_SYS_XA_MEMBER_COL_CNT 19
 #define SPIDER_SYS_XA_MEMBER_PK_COL_CNT 6
-#define SPIDER_SYS_TABLES_COL_CNT 25
+#define SPIDER_SYS_TABLES_COL_CNT 26
 #define SPIDER_SYS_TABLES_PK_COL_CNT 3
 #define SPIDER_SYS_TABLES_IDX1_COL_CNT 1
 #define SPIDER_SYS_TABLES_UIDX1_COL_CNT 3
-#define SPIDER_SYS_LINK_MON_TABLE_COL_CNT 19
+#define SPIDER_SYS_LINK_MON_TABLE_COL_CNT 20
 #define SPIDER_SYS_LINK_FAILED_TABLE_COL_CNT 4
-#define SPIDER_SYS_XA_FAILED_TABLE_COL_CNT 21
+#define SPIDER_SYS_XA_FAILED_TABLE_COL_CNT 22
 #define SPIDER_SYS_POS_FOR_RECOVERY_TABLE_COL_CNT 7
 #define SPIDER_SYS_TABLE_STS_COL_CNT 11
 #define SPIDER_SYS_TABLE_STS_PK_COL_CNT 2
@@ -86,13 +86,12 @@ public:
   uint link_id_length;
 };
 
-#if MYSQL_VERSION_ID < 50500
 TABLE *spider_open_sys_table(
   THD *thd,
   const char *table_name,
   int table_name_length,
   bool write,
-  Open_tables_state *open_tables_backup,
+  SPIDER_Open_tables_backup *open_tables_backup,
   bool need_lock,
   int *error_num
 );
@@ -100,42 +99,27 @@ TABLE *spider_open_sys_table(
 void spider_close_sys_table(
   THD *thd,
   TABLE *table,
-  Open_tables_state *open_tables_backup,
+  SPIDER_Open_tables_backup *open_tables_backup,
   bool need_lock
 );
-#else
-TABLE *spider_open_sys_table(
-  THD *thd,
-  const char *table_name,
-  int table_name_length,
-  bool write,
-  Open_tables_backup *open_tables_backup,
-  bool need_lock,
-  int *error_num
-);
 
-void spider_close_sys_table(
-  THD *thd,
-  TABLE *table,
-  Open_tables_backup *open_tables_backup,
-  bool need_lock
-);
-
-bool spider_sys_open_tables(
+#if MYSQL_VERSION_ID < 50500
+#else
+bool spider_sys_open_and_lock_tables(
   THD *thd,
   TABLE_LIST **tables,
-  Open_tables_backup *open_tables_backup
+  SPIDER_Open_tables_backup *open_tables_backup
 );
 
 TABLE *spider_sys_open_table(
   THD *thd,
   TABLE_LIST *tables,
-  Open_tables_backup *open_tables_backup
+  SPIDER_Open_tables_backup *open_tables_backup
 );
 
 void spider_sys_close_table(
   THD *thd,
-  Open_tables_backup *open_tables_backup
+  SPIDER_Open_tables_backup *open_tables_backup
 );
 #endif
 
@@ -274,11 +258,6 @@ void spider_store_tables_link_status(
   long link_status
 );
 
-void spider_store_link_chk_server_id(
-  TABLE *table,
-  uint32 server_id
-);
-
 void spider_store_binlog_pos_failed_link_idx(
   TABLE *table,
   int failed_link_idx
diff --git a/storage/spider/spd_table.cc b/storage/spider/spd_table.cc
index 495e1483ee6..608e0ca811e 100644
--- a/storage/spider/spd_table.cc
+++ b/storage/spider/spd_table.cc
@@ -1,5 +1,5 @@
-/* Copyright (C) 2008-2019 Kentoku Shiba
-   Copyright (C) 2019 MariaDB corp
+/* Copyright (C) 2008-2020 Kentoku Shiba
+   Copyright (C) 2019-2022 MariaDB corp
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -122,7 +122,7 @@ uint *spd_db_att_xid_cache_split_num;
 pthread_mutex_t *spd_db_att_LOCK_xid_cache;
 HASH *spd_db_att_xid_cache;
 #endif
-struct charset_info_st *spd_charset_utf8_bin;
+struct charset_info_st *spd_charset_utf8mb3_bin;
 const char **spd_defaults_extra_file;
 const char **spd_defaults_file;
 const char **spd_mysqld_unix_port;
@@ -152,7 +152,7 @@ SPIDER_THREAD *spider_table_crd_threads;
 PSI_mutex_key spd_key_mutex_tbl;
 PSI_mutex_key spd_key_mutex_init_error_tbl;
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-PSI_mutex_key spd_key_mutex_pt_share;
+PSI_mutex_key spd_key_mutex_wide_share;
 #endif
 PSI_mutex_key spd_key_mutex_lgtm_tblhnd_share;
 PSI_mutex_key spd_key_mutex_conn;
@@ -182,9 +182,8 @@ PSI_mutex_key spd_key_mutex_share_sts;
 PSI_mutex_key spd_key_mutex_share_crd;
 PSI_mutex_key spd_key_mutex_share_auto_increment;
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-PSI_mutex_key spd_key_mutex_pt_share_sts;
-PSI_mutex_key spd_key_mutex_pt_share_crd;
-PSI_mutex_key spd_key_mutex_pt_handler;
+PSI_mutex_key spd_key_mutex_wide_share_sts;
+PSI_mutex_key spd_key_mutex_wide_share_crd;
 #endif
 PSI_mutex_key spd_key_mutex_udf_table;
 PSI_mutex_key spd_key_mutex_mem_calc;
@@ -196,13 +195,14 @@ PSI_mutex_key spd_key_mutex_conn_i;
 PSI_mutex_key spd_key_mutex_bg_stss;
 PSI_mutex_key spd_key_mutex_bg_crds;
 #endif
+PSI_mutex_key spd_key_mutex_conn_loop_check;
 
 static PSI_mutex_info all_spider_mutexes[]=
 {
   { &spd_key_mutex_tbl, "tbl", PSI_FLAG_GLOBAL},
   { &spd_key_mutex_init_error_tbl, "init_error_tbl", PSI_FLAG_GLOBAL},
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-  { &spd_key_mutex_pt_share, "pt_share", PSI_FLAG_GLOBAL},
+  { &spd_key_mutex_wide_share, "wide_share", PSI_FLAG_GLOBAL},
 #endif
   { &spd_key_mutex_lgtm_tblhnd_share, "lgtm_tblhnd_share", PSI_FLAG_GLOBAL},
   { &spd_key_mutex_conn, "conn", PSI_FLAG_GLOBAL},
@@ -241,11 +241,11 @@ static PSI_mutex_info all_spider_mutexes[]=
   { &spd_key_mutex_share_crd, "share_crd", 0},
   { &spd_key_mutex_share_auto_increment, "share_auto_increment", 0},
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-  { &spd_key_mutex_pt_share_sts, "pt_share_sts", 0},
-  { &spd_key_mutex_pt_share_crd, "pt_share_crd", 0},
-  { &spd_key_mutex_pt_handler, "pt_handler", 0},
+  { &spd_key_mutex_wide_share_sts, "wide_share_sts", 0},
+  { &spd_key_mutex_wide_share_crd, "wide_share_crd", 0},
 #endif
   { &spd_key_mutex_udf_table, "udf_table", 0},
+  { &spd_key_mutex_conn_loop_check, "conn_loop_check", 0},
 };
 
 #ifndef WITHOUT_SPIDER_BG_SEARCH
@@ -365,12 +365,12 @@ extern pthread_mutex_t spider_conn_id_mutex;
 extern pthread_mutex_t spider_ipport_conn_mutex;
 
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-HASH spider_open_pt_share;
-uint spider_open_pt_share_id;
-const char *spider_open_pt_share_func_name;
-const char *spider_open_pt_share_file_name;
-ulong spider_open_pt_share_line_no;
-pthread_mutex_t spider_pt_share_mutex;
+HASH spider_open_wide_share;
+uint spider_open_wide_share_id;
+const char *spider_open_wide_share_func_name;
+const char *spider_open_wide_share_file_name;
+ulong spider_open_wide_share_line_no;
+pthread_mutex_t spider_wide_share_mutex;
 #endif
 
 HASH spider_lgtm_tblhnd_share_hash;
@@ -404,6 +404,9 @@ extern ulonglong  spider_free_mem_count[SPIDER_MEM_CALC_LIST_NUM];
 static char spider_wild_many = '%', spider_wild_one = '_',
   spider_wild_prefix='\\';
 
+static char spider_unique_id_buf[1 + 12 + 1 + (16 * 2) + 1 + 1];
+LEX_CSTRING spider_unique_id;
+
 // for spider_open_tables
 uchar *spider_tbl_get_key(
   SPIDER_SHARE *share,
@@ -415,28 +418,16 @@ uchar *spider_tbl_get_key(
   DBUG_RETURN((uchar*) share->table_name);
 }
 
-#ifdef WITH_PARTITION_STORAGE_ENGINE
-uchar *spider_pt_share_get_key(
-  SPIDER_PARTITION_SHARE *share,
+uchar *spider_wide_share_get_key(
+  SPIDER_WIDE_SHARE *share,
   size_t *length,
   my_bool not_used __attribute__ ((unused))
 ) {
-  DBUG_ENTER("spider_pt_share_get_key");
+  DBUG_ENTER("spider_wide_share_get_key");
   *length = share->table_name_length;
   DBUG_RETURN((uchar*) share->table_name);
 }
 
-uchar *spider_pt_handler_share_get_key(
-  SPIDER_PARTITION_HANDLER_SHARE *share,
-  size_t *length,
-  my_bool not_used __attribute__ ((unused))
-) {
-  DBUG_ENTER("spider_pt_handler_share_get_key");
-  *length = sizeof(TABLE *);
-  DBUG_RETURN((uchar*) &share->table);
-}
-#endif
-
 uchar *spider_lgtm_tblhnd_share_hash_get_key(
   SPIDER_LGTM_TBLHND_SHARE *share,
   size_t *length,
@@ -794,6 +785,17 @@ int spider_free_share_alloc(
     }
     spider_free(spider_current_trx, share->tgt_default_groups, MYF(0));
   }
+  if (share->tgt_dsns)
+  {
+    for (roop_count = 0; roop_count < (int) share->tgt_dsns_length;
+      roop_count++)
+    {
+      if (share->tgt_dsns[roop_count])
+        spider_free(spider_current_trx, share->tgt_dsns[roop_count],
+          MYF(0));
+    }
+    spider_free(spider_current_trx, share->tgt_dsns, MYF(0));
+  }
   if (share->tgt_pk_names)
   {
     for (roop_count = 0; roop_count < (int) share->tgt_pk_names_length;
@@ -897,6 +899,8 @@ int spider_free_share_alloc(
     spider_free(spider_current_trx, share->access_balances, MYF(0));
   if (share->bka_table_name_types)
     spider_free(spider_current_trx, share->bka_table_name_types, MYF(0));
+  if (share->strict_group_bys)
+    spider_free(spider_current_trx, share->strict_group_bys, MYF(0));
 #ifndef WITHOUT_SPIDER_BG_SEARCH
   if (share->monitoring_bg_interval)
     spider_free(spider_current_trx, share->monitoring_bg_interval, MYF(0));
@@ -913,10 +917,8 @@ int spider_free_share_alloc(
     delete [] share->key_hint;
     share->key_hint = NULL;
   }
-#ifdef WITH_PARTITION_STORAGE_ENGINE
-  if (share->partition_share)
-    spider_free_pt_share(share->partition_share);
-#endif
+  if (share->wide_share)
+    spider_free_wide_share(share->wide_share);
   DBUG_RETURN(0);
 }
 
@@ -999,6 +1001,11 @@ void spider_free_tmp_share_alloc(
     spider_free(spider_current_trx, share->tgt_default_groups[0], MYF(0));
     share->tgt_default_groups[0] = NULL;
   }
+  if (share->tgt_dsns && share->tgt_dsns[0])
+  {
+    spider_free(spider_current_trx, share->tgt_dsns[0], MYF(0));
+    share->tgt_dsns[0] = NULL;
+  }
   if (share->tgt_pk_names && share->tgt_pk_names[0])
   {
     spider_free(spider_current_trx, share->tgt_pk_names[0], MYF(0));
@@ -1052,7 +1059,7 @@ char *spider_get_string_between_quote(
   SPIDER_PARAM_STRING_PARSE *param_string_parse
 ) {
   char *start_ptr, *end_ptr, *tmp_ptr, *esc_ptr;
-  bool find_flg = FALSE, esc_flg = FALSE;
+  bool find_flg = FALSE;
   DBUG_ENTER("spider_get_string_between_quote");
 
   start_ptr = strchr(ptr, '\'');
@@ -1072,11 +1079,9 @@ char *spider_get_string_between_quote(
           find_flg = TRUE;
         else if (esc_ptr == end_ptr - 1)
         {
-          esc_flg = TRUE;
           tmp_ptr = end_ptr + 1;
           break;
         } else {
-          esc_flg = TRUE;
           esc_ptr += 2;
         }
       }
@@ -1097,11 +1102,9 @@ char *spider_get_string_between_quote(
           find_flg = TRUE;
         else if (esc_ptr == end_ptr - 1)
         {
-          esc_flg = TRUE;
           tmp_ptr = end_ptr + 1;
           break;
         } else {
-          esc_flg = TRUE;
           esc_ptr += 2;
         }
       }
@@ -1110,36 +1113,6 @@ char *spider_get_string_between_quote(
     DBUG_RETURN(NULL);
 
   *end_ptr = '\0';
-  if (esc_flg)
-  {
-    esc_ptr = start_ptr;
-    while (TRUE)
-    {
-      esc_ptr = strchr(esc_ptr, '\\');
-      if (!esc_ptr)
-        break;
-      switch(*(esc_ptr + 1))
-      {
-        case 'b':
-          *esc_ptr = '\b';
-          break;
-        case 'n':
-          *esc_ptr = '\n';
-          break;
-        case 'r':
-          *esc_ptr = '\r';
-          break;
-        case 't':
-          *esc_ptr = '\t';
-          break;
-        default:
-          *esc_ptr = *(esc_ptr + 1);
-          break;
-      }
-      esc_ptr++;
-      strcpy(esc_ptr, esc_ptr + 1);
-    }
-  }
 
   if (param_string_parse)
     param_string_parse->set_param_value(start_ptr, start_ptr + strlen(start_ptr) + 1);
@@ -1165,7 +1138,7 @@ int spider_create_string_list(
   SPIDER_PARAM_STRING_PARSE *param_string_parse
 ) {
   int roop_count;
-  char *tmp_ptr, *tmp_ptr2, *tmp_ptr3, *esc_ptr;
+  char *tmp_ptr, *tmp_ptr2, *tmp_ptr3, *tmp_ptr4, *esc_ptr;
   bool find_flg = FALSE;
   DBUG_ENTER("spider_create_string_list");
 
@@ -1187,22 +1160,30 @@ int spider_create_string_list(
     DBUG_RETURN(0);
   }
 
+  bool last_esc_flg = FALSE;
   while (TRUE)
   {
     if ((tmp_ptr2 = strchr(tmp_ptr, ' ')))
     {
+      find_flg = FALSE;
+      last_esc_flg = FALSE;
       esc_ptr = tmp_ptr;
       while (!find_flg)
       {
         esc_ptr = strchr(esc_ptr, '\\');
         if (!esc_ptr || esc_ptr > tmp_ptr2)
+        {
           find_flg = TRUE;
+        }
         else if (esc_ptr == tmp_ptr2 - 1)
         {
+          last_esc_flg = TRUE;
           tmp_ptr = tmp_ptr2 + 1;
           break;
-        } else
+        } else {
+          last_esc_flg = TRUE;
           esc_ptr += 2;
+        }
       }
       if (find_flg)
       {
@@ -1217,8 +1198,8 @@ int spider_create_string_list(
 
   if (!(*string_list = (char**)
     spider_bulk_malloc(spider_current_trx, 37, MYF(MY_WME | MY_ZEROFILL),
-      string_list, sizeof(char*) * (*list_length),
-      string_length_list, sizeof(int) * (*list_length),
+      string_list, (uint) (sizeof(char*) * (*list_length)),
+      string_length_list, (uint) (sizeof(int) * (*list_length)),
       NullS))
   ) {
     my_error(ER_OUT_OF_RESOURCES, MYF(0), HA_ERR_OUT_OF_MEM);
@@ -1235,6 +1216,8 @@ int spider_create_string_list(
 
   for (roop_count = 0; roop_count < (int) *list_length - 1; roop_count++)
   {
+    bool esc_flg = FALSE;
+    find_flg = FALSE;
     while (TRUE)
     {
       tmp_ptr2 = strchr(tmp_ptr, ' ');
@@ -1244,13 +1227,18 @@ int spider_create_string_list(
       {
         esc_ptr = strchr(esc_ptr, '\\');
         if (!esc_ptr || esc_ptr > tmp_ptr2)
+        {
           find_flg = TRUE;
+        }
         else if (esc_ptr == tmp_ptr2 - 1)
         {
+          esc_flg = TRUE;
           tmp_ptr = tmp_ptr2 + 1;
           break;
-        } else
+        } else {
+          esc_flg = TRUE;
           esc_ptr += 2;
+        }
       }
       if (find_flg)
         break;
@@ -1270,6 +1258,43 @@ int spider_create_string_list(
       my_error(ER_OUT_OF_RESOURCES, MYF(0), HA_ERR_OUT_OF_MEM);
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
     }
+
+    if (esc_flg)
+    {
+      esc_ptr = (*string_list)[roop_count];
+      while (TRUE)
+      {
+        esc_ptr = strchr(esc_ptr, '\\');
+        if (!esc_ptr)
+          break;
+        switch(*(esc_ptr + 1))
+        {
+          case 'b':
+            *esc_ptr = '\b';
+            break;
+          case 'n':
+            *esc_ptr = '\n';
+            break;
+          case 'r':
+            *esc_ptr = '\r';
+            break;
+          case 't':
+            *esc_ptr = '\t';
+            break;
+          default:
+            *esc_ptr = *(esc_ptr + 1);
+            break;
+        }
+        esc_ptr++;
+        tmp_ptr4 = esc_ptr;
+        do
+        {
+          *tmp_ptr4 = *(tmp_ptr4 + 1);
+          tmp_ptr4++;
+        } while (*tmp_ptr4);
+        (*string_length_list)[roop_count] -= 1;
+      }
+    }
     DBUG_PRINT("info",("spider string_list[%d]=%s", roop_count,
       (*string_list)[roop_count]));
     tmp_ptr3 = tmp_ptr;
@@ -1281,6 +1306,42 @@ int spider_create_string_list(
     my_error(ER_OUT_OF_RESOURCES, MYF(0), HA_ERR_OUT_OF_MEM);
     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
   }
+  if (last_esc_flg)
+  {
+    esc_ptr = (*string_list)[roop_count];
+    while (TRUE)
+    {
+      esc_ptr = strchr(esc_ptr, '\\');
+      if (!esc_ptr)
+        break;
+      switch(*(esc_ptr + 1))
+      {
+        case 'b':
+          *esc_ptr = '\b';
+          break;
+        case 'n':
+          *esc_ptr = '\n';
+          break;
+        case 'r':
+          *esc_ptr = '\r';
+          break;
+        case 't':
+          *esc_ptr = '\t';
+          break;
+        default:
+          *esc_ptr = *(esc_ptr + 1);
+          break;
+      }
+      esc_ptr++;
+      tmp_ptr4 = esc_ptr;
+      do
+      {
+        *tmp_ptr4 = *(tmp_ptr4 + 1);
+        tmp_ptr4++;
+      } while (*tmp_ptr4);
+      (*string_length_list)[roop_count] -= 1;
+    }
+  }
 
   param_string_parse->set_param_value(tmp_ptr3,
                                       tmp_ptr3 + strlen(tmp_ptr3) + 1);
@@ -1336,7 +1397,7 @@ int spider_create_long_list(
 
   if (!(*long_list = (long*)
     spider_bulk_malloc(spider_current_trx, 38, MYF(MY_WME | MY_ZEROFILL),
-      long_list, sizeof(long) * (*list_length),
+      long_list, (uint) (sizeof(long) * (*list_length)),
       NullS))
   ) {
     my_error(ER_OUT_OF_RESOURCES, MYF(0), HA_ERR_OUT_OF_MEM);
@@ -1420,7 +1481,7 @@ int spider_create_longlong_list(
 
   if (!(*longlong_list = (longlong *)
     spider_bulk_malloc(spider_current_trx, 39, MYF(MY_WME | MY_ZEROFILL),
-      longlong_list, sizeof(longlong) * (*list_length),
+      longlong_list, (uint) (sizeof(longlong) * (*list_length)),
       NullS))
   ) {
     my_error(ER_OUT_OF_RESOURCES, MYF(0), HA_ERR_OUT_OF_MEM);
@@ -1491,8 +1552,8 @@ int spider_increase_string_list(
 
   if (!(tmp_str_list = (char**)
     spider_bulk_malloc(spider_current_trx, 40, MYF(MY_WME | MY_ZEROFILL),
-      &tmp_str_list, sizeof(char*) * link_count,
-      &tmp_length_list, sizeof(uint) * link_count,
+      &tmp_str_list, (uint) (sizeof(char*) * link_count),
+      &tmp_length_list, (uint) (sizeof(uint) * link_count),
       NullS))
   ) {
     my_error(ER_OUT_OF_RESOURCES, MYF(0), HA_ERR_OUT_OF_MEM);
@@ -1554,8 +1615,8 @@ int spider_increase_null_string_list(
 
   if (!(tmp_str_list = (char**)
     spider_bulk_malloc(spider_current_trx, 247, MYF(MY_WME | MY_ZEROFILL),
-      &tmp_str_list, sizeof(char*) * link_count,
-      &tmp_length_list, sizeof(uint) * link_count,
+      &tmp_str_list, (uint) (sizeof(char*) * link_count),
+      &tmp_length_list, (uint) (sizeof(uint) * link_count),
       NullS))
   ) {
     my_error(ER_OUT_OF_RESOURCES, MYF(0), HA_ERR_OUT_OF_MEM);
@@ -1612,7 +1673,7 @@ int spider_increase_long_list(
 
   if (!(tmp_long_list = (long*)
     spider_bulk_malloc(spider_current_trx, 41, MYF(MY_WME | MY_ZEROFILL),
-      &tmp_long_list, sizeof(long) * link_count,
+      &tmp_long_list, (uint) (sizeof(long) * link_count),
       NullS))
   ) {
     my_error(ER_OUT_OF_RESOURCES, MYF(0), HA_ERR_OUT_OF_MEM);
@@ -1657,7 +1718,7 @@ int spider_increase_longlong_list(
 
   if (!(tmp_longlong_list = (longlong*)
     spider_bulk_malloc(spider_current_trx, 42, MYF(MY_WME | MY_ZEROFILL),
-      &tmp_longlong_list, sizeof(longlong) * link_count,
+      &tmp_longlong_list, (uint) (sizeof(longlong) * link_count),
       NullS))
   ) {
     my_error(ER_OUT_OF_RESOURCES, MYF(0), HA_ERR_OUT_OF_MEM);
@@ -2047,6 +2108,7 @@ int spider_parse_connect_info(
   share->bulk_size = -1;
   share->bulk_update_mode = -1;
   share->bulk_update_size = -1;
+  share->buffer_size = -1;
   share->internal_optimize = -1;
   share->internal_optimize_local = -1;
   share->scan_rate = -1;
@@ -2215,6 +2277,7 @@ int spider_parse_connect_info(
 #ifdef HA_CAN_BULK_ACCESS
           SPIDER_PARAM_INT_WITH_MAX("baf", bulk_access_free, 0, 1);
 #endif
+          SPIDER_PARAM_INT("bfz", buffer_size, 0);
 #ifndef WITHOUT_SPIDER_BG_SEARCH
           SPIDER_PARAM_LONGLONG("bfr", bgs_first_read, 0);
           SPIDER_PARAM_INT("bmd", bgs_mode, 0);
@@ -2245,6 +2308,7 @@ int spider_parse_connect_info(
           SPIDER_PARAM_STR_LIST("dff", tgt_default_files);
           SPIDER_PARAM_STR_LIST("dfg", tgt_default_groups);
           SPIDER_PARAM_LONGLONG("dol", direct_order_limit, 0);
+          SPIDER_PARAM_STR_LIST("dsn", tgt_dsns);
           SPIDER_PARAM_INT_WITH_MAX("erm", error_read_mode, 0, 1);
           SPIDER_PARAM_INT_WITH_MAX("ewm", error_write_mode, 0, 1);
 #ifdef HA_CAN_FORCE_BULK_DELETE
@@ -2313,6 +2377,7 @@ int spider_parse_connect_info(
           SPIDER_PARAM_STR_LIST("scp", tgt_ssl_capaths);
           SPIDER_PARAM_STR_LIST("scr", tgt_ssl_certs);
           SPIDER_PARAM_INT_WITH_MAX("sdc", skip_default_condition, 0, 1);
+          SPIDER_PARAM_LONG_LIST_WITH_MAX("sgb", strict_group_bys, 0, 1);
           SPIDER_PARAM_DOUBLE("siv", sts_interval, 0);
           SPIDER_PARAM_STR_LIST("sky", tgt_ssl_keys);
           SPIDER_PARAM_STR_LIST("sli", static_link_ids);
@@ -2424,6 +2489,7 @@ int spider_parse_connect_info(
           SPIDER_PARAM_LONG_LIST_WITH_MAX("use_hs_read", use_hs_reads, 0, 1);
 #endif
           SPIDER_PARAM_INT_WITH_MAX("casual_read", casual_read, 0, 63);
+          SPIDER_PARAM_INT("buffer_size", buffer_size, 0);
           error_num = connect_string_parse.print_param_error();
           goto error;
         case 12:
@@ -2483,6 +2549,8 @@ int spider_parse_connect_info(
 #endif
           SPIDER_PARAM_LONG_LIST_WITH_MAX("connect_timeout", connect_timeouts,
             0, 2147483647);
+          SPIDER_PARAM_LONG_LIST_WITH_MAX("strict_group_by",
+            strict_group_bys, 0, 1);
           SPIDER_PARAM_INT_WITH_MAX("error_read_mode", error_read_mode, 0, 1);
           error_num = connect_string_parse.print_param_error();
           goto error;
@@ -2658,6 +2726,8 @@ int spider_parse_connect_info(
     share->all_link_count = share->tgt_default_files_length;
   if (share->all_link_count < share->tgt_default_groups_length)
     share->all_link_count = share->tgt_default_groups_length;
+  if (share->all_link_count < share->tgt_dsns_length)
+    share->all_link_count = share->tgt_dsns_length;
   if (share->all_link_count < share->tgt_pk_names_length)
     share->all_link_count = share->tgt_pk_names_length;
   if (share->all_link_count < share->tgt_sequence_names_length)
@@ -2716,6 +2786,8 @@ int spider_parse_connect_info(
     share->all_link_count = share->access_balances_length;
   if (share->all_link_count < share->bka_table_name_types_length)
     share->all_link_count = share->bka_table_name_types_length;
+  if (share->all_link_count < share->strict_group_bys_length)
+    share->all_link_count = share->strict_group_bys_length;
   if ((error_num = spider_increase_string_list(
     &share->server_names,
     &share->server_names_lengths,
@@ -2822,6 +2894,13 @@ int spider_parse_connect_info(
     share->all_link_count)))
     goto error;
   if ((error_num = spider_increase_string_list(
+    &share->tgt_dsns,
+    &share->tgt_dsns_lengths,
+    &share->tgt_dsns_length,
+    &share->tgt_dsns_charlen,
+    share->all_link_count)))
+    goto error;
+  if ((error_num = spider_increase_string_list(
     &share->tgt_pk_names,
     &share->tgt_pk_names_lengths,
     &share->tgt_pk_names_length,
@@ -2972,6 +3051,11 @@ int spider_parse_connect_info(
     &share->bka_table_name_types_length,
     share->all_link_count)))
     goto error;
+  if ((error_num = spider_increase_long_list(
+    &share->strict_group_bys,
+    &share->strict_group_bys_length,
+    share->all_link_count)))
+    goto error;
 
   /* copy for tables start */
   share_alter = &share->alter_table;
@@ -2979,17 +3063,81 @@ int spider_parse_connect_info(
   if (!(share_alter->tmp_server_names = (char **)
     spider_bulk_malloc(spider_current_trx, 43, MYF(MY_WME | MY_ZEROFILL),
       &share_alter->tmp_server_names,
-      sizeof(char *) * 16 * share->all_link_count,
+      (uint) (sizeof(char *) * share->all_link_count),
+      &share_alter->tmp_tgt_table_names,
+      (uint) (sizeof(char *) * share->all_link_count),
+      &share_alter->tmp_tgt_dbs,
+      (uint) (sizeof(char *) * share->all_link_count),
+      &share_alter->tmp_tgt_hosts,
+      (uint) (sizeof(char *) * share->all_link_count),
+      &share_alter->tmp_tgt_usernames,
+      (uint) (sizeof(char *) * share->all_link_count),
+      &share_alter->tmp_tgt_passwords,
+      (uint) (sizeof(char *) * share->all_link_count),
+      &share_alter->tmp_tgt_sockets,
+      (uint) (sizeof(char *) * share->all_link_count),
+      &share_alter->tmp_tgt_wrappers,
+      (uint) (sizeof(char *) * share->all_link_count),
+      &share_alter->tmp_tgt_ssl_cas,
+      (uint) (sizeof(char *) * share->all_link_count),
+      &share_alter->tmp_tgt_ssl_capaths,
+      (uint) (sizeof(char *) * share->all_link_count),
+      &share_alter->tmp_tgt_ssl_certs,
+      (uint) (sizeof(char *) * share->all_link_count),
+      &share_alter->tmp_tgt_ssl_ciphers,
+      (uint) (sizeof(char *) * share->all_link_count),
+      &share_alter->tmp_tgt_ssl_keys,
+      (uint) (sizeof(char *) * share->all_link_count),
+      &share_alter->tmp_tgt_default_files,
+      (uint) (sizeof(char *) * share->all_link_count),
+      &share_alter->tmp_tgt_default_groups,
+      (uint) (sizeof(char *) * share->all_link_count),
+      &share_alter->tmp_tgt_dsns,
+      (uint) (sizeof(char *) * share->all_link_count),
+      &share_alter->tmp_static_link_ids,
+      (uint) (sizeof(char *) * share->all_link_count),
       &share_alter->tmp_server_names_lengths,
-      sizeof(uint *) * 16 * share->all_link_count,
+      (uint) (sizeof(uint *) * share->all_link_count),
+      &share_alter->tmp_tgt_table_names_lengths,
+      (uint) (sizeof(uint *) * share->all_link_count),
+      &share_alter->tmp_tgt_dbs_lengths,
+      (uint) (sizeof(uint *) * share->all_link_count),
+      &share_alter->tmp_tgt_hosts_lengths,
+      (uint) (sizeof(uint *) * share->all_link_count),
+      &share_alter->tmp_tgt_usernames_lengths,
+      (uint) (sizeof(uint *) * share->all_link_count),
+      &share_alter->tmp_tgt_passwords_lengths,
+      (uint) (sizeof(uint *) * share->all_link_count),
+      &share_alter->tmp_tgt_sockets_lengths,
+      (uint) (sizeof(uint *) * share->all_link_count),
+      &share_alter->tmp_tgt_wrappers_lengths,
+      (uint) (sizeof(uint *) * share->all_link_count),
+      &share_alter->tmp_tgt_ssl_cas_lengths,
+      (uint) (sizeof(uint *) * share->all_link_count),
+      &share_alter->tmp_tgt_ssl_capaths_lengths,
+      (uint) (sizeof(uint *) * share->all_link_count),
+      &share_alter->tmp_tgt_ssl_certs_lengths,
+      (uint) (sizeof(uint *) * share->all_link_count),
+      &share_alter->tmp_tgt_ssl_ciphers_lengths,
+      (uint) (sizeof(uint *) * share->all_link_count),
+      &share_alter->tmp_tgt_ssl_keys_lengths,
+      (uint) (sizeof(uint *) * share->all_link_count),
+      &share_alter->tmp_tgt_default_files_lengths,
+      (uint) (sizeof(uint *) * share->all_link_count),
+      &share_alter->tmp_tgt_default_groups_lengths,
+      (uint) (sizeof(uint *) * share->all_link_count),
+      &share_alter->tmp_tgt_dsns_lengths,
+      (uint) (sizeof(uint *) * share->all_link_count),
+      &share_alter->tmp_static_link_ids_lengths,
+      (uint) (sizeof(uint *) * share->all_link_count),
       &share_alter->tmp_tgt_ports,
-      sizeof(long) * share->all_link_count,
+      (uint) (sizeof(long) * share->all_link_count),
       &share_alter->tmp_tgt_ssl_vscs,
-      sizeof(long) * share->all_link_count,
+      (uint) (sizeof(long) * share->all_link_count),
       &share_alter->tmp_monitoring_binlog_pos_at_failing,
-      sizeof(long) * share->all_link_count,
+      (uint) (sizeof(long) * share->all_link_count),
       &share_alter->tmp_link_statuses,
-      sizeof(long) * share->all_link_count,
+      (uint) (sizeof(long) * share->all_link_count),
       NullS))
   ) {
     error_num = HA_ERR_OUT_OF_MEM;
@@ -2999,64 +3147,36 @@ int spider_parse_connect_info(
 
   memcpy(share_alter->tmp_server_names, share->server_names,
     sizeof(char *) * share->all_link_count);
-  share_alter->tmp_tgt_table_names =
-    share_alter->tmp_server_names + share->all_link_count;
   memcpy(share_alter->tmp_tgt_table_names, share->tgt_table_names,
     sizeof(char *) * share->all_link_count);
-  share_alter->tmp_tgt_dbs =
-    share_alter->tmp_tgt_table_names + share->all_link_count;
   memcpy(share_alter->tmp_tgt_dbs, share->tgt_dbs,
     sizeof(char *) * share->all_link_count);
-  share_alter->tmp_tgt_hosts =
-    share_alter->tmp_tgt_dbs + share->all_link_count;
   memcpy(share_alter->tmp_tgt_hosts, share->tgt_hosts,
     sizeof(char *) * share->all_link_count);
-  share_alter->tmp_tgt_usernames =
-    share_alter->tmp_tgt_hosts + share->all_link_count;
   memcpy(share_alter->tmp_tgt_usernames, share->tgt_usernames,
     sizeof(char *) * share->all_link_count);
-  share_alter->tmp_tgt_passwords =
-    share_alter->tmp_tgt_usernames + share->all_link_count;
   memcpy(share_alter->tmp_tgt_passwords, share->tgt_passwords,
     sizeof(char *) * share->all_link_count);
-  share_alter->tmp_tgt_sockets =
-    share_alter->tmp_tgt_passwords + share->all_link_count;
   memcpy(share_alter->tmp_tgt_sockets, share->tgt_sockets,
     sizeof(char *) * share->all_link_count);
-  share_alter->tmp_tgt_wrappers =
-    share_alter->tmp_tgt_sockets + share->all_link_count;
   memcpy(share_alter->tmp_tgt_wrappers, share->tgt_wrappers,
     sizeof(char *) * share->all_link_count);
-  share_alter->tmp_tgt_ssl_cas =
-    share_alter->tmp_tgt_wrappers + share->all_link_count;
   memcpy(share_alter->tmp_tgt_ssl_cas, share->tgt_ssl_cas,
     sizeof(char *) * share->all_link_count);
-  share_alter->tmp_tgt_ssl_capaths =
-    share_alter->tmp_tgt_ssl_cas + share->all_link_count;
   memcpy(share_alter->tmp_tgt_ssl_capaths, share->tgt_ssl_capaths,
     sizeof(char *) * share->all_link_count);
-  share_alter->tmp_tgt_ssl_certs =
-    share_alter->tmp_tgt_ssl_capaths + share->all_link_count;
   memcpy(share_alter->tmp_tgt_ssl_certs, share->tgt_ssl_certs,
     sizeof(char *) * share->all_link_count);
-  share_alter->tmp_tgt_ssl_ciphers =
-    share_alter->tmp_tgt_ssl_certs + share->all_link_count;
   memcpy(share_alter->tmp_tgt_ssl_ciphers, share->tgt_ssl_ciphers,
     sizeof(char *) * share->all_link_count);
-  share_alter->tmp_tgt_ssl_keys =
-    share_alter->tmp_tgt_ssl_ciphers + share->all_link_count;
   memcpy(share_alter->tmp_tgt_ssl_keys, share->tgt_ssl_keys,
     sizeof(char *) * share->all_link_count);
-  share_alter->tmp_tgt_default_files =
-    share_alter->tmp_tgt_ssl_keys + share->all_link_count;
   memcpy(share_alter->tmp_tgt_default_files, share->tgt_default_files,
     sizeof(char *) * share->all_link_count);
-  share_alter->tmp_tgt_default_groups =
-    share_alter->tmp_tgt_default_files + share->all_link_count;
   memcpy(share_alter->tmp_tgt_default_groups, share->tgt_default_groups,
     sizeof(char *) * share->all_link_count);
-  share_alter->tmp_static_link_ids =
-    share_alter->tmp_tgt_default_groups + share->all_link_count;
+  memcpy(share_alter->tmp_tgt_dsns, share->tgt_dsns,
+    sizeof(char *) * share->all_link_count);
   memcpy(share_alter->tmp_static_link_ids, share->static_link_ids,
     sizeof(char *) * share->all_link_count);
 
@@ -3073,75 +3193,48 @@ int spider_parse_connect_info(
   memcpy(share_alter->tmp_server_names_lengths,
     share->server_names_lengths,
     sizeof(uint) * share->all_link_count);
-  share_alter->tmp_tgt_table_names_lengths =
-    share_alter->tmp_server_names_lengths + share->all_link_count;
   memcpy(share_alter->tmp_tgt_table_names_lengths,
     share->tgt_table_names_lengths,
     sizeof(uint) * share->all_link_count);
-  share_alter->tmp_tgt_dbs_lengths =
-    share_alter->tmp_tgt_table_names_lengths + share->all_link_count;
   memcpy(share_alter->tmp_tgt_dbs_lengths, share->tgt_dbs_lengths,
     sizeof(uint) * share->all_link_count);
-  share_alter->tmp_tgt_hosts_lengths =
-    share_alter->tmp_tgt_dbs_lengths + share->all_link_count;
   memcpy(share_alter->tmp_tgt_hosts_lengths, share->tgt_hosts_lengths,
     sizeof(uint) * share->all_link_count);
-  share_alter->tmp_tgt_usernames_lengths =
-    share_alter->tmp_tgt_hosts_lengths + share->all_link_count;
   memcpy(share_alter->tmp_tgt_usernames_lengths,
     share->tgt_usernames_lengths,
     sizeof(uint) * share->all_link_count);
-  share_alter->tmp_tgt_passwords_lengths =
-    share_alter->tmp_tgt_usernames_lengths + share->all_link_count;
   memcpy(share_alter->tmp_tgt_passwords_lengths,
     share->tgt_passwords_lengths,
     sizeof(uint) * share->all_link_count);
-  share_alter->tmp_tgt_sockets_lengths =
-    share_alter->tmp_tgt_passwords_lengths + share->all_link_count;
   memcpy(share_alter->tmp_tgt_sockets_lengths, share->tgt_sockets_lengths,
     sizeof(uint) * share->all_link_count);
-  share_alter->tmp_tgt_wrappers_lengths =
-    share_alter->tmp_tgt_sockets_lengths + share->all_link_count;
   memcpy(share_alter->tmp_tgt_wrappers_lengths,
     share->tgt_wrappers_lengths,
     sizeof(uint) * share->all_link_count);
-  share_alter->tmp_tgt_ssl_cas_lengths =
-    share_alter->tmp_tgt_wrappers_lengths + share->all_link_count;
   memcpy(share_alter->tmp_tgt_ssl_cas_lengths,
     share->tgt_ssl_cas_lengths,
     sizeof(uint) * share->all_link_count);
-  share_alter->tmp_tgt_ssl_capaths_lengths =
-    share_alter->tmp_tgt_ssl_cas_lengths + share->all_link_count;
   memcpy(share_alter->tmp_tgt_ssl_capaths_lengths,
     share->tgt_ssl_capaths_lengths,
     sizeof(uint) * share->all_link_count);
-  share_alter->tmp_tgt_ssl_certs_lengths =
-    share_alter->tmp_tgt_ssl_capaths_lengths + share->all_link_count;
   memcpy(share_alter->tmp_tgt_ssl_certs_lengths,
     share->tgt_ssl_certs_lengths,
     sizeof(uint) * share->all_link_count);
-  share_alter->tmp_tgt_ssl_ciphers_lengths =
-    share_alter->tmp_tgt_ssl_certs_lengths + share->all_link_count;
   memcpy(share_alter->tmp_tgt_ssl_ciphers_lengths,
     share->tgt_ssl_ciphers_lengths,
     sizeof(uint) * share->all_link_count);
-  share_alter->tmp_tgt_ssl_keys_lengths =
-    share_alter->tmp_tgt_ssl_ciphers_lengths + share->all_link_count;
   memcpy(share_alter->tmp_tgt_ssl_keys_lengths,
     share->tgt_ssl_keys_lengths,
     sizeof(uint) * share->all_link_count);
-  share_alter->tmp_tgt_default_files_lengths =
-    share_alter->tmp_tgt_ssl_keys_lengths + share->all_link_count;
   memcpy(share_alter->tmp_tgt_default_files_lengths,
     share->tgt_default_files_lengths,
     sizeof(uint) * share->all_link_count);
-  share_alter->tmp_tgt_default_groups_lengths =
-    share_alter->tmp_tgt_default_files_lengths + share->all_link_count;
   memcpy(share_alter->tmp_tgt_default_groups_lengths,
     share->tgt_default_groups_lengths,
     sizeof(uint) * share->all_link_count);
-  share_alter->tmp_static_link_ids_lengths =
-    share_alter->tmp_tgt_default_groups_lengths + share->all_link_count;
+  memcpy(share_alter->tmp_tgt_dsns_lengths,
+    share->tgt_dsns_lengths,
+    sizeof(uint) * share->all_link_count);
   memcpy(share_alter->tmp_static_link_ids_lengths,
     share->static_link_ids_lengths,
     sizeof(uint) * share->all_link_count);
@@ -3163,6 +3256,8 @@ int spider_parse_connect_info(
     share->tgt_default_files_charlen;
   share_alter->tmp_tgt_default_groups_charlen =
     share->tgt_default_groups_charlen;
+  share_alter->tmp_tgt_dsns_charlen =
+    share->tgt_dsns_charlen;
   share_alter->tmp_static_link_ids_charlen =
     share->static_link_ids_charlen;
 
@@ -3182,6 +3277,8 @@ int spider_parse_connect_info(
   share_alter->tmp_tgt_default_files_length = share->tgt_default_files_length;
   share_alter->tmp_tgt_default_groups_length =
     share->tgt_default_groups_length;
+  share_alter->tmp_tgt_dsns_length =
+    share->tgt_dsns_length;
   share_alter->tmp_static_link_ids_length =
     share->static_link_ids_length;
   share_alter->tmp_tgt_ports_length = share->tgt_ports_length;
@@ -3405,6 +3502,18 @@ int spider_parse_connect_info(
       }
 
       DBUG_PRINT("info",
+        ("spider tgt_dsns_lengths[%d] = %u", roop_count,
+        share->tgt_dsns_lengths[roop_count]));
+      if (share->tgt_dsns_lengths[roop_count] >
+        SPIDER_CONNECT_INFO_MAX_LEN)
+      {
+        error_num = ER_SPIDER_INVALID_CONNECT_INFO_TOO_LONG_NUM;
+        my_printf_error(error_num, ER_SPIDER_INVALID_CONNECT_INFO_TOO_LONG_STR,
+          MYF(0), share->tgt_dsns[roop_count], "default_group");
+        goto error;
+      }
+
+      DBUG_PRINT("info",
         ("spider tgt_pk_names_lengths[%d] = %u", roop_count,
         share->tgt_pk_names_lengths[roop_count]));
       if (share->tgt_pk_names_lengths[roop_count] >
@@ -3500,7 +3609,11 @@ int spider_set_connect_info_default(
 #endif
   TABLE_SHARE *table_share
 ) {
-  int error_num, roop_count;
+  bool check_socket;
+  bool check_database;
+  bool socket_has_default_value;
+  bool database_has_default_value;
+  int error_num, roop_count, roop_count2;
   DBUG_ENTER("spider_set_connect_info_default");
   for (roop_count = 0; roop_count < (int) share->all_link_count; roop_count++)
   {
@@ -3510,6 +3623,64 @@ int spider_set_connect_info_default(
         DBUG_RETURN(error_num);
     }
 
+    if (
+      !share->tgt_sockets[roop_count] &&
+      (
+        !share->tgt_hosts[roop_count] ||
+        !strcmp(share->tgt_hosts[roop_count], my_localhost)
+      )
+    ) {
+      check_socket = TRUE;
+    } else {
+      check_socket = FALSE;
+    }
+    if (!share->tgt_dbs[roop_count] && table_share)
+    {
+      check_database = TRUE;
+    } else {
+      check_database = FALSE;
+    }
+    if (check_socket || check_database)
+    {
+      socket_has_default_value = check_socket;
+      database_has_default_value = check_database;
+      if (share->tgt_wrappers[roop_count])
+      {
+        for (roop_count2 = 0; roop_count2 < SPIDER_DBTON_SIZE; roop_count2++)
+        {
+          DBUG_PRINT("info",("spider share->tgt_wrappers[%d]=%s", roop_count,
+            share->tgt_wrappers[roop_count]));
+          DBUG_PRINT("info",("spider spider_dbton[%d].wrapper=%s", roop_count2,
+            spider_dbton[roop_count2].wrapper ?
+              spider_dbton[roop_count2].wrapper : "NULL"));
+          if (
+            spider_dbton[roop_count2].wrapper &&
+            !strcmp(share->tgt_wrappers[roop_count],
+              spider_dbton[roop_count2].wrapper)
+          ) {
+            if (spider_dbton[roop_count2].db_access_type ==
+              SPIDER_DB_ACCESS_TYPE_SQL)
+            {
+              if (check_socket)
+              {
+                socket_has_default_value = spider_dbton[roop_count2].
+                  db_util->socket_has_default_value();
+              }
+              if (check_database)
+              {
+                database_has_default_value = spider_dbton[roop_count2].
+                  db_util->database_has_default_value();
+              }
+              break;
+            }
+          }
+        }
+      }
+    } else {
+      socket_has_default_value = FALSE;
+      database_has_default_value = FALSE;
+    }
+
     if (!share->tgt_wrappers[roop_count])
     {
       DBUG_PRINT("info",("spider create default tgt_wrappers"));
@@ -3536,7 +3707,7 @@ int spider_set_connect_info_default(
       }
     }
 
-    if (!share->tgt_dbs[roop_count] && table_share)
+    if (database_has_default_value)
     {
       DBUG_PRINT("info",("spider create default tgt_dbs"));
       share->tgt_dbs_lengths[roop_count] = table_share->db.length;
@@ -3659,10 +3830,8 @@ int spider_set_connect_info_default(
     if (share->tgt_ssl_vscs[roop_count] == -1)
       share->tgt_ssl_vscs[roop_count] = 0;
 
-    if (
-      !share->tgt_sockets[roop_count] &&
-      !strcmp(share->tgt_hosts[roop_count], my_localhost)
-    ) {
+    if (socket_has_default_value)
+    {
       DBUG_PRINT("info",("spider create default tgt_sockets"));
       share->tgt_sockets_lengths[roop_count] =
         strlen((char *) MYSQL_UNIX_ADDR);
@@ -3751,6 +3920,8 @@ int spider_set_connect_info_default(
       share->access_balances[roop_count] = 100;
     if (share->bka_table_name_types[roop_count] == -1)
       share->bka_table_name_types[roop_count] = 0;
+    if (share->strict_group_bys[roop_count] == -1)
+      share->strict_group_bys[roop_count] = 1;
   }
 
 #ifndef WITHOUT_SPIDER_BG_SEARCH
@@ -3825,6 +3996,8 @@ int spider_set_connect_info_default(
     share->bulk_update_mode = 0;
   if (share->bulk_update_size == -1)
     share->bulk_update_size = 16000;
+  if (share->buffer_size == -1)
+    share->buffer_size = 16000;
   if (share->internal_optimize == -1)
     share->internal_optimize = 0;
   if (share->internal_optimize_local == -1)
@@ -3931,12 +4104,54 @@ int spider_set_connect_info_default_db_table(
   const char *table_name,
   uint table_name_length
 ) {
-  int roop_count;
+  uint roop_count, roop_count2;
+  bool check_database;
+  bool database_has_default_value;
   DBUG_ENTER("spider_set_connect_info_default_db_table");
-  for (roop_count = 0; roop_count < (int) share->link_count; roop_count++)
+  for (roop_count = 0; roop_count < share->link_count; roop_count++)
   {
     if (!share->tgt_dbs[roop_count] && db_name)
     {
+      check_database = TRUE;
+    } else {
+      check_database = FALSE;
+    }
+    if (check_database)
+    {
+      database_has_default_value = check_database;
+      if (share->tgt_wrappers[roop_count])
+      {
+        for (roop_count2 = 0; roop_count2 < SPIDER_DBTON_SIZE; roop_count2++)
+        {
+          DBUG_PRINT("info",("spider share->tgt_wrappers[%d]=%s", roop_count,
+            share->tgt_wrappers[roop_count]));
+          DBUG_PRINT("info",("spider spider_dbton[%d].wrapper=%s", roop_count2,
+            spider_dbton[roop_count2].wrapper ?
+              spider_dbton[roop_count2].wrapper : "NULL"));
+          if (
+            spider_dbton[roop_count2].wrapper &&
+            !strcmp(share->tgt_wrappers[roop_count],
+              spider_dbton[roop_count2].wrapper)
+          ) {
+            if (spider_dbton[roop_count2].db_access_type ==
+              SPIDER_DB_ACCESS_TYPE_SQL)
+            {
+              if (check_database)
+              {
+                database_has_default_value = spider_dbton[roop_count2].
+                  db_util->database_has_default_value();
+              }
+              break;
+            }
+          }
+        }
+      }
+    } else {
+      database_has_default_value = FALSE;
+    }
+
+    if (database_has_default_value)
+    {
       DBUG_PRINT("info",("spider create default tgt_dbs"));
       share->tgt_dbs_lengths[roop_count] = db_name_length;
       if (
@@ -4020,16 +4235,19 @@ int spider_create_conn_keys(
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
   char *tmp_hs_r_name, *tmp_hs_w_name;
 #endif
+  uint length_base = sizeof(uint) * share->all_link_count;
   uint *conn_keys_lengths;
+  uint *sql_dbton_ids;
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+  uint *hs_dbton_ids;
   uint *hs_r_conn_keys_lengths;
   uint *hs_w_conn_keys_lengths;
 #endif
   DBUG_ENTER("spider_create_conn_keys");
   char *ptr;
-  uint length = sizeof(uint) * share->all_link_count;
+  uint length = length_base * 2;
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-  length += (sizeof(uint) * share->all_link_count) * 2;
+  length += length_base * 3;
 #endif
   ptr = (char *) my_alloca(length);
   if (!ptr)
@@ -4037,10 +4255,14 @@ int spider_create_conn_keys(
     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
   }
   conn_keys_lengths = (uint *) ptr;
-  ptr += (sizeof(uint) * share->all_link_count);
+  ptr += length_base;
+  sql_dbton_ids = (uint *) ptr;
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+  ptr += length_base;
+  hs_dbton_ids = (uint *) ptr;
+  ptr += length_base;
   hs_r_conn_keys_lengths = (uint *) ptr;
-  ptr += (sizeof(uint) * share->all_link_count);
+  ptr += length_base;
   hs_w_conn_keys_lengths = (uint *) ptr;
 #endif
 
@@ -4051,13 +4273,76 @@ int spider_create_conn_keys(
 #endif
   for (roop_count = 0; roop_count < (int) share->all_link_count; roop_count++)
   {
-    /* tgt_db not use */
+    bool get_sql_id = FALSE;
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+    bool get_nosql_id = FALSE;
+#endif
+    for (roop_count2 = 0; roop_count2 < SPIDER_DBTON_SIZE; roop_count2++)
+    {
+      DBUG_PRINT("info",("spider share->tgt_wrappers[%d]=%s", roop_count,
+        share->tgt_wrappers[roop_count]));
+      DBUG_PRINT("info",("spider spider_dbton[%d].wrapper=%s", roop_count2,
+        spider_dbton[roop_count2].wrapper ?
+          spider_dbton[roop_count2].wrapper : "NULL"));
+      if (
+        spider_dbton[roop_count2].wrapper &&
+        !strcmp(share->tgt_wrappers[roop_count],
+          spider_dbton[roop_count2].wrapper)
+      ) {
+        spider_set_bit(share->dbton_bitmap, roop_count2);
+        if (
+          !get_sql_id &&
+          spider_dbton[roop_count2].db_access_type == SPIDER_DB_ACCESS_TYPE_SQL
+        ) {
+          sql_dbton_ids[roop_count] = roop_count2;
+          get_sql_id = TRUE;
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+          if (get_nosql_id)
+#endif
+            break;
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+          else
+            continue;
+#endif
+        }
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+        if (
+          !get_nosql_id &&
+          spider_dbton[roop_count2].db_access_type ==
+            SPIDER_DB_ACCESS_TYPE_NOSQL
+        ) {
+          hs_dbton_ids[roop_count] = roop_count2;
+          get_nosql_id = TRUE;
+          if (get_sql_id)
+            break;
+        }
+#endif
+      }
+    }
+    if (!get_sql_id)
+      sql_dbton_ids[roop_count] = SPIDER_DBTON_SIZE;
+#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
+    if (!get_nosql_id)
+      hs_dbton_ids[roop_count] = SPIDER_DBTON_SIZE;
+#endif
+
+    bool tables_on_different_db_are_joinable;
+    if (get_sql_id)
+    {
+      tables_on_different_db_are_joinable =
+        spider_dbton[sql_dbton_ids[roop_count]].db_util->
+          tables_on_different_db_are_joinable();
+    } else {
+      tables_on_different_db_are_joinable = TRUE;
+    }
     conn_keys_lengths[roop_count]
       = 1
       + share->tgt_wrappers_lengths[roop_count] + 1
       + share->tgt_hosts_lengths[roop_count] + 1
       + 5 + 1
       + share->tgt_sockets_lengths[roop_count] + 1
+      + (tables_on_different_db_are_joinable ?
+        0 : share->tgt_dbs_lengths[roop_count] + 1)
       + share->tgt_usernames_lengths[roop_count] + 1
       + share->tgt_passwords_lengths[roop_count] + 1
       + share->tgt_ssl_cas_lengths[roop_count] + 1
@@ -4067,7 +4352,8 @@ int spider_create_conn_keys(
       + share->tgt_ssl_keys_lengths[roop_count] + 1
       + 1 + 1
       + share->tgt_default_files_lengths[roop_count] + 1
-      + share->tgt_default_groups_lengths[roop_count];
+      + share->tgt_default_groups_lengths[roop_count] + 1
+      + share->tgt_dsns_lengths[roop_count];
     share->conn_keys_charlen += conn_keys_lengths[roop_count] + 2;
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
     hs_r_conn_keys_lengths[roop_count]
@@ -4092,7 +4378,7 @@ int spider_create_conn_keys(
     spider_bulk_alloc_mem(spider_current_trx, 45,
       __func__, __FILE__, __LINE__, MYF(MY_WME | MY_ZEROFILL),
       &share->conn_keys, sizeof(char *) * share->all_link_count,
-      &share->conn_keys_lengths, sizeof(uint) * share->all_link_count,
+      &share->conn_keys_lengths, length_base,
 #ifdef SPIDER_HAS_HASH_VALUE_TYPE
       &share->conn_keys_hash_value,
         sizeof(my_hash_value_type) * share->all_link_count,
@@ -4100,23 +4386,23 @@ int spider_create_conn_keys(
       &tmp_name, sizeof(char) * share->conn_keys_charlen,
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
       &share->hs_read_conn_keys, sizeof(char *) * share->all_link_count,
-      &share->hs_read_conn_keys_lengths, sizeof(uint) * share->all_link_count,
+      &share->hs_read_conn_keys_lengths, length_base,
 #ifdef SPIDER_HAS_HASH_VALUE_TYPE
       &share->hs_read_conn_keys_hash_value,
         sizeof(my_hash_value_type) * share->all_link_count,
 #endif
       &tmp_hs_r_name, sizeof(char) * share->hs_read_conn_keys_charlen,
       &share->hs_write_conn_keys, sizeof(char *) * share->all_link_count,
-      &share->hs_write_conn_keys_lengths, sizeof(uint) * share->all_link_count,
+      &share->hs_write_conn_keys_lengths, length_base,
 #ifdef SPIDER_HAS_HASH_VALUE_TYPE
       &share->hs_write_conn_keys_hash_value,
         sizeof(my_hash_value_type) * share->all_link_count,
 #endif
       &tmp_hs_w_name, sizeof(char) * share->hs_write_conn_keys_charlen,
 #endif
-      &share->sql_dbton_ids, sizeof(uint) * share->all_link_count,
+      &share->sql_dbton_ids, length_base,
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-      &share->hs_dbton_ids, sizeof(uint) * share->all_link_count,
+      &share->hs_dbton_ids, length_base,
 #endif
       NullS))
   ) {
@@ -4125,20 +4411,32 @@ int spider_create_conn_keys(
   }
   share->conn_keys_length = share->all_link_count;
   memcpy(share->conn_keys_lengths, conn_keys_lengths,
-    sizeof(uint) * share->all_link_count);
+    length_base);
+  memcpy(share->sql_dbton_ids, sql_dbton_ids, length_base);
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
   share->hs_read_conn_keys_length = share->all_link_count;
   share->hs_write_conn_keys_length = share->all_link_count;
   memcpy(share->hs_read_conn_keys_lengths, hs_r_conn_keys_lengths,
-    sizeof(uint) * share->all_link_count);
+    length_base);
   memcpy(share->hs_write_conn_keys_lengths, hs_w_conn_keys_lengths,
-    sizeof(uint) * share->all_link_count);
+    length_base);
+  memcpy(share->hs_dbton_ids, hs_dbton_ids, length_base);
 #endif
 
   my_afree(conn_keys_lengths);
 
   for (roop_count = 0; roop_count < (int) share->all_link_count; roop_count++)
   {
+    bool tables_on_different_db_are_joinable;
+    if (share->sql_dbton_ids[roop_count] != SPIDER_DBTON_SIZE)
+    {
+      tables_on_different_db_are_joinable =
+        spider_dbton[share->sql_dbton_ids[roop_count]].db_util->
+          tables_on_different_db_are_joinable();
+    } else {
+      tables_on_different_db_are_joinable = TRUE;
+    }
+
     share->conn_keys[roop_count] = tmp_name;
     *tmp_name = '0';
     DBUG_PRINT("info",("spider tgt_wrappers[%d]=%s", roop_count,
@@ -4157,6 +4455,16 @@ int spider_create_conn_keys(
       tmp_name = strmov(tmp_name + 1, share->tgt_sockets[roop_count]);
     } else
       tmp_name++;
+    if (!tables_on_different_db_are_joinable)
+    {
+      if (share->tgt_dbs[roop_count])
+      {
+        DBUG_PRINT("info",("spider tgt_dbs[%d]=%s", roop_count,
+          share->tgt_dbs[roop_count]));
+        tmp_name = strmov(tmp_name + 1, share->tgt_dbs[roop_count]);
+      } else
+        tmp_name++;
+    }
     if (share->tgt_usernames[roop_count])
     {
       DBUG_PRINT("info",("spider tgt_usernames[%d]=%s", roop_count,
@@ -4222,6 +4530,13 @@ int spider_create_conn_keys(
       tmp_name = strmov(tmp_name + 1, share->tgt_default_groups[roop_count]);
     } else
       tmp_name++;
+    if (share->tgt_dsns[roop_count])
+    {
+      DBUG_PRINT("info",("spider tgt_dsns[%d]=%s", roop_count,
+        share->tgt_dsns[roop_count]));
+      tmp_name = strmov(tmp_name + 1, share->tgt_dsns[roop_count]);
+    } else
+      tmp_name++;
     tmp_name++;
     tmp_name++;
 #ifdef SPIDER_HAS_HASH_VALUE_TYPE
@@ -4285,59 +4600,6 @@ int spider_create_conn_keys(
       share->hs_write_conn_keys_lengths[roop_count]);
 #endif
 #endif
-
-    bool get_sql_id = FALSE;
-#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-    bool get_nosql_id = FALSE;
-#endif
-    for (roop_count2 = 0; roop_count2 < SPIDER_DBTON_SIZE; roop_count2++)
-    {
-      DBUG_PRINT("info",("spider share->tgt_wrappers[%d]=%s", roop_count,
-        share->tgt_wrappers[roop_count]));
-      DBUG_PRINT("info",("spider spider_dbton[%d].wrapper=%s", roop_count2,
-        spider_dbton[roop_count2].wrapper ?
-          spider_dbton[roop_count2].wrapper : "NULL"));
-      if (
-        spider_dbton[roop_count2].wrapper &&
-        !strcmp(share->tgt_wrappers[roop_count],
-          spider_dbton[roop_count2].wrapper)
-      ) {
-        spider_set_bit(share->dbton_bitmap, roop_count2);
-        if (
-          !get_sql_id &&
-          spider_dbton[roop_count2].db_access_type == SPIDER_DB_ACCESS_TYPE_SQL
-        ) {
-          share->sql_dbton_ids[roop_count] = roop_count2;
-          get_sql_id = TRUE;
-#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-          if (get_nosql_id)
-#endif
-            break;
-#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-          else
-            continue;
-#endif
-        }
-#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-        if (
-          !get_nosql_id &&
-          spider_dbton[roop_count2].db_access_type ==
-            SPIDER_DB_ACCESS_TYPE_NOSQL
-        ) {
-          share->hs_dbton_ids[roop_count] = roop_count2;
-          get_nosql_id = TRUE;
-          if (get_sql_id)
-            break;
-        }
-#endif
-      }
-    }
-    if (!get_sql_id)
-      share->sql_dbton_ids[roop_count] = SPIDER_DBTON_SIZE;
-#if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-    if (!get_nosql_id)
-      share->hs_dbton_ids[roop_count] = SPIDER_DBTON_SIZE;
-#endif
   }
   for (roop_count2 = 0; roop_count2 < SPIDER_DBTON_SIZE; roop_count2++)
   {
@@ -4394,13 +4656,17 @@ SPIDER_SHARE *spider_create_share(
   bitmap_size = spider_bitmap_size(table_share->fields);
   if (!(share = (SPIDER_SHARE *)
     spider_bulk_malloc(spider_current_trx, 46, MYF(MY_WME | MY_ZEROFILL),
-      &share, sizeof(*share),
-      &tmp_name, length + 1,
-      &tmp_static_key_cardinality, sizeof(*tmp_static_key_cardinality) * table_share->keys,
-      &tmp_cardinality, sizeof(*tmp_cardinality) * table_share->fields,
-      &tmp_cardinality_upd, sizeof(*tmp_cardinality_upd) * bitmap_size,
-      &tmp_table_mon_mutex_bitmap, sizeof(*tmp_table_mon_mutex_bitmap) *
-        ((spider_param_udf_table_mon_mutex_count() + 7) / 8),
+      &share, (uint) (sizeof(*share)),
+      &tmp_name, (uint) (length + 1),
+      &tmp_static_key_cardinality,
+        (uint) (sizeof(*tmp_static_key_cardinality) * table_share->keys),
+      &tmp_cardinality,
+        (uint) (sizeof(*tmp_cardinality) * table_share->fields),
+      &tmp_cardinality_upd,
+        (uint) (sizeof(*tmp_cardinality_upd) * bitmap_size),
+      &tmp_table_mon_mutex_bitmap,
+        (uint) (sizeof(*tmp_table_mon_mutex_bitmap) *
+          ((spider_param_udf_table_mon_mutex_count() + 7) / 8)),
       NullS))
   ) {
     *error_num = HA_ERR_OUT_OF_MEM;
@@ -4513,8 +4779,6 @@ SPIDER_SHARE *spider_create_share(
     goto error_init_crd_mutex;
   }
 
-  thr_lock_init(&share->lock);
-
 #ifdef SPIDER_HAS_HASH_VALUE_TYPE
   if (!(share->lgtm_tblhnd_share =
     spider_get_lgtm_tblhnd_share(tmp_name, length, hash_value, FALSE, TRUE,
@@ -4527,11 +4791,9 @@ SPIDER_SHARE *spider_create_share(
     goto error_get_lgtm_tblhnd_share;
   }
 
-#ifdef WITH_PARTITION_STORAGE_ENGINE
-  if (!(share->partition_share =
-    spider_get_pt_share(share, table_share, error_num)))
-    goto error_get_pt_share;
-#endif
+  if (!(share->wide_share =
+    spider_get_wide_share(share, table_share, error_num)))
+    goto error_get_wide_share;
 
   for (roop_count = 0; roop_count < SPIDER_DBTON_SIZE; roop_count++)
   {
@@ -4579,12 +4841,9 @@ error_init_dbton:
       share->dbton_share[roop_count] = NULL;
     }
   }
-#ifdef WITH_PARTITION_STORAGE_ENGINE
-  spider_free_pt_share(share->partition_share);
-error_get_pt_share:
-#endif
+  spider_free_wide_share(share->wide_share);
+error_get_wide_share:
 error_get_lgtm_tblhnd_share:
-  thr_lock_delete(&share->lock);
   pthread_mutex_destroy(&share->crd_mutex);
 error_init_crd_mutex:
   pthread_mutex_destroy(&share->sts_mutex);
@@ -4610,7 +4869,7 @@ SPIDER_SHARE *spider_get_share(
   SPIDER_SHARE *share;
   TABLE_SHARE *table_share = table->s;
   SPIDER_RESULT_LIST *result_list = &spider->result_list;
-  uint length, tmp_conn_link_idx = 0;
+  uint length, tmp_conn_link_idx = 0, buf_sz;
   char *tmp_name, *tmp_cid;
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
   char *tmp_hs_r_name, *tmp_hs_w_name;
@@ -4634,24 +4893,67 @@ SPIDER_SHARE *spider_get_share(
   int semi_table_lock_conn;
   int search_link_idx;
   uint sql_command = thd_sql_command(thd);
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   MEM_ROOT mem_root;
   TABLE *table_tables = NULL;
   bool init_mem_root = FALSE;
   bool same_server_link;
   int load_sts_at_startup;
   int load_crd_at_startup;
+  user_var_entry *loop_check;
+  char *loop_check_buf;
+  TABLE_SHARE *top_share;
+  LEX_CSTRING lex_str;
   DBUG_ENTER("spider_get_share");
-
+  top_share = spider->wide_handler->top_share;
   length = (uint) strlen(table_name);
 #ifdef SPIDER_HAS_HASH_VALUE_TYPE
   my_hash_value_type hash_value = my_calc_hash(&spider_open_tables,
     (uchar*) table_name, length);
 #endif
+  if (top_share)
+  {
+    lex_str.length = top_share->path.length + SPIDER_SQL_LOP_CHK_PRM_PRF_LEN;
+    buf_sz = spider_unique_id.length > SPIDER_SQL_LOP_CHK_PRM_PRF_LEN ?
+      top_share->path.length + spider_unique_id.length + 2 :
+      lex_str.length + 2;
+    loop_check_buf = (char *) my_alloca(buf_sz);
+    if (unlikely(!loop_check_buf))
+    {
+      *error_num = HA_ERR_OUT_OF_MEM;
+      DBUG_RETURN(NULL);
+    }
+    lex_str.str = loop_check_buf + buf_sz - lex_str.length - 2;
+    memcpy((void *) lex_str.str,
+      SPIDER_SQL_LOP_CHK_PRM_PRF_STR, SPIDER_SQL_LOP_CHK_PRM_PRF_LEN);
+    memcpy((void *) (lex_str.str + SPIDER_SQL_LOP_CHK_PRM_PRF_LEN),
+      top_share->path.str, top_share->path.length);
+    ((char *) lex_str.str)[lex_str.length] = '\0';
+    DBUG_PRINT("info",("spider loop check param name=%s", lex_str.str));
+    loop_check = get_variable(&thd->user_vars, &lex_str, FALSE);
+    if (loop_check && loop_check->type == STRING_RESULT)
+    {
+      lex_str.length = top_share->path.length + spider_unique_id.length + 1;
+      lex_str.str = loop_check_buf + buf_sz - top_share->path.length -
+        spider_unique_id.length - 2;
+      memcpy((void *) lex_str.str, spider_unique_id.str,
+        spider_unique_id.length);
+      ((char *) lex_str.str)[lex_str.length - 1] = '-';
+      ((char *) lex_str.str)[lex_str.length] = '\0';
+      DBUG_PRINT("info",("spider loop check key=%s", lex_str.str));
+      DBUG_PRINT("info",("spider loop check param value=%s",
+        loop_check->value));
+      if (unlikely(strstr(loop_check->value, lex_str.str)))
+      {
+        *error_num = ER_SPIDER_INFINITE_LOOP_NUM;
+        my_printf_error(*error_num, ER_SPIDER_INFINITE_LOOP_STR, MYF(0),
+          top_share->db.str, top_share->table_name.str);
+        my_afree(loop_check_buf);
+        DBUG_RETURN(NULL);
+      }
+    }
+    my_afree(loop_check_buf);
+  }
   pthread_mutex_lock(&spider_tbl_mutex);
 #ifdef SPIDER_HAS_HASH_VALUE_TYPE
   if (!(share = (SPIDER_SHARE*) my_hash_search_using_hash_value(
@@ -4727,6 +5029,7 @@ SPIDER_SHARE *spider_get_share(
         {
           SPD_INIT_ALLOC_ROOT(&mem_root, 4096, 0, MYF(MY_WME));
           init_mem_root = TRUE;
+
           if (
             !(table_tables = spider_open_sys_table(
               thd, SPIDER_SYS_TABLES_TABLE_NAME_STR,
@@ -4767,21 +5070,21 @@ SPIDER_SHARE *spider_get_share(
               share->init_error_time = (time_t) time((time_t*) 0);
               share->init = TRUE;
               spider_free_share(share);
-              goto error_get_link_statuses;
+              spider_close_sys_table(thd, table_tables,
+                &open_tables_backup, FALSE);
+              table_tables = NULL;
+              goto error_open_sys_table;
             }
           } else {
             memcpy(share->alter_table.tmp_link_statuses, share->link_statuses,
               sizeof(long) * share->all_link_count);
             share->link_status_init = TRUE;
           }
-        }
-        share->have_recovery_link = spider_conn_check_recovery_link(share);
-        if (table_tables)
-        {
           spider_close_sys_table(thd, table_tables,
             &open_tables_backup, FALSE);
           table_tables = NULL;
         }
+        share->have_recovery_link = spider_conn_check_recovery_link(share);
         if (init_mem_root)
         {
           free_root(&mem_root, MYF(0));
@@ -4806,7 +5109,7 @@ SPIDER_SHARE *spider_get_share(
     else
       first_byte = '0';
 
-    if (!(spider->trx = spider_get_trx(thd, TRUE, error_num)))
+    if (!(spider->wide_handler->trx = spider_get_trx(thd, TRUE, error_num)))
     {
       share->init_error = TRUE;
       share->init_error_time = (time_t) time((time_t*) 0);
@@ -4823,7 +5126,7 @@ SPIDER_SHARE *spider_get_share(
       if (!share->sts_spider_init)
       {
         if ((*error_num = spider_create_spider_object_for_share(
-          spider->trx, share, &share->sts_spider)))
+          spider->wide_handler->trx, share, &share->sts_spider)))
         {
           pthread_mutex_unlock(&share->mutex);
           share->init_error = TRUE;
@@ -4851,7 +5154,7 @@ SPIDER_SHARE *spider_get_share(
       if (!share->crd_spider_init)
       {
         if ((*error_num = spider_create_spider_object_for_share(
-          spider->trx, share, &share->crd_spider)))
+          spider->wide_handler->trx, share, &share->crd_spider)))
         {
           pthread_mutex_unlock(&share->mutex);
           share->init_error = TRUE;
@@ -4879,7 +5182,8 @@ SPIDER_SHARE *spider_get_share(
       sql_command != SQLCOM_DROP_TABLE &&
       sql_command != SQLCOM_ALTER_TABLE &&
       sql_command != SQLCOM_SHOW_CREATE &&
-      (*error_num = spider_create_mon_threads(spider->trx, share))
+      (*error_num = spider_create_mon_threads(spider->wide_handler->trx,
+        share))
     ) {
       share->init_error = TRUE;
       share->init_error_time = (time_t) time((time_t*) 0);
@@ -5050,7 +5354,8 @@ SPIDER_SHARE *spider_get_share(
         if (
           !(spider->conns[roop_count] =
             spider_get_conn(share, roop_count, spider->conn_keys[roop_count],
-              spider->trx, spider, FALSE, TRUE, SPIDER_CONN_KIND_MYSQL,
+              spider->wide_handler->trx, spider, FALSE, TRUE,
+              SPIDER_CONN_KIND_MYSQL,
               error_num))
         ) {
           if (
@@ -5058,8 +5363,8 @@ SPIDER_SHARE *spider_get_share(
             spider->need_mons[roop_count]
           ) {
             *error_num = spider_ping_table_mon_from_table(
-                spider->trx,
-                spider->trx->thd,
+                spider->wide_handler->trx,
+                spider->wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -5158,7 +5463,7 @@ SPIDER_SHARE *spider_get_share(
       pthread_mutex_lock(&share->sts_mutex);
       pthread_mutex_lock(&share->crd_mutex);
       if ((spider_init_error_table =
-        spider_get_init_error_table(spider->trx, share, FALSE)))
+        spider_get_init_error_table(spider->wide_handler->trx, share, FALSE)))
       {
         DBUG_PRINT("info",("spider diff1=%f",
           difftime(tmp_time, spider_init_error_table->init_error_time)));
@@ -5248,6 +5553,7 @@ SPIDER_SHARE *spider_get_share(
         my_printf_error(ER_SPIDER_TABLE_OPEN_TIMEOUT_NUM,
           ER_SPIDER_TABLE_OPEN_TIMEOUT_STR, MYF(0),
           table_share->db.str, table_share->table_name.str);
+        spider_free_share(share);
         goto error_but_no_delete;
       }
       my_sleep(10000); // wait 10 ms
@@ -5265,6 +5571,7 @@ SPIDER_SHARE *spider_get_share(
       }
       if (!share->link_status_init)
       {
+        DBUG_ASSERT(!table_tables);
         /*
           The link statuses need to be refreshed from the spider_tables table
           if the operation:
@@ -5281,6 +5588,7 @@ SPIDER_SHARE *spider_get_share(
         {
           SPD_INIT_ALLOC_ROOT(&mem_root, 4096, 0, MYF(MY_WME));
           init_mem_root = TRUE;
+
           if (
             !(table_tables = spider_open_sys_table(
               thd, SPIDER_SYS_TABLES_TABLE_NAME_STR,
@@ -5315,21 +5623,21 @@ SPIDER_SHARE *spider_get_share(
               }
               pthread_mutex_unlock(&share->mutex);
               spider_free_share(share);
-              goto error_get_link_statuses;
+              spider_close_sys_table(thd, table_tables,
+                &open_tables_backup, FALSE);
+              table_tables = NULL;
+              goto error_open_sys_table;
             }
           } else {
             memcpy(share->alter_table.tmp_link_statuses, share->link_statuses,
               sizeof(long) * share->all_link_count);
             share->link_status_init = TRUE;
           }
-        }
-        share->have_recovery_link = spider_conn_check_recovery_link(share);
-        if (table_tables)
-        {
           spider_close_sys_table(thd, table_tables,
             &open_tables_backup, FALSE);
           table_tables = NULL;
         }
+        share->have_recovery_link = spider_conn_check_recovery_link(share);
         if (init_mem_root)
         {
           free_root(&mem_root, MYF(0));
@@ -5355,7 +5663,7 @@ SPIDER_SHARE *spider_get_share(
       first_byte = '0';
 
     spider->share = share;
-    if (!(spider->trx = spider_get_trx(thd, TRUE, error_num)))
+    if (!(spider->wide_handler->trx = spider_get_trx(thd, TRUE, error_num)))
     {
       spider_free_share(share);
       goto error_but_no_delete;
@@ -5369,7 +5677,7 @@ SPIDER_SHARE *spider_get_share(
       if (!share->sts_spider_init)
       {
         if ((*error_num = spider_create_spider_object_for_share(
-          spider->trx, share, &share->sts_spider)))
+          spider->wide_handler->trx, share, &share->sts_spider)))
         {
           pthread_mutex_unlock(&share->mutex);
           spider_free_share(share);
@@ -5394,7 +5702,7 @@ SPIDER_SHARE *spider_get_share(
       if (!share->crd_spider_init)
       {
         if ((*error_num = spider_create_spider_object_for_share(
-          spider->trx, share, &share->crd_spider)))
+          spider->wide_handler->trx, share, &share->crd_spider)))
         {
           pthread_mutex_unlock(&share->mutex);
           spider_free_share(share);
@@ -5419,7 +5727,8 @@ SPIDER_SHARE *spider_get_share(
       sql_command != SQLCOM_DROP_TABLE &&
       sql_command != SQLCOM_ALTER_TABLE &&
       sql_command != SQLCOM_SHOW_CREATE &&
-      (*error_num = spider_create_mon_threads(spider->trx, share))
+      (*error_num = spider_create_mon_threads(spider->wide_handler->trx,
+        share))
     ) {
       spider_free_share(share);
       goto error_but_no_delete;
@@ -5581,7 +5890,8 @@ SPIDER_SHARE *spider_get_share(
         if (
           !(spider->conns[roop_count] =
             spider_get_conn(share, roop_count, spider->conn_keys[roop_count],
-              spider->trx, spider, FALSE, TRUE, SPIDER_CONN_KIND_MYSQL,
+              spider->wide_handler->trx, spider, FALSE, TRUE,
+              SPIDER_CONN_KIND_MYSQL,
               error_num))
         ) {
           if (
@@ -5589,8 +5899,8 @@ SPIDER_SHARE *spider_get_share(
             spider->need_mons[roop_count]
           ) {
             *error_num = spider_ping_table_mon_from_table(
-                spider->trx,
-                spider->trx->thd,
+                spider->wide_handler->trx,
+                spider->wide_handler->trx->thd,
                 share,
                 roop_count,
                 (uint32) share->monitoring_sid[roop_count],
@@ -5681,7 +5991,8 @@ SPIDER_SHARE *spider_get_share(
 #endif
           time_t tmp_time = (time_t) time((time_t*) 0);
           if ((spider_init_error_table =
-            spider_get_init_error_table(spider->trx, share, FALSE)))
+            spider_get_init_error_table(spider->wide_handler->trx, share,
+              FALSE)))
           {
             DBUG_PRINT("info",("spider diff2=%f",
               difftime(tmp_time, spider_init_error_table->init_error_time)));
@@ -5772,13 +6083,6 @@ error_hash_insert:
   spider_free_share_resource_only(share);
 error_alloc_share:
   pthread_mutex_unlock(&spider_tbl_mutex);
-error_get_link_statuses:
-  if (table_tables)
-  {
-    spider_close_sys_table(thd, table_tables,
-      &open_tables_backup, FALSE);
-    table_tables = NULL;
-  }
 error_open_sys_table:
 #ifndef WITHOUT_SPIDER_BG_SEARCH
 error_crd_spider_init:
@@ -5798,7 +6102,6 @@ void spider_free_share_resource_only(
 ) {
   DBUG_ENTER("spider_free_share_resource_only");
   spider_free_share_alloc(share);
-  thr_lock_delete(&share->lock);
   pthread_mutex_destroy(&share->crd_mutex);
   pthread_mutex_destroy(&share->sts_mutex);
   pthread_mutex_destroy(&share->mutex);
@@ -5878,7 +6181,6 @@ int spider_free_share(
 #else
     my_hash_delete(&spider_open_tables, (uchar*) share);
 #endif
-    thr_lock_delete(&share->lock);
     pthread_mutex_destroy(&share->crd_mutex);
     pthread_mutex_destroy(&share->sts_mutex);
     pthread_mutex_destroy(&share->mutex);
@@ -5966,8 +6268,8 @@ SPIDER_LGTM_TBLHND_SHARE *spider_get_lgtm_tblhnd_share(
     DBUG_PRINT("info",("spider create new lgtm tblhnd share"));
     if (!(lgtm_tblhnd_share = (SPIDER_LGTM_TBLHND_SHARE *)
       spider_bulk_malloc(spider_current_trx, 244, MYF(MY_WME | MY_ZEROFILL),
-        &lgtm_tblhnd_share, sizeof(*lgtm_tblhnd_share),
-        &tmp_name, table_name_length + 1,
+        &lgtm_tblhnd_share, (uint) (sizeof(*lgtm_tblhnd_share)),
+        &tmp_name, (uint) (table_name_length + 1),
         NullS))
     ) {
       *error_num = HA_ERR_OUT_OF_MEM;
@@ -6050,59 +6352,59 @@ void spider_free_lgtm_tblhnd_share_alloc(
   DBUG_VOID_RETURN;
 }
 
-#ifdef WITH_PARTITION_STORAGE_ENGINE
-SPIDER_PARTITION_SHARE *spider_get_pt_share(
+SPIDER_WIDE_SHARE *spider_get_wide_share(
   SPIDER_SHARE *share,
   TABLE_SHARE *table_share,
   int *error_num
 ) {
-  SPIDER_PARTITION_SHARE *partition_share;
+  SPIDER_WIDE_SHARE *wide_share;
   char *tmp_name;
   longlong *tmp_cardinality;
-  DBUG_ENTER("spider_get_pt_share");
+  DBUG_ENTER("spider_get_wide_share");
 
-  pthread_mutex_lock(&spider_pt_share_mutex);
+  pthread_mutex_lock(&spider_wide_share_mutex);
 #ifdef SPIDER_HAS_HASH_VALUE_TYPE
-  if (!(partition_share = (SPIDER_PARTITION_SHARE*)
+  if (!(wide_share = (SPIDER_WIDE_SHARE*)
     my_hash_search_using_hash_value(
-    &spider_open_pt_share, share->table_path_hash_value,
+    &spider_open_wide_share, share->table_path_hash_value,
     (uchar*) table_share->path.str, table_share->path.length)))
 #else
-  if (!(partition_share = (SPIDER_PARTITION_SHARE*) my_hash_search(
-    &spider_open_pt_share,
+  if (!(wide_share = (SPIDER_WIDE_SHARE*) my_hash_search(
+    &spider_open_wide_share,
     (uchar*) table_share->path.str, table_share->path.length)))
 #endif
   {
-    DBUG_PRINT("info",("spider create new pt share"));
-    if (!(partition_share = (SPIDER_PARTITION_SHARE *)
+    DBUG_PRINT("info",("spider create new wide share"));
+    if (!(wide_share = (SPIDER_WIDE_SHARE *)
       spider_bulk_malloc(spider_current_trx, 51, MYF(MY_WME | MY_ZEROFILL),
-        &partition_share, sizeof(*partition_share),
-        &tmp_name, table_share->path.length + 1,
-        &tmp_cardinality, sizeof(*tmp_cardinality) * table_share->fields,
+        &wide_share, sizeof(SPIDER_WIDE_SHARE),
+        &tmp_name, (uint) (table_share->path.length + 1),
+        &tmp_cardinality,
+          (uint) (sizeof(*tmp_cardinality) * table_share->fields),
         NullS))
     ) {
       *error_num = HA_ERR_OUT_OF_MEM;
       goto error_alloc_share;
     }
 
-    partition_share->use_count = 0;
-    partition_share->table_name_length = table_share->path.length;
-    partition_share->table_name = tmp_name;
-    memcpy(partition_share->table_name, table_share->path.str,
-      partition_share->table_name_length);
+    wide_share->use_count = 0;
+    wide_share->table_name_length = table_share->path.length;
+    wide_share->table_name = tmp_name;
+    memcpy(wide_share->table_name, table_share->path.str,
+      wide_share->table_name_length);
 #ifdef SPIDER_HAS_HASH_VALUE_TYPE
-    partition_share->table_path_hash_value = share->table_path_hash_value;
+    wide_share->table_path_hash_value = share->table_path_hash_value;
 #endif
-    partition_share->cardinality = tmp_cardinality;
+    wide_share->cardinality = tmp_cardinality;
 
-    partition_share->crd_get_time = partition_share->sts_get_time =
+    wide_share->crd_get_time = wide_share->sts_get_time =
       share->crd_get_time;
 
 #if MYSQL_VERSION_ID < 50500
-    if (pthread_mutex_init(&partition_share->sts_mutex, MY_MUTEX_INIT_FAST))
+    if (pthread_mutex_init(&wide_share->sts_mutex, MY_MUTEX_INIT_FAST))
 #else
-    if (mysql_mutex_init(spd_key_mutex_pt_share_sts,
-      &partition_share->sts_mutex, MY_MUTEX_INIT_FAST))
+    if (mysql_mutex_init(spd_key_mutex_wide_share_sts,
+      &wide_share->sts_mutex, MY_MUTEX_INIT_FAST))
 #endif
     {
       *error_num = HA_ERR_OUT_OF_MEM;
@@ -6110,153 +6412,116 @@ SPIDER_PARTITION_SHARE *spider_get_pt_share(
     }
 
 #if MYSQL_VERSION_ID < 50500
-    if (pthread_mutex_init(&partition_share->crd_mutex, MY_MUTEX_INIT_FAST))
+    if (pthread_mutex_init(&wide_share->crd_mutex, MY_MUTEX_INIT_FAST))
 #else
-    if (mysql_mutex_init(spd_key_mutex_pt_share_crd,
-      &partition_share->crd_mutex, MY_MUTEX_INIT_FAST))
+    if (mysql_mutex_init(spd_key_mutex_wide_share_crd,
+      &wide_share->crd_mutex, MY_MUTEX_INIT_FAST))
 #endif
     {
       *error_num = HA_ERR_OUT_OF_MEM;
       goto error_init_crd_mutex;
     }
 
-#if MYSQL_VERSION_ID < 50500
-    if (pthread_mutex_init(&partition_share->pt_handler_mutex,
-      MY_MUTEX_INIT_FAST))
-#else
-    if (mysql_mutex_init(spd_key_mutex_pt_handler,
-      &partition_share->pt_handler_mutex, MY_MUTEX_INIT_FAST))
-#endif
-    {
-      *error_num = HA_ERR_OUT_OF_MEM;
-      goto error_init_pt_handler_mutex;
-    }
-
-    if(
-      my_hash_init(&partition_share->pt_handler_hash, spd_charset_utf8_bin,
-        32, 0, 0, (my_hash_get_key) spider_pt_handler_share_get_key, 0, 0)
-    ) {
-      *error_num = HA_ERR_OUT_OF_MEM;
-      goto error_init_pt_handler_hash;
-    }
-    spider_alloc_calc_mem_init(partition_share->pt_handler_hash, 142);
-    spider_alloc_calc_mem(spider_current_trx,
-      partition_share->pt_handler_hash,
-      partition_share->pt_handler_hash.array.max_element *
-      partition_share->pt_handler_hash.array.size_of_element);
+    thr_lock_init(&wide_share->lock);
 
-    uint old_elements = spider_open_pt_share.array.max_element;
+    uint old_elements = spider_open_wide_share.array.max_element;
 #ifdef HASH_UPDATE_WITH_HASH_VALUE
-    if (my_hash_insert_with_hash_value(&spider_open_pt_share,
+    if (my_hash_insert_with_hash_value(&spider_open_wide_share,
       share->table_path_hash_value,
-      (uchar*) partition_share))
+      (uchar*) wide_share))
 #else
-    if (my_hash_insert(&spider_open_pt_share, (uchar*) partition_share))
+    if (my_hash_insert(&spider_open_wide_share, (uchar*) wide_share))
 #endif
     {
       *error_num = HA_ERR_OUT_OF_MEM;
       goto error_hash_insert;
     }
-    if (spider_open_pt_share.array.max_element > old_elements)
+    if (spider_open_wide_share.array.max_element > old_elements)
     {
       spider_alloc_calc_mem(spider_current_trx,
-        spider_open_pt_share,
-        (spider_open_pt_share.array.max_element - old_elements) *
-        spider_open_pt_share.array.size_of_element);
+        spider_open_wide_share,
+        (spider_open_wide_share.array.max_element - old_elements) *
+        spider_open_wide_share.array.size_of_element);
     }
   }
-  partition_share->use_count++;
-  pthread_mutex_unlock(&spider_pt_share_mutex);
+  wide_share->use_count++;
+  pthread_mutex_unlock(&spider_wide_share_mutex);
 
-  DBUG_PRINT("info",("spider partition_share=%p", partition_share));
-  DBUG_RETURN(partition_share);
+  DBUG_PRINT("info",("spider wide_share=%p", wide_share));
+  DBUG_RETURN(wide_share);
 
 error_hash_insert:
-  spider_free_mem_calc(spider_current_trx,
-    partition_share->pt_handler_hash_id,
-    partition_share->pt_handler_hash.array.max_element *
-    partition_share->pt_handler_hash.array.size_of_element);
-  my_hash_free(&partition_share->pt_handler_hash);
-error_init_pt_handler_hash:
-  pthread_mutex_destroy(&partition_share->pt_handler_mutex);
-error_init_pt_handler_mutex:
-  pthread_mutex_destroy(&partition_share->crd_mutex);
+  pthread_mutex_destroy(&wide_share->crd_mutex);
 error_init_crd_mutex:
-  pthread_mutex_destroy(&partition_share->sts_mutex);
+  pthread_mutex_destroy(&wide_share->sts_mutex);
 error_init_sts_mutex:
-  spider_free(spider_current_trx, partition_share, MYF(0));
+  spider_free(spider_current_trx, wide_share, MYF(0));
 error_alloc_share:
-  pthread_mutex_unlock(&spider_pt_share_mutex);
+  pthread_mutex_unlock(&spider_wide_share_mutex);
   DBUG_RETURN(NULL);
 }
 
-int spider_free_pt_share(
-  SPIDER_PARTITION_SHARE *partition_share
+int spider_free_wide_share(
+  SPIDER_WIDE_SHARE *wide_share
 ) {
-  DBUG_ENTER("spider_free_pt_share");
-  pthread_mutex_lock(&spider_pt_share_mutex);
-  if (!--partition_share->use_count)
+  DBUG_ENTER("spider_free_wide_share");
+  pthread_mutex_lock(&spider_wide_share_mutex);
+  if (!--wide_share->use_count)
   {
+    thr_lock_delete(&wide_share->lock);
 #ifdef HASH_UPDATE_WITH_HASH_VALUE
-    my_hash_delete_with_hash_value(&spider_open_pt_share,
-      partition_share->table_path_hash_value, (uchar*) partition_share);
+    my_hash_delete_with_hash_value(&spider_open_wide_share,
+      wide_share->table_path_hash_value, (uchar*) wide_share);
 #else
-    my_hash_delete(&spider_open_pt_share, (uchar*) partition_share);
+    my_hash_delete(&spider_open_wide_share, (uchar*) wide_share);
 #endif
-    spider_free_mem_calc(spider_current_trx,
-      partition_share->pt_handler_hash_id,
-      partition_share->pt_handler_hash.array.max_element *
-      partition_share->pt_handler_hash.array.size_of_element);
-    my_hash_free(&partition_share->pt_handler_hash);
-    pthread_mutex_destroy(&partition_share->pt_handler_mutex);
-    pthread_mutex_destroy(&partition_share->crd_mutex);
-    pthread_mutex_destroy(&partition_share->sts_mutex);
-    spider_free(spider_current_trx, partition_share, MYF(0));
-  }
-  pthread_mutex_unlock(&spider_pt_share_mutex);
+    pthread_mutex_destroy(&wide_share->crd_mutex);
+    pthread_mutex_destroy(&wide_share->sts_mutex);
+    spider_free(spider_current_trx, wide_share, MYF(0));
+  }
+  pthread_mutex_unlock(&spider_wide_share_mutex);
   DBUG_RETURN(0);
 }
 
-void spider_copy_sts_to_pt_share(
-  SPIDER_PARTITION_SHARE *partition_share,
+void spider_copy_sts_to_wide_share(
+  SPIDER_WIDE_SHARE *wide_share,
   SPIDER_SHARE *share
 ) {
   DBUG_ENTER("spider_copy_sts_to_pt_share");
-  partition_share->stat = share->stat;
+  wide_share->stat = share->stat;
   DBUG_VOID_RETURN;
 }
 
 void spider_copy_sts_to_share(
   SPIDER_SHARE *share,
-  SPIDER_PARTITION_SHARE *partition_share
+  SPIDER_WIDE_SHARE *wide_share
 ) {
   DBUG_ENTER("spider_copy_sts_to_share");
-  share->stat = partition_share->stat;
+  share->stat = wide_share->stat;
   DBUG_VOID_RETURN;
 }
 
-void spider_copy_crd_to_pt_share(
-  SPIDER_PARTITION_SHARE *partition_share,
+void spider_copy_crd_to_wide_share(
+  SPIDER_WIDE_SHARE *wide_share,
   SPIDER_SHARE *share,
   int fields
 ) {
-  DBUG_ENTER("spider_copy_crd_to_pt_share");
-  memcpy(partition_share->cardinality, share->cardinality,
+  DBUG_ENTER("spider_copy_crd_to_wide_share");
+  memcpy(wide_share->cardinality, share->cardinality,
     sizeof(longlong) * fields);
   DBUG_VOID_RETURN;
 }
 
 void spider_copy_crd_to_share(
   SPIDER_SHARE *share,
-  SPIDER_PARTITION_SHARE *partition_share,
+  SPIDER_WIDE_SHARE *wide_share,
   int fields
 ) {
   DBUG_ENTER("spider_copy_crd_to_share");
-  memcpy(share->cardinality, partition_share->cardinality,
+  memcpy(share->cardinality, wide_share->cardinality,
     sizeof(longlong) * fields);
   DBUG_VOID_RETURN;
 }
-#endif
 
 int spider_open_all_tables(
   SPIDER_TRX *trx,
@@ -6280,11 +6545,7 @@ int spider_open_all_tables(
   long *long_info;
   longlong *longlong_info;
   MEM_ROOT mem_root;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   DBUG_ENTER("spider_open_all_tables");
   if (
     !(table_tables = spider_open_sys_table(
@@ -6427,19 +6688,22 @@ int spider_open_all_tables(
         free_root(&mem_root, MYF(0));
         DBUG_RETURN(HA_ERR_OUT_OF_MEM);
       }
-      spider->lock_type = TL_READ_NO_INSERT;
+      spider->wide_handler->lock_type = TL_READ_NO_INSERT;
 
       if (!(share = (SPIDER_SHARE *)
         spider_bulk_malloc(spider_current_trx, 52, MYF(MY_WME | MY_ZEROFILL),
-          &share, sizeof(*share),
-          &connect_info, sizeof(char *) * SPIDER_TMP_SHARE_CHAR_PTR_COUNT,
-          &connect_info_length, sizeof(uint) * SPIDER_TMP_SHARE_UINT_COUNT,
-          &long_info, sizeof(long) * SPIDER_TMP_SHARE_LONG_COUNT,
-          &longlong_info, sizeof(longlong) * SPIDER_TMP_SHARE_LONGLONG_COUNT,
-          &conns, sizeof(SPIDER_CONN *),
-          &need_mon, sizeof(int),
-          &spider->conn_link_idx, sizeof(uint),
-          &spider->conn_can_fo, sizeof(uchar),
+          &share, (uint) (sizeof(*share)),
+          &connect_info,
+            (uint) (sizeof(char *) * SPIDER_TMP_SHARE_CHAR_PTR_COUNT),
+          &connect_info_length,
+            (uint) (sizeof(uint) * SPIDER_TMP_SHARE_UINT_COUNT),
+          &long_info, (uint) (sizeof(long) * SPIDER_TMP_SHARE_LONG_COUNT),
+          &longlong_info,
+            (uint) (sizeof(longlong) * SPIDER_TMP_SHARE_LONGLONG_COUNT),
+          &conns, (uint) (sizeof(SPIDER_CONN *)),
+          &need_mon, (uint) (sizeof(int)),
+          &spider->conn_link_idx, (uint) (sizeof(uint)),
+          &spider->conn_can_fo, (uint) (sizeof(uchar)),
           NullS))
       ) {
         delete spider;
@@ -6462,7 +6726,7 @@ int spider_open_all_tables(
       memcpy(longlong_info, &tmp_longlong, sizeof(longlong) *
         SPIDER_TMP_SHARE_LONGLONG_COUNT);
       spider->share = share;
-      spider->trx = trx;
+      spider->wide_handler->trx = trx;
       spider->conns = conns;
       spider->need_mons = need_mon;
       spider->conn_link_idx[0] = 0;
@@ -6636,7 +6900,7 @@ int spider_close_connection(
   }
 
   spider_rollback(spider_hton_ptr, thd, TRUE);
-  spider_free_trx(trx, TRUE);
+  spider_free_trx(trx, TRUE, false);
 
   DBUG_RETURN(0);
 }
@@ -6835,10 +7099,10 @@ int spider_db_done(
   my_hash_free(&spider_lgtm_tblhnd_share_hash);
 #ifdef WITH_PARTITION_STORAGE_ENGINE
   spider_free_mem_calc(spider_current_trx,
-    spider_open_pt_share_id,
-    spider_open_pt_share.array.max_element *
-    spider_open_pt_share.array.size_of_element);
-  my_hash_free(&spider_open_pt_share);
+    spider_open_wide_share_id,
+    spider_open_wide_share.array.max_element *
+    spider_open_wide_share.array.size_of_element);
+  my_hash_free(&spider_open_wide_share);
 #endif
   pthread_mutex_lock(&spider_init_error_tbl_mutex);
   while ((spider_init_error_table = (SPIDER_INIT_ERROR_TABLE*)
@@ -6876,7 +7140,7 @@ int spider_db_done(
   pthread_mutex_destroy(&spider_conn_mutex);
   pthread_mutex_destroy(&spider_lgtm_tblhnd_share_mutex);
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-  pthread_mutex_destroy(&spider_pt_share_mutex);
+  pthread_mutex_destroy(&spider_wide_share_mutex);
 #endif
   pthread_mutex_destroy(&spider_init_error_tbl_mutex);
   pthread_mutex_destroy(&spider_conn_id_mutex);
@@ -6926,11 +7190,11 @@ int spider_db_init(
 ) {
   int error_num = HA_ERR_OUT_OF_MEM, roop_count;
   uint dbton_id = 0;
+  uchar addr[6];
   handlerton *spider_hton = (handlerton *)p;
   DBUG_ENTER("spider_db_init");
   spider_hton_ptr = spider_hton;
 
-  spider_hton->state = SHOW_OPTION_YES;
   spider_hton->flags = HTON_NO_FLAGS;
 #ifdef HTON_CAN_READ_CONNECT_STRING_IN_PARTITION
   spider_hton->flags |= HTON_CAN_READ_CONNECT_STRING_IN_PARTITION;
@@ -6968,6 +7232,16 @@ int spider_db_init(
   spider_hton->create_group_by = spider_create_group_by_handler;
 #endif
 
+  if (my_gethwaddr((uchar *) addr))
+  {
+    my_printf_error(ER_SPIDER_CANT_NUM, ER_SPIDER_CANT_STR1, MYF(0),
+      "get hardware address with error ", errno);
+  }
+  spider_unique_id.str = spider_unique_id_buf;
+  spider_unique_id.length = my_sprintf(spider_unique_id_buf,
+    (spider_unique_id_buf, "-%02x%02x%02x%02x%02x%02x-%lx-",
+      addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (ulong) getpid()));
+
   memset(&spider_alloc_func_name, 0, sizeof(spider_alloc_func_name));
   memset(&spider_alloc_file_name, 0, sizeof(spider_alloc_file_name));
   memset(&spider_alloc_line_no, 0, sizeof(spider_alloc_line_no));
@@ -6976,62 +7250,6 @@ int spider_db_init(
   memset(&spider_alloc_mem_count, 0, sizeof(spider_alloc_mem_count));
   memset(&spider_free_mem_count, 0, sizeof(spider_free_mem_count));
 
-#ifdef _WIN32
-  HMODULE current_module = GetModuleHandle(NULL);
-#ifndef SPIDER_HAS_NEXT_THREAD_ID
-  spd_db_att_thread_id = (ulong *)
-    GetProcAddress(current_module, "?thread_id@@3KA");
-#endif
-#ifdef SPIDER_XID_USES_xid_cache_iterate
-#else
-#ifdef XID_CACHE_IS_SPLITTED
-  spd_db_att_xid_cache_split_num = (uint *)
-    GetProcAddress(current_module,
-      "?opt_xid_cache_split_num@@3IA");
-  spd_db_att_LOCK_xid_cache = *((pthread_mutex_t **)
-    GetProcAddress(current_module,
-      "?LOCK_xid_cache@@3PAUst_mysql_mutex@@A"));
-  spd_db_att_xid_cache = *((HASH **)
-    GetProcAddress(current_module, "?xid_cache@@3PAUst_hash@@A"));
-#else
-  spd_db_att_LOCK_xid_cache = (pthread_mutex_t *)
-#if MYSQL_VERSION_ID < 50500
-    GetProcAddress(current_module,
-      "?LOCK_xid_cache@@3U_RTL_CRITICAL_SECTION@@A");
-#else
-    GetProcAddress(current_module,
-      "?LOCK_xid_cache@@3Ust_mysql_mutex@@A");
-#endif
-  spd_db_att_xid_cache = (HASH *)
-    GetProcAddress(current_module, "?xid_cache@@3Ust_hash@@A");
-#endif
-#endif
-  spd_charset_utf8_bin = (struct charset_info_st *)
-    GetProcAddress(current_module, "my_charset_utf8_bin");
-  spd_defaults_extra_file = (const char **)
-    GetProcAddress(current_module, "my_defaults_extra_file");
-  spd_defaults_file = (const char **)
-    GetProcAddress(current_module, "my_defaults_file");
-  spd_mysqld_unix_port = (const char **)
-    GetProcAddress(current_module, "?mysqld_unix_port@@3PADA");
-  spd_mysqld_port = (uint *)
-    GetProcAddress(current_module, "?mysqld_port@@3IA");
-  spd_abort_loop = (bool volatile *)
-    GetProcAddress(current_module, "?abort_loop@@3_NC");
-  spd_tz_system = *(Time_zone **)
-#ifdef _WIN64
-    GetProcAddress(current_module, "?my_tz_SYSTEM@@3PEAVTime_zone@@EA");
-#else
-    GetProcAddress(current_module, "?my_tz_SYSTEM@@3PAVTime_zone@@A");
-#endif
-  spd_mysqld_server_started = (int *)
-    GetProcAddress(current_module, "?mysqld_server_started@@3HA");
-  spd_LOCK_server_started = (pthread_mutex_t *)
-    GetProcAddress(current_module,
-      "?LOCK_server_started@@3Ust_mysql_mutex@@A");
-  spd_COND_server_started = (pthread_cond_t *)
-    GetProcAddress(current_module, "?COND_server_started@@3Ust_mysql_cond@@A");
-#else
 #ifndef SPIDER_HAS_NEXT_THREAD_ID
   spd_db_att_thread_id = &thread_id;
 #endif
@@ -7046,7 +7264,7 @@ int spider_db_init(
   spd_db_att_xid_cache = &xid_cache;
 #endif
 #endif
-  spd_charset_utf8_bin = &my_charset_utf8_bin;
+  spd_charset_utf8mb3_bin = &my_charset_utf8mb3_bin;
   spd_defaults_extra_file = &my_defaults_extra_file;
   spd_defaults_file = &my_defaults_file;
   spd_mysqld_unix_port = (const char **) &mysqld_unix_port;
@@ -7056,7 +7274,6 @@ int spider_db_init(
   spd_mysqld_server_started = &mysqld_server_started;
   spd_LOCK_server_started = &LOCK_server_started;
   spd_COND_server_started = &COND_server_started;
-#endif
 
 #ifdef HAVE_PSI_INTERFACE
   init_spider_psi_keys();
@@ -7110,12 +7327,12 @@ int spider_db_init(
 
 #ifdef WITH_PARTITION_STORAGE_ENGINE
 #if MYSQL_VERSION_ID < 50500
-  if (pthread_mutex_init(&spider_pt_share_mutex, MY_MUTEX_INIT_FAST))
+  if (pthread_mutex_init(&spider_wide_share_mutex, MY_MUTEX_INIT_FAST))
 #else
-  if (mysql_mutex_init(spd_key_mutex_pt_share,
-    &spider_pt_share_mutex, MY_MUTEX_INIT_FAST))
+  if (mysql_mutex_init(spd_key_mutex_wide_share,
+    &spider_wide_share_mutex, MY_MUTEX_INIT_FAST))
 #endif
-    goto error_pt_share_mutex_init;
+    goto error_wide_share_mutex_init;
 
 #endif
 #if MYSQL_VERSION_ID < 50500
@@ -7184,7 +7401,7 @@ int spider_db_init(
 #endif
     goto error_mem_calc_mutex_init;
 
-  if (my_hash_init(&spider_open_tables, spd_charset_utf8_bin, 32, 0, 0,
+  if (my_hash_init(PSI_INSTRUMENT_ME, &spider_open_tables, spd_charset_utf8mb3_bin, 32, 0, 0,
                    (my_hash_get_key) spider_tbl_get_key, 0, 0))
     goto error_open_tables_hash_init;
 
@@ -7193,7 +7410,7 @@ int spider_db_init(
     spider_open_tables,
     spider_open_tables.array.max_element *
     spider_open_tables.array.size_of_element);
-  if (my_hash_init(&spider_init_error_tables, spd_charset_utf8_bin, 32, 0, 0,
+  if (my_hash_init(PSI_INSTRUMENT_ME, &spider_init_error_tables, spd_charset_utf8mb3_bin, 32, 0, 0,
                    (my_hash_get_key) spider_tbl_get_key, 0, 0))
     goto error_init_error_tables_hash_init;
 
@@ -7203,20 +7420,21 @@ int spider_db_init(
     spider_init_error_tables.array.max_element *
     spider_init_error_tables.array.size_of_element);
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-  if (my_hash_init(&spider_open_pt_share, spd_charset_utf8_bin, 32, 0, 0,
-                   (my_hash_get_key) spider_pt_share_get_key, 0, 0))
-    goto error_open_pt_share_hash_init;
+  if(
+    my_hash_init(PSI_INSTRUMENT_ME, &spider_open_wide_share, spd_charset_utf8mb3_bin, 32, 0, 0,
+                   (my_hash_get_key) spider_wide_share_get_key, 0, 0)
+  )
+    goto error_open_wide_share_hash_init;
 
-  spider_alloc_calc_mem_init(spider_open_pt_share, 145);
+  spider_alloc_calc_mem_init(spider_open_wide_share, 145);
   spider_alloc_calc_mem(NULL,
-    spider_open_pt_share,
-    spider_open_pt_share.array.max_element *
-    spider_open_pt_share.array.size_of_element);
-#endif
-  if (my_hash_init(&spider_lgtm_tblhnd_share_hash, spd_charset_utf8_bin,
-                   32, 0, 0,
-                   (my_hash_get_key) spider_lgtm_tblhnd_share_hash_get_key,
-                   0, 0))
+    spider_open_wide_share,
+    spider_open_wide_share.array.max_element *
+    spider_open_wide_share.array.size_of_element);
+#endif
+  if (my_hash_init(PSI_INSTRUMENT_ME, &spider_lgtm_tblhnd_share_hash,
+                   spd_charset_utf8mb3_bin, 32, 0, 0,
+                   (my_hash_get_key) spider_lgtm_tblhnd_share_hash_get_key, 0, 0))
     goto error_lgtm_tblhnd_share_hash_init;
 
   spider_alloc_calc_mem_init(spider_lgtm_tblhnd_share_hash, 245);
@@ -7224,11 +7442,11 @@ int spider_db_init(
     spider_lgtm_tblhnd_share_hash,
     spider_lgtm_tblhnd_share_hash.array.max_element *
     spider_lgtm_tblhnd_share_hash.array.size_of_element);
-  if (my_hash_init(&spider_open_connections, spd_charset_utf8_bin, 32, 0, 0,
+  if (my_hash_init(PSI_INSTRUMENT_ME, &spider_open_connections, spd_charset_utf8mb3_bin, 32, 0, 0,
                    (my_hash_get_key) spider_conn_get_key, 0, 0))
     goto error_open_connections_hash_init;
 
-  if (my_hash_init(&spider_ipport_conns, spd_charset_utf8_bin, 32, 0, 0,
+  if (my_hash_init(PSI_INSTRUMENT_ME, &spider_ipport_conns, spd_charset_utf8mb3_bin, 32, 0, 0,
                    (my_hash_get_key) spider_ipport_conn_get_key,
                    spider_free_ipport_conn, 0))
       goto error_ipport_conn__hash_init;
@@ -7239,7 +7457,7 @@ int spider_db_init(
     spider_open_connections.array.max_element *
     spider_open_connections.array.size_of_element);
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-  if (my_hash_init(&spider_hs_r_conn_hash, spd_charset_utf8_bin, 32, 0, 0,
+  if (my_hash_init(PSI_INSTRUMENT_ME, &spider_hs_r_conn_hash, spd_charset_utf8mb3_bin, 32, 0, 0,
                    (my_hash_get_key) spider_conn_get_key, 0, 0))
     goto error_hs_r_conn_hash_init;
 
@@ -7248,7 +7466,7 @@ int spider_db_init(
     spider_hs_r_conn_hash,
     spider_hs_r_conn_hash.array.max_element *
     spider_hs_r_conn_hash.array.size_of_element);
-  if (my_hash_init(&spider_hs_w_conn_hash, spd_charset_utf8_bin, 32, 0, 0,
+  if (my_hash_init(PSI_INSTRUMENT_ME, &spider_hs_w_conn_hash, spd_charset_utf8mb3_bin, 32, 0, 0,
                    (my_hash_get_key) spider_conn_get_key, 0, 0))
     goto error_hs_w_conn_hash_init;
 
@@ -7258,7 +7476,7 @@ int spider_db_init(
     spider_hs_w_conn_hash.array.max_element *
     spider_hs_w_conn_hash.array.size_of_element);
 #endif
-  if (my_hash_init(&spider_allocated_thds, spd_charset_utf8_bin, 32, 0, 0,
+  if (my_hash_init(PSI_INSTRUMENT_ME, &spider_allocated_thds, spd_charset_utf8mb3_bin, 32, 0, 0,
                    (my_hash_get_key) spider_allocated_thds_get_key, 0, 0))
     goto error_allocated_thds_hash_init;
 
@@ -7280,12 +7498,12 @@ int spider_db_init(
 
   if (!(spider_udf_table_mon_mutexes = (pthread_mutex_t *)
     spider_bulk_malloc(NULL, 53, MYF(MY_WME | MY_ZEROFILL),
-      &spider_udf_table_mon_mutexes, sizeof(pthread_mutex_t) *
-        spider_param_udf_table_mon_mutex_count(),
-      &spider_udf_table_mon_conds, sizeof(pthread_cond_t) *
-        spider_param_udf_table_mon_mutex_count(),
-      &spider_udf_table_mon_list_hash, sizeof(HASH) *
-        spider_param_udf_table_mon_mutex_count(),
+      &spider_udf_table_mon_mutexes, (uint) (sizeof(pthread_mutex_t) *
+        spider_param_udf_table_mon_mutex_count()),
+      &spider_udf_table_mon_conds, (uint) (sizeof(pthread_cond_t) *
+        spider_param_udf_table_mon_mutex_count()),
+      &spider_udf_table_mon_list_hash, (uint) (sizeof(HASH) *
+        spider_param_udf_table_mon_mutex_count()),
       NullS))
   )
     goto error_alloc_mon_mutxes;
@@ -7319,8 +7537,8 @@ int spider_db_init(
     roop_count < (int) spider_param_udf_table_mon_mutex_count();
     roop_count++)
   {
-    if (my_hash_init(&spider_udf_table_mon_list_hash[roop_count],
-      spd_charset_utf8_bin, 32, 0, 0,
+    if (my_hash_init(PSI_INSTRUMENT_ME, &spider_udf_table_mon_list_hash[roop_count],
+      spd_charset_utf8mb3_bin, 32, 0, 0,
       (my_hash_get_key) spider_udf_tbl_mon_list_key, 0, 0))
       goto error_init_udf_table_mon_list_hash;
 
@@ -7334,10 +7552,10 @@ int spider_db_init(
 #ifndef WITHOUT_SPIDER_BG_SEARCH
   if (!(spider_table_sts_threads = (SPIDER_THREAD *)
     spider_bulk_malloc(NULL, 256, MYF(MY_WME | MY_ZEROFILL),
-      &spider_table_sts_threads, sizeof(SPIDER_THREAD) *
-        spider_param_table_sts_thread_count(),
-      &spider_table_crd_threads, sizeof(SPIDER_THREAD) *
-        spider_param_table_crd_thread_count(),
+      &spider_table_sts_threads, (uint) (sizeof(SPIDER_THREAD) *
+        spider_param_table_sts_thread_count()),
+      &spider_table_crd_threads, (uint) (sizeof(SPIDER_THREAD) *
+        spider_param_table_crd_thread_count()),
       NullS))
   )
     goto error_alloc_mon_mutxes;
@@ -7480,11 +7698,11 @@ error_open_connections_hash_init:
 error_lgtm_tblhnd_share_hash_init:
 #ifdef WITH_PARTITION_STORAGE_ENGINE
   spider_free_mem_calc(NULL,
-    spider_open_pt_share_id,
-    spider_open_pt_share.array.max_element *
-    spider_open_pt_share.array.size_of_element);
-  my_hash_free(&spider_open_pt_share);
-error_open_pt_share_hash_init:
+    spider_open_wide_share_id,
+    spider_open_wide_share.array.max_element *
+    spider_open_wide_share.array.size_of_element);
+  my_hash_free(&spider_open_wide_share);
+error_open_wide_share_hash_init:
 #endif
   spider_free_mem_calc(NULL,
     spider_init_error_tables_id,
@@ -7517,8 +7735,8 @@ error_conn_mutex_init:
   pthread_mutex_destroy(&spider_lgtm_tblhnd_share_mutex);
 error_lgtm_tblhnd_share_mutex_init:
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-  pthread_mutex_destroy(&spider_pt_share_mutex);
-error_pt_share_mutex_init:
+  pthread_mutex_destroy(&spider_wide_share_mutex);
+error_wide_share_mutex_init:
 #endif
   pthread_mutex_destroy(&spider_init_error_tbl_mutex);
 error_init_error_tbl_mutex_init:
@@ -7684,7 +7902,9 @@ int spider_get_sts(
   int sts_sync_level,
   uint flag
 ) {
-  int get_type __attribute__ ((unused));
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+  int get_type;
+#endif
   int error_num = 0;
   bool need_to_get = TRUE;
   DBUG_ENTER("spider_get_sts");
@@ -7696,26 +7916,26 @@ int spider_get_sts(
     /* get */
     get_type = 1;
   } else if (
-    !share->partition_share->sts_init
+    !share->wide_share->sts_init
   ) {
-    pthread_mutex_lock(&share->partition_share->sts_mutex);
-    if (!share->partition_share->sts_init)
+    pthread_mutex_lock(&share->wide_share->sts_mutex);
+    if (!share->wide_share->sts_init)
     {
       /* get after mutex_lock */
       get_type = 2;
     } else {
-      pthread_mutex_unlock(&share->partition_share->sts_mutex);
+      pthread_mutex_unlock(&share->wide_share->sts_mutex);
       /* copy */
       get_type = 0;
     }
   } else if (
-    difftime(share->sts_get_time, share->partition_share->sts_get_time) <
+    difftime(share->sts_get_time, share->wide_share->sts_get_time) <
       sts_interval
   ) {
     /* copy */
     get_type = 0;
   } else if (
-    !pthread_mutex_trylock(&share->partition_share->sts_mutex)
+    !pthread_mutex_trylock(&share->wide_share->sts_mutex)
   ) {
     /* get after mutex_trylock */
     get_type = 3;
@@ -7747,7 +7967,7 @@ int spider_get_sts(
   {
 #ifdef WITH_PARTITION_STORAGE_ENGINE
     if (get_type == 0)
-      spider_copy_sts_to_share(share, share->partition_share);
+      spider_copy_sts_to_share(share, share->wide_share);
     else {
 #endif
       error_num = spider_db_show_table_status(spider, link_idx, sts_mode, flag);
@@ -7757,20 +7977,20 @@ int spider_get_sts(
   }
 #ifdef WITH_PARTITION_STORAGE_ENGINE
   if (get_type >= 2)
-    pthread_mutex_unlock(&share->partition_share->sts_mutex);
+    pthread_mutex_unlock(&share->wide_share->sts_mutex);
 #endif
   if (error_num)
   {
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-    SPIDER_PARTITION_HANDLER_SHARE *partition_handler_share =
-      spider->partition_handler_share;
+    SPIDER_PARTITION_HANDLER *partition_handler =
+      spider->partition_handler;
     if (
-      !share->partition_share->sts_init &&
+      !share->wide_share->sts_init &&
       sts_sync >= sts_sync_level &&
       get_type > 1 &&
-      partition_handler_share &&
-      partition_handler_share->handlers &&
-      partition_handler_share->handlers[0] == spider
+      partition_handler &&
+      partition_handler->handlers &&
+      partition_handler->handlers[0] == spider
     ) {
       int roop_count;
       ha_spider *tmp_spider;
@@ -7778,13 +7998,13 @@ int spider_get_sts(
       double tmp_sts_interval;
       int tmp_sts_mode;
       int tmp_sts_sync;
-      THD *thd = spider->trx->thd;
+      THD *thd = spider->wide_handler->trx->thd;
       for (roop_count = 1;
-        roop_count < (int) partition_handler_share->use_count;
+        roop_count < (int) partition_handler->no_parts;
         roop_count++)
       {
         tmp_spider =
-          (ha_spider *) partition_handler_share->handlers[roop_count];
+          (ha_spider *) partition_handler->handlers[roop_count];
         tmp_share = tmp_spider->share;
         tmp_sts_interval = spider_param_sts_interval(thd, share->sts_interval);
         tmp_sts_mode = spider_param_sts_mode(thd, share->sts_mode);
@@ -7792,12 +8012,12 @@ int spider_get_sts(
         spider_get_sts(tmp_share, tmp_spider->search_link_idx,
           tmp_time, tmp_spider, tmp_sts_interval, tmp_sts_mode, tmp_sts_sync,
           1, flag);
-        if (share->partition_share->sts_init)
+        if (share->wide_share->sts_init)
         {
           error_num = 0;
           thd->clear_error();
           get_type = 0;
-          spider_copy_sts_to_share(share, share->partition_share);
+          spider_copy_sts_to_share(share, share->wide_share);
           break;
         }
       }
@@ -7809,9 +8029,9 @@ int spider_get_sts(
 #ifdef WITH_PARTITION_STORAGE_ENGINE
   if (sts_sync >= sts_sync_level && get_type > 0)
   {
-    spider_copy_sts_to_pt_share(share->partition_share, share);
-    share->partition_share->sts_get_time = tmp_time;
-    share->partition_share->sts_init = TRUE;
+    spider_copy_sts_to_wide_share(share->wide_share, share);
+    share->wide_share->sts_get_time = tmp_time;
+    share->wide_share->sts_init = TRUE;
   }
 #endif
   share->sts_get_time = tmp_time;
@@ -7832,7 +8052,9 @@ int spider_get_crd(
 #endif
   int crd_sync_level
 ) {
-  int get_type __attribute__ ((unused));
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+  int get_type;
+#endif
   int error_num = 0;
   bool need_to_get = TRUE;
   DBUG_ENTER("spider_get_crd");
@@ -7844,26 +8066,26 @@ int spider_get_crd(
     /* get */
     get_type = 1;
   } else if (
-    !share->partition_share->crd_init
+    !share->wide_share->crd_init
   ) {
-    pthread_mutex_lock(&share->partition_share->crd_mutex);
-    if (!share->partition_share->crd_init)
+    pthread_mutex_lock(&share->wide_share->crd_mutex);
+    if (!share->wide_share->crd_init)
     {
       /* get after mutex_lock */
       get_type = 2;
     } else {
-      pthread_mutex_unlock(&share->partition_share->crd_mutex);
+      pthread_mutex_unlock(&share->wide_share->crd_mutex);
       /* copy */
       get_type = 0;
     }
   } else if (
-    difftime(share->crd_get_time, share->partition_share->crd_get_time) <
+    difftime(share->crd_get_time, share->wide_share->crd_get_time) <
       crd_interval
   ) {
     /* copy */
     get_type = 0;
   } else if (
-    !pthread_mutex_trylock(&share->partition_share->crd_mutex)
+    !pthread_mutex_trylock(&share->wide_share->crd_mutex)
   ) {
     /* get after mutex_trylock */
     get_type = 3;
@@ -7895,7 +8117,7 @@ int spider_get_crd(
   {
 #ifdef WITH_PARTITION_STORAGE_ENGINE
     if (get_type == 0)
-      spider_copy_crd_to_share(share, share->partition_share,
+      spider_copy_crd_to_share(share, share->wide_share,
         table->s->fields);
     else {
 #endif
@@ -7906,20 +8128,20 @@ int spider_get_crd(
   }
 #ifdef WITH_PARTITION_STORAGE_ENGINE
   if (get_type >= 2)
-    pthread_mutex_unlock(&share->partition_share->crd_mutex);
+    pthread_mutex_unlock(&share->wide_share->crd_mutex);
 #endif
   if (error_num)
   {
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-    SPIDER_PARTITION_HANDLER_SHARE *partition_handler_share =
-      spider->partition_handler_share;
+    SPIDER_PARTITION_HANDLER *partition_handler =
+      spider->partition_handler;
     if (
-      !share->partition_share->crd_init &&
+      !share->wide_share->crd_init &&
       crd_sync >= crd_sync_level &&
       get_type > 1 &&
-      partition_handler_share &&
-      partition_handler_share->handlers &&
-      partition_handler_share->handlers[0] == spider
+      partition_handler &&
+      partition_handler->handlers &&
+      partition_handler->handlers[0] == spider
     ) {
       int roop_count;
       ha_spider *tmp_spider;
@@ -7927,13 +8149,13 @@ int spider_get_crd(
       double tmp_crd_interval;
       int tmp_crd_mode;
       int tmp_crd_sync;
-      THD *thd = spider->trx->thd;
+      THD *thd = spider->wide_handler->trx->thd;
       for (roop_count = 1;
-        roop_count < (int) partition_handler_share->use_count;
+        roop_count < (int) partition_handler->no_parts;
         roop_count++)
       {
         tmp_spider =
-          (ha_spider *) partition_handler_share->handlers[roop_count];
+          (ha_spider *) partition_handler->handlers[roop_count];
         tmp_share = tmp_spider->share;
         tmp_crd_interval = spider_param_crd_interval(thd, share->crd_interval);
         tmp_crd_mode = spider_param_crd_mode(thd, share->crd_mode);
@@ -7941,12 +8163,12 @@ int spider_get_crd(
         spider_get_crd(tmp_share, tmp_spider->search_link_idx,
           tmp_time, tmp_spider, table, tmp_crd_interval, tmp_crd_mode,
           tmp_crd_sync, 1);
-        if (share->partition_share->crd_init)
+        if (share->wide_share->crd_init)
         {
           error_num = 0;
           thd->clear_error();
           get_type = 0;
-          spider_copy_crd_to_share(share, share->partition_share,
+          spider_copy_crd_to_share(share, share->wide_share,
             table->s->fields);
           break;
         }
@@ -7959,10 +8181,10 @@ int spider_get_crd(
 #ifdef WITH_PARTITION_STORAGE_ENGINE
   if (crd_sync >= crd_sync_level && get_type > 0)
   {
-    spider_copy_crd_to_pt_share(share->partition_share, share,
+    spider_copy_crd_to_wide_share(share->wide_share, share,
       table->s->fields);
-    share->partition_share->crd_get_time = tmp_time;
-    share->partition_share->crd_init = TRUE;
+    share->wide_share->crd_get_time = tmp_time;
+    share->wide_share->crd_init = TRUE;
   }
 #endif
   share->crd_get_time = tmp_time;
@@ -7975,14 +8197,14 @@ void spider_set_result_list_param(
 ) {
   SPIDER_RESULT_LIST *result_list = &spider->result_list;
   SPIDER_SHARE *share = spider->share;
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   DBUG_ENTER("spider_set_result_list_param");
   result_list->internal_offset =
     spider_param_internal_offset(thd, share->internal_offset);
   result_list->internal_limit =
 #ifdef INFO_KIND_FORCE_LIMIT_BEGIN
-    spider->info_limit < 9223372036854775807LL ?
-    spider->info_limit :
+    spider->wide_handler->info_limit < 9223372036854775807LL ?
+    spider->wide_handler->info_limit :
 #endif
     spider_param_internal_limit(thd, share->internal_limit);
   result_list->split_read = spider_split_read_param(spider);
@@ -8033,8 +8255,8 @@ SPIDER_INIT_ERROR_TABLE *spider_get_init_error_table(
     }
     if (!(spider_init_error_table = (SPIDER_INIT_ERROR_TABLE *)
       spider_bulk_malloc(spider_current_trx, 54, MYF(MY_WME | MY_ZEROFILL),
-        &spider_init_error_table, sizeof(*spider_init_error_table),
-        &tmp_name, share->table_name_length + 1,
+        &spider_init_error_table, (uint) (sizeof(*spider_init_error_table)),
+        &tmp_name, (uint) (share->table_name_length + 1),
         NullS))
     ) {
       pthread_mutex_unlock(&spider_init_error_tbl_mutex);
@@ -8261,12 +8483,13 @@ void spider_set_tmp_share_pointer(
   tmp_share->tgt_ssl_keys = &tmp_connect_info[12];
   tmp_share->tgt_default_files = &tmp_connect_info[13];
   tmp_share->tgt_default_groups = &tmp_connect_info[14];
-  tmp_share->tgt_pk_names = &tmp_connect_info[15];
-  tmp_share->tgt_sequence_names = &tmp_connect_info[16];
-  tmp_share->static_link_ids = &tmp_connect_info[17];
+  tmp_share->tgt_dsns = &tmp_connect_info[15];
+  tmp_share->tgt_pk_names = &tmp_connect_info[16];
+  tmp_share->tgt_sequence_names = &tmp_connect_info[17];
+  tmp_share->static_link_ids = &tmp_connect_info[18];
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-  tmp_share->hs_read_socks = &tmp_connect_info[18];
-  tmp_share->hs_write_socks = &tmp_connect_info[19];
+  tmp_share->hs_read_socks = &tmp_connect_info[19];
+  tmp_share->hs_write_socks = &tmp_connect_info[20];
 #endif
   tmp_share->tgt_ports = &tmp_long[0];
   tmp_share->tgt_ssl_vscs = &tmp_long[1];
@@ -8294,6 +8517,7 @@ void spider_set_tmp_share_pointer(
   tmp_long[15] = -1;
   tmp_share->access_balances = &tmp_long[17];
   tmp_share->bka_table_name_types = &tmp_long[18];
+  tmp_share->strict_group_bys = &tmp_long[19];
   tmp_share->monitoring_limit = &tmp_longlong[0];
   tmp_share->monitoring_sid = &tmp_longlong[1];
 #ifndef WITHOUT_SPIDER_BG_SEARCH
@@ -8314,12 +8538,13 @@ void spider_set_tmp_share_pointer(
   tmp_share->tgt_ssl_keys_lengths = &tmp_connect_info_length[12];
   tmp_share->tgt_default_files_lengths = &tmp_connect_info_length[13];
   tmp_share->tgt_default_groups_lengths = &tmp_connect_info_length[14];
-  tmp_share->tgt_pk_names_lengths = &tmp_connect_info_length[15];
-  tmp_share->tgt_sequence_names_lengths = &tmp_connect_info_length[16];
-  tmp_share->static_link_ids_lengths = &tmp_connect_info_length[17];
+  tmp_share->tgt_dsns_lengths = &tmp_connect_info_length[15];
+  tmp_share->tgt_pk_names_lengths = &tmp_connect_info_length[16];
+  tmp_share->tgt_sequence_names_lengths = &tmp_connect_info_length[17];
+  tmp_share->static_link_ids_lengths = &tmp_connect_info_length[18];
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
-  tmp_share->hs_read_socks_lengths = &tmp_connect_info_length[18];
-  tmp_share->hs_write_socks_lengths = &tmp_connect_info_length[19];
+  tmp_share->hs_read_socks_lengths = &tmp_connect_info_length[19];
+  tmp_share->hs_write_socks_lengths = &tmp_connect_info_length[20];
 #endif
   tmp_share->server_names_length = 1;
   tmp_share->tgt_table_names_length = 1;
@@ -8336,6 +8561,7 @@ void spider_set_tmp_share_pointer(
   tmp_share->tgt_ssl_keys_length = 1;
   tmp_share->tgt_default_files_length = 1;
   tmp_share->tgt_default_groups_length = 1;
+  tmp_share->tgt_dsns_length = 1;
   tmp_share->tgt_pk_names_length = 1;
   tmp_share->tgt_sequence_names_length = 1;
   tmp_share->static_link_ids_length = 1;
@@ -8369,6 +8595,7 @@ void spider_set_tmp_share_pointer(
   tmp_share->net_write_timeouts_length = 1;
   tmp_share->access_balances_length = 1;
   tmp_share->bka_table_name_types_length = 1;
+  tmp_share->strict_group_bys_length = 1;
 
 #ifndef WITHOUT_SPIDER_BG_SEARCH
   tmp_share->monitoring_bg_flag[0] = -1;
@@ -8462,7 +8689,7 @@ void spider_free_tmp_dbton_handler(
 TABLE_LIST *spider_get_parent_table_list(
   ha_spider *spider
 ) {
-  TABLE *table = spider->get_top_table();
+  TABLE *table = spider->get_table();
   DBUG_ENTER("spider_get_parent_table_list");
   DBUG_RETURN(table->pos_in_table_list);
 }
@@ -8528,7 +8755,7 @@ longlong spider_split_read_param(
 ) {
   SPIDER_SHARE *share = spider->share;
   SPIDER_RESULT_LIST *result_list = &spider->result_list;
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   st_select_lex *select_lex;
   longlong select_limit;
   longlong offset_limit;
@@ -8537,10 +8764,11 @@ longlong spider_split_read_param(
   DBUG_ENTER("spider_split_read_param");
   result_list->set_split_read_count = 1;
 #ifdef INFO_KIND_FORCE_LIMIT_BEGIN
-  if (spider->info_limit < 9223372036854775807LL)
+  if (spider->wide_handler->info_limit < 9223372036854775807LL)
   {
-    DBUG_PRINT("info",("spider info_limit=%lld", spider->info_limit));
-    longlong info_limit = spider->info_limit;
+    DBUG_PRINT("info",("spider info_limit=%lld",
+      spider->wide_handler->info_limit));
+    longlong info_limit = spider->wide_handler->info_limit;
     result_list->split_read_base = info_limit;
     result_list->semi_split_read = 0;
     result_list->first_read = info_limit;
@@ -8560,7 +8788,8 @@ longlong spider_split_read_param(
   {
     int bulk_update_mode = spider_param_bulk_update_mode(thd,
       share->bulk_update_mode);
-    DBUG_PRINT("info",("spider sql_command=%u", spider->sql_command));
+    DBUG_PRINT("info",("spider sql_command=%u",
+      spider->wide_handler->sql_command));
     DBUG_PRINT("info",("spider bulk_update_mode=%d", bulk_update_mode));
     DBUG_PRINT("info",("spider support_bulk_update_sql=%s",
       spider->support_bulk_update_sql() ? "TRUE" : "FALSE"));
@@ -8568,32 +8797,32 @@ longlong spider_split_read_param(
     bool inserting =
       (
 #ifdef HS_HAS_SQLCOM
-        spider->sql_command == SQLCOM_HS_INSERT ||
+        spider->wide_handler->sql_command == SQLCOM_HS_INSERT ||
 #endif
-        spider->sql_command == SQLCOM_INSERT ||
-        spider->sql_command == SQLCOM_INSERT_SELECT
+        spider->wide_handler->sql_command == SQLCOM_INSERT ||
+        spider->wide_handler->sql_command == SQLCOM_INSERT_SELECT
       );
 #endif
     bool updating =
       (
 #ifdef HS_HAS_SQLCOM
-        spider->sql_command == SQLCOM_HS_UPDATE ||
+        spider->wide_handler->sql_command == SQLCOM_HS_UPDATE ||
 #endif
-        spider->sql_command == SQLCOM_UPDATE ||
-        spider->sql_command == SQLCOM_UPDATE_MULTI
+        spider->wide_handler->sql_command == SQLCOM_UPDATE ||
+        spider->wide_handler->sql_command == SQLCOM_UPDATE_MULTI
       );
     bool deleting =
       (
 #ifdef HS_HAS_SQLCOM
-        spider->sql_command == SQLCOM_HS_DELETE ||
+        spider->wide_handler->sql_command == SQLCOM_HS_DELETE ||
 #endif
-        spider->sql_command == SQLCOM_DELETE ||
-        spider->sql_command == SQLCOM_DELETE_MULTI
+        spider->wide_handler->sql_command == SQLCOM_DELETE ||
+        spider->wide_handler->sql_command == SQLCOM_DELETE_MULTI
       );
     bool replacing =
       (
-        spider->sql_command == SQLCOM_REPLACE ||
-        spider->sql_command == SQLCOM_REPLACE_SELECT
+        spider->wide_handler->sql_command == SQLCOM_REPLACE ||
+        spider->wide_handler->sql_command == SQLCOM_REPLACE_SELECT
       );
     DBUG_PRINT("info",("spider updating=%s", updating ? "TRUE" : "FALSE"));
     DBUG_PRINT("info",("spider deleting=%s", deleting ? "TRUE" : "FALSE"));
@@ -8779,20 +9008,20 @@ void spider_next_split_read_param(
 bool spider_check_direct_order_limit(
   ha_spider *spider
 ) {
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
   SPIDER_SHARE *share = spider->share;
   st_select_lex *select_lex;
   longlong select_limit;
   longlong offset_limit;
   DBUG_ENTER("spider_check_direct_order_limit");
-  if (spider_check_index_merge(spider->get_top_table(),
+  if (spider_check_index_merge(spider->get_table(),
     spider_get_select_lex(spider)))
   {
     DBUG_PRINT("info",("spider set use_index_merge"));
     spider->use_index_merge = TRUE;
   }
   DBUG_PRINT("info",("spider SQLCOM_HA_READ=%s",
-    (spider->sql_command == SQLCOM_HA_READ) ? "TRUE" : "FALSE"));
+    (spider->wide_handler->sql_command == SQLCOM_HA_READ) ? "TRUE" : "FALSE"));
   DBUG_PRINT("info",("spider sql_kinds with SPIDER_SQL_KIND_HANDLER=%s",
     (spider->sql_kinds & SPIDER_SQL_KIND_HANDLER) ? "TRUE" : "FALSE"));
   DBUG_PRINT("info",("spider use_index_merge=%s",
@@ -8804,7 +9033,7 @@ bool spider_check_direct_order_limit(
     spider->is_bulk_access_clone ? "TRUE" : "FALSE"));
 #endif
   if (
-    spider->sql_command != SQLCOM_HA_READ &&
+    spider->wide_handler->sql_command != SQLCOM_HA_READ &&
     !spider->use_index_merge &&
 #ifdef HA_CAN_BULK_ACCESS
     (!spider->is_clone || spider->is_bulk_access_clone)
@@ -8962,7 +9191,7 @@ bool spider_check_direct_order_limit(
       DBUG_PRINT("info",("spider TRUE"));
       spider->result_list.internal_limit = select_limit + offset_limit;
       spider->result_list.split_read = select_limit + offset_limit;
-      spider->trx->direct_order_limit_count++;
+      spider->wide_handler->trx->direct_order_limit_count++;
       DBUG_RETURN(TRUE);
     }
   }
@@ -9041,22 +9270,9 @@ Field *spider_field_exchange(
 #endif
   DBUG_PRINT("info",("spider in field=%p", field));
   DBUG_PRINT("info",("spider in field->table=%p", field->table));
-#ifdef HANDLER_HAS_TOP_TABLE_FIELDS
-  if (handler->set_top_table_fields)
-  {
-    DBUG_PRINT("info",("spider top_table=%p", handler->top_table));
-    if (field->table != handler->top_table)
-      DBUG_RETURN(NULL);
-    if (!(field = handler->top_table_field[field->field_index]))
-      DBUG_RETURN(NULL);
-  } else {
-#endif
     DBUG_PRINT("info",("spider table=%p", handler->get_table()));
     if (field->table != handler->get_table())
       DBUG_RETURN(NULL);
-#ifdef HANDLER_HAS_TOP_TABLE_FIELDS
-  }
-#endif
   DBUG_PRINT("info",("spider out field=%p", field));
   DBUG_RETURN(field);
 }
@@ -9066,7 +9282,7 @@ int spider_set_direct_limit_offset(
   ha_spider *spider
 ) {
 #ifndef SPIDER_ENGINE_CONDITION_PUSHDOWN_IS_ALWAYS_ON
-  THD *thd = spider->trx->thd;
+  THD *thd = spider->wide_handler->trx->thd;
 #endif
   st_select_lex *select_lex;
   longlong select_limit;
@@ -9078,10 +9294,11 @@ int spider_set_direct_limit_offset(
     DBUG_RETURN(TRUE);
 
   if (
-    spider->pt_handler_share_creator &&
-    spider->pt_handler_share_creator != spider
+    spider->partition_handler &&
+    !spider->wide_handler_owner
   ) {
-    if (spider->pt_handler_share_creator->result_list.direct_limit_offset == TRUE)
+    if (spider->partition_handler->owner->
+      result_list.direct_limit_offset == TRUE)
     {
       spider->result_list.direct_limit_offset = TRUE;
       DBUG_RETURN(TRUE);
@@ -9091,7 +9308,7 @@ int spider_set_direct_limit_offset(
   }
 
   if (
-    spider->sql_command != SQLCOM_SELECT ||
+    spider->wide_handler->sql_command != SQLCOM_SELECT ||
 #ifdef HANDLER_HAS_DIRECT_AGGREGATE
     spider->result_list.direct_aggregate ||
 #endif
@@ -9134,7 +9351,8 @@ int spider_set_direct_limit_offset(
       OPTIMIZER_SWITCH_ENGINE_CONDITION_PUSHDOWN) ||
 #endif
 #endif
-    spider->condition   // conditions is null may be no where condition in rand_init
+    // conditions is null may be no where condition in rand_init
+    spider->wide_handler->condition
   )
     DBUG_RETURN(FALSE);
 
@@ -9332,9 +9550,11 @@ int spider_discover_table_structure(
 #ifdef WITH_PARTITION_STORAGE_ENGINE
   partition_info *part_info = thd->work_part_info;
 #endif
-  Open_tables_backup open_tables_backup;
+  SPIDER_Open_tables_backup open_tables_backup;
   TABLE *table_tables;
-  uint str_len __attribute__ ((unused));
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+  uint str_len;
+#endif
   char buf[MAX_FIELD_WIDTH];
   spider_string str(buf, sizeof(buf), system_charset_info);
   DBUG_ENTER("spider_discover_table_structure");
@@ -9689,6 +9909,7 @@ int spider_create_spider_object_for_share(
   char **hs_w_conn_keys;
 #endif
   spider_db_handler **dbton_hdl;
+  SPIDER_WIDE_HANDLER *wide_handler;
   DBUG_ENTER("spider_create_spider_object_for_share");
   DBUG_PRINT("info",("spider trx=%p", trx));
   DBUG_PRINT("info",("spider share=%p", share));
@@ -9710,25 +9931,27 @@ int spider_create_spider_object_for_share(
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
   if (!(need_mons = (int *)
     spider_bulk_malloc(spider_current_trx, 255, MYF(MY_WME | MY_ZEROFILL),
-      &need_mons, (sizeof(int) * share->link_count),
-      &conns, (sizeof(SPIDER_CONN *) * share->link_count),
-      &conn_link_idx, (sizeof(uint) * share->link_count),
-      &conn_can_fo, (sizeof(uchar) * share->link_bitmap_size),
-      &conn_keys, (sizeof(char *) * share->link_count),
-      &hs_r_conn_keys, (sizeof(char *) * share->link_count),
-      &hs_w_conn_keys, (sizeof(char *) * share->link_count),
-      &dbton_hdl, (sizeof(spider_db_handler *) * SPIDER_DBTON_SIZE),
+      &need_mons, (uint) (sizeof(int) * share->link_count),
+      &conns, (uint) (sizeof(SPIDER_CONN *) * share->link_count),
+      &conn_link_idx, (uint) (sizeof(uint) * share->link_count),
+      &conn_can_fo, (uint) (sizeof(uchar) * share->link_bitmap_size),
+      &conn_keys, (uint) (sizeof(char *) * share->link_count),
+      &hs_r_conn_keys, (uint) (sizeof(char *) * share->link_count),
+      &hs_w_conn_keys, (uint) (sizeof(char *) * share->link_count),
+      &dbton_hdl, (uint) (sizeof(spider_db_handler *) * SPIDER_DBTON_SIZE),
+      &wide_handler, (uint) sizeof(SPIDER_WIDE_HANDLER),
       NullS))
   )
 #else
   if (!(need_mons = (int *)
     spider_bulk_malloc(spider_current_trx, 255, MYF(MY_WME | MY_ZEROFILL),
-      &need_mons, (sizeof(int) * share->link_count),
-      &conns, (sizeof(SPIDER_CONN *) * share->link_count),
-      &conn_link_idx, (sizeof(uint) * share->link_count),
-      &conn_can_fo, (sizeof(uchar) * share->link_bitmap_size),
-      &conn_keys, (sizeof(char *) * share->link_count),
-      &dbton_hdl, (sizeof(spider_db_handler *) * SPIDER_DBTON_SIZE),
+      &need_mons, (uint) (sizeof(int) * share->link_count),
+      &conns, (uint) (sizeof(SPIDER_CONN *) * share->link_count),
+      &conn_link_idx, (uint) (sizeof(uint) * share->link_count),
+      &conn_can_fo, (uint) (sizeof(uchar) * share->link_bitmap_size),
+      &conn_keys, (uint) (sizeof(char *) * share->link_count),
+      &dbton_hdl, (uint) (sizeof(spider_db_handler *) * SPIDER_DBTON_SIZE),
+      &wide_handler, (uint) sizeof(SPIDER_WIDE_HANDLER),
       NullS))
   )
 #endif
@@ -9737,7 +9960,8 @@ int spider_create_spider_object_for_share(
     goto error_need_mons_alloc;
   }
   DBUG_PRINT("info",("spider need_mons=%p", need_mons));
-  (*spider)->trx = trx;
+  (*spider)->wide_handler = wide_handler;
+  wide_handler->trx = trx;
   (*spider)->change_table_ptr(&share->table, share->table_share);
   (*spider)->share = share;
   (*spider)->conns = conns;
@@ -10025,7 +10249,7 @@ void *spider_table_bg_sts_action(
     thread->killed = FALSE;
     pthread_mutex_unlock(&thread->mutex);
 #if !defined(MYSQL_DYNAMIC_PLUGIN) || !defined(_WIN32)
-    my_pthread_setspecific_ptr(THR_THD, NULL);
+    set_current_thd(nullptr);
 #endif
     my_thread_end();
     DBUG_RETURN(NULL);
@@ -10046,13 +10270,22 @@ void *spider_table_bg_sts_action(
       thd->mysys_var->current_mutex = spd_LOCK_server_started;
       if (!(*spd_mysqld_server_started) && !thd->killed)
       {
-        pthread_cond_wait(spd_COND_server_started, spd_LOCK_server_started);
+        do
+        {
+          struct timespec abstime;
+          set_timespec_nsec(abstime, 1000);
+          error_num = pthread_cond_timedwait(spd_COND_server_started,
+            spd_LOCK_server_started, &abstime);
+        } while (
+          (error_num == ETIMEDOUT || error_num == ETIME) &&
+          !(*spd_mysqld_server_started) && !thd->killed && !thread->killed
+        );
       }
       pthread_mutex_unlock(spd_LOCK_server_started);
       thd->mysys_var->current_cond = &thread->cond;
       thd->mysys_var->current_mutex = &thread->mutex;
     }
-    while (spider_init_queries[i].length && !thd->killed)
+    while (spider_init_queries[i].length && !thd->killed && !thread->killed)
     {
       dispatch_command(COM_QUERY, thd, spider_init_queries[i].str,
         (uint) spider_init_queries[i].length, FALSE, FALSE);
@@ -10092,7 +10325,7 @@ void *spider_table_bg_sts_action(
       pthread_cond_signal(&thread->sync_cond);
       pthread_mutex_unlock(&thread->mutex);
 #if !defined(MYSQL_DYNAMIC_PLUGIN) || !defined(_WIN32)
-      my_pthread_setspecific_ptr(THR_THD, NULL);
+      set_current_thd(nullptr);
 #endif
       my_thread_end();
       DBUG_RETURN(NULL);
@@ -10115,7 +10348,7 @@ void *spider_table_bg_sts_action(
     conns = spider->conns;
     if (spider->search_link_idx < 0)
     {
-      spider->trx = trx;
+      spider->wide_handler->trx = trx;
       spider_trx_set_link_idx_for_all(spider);
       spider->search_link_idx = spider_conn_first_link_idx(thd,
         share->link_statuses, share->access_balances, spider->conn_link_idx,
@@ -10224,7 +10457,7 @@ void *spider_table_bg_crd_action(
     thread->killed = FALSE;
     pthread_mutex_unlock(&thread->mutex);
 #if !defined(MYSQL_DYNAMIC_PLUGIN) || !defined(_WIN32)
-    my_pthread_setspecific_ptr(THR_THD, NULL);
+    set_current_thd(nullptr);
 #endif
     my_thread_end();
     DBUG_RETURN(NULL);
@@ -10244,7 +10477,7 @@ void *spider_table_bg_crd_action(
       pthread_cond_signal(&thread->sync_cond);
       pthread_mutex_unlock(&thread->mutex);
 #if !defined(MYSQL_DYNAMIC_PLUGIN) || !defined(_WIN32)
-      my_pthread_setspecific_ptr(THR_THD, NULL);
+      set_current_thd(nullptr);
 #endif
       my_thread_end();
       DBUG_RETURN(NULL);
@@ -10268,7 +10501,7 @@ void *spider_table_bg_crd_action(
     conns = spider->conns;
     if (spider->search_link_idx < 0)
     {
-      spider->trx = trx;
+      spider->wide_handler->trx = trx;
       spider_trx_set_link_idx_for_all(spider);
       spider->search_link_idx = spider_conn_first_link_idx(thd,
         share->link_statuses, share->access_balances, spider->conn_link_idx,
@@ -10500,3 +10733,25 @@ void spider_table_remove_share_from_crd_thread(
   DBUG_VOID_RETURN;
 }
 #endif
+
+uchar *spider_duplicate_char(
+  uchar *dst,
+  uchar esc,
+  uchar *src,
+  uint src_lgt
+) {
+  uchar *ed = src + src_lgt;
+  DBUG_ENTER("spider_duplicate_char");
+  while (src < ed)
+  {
+    *dst = *src;
+    if (*src == esc)
+    {
+      ++dst;
+      *dst = esc;
+    }
+    ++dst;
+    ++src;
+  }
+  DBUG_RETURN(dst);
+}
diff --git a/storage/spider/spd_table.h b/storage/spider/spd_table.h
index 063f459ae8d..dc351e73c61 100644
--- a/storage/spider/spd_table.h
+++ b/storage/spider/spd_table.h
@@ -1,5 +1,5 @@
 /* Copyright (C) 2008-2019 Kentoku Shiba
-   Copyright (C) 2019 MariaDB corp
+   Copyright (C) 2019-2022 MariaDB corp
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -313,19 +313,11 @@ uchar *spider_tbl_get_key(
   my_bool not_used __attribute__ ((unused))
 );
 
-#ifdef WITH_PARTITION_STORAGE_ENGINE
-uchar *spider_pt_share_get_key(
-  SPIDER_PARTITION_SHARE *share,
-  size_t *length,
-  my_bool not_used __attribute__ ((unused))
-);
-
-uchar *spider_pt_handler_share_get_key(
-  SPIDER_PARTITION_HANDLER_SHARE *share,
+uchar *spider_wide_share_get_key(
+  SPIDER_WIDE_SHARE *share,
   size_t *length,
   my_bool not_used __attribute__ ((unused))
 );
-#endif
 
 uchar *spider_link_get_key(
   SPIDER_LINK_FOR_HASH *link_for_hash,
@@ -509,39 +501,37 @@ void spider_update_link_status_for_share(
   long link_status
 );
 
-#ifdef WITH_PARTITION_STORAGE_ENGINE
-SPIDER_PARTITION_SHARE *spider_get_pt_share(
+SPIDER_WIDE_SHARE *spider_get_wide_share(
   SPIDER_SHARE *share,
   TABLE_SHARE *table_share,
   int *error_num
 );
 
-int spider_free_pt_share(
-  SPIDER_PARTITION_SHARE *partition_share
+int spider_free_wide_share(
+  SPIDER_WIDE_SHARE *wide_share
 );
 
-void spider_copy_sts_to_pt_share(
-  SPIDER_PARTITION_SHARE *partition_share,
+void spider_copy_sts_to_wide_share(
+  SPIDER_WIDE_SHARE *wide_share,
   SPIDER_SHARE *share
 );
 
 void spider_copy_sts_to_share(
   SPIDER_SHARE *share,
-  SPIDER_PARTITION_SHARE *partition_share
+  SPIDER_WIDE_SHARE *wide_share
 );
 
-void spider_copy_crd_to_pt_share(
-  SPIDER_PARTITION_SHARE *partition_share,
+void spider_copy_crd_to_wide_share(
+  SPIDER_WIDE_SHARE *wide_share,
   SPIDER_SHARE *share,
   int fields
 );
 
 void spider_copy_crd_to_share(
   SPIDER_SHARE *share,
-  SPIDER_PARTITION_SHARE *partition_share,
+  SPIDER_WIDE_SHARE *wide_share,
   int fields
 );
-#endif
 
 int spider_open_all_tables(
   SPIDER_TRX *trx,
@@ -741,8 +731,8 @@ Field *spider_field_exchange(
 #endif
 
 int spider_set_direct_limit_offset(
-                                   ha_spider*		spider
-                                   );
+  ha_spider *spider
+);
 
 bool spider_check_index_merge(
   TABLE *table,
@@ -829,3 +819,9 @@ void spider_table_remove_share_from_crd_thread(
   SPIDER_SHARE *share
 );
 #endif
+uchar *spider_duplicate_char(
+  uchar *dst,
+  uchar esc,
+  uchar *src,
+  uint src_lgt
+);
diff --git a/storage/spider/spd_trx.cc b/storage/spider/spd_trx.cc
index 4f8296f1d01..80658012506 100644
--- a/storage/spider/spd_trx.cc
+++ b/storage/spider/spd_trx.cc
@@ -49,7 +49,7 @@ extern uint *spd_db_att_xid_cache_split_num;
 extern pthread_mutex_t *spd_db_att_LOCK_xid_cache;
 extern HASH *spd_db_att_xid_cache;
 #endif
-extern struct charset_info_st *spd_charset_utf8_bin;
+extern struct charset_info_st *spd_charset_utf8mb3_bin;
 
 extern handlerton *spider_hton_ptr;
 extern SPIDER_DBTON spider_dbton[SPIDER_DBTON_SIZE];
@@ -247,6 +247,7 @@ int spider_trx_another_lock_tables(
   SPIDER_CONN *conn;
   ha_spider tmp_spider;
   SPIDER_SHARE tmp_share;
+  SPIDER_WIDE_HANDLER tmp_wide_handler;
   char sql_buf[MAX_FIELD_WIDTH];
   spider_string sql_str(sql_buf, sizeof(sql_buf), system_charset_info);
   DBUG_ENTER("spider_trx_another_lock_tables");
@@ -255,13 +256,11 @@ int spider_trx_another_lock_tables(
   sql_str.length(0);
   memset((void*)&tmp_spider, 0, sizeof(ha_spider));
   memset((void*)&tmp_share, 0, sizeof(SPIDER_SHARE));
+  memset((void*)&tmp_wide_handler, 0, sizeof(SPIDER_WIDE_HANDLER));
   tmp_spider.share = &tmp_share;
-  tmp_spider.trx = trx;
+  tmp_spider.wide_handler = &tmp_wide_handler;
+  tmp_wide_handler.trx = trx;
   tmp_share.access_charset = system_charset_info;
-/*
-  if ((error_num = spider_db_append_set_names(&tmp_share)))
-    DBUG_RETURN(error_num);
-*/
   tmp_spider.conns = &conn;
   tmp_spider.result_list.sqls = &sql_str;
   tmp_spider.need_mons = &need_mon;
@@ -273,17 +272,11 @@ int spider_trx_another_lock_tables(
       SPIDER_CONN_RESTORE_DASTATUS_AND_RESET_ERROR_NUM;
       if (error_num)
       {
-/*
-        spider_db_free_set_names(&tmp_share);
-*/
         DBUG_RETURN(error_num);
       }
     }
     roop_count++;
   }
-/*
-  spider_db_free_set_names(&tmp_share);
-*/
   DBUG_RETURN(0);
 }
 
@@ -387,10 +380,13 @@ int spider_trx_all_start_trx(
   THD *thd = trx->thd;
   SPIDER_CONN *conn;
   ha_spider tmp_spider;
+  SPIDER_WIDE_HANDLER tmp_wide_handler;
   DBUG_ENTER("spider_trx_all_start_trx");
   SPIDER_BACKUP_DASTATUS;
   memset((void*)&tmp_spider, 0, sizeof(ha_spider));
-  tmp_spider.trx = trx;
+  memset(&tmp_wide_handler, 0, sizeof(SPIDER_WIDE_HANDLER));
+  tmp_spider.wide_handler = &tmp_wide_handler;
+  tmp_wide_handler.trx = trx;
   tmp_spider.need_mons = &need_mon;
   while ((conn = (SPIDER_CONN*) my_hash_element(&trx->trx_conn_hash,
     roop_count)))
@@ -398,7 +394,8 @@ int spider_trx_all_start_trx(
     if (
       (spider_param_sync_trx_isolation(trx->thd) &&
         (error_num = spider_check_and_set_trx_isolation(conn, &need_mon))) ||
-      (error_num = spider_internal_start_trx(&tmp_spider, conn, 0))
+      (error_num = spider_internal_start_trx_for_connection(&tmp_spider,
+        conn, 0))
     ) {
       SPIDER_CONN_RESTORE_DASTATUS_AND_RESET_ERROR_NUM;
       if (error_num)
@@ -418,6 +415,7 @@ int spider_trx_all_flush_logs(
   SPIDER_CONN *conn;
   ha_spider tmp_spider;
   SPIDER_SHARE tmp_share;
+  SPIDER_WIDE_HANDLER tmp_wide_handler;
   long tmp_link_statuses = SPIDER_LINK_STATUS_OK;
   uint conn_link_idx = 0;
   long net_read_timeout = 600;
@@ -425,6 +423,7 @@ int spider_trx_all_flush_logs(
   DBUG_ENTER("spider_trx_all_flush_logs");
   SPIDER_BACKUP_DASTATUS;
   memset((void*)&tmp_spider, 0, sizeof(ha_spider));
+  memset(&tmp_wide_handler, 0, sizeof(SPIDER_WIDE_HANDLER));
   tmp_share.link_count = 1;
   tmp_share.all_link_count = 1;
   tmp_share.link_statuses = &tmp_link_statuses;
@@ -437,7 +436,8 @@ int spider_trx_all_flush_logs(
   tmp_spider.conns = &conn;
   tmp_spider.need_mons = &need_mon;
   tmp_spider.conn_link_idx = &conn_link_idx;
-  tmp_spider.trx = trx;
+  tmp_spider.wide_handler = &tmp_wide_handler;
+  tmp_wide_handler.trx = trx;
   while ((conn = (SPIDER_CONN*) my_hash_element(&trx->trx_conn_hash,
     roop_count)))
   {
@@ -505,6 +505,7 @@ int spider_create_trx_alter_table(
   char **tmp_tgt_ssl_keys;
   char **tmp_tgt_default_files;
   char **tmp_tgt_default_groups;
+  char **tmp_tgt_dsns;
   char **tmp_static_link_ids;
   uint *tmp_server_names_lengths;
   uint *tmp_tgt_table_names_lengths;
@@ -521,6 +522,7 @@ int spider_create_trx_alter_table(
   uint *tmp_tgt_ssl_keys_lengths;
   uint *tmp_tgt_default_files_lengths;
   uint *tmp_tgt_default_groups_lengths;
+  uint *tmp_tgt_dsns_lengths;
   uint *tmp_static_link_ids_lengths;
   long *tmp_tgt_ports;
   long *tmp_tgt_ssl_vscs;
@@ -541,6 +543,7 @@ int spider_create_trx_alter_table(
   char *tmp_tgt_ssl_keys_char;
   char *tmp_tgt_default_files_char;
   char *tmp_tgt_default_groups_char;
+  char *tmp_tgt_dsns_char;
   char *tmp_static_link_ids_char;
   uint old_elements;
 
@@ -549,81 +552,94 @@ int spider_create_trx_alter_table(
 
   if (!(alter_table = (SPIDER_ALTER_TABLE *)
     spider_bulk_malloc(spider_current_trx, 55, MYF(MY_WME | MY_ZEROFILL),
-      &alter_table, sizeof(*alter_table),
-      &tmp_name, sizeof(char) * (share->table_name_length + 1),
-
-      &tmp_server_names, sizeof(char *) * share->all_link_count,
-      &tmp_tgt_table_names, sizeof(char *) * share->all_link_count,
-      &tmp_tgt_dbs, sizeof(char *) * share->all_link_count,
-      &tmp_tgt_hosts, sizeof(char *) * share->all_link_count,
-      &tmp_tgt_usernames, sizeof(char *) * share->all_link_count,
-      &tmp_tgt_passwords, sizeof(char *) * share->all_link_count,
-      &tmp_tgt_sockets, sizeof(char *) * share->all_link_count,
-      &tmp_tgt_wrappers, sizeof(char *) * share->all_link_count,
-      &tmp_tgt_ssl_cas, sizeof(char *) * share->all_link_count,
-      &tmp_tgt_ssl_capaths, sizeof(char *) * share->all_link_count,
-      &tmp_tgt_ssl_certs, sizeof(char *) * share->all_link_count,
-      &tmp_tgt_ssl_ciphers, sizeof(char *) * share->all_link_count,
-      &tmp_tgt_ssl_keys, sizeof(char *) * share->all_link_count,
-      &tmp_tgt_default_files, sizeof(char *) * share->all_link_count,
-      &tmp_tgt_default_groups, sizeof(char *) * share->all_link_count,
-      &tmp_static_link_ids, sizeof(char *) * share->all_link_count,
-
-      &tmp_server_names_lengths, sizeof(uint) * share->all_link_count,
-      &tmp_tgt_table_names_lengths, sizeof(uint) * share->all_link_count,
-      &tmp_tgt_dbs_lengths, sizeof(uint) * share->all_link_count,
-      &tmp_tgt_hosts_lengths, sizeof(uint) * share->all_link_count,
-      &tmp_tgt_usernames_lengths, sizeof(uint) * share->all_link_count,
-      &tmp_tgt_passwords_lengths, sizeof(uint) * share->all_link_count,
-      &tmp_tgt_sockets_lengths, sizeof(uint) * share->all_link_count,
-      &tmp_tgt_wrappers_lengths, sizeof(uint) * share->all_link_count,
-      &tmp_tgt_ssl_cas_lengths, sizeof(uint) * share->all_link_count,
-      &tmp_tgt_ssl_capaths_lengths, sizeof(uint) * share->all_link_count,
-      &tmp_tgt_ssl_certs_lengths, sizeof(uint) * share->all_link_count,
-      &tmp_tgt_ssl_ciphers_lengths, sizeof(uint) * share->all_link_count,
-      &tmp_tgt_ssl_keys_lengths, sizeof(uint) * share->all_link_count,
-      &tmp_tgt_default_files_lengths, sizeof(uint) * share->all_link_count,
-      &tmp_tgt_default_groups_lengths, sizeof(uint) * share->all_link_count,
-      &tmp_static_link_ids_lengths, sizeof(uint) * share->all_link_count,
-
-      &tmp_tgt_ports, sizeof(long) * share->all_link_count,
-      &tmp_tgt_ssl_vscs, sizeof(long) * share->all_link_count,
+      &alter_table, (uint) (sizeof(*alter_table)),
+      &tmp_name, (uint) (sizeof(char) * (share->table_name_length + 1)),
+
+      &tmp_server_names, (uint) (sizeof(char *) * share->all_link_count),
+      &tmp_tgt_table_names, (uint) (sizeof(char *) * share->all_link_count),
+      &tmp_tgt_dbs, (uint) (sizeof(char *) * share->all_link_count),
+      &tmp_tgt_hosts, (uint) (sizeof(char *) * share->all_link_count),
+      &tmp_tgt_usernames, (uint) (sizeof(char *) * share->all_link_count),
+      &tmp_tgt_passwords, (uint) (sizeof(char *) * share->all_link_count),
+      &tmp_tgt_sockets, (uint) (sizeof(char *) * share->all_link_count),
+      &tmp_tgt_wrappers, (uint) (sizeof(char *) * share->all_link_count),
+      &tmp_tgt_ssl_cas, (uint) (sizeof(char *) * share->all_link_count),
+      &tmp_tgt_ssl_capaths, (uint) (sizeof(char *) * share->all_link_count),
+      &tmp_tgt_ssl_certs, (uint) (sizeof(char *) * share->all_link_count),
+      &tmp_tgt_ssl_ciphers, (uint) (sizeof(char *) * share->all_link_count),
+      &tmp_tgt_ssl_keys, (uint) (sizeof(char *) * share->all_link_count),
+      &tmp_tgt_default_files, (uint) (sizeof(char *) * share->all_link_count),
+      &tmp_tgt_default_groups, (uint) (sizeof(char *) * share->all_link_count),
+      &tmp_tgt_dsns, (uint) (sizeof(char *) * share->all_link_count),
+      &tmp_static_link_ids, (uint) (sizeof(char *) * share->all_link_count),
+
+      &tmp_server_names_lengths, (uint) (sizeof(uint) * share->all_link_count),
+      &tmp_tgt_table_names_lengths,
+        (uint) (sizeof(uint) * share->all_link_count),
+      &tmp_tgt_dbs_lengths, (uint) (sizeof(uint) * share->all_link_count),
+      &tmp_tgt_hosts_lengths, (uint) (sizeof(uint) * share->all_link_count),
+      &tmp_tgt_usernames_lengths,
+        (uint) (sizeof(uint) * share->all_link_count),
+      &tmp_tgt_passwords_lengths,
+        (uint) (sizeof(uint) * share->all_link_count),
+      &tmp_tgt_sockets_lengths, (uint) (sizeof(uint) * share->all_link_count),
+      &tmp_tgt_wrappers_lengths, (uint) (sizeof(uint) * share->all_link_count),
+      &tmp_tgt_ssl_cas_lengths, (uint) (sizeof(uint) * share->all_link_count),
+      &tmp_tgt_ssl_capaths_lengths,
+        (uint) (sizeof(uint) * share->all_link_count),
+      &tmp_tgt_ssl_certs_lengths,
+        (uint) (sizeof(uint) * share->all_link_count),
+      &tmp_tgt_ssl_ciphers_lengths,
+        (uint) (sizeof(uint) * share->all_link_count),
+      &tmp_tgt_ssl_keys_lengths, (uint) (sizeof(uint) * share->all_link_count),
+      &tmp_tgt_default_files_lengths,
+        (uint) (sizeof(uint) * share->all_link_count),
+      &tmp_tgt_default_groups_lengths,
+        (uint) (sizeof(uint) * share->all_link_count),
+      &tmp_tgt_dsns_lengths, (uint) (sizeof(uint) * share->all_link_count),
+      &tmp_static_link_ids_lengths,
+        (uint) (sizeof(uint) * share->all_link_count),
+
+      &tmp_tgt_ports, (uint) (sizeof(long) * share->all_link_count),
+      &tmp_tgt_ssl_vscs, (uint) (sizeof(long) * share->all_link_count),
       &tmp_monitoring_binlog_pos_at_failing,
-        sizeof(long) * share->all_link_count,
-      &tmp_link_statuses, sizeof(long) * share->all_link_count,
-
-      &tmp_server_names_char, sizeof(char) *
-        (share_alter->tmp_server_names_charlen + 1),
-      &tmp_tgt_table_names_char, sizeof(char) *
-        (share_alter->tmp_tgt_table_names_charlen + 1),
-      &tmp_tgt_dbs_char, sizeof(char) *
-        (share_alter->tmp_tgt_dbs_charlen + 1),
-      &tmp_tgt_hosts_char, sizeof(char) *
-        (share_alter->tmp_tgt_hosts_charlen + 1),
-      &tmp_tgt_usernames_char, sizeof(char) *
-        (share_alter->tmp_tgt_usernames_charlen + 1),
-      &tmp_tgt_passwords_char, sizeof(char) *
-        (share_alter->tmp_tgt_passwords_charlen + 1),
-      &tmp_tgt_sockets_char, sizeof(char) *
-        (share_alter->tmp_tgt_sockets_charlen + 1),
-      &tmp_tgt_wrappers_char, sizeof(char) *
-        (share_alter->tmp_tgt_wrappers_charlen + 1),
-      &tmp_tgt_ssl_cas_char, sizeof(char) *
-        (share_alter->tmp_tgt_ssl_cas_charlen + 1),
-      &tmp_tgt_ssl_capaths_char, sizeof(char) *
-        (share_alter->tmp_tgt_ssl_capaths_charlen + 1),
-      &tmp_tgt_ssl_certs_char, sizeof(char) *
-        (share_alter->tmp_tgt_ssl_certs_charlen + 1),
-      &tmp_tgt_ssl_ciphers_char, sizeof(char) *
-        (share_alter->tmp_tgt_ssl_ciphers_charlen + 1),
-      &tmp_tgt_ssl_keys_char, sizeof(char) *
-        (share_alter->tmp_tgt_ssl_keys_charlen + 1),
-      &tmp_tgt_default_files_char, sizeof(char) *
-        (share_alter->tmp_tgt_default_files_charlen + 1),
-      &tmp_tgt_default_groups_char, sizeof(char) *
-        (share_alter->tmp_tgt_default_groups_charlen + 1),
-      &tmp_static_link_ids_char, sizeof(char) *
-        (share_alter->tmp_static_link_ids_charlen + 1),
+        (uint) (sizeof(long) * share->all_link_count),
+      &tmp_link_statuses, (uint) (sizeof(long) * share->all_link_count),
+
+      &tmp_server_names_char, (uint) (sizeof(char) *
+        (share_alter->tmp_server_names_charlen + 1)),
+      &tmp_tgt_table_names_char, (uint) (sizeof(char) *
+        (share_alter->tmp_tgt_table_names_charlen + 1)),
+      &tmp_tgt_dbs_char, (uint) (sizeof(char) *
+        (share_alter->tmp_tgt_dbs_charlen + 1)),
+      &tmp_tgt_hosts_char, (uint) (sizeof(char) *
+        (share_alter->tmp_tgt_hosts_charlen + 1)),
+      &tmp_tgt_usernames_char, (uint) (sizeof(char) *
+        (share_alter->tmp_tgt_usernames_charlen + 1)),
+      &tmp_tgt_passwords_char, (uint) (sizeof(char) *
+        (share_alter->tmp_tgt_passwords_charlen + 1)),
+      &tmp_tgt_sockets_char, (uint) (sizeof(char) *
+        (share_alter->tmp_tgt_sockets_charlen + 1)),
+      &tmp_tgt_wrappers_char, (uint) (sizeof(char) *
+        (share_alter->tmp_tgt_wrappers_charlen + 1)),
+      &tmp_tgt_ssl_cas_char, (uint) (sizeof(char) *
+        (share_alter->tmp_tgt_ssl_cas_charlen + 1)),
+      &tmp_tgt_ssl_capaths_char, (uint) (sizeof(char) *
+        (share_alter->tmp_tgt_ssl_capaths_charlen + 1)),
+      &tmp_tgt_ssl_certs_char, (uint) (sizeof(char) *
+        (share_alter->tmp_tgt_ssl_certs_charlen + 1)),
+      &tmp_tgt_ssl_ciphers_char, (uint) (sizeof(char) *
+        (share_alter->tmp_tgt_ssl_ciphers_charlen + 1)),
+      &tmp_tgt_ssl_keys_char, (uint) (sizeof(char) *
+        (share_alter->tmp_tgt_ssl_keys_charlen + 1)),
+      &tmp_tgt_default_files_char, (uint) (sizeof(char) *
+        (share_alter->tmp_tgt_default_files_charlen + 1)),
+      &tmp_tgt_default_groups_char, (uint) (sizeof(char) *
+        (share_alter->tmp_tgt_default_groups_charlen + 1)),
+      &tmp_tgt_dsns_char, (uint) (sizeof(char) *
+        (share_alter->tmp_tgt_dsns_charlen + 1)),
+      &tmp_static_link_ids_char, (uint) (sizeof(char) *
+        (share_alter->tmp_static_link_ids_charlen + 1)),
       NullS))
   ) {
     error_num = HA_ERR_OUT_OF_MEM;
@@ -657,6 +673,7 @@ int spider_create_trx_alter_table(
   alter_table->tmp_tgt_ssl_keys = tmp_tgt_ssl_keys;
   alter_table->tmp_tgt_default_files = tmp_tgt_default_files;
   alter_table->tmp_tgt_default_groups = tmp_tgt_default_groups;
+  alter_table->tmp_tgt_dsns = tmp_tgt_dsns;
   alter_table->tmp_static_link_ids = tmp_static_link_ids;
 
   alter_table->tmp_tgt_ports = tmp_tgt_ports;
@@ -680,6 +697,7 @@ int spider_create_trx_alter_table(
   alter_table->tmp_tgt_ssl_keys_lengths = tmp_tgt_ssl_keys_lengths;
   alter_table->tmp_tgt_default_files_lengths = tmp_tgt_default_files_lengths;
   alter_table->tmp_tgt_default_groups_lengths = tmp_tgt_default_groups_lengths;
+  alter_table->tmp_tgt_dsns_lengths = tmp_tgt_dsns_lengths;
   alter_table->tmp_static_link_ids_lengths = tmp_static_link_ids_lengths;
 
   for(roop_count = 0; roop_count < (int) share->all_link_count; roop_count++)
@@ -780,6 +798,12 @@ int spider_create_trx_alter_table(
     tmp_tgt_default_groups_char +=
       share_alter->tmp_tgt_default_groups_lengths[roop_count] + 1;
 
+    tmp_tgt_dsns[roop_count] = tmp_tgt_dsns_char;
+    memcpy(tmp_tgt_dsns_char, share_alter->tmp_tgt_dsns[roop_count],
+      sizeof(char) * share_alter->tmp_tgt_dsns_lengths[roop_count]);
+    tmp_tgt_dsns_char +=
+      share_alter->tmp_tgt_dsns_lengths[roop_count] + 1;
+
     if (share_alter->tmp_static_link_ids[roop_count])
     {
       tmp_static_link_ids[roop_count] = tmp_static_link_ids_char;
@@ -833,6 +857,8 @@ int spider_create_trx_alter_table(
   memcpy(tmp_tgt_default_groups_lengths,
     share_alter->tmp_tgt_default_groups_lengths,
     sizeof(uint) * share->all_link_count);
+  memcpy(tmp_tgt_dsns_lengths, share_alter->tmp_tgt_dsns_lengths,
+    sizeof(uint) * share->all_link_count);
   memcpy(tmp_static_link_ids_lengths,
     share_alter->tmp_static_link_ids_lengths,
     sizeof(uint) * share->all_link_count);
@@ -867,6 +893,8 @@ int spider_create_trx_alter_table(
     share_alter->tmp_tgt_default_files_length;
   alter_table->tmp_tgt_default_groups_length =
     share_alter->tmp_tgt_default_groups_length;
+  alter_table->tmp_tgt_dsns_length =
+    share_alter->tmp_tgt_dsns_length;
   alter_table->tmp_static_link_ids_length =
     share_alter->tmp_static_link_ids_length;
   alter_table->tmp_tgt_ports_length =
@@ -1071,6 +1099,16 @@ bool spider_cmp_trx_alter_table(
         )
       ) ||
       (
+        cmp1->tmp_tgt_dsns[roop_count] !=
+          cmp2->tmp_tgt_dsns[roop_count] &&
+        (
+          !cmp1->tmp_tgt_dsns[roop_count] ||
+          !cmp2->tmp_tgt_dsns[roop_count] ||
+          strcmp(cmp1->tmp_tgt_dsns[roop_count],
+            cmp2->tmp_tgt_dsns[roop_count])
+        )
+      ) ||
+      (
         cmp1->tmp_static_link_ids[roop_count] !=
           cmp2->tmp_static_link_ids[roop_count] &&
         (
@@ -1191,6 +1229,7 @@ SPIDER_TRX *spider_get_trx(
   int roop_count = 0, roop_count2;
   SPIDER_TRX *trx;
   SPIDER_SHARE *tmp_share;
+  SPIDER_WIDE_HANDLER *tmp_wide_handler;
   pthread_mutex_t *udf_table_mutexes;
   DBUG_ENTER("spider_get_trx");
 
@@ -1201,10 +1240,11 @@ SPIDER_TRX *spider_get_trx(
     DBUG_PRINT("info",("spider create new trx"));
     if (!(trx = (SPIDER_TRX *)
       spider_bulk_malloc(NULL, 56, MYF(MY_WME | MY_ZEROFILL),
-        &trx, sizeof(*trx),
-        &tmp_share, sizeof(SPIDER_SHARE),
-        &udf_table_mutexes, sizeof(pthread_mutex_t) *
-          spider_param_udf_table_lock_mutex_count(),
+        &trx, (uint) (sizeof(*trx)),
+        &tmp_share, (uint) (sizeof(SPIDER_SHARE)),
+        &tmp_wide_handler, (uint) sizeof(SPIDER_WIDE_HANDLER),
+        &udf_table_mutexes, (uint) (sizeof(pthread_mutex_t) *
+          spider_param_udf_table_lock_mutex_count()),
         NullS))
     )
       goto error_alloc_trx;
@@ -1228,8 +1268,9 @@ SPIDER_TRX *spider_get_trx(
     }
 
     if (
-      my_hash_init(&trx->trx_conn_hash, spd_charset_utf8_bin, 32, 0, 0,
-                   (my_hash_get_key) spider_conn_get_key, 0, 0)
+      my_hash_init(PSI_INSTRUMENT_ME, &trx->trx_conn_hash,
+                   spd_charset_utf8mb3_bin, 32, 0, 0, (my_hash_get_key)
+                   spider_conn_get_key, 0, 0)
     )
       goto error_init_hash;
     spider_alloc_calc_mem_init(trx->trx_conn_hash, 151);
@@ -1240,8 +1281,9 @@ SPIDER_TRX *spider_get_trx(
       trx->trx_conn_hash.array.size_of_element);
 
     if (
-      my_hash_init(&trx->trx_another_conn_hash, spd_charset_utf8_bin, 32, 0, 0,
-                   (my_hash_get_key) spider_conn_get_key, 0, 0)
+      my_hash_init(PSI_INSTRUMENT_ME, &trx->trx_another_conn_hash,
+                   spd_charset_utf8mb3_bin, 32, 0, 0, (my_hash_get_key)
+                   spider_conn_get_key, 0, 0)
     )
       goto error_init_another_hash;
     spider_alloc_calc_mem_init(trx->trx_another_conn_hash, 152);
@@ -1253,7 +1295,7 @@ SPIDER_TRX *spider_get_trx(
 
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
     if (
-      my_hash_init(&trx->trx_hs_r_conn_hash, spd_charset_utf8_bin, 32, 0, 0,
+      my_hash_init(&trx->trx_hs_r_conn_hash, spd_charset_utf8mb3_bin, 32, 0, 0,
                    (my_hash_get_key) spider_conn_get_key, 0, 0)
     )
       goto error_hs_r_init_hash;
@@ -1265,7 +1307,7 @@ SPIDER_TRX *spider_get_trx(
       trx->trx_hs_r_conn_hash.array.size_of_element);
 
     if (
-      my_hash_init(&trx->trx_hs_w_conn_hash, spd_charset_utf8_bin, 32, 0, 0,
+      my_hash_init(&trx->trx_hs_w_conn_hash, spd_charset_utf8mb3_bin, 32, 0, 0,
                    (my_hash_get_key) spider_conn_get_key, 0, 0)
     )
       goto error_hs_w_init_hash;
@@ -1279,7 +1321,7 @@ SPIDER_TRX *spider_get_trx(
 
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
     if (
-      my_hash_init(&trx->trx_direct_hs_r_conn_hash, spd_charset_utf8_bin, 32,
+      my_hash_init(&trx->trx_direct_hs_r_conn_hash, spd_charset_utf8mb3_bin, 32,
         0, 0, (my_hash_get_key) spider_conn_get_key, 0, 0)
     )
       goto error_direct_hs_r_init_hash;
@@ -1291,7 +1333,7 @@ SPIDER_TRX *spider_get_trx(
       trx->trx_direct_hs_r_conn_hash.array.size_of_element);
 
     if (
-      my_hash_init(&trx->trx_direct_hs_w_conn_hash, spd_charset_utf8_bin, 32,
+      my_hash_init(&trx->trx_direct_hs_w_conn_hash, spd_charset_utf8mb3_bin, 32,
         0, 0, (my_hash_get_key) spider_conn_get_key, 0, 0)
     )
       goto error_direct_hs_w_init_hash;
@@ -1304,8 +1346,9 @@ SPIDER_TRX *spider_get_trx(
 #endif
 
     if (
-      my_hash_init(&trx->trx_alter_table_hash, spd_charset_utf8_bin, 32, 0, 0,
-                   (my_hash_get_key) spider_alter_tbl_get_key, 0, 0)
+      my_hash_init(PSI_INSTRUMENT_ME, &trx->trx_alter_table_hash,
+                   spd_charset_utf8mb3_bin, 32, 0, 0, (my_hash_get_key)
+                   spider_alter_tbl_get_key, 0, 0)
     )
       goto error_init_alter_hash;
     spider_alloc_calc_mem_init(trx->trx_alter_table_hash, 157);
@@ -1316,8 +1359,9 @@ SPIDER_TRX *spider_get_trx(
       trx->trx_alter_table_hash.array.size_of_element);
 
     if (
-      my_hash_init(&trx->trx_ha_hash, spd_charset_utf8_bin, 32, 0, 0,
-                   (my_hash_get_key) spider_trx_ha_get_key, 0, 0)
+      my_hash_init(PSI_INSTRUMENT_ME, &trx->trx_ha_hash,
+                   spd_charset_utf8mb3_bin, 32, 0, 0, (my_hash_get_key)
+                   spider_trx_ha_get_key, 0, 0)
     )
       goto error_init_trx_ha_hash;
     spider_alloc_calc_mem_init(trx->trx_ha_hash, 158);
@@ -1374,7 +1418,8 @@ SPIDER_TRX *spider_get_trx(
       }
       trx->tmp_spider->need_mons = &trx->tmp_need_mon;
       trx->tmp_spider->share = trx->tmp_share;
-      trx->tmp_spider->trx = trx;
+      trx->tmp_spider->wide_handler = tmp_wide_handler;
+      tmp_wide_handler->trx = trx;
       trx->tmp_spider->dbton_handler = trx->tmp_dbton_handler;
       if (!(trx->tmp_spider->result_list.sqls =
         new spider_string[trx->tmp_share->link_count]))
@@ -1557,7 +1602,8 @@ error_alloc_trx:
 
 int spider_free_trx(
   SPIDER_TRX *trx,
-  bool need_lock
+  bool need_lock,
+  bool reset_ha_data
 ) {
   DBUG_ENTER("spider_free_trx");
   if (trx->thd)
@@ -1575,7 +1621,8 @@ int spider_free_trx(
       if (need_lock)
         pthread_mutex_unlock(&spider_allocated_thds_mutex);
     }
-    thd_set_ha_data(trx->thd, spider_hton_ptr, NULL);
+    if (reset_ha_data)
+      thd_set_ha_data(trx->thd, spider_hton_ptr, NULL);
   }
   spider_free_trx_alloc(trx);
   spider_merge_mem_calc(trx, TRUE);
@@ -1860,27 +1907,14 @@ int spider_start_internal_consistent_snapshot(
 }
 
 int spider_internal_start_trx(
-  ha_spider *spider,
-  SPIDER_CONN *conn,
-  int link_idx
+  ha_spider *spider
 ) {
   int error_num;
-  SPIDER_TRX *trx = spider->trx;
+  SPIDER_TRX *trx = spider->wide_handler->trx;
   THD *thd = trx->thd;
-  bool sync_autocommit = spider_param_sync_autocommit(thd);
-  double ping_interval_at_trx_start =
-    spider_param_ping_interval_at_trx_start(thd);
   bool xa_lock = FALSE;
-  time_t tmp_time = (time_t) time((time_t*) 0);
   DBUG_ENTER("spider_internal_start_trx");
 
-  if (
-    conn->server_lost ||
-    difftime(tmp_time, conn->ping_time) >= ping_interval_at_trx_start
-  ) {
-    spider_conn_queue_ping(spider, conn, link_idx);
-  }
-  conn->disable_reconnect = TRUE;
   if (!trx->trx_start)
   {
     if (!trx->trx_consistent_snapshot)
@@ -1891,19 +1925,7 @@ int spider_internal_start_trx(
       trx->internal_xa_snapshot = spider_param_internal_xa_snapshot(thd);
     }
   }
-  if (
-    (error_num = spider_check_and_set_sql_log_off(thd, conn,
-      &spider->need_mons[link_idx])) ||
-    (error_num = spider_check_and_set_wait_timeout(thd, conn,
-      &spider->need_mons[link_idx])) ||
-    (spider_param_sync_sql_mode(thd) &&
-      (error_num = spider_check_and_set_sql_mode(thd, conn,
-        &spider->need_mons[link_idx]))) ||
-    (sync_autocommit &&
-      (error_num = spider_check_and_set_autocommit(thd, conn,
-        &spider->need_mons[link_idx])))
-  )
-    goto error;
+  spider->wide_handler->consistent_snapshot = FALSE;
   if (trx->trx_consistent_snapshot)
   {
     if (trx->internal_xa && trx->internal_xa_snapshot < 2)
@@ -1914,9 +1936,7 @@ int spider_internal_start_trx(
       goto error;
     } else if (!trx->internal_xa || trx->internal_xa_snapshot == 2)
     {
-      if ((error_num = spider_start_internal_consistent_snapshot(trx, conn,
-        &spider->need_mons[link_idx])))
-        goto error;
+      spider->wide_handler->consistent_snapshot = TRUE;
     }
   }
   DBUG_PRINT("info",("spider trx->trx_start= %s",
@@ -1924,7 +1944,7 @@ int spider_internal_start_trx(
   if (!trx->trx_start)
   {
     if (
-      thd->transaction.xid_state.is_explicit_XA() &&
+      thd->transaction->xid_state.is_explicit_XA() &&
       spider_param_support_xa()
     ) {
       trx->trx_xa = TRUE;
@@ -1935,7 +1955,7 @@ int spider_internal_start_trx(
       !trx->trx_xa &&
       trx->internal_xa &&
       (!trx->trx_consistent_snapshot || trx->internal_xa_snapshot == 3) &&
-      spider->sql_command != SQLCOM_LOCK_TABLES
+      spider->wide_handler->sql_command != SQLCOM_LOCK_TABLES
     ) {
       trx->trx_xa = TRUE;
       trx->xid.formatID = 1;
@@ -1979,15 +1999,63 @@ int spider_internal_start_trx(
       trx->trx_consistent_snapshot ? "TRUE" : "FALSE"));
     if (!trx->trx_consistent_snapshot)
     {
-      trans_register_ha(thd, FALSE, spider_hton_ptr);
+      trans_register_ha(thd, FALSE, spider_hton_ptr, 0);
       if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
-        trans_register_ha(thd, TRUE, spider_hton_ptr);
+        trans_register_ha(thd, TRUE, spider_hton_ptr, 0);
     }
     trx->trx_start = TRUE;
     trx->trx_xa_prepared = FALSE;
     trx->updated_in_this_trx = FALSE;
     DBUG_PRINT("info",("spider trx->updated_in_this_trx=FALSE"));
   }
+  DBUG_RETURN(0);
+
+error:
+  if (xa_lock)
+    spider_xa_unlock(&trx->internal_xid_state);
+  DBUG_RETURN(error_num);
+}
+
+int spider_internal_start_trx_for_connection(
+  ha_spider *spider,
+  SPIDER_CONN *conn,
+  int link_idx
+) {
+  int error_num;
+  SPIDER_TRX *trx = spider->wide_handler->trx;
+  THD *thd = trx->thd;
+  bool sync_autocommit = spider_param_sync_autocommit(thd);
+  double ping_interval_at_trx_start =
+    spider_param_ping_interval_at_trx_start(thd);
+  time_t tmp_time = (time_t) time((time_t*) 0);
+  DBUG_ENTER("spider_internal_start_trx_for_connection");
+  if (
+    conn->server_lost ||
+    difftime(tmp_time, conn->ping_time) >= ping_interval_at_trx_start
+  ) {
+    spider_conn_queue_ping(spider, conn, link_idx);
+  }
+  conn->disable_reconnect = TRUE;
+  if (
+    (error_num = spider_check_and_set_sql_log_off(thd, conn,
+      &spider->need_mons[link_idx])) ||
+    (error_num = spider_check_and_set_wait_timeout(thd, conn,
+      &spider->need_mons[link_idx])) ||
+    (spider_param_sync_sql_mode(thd) &&
+      (error_num = spider_check_and_set_sql_mode(thd, conn,
+        &spider->need_mons[link_idx]))) ||
+    (sync_autocommit &&
+      (error_num = spider_check_and_set_autocommit(thd, conn,
+        &spider->need_mons[link_idx])))
+  )
+    goto error;
+
+  if (spider->wide_handler->consistent_snapshot)
+  {
+    if ((error_num = spider_start_internal_consistent_snapshot(trx, conn,
+      &spider->need_mons[link_idx])))
+      goto error;
+  }
 
   DBUG_PRINT("info",("spider sync_autocommit = %d", sync_autocommit));
   DBUG_PRINT("info",("spider conn->semi_trx_chk = %d", conn->semi_trx_chk));
@@ -2053,8 +2121,6 @@ int spider_internal_start_trx(
   DBUG_RETURN(0);
 
 error:
-  if (xa_lock)
-    spider_xa_unlock(&trx->internal_xid_state);
   DBUG_RETURN(error_num);
 }
 
@@ -2070,11 +2136,7 @@ int spider_internal_xa_commit(
   SPIDER_CONN *conn;
   uint force_commit = spider_param_force_commit(thd);
   MEM_ROOT mem_root;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   bool table_xa_opened = FALSE;
   bool table_xa_member_opened = FALSE;
   DBUG_ENTER("spider_internal_xa_commit");
@@ -2258,11 +2320,7 @@ int spider_internal_xa_rollback(
   SPIDER_CONN *conn;
   uint force_commit = spider_param_force_commit(thd);
   MEM_ROOT mem_root;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   bool server_lost = FALSE;
   bool table_xa_opened = FALSE;
   bool table_xa_member_opened = FALSE;
@@ -2500,11 +2558,7 @@ int spider_internal_xa_prepare(
   int error_num;
   SPIDER_CONN *conn;
   uint force_commit = spider_param_force_commit(thd);
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   bool table_xa_opened = FALSE;
   bool table_xa_member_opened = FALSE;
   DBUG_ENTER("spider_internal_xa_prepare");
@@ -2675,11 +2729,7 @@ int spider_internal_xa_recover(
   int cnt = 0;
   char xa_key[MAX_KEY_LENGTH];
   MEM_ROOT mem_root;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   DBUG_ENTER("spider_internal_xa_recover");
   /*
     select
@@ -2734,50 +2784,24 @@ int spider_initinal_xa_recover(
   uint len
 ) {
   int error_num;
-  static THD *thd = NULL;
-  static TABLE *table_xa = NULL;
-  static READ_RECORD *read_record = NULL;
-#if MYSQL_VERSION_ID < 50500
-  static Open_tables_state *open_tables_backup = NULL;
-#else
-  static Open_tables_backup *open_tables_backup = NULL;
-#endif
+  THD *thd;
+  TABLE *table_xa;
+  READ_RECORD *read_record;
+  SPIDER_Open_tables_backup open_tables_backup;
   int cnt = 0;
   MEM_ROOT mem_root;
   DBUG_ENTER("spider_initinal_xa_recover");
-  if (!open_tables_backup)
-  {
-#if MYSQL_VERSION_ID < 50500
-    if (!(open_tables_backup = new Open_tables_state))
-#else
-    if (!(open_tables_backup = new Open_tables_backup))
-#endif
-    {
-      error_num = HA_ERR_OUT_OF_MEM;
-      goto error_create_state;
-    }
-  }
-  if (!read_record)
+  if (!(read_record = new READ_RECORD))
   {
-    if (!(read_record = new READ_RECORD))
-    {
-      error_num = HA_ERR_OUT_OF_MEM;
-      goto error_create_read_record;
-    }
+    error_num = HA_ERR_OUT_OF_MEM;
+    goto error_create_read_record;
   }
 
-/*
-  if (!thd)
+  if (!(thd = spider_create_tmp_thd()))
   {
-*/
-    if (!(thd = spider_create_tmp_thd()))
-    {
-      error_num = HA_ERR_OUT_OF_MEM;
-      goto error_create_thd;
-    }
-/*
+    error_num = HA_ERR_OUT_OF_MEM;
+    goto error_create_thd;
   }
-*/
 
   /*
     select
@@ -2788,17 +2812,14 @@ int spider_initinal_xa_recover(
     from
       mysql.spider_xa
   */
-  if (!table_xa)
-  {
-    if (
-      !(table_xa = spider_open_sys_table(
-        thd, SPIDER_SYS_XA_TABLE_NAME_STR, SPIDER_SYS_XA_TABLE_NAME_LEN,
-        FALSE, open_tables_backup, TRUE, &error_num))
-    )
-      goto error_open_table;
-    SPIDER_init_read_record(read_record, thd, table_xa, NULL, NULL, TRUE,
-      FALSE, FALSE);
-  }
+  if (
+    !(table_xa = spider_open_sys_table(
+      thd, SPIDER_SYS_XA_TABLE_NAME_STR, SPIDER_SYS_XA_TABLE_NAME_LEN,
+      FALSE, &open_tables_backup, TRUE, &error_num))
+  )
+    goto error_open_table;
+  SPIDER_init_read_record(read_record, thd, table_xa, NULL, NULL, TRUE,
+    FALSE, FALSE);
   SPD_INIT_ALLOC_ROOT(&mem_root, 4096, 0, MYF(MY_WME));
   while ((!(read_record->SPIDER_read_record_read_record(read_record))) &&
     cnt < (int) len)
@@ -2808,30 +2829,15 @@ int spider_initinal_xa_recover(
   }
   free_root(&mem_root, MYF(0));
 
-/*
-  if (cnt < (int) len)
-  {
-*/
-    end_read_record(read_record);
-    spider_close_sys_table(thd, table_xa, open_tables_backup, TRUE);
-    table_xa = NULL;
-    spider_free_tmp_thd(thd);
-    thd = NULL;
-    delete read_record;
-    read_record = NULL;
-    delete open_tables_backup;
-    open_tables_backup = NULL;
-/*
-  }
-*/
-  DBUG_RETURN(cnt);
-
-/*
-error:
-  end_read_record(&read_record_info);
+  end_read_record(read_record);
   spider_close_sys_table(thd, table_xa, &open_tables_backup, TRUE);
   table_xa = NULL;
-*/
+  spider_free_tmp_thd(thd);
+  thd = NULL;
+  delete read_record;
+  read_record = NULL;
+  DBUG_RETURN(cnt);
+
 error_open_table:
   spider_free_tmp_thd(thd);
   thd = NULL;
@@ -2839,9 +2845,6 @@ error_create_thd:
   delete read_record;
   read_record = NULL;
 error_create_read_record:
-  delete open_tables_backup;
-  open_tables_backup = NULL;
-error_create_state:
   DBUG_RETURN(0);
 }
 
@@ -2862,11 +2865,7 @@ int spider_internal_xa_commit_by_xid(
   SPIDER_CONN *conn;
   uint force_commit = spider_param_force_commit(thd);
   MEM_ROOT mem_root;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   bool table_xa_opened = FALSE;
   bool table_xa_member_opened = FALSE;
   DBUG_ENTER("spider_internal_xa_commit_by_xid");
@@ -3097,11 +3096,7 @@ int spider_internal_xa_rollback_by_xid(
   SPIDER_CONN *conn;
   uint force_commit = spider_param_force_commit(thd);
   MEM_ROOT mem_root;
-#if MYSQL_VERSION_ID < 50500
-  Open_tables_state open_tables_backup;
-#else
-  Open_tables_backup open_tables_backup;
-#endif
+  SPIDER_Open_tables_backup open_tables_backup;
   bool table_xa_opened = FALSE;
   bool table_xa_member_opened = FALSE;
   DBUG_ENTER("spider_internal_xa_rollback_by_xid");
@@ -3336,8 +3331,8 @@ int spider_start_consistent_snapshot(
       trx->trx_consistent_snapshot = TRUE;
       trx->use_consistent_snapshot = TRUE;
       trx->internal_xa_snapshot = spider_param_internal_xa_snapshot(trx->thd);
-      trans_register_ha(trx->thd, FALSE, spider_hton_ptr);
-      trans_register_ha(trx->thd, TRUE, spider_hton_ptr);
+      trans_register_ha(trx->thd, FALSE, spider_hton_ptr, 0);
+      trans_register_ha(trx->thd, TRUE, spider_hton_ptr, 0);
       if (spider_param_use_all_conns_snapshot(trx->thd))
       {
         trx->internal_xa = FALSE;
@@ -3747,12 +3742,10 @@ int spider_check_trx_and_get_conn(
     DBUG_PRINT("info",("spider get trx error"));
     DBUG_RETURN(error_num);
   }
-  spider->trx = trx;
+  spider->wide_handler->trx = trx;
   spider->set_error_mode();
-  if (
-    spider->sql_command != SQLCOM_DROP_TABLE &&
-    spider->sql_command != SQLCOM_ALTER_TABLE
-  ) {
+  if (spider->wide_handler->sql_command != SQLCOM_DROP_TABLE)
+  {
     SPIDER_TRX_HA *trx_ha = spider_check_trx_ha(trx, spider);
     if (!trx_ha || trx_ha->wait_for_reusing)
       spider_trx_set_link_idx_for_all(spider);
@@ -3807,9 +3800,9 @@ int spider_check_trx_and_get_conn(
         SPIDER_LINK_STATUS_NG
     ) {
       DBUG_PRINT("info",(first_byte != *spider->conn_keys[0] ?
-        "spider change conn type" : trx != spider->trx ? "spider change thd" :
-        "spider next trx"));
-      spider->trx = trx;
+        "spider change conn type" : trx != spider->wide_handler->trx ?
+        "spider change thd" : "spider next trx"));
+      spider->wide_handler->trx = trx;
       spider->trx_conn_adjustment = trx->trx_conn_adjustment;
 #if defined(HS_HAS_SQLCOM) && defined(HAVE_HANDLERSOCKET)
       if (use_conn_kind)
@@ -4170,8 +4163,7 @@ THD *spider_create_tmp_thd()
   thd->thread_id = thd->variables.pseudo_thread_id = 0;
 #endif
   thd->thread_stack = (char*) &thd;
-  if (thd->store_globals())
-    DBUG_RETURN(NULL);
+  thd->store_globals();
   lex_start(thd);
   DBUG_RETURN(thd);
 }
@@ -4228,10 +4220,10 @@ int spider_create_trx_ha(
   {
     if (!(trx_ha = (SPIDER_TRX_HA *)
       spider_bulk_malloc(spider_current_trx, 58, MYF(MY_WME),
-        &trx_ha, sizeof(SPIDER_TRX_HA),
-        &tmp_name, sizeof(char *) * (share->table_name_length + 1),
-        &conn_link_idx, sizeof(uint) * share->link_count,
-        &conn_can_fo, sizeof(uchar) * share->link_bitmap_size,
+        &trx_ha, (uint) (sizeof(SPIDER_TRX_HA)),
+        &tmp_name, (uint) (sizeof(char *) * (share->table_name_length + 1)),
+        &conn_link_idx, (uint) (sizeof(uint) * share->link_count),
+        &conn_can_fo, (uint) (sizeof(uchar) * share->link_bitmap_size),
         NullS))
     ) {
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
diff --git a/storage/spider/spd_trx.h b/storage/spider/spd_trx.h
index 3883ec49723..3bf93aada1a 100644
--- a/storage/spider/spd_trx.h
+++ b/storage/spider/spd_trx.h
@@ -80,7 +80,8 @@ SPIDER_TRX *spider_get_trx(
 
 int spider_free_trx(
   SPIDER_TRX *trx,
-  bool need_lock
+  bool need_lock,
+  bool reset_ha_data= true
 );
 
 int spider_check_and_set_trx_isolation(
@@ -125,6 +126,10 @@ int spider_start_internal_consistent_snapshot(
 );
 
 int spider_internal_start_trx(
+  ha_spider *spider
+);
+
+int spider_internal_start_trx_for_connection(
   ha_spider *spider,
   SPIDER_CONN *conn,
   int link_idx
diff --git a/storage/spider/spider.cnf b/storage/spider/spider.cnf
new file mode 100644
index 00000000000..ebc9968b490
--- /dev/null
+++ b/storage/spider/spider.cnf
@@ -0,0 +1,7 @@
+[mariadb]
+#
+# Uncomment line to enable
+#
+#plugin-load-add = ha_spider
+
+# Read more at https://mariadb.com/kb/en/spider/
diff --git a/storage/test_sql_discovery/mysql-test/sql_discovery/simple.result b/storage/test_sql_discovery/mysql-test/sql_discovery/simple.result
index 1feea5e47ee..d63ec136225 100644
--- a/storage/test_sql_discovery/mysql-test/sql_discovery/simple.result
+++ b/storage/test_sql_discovery/mysql-test/sql_discovery/simple.result
@@ -82,7 +82,7 @@ select * from t1;
 ERROR HY000: Engine TEST_SQL_DISCOVERY failed to discover table `test`.`t1` with 'create table t1 (a uint)'
 show warnings;
 Level	Code	Message
-Error	1064	You have an error in your SQL syntax; check the manual that corresponds to your MariaDB server version for the right syntax to use near ')' at line 1
+Error	4161	Unknown data type: 'uint'
 Error	1939	Engine TEST_SQL_DISCOVERY failed to discover table `test`.`t1` with 'create table t1 (a uint)'
 set @@test_sql_discovery_statement='t1:create table t1 (a int)';
 select * from t1;
diff --git a/storage/test_sql_discovery/test_sql_discovery.cc b/storage/test_sql_discovery/test_sql_discovery.cc
index 9e7a22368fc..0758d5f503f 100644
--- a/storage/test_sql_discovery/test_sql_discovery.cc
+++ b/storage/test_sql_discovery/test_sql_discovery.cc
@@ -147,11 +147,20 @@ static int discover_table(handlerton *hton, THD* thd, TABLE_SHARE *share)
                                                sql, strlen(sql));
 }
 
+static int drop_table(handlerton *hton, const char *path)
+{
+  const char *name= strrchr(path, FN_LIBCHAR)+1;
+  const char *sql= THDVAR(current_thd, statement);
+  return !sql || strncmp(sql, name, strlen(name)) || sql[strlen(name)] != ':'
+    ? ENOENT : 0;
+}
+
 static int init(void *p)
 {
   handlerton *hton = (handlerton *)p;
   hton->create = create_handler;
   hton->discover_table = discover_table;
+  hton->drop_table= drop_table;
   return 0;
 }
 
diff --git a/storage/tokudb/CMakeLists.txt b/storage/tokudb/CMakeLists.txt
index ee91807c619..6de57cd6a97 100644
--- a/storage/tokudb/CMakeLists.txt
+++ b/storage/tokudb/CMakeLists.txt
@@ -43,6 +43,7 @@ SET(TOKUDB_SOURCES
     tokudb_thread.cc
     tokudb_dir_cmd.cc)
 MYSQL_ADD_PLUGIN(tokudb ${TOKUDB_SOURCES} STORAGE_ENGINE MODULE_ONLY
+                 DISABLED
                  COMPONENT tokudb-engine CONFIG ${CMAKE_CURRENT_BINARY_DIR}/tokudb.cnf)
 
 IF(NOT TARGET tokudb)
diff --git a/storage/tokudb/PerconaFT/ft/tests/logcursor-empty-logfile-3.cc b/storage/tokudb/PerconaFT/ft/tests/logcursor-empty-logfile-3.cc
index 8bb9d961fdd..e54ea4955f1 100644
--- a/storage/tokudb/PerconaFT/ft/tests/logcursor-empty-logfile-3.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/logcursor-empty-logfile-3.cc
@@ -174,7 +174,7 @@ test_main (int argc, const char *argv[]) {
         r = toku_logger_open(TOKU_TEST_FILENAME, logger);  assert(r==0);
 
         toku_logger_maybe_trim_log(logger, trim_lsn);
-        assert( toku_logfilemgr_num_logfiles(logger->logfilemgr) == 4 ); // untrimmed log, empty log, plus newly openned log
+        assert( toku_logfilemgr_num_logfiles(logger->logfilemgr) == 4 ); // untrimmed log, empty log, plus newly opened log
 
         r = toku_logger_close(&logger);
     }
diff --git a/storage/tokudb/PerconaFT/third_party/snappy-1.1.2/aclocal.m4 b/storage/tokudb/PerconaFT/third_party/snappy-1.1.2/aclocal.m4
index 881dba8f0cd..10f8e70596b 100644
--- a/storage/tokudb/PerconaFT/third_party/snappy-1.1.2/aclocal.m4
+++ b/storage/tokudb/PerconaFT/third_party/snappy-1.1.2/aclocal.m4
@@ -8681,7 +8681,7 @@ fi[]dnl
 # to PKG_CHECK_MODULES(), but does not set variables or print errors.
 #
 # Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG])
-# only at the first occurence in configure.ac, so if the first place
+# only at the first occurrence in configure.ac, so if the first place
 # it's called might be skipped (such as if it is within an "if", you
 # have to call PKG_CHECK_EXISTS manually
 # --------------------------------------------------------------
diff --git a/storage/tokudb/ha_tokudb.cc b/storage/tokudb/ha_tokudb.cc
index f706ab86fc9..cc7911da557 100644
--- a/storage/tokudb/ha_tokudb.cc
+++ b/storage/tokudb/ha_tokudb.cc
@@ -6387,7 +6387,7 @@ int ha_tokudb::create_txn(THD* thd, tokudb_trx_data* trx) {
             "created master %p",
             trx->all);
         trx->sp_level = trx->all;
-        trans_register_ha(thd, true, tokudb_hton);
+        trans_register_ha(thd, true, tokudb_hton, 0);
     }
     DBUG_PRINT("trans", ("starting transaction stmt"));
     if (trx->stmt) { 
@@ -6429,7 +6429,7 @@ int ha_tokudb::create_txn(THD* thd, tokudb_trx_data* trx) {
         trx->sp_level,
         trx->stmt);
     reset_stmt_progress(&trx->stmt_progress);
-    trans_register_ha(thd, false, tokudb_hton);
+    trans_register_ha(thd, false, tokudb_hton, 0);
 cleanup:
     return error;
 }
@@ -6594,7 +6594,7 @@ int ha_tokudb::start_stmt(THD* thd, thr_lock_type lock_type) {
         share->rows_from_locked_table = added_rows - deleted_rows;
     }
     transaction = trx->sub_sp_level;
-    trans_register_ha(thd, false, tokudb_hton);
+    trans_register_ha(thd, false, tokudb_hton, 0);
 cleanup:
     TOKUDB_HANDLER_DBUG_RETURN(error);
 }
@@ -7793,11 +7793,6 @@ double ha_tokudb::scan_time() {
     DBUG_RETURN(ret_val);
 }
 
-bool ha_tokudb::is_clustering_key(uint index)
-{
-    return index == primary_key || key_is_clustering(&table->key_info[index]);
-}
-
 double ha_tokudb::keyread_time(uint index, uint ranges, ha_rows rows)
 {
     TOKUDB_HANDLER_DBUG_ENTER("%u %u %" PRIu64, index, ranges, (uint64_t) rows);
@@ -7899,7 +7894,9 @@ double ha_tokudb::index_only_read_time(uint keynr, double records) {
 //      number > 0 - There are approximately number matching rows in the range
 //      HA_POS_ERROR - Something is wrong with the index tree
 //
-ha_rows ha_tokudb::records_in_range(uint keynr, key_range* start_key, key_range* end_key) {
+ha_rows ha_tokudb::records_in_range(uint keynr, const key_range* start_key,
+                                    const key_range* end_key,
+                                    page_range *pages) {
     TOKUDB_HANDLER_DBUG_ENTER("%d %p %p", keynr, start_key, end_key);
     DBT *pleft_key, *pright_key;
     DBT left_key, right_key;
diff --git a/storage/tokudb/ha_tokudb.h b/storage/tokudb/ha_tokudb.h
index d74c99a5f8b..5a7027a6b04 100644
--- a/storage/tokudb/ha_tokudb.h
+++ b/storage/tokudb/ha_tokudb.h
@@ -844,7 +844,9 @@ public:
     int external_lock(THD * thd, int lock_type);
     int start_stmt(THD * thd, thr_lock_type lock_type);
 
-    ha_rows records_in_range(uint inx, key_range * min_key, key_range * max_key);
+    ha_rows records_in_range(uint inx, const key_range * min_key,
+                             const key_range * max_key,
+                             page_range *pages);
 
     uint32_t get_cursor_isolation_flags(enum thr_lock_type lock_type, THD* thd);
     THR_LOCK_DATA **store_lock(THD * thd, THR_LOCK_DATA ** to, enum thr_lock_type lock_type);
@@ -869,10 +871,6 @@ public:
     uint8 table_cache_type() {
         return HA_CACHE_TBL_TRANSACT;
     }
-    bool primary_key_is_clustered() {
-        return true;
-    }
-    bool is_clustering_key(uint index);
     int cmp_ref(const uchar * ref1, const uchar * ref2);
     bool check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes);
 
diff --git a/storage/tokudb/ha_tokudb_alter_56.cc b/storage/tokudb/ha_tokudb_alter_56.cc
index 4bdf2cf7bda..233d929a509 100644
--- a/storage/tokudb/ha_tokudb_alter_56.cc
+++ b/storage/tokudb/ha_tokudb_alter_56.cc
@@ -57,7 +57,7 @@ public:
         expand_varchar_update_needed(false),
         expand_fixed_update_needed(false),
         expand_blob_update_needed(false),
-        optimize_needed(false),
+        optimize_needed(false), changed_fields(PSI_INSTRUMENT_MEM),
         table_kc_info(NULL),
         altered_table_kc_info(NULL) {
     }
diff --git a/storage/tokudb/hatoku_cmp.cc b/storage/tokudb/hatoku_cmp.cc
index 5f82d1e48be..51f447da13e 100644
--- a/storage/tokudb/hatoku_cmp.cc
+++ b/storage/tokudb/hatoku_cmp.cc
@@ -6,6 +6,7 @@ This file is part of TokuDB
 
 
 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+Copyright (c) 2020, MariaDB Corporation.
 
     TokuDBis is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License, version 2,
@@ -727,8 +728,7 @@ static inline uchar* pack_toku_blob(
                        max_num_bytes/charset->mbmaxlen : max_num_bytes);
     if (length > local_char_length)
     {
-      local_char_length= my_charpos(
-        charset, 
+      local_char_length= charset->charpos(
         blob_buf, 
         blob_buf+length,
         local_char_length
@@ -817,8 +817,7 @@ static uchar* pack_toku_varstring_from_desc(
                        key_part_length/charset->mbmaxlen : key_part_length);
     if (length > local_char_length)
     {
-      local_char_length= my_charpos(
-        charset, 
+      local_char_length= charset->charpos(
         from_desc, 
         from_desc+length,
         local_char_length
@@ -880,8 +879,7 @@ static inline uchar* pack_toku_varstring(
                        max_num_bytes/charset->mbmaxlen : max_num_bytes);
     if (length > local_char_length)
     {
-      local_char_length= my_charpos(
-        charset, 
+      local_char_length= charset->charpos(
         from_mysql+length_bytes_in_mysql, 
         from_mysql+length_bytes_in_mysql+length,
         local_char_length
@@ -917,8 +915,7 @@ static inline int cmp_toku_string(
 
     charset = get_charset_from_num(charset_number);
 
-    ret_val = charset->coll->strnncollsp(
-        charset,
+    ret_val = charset->strnncollsp(
         a_buf, 
         a_num_bytes,
         b_buf, 
diff --git a/storage/tokudb/hatoku_hton.cc b/storage/tokudb/hatoku_hton.cc
index 6002a3efd88..c4d2cbf20a0 100644
--- a/storage/tokudb/hatoku_hton.cc
+++ b/storage/tokudb/hatoku_hton.cc
@@ -350,7 +350,6 @@ static int tokudb_init_func(void *p) {
     TOKUDB_SHARE::static_init();
     tokudb::background::initialize();
 
-    tokudb_hton->state = SHOW_OPTION_YES;
     // tokudb_hton->flags= HTON_CAN_RECREATE;  // QQQ this came from skeleton
     tokudb_hton->flags = HTON_CLOSE_CURSORS_AT_COMMIT | HTON_SUPPORTS_EXTENDED_KEYS;
 
diff --git a/storage/tokudb/man/tokuft_logprint.1 b/storage/tokudb/man/tokuft_logprint.1
index c97f7e19f69..c6b4d17e243 100644
--- a/storage/tokudb/man/tokuft_logprint.1
+++ b/storage/tokudb/man/tokuft_logprint.1
@@ -1,6 +1,6 @@
 '\" t
 .\"
-.TH "\FBTOKUFT_LOGPRINT\FR" "1" "28 March 2019" "MariaDB 10\&.4" "MariaDB Database System"
+.TH "\FBTOKUFT_LOGPRINT\FR" "1" "27 June 2019" "MariaDB 10\&.5" "MariaDB Database System"
 .\" -----------------------------------------------------------------
 .\" * set default formatting
 .\" -----------------------------------------------------------------
diff --git a/storage/tokudb/man/tokuftdump.1 b/storage/tokudb/man/tokuftdump.1
index a9c900e0045..04934f754d7 100644
--- a/storage/tokudb/man/tokuftdump.1
+++ b/storage/tokudb/man/tokuftdump.1
@@ -1,6 +1,6 @@
 '\" t
 .\"
-.TH "\FBTOKUFTDUMP\FR" "1" "28 March 2019" "MariaDB 10\&.4" "MariaDB Database System"
+.TH "\FBTOKUFTDUMP\FR" "1" "27 June 2019" "MariaDB 10\&.5" "MariaDB Database System"
 .\" -----------------------------------------------------------------
 .\" * set default formatting
 .\" -----------------------------------------------------------------
@@ -223,7 +223,7 @@ Provide summary info\&.
 .SH "COPYRIGHT"
 .br
 .PP
-Copyright 2016 MariaDB Foundation
+Copyright 2016-2019 MariaDB Foundation
 .PP
 This documentation is free software; you can redistribute it and/or modify it only under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License.
 .PP
diff --git a/storage/tokudb/mysql-test/tokudb/r/cluster_delete.result b/storage/tokudb/mysql-test/tokudb/r/cluster_delete.result
index f85845232dd..cb8cb1c68bb 100644
--- a/storage/tokudb/mysql-test/tokudb/r/cluster_delete.result
+++ b/storage/tokudb/mysql-test/tokudb/r/cluster_delete.result
@@ -27,7 +27,7 @@ a	b	c	d
 11	110	1100	11000
 explain select * from t1 where c > 850;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	ALL	c	NULL	NULL	NULL	NULL;	Using where
+1	SIMPLE	t1	range	c	c	5	NULL	NULL;	Using where
 select * from t1 where c > 850;
 a	b	c	d
 9	90	900	9000
diff --git a/storage/tokudb/mysql-test/tokudb/r/cluster_key.result b/storage/tokudb/mysql-test/tokudb/r/cluster_key.result
index 4c2fc08cd48..b01aaef6ee7 100644
--- a/storage/tokudb/mysql-test/tokudb/r/cluster_key.result
+++ b/storage/tokudb/mysql-test/tokudb/r/cluster_key.result
@@ -53,7 +53,7 @@ a	b
 11	110
 explain select a,b from t1 where c > 350;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	index	c	c	4	NULL	NULL;	Using where; Using index
+1	SIMPLE	t1	range	c	c	4	NULL	NULL;	Using where; Using index
 select a,c from t1 where c > 350;
 a	c
 4	400
@@ -160,7 +160,7 @@ a	b
 11	110
 explain select a,b from t1 where c > 350;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	index	c	c	4	NULL	NULL;	Using where; Using index
+1	SIMPLE	t1	range	c	c	4	NULL	NULL;	Using where; Using index
 select a,c from t1 where c > 350;
 a	c
 4	400
diff --git a/storage/tokudb/mysql-test/tokudb/r/cluster_query_plan.result b/storage/tokudb/mysql-test/tokudb/r/cluster_query_plan.result
index 6b458b36585..4309b622293 100644
--- a/storage/tokudb/mysql-test/tokudb/r/cluster_query_plan.result
+++ b/storage/tokudb/mysql-test/tokudb/r/cluster_query_plan.result
@@ -22,5 +22,5 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	index	b	b	5	NULL	NULL;	Using where; Using index
 explain select a from t1 where c > 0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	index	c	c	5	NULL	NULL;	Using where; Using index
+1	SIMPLE	t1	range	c	c	5	NULL	NULL;	Using where; Using index
 drop table t1;
diff --git a/storage/tokudb/mysql-test/tokudb/r/cluster_tokudb_bug_993_2.result b/storage/tokudb/mysql-test/tokudb/r/cluster_tokudb_bug_993_2.result
index 2dcb65cee10..137f2a930ca 100644
--- a/storage/tokudb/mysql-test/tokudb/r/cluster_tokudb_bug_993_2.result
+++ b/storage/tokudb/mysql-test/tokudb/r/cluster_tokudb_bug_993_2.result
@@ -38,7 +38,7 @@ max(a)
 3
 explain select a,b from z1 where a < 3;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	z1	index	a	a	10	NULL	14	Using where; Using index
+1	SIMPLE	z1	range	a	a	5	NULL	9	Using where; Using index
 select max(a) from z1 where a < 3;
 max(a)
 1
diff --git a/storage/tokudb/mysql-test/tokudb/r/type_bit.result b/storage/tokudb/mysql-test/tokudb/r/type_bit.result
index 76a032d99c4..5ef5783d866 100644
--- a/storage/tokudb/mysql-test/tokudb/r/type_bit.result
+++ b/storage/tokudb/mysql-test/tokudb/r/type_bit.result
@@ -675,7 +675,7 @@ INSERT INTO t1(a) VALUES
 (65535),(65525),(65535),(65535),(65535),(65535),(65535),(65535),(65535),(65535);
 EXPLAIN SELECT 1 FROM t1 GROUP BY a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	index	NULL	a	3	NULL	10	Using index
+1	SIMPLE	t1	range	NULL	a	3	NULL	11	Using index for group-by
 SELECT 1 FROM t1 GROUP BY a;
 1
 1
@@ -759,7 +759,7 @@ CREATE TABLE t1 (a BIT(7), b BIT(9), KEY(a, b));
 INSERT INTO t1 VALUES(0, 0), (5, 3), (5, 6), (6, 4), (7, 0);
 EXPLAIN SELECT a+0, b+0 FROM t1 WHERE a > 4 and b < 7 ORDER BY 2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	index	a	a	5	NULL	5	Using where; Using index; Using filesort
+1	SIMPLE	t1	range	a	a	2	NULL	4	Using where; Using index; Using filesort
 DROP TABLE t1;
 End of 5.0 tests
 create table t1(a bit(7));
diff --git a/storage/tokudb/mysql-test/tokudb/r/type_blob.result b/storage/tokudb/mysql-test/tokudb/r/type_blob.result
index f3d634d84c1..6acfcf0fed3 100644
--- a/storage/tokudb/mysql-test/tokudb/r/type_blob.result
+++ b/storage/tokudb/mysql-test/tokudb/r/type_blob.result
@@ -234,6 +234,8 @@ HELLO MY
 a
 hello
 set big_tables=1;
+Warnings:
+Warning	1287	'@@big_tables' is deprecated and will be removed in a future release
 select distinct t from t1;
 t
 NULL
@@ -315,6 +317,8 @@ HELLO MY
 a
 hello
 set big_tables=0;
+Warnings:
+Warning	1287	'@@big_tables' is deprecated and will be removed in a future release
 select distinct * from t1;
 t	c	b	d
 NULL	NULL	NULL	NULL
diff --git a/storage/tokudb/mysql-test/tokudb/r/type_datetime.result b/storage/tokudb/mysql-test/tokudb/r/type_datetime.result
index dbe93b37271..61aec6f9ea7 100644
--- a/storage/tokudb/mysql-test/tokudb/r/type_datetime.result
+++ b/storage/tokudb/mysql-test/tokudb/r/type_datetime.result
@@ -105,7 +105,7 @@ date	numfacture	expedition
 0000-00-00 00:00:00	1212	0001-00-00 00:00:00
 EXPLAIN SELECT * FROM t1 WHERE expedition='0001-00-00 00:00:00';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	ALL	expedition	NULL	NULL	NULL	2	Using where
+1	SIMPLE	t1	ref	expedition	expedition	5	const	2	
 drop table t1;
 create table t1 (a datetime not null, b datetime not null);
 insert into t1 values (now(), now());
diff --git a/storage/tokudb/mysql-test/tokudb/t/bf_delete.test b/storage/tokudb/mysql-test/tokudb/t/bf_delete.test
index a55d78784cc..cdfb5dbc1a6 100644
--- a/storage/tokudb/mysql-test/tokudb/t/bf_delete.test
+++ b/storage/tokudb/mysql-test/tokudb/t/bf_delete.test
@@ -2,6 +2,8 @@
 
 source include/have_tokudb.inc;
 source include/big_test.inc;
+# ASan causes test to timeout
+source include/not_asan.inc;
 set default_storage_engine='tokudb';
 disable_warnings;
 drop table if exists t;
diff --git a/storage/tokudb/mysql-test/tokudb_alter_table/r/hcad_with_lock_sps.result b/storage/tokudb/mysql-test/tokudb_alter_table/r/hcad_with_lock_sps.result
index 80402df665d..88f28362119 100644
--- a/storage/tokudb/mysql-test/tokudb_alter_table/r/hcad_with_lock_sps.result
+++ b/storage/tokudb/mysql-test/tokudb_alter_table/r/hcad_with_lock_sps.result
@@ -20,7 +20,7 @@ z	a	b	c
 999	4	40	400
 explain select * from foo where b > 20;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	foo	index	b	b	5	NULL	4	Using where; Using index
+1	SIMPLE	foo	range	b	b	5	NULL	2	Using where; Using index
 select* from foo where b > 10;
 z	a	b	c
 999	2	20	200
@@ -40,7 +40,7 @@ a	b	c
 4	40	400
 explain select * from foo where b > 20;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	foo	index	b	b	5	NULL	4	Using where; Using index
+1	SIMPLE	foo	range	b	b	5	NULL	2	Using where; Using index
 select* from foo where b > 10;
 a	b	c
 2	20	200
@@ -59,7 +59,7 @@ a	b	c	z
 4	40	400	NULL
 explain select * from foo where b > 20;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	foo	index	b	b	5	NULL	4	Using where; Using index
+1	SIMPLE	foo	range	b	b	5	NULL	2	Using where; Using index
 select* from foo where b > 10;
 a	b	c	z
 2	20	200	NULL
@@ -94,7 +94,7 @@ a	b	c
 3	30	300
 4	40	400
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	foo	index	b	b	5	NULL	4	Using where; Using index
+1	SIMPLE	foo	range	b	b	5	NULL	2	Using where; Using index
 a	b	c
 2	20	200
 3	30	300
@@ -110,7 +110,7 @@ a	b	c
 4	40	400
 explain select * from foo where b > 20;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	foo	index	b	b	5	NULL	4	Using where; Using index
+1	SIMPLE	foo	range	b	b	5	NULL	2	Using where; Using index
 select* from foo where b > 10;
 a	b	c
 2	20	200
@@ -154,7 +154,7 @@ a	b	c	g
 4	40	400	NULL
 5	50	500	NULL
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	foo	index	b	b	5	NULL	5	Using where; Using index
+1	SIMPLE	foo	range	b	b	5	NULL	3	Using where; Using index
 set autocommit=on;
 explain select * from foo;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
diff --git a/storage/tokudb/mysql-test/tokudb_alter_table/r/hcr3.result b/storage/tokudb/mysql-test/tokudb_alter_table/r/hcr3.result
index b7aefaa4b8d..5e044461d84 100644
--- a/storage/tokudb/mysql-test/tokudb_alter_table/r/hcr3.result
+++ b/storage/tokudb/mysql-test/tokudb_alter_table/r/hcr3.result
@@ -43,7 +43,7 @@ foo	CREATE TABLE `foo` (
 ) ENGINE=TokuDB DEFAULT CHARSET=latin1
 explain select bb from foo FORCE INDEX (b) where bb > 5;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	foo	index	b	b	9	NULL	1	Using where; Using index
+1	SIMPLE	foo	range	b	b	9	NULL	1	Using where; Using index
 select bb from foo FORCE INDEX (b) where bb > 5;
 bb
 10
diff --git a/storage/tokudb/mysql-test/tokudb_bugs/r/2383.result b/storage/tokudb/mysql-test/tokudb_bugs/r/2383.result
index bda2cceb7ba..fce13e39504 100644
--- a/storage/tokudb/mysql-test/tokudb_bugs/r/2383.result
+++ b/storage/tokudb/mysql-test/tokudb_bugs/r/2383.result
@@ -10,7 +10,7 @@ create table foo (a char (255), key (a))charset=utf8;
 insert into foo values (repeat('A', 255));
 explain select a from foo where a > "a";
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	foo	index	a	a	766	NULL	1	Using where; Using index
+1	SIMPLE	foo	range	a	a	766	NULL	1	Using where; Using index
 select a from foo where a > "a";
 a
 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
diff --git a/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_background_job_status.result b/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_background_job_status.result
new file mode 100644
index 00000000000..a4e42a10b57
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_background_job_status.result
@@ -0,0 +1,13 @@
+SHOW CREATE TABLE INFORMATION_SCHEMA.TOKUDB_BACKGROUND_JOB_STATUS;
+Table	Create Table
+TokuDB_background_job_status	CREATE TEMPORARY TABLE `TokuDB_background_job_status` (
+  `id` bigint(0) NOT NULL DEFAULT 0,
+  `database_name` varchar(256) NOT NULL DEFAULT '',
+  `table_name` varchar(256) NOT NULL DEFAULT '',
+  `job_type` varchar(256) NOT NULL DEFAULT '',
+  `job_params` varchar(256) NOT NULL DEFAULT '',
+  `scheduler` varchar(32) NOT NULL DEFAULT '',
+  `scheduled_time` datetime NOT NULL DEFAULT '0000-00-00 00:00:00',
+  `started_time` datetime DEFAULT NULL,
+  `status` varchar(1024) DEFAULT NULL
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
diff --git a/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_file_map.result b/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_file_map.result
new file mode 100644
index 00000000000..1d82039ebf3
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_file_map.result
@@ -0,0 +1,9 @@
+SHOW CREATE TABLE INFORMATION_SCHEMA.TOKUDB_FILE_MAP;
+Table	Create Table
+TokuDB_file_map	CREATE TEMPORARY TABLE `TokuDB_file_map` (
+  `dictionary_name` varchar(256) NOT NULL DEFAULT '',
+  `internal_file_name` varchar(256) NOT NULL DEFAULT '',
+  `table_schema` varchar(256) NOT NULL DEFAULT '',
+  `table_name` varchar(256) NOT NULL DEFAULT '',
+  `table_dictionary_name` varchar(256) NOT NULL DEFAULT ''
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
diff --git a/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_fractal_tree_block_map.result b/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_fractal_tree_block_map.result
new file mode 100644
index 00000000000..a90db7a5b96
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_fractal_tree_block_map.result
@@ -0,0 +1,13 @@
+SHOW CREATE TABLE INFORMATION_SCHEMA.TOKUDB_FRACTAL_TREE_BLOCK_MAP;
+Table	Create Table
+TokuDB_fractal_tree_block_map	CREATE TEMPORARY TABLE `TokuDB_fractal_tree_block_map` (
+  `dictionary_name` varchar(256) NOT NULL DEFAULT '',
+  `internal_file_name` varchar(256) NOT NULL DEFAULT '',
+  `checkpoint_count` bigint(0) NOT NULL DEFAULT 0,
+  `blocknum` bigint(0) NOT NULL DEFAULT 0,
+  `offset` bigint(0) DEFAULT NULL,
+  `size` bigint(0) DEFAULT NULL,
+  `table_schema` varchar(256) NOT NULL DEFAULT '',
+  `table_name` varchar(256) NOT NULL DEFAULT '',
+  `table_dictionary_name` varchar(256) NOT NULL DEFAULT ''
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
diff --git a/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_fractal_tree_info.result b/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_fractal_tree_info.result
new file mode 100644
index 00000000000..6b071d35b46
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_fractal_tree_info.result
@@ -0,0 +1,13 @@
+SHOW CREATE TABLE INFORMATION_SCHEMA.TOKUDB_FRACTAL_TREE_INFO;
+Table	Create Table
+TokuDB_fractal_tree_info	CREATE TEMPORARY TABLE `TokuDB_fractal_tree_info` (
+  `dictionary_name` varchar(256) NOT NULL DEFAULT '',
+  `internal_file_name` varchar(256) NOT NULL DEFAULT '',
+  `bt_num_blocks_allocated` bigint(0) NOT NULL DEFAULT 0,
+  `bt_num_blocks_in_use` bigint(0) NOT NULL DEFAULT 0,
+  `bt_size_allocated` bigint(0) NOT NULL DEFAULT 0,
+  `bt_size_in_use` bigint(0) NOT NULL DEFAULT 0,
+  `table_schema` varchar(256) NOT NULL DEFAULT '',
+  `table_name` varchar(256) NOT NULL DEFAULT '',
+  `table_dictionary_name` varchar(256) NOT NULL DEFAULT ''
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
diff --git a/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_lock_waits.result b/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_lock_waits.result
new file mode 100644
index 00000000000..b90db36691c
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_lock_waits.result
@@ -0,0 +1,13 @@
+SHOW CREATE TABLE INFORMATION_SCHEMA.TOKUDB_LOCK_WAITS;
+Table	Create Table
+TokuDB_lock_waits	CREATE TEMPORARY TABLE `TokuDB_lock_waits` (
+  `requesting_trx_id` bigint(0) NOT NULL DEFAULT 0,
+  `blocking_trx_id` bigint(0) NOT NULL DEFAULT 0,
+  `lock_waits_dname` varchar(256) NOT NULL DEFAULT '',
+  `lock_waits_key_left` varchar(256) NOT NULL DEFAULT '',
+  `lock_waits_key_right` varchar(256) NOT NULL DEFAULT '',
+  `lock_waits_start_time` bigint(0) NOT NULL DEFAULT 0,
+  `lock_waits_table_schema` varchar(256) NOT NULL DEFAULT '',
+  `lock_waits_table_name` varchar(256) NOT NULL DEFAULT '',
+  `lock_waits_table_dictionary_name` varchar(256) NOT NULL DEFAULT ''
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
diff --git a/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_locks.result b/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_locks.result
new file mode 100644
index 00000000000..2c1084643d8
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_locks.result
@@ -0,0 +1,12 @@
+SHOW CREATE TABLE INFORMATION_SCHEMA.TOKUDB_LOCKS;
+Table	Create Table
+TokuDB_locks	CREATE TEMPORARY TABLE `TokuDB_locks` (
+  `locks_trx_id` bigint(0) NOT NULL DEFAULT 0,
+  `locks_mysql_thread_id` bigint(0) NOT NULL DEFAULT 0,
+  `locks_dname` varchar(256) NOT NULL DEFAULT '',
+  `locks_key_left` varchar(256) NOT NULL DEFAULT '',
+  `locks_key_right` varchar(256) NOT NULL DEFAULT '',
+  `locks_table_schema` varchar(256) NOT NULL DEFAULT '',
+  `locks_table_name` varchar(256) NOT NULL DEFAULT '',
+  `locks_table_dictionary_name` varchar(256) NOT NULL DEFAULT ''
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
diff --git a/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_trx.result b/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_trx.result
new file mode 100644
index 00000000000..94c1b0c7642
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_i_s/r/tokudb_trx.result
@@ -0,0 +1,7 @@
+SHOW CREATE TABLE INFORMATION_SCHEMA.TOKUDB_TRX;
+Table	Create Table
+TokuDB_trx	CREATE TEMPORARY TABLE `TokuDB_trx` (
+  `trx_id` bigint(0) NOT NULL DEFAULT 0,
+  `trx_mysql_thread_id` bigint(0) NOT NULL DEFAULT 0,
+  `trx_time` bigint(0) NOT NULL DEFAULT 0
+) ENGINE=MEMORY DEFAULT CHARSET=utf8
diff --git a/storage/tokudb/mysql-test/tokudb_i_s/suite.opt b/storage/tokudb/mysql-test/tokudb_i_s/suite.opt
new file mode 100644
index 00000000000..ea8042b7740
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_i_s/suite.opt
@@ -0,0 +1 @@
+--tokudb --plugin-load-add=$HA_TOKUDB_SO --loose-tokudb-check-jemalloc=0
diff --git a/storage/tokudb/mysql-test/tokudb_i_s/suite.pm b/storage/tokudb/mysql-test/tokudb_i_s/suite.pm
new file mode 100644
index 00000000000..a6e01cd6dd4
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_i_s/suite.pm
@@ -0,0 +1,14 @@
+package My::Suite::TokuDB_bugs;
+use File::Basename;
+@ISA = qw(My::Suite);
+
+# Ensure we can run the TokuDB tests even if hugepages are enabled
+$ENV{TOKU_HUGE_PAGES_OK}=1;
+
+#return "Not run for embedded server" if $::opt_embedded_server;
+return "No TokuDB engine" unless $ENV{HA_TOKUDB_SO} or $::mysqld_variables{tokudb};
+
+sub is_default { not $::opt_embedded_server }
+
+bless { };
+
diff --git a/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_background_job_status.test b/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_background_job_status.test
new file mode 100644
index 00000000000..7d52e902b03
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_background_job_status.test
@@ -0,0 +1 @@
+SHOW CREATE TABLE INFORMATION_SCHEMA.TOKUDB_BACKGROUND_JOB_STATUS;
diff --git a/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_file_map.test b/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_file_map.test
new file mode 100644
index 00000000000..ef45fa58651
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_file_map.test
@@ -0,0 +1 @@
+SHOW CREATE TABLE INFORMATION_SCHEMA.TOKUDB_FILE_MAP;
diff --git a/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_fractal_tree_block_map.test b/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_fractal_tree_block_map.test
new file mode 100644
index 00000000000..655714c31d4
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_fractal_tree_block_map.test
@@ -0,0 +1 @@
+SHOW CREATE TABLE INFORMATION_SCHEMA.TOKUDB_FRACTAL_TREE_BLOCK_MAP;
diff --git a/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_fractal_tree_info.test b/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_fractal_tree_info.test
new file mode 100644
index 00000000000..d05d20a58f3
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_fractal_tree_info.test
@@ -0,0 +1 @@
+SHOW CREATE TABLE INFORMATION_SCHEMA.TOKUDB_FRACTAL_TREE_INFO;
diff --git a/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_lock_waits.test b/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_lock_waits.test
new file mode 100644
index 00000000000..5a43ec908ef
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_lock_waits.test
@@ -0,0 +1 @@
+SHOW CREATE TABLE INFORMATION_SCHEMA.TOKUDB_LOCK_WAITS;
diff --git a/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_locks.test b/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_locks.test
new file mode 100644
index 00000000000..7975c2f9e6c
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_locks.test
@@ -0,0 +1 @@
+SHOW CREATE TABLE INFORMATION_SCHEMA.TOKUDB_LOCKS;
diff --git a/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_trx.test b/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_trx.test
new file mode 100644
index 00000000000..adb4b631809
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_i_s/t/tokudb_trx.test
@@ -0,0 +1 @@
+SHOW CREATE TABLE INFORMATION_SCHEMA.TOKUDB_TRX;
diff --git a/storage/tokudb/mysql-test/tokudb_mariadb/r/mrr.result b/storage/tokudb/mysql-test/tokudb_mariadb/r/mrr.result
index 50e5b1940b0..0dd05170252 100644
--- a/storage/tokudb/mysql-test/tokudb_mariadb/r/mrr.result
+++ b/storage/tokudb/mysql-test/tokudb_mariadb/r/mrr.result
@@ -33,7 +33,7 @@ explain
 SELECT t3.task_id, t3.field  FROM
 t3,t2 WHERE    t3.task_id=t2.task_id AND   t2.type NOT IN (8,11);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	index	PRIMARY,ymtasks_type	ymtasks_type	2	NULL	2	Using where; Using index
+1	SIMPLE	t2	range	PRIMARY,ymtasks_type	ymtasks_type	2	NULL	2	Using where; Using index
 1	SIMPLE	t3	ref	ymtasksoptions_task	ymtasksoptions_task	4	test.t2.task_id	1	
 SELECT t3.task_id, t3.field  FROM
 t3,t2 WHERE    t3.task_id=t2.task_id AND   t2.type NOT IN (8,11);
@@ -84,7 +84,7 @@ explain
 SELECT t3.task_id, t3.field  FROM
 t3,t2 WHERE    t3.task_id=t2.task_id AND   t2.type NOT IN (8,11);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	index	PRIMARY,ymtasks_type	ymtasks_type	2	NULL	2	Using where; Using index
+1	SIMPLE	t2	range	PRIMARY,ymtasks_type	ymtasks_type	2	NULL	2	Using where; Using index
 1	SIMPLE	t3	ref	ymtasksoptions_task	ymtasksoptions_task	4	test.t2.task_id	1	Using join buffer (flat, BKA join); Key-ordered Rowid-ordered scan
 SELECT t3.task_id, t3.field  FROM
 t3,t2 WHERE    t3.task_id=t2.task_id AND   t2.type NOT IN (8,11);
diff --git a/storage/tokudb/mysql-test/tokudb_mariadb/r/xa.result b/storage/tokudb/mysql-test/tokudb_mariadb/r/xa.result
index 4724a0af926..34233b6fd8d 100644
--- a/storage/tokudb/mysql-test/tokudb_mariadb/r/xa.result
+++ b/storage/tokudb/mysql-test/tokudb_mariadb/r/xa.result
@@ -65,4 +65,5 @@ a
 20
 disconnect con1;
 connection default;
+xa rollback 'testb',0x2030405060,11;
 drop table t1;
diff --git a/storage/tokudb/mysql-test/tokudb_mariadb/t/xa.test b/storage/tokudb/mysql-test/tokudb_mariadb/t/xa.test
index dc5520a39b8..a6be07963f5 100644
--- a/storage/tokudb/mysql-test/tokudb_mariadb/t/xa.test
+++ b/storage/tokudb/mysql-test/tokudb_mariadb/t/xa.test
@@ -68,6 +68,9 @@ xa start 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz';
 select * from t1;
 
 disconnect con1;
+xa recover;
+
 connection default;
+xa rollback 'testb',0x2030405060,11;
 drop table t1;
 
diff --git a/storage/tokudb/mysql-test/tokudb_parts/t/partition_alter4_tokudb.test b/storage/tokudb/mysql-test/tokudb_parts/t/partition_alter4_tokudb.test
index 93a3a0cbdce..e450574634b 100644
--- a/storage/tokudb/mysql-test/tokudb_parts/t/partition_alter4_tokudb.test
+++ b/storage/tokudb/mysql-test/tokudb_parts/t/partition_alter4_tokudb.test
@@ -49,6 +49,8 @@ let $more_pk_ui_tests= 0;
 --source include/big_test.inc
 # Skiping this test from Valgrind execution as per Bug-14627884
 --source include/not_valgrind.inc
+# Don't run with asan as it causes timeouts
+--source include/not_asan.inc
 
 #------------------------------------------------------------------------------#
 # Engine specific settings and requirements
diff --git a/storage/tokudb/tokudb_dir_cmd.cc b/storage/tokudb/tokudb_dir_cmd.cc
index d0da92eab27..871bb712406 100644
--- a/storage/tokudb/tokudb_dir_cmd.cc
+++ b/storage/tokudb/tokudb_dir_cmd.cc
@@ -67,7 +67,7 @@ static int MDL_and_TDC(THD *thd,
                          table);
         return error;
     }
-    tdc_remove_table(thd, TDC_RT_REMOVE_ALL, db, table, false);
+    tdc_remove_table(thd, db, table);
     return error;
 }
 
diff --git a/storage/tokudb/tokudb_information_schema.cc b/storage/tokudb/tokudb_information_schema.cc
index 0b9882060cd..fef72557b18 100644
--- a/storage/tokudb/tokudb_information_schema.cc
+++ b/storage/tokudb/tokudb_information_schema.cc
@@ -56,11 +56,17 @@ st_mysql_information_schema trx_information_schema = {
     MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION
 };
 
+using ::Show::Column;
+using ::Show::SLonglong;
+using ::Show::CEnd;
+using ::Show::Varchar;
+using ::Show::Datetime;
+
 ST_FIELD_INFO trx_field_info[] = {
-    {"trx_id", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"trx_mysql_thread_id", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"trx_time", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE}
+    Column("trx_id",              SLonglong(0), NOT_NULL),
+    Column("trx_mysql_thread_id", SLonglong(0), NOT_NULL),
+    Column("trx_time",            SLonglong(0), NOT_NULL),
+    CEnd()
 };
 
 struct trx_extra_t {
@@ -152,16 +158,16 @@ st_mysql_information_schema lock_waits_information_schema = {
 };
 
 ST_FIELD_INFO lock_waits_field_info[] = {
-    {"requesting_trx_id", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"blocking_trx_id", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"lock_waits_dname", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"lock_waits_key_left", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"lock_waits_key_right", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"lock_waits_start_time", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"lock_waits_table_schema", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"lock_waits_table_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"lock_waits_table_dictionary_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE}
+    Column("requesting_trx_id",                SLonglong(0), NOT_NULL),
+    Column("blocking_trx_id",                  SLonglong(0), NOT_NULL),
+    Column("lock_waits_dname",                 Varchar(256), NOT_NULL),
+    Column("lock_waits_key_left",              Varchar(256), NOT_NULL),
+    Column("lock_waits_key_right",             Varchar(256), NOT_NULL),
+    Column("lock_waits_start_time",            SLonglong(0), NOT_NULL),
+    Column("lock_waits_table_schema",          Varchar(256), NOT_NULL),
+    Column("lock_waits_table_name",            Varchar(256), NOT_NULL),
+    Column("lock_waits_table_dictionary_name", Varchar(256), NOT_NULL),
+    CEnd()
 };
 
 struct lock_waits_extra_t {
@@ -293,16 +299,16 @@ st_mysql_information_schema locks_information_schema = {
     MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION
 };
 
- ST_FIELD_INFO locks_field_info[] = {
-    {"locks_trx_id", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"locks_mysql_thread_id", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"locks_dname", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"locks_key_left", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"locks_key_right", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"locks_table_schema", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"locks_table_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"locks_table_dictionary_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE}
+ST_FIELD_INFO locks_field_info[] = {
+    Column("locks_trx_id",                SLonglong(0), NOT_NULL),
+    Column("locks_mysql_thread_id",       SLonglong(0), NOT_NULL),
+    Column("locks_dname",                 Varchar(256), NOT_NULL),
+    Column("locks_key_left",              Varchar(256), NOT_NULL),
+    Column("locks_key_right",             Varchar(256), NOT_NULL),
+    Column("locks_table_schema",          Varchar(256), NOT_NULL),
+    Column("locks_table_name",            Varchar(256), NOT_NULL),
+    Column("locks_table_dictionary_name", Varchar(256), NOT_NULL),
+    CEnd()
 };
 
 struct locks_extra_t {
@@ -434,12 +440,12 @@ st_mysql_information_schema file_map_information_schema = {
 };
 
 ST_FIELD_INFO file_map_field_info[] = {
-    {"dictionary_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"internal_file_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"table_schema", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"table_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"table_dictionary_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE}
+    Column("dictionary_name",       Varchar(256), NOT_NULL),
+    Column("internal_file_name",    Varchar(256), NOT_NULL),
+    Column("table_schema",          Varchar(256), NOT_NULL),
+    Column("table_name",            Varchar(256), NOT_NULL),
+    Column("table_dictionary_name", Varchar(256), NOT_NULL),
+    CEnd()
 };
 
 int report_file_map(TABLE* table, THD* thd) {
@@ -581,16 +587,16 @@ st_mysql_information_schema fractal_tree_info_information_schema = {
 };
 
 ST_FIELD_INFO fractal_tree_info_field_info[] = {
-    {"dictionary_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"internal_file_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"bt_num_blocks_allocated", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"bt_num_blocks_in_use", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"bt_size_allocated", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"bt_size_in_use", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"table_schema", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"table_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"table_dictionary_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE}
+    Column("dictionary_name",         Varchar(256), NOT_NULL),
+    Column("internal_file_name",      Varchar(256), NOT_NULL),
+    Column("bt_num_blocks_allocated", SLonglong(0), NOT_NULL),
+    Column("bt_num_blocks_in_use",    SLonglong(0), NOT_NULL),
+    Column("bt_size_allocated",       SLonglong(0), NOT_NULL),
+    Column("bt_size_in_use",          SLonglong(0), NOT_NULL),
+    Column("table_schema",            Varchar(256), NOT_NULL),
+    Column("table_name",              Varchar(256), NOT_NULL),
+    Column("table_dictionary_name",   Varchar(256), NOT_NULL),
+    CEnd()
 };
 
 int report_fractal_tree_info_for_db(
@@ -793,16 +799,16 @@ st_mysql_information_schema fractal_tree_block_map_information_schema = {
 };
 
 ST_FIELD_INFO fractal_tree_block_map_field_info[] = {
-    {"dictionary_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"internal_file_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"checkpoint_count", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"blocknum", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"offset", 0, MYSQL_TYPE_LONGLONG, 0, MY_I_S_MAYBE_NULL, NULL, SKIP_OPEN_TABLE },
-    {"size", 0, MYSQL_TYPE_LONGLONG, 0, MY_I_S_MAYBE_NULL, NULL, SKIP_OPEN_TABLE },
-    {"table_schema", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"table_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"table_dictionary_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE}
+    Column("dictionary_name",       Varchar(256), NOT_NULL),
+    Column("internal_file_name",    Varchar(256), NOT_NULL),
+    Column("checkpoint_count",      SLonglong(0), NOT_NULL),
+    Column("blocknum",              SLonglong(0), NOT_NULL),
+    Column("offset",                SLonglong(0), NULLABLE),
+    Column("size",                  SLonglong(0), NULLABLE),
+    Column("table_schema",          Varchar(256), NOT_NULL),
+    Column("table_name",            Varchar(256), NOT_NULL),
+    Column("table_dictionary_name", Varchar(256), NOT_NULL),
+    CEnd()
 };
 
 struct report_fractal_tree_block_map_iterator_extra_t {
@@ -1089,16 +1095,16 @@ st_mysql_information_schema background_job_status_information_schema = {
 };
 
 ST_FIELD_INFO background_job_status_field_info[] = {
-    {"id", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"database_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"table_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"job_type", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"job_params", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"scheduler", 32, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"scheduled_time", 0, MYSQL_TYPE_DATETIME, 0, 0, NULL, SKIP_OPEN_TABLE },
-    {"started_time", 0, MYSQL_TYPE_DATETIME, 0, MY_I_S_MAYBE_NULL, NULL, SKIP_OPEN_TABLE },
-    {"status", 1024, MYSQL_TYPE_STRING, 0, MY_I_S_MAYBE_NULL, NULL, SKIP_OPEN_TABLE },
-    {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE}
+    Column("id",             SLonglong(0),  NOT_NULL),
+    Column("database_name",  Varchar(256),  NOT_NULL),
+    Column("table_name",     Varchar(256),  NOT_NULL),
+    Column("job_type",       Varchar(256),  NOT_NULL),
+    Column("job_params",     Varchar(256),  NOT_NULL),
+    Column("scheduler",      Varchar(32),   NOT_NULL),
+    Column("scheduled_time", Datetime(0),   NOT_NULL),
+    Column("started_time",   Datetime(0),   NULLABLE),
+    Column("status",         Varchar(1024), NULLABLE),
+    CEnd()
 };
 
 struct background_job_status_extra {
diff --git a/storage/tokudb/tokudb_memory.h b/storage/tokudb/tokudb_memory.h
index 2687c1cda8e..a12db83e022 100644
--- a/storage/tokudb/tokudb_memory.h
+++ b/storage/tokudb/tokudb_memory.h
@@ -40,31 +40,19 @@ void* multi_malloc(myf myFlags, ...);
 
 
 inline void* malloc(size_t s, myf flags) {
-#if 50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799
     return ::my_malloc(0, s, flags);
-#else
-    return ::my_malloc(s, flags);
-#endif
 }
 inline void* realloc(void* p, size_t s, myf flags) {
     if (s == 0)
         return p;
-#if 50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799
     return ::my_realloc(0, p, s, flags);
-#else
-    return ::my_realloc(p, s, flags | MY_ALLOW_ZERO_PTR);
-#endif
 }
 inline void free(void* ptr) {
     if (ptr)
         ::my_free(ptr);
 }
 inline char* strdup(const char* p, myf flags) {
-#if 50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799
     return ::my_strdup(0, p, flags);
-#else
-    return ::my_strdup(p, flags);
-#endif
 }
 inline void* multi_malloc(myf myFlags, ...) {
     va_list args;